├── __init__.py ├── bucket_generation ├── __init__.py ├── add_to_path.sh ├── replayExisting.py ├── generators │ ├── random_baseline │ │ └── guesser.py │ ├── character_grams │ │ └── guesser.py │ ├── n_grams │ │ └── guesser.py │ ├── continella │ │ └── guesser.py │ ├── pcfg │ │ └── guesser.py │ ├── token_pcfg │ │ └── guesser.py │ └── rnn │ │ └── guesser.py └── utils.py ├── bucket_extraction ├── utils │ ├── __init__.py │ └── extract_utils.py ├── __init__.py ├── feed_to_validator │ └── feed_to_validator.py ├── grayhatwarfare │ └── grayhatwarfare.py ├── bing │ └── bing.py ├── farsight │ └── farsight.py └── virustotal │ └── virustotal.py ├── data ├── extraction │ ├── bing │ │ └── .gitignore │ ├── farsight │ │ └── .gitignore │ ├── grayhatwarfare │ │ └── .gitignore │ └── virustotal │ │ └── .gitignore └── validation │ ├── aliyuncs.com │ └── .gitignore │ ├── s3.amazonaws.com │ └── .gitignore │ └── storage.googleapis.com │ └── .gitignore ├── final_output ├── .gitignore └── gather_all_buckets.sh ├── images └── flow.jpg ├── requirements.txt ├── .env.example ├── .gitignore ├── utils.py ├── bucket_validation ├── test_bloom │ └── test_bloom.go ├── bloom │ └── add_to_bloom.go └── listener.go ├── README.md ├── main.py └── LICENSE /__init__.py: -------------------------------------------------------------------------------- 1 | from utils import * -------------------------------------------------------------------------------- /bucket_generation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bucket_extraction/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/extraction/bing/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /data/extraction/farsight/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /bucket_generation/add_to_path.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH=$(pwd) -------------------------------------------------------------------------------- /data/extraction/grayhatwarfare/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /data/extraction/virustotal/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /data/validation/aliyuncs.com/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /data/validation/s3.amazonaws.com/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /data/validation/storage.googleapis.com/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /final_output/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !gather_all_buckets.sh 3 | !.gitignore -------------------------------------------------------------------------------- /bucket_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | from bucket_extraction.utils.extract_utils import * 2 | -------------------------------------------------------------------------------- /images/flow.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-esrg/stratosphere/HEAD/images/flow.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | grequests==0.6.0 2 | argparse==1.4.0 3 | python-dotenv 4 | requests 5 | pystalk 6 | numpy 7 | keras 8 | tensorflow 9 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # Bing Cognitive Search API key 2 | # https://docs.microsoft.com/en-us/azure/cognitive-services/bing-web-search/ 3 | BING_API_KEY=1 4 | 5 | # Farsight API key 6 | FARSIGHT_API_KEY=1 7 | 8 | # GrayHat Warfare access token 9 | GRAYHAT_ACCESS_TOKEN=1 10 | 11 | # VirusTotal API key 12 | VIRUSTOTAL_API_KEY=1 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.code-workspace 2 | *__pycache__* 3 | .DS_Store 4 | *.pyc 5 | .vscode 6 | bucket_validation/listener-config.json 7 | *.ipynb_checkpoints* 8 | *.ipynb 9 | bucket_validation/bloom/candidate_set.bloom 10 | bucket_validation/bloom/old_candidate_set.bloom 11 | bucket_generation/github_orgs/.env 12 | bucket_generation/github_orgs/repo_counts_for_1k_orgs.png 13 | .env 14 | -------------------------------------------------------------------------------- /bucket_extraction/feed_to_validator/feed_to_validator.py: -------------------------------------------------------------------------------- 1 | from pystalk import BeanstalkClient 2 | import time 3 | 4 | beanstalk_client = BeanstalkClient('127.0.0.1', 11301) 5 | 6 | def feedToValidator(file, label): 7 | with open(file, 'r') as f: 8 | lines = list(f) 9 | for line in lines: 10 | line = line.strip() 11 | print('CAND:', line) 12 | beanstalk_client.put_job("extraction/" + label + "," + line) 13 | -------------------------------------------------------------------------------- /bucket_generation/replayExisting.py: -------------------------------------------------------------------------------- 1 | from pystalk import BeanstalkClient 2 | import time 3 | from bucket_extraction import getBucketsFromText 4 | 5 | beanstalk_client = BeanstalkClient('127.0.0.1', 11301) 6 | 7 | def replayExisting(file, label): 8 | with open(file, 'r') as f: 9 | for line in f: 10 | buckets = list(getBucketsFromText(line)) 11 | for bucket in buckets: 12 | print('CAND:', bucket) 13 | beanstalk_client.put_job("generation/" + label + "," + bucket) 14 | time.sleep(1/200) 15 | 16 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from bucket_extraction import getBucketsFromText 2 | 3 | def initializeSetFromTextFile(path, setType): 4 | """ 5 | Initialize set comprised of lines from the text file 6 | :param path: path to text file 7 | :param setType: a set that we will add lines to 8 | """ 9 | with open(path,'r') as f: 10 | for line in f: 11 | setType.add(line.strip()) 12 | 13 | def getBucketsFromTextFile(path): 14 | buckets = set() 15 | with open(path, 'r') as f: 16 | lines = f.readlines() 17 | for line in lines: 18 | buckets = buckets.union(getBucketsFromText(line)) 19 | return buckets 20 | -------------------------------------------------------------------------------- /final_output/gather_all_buckets.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | shopt -s extglob 3 | 4 | # Helper script to gather data from all folders 5 | cat ./data/validation/@(s3.amazonaws.com|storage.googleapis.com|aliyuncs.com|bucket_types)/private.txt > ./final_output/all_platforms_private.txt 6 | sort -u -t, -k1,1 ./final_output/all_platforms_private.txt -o ./final_output/all_platforms_private.txt 7 | cat ./data/validation/@(s3.amazonaws.com|storage.googleapis.com|aliyuncs.com|bucket_types)/public.txt > ./final_output/all_platforms_public.txt 8 | sort -u -t, -k1,1 ./final_output/all_platforms_public.txt -o ./final_output/all_platforms_public.txt 9 | cat ./final_output/all_platforms_public.txt ./final_output/all_platforms_private.txt | sort -u > ./final_output/all_platforms_all.txt 10 | 11 | -------------------------------------------------------------------------------- /bucket_extraction/utils/extract_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scrape a given site's source for buckets. 3 | """ 4 | import requests 5 | import re 6 | 7 | def getBucketsFromText(text): 8 | regexes = [r'([\w\d_\.-]+)\.s3[\w\d-]*\.amazonaws\.com', 9 | r'([\w\d_\.-]+)\.storage\.googleapis\.com', 10 | r'([\w\d_\.-]+)\.[\w\d\.-]*\.cdn\.digitaloceanspaces\.com', 11 | r'([\w\d_-]+)\.oss[\w\d-]*\.aliyuncs\.com', 12 | r'^[^.]*s3[\w\d-]*\.amazonaws\.com\/([\w\d_.-]+)', 13 | r'^[^.]*s3[\w\d\.-]*\.wasabisys\.com\/([\w\d_.-]+)', 14 | r'^[^.]*storage\.googleapis\.com\/([\w\d_.-]+)', 15 | r'^[^.]*oss[\w\d_-]*\.aliyuncs\.com\/([\w\d_.-]+)'] 16 | for regex in regexes: 17 | found = re.findall(regex, text.lower()) 18 | if len(found) > 0: 19 | return found 20 | return set() 21 | 22 | -------------------------------------------------------------------------------- /bucket_extraction/grayhatwarfare/grayhatwarfare.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import os 4 | 5 | def getGrayhatWarfare(): 6 | buckets = [] 7 | num_buckets = 100000 8 | current = 0 9 | chunk_size = 50000 10 | 11 | access_token = os.getenv("GRAYHAT_ACCESS_TOKEN") 12 | 13 | while current < num_buckets: 14 | resp = requests.get("https://buckets.grayhatwarfare.com/api/v1/buckets/" + str(current) + "/" + str(chunk_size) + "?access_token=" + access_token) 15 | resp_json = json.loads(resp.text) 16 | for bucket in resp_json["buckets"]: 17 | if bucket["type"] == "aws": 18 | buckets.append(bucket["bucket"]) 19 | print(len(resp_json["buckets"])) 20 | 21 | current += chunk_size 22 | out = './data/extraction/grayhatwarfare/grayhatwarfare.txt' 23 | with open(out, 'w+') as f: 24 | for b in buckets: 25 | f.write(b + '\n') 26 | print("Wrote buckets to " + out) 27 | 28 | if __name__ == "__main__": 29 | getGrayhatWarfare() -------------------------------------------------------------------------------- /bucket_generation/generators/random_baseline/guesser.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import random 3 | import string 4 | 5 | from bucket_generation.utils import getBeanstalkClient, addArguments 6 | 7 | def randomlyGuessBucketNames(numCharacters=5, numTrials=float("inf"), name="random"): 8 | beanstalkClient = getBeanstalkClient() 9 | while numTrials > 0: 10 | numTrials -= 1 11 | randomBucket = "".join( 12 | [ 13 | random.choice(list(string.ascii_lowercase) + list(string.digits) + ["-",".","_"]) 14 | for _ in range(numCharacters) 15 | ] 16 | ) 17 | print(f"CAND: {randomBucket}") 18 | beanstalkClient.put_job(f"generation/{name},{randomBucket}") 19 | 20 | 21 | if __name__ == "__main__": 22 | parser = argparse.ArgumentParser(description='Run the PCFG generator.') 23 | addArguments(parser) 24 | parser.add_argument("--character_num", type=int, help="The length of characters to generate.") 25 | args = parser.parse_args() 26 | randomlyGuessBucketNames(name=args.name, numTrials=int(args.num_trials), numCharacters=int(args.character_num)) 27 | -------------------------------------------------------------------------------- /bucket_validation/test_bloom/test_bloom.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | /** 4 | * Creates a bloom filter and initilizes the filter to be from all read lines in all the validator files. 5 | */ 6 | 7 | import ( 8 | "bufio" 9 | "log" 10 | "os" 11 | "strings" 12 | 13 | "github.com/willf/bloom" 14 | ) 15 | 16 | /** 17 | * Adds all the lines of text from the textfile to the bloom filter. 18 | */ 19 | func addFromTextFile(filter *bloom.BloomFilter, filePath string) { 20 | file, err := os.Open(filePath) 21 | if err != nil { 22 | log.Fatal(err) 23 | } 24 | defer file.Close() 25 | scanner := bufio.NewScanner(file) 26 | for scanner.Scan() { 27 | line := scanner.Text() 28 | lineContents := strings.Split(line, ",") 29 | if len(lineContents) == 2 { // Of the form "," 30 | filter.AddString(lineContents[0]) 31 | } else { // Normal form of 32 | filter.AddString(scanner.Text()) 33 | } 34 | } 35 | } 36 | 37 | func main() { 38 | filter := bloom.NewWithEstimates(300000000, .000001) 39 | 40 | f1, err := os.OpenFile("./bucket_validation/bloom/candidate_set.bloom", os.O_CREATE|os.O_RDONLY, 0644) 41 | if err != nil { 42 | panic(err) 43 | } 44 | defer f1.Close() 45 | 46 | r := bufio.NewReader(f1) 47 | readsize, err := filter.ReadFrom(r) 48 | 49 | if err != nil { 50 | panic(err) 51 | } 52 | log.Println(readsize) 53 | 54 | for i := 1; i < len(os.Args); i++ { 55 | log.Println(os.Args[i], filter.TestString(os.Args[i])) 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /bucket_validation/bloom/add_to_bloom.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | /** 4 | * Creates a bloom filter and initilizes the filter to be from all read lines in all the validator files. 5 | */ 6 | 7 | import ( 8 | "bufio" 9 | "log" 10 | "os" 11 | "strings" 12 | 13 | "github.com/willf/bloom" 14 | ) 15 | 16 | /** 17 | * Adds all the lines of text from the textfile to the bloom filter. 18 | */ 19 | func addFromTextFile(filter *bloom.BloomFilter, filePath string) { 20 | file, err := os.Open(filePath) 21 | if err != nil { 22 | log.Fatal(err) 23 | } 24 | defer file.Close() 25 | scanner := bufio.NewScanner(file) 26 | for scanner.Scan() { 27 | line := scanner.Text() 28 | lineContents := strings.Split(line, ",") 29 | if len(lineContents) == 2 { // Of the form "," 30 | filter.AddString(lineContents[0]) 31 | } else { // Normal form of 32 | filter.AddString(scanner.Text()) 33 | } 34 | } 35 | } 36 | 37 | func main() { 38 | filter := bloom.NewWithEstimates(300000000, .000001) 39 | 40 | f1, err := os.OpenFile("./bucket_validation/bloom/candidate_set.bloom", os.O_CREATE|os.O_RDONLY, 0644) 41 | if err != nil { 42 | panic(err) 43 | } 44 | defer f1.Close() 45 | 46 | r := bufio.NewReader(f1) 47 | readsize, err := filter.ReadFrom(r) 48 | 49 | if err != nil { 50 | panic(err) 51 | } 52 | log.Println(readsize) 53 | 54 | // Add random textfile stuffs 55 | addFromTextFile(filter, os.Args[1]) 56 | 57 | f2, err := os.OpenFile("./bucket_validation/bloom/candidate_set.bloom", os.O_WRONLY, 0644) 58 | 59 | w := bufio.NewWriter(f2) // ignores 60 | size, err := filter.WriteTo(w) 61 | if err != nil { 62 | panic(err) 63 | } 64 | defer f2.Close() 65 | log.Println(size) 66 | 67 | } 68 | -------------------------------------------------------------------------------- /bucket_extraction/bing/bing.py: -------------------------------------------------------------------------------- 1 | import grequests 2 | import csv 3 | from .. import getBucketsFromText 4 | import random 5 | import string 6 | import os 7 | 8 | subscription_key = os.getenv("BING_API_KEY") 9 | search_url = "https://api.cognitive.microsoft.com/bing/v7.0/search" 10 | 11 | headers = {"Ocp-Apim-Subscription-Key": subscription_key} 12 | 13 | NUM_SEARCHES = 100 14 | NUM_THREADS = 10 15 | SEED_LENGTH = 3 16 | OUTPUT_NAME = "./data/extraction/bing/buckets_output.txt" 17 | 18 | def exception(request, exception): 19 | print("Error: {}: {}".format(request.url, exception)) 20 | 21 | def getBucketsFromBing(): 22 | 23 | try: 24 | with open(OUTPUT_NAME, "r") as f: 25 | buckets = set(line.strip() for line in f) 26 | except Exception as e: 27 | buckets = set() 28 | 29 | initLen = len(buckets) 30 | 31 | reqs = [] 32 | 33 | for i in range(0, NUM_SEARCHES): 34 | rand = ''.join(random.choice(string.ascii_lowercase) for _ in range(SEED_LENGTH)) 35 | platform = random.choice(["s3.amazonaws.com", "storage.googleapis.com", "oss.aliyuncs.com"]) 36 | reqs.append(grequests.get( 37 | search_url, 38 | headers=headers, 39 | params={ 40 | "q": "site:" + platform + " \"" + rand + "\"", 41 | "responseFilter": "Webpages", 42 | "count": 50, 43 | "offset": 0 44 | }, 45 | stream=False 46 | )) 47 | 48 | results = grequests.map(reqs, exception_handler=exception, size=NUM_THREADS) 49 | 50 | for result in results: 51 | if result is None: 52 | continue 53 | 54 | result.close() 55 | 56 | if result.status_code != 200: 57 | print(result) 58 | continue 59 | 60 | parsed = result.json() 61 | if "webPages" in parsed and "value" in parsed["webPages"]: 62 | for page in parsed["webPages"]["value"]: 63 | if "snippet" in page: 64 | buckets = buckets.union(getBucketsFromText(page["snippet"])) 65 | buckets = buckets.union(getBucketsFromText(page["url"])) 66 | 67 | numAdded = len(buckets) - initLen 68 | ratio = numAdded / NUM_SEARCHES 69 | print("Discovered {} new buckets. ({} buckets / search)".format(numAdded, ratio)) 70 | print("Wrote buckets to " + OUTPUT_NAME) 71 | 72 | with open(OUTPUT_NAME, 'w+') as f: 73 | f.write("\n".join(buckets)) 74 | -------------------------------------------------------------------------------- /bucket_extraction/farsight/farsight.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from .. import getBucketsFromText 4 | import os 5 | 6 | API_ENDPOINTS = [("https://api.dnsdb.info/lookup/rrset/name/", "rrname"), ("https://api.dnsdb.info/lookup/rdata/name/", "rdata")] 7 | regions = ["us-east-2", "us-east-1", "us-west-1", "us-west-2", "af-south-1", "ap-east-1", "ap-south-1", "ap-northeast-3", "ap-northeast-2", "ap-southeast-1", "ap-southeast-2", "ap-northeast-1", "ca-central-1", "cn-north-1", "cn-northwest-1", "eu-central-1", "eu-west-1", "eu-west-2", "eu-south-1", "eu-west-3", "eu-north-1", "me-south-1", "sa-east-1", "us-gov-east-1", "us-gov-west-1"] 8 | 9 | API_KEY = os.getenv("FARSIGHT_API_KEY") 10 | OUTPUT_NAME = "./data/extraction/farsight/" 11 | 12 | api_limit = 1000000 13 | 14 | def lookupFile(file, type): 15 | files = [] 16 | with open(file, 'r') as f: 17 | for line in f: 18 | text = line.strip() 19 | if "{region}" in text: 20 | all_regions = [] 21 | for region in regions: 22 | all_regions.append(text.replace("{region}", region)) 23 | files.append(all_regions) 24 | else: 25 | files.append([text]) 26 | for endpoint_list in files: 27 | lookup(endpoint_list, type + "/") 28 | 29 | def lookup(endpoints, directory=""): 30 | all_domains = set() 31 | for domain in endpoints: 32 | for endpoint_pair in API_ENDPOINTS: 33 | (endpoint, field_name) = endpoint_pair 34 | offset = 0 35 | while True: 36 | url = "{}*.{}?limit={}&offset={}".format(endpoint, domain, api_limit, offset) 37 | print("Fetching " + url) 38 | headers = {'Accept': 'application/json', 'X-API-Key': API_KEY} 39 | resp = requests.get(url, headers=headers) 40 | if resp.status_code != 200: 41 | break 42 | split = resp.text.split("\n") 43 | for line in split: 44 | if line.strip() == "": 45 | continue 46 | resp_json = json.loads(line) 47 | if field_name in resp_json: 48 | dns_val = resp_json[field_name] 49 | if dns_val[-1] == ".": 50 | dns_val = dns_val[:-1] 51 | all_domains.add(dns_val) 52 | print(len(split)) 53 | if len(split) < api_limit or offset >= 4000000: 54 | break 55 | offset += api_limit 56 | out = './data/extraction/farsight/' + directory + domain + ".txt" 57 | with open(out, 'w+') as f: 58 | f.write("\n".join(all_domains)) 59 | print("Wrote buckets to " + out) 60 | 61 | -------------------------------------------------------------------------------- /bucket_extraction/virustotal/virustotal.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import subprocess 3 | import grequests 4 | from .. import getBucketsFromText 5 | import os 6 | 7 | S3_IP_URL = "https://ip-ranges.amazonaws.com/ip-ranges.json" 8 | API_KEY = os.getenv("VIRUSTOTAL_API_KEY") 9 | IP_LOOKUP_ENDPOINT = "https://www.virustotal.com/vtapi/v2/ip-address/report" 10 | BATCH_SEARCH = 100 11 | NUM_THREADS = 10 12 | OUTPUT_NAME = "./data/extraction/virustotal/buckets_output.txt" 13 | 14 | def exception(request, exception): 15 | print("Error: {}: {}".format(request.url, exception)) 16 | 17 | # Fetches CIDR ranges of all S3 IPs from Amazon 18 | def getS3IPs(): 19 | json = requests.get(S3_IP_URL).json() 20 | cidrs = [] 21 | for prefix in json["prefixes"]: 22 | if prefix["service"] == "AMAZON" and "ip_prefix" in prefix: 23 | cidrs.append(prefix["ip_prefix"]) 24 | with open('./data/extraction/virustotal/all_ips.txt', 'w+') as f: 25 | f.write("\n".join(cidrs)) 26 | print("Wrote cidrs to all_ips.txt") 27 | 28 | # Runs a ping scan via Zmap for all S3 IPs 29 | def runZmap(): 30 | with open("./data/extraction/virustotal/all_ips.txt", "r") as f: 31 | cidrs = f.read().splitlines() 32 | print(" ".join(cidrs)) 33 | command = "sudo zmap -i ens8 --probe-module=icmp_echoscan -B 10M -o ./data/extraction/virustotal/live_ips.txt " + " ".join(cidrs) 34 | subprocess.call(command, shell=True) 35 | subprocess.call("cp ./data/extraction/virustotal/live_ips.txt ./data/extraction/virustotal/rem_ips.txt", shell=True) 36 | 37 | def lookup(num): 38 | with open("./data/extraction/virustotal/rem_ips.txt", "r") as f: 39 | ips = f.read().splitlines() 40 | 41 | domains = set() 42 | with open("./data/extraction/virustotal/domains_output.txt", "r") as f: 43 | domains = set(line.strip() for line in f) 44 | buckets = set() 45 | with open(OUTPUT_NAME, "r") as f: 46 | buckets = set(line.strip() for line in f) 47 | while num > 0: 48 | numSearches = min(num, BATCH_SEARCH) 49 | num -= numSearches 50 | reqs = [] 51 | for i in range(0, numSearches): 52 | ip = ips.pop(0) 53 | reqs.append(grequests.get( 54 | IP_LOOKUP_ENDPOINT, 55 | params={ 56 | "apikey": API_KEY, 57 | "ip": ip 58 | }, 59 | stream=False)) 60 | 61 | results = grequests.map(reqs, exception_handler=exception, size=NUM_THREADS) 62 | for result in results: 63 | if result is None: 64 | print("Error: null response") 65 | continue 66 | parsed = result.json() 67 | if ("response_code" in parsed and parsed["response_code"] == "0") or "resolutions" not in parsed: 68 | result.close() 69 | continue 70 | for res in parsed["resolutions"]: 71 | if "hostname" in res and res["hostname"] is not None: 72 | domains.add(res["hostname"]) 73 | buckets = buckets.union(getBucketsFromText(res["hostname"])) 74 | result.close() 75 | 76 | with open(OUTPUT_NAME, 'w+') as f: 77 | f.write("\n".join(buckets)) 78 | 79 | with open("./bucket_extraction/virustotal/domains_output.txt", 'w+') as f: 80 | f.write("\n".join(domains)) 81 | 82 | with open("./bucket_extraction/virustotal/rem_ips.txt", "w+") as f: 83 | f.write("\n".join(ips)) 84 | 85 | print("Wrote buckets to " + OUTPUT_NAME) 86 | 87 | 88 | -------------------------------------------------------------------------------- /bucket_generation/generators/character_grams/guesser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Inspired from https://dl.acm.org/doi/pdf/10.1145/1102120.1102168 and https://hal.archives-ouvertes.fr/hal-01112124/file/omen.pdf 3 | Do n-grams where n=4, but look at the character level. 4 | """ 5 | import argparse 6 | from collections import Counter, defaultdict 7 | import string 8 | 9 | import numpy as np 10 | 11 | from bucket_extraction.utils.extract_utils import getBucketsFromText 12 | import bucket_generation.utils as gen_utils 13 | 14 | def generateLaplaceDistribution(): 15 | return Counter(list(string.ascii_lowercase) + list(string.digits) + ["-",".","_"]) 16 | 17 | def getCounters(buckets): 18 | """ 19 | Build distribution from left to right of prev4 chars -> next char. 20 | We will have some default Laplace smoothing, both to add a little bit of randomness, 21 | and to make sure that we have at least some distribution if the sequence 22 | has not been encountered before. 23 | """ 24 | lengthDistribution = Counter() 25 | counters = defaultdict(generateLaplaceDistribution) 26 | for bucket in buckets: 27 | bucketString = bucket.lower().strip() 28 | for i in range(len(bucketString)): 29 | counters[ 30 | bucketString[max(0, i-4): i] # If we aren't at fourth character yet, just do previous characters. 31 | ][bucketString[i]] += 1 32 | lengthDistribution[len(bucket)] += 1000 33 | return counters, lengthDistribution 34 | 35 | def sampleFromCounter(counter): 36 | total = sum(counter.values()) 37 | return np.random.choice([k for k,v in counter.items()], p=[v/total for k,v in counter.items()]) 38 | 39 | def generateCandidates(name="c4grams", startingCandidates=None, beanstalkPort=None, numTrials=float("inf"), public=False): 40 | beanstalkClient = gen_utils.getBeanstalkClient(port=beanstalkPort) 41 | previouslySeen = startingCandidates | gen_utils.readBucketsFromFile(f"./data/generation/{name}.txt") 42 | 43 | # Randomly generate template according to distro 44 | while numTrials > 0: 45 | print("Updating character-level 4-grams.") 46 | # In intervals of 10,000 guesses, update our PCFG from our successful guesses. 47 | with gen_utils.Profiler(gen_utils.ProfilerType.TRAIN, name): 48 | candidates = startingCandidates | gen_utils.getExistingAlreadyGuessedBuckets(name, public=public) 49 | counters, lengthDistribution = getCounters(candidates) 50 | 51 | 52 | for _ in range(int(1e4)): 53 | with gen_utils.Profiler(gen_utils.ProfilerType.GENERATE, name) as p: 54 | bucket = "" 55 | bucketLength = sampleFromCounter(lengthDistribution) 56 | while len(bucket) < bucketLength: 57 | bucket += sampleFromCounter(counters[bucket[max(0, len(bucket)-4): len(bucket)]]) 58 | 59 | p.bucket(bucket) 60 | if bucket not in previouslySeen: 61 | previouslySeen.add(bucket) 62 | print('CAND:', bucket) 63 | beanstalkClient.put_job(f"generation/{name},{bucket}") 64 | numTrials -= 1 65 | 66 | 67 | 68 | if __name__ == "__main__": 69 | parser = argparse.ArgumentParser(description='Run the a character level n-grams.') 70 | gen_utils.addArguments(parser) 71 | args = parser.parse_args() 72 | candidates = gen_utils.getStartBucketNames(args) 73 | generateCandidates(name=args.name, startingCandidates=candidates, public=args.public, numTrials=int(args.num_trials) or float("inf")) 74 | -------------------------------------------------------------------------------- /bucket_generation/generators/n_grams/guesser.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | from collections import Counter, defaultdict 4 | import json 5 | import random 6 | import re 7 | import time 8 | 9 | import numpy as np 10 | from pystalk import BeanstalkClient 11 | 12 | from bucket_extraction import getBucketsFromText 13 | import bucket_generation.utils as generation_utils 14 | 15 | 16 | def generateNGrams(candidates): 17 | ngrams = defaultdict(lambda: Counter()) 18 | lengthDistribution = Counter() 19 | delimiterDistribution = Counter() 20 | for bucket in candidates: 21 | splitBucket = re.split(r'([\.|\-|_])', bucket.lower().strip()) 22 | delimiterDistribution.update( 23 | [ 24 | d for d in splitBucket if d in [".", "-", "_"] 25 | ] 26 | ) 27 | tokens = [t for t in splitBucket if t not in [".", "-", "_"]] 28 | for i in range(len(list(tokens))): 29 | ngrams[ 30 | tuple(tokens[max(0, i-1): i]) 31 | ][tokens[i]] += 1 32 | lengthDistribution[len(tokens)] += 1 33 | return ngrams, lengthDistribution, delimiterDistribution 34 | 35 | def sampleFromCounter(counter): 36 | total = sum(counter.values()) 37 | return np.random.choice([k for k,v in counter.items()], p=[v/total for k,v in counter.items()]) 38 | 39 | def streamNGramCandidates( 40 | startingCandidates=None, beanstalkPort=None, numTrials=float("inf"), name="ngrams", experiment=False, public=False, 41 | ): 42 | candidates = startingCandidates or generation_utils.getExistingBuckets(public=public) 43 | previouslySeen = startingCandidates | generation_utils.readBucketsFromFile(f"./data/generation/{name}.txt") 44 | beanstalkClient = generation_utils.getBeanstalkClient(port=beanstalkPort) 45 | 46 | while numTrials > 0: 47 | # Update our prior distribution for every 10,000 candidates. 48 | print("Initializing bigram distribution.") 49 | 50 | with generation_utils.Profiler(generation_utils.ProfilerType.TRAIN, name): 51 | if experiment: 52 | # add all existing buckets that have been guessed by ngrams and are in seed set. 53 | candidates |= generation_utils.getExistingAlreadyGuessedBuckets(name, public=public) 54 | nGrams, lengthDistribution, delimiterDistribution = generateNGrams(candidates) 55 | 56 | 57 | for _ in range(int(1e4)): 58 | with generation_utils.Profiler(generation_utils.ProfilerType.GENERATE, name) as p: 59 | bucket = [] 60 | bucketLength = sampleFromCounter(lengthDistribution) 61 | for _ in range(bucketLength): 62 | if len(bucket) > 0: 63 | bucket += [sampleFromCounter(delimiterDistribution)] 64 | ngramsKey = tuple(bucket[-2:-1]) 65 | if ngramsKey in nGrams: 66 | bucket += sampleFromCounter(nGrams[ngramsKey]) 67 | bucket = "".join(bucket) 68 | p.bucket(bucket) 69 | if len(bucket) < 64 and bucket not in previouslySeen: 70 | previouslySeen.add(bucket) 71 | beanstalkClient.put_job("generation/{},{}".format(name, bucket)) 72 | print("Generated: " + bucket) 73 | numTrials -= 1 74 | 75 | if __name__ == "__main__": 76 | parser = argparse.ArgumentParser(description='Run the ngrams generator.') 77 | generation_utils.addArguments(parser) 78 | args = parser.parse_args() 79 | candidates = generation_utils.getStartBucketNames(args) 80 | streamNGramCandidates(name=args.name, startingCandidates=candidates, public=args.public, numTrials=int(args.num_trials) or float("inf")) -------------------------------------------------------------------------------- /bucket_generation/generators/continella/guesser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate bucket names according to the algorithms specified in Continella et al: 3 | https://re.public.polimi.it/retrieve/handle/11311/1065367/314518/2018-continella-bucketsec.pdf 4 | 5 | - Enumerate all 3/4 character combinations 6 | - Enumerate word mutations 7 | """ 8 | import argparse 9 | from enum import Enum, auto 10 | import random 11 | import string 12 | 13 | 14 | import bucket_generation.utils as utils 15 | 16 | def generateRandomThreeOrFour(beanstalkPort=None, numTrials=float("inf"), candidates=None, name="continella_threefour"): 17 | beanstalkClient = utils.getBeanstalkClient(port=beanstalkPort) 18 | allPossibleCandidates = set() 19 | for char1 in list(string.ascii_lowercase) + [""]: 20 | for char2 in list(string.ascii_lowercase): 21 | for char3 in list(string.ascii_lowercase): 22 | for char4 in list(string.ascii_lowercase): 23 | allPossibleCandidates.add("".join([char1,char2,char3,char4])) 24 | candidates = random.sample(allPossibleCandidates, min(numTrials, len(allPossibleCandidates))) 25 | for candidate in candidates: 26 | beanstalkClient.put_job("generation/{},{}".format(name, candidate)) 27 | 28 | class Mutation(Enum): 29 | 30 | DELETE = auto() 31 | DUPLICATE = auto() 32 | CONCATENATE = auto() 33 | END = auto() 34 | 35 | def mutateWords(beanstalkPort=None, numTrials=float("inf"), name="continella_dictionary"): 36 | with open("/usr/share/dict/words", "r") as f: 37 | dictionary = [ 38 | l.strip().lower() for l in f.readlines() 39 | if l.strip().isalnum() and all(ord(c) < 128 for c in l.strip()) # alphanumeric ascii characters 40 | ] 41 | prevCandidates = utils.readBucketsFromFile("./data/generation/{}.txt".format(name)) 42 | beanstalkClient = utils.getBeanstalkClient(port=beanstalkPort) 43 | 44 | while numTrials > 0: 45 | with utils.Profiler(utils.ProfilerType.GENERATE, name) as p: 46 | word = random.choice(dictionary) 47 | mutation = random.choice([mutation for mutation in Mutation]) 48 | assert any([mutation == m for m in Mutation]), "wrong equals" 49 | while mutation != Mutation.END: 50 | if mutation == Mutation.DELETE and len(word) > 1: 51 | deletedCharIdx = random.randint(0, len(word) - 1) 52 | word = word[:deletedCharIdx] + word[deletedCharIdx + 1:] 53 | elif mutation == Mutation.DUPLICATE and len(word) < 63: # Can't exceed limit. 54 | dupCharIdx = random.randint(0, len(word) - 1) 55 | word = word[:dupCharIdx] + 2 * word[dupCharIdx] + word[dupCharIdx + 1:] 56 | elif mutation == Mutation.CONCATENATE: 57 | otherWord = random.choice(dictionary) 58 | if len(word) + len(otherWord) < 63: 59 | if random.random() < .5: 60 | word += otherWord 61 | else: 62 | word = otherWord + word 63 | mutation = random.choice([mutation for mutation in Mutation]) 64 | p.bucket(word) 65 | if word not in prevCandidates: 66 | print(word) 67 | prevCandidates.add(word) 68 | beanstalkClient.put_job("generation/{},{}".format(name, word)) 69 | numTrials -= 1 70 | 71 | if __name__ == "__main__": 72 | parser = argparse.ArgumentParser(description='Run the continella experiments.') 73 | utils.addArguments(parser) 74 | parser.add_argument("--mutateWords", action="store_true", help="Run the mutateWords experiment.") 75 | parser.add_argument("--generateRandom34", action="store_true", help="Generate all 3/4 character sequences.") 76 | args = parser.parse_args() 77 | assert args.mutateWords != args.generateRandom34, "One of --mutateWords and --generateRandom34 must be selected." 78 | 79 | if args.mutateWords: 80 | mutateWords(numTrials=int(args.num_trials) or float("inf"), name=args.name) 81 | elif args.generateRandom34: 82 | generateRandomThreeOrFour(name=args.name) 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /bucket_generation/generators/pcfg/guesser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Inspired from https://web.eecs.utk.edu/~mschucha/netsec/readings/cfgPass.pdf 3 | Predict token based on distribution of tokens bucketd in types as CFG. 4 | To begin, we will just do a really primitive survey: 5 | break bucket into delimiters, characters of a certain length, and 6 | numbers of certain length. We can definitely extend this to 7 | with more specific types. 8 | Types : Ci -> i characters, Ni -> i numbers, 9 | B -> T-T-T 10 | """ 11 | import argparse 12 | from collections import Counter 13 | import re 14 | 15 | import numpy as np 16 | 17 | from bucket_extraction.utils.extract_utils import getBucketsFromText 18 | import bucket_generation.utils as gen_utils 19 | 20 | templates = Counter() 21 | C = {} 22 | N = {} 23 | for i in range(64): 24 | C[str(i)] = Counter() 25 | N[str(i)] = Counter() 26 | 27 | def getType(c): 28 | if c.isalpha(): 29 | return 'C' 30 | if c.isnumeric(): 31 | return 'N' 32 | else: 33 | return c 34 | 35 | def updateCounters(bucket): 36 | template = '' 37 | while len(bucket) > 0: 38 | ci = re.search('([a-z]*)', bucket).group() 39 | if len(ci) > 0: 40 | template += 'C' + str(len(ci)) 41 | C[str(len(ci))][ci] += 1 42 | bucket = bucket[len(ci):] 43 | continue 44 | 45 | ni = re.search('([0-9]*)', bucket).group() 46 | if len(ni) > 0: 47 | template += 'N' + str(len(ni)) 48 | bucket = bucket[len(ni):] 49 | N[str(len(ni))][ni] += 1 50 | continue 51 | 52 | other = re.search('([^a-z0-9]*)', bucket).group() 53 | template += other 54 | bucket = bucket[len(other):] 55 | templates[template] += 1 56 | 57 | def sampleFromCounter(counter): 58 | total = sum(counter.values()) 59 | return np.random.choice([k for k,v in counter.items()], p=[v/total for k,v in counter.items()]) 60 | 61 | def generatePCFGCandidates(name="pcfg", startingCandidates=None, beanstalkPort=None, numTrials=float("inf"), public=False): 62 | beanstalkClient = gen_utils.getBeanstalkClient(port=beanstalkPort) 63 | candidates = startingCandidates or gen_utils.getExistingBuckets(public=public) 64 | previouslySeen = startingCandidates | gen_utils.readBucketsFromFile(f"./data/generation/{name}.txt") 65 | 66 | # Randomly generate template according to distro 67 | while numTrials > 0: 68 | print("Updating PCFG.") 69 | # In intervals of 10,000 guesses, update our PCFG from our successful guesses. 70 | with gen_utils.Profiler(gen_utils.ProfilerType.TRAIN, name): 71 | candidates = startingCandidates | gen_utils.getExistingAlreadyGuessedBuckets(name, public=public) 72 | for candidate in candidates: 73 | updateCounters(candidate.strip().lower()) 74 | 75 | 76 | for _ in range(int(1e4)): 77 | with gen_utils.Profiler(gen_utils.ProfilerType.GENERATE, name) as p: 78 | template = sampleFromCounter(templates) 79 | print(template) 80 | bucket = '' 81 | while len(template) > 0: 82 | if template[0] == 'C': 83 | ni = re.search('([0-9]*)', template[1]).group() 84 | i = ni 85 | try: 86 | bucket += sampleFromCounter(C[i]) 87 | except KeyError: 88 | import pdb 89 | pdb.set_trace() 90 | template = template[1+len(ni):] 91 | elif template[0] == 'N': 92 | ni = re.search('([0-9]*)', template[1]).group() 93 | i = ni 94 | template = template[1+len(ni):] 95 | bucket += sampleFromCounter(N[i]) 96 | else: 97 | bucket += template[0] 98 | template = template[1:] 99 | p.bucket(bucket) 100 | if bucket not in previouslySeen: 101 | previouslySeen.add(bucket) 102 | print('CAND:', bucket) 103 | beanstalkClient.put_job(f"generation/{name},{bucket}") 104 | numTrials -= 1 105 | 106 | 107 | 108 | if __name__ == "__main__": 109 | parser = argparse.ArgumentParser(description='Run the PCFG generator.') 110 | gen_utils.addArguments(parser) 111 | args = parser.parse_args() 112 | candidates = gen_utils.getStartBucketNames(args) 113 | generatePCFGCandidates(name=args.name, startingCandidates=candidates, public=args.public, numTrials=int(args.num_trials) or float("inf")) 114 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stratosphere 2 | 3 | Stratosphere uses password generation algorithms to discover publicly accessible cloud storage buckets. Stratosphere includes infrastructure for extracting, generating, and validating bucket names across Amazon S3, Google Cloud Storage, and Alibaba's Object Storage Service. 4 | 5 | For more information about Stratosphere, please check out our [research paper](https://zakird.com/papers/s3.pdf). 6 | 7 | ![](images/flow.jpg) 8 | 9 | ## Installation 10 | 11 | 1. Install python dependencies: `pip install requirements.txt` 12 | 2. Install Go dependencies: `cd bucket_validation && go get` 13 | 3. Install [ZMap](https://github.com/zmap/zmap) and [ZGrab2](https://github.com/zmap/zgrab2) 14 | 4. Install [beanstalkd](https://beanstalkd.github.io/) 15 | 5. Run `cp .env.example .env` and configure relevant API keys 16 | 6. Configure `bucket_validation/listener-config.json` with source IPs, if you would like to use more than one source IP 17 | 18 | ## Usage 19 | 20 | ### Extraction 21 | 22 | The extraction phase gathers buckets to seed generation algorithms. All extractors will write candidate bucket names to files in `./data/extraction/`, which can then be run through the validator to collect valid bucket names. 23 | 24 | Examples to extract buckets from various sources: 25 | 26 | Bing: `python main.py --bing` 27 | 28 | Farsight: `python main.py --farsight --domain s3.amazonaws.com` (a file of domain names can be provided via `python main.py --farsight -f ./file.txt`) 29 | 30 | GrayHat Warfare: `python main.py --grayhatwarfare` 31 | 32 | VirusTotal is a 3-part process: 33 | 34 | 1. Run `python main.py --virustotal --ips` to fetch S3 IP blocks (similar IP ranges can be found for Google Cloud Storage and Alibaba) 35 | 2. Run `python main.py --virustotal --pingAll` to ping all IP addresses via ZMap 36 | 3. Run `python main.py --virustotal --lookup -n 10000` where `-n` is the maximum number of IPs to be validated (to allow running in batches) 37 | 38 | Lastly, you may bring your own data sources. To use unvalidated data (e.g. buckets that may or may not exist), call `feedToValidator` to validate buckets (see "Validating extracted buckets" below). To use validated data, create a folder in `data/validation` with a unique name. Place private buckets in `private.txt` and public buckets in `public.txt`. As always, invoke `gather_all_buckets.sh` in `final_output` to combine found buckets. 39 | 40 | ### Validation 41 | 42 | The validation phase fetches buckets to check whether the bucket exists, and if the bucket is public or private. The source can either be extracted buckets or generated buckets. 43 | 44 | The validator will output bucket names in `./data/validation/`, with folders for each platform. Within each platform folder, the validator will write files `public.txt`, `private.txt`, and `no_such_bucket.txt` to indicate the response received for each bucket. 45 | 46 | 1. Run beanstalk in the background: `./beanstalkd -l 127.0.0.1 -p 11301 &` 47 | 2. Run listener: `go run bucket_validation/listener.go` 48 | 49 | We recommend running the listener in a seperate shell, such as [screen](https://www.gnu.org/software/screen/), for debugging. 50 | 51 | The listener will continually poll the Beanstalk queue and can be left running. 52 | 53 | #### Validating extracted buckets 54 | 55 | To validate extracted buckets, run `python main.py --feedToValidator -f data/extraction/bing/buckets_output.txt --label bing`, where `-f` is the name of the file containing buckets and `--label` is a label to identify the source. 56 | 57 | This will feed all found buckets to the Beanstalk queue, which will be processed by the listener. 58 | 59 | #### Combining buckets 60 | 61 | In order to combine buckets, the `gather_all_buckets.sh` script in `final_output` can be run to aggregate and deduplicate found buckets across all three sources. This will create three files: `all_platforms_private.txt`, which contains all private buckets, `all_platforms_public.txt`, which contains all public buckets, and `all_platforms_all.txt` which contains all buckets across all platforms. 62 | 63 | ### Generation 64 | 65 | The generation phase generates new bucket names based on previously seen buckets. 66 | 67 | The generators rely on the `all_platforms_private.txt` and `all_platforms_public.txt` files in `final_output`. Thus, after running the validator on extracted sources, be sure to run `final_output/gather_all_buckets.sh` to generate these files. 68 | 69 | In order to run these files, you will need to add the project to your PYTHONPATH. You can do this by runnning the following: 70 | 71 | ``` 72 | source bucket_generation/add_to_path.sh 73 | python bucket_generation/generators//guesser.py [--public] [--num_trials N] 74 | ``` 75 | 76 | Examples to generate buckets using different algorithms: 77 | 78 | LSTM RNN Generator: `python bucket_generation/generators/rnn/guesser.py rnn --stream --forward` 79 | 80 | LSTM RNN Train: `python bucket_generation/generators/rnn/guesser.py rnn --train --forward` 81 | 82 | Token PCFG: `python bucket_generation/generators/token_pcfg/guesser.py` 83 | 84 | Character 5-Grams: `python bucket_generation/generators/character_grams/guesser.py` 85 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from gevent import monkey as curious_george 2 | curious_george.patch_all(thread=False, select=False) 3 | 4 | from dotenv import load_dotenv 5 | load_dotenv() 6 | 7 | from bucket_extraction.bing.bing import getBucketsFromBing 8 | from bucket_extraction.virustotal.virustotal import getS3IPs 9 | from bucket_extraction.grayhatwarfare.grayhatwarfare import getGrayhatWarfare 10 | from bucket_extraction.virustotal.virustotal import runZmap 11 | from bucket_extraction.virustotal.virustotal import lookup 12 | from bucket_generation.generators.n_grams.n_grams import streamNGramCandidates 13 | from bucket_generation.evaulator.evaluate_performance import evaluatePerformance, fetchExtractedNames 14 | 15 | import argparse 16 | 17 | parser = argparse.ArgumentParser(description='Stratosphere: Discover public cloud storage buckets') 18 | parser.add_argument('--backward', help='Train the RNN backwards', action='store_true') 19 | parser.add_argument('--bing', help='Get S3 bucket names from Bing API searches', action='store_true') 20 | parser.add_argument('--eval', help='Evaluate generator performance with provided date of form %%Y_%%M_%%d.', type=str) 21 | parser.add_argument('--fetch', help='Load dataset of JUST extracted bucket names.', action='store_true') 22 | parser.add_argument('--lookup', help='Checks all IPs against VirusTotal', action='store_true') 23 | parser.add_argument('--ips', help='Fetch S3 IPs for Zmap', action='store_true') 24 | parser.add_argument('-n', help='Number of requests to make', type=int,) 25 | parser.add_argument('-i', help='Network interface for zgrab to use') 26 | parser.add_argument('--label', help='Output filename label') 27 | parser.add_argument('--ngrams', help='Generate candidates using NGrams', action='store_true') 28 | parser.add_argument('--ngrams2', help='Generate candidates using NGrams', action='store_true') 29 | parser.add_argument('--ngrams3', help='Generate candidates using NGrams', action='store_true') 30 | parser.add_argument('--templates', help='Template bucket names', action='store_true') 31 | parser.add_argument('-f', help='Path to buckets_output.txt file', type=str) 32 | parser.add_argument('--type', help='Service type', type=str) 33 | parser.add_argument('--rnn', help='Run RNN LSTM generator.', action='store_true') 34 | parser.add_argument('--train', help='Train', action='store_true') 35 | parser.add_argument('--stream', help='Continuously stream candidates', action='store_true') 36 | parser.add_argument('-r', help='Number of candidates per second to generate', type=int) 37 | parser.add_argument('--pingAll', help='Ping all S3 IPs with Zmap', action='store_true') 38 | parser.add_argument('--pcfg', help='Run PCFG generator', action='store_true') 39 | parser.add_argument('--token', help='Run token PCFG generator', action='store_true') 40 | parser.add_argument('--farsight', help='Get bucket names from FarSight', action='store_true') 41 | parser.add_argument('--virustotal', help='Get S3 bucket names from VirusTotal', action='store_true') 42 | parser.add_argument('--grayhatwarfare', help='Get bucket names from grayhatwarfare', action='store_true') 43 | parser.add_argument('--feedToValidator', help='Send buckets_output.txt candidates to validator', action='store_true') 44 | parser.add_argument('--replayExisting', help='Replay existing buckets with a new domain', action='store_true') 45 | parser.add_argument('--domain', help='Root domain to look up', type=str) 46 | 47 | args = parser.parse_args() 48 | if args.bing: 49 | getBucketsFromBing() 50 | elif args.grayhatwarfare: 51 | getGrayhatWarfare() 52 | elif args.virustotal: 53 | if args.ips: 54 | getS3IPs() 55 | if args.pingAll: 56 | runZmap() 57 | if args.lookup: 58 | lookup(args.n) 59 | elif args.farsight: 60 | if args.f: 61 | from bucket_extraction.farsight.farsight import lookupFile 62 | lookupFile(args.f, args.type) 63 | else: 64 | from bucket_extraction.farsight.farsight import lookup 65 | lookup([args.domain]) 66 | elif args.ngrams: 67 | streamNGramCandidates(args.r) 68 | elif args.ngrams2: 69 | from bucket_generation.generators.n_grams2.n_grams2 import streamNGrams2 70 | streamNGrams2(args.r) 71 | elif args.ngrams3: 72 | from bucket_generation.generators.n_grams2.n_grams2 import streamNGrams3 73 | streamNGrams3(args.r) 74 | elif args.templates: 75 | from bucket_generation.generators.templates.templates import steamCandidates 76 | steamCandidates(args.n) 77 | elif args.replayExisting: 78 | from bucket_generation.replayExisting import replayExisting 79 | replayExisting(args.f, args.label) 80 | elif args.rnn: 81 | from bucket_generation.generators.rnn.rnn import streamRNNGuesses, runTraining 82 | if args.stream: 83 | streamRNNGuesses(not args.backward) 84 | if args.train: 85 | runTraining(not args.backward) 86 | elif args.feedToValidator: 87 | from bucket_extraction.feed_to_validator.feed_to_validator import feedToValidator 88 | feedToValidator(args.f, args.label) 89 | elif args.pcfg: 90 | from bucket_generation.generators.pcfg.guesser import generatePCFGCandidates 91 | generatePCFGCandidates() 92 | elif args.token: 93 | from bucket_generation.generators.token_pcfg.guesser import generatePCFGCandidates 94 | generatePCFGCandidates() 95 | elif args.eval: 96 | evaluatePerformance(args.eval) 97 | elif args.fetch: 98 | fetchExtractedNames() 99 | else: 100 | print("Error: command not found.") 101 | parser.print_help() 102 | 103 | -------------------------------------------------------------------------------- /bucket_generation/utils.py: -------------------------------------------------------------------------------- 1 | from argparse import Action 2 | from enum import Enum, auto 3 | import random 4 | import time 5 | from utils import getBucketsFromText 6 | from pystalk import BeanstalkClient 7 | import json 8 | import argparse 9 | 10 | 11 | def randLines(file, n): 12 | """ 13 | Grabs n random lines in a file using Algorithm R. 14 | """ 15 | lines = [] 16 | for num, line in enumerate(file): 17 | if num < n: 18 | lines.append(line) 19 | else: 20 | randNum = random.randrange(0, num) 21 | if randNum < n: 22 | lines[randNum] = line 23 | return lines 24 | 25 | def readFile(file, removeSuffix=False): 26 | result = [] 27 | with open(file, 'r') as f: 28 | for line in f: 29 | if line is not None: 30 | stripped = line.strip() 31 | if removeSuffix and ".s3.amazonaws.com" in stripped: 32 | stripped = stripped.split(".s3.amazonaws.com")[0] 33 | result.append(stripped) 34 | return result 35 | 36 | def readFullBucketNamesFromFile(path, timePeriod=None): 37 | result = set([]) 38 | firstTimestamp = None 39 | with open(path, 'r') as f: 40 | for line in f: 41 | splitLine = line.strip().split(",") 42 | if len(splitLine) == 2: 43 | bucket, timestamp = splitLine 44 | timestamp = int(timestamp) 45 | if timePeriod: 46 | if firstTimestamp: 47 | if timestamp - firstTimestamp > timePeriod: 48 | print(path, timestamp, firstTimestamp) 49 | break 50 | else: 51 | firstTimestamp = timestamp 52 | result.add(bucket) 53 | return result 54 | 55 | class BucketType(Enum): 56 | PUBLIC = auto() 57 | PRIVATE = auto() 58 | 59 | def getFullBuckets(bucketType): 60 | assert ( 61 | bucketType == BucketType.PUBLIC or bucketType == BucketType.PRIVATE 62 | ), "Bucket type must be one of PUBLIC/PRIVATE" 63 | if bucketType == BucketType.PUBLIC: 64 | return readFullBucketNamesFromFile("./final_output/all_platforms_public.txt") 65 | if bucketType == BucketType.PRIVATE: 66 | return readFullBucketNamesFromFile("./final_output/all_platforms_private.txt") 67 | 68 | class GeneratorBeanstalkClient(BeanstalkClient): 69 | 70 | def __init__(self, address, port): 71 | super().__init__(address, port) 72 | 73 | def put_job(self, string, **kwargs): 74 | # first, check if the job queue isn't too big. 75 | # if so, sleep proportional to size so that we should slow down at around 10M queue size. 76 | jobsReady = super().stats()["current-jobs-ready"] 77 | time.sleep(jobsReady/1e5) 78 | super().put_job(string, **kwargs) 79 | 80 | 81 | def getExistingAlreadyGuessedBuckets(name, public=False): 82 | """ 83 | Given the generator name, load the dataset comprised 84 | of already guessed buckets from that generator that also happen to exist. 85 | :param: The generator name. 86 | """ 87 | return getExistingBuckets(public=public) & \ 88 | readBucketsFromFile(f"./data/generation/{name}.txt") 89 | 90 | def getExistingBuckets(public=False): 91 | filePath = './final_output/all_platforms_all.txt' 92 | if public: 93 | filePath = "./final_output/all_platforms_public.txt" 94 | return readBucketsFromFile(filePath) 95 | 96 | def readBucketsFromFile(path): 97 | try: 98 | with open(path, "r") as f: 99 | return set([bucket for line in f.readlines() for bucket in getBucketsFromText(line)]) 100 | except FileNotFoundError: 101 | return set() 102 | 103 | 104 | def getStartBucketNames(args): 105 | if args.experiment: 106 | import bucket_generation.evaulator.evaluate_performance as evaluate_performance 107 | candidates = evaluate_performance.loadExtractedNames("2020_07_20") 108 | if args.public: 109 | candidates &= getExistingBuckets(public=True) 110 | return candidates 111 | return None 112 | 113 | class ProfilerType(Enum): 114 | TRAIN = "train" 115 | GENERATE = "generate" 116 | 117 | class Profiler: 118 | 119 | def __init__(self, profilerType, name): 120 | assert profilerType in ProfilerType, "Don't know where to write these profiled results." 121 | assert type(name) == str, "Name must be a string corresponding to the profiler." 122 | self.type = profilerType 123 | self.name = name 124 | self.bucket_name = "" 125 | 126 | def __enter__(self): 127 | self.start = time.process_time() 128 | return self 129 | 130 | def bucket(self, bucket): 131 | self.bucket_name = bucket 132 | 133 | def __exit__(self, exc_type, exc_vlaue, exc_tb): 134 | self.end = time.process_time() 135 | with open(f"./data/timing/{self.type.value}/{self.name}", "a+") as f: 136 | f.write(f"{self.bucket_name},{self.end - self.start},{time.time()}\n") 137 | 138 | 139 | def getBeanstalkClient(port=None): 140 | """ 141 | Start up a beanstalkclient. 142 | """ 143 | config = {} 144 | if not port: 145 | with open('./bucket_validation/listener-config.json', 'r') as f: 146 | config = json.load(f) 147 | port = config["BeanstalkHost"].split(":")[1] 148 | return GeneratorBeanstalkClient("127.0.0.1", port) 149 | 150 | 151 | def getPreviousCandidates(): 152 | candidates = set() 153 | with open('./final_output/all_platforms_public.txt', 'r') as f: 154 | for line in f: 155 | try: 156 | cands = list(getBucketsFromText(line)) 157 | if len(cands) > 0: 158 | candidates.add(cands[0]) 159 | except Exception as e: 160 | pass 161 | return candidates 162 | 163 | def addArguments(parser): 164 | parser.add_argument("name", type=str, help="generator identifier") 165 | parser.add_argument("--num_trials", type=str, help="Number of trials to run generator.") 166 | parser.add_argument("--port", type=int, help="The beanstalk job queue port.") 167 | parser.add_argument("--public", action="store_true", help="Only load the public buckets in our models.") -------------------------------------------------------------------------------- /bucket_generation/generators/token_pcfg/guesser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Inspired from https://web.eecs.utk.edu/~mschucha/netsec/readings/cfgPass.pdf 3 | Predict token based on distribution of tokens bucketd in types as CFG. 4 | To begin, we will just do a really primitive survey: 5 | break bucket into delimiters, characters of a certain length, and 6 | numbers of certain length. We can definitely extend this to 7 | with more specific types. 8 | Types : Ci -> i characters, Ni -> i numbers, 9 | B -> T-T-T 10 | """ 11 | import argparse 12 | from collections import Counter 13 | from enum import Enum, auto 14 | import re 15 | 16 | import numpy as np 17 | 18 | from bucket_extraction.utils.extract_utils import getBucketsFromText 19 | import bucket_generation.utils as gen_utils 20 | 21 | class Type(Enum): 22 | """ 23 | Token types. These source files are sourced from the data/aux text files 24 | """ 25 | TECH = auto() # tech term from 26 | TEMPLATE = auto() 27 | WORD = auto() 28 | TLD = auto() 29 | FILE = auto() 30 | NUMBER = auto() 31 | COMPOUND = auto() 32 | DOMAIN = auto() 33 | OTHER = auto() 34 | 35 | def getType(token, tech_terms, suffixes, file_extensions, domains, dictionary_words): 36 | """ 37 | :param: token a string that we are tying to determine its type 38 | :params: sets of strings that we will check for membership 39 | :return: the token type as an enum 40 | """ 41 | if len(token) == 0: 42 | return '' 43 | if token in tech_terms: 44 | return Type.TECH 45 | if token in suffixes: 46 | return Type.TLD 47 | elif token in file_extensions or token[:-1] in file_extensions: 48 | return Type.FILE 49 | elif token in domains or token[:-1] in domains: 50 | return Type.DOMAIN 51 | elif token in dictionary_words or token[:-1] in dictionary_words: 52 | return Type.WORD 53 | elif token.isdigit(): 54 | return Type.NUMBER 55 | else: 56 | for i in range(len(token)): 57 | if token[:i] in dictionary_words and token[i:] in dictionary_words: 58 | return Type.COMPOUND 59 | return Type.OTHER 60 | 61 | def loadTypeSets(): 62 | """ 63 | Reads the type sets from the right text files and returns their sets 64 | """ 65 | # ./data/aux/dictionary.txt' -- from https://github.com/dwyl/english-words/ 66 | # instead, let's use SCOWL like Continella for better comparison. (/usr/share/dict/wodrds) 67 | with open('/usr/share/dict/words') as f: 68 | words = set([l.lower().strip() for l in f.readlines()]) 69 | 70 | # TLDs but a little less strict (i.e. co.uk is not a TLD but a common suffix) 71 | # from https://publicsuffix.org/list/public_suffix_list.dat 72 | with open('./data/aux/public_suffix_list.dat') as f: 73 | suffixes = set([l.lower().strip() for l in f.readlines()][1:]) 74 | 75 | # Just sourced manually via the "Other" section 76 | with open('./data/aux/tech_terms.txt') as f: 77 | techTerms = set([l.lower().strip() for l in f.readlines()]) 78 | 79 | # From https://s3-us-west-1.amazonaws.com/umbrella-static/index.html 80 | with open('./data/aux/top-1e5-domains.txt') as f: 81 | domains = set([line.split('.')[-2] for line in f.readlines() if len(line.split('.')) >= 2]) 82 | 83 | # sourced manually form wikipedia: https://en.wikipedia.org/wiki/List_of_file_formats 84 | with open('./data/aux/wikipedia-file-extensions.txt') as f: 85 | files = set([l.lower().strip() for l in f.readlines()][1:]) 86 | return techTerms, suffixes, files, domains, words 87 | 88 | 89 | def updateCounters(buckets): 90 | """ 91 | Generate distributions for each CFG node 92 | :return: a counter for templates, dictionary words, tech words, files, domains, compound words, TLDS, and numbers 93 | """ 94 | 95 | techTerms, suffixes, files, domains, words = loadTypeSets() 96 | 97 | counters = { key: Counter() for key in Type } 98 | delimiters = re.compile('[-._]') 99 | for bucket in buckets: 100 | tokens = delimiters.split(bucket.lower()) 101 | 102 | bucketDelimiters = list(delimiters.finditer(bucket)) 103 | template = '' 104 | for i, token in enumerate(tokens): 105 | tokenType = getType(token, techTerms, suffixes, files, domains, words) 106 | if tokenType != '': 107 | template += tokenType.name 108 | counters[tokenType][token] += 1 109 | if i != len(tokens) - 1: 110 | template += bucketDelimiters[i].group() 111 | 112 | counters[Type.TEMPLATE][template] += 1 113 | return counters 114 | 115 | 116 | def sampleFromCounter(counter): 117 | total = sum(counter.values()) 118 | return np.random.choice([k for k,v in counter.items()], p=[v/total for k,v in counter.items()]) 119 | 120 | def generatePCFGCandidates(startingCandidates=None, beanstalkPort=None, name="token_pcfg", numTrials=float("inf"), public=False): 121 | beanstalkClient = gen_utils.getBeanstalkClient(port=beanstalkPort) 122 | previouslySeen = startingCandidates | gen_utils.readBucketsFromFile(f"./data/generation/{name}.txt") 123 | 124 | # Randomly generate template according to distro 125 | delimiters = re.compile('[-._]') 126 | while numTrials > 0: 127 | 128 | # Every 10,000 guesses, update the PCFG. 129 | print("Updating PCFG.") 130 | with gen_utils.Profiler(gen_utils.ProfilerType.TRAIN, name): 131 | candidates = startingCandidates | gen_utils.getExistingAlreadyGuessedBuckets(name, public=public) 132 | counters = updateCounters(candidates) 133 | 134 | for i in range(int(1e4)): 135 | with gen_utils.Profiler(gen_utils.ProfilerType.GENERATE, name) as p: 136 | template = sampleFromCounter(counters[Type.TEMPLATE]) 137 | tokens = delimiters.split(template) 138 | templateDelimiters = list(delimiters.finditer(template)) 139 | bucket = '' 140 | for idx, token in enumerate(tokens): 141 | if token != '': 142 | bucket += sampleFromCounter(counters[Type[token]]) 143 | if idx != len(tokens) - 1: 144 | bucket += templateDelimiters[idx].group() 145 | p.bucket(bucket) 146 | if bucket not in previouslySeen: 147 | numTrials -= 1 148 | previouslySeen.add(bucket) 149 | print('CAND:', bucket) 150 | beanstalkClient.put_job(f"generation/{name},{bucket}") 151 | 152 | if __name__ == "__main__": 153 | parser = argparse.ArgumentParser(description='Run the Token PCFG generator.') 154 | gen_utils.addArguments(parser) 155 | args = parser.parse_args() 156 | candidates = gen_utils.getStartBucketNames(args) 157 | generatePCFGCandidates(name=args.name, startingCandidates=candidates, public=args.public, numTrials=int(args.num_trials) or float("inf")) 158 | 159 | 160 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS -------------------------------------------------------------------------------- /bucket_generation/generators/rnn/guesser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Jack Cable and Drew Gregory 3 | LSTM Generator to produce S3 Bucket Candidate Names. 4 | This uses a one-to-many design so that we can generate names from scratch (only basing off of a single character). 5 | Inspired by: 6 | - https://towardsdatascience.com/generating-text-using-a-recurrent-neural-network-1c3bfee27a5e 7 | - https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py 8 | - https://stackoverflow.com/questions/38714959/understanding-keras-lstms?rq=1 9 | Here was a useful answer to why the output dimensions are for multiple sequences: 10 | - https://stackoverflow.com/questions/43702481/why-does-keras-lstm-batch-size-used-for-prediction-have-to-be-the-same-as-fittin 11 | """ 12 | import argparse 13 | from datetime import date 14 | import json 15 | import numpy as np 16 | import random 17 | import time 18 | 19 | from keras.callbacks import ModelCheckpoint, LambdaCallback 20 | from keras.layers import Activation, Dense, Flatten, LSTM, Masking 21 | from keras.models import Sequential, load_model 22 | from keras.callbacks import ReduceLROnPlateau 23 | from keras.optimizers import RMSprop 24 | 25 | import bucket_generation.utils as generation_utils 26 | from bucket_extraction.utils.extract_utils import getBucketsFromText 27 | from bucket_generation.utils import getExistingAlreadyGuessedBuckets, getExistingBuckets 28 | 29 | 30 | beanstalkClient = None 31 | 32 | def sample(preds, temperature=1.0): 33 | # helper function to sample an index from a probability array 34 | preds = np.asarray(preds).astype('float64') 35 | preds = np.log(preds) / temperature 36 | exp_preds = np.exp(preds) 37 | preds = exp_preds / np.sum(exp_preds) 38 | probas = np.random.multinomial(1, preds, 1) 39 | return np.argmax(probas) 40 | 41 | def standardizeText(line, forward=True): 42 | """ 43 | Remove whitespace, lowercase, 44 | and end with termination character \r 45 | """ 46 | text = line.strip().lower()[:63] 47 | return (text if forward else text[::-1]) + '\r' 48 | 49 | def buildModel(uniqueChars): 50 | # This is because we have variable length input sequences and thus different 51 | # dimensions, see https://github.com/keras-team/keras/issues/6776 52 | 53 | hiddenUnits = 64 54 | model = Sequential() 55 | inShape = (64, uniqueChars) # bucket names need to be between 3-63 chars 56 | model.add( 57 | LSTM( 58 | hiddenUnits, input_shape=inShape, 59 | return_sequences=True, 60 | ) 61 | ) 62 | model.add(Flatten()) # https://github.com/keras-team/keras/issues/6351 63 | model.add(Dense(uniqueChars, activation='softmax')) 64 | optimizer = RMSprop(lr=0.01) 65 | model.compile(loss='categorical_crossentropy', optimizer=optimizer) 66 | return model 67 | 68 | def addNamesToCorpus(x,y, names, startingCharCounts, forward): 69 | for bucket_name in names: 70 | goodName = standardizeText(bucket_name,forward=forward) 71 | for i in range(len(goodName)-1): 72 | x.append(goodName[:i+1]) 73 | y.append(goodName[i+1]) 74 | startC = goodName[0] 75 | if startC not in startingCharCounts: 76 | startingCharCounts[startC] = 0 77 | startingCharCounts[startC] += 1 78 | 79 | 80 | def addNamesToCorpusFromFile(x,y, filename, startingCharCounts, forward): 81 | buckets = set(random.sample(generation_utils.readBucketsFromFile(filename), k=int(1e4))) 82 | addNamesToCorpus(x,y, buckets, startingCharCounts, forward) 83 | 84 | 85 | def generateText(startingCounts, model, indicesChar, charIndices, forward): 86 | startingChar = startingCounts[ 87 | sample([c[1] for c in startingCounts]) 88 | ][0] 89 | sentence = startingChar 90 | for _ in range(63): 91 | x_pred = np.zeros((1, 64, 40)) 92 | for t, char in enumerate(sentence): 93 | x_pred[0, t, charIndices[char]] = 1. 94 | preds = model.predict(x_pred, verbose=0)[0] 95 | nextIndex = sample(preds) 96 | if nextIndex == charIndices['\r']: 97 | if len(sentence) <= 3: 98 | continue 99 | else: 100 | break 101 | sentence += indicesChar[str(nextIndex)] 102 | return sentence if forward else sentence[::-1] 103 | 104 | def onEpochEnd(epoch, logs, startingCounts, model, indicesChar, charIndices, forward): 105 | print('FINISHED EPOCH', epoch) 106 | for _ in range(10): 107 | print(generateText(startingCounts, model, indicesChar, charIndices, forward)) 108 | 109 | 110 | def trainModel( 111 | startingCharCounts, model, filepath, charIndices, indicesChar, forward, 112 | candidates=None, name=None, public=False): 113 | 114 | # Collect all bucket names and starting character distribution 115 | sentences = [] 116 | nextChars = [] 117 | candidates = candidates or getExistingBuckets(public=public) 118 | candidates |= getExistingAlreadyGuessedBuckets(name, public=public) 119 | 120 | # This many candidates wouldn't fit in memory, so let's grab 10,000 buckets at random. 121 | sampledBucketNames = random.sample( 122 | candidates, 123 | int(1e4) 124 | ) 125 | addNamesToCorpus(sentences, nextChars, sampledBucketNames, startingCharCounts, forward) 126 | 127 | 128 | x = np.zeros((len(sentences), 64, 40), dtype=np.bool) 129 | y = np.zeros((len(sentences), 40), dtype=np.bool) 130 | 131 | for i, sentence in enumerate(sentences): 132 | for t, char in enumerate(sentence): 133 | x[i, t, charIndices[char]] = 1 134 | y[i, charIndices[nextChars[i]]] = 1 135 | print('NUM SENTENCES', len(sentences)) 136 | startingCounts = list(startingCharCounts.items()) 137 | 138 | checkpoint = ModelCheckpoint(filepath, monitor='loss', 139 | verbose=1, save_best_only=True, 140 | mode='min') 141 | checkpoint_backup = ModelCheckpoint("{}.{}".format(filepath, date.today().strftime("%Y_%m_%d")),monitor='loss', 142 | verbose=1, save_best_only=True, 143 | mode='min') 144 | reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, 145 | patience=1, min_lr=0.00001) 146 | 147 | print_callback = LambdaCallback(on_epoch_end=lambda x,y: onEpochEnd( 148 | x, y,startingCounts, model, indicesChar, charIndices, forward 149 | )) 150 | callbacks = [print_callback, checkpoint, checkpoint_backup, reduce_lr] 151 | print('FITTING') 152 | print(len(x),len(y)) 153 | while True: 154 | try: 155 | model.fit(x, y, batch_size=1000, epochs=10, callbacks=callbacks, use_multiprocessing=True) 156 | break 157 | except OSError as e: 158 | # Just retry, after waiting some time. 159 | time.sleep(17) 160 | return model 161 | 162 | def makeGuesses(model, startingCharCounts, charIndices, indicesChar, forward, name="name", previous=None): 163 | candidates = previous or set() 164 | startingCounts = list(startingCharCounts.items()) 165 | for _ in range(10000): 166 | with generation_utils.Profiler(generation_utils.ProfilerType.GENERATE, name) as p: 167 | cand = generateText(startingCounts, model, indicesChar, charIndices, forward) 168 | p.bucket(cand) 169 | print(len(candidates)) 170 | if cand not in candidates: 171 | print('CAND', cand) 172 | beanstalkClient.put_job(f"generation/{name},{cand}") 173 | candidates.add(cand) 174 | else: 175 | print("ALREADY GUESSED") 176 | 177 | 178 | def runTraining(name="rnn", forward=True, filepath=None, candidates=None, public=False): 179 | chars = 40 180 | assert filepath, "No weights filepath provided." 181 | try: 182 | model = load_model(filepath) 183 | except Exception as e: 184 | print("COULDNT LOAD MODEL", e) 185 | model = buildModel(chars) 186 | model.summary() 187 | charIndices = {} 188 | with open('./data/generation/rnn/charIndices.json') as f: 189 | charIndices = json.load(f,) 190 | indicesChar = {} 191 | with open('./data/generation/rnn/indicesChar.json') as f: 192 | indicesChar = json.load(f,) 193 | startingCharCounts = {} 194 | while True: 195 | with generation_utils.Profiler(generation_utils.ProfilerType.TRAIN, name): 196 | model = trainModel( 197 | startingCharCounts, model, filepath, charIndices, indicesChar,forward, 198 | candidates=candidates, 199 | name=name, public=public 200 | ) 201 | 202 | def streamRNNGuesses( 203 | forward=True, beanstalkPort=None, name="rnn", numTrials=None, weights_path=None, seedSet=None 204 | ): 205 | 206 | if not numTrials: 207 | numTrials = float("inf") 208 | 209 | global beanstalkClient 210 | beanstalkClient = generation_utils.getBeanstalkClient(port=beanstalkPort) 211 | 212 | filepath = weights_path 213 | charIndices = {} 214 | with open('./data/generation/rnn/charIndices.json') as f: 215 | charIndices = json.load(f,) 216 | indicesChar = {} 217 | with open('./data/generation/rnn/indicesChar.json') as f: 218 | indicesChar = json.load(f,) 219 | startingCharCounts = {} 220 | sentences = [] 221 | nextChars = [] 222 | numTrials /= 1e4 223 | previouslySeen = generation_utils.readBucketsFromFile(f"data/generation/{name}.txt") | (seedSet or set()) 224 | # This is just to load up the startingCharCounts. 225 | addNamesToCorpusFromFile(sentences, nextChars, './final_output/all_platforms_all.txt', startingCharCounts, forward) 226 | sentences = [] 227 | nextChars = [] 228 | while numTrials > 0: 229 | try: 230 | model = load_model(filepath) 231 | except Exception as e: 232 | print("COULDNT LOAD MODEL, WAITING A MINUTE", e) 233 | time.sleep(60) 234 | continue 235 | model.summary() 236 | makeGuesses(model, startingCharCounts, charIndices, indicesChar, forward, name=name, previous=previouslySeen) 237 | numTrials -= 1 238 | 239 | 240 | if __name__ == "__main__": 241 | parser = argparse.ArgumentParser(description='Train rnn.') 242 | generation_utils.addArguments(parser) 243 | parser.add_argument("--train", action="store_true", help="Train rnn instead of stream guesses.") 244 | parser.add_argument("--forward", action="store_true", help="Run the rnn in forward vs. backward mode.") 245 | parser.add_argument("--stream", action="store_true", help="Stream guesses based off of the model.") 246 | 247 | args = parser.parse_args() 248 | name = args.name or "rnn" 249 | assert args.train or args.stream, "Must have one of --stream or --train." 250 | weights_path = "data/generation/rnn/{}_weights_{}.hdf5".format( 251 | name, 252 | "forward" if args.forward else "backward", 253 | ) 254 | if args.stream: 255 | extractedCandidates = generation_utils.getStartBucketNames(args) if args.experiment else None 256 | streamRNNGuesses( 257 | beanstalkPort=args.port, 258 | forward=args.forward, 259 | name=name, 260 | numTrials=int(args.num_trials) or float("inf"), 261 | weights_path=weights_path, 262 | seedSet=extractedCandidates, 263 | ) 264 | elif args.train: 265 | 266 | extractedCandidates = generation_utils.getStartBucketNames(args) if args.experiment else None 267 | runTraining( 268 | name=name, 269 | forward=args.forward, 270 | filepath=weights_path, 271 | candidates=extractedCandidates, 272 | public=args.public, 273 | ) 274 | -------------------------------------------------------------------------------- /bucket_validation/listener.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | /* 4 | This Listener takes in lines of IP addresses as input and delgates their results 5 | to processes. 6 | */ 7 | 8 | import ( 9 | "bufio" 10 | "encoding/json" 11 | "fmt" 12 | "io" 13 | "log" 14 | "os" 15 | "os/exec" 16 | "os/signal" 17 | "strconv" 18 | "strings" 19 | "sync" 20 | "syscall" 21 | "time" 22 | 23 | "github.com/willf/bloom" 24 | 25 | "github.com/beanstalkd/go-beanstalk" 26 | "github.com/spf13/viper" 27 | ) 28 | 29 | // Configuration class allows you to specify the source ips and read limit (in KB) 30 | type Configuration struct { 31 | NumSenders int 32 | BeanstalkHost string 33 | ReadLimitPerHost int 34 | SourceIPs []string 35 | } 36 | 37 | /* 38 | Validator represents a process that takes in S3 hostnames and resolves their results to an 39 | output file. 40 | */ 41 | type Validator struct { 42 | stdIn io.WriteCloser 43 | stdOut io.ReadCloser 44 | cmd *exec.Cmd 45 | ip string 46 | lastResponseReceived time.Time 47 | lastResponseMutex sync.Mutex 48 | } 49 | 50 | //OutputFile represented the bucketed output types, abstracted just in case we want more member fields. 51 | type OutputFile struct { 52 | f *os.File 53 | } 54 | 55 | var config Configuration 56 | 57 | // The list of hosts to try a bucket against if no host is provided 58 | var hosts []string 59 | 60 | // The list of hosts that are accepted 61 | var acceptedHosts []string 62 | 63 | var validators []*Validator 64 | 65 | var responseChan chan string 66 | 67 | // The maximum number of times a single bucket is tried 68 | var MAX_RETRIES = 3 69 | 70 | type OpenRequest struct { 71 | provider string 72 | source string 73 | numAttempts int 74 | lastTried time.Time 75 | } 76 | 77 | // A map from bucket names to their sources and number of retiries 78 | var openRequests map[string]OpenRequest = make(map[string]OpenRequest) 79 | var openRequestsMutex sync.Mutex 80 | 81 | // Mutex for bloom filter 82 | var previouslySeenMutex sync.Mutex 83 | 84 | func handleErrorFatal(err error) { 85 | if err != nil { 86 | log.Fatal(err) 87 | } 88 | } 89 | 90 | func handleError(err error) { 91 | if err != nil { 92 | log.Println(err) 93 | } 94 | } 95 | 96 | func initializeZGrab(numSenders int, readLimitPerHost int, sourceIP string) *Validator { 97 | path, ok := os.LookupEnv("GOPATH") 98 | if ok { 99 | cmd := exec.Command(path+"/bin/zgrab2", "http", 100 | "--use-https", 101 | "--port", "443", 102 | "--read-limit-per-host", fmt.Sprintf("%d", readLimitPerHost), 103 | "--senders", fmt.Sprintf("%d", numSenders), 104 | "--source-ip", sourceIP, 105 | "--flush") 106 | stdout, err := cmd.StdoutPipe() 107 | handleError(err) 108 | cmd.Stderr = os.Stderr 109 | stdin, err := cmd.StdinPipe() 110 | handleError(err) 111 | err = cmd.Start() 112 | handleError(err) 113 | fmt.Printf("ZGrab running with %d senders on IP %s \n", numSenders, sourceIP) 114 | return &Validator{stdin, stdout, cmd, sourceIP, time.Now(), sync.Mutex{}} 115 | } 116 | fmt.Printf("GOPATH not set. %s", path) 117 | 118 | return nil 119 | } 120 | 121 | func delegateRequestJobs(files map[string]OutputFile, beanstalkHost string, prevSeen *bloom.BloomFilter) { 122 | jobQueue, err := beanstalk.Dial("tcp", beanstalkHost) 123 | handleErrorFatal(err) 124 | for { 125 | for i, v := range validators { 126 | v.lastResponseMutex.Lock() 127 | // If more than one minute has elapsed, restart validator 128 | if time.Since(v.lastResponseReceived) > time.Minute { 129 | closeValidator(v) 130 | initiateValidator(i) 131 | receiveResponse(validators[i]) 132 | v.lastResponseReceived = time.Now() 133 | } 134 | v.lastResponseMutex.Unlock() 135 | // Check if any buckets should be retried 136 | shouldContinue := false 137 | openRequestsMutex.Lock() 138 | for bucket, openRequest := range openRequests { 139 | if openRequest.numAttempts > MAX_RETRIES { 140 | delete(openRequests, bucket) 141 | continue 142 | } 143 | if time.Since(openRequest.lastTried) > 60*time.Minute { 144 | spawnBucket(bucket, openRequest.provider, openRequest.source, v, openRequest.numAttempts+1, false, prevSeen) 145 | shouldContinue = true 146 | break 147 | } 148 | } 149 | openRequestsMutex.Unlock() 150 | if shouldContinue { 151 | continue 152 | } 153 | 154 | id, body, err := jobQueue.Reserve(5 * time.Second) 155 | log.Println("Reserved job " + string(body)) 156 | if err != nil { 157 | if !strings.Contains(err.Error(), "timeout") { // Don't print if it's a timeout 158 | log.Println("Error reserving job: " + err.Error()) 159 | } 160 | continue 161 | } 162 | jobContents := strings.Split(string(body), ",") 163 | if len(jobContents) != 2 { 164 | log.Println("INVALID FORMAT FOR " + string(body) + ": NEEDS ',' DELIMITER") 165 | err = jobQueue.Delete(id) 166 | continue 167 | } 168 | path := jobContents[0] 169 | bucket := jobContents[1] 170 | 171 | // If a host is already provided, we use only that host 172 | hostFound := false 173 | for _, host := range acceptedHosts { 174 | if strings.Contains(bucket, host) { 175 | spawnBucket(bucket, host, path, v, 1, true, prevSeen) 176 | hostFound = true 177 | break 178 | } 179 | } 180 | 181 | // Otherwise, we try on all hosts 182 | if !hostFound { 183 | for _, host := range hosts { 184 | bucketName := bucket 185 | if host == "oss-us-east-1.aliyuncs.com" { 186 | // For alibaba, replace dots with hyphens 187 | bucketName = strings.Replace(bucketName, ".", "-", -1) 188 | } 189 | spawnBucket(bucketName+"."+host, host, path, v, 1, true, prevSeen) 190 | } 191 | } 192 | 193 | err = jobQueue.Delete(id) 194 | handleError(err) 195 | } 196 | log.Println("Sleeping for 0.25 second") 197 | time.Sleep(time.Duration(250) * time.Millisecond) 198 | } 199 | 200 | } 201 | 202 | func spawnBucket(bucket string, host string, path string, v *Validator, count int, shouldLock bool, prevSeen *bloom.BloomFilter) { 203 | 204 | // First, confirm that we have not tried the bucket before. 205 | // Currently commented out - TODO: add option to use bloom filter 206 | // previouslySeenMutex.Lock() 207 | // old := prevSeen.Test([]byte(bucket)) 208 | // previouslySeenMutex.Unlock() 209 | old := false 210 | 211 | if old { // The bucket has already been tried: just ignore. 212 | log.Println("Have seen " + bucket) 213 | if shouldLock { 214 | openRequestsMutex.Lock() 215 | } 216 | delete(openRequests, bucket) 217 | if shouldLock { 218 | openRequestsMutex.Unlock() 219 | } 220 | return 221 | } 222 | 223 | log.Println("Have not seen " + bucket) 224 | fmt.Println("Sending: " + bucket) 225 | 226 | go writeWithTimeout(v.stdIn, []byte(bucket+"\n")) 227 | 228 | if shouldLock { 229 | openRequestsMutex.Lock() 230 | } 231 | openRequests[bucket] = OpenRequest{host, path, count, time.Now()} 232 | if shouldLock { 233 | openRequestsMutex.Unlock() 234 | } 235 | } 236 | 237 | func writeWithTimeout(stdIn io.WriteCloser, text []byte) { 238 | c := make(chan string, 1) 239 | go func() { 240 | stdIn.Write(text) 241 | c <- "done" 242 | }() 243 | select { 244 | case <-c: 245 | case <-time.After(500 * time.Millisecond): 246 | } 247 | } 248 | 249 | func receiveResponses() { 250 | for _, v := range validators { 251 | receiveResponse(v) 252 | } 253 | } 254 | 255 | func receiveResponse(v *Validator) { 256 | go func(v *Validator, c chan string) { 257 | scanner := bufio.NewScanner(v.stdOut) 258 | if scanner != nil { 259 | for scanner.Scan() { 260 | v.lastResponseMutex.Lock() 261 | v.lastResponseReceived = time.Now() 262 | v.lastResponseMutex.Unlock() 263 | 264 | text := scanner.Text() 265 | c <- text 266 | } 267 | } 268 | }(v, responseChan) 269 | } 270 | 271 | func writeResponses(files map[string]OutputFile, prevSeen *bloom.BloomFilter) { 272 | for result := range responseChan { 273 | var responseBody interface{} 274 | err := json.Unmarshal([]byte(result), &responseBody) 275 | if err != nil { 276 | fmt.Fprintln(os.Stderr, err) 277 | } else { 278 | responseJSON := responseBody.(map[string]interface{}) 279 | domain := responseJSON["domain"].(string) 280 | data := responseJSON["data"].(map[string]interface{}) 281 | if data["http"] == nil { 282 | continue 283 | } 284 | http := data["http"].(map[string]interface{}) 285 | if http["result"] == nil { 286 | continue 287 | } 288 | resultJSON := http["result"].(map[string]interface{}) 289 | if resultJSON["response"] == nil { 290 | continue 291 | } 292 | response := resultJSON["response"].(map[string]interface{}) 293 | statusCode := int(response["status_code"].(float64)) 294 | fmt.Printf("%d %s\n", statusCode, domain) 295 | 296 | // Add bucket to our previously seen set. 297 | previouslySeenMutex.Lock() 298 | prevSeen.Add([]byte(domain)) 299 | previouslySeenMutex.Unlock() 300 | 301 | // Alibaba: Check if response is redirecting to a different bucket 302 | if strings.Contains(domain, "oss-us-east-1.aliyuncs.com") && response["body"] != nil { 303 | body := response["body"].(string) 304 | if statusCode == 403 && strings.Contains(body, "must be addressed") && strings.Contains(body, "") { 305 | newHost := strings.Split(strings.Split(body, "")[0], "")[1] 306 | bucket := strings.Split(domain, ".oss-us-east-1.aliyuncs.com")[0] + "." + newHost 307 | openRequestsMutex.Lock() 308 | origRequest := openRequests[domain] 309 | delete(openRequests, domain) 310 | // Add to pending queue with 0 time to force trying on new host 311 | openRequests[bucket] = OpenRequest{origRequest.provider, origRequest.source, origRequest.numAttempts + 1, time.Time{}} 312 | openRequestsMutex.Unlock() 313 | continue 314 | } 315 | } 316 | 317 | toLog := fmt.Sprintf("%s,%d\n", domain, time.Now().Unix()) 318 | 319 | openRequestsMutex.Lock() 320 | request := openRequests[domain] 321 | getFile(request.source, files).f.WriteString(toLog) 322 | delete(openRequests, domain) 323 | openRequestsMutex.Unlock() 324 | 325 | for _, host := range acceptedHosts { 326 | if strings.Contains(domain, host) { 327 | getFile(strconv.Itoa(statusCode)+host, files).f.WriteString(toLog) 328 | break 329 | } 330 | } 331 | } 332 | } 333 | } 334 | 335 | func closeAllValidators() { 336 | for _, v := range validators { 337 | closeValidator(v) 338 | } 339 | } 340 | 341 | func closeValidator(v *Validator) { 342 | v.stdIn.Close() 343 | v.cmd.Process.Kill() 344 | v.cmd.Wait() 345 | } 346 | 347 | func getFile(path string, files map[string]OutputFile) OutputFile { 348 | if !strings.Contains(path, "..") { 349 | if val, ok := files[path]; ok { 350 | fmt.Println("VAL: " + path) 351 | 352 | return val 353 | } 354 | 355 | f, err := os.OpenFile( 356 | "./data/"+path+".txt", 357 | os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) 358 | if err == nil { 359 | files[path] = OutputFile{f} 360 | 361 | return files[path] 362 | } 363 | fmt.Fprintln(os.Stderr, err) 364 | 365 | } 366 | return OutputFile{nil} 367 | } 368 | 369 | func closeAllValidatorsOnSignal(files map[string]OutputFile, prevSeen *bloom.BloomFilter) { 370 | // Intercept sigint 371 | sig := make(chan os.Signal, 2) 372 | signal.Notify(sig, os.Interrupt, syscall.SIGTERM) 373 | 374 | go func() { 375 | <-sig 376 | fmt.Println("Stopping. Closing all validators.") 377 | closeAllValidators() 378 | for _, v := range files { 379 | v.f.Close() 380 | } 381 | 382 | // Also write to the bloom filter file 383 | previouslySeenMutex.Lock() 384 | f, err := os.OpenFile("./bucket_validation/bloom/candidate_set.bloom", os.O_CREATE|os.O_RDWR, 0644) 385 | if err != nil { 386 | panic(err) 387 | } 388 | defer f.Close() 389 | w := bufio.NewWriter(f) 390 | prevSeen.WriteTo(w) 391 | previouslySeenMutex.Unlock() 392 | os.Exit(0) 393 | }() 394 | 395 | } 396 | 397 | func openFiles() map[string]OutputFile { 398 | types := map[string]string{ 399 | "200": "public", 400 | "400": "invalid_bucket", 401 | "403": "private", 402 | "404": "no_such_bucket", 403 | "500": "error", 404 | } 405 | files := make(map[string]OutputFile) 406 | for k, v := range types { 407 | for _, host := range acceptedHosts { 408 | f, err := os.OpenFile( 409 | "./data/validation/"+host+"/"+v+".txt", 410 | os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) 411 | if err == nil { 412 | files[k+host] = OutputFile{f} 413 | } else { 414 | fmt.Fprintln(os.Stderr, err) 415 | } 416 | } 417 | } 418 | return files 419 | } 420 | 421 | func parseConfig() { 422 | viper.SetConfigName("bucket_validation/listener-config") 423 | viper.AddConfigPath(".") 424 | err := viper.ReadInConfig() 425 | handleErrorFatal(err) 426 | err = viper.Unmarshal(&config) 427 | handleErrorFatal(err) 428 | } 429 | 430 | // Clears the job queue by deleting all items 431 | func clearQueue(beanstalkHost string) { 432 | jobQueue, err := beanstalk.Dial("tcp", beanstalkHost) 433 | handleError(err) 434 | for { 435 | id, _, err := jobQueue.Reserve(5 * time.Second) 436 | if err != nil { 437 | if !strings.Contains(err.Error(), "timeout") { // Don't print if it's a timeout 438 | log.Println("Error reserving job: " + err.Error()) 439 | } 440 | continue 441 | } 442 | err = jobQueue.Delete(id) 443 | handleError(err) 444 | } 445 | } 446 | 447 | func initiateValidators() { 448 | validators = make([]*Validator, 0) 449 | if len(config.SourceIPs) == 0 { 450 | validators = append(validators, initializeZGrab(config.NumSenders, config.ReadLimitPerHost, "")) 451 | } else { 452 | for _, ip := range config.SourceIPs { 453 | validators = append(validators, initializeZGrab(config.NumSenders, config.ReadLimitPerHost, ip)) 454 | } 455 | } 456 | } 457 | 458 | func initiateValidator(i int) { 459 | newValidator := initializeZGrab(config.NumSenders, config.ReadLimitPerHost, validators[i].ip) 460 | validators[i].stdIn = newValidator.stdIn 461 | validators[i].stdOut = newValidator.stdOut 462 | validators[i].cmd = newValidator.cmd 463 | } 464 | 465 | func loadBloomFilter() *bloom.BloomFilter { 466 | filter := bloom.NewWithEstimates(300000000, .000001) 467 | // TODO: Add support for Bloom Filter 468 | // f, err := os.OpenFile("./bucket_validation/bloom/candidate_set.bloom", os.O_CREATE|os.O_RDWR, 0644) 469 | // if err != nil { 470 | // panic(err) 471 | // } 472 | // defer f.Close() 473 | // r := bufio.NewReader(f) 474 | // filter.ReadFrom(r) 475 | return filter 476 | } 477 | 478 | func main() { 479 | 480 | responseChan = make(chan string) 481 | // The hosts that we automatically try against. For Alibaba, we initially try 482 | // against one region and get the new region from the response if the bucket exists 483 | hosts = []string{"s3.amazonaws.com", "storage.googleapis.com", "oss-us-east-1.aliyuncs.com"} 484 | acceptedHosts = []string{"s3.amazonaws.com", "storage.googleapis.com", "aliyuncs.com"} 485 | 486 | // clearQueue(config.BeanstalkHost) 487 | 488 | filter := loadBloomFilter() 489 | 490 | parseConfig() 491 | initiateValidators() 492 | 493 | outputFiles := openFiles() 494 | closeAllValidatorsOnSignal(outputFiles, filter) 495 | go delegateRequestJobs(outputFiles, config.BeanstalkHost, filter) 496 | receiveResponses() 497 | writeResponses(outputFiles, filter) 498 | 499 | } 500 | --------------------------------------------------------------------------------