├── __init__.py
├── bucket_generation
    ├── __init__.py
    ├── add_to_path.sh
    ├── replayExisting.py
    ├── generators
    │   ├── random_baseline
    │   │   └── guesser.py
    │   ├── character_grams
    │   │   └── guesser.py
    │   ├── n_grams
    │   │   └── guesser.py
    │   ├── continella
    │   │   └── guesser.py
    │   ├── pcfg
    │   │   └── guesser.py
    │   ├── token_pcfg
    │   │   └── guesser.py
    │   └── rnn
    │   │   └── guesser.py
    └── utils.py
├── bucket_extraction
    ├── utils
    │   ├── __init__.py
    │   └── extract_utils.py
    ├── __init__.py
    ├── feed_to_validator
    │   └── feed_to_validator.py
    ├── grayhatwarfare
    │   └── grayhatwarfare.py
    ├── bing
    │   └── bing.py
    ├── farsight
    │   └── farsight.py
    └── virustotal
    │   └── virustotal.py
├── data
    ├── extraction
    │   ├── bing
    │   │   └── .gitignore
    │   ├── farsight
    │   │   └── .gitignore
    │   ├── grayhatwarfare
    │   │   └── .gitignore
    │   └── virustotal
    │   │   └── .gitignore
    └── validation
    │   ├── aliyuncs.com
    │       └── .gitignore
    │   ├── s3.amazonaws.com
    │       └── .gitignore
    │   └── storage.googleapis.com
    │       └── .gitignore
├── final_output
    ├── .gitignore
    └── gather_all_buckets.sh
├── images
    └── flow.jpg
├── requirements.txt
├── .env.example
├── .gitignore
├── utils.py
├── bucket_validation
    ├── test_bloom
    │   └── test_bloom.go
    ├── bloom
    │   └── add_to_bloom.go
    └── listener.go
├── README.md
├── main.py
└── LICENSE


/__init__.py:
--------------------------------------------------------------------------------
1 | from utils import *


--------------------------------------------------------------------------------
/bucket_generation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/bucket_extraction/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/extraction/bing/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/data/extraction/farsight/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/bucket_generation/add_to_path.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH=$(pwd)


--------------------------------------------------------------------------------
/data/extraction/grayhatwarfare/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/data/extraction/virustotal/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/data/validation/aliyuncs.com/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/data/validation/s3.amazonaws.com/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/data/validation/storage.googleapis.com/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/final_output/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !gather_all_buckets.sh
3 | !.gitignore


--------------------------------------------------------------------------------
/bucket_extraction/__init__.py:
--------------------------------------------------------------------------------
1 | from bucket_extraction.utils.extract_utils import *
2 | 


--------------------------------------------------------------------------------
/images/flow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanford-esrg/stratosphere/HEAD/images/flow.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | grequests==0.6.0
2 | argparse==1.4.0
3 | python-dotenv
4 | requests
5 | pystalk
6 | numpy
7 | keras
8 | tensorflow
9 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | # Bing Cognitive Search API key
 2 | # https://docs.microsoft.com/en-us/azure/cognitive-services/bing-web-search/
 3 | BING_API_KEY=1
 4 | 
 5 | # Farsight API key
 6 | FARSIGHT_API_KEY=1
 7 | 
 8 | # GrayHat Warfare access token
 9 | GRAYHAT_ACCESS_TOKEN=1
10 | 
11 | # VirusTotal API key
12 | VIRUSTOTAL_API_KEY=1


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.code-workspace
 2 | *__pycache__*
 3 | .DS_Store
 4 | *.pyc
 5 | .vscode
 6 | bucket_validation/listener-config.json
 7 | *.ipynb_checkpoints*
 8 | *.ipynb
 9 | bucket_validation/bloom/candidate_set.bloom
10 | bucket_validation/bloom/old_candidate_set.bloom
11 | bucket_generation/github_orgs/.env
12 | bucket_generation/github_orgs/repo_counts_for_1k_orgs.png
13 | .env
14 | 


--------------------------------------------------------------------------------
/bucket_extraction/feed_to_validator/feed_to_validator.py:
--------------------------------------------------------------------------------
 1 | from pystalk import BeanstalkClient
 2 | import time
 3 | 
 4 | beanstalk_client = BeanstalkClient('127.0.0.1', 11301)
 5 | 
 6 | def feedToValidator(file, label):
 7 |     with open(file, 'r') as f:
 8 |         lines = list(f)
 9 |     for line in lines:
10 |         line = line.strip()
11 |         print('CAND:', line)
12 |         beanstalk_client.put_job("extraction/" + label + "," + line)
13 | 


--------------------------------------------------------------------------------
/bucket_generation/replayExisting.py:
--------------------------------------------------------------------------------
 1 | from pystalk import BeanstalkClient
 2 | import time
 3 | from bucket_extraction import getBucketsFromText
 4 | 
 5 | beanstalk_client = BeanstalkClient('127.0.0.1', 11301)
 6 | 
 7 | def replayExisting(file, label):
 8 |     with open(file, 'r') as f:
 9 |         for line in f:
10 |             buckets = list(getBucketsFromText(line))
11 |             for bucket in buckets:
12 |                 print('CAND:', bucket)
13 |                 beanstalk_client.put_job("generation/" + label + "," + bucket)
14 |                 time.sleep(1/200)
15 | 
16 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | from bucket_extraction import getBucketsFromText
 2 | 
 3 | def initializeSetFromTextFile(path, setType):
 4 |     """
 5 |     Initialize set comprised of lines from the text file
 6 |     :param path: path to text file
 7 |     :param setType: a set that we will add lines to
 8 |     """
 9 |     with open(path,'r') as f:
10 |         for line in f:
11 |             setType.add(line.strip())
12 | 
13 | def getBucketsFromTextFile(path):
14 |     buckets = set()
15 |     with open(path, 'r') as f:
16 |         lines = f.readlines()
17 |         for line in lines:
18 |             buckets = buckets.union(getBucketsFromText(line))
19 |     return buckets
20 | 


--------------------------------------------------------------------------------
/final_output/gather_all_buckets.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | shopt -s extglob
 3 | 
 4 | # Helper script to gather data from all folders
 5 | cat ./data/validation/@(s3.amazonaws.com|storage.googleapis.com|aliyuncs.com|bucket_types)/private.txt > ./final_output/all_platforms_private.txt
 6 | sort -u -t, -k1,1 ./final_output/all_platforms_private.txt -o ./final_output/all_platforms_private.txt
 7 | cat ./data/validation/@(s3.amazonaws.com|storage.googleapis.com|aliyuncs.com|bucket_types)/public.txt > ./final_output/all_platforms_public.txt
 8 | sort -u -t, -k1,1 ./final_output/all_platforms_public.txt -o ./final_output/all_platforms_public.txt
 9 | cat ./final_output/all_platforms_public.txt ./final_output/all_platforms_private.txt | sort -u > ./final_output/all_platforms_all.txt
10 | 
11 | 


--------------------------------------------------------------------------------
/bucket_extraction/utils/extract_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Scrape a given site's source for buckets.
 3 | """
 4 | import requests
 5 | import re
 6 |     
 7 | def getBucketsFromText(text):
 8 |     regexes = [r'([\w\d_\.-]+)\.s3[\w\d-]*\.amazonaws\.com',
 9 |                r'([\w\d_\.-]+)\.storage\.googleapis\.com',
10 |                r'([\w\d_\.-]+)\.[\w\d\.-]*\.cdn\.digitaloceanspaces\.com',
11 |                r'([\w\d_-]+)\.oss[\w\d-]*\.aliyuncs\.com',
12 |                r'^[^.]*s3[\w\d-]*\.amazonaws\.com\/([\w\d_.-]+)',
13 |                r'^[^.]*s3[\w\d\.-]*\.wasabisys\.com\/([\w\d_.-]+)',
14 |                r'^[^.]*storage\.googleapis\.com\/([\w\d_.-]+)',
15 |                r'^[^.]*oss[\w\d_-]*\.aliyuncs\.com\/([\w\d_.-]+)']
16 |     for regex in regexes:
17 |         found = re.findall(regex, text.lower())
18 |         if len(found) > 0:
19 |             return found
20 |     return set()
21 | 
22 | 


--------------------------------------------------------------------------------
/bucket_extraction/grayhatwarfare/grayhatwarfare.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import os
 4 | 
 5 | def getGrayhatWarfare():
 6 |     buckets = []
 7 |     num_buckets = 100000
 8 |     current = 0
 9 |     chunk_size = 50000
10 | 
11 |     access_token = os.getenv("GRAYHAT_ACCESS_TOKEN")
12 | 
13 |     while current < num_buckets:
14 |       resp = requests.get("https://buckets.grayhatwarfare.com/api/v1/buckets/" + str(current) + "/" + str(chunk_size) + "?access_token=" + access_token)
15 |       resp_json = json.loads(resp.text)
16 |       for bucket in resp_json["buckets"]:
17 |         if bucket["type"] == "aws":
18 |           buckets.append(bucket["bucket"])
19 |       print(len(resp_json["buckets"]))
20 |     
21 |       current += chunk_size
22 |     out = './data/extraction/grayhatwarfare/grayhatwarfare.txt'
23 |     with open(out, 'w+') as f:
24 |         for b in buckets:
25 |             f.write(b + '\n')
26 |     print("Wrote buckets to " + out)
27 |             
28 | if __name__ == "__main__":
29 |   getGrayhatWarfare()


--------------------------------------------------------------------------------
/bucket_generation/generators/random_baseline/guesser.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import random
 3 | import string
 4 | 
 5 | from bucket_generation.utils import getBeanstalkClient, addArguments
 6 | 
 7 | def randomlyGuessBucketNames(numCharacters=5, numTrials=float("inf"), name="random"):
 8 |     beanstalkClient = getBeanstalkClient()
 9 |     while numTrials > 0:
10 |         numTrials -= 1
11 |         randomBucket = "".join(
12 |             [
13 |                 random.choice(list(string.ascii_lowercase) + list(string.digits) + ["-",".","_"])
14 |                 for _ in range(numCharacters)            
15 |             ]
16 |         )
17 |         print(f"CAND: {randomBucket}")
18 |         beanstalkClient.put_job(f"generation/{name},{randomBucket}")
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     parser = argparse.ArgumentParser(description='Run the PCFG generator.')
23 |     addArguments(parser)
24 |     parser.add_argument("--character_num", type=int, help="The length of characters to generate.")
25 |     args = parser.parse_args()
26 |     randomlyGuessBucketNames(name=args.name, numTrials=int(args.num_trials), numCharacters=int(args.character_num))
27 | 


--------------------------------------------------------------------------------
/bucket_validation/test_bloom/test_bloom.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | /**
 4 |  * Creates a bloom filter and initilizes the filter to be from all read lines in all the validator files.
 5 |  */
 6 | 
 7 | import (
 8 | 	"bufio"
 9 | 	"log"
10 | 	"os"
11 | 	"strings"
12 | 
13 | 	"github.com/willf/bloom"
14 | )
15 | 
16 | /**
17 |  * Adds all the lines of text from the textfile to the bloom filter.
18 |  */
19 | func addFromTextFile(filter *bloom.BloomFilter, filePath string) {
20 | 	file, err := os.Open(filePath)
21 | 	if err != nil {
22 | 		log.Fatal(err)
23 | 	}
24 | 	defer file.Close()
25 | 	scanner := bufio.NewScanner(file)
26 | 	for scanner.Scan() {
27 | 		line := scanner.Text()
28 | 		lineContents := strings.Split(line, ",")
29 | 		if len(lineContents) == 2 { // Of the form "<candidate>,<timestamp>"
30 | 			filter.AddString(lineContents[0])
31 | 		} else { // Normal form of <candidate>
32 | 			filter.AddString(scanner.Text())
33 | 		}
34 | 	}
35 | }
36 | 
37 | func main() {
38 | 	filter := bloom.NewWithEstimates(300000000, .000001)
39 | 
40 | 	f1, err := os.OpenFile("./bucket_validation/bloom/candidate_set.bloom", os.O_CREATE|os.O_RDONLY, 0644)
41 | 	if err != nil {
42 | 		panic(err)
43 | 	}
44 | 	defer f1.Close()
45 | 
46 | 	r := bufio.NewReader(f1)
47 | 	readsize, err := filter.ReadFrom(r)
48 | 
49 | 	if err != nil {
50 | 		panic(err)
51 | 	}
52 | 	log.Println(readsize)
53 | 
54 | 	for i := 1; i < len(os.Args); i++ {
55 | 		log.Println(os.Args[i], filter.TestString(os.Args[i]))
56 | 	}
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/bucket_validation/bloom/add_to_bloom.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | /**
 4 |  * Creates a bloom filter and initilizes the filter to be from all read lines in all the validator files.
 5 |  */
 6 | 
 7 | import (
 8 | 	"bufio"
 9 | 	"log"
10 | 	"os"
11 | 	"strings"
12 | 
13 | 	"github.com/willf/bloom"
14 | )
15 | 
16 | /**
17 |  * Adds all the lines of text from the textfile to the bloom filter.
18 |  */
19 | func addFromTextFile(filter *bloom.BloomFilter, filePath string) {
20 | 	file, err := os.Open(filePath)
21 | 	if err != nil {
22 | 		log.Fatal(err)
23 | 	}
24 | 	defer file.Close()
25 | 	scanner := bufio.NewScanner(file)
26 | 	for scanner.Scan() {
27 | 		line := scanner.Text()
28 | 		lineContents := strings.Split(line, ",")
29 | 		if len(lineContents) == 2 { // Of the form "<candidate>,<timestamp>"
30 | 			filter.AddString(lineContents[0])
31 | 		} else { // Normal form of <candidate>
32 | 			filter.AddString(scanner.Text())
33 | 		}
34 | 	}
35 | }
36 | 
37 | func main() {
38 | 	filter := bloom.NewWithEstimates(300000000, .000001)
39 | 
40 | 	f1, err := os.OpenFile("./bucket_validation/bloom/candidate_set.bloom", os.O_CREATE|os.O_RDONLY, 0644)
41 | 	if err != nil {
42 | 		panic(err)
43 | 	}
44 | 	defer f1.Close()
45 | 
46 | 	r := bufio.NewReader(f1)
47 | 	readsize, err := filter.ReadFrom(r)
48 | 
49 | 	if err != nil {
50 | 		panic(err)
51 | 	}
52 | 	log.Println(readsize)
53 | 
54 | 	// Add random textfile stuffs
55 | 	addFromTextFile(filter, os.Args[1])
56 | 
57 | 	f2, err := os.OpenFile("./bucket_validation/bloom/candidate_set.bloom", os.O_WRONLY, 0644)
58 | 
59 | 	w := bufio.NewWriter(f2) // ignores
60 | 	size, err := filter.WriteTo(w)
61 | 	if err != nil {
62 | 		panic(err)
63 | 	}
64 | 	defer f2.Close()
65 | 	log.Println(size)
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/bucket_extraction/bing/bing.py:
--------------------------------------------------------------------------------
 1 | import grequests
 2 | import csv
 3 | from .. import getBucketsFromText
 4 | import random
 5 | import string
 6 | import os
 7 | 
 8 | subscription_key = os.getenv("BING_API_KEY")
 9 | search_url = "https://api.cognitive.microsoft.com/bing/v7.0/search"
10 | 
11 | headers = {"Ocp-Apim-Subscription-Key": subscription_key}
12 | 
13 | NUM_SEARCHES = 100
14 | NUM_THREADS = 10
15 | SEED_LENGTH = 3
16 | OUTPUT_NAME = "./data/extraction/bing/buckets_output.txt"
17 | 
18 | def exception(request, exception):
19 |     print("Error: {}: {}".format(request.url, exception))
20 | 
21 | def getBucketsFromBing():
22 | 
23 |     try:
24 |         with open(OUTPUT_NAME, "r") as  f:
25 |             buckets = set(line.strip() for line in f)
26 |     except Exception as e:
27 |         buckets = set()
28 | 
29 |     initLen = len(buckets)
30 | 
31 |     reqs = []
32 | 
33 |     for i in range(0, NUM_SEARCHES):
34 |         rand = ''.join(random.choice(string.ascii_lowercase) for _ in range(SEED_LENGTH))
35 |         platform = random.choice(["s3.amazonaws.com", "storage.googleapis.com", "oss.aliyuncs.com"])
36 |         reqs.append(grequests.get(
37 |             search_url,
38 |             headers=headers,
39 |             params={
40 |                     "q": "site:" + platform + " \"" + rand + "\"",
41 |                     "responseFilter": "Webpages",
42 |                     "count": 50,
43 |                     "offset": 0
44 |                     },
45 |             stream=False
46 |             ))
47 | 
48 |     results = grequests.map(reqs, exception_handler=exception, size=NUM_THREADS)
49 | 
50 |     for result in results:
51 |         if result is None:
52 |             continue
53 |         
54 |         result.close()
55 | 
56 |         if result.status_code != 200:
57 |             print(result)
58 |             continue
59 | 
60 |         parsed = result.json()
61 |         if "webPages" in parsed and "value" in parsed["webPages"]:
62 |             for page in parsed["webPages"]["value"]:
63 |                 if "snippet" in page:
64 |                     buckets = buckets.union(getBucketsFromText(page["snippet"]))
65 |                     buckets = buckets.union(getBucketsFromText(page["url"]))
66 | 
67 |     numAdded = len(buckets) - initLen
68 |     ratio = numAdded / NUM_SEARCHES
69 |     print("Discovered {} new buckets. ({} buckets / search)".format(numAdded, ratio))
70 |     print("Wrote buckets to " + OUTPUT_NAME)
71 | 
72 |     with open(OUTPUT_NAME, 'w+') as f:
73 |         f.write("\n".join(buckets))
74 | 


--------------------------------------------------------------------------------
/bucket_extraction/farsight/farsight.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | from .. import getBucketsFromText
 4 | import os
 5 | 
 6 | API_ENDPOINTS = [("https://api.dnsdb.info/lookup/rrset/name/", "rrname"), ("https://api.dnsdb.info/lookup/rdata/name/", "rdata")]
 7 | regions = ["us-east-2", "us-east-1", "us-west-1", "us-west-2", "af-south-1", "ap-east-1", "ap-south-1", "ap-northeast-3", "ap-northeast-2", "ap-southeast-1", "ap-southeast-2", "ap-northeast-1", "ca-central-1", "cn-north-1", "cn-northwest-1", "eu-central-1", "eu-west-1", "eu-west-2", "eu-south-1", "eu-west-3", "eu-north-1", "me-south-1", "sa-east-1", "us-gov-east-1", "us-gov-west-1"]
 8 | 
 9 | API_KEY = os.getenv("FARSIGHT_API_KEY")
10 | OUTPUT_NAME = "./data/extraction/farsight/"
11 | 
12 | api_limit = 1000000
13 | 
14 | def lookupFile(file, type):
15 |     files = []
16 |     with open(file, 'r') as f:
17 |         for line in f:
18 |             text = line.strip()
19 |             if "{region}" in text:
20 |                 all_regions = []
21 |                 for region in regions:
22 |                     all_regions.append(text.replace("{region}", region))
23 |                 files.append(all_regions)
24 |             else:
25 |                 files.append([text])
26 |     for endpoint_list in files:
27 |         lookup(endpoint_list, type + "/")
28 | 
29 | def lookup(endpoints, directory=""):
30 |     all_domains = set()
31 |     for domain in endpoints:
32 |         for endpoint_pair in API_ENDPOINTS:
33 |             (endpoint, field_name) = endpoint_pair
34 |             offset = 0
35 |             while True:
36 |                 url = "{}*.{}?limit={}&offset={}".format(endpoint, domain, api_limit, offset)
37 |                 print("Fetching " + url)
38 |                 headers = {'Accept': 'application/json', 'X-API-Key': API_KEY}
39 |                 resp = requests.get(url, headers=headers)
40 |                 if resp.status_code != 200:
41 |                     break
42 |                 split = resp.text.split("\n")
43 |                 for line in split:
44 |                     if line.strip() == "":
45 |                         continue
46 |                     resp_json = json.loads(line)
47 |                     if field_name in resp_json:
48 |                         dns_val = resp_json[field_name]
49 |                         if dns_val[-1] == ".":
50 |                             dns_val = dns_val[:-1]
51 |                         all_domains.add(dns_val)
52 |                 print(len(split))
53 |                 if len(split) < api_limit or offset >= 4000000:
54 |                     break
55 |                 offset += api_limit
56 |     out = './data/extraction/farsight/' + directory + domain + ".txt"
57 |     with open(out, 'w+') as f:
58 |         f.write("\n".join(all_domains))
59 |     print("Wrote buckets to " + out)
60 | 
61 | 


--------------------------------------------------------------------------------
/bucket_extraction/virustotal/virustotal.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import subprocess
 3 | import grequests
 4 | from .. import getBucketsFromText
 5 | import os
 6 | 
 7 | S3_IP_URL = "https://ip-ranges.amazonaws.com/ip-ranges.json"
 8 | API_KEY = os.getenv("VIRUSTOTAL_API_KEY")
 9 | IP_LOOKUP_ENDPOINT = "https://www.virustotal.com/vtapi/v2/ip-address/report"
10 | BATCH_SEARCH = 100
11 | NUM_THREADS = 10
12 | OUTPUT_NAME = "./data/extraction/virustotal/buckets_output.txt"
13 | 
14 | def exception(request, exception):
15 |     print("Error: {}: {}".format(request.url, exception))
16 | 
17 | # Fetches CIDR ranges of all S3 IPs from Amazon
18 | def getS3IPs():
19 |     json = requests.get(S3_IP_URL).json()
20 |     cidrs = []
21 |     for prefix in json["prefixes"]:
22 |         if prefix["service"] == "AMAZON" and "ip_prefix" in prefix:
23 |             cidrs.append(prefix["ip_prefix"])
24 |     with open('./data/extraction/virustotal/all_ips.txt', 'w+') as f:
25 |         f.write("\n".join(cidrs))
26 |     print("Wrote cidrs to all_ips.txt")
27 | 
28 | # Runs a ping scan via Zmap for all S3 IPs
29 | def runZmap():
30 |     with open("./data/extraction/virustotal/all_ips.txt", "r") as f:
31 |         cidrs = f.read().splitlines()
32 |     print(" ".join(cidrs))
33 |     command = "sudo zmap -i ens8 --probe-module=icmp_echoscan -B 10M -o ./data/extraction/virustotal/live_ips.txt " + " ".join(cidrs)
34 |     subprocess.call(command, shell=True)
35 |     subprocess.call("cp ./data/extraction/virustotal/live_ips.txt ./data/extraction/virustotal/rem_ips.txt", shell=True)
36 | 
37 | def lookup(num):
38 |     with open("./data/extraction/virustotal/rem_ips.txt", "r") as f:
39 |         ips = f.read().splitlines()
40 | 
41 |     domains = set()
42 |     with open("./data/extraction/virustotal/domains_output.txt", "r") as  f:
43 |         domains = set(line.strip() for line in f)
44 |     buckets = set()
45 |     with open(OUTPUT_NAME, "r") as  f:
46 |         buckets = set(line.strip() for line in f)
47 |     while num > 0:
48 |         numSearches = min(num, BATCH_SEARCH)
49 |         num -= numSearches
50 |         reqs = []
51 |         for i in range(0, numSearches):
52 |             ip = ips.pop(0)
53 |             reqs.append(grequests.get(
54 |                 IP_LOOKUP_ENDPOINT,
55 |                 params={
56 |                         "apikey": API_KEY,
57 |                         "ip": ip
58 |                         },
59 |                 stream=False))
60 | 
61 |         results = grequests.map(reqs, exception_handler=exception, size=NUM_THREADS)
62 |         for result in results:
63 |             if result is None:
64 |                 print("Error: null response")
65 |                 continue
66 |             parsed = result.json()
67 |             if ("response_code" in parsed and parsed["response_code"] == "0") or "resolutions" not in parsed:
68 |                 result.close()
69 |                 continue
70 |             for res in parsed["resolutions"]:
71 |                 if "hostname" in res and res["hostname"] is not None:
72 |                     domains.add(res["hostname"])
73 |                     buckets = buckets.union(getBucketsFromText(res["hostname"]))
74 |             result.close()
75 | 
76 |     with open(OUTPUT_NAME, 'w+') as f:
77 |         f.write("\n".join(buckets))
78 | 
79 |     with open("./bucket_extraction/virustotal/domains_output.txt", 'w+') as f:
80 |         f.write("\n".join(domains))
81 | 
82 |     with open("./bucket_extraction/virustotal/rem_ips.txt", "w+") as f:
83 |         f.write("\n".join(ips))
84 | 
85 |     print("Wrote buckets to " + OUTPUT_NAME)
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/bucket_generation/generators/character_grams/guesser.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Inspired from https://dl.acm.org/doi/pdf/10.1145/1102120.1102168 and https://hal.archives-ouvertes.fr/hal-01112124/file/omen.pdf
 3 | Do n-grams where n=4, but look at the character level.
 4 | """
 5 | import argparse
 6 | from collections import Counter, defaultdict
 7 | import string
 8 | 
 9 | import numpy as np
10 | 
11 | from bucket_extraction.utils.extract_utils import getBucketsFromText
12 | import bucket_generation.utils as gen_utils
13 | 
14 | def generateLaplaceDistribution():
15 |     return Counter(list(string.ascii_lowercase) + list(string.digits) + ["-",".","_"])
16 | 
17 | def getCounters(buckets):
18 |     """
19 |     Build distribution from left to right of prev4 chars -> next char.
20 |     We will have some default Laplace smoothing, both to add a little bit of randomness,
21 |     and to make sure that we have at least some distribution if the sequence
22 |     has not been encountered before.
23 |     """
24 |     lengthDistribution = Counter()
25 |     counters = defaultdict(generateLaplaceDistribution)
26 |     for bucket in buckets:
27 |         bucketString = bucket.lower().strip()
28 |         for i in range(len(bucketString)):
29 |             counters[
30 |                 bucketString[max(0, i-4): i] # If we aren't at fourth character yet, just do previous characters.
31 |             ][bucketString[i]] += 1
32 |         lengthDistribution[len(bucket)] += 1000
33 |     return counters, lengthDistribution
34 | 
35 | def sampleFromCounter(counter):
36 |     total = sum(counter.values())
37 |     return np.random.choice([k for k,v in counter.items()], p=[v/total for k,v in counter.items()])
38 | 
39 | def generateCandidates(name="c4grams", startingCandidates=None, beanstalkPort=None, numTrials=float("inf"), public=False):
40 |     beanstalkClient = gen_utils.getBeanstalkClient(port=beanstalkPort)
41 |     previouslySeen = startingCandidates | gen_utils.readBucketsFromFile(f"./data/generation/{name}.txt")
42 |     
43 |     # Randomly generate template according to distro
44 |     while numTrials > 0:
45 |         print("Updating character-level 4-grams.")
46 |         # In intervals of 10,000 guesses, update our PCFG from our successful guesses.
47 |         with gen_utils.Profiler(gen_utils.ProfilerType.TRAIN, name):
48 |             candidates = startingCandidates | gen_utils.getExistingAlreadyGuessedBuckets(name, public=public)
49 |             counters, lengthDistribution = getCounters(candidates)
50 | 
51 |         
52 |         for _ in range(int(1e4)):
53 |             with gen_utils.Profiler(gen_utils.ProfilerType.GENERATE, name) as p:
54 |                 bucket = ""
55 |                 bucketLength = sampleFromCounter(lengthDistribution)
56 |                 while len(bucket) < bucketLength:
57 |                     bucket += sampleFromCounter(counters[bucket[max(0, len(bucket)-4): len(bucket)]])
58 |                 
59 |                 p.bucket(bucket)
60 |                 if bucket not in previouslySeen:
61 |                     previouslySeen.add(bucket)
62 |                     print('CAND:', bucket)
63 |                     beanstalkClient.put_job(f"generation/{name},{bucket}")
64 |                     numTrials -= 1
65 |         
66 |     
67 | 
68 | if __name__ == "__main__":
69 |     parser = argparse.ArgumentParser(description='Run the a character level n-grams.')
70 |     gen_utils.addArguments(parser)
71 |     args = parser.parse_args()
72 |     candidates = gen_utils.getStartBucketNames(args)
73 |     generateCandidates(name=args.name, startingCandidates=candidates, public=args.public, numTrials=int(args.num_trials) or float("inf"))
74 | 


--------------------------------------------------------------------------------
/bucket_generation/generators/n_grams/guesser.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | from collections import Counter, defaultdict
 4 | import json
 5 | import random
 6 | import re
 7 | import time
 8 | 
 9 | import numpy as np
10 | from pystalk import BeanstalkClient
11 | 
12 | from bucket_extraction import getBucketsFromText
13 | import bucket_generation.utils as generation_utils 
14 | 
15 | 
16 | def generateNGrams(candidates):
17 |     ngrams = defaultdict(lambda: Counter())
18 |     lengthDistribution = Counter()
19 |     delimiterDistribution = Counter()
20 |     for bucket in candidates:
21 |         splitBucket = re.split(r'([\.|\-|_])', bucket.lower().strip())
22 |         delimiterDistribution.update(
23 |             [
24 |                 d for d in splitBucket if d in [".", "-", "_"]
25 |             ]
26 |         )
27 |         tokens = [t for t in splitBucket if t not in [".", "-", "_"]]
28 |         for i in range(len(list(tokens))):
29 |              ngrams[
30 |                 tuple(tokens[max(0, i-1): i])
31 |             ][tokens[i]] += 1
32 |         lengthDistribution[len(tokens)] += 1
33 |     return ngrams, lengthDistribution, delimiterDistribution
34 | 
35 | def sampleFromCounter(counter):
36 |     total = sum(counter.values())
37 |     return np.random.choice([k for k,v in counter.items()], p=[v/total for k,v in counter.items()])
38 | 
39 | def streamNGramCandidates(
40 |     startingCandidates=None, beanstalkPort=None, numTrials=float("inf"), name="ngrams", experiment=False, public=False,
41 | ):
42 |     candidates = startingCandidates or generation_utils.getExistingBuckets(public=public)
43 |     previouslySeen = startingCandidates | generation_utils.readBucketsFromFile(f"./data/generation/{name}.txt")
44 |     beanstalkClient = generation_utils.getBeanstalkClient(port=beanstalkPort)
45 | 
46 |     while numTrials > 0:
47 |         # Update our prior distribution for every 10,000 candidates.
48 |         print("Initializing bigram distribution.")
49 |         
50 |         with generation_utils.Profiler(generation_utils.ProfilerType.TRAIN, name):
51 |             if experiment:
52 |                 # add all existing buckets that have been guessed by ngrams and are in seed set.
53 |                 candidates |= generation_utils.getExistingAlreadyGuessedBuckets(name, public=public)
54 |             nGrams, lengthDistribution, delimiterDistribution = generateNGrams(candidates)
55 |         
56 |         
57 |         for _ in range(int(1e4)):
58 |             with generation_utils.Profiler(generation_utils.ProfilerType.GENERATE, name) as p:
59 |                 bucket = []
60 |                 bucketLength = sampleFromCounter(lengthDistribution)
61 |                 for _ in range(bucketLength):
62 |                     if len(bucket) > 0:
63 |                         bucket += [sampleFromCounter(delimiterDistribution)]
64 |                     ngramsKey = tuple(bucket[-2:-1])
65 |                     if ngramsKey in nGrams:
66 |                         bucket += sampleFromCounter(nGrams[ngramsKey])
67 |                 bucket = "".join(bucket)
68 |                 p.bucket(bucket)
69 |                 if len(bucket) < 64 and bucket not in previouslySeen:
70 |                     previouslySeen.add(bucket)
71 |                     beanstalkClient.put_job("generation/{},{}".format(name, bucket))
72 |                     print("Generated: " + bucket)
73 |                     numTrials -= 1
74 | 
75 | if __name__ == "__main__":
76 |     parser = argparse.ArgumentParser(description='Run the ngrams generator.')
77 |     generation_utils.addArguments(parser)
78 |     args = parser.parse_args()
79 |     candidates = generation_utils.getStartBucketNames(args)
80 |     streamNGramCandidates(name=args.name, startingCandidates=candidates, public=args.public, numTrials=int(args.num_trials) or float("inf"))


--------------------------------------------------------------------------------
/bucket_generation/generators/continella/guesser.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Generate bucket names according to the algorithms specified in Continella et al:
 3 | https://re.public.polimi.it/retrieve/handle/11311/1065367/314518/2018-continella-bucketsec.pdf
 4 | 
 5 | - Enumerate all 3/4 character combinations
 6 | - Enumerate word mutations
 7 | """
 8 | import argparse
 9 | from enum import Enum, auto
10 | import random
11 | import string
12 | 
13 | 
14 | import bucket_generation.utils as utils
15 | 
16 | def generateRandomThreeOrFour(beanstalkPort=None, numTrials=float("inf"), candidates=None, name="continella_threefour"):
17 |     beanstalkClient = utils.getBeanstalkClient(port=beanstalkPort)
18 |     allPossibleCandidates = set()
19 |     for char1 in list(string.ascii_lowercase) + [""]:
20 |         for char2 in list(string.ascii_lowercase):
21 |             for char3 in list(string.ascii_lowercase):
22 |                 for char4 in list(string.ascii_lowercase):
23 |                     allPossibleCandidates.add("".join([char1,char2,char3,char4]))
24 |     candidates = random.sample(allPossibleCandidates, min(numTrials, len(allPossibleCandidates)))
25 |     for candidate in candidates:
26 |         beanstalkClient.put_job("generation/{},{}".format(name, candidate))
27 | 
28 | class Mutation(Enum):
29 | 
30 |     DELETE = auto()
31 |     DUPLICATE = auto()
32 |     CONCATENATE = auto()
33 |     END = auto()
34 | 
35 | def mutateWords(beanstalkPort=None, numTrials=float("inf"), name="continella_dictionary"):
36 |     with open("/usr/share/dict/words", "r") as f:
37 |         dictionary = [
38 |             l.strip().lower() for l in f.readlines() 
39 |             if l.strip().isalnum() and all(ord(c) < 128 for c in l.strip()) # alphanumeric ascii characters
40 |         ]
41 |     prevCandidates = utils.readBucketsFromFile("./data/generation/{}.txt".format(name))
42 |     beanstalkClient = utils.getBeanstalkClient(port=beanstalkPort)
43 |     
44 |     while numTrials > 0:
45 |         with utils.Profiler(utils.ProfilerType.GENERATE, name) as p:
46 |             word = random.choice(dictionary)
47 |             mutation = random.choice([mutation for mutation in Mutation])
48 |             assert any([mutation == m for m in Mutation]), "wrong equals"
49 |             while mutation != Mutation.END:
50 |                 if mutation == Mutation.DELETE and len(word) > 1:
51 |                     deletedCharIdx = random.randint(0, len(word) - 1)
52 |                     word = word[:deletedCharIdx] + word[deletedCharIdx + 1:]
53 |                 elif mutation == Mutation.DUPLICATE and len(word) < 63: # Can't exceed limit.
54 |                     dupCharIdx = random.randint(0, len(word) - 1)
55 |                     word = word[:dupCharIdx] + 2 * word[dupCharIdx] + word[dupCharIdx + 1:]
56 |                 elif mutation == Mutation.CONCATENATE:
57 |                     otherWord = random.choice(dictionary)
58 |                     if len(word) + len(otherWord) < 63:
59 |                         if random.random() < .5:
60 |                             word += otherWord
61 |                         else:
62 |                             word = otherWord + word
63 |                 mutation = random.choice([mutation for mutation in Mutation])
64 |             p.bucket(word)
65 |             if word not in prevCandidates:
66 |                 print(word)
67 |                 prevCandidates.add(word)
68 |                 beanstalkClient.put_job("generation/{},{}".format(name, word))
69 |                 numTrials -= 1
70 | 
71 | if __name__ == "__main__":
72 |     parser = argparse.ArgumentParser(description='Run the continella experiments.')
73 |     utils.addArguments(parser)
74 |     parser.add_argument("--mutateWords", action="store_true", help="Run the mutateWords experiment.")
75 |     parser.add_argument("--generateRandom34", action="store_true", help="Generate all 3/4 character sequences.")
76 |     args = parser.parse_args()
77 |     assert args.mutateWords != args.generateRandom34, "One of --mutateWords and --generateRandom34 must be selected."
78 |     
79 |     if args.mutateWords:
80 |         mutateWords(numTrials=int(args.num_trials) or float("inf"), name=args.name)
81 |     elif args.generateRandom34:
82 |         generateRandomThreeOrFour(name=args.name)
83 |     
84 |         
85 | 
86 | 


--------------------------------------------------------------------------------
/bucket_generation/generators/pcfg/guesser.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Inspired from https://web.eecs.utk.edu/~mschucha/netsec/readings/cfgPass.pdf
  3 | Predict token based on distribution of tokens bucketd in types as CFG.
  4 | To begin, we will just do a really primitive survey: 
  5 | break bucket into delimiters, characters of a certain length, and
  6 | numbers of certain length. We can definitely extend this to
  7 | with more specific types.
  8 | Types : Ci -> i characters, Ni -> i numbers,
  9 | B -> T-T-T
 10 | """
 11 | import argparse
 12 | from collections import Counter
 13 | import re
 14 | 
 15 | import numpy as np
 16 | 
 17 | from bucket_extraction.utils.extract_utils import getBucketsFromText
 18 | import bucket_generation.utils as gen_utils
 19 | 
 20 | templates = Counter()
 21 | C = {}
 22 | N = {}
 23 | for i in range(64):
 24 |     C[str(i)] = Counter()
 25 |     N[str(i)] = Counter()
 26 | 
 27 | def getType(c):
 28 |     if c.isalpha():
 29 |         return 'C'
 30 |     if c.isnumeric():
 31 |         return 'N'
 32 |     else:
 33 |         return c
 34 | 
 35 | def updateCounters(bucket):
 36 |     template = ''
 37 |     while len(bucket) > 0:
 38 |         ci = re.search('([a-z]*)', bucket).group()
 39 |         if len(ci) > 0:
 40 |             template += 'C' + str(len(ci))
 41 |             C[str(len(ci))][ci] += 1
 42 |             bucket = bucket[len(ci):]
 43 |             continue
 44 | 
 45 |         ni = re.search('([0-9]*)', bucket).group()
 46 |         if len(ni) > 0:
 47 |             template += 'N' + str(len(ni))
 48 |             bucket = bucket[len(ni):]
 49 |             N[str(len(ni))][ni] += 1
 50 |             continue
 51 | 
 52 |         other = re.search('([^a-z0-9]*)', bucket).group()
 53 |         template += other
 54 |         bucket = bucket[len(other):]
 55 |     templates[template] += 1
 56 | 
 57 | def sampleFromCounter(counter):
 58 |     total = sum(counter.values())
 59 |     return np.random.choice([k for k,v in counter.items()], p=[v/total for k,v in counter.items()])
 60 | 
 61 | def generatePCFGCandidates(name="pcfg", startingCandidates=None, beanstalkPort=None, numTrials=float("inf"), public=False):
 62 |     beanstalkClient = gen_utils.getBeanstalkClient(port=beanstalkPort)
 63 |     candidates = startingCandidates or gen_utils.getExistingBuckets(public=public)
 64 |     previouslySeen = startingCandidates | gen_utils.readBucketsFromFile(f"./data/generation/{name}.txt")
 65 | 
 66 |     # Randomly generate template according to distro
 67 |     while numTrials > 0:
 68 |         print("Updating PCFG.")
 69 |         # In intervals of 10,000 guesses, update our PCFG from our successful guesses.
 70 |         with gen_utils.Profiler(gen_utils.ProfilerType.TRAIN, name):
 71 |             candidates = startingCandidates | gen_utils.getExistingAlreadyGuessedBuckets(name, public=public)
 72 |             for candidate in candidates:
 73 |                 updateCounters(candidate.strip().lower())
 74 |         
 75 |         
 76 |         for _ in range(int(1e4)):
 77 |             with gen_utils.Profiler(gen_utils.ProfilerType.GENERATE, name) as p:
 78 |                 template = sampleFromCounter(templates)
 79 |                 print(template)
 80 |                 bucket = '' 
 81 |                 while len(template) > 0:
 82 |                     if template[0] == 'C':
 83 |                         ni = re.search('([0-9]*)', template[1]).group()
 84 |                         i = ni
 85 |                         try:
 86 |                             bucket += sampleFromCounter(C[i])
 87 |                         except KeyError:
 88 |                             import pdb
 89 |                             pdb.set_trace()
 90 |                         template = template[1+len(ni):]
 91 |                     elif template[0] == 'N':
 92 |                         ni = re.search('([0-9]*)', template[1]).group()
 93 |                         i = ni
 94 |                         template = template[1+len(ni):]
 95 |                         bucket += sampleFromCounter(N[i])
 96 |                     else:
 97 |                         bucket += template[0]
 98 |                         template = template[1:]
 99 |                 p.bucket(bucket)
100 |                 if bucket not in previouslySeen:
101 |                     previouslySeen.add(bucket)
102 |                     print('CAND:', bucket)
103 |                     beanstalkClient.put_job(f"generation/{name},{bucket}")
104 |                     numTrials -= 1
105 |         
106 |     
107 | 
108 | if __name__ == "__main__":
109 |     parser = argparse.ArgumentParser(description='Run the PCFG generator.')
110 |     gen_utils.addArguments(parser)
111 |     args = parser.parse_args()
112 |     candidates = gen_utils.getStartBucketNames(args)
113 |     generatePCFGCandidates(name=args.name, startingCandidates=candidates, public=args.public, numTrials=int(args.num_trials) or float("inf"))
114 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Stratosphere
 2 | 
 3 | Stratosphere uses password generation algorithms to discover publicly accessible cloud storage buckets. Stratosphere includes infrastructure for extracting, generating, and validating bucket names across Amazon S3, Google Cloud Storage, and Alibaba's Object Storage Service.
 4 | 
 5 | For more information about Stratosphere, please check out our [research paper](https://zakird.com/papers/s3.pdf).
 6 | 
 7 | ![](images/flow.jpg)
 8 | 
 9 | ## Installation
10 | 
11 | 1.  Install python dependencies: `pip install requirements.txt`
12 | 2.  Install Go dependencies: `cd bucket_validation && go get`
13 | 3.  Install [ZMap](https://github.com/zmap/zmap) and [ZGrab2](https://github.com/zmap/zgrab2)
14 | 4.  Install [beanstalkd](https://beanstalkd.github.io/)
15 | 5.  Run `cp .env.example .env` and configure relevant API keys
16 | 6.  Configure `bucket_validation/listener-config.json` with source IPs, if you would like to use more than one source IP
17 | 
18 | ## Usage
19 | 
20 | ### Extraction
21 | 
22 | The extraction phase gathers buckets to seed generation algorithms. All extractors will write candidate bucket names to files in `./data/extraction/`, which can then be run through the validator to collect valid bucket names.
23 | 
24 | Examples to extract buckets from various sources:
25 | 
26 | Bing: `python main.py --bing`
27 | 
28 | Farsight: `python main.py --farsight --domain s3.amazonaws.com` (a file of domain names can be provided via `python main.py --farsight -f ./file.txt`)
29 | 
30 | GrayHat Warfare: `python main.py --grayhatwarfare`
31 | 
32 | VirusTotal is a 3-part process:
33 | 
34 | 1. Run `python main.py --virustotal --ips` to fetch S3 IP blocks (similar IP ranges can be found for Google Cloud Storage and Alibaba)
35 | 2. Run `python main.py --virustotal --pingAll` to ping all IP addresses via ZMap
36 | 3. Run `python main.py --virustotal --lookup -n 10000` where `-n` is the maximum number of IPs to be validated (to allow running in batches)
37 | 
38 | Lastly, you may bring your own data sources. To use unvalidated data (e.g. buckets that may or may not exist), call `feedToValidator` to validate buckets (see "Validating extracted buckets" below). To use validated data, create a folder in `data/validation` with a unique name. Place private buckets in `private.txt` and public buckets in `public.txt`. As always, invoke `gather_all_buckets.sh` in `final_output` to combine found buckets.
39 | 
40 | ### Validation
41 | 
42 | The validation phase fetches buckets to check whether the bucket exists, and if the bucket is public or private. The source can either be extracted buckets or generated buckets.
43 | 
44 | The validator will output bucket names in `./data/validation/`, with folders for each platform. Within each platform folder, the validator will write files `public.txt`, `private.txt`, and `no_such_bucket.txt` to indicate the response received for each bucket.
45 | 
46 | 1. Run beanstalk in the background: `./beanstalkd -l 127.0.0.1 -p 11301 &`
47 | 2. Run listener: `go run bucket_validation/listener.go`
48 | 
49 | We recommend running the listener in a seperate shell, such as [screen](https://www.gnu.org/software/screen/), for debugging.
50 | 
51 | The listener will continually poll the Beanstalk queue and can be left running.
52 | 
53 | #### Validating extracted buckets
54 | 
55 | To validate extracted buckets, run `python main.py --feedToValidator -f data/extraction/bing/buckets_output.txt --label bing`, where `-f` is the name of the file containing buckets and `--label` is a label to identify the source.
56 | 
57 | This will feed all found buckets to the Beanstalk queue, which will be processed by the listener.
58 | 
59 | #### Combining buckets
60 | 
61 | In order to combine buckets, the `gather_all_buckets.sh` script in `final_output` can be run to aggregate and deduplicate found buckets across all three sources. This will create three files: `all_platforms_private.txt`, which contains all private buckets, `all_platforms_public.txt`, which contains all public buckets, and `all_platforms_all.txt` which contains all buckets across all platforms.
62 | 
63 | ### Generation
64 | 
65 | The generation phase generates new bucket names based on previously seen buckets.
66 | 
67 | The generators rely on the `all_platforms_private.txt` and `all_platforms_public.txt` files in `final_output`. Thus, after running the validator on extracted sources, be sure to run `final_output/gather_all_buckets.sh` to generate these files.
68 | 
69 | In order to run these files, you will need to add the project to your PYTHONPATH. You can do this by runnning the following:
70 | 
71 | ```
72 | source bucket_generation/add_to_path.sh
73 | python bucket_generation/generators/<generator_name>/guesser.py <custom_generator_name> [--public] [--num_trials N]
74 | ```
75 | 
76 | Examples to generate buckets using different algorithms:
77 | 
78 | LSTM RNN Generator: `python bucket_generation/generators/rnn/guesser.py rnn --stream --forward`
79 | 
80 | LSTM RNN Train: `python bucket_generation/generators/rnn/guesser.py rnn --train --forward`
81 | 
82 | Token PCFG: `python bucket_generation/generators/token_pcfg/guesser.py`
83 | 
84 | Character 5-Grams: `python bucket_generation/generators/character_grams/guesser.py`
85 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from gevent import monkey as curious_george
  2 | curious_george.patch_all(thread=False, select=False)
  3 | 
  4 | from dotenv import load_dotenv
  5 | load_dotenv()
  6 | 
  7 | from bucket_extraction.bing.bing import getBucketsFromBing
  8 | from bucket_extraction.virustotal.virustotal import getS3IPs
  9 | from bucket_extraction.grayhatwarfare.grayhatwarfare import getGrayhatWarfare
 10 | from bucket_extraction.virustotal.virustotal import runZmap
 11 | from bucket_extraction.virustotal.virustotal import lookup
 12 | from bucket_generation.generators.n_grams.n_grams import streamNGramCandidates
 13 | from bucket_generation.evaulator.evaluate_performance import evaluatePerformance, fetchExtractedNames
 14 | 
 15 | import argparse
 16 | 
 17 | parser = argparse.ArgumentParser(description='Stratosphere: Discover public cloud storage buckets')
 18 | parser.add_argument('--backward', help='Train the RNN backwards', action='store_true')
 19 | parser.add_argument('--bing', help='Get S3 bucket names from Bing API searches', action='store_true')
 20 | parser.add_argument('--eval', help='Evaluate generator performance with provided date of form %%Y_%%M_%%d.', type=str)
 21 | parser.add_argument('--fetch', help='Load dataset of JUST extracted bucket names.', action='store_true')
 22 | parser.add_argument('--lookup', help='Checks all IPs against VirusTotal', action='store_true')
 23 | parser.add_argument('--ips', help='Fetch S3 IPs for Zmap', action='store_true')
 24 | parser.add_argument('-n', help='Number of requests to make', type=int,)
 25 | parser.add_argument('-i', help='Network interface for zgrab to use')
 26 | parser.add_argument('--label', help='Output filename label')
 27 | parser.add_argument('--ngrams', help='Generate candidates using NGrams', action='store_true')
 28 | parser.add_argument('--ngrams2', help='Generate candidates using NGrams', action='store_true')
 29 | parser.add_argument('--ngrams3', help='Generate candidates using NGrams', action='store_true')
 30 | parser.add_argument('--templates', help='Template bucket names', action='store_true')
 31 | parser.add_argument('-f', help='Path to buckets_output.txt file', type=str)
 32 | parser.add_argument('--type', help='Service type', type=str)
 33 | parser.add_argument('--rnn', help='Run RNN LSTM generator.', action='store_true')
 34 | parser.add_argument('--train', help='Train', action='store_true')
 35 | parser.add_argument('--stream', help='Continuously stream candidates', action='store_true')
 36 | parser.add_argument('-r', help='Number of candidates per second to generate', type=int)
 37 | parser.add_argument('--pingAll', help='Ping all S3 IPs with Zmap', action='store_true')
 38 | parser.add_argument('--pcfg', help='Run PCFG generator', action='store_true')
 39 | parser.add_argument('--token', help='Run token PCFG generator', action='store_true')
 40 | parser.add_argument('--farsight', help='Get bucket names from FarSight', action='store_true')
 41 | parser.add_argument('--virustotal', help='Get S3 bucket names from VirusTotal', action='store_true')
 42 | parser.add_argument('--grayhatwarfare', help='Get bucket names from grayhatwarfare', action='store_true')
 43 | parser.add_argument('--feedToValidator', help='Send buckets_output.txt candidates to validator', action='store_true')
 44 | parser.add_argument('--replayExisting', help='Replay existing buckets with a new domain', action='store_true')
 45 | parser.add_argument('--domain', help='Root domain to look up', type=str)
 46 | 
 47 | args = parser.parse_args()
 48 | if args.bing:
 49 | 	getBucketsFromBing()
 50 | elif args.grayhatwarfare:
 51 | 	getGrayhatWarfare()
 52 | elif args.virustotal:
 53 | 	if args.ips:
 54 | 		getS3IPs()
 55 | 	if args.pingAll:
 56 | 		runZmap()
 57 | 	if args.lookup:
 58 | 		lookup(args.n)
 59 | elif args.farsight:
 60 | 	if args.f:
 61 | 		from bucket_extraction.farsight.farsight import lookupFile
 62 | 		lookupFile(args.f, args.type)
 63 | 	else:
 64 | 		from bucket_extraction.farsight.farsight import lookup
 65 | 		lookup([args.domain])
 66 | elif args.ngrams:
 67 | 	streamNGramCandidates(args.r)
 68 | elif args.ngrams2:
 69 | 	from bucket_generation.generators.n_grams2.n_grams2 import streamNGrams2
 70 | 	streamNGrams2(args.r)
 71 | elif args.ngrams3:
 72 | 	from bucket_generation.generators.n_grams2.n_grams2 import streamNGrams3
 73 | 	streamNGrams3(args.r)
 74 | elif args.templates:
 75 | 	from bucket_generation.generators.templates.templates import steamCandidates
 76 | 	steamCandidates(args.n)
 77 | elif args.replayExisting:
 78 | 	from bucket_generation.replayExisting import replayExisting
 79 | 	replayExisting(args.f, args.label)
 80 | elif args.rnn:
 81 | 	from bucket_generation.generators.rnn.rnn import streamRNNGuesses, runTraining
 82 | 	if args.stream:
 83 | 		streamRNNGuesses(not args.backward)
 84 | 	if args.train:
 85 | 		runTraining(not args.backward)
 86 | elif args.feedToValidator:
 87 | 	from bucket_extraction.feed_to_validator.feed_to_validator import feedToValidator
 88 | 	feedToValidator(args.f, args.label)
 89 | elif args.pcfg:
 90 | 	from bucket_generation.generators.pcfg.guesser import generatePCFGCandidates
 91 | 	generatePCFGCandidates()
 92 | elif args.token:
 93 | 	from bucket_generation.generators.token_pcfg.guesser import generatePCFGCandidates
 94 | 	generatePCFGCandidates()
 95 | elif args.eval:
 96 | 	evaluatePerformance(args.eval)
 97 | elif args.fetch:
 98 | 	fetchExtractedNames()
 99 | else:
100 | 	print("Error: command not found.")
101 | 	parser.print_help()
102 | 
103 | 


--------------------------------------------------------------------------------
/bucket_generation/utils.py:
--------------------------------------------------------------------------------
  1 | from argparse import Action
  2 | from enum import Enum, auto
  3 | import random
  4 | import time
  5 | from utils import getBucketsFromText
  6 | from pystalk import BeanstalkClient
  7 | import json
  8 | import argparse
  9 | 
 10 | 
 11 | def randLines(file, n):
 12 |     """
 13 |     Grabs n random lines in a file using Algorithm R.
 14 |     """
 15 |     lines = []
 16 |     for num, line in enumerate(file):
 17 |         if num < n:
 18 |             lines.append(line)
 19 |         else:
 20 |             randNum = random.randrange(0, num)
 21 |             if randNum < n:
 22 |                 lines[randNum] = line
 23 |     return lines
 24 | 
 25 | def readFile(file, removeSuffix=False):
 26 |     result = []
 27 |     with open(file, 'r') as f:
 28 |         for line in f:
 29 |             if line is not None:
 30 |                 stripped = line.strip()
 31 |                 if removeSuffix and ".s3.amazonaws.com" in stripped:
 32 |                     stripped = stripped.split(".s3.amazonaws.com")[0]
 33 |                 result.append(stripped)
 34 |     return result
 35 | 
 36 | def readFullBucketNamesFromFile(path, timePeriod=None):
 37 |     result = set([])
 38 |     firstTimestamp = None
 39 |     with open(path, 'r') as f:
 40 |         for line in f:
 41 |             splitLine = line.strip().split(",")
 42 |             if len(splitLine) == 2:
 43 |                 bucket, timestamp = splitLine
 44 |                 timestamp = int(timestamp)
 45 |                 if timePeriod:
 46 |                     if firstTimestamp:
 47 |                         if timestamp - firstTimestamp > timePeriod:
 48 |                             print(path, timestamp, firstTimestamp)
 49 |                             break
 50 |                     else:
 51 |                         firstTimestamp = timestamp
 52 |             result.add(bucket)
 53 |     return result
 54 | 
 55 | class BucketType(Enum):
 56 |     PUBLIC = auto()
 57 |     PRIVATE = auto()
 58 | 
 59 | def getFullBuckets(bucketType):
 60 |     assert (
 61 |         bucketType == BucketType.PUBLIC or bucketType == BucketType.PRIVATE
 62 |     ), "Bucket type must be one of PUBLIC/PRIVATE"
 63 |     if bucketType == BucketType.PUBLIC:
 64 |         return readFullBucketNamesFromFile("./final_output/all_platforms_public.txt")
 65 |     if bucketType == BucketType.PRIVATE:
 66 |         return readFullBucketNamesFromFile("./final_output/all_platforms_private.txt")
 67 | 
 68 | class GeneratorBeanstalkClient(BeanstalkClient):
 69 | 
 70 |     def __init__(self, address, port):
 71 |         super().__init__(address, port)
 72 | 
 73 |     def put_job(self, string, **kwargs):
 74 |         # first, check if the job queue isn't too big.
 75 |         # if so, sleep proportional to size so that we should slow down at around 10M queue size.        
 76 |         jobsReady = super().stats()["current-jobs-ready"]
 77 |         time.sleep(jobsReady/1e5)
 78 |         super().put_job(string, **kwargs)
 79 | 
 80 | 
 81 | def getExistingAlreadyGuessedBuckets(name, public=False):
 82 |     """
 83 |     Given the generator name, load the dataset comprised
 84 |     of already guessed buckets from that generator that also happen to exist.
 85 |     :param: The generator name.
 86 |     """
 87 |     return getExistingBuckets(public=public) & \
 88 |         readBucketsFromFile(f"./data/generation/{name}.txt")
 89 | 
 90 | def getExistingBuckets(public=False):
 91 |     filePath = './final_output/all_platforms_all.txt'
 92 |     if public:
 93 |         filePath = "./final_output/all_platforms_public.txt"
 94 |     return readBucketsFromFile(filePath)
 95 | 
 96 | def readBucketsFromFile(path):
 97 |     try:
 98 |         with open(path, "r") as f:
 99 |             return set([bucket for line in f.readlines() for bucket in getBucketsFromText(line)])
100 |     except FileNotFoundError:
101 |         return set()
102 | 
103 | 
104 | def getStartBucketNames(args):
105 |     if args.experiment:
106 |         import bucket_generation.evaulator.evaluate_performance as evaluate_performance
107 |         candidates = evaluate_performance.loadExtractedNames("2020_07_20")
108 |         if args.public:
109 |             candidates &= getExistingBuckets(public=True)
110 |         return candidates
111 |     return None
112 | 
113 | class ProfilerType(Enum):
114 |     TRAIN = "train"
115 |     GENERATE = "generate"
116 | 
117 | class Profiler:
118 |     
119 |     def __init__(self, profilerType, name):
120 |         assert profilerType in ProfilerType, "Don't know where to write these profiled results."
121 |         assert type(name) == str, "Name must be a string corresponding to the profiler."
122 |         self.type = profilerType
123 |         self.name = name
124 |         self.bucket_name = ""
125 | 
126 |     def __enter__(self):
127 |         self.start = time.process_time()
128 |         return self
129 |     
130 |     def bucket(self, bucket):
131 |         self.bucket_name = bucket
132 | 
133 |     def __exit__(self, exc_type, exc_vlaue, exc_tb):
134 |         self.end = time.process_time()
135 |         with open(f"./data/timing/{self.type.value}/{self.name}", "a+") as f:
136 |             f.write(f"{self.bucket_name},{self.end - self.start},{time.time()}\n")
137 |     
138 | 
139 | def getBeanstalkClient(port=None):
140 |     """
141 |     Start up a beanstalkclient.
142 |     """
143 |     config = {}
144 |     if not port:
145 |         with open('./bucket_validation/listener-config.json', 'r') as f:
146 |             config = json.load(f)
147 |         port = config["BeanstalkHost"].split(":")[1]
148 |     return GeneratorBeanstalkClient("127.0.0.1", port)
149 | 
150 | 
151 | def getPreviousCandidates():
152 |     candidates = set()
153 |     with open('./final_output/all_platforms_public.txt', 'r') as f:
154 |         for line in f:
155 |             try:
156 |                 cands = list(getBucketsFromText(line))
157 |                 if len(cands) > 0:
158 |                     candidates.add(cands[0])
159 |             except Exception as e:
160 |                 pass
161 |     return candidates
162 | 
163 | def addArguments(parser):
164 |     parser.add_argument("name", type=str, help="generator identifier")
165 |     parser.add_argument("--num_trials", type=str, help="Number of trials to run generator.")
166 |     parser.add_argument("--port", type=int, help="The beanstalk job queue port.")
167 |     parser.add_argument("--public", action="store_true", help="Only load the public buckets in our models.")


--------------------------------------------------------------------------------
/bucket_generation/generators/token_pcfg/guesser.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Inspired from https://web.eecs.utk.edu/~mschucha/netsec/readings/cfgPass.pdf
  3 | Predict token based on distribution of tokens bucketd in types as CFG.
  4 | To begin, we will just do a really primitive survey: 
  5 | break bucket into delimiters, characters of a certain length, and
  6 | numbers of certain length. We can definitely extend this to
  7 | with more specific types.
  8 | Types : Ci -> i characters, Ni -> i numbers,
  9 | B -> T-T-T
 10 | """
 11 | import argparse
 12 | from collections import Counter
 13 | from enum import Enum, auto
 14 | import re
 15 | 
 16 | import numpy as np
 17 | 
 18 | from bucket_extraction.utils.extract_utils import getBucketsFromText
 19 | import bucket_generation.utils as gen_utils
 20 | 
 21 | class Type(Enum):
 22 |     """
 23 |     Token types. These source files are sourced from the data/aux text files
 24 |     """
 25 |     TECH = auto() # tech term from 
 26 |     TEMPLATE = auto()
 27 |     WORD = auto()
 28 |     TLD = auto()
 29 |     FILE = auto()
 30 |     NUMBER = auto()
 31 |     COMPOUND = auto()
 32 |     DOMAIN = auto()
 33 |     OTHER = auto()
 34 | 
 35 | def getType(token, tech_terms, suffixes, file_extensions, domains, dictionary_words):
 36 |     """
 37 |     :param: token a string that we are tying to determine its type
 38 |     :params: sets of strings that we will check for membership 
 39 |     :return: the token type as an enum
 40 |     """
 41 |     if len(token) == 0:
 42 |         return ''
 43 |     if token in tech_terms:
 44 |         return Type.TECH
 45 |     if token in suffixes:
 46 |         return Type.TLD
 47 |     elif token in file_extensions or token[:-1] in file_extensions:
 48 |         return Type.FILE
 49 |     elif token in domains or token[:-1] in domains:
 50 |         return Type.DOMAIN
 51 |     elif token in dictionary_words or token[:-1] in dictionary_words:
 52 |         return Type.WORD
 53 |     elif token.isdigit():
 54 |         return Type.NUMBER
 55 |     else:
 56 |         for i in range(len(token)):
 57 |             if token[:i] in dictionary_words and token[i:] in dictionary_words:
 58 |                 return Type.COMPOUND
 59 |     return Type.OTHER
 60 | 
 61 | def loadTypeSets():
 62 |     """
 63 |     Reads the type sets from the right text files and returns their sets
 64 |     """
 65 |     # ./data/aux/dictionary.txt' -- from https://github.com/dwyl/english-words/
 66 |     # instead, let's use SCOWL like Continella for better comparison. (/usr/share/dict/wodrds)
 67 |     with open('/usr/share/dict/words') as f:
 68 |         words = set([l.lower().strip() for l in f.readlines()])
 69 |     
 70 |     # TLDs but a little less strict (i.e. co.uk is not a TLD but a common suffix) 
 71 |     # from https://publicsuffix.org/list/public_suffix_list.dat
 72 |     with open('./data/aux/public_suffix_list.dat') as f:
 73 |         suffixes = set([l.lower().strip() for l in f.readlines()][1:])
 74 | 
 75 |     # Just sourced manually via the "Other" section
 76 |     with open('./data/aux/tech_terms.txt') as f:
 77 |         techTerms = set([l.lower().strip() for l in f.readlines()])
 78 |     
 79 |     # From https://s3-us-west-1.amazonaws.com/umbrella-static/index.html
 80 |     with open('./data/aux/top-1e5-domains.txt') as f:
 81 |         domains = set([line.split('.')[-2] for line in f.readlines() if len(line.split('.')) >= 2])
 82 | 
 83 |     # sourced manually form wikipedia: https://en.wikipedia.org/wiki/List_of_file_formats
 84 |     with open('./data/aux/wikipedia-file-extensions.txt') as f:
 85 |         files = set([l.lower().strip() for l in f.readlines()][1:])
 86 |     return techTerms, suffixes, files, domains, words
 87 | 
 88 | 
 89 | def updateCounters(buckets):
 90 |     """
 91 |     Generate distributions for each CFG node
 92 |     :return: a counter for templates, dictionary words, tech words, files, domains, compound words, TLDS, and numbers
 93 |     """
 94 | 
 95 |     techTerms, suffixes, files, domains, words = loadTypeSets()
 96 | 
 97 |     counters = { key: Counter() for key in Type }
 98 |     delimiters = re.compile('[-._]')
 99 |     for bucket in buckets:
100 |         tokens = delimiters.split(bucket.lower())
101 |         
102 |         bucketDelimiters = list(delimiters.finditer(bucket))
103 |         template = ''
104 |         for i, token in enumerate(tokens):
105 |             tokenType = getType(token, techTerms, suffixes, files, domains, words)
106 |             if tokenType != '':
107 |                 template += tokenType.name
108 |                 counters[tokenType][token] += 1
109 |             if i != len(tokens) - 1:
110 |                 template += bucketDelimiters[i].group()
111 |                 
112 |         counters[Type.TEMPLATE][template] += 1         
113 |     return counters  
114 | 
115 | 
116 | def sampleFromCounter(counter):
117 |     total = sum(counter.values())
118 |     return np.random.choice([k for k,v in counter.items()], p=[v/total for k,v in counter.items()])
119 | 
120 | def generatePCFGCandidates(startingCandidates=None, beanstalkPort=None, name="token_pcfg", numTrials=float("inf"), public=False):
121 |     beanstalkClient = gen_utils.getBeanstalkClient(port=beanstalkPort)
122 |     previouslySeen = startingCandidates | gen_utils.readBucketsFromFile(f"./data/generation/{name}.txt")
123 | 
124 |     # Randomly generate template according to distro
125 |     delimiters = re.compile('[-._]')
126 |     while numTrials > 0:
127 |          
128 |         # Every 10,000 guesses, update the PCFG.
129 |         print("Updating PCFG.")
130 |         with gen_utils.Profiler(gen_utils.ProfilerType.TRAIN, name):
131 |             candidates = startingCandidates | gen_utils.getExistingAlreadyGuessedBuckets(name, public=public)
132 |             counters = updateCounters(candidates)
133 | 
134 |         for i in range(int(1e4)):
135 |             with gen_utils.Profiler(gen_utils.ProfilerType.GENERATE, name) as p:
136 |                 template = sampleFromCounter(counters[Type.TEMPLATE])        
137 |                 tokens = delimiters.split(template)
138 |                 templateDelimiters = list(delimiters.finditer(template))
139 |                 bucket = ''
140 |                 for idx, token in enumerate(tokens):
141 |                     if token != '':
142 |                         bucket += sampleFromCounter(counters[Type[token]])
143 |                     if idx != len(tokens) - 1:
144 |                         bucket += templateDelimiters[idx].group()
145 |                 p.bucket(bucket)
146 |                 if bucket not in previouslySeen:
147 |                     numTrials -= 1
148 |                     previouslySeen.add(bucket)
149 |                     print('CAND:', bucket)
150 |                     beanstalkClient.put_job(f"generation/{name},{bucket}")
151 |     
152 | if __name__ == "__main__":
153 |     parser = argparse.ArgumentParser(description='Run the Token PCFG generator.')
154 |     gen_utils.addArguments(parser)
155 |     args = parser.parse_args()
156 |     candidates = gen_utils.getStartBucketNames(args)    
157 |     generatePCFGCandidates(name=args.name, startingCandidates=candidates, public=args.public, numTrials=int(args.num_trials) or float("inf"))
158 | 
159 | 
160 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS


--------------------------------------------------------------------------------
/bucket_generation/generators/rnn/guesser.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Jack Cable and Drew Gregory
  3 | LSTM Generator to produce S3 Bucket Candidate Names.
  4 | This uses a one-to-many design so that we can generate names from scratch (only basing off of a single character).
  5 | Inspired by:
  6 |     - https://towardsdatascience.com/generating-text-using-a-recurrent-neural-network-1c3bfee27a5e
  7 |     - https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py
  8 |     - https://stackoverflow.com/questions/38714959/understanding-keras-lstms?rq=1
  9 | Here was a useful answer to why the output dimensions are for multiple sequences:
 10 |     - https://stackoverflow.com/questions/43702481/why-does-keras-lstm-batch-size-used-for-prediction-have-to-be-the-same-as-fittin
 11 | """
 12 | import argparse
 13 | from datetime import date
 14 | import json
 15 | import numpy as np
 16 | import random
 17 | import time
 18 | 
 19 | from keras.callbacks import ModelCheckpoint, LambdaCallback
 20 | from keras.layers import Activation, Dense, Flatten, LSTM, Masking
 21 | from keras.models import Sequential, load_model
 22 | from keras.callbacks import ReduceLROnPlateau
 23 | from keras.optimizers import RMSprop
 24 | 
 25 | import bucket_generation.utils as generation_utils
 26 | from bucket_extraction.utils.extract_utils import getBucketsFromText
 27 | from bucket_generation.utils import getExistingAlreadyGuessedBuckets, getExistingBuckets
 28 | 
 29 | 
 30 | beanstalkClient = None
 31 | 
 32 | def sample(preds, temperature=1.0):
 33 |     # helper function to sample an index from a probability array
 34 |     preds = np.asarray(preds).astype('float64')
 35 |     preds = np.log(preds) / temperature
 36 |     exp_preds = np.exp(preds)
 37 |     preds = exp_preds / np.sum(exp_preds)
 38 |     probas = np.random.multinomial(1, preds, 1)
 39 |     return np.argmax(probas)
 40 | 
 41 | def standardizeText(line, forward=True):
 42 |     """
 43 |     Remove whitespace, lowercase,
 44 |      and end with termination character \r 
 45 |     """
 46 |     text = line.strip().lower()[:63]
 47 |     return (text if forward else text[::-1]) + '\r'
 48 | 
 49 | def buildModel(uniqueChars):
 50 |     # This is because we have variable length input sequences and thus different
 51 |     # dimensions, see https://github.com/keras-team/keras/issues/6776
 52 |     
 53 |     hiddenUnits = 64
 54 |     model = Sequential()
 55 |     inShape = (64, uniqueChars) # bucket names need to be between 3-63 chars
 56 |     model.add(
 57 |         LSTM(
 58 |             hiddenUnits, input_shape=inShape,
 59 |             return_sequences=True,
 60 |         )
 61 |     )
 62 |     model.add(Flatten()) # https://github.com/keras-team/keras/issues/6351
 63 |     model.add(Dense(uniqueChars, activation='softmax'))
 64 |     optimizer = RMSprop(lr=0.01)
 65 |     model.compile(loss='categorical_crossentropy', optimizer=optimizer)
 66 |     return model
 67 |     
 68 | def addNamesToCorpus(x,y, names, startingCharCounts, forward):
 69 |     for bucket_name in names:
 70 |         goodName = standardizeText(bucket_name,forward=forward)
 71 |         for i in range(len(goodName)-1):
 72 |             x.append(goodName[:i+1])
 73 |             y.append(goodName[i+1])
 74 |         startC = goodName[0]
 75 |         if startC not in startingCharCounts:
 76 |             startingCharCounts[startC] = 0
 77 |         startingCharCounts[startC] += 1
 78 | 
 79 | 
 80 | def addNamesToCorpusFromFile(x,y, filename, startingCharCounts, forward):
 81 |     buckets = set(random.sample(generation_utils.readBucketsFromFile(filename), k=int(1e4)))
 82 |     addNamesToCorpus(x,y, buckets, startingCharCounts, forward)
 83 | 
 84 | 
 85 | def generateText(startingCounts, model, indicesChar, charIndices, forward):
 86 |     startingChar = startingCounts[
 87 |         sample([c[1] for c in startingCounts])
 88 |     ][0]
 89 |     sentence = startingChar
 90 |     for _ in range(63):
 91 |         x_pred = np.zeros((1, 64, 40))
 92 |         for t, char in enumerate(sentence):
 93 |             x_pred[0, t, charIndices[char]] = 1.
 94 |         preds = model.predict(x_pred, verbose=0)[0]
 95 |         nextIndex = sample(preds)
 96 |         if nextIndex == charIndices['\r']:
 97 |             if len(sentence) <= 3:
 98 |                 continue
 99 |             else:
100 |                 break
101 |         sentence += indicesChar[str(nextIndex)]
102 |     return sentence if forward else sentence[::-1]
103 | 
104 | def onEpochEnd(epoch, logs, startingCounts, model, indicesChar, charIndices, forward):
105 |     print('FINISHED EPOCH', epoch)
106 |     for _ in range(10):
107 |         print(generateText(startingCounts, model, indicesChar, charIndices, forward))
108 | 
109 | 
110 | def trainModel(
111 |     startingCharCounts, model, filepath, charIndices, indicesChar, forward, 
112 |     candidates=None, name=None, public=False):
113 |     
114 |     # Collect all bucket names and starting character distribution
115 |     sentences = []
116 |     nextChars = []
117 |     candidates = candidates or getExistingBuckets(public=public)
118 |     candidates |= getExistingAlreadyGuessedBuckets(name, public=public)
119 | 
120 |     # This many candidates wouldn't fit in memory, so let's grab 10,000 buckets at random.
121 |     sampledBucketNames = random.sample(
122 |         candidates,
123 |         int(1e4)
124 |     )
125 |     addNamesToCorpus(sentences, nextChars, sampledBucketNames, startingCharCounts, forward)
126 |   
127 |     
128 |     x = np.zeros((len(sentences), 64, 40), dtype=np.bool)
129 |     y = np.zeros((len(sentences), 40), dtype=np.bool)
130 | 
131 |     for i, sentence in enumerate(sentences):
132 |         for t, char in enumerate(sentence):
133 |             x[i, t, charIndices[char]] = 1
134 |         y[i, charIndices[nextChars[i]]] = 1
135 |     print('NUM SENTENCES', len(sentences))
136 |     startingCounts = list(startingCharCounts.items())
137 | 
138 |     checkpoint = ModelCheckpoint(filepath, monitor='loss',
139 |                              verbose=1, save_best_only=True,
140 |                              mode='min')
141 |     checkpoint_backup = ModelCheckpoint("{}.{}".format(filepath, date.today().strftime("%Y_%m_%d")),monitor='loss',
142 |                              verbose=1, save_best_only=True,
143 |                              mode='min')
144 |     reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2,
145 |                               patience=1, min_lr=0.00001)
146 | 
147 |     print_callback = LambdaCallback(on_epoch_end=lambda x,y: onEpochEnd(
148 |         x, y,startingCounts, model, indicesChar, charIndices, forward
149 |     ))
150 |     callbacks = [print_callback, checkpoint, checkpoint_backup, reduce_lr]
151 |     print('FITTING')
152 |     print(len(x),len(y))
153 |     while True:
154 |         try:
155 |             model.fit(x, y, batch_size=1000, epochs=10, callbacks=callbacks, use_multiprocessing=True)
156 |             break
157 |         except OSError as e:
158 |             # Just retry, after waiting some time.
159 |             time.sleep(17)
160 |     return model
161 | 
162 | def makeGuesses(model, startingCharCounts, charIndices, indicesChar, forward, name="name", previous=None):
163 |     candidates = previous or set()
164 |     startingCounts = list(startingCharCounts.items())
165 |     for _ in range(10000):
166 |         with generation_utils.Profiler(generation_utils.ProfilerType.GENERATE, name) as p:
167 |             cand = generateText(startingCounts, model, indicesChar, charIndices, forward)
168 |             p.bucket(cand)
169 |             print(len(candidates))
170 |             if cand not in candidates:
171 |                 print('CAND', cand)
172 |                 beanstalkClient.put_job(f"generation/{name},{cand}")
173 |                 candidates.add(cand)
174 |             else:
175 |                 print("ALREADY GUESSED")
176 | 
177 | 
178 | def runTraining(name="rnn", forward=True, filepath=None, candidates=None, public=False):
179 |     chars = 40
180 |     assert filepath, "No weights filepath provided."
181 |     try:
182 |         model = load_model(filepath)
183 |     except Exception as e:
184 |         print("COULDNT LOAD MODEL", e)
185 |         model = buildModel(chars)
186 |     model.summary()
187 |     charIndices = {}
188 |     with open('./data/generation/rnn/charIndices.json') as f:
189 |         charIndices = json.load(f,)
190 |     indicesChar = {}
191 |     with open('./data/generation/rnn/indicesChar.json') as f:
192 |         indicesChar = json.load(f,)
193 |     startingCharCounts = {}
194 |     while True:
195 |         with generation_utils.Profiler(generation_utils.ProfilerType.TRAIN, name):
196 |             model = trainModel(
197 |                 startingCharCounts, model, filepath, charIndices, indicesChar,forward, 
198 |                 candidates=candidates,
199 |                 name=name, public=public
200 |             )
201 | 
202 | def streamRNNGuesses(
203 |     forward=True, beanstalkPort=None, name="rnn", numTrials=None, weights_path=None, seedSet=None
204 | ):
205 | 
206 |     if not numTrials:
207 |         numTrials = float("inf")
208 |     
209 |     global beanstalkClient    
210 |     beanstalkClient = generation_utils.getBeanstalkClient(port=beanstalkPort)
211 | 
212 |     filepath = weights_path
213 |     charIndices = {}
214 |     with open('./data/generation/rnn/charIndices.json') as f:
215 |         charIndices = json.load(f,)
216 |     indicesChar = {}
217 |     with open('./data/generation/rnn/indicesChar.json') as f:
218 |         indicesChar = json.load(f,)
219 |     startingCharCounts = {}
220 |     sentences = []
221 |     nextChars = []
222 |     numTrials /= 1e4
223 |     previouslySeen = generation_utils.readBucketsFromFile(f"data/generation/{name}.txt") | (seedSet or set())
224 |     # This is just to load up the startingCharCounts.
225 |     addNamesToCorpusFromFile(sentences, nextChars, './final_output/all_platforms_all.txt', startingCharCounts, forward)
226 |     sentences = []
227 |     nextChars = []
228 |     while numTrials > 0:
229 |         try:
230 |             model = load_model(filepath)
231 |         except Exception as e:
232 |             print("COULDNT LOAD MODEL, WAITING A MINUTE", e)
233 |             time.sleep(60)
234 |             continue
235 |         model.summary()
236 |         makeGuesses(model, startingCharCounts, charIndices, indicesChar, forward, name=name, previous=previouslySeen)
237 |         numTrials -= 1
238 | 
239 | 
240 | if __name__ == "__main__":
241 |     parser = argparse.ArgumentParser(description='Train rnn.')
242 |     generation_utils.addArguments(parser)
243 |     parser.add_argument("--train", action="store_true", help="Train rnn instead of stream guesses.")
244 |     parser.add_argument("--forward", action="store_true", help="Run the rnn in forward vs. backward mode.")
245 |     parser.add_argument("--stream", action="store_true", help="Stream guesses based off of the model.")
246 | 
247 |     args = parser.parse_args()
248 |     name = args.name or "rnn"
249 |     assert args.train or args.stream, "Must have one of --stream or --train."
250 |     weights_path = "data/generation/rnn/{}_weights_{}.hdf5".format(
251 |         name,
252 |         "forward" if args.forward else "backward",
253 |     )
254 |     if args.stream:
255 |         extractedCandidates = generation_utils.getStartBucketNames(args) if args.experiment else None
256 |         streamRNNGuesses(
257 |             beanstalkPort=args.port,
258 |             forward=args.forward,
259 |             name=name,
260 |             numTrials=int(args.num_trials) or float("inf"),
261 |             weights_path=weights_path,
262 |             seedSet=extractedCandidates,
263 |         )
264 |     elif args.train:
265 |         
266 |         extractedCandidates = generation_utils.getStartBucketNames(args) if args.experiment else None 
267 |         runTraining(
268 |             name=name,
269 |             forward=args.forward,
270 |             filepath=weights_path,
271 |             candidates=extractedCandidates,
272 |             public=args.public,
273 |         )
274 | 


--------------------------------------------------------------------------------
/bucket_validation/listener.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | /*
  4 | 	This Listener takes in lines of IP addresses as input and delgates their results
  5 | 	to processes.
  6 | */
  7 | 
  8 | import (
  9 | 	"bufio"
 10 | 	"encoding/json"
 11 | 	"fmt"
 12 | 	"io"
 13 | 	"log"
 14 | 	"os"
 15 | 	"os/exec"
 16 | 	"os/signal"
 17 | 	"strconv"
 18 | 	"strings"
 19 | 	"sync"
 20 | 	"syscall"
 21 | 	"time"
 22 | 
 23 | 	"github.com/willf/bloom"
 24 | 
 25 | 	"github.com/beanstalkd/go-beanstalk"
 26 | 	"github.com/spf13/viper"
 27 | )
 28 | 
 29 | // Configuration class allows you to specify the source ips and read limit (in KB)
 30 | type Configuration struct {
 31 | 	NumSenders       int
 32 | 	BeanstalkHost    string
 33 | 	ReadLimitPerHost int
 34 | 	SourceIPs        []string
 35 | }
 36 | 
 37 | /*
 38 | Validator represents a process that takes in S3 hostnames and resolves their results to an
 39 | output file.
 40 | */
 41 | type Validator struct {
 42 | 	stdIn                io.WriteCloser
 43 | 	stdOut               io.ReadCloser
 44 | 	cmd                  *exec.Cmd
 45 | 	ip                   string
 46 | 	lastResponseReceived time.Time
 47 | 	lastResponseMutex    sync.Mutex
 48 | }
 49 | 
 50 | //OutputFile represented the bucketed output types, abstracted just in case we want more member fields.
 51 | type OutputFile struct {
 52 | 	f *os.File
 53 | }
 54 | 
 55 | var config Configuration
 56 | 
 57 | // The list of hosts to try a bucket against if no host is provided
 58 | var hosts []string
 59 | 
 60 | // The list of hosts that are accepted
 61 | var acceptedHosts []string
 62 | 
 63 | var validators []*Validator
 64 | 
 65 | var responseChan chan string
 66 | 
 67 | // The maximum number of times a single bucket is tried
 68 | var MAX_RETRIES = 3
 69 | 
 70 | type OpenRequest struct {
 71 | 	provider    string
 72 | 	source      string
 73 | 	numAttempts int
 74 | 	lastTried   time.Time
 75 | }
 76 | 
 77 | // A map from bucket names to their sources and number of retiries
 78 | var openRequests map[string]OpenRequest = make(map[string]OpenRequest)
 79 | var openRequestsMutex sync.Mutex
 80 | 
 81 | // Mutex for bloom filter
 82 | var previouslySeenMutex sync.Mutex
 83 | 
 84 | func handleErrorFatal(err error) {
 85 | 	if err != nil {
 86 | 		log.Fatal(err)
 87 | 	}
 88 | }
 89 | 
 90 | func handleError(err error) {
 91 | 	if err != nil {
 92 | 		log.Println(err)
 93 | 	}
 94 | }
 95 | 
 96 | func initializeZGrab(numSenders int, readLimitPerHost int, sourceIP string) *Validator {
 97 | 	path, ok := os.LookupEnv("GOPATH")
 98 | 	if ok {
 99 | 		cmd := exec.Command(path+"/bin/zgrab2", "http",
100 | 			"--use-https",
101 | 			"--port", "443",
102 | 			"--read-limit-per-host", fmt.Sprintf("%d", readLimitPerHost),
103 | 			"--senders", fmt.Sprintf("%d", numSenders),
104 | 			"--source-ip", sourceIP,
105 | 			"--flush")
106 | 		stdout, err := cmd.StdoutPipe()
107 | 		handleError(err)
108 | 		cmd.Stderr = os.Stderr
109 | 		stdin, err := cmd.StdinPipe()
110 | 		handleError(err)
111 | 		err = cmd.Start()
112 | 		handleError(err)
113 | 		fmt.Printf("ZGrab running with %d senders on IP %s \n", numSenders, sourceIP)
114 | 		return &Validator{stdin, stdout, cmd, sourceIP, time.Now(), sync.Mutex{}}
115 | 	}
116 | 	fmt.Printf("GOPATH not set. %s", path)
117 | 
118 | 	return nil
119 | }
120 | 
121 | func delegateRequestJobs(files map[string]OutputFile, beanstalkHost string, prevSeen *bloom.BloomFilter) {
122 | 	jobQueue, err := beanstalk.Dial("tcp", beanstalkHost)
123 | 	handleErrorFatal(err)
124 | 	for {
125 | 		for i, v := range validators {
126 | 			v.lastResponseMutex.Lock()
127 | 			// If more than one minute has elapsed, restart validator
128 | 			if time.Since(v.lastResponseReceived) > time.Minute {
129 | 				closeValidator(v)
130 | 				initiateValidator(i)
131 | 				receiveResponse(validators[i])
132 | 				v.lastResponseReceived = time.Now()
133 | 			}
134 | 			v.lastResponseMutex.Unlock()
135 | 			// Check if any buckets should be retried
136 | 			shouldContinue := false
137 | 			openRequestsMutex.Lock()
138 | 			for bucket, openRequest := range openRequests {
139 | 				if openRequest.numAttempts > MAX_RETRIES {
140 | 					delete(openRequests, bucket)
141 | 					continue
142 | 				}
143 | 				if time.Since(openRequest.lastTried) > 60*time.Minute {
144 | 					spawnBucket(bucket, openRequest.provider, openRequest.source, v, openRequest.numAttempts+1, false, prevSeen)
145 | 					shouldContinue = true
146 | 					break
147 | 				}
148 | 			}
149 | 			openRequestsMutex.Unlock()
150 | 			if shouldContinue {
151 | 				continue
152 | 			}
153 | 
154 | 			id, body, err := jobQueue.Reserve(5 * time.Second)
155 | 			log.Println("Reserved job " + string(body))
156 | 			if err != nil {
157 | 				if !strings.Contains(err.Error(), "timeout") { // Don't print if it's a timeout
158 | 					log.Println("Error reserving job: " + err.Error())
159 | 				}
160 | 				continue
161 | 			}
162 | 			jobContents := strings.Split(string(body), ",")
163 | 			if len(jobContents) != 2 {
164 | 				log.Println("INVALID FORMAT FOR " + string(body) + ": NEEDS ',' DELIMITER")
165 | 				err = jobQueue.Delete(id)
166 | 				continue
167 | 			}
168 | 			path := jobContents[0]
169 | 			bucket := jobContents[1]
170 | 
171 | 			// If a host is already provided, we use only that host
172 | 			hostFound := false
173 | 			for _, host := range acceptedHosts {
174 | 				if strings.Contains(bucket, host) {
175 | 					spawnBucket(bucket, host, path, v, 1, true, prevSeen)
176 | 					hostFound = true
177 | 					break
178 | 				}
179 | 			}
180 | 
181 | 			// Otherwise, we try on all hosts
182 | 			if !hostFound {
183 | 				for _, host := range hosts {
184 | 					bucketName := bucket
185 | 					if host == "oss-us-east-1.aliyuncs.com" {
186 | 						// For alibaba, replace dots with hyphens
187 | 						bucketName = strings.Replace(bucketName, ".", "-", -1)
188 | 					}
189 | 					spawnBucket(bucketName+"."+host, host, path, v, 1, true, prevSeen)
190 | 				}
191 | 			}
192 | 
193 | 			err = jobQueue.Delete(id)
194 | 			handleError(err)
195 | 		}
196 | 		log.Println("Sleeping for 0.25 second")
197 | 		time.Sleep(time.Duration(250) * time.Millisecond)
198 | 	}
199 | 
200 | }
201 | 
202 | func spawnBucket(bucket string, host string, path string, v *Validator, count int, shouldLock bool, prevSeen *bloom.BloomFilter) {
203 | 
204 | 	// First, confirm that we have not tried the bucket before.
205 | 	// Currently commented out - TODO: add option to use bloom filter
206 | 	// previouslySeenMutex.Lock()
207 | 	// old := prevSeen.Test([]byte(bucket))
208 | 	// previouslySeenMutex.Unlock()
209 | 	old := false
210 | 
211 | 	if old { // The bucket has already been tried: just ignore.
212 | 		log.Println("Have seen " + bucket)
213 | 		if shouldLock {
214 | 			openRequestsMutex.Lock()
215 | 		}
216 | 		delete(openRequests, bucket)
217 | 		if shouldLock {
218 | 			openRequestsMutex.Unlock()
219 | 		}
220 | 		return
221 | 	}
222 | 
223 | 	log.Println("Have not seen " + bucket)
224 | 	fmt.Println("Sending: " + bucket)
225 | 
226 | 	go writeWithTimeout(v.stdIn, []byte(bucket+"\n"))
227 | 
228 | 	if shouldLock {
229 | 		openRequestsMutex.Lock()
230 | 	}
231 | 	openRequests[bucket] = OpenRequest{host, path, count, time.Now()}
232 | 	if shouldLock {
233 | 		openRequestsMutex.Unlock()
234 | 	}
235 | }
236 | 
237 | func writeWithTimeout(stdIn io.WriteCloser, text []byte) {
238 | 	c := make(chan string, 1)
239 | 	go func() {
240 | 		stdIn.Write(text)
241 | 		c <- "done"
242 | 	}()
243 | 	select {
244 | 	case <-c:
245 | 	case <-time.After(500 * time.Millisecond):
246 | 	}
247 | }
248 | 
249 | func receiveResponses() {
250 | 	for _, v := range validators {
251 | 		receiveResponse(v)
252 | 	}
253 | }
254 | 
255 | func receiveResponse(v *Validator) {
256 | 	go func(v *Validator, c chan string) {
257 | 		scanner := bufio.NewScanner(v.stdOut)
258 | 		if scanner != nil {
259 | 			for scanner.Scan() {
260 | 				v.lastResponseMutex.Lock()
261 | 				v.lastResponseReceived = time.Now()
262 | 				v.lastResponseMutex.Unlock()
263 | 
264 | 				text := scanner.Text()
265 | 				c <- text
266 | 			}
267 | 		}
268 | 	}(v, responseChan)
269 | }
270 | 
271 | func writeResponses(files map[string]OutputFile, prevSeen *bloom.BloomFilter) {
272 | 	for result := range responseChan {
273 | 		var responseBody interface{}
274 | 		err := json.Unmarshal([]byte(result), &responseBody)
275 | 		if err != nil {
276 | 			fmt.Fprintln(os.Stderr, err)
277 | 		} else {
278 | 			responseJSON := responseBody.(map[string]interface{})
279 | 			domain := responseJSON["domain"].(string)
280 | 			data := responseJSON["data"].(map[string]interface{})
281 | 			if data["http"] == nil {
282 | 				continue
283 | 			}
284 | 			http := data["http"].(map[string]interface{})
285 | 			if http["result"] == nil {
286 | 				continue
287 | 			}
288 | 			resultJSON := http["result"].(map[string]interface{})
289 | 			if resultJSON["response"] == nil {
290 | 				continue
291 | 			}
292 | 			response := resultJSON["response"].(map[string]interface{})
293 | 			statusCode := int(response["status_code"].(float64))
294 | 			fmt.Printf("%d %s\n", statusCode, domain)
295 | 
296 | 			// Add bucket to our previously seen set.
297 | 			previouslySeenMutex.Lock()
298 | 			prevSeen.Add([]byte(domain))
299 | 			previouslySeenMutex.Unlock()
300 | 
301 | 			// Alibaba: Check if response is redirecting to a different bucket
302 | 			if strings.Contains(domain, "oss-us-east-1.aliyuncs.com") && response["body"] != nil {
303 | 				body := response["body"].(string)
304 | 				if statusCode == 403 && strings.Contains(body, "must be addressed") && strings.Contains(body, "<Endpoint>") {
305 | 					newHost := strings.Split(strings.Split(body, "</Endpoint>")[0], "<Endpoint>")[1]
306 | 					bucket := strings.Split(domain, ".oss-us-east-1.aliyuncs.com")[0] + "." + newHost
307 | 					openRequestsMutex.Lock()
308 | 					origRequest := openRequests[domain]
309 | 					delete(openRequests, domain)
310 | 					// Add to pending queue with 0 time to force trying on new host
311 | 					openRequests[bucket] = OpenRequest{origRequest.provider, origRequest.source, origRequest.numAttempts + 1, time.Time{}}
312 | 					openRequestsMutex.Unlock()
313 | 					continue
314 | 				}
315 | 			}
316 | 
317 | 			toLog := fmt.Sprintf("%s,%d\n", domain, time.Now().Unix())
318 | 
319 | 			openRequestsMutex.Lock()
320 | 			request := openRequests[domain]
321 | 			getFile(request.source, files).f.WriteString(toLog)
322 | 			delete(openRequests, domain)
323 | 			openRequestsMutex.Unlock()
324 | 
325 | 			for _, host := range acceptedHosts {
326 | 				if strings.Contains(domain, host) {
327 | 					getFile(strconv.Itoa(statusCode)+host, files).f.WriteString(toLog)
328 | 					break
329 | 				}
330 | 			}
331 | 		}
332 | 	}
333 | }
334 | 
335 | func closeAllValidators() {
336 | 	for _, v := range validators {
337 | 		closeValidator(v)
338 | 	}
339 | }
340 | 
341 | func closeValidator(v *Validator) {
342 | 	v.stdIn.Close()
343 | 	v.cmd.Process.Kill()
344 | 	v.cmd.Wait()
345 | }
346 | 
347 | func getFile(path string, files map[string]OutputFile) OutputFile {
348 | 	if !strings.Contains(path, "..") {
349 | 		if val, ok := files[path]; ok {
350 | 			fmt.Println("VAL: " + path)
351 | 
352 | 			return val
353 | 		}
354 | 
355 | 		f, err := os.OpenFile(
356 | 			"./data/"+path+".txt",
357 | 			os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
358 | 		if err == nil {
359 | 			files[path] = OutputFile{f}
360 | 
361 | 			return files[path]
362 | 		}
363 | 		fmt.Fprintln(os.Stderr, err)
364 | 
365 | 	}
366 | 	return OutputFile{nil}
367 | }
368 | 
369 | func closeAllValidatorsOnSignal(files map[string]OutputFile, prevSeen *bloom.BloomFilter) {
370 | 	// Intercept sigint
371 | 	sig := make(chan os.Signal, 2)
372 | 	signal.Notify(sig, os.Interrupt, syscall.SIGTERM)
373 | 
374 | 	go func() {
375 | 		<-sig
376 | 		fmt.Println("Stopping. Closing all validators.")
377 | 		closeAllValidators()
378 | 		for _, v := range files {
379 | 			v.f.Close()
380 | 		}
381 | 
382 | 		// Also write to the bloom filter file
383 | 		previouslySeenMutex.Lock()
384 | 		f, err := os.OpenFile("./bucket_validation/bloom/candidate_set.bloom", os.O_CREATE|os.O_RDWR, 0644)
385 | 		if err != nil {
386 | 			panic(err)
387 | 		}
388 | 		defer f.Close()
389 | 		w := bufio.NewWriter(f)
390 | 		prevSeen.WriteTo(w)
391 | 		previouslySeenMutex.Unlock()
392 | 		os.Exit(0)
393 | 	}()
394 | 
395 | }
396 | 
397 | func openFiles() map[string]OutputFile {
398 | 	types := map[string]string{
399 | 		"200": "public",
400 | 		"400": "invalid_bucket",
401 | 		"403": "private",
402 | 		"404": "no_such_bucket",
403 | 		"500": "error",
404 | 	}
405 | 	files := make(map[string]OutputFile)
406 | 	for k, v := range types {
407 | 		for _, host := range acceptedHosts {
408 | 			f, err := os.OpenFile(
409 | 				"./data/validation/"+host+"/"+v+".txt",
410 | 				os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
411 | 			if err == nil {
412 | 				files[k+host] = OutputFile{f}
413 | 			} else {
414 | 				fmt.Fprintln(os.Stderr, err)
415 | 			}
416 | 		}
417 | 	}
418 | 	return files
419 | }
420 | 
421 | func parseConfig() {
422 | 	viper.SetConfigName("bucket_validation/listener-config")
423 | 	viper.AddConfigPath(".")
424 | 	err := viper.ReadInConfig()
425 | 	handleErrorFatal(err)
426 | 	err = viper.Unmarshal(&config)
427 | 	handleErrorFatal(err)
428 | }
429 | 
430 | // Clears the job queue by deleting all items
431 | func clearQueue(beanstalkHost string) {
432 | 	jobQueue, err := beanstalk.Dial("tcp", beanstalkHost)
433 | 	handleError(err)
434 | 	for {
435 | 		id, _, err := jobQueue.Reserve(5 * time.Second)
436 | 		if err != nil {
437 | 			if !strings.Contains(err.Error(), "timeout") { // Don't print if it's a timeout
438 | 				log.Println("Error reserving job: " + err.Error())
439 | 			}
440 | 			continue
441 | 		}
442 | 		err = jobQueue.Delete(id)
443 | 		handleError(err)
444 | 	}
445 | }
446 | 
447 | func initiateValidators() {
448 | 	validators = make([]*Validator, 0)
449 | 	if len(config.SourceIPs) == 0 {
450 | 		validators = append(validators, initializeZGrab(config.NumSenders, config.ReadLimitPerHost, ""))
451 | 	} else {
452 | 		for _, ip := range config.SourceIPs {
453 | 			validators = append(validators, initializeZGrab(config.NumSenders, config.ReadLimitPerHost, ip))
454 | 		}
455 | 	}
456 | }
457 | 
458 | func initiateValidator(i int) {
459 | 	newValidator := initializeZGrab(config.NumSenders, config.ReadLimitPerHost, validators[i].ip)
460 | 	validators[i].stdIn = newValidator.stdIn
461 | 	validators[i].stdOut = newValidator.stdOut
462 | 	validators[i].cmd = newValidator.cmd
463 | }
464 | 
465 | func loadBloomFilter() *bloom.BloomFilter {
466 | 	filter := bloom.NewWithEstimates(300000000, .000001)
467 | 	// TODO: Add support for Bloom Filter
468 | 	// f, err := os.OpenFile("./bucket_validation/bloom/candidate_set.bloom", os.O_CREATE|os.O_RDWR, 0644)
469 | 	// if err != nil {
470 | 	// 	panic(err)
471 | 	// }
472 | 	// defer f.Close()
473 | 	// r := bufio.NewReader(f)
474 | 	// filter.ReadFrom(r)
475 | 	return filter
476 | }
477 | 
478 | func main() {
479 | 
480 | 	responseChan = make(chan string)
481 | 	// The hosts that we automatically try against. For Alibaba, we initially try
482 | 	// against one region and get the new region from the response if the bucket exists
483 | 	hosts = []string{"s3.amazonaws.com", "storage.googleapis.com", "oss-us-east-1.aliyuncs.com"}
484 | 	acceptedHosts = []string{"s3.amazonaws.com", "storage.googleapis.com", "aliyuncs.com"}
485 | 
486 | 	//	clearQueue(config.BeanstalkHost)
487 | 
488 | 	filter := loadBloomFilter()
489 | 
490 | 	parseConfig()
491 | 	initiateValidators()
492 | 
493 | 	outputFiles := openFiles()
494 | 	closeAllValidatorsOnSignal(outputFiles, filter)
495 | 	go delegateRequestJobs(outputFiles, config.BeanstalkHost, filter)
496 | 	receiveResponses()
497 | 	writeResponses(outputFiles, filter)
498 | 
499 | }
500 | 


--------------------------------------------------------------------------------