├── README.md └── cc.py /README.md: -------------------------------------------------------------------------------- 1 | # commoncrawl 2 | Gathers urls from common crawl 3 | -------------------------------------------------------------------------------- /cc.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import threading 4 | import queue 5 | import argparse 6 | 7 | class commonCrawlDataClass(): 8 | 9 | def __init__(self,domain): 10 | self.jsonIndexData = "" 11 | self.domain = domain 12 | self.domains = [] 13 | self.q = queue.Queue() 14 | 15 | def getIndexes(self): 16 | indexURL = "https://index.commoncrawl.org/collinfo.json" 17 | r = requests.get(indexURL) 18 | jsonIndexData = json.loads(r.text) 19 | for index in jsonIndexData: 20 | self.q.put(index['id']) 21 | 22 | def getIndexData(self,indexID): 23 | try: 24 | commonCrawlURL = "http://index.commoncrawl.org/"+indexID+"-index?url="+self.domain+"/*&output=json" 25 | r = requests.get(commonCrawlURL) 26 | data = r.text.split("\n")[:-1] 27 | for entry in data: 28 | url = json.loads(entry)['url'] 29 | if url not in self.domains: 30 | self.domains.append(url) 31 | print(url) 32 | except: 33 | pass 34 | def worker(self): 35 | while 1: 36 | indexID = self.q.get() 37 | self.getIndexData(indexID) 38 | self.q.task_done() 39 | 40 | def start(self): 41 | self.getIndexes() 42 | for i in range(0,10): 43 | t = threading.Thread(target=self.worker) 44 | t.daemon = True 45 | t.start() 46 | self.q.join() 47 | 48 | 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument("-d","--domain", help="Domain Name; EX: test.com") 51 | args = parser.parse_args() 52 | 53 | if args.domain: 54 | domain = args.domain 55 | cc = commonCrawlDataClass(domain) 56 | cc.start() 57 | --------------------------------------------------------------------------------