├── README.md
└── cc.py


/README.md:
--------------------------------------------------------------------------------
1 | # commoncrawl
2 | Gathers urls from common crawl
3 | 


--------------------------------------------------------------------------------
/cc.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import threading
 4 | import queue
 5 | import argparse
 6 | 
 7 | class commonCrawlDataClass():
 8 | 
 9 |     def __init__(self,domain):
10 |         self.jsonIndexData = ""
11 |         self.domain = domain
12 |         self.domains = []
13 |         self.q = queue.Queue()
14 | 
15 |     def getIndexes(self):
16 |         indexURL = "https://index.commoncrawl.org/collinfo.json"
17 |         r = requests.get(indexURL)
18 |         jsonIndexData = json.loads(r.text)
19 |         for index in jsonIndexData:
20 |             self.q.put(index['id'])
21 | 
22 |     def getIndexData(self,indexID):
23 |         try:
24 | 	    commonCrawlURL = "http://index.commoncrawl.org/"+indexID+"-index?url="+self.domain+"/*&output=json"
25 |             r = requests.get(commonCrawlURL)
26 |             data = r.text.split("\n")[:-1]
27 |             for entry in data:
28 |                 url = json.loads(entry)['url']
29 |                 if url not in self.domains:
30 |                     self.domains.append(url)
31 |                     print(url)
32 |         except:
33 |             pass
34 |     def worker(self):
35 |         while 1:
36 |             indexID = self.q.get()
37 |             self.getIndexData(indexID)
38 |             self.q.task_done()
39 | 
40 |     def start(self):
41 |         self.getIndexes()
42 |         for i in range(0,10):
43 |             t = threading.Thread(target=self.worker)
44 |             t.daemon = True
45 |             t.start()
46 |         self.q.join()
47 | 
48 | 
49 | parser = argparse.ArgumentParser()
50 | parser.add_argument("-d","--domain", help="Domain Name; EX: test.com")
51 | args = parser.parse_args()
52 | 
53 | if args.domain:
54 |     domain = args.domain
55 |     cc = commonCrawlDataClass(domain)
56 |     cc.start()
57 | 


--------------------------------------------------------------------------------