├── .gitignore
├── README.md
├── __init__.py
├── actions.py
├── geoloc_by_domain.py
├── geolocatisation
    ├── GeoLiteCity.dat
    ├── __init__.py
    ├── dschield.py
    ├── geolocalisation.py
    └── result.txt
├── harvesting
    ├── __init__.py
    ├── __init__.pyc
    ├── bingsearch.js
    ├── content.py
    ├── content_search.py
    ├── crawler.py
    ├── dynamic.js
    ├── filters.py
    ├── googlesearch.js
    ├── keywords
    ├── metaextract.js
    ├── pastebin.js
    ├── pastebin.py
    ├── pastebinExtract.py
    ├── pastebintest.py
    ├── pastebintext.js
    ├── pholcidae.py
    ├── random_user_agent.py
    ├── search.py
    ├── user_agents
    ├── white_list.py
    └── yahoosearch.js
├── history
    ├── __init__.py
    └── history.py
├── main.py
├── mongodb
    ├── __init__.py
    ├── __init__.pyc
    ├── mongodb.py
    └── mongodb.pyc
├── network
    ├── IPy.py
    ├── __init__.py
    ├── __init__.pyc
    ├── make_networks.py
    ├── networks.py
    ├── networks.pyc
    └── search_on_network.py
├── processing
    ├── __init__.py
    ├── __init__.pyc
    ├── bulk.py
    ├── categoryze_result.py
    ├── clean_db.py
    ├── compare.py
    ├── create_request.py
    ├── create_result.py
    ├── createcorpus.py
    ├── dnstree.py
    ├── filters.py
    ├── filters.pyc
    ├── gouv.log
    ├── gouv_domaine.txt
    ├── gouv_metadatas.txt
    ├── metadataextract.py
    └── metadataextract.pyc
├── scanners
    ├── __init__.py
    └── networks.py
├── screenshots
    ├── __init__.py
    ├── make_screenshots.py
    ├── screenshots.js
    └── screenshots.py
└── storage
    ├── __init__.py
    └── redis_record.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | #compiled file
 2 | *.pyc
 3 | #log
 4 | *.log
 5 | #screen
 6 | *.png
 7 | #data
 8 | *.csv
 9 | .project
10 | .settings/org.eclipse.ltk.core.refactoring.prefs
11 | .pydevproject
12 | *.txt
13 | *.tar.gz
14 | *.zip
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/README.md


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/__init__.py


--------------------------------------------------------------------------------
/actions.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Feb 1, 2013
  3 | 
  4 | @author: slarinier
  5 | '''
  6 | 
  7 | from libnmap.parser import NmapParser
  8 | from libnmap.process import NmapProcess
  9 | import pymongo
 10 | from pymongo import MongoClient
 11 | import threading
 12 | 
 13 | from harvesting import search
 14 | from harvesting.crawler import Record, CrawlerThread
 15 | import mongodb
 16 | from network import make_networks, networks
 17 | from network.IPy import IP
 18 | from processing import metadataextract
 19 | from processing.clean_db import Cleandb
 20 | from processing.create_result import Create_Result
 21 | from processing.dnstree import DNSTree
 22 | from screenshots.screenshots import Screenshots
 23 | from scanners.networks import Networks
 24 | 
 25 | class Actions(object):
 26 |     '''
 27 |     classdocs
 28 |     '''
 29 |     def __init__(self, db_value):
 30 |         self.db_value = db_value
 31 |         connection = MongoClient(host='localhost', port=27017)
 32 |         self.db = connection[db_value]
 33 | 
 34 |     def create_network(self):
 35 |         network=make_networks.make_networks('localhost', self.db_value)
 36 |         network.createNetworks('new_domaines')
 37 |         network.exportFile(self.db_value+'_network.log')
 38 | 
 39 |     def create_result(self,collection,criteria):
 40 |         createResult=Create_Result(self.db_value,criteria)
 41 |         if collection=='scanners':
 42 |             createResult.processScanners(collection)
 43 |             return
 44 |         createResult.process(collection)
 45 |     
 46 |     def metasearch(self,criteria,scriptsJS,geoloc):
 47 |         print "########### Meta Search ###########"
 48 |         main_thread = threading.currentThread()
 49 |         thread_pool=[]
 50 |         for criterius in criteria:
 51 |             for script in scriptsJS:
 52 |                 gs=search.search(100,criterius,script,self.db_value)
 53 |                 gs.start()
 54 |                 thread_pool.append(gs)
 55 |             for t in thread_pool:
 56 |                 t.join()
 57 |             for t in thread_pool:
 58 |                 t.record()
 59 |         print "########### Search terminated ###########"
 60 | 
 61 |         print "########### Resolve IP ############"
 62 |         networks.resolve(geoloc,self.db_value)
 63 |     
 64 |     def search_ip(self,geoloc,scriptsJS,ip_range):
 65 |         main_thread = threading.currentThread()
 66 |         print "########### Search by IP ###########"
 67 |         ips=[]
 68 |         domaines=self.db.new_domaines.find()
 69 |         thread_pool=[]
 70 |         cache={}
 71 |         for domaine in domaines:
 72 |             try:    
 73 |                 ips.append(domaine['ip'])
 74 |     
 75 |             except KeyError:
 76 |                 print domaine
 77 |         i=0
 78 |         print 'les IPS sont: '+ str(ips)
 79 |         ip_to_add=[]
 80 |         if ip_range:
 81 |             ip_to_add=[str(x) for x in IP(ip_range)]
 82 |             ips[len(ips):]=ip_to_add
 83 |         for ip in set(ips):
 84 |             if ip != '0.0.0.0':
 85 |                 i+=1
 86 |                 gs=search.search(20,'ip:'+str(ip),scriptsJS[1],self.db_value)
 87 |                 gs.start()
 88 |                 thread_pool.append(gs)
 89 |             if i % 10 ==0:
 90 |                 for t in thread_pool:
 91 |                     t.join()
 92 |                 for t in thread_pool:
 93 |                     t.record()
 94 |         print "########### Search terminated ###########"         
 95 |         print "########### Search by network ###########"
 96 | 
 97 |         print "########### Resolve IP ############"
 98 |         networks.resolve(geoloc,self.db_value)
 99 | 
100 |     def scan_network(self):
101 |         pass
102 |     def scan_nmap(self,ip_range,options):
103 |         ips=[]
104 |         domaines=self.db.new_domaines.find()
105 |         thread_pool=[]
106 |         cache={}
107 |         for domaine in domaines:
108 |             try:    
109 |                 ips.append(domaine['ip'])
110 |                 cache[domaine['ip']]=domaine
111 |             except KeyError:
112 |                 print domaine
113 |         net=Networks(list(set(ips)),options)        
114 |         net.run()
115 |         report=net.make_report()
116 |         #net.record_report(report,cache,self.db.new_domaines)       
117 |         pass
118 |     def screenshots(self,db_value,threadpool):
119 |         connection= MongoClient(host='localhost', port=27017)
120 |         db=connection[db_value]
121 |         domaines=db.new_domaines.distinct('domaine')
122 |         i=0
123 |         main_thread = threading.currentThread()
124 |         threadpools=[]
125 |         print "print "+ str(len(domaines))+ " screenshots"
126 |         for domaine in domaines:
127 |             i+=1    
128 |             screen=Screenshots(domaines, 'screenshots/screenshots.js', 'screenshots/screenshots/'+db_value, domaine)
129 |             screen.start()
130 |             threadpools.append(screen)
131 |             if i % int(threadpool)== 0:
132 |                 for t in threadpools:
133 |                     t.join()
134 |     
135 |     def metadata_exctract(self,db):
136 |         main_thread = threading.currentThread()
137 |         print "########## Meta Data IP ##########"
138 |         mdb=mongodb.mongodb('localhost',27017,db)
139 |         i=0
140 | 
141 |         for domaine in mdb.selectall('new_domaines'):
142 |             i+=1
143 |             url=domaine['url']
144 |             domaine_value=domaine['domaine']
145 |             print url
146 |             if not 'meta' in domaine:
147 |                 domaine['meta']='ok'
148 |                 mtd=metadataextract.metadataextract('harvesting/metaextract.js',db,domaine_value,url)
149 |                 mtd.start()
150 |                 if i % 30==0:
151 |                     for t in threading.enumerate():
152 |                         if t is not main_thread:
153 |                             t.join(2)
154 | 
155 |     def dnstree(self,db_value):
156 |         dnst=DNSTree(db_value)
157 |         dnst.process()
158 |     
159 |     def crawl(self,list_domains):
160 |         main_thread = threading.currentThread()
161 |         #domaines=self.db.new_domaines.distinct('domaine')
162 |         domains=list_domains.split(',')
163 |         threadpool=[]
164 |         lock=threading.Lock()
165 |         rec=Record(self.db_value,lock)
166 |         rec.start()
167 |         i=0
168 |         for domain in domains:
169 |             i=i+1
170 |             cw=CrawlerThread(domain,self.db,lock)        
171 |             cw.run()        
172 |         
173 |             if i % 5==0:
174 |                 for t in threading.enumerate():
175 |                     if t is not main_thread:
176 |                         t.join(2)
177 |         stop=True
178 |     
179 |         while(stop):
180 |             for t in threadpool:
181 |                 if not t.IsActive():
182 |                     threadpool.remove(t)
183 |                 if len(threadpool)==0:
184 |                     stop=False
185 |                     
186 |     def clean_db(self,pathfilters):
187 |         print "#####Clean DB####"        
188 |         directory = "screenshots/screenshots/"+self.db_value
189 |         filters=[]
190 |         with open(pathfilters,'r') as fw:
191 |             for ligne in fw:
192 |                 filters.append(ligne.strip())        
193 |             cl=Cleandb(self.db_value, directory, filters)
194 |             cl.clean()
195 |     
196 |     def reset(self):
197 |         
198 |         for domaine in self.db.new_domaines.find():
199 |             domaine['meta']=None
200 |             self.db.update(domaine,'new_domaines')
201 |         
202 |     def init(self,db,coll,attrib):
203 |         
204 |         self.db.create_collection(coll)
205 |         self.db[coll].ensure_index([(attrib,pymongo.ASCENDING)],unique=True)
206 |         


--------------------------------------------------------------------------------
/geoloc_by_domain.py:
--------------------------------------------------------------------------------
 1 | from network import networks
 2 | import argparse
 3 | import sys
 4 | from geolocatisation import dschield
 5 | 
 6 | parser = argparse.ArgumentParser(description='Geolocalisation by domains')
 7 | parser.add_argument('--domaine', dest='fqdn',help='make a fqdn for geolocalisation')
 8 | parser.add_argument('--filename',dest='list_domaine')
 9 | parser.add_argument('--geoloc_file',dest='geoloc_file')
10 | parser.add_argument('--resolve_dns',dest='resolve_dns')
11 | parser.add_argument('--geoloc_country',dest='geoloc_country')
12 | parser.add_argument('--outfile',dest='outfile')                 
13 | 
14 | args=parser.parse_args()
15 | domaines=[]
16 | geoloc=[]
17 | geoloc_country=False
18 | geoloc_file=False
19 | if args.fqdn != None:
20 |     domaines=[args.fqdn]
21 | if args.list_domaine != None:
22 |     print "Read Domaine List"
23 |     with open(args.list_domaine,'r') as fr:
24 |         for ligne in fr:
25 |             domaines.append(ligne.strip())
26 | if args.geoloc_file != None:
27 |     print "Geolocalisation Load"
28 |     geoloc_file=True
29 |     if args.geoloc_file == None:
30 |         parser.print_help()
31 |         sys.exit(-1)
32 |     print "geoloc"
33 | 
34 | if args.geoloc_country:
35 |     print "Geolocalisation country ok"
36 |     geoloc_country=True
37 | domaines=list(set(domaines))
38 | print "Domaines list: "+str(len(domaines))
39 | for domaine in domaines:
40 |     ip='0.0.0.0'
41 |     ip=networks.resolve_dns(domaine)
42 |     if ip != None:
43 |         temp=ip+','+domaine
44 |         if geoloc_file == True:
45 |             geo=networks.geolocIP(args.geoloc_file,ip)
46 |             country=networks.geolocCountry(args.geoloc_file,ip)
47 |             if country:
48 |                 temp=temp+','+country
49 |             if geo:
50 |                 temp=temp+','+geo
51 |         if geoloc_country ==True:
52 |             ds=dschield.dschield('http://dshield.org/ipinfo_ascii.html?ip=')
53 |             ip,country,asname,network=ds.response(ip)              
54 |             temp=temp+','+country
55 |         print temp	
56 |         geoloc.append(temp)
57 |     else: 
58 |         geoloc.append('DNS Failure: '+domaine)
59 | if args.outfile != None:
60 |     with open(args.outfile,'w') as fw:
61 |         for ligne in geoloc:
62 |             fw.write(ligne+'\n') 
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/geolocatisation/GeoLiteCity.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/geolocatisation/GeoLiteCity.dat


--------------------------------------------------------------------------------
/geolocatisation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/geolocatisation/__init__.py


--------------------------------------------------------------------------------
/geolocatisation/dschield.py:
--------------------------------------------------------------------------------
 1 | import urllib2
 2 | import re
 3 | class dschield(object):
 4 | 	
 5 | 	def __init__(self,url):
 6 | 		self.url=url	
 7 | 	
 8 | 	def response(self,ip):
 9 | 		dschieldContent=urllib2.urlopen(self.url+ip)
10 | 		value=dschieldContent.read()
11 | 		patern='country= (\w+)'
12 | 		
13 | 		reg =re.compile(patern)
14 | 		m = reg.search(value)
15 | 		country=''		
16 | 		if m:
17 | 			 country=m.group(1)
18 | 		patern='asname= (.+)'		
19 | 		reg =re.compile(patern)
20 | 		m = reg.search(value)
21 | 		asname=''
22 | 		if m:
23 | 			asname=m.group(1)
24 | 		patern='network= (.+)'
25 | 		reg =re.compile(patern)
26 | 		m = reg.search(value)
27 | 		network=''
28 | 		if m:
29 | 			network=m.group(1)
30 | 			network=network.split(' ')[0]
31 | 		if country != '' and asname !='' and network !='':		
32 | 				return (ip,country,asname,network)
33 | 		return ('127.0.0.1','mars','alien','nothing')
34 | 


--------------------------------------------------------------------------------
/geolocatisation/geolocalisation.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Feb 20, 2013
 3 | 
 4 | @author: slarinier
 5 | '''
 6 | from pymongo.connection import Connection
 7 | 
 8 | class Geolocalisation(object):
 9 |     '''
10 |     classdocs
11 |     '''
12 | 
13 | 
14 |     def __init__(self,list_domaine,db_value):
15 |         '''
16 |         Constructor
17 |         '''
18 |         
19 |     def geolochoffline(self):
20 |       
21 |                 
22 |     def geolocOnline(self):
23 |         
24 | 
25 |     
26 |             
27 |             


--------------------------------------------------------------------------------
/geolocatisation/result.txt:
--------------------------------------------------------------------------------
 1 | 31.184.244.9,onlinetracksz.net,24.0_54.0,RU
 2 | 31.184.244.9,httpsites.org,24.0_54.0,RU
 3 | 31.184.244.9,onlinegreencm.org,24.0_54.0,RU
 4 | 31.184.244.9,onlinegiigii.com,24.0_54.0,RU
 5 | 31.184.244.9,onlinefishmw3bid.net,24.0_54.0,RU
 6 | 31.184.244.9,onlineliverss.org,24.0_54.0,RU
 7 | 31.184.244.9,onlinemooviii.com,24.0_54.0,RU
 8 | 31.184.244.9,onlinegiigii.net,24.0_54.0,RU
 9 | 62.109.12.39,62.109.12.39,55.7522_37.6156,RU
10 | 31.184.244.9,httpsites.net,24.0_54.0,RU
11 | DNS Failure: sauth-yandex.ru
12 | 31.184.244.9,onlinepainrs.com,24.0_54.0,RU
13 | 31.184.244.9,onlinegreenguide.com,24.0_54.0,RU
14 | 31.184.244.9,onlinepainrs.net,24.0_54.0,RU
15 | 31.184.244.9,onlineliververs.net,24.0_54.0,RU
16 | 31.184.244.9,online-moo-viii.net,24.0_54.0,RU
17 | 31.184.244.9,onlinemaris.com,24.0_54.0,RU
18 | 31.184.244.9,onlinegreenguide.net,24.0_54.0,RU
19 | 31.184.244.9,httpblogs.com,24.0_54.0,RU
20 | 31.184.244.9,onlinecodmw3buy.net,24.0_54.0,RU
21 | 31.184.244.9,onlinemaris.net,24.0_54.0,RU
22 | 31.184.244.9,onlinemooviii.net,24.0_54.0,RU
23 | 173.45.252.44,oase2.net,38.6446_-90.2533,US
24 | 92.63.106.133,www.money-yanbex.ru,60.0_100.0,RU
25 | 31.184.244.9,31.184.244.9,24.0_54.0,RU
26 | 31.184.244.219,onlinemoneysstock.org,24.0_54.0,RU
27 | 31.184.244.219,onlinefundsgoods.org,24.0_54.0,RU
28 | 31.184.244.219,livemoneysgoods.org,24.0_54.0,RU
29 | 31.184.244.219,onlineincomegoods.org,24.0_54.0,RU
30 | DNS Failure: newdomeninfo.info
31 | 31.184.244.9,onlineliververs.com,24.0_54.0,RU
32 | 31.184.244.9,onlineliverss.com,24.0_54.0,RU
33 | 31.184.244.9,onlineliverss.net,24.0_54.0,RU
34 | DNS Failure: onlinecashsstt.org
35 | DNS Failure: internetmoneysstt.org
36 | 69.43.161.151,moneyinternetlovesff.info,-27.0_133.0,US
37 | 31.184.244.219,livewindowsxpf4.info,24.0_54.0,RU
38 | 31.184.244.219,onlinewinsphonessite.org,24.0_54.0,RU
39 | 141.8.224.162,webstockcwo.info,47.0_8.0,CH
40 | DNS Failure: internetwindowslive.info
41 | 31.184.244.219,theonlinewinsphones.org,24.0_54.0,RU
42 | 31.184.244.219,webwindowsproc.info,24.0_54.0,RU
43 | 31.184.244.219,internetwindowslows.com,24.0_54.0,RU
44 | DNS Failure: moneydigitallovesff.info
45 | 31.184.244.219,internet-wins-phones.org,24.0_54.0,RU
46 | 31.184.244.219,livewindowsproc.info,24.0_54.0,RU
47 | 31.184.244.219,onlinewindowsxpf4site.info,24.0_54.0,RU
48 | 31.184.244.219,webwindowslows.com,24.0_54.0,RU
49 | 31.184.244.219,webbuildingstore.info,24.0_54.0,RU
50 | DNS Failure: livemoneysstt.org
51 | DNS Failure: moneylivelovesff.info
52 | 69.43.161.161,stockonlinelovesff.info,-27.0_133.0,US
53 | 69.43.161.156,moneyweblovesff.info,-27.0_133.0,US
54 | 31.184.244.219,digitalwindowsproc.info,24.0_54.0,RU
55 | DNS Failure: cashonlinelovesff.info
56 | 31.184.244.219,onlinemoneyssuv.info,24.0_54.0,RU
57 | 31.184.244.219,onlinemicrosoftproc.info,24.0_54.0,RU
58 | 31.184.244.219,onlinewindowsxpf4s.info,24.0_54.0,RU
59 | 69.43.161.161,dollaronlinelovesff.info,-27.0_133.0,US
60 | 31.184.244.219,digitalwinsphones.org,24.0_54.0,RU
61 | 62.109.23.82,l2-pantheon.ru,59.8944_30.2642,RU
62 | 31.184.244.219,onlinefinanses2f.info,24.0_54.0,RU
63 | 141.8.224.162,internetstockcwo.info,47.0_8.0,CH
64 | 


--------------------------------------------------------------------------------
/harvesting/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/harvesting/__init__.py


--------------------------------------------------------------------------------
/harvesting/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/harvesting/__init__.pyc


--------------------------------------------------------------------------------
/harvesting/bingsearch.js:
--------------------------------------------------------------------------------
 1 | var links = [];
 2 | var casper = require('casper').create();
 3 | var padding=casper.cli.get(0)
 4 | var criteria=casper.cli.get(1)
 5 | function getLinks() {
 6 |    
 7 |     var links = document.querySelectorAll('h2 a')
 8 |     return Array.prototype.map.call(links, function(e) {
 9 |         return e.getAttribute('href')
10 |     });
11 | }
12 | 
13 | 
14 | casper.start();
15 | 
16 | casper.open('http://www.bing.com/search?q='+criteria+'&go=&qs=ds&filt=all&first='+padding+'&FORM=PERE')
17 | casper.then(function() {
18 |     // aggregate results for the 'casperjs' search
19 |  	
20 | 	
21 |     links = this.evaluate(getLinks);
22 | 	
23 |     // now search for 'phantomjs' by filling the form again
24 | });
25 | 
26 | 
27 | 
28 | casper.run(function() {
29 |     // echo results in some pretty fashion
30 |     this.echo(links.length + ' links found:');
31 |     this.echo(' - ' + links.join('\n - ')).exit();
32 | });
33 | 


--------------------------------------------------------------------------------
/harvesting/content.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 1, 2012
 3 | 
 4 | @author: slarinier
 5 | '''
 6 | import re
 7 | from content_search import Content_search
 8 | 
 9 | class Content(object):
10 |     '''
11 |     classdocs
12 |     '''
13 |     _instance = None
14 |     def __new__(cls, *args, **kwargs):
15 |         if not cls._instance:
16 |             cls._instance = super(Content, cls).__new__(cls, *args, **kwargs)
17 |         return cls._instance
18 |     
19 |     def __init__(self,filetoload='keywords'):
20 |         '''
21 |         Constructor
22 |         '''
23 |         self.filetoload=filetoload
24 |         self.keywords=[]
25 |         with open(self.filetoload,'r') as fr:
26 |             for ligne in fr:
27 |                 self.keywords.append(ligne.strip())
28 |     
29 |     def analyse(self,ligne):
30 |         if ligne.find('&') != -1:
31 |             return 'keywords_and'
32 |         else :
33 |             return 'keyword_only'     
34 |     
35 |     def search(self,keyword,data):
36 |             action=self.analyse(keyword)
37 |             cs = Content_search(action,data)
38 |             find=getattr(cs, action)(keyword)
39 |             return find
40 |             
41 |                 
42 |             


--------------------------------------------------------------------------------
/harvesting/content_search.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 2, 2012
 3 | 
 4 | @author: slarinier
 5 | '''
 6 | import re
 7 | class Content_search(object):
 8 |     '''
 9 |     classdocs
10 |     '''
11 | 
12 | 
13 |     def __init__(self,action,data):
14 |         '''
15 |         Constructor
16 |         '''
17 |         self.action=action
18 |         self.data=data
19 |             
20 |     def keyword_only(self,keyword):
21 |         tokens=re.findall(keyword, self.data)
22 |         if len(tokens) > 0:
23 |             return True
24 |         return False
25 |     
26 |     def keywords_and(self,keywords):
27 |         keywords=keywords.split('&')
28 |         
29 |         for keyword in keywords:
30 |             if self.keyword_only(keyword) == False:
31 |                 return False
32 |         return True
33 |                 
34 |         
35 |     


--------------------------------------------------------------------------------
/harvesting/crawler.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Jan 7, 2013
  3 | 
  4 | @author: slarinier
  5 | '''
  6 | from selenium import webdriver
  7 | from pymongo import MongoClient
  8 | from threading import Thread
  9 | import redis
 10 | import threading
 11 | from pyfaup.faup import Faup
 12 | import time
 13 | from storage.redis_record import RedisRecord
 14 | from filters import Filters
 15 | from urllib2 import URLError
 16 | from collections import deque
 17 | 
 18 | 
 19 | class CrawlerThread(threading.Thread):
 20 |     def __init__(self, domain, db_value, lock):
 21 |         threading.Thread.__init__(self)
 22 |         self.domain = domain
 23 |         self.lock = lock
 24 | 
 25 |     def run(self):
 26 |         cw = Crawler(webdriver.Firefox(), self.lock, "http://" + self.domain)
 27 |         cw.init()
 28 |         cw.navigation()
 29 | 
 30 | 
 31 | class Record(threading.Thread):
 32 |     def __init__(self, db_value, lock):
 33 |         self.r = RedisRecord()
 34 |         self.connection = MongoClient(host='localhost', port=27017, db=db_value)
 35 |         self.db = self.connection[db_value]
 36 | 
 37 |         threading.Thread.__init__(self)
 38 |         self.lock = lock
 39 | 
 40 |     def run(self):
 41 |         i = 0
 42 |         while (True):
 43 |             i = i + 1
 44 |             if i % 1000 == 0:
 45 |                 time.sleep(10)
 46 |             self.lock.acquire()
 47 |             self.r.switchDB(1)
 48 |             url = self.r.rpop('crawl')
 49 |             self.lock.release()
 50 |             # print url
 51 |             fex = Faup()
 52 |             if url:
 53 |                 print "url found: " + url
 54 |                 try:
 55 |                     fex.decode(url)
 56 |                     domain = fex.get_host()
 57 |                     entry = self.db.new_domaines.find_one({'domaine': domain})
 58 |                     if entry == None:
 59 |                         print "record: " + domain
 60 |                         self.db.new_domaines.save({'domaine': domain, 'urls': [url]})
 61 | 
 62 |                     urls_stored = entry['urls']
 63 |                     if not url in urls_stored:
 64 |                         urls_stored.append(url)
 65 |                         entry['urls'] = urls_stored
 66 |                         self.db.new_domaines.save(entry)
 67 |                 except:
 68 |                     print "parsing fault " + url
 69 | 
 70 | 
 71 | class Crawler(object):
 72 |     def __init__(self, driver, lock, first_url, db_int=1):
 73 |         self.driver = driver
 74 |         self.driver.implicitly_wait(10)
 75 |         self.driver.set_page_load_timeout(30)
 76 |         self.r = RedisRecord()
 77 |         self.lock = lock
 78 |         self.queue = deque([])
 79 |         self.queue.append(first_url)
 80 |         self.dbs = [1, 2]
 81 | 
 82 |     def init(self):
 83 |         self.r.init(self.dbs)
 84 |         url = self.queue.popleft()
 85 |         self.driver.get(url)
 86 |         self.parser(url)
 87 | 
 88 |     def parser(self, url):
 89 |         self.r.switchDB(1)
 90 |         if not self.r.get(url):
 91 |             self.driver.get(url)
 92 |             elem_links = self.driver.find_elements_by_tag_name('a')
 93 |             self.lock.acquire()
 94 |             self.sort([link.get_attribute("href") for link in elem_links], url)
 95 |             self.lock.release()
 96 |             self.r.switchDB(1)
 97 |             self.r.put(url, url)
 98 | 
 99 |     def navigation(self):
100 | 
101 |         while (len(self.queue) > 0):
102 |             url = self.queue.popleft()
103 |             try:
104 |                 # self.driver.refresh()
105 |                 self.r.switchDB(1)
106 |                 self.parser(url)
107 | 
108 |             except URLError as e:
109 |                 print url
110 |             except IOError as e:
111 |                 self.r.switchDB(2)
112 |                 print "I/O error({0}): {1}".format(e.errno, e.strerror)
113 |                 # self.r.put(new_url,new_url)
114 |                 self.r.switchDB(1)
115 |             except e:
116 |                 continue
117 |         try:
118 |             self.driver.quit()
119 |             print "Fin du crawling du site " + url
120 |         except URLError as e:
121 |             self.driver = getattr(webdriver, 'Firefox')()
122 |             print 'boum'
123 |             self.lock.acquire()
124 |             self.r.switchDB(1)
125 |             self.r.put(url, url)
126 |             self.lock.release()
127 | 
128 |     def sort(self, elem_links, url):
129 |         fex = Faup()
130 |         f = Filters()
131 |         f.load()
132 |         self.r.switchDB(1)
133 |         extend = True
134 |         domainfilter = True
135 |         schemefilter = True
136 |         try:
137 |             for link in elem_links:
138 |                 new_url = link
139 |                 self.r.switchDB(2)
140 |                 if not self.r.get(new_url) and new_url:
141 |                     self.r.switchDB(1)
142 |                     if not self.r.get(new_url):
143 |                         fex.decode(new_url)
144 |                         domain = fex.get_host()
145 |                         if f.isfilteredscheme(fex.get_scheme()):
146 |                             self.r.switchDB(2)
147 |                             self.r.put(new_url, new_url)
148 |                             schemefilter = False
149 |                         if f.isfiltereddomains(domain):
150 |                             self.r.switchDB(2)
151 |                             self.r.put(new_url, new_url)
152 |                             domainfilter = False
153 |                         if f.isfilteredextention(fex.get_resource_path()):
154 |                             extend = False
155 |                             self.r.switchDB(2)
156 |                             self.r.put(new_url, new_url)
157 | 
158 |                         if extend and domainfilter and schemefilter:
159 |                             self.r.switchDB(1)
160 |                             self.r.rpush('crawl', new_url)
161 |                             self.queue.append(new_url)
162 |         except TypeError as e:
163 |             print "TypeError"
164 | 


--------------------------------------------------------------------------------
/harvesting/dynamic.js:
--------------------------------------------------------------------------------
 1 | var casper = require("casper").create({
 2 |     verbose: true
 3 | });
 4 | url = casper.cli.get(0)
 5 | // The base links array
 6 | var links = [
 7 |     url
 8 | ];
 9 | 
10 | // If we don't set a limit, it could go on forever
11 | var upTo = ~~casper.cli.get(0) || 10;
12 | 
13 | var currentLink = 0;
14 | 
15 | // Get the links, and add them to the links array
16 | // (It could be done all in one step, but it is intentionally splitted)
17 | function addLinks(link) {
18 |     this.then(function() {
19 |         var found = this.evaluate(searchLinks);
20 |         this.echo(found.length + " links found on " + link);
21 |         links = links.concat(found);
22 |     });
23 | }
24 | 
25 | // Fetch all <a> elements from the page and return
26 | // the ones which contains a href starting with 'http://'
27 | function searchLinks() {
28 |     var filter, map;
29 |     filter = Array.prototype.filter;
30 |     map = Array.prototype.map;
31 |     return map.call(filter.call(document.querySelectorAll("a"), function(a) {
32 |         return (/^http:\/\/.*/i).test(a.getAttribute("href"));
33 |     }), function(a) {
34 |         return a.getAttribute("href");
35 |     });
36 | }
37 | 
38 | // Just opens the page and prints the title
39 | function start(link) {
40 |     this.start(link, function() {
41 |         this.echo('Page title: ' + this.getTitle());
42 |     });
43 | }
44 | 
45 | // As long as it has a next link, and is under the maximum limit, will keep running
46 | function check() {
47 |     if (links[currentLink] && currentLink < upTo) {
48 |         this.echo('--- Link ' + currentLink + ' ---');
49 |         start.call(this, links[currentLink]);
50 |         addLinks.call(this, links[currentLink]);
51 |         currentLink++;
52 |         this.run(check);
53 |     } else {
54 |         this.echo("All done.");
55 |         this.exit();
56 |     }
57 | }
58 | 
59 | casper.start().then(function() {
60 |     this.echo("Starting");
61 | });
62 | 
63 | casper.run(check);
64 | 


--------------------------------------------------------------------------------
/harvesting/filters.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jun 12 17:40:53 2013
 4 | 
 5 | @author: slarinier
 6 | """
 7 | class Filters(object):
 8 |     def __init__(self,pathextention='harvesting/filtered_extensions',pathscheme='harvesting/filtered_schemes',pathdomain='harvesting/filtered_domains'):
 9 |         self.pathdomain=pathdomain
10 |         self.pathscheme=pathscheme
11 |         self.pathextentions=pathextention
12 |         self.domains=[]
13 |         self.schemes=[]
14 |         self.extentions=[]
15 |     def load(self):
16 |         with open(self.pathdomain,"r") as fr:
17 |             self.domains=[line.strip() for line in fr]
18 |         with open(self.pathscheme,"r") as fr:
19 |             self.schemes=[line.strip() for line in fr]
20 |         with open(self.pathextentions,"r") as fr:
21 |             self.extentions=[line.strip() for line in fr]
22 |     def isfilteredextention(self,path):
23 |         try:
24 |             for ext in self.extentions:
25 |                 if path.endswith(ext):
26 |                     return True
27 |             return False
28 |         except:
29 |             print "extension error"
30 |         
31 |     def isfilteredscheme(self,scheme):
32 |             return scheme is self.schemes
33 |     def isfiltereddomains(self,domain):
34 |         try:        
35 |             tokens=domain.split('.')[::-1]
36 |             for d in self.domains:
37 |                 d_tokens=d.split('.')[::-1]
38 |                 d_reverse=d_tokens[0]+'.'+d_tokens[1]
39 |                 t_reverse=str(tokens[0]+'.'+tokens[1])    
40 |                 if d_reverse == t_reverse:
41 |                     return True
42 |         except IndexError as e:
43 |             if domain.find('.') == -1:
44 |                 return True
45 |         except AttributeError as e:
46 |             print "test"
47 |         return False


--------------------------------------------------------------------------------
/harvesting/googlesearch.js:
--------------------------------------------------------------------------------
 1 | var links = [];
 2 | var casper = require('casper').create();
 3 | var padding=casper.cli.get(0);
 4 | var criteria=casper.cli.get(1);
 5 | var ua=casper.cli.get(2)
 6 | function getLinks() {
 7 |    
 8 |     var links = document.querySelectorAll('h3.r a');
 9 |     return Array.prototype.map.call(links, function(e) {
10 |         return e.getAttribute('href')
11 |     });
12 | }
13 | 
14 | 
15 | casper.start();
16 | casper.userAgent(ua)
17 | casper.open('http://google.com/search?q='+criteria+'&start='+padding)
18 | casper.then(function() {
19 |     // aggregate results for the 'casperjs' search
20 |  
21 | 	
22 |     links = this.evaluate(getLinks);
23 | 	
24 |     // now search for 'phantomjs' by filling the form again
25 | });
26 | 
27 | 
28 | 
29 | casper.run(function() {
30 |     // echo results in some pretty fashion
31 |     this.echo(links.length + ' links found:');
32 |     this.echo(' - ' + links.join('\n - ')).exit();
33 | });
34 | 


--------------------------------------------------------------------------------
/harvesting/keywords:
--------------------------------------------------------------------------------
1 | porn
2 | user&password
3 | 


--------------------------------------------------------------------------------
/harvesting/metaextract.js:
--------------------------------------------------------------------------------
 1 | var casper = require("casper").create()
 2 |   , url = casper.cli.get(0)
 3 |   , metas = [];
 4 | 
 5 | if (!url) {
 6 |     casper.echo('Usage: casperjs [url]').exit();
 7 | }
 8 | 
 9 | casper.start(url, function() {
10 |     metas = this.evaluate(function() {
11 |         var metas = [];
12 |         [].forEach.call(document.querySelectorAll('META'), function(elem) {
13 |             var meta = {};
14 |             [].slice.call(elem.attributes).forEach(function(attr) {
15 |                 meta[attr.name] = attr.value;
16 |             });
17 |             metas.push(meta);
18 |         });
19 |         return metas;
20 |     });
21 | });
22 | 
23 | casper.run(function() {
24 |     require("utils").dump(metas);
25 |     this.exit();
26 | });
27 | 


--------------------------------------------------------------------------------
/harvesting/pastebin.js:
--------------------------------------------------------------------------------
 1 | var links = [];
 2 | var casper = require('casper').create();
 3 | var url=casper.cli.get(0);
 4 | var ua=casper.cli.get(1)
 5 | function getLinks() {
 6 |    
 7 |     var links = document.querySelectorAll('tr a');
 8 |     return Array.prototype.map.call(links, function(e) {
 9 |         return e.getAttribute('href')
10 |     });
11 | }
12 | 
13 | 
14 | casper.start();
15 | casper.userAgent(ua);
16 | casper.open(url);
17 | casper.then(function() {
18 |     // aggregate results for the 'casperjs' search
19 |     links = this.evaluate(getLinks);
20 | 	
21 |     // now search for 'phantomjs' by filling the form again
22 | });
23 | 
24 | 
25 | casper.run(function() {
26 |     // echo results in some pretty fashion
27 |     this.echo(links.length + ' links found:');
28 |     this.echo(' - ' + links.join('\n - ')).exit();
29 | 
30 | });
31 | 


--------------------------------------------------------------------------------
/harvesting/pastebin.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | from subprocess import Popen, PIPE
 4 | import threading
 5 | import pymongo
 6 | from pymongo import Connection
 7 | from pastebinExtract import pastebinExtract
 8 | from random_user_agent import Random_user_agent
 9 | import time
10 | class pastebin():
11 |     def __init__(self, url,keyword,casperJSScript):
12 |         self.url=url
13 |         self.keyword=keyword
14 |         self.casperJSScript=casperJSScript
15 |         self.urls=[]
16 |         rua=Random_user_agent()
17 |         self.ua=rua.rand()
18 |         self.time = rua.randsleep()
19 |         self.result=[]
20 |     def pastebinArchive(self):
21 |         result=subprocess.Popen(['casperjs' ,self.casperJSScript,self.url,'\''+self.ua+'\''],stdout=PIPE)
22 |         for ligne in result.stdout:
23 |             if ligne.find('/')!=-1 and ligne.find('archive') == -1:
24 |                 id=ligne.replace(' - /','').strip()
25 |                 id=id.replace('\n','')
26 |                 self.urls.append('http://pastebin.com/raw.php?i='+id)
27 |         print self.urls
28 |                
29 |     def pastebinAnalyse(self):
30 |         i=0
31 |         main_thread = threading.currentThread()
32 |         thread_pool=[]
33 |         for url in self.urls:
34 |             pasteExtract=pastebinExtract(url)
35 |             time.sleep(self.time)
36 |             pasteExtract.start()
37 |             thread_pool.append(pasteExtract)
38 |             i+=1
39 |             if i % 500 ==0:
40 |                 for t in threading.enumerate():
41 |                     if t is not main_thread:
42 |                         t.join()
43 |                         
44 |         for t in thread_pool:
45 |             result =getattr(t,'result')
46 |             if result :
47 |                 self.result.append(result)
48 |         return self.result
49 |                     
50 |         
51 | 


--------------------------------------------------------------------------------
/harvesting/pastebinExtract.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | from subprocess import Popen, PIPE
 4 | import threading
 5 | from content import Content
 6 | from random_user_agent import Random_user_agent
 7 | 
 8 | class pastebinExtract(threading.Thread):
 9 |     def __init__(self,url,casperJSScript='pastebintext.js'):
10 |         threading.Thread.__init__(self)
11 |         self.url=url
12 |         self.casperJSScript=casperJSScript
13 |         self.content=Content()
14 |         self.data=[]
15 |         rua=Random_user_agent()
16 |         self.ua=rua.rand()
17 |         self.result=None
18 |    
19 |     def run(self):
20 |         result=subprocess.Popen(['casperjs' ,self.casperJSScript,self.url,'\''+self.ua+'\''],stdout=PIPE)
21 |         for ligne in result.stdout:
22 |                 record=ligne.strip()                
23 |                 self.data.append(record.lower())
24 |                 
25 |         keywords=getattr(self.content,'keywords')
26 |         for keyword in keywords:
27 |             if self.content.search(keyword,str(self.data)):
28 |                 self.result={'url': self.url, 'data': self.data}


--------------------------------------------------------------------------------
/harvesting/pastebintest.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import pastebin
3 | 
4 | paste=pastebin.pastebin('http://pastebin.com/archive',[],'pastebin.js')
5 | paste.pastebinArchive()
6 | setattr(paste,'casperJSScript','pastebintext.js')
7 | result=paste.pastebinAnalyse()
8 | print result
9 | 


--------------------------------------------------------------------------------
/harvesting/pastebintext.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var casper = require('casper').create();
 3 | 
 4 | 
 5 | var url=casper.cli.get(0);
 6 | var ua =casper.cli.get(1)
 7 | 
 8 | casper.start().then(function() {
 9 |     this.userAgent(ua);
10 |     this.open(url, {
11 |         method: 'get',
12 |         headers: {
13 |             'Accept': 'application/text'
14 |         }
15 |     });
16 | });
17 | 
18 | casper.run(function() {
19 |     this.echo(this.debugPage());
20 |     this.exit();
21 | });
22 | 
23 | 


--------------------------------------------------------------------------------
/harvesting/pholcidae.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | 
  3 | import re
  4 | import sys
  5 | 
  6 | # importing modules corresponding to Python version
  7 | 
  8 | import urlparse
  9 | import urllib2
 10 | 
 11 | class Pholcidae(object):
 12 | 
 13 |     """" Pholcidae is a small and fast web crawler. """
 14 | 
 15 |     def __init__(self):
 16 | 
 17 |         """
 18 |             @return void
 19 | 
 20 |             Creates Pholcidae instance and updates default settings dict.
 21 |         """
 22 | 
 23 |         # default local urllib2 opener
 24 |         self._opener = None
 25 |         # creating new sets of unparsed, already parsed and failed URLs
 26 |         self._unparsed_urls = set()
 27 |         self._parsed_urls = set()
 28 |         self._failed_urls = set()
 29 |         # extending settings with given values
 30 |         self._extend_settings()
 31 |         # compiling regular expressions
 32 |         self._compile_regexs()
 33 |         # autostart crawler if settings allows
 34 |         if self._settings.autostart:
 35 |             self.start()
 36 | 
 37 |     def crawl(self, response):
 38 | 
 39 |         """
 40 |             @type response AttrDict
 41 |             @return void
 42 | 
 43 |             Dummy method which can be overrided by inheriting Pholcidae class.
 44 |             Use it to get html page and parse it as you want to.
 45 |         """
 46 | 
 47 |         pass
 48 | 
 49 |     def start(self):
 50 | 
 51 |         """
 52 |             @return void
 53 | 
 54 |             Simple crawler start trigger.
 55 |         """
 56 | 
 57 |         self._get_page()
 58 | 
 59 |     ############################################################################
 60 |     # PRIVATE METHODS                                                          #
 61 |     ############################################################################
 62 | 
 63 |     ############################ INIT METHODS ##################################
 64 | 
 65 |     def _extend_settings(self):
 66 | 
 67 |         """
 68 |             @return void
 69 | 
 70 |             Extends default settings with given settings.
 71 |         """
 72 | 
 73 |         # creating default settings object
 74 |         self._settings = AttrDict({
 75 |             # do we need to follow HTTP redirects?
 76 |             'follow_redirects': True,
 77 |             # what page links do we need to parse?
 78 |             'valid_links': ['(.*)'],
 79 |             # what URLs must be excluded
 80 |             'exclude_links': [],
 81 |             # what is an entry point for crawler?
 82 |             'start_page': '/',
 83 |             # which domain should we parse?
 84 |             'domain': '',
 85 |             # should we ignor pages outside of the given domain?
 86 |             'stay_in_domain': True,
 87 |             # which protocol do we need to use?
 88 |             'protocol': 'http://',
 89 |             # autostart crawler right after initialization?
 90 |             'autostart': False,
 91 |             # cookies to be added to each request
 92 |             'cookies': {},
 93 |             # custom headers to be added to each request
 94 |             'headers': {}
 95 |         })
 96 | 
 97 |         # updating settings with given values
 98 |         self._settings.update(self.settings)
 99 | 
100 |         # creating urllib2 opener
101 |         self._create_opener()
102 |         # compiling cookies
103 |         self._compile_cookies()
104 |         # compiling headers
105 |         self._compile_headers()
106 | 
107 |         # adding start point into unparsed list
108 |         start_url = '%s%s%s' % (self._settings.protocol, self._settings.domain,
109 |                                 self._settings.start_page)
110 |         self._unparsed_urls.add(start_url)
111 | 
112 |     def _compile_regexs(self):
113 | 
114 |         """
115 |             @return void
116 | 
117 |             Compiles regular expressions for further use.
118 |         """
119 | 
120 |         # setting default flags
121 |         flags = re.I | re.S
122 |         # compiling regexs
123 |         self._regex = AttrDict({
124 |             # collects all links across given page
125 |             'href_links': re.compile(r'<a\s(.*?)href="(.*?)"(.*?)>',
126 |                                      flags=flags),
127 |             # valid links regexs
128 |             'valid_link': [],
129 |             # invalid links regexs
130 |             'invalid_link': []
131 |         })
132 | 
133 |         # complinig valid links regexs
134 |         for regex in self._settings.valid_links:
135 |             self._regex.valid_link.append(re.compile(regex, flags=flags))
136 | 
137 |         # compiling invalid links regexs
138 |         for regex in self._settings.exclude_links:
139 |             self._regex.invalid_link.append(re.compile(regex, flags=flags))
140 | 
141 |     def _compile_cookies(self):
142 | 
143 |         """
144 |             @return void
145 | 
146 |             Compiles given dict of cookies to string.
147 |         """
148 | 
149 |         compiled = []
150 |         for name, value in self._settings.cookies.items():
151 |             compiled.append('%s=%s' % (name, value))
152 |         self._settings.cookies = ','.join(compiled)
153 |         self._opener.addheaders.append(('Cookie', self._settings.cookies))
154 | 
155 |     def _compile_headers(self):
156 | 
157 |         """
158 |             @return void
159 | 
160 |             Adds given dict of headers to urllib2 opener.
161 |         """
162 | 
163 |         for header_name, header_value in self._settings.headers.items():
164 |             self._opener.addheaders.append((header_name, header_value))
165 | 
166 |     def _create_opener(self):
167 | 
168 |         """
169 |             @return void
170 | 
171 |             Creates local urllib2 opener and extends it with custom
172 |             redirect handler if needed.
173 |         """
174 | 
175 |         self._opener = urllib2.build_opener()
176 |         if not self._settings.follow_redirects:
177 |             self._opener = urllib2.build_opener(PholcidaeRedirectHandler,
178 |                                                 urllib2.HTTPCookieProcessor())
179 | 
180 |     ########################## CRAWLING METHODS ################################
181 | 
182 |     def _get_page(self):
183 | 
184 |         """
185 |             @return bool
186 | 
187 |             Fetches page by URL.
188 |         """
189 | 
190 |         # iterating over unparsed links
191 |         while self._unparsed_urls:
192 |             # getting link to get
193 |             url = self._unparsed_urls.pop()
194 | 
195 |             # fetching page
196 |             page = self._fetch_url(url)
197 |             if page.status not in [500, 404, 502]:
198 |                 # parsing only valid urls
199 |                 valid_match = self._is_valid_link(page.url)
200 |                 if valid_match:
201 |                     # adding regex match to page object
202 |                     page.match = valid_match
203 |                     # sending raw HTML to crawl function
204 |                     self.crawl(page)
205 |                 # moving url from unparsed to parsed list
206 |                 self._parsed_urls.add(url)
207 |                 # collecting links from page
208 |                 self._get_page_links(page.body, page.url)
209 |             else:
210 |                 # moving url from unparsed to failed list
211 |                 self._failed_urls.add(url)
212 | 
213 |     def _get_page_links(self, raw_html, url):
214 | 
215 |         """
216 |             @type raw_html str
217 |             @type url str
218 |             @return void
219 | 
220 |             Parses out all links from crawled web page.
221 |         """
222 | 
223 |         links_groups = self._regex.href_links.findall(str(raw_html))
224 |         links = [group[1] for group in links_groups]
225 |         for link in links:
226 |             # is link not excluded?
227 |             if not self._is_excluded(link):
228 |                 # getting link parts
229 |                 link_info = urlparse.urlparse(link)
230 |                 # if link not relative
231 |                 if link_info.scheme or link_info.netloc:
232 |                     # if stay_in_domain enabled and link outside of domain scope
233 |                     if self._settings.stay_in_domain:
234 |                         try:
235 |                             is_link = self._settings.domain not in link
236 |                         except UnicodeDecodeError:
237 |                             continue
238 |                         else:
239 |                             if is_link:
240 |                                 continue
241 |                 else:
242 |                     # converting relative link into absolute
243 |                     link = urlparse.urljoin(url, link)
244 |                 # if link was not previously parsed
245 |                 if link not in self._parsed_urls:
246 |                     if link not in self._failed_urls:
247 |                         self._unparsed_urls.add(link)
248 | 
249 |     def _is_valid_link(self, link):
250 | 
251 |         """
252 |             @type link str
253 |             @return str
254 | 
255 |             Compares link with given regex to decide if we need to parse that
256 |             page.
257 |         """
258 | 
259 |         # if hash in URL - assumimg anchor or AJAX
260 |         if link and '#' not in link:
261 |             for regex in self._regex.valid_link:
262 |                     matches = regex.findall(link)
263 |                     if matches:
264 |                         return matches
265 |         return ''
266 | 
267 |     def _is_excluded(self, link):
268 | 
269 |         """
270 |             @type link str
271 |             @return bool
272 | 
273 |             Checks if link matches exluded regex.
274 |         """
275 | 
276 |         for regex in self._regex.invalid_link:
277 |             if regex.search(link):
278 |                 return True
279 |         return False
280 | 
281 |     ######################### URL FETCHING METHODS #############################
282 | 
283 |     def _fetch_url(self, url):
284 | 
285 |         """
286 |             @type url str
287 |             @return AttrDict
288 | 
289 |             Fetches given URL and returns data from it.
290 |         """
291 | 
292 |         # empty page container
293 |         page = AttrDict()
294 | 
295 |         try:
296 |             # getting response from given URL
297 |             resp = self._opener.open(url)
298 |             page = AttrDict({
299 |                 'body': resp.read(),
300 |                 'url': resp.geturl(),
301 |                 'headers': AttrDict(dict(resp.headers.items())),
302 |                 'cookies': self._parse_cookies(dict(resp.headers.items())),
303 |                 'status': resp.getcode()
304 |             })
305 |         except:
306 |             # drop invalid page with 500 HTTP error code
307 |             page = AttrDict({'status': 500})
308 |             self._failed_urls.add(url)
309 |         return page
310 | 
311 |     def _parse_cookies(self, headers):
312 | 
313 |         """
314 |             @type headers dict
315 |             @return AttrDict
316 | 
317 |             Parses cookies from response headers.
318 |         """
319 | 
320 |         cookies = AttrDict()
321 |         # lowering headers keys
322 |         headers_lower={}
323 |         
324 |         for k,v in headers.items():
325 |             headers_lower[k.lower()]=v
326 |         headers=headers_lower
327 |         if 'set-cookie' in headers:
328 |             # splitting raw cookies
329 |             raw_cookies = headers['set-cookie'].split(';')
330 |             # cookie parts to throw out
331 |             throw_out = ['expires', 'path', 'domain', 'secure', 'HttpOnly']
332 |             for cookie in raw_cookies:
333 |                 cookie = cookie.split('=')
334 |                 if cookie[0].strip() not in throw_out:
335 |                     cookies.update({cookie[0]: cookie[1]})
336 |         return cookies
337 | 
338 | 
339 | class AttrDict(dict):
340 | 
341 |     """ A dict that allows for object-like property access syntax. """
342 | 
343 |     def __init__(self, new_dict=None):
344 |         dict.__init__(self)
345 |         if new_dict:
346 |             self.update(new_dict)
347 | 
348 |     def __getattr__(self, name):
349 |         try:
350 |             return self[name]
351 |         except KeyError:
352 |             raise AttributeError(name)
353 | 
354 |     def __setattr__(self, key, value):
355 |         self.update({key: value})
356 | 
357 | 
358 | class PholcidaeRedirectHandler(urllib2.HTTPRedirectHandler):
359 | 
360 |     """ Custom URL redirects handler. """
361 | 
362 |     def http_error_302(self, req, fp, code, msg, headers):
363 |         return fp
364 | 
365 |     http_error_301 = http_error_303 = http_error_307 = http_error_302
366 | 


--------------------------------------------------------------------------------
/harvesting/random_user_agent.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 2, 2012
 3 | 
 4 | @author: slarinier
 5 | '''
 6 | import random
 7 | class Random_user_agent(object):
 8 |     '''
 9 |     classdocs
10 |     '''
11 |     _instance = None
12 |     def __init__(self,path_user_agent='harvesting/user_agents'):
13 |         '''
14 |         Constructor
15 |         '''
16 |         self.user_agent_list=[]
17 |         self.path_user_agent=path_user_agent
18 |         with open(self.path_user_agent,'r') as fr:
19 |             for user_agent in fr:
20 |                 if user_agent.find('#') == -1:
21 |                     self.user_agent_list.append(user_agent)
22 |                 
23 |         
24 |     def __new__(cls, *args, **kwargs):
25 |         if not cls._instance:
26 |             cls._instance = super(Random_user_agent, cls).__new__(
27 |                                 cls, *args, **kwargs)
28 |         return cls._instance
29 |     
30 |     def rand(self):
31 |         return random.choice(self.user_agent_list)
32 |     def randsleep(self):
33 |         return random.randrange(1,3,2)


--------------------------------------------------------------------------------
/harvesting/search.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | from subprocess import PIPE
 3 | from white_list import white_list
 4 | import re
 5 | import subprocess
 6 | import threading
 7 | from random_user_agent import Random_user_agent
 8 | 
 9 | class search(threading.Thread):
10 |     def __init__(self,limit,criteria,scriptjs,db,url_pattern='((https?|ftp|gopher|telnet|file|notes|ms-help):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&])*)'):
11 |         threading.Thread.__init__(self)
12 |         self.result=[]
13 |         self.limit=limit
14 |         self.criteria=criteria
15 |         self.scriptjs=scriptjs
16 |         self.connection= MongoClient(host='localhost', port=27017)
17 |         self.db=self.connection[db]
18 |         self.whitelist=white_list(db)
19 |         self.regex_url=re.compile(url_pattern)
20 |         rua=Random_user_agent()
21 |         self.ua=rua.rand()
22 |         self.urls_by_domaine={}
23 | 
24 |     def run(self):
25 |         i=0
26 |         while i < self.limit:
27 |             result=subprocess.Popen(['casperjs' ,self.scriptjs,str(i),self.criteria,self.ua],stdout=PIPE)
28 |             for ligne in result.stdout:
29 |                 if ligne.find('/')!=-1 and ligne.find('http://') != -1:
30 |                     url_information=self.regex_url.search(ligne)
31 |                     url=url_information.group(1)
32 |                     domaine=url.split('/')[2]
33 |                     tokens=domaine.split('.')
34 |                     racine=tokens[len(tokens)-2]+'.'+tokens[len(tokens)-1]
35 | 
36 |                     print "domain found: "+ domaine
37 | 
38 |                     if not racine in getattr(self.whitelist, 'white_domaine'):
39 |                         if domaine in self.urls_by_domaine:
40 |                             urls= self.urls_by_domaine[domaine]
41 |                             urls.append(url)
42 |                             self.urls_by_domaine[domaine]=urls
43 |                         else:
44 |                             self.urls_by_domaine[domaine]=[url]
45 | 
46 |             i=i+10
47 | 
48 |     def record(self):
49 |         print "#######################record############################"
50 |         domaines = iter(self.urls_by_domaine)
51 |         for domaine in domaines:
52 |             entry = self.db.new_domaines.find_one({'domaine':domaine})
53 |             if entry == None:
54 |                 self.db.new_domaines.insert_one({'domaine':domaine,'urls':self.urls_by_domaine[domaine],'criteria':[self.criteria]})
55 |             else:
56 | 
57 |                 try:
58 |                     urls_stored = entry['urls']
59 |                     urls=self.urls_by_domaine[domaine]
60 |                     urls_to_store=list(set(urls_stored + urls))
61 |                     criteria=entry['criteria']
62 |                     criteria=list(set(criteria.append(self.criteria)))
63 |                     entry['criteria']=criteria
64 |                     self.db.new_domaines.save(entry)
65 |                 except :
66 |                     criteria=[]
67 |                     try :
68 |                         criteria=entry['criteria']
69 |                         criteria=list(set(criteria.append(self.criteria)))
70 |                     except:
71 |                         criteria.append(self.criteria)
72 |                         pass
73 | 
74 |                     entry['criteria']=criteria
75 |                     try:
76 |                         self.db.new_domaines.insert_one({'domaine':domaine},{'urls':self.urls_by_domaine[domaine],'criteria':criteria})
77 |                     except:
78 |                         pass
79 | 
80 | 


--------------------------------------------------------------------------------
/harvesting/white_list.py:
--------------------------------------------------------------------------------
 1 | from mongodb import mongodb
 2 | import os
 3 | import glob
 4 | class white_list():
 5 | 
 6 |     def __init__(self,db):
 7 |         self.mdb=mongodb.mongodb('localhost',27017,db)
 8 |         self.white_list=[]
 9 |         self.white_domaine=['msn.com','google.com','wikipedia.fr','free.fr','linkedin.com']
10 |         
11 |     def loadWhiteList(self):
12 |         domaines=self.mdb.selectall('white_list')
13 |         for domaine in domaines:
14 |            self. white_domaine.append(domaine['domaine'])
15 | 			
16 |     def makeWhiteList(self,path):
17 |         list_files=os.walk(path)
18 |         for root,dirs,files in list_files:
19 |             category=''
20 |             for fl in files:
21 |                 if fl=='domains':
22 |                     with open(root+'/'+fl,'r') as fr:
23 |                         root=root.replace(path,'')
24 |                         if '/' in root:
25 |                             category=root.replace('/','_')
26 |                         else:
27 |                             category=root
28 |                         for ligne in fr:
29 |                             item={'domaine':ligne.strip(),'category':category}
30 |                             self.mdb.update(item,'white_list')
31 |     def	searchInWhiteList(self,domaine):
32 |         result=self.mdb.selectbycreteria('domaine',domaine,'white_list')
33 |         if result is not None:
34 |             category=result[0]
35 |             print category['category']
36 |             return category
37 | 	#def compare_white_list()		
38 | 		


--------------------------------------------------------------------------------
/harvesting/yahoosearch.js:
--------------------------------------------------------------------------------
 1 | var links = [];
 2 | var casper = require('casper').create();
 3 | var padding=casper.cli.get(0)
 4 | var criteria=casper.cli.get(1)
 5 | var ua = casper.cli.get(2)
 6 | 
 7 | function getLinks() {
 8 |    
 9 |     var links = document.querySelectorAll('h3 a');
10 |     return Array.prototype.map.call(links, function(e) {
11 |         return e.getAttribute('href')
12 |     });
13 | }
14 | 
15 | 
16 | casper.start();
17 | casper.userAgent(ua)
18 | casper.open('http://fr.yahoo.com/search='+criteria+'&rd=r1&fr=yfp-t-731&fr2=sb-top&xargs=0&pstart=1&b='+padding)
19 | casper.then(function() {
20 |     // aggregate results for the 'casperjs' search
21 |  
22 |     links = this.evaluate(getLinks);
23 | 	
24 |     // now search for 'phantomjs' by filling the form again
25 | });
26 | 
27 | 
28 | 
29 | casper.run(function() {
30 |     // echo results in some pretty fashion
31 |     this.echo(links.length + ' links found:');
32 |     this.echo(' - ' + links.join('\n - ')).exit();
33 | });
34 | 


--------------------------------------------------------------------------------
/history/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/history/__init__.py


--------------------------------------------------------------------------------
/history/history.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jan 18, 2013
 3 | 
 4 | @author: slarinier
 5 | '''
 6 | 
 7 | import datetime
 8 | import logging
 9 | 
10 | class History(object):
11 |     '''
12 |     classdocs
13 |     '''
14 | 
15 | 
16 |     def __init__(self):
17 |         '''
18 |         Constructor
19 |         '''
20 |         d=datetime.datetime.now()
21 |         date_value=d.strftime("%Y-%m-%d")
22 |         self.logger=logging.getLogger('history')
23 |         hdlr = logging.FileHandler('history/'+date_value+'.log')
24 |         formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
25 |         hdlr.setFormatter(formatter)
26 |         self.logger.addHandler(hdlr) 
27 |         self.logger.setLevel(logging.INFO)
28 |           
29 |     def register(self,action):
30 |         self.logger.info(action)
31 |         
32 |         


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | '''
 3 | Created on Sep 25, 2012
 4 | 
 5 | @author: slarinier
 6 | '''
 7 | 
 8 | from actions import Actions
 9 | import argparse
10 | from history.history import History
11 | import sys
12 | import threading
13 | 
14 | if __name__ == '__main__':
15 |     scriptsJS = ['harvesting/googlesearch.js', 'harvesting/bingsearch.js', 'harvesting/yahoosearch.js']
16 |     h = History()
17 |     result = []
18 |     domaine_ip = {}
19 | 
20 |     # limit=sys.argv[4]
21 | 
22 | 
23 |     parser = argparse.ArgumentParser(description='metaharvester')
24 |     parser.add_argument('--db', dest='db', help='db in mongo to store informations')
25 |     parser.add_argument('--geoloc', dest='geoloc')
26 |     parser.add_argument('--action', dest='action')
27 |     parser.add_argument('--criteria', dest='criteria')
28 |     parser.add_argument('--collection', dest='collection')
29 |     parser.add_argument('--attr', dest='attr')
30 |     parser.add_argument('--threadpool', dest='threadpool')
31 |     parser.add_argument('--filters', dest='filters')
32 |     parser.add_argument('--domains', dest='domains')
33 |     parser.add_argument('--range', dest='range')
34 |     parser.add_argument('--nmap_options', dest='nmap_options')
35 |     args = parser.parse_args()
36 |     db = args.db
37 |     filters = args.filters
38 |     criteria = args.criteria
39 |     if criteria == None:
40 |         criteria = ''
41 |     geoloc = args.geoloc
42 |     if geoloc == None:
43 |         geoloc = ''
44 |     collection = args.collection
45 |     attr = args.attr
46 |     msg = db + ' ' + ' ' + args.action + ' ' + criteria
47 |     h.register(msg)
48 |     act = Actions(db)
49 |     if args.action == 'reset':
50 |         act.reset()
51 |     elif args.action == 'metasearch':
52 |         if criteria and scriptsJS and db and geoloc:
53 |             criteria = criteria.split(',')
54 |             act.metasearch(criteria, scriptsJS, geoloc)
55 |     elif args.action == 'search_ip':
56 |         act.search_ip(geoloc, scriptsJS, args.range)
57 |     elif args.action == 'create_network':
58 |         act.create_network()
59 |     elif args.action == 'metadata':
60 |         act.metadata_exctract()
61 |     elif args.action == 'create_result':
62 |         if not criteria and not db:
63 |             parser.print_help()
64 |         else:
65 |             if collection:
66 |                 act.create_result(collection, criteria)
67 |     elif args.action == 'dnstree':
68 |         if db:
69 |             act.dnstree(db)
70 |     elif args.action == 'crawl' and args.domains:
71 |         if db:
72 |             act.crawl(args.domains)
73 |     elif args.action == 'cleandb':
74 |         if db and filters:
75 |             act.clean_db(filters)
76 |     elif args.action == 'screenshots':
77 |         if db and args.threadpool:
78 |             act.screenshots(db, args.threadpool)
79 |         else:
80 |             parser.print_help()
81 |     elif args.action == 'init':
82 |         if db and attr and collection:
83 |             act.init(db, collection, attr)
84 |         else:
85 |             parser.print_help()
86 |     elif args.action == 'nmap':
87 |         if args.nmap_options or args.range:
88 |             act.scan_nmap(args.range, args.nmap_options)
89 |     else:
90 | 
91 |         parser.print_help()
92 |         sys.exit(1)
93 | 


--------------------------------------------------------------------------------
/mongodb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/mongodb/__init__.py


--------------------------------------------------------------------------------
/mongodb/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/mongodb/__init__.pyc


--------------------------------------------------------------------------------
/mongodb/mongodb.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | import bson
 3 | import pymongo
 4 | 
 5 | 
 6 | class mongodb(object):
 7 |     def __init__(self, host, port, db):
 8 |         self.host = host
 9 |         self.port = port
10 |         self.connection = MongoClient(host=host, port=port)
11 |         self.db = self.connection[db]
12 | 
13 |     def insert(self, collection, key, value):
14 |         col = self.db[collection]
15 |         value_db = {'domaine': value}
16 |         # col.create_index([('domaine', pymongo.DESCENDING)])
17 |         col.save(value_db)
18 | 
19 |     def update(self, item, collection):
20 |         col = self.db[collection]
21 |         try:
22 |             col.save(item)
23 |         except bson.errors.InvalidStringData:
24 |             print 'InvalidString ' + str(item)
25 | 
26 |     def selectbyDict(self, request, col):
27 |         self.col = self.db[col]
28 |         return self.col.find(request)
29 | 
30 |     def selectbycreteria(self, key, criteria, col):
31 |         request = {key: criteria}
32 |         self.col = self.db[col]
33 |         return self.col.find(request)
34 | 
35 |     def selectall(self, collection):
36 |         col = self.db[collection]
37 |         return col.find()
38 | 
39 |     def insertMultiCriteria(self, collection, items):
40 |         print "insert " + str(items)
41 |         col = self.db[collection]
42 |         try:
43 |             col.save(items)
44 |         except ValueError:
45 |             print 'Erreur encoding: ' + items
46 | 


--------------------------------------------------------------------------------
/mongodb/mongodb.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/mongodb/mongodb.pyc


--------------------------------------------------------------------------------
/network/IPy.py:
--------------------------------------------------------------------------------
   1 | """ IPy - class and tools for handling of IPv4 and IPv6 Addresses and Networks.
   2 | 
   3 | $HeadURL: http://svn.23.nu/svn/repos/IPy/trunk/IPy.py $
   4 | 
   5 | $Id: IPy.py 671 2004-08-22 21:02:29Z md $
   6 | 
   7 | The IP class allows a comfortable parsing and handling for most
   8 | notations in use for IPv4 and IPv6 Addresses and Networks. It was
   9 | greatly inspired bei RIPE's Perl module NET::IP's interface but
  10 | doesn't share the Implementation. It doesn't share non-CIDR netmasks,
  11 | so funky stuff lixe a netmask 0xffffff0f can't be done here.
  12 | 
  13 |     >>> ip = IP('127.0.0.0/30')
  14 |     >>> for x in ip:
  15 |     ...  print x
  16 |     ...
  17 |     127.0.0.0
  18 |     127.0.0.1
  19 |     127.0.0.2
  20 |     127.0.0.3
  21 |     >>> ip2 = IP('0x7f000000/30')
  22 |     >>> ip == ip2
  23 |     1
  24 |     >>> ip.reverseNames()
  25 |     ['0.0.0.127.in-addr.arpa.', '1.0.0.127.in-addr.arpa.', '2.0.0.127.in-addr.arpa.', '3.0.0.127.in-addr.arpa.']
  26 |     >>> ip.reverseName()
  27 |     '0-3.0.0.127.in-addr.arpa.'
  28 |     >>> ip.iptype()
  29 |     'PRIVATE'
  30 | 
  31 | It can detect about a dozen different ways of expressing IP addresses
  32 | and networks, parse them and distinguish between IPv4 and IPv6 addresses.
  33 | 
  34 |     >>> IP('10.0.0.0/8').version()
  35 |     4
  36 |     >>> IP('::1').version()
  37 |     6
  38 |     >>> print IP(0x7f000001)
  39 |     127.0.0.1
  40 |     >>> print IP('0x7f000001')
  41 |     127.0.0.1
  42 |     >>> print IP('127.0.0.1')
  43 |     127.0.0.1
  44 |     >>> print IP('10')
  45 |     10.0.0.0
  46 |     >>> print IP('1080:0:0:0:8:800:200C:417A')
  47 |     1080:0000:0000:0000:0008:0800:200c:417a
  48 |     >>> print IP('1080::8:800:200C:417A')
  49 |     1080:0000:0000:0000:0008:0800:200c:417a
  50 |     >>> print IP('::1')
  51 |     0000:0000:0000:0000:0000:0000:0000:0001
  52 |     >>> print IP('::13.1.68.3')
  53 |     0000:0000:0000:0000:0000:0000:0d01:4403
  54 |     >>> print IP('127.0.0.0/8')
  55 |     127.0.0.0/8
  56 |     >>> print IP('127.0.0.0/255.0.0.0')
  57 |     127.0.0.0/8
  58 |     >>> print IP('127.0.0.0-127.255.255.255')
  59 |     127.0.0.0/8
  60 | 
  61 | Nearly all class methods which return a string have an optional
  62 | parameter 'wantprefixlen' which controlles if the prefixlen or netmask
  63 | is printed. Per default the prefilen is always shown if the net
  64 | contains more than one address.
  65 | 
  66 | wantprefixlen == 0 / None        don't return anything    1.2.3.0
  67 | wantprefixlen == 1               /prefix                  1.2.3.0/24
  68 | wantprefixlen == 2               /netmask                 1.2.3.0/255.255.255.0
  69 | wantprefixlen == 3               -lastip                  1.2.3.0-1.2.3.255
  70 | 
  71 | You can also change the defaults on an per-object basis by fiddeling with the class members
  72 | 
  73 | NoPrefixForSingleIp
  74 | WantPrefixLen
  75 | 
  76 |     >>> IP('10.0.0.0/32').strNormal()
  77 |     '10.0.0.0'
  78 |     >>> IP('10.0.0.0/24').strNormal()
  79 |     '10.0.0.0/24'
  80 |     >>> IP('10.0.0.0/24').strNormal(0)
  81 |     '10.0.0.0'
  82 |     >>> IP('10.0.0.0/24').strNormal(1)
  83 |     '10.0.0.0/24'
  84 |     >>> IP('10.0.0.0/24').strNormal(2)
  85 |     '10.0.0.0/255.255.255.0'
  86 |     >>> IP('10.0.0.0/24').strNormal(3)
  87 |     '10.0.0.0-10.0.0.255'
  88 |     >>> ip = IP('10.0.0.0')
  89 |     >>> print ip
  90 |     10.0.0.0
  91 |     >>> ip.NoPrefixForSingleIp = None
  92 |     >>> print ip
  93 |     10.0.0.0/32
  94 |     >>> ip.WantPrefixLen = 3
  95 |     >>> print ip
  96 |     10.0.0.0-10.0.0.0
  97 |                 
  98 | 
  99 | Further Information might be available at http://c0re.jp/c0de/IPy/
 100 | 
 101 | Hacked 2001 by drt@un.bewaff.net
 102 | 
 103 | TODO:
 104 |       * better comparison (__cmp__ and friends)
 105 |       * tests for __cmp__
 106 |       * always write hex values lowercase
 107 |       * interpret 2001:1234:5678:1234/64 as 2001:1234:5678:1234::/64
 108 |       * move size in bits into class variables to get rid of some "if self._ipversion ..."
 109 |       * support for base85 encoding
 110 |       * support for output of IPv6 encoded IPv4 Addresses
 111 |       * update address type tables      
 112 |       * first-last notation should be allowed for IPv6
 113 |       * add IPv6 docstring examples
 114 |       * check better for negative parameters
 115 |       * add addition / aggregation
 116 |       * move reverse name stuff out of the classes and refactor it
 117 |       * support for aggregation of more than two nets at once
 118 |       * support for aggregation with "holes"
 119 |       * support for finding common prefix
 120 |       * '>>' and '<<' for prefix manipulation
 121 |       * add our own exceptions instead ValueError all the time
 122 |       * rename checkPrefix to checkPrefixOk
 123 |       * add more documentation and doctests
 124 |       * refactor
 125 | """
 126 | 
 127 | __rcsid__ = '$Id: IPy.py 671 2004-08-22 21:02:29Z md $'
 128 | __version__ = '0.42'
 129 | 
 130 | import types
 131 | 
 132 | # Definition of the Ranges for IPv4 IPs
 133 | # this should include www.iana.org/assignments/ipv4-address-space
 134 | # and www.iana.org/assignments/multicast-addresses
 135 | IPv4ranges = {
 136 |     '0'	   	        : 'PUBLIC',   # fall back
 137 |     '00000000'		: 'PRIVATE',  # 0/8
 138 |     '00001010'		: 'PRIVATE',  # 10/8
 139 |     '01111111'		: 'PRIVATE',  # 127.0/8
 140 |     '1'		        : 'PUBLIC',   # fall back
 141 |     '101011000001'      : 'PRIVATE',  # 172.16/12
 142 |     '1100000010101000'	: 'PRIVATE',  # 192.168/16
 143 |     '11011111'		: 'RESERVED', # 223/8
 144 |     '111'	        : 'RESERVED'  # 224/3
 145 |     }
 146 | 
 147 | # Definition of the Ranges for IPv6 IPs
 148 | # see also www.iana.org/assignments/ipv6-address-space,
 149 | # www.iana.org/assignments/ipv6-tla-assignments,
 150 | # www.iana.org/assignments/ipv6-multicast-addresses,
 151 | # www.iana.org/assignments/ipv6-anycast-addresses
 152 | IPv6ranges = {
 153 |     '00000000'              : 'RESERVED',       # ::/8
 154 |     '00000001'              : 'UNASSIGNED',     # 100::/8
 155 |     '0000001'               : 'NSAP',           # 200::/7
 156 |     '0000010'               : 'IPX',            # 400::/7
 157 |     '0000011'               : 'UNASSIGNED',     # 600::/7   
 158 |     '00001'                 : 'UNASSIGNED',     # 800::/5
 159 |     '0001'                  : 'UNASSIGNED',     # 1000::/4
 160 |     '0010000000000000'      : 'RESERVED',       # 2000::/16 Reserved
 161 |     '0010000000000001'      : 'ASSIGNABLE',     # 2001::/16 Sub-TLA Assignments [RFC2450]
 162 |     '00100000000000010000000': 'ASSIGNABLE IANA',  # 2001:0000::/29 - 2001:01F8::/29 IANA
 163 |     '00100000000000010000001': 'ASSIGNABLE APNIC', # 2001:0200::/29 - 2001:03F8::/29 APNIC
 164 |     '00100000000000010000010': 'ASSIGNABLE ARIN',  # 2001:0400::/29 - 2001:05F8::/29 ARIN
 165 |     '00100000000000010000011': 'ASSIGNABLE RIPE',  # 2001:0600::/29 - 2001:07F8::/29 RIPE NCC
 166 |     '0010000000000010'      : '6TO4',           # 2002::/16 "6to4" [RFC3056]
 167 |     '0011111111111110'      : '6BONE',          # 3FFE::/16 6bone Testing [RFC2471]
 168 |     '0011111111111111'      : 'RESERVED',       # 3FFF::/16 Reserved
 169 |     '010'                   : 'GLOBAL-UNICAST', # 4000::/3              
 170 |     '011'                   : 'UNASSIGNED',     # 6000::/3
 171 |     '100'                   : 'GEO-UNICAST',    # 8000::/3
 172 |     '101'                   : 'UNASSIGNED',     # A000::/3
 173 |     '110'                   : 'UNASSIGNED',     # C000::/3
 174 |     '1110'                  : 'UNASSIGNED',     # E000::/4
 175 |     '11110'                 : 'UNASSIGNED',     # F000::/5
 176 |     '111110'                : 'UNASSIGNED',     # F800::/6
 177 |     '1111110'               : 'UNASSIGNED',     # FC00::/7
 178 |     '111111100'             : 'UNASSIGNED',     # FE00::/9
 179 |     '1111111010'            : 'LINKLOCAL',      # FE80::/10
 180 |     '1111111011'            : 'SITELOCAL',      # FEC0::/10
 181 |     '11111111'              : 'MULTICAST',      # FF00::/8
 182 |     '0' * 96                : 'IPV4COMP',       # ::/96
 183 |     '0' * 80 + '1' * 16     : 'IPV4MAP',        # ::FFFF:0:0/96
 184 |     '0' * 128               : 'UNSPECIFIED',    # ::/128
 185 |     '0' * 127 + '1'         : 'LOOPBACK'        # ::1/128
 186 |     }
 187 | 
 188 | 
 189 | class IPint:
 190 |     """Handling of IP addresses returning integers.
 191 | 
 192 |     Use class IP instead because some features are not implemented for
 193 |     IPint."""
 194 |     
 195 |     def __init__(self, data, ipversion = 0):
 196 |         """Create an instance of an IP object.
 197 |         
 198 |         Data can be a network specification or a single IP. IP
 199 |         Addresses can be specified in all forms understood by
 200 |         parseAddress.() the size of a network can be specified as
 201 | 
 202 |         /prefixlen        a.b.c.0/24               2001:658:22a:cafe::/64
 203 |         -lastIP           a.b.c.0-a.b.c.255        2001:658:22a:cafe::-2001:658:22a:cafe:ffff:ffff:ffff:ffff
 204 |         /decimal netmask  a.b.c.d/255.255.255.0    not supported for IPv6
 205 | 
 206 |         If no size specification is given a size of 1 address (/32 for
 207 |         IPv4 and /128 for IPv6) is assumed.
 208 | 
 209 |         >>> print IP('127.0.0.0/8')
 210 |         127.0.0.0/8
 211 |         >>> print IP('127.0.0.0/255.0.0.0')
 212 |         127.0.0.0/8
 213 |         >>> print IP('127.0.0.0-127.255.255.255')
 214 |         127.0.0.0/8
 215 | 
 216 |         See module documentation for more examples.
 217 |         """
 218 | 
 219 |         self.NoPrefixForSingleIp = 1  # Print no Prefixlen for /32 and /128
 220 |         self.WantPrefixLen = None     # Do we want prefix printed by default? see _printPrefix()
 221 |         
 222 |         netbits = 0
 223 |         prefixlen = -1
 224 |         
 225 |         # handling of non string values in constructor
 226 |         if type(data) == types.IntType or type(data) == types.LongType:
 227 |             self.ip = long(data)
 228 |             if ipversion == 0:
 229 |                 if self.ip < 0x100000000L:
 230 |                     ipversion = 4
 231 |                 else:
 232 |                     ipversion = 6
 233 |             if ipversion == 4:
 234 |                 prefixlen = 32
 235 |             elif ipversion == 6:
 236 |                 prefixlen = 128
 237 |             else:
 238 |                 raise ValueError, "only IPv4 and IPv6 supported"
 239 |             self._ipversion = ipversion
 240 |             self._prefixlen = prefixlen
 241 |         # handle IP instance as an parameter
 242 |         elif isinstance(data, IPint):
 243 |             self._ipversion = data._ipversion
 244 |             self._prefixlen = data._prefixlen
 245 |             self.ip = data.ip
 246 |         else:
 247 |             # TODO: refactor me!
 248 |             # splitting of a string into IP and prefixlen et. al.
 249 |             x = data.split('-')
 250 |             if len(x) == 2:
 251 |                 # a.b.c.0-a.b.c.255 specification ?
 252 |                 (ip, last) = x
 253 |                 (self.ip, parsedVersion) = parseAddress(ip)
 254 |                 if parsedVersion != 4:
 255 |                     raise ValueError, "first-last notation only allowed for IPv4"
 256 |                 (last, lastversion) = parseAddress(last)
 257 |                 if lastversion != 4:
 258 |                     raise ValueError, "last address should be IPv4, too"
 259 |                 if last < self.ip:
 260 |                     raise ValueError, "last address should be larger than first"
 261 |                 size = last - self.ip
 262 |                 netbits = _count1Bits(size)
 263 |             elif len(x) == 1:    
 264 |                 x = data.split('/')
 265 |                 # if no prefix is given use defaults
 266 |                 if len(x) == 1:
 267 |                     ip = x[0]
 268 |                     prefixlen = -1
 269 |                 elif len(x) > 2:
 270 |                     raise ValueError, "only one '/' allowed in IP Address"
 271 |                 else:
 272 |                     (ip, prefixlen) = x
 273 |                     if prefixlen.find('.') != -1:
 274 |                         # check if the user might have used a netmask like
 275 |                         # a.b.c.d/255.255.255.0
 276 |                         (netmask, vers) = parseAddress(prefixlen)
 277 |                         if vers != 4:
 278 |                             raise ValueError, "netmask must be IPv4"
 279 |                         prefixlen = _netmaskToPrefixlen(netmask)                         
 280 |             elif len(x) > 2:
 281 |                 raise ValueError, "only one '-' allowed in IP Address"
 282 |             else:
 283 |                 raise ValueError, "can't parse"
 284 | 
 285 |             (self.ip, parsedVersion) = parseAddress(ip)
 286 |             if ipversion == 0:
 287 |                 ipversion = parsedVersion
 288 |             if prefixlen == -1:
 289 |                 if ipversion == 4:
 290 |                     prefixlen = 32 - netbits
 291 |                 elif ipversion == 6:
 292 |                     prefixlen = 128 - netbits
 293 |                 else:
 294 |                     raise ValueError, "only IPv4 and IPv6 supported"
 295 |             self._ipversion = ipversion
 296 |             self._prefixlen = int(prefixlen)
 297 | 
 298 |             if not _checkNetaddrWorksWithPrefixlen(self.ip, self._prefixlen, self._ipversion):
 299 |                 raise ValueError, "%s goes not well with prefixlen %d" % (hex(self.ip), self._prefixlen) 
 300 |                 
 301 | 
 302 |     def int(self):
 303 |         """Return the first / base / network addess as an (long) integer.
 304 | 
 305 |         The same as IP[0].
 306 | 
 307 |         >>> hex(IP('10.0.0.0/8').int())
 308 |         '0xA000000L'
 309 |         """
 310 |         return self.ip
 311 | 
 312 |     def version(self):
 313 |         """Return the IP version of this Object.
 314 | 
 315 |         >>> IP('10.0.0.0/8').version()
 316 |         4
 317 |         >>> IP('::1').version()
 318 |         6
 319 |         """
 320 |         return self._ipversion
 321 | 
 322 |     def prefixlen(self):
 323 |         """Returns Network Prefixlen.
 324 | 
 325 |         >>> IP('10.0.0.0/8').prefixlen()
 326 |         8
 327 |         """
 328 |         return self._prefixlen
 329 | 
 330 |     def net(self):
 331 |         """Return the base (first) address of a network as an (long) integer."""
 332 | 
 333 |         return self.int()
 334 | 
 335 |     def broadcast(self):
 336 |         """Return the broadcast (last) address of a network as an (long) integer.
 337 | 
 338 |         The same as IP[-1]."""
 339 |         return self.int() + self.len() - 1
 340 | 
 341 |     def _printPrefix(self, want):
 342 |         """Prints Prefixlen/Netmask.
 343 | 
 344 |         Not really. In fact it is our universal Netmask/Prefixlen printer.
 345 |         This is considered an internel function.
 346 | 
 347 |         want == 0 / None        don't return anything    1.2.3.0
 348 |         want == 1               /prefix                  1.2.3.0/24
 349 |         want == 2               /netmask                 1.2.3.0/255.255.255.0
 350 |         want == 3               -lastip                  1.2.3.0-1.2.3.255
 351 |         """
 352 | 
 353 |         if (self._ipversion == 4 and self._prefixlen == 32) or \
 354 |            (self._ipversion == 6 and self._prefixlen == 128): 
 355 |             if self.NoPrefixForSingleIp:
 356 |                 want = 0
 357 |         if want == None:
 358 |             want = self.WantPrefixLen
 359 |             if want == None:
 360 |                 want = 1
 361 |         if want:
 362 |             if want == 2:
 363 |                 # this should work wit IP and IPint
 364 |                 netmask = self.netmask()
 365 |                 if type(netmask) != types.IntType and type(netmask) != types.LongType:
 366 |                     netmask = netmask.int()
 367 |                 return "/%s" % (intToIp(netmask, self._ipversion))
 368 |             elif want == 3:
 369 |                 return "-%s" % (intToIp(self.ip + self.len() - 1, self._ipversion))
 370 |             else:
 371 |                 # default
 372 |                 return "/%d" % (self._prefixlen)
 373 |         else:
 374 |             return ''
 375 | 
 376 |         # We have different Favours to convert to:
 377 |         # strFullsize   127.0.0.1    2001:0658:022a:cafe:0200:c0ff:fe8d:08fa
 378 |         # strNormal     127.0.0.1    2001:658:22a:cafe:200:c0ff:fe8d:08fa
 379 |         # strCompressed 127.0.0.1    2001:658:22a:cafe::1
 380 |         # strHex        0x7F000001L  0x20010658022ACAFE0200C0FFFE8D08FA
 381 |         # strDec        2130706433   42540616829182469433547974687817795834
 382 | 
 383 |     def strBin(self, wantprefixlen = None): 
 384 |         """Return a string representation as a binary value.
 385 | 
 386 |         >>> print IP('127.0.0.1').strBin()
 387 |         01111111000000000000000000000001
 388 |         """
 389 | 
 390 | 
 391 |         if self._ipversion == 4:
 392 |             bits = 32 
 393 |         elif self._ipversion == 6:
 394 |             bits = 128
 395 |         else:
 396 |             raise ValueError, "only IPv4 and IPv6 supported"
 397 | 
 398 |         if self.WantPrefixLen == None and wantprefixlen == None:
 399 |             wantprefixlen = 0
 400 |         ret = _intToBin(self.ip)
 401 |         return  '0' * (bits - len(ret)) + ret + self._printPrefix(wantprefixlen)
 402 | 
 403 |     def strCompressed(self, wantprefixlen = None):
 404 |         """Return a string representation in compressed format using '::' Notation.
 405 | 
 406 |         >>> print IP('127.0.0.1').strCompressed()
 407 |         127.0.0.1
 408 |         >>> print IP('2001:0658:022a:cafe:0200::1').strCompressed()
 409 |         2001:658:22a:cafe:200::1
 410 |         """
 411 |                 
 412 |         if self.WantPrefixLen == None and wantprefixlen == None:
 413 |             wantprefixlen = 1
 414 |             
 415 |         if self._ipversion == 4:
 416 |             return self.strFullsize(wantprefixlen)
 417 |         else:
 418 |             # find the longest sequence of '0'
 419 |             hextets = [int(x, 16) for x in self.strFullsize(0).split(':')]
 420 |             # every element of followingzeros will contain the number of zeros
 421 |             # following the corrospondending element of hextetes
 422 |             followingzeros = [0] * 8
 423 |             for i in range(len(hextets)):
 424 |                 followingzeros[i] = _countFollowingZeros(hextets[i:])
 425 |             # compressionpos is the position where we can start removing zeros
 426 |             compressionpos = followingzeros.index(max(followingzeros))
 427 |             if max(followingzeros) > 1:
 428 |                 # genererate string with the longest number of zeros cut out
 429 |                 # now we need hextets as strings
 430 |                 hextets = [x for x in self.strNormal(0).split(':')]
 431 |                 while compressionpos < len(hextets) and hextets[compressionpos] == '0':
 432 |                     del(hextets[compressionpos])
 433 |                 hextets.insert(compressionpos, '')
 434 |                 if compressionpos + 1 >= len(hextets):
 435 |                     hextets.append('')
 436 |                 if compressionpos == 0:
 437 |                     hextets = [''] + hextets
 438 |                 return ':'.join(hextets) + self._printPrefix(wantprefixlen)
 439 |             else:
 440 |                 return self.strNormal() + self._printPrefix(wantprefixlen)
 441 | 
 442 |     def strNormal(self, wantprefixlen = None):
 443 |         """Return a string representation in the usual format.
 444 | 
 445 |         >>> print IP('127.0.0.1').strNormal()
 446 |         127.0.0.1
 447 |         >>> print IP('2001:0658:022a:cafe:0200::1').strNormal()
 448 |         2001:658:22a:cafe:200:0:0:1
 449 |         """
 450 | 
 451 |         if self.WantPrefixLen == None and wantprefixlen == None:
 452 |             wantprefixlen = 1
 453 |         
 454 |         if self._ipversion == 4:
 455 |             ret = self.strFullsize(0) 
 456 |         elif self._ipversion == 6:
 457 |             ret = ':'.join([hex(x)[2:] for x in [int(x, 16) for x in self.strFullsize(0).split(':')]])
 458 |         else:
 459 |             raise ValueError, "only IPv4 and IPv6 supported"
 460 | 
 461 |             
 462 |         
 463 |         return ret + self._printPrefix(wantprefixlen)
 464 | 
 465 |     def strFullsize(self, wantprefixlen = None):
 466 |         """Return a string representation in the non mangled format.
 467 | 
 468 |         >>> print IP('127.0.0.1').strFullsize()
 469 |         127.0.0.1
 470 |         >>> print IP('2001:0658:022a:cafe:0200::1').strFullsize()
 471 |         2001:0658:022a:cafe:0200:0000:0000:0001
 472 |         """
 473 | 
 474 |         if self.WantPrefixLen == None and wantprefixlen == None:
 475 |             wantprefixlen = 1
 476 |             
 477 |         return intToIp(self.ip, self._ipversion).lower() + self._printPrefix(wantprefixlen)
 478 | 
 479 |     def strHex(self, wantprefixlen = None):
 480 |         """Return a string representation in hex format.
 481 | 
 482 |         >>> print IP('127.0.0.1').strHex()
 483 |         0x7F000001
 484 |         >>> print IP('2001:0658:022a:cafe:0200::1').strHex()
 485 |         0x20010658022ACAFE0200000000000001
 486 |         """
 487 | 
 488 |         if self.WantPrefixLen == None and wantprefixlen == None:
 489 |             wantprefixlen = 0
 490 | 
 491 |         x = hex(self.ip)
 492 |         if x[-1] == 'L':
 493 |             x = x[:-1]
 494 |         return x + self._printPrefix(wantprefixlen)
 495 | 
 496 |     def strDec(self, wantprefixlen = None):
 497 |         """Return a string representation in decimal format.
 498 | 
 499 |         >>> print IP('127.0.0.1').strDec()
 500 |         2130706433
 501 |         >>> print IP('2001:0658:022a:cafe:0200::1').strDec()
 502 |         42540616829182469433547762482097946625
 503 |         """
 504 | 
 505 |         if self.WantPrefixLen == None and wantprefixlen == None:
 506 |             wantprefixlen = 0
 507 | 
 508 |         x =  str(self.ip)
 509 |         if x[-1] == 'L':
 510 |             x = x[:-1]
 511 |         return x + self._printPrefix(wantprefixlen)
 512 | 
 513 |     def iptype(self):
 514 |         """Return a description of the IP type ('PRIVATE', 'RESERVERD', etc).
 515 | 
 516 |         >>> print IP('127.0.0.1').iptype()
 517 |         PRIVATE
 518 |         >>> print IP('192.168.1.1').iptype()
 519 |         PRIVATE
 520 |         >>> print IP('195.185.1.2').iptype()
 521 |         PUBLIC
 522 |         >>> print IP('::1').iptype()
 523 |         LOOPBACK
 524 |         >>> print IP('2001:0658:022a:cafe:0200::1').iptype()
 525 |         ASSIGNABLE RIPE
 526 | 
 527 |         The type information for IPv6 is out of sync with reality.
 528 |         """
 529 | 
 530 |         # this could be greatly improved
 531 | 
 532 |         if self._ipversion == 4:
 533 |             iprange = IPv4ranges 
 534 |         elif self._ipversion == 6:
 535 |             iprange = IPv6ranges 
 536 |         else:
 537 |             raise ValueError, "only IPv4 and IPv6 supported"
 538 | 
 539 |         bits = self.strBin()
 540 |         for i in range(len(bits), 0, -1):
 541 |             if iprange.has_key(bits[:i]):
 542 |                 return iprange[bits[:i]]
 543 |         return "unknown"
 544 | 
 545 | 
 546 |     def netmask(self):
 547 |         """Return netmask as an integer.
 548 | 
 549 |         >>> print hex(IP('195.185.0.0/16').netmask().int())
 550 |         0xFFFF0000L
 551 |         """
 552 | 
 553 |         # TODO: unify with prefixlenToNetmask?
 554 |         if self._ipversion == 4:
 555 |             locallen = 32 - self._prefixlen
 556 |         elif self._ipversion == 6:
 557 |             locallen = 128 - self._prefixlen
 558 |         else:
 559 |             raise ValueError, "only IPv4 and IPv6 supported"
 560 | 
 561 |         return ((2L ** self._prefixlen) - 1) << locallen
 562 | 
 563 | 
 564 |     def strNetmask(self):
 565 |         """Return netmask as an string. Mostly useful for IPv6.
 566 | 
 567 |         >>> print IP('195.185.0.0/16').strNetmask()
 568 |         255.255.0.0
 569 |         >>> print IP('2001:0658:022a:cafe::0/64').strNetmask()
 570 |         /64
 571 |         """
 572 | 
 573 |         # TODO: unify with prefixlenToNetmask?
 574 |         if self._ipversion == 4:
 575 |             locallen = 32 - self._prefixlen
 576 |             return intToIp(((2L ** self._prefixlen) - 1) << locallen, 4)
 577 |         elif self._ipversion == 6:
 578 |             locallen = 128 - self._prefixlen
 579 |             return "/%d" % self._prefixlen
 580 |         else:
 581 |             raise ValueError, "only IPv4 and IPv6 supported"
 582 | 
 583 |     def len(self):
 584 |         """Return the length of an subnet.
 585 | 
 586 |         >>> print IP('195.185.1.0/28').len()
 587 |         16
 588 |         >>> print IP('195.185.1.0/24').len()
 589 |         256
 590 |         """
 591 | 
 592 |         if self._ipversion == 4:
 593 |             locallen = 32 - self._prefixlen
 594 |         elif self._ipversion == 6:
 595 |             locallen = 128 - self._prefixlen
 596 |         else:
 597 |             raise ValueError, "only IPv4 and IPv6 supported"
 598 | 
 599 |         return 2L ** locallen 
 600 | 
 601 | 
 602 |     def __len__(self):
 603 |         """Return the length of an subnet.
 604 | 
 605 |         Called to implement the built-in function len().
 606 |         It breaks with IPv6 Networks. Anybody knows how to fix this."""
 607 | 
 608 |         # Python < 2.2 has this silly restriction which breaks IPv6
 609 |         # how about Python >= 2.2 ... ouch - it presists!
 610 |     
 611 |         return int(self.len())
 612 | 
 613 | 
 614 |     def __getitem__(self, key):
 615 |         """Called to implement evaluation of self[key].
 616 |         
 617 |         >>> ip=IP('127.0.0.0/30')
 618 |         >>> for x in ip:
 619 |         ...  print hex(x.int())
 620 |         ...
 621 |         0x7F000000L
 622 |         0x7F000001L
 623 |         0x7F000002L
 624 |         0x7F000003L
 625 |         >>> hex(ip[2].int())
 626 |         '0x7F000002L'
 627 |         >>> hex(ip[-1].int())
 628 |         '0x7F000003L'
 629 |         """
 630 | 
 631 |         if type(key) != types.IntType and type(key) != types.LongType:
 632 |             raise TypeError
 633 |         if abs(key) >= self.len():
 634 |             raise IndexError
 635 |         if key < 0:
 636 |             key = self.len() - abs(key)
 637 | 
 638 |         return self.ip + long(key)
 639 | 
 640 |     
 641 | 
 642 |     def __contains__(self, item):
 643 |         """Called to implement membership test operators.
 644 | 
 645 |         Should return true if item is in self, false otherwise. Item
 646 |         can be other IP-objects, strings or ints.
 647 | 
 648 |         >>> print IP('195.185.1.1').strHex()
 649 |         0xC3B90101
 650 |         >>> 0xC3B90101L in IP('195.185.1.0/24')
 651 |         1
 652 |         >>> '127.0.0.1' in IP('127.0.0.0/24')
 653 |         1
 654 |         >>> IP('127.0.0.0/24') in IP('127.0.0.0/25')
 655 |         0
 656 |         """
 657 | 
 658 |         item = IP(item)
 659 |         if item.ip >= self.ip and item.ip < self.ip + self.len() - item.len() + 1:
 660 |             return 1
 661 |         else:
 662 |             return 0
 663 | 
 664 | 
 665 |     def overlaps(self, item):
 666 |         """Check if two IP address ranges overlap.
 667 | 
 668 |         Returns 0 if the two ranged don't overlap, 1 if the given
 669 |         range overlaps at the end and -1 if it does at the beginning.
 670 | 
 671 |         >>> IP('192.168.0.0/23').overlaps('192.168.1.0/24')
 672 |         1
 673 |         >>> IP('192.168.0.0/23').overlaps('192.168.1.255')
 674 |         1
 675 |         >>> IP('192.168.0.0/23').overlaps('192.168.2.0')
 676 |         0
 677 |         >>> IP('192.168.1.0/24').overlaps('192.168.0.0/23')
 678 |         -1
 679 |         """
 680 | 
 681 |         item = IP(item)
 682 |         if item.ip >= self.ip and item.ip < self.ip + self.len():
 683 |             return 1
 684 |         elif self.ip >= item.ip and self.ip < item.ip + item.len():
 685 |             return -1
 686 |         else:
 687 |             return 0
 688 | 
 689 |     
 690 |     def __str__(self):
 691 |         """Dispatch to the prefered String Representation.
 692 | 
 693 |         Used to implement str(IP)."""
 694 | 
 695 |         return self.strFullsize()
 696 | 
 697 | 
 698 |     def __repr__(self):
 699 |         """Print a representation of the Object.
 700 | 
 701 |         Used to implement repr(IP). Returns a string which evaluates
 702 |         to an identical Object (without the wnatprefixlen stuff - see
 703 |         module docstring.
 704 | 
 705 |         >>> print repr(IP('10.0.0.0/24'))
 706 |         IP('10.0.0.0/24')
 707 |         """
 708 | 
 709 |         return("IPint('%s')" % (self.strCompressed(1))) 
 710 | 
 711 | 
 712 |     def __cmp__(self, other):
 713 |         """Called by comparison operations.
 714 | 
 715 |         Should return a negative integer if self < other, zero if self
 716 |         == other, a positive integer if self > other.
 717 | 
 718 |         Networks with different prefixlen are considered non-equal.
 719 |         Networks with the same prefixlen and differing addresses are
 720 |         considered non equal but are compared by thair base address
 721 |         integer value to aid sorting of IP objects.
 722 | 
 723 |         The Version of Objects is not put into consideration.
 724 | 
 725 |         >>> IP('10.0.0.0/24') > IP('10.0.0.0')
 726 |         1
 727 |         >>> IP('10.0.0.0/24') < IP('10.0.0.0')
 728 |         0
 729 |         >>> IP('10.0.0.0/24') < IP('12.0.0.0/24')
 730 |         1
 731 |         >>> IP('10.0.0.0/24') > IP('12.0.0.0/24')
 732 |         0
 733 | 
 734 |         """
 735 | 
 736 |         # Im not really sure if this is "the right thing to do"
 737 |         if self._prefixlen < other.prefixlen():
 738 |             return (other.prefixlen() - self._prefixlen) 
 739 |         elif self._prefixlen > other.prefixlen():
 740 | 
 741 |             # Fixed bySamuel Krempp <krempp@crans.ens-cachan.fr>:
 742 |             
 743 |             # The bug is quite obvious really (as 99% bugs are once
 744 |             # spotted, isn't it ? ;-) Because of precedence of
 745 |             # multiplication by -1 over the substraction, prefixlen
 746 |             # differences were causing the __cmp__ function to always
 747 |             # return positive numbers, thus the function was failing
 748 |             # the basic assumptions for a __cmp__ function.
 749 | 
 750 |             # Namely we could have (a > b AND b > a), when the
 751 |             # prefixlen of a and b are different.  (eg let
 752 |             # a=IP("1.0.0.0/24"); b=IP("2.0.0.0/16");) thus, anything
 753 |             # could happen when launching a sort algorithm..
 754 |             # everything's in order with the trivial, attached patch.
 755 |                                              
 756 |             return (self._prefixlen - other.prefixlen()) * -1
 757 |         else:
 758 |             if self.ip < other.ip:
 759 |                 return -1 
 760 |             elif self.ip > other.ip:
 761 |                 return 1
 762 |             else:
 763 |                 return 0
 764 |         
 765 |      
 766 |     def __hash__(self):
 767 |         """Called for the key object for dictionary operations, and by
 768 |         the built-in function hash()  Should return a 32-bit integer
 769 |         usable as a hash value for dictionary operations. The only
 770 |         required property is that objects which compare equal have the
 771 |         same hash value
 772 | 
 773 |         >>> hex(IP('10.0.0.0/24').__hash__())
 774 |         '0xf5ffffe7'
 775 |         """
 776 | 
 777 |         thehash = int(-1)
 778 |         ip = self.ip
 779 |         while ip > 0:
 780 |             thehash = thehash ^ (ip & 0x7fffffff)
 781 |             ip = ip >> 32
 782 |         thehash = thehash ^ self._prefixlen
 783 |         return int(thehash)
 784 | 
 785 | 
 786 | class IP(IPint):
 787 |     """Class for handling IP Addresses and Networks."""
 788 | 
 789 |     def net(self):
 790 |         """Return the base (first) address of a network as an IP object.
 791 | 
 792 |         The same as IP[0].
 793 | 
 794 |         >>> IP('10.0.0.0/8').net()
 795 |         IP('10.0.0.0')
 796 |         """
 797 |         return IP(IPint.net(self))
 798 | 
 799 |     def broadcast(self):
 800 |         """Return the broadcast (last) address of a network as an IP object.
 801 | 
 802 |         The same as IP[-1].
 803 | 
 804 |         >>> IP('10.0.0.0/8').broadcast()
 805 |         IP('10.255.255.255')
 806 |         """
 807 |         return IP(IPint.broadcast(self))
 808 | 
 809 |     def netmask(self):
 810 |         """Return netmask as an IP object.
 811 | 
 812 |         >>> IP('10.0.0.0/8').netmask()
 813 |         IP('255.0.0.0')
 814 |          """
 815 |         return IP(IPint.netmask(self))
 816 | 
 817 | 
 818 |     def reverseNames(self):
 819 |         """Return a list with values forming the reverse lookup.
 820 | 
 821 |         >>> IP('213.221.113.87/32').reverseNames()
 822 |         ['87.113.221.213.in-addr.arpa.']
 823 |         >>> IP('213.221.112.224/30').reverseNames()
 824 |         ['224.112.221.213.in-addr.arpa.', '225.112.221.213.in-addr.arpa.', '226.112.221.213.in-addr.arpa.', '227.112.221.213.in-addr.arpa.']
 825 |         >>> IP('127.0.0.0/24').reverseNames()
 826 |         ['0.0.127.in-addr.arpa.']
 827 |         >>> IP('127.0.0.0/23').reverseNames()
 828 |         ['0.0.127.in-addr.arpa.', '1.0.127.in-addr.arpa.']
 829 |         >>> IP('127.0.0.0/16').reverseNames()
 830 |         ['0.127.in-addr.arpa.']
 831 |         >>> IP('127.0.0.0/15').reverseNames()
 832 |         ['0.127.in-addr.arpa.', '1.127.in-addr.arpa.']
 833 |         >>> IP('128.0.0.0/8').reverseNames()
 834 |         ['128.in-addr.arpa.']
 835 |         >>> IP('128.0.0.0/7').reverseNames()
 836 |         ['128.in-addr.arpa.', '129.in-addr.arpa.']
 837 |         
 838 |         """
 839 | 
 840 |         if self._ipversion == 4:
 841 |             ret =[]
 842 |             # TODO: Refactor. Add support for IPint objects
 843 |             if self.len() < 2**8:
 844 |                 for x in self:
 845 |                     ret.append(x.reverseName())
 846 |             elif self.len() < 2**16L:
 847 |                 for i in range(0, self.len(), 2**8):
 848 |                     ret.append(self[i].reverseName()[2:])
 849 |             elif self.len() < 2**24L:
 850 |                 for i in range(0, self.len(), 2**16):
 851 |                     ret.append(self[i].reverseName()[4:])
 852 |             else:
 853 |                 for i in range(0, self.len(), 2**24):
 854 |                     ret.append(self[i].reverseName()[6:])
 855 |             return ret
 856 |         elif self._ipversion == 6:
 857 |             s = hex(self.ip)[2:].lower()
 858 |             if s[-1] == 'l':
 859 |                 s = s[:-1]
 860 |             if self._prefixlen % 4 != 0:
 861 |                 raise NotImplementedError, "can't create IPv6 reverse names at sub nibble level"
 862 |             s = list(s)
 863 |             s.reverse()
 864 |             s = '.'.join(s)
 865 |             first_nibble_index = int(32 - (self._prefixlen / 4)) * 2
 866 |             return ["%s.ip6.int." % s[first_nibble_index:]]
 867 |         else:
 868 |             raise ValueError, "only IPv4 and IPv6 supported"
 869 |         
 870 |         
 871 | 
 872 |     def reverseName(self):
 873 |         """Return the value for reverse lookup/PTR records as RfC 2317 look alike.
 874 | 
 875 |         RfC 2317 is an ugly hack which only works for sub-/24 e.g. not
 876 |         for /23. Do not use it. Better set up a Zone for every
 877 |         address. See reverseName for a way to arcive that.
 878 | 
 879 |         >>> print IP('195.185.1.1').reverseName()
 880 |         1.1.185.195.in-addr.arpa.
 881 |         >>> print IP('195.185.1.0/28').reverseName()
 882 |         0-15.1.185.195.in-addr.arpa.
 883 |         """
 884 | 
 885 |         if self._ipversion == 4:
 886 |             s = self.strFullsize(0)
 887 |             s = s.split('.')
 888 |             s.reverse()
 889 |             first_byte_index = int(4 - (self._prefixlen / 8)) 
 890 |             if self._prefixlen % 8 != 0:
 891 |                 nibblepart = "%s-%s" % (s[3-(self._prefixlen / 8)], intToIp(self.ip + self.len() - 1, 4).split('.')[-1])
 892 |                 if nibblepart[-1] == 'l':
 893 |                     nibblepart = nibblepart[:-1]
 894 |                 nibblepart += '.'
 895 |             else:
 896 |                 nibblepart = ""
 897 | 
 898 |             s = '.'.join(s[first_byte_index:])
 899 |             return "%s%s.in-addr.arpa." % (nibblepart, s)
 900 | 
 901 |         elif self._ipversion == 6:
 902 |             s = hex(self.ip)[2:].lower()
 903 |             if s[-1] == 'l':
 904 |                 s = s[:-1]
 905 |             if self._prefixlen % 4 != 0:
 906 |                 nibblepart = "%s-%s" % (s[self._prefixlen:], hex(self.ip + self.len() - 1)[2:].lower())
 907 |                 if nibblepart[-1] == 'l':
 908 |                     nibblepart = nibblepart[:-1]
 909 |                 nibblepart += '.'
 910 |             else:
 911 |                 nibblepart = ""
 912 |             s = list(s)
 913 |             s.reverse()
 914 |             s = '.'.join(s)
 915 |             first_nibble_index = int(32 - (self._prefixlen / 4)) * 2
 916 |             return "%s%s.ip6.int." % (nibblepart, s[first_nibble_index:])
 917 |         else:
 918 |             raise ValueError, "only IPv4 and IPv6 supported"
 919 | 
 920 |     def __getitem__(self, key):
 921 |         """Called to implement evaluation of self[key].
 922 |         
 923 |         >>> ip=IP('127.0.0.0/30')
 924 |         >>> for x in ip:
 925 |         ...  print str(x)
 926 |         ...
 927 |         127.0.0.0
 928 |         127.0.0.1
 929 |         127.0.0.2
 930 |         127.0.0.3
 931 |         >>> print str(ip[2])
 932 |         127.0.0.2
 933 |         >>> print str(ip[-1])
 934 |         127.0.0.3
 935 |         """
 936 |         return IP(IPint.__getitem__(self, key))
 937 | 
 938 |     def __repr__(self):
 939 |         """Print a representation of the Object.
 940 | 
 941 |         >>> IP('10.0.0.0/8')
 942 |         IP('10.0.0.0/8')
 943 |         """
 944 | 
 945 |         return("IP('%s')" % (self.strCompressed(1))) 
 946 | 
 947 |     def __add__(self, other):
 948 |         """Emulate numeric objects through network aggregation"""
 949 |         if self.prefixlen() != other.prefixlen():
 950 |             raise ValueError, "Only networks with the same prefixlen can be added."
 951 |         if self.prefixlen < 1:
 952 |             raise ValueError, "Networks with a prefixlen longer than /1 can't be added."
 953 |         if self.version() != other.version():
 954 |             raise ValueError, "Only networks with the same IP version can be added."
 955 |         if self > other:
 956 |             # fixed by Skinny Puppy <skin_pup-IPy@happypoo.com>
 957 |             return other.__add__(self)
 958 |         else:
 959 |             ret = IP(self.int())
 960 |             ret._prefixlen = self.prefixlen() - 1 
 961 |             return ret
 962 | 
 963 | def parseAddress(ipstr):
 964 |     """Parse a string and return the corrospondending IPaddress and the a guess of the IP version.
 965 | 
 966 |     Following Forms ar recorgnized:
 967 |     0x0123456789abcdef           # IPv4 if <= 0xffffffff else IPv6
 968 |     123.123.123.123              # IPv4
 969 |     123.123                      # 0-padded IPv4
 970 |     1080:0000:0000:0000:0008:0800:200C:417A
 971 |     1080:0:0:0:8:800:200C:417A
 972 |     1080:0::8:800:200C:417A
 973 |     ::1
 974 |     ::
 975 |     0:0:0:0:0:FFFF:129.144.52.38
 976 |     ::13.1.68.3
 977 |     ::FFFF:129.144.52.38
 978 |     """
 979 | 
 980 |     # TODO: refactor me!
 981 |     if ipstr.startswith('0x'):
 982 |         ret = long(ipstr[2:], 16)
 983 |         if ret > 0xffffffffffffffffffffffffffffffffL:
 984 |             raise ValueError, "%r: IP Address can't be bigger than 2^128" % (ipstr)
 985 |         if ret < 0x100000000L:
 986 |             return (ret, 4)
 987 |         else:
 988 |             return (ret, 6)
 989 |             
 990 |     if ipstr.find(':') != -1:
 991 |         # assume IPv6
 992 |         if ipstr.find(':::') != -1:
 993 |             raise ValueError, "%r: IPv6 Address can't contain ':::'" % (ipstr)
 994 |         hextets = ipstr.split(':')
 995 |         if ipstr.find('.') != -1:
 996 |             # this might be a mixed address like '0:0:0:0:0:0:13.1.68.3'
 997 |             (v4, foo) = parseAddress(hextets[-1])
 998 |             assert foo == 4
 999 |             del(hextets[-1])
1000 |             hextets.append(hex(v4 >> 16)[2:-1])
1001 |             hextets.append(hex(v4 & 0xffff)[2:-1])
1002 |         if len(hextets) > 8:
1003 |             raise ValueError, "%r: IPv6 Address with more than 8 hexletts" % (ipstr)
1004 |         if len(hextets) < 8:
1005 |             if '' not in hextets:
1006 |                 raise ValueError, "%r IPv6 Address with less than 8 hexletts and without '::'" % (ipstr)
1007 |             # catch :: at the beginning or end
1008 |             if hextets.index('') < len(hextets) - 1 and hextets[hextets.index('')+1] == '':
1009 |                 hextets.remove('')
1010 |             # catch '::'
1011 |             if hextets.index('') < len(hextets) - 1 and hextets[hextets.index('')+1] == '':
1012 |                 hextets.remove('')
1013 |             
1014 |             for foo in range(9-len(hextets)):
1015 |                 hextets.insert(hextets.index(''), '0')
1016 |             hextets.remove('')
1017 |             if '' in hextets:
1018 |                 raise ValueError, "%r IPv6 Address may contain '::' only once" % (ipstr)
1019 |         if '' in hextets:
1020 |             raise ValueError, "%r IPv6 Address may contain '::' only if it has less than 8 hextets" % (ipstr)
1021 |         num = ''
1022 |         for x in hextets:
1023 |             if len(x) < 4:
1024 |                 x = ((4 - len(x)) * '0') + x
1025 |             if int(x, 16) < 0 or int(x, 16) > 0xffff: 
1026 |                 raise ValueError, "%r: single hextet must be 0 <= hextet <= 0xffff which isn't true for %s" % (ipstr, x)
1027 |             num += x
1028 |         return (long(num, 16), 6)
1029 | 
1030 |     elif len(ipstr) == 32:
1031 |         # assume IPv6 in pure hexadecimal notation
1032 |         return (long(ipstr, 16), 6)
1033 |     
1034 |     elif  ipstr.find('.') != -1 or (len(ipstr) < 4 and int(ipstr) < 256):
1035 |         # assume IPv4  ('127' gets interpreted as '127.0.0.0')
1036 |         bytes = ipstr.split('.')
1037 |         if len(bytes) > 4:
1038 |             raise ValueError, "IPv4 Address with more than 4 bytes"
1039 |         bytes += ['0'] * (4 - len(bytes))
1040 |         bytes = [long(x) for x in bytes]
1041 |         for x in bytes:
1042 |             if x > 255 or x < 0:
1043 |                 raise ValueError, "%r: single byte must be 0 <= byte < 256" % (ipstr)
1044 |         return ((bytes[0] << 24) + (bytes[1] << 16) + (bytes[2] << 8) + bytes[3], 4)
1045 | 
1046 |     else:
1047 |         # we try to interprete it as a decimal digit -
1048 |         # this ony works for numbers > 255 ... others
1049 |         # will be interpreted as IPv4 first byte
1050 |         ret = long(ipstr)
1051 |         if ret > 0xffffffffffffffffffffffffffffffffL:
1052 |             raise ValueError, "IP Address cant be bigger than 2^128"
1053 |         if ret <= 0xffffffffL:
1054 |             return (ret, 4)
1055 |         else:
1056 |             return (ret, 6)
1057 | 
1058 | 
1059 | def intToIp(ip, version):
1060 |     """Transform an integer string into an IP address."""
1061 | 
1062 |     # just to be sure and hoping for Python 2.22
1063 |     ip = long(ip)
1064 | 
1065 |     if ip < 0:
1066 |         raise ValueError, "IPs can't be negative: %d" % (ip)
1067 |     
1068 |     ret = ''
1069 |     if version == 4: 
1070 |         if ip > 0xffffffffL:
1071 |             raise ValueError, "IPv4 Addresses can't be larger than 0xffffffff: %s" % (hex(ip))
1072 |         for l in range(4):
1073 |             ret = str(ip & 0xffL) + '.' + ret
1074 |             ip = ip >> 8;
1075 |         ret = ret[:-1]
1076 |     elif version == 6:
1077 |         if ip > 0xffffffffffffffffffffffffffffffffL:
1078 |             raise ValueError, "IPv6 Addresses can't be larger than 0xffffffffffffffffffffffffffffffff: %s" % (hex(ip))
1079 |         l = '0' * 32 + hex(ip)[2:-1]
1080 |         for x in range(1,33):
1081 |             ret = l[-x] + ret
1082 |             if x % 4 == 0:
1083 |                 ret = ':' + ret
1084 |         ret = ret[1:]
1085 |     else:
1086 |         raise ValueError, "only IPv4 and IPv6 supported"
1087 |             
1088 |     return ret;
1089 | 
1090 | def _ipVersionToLen(version):
1091 |     """Return number of bits in address for a certain IP version.
1092 | 
1093 |     >>> _ipVersionToLen(4)
1094 |     32
1095 |     >>> _ipVersionToLen(6)
1096 |     128
1097 |     >>> _ipVersionToLen(5)
1098 |     Traceback (most recent call last):
1099 |       File "<stdin>", line 1, in ?
1100 |       File "IPy.py", line 1076, in _ipVersionToLen
1101 |         raise ValueError, "only IPv4 and IPv6 supported"
1102 |     ValueError: only IPv4 and IPv6 supported
1103 |     """
1104 | 
1105 |     if version == 4:
1106 |         return 32
1107 |     elif version == 6:
1108 |         return 128
1109 |     else:
1110 |         raise ValueError, "only IPv4 and IPv6 supported"
1111 | 
1112 | 
1113 | def _countFollowingZeros(l):
1114 |     """Return Nr. of elements containing 0 at the beginning th the list."""
1115 |     if len(l) == 0:
1116 |         return 0
1117 |     elif l[0] != 0:
1118 |         return 0
1119 |     else:
1120 |         return 1 + _countFollowingZeros(l[1:])
1121 | 
1122 | 
1123 | _BitTable = {'0': '0000', '1': '0001', '2': '0010', '3': '0011',
1124 |             '4': '0100', '5': '0101', '6': '0110', '7': '0111',
1125 |             '8': '1000', '9': '1001', 'a': '1010', 'b': '1011',
1126 |             'c': '1100', 'd': '1101', 'e': '1110', 'f': '1111'}
1127 |         
1128 | def _intToBin(val):
1129 |     """Return the binary representation of an integer as string."""
1130 | 
1131 |     if val < 0:
1132 |         raise ValueError, "Only positive Values allowed"
1133 |     s = hex(val).lower()
1134 |     ret = ''
1135 |     if s[-1] == 'l':
1136 |         s = s[:-1]
1137 |     for x in s[2:]:
1138 |         if __debug__:
1139 |             if not _BitTable.has_key(x):
1140 |                 raise AssertionError, "hex() returned strange result"
1141 |         ret += _BitTable[x]
1142 |     # remove leading zeros
1143 |     while ret[0] == '0' and len(ret) > 1:
1144 |         ret = ret[1:]
1145 |     return ret 
1146 | 
1147 | def _count1Bits(num):
1148 |     """Find the highest bit set to 1 in an integer."""
1149 |     ret = 0
1150 |     while num > 0:
1151 |         num = num >> 1
1152 |         ret += 1
1153 |     return ret
1154 | 
1155 | def _count0Bits(num):
1156 |     """Find the highest bit set to 0 in an integer."""
1157 | 
1158 |     # this could be so easy if _count1Bits(~long(num)) would work as excepted
1159 |     num = long(num)
1160 |     if num < 0:
1161 |         raise ValueError, "Only positive Numbers please: %s" % (num)
1162 |     ret = 0
1163 |     while num > 0:
1164 |         if num & 1 == 1:
1165 |             break
1166 |         num = num >> 1
1167 |         ret += 1
1168 |     return ret 
1169 | 
1170 |     
1171 | def _checkPrefix(ip, prefixlen, version):
1172 |     """Check the validity of a prefix
1173 |     
1174 |     Checks if the variant part of a prefix only has 0s, and the length is
1175 |     correct.
1176 | 
1177 |     >>> _checkPrefix(0x7f000000L, 24, 4)
1178 |     1
1179 |     >>> _checkPrefix(0x7f000001L, 24, 4)
1180 |     0
1181 |     >>> repr(_checkPrefix(0x7f000001L, -1, 4))
1182 |     'None'
1183 |     >>> repr(_checkPrefix(0x7f000001L, 33, 4))
1184 |     'None'
1185 |     """
1186 | 
1187 |     # TODO: unify this v4/v6/invalid code in a function
1188 |     bits = _ipVersionToLen(version)
1189 |     
1190 |     if prefixlen < 0 or prefixlen > bits:
1191 |         return None
1192 | 
1193 |     if ip == 0: 
1194 |         zbits = bits + 1
1195 |     else:
1196 |         zbits = _count0Bits(ip)
1197 |     if zbits <  bits - prefixlen:
1198 |         return 0
1199 |     else:
1200 |         return 1
1201 | 
1202 | 
1203 | def _checkNetmask(netmask, masklen):
1204 |     """Checks if a netmask is expressable as e prefixlen."""
1205 | 
1206 |     num = long(netmask)
1207 |     bits = masklen
1208 |     
1209 |     # remove zero bits at the end
1210 |     while (num & 1) == 0:
1211 |         num = num >> 1
1212 |         bits -= 1
1213 |         if bits == 0:
1214 |             break
1215 |     # now check if the rest consists only of ones
1216 |     while bits > 0:
1217 |         if (num & 1) == 0:
1218 |             raise ValueError, "Netmask %s can't be expressed as an prefix." % (hex(netmask))
1219 |         num = num >> 1
1220 |         bits -= 1
1221 | 
1222 | 
1223 | def _checkNetaddrWorksWithPrefixlen(net, prefixlen, version):
1224 |     """Check if a base addess of e network is compatible with a prefixlen"""
1225 |     if net & _prefixlenToNetmask(prefixlen, version) == net:
1226 |         return 1
1227 |     else:
1228 |         return 0
1229 |     
1230 | 
1231 | def _netmaskToPrefixlen(netmask):
1232 |     """Convert an Integer reprsenting a Netmask to an prefixlen.
1233 | 
1234 |     E.g. 0xffffff00 (255.255.255.0) returns 24
1235 |     """
1236 | 
1237 |     netlen = _count0Bits(netmask)
1238 |     masklen = _count1Bits(netmask)
1239 |     _checkNetmask(netmask, masklen)
1240 |     return masklen - netlen
1241 | 
1242 | 
1243 | def _prefixlenToNetmask(prefixlen, version):
1244 |     """Return a mask of n bits as a long integer.
1245 | 
1246 |     From 'IP address conversion functions with the builtin socket module' by Alex Martelli
1247 |     http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66517
1248 |     """
1249 |     if prefixlen == 0:
1250 |         return 0
1251 |     elif prefixlen < 0:
1252 |         raise ValueError, "Prefixlen must be > 0"
1253 |     return ((2L<<prefixlen-1)-1) << (_ipVersionToLen(version) - prefixlen)
1254 |          
1255 | 
1256 | #def _test():
1257 | #    import doctest, network.IPy
1258 | #    return doctest.testmod(IPy)
1259 | 
1260 | #if __name__ == "__main__":
1261 | #     _test()
1262 | 
1263 | #    t = [0xf0, 0xf00, 0xff00, 0xffff00, 0xffffff00L]
1264 | #    o = []
1265 | #    for x in t:
1266 | #       pass
1267 | #    x = 0L
1268 | 


--------------------------------------------------------------------------------
/network/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/network/__init__.py


--------------------------------------------------------------------------------
/network/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/network/__init__.pyc


--------------------------------------------------------------------------------
/network/make_networks.py:
--------------------------------------------------------------------------------
 1 | from mongodb import mongodb
 2 | import re
 3 | from IPy import IP
 4 | class make_networks(object):
 5 | 
 6 | 	def __init__(self,host,db):
 7 | 		self.db=mongodb.mongodb(host,27017,db)
 8 | 		self.networks={}
 9 | 	
10 | 	def createNetworks(self,collection):
11 | 		result = self.db.selectall(collection)
12 | 		for domaine in result:
13 | 			try:
14 | 				network=domaine['network']
15 | 				fqdn=domaine['domaine']
16 | 				netname = domaine['netname']
17 | 				if network in self.networks:
18 | 					nt=self.networks[network]
19 | 					nt.append((fqdn,netname))
20 | 					self.networks[network]=nt
21 | 				else:
22 | 					self.networks[network]=[(fqdn,netname)]
23 | 			except KeyError:
24 | 				print 'probleme de cle '+str(domaine)
25 | 	def exportFile(self,namefile):
26 | 		keys =iter(self.networks)
27 | 		result=''
28 | 		with open(namefile,'w') as fw:
29 | 			for key in keys:
30 | 				try:
31 | 					result=''
32 | 					cidr=None
33 | 					try:
34 | 						cidr=IP(key.replace(' ',''))
35 | 					except ValueError:
36 | 						cidr=''
37 | 					
38 | 					#fw.write(key + ';')
39 | 					add=result.join(":"+str(item) for item in self.networks[key])
40 | 
41 | 					fw.write(key + ';'+ add+';')
42 | 					fw.write(str(cidr)+'\n')
43 | 				except AttributeError:
44 | 						print 'erreur ' + key.replace(' ','')
45 | 						print cidr
46 | 				
47 | 		fw.close()
48 | 


--------------------------------------------------------------------------------
/network/networks.py:
--------------------------------------------------------------------------------
  1 | from pymongo import MongoClient
  2 | import pygeoip
  3 | import pymongo
  4 | import re
  5 | import socket
  6 | import whois
  7 | 
  8 | def port_connexion_by_hostname(hostname,port):
  9 |     try:
 10 |         http_socket = socket.socket ( socket.AF_INET, socket.SOCK_STREAM )
 11 |         http_socket.connect ( ( hostname, port ) )
 12 |         return True
 13 |     except socket.gaierror:
 14 |         return False    
 15 | def resolve_dns(hostname):
 16 |     ip=None
 17 |     try:
 18 |         ip=socket.gethostbyname(hostname)
 19 |     except socket.gaierror:
 20 |         print "DNS Resolution Failure: "+hostname
 21 |     except socket.error:
 22 |         print 'Erreur reseau'
 23 |     return ip 
 24 | 
 25 | def geolocIP(pathgeoloc,ip):
 26 |     geo=None
 27 |     glc = pygeoip.GeoIP(pathgeoloc)
 28 |     try:
 29 |         ar=glc.record_by_addr(ip)
 30 |         if ar!=None:
 31 |             if 'latitude' in ar and 'longitude' in ar:
 32 |                 geo=str(ar['latitude'])+'_'+str(ar['longitude'])
 33 |                 return geo
 34 |     except pygeoip.GeoIPError:
 35 |         print "Erreur de geoloc"
 36 |     return ''
 37 | def geolocCountry(pathgeoloc,ip):
 38 |     geo=None
 39 |     glc = pygeoip.GeoIP(pathgeoloc)
 40 |     try:
 41 |         ar=glc.record_by_addr(ip)
 42 |         if ar != None:
 43 |             geo=ar['country_code3']
 44 |         else:
 45 |             return ''
 46 |     except pygeoip.GeoIPError:
 47 |         print "Erreur de geoloc"
 48 |     return geo    
 49 | def whoisIP(whois_service,ip):
 50 |     result=None
 51 |     try:
 52 |         Whois = whois.WhoisConsumer(ip)
 53 |         whois.WhoisRequest(Whois,whois_service)
 54 |         whois.asyncore.loop()
 55 |         result=Whois.text
 56 |     except socket.gaierror:    
 57 |         print "Whois Failure: "+ip
 58 |     return result
 59 |     
 60 |     #pattern = 'inetnum: (.*)'				
 61 |     #reg_CIDR=re.compile(pattern)
 62 |     
 63 | def extract_whois_information(pattern,whois_text):
 64 |     reg_CIDR=re.compile(pattern)
 65 |     m=reg_CIDR.search(whois_text)
 66 |     information=None
 67 |     if m:
 68 |         information=m.group(1)
 69 |     return information
 70 |        
 71 | def resolve(pathgeoloc,db_value):
 72 |     connection= MongoClient(host='localhost', port=27017)
 73 |     db=connection[db_value]
 74 |     
 75 |     domaines=db.new_domaines.find()
 76 |     for domaine in domaines:
 77 |         try:       	
 78 |             domaine_value = domaine['domaine']
 79 |             if not 'ip' in domaine:
 80 |                 print 'resolution '+domaine_value
 81 |                 ip=resolve_dns(domaine['domaine'])
 82 |                 if ip != None or ip != '0.0.0.0':
 83 |                     print 'ip du domaine ' + domaine_value +' '+ ip 
 84 |                     domaine['ip']=ip
 85 |                     key = geolocIP(pathgeoloc,ip)
 86 |                     domaine['geoloc']=key
 87 |                     whois_text=whoisIP("whois.ripe.net",ip)
 88 |                     netname=None
 89 |                     if not 'Whois'in domaine:
 90 |                         pattern_netname='netname: (.*)'
 91 |                         netname=extract_whois_information(pattern_netname,whois_text)
 92 |                         domaine['netname']=netname
 93 |                     network=None
 94 |                     if whois_text !=None:
 95 |              	      pattern_network = 'inetnum: (.*)'				
 96 |              	      network=extract_whois_information(pattern_network,whois_text)
 97 |              	    if network != None:
 98 |                  	  domaine['network']=network
 99 |                     try:   
100 |                         db.new_domaines.save(domaine)
101 |                     except:
102 |                          print domaine
103 |         except TypeError:
104 |                 print 'Error type '+domaine_value            
105 |         except pymongo.errors.OperationFailure:
106 |             print 'Error pymongo ' +domaine
107 | 


--------------------------------------------------------------------------------
/network/networks.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/network/networks.pyc


--------------------------------------------------------------------------------
/network/search_on_network.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | 
 3 | from harvesting import search
 4 | 
 5 | class search_on_network(threading.Thread):
 6 | 
 7 | 	def __init__(self,network_ip,criterius,script,limit,db):
 8 | 		threading.Thread.__init__(self)
 9 | 		self.gs=search.search(limit,criterius,script,db)
10 | 		self.network_ip=network_ip
11 | 		self.network_all_ready=[]
12 | 	def run(self):
13 | 		if not str(self.network_ip) in self.network_all_ready:
14 | 			self.network_all_ready.append(str(self.network_ip))
15 | 			for ip in self.network_ip:
16 | 				setattr(self.gs,'criteria','ip:'+str(ip))
17 | 				self.gs.run()                        
18 | 			
19 | 		
20 | 


--------------------------------------------------------------------------------
/processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/processing/__init__.py


--------------------------------------------------------------------------------
/processing/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/processing/__init__.pyc


--------------------------------------------------------------------------------
/processing/bulk.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import subprocess
 3 | from subprocess import Popen, PIPE
 4 | 
 5 | with open(sys.argv[1],'r') as fr:
 6 |     for ligne in fr:
 7 |         result=subprocess.Popen(['python' ,'main.py',sys.argv[1],ligne,'../geolocalization/GeoLiteCity.dat'],stdout=PIPE)       
 8 |         for ligne in result.stdout:
 9 |             print ligne
10 | 


--------------------------------------------------------------------------------
/processing/categoryze_result.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from harvesting import white_list
 3 | import mongodb
 4 | 
 5 | db=sys.argv[1]
 6 | mdb=mongodb.mongodb('localhost',27017,db)
 7 | 
 8 | for domaine in mdb.selectall('new_domaines'):
 9 |     dm=domaine['domaine']
10 |     
11 | 


--------------------------------------------------------------------------------
/processing/clean_db.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | import glob
 3 | import os
 4 | import sys
 5 | 
 6 | 
 7 | class Cleandb(object):
 8 |     def __init__(self, db_value, directory, filters):
 9 |         connection = MongoClient(host='localhost', port=27017, db=db_value)
10 |         self.db = connection[db_value]
11 |         self.filters = filters
12 |         self.directory = directory
13 | 
14 |     def clean(self):
15 |         list_domains = []
16 |         list_files = glob.glob(self.directory + '/*.png')
17 |         for name_file in list_files:
18 |             fileName, fileExtension = os.path.splitext(name_file)
19 |             tokens = fileName.split('/')
20 |             domain = tokens[len(tokens) - 1]
21 |             list_domains.append(domain)
22 |         list_domains_db_unicode = self.db.new_domaines.distinct('domaine')
23 |         list_domains_db = [str(domain) for domain in list_domains_db_unicode]
24 |         list_to_remove = list(set(list_domains_db) - set(list_domains))
25 |         for domain in list_to_remove:
26 |             if self._filters(domain):
27 |                 self.db.new_domaines.remove({'domaine': domain})
28 | 
29 |     def _filters(self, domain):
30 |         for filter in self.filters:
31 |             if filter.find(domain) != -1:
32 |                 return False
33 |             return True
34 | 


--------------------------------------------------------------------------------
/processing/compare.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | file_ref = sys.argv[1]
 3 | file_to_compare = sys.argv[2]
 4 | file_result = sys.argv[3]
 5 | file_marque = sys.argv[4]
 6 | 
 7 | map_file_ref = {}
 8 | 
 9 | with open(file_ref, 'r') as fr:
10 |     for ligne in fr:
11 |         ligne = ligne.strip()
12 |         tokens = ligne.split(',')
13 |         ip = tokens[1]
14 |         domaine = tokens[2]
15 |         if ip in map_file_ref:
16 |             domaines = map_file_ref[ip]
17 |             domaines.append(domaine)
18 |         else :
19 |             map_file_ref[ip] = [domaine]
20 | 
21 | 


--------------------------------------------------------------------------------
/processing/create_request.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | requests=[]
 3 | with open(sys.argv[1],'r') as fr:
 4 |     for ligne in fr:
 5 |         ligne=ligne.strip()
 6 |         requests.append('site:'+ligne+'.com,' +sys.argv[1]+' and '+ligne)
 7 |       
 8 | with open(sys.argv[2],'w') as fw:
 9 |     for request in requests:
10 |         fw.write(request+'\n')
11 | 


--------------------------------------------------------------------------------
/processing/create_result.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | from pymongo import MongoClient
 3 | import pymongo
 4 | 
 5 | 
 6 | class Create_Result():
 7 |     def __init__(self, dbname, critere):
 8 |         self.dbname = dbname
 9 |         self.critere = critere
10 | 
11 |     def process(self, collection):
12 |         print "######### Print Result database ##########"
13 |         connection = MongoClient(host='localhost', port=27017)
14 |         db = connection[self.dbname]
15 |         domaines = db[collection].find()
16 |         if self.critere.find(','):
17 |             critere = self.critere.split(',')
18 |         else:
19 |             if len(self.critere) > 0:
20 |                 critere = [critere]
21 | 
22 |         with open(self.dbname + '_' + '_'.join(critere) + '.csv', 'w') as fw:
23 |             for domaine in domaines:
24 |                 try:
25 |                     towrite = ''
26 |                     for key in critere:
27 |                         infos = domaine[key]
28 |                         if len(infos) > 0:
29 |                             if isinstance(infos, list):
30 |                                 infos = ','.join(infos)
31 |                         towrite = towrite + ',' + str(infos)
32 | 
33 |                     fw.write(towrite[1:] + '\n')
34 |                 except KeyError:
35 |                     print 'domaine: ' + str(domaine)
36 |                 except pymongo.errors.OperationFailure:
37 |                     print 'error mongo ' + str(domaine)
38 | 


--------------------------------------------------------------------------------
/processing/createcorpus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | from mongodb import mongodb
 3 | import sys
 4 | import filters
 5 | db=sys.argv[1]
 6 | mdb=mongodb.mongodb('localhost',27017,db)
 7 | 
 8 | i=0
 9 | 
10 | with open(db+'_domaine.txt','w') as fw:
11 | 	fw.write('**** *domaine\n')
12 | 	for domaine in mdb.selectall('metadatas'):
13 | 		fw.write(domaine['domaine'])		
14 | 		fw.write('\n')		
15 | with open(db+'_metadatas.txt','w') as fw:
16 | 	fw.write('**** *metadata\n')
17 | 	for domaine in mdb.selectall('metadatas'):		
18 | 		meta=domaine['meta']
19 | 		for filt in filters.filters_metadata:
20 | 			meta=meta.replace(filt,'')
21 | 			meta=meta.replace(filt.swapcase(),'')
22 | 		fw.write(meta.encode('ascii','ignore'))		
23 | 		fw.write('\n')
24 | 		
25 | 	
26 | 		
27 | fw.close()
28 | 	
29 | 		
30 | 


--------------------------------------------------------------------------------
/processing/dnstree.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Dec 20, 2012
 3 | 
 4 | @author: slarinier
 5 | '''
 6 | from pymongo import MongoClient
 7 | from pyfaup.faup import Faup
 8 | 
 9 | 
10 | class DNSTree(object):
11 |     '''
12 |     classdocs
13 |     '''
14 | 
15 |     def __init__(self, db_value):
16 |         '''
17 |         Constructor
18 |         '''
19 |         connection = MongoClient(host='localhost', port=27017)
20 |         self.db = connection[db_value]
21 | 
22 |     def process(self):
23 |         list_domains = self.db['new_domaines'].distinct('domaine')
24 |         fex = Faup()
25 |         for domain in list_domains:
26 |             url = 'http://' + str(domain)
27 |             fex.decode(url)
28 | 
29 |             try:
30 |                 print (
31 |                 fex.get_tld() + ',' + fex.get_domain() + ',' + ','.join(fex.get_subdomain().split('.')[::-1]).replace('www',
32 |                                                                                                                       '')).replace(
33 |                     ',,', ',')
34 |             except:
35 |                 pass


--------------------------------------------------------------------------------
/processing/filters.py:
--------------------------------------------------------------------------------
1 | filters_metadata=['charset','text','iso','html','-8859-1','www','fr']
2 | 


--------------------------------------------------------------------------------
/processing/filters.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/processing/filters.pyc


--------------------------------------------------------------------------------
/processing/gouv.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/processing/gouv.log


--------------------------------------------------------------------------------
/processing/gouv_domaine.txt:
--------------------------------------------------------------------------------
  1 | **** *domaine
  2 | legifrance.gouv.fr
  3 | www.eure.sit.gouv.fr
  4 | archives.livreblancdefenseetsecurite.gouv.fr
  5 | www.yonne.sit.gouv.fr
  6 | www.internet.gouv.fr
  7 | www.direct-fr.com
  8 | forum.webmaster-rank.info
  9 | www.drees.sante.gouv.fr
 10 | www.impots.gouv.fr
 11 | www.oncfs.gouv.fr
 12 | www.legifrance.gouv.fr
 13 | www.interieur.gouv.fr
 14 | www.refondonslecole.gouv.fr
 15 | www.immigration.gouv.fr
 16 | www.dmp.gouv.fr
 17 | www.vendee.gouv.fr
 18 | www.gouvernement.fr
 19 | direccte.gouv.fr
 20 | service-public.fr
 21 | www.jeunes.gouv.fr
 22 | vosdroits.service-public.fr
 23 | www.tourisme.gouv.fr
 24 | www.service-public.fr
 25 | archives.forum.gouv.fr
 26 | www.banqoutils.education.gouv.fr
 27 | www2.impots.gouv.fr
 28 | www.sgdn.gouv.fr
 29 | www.cerpet.education.gouv.fr
 30 | www.legifrance.org
 31 | www-int.dmp.gouv.fr
 32 | www.hcst.fr
 33 | www.publinetd5.education.fr
 34 | archives.dividende-numerique.fr
 35 | cpcnu.fr
 36 | www.dicod.defense.gouv.fr
 37 | www.legifrance.com
 38 | delegation.internet.gouv.fr
 39 | www.clemi.org
 40 | www-prod.sante.gouv.fr
 41 | archives.europe.gouv.fr
 42 | www.sante.gouv.fr
 43 | www.nord.gouv.fr
 44 | www.ove-national.education.fr
 45 | www.2011-annee-droits-patients.sante.gouv.fr
 46 | org-www.sante.gouv.fr
 47 | www.emsome.terre.defense.gouv.fr
 48 | www.rt519.terre.defense.gouv.fr
 49 | archives.surfez-intelligent.gouv.fr
 50 | www.restructurations.defense.gouv.fr
 51 | www.bretagne.pref.gouv.fr
 52 | www.bca13.terre.defense.gouv.fr
 53 | ons.education.gouv.fr
 54 | www.juryeps.education.fr
 55 | 160.92.162.230
 56 | www.publinetde.education.fr
 57 | publinetce2.education.fr
 58 | www.cdj59.org
 59 | www.ancien.eure.pref.gouv.fr
 60 | www.drdjs-lorraine.jeunesse-sports.gouv.fr
 61 | www.hce.education.fr
 62 | www.cpcnu.fr
 63 | www.bretagne.drjscs.gouv.fr
 64 | www.centre.drjscs.gouv.fr
 65 | www.paca.drjscs.gouv.fr
 66 | www.auvergne.drjscs.gouv.fr
 67 | www.observatoire-parite.gouv.fr
 68 | www.rama3.terre.defense.gouv.fr
 69 | www.securite-sociale.fr
 70 | www.haute-normandie.drjscs.gouv.fr
 71 | www.bilrif.sga.defense.gouv.fr
 72 | www.drjscs.gouv.fr
 73 | www.aquitaine.drjscs.gouv.fr
 74 | www.sports.gouv.fr
 75 | www.anesm.sante.gouv.fr
 76 | www.cesat.terre.defense.gouv.fr
 77 | www.franche-comte.drjscs.gouv.fr
 78 | www.garnison-besancon.terre.defense.gouv.fr
 79 | www.etrs.terre.defense.gouv.fr
 80 | www.bca7.terre.defense.gouv.fr
 81 | www.lorraine.drjscs.gouv.fr
 82 | www.midi-pyrenees.drjscs.gouv.fr
 83 | www.rhone-alpes.drjscs.gouv.fr
 84 | fr.webmaster-rank.info
 85 | it.webmaster-rank.info
 86 | easy404.webmaster-rank.info
 87 | www.webmaster-rank.info
 88 | www.pyrenees-atlantiques.pref.gouv.fr
 89 | www.vigicrues.developpement-durable.gouv.fr
 90 | www.memoiredeshommes.sga.defense.gouv.fr
 91 | www.vigicrues.ecologie.gouv.fr
 92 | www.servicehistorique.sga.defense.gouv.fr
 93 | www.basse-normandie.pref.gouv.fr
 94 | www.loire.pref.gouv.fr
 95 | www.oise.pref.gouv.fr
 96 | www.dordogne.pref.gouv.fr
 97 | dordogne.pref.gouv.fr
 98 | www.morbihan.pref.gouv.fr
 99 | formation.oncfs.gouv.fr
100 | www.martinique.pref.gouv.fr
101 | www.drome.pref.gouv.fr
102 | loire.gouv.fr
103 | www.tarn-et-garonne.pref.gouv.fr
104 | www.cada.fr
105 | www.loire.gouv.fr
106 | m.geoportail.fr
107 | www.conseilculturel-upm.gouv.fr
108 | www.essonne.gouv.fr
109 | archives.internet.gouv.fr
110 | www.like-rank.com
111 | www.drogues.gouv.fr
112 | www.nord.pref.gouv.fr
113 | drogues.gouv.fr
114 | www.haute-saone.pref.gouv.fr
115 | www.maine-et-loire.pref.gouv.fr
116 | www.gopher.com
117 | www.ariege.pref.gouv.fr
118 | www2.direct-fr.com
119 | www.pyrenees-orientales.pref.gouv.fr
120 | www.haut-rhin.pref.gouv.fr
121 | www.isere.pref.gouv.fr
122 | www.somme.pref.gouv.fr
123 | search.kiwee.com
124 | ardeche.pref.gouv.fr
125 | vendee.gouv.fr
126 | www.franche-comte.pref.gouv.fr
127 | org-www.impots.gouv.fr
128 | www.contact.impots.gouv.fr
129 | interne.impots.gouv.fr
130 | contacts.impots.gouv.fr
131 | www.champagne-ardenne.pref.gouv.fr
132 | www.cartocrime.net
133 | www.guadeloupe.dieccte.gouv.fr
134 | www.guyane.dieccte.gouv.fr
135 | www.auvergne.direccte.gouv.fr
136 | www.boamp.fr
137 | www.paca.direccte.gouv.fr
138 | www.gers.pref.gouv.fr
139 | www.savoie.pref.gouv.fr
140 | www.ladocumentationfrancaise.fr
141 | www.vie-publique.fr
142 | www.pme.service-public.fr
143 | www.direccte.gouv.fr
144 | www.alsace.direccte.gouv.fr
145 | www.marne.pref.gouv.fr
146 | lannuaire.service-public.fr
147 | www.corse.direccte.gouv.fr
148 | www.ddjs-ardennes.jeunesse-sports.gouv.fr
149 | www.correze.pref.gouv.fr
150 | www.centre.pref.gouv.fr
151 | landes.pref.gouv.fr
152 | www.recrutement.terre.defense.gouv.fr
153 | search.zip2.com
154 | www.nievre.pref.gouv.fr
155 | www.contacts.impots.gouv.fr
156 | search.firstplace.com
157 | www.poitou-charentes.direccte.gouv.fr
158 | www.commentcamarche.net
159 | www.idf.direccte.gouv.fr
160 | www.ardennes.pref.gouv.fr
161 | www.pays-de-la-loire.direccte.gouv.fr
162 | www.mayotte.dieccte.gouv.fr
163 | experts-univers.com
164 | m.vosdroits.service-public.fr
165 | communaute.vie-publique.fr
166 | discours.vie-publique.fr
167 | interactif.service-public.fr
168 | pme.service-public.fr
169 | www.bourgogne.direccte.gouv.fr
170 | sciencespo.ladocumentationfrancaise.fr
171 | environnement-sante.com
172 | incredimailhosted.infospace.com
173 | mamma.infospace.com
174 | www.concours-civils.defense.gouv.fr
175 | www.concours-civils.sga.defense.gouv.fr
176 | www.leroustidou.com
177 | www.ri92.terre.defense.gouv.fr
178 | www.gites-erable-alsace.com
179 | www.formation.terre.defense.gouv.fr
180 | 90plan.ovh.net
181 | www.sante-environnement-travail.fr
182 | dmp.gouv.fr
183 | ladsetjockeys-lefilm.fr
184 | www.sante-environnement.fr
185 | www.topfouine.com
186 | www.basse-normandie.direccte.gouv.fr
187 | www.laubergine-eygalieres.com
188 | www.bretagne.direccte.gouv.fr
189 | www.meuse.pref.gouv.fr
190 | www.bilrif.defense.gouv.fr
191 | www.antoine.fr
192 | www.terredebruyere.com
193 | www.beghingroux.fr
194 | www.auberge-provencale.fr
195 | www.soirsdefetes.com
196 | www.telestock.fr
197 | sante-environnement.org
198 | www.chaletliotard.fr
199 | www.cars-la-populaire.com
200 | environnement-sante.org
201 | www.sermesdistribution.fr
202 | www.camping-la-pinede.com
203 | patisserieolivierbourau.com
204 | www.gourmets-events.com
205 | www.environnement-sante.net
206 | www.environnement-sante.fr
207 | www.alliancepavillons.org
208 | atelierfeesbrodeuses.fr
209 | www.dermophilindien-lab.com
210 | www.la-cabane-perchee.com
211 | www.aeta-audio.com
212 | www.sahlm79.fr
213 | www.ba118.air.defense.gouv.fr
214 | www.cclinouest.com
215 | www.rg3.terre.defense.gouv.fr
216 | www.iserba.fr
217 | www.fantasyforest.fr
218 | www.televitale.fr
219 | www.serialproducteurs.com
220 | www.ville-saintdie.fr
221 | www.coiffure2010.com
222 | www.cehd.sga.defense.gouv.fr
223 | www.varini.org
224 | www.ain.pref.gouv.fr
225 | www.beauregard-hotel.com
226 | www.transports-bernard.com
227 | www.tattootatouage.com
228 | www.automobile2010.com
229 | www.eetaa722.air.defense.gouv.fr
230 | coiffure2008.com
231 | www.nettoyagebijoux.com
232 | www.stages.defense.gouv.fr
233 | coupe-de-cheveux-homme.com
234 | www.coupedecheveuxfemme.com
235 | www.ba901.air.defense.gouv.fr
236 | www.ba106.air.defense.gouv.fr
237 | www.ba120.air.defense.gouv.fr
238 | www.coiffure2008.com
239 | www.web200708.clarahost.fr
240 | www.beautedeco.com
241 | www.qcclick.com
242 | coiffure2009.com
243 | www.epa749.air.defense.gouv.fr
244 | www.vpgreen.fr
245 | www.bcsfreelance.com
246 | www.lechaletdumoulin.fr
247 | www.media.recrutement.terre.defense.gouv.fr
248 | www.photo-phore.com
249 | www.marocchezlhabitant.com
250 | www.industube.com
251 | www.georget.fr
252 | www.acrie.fr
253 | mobile.recrutement.terre.defense.gouv.fr
254 | www.ba942.air.defense.gouv.fr
255 | www.hotelsatlas.com
256 | www.pharmacie-de-lherm.fr
257 | www.rpmi.fr
258 | 87.106.4.168
259 | www.ba107.air.defense.gouv.fr
260 | www.enligne.recrutement.terre.defense.gouv.fr
261 | www.hotel-st-georges.com
262 | www.ville-challans.fr
263 | www.ba217.air.defense.gouv.fr
264 | www.airmobilite.air.defense.gouv.fr
265 | www.ba721.air.defense.gouv.fr
266 | www.palmiers-ocean.fr
267 | www.quellemutuelles.com
268 | www.cfas.air.defense.gouv.fr
269 | www.ba112.air.defense.gouv.fr
270 | www.cma-bareges.air.defense.gouv.fr
271 | www.da204.air.defense.gouv.fr
272 | ead.ent-etrs.net
273 | www.ent-etrs.net
274 | www.eppa.sante.defense.gouv.fr
275 | www.plasti-ouest.com
276 | pharmacieduvalsaintjean.e-officine.net
277 | www.cedimattp.fr
278 | www.machecoul.com
279 | www.tsr-be.com
280 | pharmaciecentralelens.e-officine.net
281 | www.reseauetudiant.com
282 | twitter-icon.com
283 | search.egreetings.com
284 | www.ado.justice.gouv.fr
285 | www.experatoo.com
286 | www.journaldunet.com
287 | www.annuaires.justice.gouv.fr
288 | www.coiffures2011.net
289 | www.saint-martin-de-sanzay.fr
290 | www.puregourmandise.com
291 | www.yatoshi.com
292 | www.techniques-transparentes.com
293 | vecteurdiffusion.com
294 | www.domaine-sainteleocadie.com
295 | www.lejulesverne-paris.com
296 | www.lewistrondheim.com
297 | arnaudfrichphoto.com
298 | www.cdad-lot.justice.fr
299 | www.cdad-manche.justice.fr
300 | www.metiers.justice.gouv.fr
301 | affinitiz.net
302 | www.alerte-enlevement.gouv.fr
303 | www.ca-paris.justice.fr
304 | www.ciao.fr
305 | www.rip.justice.fr
306 | www.ca-besancon.justice.fr
307 | www.fontainedemars.com
308 | www.ca-bourges.justice.fr
309 | www.cdad-cotedor.justice.fr
310 | www.ca-aixenprovence.justice.fr
311 | www.holiprom.com
312 | www.western-valley.fr
313 | www.infoceane.com
314 | www.bateaux-mouches.fr
315 | www.justice.gouv.fr
316 | www.alaindelorme.com
317 | avocats.fr
318 | anissaledorze.avocats.fr
319 | www.vos-droits.justice.gouv.fr
320 | cmonatelier.cultura.com
321 | isabelle.chevalier-dupont.avocats.fr
322 | reseau.avf.asso.fr
323 | www.ca-amiens.justice.fr
324 | www.boutique-clubdsk.fr
325 | www.noube.fr
326 | www.ca-chambery.justice.fr
327 | www.eng.justice.fr
328 | www.ca-versailles.justice.fr
329 | servirlafrance.com
330 | www.animalnature.fr
331 | reseaulia.com
332 | selli-vine.avocats.fr
333 | kityuko.42stores.com
334 | couturejihanny.42stores.com
335 | www.ca-angers.justice.fr
336 | www.setzaomi.com
337 | www.editions-infini.fr
338 | www.lineab1.fr
339 | corinegaudilliere.avocats.fr
340 | planete-volontaires.fr
341 | blogs.jardiner-malin.fr
342 | loisicrea.com
343 | www.cevennescaravanes.com
344 | www.colorme.ch
345 | affinitiz.com
346 | parentsindignes.42stores.com
347 | www.suite23.fr
348 | www.1bijoux2perles.fr
349 | www.mecaservice.com
350 | www.ptfp.fr
351 | www.nosfell.com
352 | cheminsblancs.com
353 | cubexar.com
354 | www.jetaide.com
355 | forum-centres-d-appels.com
356 | www.avocatforum.com
357 | jetaide.com
358 | www.manzi.be
359 | www.cabasse.com
360 | candyshop.42stores.com
361 | kits-n-scrap.42stores.com
362 | www.lecoinplaisir.com
363 | www.swingromaneacademie.com
364 | www.limprimeur.net
365 | www.fert-demolition.com
366 | www.eguiazabal.com
367 | www.chacunsonchemin.com
368 | www.normanniae.com
369 | www.ot-saverne.fr
370 | www.poleressources95.org
371 | 720plan.ovh.net
372 | www.ba116.air.defense.gouv.fr
373 | www.ypluthier.com
374 | marina-erbarossa.com
375 | www.lamy-diffusion.com
376 | www.ba125.air.defense.gouv.fr
377 | www.leganet.fr
378 | constat-huissier.net
379 | information-juridique.com
380 | famillesdavant.linternaute.com
381 | msn.ciao.fr
382 | ecran-de-veille.linternaute.com
383 | www.forum-entreprise.com
384 | www.cgv-expert.fr
385 | webcam.linternaute.com
386 | programme-tv.linternaute.com
387 | www.guyane.pref.gouv.fr
388 | www.conseil-juridique.net
389 | www.action-collective.com
390 | polardiagram.com
391 | encyclopedie.linternaute.com
392 | www.legavox.fr
393 | site.journaldunet.com
394 | www.juristudiant.com
395 | emploi.journaldunet.com
396 | juristudiant.com
397 | arwatch.org
398 | formation.journaldunet.com
399 | www.inpharma2000.ru
400 | www.yvelines.pref.gouv.fr
401 | ms.ciao.fr
402 | www.veille-reputation.com
403 | www.finistere.pref.gouv.fr
404 | www.sarthe.pref.gouv.fr
405 | www.twitter-icon.com
406 | www.sarthe.gouv.fr
407 | photos.linternaute.com
408 | societe.journaldunet.com
409 | www.portail-mystique.fr
410 | www.moselle.pref.gouv.fr
411 | alavoileblanche.com
412 | piecemontee.com
413 | www.albifun.com
414 | www.urlidea.com
415 | www.guadeloupe.pref.gouv.fr
416 | dhammadana.fr
417 | www.sante-environnement.com
418 | www.escale-wellness.be
419 | www.markosweb.com
420 | www.aquitaine.pref.gouv.fr
421 | www.mc-franquevielle.fr
422 | www.domaine-de-marseillens.com
423 | www.ardeche.pref.gouv.fr
424 | www.lot.pref.gouv.fr
425 | www.charente.pref.gouv.fr
426 | www.indre-et-loire.pref.gouv.fr
427 | www.loiret.pref.gouv.fr
428 | www.motards-idf.fr
429 | www.indre.pref.gouv.fr
430 | www.mjdatabank.com
431 | www.zsysteme.com
432 | www.lemanoir39.com
433 | www.hotel-les-pyrenees.com
434 | www.droitsenfant.com
435 | annecybonlieuhotel.fr
436 | www.manche.pref.gouv.fr
437 | galeriedu7eme.com
438 | www.assoprairieland.com
439 | www.lexilogos.com
440 | www.preparation-physique.net
441 | www.theoutlaw.fr
442 | www.bill-looking.fr
443 | www.landes.pref.gouv.fr
444 | www.aigrehandball.fr
445 | www.iletaitunevoix.org
446 | www.jura.pref.gouv.fr
447 | www.jm-planchon.fr
448 | www.campingchadeyron.com
449 | www.fruirouge.fr
450 | www.campingcassis.com
451 | www.evretz.fr
452 | www.contespedagogiques.be
453 | www.lazare-et-vespucci.com
454 | www.randoleiesclops.fr
455 | www.braccomotos.com
456 | www.hugme.fr
457 | mondolatino.fr
458 | www.pkma.eu
459 | www.photos-allain-mousset.fr
460 | unamourdeuxperles.com
461 | www.vaccination-h1n1.moselle.pref.gouv.fr
462 | jardinvoyageur.com
463 | seine-saint-denis.gouv.fr
464 | www.auvergne.pref.gouv.fr
465 | mobile.hauts-de-seine.gouv.fr
466 | www.pfrh.lorraine.pref.gouv.fr
467 | www.srias.lorraine.pref.gouv.fr
468 | paysages.mayenne.pref.gouv.fr
469 | www.risquesmajeurs-hautes-pyrenees.pref.gouv.fr
470 | 208.76.50.76
471 | ddrm.mayotte.pref.gouv.fr
472 | lot-et-garonne.gouv.fr
473 | www.ppol-taxi.interieur.gouv.fr
474 | nasdaq.infospace.com
475 | www.prse.lorraine.gouv.fr
476 | www.haute-savoie.pref.gouv.fr
477 | www.cakechloes.com
478 | www.languedoc-roussillon.pref.gouv.fr
479 | aveyron.gouv.fr
480 | old.pyrenees-atlantiques.pref.gouv.fr
481 | www.finistere.gouv.fr
482 | www.seine-saint-denis.pref.gouv.fr
483 | www.lorraine.pref.gouv.fr
484 | www.charente-maritime.pref.gouv.fr
485 | www.finances.gouv.fr
486 | laboratoirecentral.interieur.gouv.fr
487 | sas.sante.gouv.fr
488 | yvelines.pref.gouv.fr
489 | www.recherche-biomedicale.sante.gouv.fr
490 | www.datar.gouv.fr
491 | www.lenotre.culture.gouv.fr
492 | www.sanglier5767.com
493 | portailmoselle.dims.fr
494 | baignades.sante.gouv.fr
495 | agriculture.gouv.fr
496 | www.moselle.gouv.fr
497 | voiceillusion.com
498 | ddaf.ain.pref.gouv.fr
499 | www.pref93.pref.gouv.fr
500 | www.srcae.lorraine.gouv.fr
501 | www.diplomatie.gouv.fr
502 | www.economie.gouv.fr
503 | www.developpement-durable.gouv.fr
504 | en.palmiers-ocean.fr
505 | www.sae-diffusion.sante.gouv.fr
506 | www.ddjs-haute-savoie.jeunesse-sports.gouv.fr
507 | www.ile-de-france.sante.gouv.fr
508 | www.coupesdecheveux2011.net
509 | www.eure.sit.gouv.fr
510 | archives.livreblancdefenseetsecurite.gouv.fr
511 | www.oncfs.gouv.fr
512 | www.yonne.sit.gouv.fr
513 | www.internet.gouv.fr
514 | www.impots.gouv.fr
515 | forum.webmaster-rank.info
516 | www2.impots.gouv.fr
517 | direccte.gouv.fr
518 | www.immigration.gouv.fr
519 | www.hcst.fr
520 | www.drees.sante.gouv.fr
521 | 


--------------------------------------------------------------------------------
/processing/metadataextract.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | from subprocess import Popen, PIPE
 4 | import threading
 5 | import pymongo
 6 | from pymongo import MongoClient
 7 | import simplejson
 8 | import HTMLParser
 9 | class metadataextract(threading.Thread):
10 | 	def __init__(self,scriptjs,db,domaine,url):
11 | 		threading.Thread.__init__(self)
12 | 		self.result=[]
13 | 		self.domaine=domaine
14 | 		self.scriptjs=scriptjs
15 | 		self.url=url
16 | 		self.connection= MongoClient(host='localhost', port=27017,db=db)
17 | 		self.db=self.connection[db]
18 | 
19 | 	def run(self):
20 | 		result=subprocess.Popen(['casperjs',self.scriptjs,self.url],stdout=PIPE)
21 | 		meta=''
22 | 		contents=[]
23 | 		
24 | 		for ligne in result.stdout:
25 | 			meta=meta+ligne
26 | 			
27 | 		try:
28 | 			data = simplejson.loads(meta)
29 | 			#print data
30 | 			print len(data)
31 | 			if len(data) > 0:
32 | 				print data
33 | 				for content in data:
34 | 					contents.append(content['content'])
35 | 					
36 | 				meta=' '.join(contents)
37 | 				print meta
38 | 				if len(meta) >0:
39 | 					h = HTMLParser.HTMLParser()
40 | 					print h.unescape(meta)
41 | 					value_db={'domaine':self.domaine,'meta':h.unescape(meta)}
42 | 					self.db.metadatas.save(value_db)
43 | 		except ValueError:
44 | 			print 'Erreur encoding: '+ meta
45 | 


--------------------------------------------------------------------------------
/processing/metadataextract.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/processing/metadataextract.pyc


--------------------------------------------------------------------------------
/scanners/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/scanners/__init__.py


--------------------------------------------------------------------------------
/scanners/networks.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 14 mai 2014
 3 | 
 4 | @author: slarinier
 5 | '''
 6 | from libnmap.parser import NmapParser
 7 | from libnmap.process import NmapProcess
 8 | 
 9 | class Networks(object):
10 |     '''
11 |     classdocs
12 |     '''
13 | 
14 | 
15 |     def __init__(self, targets,options):
16 |         self.nmap=NmapProcess(targets,options)
17 |     def run(self):
18 |         self.nmap.run()
19 |         
20 |     def make_report(self):
21 |         report=NmapParser.parse(self.nmap.stdout)
22 |         result=[]
23 |         for host in report.hosts:
24 |             temp={}
25 |             print host
26 |             print  host.scripts_results
27 |             temp['ip']=host.ipv4
28 |             print [(service.state,service.port,service.scripts_results) for service in host.services]
29 | #             for service in host.services:
30 | #                 for k in service.scripts_results:
31 | #                     if k.find('.'):
32 | #                         v=service.scripts_results[k]
33 | #                         del service.scripts_resutls[k]
34 | #                         service.scripts_resutls[k.replace('.','_')]=v
35 | #                 temp['services']=[(service.state,service.port,service.scripts_results)]
36 | #             result.append(temp)
37 | #         return result
38 |     def record_report(self,records,cache,coll):
39 |         for r in records:
40 |             doc=cache[r['ip']]
41 |             doc['service']=r
42 |             try:
43 |                 coll.save(doc)
44 |             except:
45 |                 print doc


--------------------------------------------------------------------------------
/screenshots/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/screenshots/__init__.py


--------------------------------------------------------------------------------
/screenshots/make_screenshots.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python	
 2 | import screenshots
 3 | import sys
 4 | import threading
 5 | 
 6 | file_list_websites=sys.argv[1]
 7 | jsfile=sys.argv[2]	
 8 | emplacement=sys.argv[3]
 9 | threadpool=sys.argv[4]
10 | domaines=[]
11 | main_thread = threading.currentThread()
12 | with open(file_list_websites,'r') as fr:
13 | 	for ligne in fr:
14 | 		domaines.append(ligne.replace('\r\n',''))
15 | print domaines
16 | i=0
17 | for domaine in domaines:
18 | 	i+=1	
19 | 	screen=screenshots.Screenshots(domaines,jsfile,emplacement,domaine)
20 | 	screen.start()
21 | 	if i % int(threadpool):
22 | 		for t in threading.enumerate():
23 | 			if t is not main_thread:
24 | 				t.join()
25 | 						
26 | 


--------------------------------------------------------------------------------
/screenshots/screenshots.js:
--------------------------------------------------------------------------------
 1 | var casper = require('casper').create({
 2 | 
 3 | })
 4 | , terms = casper.cli.get(0),url=casper.cli.get(1),emplacement=casper.cli.get(2),i=0
 5 | casper.start(url, function() {
 6 |     this.capture(emplacement+'/'+terms+'.png', {
 7 |         top: 10,
 8 |         left: 10,
 9 |         width: 1024,
10 |         height: 768
11 |     },12000);
12 | });
13 | 
14 | casper.run()
15 | 


--------------------------------------------------------------------------------
/screenshots/screenshots.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from subprocess import Popen, PIPE
 3 | import threading
 4 | import time
 5 | # -*- coding: utf-8 -*-
 6 | """
 7 | Created on Mon Apr 30 12:24:14 2012
 8 | 
 9 | @author: slarinier
10 | """
11 | 
12 | class Screenshots(threading.Thread):
13 | 	def __init__(self,listofwebsites,jsfile,location,website):
14 | 		self.listofwebsites=listofwebsites
15 | 		self.jsfile=jsfile
16 | 		self.location=location
17 | 		self.website=website
18 | 		threading.Thread.__init__(self)
19 | 		
20 | 	
21 | 	def run(self):
22 | 		cmd='casperjs '+self.jsfile+' '+self.website +' http://'+self.website +' '+self.location+' --web-security=no'
23 | 		args=cmd.split()
24 | 		result=subprocess.Popen(args,stdout=PIPE)
25 | 		print "Make screenshots :"+self.website
26 | 		time.sleep(3)			
27 | 	
28 | 


--------------------------------------------------------------------------------
/storage/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/storage/redis_record.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Apr 24 15:30:33 2013
 4 | 
 5 | @author: slarinier
 6 | """
 7 | import redis
 8 | 
 9 | class RedisRecord(object):
10 |     
11 |     def __init__(self,host='localhost',port=6379,db=1):
12 |         pool=redis.ConnectionPool(host=host,port=port,db=db)
13 |         self.r=redis.Redis(connection_pool=pool)
14 |         self.processus_tab=[]
15 |     def delete(self,key):
16 |         self.r.delete(key)        
17 |     def get(self,key):
18 |        return self.r.get(key)
19 |     def put(self,key,value):
20 |         self.r.set(key,value)
21 |     def init(self,dbs):
22 |         for i in dbs:
23 |             self.flushdb(i)
24 |     def flushdb(self,db_value):
25 |         self.switchDB(db_value)
26 |         self.r.flushdb()
27 |     def rpush(self,listvalue,item):
28 |         self.r.rpush(listvalue,item)
29 |     def rpop(self,listvalue):
30 |         return self.r.rpop(listvalue)
31 |     def switchDB(self,db,host='localhost',port=6379):
32 |         pool=redis.ConnectionPool(host=host,port=port,db=db)
33 |         self.r=redis.Redis(connection_pool=pool)
34 |     def currentDB(self):
35 |         return self.r.connection_pool.get_connection(1).db


--------------------------------------------------------------------------------