├── .gitignore ├── README.md ├── __init__.py ├── actions.py ├── geoloc_by_domain.py ├── geolocatisation ├── GeoLiteCity.dat ├── __init__.py ├── dschield.py ├── geolocalisation.py └── result.txt ├── harvesting ├── __init__.py ├── __init__.pyc ├── bingsearch.js ├── content.py ├── content_search.py ├── crawler.py ├── dynamic.js ├── filters.py ├── googlesearch.js ├── keywords ├── metaextract.js ├── pastebin.js ├── pastebin.py ├── pastebinExtract.py ├── pastebintest.py ├── pastebintext.js ├── pholcidae.py ├── random_user_agent.py ├── search.py ├── user_agents ├── white_list.py └── yahoosearch.js ├── history ├── __init__.py └── history.py ├── main.py ├── mongodb ├── __init__.py ├── __init__.pyc ├── mongodb.py └── mongodb.pyc ├── network ├── IPy.py ├── __init__.py ├── __init__.pyc ├── make_networks.py ├── networks.py ├── networks.pyc └── search_on_network.py ├── processing ├── __init__.py ├── __init__.pyc ├── bulk.py ├── categoryze_result.py ├── clean_db.py ├── compare.py ├── create_request.py ├── create_result.py ├── createcorpus.py ├── dnstree.py ├── filters.py ├── filters.pyc ├── gouv.log ├── gouv_domaine.txt ├── gouv_metadatas.txt ├── metadataextract.py └── metadataextract.pyc ├── scanners ├── __init__.py └── networks.py ├── screenshots ├── __init__.py ├── make_screenshots.py ├── screenshots.js └── screenshots.py └── storage ├── __init__.py └── redis_record.py /.gitignore: -------------------------------------------------------------------------------- 1 | #compiled file 2 | *.pyc 3 | #log 4 | *.log 5 | #screen 6 | *.png 7 | #data 8 | *.csv 9 | .project 10 | .settings/org.eclipse.ltk.core.refactoring.prefs 11 | .pydevproject 12 | *.txt 13 | *.tar.gz 14 | *.zip 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/README.md -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/__init__.py -------------------------------------------------------------------------------- /actions.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Feb 1, 2013 3 | 4 | @author: slarinier 5 | ''' 6 | 7 | from libnmap.parser import NmapParser 8 | from libnmap.process import NmapProcess 9 | import pymongo 10 | from pymongo import MongoClient 11 | import threading 12 | 13 | from harvesting import search 14 | from harvesting.crawler import Record, CrawlerThread 15 | import mongodb 16 | from network import make_networks, networks 17 | from network.IPy import IP 18 | from processing import metadataextract 19 | from processing.clean_db import Cleandb 20 | from processing.create_result import Create_Result 21 | from processing.dnstree import DNSTree 22 | from screenshots.screenshots import Screenshots 23 | from scanners.networks import Networks 24 | 25 | class Actions(object): 26 | ''' 27 | classdocs 28 | ''' 29 | def __init__(self, db_value): 30 | self.db_value = db_value 31 | connection = MongoClient(host='localhost', port=27017) 32 | self.db = connection[db_value] 33 | 34 | def create_network(self): 35 | network=make_networks.make_networks('localhost', self.db_value) 36 | network.createNetworks('new_domaines') 37 | network.exportFile(self.db_value+'_network.log') 38 | 39 | def create_result(self,collection,criteria): 40 | createResult=Create_Result(self.db_value,criteria) 41 | if collection=='scanners': 42 | createResult.processScanners(collection) 43 | return 44 | createResult.process(collection) 45 | 46 | def metasearch(self,criteria,scriptsJS,geoloc): 47 | print "########### Meta Search ###########" 48 | main_thread = threading.currentThread() 49 | thread_pool=[] 50 | for criterius in criteria: 51 | for script in scriptsJS: 52 | gs=search.search(100,criterius,script,self.db_value) 53 | gs.start() 54 | thread_pool.append(gs) 55 | for t in thread_pool: 56 | t.join() 57 | for t in thread_pool: 58 | t.record() 59 | print "########### Search terminated ###########" 60 | 61 | print "########### Resolve IP ############" 62 | networks.resolve(geoloc,self.db_value) 63 | 64 | def search_ip(self,geoloc,scriptsJS,ip_range): 65 | main_thread = threading.currentThread() 66 | print "########### Search by IP ###########" 67 | ips=[] 68 | domaines=self.db.new_domaines.find() 69 | thread_pool=[] 70 | cache={} 71 | for domaine in domaines: 72 | try: 73 | ips.append(domaine['ip']) 74 | 75 | except KeyError: 76 | print domaine 77 | i=0 78 | print 'les IPS sont: '+ str(ips) 79 | ip_to_add=[] 80 | if ip_range: 81 | ip_to_add=[str(x) for x in IP(ip_range)] 82 | ips[len(ips):]=ip_to_add 83 | for ip in set(ips): 84 | if ip != '0.0.0.0': 85 | i+=1 86 | gs=search.search(20,'ip:'+str(ip),scriptsJS[1],self.db_value) 87 | gs.start() 88 | thread_pool.append(gs) 89 | if i % 10 ==0: 90 | for t in thread_pool: 91 | t.join() 92 | for t in thread_pool: 93 | t.record() 94 | print "########### Search terminated ###########" 95 | print "########### Search by network ###########" 96 | 97 | print "########### Resolve IP ############" 98 | networks.resolve(geoloc,self.db_value) 99 | 100 | def scan_network(self): 101 | pass 102 | def scan_nmap(self,ip_range,options): 103 | ips=[] 104 | domaines=self.db.new_domaines.find() 105 | thread_pool=[] 106 | cache={} 107 | for domaine in domaines: 108 | try: 109 | ips.append(domaine['ip']) 110 | cache[domaine['ip']]=domaine 111 | except KeyError: 112 | print domaine 113 | net=Networks(list(set(ips)),options) 114 | net.run() 115 | report=net.make_report() 116 | #net.record_report(report,cache,self.db.new_domaines) 117 | pass 118 | def screenshots(self,db_value,threadpool): 119 | connection= MongoClient(host='localhost', port=27017) 120 | db=connection[db_value] 121 | domaines=db.new_domaines.distinct('domaine') 122 | i=0 123 | main_thread = threading.currentThread() 124 | threadpools=[] 125 | print "print "+ str(len(domaines))+ " screenshots" 126 | for domaine in domaines: 127 | i+=1 128 | screen=Screenshots(domaines, 'screenshots/screenshots.js', 'screenshots/screenshots/'+db_value, domaine) 129 | screen.start() 130 | threadpools.append(screen) 131 | if i % int(threadpool)== 0: 132 | for t in threadpools: 133 | t.join() 134 | 135 | def metadata_exctract(self,db): 136 | main_thread = threading.currentThread() 137 | print "########## Meta Data IP ##########" 138 | mdb=mongodb.mongodb('localhost',27017,db) 139 | i=0 140 | 141 | for domaine in mdb.selectall('new_domaines'): 142 | i+=1 143 | url=domaine['url'] 144 | domaine_value=domaine['domaine'] 145 | print url 146 | if not 'meta' in domaine: 147 | domaine['meta']='ok' 148 | mtd=metadataextract.metadataextract('harvesting/metaextract.js',db,domaine_value,url) 149 | mtd.start() 150 | if i % 30==0: 151 | for t in threading.enumerate(): 152 | if t is not main_thread: 153 | t.join(2) 154 | 155 | def dnstree(self,db_value): 156 | dnst=DNSTree(db_value) 157 | dnst.process() 158 | 159 | def crawl(self,list_domains): 160 | main_thread = threading.currentThread() 161 | #domaines=self.db.new_domaines.distinct('domaine') 162 | domains=list_domains.split(',') 163 | threadpool=[] 164 | lock=threading.Lock() 165 | rec=Record(self.db_value,lock) 166 | rec.start() 167 | i=0 168 | for domain in domains: 169 | i=i+1 170 | cw=CrawlerThread(domain,self.db,lock) 171 | cw.run() 172 | 173 | if i % 5==0: 174 | for t in threading.enumerate(): 175 | if t is not main_thread: 176 | t.join(2) 177 | stop=True 178 | 179 | while(stop): 180 | for t in threadpool: 181 | if not t.IsActive(): 182 | threadpool.remove(t) 183 | if len(threadpool)==0: 184 | stop=False 185 | 186 | def clean_db(self,pathfilters): 187 | print "#####Clean DB####" 188 | directory = "screenshots/screenshots/"+self.db_value 189 | filters=[] 190 | with open(pathfilters,'r') as fw: 191 | for ligne in fw: 192 | filters.append(ligne.strip()) 193 | cl=Cleandb(self.db_value, directory, filters) 194 | cl.clean() 195 | 196 | def reset(self): 197 | 198 | for domaine in self.db.new_domaines.find(): 199 | domaine['meta']=None 200 | self.db.update(domaine,'new_domaines') 201 | 202 | def init(self,db,coll,attrib): 203 | 204 | self.db.create_collection(coll) 205 | self.db[coll].ensure_index([(attrib,pymongo.ASCENDING)],unique=True) 206 | -------------------------------------------------------------------------------- /geoloc_by_domain.py: -------------------------------------------------------------------------------- 1 | from network import networks 2 | import argparse 3 | import sys 4 | from geolocatisation import dschield 5 | 6 | parser = argparse.ArgumentParser(description='Geolocalisation by domains') 7 | parser.add_argument('--domaine', dest='fqdn',help='make a fqdn for geolocalisation') 8 | parser.add_argument('--filename',dest='list_domaine') 9 | parser.add_argument('--geoloc_file',dest='geoloc_file') 10 | parser.add_argument('--resolve_dns',dest='resolve_dns') 11 | parser.add_argument('--geoloc_country',dest='geoloc_country') 12 | parser.add_argument('--outfile',dest='outfile') 13 | 14 | args=parser.parse_args() 15 | domaines=[] 16 | geoloc=[] 17 | geoloc_country=False 18 | geoloc_file=False 19 | if args.fqdn != None: 20 | domaines=[args.fqdn] 21 | if args.list_domaine != None: 22 | print "Read Domaine List" 23 | with open(args.list_domaine,'r') as fr: 24 | for ligne in fr: 25 | domaines.append(ligne.strip()) 26 | if args.geoloc_file != None: 27 | print "Geolocalisation Load" 28 | geoloc_file=True 29 | if args.geoloc_file == None: 30 | parser.print_help() 31 | sys.exit(-1) 32 | print "geoloc" 33 | 34 | if args.geoloc_country: 35 | print "Geolocalisation country ok" 36 | geoloc_country=True 37 | domaines=list(set(domaines)) 38 | print "Domaines list: "+str(len(domaines)) 39 | for domaine in domaines: 40 | ip='0.0.0.0' 41 | ip=networks.resolve_dns(domaine) 42 | if ip != None: 43 | temp=ip+','+domaine 44 | if geoloc_file == True: 45 | geo=networks.geolocIP(args.geoloc_file,ip) 46 | country=networks.geolocCountry(args.geoloc_file,ip) 47 | if country: 48 | temp=temp+','+country 49 | if geo: 50 | temp=temp+','+geo 51 | if geoloc_country ==True: 52 | ds=dschield.dschield('http://dshield.org/ipinfo_ascii.html?ip=') 53 | ip,country,asname,network=ds.response(ip) 54 | temp=temp+','+country 55 | print temp 56 | geoloc.append(temp) 57 | else: 58 | geoloc.append('DNS Failure: '+domaine) 59 | if args.outfile != None: 60 | with open(args.outfile,'w') as fw: 61 | for ligne in geoloc: 62 | fw.write(ligne+'\n') 63 | 64 | 65 | -------------------------------------------------------------------------------- /geolocatisation/GeoLiteCity.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/geolocatisation/GeoLiteCity.dat -------------------------------------------------------------------------------- /geolocatisation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/geolocatisation/__init__.py -------------------------------------------------------------------------------- /geolocatisation/dschield.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import re 3 | class dschield(object): 4 | 5 | def __init__(self,url): 6 | self.url=url 7 | 8 | def response(self,ip): 9 | dschieldContent=urllib2.urlopen(self.url+ip) 10 | value=dschieldContent.read() 11 | patern='country= (\w+)' 12 | 13 | reg =re.compile(patern) 14 | m = reg.search(value) 15 | country='' 16 | if m: 17 | country=m.group(1) 18 | patern='asname= (.+)' 19 | reg =re.compile(patern) 20 | m = reg.search(value) 21 | asname='' 22 | if m: 23 | asname=m.group(1) 24 | patern='network= (.+)' 25 | reg =re.compile(patern) 26 | m = reg.search(value) 27 | network='' 28 | if m: 29 | network=m.group(1) 30 | network=network.split(' ')[0] 31 | if country != '' and asname !='' and network !='': 32 | return (ip,country,asname,network) 33 | return ('127.0.0.1','mars','alien','nothing') 34 | -------------------------------------------------------------------------------- /geolocatisation/geolocalisation.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Feb 20, 2013 3 | 4 | @author: slarinier 5 | ''' 6 | from pymongo.connection import Connection 7 | 8 | class Geolocalisation(object): 9 | ''' 10 | classdocs 11 | ''' 12 | 13 | 14 | def __init__(self,list_domaine,db_value): 15 | ''' 16 | Constructor 17 | ''' 18 | 19 | def geolochoffline(self): 20 | 21 | 22 | def geolocOnline(self): 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /geolocatisation/result.txt: -------------------------------------------------------------------------------- 1 | 31.184.244.9,onlinetracksz.net,24.0_54.0,RU 2 | 31.184.244.9,httpsites.org,24.0_54.0,RU 3 | 31.184.244.9,onlinegreencm.org,24.0_54.0,RU 4 | 31.184.244.9,onlinegiigii.com,24.0_54.0,RU 5 | 31.184.244.9,onlinefishmw3bid.net,24.0_54.0,RU 6 | 31.184.244.9,onlineliverss.org,24.0_54.0,RU 7 | 31.184.244.9,onlinemooviii.com,24.0_54.0,RU 8 | 31.184.244.9,onlinegiigii.net,24.0_54.0,RU 9 | 62.109.12.39,62.109.12.39,55.7522_37.6156,RU 10 | 31.184.244.9,httpsites.net,24.0_54.0,RU 11 | DNS Failure: sauth-yandex.ru 12 | 31.184.244.9,onlinepainrs.com,24.0_54.0,RU 13 | 31.184.244.9,onlinegreenguide.com,24.0_54.0,RU 14 | 31.184.244.9,onlinepainrs.net,24.0_54.0,RU 15 | 31.184.244.9,onlineliververs.net,24.0_54.0,RU 16 | 31.184.244.9,online-moo-viii.net,24.0_54.0,RU 17 | 31.184.244.9,onlinemaris.com,24.0_54.0,RU 18 | 31.184.244.9,onlinegreenguide.net,24.0_54.0,RU 19 | 31.184.244.9,httpblogs.com,24.0_54.0,RU 20 | 31.184.244.9,onlinecodmw3buy.net,24.0_54.0,RU 21 | 31.184.244.9,onlinemaris.net,24.0_54.0,RU 22 | 31.184.244.9,onlinemooviii.net,24.0_54.0,RU 23 | 173.45.252.44,oase2.net,38.6446_-90.2533,US 24 | 92.63.106.133,www.money-yanbex.ru,60.0_100.0,RU 25 | 31.184.244.9,31.184.244.9,24.0_54.0,RU 26 | 31.184.244.219,onlinemoneysstock.org,24.0_54.0,RU 27 | 31.184.244.219,onlinefundsgoods.org,24.0_54.0,RU 28 | 31.184.244.219,livemoneysgoods.org,24.0_54.0,RU 29 | 31.184.244.219,onlineincomegoods.org,24.0_54.0,RU 30 | DNS Failure: newdomeninfo.info 31 | 31.184.244.9,onlineliververs.com,24.0_54.0,RU 32 | 31.184.244.9,onlineliverss.com,24.0_54.0,RU 33 | 31.184.244.9,onlineliverss.net,24.0_54.0,RU 34 | DNS Failure: onlinecashsstt.org 35 | DNS Failure: internetmoneysstt.org 36 | 69.43.161.151,moneyinternetlovesff.info,-27.0_133.0,US 37 | 31.184.244.219,livewindowsxpf4.info,24.0_54.0,RU 38 | 31.184.244.219,onlinewinsphonessite.org,24.0_54.0,RU 39 | 141.8.224.162,webstockcwo.info,47.0_8.0,CH 40 | DNS Failure: internetwindowslive.info 41 | 31.184.244.219,theonlinewinsphones.org,24.0_54.0,RU 42 | 31.184.244.219,webwindowsproc.info,24.0_54.0,RU 43 | 31.184.244.219,internetwindowslows.com,24.0_54.0,RU 44 | DNS Failure: moneydigitallovesff.info 45 | 31.184.244.219,internet-wins-phones.org,24.0_54.0,RU 46 | 31.184.244.219,livewindowsproc.info,24.0_54.0,RU 47 | 31.184.244.219,onlinewindowsxpf4site.info,24.0_54.0,RU 48 | 31.184.244.219,webwindowslows.com,24.0_54.0,RU 49 | 31.184.244.219,webbuildingstore.info,24.0_54.0,RU 50 | DNS Failure: livemoneysstt.org 51 | DNS Failure: moneylivelovesff.info 52 | 69.43.161.161,stockonlinelovesff.info,-27.0_133.0,US 53 | 69.43.161.156,moneyweblovesff.info,-27.0_133.0,US 54 | 31.184.244.219,digitalwindowsproc.info,24.0_54.0,RU 55 | DNS Failure: cashonlinelovesff.info 56 | 31.184.244.219,onlinemoneyssuv.info,24.0_54.0,RU 57 | 31.184.244.219,onlinemicrosoftproc.info,24.0_54.0,RU 58 | 31.184.244.219,onlinewindowsxpf4s.info,24.0_54.0,RU 59 | 69.43.161.161,dollaronlinelovesff.info,-27.0_133.0,US 60 | 31.184.244.219,digitalwinsphones.org,24.0_54.0,RU 61 | 62.109.23.82,l2-pantheon.ru,59.8944_30.2642,RU 62 | 31.184.244.219,onlinefinanses2f.info,24.0_54.0,RU 63 | 141.8.224.162,internetstockcwo.info,47.0_8.0,CH 64 | -------------------------------------------------------------------------------- /harvesting/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/harvesting/__init__.py -------------------------------------------------------------------------------- /harvesting/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/harvesting/__init__.pyc -------------------------------------------------------------------------------- /harvesting/bingsearch.js: -------------------------------------------------------------------------------- 1 | var links = []; 2 | var casper = require('casper').create(); 3 | var padding=casper.cli.get(0) 4 | var criteria=casper.cli.get(1) 5 | function getLinks() { 6 | 7 | var links = document.querySelectorAll('h2 a') 8 | return Array.prototype.map.call(links, function(e) { 9 | return e.getAttribute('href') 10 | }); 11 | } 12 | 13 | 14 | casper.start(); 15 | 16 | casper.open('http://www.bing.com/search?q='+criteria+'&go=&qs=ds&filt=all&first='+padding+'&FORM=PERE') 17 | casper.then(function() { 18 | // aggregate results for the 'casperjs' search 19 | 20 | 21 | links = this.evaluate(getLinks); 22 | 23 | // now search for 'phantomjs' by filling the form again 24 | }); 25 | 26 | 27 | 28 | casper.run(function() { 29 | // echo results in some pretty fashion 30 | this.echo(links.length + ' links found:'); 31 | this.echo(' - ' + links.join('\n - ')).exit(); 32 | }); 33 | -------------------------------------------------------------------------------- /harvesting/content.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 1, 2012 3 | 4 | @author: slarinier 5 | ''' 6 | import re 7 | from content_search import Content_search 8 | 9 | class Content(object): 10 | ''' 11 | classdocs 12 | ''' 13 | _instance = None 14 | def __new__(cls, *args, **kwargs): 15 | if not cls._instance: 16 | cls._instance = super(Content, cls).__new__(cls, *args, **kwargs) 17 | return cls._instance 18 | 19 | def __init__(self,filetoload='keywords'): 20 | ''' 21 | Constructor 22 | ''' 23 | self.filetoload=filetoload 24 | self.keywords=[] 25 | with open(self.filetoload,'r') as fr: 26 | for ligne in fr: 27 | self.keywords.append(ligne.strip()) 28 | 29 | def analyse(self,ligne): 30 | if ligne.find('&') != -1: 31 | return 'keywords_and' 32 | else : 33 | return 'keyword_only' 34 | 35 | def search(self,keyword,data): 36 | action=self.analyse(keyword) 37 | cs = Content_search(action,data) 38 | find=getattr(cs, action)(keyword) 39 | return find 40 | 41 | 42 | -------------------------------------------------------------------------------- /harvesting/content_search.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 2, 2012 3 | 4 | @author: slarinier 5 | ''' 6 | import re 7 | class Content_search(object): 8 | ''' 9 | classdocs 10 | ''' 11 | 12 | 13 | def __init__(self,action,data): 14 | ''' 15 | Constructor 16 | ''' 17 | self.action=action 18 | self.data=data 19 | 20 | def keyword_only(self,keyword): 21 | tokens=re.findall(keyword, self.data) 22 | if len(tokens) > 0: 23 | return True 24 | return False 25 | 26 | def keywords_and(self,keywords): 27 | keywords=keywords.split('&') 28 | 29 | for keyword in keywords: 30 | if self.keyword_only(keyword) == False: 31 | return False 32 | return True 33 | 34 | 35 | -------------------------------------------------------------------------------- /harvesting/crawler.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jan 7, 2013 3 | 4 | @author: slarinier 5 | ''' 6 | from selenium import webdriver 7 | from pymongo import MongoClient 8 | from threading import Thread 9 | import redis 10 | import threading 11 | from pyfaup.faup import Faup 12 | import time 13 | from storage.redis_record import RedisRecord 14 | from filters import Filters 15 | from urllib2 import URLError 16 | from collections import deque 17 | 18 | 19 | class CrawlerThread(threading.Thread): 20 | def __init__(self, domain, db_value, lock): 21 | threading.Thread.__init__(self) 22 | self.domain = domain 23 | self.lock = lock 24 | 25 | def run(self): 26 | cw = Crawler(webdriver.Firefox(), self.lock, "http://" + self.domain) 27 | cw.init() 28 | cw.navigation() 29 | 30 | 31 | class Record(threading.Thread): 32 | def __init__(self, db_value, lock): 33 | self.r = RedisRecord() 34 | self.connection = MongoClient(host='localhost', port=27017, db=db_value) 35 | self.db = self.connection[db_value] 36 | 37 | threading.Thread.__init__(self) 38 | self.lock = lock 39 | 40 | def run(self): 41 | i = 0 42 | while (True): 43 | i = i + 1 44 | if i % 1000 == 0: 45 | time.sleep(10) 46 | self.lock.acquire() 47 | self.r.switchDB(1) 48 | url = self.r.rpop('crawl') 49 | self.lock.release() 50 | # print url 51 | fex = Faup() 52 | if url: 53 | print "url found: " + url 54 | try: 55 | fex.decode(url) 56 | domain = fex.get_host() 57 | entry = self.db.new_domaines.find_one({'domaine': domain}) 58 | if entry == None: 59 | print "record: " + domain 60 | self.db.new_domaines.save({'domaine': domain, 'urls': [url]}) 61 | 62 | urls_stored = entry['urls'] 63 | if not url in urls_stored: 64 | urls_stored.append(url) 65 | entry['urls'] = urls_stored 66 | self.db.new_domaines.save(entry) 67 | except: 68 | print "parsing fault " + url 69 | 70 | 71 | class Crawler(object): 72 | def __init__(self, driver, lock, first_url, db_int=1): 73 | self.driver = driver 74 | self.driver.implicitly_wait(10) 75 | self.driver.set_page_load_timeout(30) 76 | self.r = RedisRecord() 77 | self.lock = lock 78 | self.queue = deque([]) 79 | self.queue.append(first_url) 80 | self.dbs = [1, 2] 81 | 82 | def init(self): 83 | self.r.init(self.dbs) 84 | url = self.queue.popleft() 85 | self.driver.get(url) 86 | self.parser(url) 87 | 88 | def parser(self, url): 89 | self.r.switchDB(1) 90 | if not self.r.get(url): 91 | self.driver.get(url) 92 | elem_links = self.driver.find_elements_by_tag_name('a') 93 | self.lock.acquire() 94 | self.sort([link.get_attribute("href") for link in elem_links], url) 95 | self.lock.release() 96 | self.r.switchDB(1) 97 | self.r.put(url, url) 98 | 99 | def navigation(self): 100 | 101 | while (len(self.queue) > 0): 102 | url = self.queue.popleft() 103 | try: 104 | # self.driver.refresh() 105 | self.r.switchDB(1) 106 | self.parser(url) 107 | 108 | except URLError as e: 109 | print url 110 | except IOError as e: 111 | self.r.switchDB(2) 112 | print "I/O error({0}): {1}".format(e.errno, e.strerror) 113 | # self.r.put(new_url,new_url) 114 | self.r.switchDB(1) 115 | except e: 116 | continue 117 | try: 118 | self.driver.quit() 119 | print "Fin du crawling du site " + url 120 | except URLError as e: 121 | self.driver = getattr(webdriver, 'Firefox')() 122 | print 'boum' 123 | self.lock.acquire() 124 | self.r.switchDB(1) 125 | self.r.put(url, url) 126 | self.lock.release() 127 | 128 | def sort(self, elem_links, url): 129 | fex = Faup() 130 | f = Filters() 131 | f.load() 132 | self.r.switchDB(1) 133 | extend = True 134 | domainfilter = True 135 | schemefilter = True 136 | try: 137 | for link in elem_links: 138 | new_url = link 139 | self.r.switchDB(2) 140 | if not self.r.get(new_url) and new_url: 141 | self.r.switchDB(1) 142 | if not self.r.get(new_url): 143 | fex.decode(new_url) 144 | domain = fex.get_host() 145 | if f.isfilteredscheme(fex.get_scheme()): 146 | self.r.switchDB(2) 147 | self.r.put(new_url, new_url) 148 | schemefilter = False 149 | if f.isfiltereddomains(domain): 150 | self.r.switchDB(2) 151 | self.r.put(new_url, new_url) 152 | domainfilter = False 153 | if f.isfilteredextention(fex.get_resource_path()): 154 | extend = False 155 | self.r.switchDB(2) 156 | self.r.put(new_url, new_url) 157 | 158 | if extend and domainfilter and schemefilter: 159 | self.r.switchDB(1) 160 | self.r.rpush('crawl', new_url) 161 | self.queue.append(new_url) 162 | except TypeError as e: 163 | print "TypeError" 164 | -------------------------------------------------------------------------------- /harvesting/dynamic.js: -------------------------------------------------------------------------------- 1 | var casper = require("casper").create({ 2 | verbose: true 3 | }); 4 | url = casper.cli.get(0) 5 | // The base links array 6 | var links = [ 7 | url 8 | ]; 9 | 10 | // If we don't set a limit, it could go on forever 11 | var upTo = ~~casper.cli.get(0) || 10; 12 | 13 | var currentLink = 0; 14 | 15 | // Get the links, and add them to the links array 16 | // (It could be done all in one step, but it is intentionally splitted) 17 | function addLinks(link) { 18 | this.then(function() { 19 | var found = this.evaluate(searchLinks); 20 | this.echo(found.length + " links found on " + link); 21 | links = links.concat(found); 22 | }); 23 | } 24 | 25 | // Fetch all elements from the page and return 26 | // the ones which contains a href starting with 'http://' 27 | function searchLinks() { 28 | var filter, map; 29 | filter = Array.prototype.filter; 30 | map = Array.prototype.map; 31 | return map.call(filter.call(document.querySelectorAll("a"), function(a) { 32 | return (/^http:\/\/.*/i).test(a.getAttribute("href")); 33 | }), function(a) { 34 | return a.getAttribute("href"); 35 | }); 36 | } 37 | 38 | // Just opens the page and prints the title 39 | function start(link) { 40 | this.start(link, function() { 41 | this.echo('Page title: ' + this.getTitle()); 42 | }); 43 | } 44 | 45 | // As long as it has a next link, and is under the maximum limit, will keep running 46 | function check() { 47 | if (links[currentLink] && currentLink < upTo) { 48 | this.echo('--- Link ' + currentLink + ' ---'); 49 | start.call(this, links[currentLink]); 50 | addLinks.call(this, links[currentLink]); 51 | currentLink++; 52 | this.run(check); 53 | } else { 54 | this.echo("All done."); 55 | this.exit(); 56 | } 57 | } 58 | 59 | casper.start().then(function() { 60 | this.echo("Starting"); 61 | }); 62 | 63 | casper.run(check); 64 | -------------------------------------------------------------------------------- /harvesting/filters.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jun 12 17:40:53 2013 4 | 5 | @author: slarinier 6 | """ 7 | class Filters(object): 8 | def __init__(self,pathextention='harvesting/filtered_extensions',pathscheme='harvesting/filtered_schemes',pathdomain='harvesting/filtered_domains'): 9 | self.pathdomain=pathdomain 10 | self.pathscheme=pathscheme 11 | self.pathextentions=pathextention 12 | self.domains=[] 13 | self.schemes=[] 14 | self.extentions=[] 15 | def load(self): 16 | with open(self.pathdomain,"r") as fr: 17 | self.domains=[line.strip() for line in fr] 18 | with open(self.pathscheme,"r") as fr: 19 | self.schemes=[line.strip() for line in fr] 20 | with open(self.pathextentions,"r") as fr: 21 | self.extentions=[line.strip() for line in fr] 22 | def isfilteredextention(self,path): 23 | try: 24 | for ext in self.extentions: 25 | if path.endswith(ext): 26 | return True 27 | return False 28 | except: 29 | print "extension error" 30 | 31 | def isfilteredscheme(self,scheme): 32 | return scheme is self.schemes 33 | def isfiltereddomains(self,domain): 34 | try: 35 | tokens=domain.split('.')[::-1] 36 | for d in self.domains: 37 | d_tokens=d.split('.')[::-1] 38 | d_reverse=d_tokens[0]+'.'+d_tokens[1] 39 | t_reverse=str(tokens[0]+'.'+tokens[1]) 40 | if d_reverse == t_reverse: 41 | return True 42 | except IndexError as e: 43 | if domain.find('.') == -1: 44 | return True 45 | except AttributeError as e: 46 | print "test" 47 | return False -------------------------------------------------------------------------------- /harvesting/googlesearch.js: -------------------------------------------------------------------------------- 1 | var links = []; 2 | var casper = require('casper').create(); 3 | var padding=casper.cli.get(0); 4 | var criteria=casper.cli.get(1); 5 | var ua=casper.cli.get(2) 6 | function getLinks() { 7 | 8 | var links = document.querySelectorAll('h3.r a'); 9 | return Array.prototype.map.call(links, function(e) { 10 | return e.getAttribute('href') 11 | }); 12 | } 13 | 14 | 15 | casper.start(); 16 | casper.userAgent(ua) 17 | casper.open('http://google.com/search?q='+criteria+'&start='+padding) 18 | casper.then(function() { 19 | // aggregate results for the 'casperjs' search 20 | 21 | 22 | links = this.evaluate(getLinks); 23 | 24 | // now search for 'phantomjs' by filling the form again 25 | }); 26 | 27 | 28 | 29 | casper.run(function() { 30 | // echo results in some pretty fashion 31 | this.echo(links.length + ' links found:'); 32 | this.echo(' - ' + links.join('\n - ')).exit(); 33 | }); 34 | -------------------------------------------------------------------------------- /harvesting/keywords: -------------------------------------------------------------------------------- 1 | porn 2 | user&password 3 | -------------------------------------------------------------------------------- /harvesting/metaextract.js: -------------------------------------------------------------------------------- 1 | var casper = require("casper").create() 2 | , url = casper.cli.get(0) 3 | , metas = []; 4 | 5 | if (!url) { 6 | casper.echo('Usage: casperjs [url]').exit(); 7 | } 8 | 9 | casper.start(url, function() { 10 | metas = this.evaluate(function() { 11 | var metas = []; 12 | [].forEach.call(document.querySelectorAll('META'), function(elem) { 13 | var meta = {}; 14 | [].slice.call(elem.attributes).forEach(function(attr) { 15 | meta[attr.name] = attr.value; 16 | }); 17 | metas.push(meta); 18 | }); 19 | return metas; 20 | }); 21 | }); 22 | 23 | casper.run(function() { 24 | require("utils").dump(metas); 25 | this.exit(); 26 | }); 27 | -------------------------------------------------------------------------------- /harvesting/pastebin.js: -------------------------------------------------------------------------------- 1 | var links = []; 2 | var casper = require('casper').create(); 3 | var url=casper.cli.get(0); 4 | var ua=casper.cli.get(1) 5 | function getLinks() { 6 | 7 | var links = document.querySelectorAll('tr a'); 8 | return Array.prototype.map.call(links, function(e) { 9 | return e.getAttribute('href') 10 | }); 11 | } 12 | 13 | 14 | casper.start(); 15 | casper.userAgent(ua); 16 | casper.open(url); 17 | casper.then(function() { 18 | // aggregate results for the 'casperjs' search 19 | links = this.evaluate(getLinks); 20 | 21 | // now search for 'phantomjs' by filling the form again 22 | }); 23 | 24 | 25 | casper.run(function() { 26 | // echo results in some pretty fashion 27 | this.echo(links.length + ' links found:'); 28 | this.echo(' - ' + links.join('\n - ')).exit(); 29 | 30 | }); 31 | -------------------------------------------------------------------------------- /harvesting/pastebin.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from subprocess import Popen, PIPE 4 | import threading 5 | import pymongo 6 | from pymongo import Connection 7 | from pastebinExtract import pastebinExtract 8 | from random_user_agent import Random_user_agent 9 | import time 10 | class pastebin(): 11 | def __init__(self, url,keyword,casperJSScript): 12 | self.url=url 13 | self.keyword=keyword 14 | self.casperJSScript=casperJSScript 15 | self.urls=[] 16 | rua=Random_user_agent() 17 | self.ua=rua.rand() 18 | self.time = rua.randsleep() 19 | self.result=[] 20 | def pastebinArchive(self): 21 | result=subprocess.Popen(['casperjs' ,self.casperJSScript,self.url,'\''+self.ua+'\''],stdout=PIPE) 22 | for ligne in result.stdout: 23 | if ligne.find('/')!=-1 and ligne.find('archive') == -1: 24 | id=ligne.replace(' - /','').strip() 25 | id=id.replace('\n','') 26 | self.urls.append('http://pastebin.com/raw.php?i='+id) 27 | print self.urls 28 | 29 | def pastebinAnalyse(self): 30 | i=0 31 | main_thread = threading.currentThread() 32 | thread_pool=[] 33 | for url in self.urls: 34 | pasteExtract=pastebinExtract(url) 35 | time.sleep(self.time) 36 | pasteExtract.start() 37 | thread_pool.append(pasteExtract) 38 | i+=1 39 | if i % 500 ==0: 40 | for t in threading.enumerate(): 41 | if t is not main_thread: 42 | t.join() 43 | 44 | for t in thread_pool: 45 | result =getattr(t,'result') 46 | if result : 47 | self.result.append(result) 48 | return self.result 49 | 50 | 51 | -------------------------------------------------------------------------------- /harvesting/pastebinExtract.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from subprocess import Popen, PIPE 4 | import threading 5 | from content import Content 6 | from random_user_agent import Random_user_agent 7 | 8 | class pastebinExtract(threading.Thread): 9 | def __init__(self,url,casperJSScript='pastebintext.js'): 10 | threading.Thread.__init__(self) 11 | self.url=url 12 | self.casperJSScript=casperJSScript 13 | self.content=Content() 14 | self.data=[] 15 | rua=Random_user_agent() 16 | self.ua=rua.rand() 17 | self.result=None 18 | 19 | def run(self): 20 | result=subprocess.Popen(['casperjs' ,self.casperJSScript,self.url,'\''+self.ua+'\''],stdout=PIPE) 21 | for ligne in result.stdout: 22 | record=ligne.strip() 23 | self.data.append(record.lower()) 24 | 25 | keywords=getattr(self.content,'keywords') 26 | for keyword in keywords: 27 | if self.content.search(keyword,str(self.data)): 28 | self.result={'url': self.url, 'data': self.data} -------------------------------------------------------------------------------- /harvesting/pastebintest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import pastebin 3 | 4 | paste=pastebin.pastebin('http://pastebin.com/archive',[],'pastebin.js') 5 | paste.pastebinArchive() 6 | setattr(paste,'casperJSScript','pastebintext.js') 7 | result=paste.pastebinAnalyse() 8 | print result 9 | -------------------------------------------------------------------------------- /harvesting/pastebintext.js: -------------------------------------------------------------------------------- 1 | 2 | var casper = require('casper').create(); 3 | 4 | 5 | var url=casper.cli.get(0); 6 | var ua =casper.cli.get(1) 7 | 8 | casper.start().then(function() { 9 | this.userAgent(ua); 10 | this.open(url, { 11 | method: 'get', 12 | headers: { 13 | 'Accept': 'application/text' 14 | } 15 | }); 16 | }); 17 | 18 | casper.run(function() { 19 | this.echo(this.debugPage()); 20 | this.exit(); 21 | }); 22 | 23 | -------------------------------------------------------------------------------- /harvesting/pholcidae.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | import re 4 | import sys 5 | 6 | # importing modules corresponding to Python version 7 | 8 | import urlparse 9 | import urllib2 10 | 11 | class Pholcidae(object): 12 | 13 | """" Pholcidae is a small and fast web crawler. """ 14 | 15 | def __init__(self): 16 | 17 | """ 18 | @return void 19 | 20 | Creates Pholcidae instance and updates default settings dict. 21 | """ 22 | 23 | # default local urllib2 opener 24 | self._opener = None 25 | # creating new sets of unparsed, already parsed and failed URLs 26 | self._unparsed_urls = set() 27 | self._parsed_urls = set() 28 | self._failed_urls = set() 29 | # extending settings with given values 30 | self._extend_settings() 31 | # compiling regular expressions 32 | self._compile_regexs() 33 | # autostart crawler if settings allows 34 | if self._settings.autostart: 35 | self.start() 36 | 37 | def crawl(self, response): 38 | 39 | """ 40 | @type response AttrDict 41 | @return void 42 | 43 | Dummy method which can be overrided by inheriting Pholcidae class. 44 | Use it to get html page and parse it as you want to. 45 | """ 46 | 47 | pass 48 | 49 | def start(self): 50 | 51 | """ 52 | @return void 53 | 54 | Simple crawler start trigger. 55 | """ 56 | 57 | self._get_page() 58 | 59 | ############################################################################ 60 | # PRIVATE METHODS # 61 | ############################################################################ 62 | 63 | ############################ INIT METHODS ################################## 64 | 65 | def _extend_settings(self): 66 | 67 | """ 68 | @return void 69 | 70 | Extends default settings with given settings. 71 | """ 72 | 73 | # creating default settings object 74 | self._settings = AttrDict({ 75 | # do we need to follow HTTP redirects? 76 | 'follow_redirects': True, 77 | # what page links do we need to parse? 78 | 'valid_links': ['(.*)'], 79 | # what URLs must be excluded 80 | 'exclude_links': [], 81 | # what is an entry point for crawler? 82 | 'start_page': '/', 83 | # which domain should we parse? 84 | 'domain': '', 85 | # should we ignor pages outside of the given domain? 86 | 'stay_in_domain': True, 87 | # which protocol do we need to use? 88 | 'protocol': 'http://', 89 | # autostart crawler right after initialization? 90 | 'autostart': False, 91 | # cookies to be added to each request 92 | 'cookies': {}, 93 | # custom headers to be added to each request 94 | 'headers': {} 95 | }) 96 | 97 | # updating settings with given values 98 | self._settings.update(self.settings) 99 | 100 | # creating urllib2 opener 101 | self._create_opener() 102 | # compiling cookies 103 | self._compile_cookies() 104 | # compiling headers 105 | self._compile_headers() 106 | 107 | # adding start point into unparsed list 108 | start_url = '%s%s%s' % (self._settings.protocol, self._settings.domain, 109 | self._settings.start_page) 110 | self._unparsed_urls.add(start_url) 111 | 112 | def _compile_regexs(self): 113 | 114 | """ 115 | @return void 116 | 117 | Compiles regular expressions for further use. 118 | """ 119 | 120 | # setting default flags 121 | flags = re.I | re.S 122 | # compiling regexs 123 | self._regex = AttrDict({ 124 | # collects all links across given page 125 | 'href_links': re.compile(r'', 126 | flags=flags), 127 | # valid links regexs 128 | 'valid_link': [], 129 | # invalid links regexs 130 | 'invalid_link': [] 131 | }) 132 | 133 | # complinig valid links regexs 134 | for regex in self._settings.valid_links: 135 | self._regex.valid_link.append(re.compile(regex, flags=flags)) 136 | 137 | # compiling invalid links regexs 138 | for regex in self._settings.exclude_links: 139 | self._regex.invalid_link.append(re.compile(regex, flags=flags)) 140 | 141 | def _compile_cookies(self): 142 | 143 | """ 144 | @return void 145 | 146 | Compiles given dict of cookies to string. 147 | """ 148 | 149 | compiled = [] 150 | for name, value in self._settings.cookies.items(): 151 | compiled.append('%s=%s' % (name, value)) 152 | self._settings.cookies = ','.join(compiled) 153 | self._opener.addheaders.append(('Cookie', self._settings.cookies)) 154 | 155 | def _compile_headers(self): 156 | 157 | """ 158 | @return void 159 | 160 | Adds given dict of headers to urllib2 opener. 161 | """ 162 | 163 | for header_name, header_value in self._settings.headers.items(): 164 | self._opener.addheaders.append((header_name, header_value)) 165 | 166 | def _create_opener(self): 167 | 168 | """ 169 | @return void 170 | 171 | Creates local urllib2 opener and extends it with custom 172 | redirect handler if needed. 173 | """ 174 | 175 | self._opener = urllib2.build_opener() 176 | if not self._settings.follow_redirects: 177 | self._opener = urllib2.build_opener(PholcidaeRedirectHandler, 178 | urllib2.HTTPCookieProcessor()) 179 | 180 | ########################## CRAWLING METHODS ################################ 181 | 182 | def _get_page(self): 183 | 184 | """ 185 | @return bool 186 | 187 | Fetches page by URL. 188 | """ 189 | 190 | # iterating over unparsed links 191 | while self._unparsed_urls: 192 | # getting link to get 193 | url = self._unparsed_urls.pop() 194 | 195 | # fetching page 196 | page = self._fetch_url(url) 197 | if page.status not in [500, 404, 502]: 198 | # parsing only valid urls 199 | valid_match = self._is_valid_link(page.url) 200 | if valid_match: 201 | # adding regex match to page object 202 | page.match = valid_match 203 | # sending raw HTML to crawl function 204 | self.crawl(page) 205 | # moving url from unparsed to parsed list 206 | self._parsed_urls.add(url) 207 | # collecting links from page 208 | self._get_page_links(page.body, page.url) 209 | else: 210 | # moving url from unparsed to failed list 211 | self._failed_urls.add(url) 212 | 213 | def _get_page_links(self, raw_html, url): 214 | 215 | """ 216 | @type raw_html str 217 | @type url str 218 | @return void 219 | 220 | Parses out all links from crawled web page. 221 | """ 222 | 223 | links_groups = self._regex.href_links.findall(str(raw_html)) 224 | links = [group[1] for group in links_groups] 225 | for link in links: 226 | # is link not excluded? 227 | if not self._is_excluded(link): 228 | # getting link parts 229 | link_info = urlparse.urlparse(link) 230 | # if link not relative 231 | if link_info.scheme or link_info.netloc: 232 | # if stay_in_domain enabled and link outside of domain scope 233 | if self._settings.stay_in_domain: 234 | try: 235 | is_link = self._settings.domain not in link 236 | except UnicodeDecodeError: 237 | continue 238 | else: 239 | if is_link: 240 | continue 241 | else: 242 | # converting relative link into absolute 243 | link = urlparse.urljoin(url, link) 244 | # if link was not previously parsed 245 | if link not in self._parsed_urls: 246 | if link not in self._failed_urls: 247 | self._unparsed_urls.add(link) 248 | 249 | def _is_valid_link(self, link): 250 | 251 | """ 252 | @type link str 253 | @return str 254 | 255 | Compares link with given regex to decide if we need to parse that 256 | page. 257 | """ 258 | 259 | # if hash in URL - assumimg anchor or AJAX 260 | if link and '#' not in link: 261 | for regex in self._regex.valid_link: 262 | matches = regex.findall(link) 263 | if matches: 264 | return matches 265 | return '' 266 | 267 | def _is_excluded(self, link): 268 | 269 | """ 270 | @type link str 271 | @return bool 272 | 273 | Checks if link matches exluded regex. 274 | """ 275 | 276 | for regex in self._regex.invalid_link: 277 | if regex.search(link): 278 | return True 279 | return False 280 | 281 | ######################### URL FETCHING METHODS ############################# 282 | 283 | def _fetch_url(self, url): 284 | 285 | """ 286 | @type url str 287 | @return AttrDict 288 | 289 | Fetches given URL and returns data from it. 290 | """ 291 | 292 | # empty page container 293 | page = AttrDict() 294 | 295 | try: 296 | # getting response from given URL 297 | resp = self._opener.open(url) 298 | page = AttrDict({ 299 | 'body': resp.read(), 300 | 'url': resp.geturl(), 301 | 'headers': AttrDict(dict(resp.headers.items())), 302 | 'cookies': self._parse_cookies(dict(resp.headers.items())), 303 | 'status': resp.getcode() 304 | }) 305 | except: 306 | # drop invalid page with 500 HTTP error code 307 | page = AttrDict({'status': 500}) 308 | self._failed_urls.add(url) 309 | return page 310 | 311 | def _parse_cookies(self, headers): 312 | 313 | """ 314 | @type headers dict 315 | @return AttrDict 316 | 317 | Parses cookies from response headers. 318 | """ 319 | 320 | cookies = AttrDict() 321 | # lowering headers keys 322 | headers_lower={} 323 | 324 | for k,v in headers.items(): 325 | headers_lower[k.lower()]=v 326 | headers=headers_lower 327 | if 'set-cookie' in headers: 328 | # splitting raw cookies 329 | raw_cookies = headers['set-cookie'].split(';') 330 | # cookie parts to throw out 331 | throw_out = ['expires', 'path', 'domain', 'secure', 'HttpOnly'] 332 | for cookie in raw_cookies: 333 | cookie = cookie.split('=') 334 | if cookie[0].strip() not in throw_out: 335 | cookies.update({cookie[0]: cookie[1]}) 336 | return cookies 337 | 338 | 339 | class AttrDict(dict): 340 | 341 | """ A dict that allows for object-like property access syntax. """ 342 | 343 | def __init__(self, new_dict=None): 344 | dict.__init__(self) 345 | if new_dict: 346 | self.update(new_dict) 347 | 348 | def __getattr__(self, name): 349 | try: 350 | return self[name] 351 | except KeyError: 352 | raise AttributeError(name) 353 | 354 | def __setattr__(self, key, value): 355 | self.update({key: value}) 356 | 357 | 358 | class PholcidaeRedirectHandler(urllib2.HTTPRedirectHandler): 359 | 360 | """ Custom URL redirects handler. """ 361 | 362 | def http_error_302(self, req, fp, code, msg, headers): 363 | return fp 364 | 365 | http_error_301 = http_error_303 = http_error_307 = http_error_302 366 | -------------------------------------------------------------------------------- /harvesting/random_user_agent.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 2, 2012 3 | 4 | @author: slarinier 5 | ''' 6 | import random 7 | class Random_user_agent(object): 8 | ''' 9 | classdocs 10 | ''' 11 | _instance = None 12 | def __init__(self,path_user_agent='harvesting/user_agents'): 13 | ''' 14 | Constructor 15 | ''' 16 | self.user_agent_list=[] 17 | self.path_user_agent=path_user_agent 18 | with open(self.path_user_agent,'r') as fr: 19 | for user_agent in fr: 20 | if user_agent.find('#') == -1: 21 | self.user_agent_list.append(user_agent) 22 | 23 | 24 | def __new__(cls, *args, **kwargs): 25 | if not cls._instance: 26 | cls._instance = super(Random_user_agent, cls).__new__( 27 | cls, *args, **kwargs) 28 | return cls._instance 29 | 30 | def rand(self): 31 | return random.choice(self.user_agent_list) 32 | def randsleep(self): 33 | return random.randrange(1,3,2) -------------------------------------------------------------------------------- /harvesting/search.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | from subprocess import PIPE 3 | from white_list import white_list 4 | import re 5 | import subprocess 6 | import threading 7 | from random_user_agent import Random_user_agent 8 | 9 | class search(threading.Thread): 10 | def __init__(self,limit,criteria,scriptjs,db,url_pattern='((https?|ftp|gopher|telnet|file|notes|ms-help):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&])*)'): 11 | threading.Thread.__init__(self) 12 | self.result=[] 13 | self.limit=limit 14 | self.criteria=criteria 15 | self.scriptjs=scriptjs 16 | self.connection= MongoClient(host='localhost', port=27017) 17 | self.db=self.connection[db] 18 | self.whitelist=white_list(db) 19 | self.regex_url=re.compile(url_pattern) 20 | rua=Random_user_agent() 21 | self.ua=rua.rand() 22 | self.urls_by_domaine={} 23 | 24 | def run(self): 25 | i=0 26 | while i < self.limit: 27 | result=subprocess.Popen(['casperjs' ,self.scriptjs,str(i),self.criteria,self.ua],stdout=PIPE) 28 | for ligne in result.stdout: 29 | if ligne.find('/')!=-1 and ligne.find('http://') != -1: 30 | url_information=self.regex_url.search(ligne) 31 | url=url_information.group(1) 32 | domaine=url.split('/')[2] 33 | tokens=domaine.split('.') 34 | racine=tokens[len(tokens)-2]+'.'+tokens[len(tokens)-1] 35 | 36 | print "domain found: "+ domaine 37 | 38 | if not racine in getattr(self.whitelist, 'white_domaine'): 39 | if domaine in self.urls_by_domaine: 40 | urls= self.urls_by_domaine[domaine] 41 | urls.append(url) 42 | self.urls_by_domaine[domaine]=urls 43 | else: 44 | self.urls_by_domaine[domaine]=[url] 45 | 46 | i=i+10 47 | 48 | def record(self): 49 | print "#######################record############################" 50 | domaines = iter(self.urls_by_domaine) 51 | for domaine in domaines: 52 | entry = self.db.new_domaines.find_one({'domaine':domaine}) 53 | if entry == None: 54 | self.db.new_domaines.insert_one({'domaine':domaine,'urls':self.urls_by_domaine[domaine],'criteria':[self.criteria]}) 55 | else: 56 | 57 | try: 58 | urls_stored = entry['urls'] 59 | urls=self.urls_by_domaine[domaine] 60 | urls_to_store=list(set(urls_stored + urls)) 61 | criteria=entry['criteria'] 62 | criteria=list(set(criteria.append(self.criteria))) 63 | entry['criteria']=criteria 64 | self.db.new_domaines.save(entry) 65 | except : 66 | criteria=[] 67 | try : 68 | criteria=entry['criteria'] 69 | criteria=list(set(criteria.append(self.criteria))) 70 | except: 71 | criteria.append(self.criteria) 72 | pass 73 | 74 | entry['criteria']=criteria 75 | try: 76 | self.db.new_domaines.insert_one({'domaine':domaine},{'urls':self.urls_by_domaine[domaine],'criteria':criteria}) 77 | except: 78 | pass 79 | 80 | -------------------------------------------------------------------------------- /harvesting/white_list.py: -------------------------------------------------------------------------------- 1 | from mongodb import mongodb 2 | import os 3 | import glob 4 | class white_list(): 5 | 6 | def __init__(self,db): 7 | self.mdb=mongodb.mongodb('localhost',27017,db) 8 | self.white_list=[] 9 | self.white_domaine=['msn.com','google.com','wikipedia.fr','free.fr','linkedin.com'] 10 | 11 | def loadWhiteList(self): 12 | domaines=self.mdb.selectall('white_list') 13 | for domaine in domaines: 14 | self. white_domaine.append(domaine['domaine']) 15 | 16 | def makeWhiteList(self,path): 17 | list_files=os.walk(path) 18 | for root,dirs,files in list_files: 19 | category='' 20 | for fl in files: 21 | if fl=='domains': 22 | with open(root+'/'+fl,'r') as fr: 23 | root=root.replace(path,'') 24 | if '/' in root: 25 | category=root.replace('/','_') 26 | else: 27 | category=root 28 | for ligne in fr: 29 | item={'domaine':ligne.strip(),'category':category} 30 | self.mdb.update(item,'white_list') 31 | def searchInWhiteList(self,domaine): 32 | result=self.mdb.selectbycreteria('domaine',domaine,'white_list') 33 | if result is not None: 34 | category=result[0] 35 | print category['category'] 36 | return category 37 | #def compare_white_list() 38 | -------------------------------------------------------------------------------- /harvesting/yahoosearch.js: -------------------------------------------------------------------------------- 1 | var links = []; 2 | var casper = require('casper').create(); 3 | var padding=casper.cli.get(0) 4 | var criteria=casper.cli.get(1) 5 | var ua = casper.cli.get(2) 6 | 7 | function getLinks() { 8 | 9 | var links = document.querySelectorAll('h3 a'); 10 | return Array.prototype.map.call(links, function(e) { 11 | return e.getAttribute('href') 12 | }); 13 | } 14 | 15 | 16 | casper.start(); 17 | casper.userAgent(ua) 18 | casper.open('http://fr.yahoo.com/search='+criteria+'&rd=r1&fr=yfp-t-731&fr2=sb-top&xargs=0&pstart=1&b='+padding) 19 | casper.then(function() { 20 | // aggregate results for the 'casperjs' search 21 | 22 | links = this.evaluate(getLinks); 23 | 24 | // now search for 'phantomjs' by filling the form again 25 | }); 26 | 27 | 28 | 29 | casper.run(function() { 30 | // echo results in some pretty fashion 31 | this.echo(links.length + ' links found:'); 32 | this.echo(' - ' + links.join('\n - ')).exit(); 33 | }); 34 | -------------------------------------------------------------------------------- /history/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/history/__init__.py -------------------------------------------------------------------------------- /history/history.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jan 18, 2013 3 | 4 | @author: slarinier 5 | ''' 6 | 7 | import datetime 8 | import logging 9 | 10 | class History(object): 11 | ''' 12 | classdocs 13 | ''' 14 | 15 | 16 | def __init__(self): 17 | ''' 18 | Constructor 19 | ''' 20 | d=datetime.datetime.now() 21 | date_value=d.strftime("%Y-%m-%d") 22 | self.logger=logging.getLogger('history') 23 | hdlr = logging.FileHandler('history/'+date_value+'.log') 24 | formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') 25 | hdlr.setFormatter(formatter) 26 | self.logger.addHandler(hdlr) 27 | self.logger.setLevel(logging.INFO) 28 | 29 | def register(self,action): 30 | self.logger.info(action) 31 | 32 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ''' 3 | Created on Sep 25, 2012 4 | 5 | @author: slarinier 6 | ''' 7 | 8 | from actions import Actions 9 | import argparse 10 | from history.history import History 11 | import sys 12 | import threading 13 | 14 | if __name__ == '__main__': 15 | scriptsJS = ['harvesting/googlesearch.js', 'harvesting/bingsearch.js', 'harvesting/yahoosearch.js'] 16 | h = History() 17 | result = [] 18 | domaine_ip = {} 19 | 20 | # limit=sys.argv[4] 21 | 22 | 23 | parser = argparse.ArgumentParser(description='metaharvester') 24 | parser.add_argument('--db', dest='db', help='db in mongo to store informations') 25 | parser.add_argument('--geoloc', dest='geoloc') 26 | parser.add_argument('--action', dest='action') 27 | parser.add_argument('--criteria', dest='criteria') 28 | parser.add_argument('--collection', dest='collection') 29 | parser.add_argument('--attr', dest='attr') 30 | parser.add_argument('--threadpool', dest='threadpool') 31 | parser.add_argument('--filters', dest='filters') 32 | parser.add_argument('--domains', dest='domains') 33 | parser.add_argument('--range', dest='range') 34 | parser.add_argument('--nmap_options', dest='nmap_options') 35 | args = parser.parse_args() 36 | db = args.db 37 | filters = args.filters 38 | criteria = args.criteria 39 | if criteria == None: 40 | criteria = '' 41 | geoloc = args.geoloc 42 | if geoloc == None: 43 | geoloc = '' 44 | collection = args.collection 45 | attr = args.attr 46 | msg = db + ' ' + ' ' + args.action + ' ' + criteria 47 | h.register(msg) 48 | act = Actions(db) 49 | if args.action == 'reset': 50 | act.reset() 51 | elif args.action == 'metasearch': 52 | if criteria and scriptsJS and db and geoloc: 53 | criteria = criteria.split(',') 54 | act.metasearch(criteria, scriptsJS, geoloc) 55 | elif args.action == 'search_ip': 56 | act.search_ip(geoloc, scriptsJS, args.range) 57 | elif args.action == 'create_network': 58 | act.create_network() 59 | elif args.action == 'metadata': 60 | act.metadata_exctract() 61 | elif args.action == 'create_result': 62 | if not criteria and not db: 63 | parser.print_help() 64 | else: 65 | if collection: 66 | act.create_result(collection, criteria) 67 | elif args.action == 'dnstree': 68 | if db: 69 | act.dnstree(db) 70 | elif args.action == 'crawl' and args.domains: 71 | if db: 72 | act.crawl(args.domains) 73 | elif args.action == 'cleandb': 74 | if db and filters: 75 | act.clean_db(filters) 76 | elif args.action == 'screenshots': 77 | if db and args.threadpool: 78 | act.screenshots(db, args.threadpool) 79 | else: 80 | parser.print_help() 81 | elif args.action == 'init': 82 | if db and attr and collection: 83 | act.init(db, collection, attr) 84 | else: 85 | parser.print_help() 86 | elif args.action == 'nmap': 87 | if args.nmap_options or args.range: 88 | act.scan_nmap(args.range, args.nmap_options) 89 | else: 90 | 91 | parser.print_help() 92 | sys.exit(1) 93 | -------------------------------------------------------------------------------- /mongodb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/mongodb/__init__.py -------------------------------------------------------------------------------- /mongodb/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/mongodb/__init__.pyc -------------------------------------------------------------------------------- /mongodb/mongodb.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | import bson 3 | import pymongo 4 | 5 | 6 | class mongodb(object): 7 | def __init__(self, host, port, db): 8 | self.host = host 9 | self.port = port 10 | self.connection = MongoClient(host=host, port=port) 11 | self.db = self.connection[db] 12 | 13 | def insert(self, collection, key, value): 14 | col = self.db[collection] 15 | value_db = {'domaine': value} 16 | # col.create_index([('domaine', pymongo.DESCENDING)]) 17 | col.save(value_db) 18 | 19 | def update(self, item, collection): 20 | col = self.db[collection] 21 | try: 22 | col.save(item) 23 | except bson.errors.InvalidStringData: 24 | print 'InvalidString ' + str(item) 25 | 26 | def selectbyDict(self, request, col): 27 | self.col = self.db[col] 28 | return self.col.find(request) 29 | 30 | def selectbycreteria(self, key, criteria, col): 31 | request = {key: criteria} 32 | self.col = self.db[col] 33 | return self.col.find(request) 34 | 35 | def selectall(self, collection): 36 | col = self.db[collection] 37 | return col.find() 38 | 39 | def insertMultiCriteria(self, collection, items): 40 | print "insert " + str(items) 41 | col = self.db[collection] 42 | try: 43 | col.save(items) 44 | except ValueError: 45 | print 'Erreur encoding: ' + items 46 | -------------------------------------------------------------------------------- /mongodb/mongodb.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/mongodb/mongodb.pyc -------------------------------------------------------------------------------- /network/IPy.py: -------------------------------------------------------------------------------- 1 | """ IPy - class and tools for handling of IPv4 and IPv6 Addresses and Networks. 2 | 3 | $HeadURL: http://svn.23.nu/svn/repos/IPy/trunk/IPy.py $ 4 | 5 | $Id: IPy.py 671 2004-08-22 21:02:29Z md $ 6 | 7 | The IP class allows a comfortable parsing and handling for most 8 | notations in use for IPv4 and IPv6 Addresses and Networks. It was 9 | greatly inspired bei RIPE's Perl module NET::IP's interface but 10 | doesn't share the Implementation. It doesn't share non-CIDR netmasks, 11 | so funky stuff lixe a netmask 0xffffff0f can't be done here. 12 | 13 | >>> ip = IP('127.0.0.0/30') 14 | >>> for x in ip: 15 | ... print x 16 | ... 17 | 127.0.0.0 18 | 127.0.0.1 19 | 127.0.0.2 20 | 127.0.0.3 21 | >>> ip2 = IP('0x7f000000/30') 22 | >>> ip == ip2 23 | 1 24 | >>> ip.reverseNames() 25 | ['0.0.0.127.in-addr.arpa.', '1.0.0.127.in-addr.arpa.', '2.0.0.127.in-addr.arpa.', '3.0.0.127.in-addr.arpa.'] 26 | >>> ip.reverseName() 27 | '0-3.0.0.127.in-addr.arpa.' 28 | >>> ip.iptype() 29 | 'PRIVATE' 30 | 31 | It can detect about a dozen different ways of expressing IP addresses 32 | and networks, parse them and distinguish between IPv4 and IPv6 addresses. 33 | 34 | >>> IP('10.0.0.0/8').version() 35 | 4 36 | >>> IP('::1').version() 37 | 6 38 | >>> print IP(0x7f000001) 39 | 127.0.0.1 40 | >>> print IP('0x7f000001') 41 | 127.0.0.1 42 | >>> print IP('127.0.0.1') 43 | 127.0.0.1 44 | >>> print IP('10') 45 | 10.0.0.0 46 | >>> print IP('1080:0:0:0:8:800:200C:417A') 47 | 1080:0000:0000:0000:0008:0800:200c:417a 48 | >>> print IP('1080::8:800:200C:417A') 49 | 1080:0000:0000:0000:0008:0800:200c:417a 50 | >>> print IP('::1') 51 | 0000:0000:0000:0000:0000:0000:0000:0001 52 | >>> print IP('::13.1.68.3') 53 | 0000:0000:0000:0000:0000:0000:0d01:4403 54 | >>> print IP('127.0.0.0/8') 55 | 127.0.0.0/8 56 | >>> print IP('127.0.0.0/255.0.0.0') 57 | 127.0.0.0/8 58 | >>> print IP('127.0.0.0-127.255.255.255') 59 | 127.0.0.0/8 60 | 61 | Nearly all class methods which return a string have an optional 62 | parameter 'wantprefixlen' which controlles if the prefixlen or netmask 63 | is printed. Per default the prefilen is always shown if the net 64 | contains more than one address. 65 | 66 | wantprefixlen == 0 / None don't return anything 1.2.3.0 67 | wantprefixlen == 1 /prefix 1.2.3.0/24 68 | wantprefixlen == 2 /netmask 1.2.3.0/255.255.255.0 69 | wantprefixlen == 3 -lastip 1.2.3.0-1.2.3.255 70 | 71 | You can also change the defaults on an per-object basis by fiddeling with the class members 72 | 73 | NoPrefixForSingleIp 74 | WantPrefixLen 75 | 76 | >>> IP('10.0.0.0/32').strNormal() 77 | '10.0.0.0' 78 | >>> IP('10.0.0.0/24').strNormal() 79 | '10.0.0.0/24' 80 | >>> IP('10.0.0.0/24').strNormal(0) 81 | '10.0.0.0' 82 | >>> IP('10.0.0.0/24').strNormal(1) 83 | '10.0.0.0/24' 84 | >>> IP('10.0.0.0/24').strNormal(2) 85 | '10.0.0.0/255.255.255.0' 86 | >>> IP('10.0.0.0/24').strNormal(3) 87 | '10.0.0.0-10.0.0.255' 88 | >>> ip = IP('10.0.0.0') 89 | >>> print ip 90 | 10.0.0.0 91 | >>> ip.NoPrefixForSingleIp = None 92 | >>> print ip 93 | 10.0.0.0/32 94 | >>> ip.WantPrefixLen = 3 95 | >>> print ip 96 | 10.0.0.0-10.0.0.0 97 | 98 | 99 | Further Information might be available at http://c0re.jp/c0de/IPy/ 100 | 101 | Hacked 2001 by drt@un.bewaff.net 102 | 103 | TODO: 104 | * better comparison (__cmp__ and friends) 105 | * tests for __cmp__ 106 | * always write hex values lowercase 107 | * interpret 2001:1234:5678:1234/64 as 2001:1234:5678:1234::/64 108 | * move size in bits into class variables to get rid of some "if self._ipversion ..." 109 | * support for base85 encoding 110 | * support for output of IPv6 encoded IPv4 Addresses 111 | * update address type tables 112 | * first-last notation should be allowed for IPv6 113 | * add IPv6 docstring examples 114 | * check better for negative parameters 115 | * add addition / aggregation 116 | * move reverse name stuff out of the classes and refactor it 117 | * support for aggregation of more than two nets at once 118 | * support for aggregation with "holes" 119 | * support for finding common prefix 120 | * '>>' and '<<' for prefix manipulation 121 | * add our own exceptions instead ValueError all the time 122 | * rename checkPrefix to checkPrefixOk 123 | * add more documentation and doctests 124 | * refactor 125 | """ 126 | 127 | __rcsid__ = '$Id: IPy.py 671 2004-08-22 21:02:29Z md $' 128 | __version__ = '0.42' 129 | 130 | import types 131 | 132 | # Definition of the Ranges for IPv4 IPs 133 | # this should include www.iana.org/assignments/ipv4-address-space 134 | # and www.iana.org/assignments/multicast-addresses 135 | IPv4ranges = { 136 | '0' : 'PUBLIC', # fall back 137 | '00000000' : 'PRIVATE', # 0/8 138 | '00001010' : 'PRIVATE', # 10/8 139 | '01111111' : 'PRIVATE', # 127.0/8 140 | '1' : 'PUBLIC', # fall back 141 | '101011000001' : 'PRIVATE', # 172.16/12 142 | '1100000010101000' : 'PRIVATE', # 192.168/16 143 | '11011111' : 'RESERVED', # 223/8 144 | '111' : 'RESERVED' # 224/3 145 | } 146 | 147 | # Definition of the Ranges for IPv6 IPs 148 | # see also www.iana.org/assignments/ipv6-address-space, 149 | # www.iana.org/assignments/ipv6-tla-assignments, 150 | # www.iana.org/assignments/ipv6-multicast-addresses, 151 | # www.iana.org/assignments/ipv6-anycast-addresses 152 | IPv6ranges = { 153 | '00000000' : 'RESERVED', # ::/8 154 | '00000001' : 'UNASSIGNED', # 100::/8 155 | '0000001' : 'NSAP', # 200::/7 156 | '0000010' : 'IPX', # 400::/7 157 | '0000011' : 'UNASSIGNED', # 600::/7 158 | '00001' : 'UNASSIGNED', # 800::/5 159 | '0001' : 'UNASSIGNED', # 1000::/4 160 | '0010000000000000' : 'RESERVED', # 2000::/16 Reserved 161 | '0010000000000001' : 'ASSIGNABLE', # 2001::/16 Sub-TLA Assignments [RFC2450] 162 | '00100000000000010000000': 'ASSIGNABLE IANA', # 2001:0000::/29 - 2001:01F8::/29 IANA 163 | '00100000000000010000001': 'ASSIGNABLE APNIC', # 2001:0200::/29 - 2001:03F8::/29 APNIC 164 | '00100000000000010000010': 'ASSIGNABLE ARIN', # 2001:0400::/29 - 2001:05F8::/29 ARIN 165 | '00100000000000010000011': 'ASSIGNABLE RIPE', # 2001:0600::/29 - 2001:07F8::/29 RIPE NCC 166 | '0010000000000010' : '6TO4', # 2002::/16 "6to4" [RFC3056] 167 | '0011111111111110' : '6BONE', # 3FFE::/16 6bone Testing [RFC2471] 168 | '0011111111111111' : 'RESERVED', # 3FFF::/16 Reserved 169 | '010' : 'GLOBAL-UNICAST', # 4000::/3 170 | '011' : 'UNASSIGNED', # 6000::/3 171 | '100' : 'GEO-UNICAST', # 8000::/3 172 | '101' : 'UNASSIGNED', # A000::/3 173 | '110' : 'UNASSIGNED', # C000::/3 174 | '1110' : 'UNASSIGNED', # E000::/4 175 | '11110' : 'UNASSIGNED', # F000::/5 176 | '111110' : 'UNASSIGNED', # F800::/6 177 | '1111110' : 'UNASSIGNED', # FC00::/7 178 | '111111100' : 'UNASSIGNED', # FE00::/9 179 | '1111111010' : 'LINKLOCAL', # FE80::/10 180 | '1111111011' : 'SITELOCAL', # FEC0::/10 181 | '11111111' : 'MULTICAST', # FF00::/8 182 | '0' * 96 : 'IPV4COMP', # ::/96 183 | '0' * 80 + '1' * 16 : 'IPV4MAP', # ::FFFF:0:0/96 184 | '0' * 128 : 'UNSPECIFIED', # ::/128 185 | '0' * 127 + '1' : 'LOOPBACK' # ::1/128 186 | } 187 | 188 | 189 | class IPint: 190 | """Handling of IP addresses returning integers. 191 | 192 | Use class IP instead because some features are not implemented for 193 | IPint.""" 194 | 195 | def __init__(self, data, ipversion = 0): 196 | """Create an instance of an IP object. 197 | 198 | Data can be a network specification or a single IP. IP 199 | Addresses can be specified in all forms understood by 200 | parseAddress.() the size of a network can be specified as 201 | 202 | /prefixlen a.b.c.0/24 2001:658:22a:cafe::/64 203 | -lastIP a.b.c.0-a.b.c.255 2001:658:22a:cafe::-2001:658:22a:cafe:ffff:ffff:ffff:ffff 204 | /decimal netmask a.b.c.d/255.255.255.0 not supported for IPv6 205 | 206 | If no size specification is given a size of 1 address (/32 for 207 | IPv4 and /128 for IPv6) is assumed. 208 | 209 | >>> print IP('127.0.0.0/8') 210 | 127.0.0.0/8 211 | >>> print IP('127.0.0.0/255.0.0.0') 212 | 127.0.0.0/8 213 | >>> print IP('127.0.0.0-127.255.255.255') 214 | 127.0.0.0/8 215 | 216 | See module documentation for more examples. 217 | """ 218 | 219 | self.NoPrefixForSingleIp = 1 # Print no Prefixlen for /32 and /128 220 | self.WantPrefixLen = None # Do we want prefix printed by default? see _printPrefix() 221 | 222 | netbits = 0 223 | prefixlen = -1 224 | 225 | # handling of non string values in constructor 226 | if type(data) == types.IntType or type(data) == types.LongType: 227 | self.ip = long(data) 228 | if ipversion == 0: 229 | if self.ip < 0x100000000L: 230 | ipversion = 4 231 | else: 232 | ipversion = 6 233 | if ipversion == 4: 234 | prefixlen = 32 235 | elif ipversion == 6: 236 | prefixlen = 128 237 | else: 238 | raise ValueError, "only IPv4 and IPv6 supported" 239 | self._ipversion = ipversion 240 | self._prefixlen = prefixlen 241 | # handle IP instance as an parameter 242 | elif isinstance(data, IPint): 243 | self._ipversion = data._ipversion 244 | self._prefixlen = data._prefixlen 245 | self.ip = data.ip 246 | else: 247 | # TODO: refactor me! 248 | # splitting of a string into IP and prefixlen et. al. 249 | x = data.split('-') 250 | if len(x) == 2: 251 | # a.b.c.0-a.b.c.255 specification ? 252 | (ip, last) = x 253 | (self.ip, parsedVersion) = parseAddress(ip) 254 | if parsedVersion != 4: 255 | raise ValueError, "first-last notation only allowed for IPv4" 256 | (last, lastversion) = parseAddress(last) 257 | if lastversion != 4: 258 | raise ValueError, "last address should be IPv4, too" 259 | if last < self.ip: 260 | raise ValueError, "last address should be larger than first" 261 | size = last - self.ip 262 | netbits = _count1Bits(size) 263 | elif len(x) == 1: 264 | x = data.split('/') 265 | # if no prefix is given use defaults 266 | if len(x) == 1: 267 | ip = x[0] 268 | prefixlen = -1 269 | elif len(x) > 2: 270 | raise ValueError, "only one '/' allowed in IP Address" 271 | else: 272 | (ip, prefixlen) = x 273 | if prefixlen.find('.') != -1: 274 | # check if the user might have used a netmask like 275 | # a.b.c.d/255.255.255.0 276 | (netmask, vers) = parseAddress(prefixlen) 277 | if vers != 4: 278 | raise ValueError, "netmask must be IPv4" 279 | prefixlen = _netmaskToPrefixlen(netmask) 280 | elif len(x) > 2: 281 | raise ValueError, "only one '-' allowed in IP Address" 282 | else: 283 | raise ValueError, "can't parse" 284 | 285 | (self.ip, parsedVersion) = parseAddress(ip) 286 | if ipversion == 0: 287 | ipversion = parsedVersion 288 | if prefixlen == -1: 289 | if ipversion == 4: 290 | prefixlen = 32 - netbits 291 | elif ipversion == 6: 292 | prefixlen = 128 - netbits 293 | else: 294 | raise ValueError, "only IPv4 and IPv6 supported" 295 | self._ipversion = ipversion 296 | self._prefixlen = int(prefixlen) 297 | 298 | if not _checkNetaddrWorksWithPrefixlen(self.ip, self._prefixlen, self._ipversion): 299 | raise ValueError, "%s goes not well with prefixlen %d" % (hex(self.ip), self._prefixlen) 300 | 301 | 302 | def int(self): 303 | """Return the first / base / network addess as an (long) integer. 304 | 305 | The same as IP[0]. 306 | 307 | >>> hex(IP('10.0.0.0/8').int()) 308 | '0xA000000L' 309 | """ 310 | return self.ip 311 | 312 | def version(self): 313 | """Return the IP version of this Object. 314 | 315 | >>> IP('10.0.0.0/8').version() 316 | 4 317 | >>> IP('::1').version() 318 | 6 319 | """ 320 | return self._ipversion 321 | 322 | def prefixlen(self): 323 | """Returns Network Prefixlen. 324 | 325 | >>> IP('10.0.0.0/8').prefixlen() 326 | 8 327 | """ 328 | return self._prefixlen 329 | 330 | def net(self): 331 | """Return the base (first) address of a network as an (long) integer.""" 332 | 333 | return self.int() 334 | 335 | def broadcast(self): 336 | """Return the broadcast (last) address of a network as an (long) integer. 337 | 338 | The same as IP[-1].""" 339 | return self.int() + self.len() - 1 340 | 341 | def _printPrefix(self, want): 342 | """Prints Prefixlen/Netmask. 343 | 344 | Not really. In fact it is our universal Netmask/Prefixlen printer. 345 | This is considered an internel function. 346 | 347 | want == 0 / None don't return anything 1.2.3.0 348 | want == 1 /prefix 1.2.3.0/24 349 | want == 2 /netmask 1.2.3.0/255.255.255.0 350 | want == 3 -lastip 1.2.3.0-1.2.3.255 351 | """ 352 | 353 | if (self._ipversion == 4 and self._prefixlen == 32) or \ 354 | (self._ipversion == 6 and self._prefixlen == 128): 355 | if self.NoPrefixForSingleIp: 356 | want = 0 357 | if want == None: 358 | want = self.WantPrefixLen 359 | if want == None: 360 | want = 1 361 | if want: 362 | if want == 2: 363 | # this should work wit IP and IPint 364 | netmask = self.netmask() 365 | if type(netmask) != types.IntType and type(netmask) != types.LongType: 366 | netmask = netmask.int() 367 | return "/%s" % (intToIp(netmask, self._ipversion)) 368 | elif want == 3: 369 | return "-%s" % (intToIp(self.ip + self.len() - 1, self._ipversion)) 370 | else: 371 | # default 372 | return "/%d" % (self._prefixlen) 373 | else: 374 | return '' 375 | 376 | # We have different Favours to convert to: 377 | # strFullsize 127.0.0.1 2001:0658:022a:cafe:0200:c0ff:fe8d:08fa 378 | # strNormal 127.0.0.1 2001:658:22a:cafe:200:c0ff:fe8d:08fa 379 | # strCompressed 127.0.0.1 2001:658:22a:cafe::1 380 | # strHex 0x7F000001L 0x20010658022ACAFE0200C0FFFE8D08FA 381 | # strDec 2130706433 42540616829182469433547974687817795834 382 | 383 | def strBin(self, wantprefixlen = None): 384 | """Return a string representation as a binary value. 385 | 386 | >>> print IP('127.0.0.1').strBin() 387 | 01111111000000000000000000000001 388 | """ 389 | 390 | 391 | if self._ipversion == 4: 392 | bits = 32 393 | elif self._ipversion == 6: 394 | bits = 128 395 | else: 396 | raise ValueError, "only IPv4 and IPv6 supported" 397 | 398 | if self.WantPrefixLen == None and wantprefixlen == None: 399 | wantprefixlen = 0 400 | ret = _intToBin(self.ip) 401 | return '0' * (bits - len(ret)) + ret + self._printPrefix(wantprefixlen) 402 | 403 | def strCompressed(self, wantprefixlen = None): 404 | """Return a string representation in compressed format using '::' Notation. 405 | 406 | >>> print IP('127.0.0.1').strCompressed() 407 | 127.0.0.1 408 | >>> print IP('2001:0658:022a:cafe:0200::1').strCompressed() 409 | 2001:658:22a:cafe:200::1 410 | """ 411 | 412 | if self.WantPrefixLen == None and wantprefixlen == None: 413 | wantprefixlen = 1 414 | 415 | if self._ipversion == 4: 416 | return self.strFullsize(wantprefixlen) 417 | else: 418 | # find the longest sequence of '0' 419 | hextets = [int(x, 16) for x in self.strFullsize(0).split(':')] 420 | # every element of followingzeros will contain the number of zeros 421 | # following the corrospondending element of hextetes 422 | followingzeros = [0] * 8 423 | for i in range(len(hextets)): 424 | followingzeros[i] = _countFollowingZeros(hextets[i:]) 425 | # compressionpos is the position where we can start removing zeros 426 | compressionpos = followingzeros.index(max(followingzeros)) 427 | if max(followingzeros) > 1: 428 | # genererate string with the longest number of zeros cut out 429 | # now we need hextets as strings 430 | hextets = [x for x in self.strNormal(0).split(':')] 431 | while compressionpos < len(hextets) and hextets[compressionpos] == '0': 432 | del(hextets[compressionpos]) 433 | hextets.insert(compressionpos, '') 434 | if compressionpos + 1 >= len(hextets): 435 | hextets.append('') 436 | if compressionpos == 0: 437 | hextets = [''] + hextets 438 | return ':'.join(hextets) + self._printPrefix(wantprefixlen) 439 | else: 440 | return self.strNormal() + self._printPrefix(wantprefixlen) 441 | 442 | def strNormal(self, wantprefixlen = None): 443 | """Return a string representation in the usual format. 444 | 445 | >>> print IP('127.0.0.1').strNormal() 446 | 127.0.0.1 447 | >>> print IP('2001:0658:022a:cafe:0200::1').strNormal() 448 | 2001:658:22a:cafe:200:0:0:1 449 | """ 450 | 451 | if self.WantPrefixLen == None and wantprefixlen == None: 452 | wantprefixlen = 1 453 | 454 | if self._ipversion == 4: 455 | ret = self.strFullsize(0) 456 | elif self._ipversion == 6: 457 | ret = ':'.join([hex(x)[2:] for x in [int(x, 16) for x in self.strFullsize(0).split(':')]]) 458 | else: 459 | raise ValueError, "only IPv4 and IPv6 supported" 460 | 461 | 462 | 463 | return ret + self._printPrefix(wantprefixlen) 464 | 465 | def strFullsize(self, wantprefixlen = None): 466 | """Return a string representation in the non mangled format. 467 | 468 | >>> print IP('127.0.0.1').strFullsize() 469 | 127.0.0.1 470 | >>> print IP('2001:0658:022a:cafe:0200::1').strFullsize() 471 | 2001:0658:022a:cafe:0200:0000:0000:0001 472 | """ 473 | 474 | if self.WantPrefixLen == None and wantprefixlen == None: 475 | wantprefixlen = 1 476 | 477 | return intToIp(self.ip, self._ipversion).lower() + self._printPrefix(wantprefixlen) 478 | 479 | def strHex(self, wantprefixlen = None): 480 | """Return a string representation in hex format. 481 | 482 | >>> print IP('127.0.0.1').strHex() 483 | 0x7F000001 484 | >>> print IP('2001:0658:022a:cafe:0200::1').strHex() 485 | 0x20010658022ACAFE0200000000000001 486 | """ 487 | 488 | if self.WantPrefixLen == None and wantprefixlen == None: 489 | wantprefixlen = 0 490 | 491 | x = hex(self.ip) 492 | if x[-1] == 'L': 493 | x = x[:-1] 494 | return x + self._printPrefix(wantprefixlen) 495 | 496 | def strDec(self, wantprefixlen = None): 497 | """Return a string representation in decimal format. 498 | 499 | >>> print IP('127.0.0.1').strDec() 500 | 2130706433 501 | >>> print IP('2001:0658:022a:cafe:0200::1').strDec() 502 | 42540616829182469433547762482097946625 503 | """ 504 | 505 | if self.WantPrefixLen == None and wantprefixlen == None: 506 | wantprefixlen = 0 507 | 508 | x = str(self.ip) 509 | if x[-1] == 'L': 510 | x = x[:-1] 511 | return x + self._printPrefix(wantprefixlen) 512 | 513 | def iptype(self): 514 | """Return a description of the IP type ('PRIVATE', 'RESERVERD', etc). 515 | 516 | >>> print IP('127.0.0.1').iptype() 517 | PRIVATE 518 | >>> print IP('192.168.1.1').iptype() 519 | PRIVATE 520 | >>> print IP('195.185.1.2').iptype() 521 | PUBLIC 522 | >>> print IP('::1').iptype() 523 | LOOPBACK 524 | >>> print IP('2001:0658:022a:cafe:0200::1').iptype() 525 | ASSIGNABLE RIPE 526 | 527 | The type information for IPv6 is out of sync with reality. 528 | """ 529 | 530 | # this could be greatly improved 531 | 532 | if self._ipversion == 4: 533 | iprange = IPv4ranges 534 | elif self._ipversion == 6: 535 | iprange = IPv6ranges 536 | else: 537 | raise ValueError, "only IPv4 and IPv6 supported" 538 | 539 | bits = self.strBin() 540 | for i in range(len(bits), 0, -1): 541 | if iprange.has_key(bits[:i]): 542 | return iprange[bits[:i]] 543 | return "unknown" 544 | 545 | 546 | def netmask(self): 547 | """Return netmask as an integer. 548 | 549 | >>> print hex(IP('195.185.0.0/16').netmask().int()) 550 | 0xFFFF0000L 551 | """ 552 | 553 | # TODO: unify with prefixlenToNetmask? 554 | if self._ipversion == 4: 555 | locallen = 32 - self._prefixlen 556 | elif self._ipversion == 6: 557 | locallen = 128 - self._prefixlen 558 | else: 559 | raise ValueError, "only IPv4 and IPv6 supported" 560 | 561 | return ((2L ** self._prefixlen) - 1) << locallen 562 | 563 | 564 | def strNetmask(self): 565 | """Return netmask as an string. Mostly useful for IPv6. 566 | 567 | >>> print IP('195.185.0.0/16').strNetmask() 568 | 255.255.0.0 569 | >>> print IP('2001:0658:022a:cafe::0/64').strNetmask() 570 | /64 571 | """ 572 | 573 | # TODO: unify with prefixlenToNetmask? 574 | if self._ipversion == 4: 575 | locallen = 32 - self._prefixlen 576 | return intToIp(((2L ** self._prefixlen) - 1) << locallen, 4) 577 | elif self._ipversion == 6: 578 | locallen = 128 - self._prefixlen 579 | return "/%d" % self._prefixlen 580 | else: 581 | raise ValueError, "only IPv4 and IPv6 supported" 582 | 583 | def len(self): 584 | """Return the length of an subnet. 585 | 586 | >>> print IP('195.185.1.0/28').len() 587 | 16 588 | >>> print IP('195.185.1.0/24').len() 589 | 256 590 | """ 591 | 592 | if self._ipversion == 4: 593 | locallen = 32 - self._prefixlen 594 | elif self._ipversion == 6: 595 | locallen = 128 - self._prefixlen 596 | else: 597 | raise ValueError, "only IPv4 and IPv6 supported" 598 | 599 | return 2L ** locallen 600 | 601 | 602 | def __len__(self): 603 | """Return the length of an subnet. 604 | 605 | Called to implement the built-in function len(). 606 | It breaks with IPv6 Networks. Anybody knows how to fix this.""" 607 | 608 | # Python < 2.2 has this silly restriction which breaks IPv6 609 | # how about Python >= 2.2 ... ouch - it presists! 610 | 611 | return int(self.len()) 612 | 613 | 614 | def __getitem__(self, key): 615 | """Called to implement evaluation of self[key]. 616 | 617 | >>> ip=IP('127.0.0.0/30') 618 | >>> for x in ip: 619 | ... print hex(x.int()) 620 | ... 621 | 0x7F000000L 622 | 0x7F000001L 623 | 0x7F000002L 624 | 0x7F000003L 625 | >>> hex(ip[2].int()) 626 | '0x7F000002L' 627 | >>> hex(ip[-1].int()) 628 | '0x7F000003L' 629 | """ 630 | 631 | if type(key) != types.IntType and type(key) != types.LongType: 632 | raise TypeError 633 | if abs(key) >= self.len(): 634 | raise IndexError 635 | if key < 0: 636 | key = self.len() - abs(key) 637 | 638 | return self.ip + long(key) 639 | 640 | 641 | 642 | def __contains__(self, item): 643 | """Called to implement membership test operators. 644 | 645 | Should return true if item is in self, false otherwise. Item 646 | can be other IP-objects, strings or ints. 647 | 648 | >>> print IP('195.185.1.1').strHex() 649 | 0xC3B90101 650 | >>> 0xC3B90101L in IP('195.185.1.0/24') 651 | 1 652 | >>> '127.0.0.1' in IP('127.0.0.0/24') 653 | 1 654 | >>> IP('127.0.0.0/24') in IP('127.0.0.0/25') 655 | 0 656 | """ 657 | 658 | item = IP(item) 659 | if item.ip >= self.ip and item.ip < self.ip + self.len() - item.len() + 1: 660 | return 1 661 | else: 662 | return 0 663 | 664 | 665 | def overlaps(self, item): 666 | """Check if two IP address ranges overlap. 667 | 668 | Returns 0 if the two ranged don't overlap, 1 if the given 669 | range overlaps at the end and -1 if it does at the beginning. 670 | 671 | >>> IP('192.168.0.0/23').overlaps('192.168.1.0/24') 672 | 1 673 | >>> IP('192.168.0.0/23').overlaps('192.168.1.255') 674 | 1 675 | >>> IP('192.168.0.0/23').overlaps('192.168.2.0') 676 | 0 677 | >>> IP('192.168.1.0/24').overlaps('192.168.0.0/23') 678 | -1 679 | """ 680 | 681 | item = IP(item) 682 | if item.ip >= self.ip and item.ip < self.ip + self.len(): 683 | return 1 684 | elif self.ip >= item.ip and self.ip < item.ip + item.len(): 685 | return -1 686 | else: 687 | return 0 688 | 689 | 690 | def __str__(self): 691 | """Dispatch to the prefered String Representation. 692 | 693 | Used to implement str(IP).""" 694 | 695 | return self.strFullsize() 696 | 697 | 698 | def __repr__(self): 699 | """Print a representation of the Object. 700 | 701 | Used to implement repr(IP). Returns a string which evaluates 702 | to an identical Object (without the wnatprefixlen stuff - see 703 | module docstring. 704 | 705 | >>> print repr(IP('10.0.0.0/24')) 706 | IP('10.0.0.0/24') 707 | """ 708 | 709 | return("IPint('%s')" % (self.strCompressed(1))) 710 | 711 | 712 | def __cmp__(self, other): 713 | """Called by comparison operations. 714 | 715 | Should return a negative integer if self < other, zero if self 716 | == other, a positive integer if self > other. 717 | 718 | Networks with different prefixlen are considered non-equal. 719 | Networks with the same prefixlen and differing addresses are 720 | considered non equal but are compared by thair base address 721 | integer value to aid sorting of IP objects. 722 | 723 | The Version of Objects is not put into consideration. 724 | 725 | >>> IP('10.0.0.0/24') > IP('10.0.0.0') 726 | 1 727 | >>> IP('10.0.0.0/24') < IP('10.0.0.0') 728 | 0 729 | >>> IP('10.0.0.0/24') < IP('12.0.0.0/24') 730 | 1 731 | >>> IP('10.0.0.0/24') > IP('12.0.0.0/24') 732 | 0 733 | 734 | """ 735 | 736 | # Im not really sure if this is "the right thing to do" 737 | if self._prefixlen < other.prefixlen(): 738 | return (other.prefixlen() - self._prefixlen) 739 | elif self._prefixlen > other.prefixlen(): 740 | 741 | # Fixed bySamuel Krempp : 742 | 743 | # The bug is quite obvious really (as 99% bugs are once 744 | # spotted, isn't it ? ;-) Because of precedence of 745 | # multiplication by -1 over the substraction, prefixlen 746 | # differences were causing the __cmp__ function to always 747 | # return positive numbers, thus the function was failing 748 | # the basic assumptions for a __cmp__ function. 749 | 750 | # Namely we could have (a > b AND b > a), when the 751 | # prefixlen of a and b are different. (eg let 752 | # a=IP("1.0.0.0/24"); b=IP("2.0.0.0/16");) thus, anything 753 | # could happen when launching a sort algorithm.. 754 | # everything's in order with the trivial, attached patch. 755 | 756 | return (self._prefixlen - other.prefixlen()) * -1 757 | else: 758 | if self.ip < other.ip: 759 | return -1 760 | elif self.ip > other.ip: 761 | return 1 762 | else: 763 | return 0 764 | 765 | 766 | def __hash__(self): 767 | """Called for the key object for dictionary operations, and by 768 | the built-in function hash() Should return a 32-bit integer 769 | usable as a hash value for dictionary operations. The only 770 | required property is that objects which compare equal have the 771 | same hash value 772 | 773 | >>> hex(IP('10.0.0.0/24').__hash__()) 774 | '0xf5ffffe7' 775 | """ 776 | 777 | thehash = int(-1) 778 | ip = self.ip 779 | while ip > 0: 780 | thehash = thehash ^ (ip & 0x7fffffff) 781 | ip = ip >> 32 782 | thehash = thehash ^ self._prefixlen 783 | return int(thehash) 784 | 785 | 786 | class IP(IPint): 787 | """Class for handling IP Addresses and Networks.""" 788 | 789 | def net(self): 790 | """Return the base (first) address of a network as an IP object. 791 | 792 | The same as IP[0]. 793 | 794 | >>> IP('10.0.0.0/8').net() 795 | IP('10.0.0.0') 796 | """ 797 | return IP(IPint.net(self)) 798 | 799 | def broadcast(self): 800 | """Return the broadcast (last) address of a network as an IP object. 801 | 802 | The same as IP[-1]. 803 | 804 | >>> IP('10.0.0.0/8').broadcast() 805 | IP('10.255.255.255') 806 | """ 807 | return IP(IPint.broadcast(self)) 808 | 809 | def netmask(self): 810 | """Return netmask as an IP object. 811 | 812 | >>> IP('10.0.0.0/8').netmask() 813 | IP('255.0.0.0') 814 | """ 815 | return IP(IPint.netmask(self)) 816 | 817 | 818 | def reverseNames(self): 819 | """Return a list with values forming the reverse lookup. 820 | 821 | >>> IP('213.221.113.87/32').reverseNames() 822 | ['87.113.221.213.in-addr.arpa.'] 823 | >>> IP('213.221.112.224/30').reverseNames() 824 | ['224.112.221.213.in-addr.arpa.', '225.112.221.213.in-addr.arpa.', '226.112.221.213.in-addr.arpa.', '227.112.221.213.in-addr.arpa.'] 825 | >>> IP('127.0.0.0/24').reverseNames() 826 | ['0.0.127.in-addr.arpa.'] 827 | >>> IP('127.0.0.0/23').reverseNames() 828 | ['0.0.127.in-addr.arpa.', '1.0.127.in-addr.arpa.'] 829 | >>> IP('127.0.0.0/16').reverseNames() 830 | ['0.127.in-addr.arpa.'] 831 | >>> IP('127.0.0.0/15').reverseNames() 832 | ['0.127.in-addr.arpa.', '1.127.in-addr.arpa.'] 833 | >>> IP('128.0.0.0/8').reverseNames() 834 | ['128.in-addr.arpa.'] 835 | >>> IP('128.0.0.0/7').reverseNames() 836 | ['128.in-addr.arpa.', '129.in-addr.arpa.'] 837 | 838 | """ 839 | 840 | if self._ipversion == 4: 841 | ret =[] 842 | # TODO: Refactor. Add support for IPint objects 843 | if self.len() < 2**8: 844 | for x in self: 845 | ret.append(x.reverseName()) 846 | elif self.len() < 2**16L: 847 | for i in range(0, self.len(), 2**8): 848 | ret.append(self[i].reverseName()[2:]) 849 | elif self.len() < 2**24L: 850 | for i in range(0, self.len(), 2**16): 851 | ret.append(self[i].reverseName()[4:]) 852 | else: 853 | for i in range(0, self.len(), 2**24): 854 | ret.append(self[i].reverseName()[6:]) 855 | return ret 856 | elif self._ipversion == 6: 857 | s = hex(self.ip)[2:].lower() 858 | if s[-1] == 'l': 859 | s = s[:-1] 860 | if self._prefixlen % 4 != 0: 861 | raise NotImplementedError, "can't create IPv6 reverse names at sub nibble level" 862 | s = list(s) 863 | s.reverse() 864 | s = '.'.join(s) 865 | first_nibble_index = int(32 - (self._prefixlen / 4)) * 2 866 | return ["%s.ip6.int." % s[first_nibble_index:]] 867 | else: 868 | raise ValueError, "only IPv4 and IPv6 supported" 869 | 870 | 871 | 872 | def reverseName(self): 873 | """Return the value for reverse lookup/PTR records as RfC 2317 look alike. 874 | 875 | RfC 2317 is an ugly hack which only works for sub-/24 e.g. not 876 | for /23. Do not use it. Better set up a Zone for every 877 | address. See reverseName for a way to arcive that. 878 | 879 | >>> print IP('195.185.1.1').reverseName() 880 | 1.1.185.195.in-addr.arpa. 881 | >>> print IP('195.185.1.0/28').reverseName() 882 | 0-15.1.185.195.in-addr.arpa. 883 | """ 884 | 885 | if self._ipversion == 4: 886 | s = self.strFullsize(0) 887 | s = s.split('.') 888 | s.reverse() 889 | first_byte_index = int(4 - (self._prefixlen / 8)) 890 | if self._prefixlen % 8 != 0: 891 | nibblepart = "%s-%s" % (s[3-(self._prefixlen / 8)], intToIp(self.ip + self.len() - 1, 4).split('.')[-1]) 892 | if nibblepart[-1] == 'l': 893 | nibblepart = nibblepart[:-1] 894 | nibblepart += '.' 895 | else: 896 | nibblepart = "" 897 | 898 | s = '.'.join(s[first_byte_index:]) 899 | return "%s%s.in-addr.arpa." % (nibblepart, s) 900 | 901 | elif self._ipversion == 6: 902 | s = hex(self.ip)[2:].lower() 903 | if s[-1] == 'l': 904 | s = s[:-1] 905 | if self._prefixlen % 4 != 0: 906 | nibblepart = "%s-%s" % (s[self._prefixlen:], hex(self.ip + self.len() - 1)[2:].lower()) 907 | if nibblepart[-1] == 'l': 908 | nibblepart = nibblepart[:-1] 909 | nibblepart += '.' 910 | else: 911 | nibblepart = "" 912 | s = list(s) 913 | s.reverse() 914 | s = '.'.join(s) 915 | first_nibble_index = int(32 - (self._prefixlen / 4)) * 2 916 | return "%s%s.ip6.int." % (nibblepart, s[first_nibble_index:]) 917 | else: 918 | raise ValueError, "only IPv4 and IPv6 supported" 919 | 920 | def __getitem__(self, key): 921 | """Called to implement evaluation of self[key]. 922 | 923 | >>> ip=IP('127.0.0.0/30') 924 | >>> for x in ip: 925 | ... print str(x) 926 | ... 927 | 127.0.0.0 928 | 127.0.0.1 929 | 127.0.0.2 930 | 127.0.0.3 931 | >>> print str(ip[2]) 932 | 127.0.0.2 933 | >>> print str(ip[-1]) 934 | 127.0.0.3 935 | """ 936 | return IP(IPint.__getitem__(self, key)) 937 | 938 | def __repr__(self): 939 | """Print a representation of the Object. 940 | 941 | >>> IP('10.0.0.0/8') 942 | IP('10.0.0.0/8') 943 | """ 944 | 945 | return("IP('%s')" % (self.strCompressed(1))) 946 | 947 | def __add__(self, other): 948 | """Emulate numeric objects through network aggregation""" 949 | if self.prefixlen() != other.prefixlen(): 950 | raise ValueError, "Only networks with the same prefixlen can be added." 951 | if self.prefixlen < 1: 952 | raise ValueError, "Networks with a prefixlen longer than /1 can't be added." 953 | if self.version() != other.version(): 954 | raise ValueError, "Only networks with the same IP version can be added." 955 | if self > other: 956 | # fixed by Skinny Puppy 957 | return other.__add__(self) 958 | else: 959 | ret = IP(self.int()) 960 | ret._prefixlen = self.prefixlen() - 1 961 | return ret 962 | 963 | def parseAddress(ipstr): 964 | """Parse a string and return the corrospondending IPaddress and the a guess of the IP version. 965 | 966 | Following Forms ar recorgnized: 967 | 0x0123456789abcdef # IPv4 if <= 0xffffffff else IPv6 968 | 123.123.123.123 # IPv4 969 | 123.123 # 0-padded IPv4 970 | 1080:0000:0000:0000:0008:0800:200C:417A 971 | 1080:0:0:0:8:800:200C:417A 972 | 1080:0::8:800:200C:417A 973 | ::1 974 | :: 975 | 0:0:0:0:0:FFFF:129.144.52.38 976 | ::13.1.68.3 977 | ::FFFF:129.144.52.38 978 | """ 979 | 980 | # TODO: refactor me! 981 | if ipstr.startswith('0x'): 982 | ret = long(ipstr[2:], 16) 983 | if ret > 0xffffffffffffffffffffffffffffffffL: 984 | raise ValueError, "%r: IP Address can't be bigger than 2^128" % (ipstr) 985 | if ret < 0x100000000L: 986 | return (ret, 4) 987 | else: 988 | return (ret, 6) 989 | 990 | if ipstr.find(':') != -1: 991 | # assume IPv6 992 | if ipstr.find(':::') != -1: 993 | raise ValueError, "%r: IPv6 Address can't contain ':::'" % (ipstr) 994 | hextets = ipstr.split(':') 995 | if ipstr.find('.') != -1: 996 | # this might be a mixed address like '0:0:0:0:0:0:13.1.68.3' 997 | (v4, foo) = parseAddress(hextets[-1]) 998 | assert foo == 4 999 | del(hextets[-1]) 1000 | hextets.append(hex(v4 >> 16)[2:-1]) 1001 | hextets.append(hex(v4 & 0xffff)[2:-1]) 1002 | if len(hextets) > 8: 1003 | raise ValueError, "%r: IPv6 Address with more than 8 hexletts" % (ipstr) 1004 | if len(hextets) < 8: 1005 | if '' not in hextets: 1006 | raise ValueError, "%r IPv6 Address with less than 8 hexletts and without '::'" % (ipstr) 1007 | # catch :: at the beginning or end 1008 | if hextets.index('') < len(hextets) - 1 and hextets[hextets.index('')+1] == '': 1009 | hextets.remove('') 1010 | # catch '::' 1011 | if hextets.index('') < len(hextets) - 1 and hextets[hextets.index('')+1] == '': 1012 | hextets.remove('') 1013 | 1014 | for foo in range(9-len(hextets)): 1015 | hextets.insert(hextets.index(''), '0') 1016 | hextets.remove('') 1017 | if '' in hextets: 1018 | raise ValueError, "%r IPv6 Address may contain '::' only once" % (ipstr) 1019 | if '' in hextets: 1020 | raise ValueError, "%r IPv6 Address may contain '::' only if it has less than 8 hextets" % (ipstr) 1021 | num = '' 1022 | for x in hextets: 1023 | if len(x) < 4: 1024 | x = ((4 - len(x)) * '0') + x 1025 | if int(x, 16) < 0 or int(x, 16) > 0xffff: 1026 | raise ValueError, "%r: single hextet must be 0 <= hextet <= 0xffff which isn't true for %s" % (ipstr, x) 1027 | num += x 1028 | return (long(num, 16), 6) 1029 | 1030 | elif len(ipstr) == 32: 1031 | # assume IPv6 in pure hexadecimal notation 1032 | return (long(ipstr, 16), 6) 1033 | 1034 | elif ipstr.find('.') != -1 or (len(ipstr) < 4 and int(ipstr) < 256): 1035 | # assume IPv4 ('127' gets interpreted as '127.0.0.0') 1036 | bytes = ipstr.split('.') 1037 | if len(bytes) > 4: 1038 | raise ValueError, "IPv4 Address with more than 4 bytes" 1039 | bytes += ['0'] * (4 - len(bytes)) 1040 | bytes = [long(x) for x in bytes] 1041 | for x in bytes: 1042 | if x > 255 or x < 0: 1043 | raise ValueError, "%r: single byte must be 0 <= byte < 256" % (ipstr) 1044 | return ((bytes[0] << 24) + (bytes[1] << 16) + (bytes[2] << 8) + bytes[3], 4) 1045 | 1046 | else: 1047 | # we try to interprete it as a decimal digit - 1048 | # this ony works for numbers > 255 ... others 1049 | # will be interpreted as IPv4 first byte 1050 | ret = long(ipstr) 1051 | if ret > 0xffffffffffffffffffffffffffffffffL: 1052 | raise ValueError, "IP Address cant be bigger than 2^128" 1053 | if ret <= 0xffffffffL: 1054 | return (ret, 4) 1055 | else: 1056 | return (ret, 6) 1057 | 1058 | 1059 | def intToIp(ip, version): 1060 | """Transform an integer string into an IP address.""" 1061 | 1062 | # just to be sure and hoping for Python 2.22 1063 | ip = long(ip) 1064 | 1065 | if ip < 0: 1066 | raise ValueError, "IPs can't be negative: %d" % (ip) 1067 | 1068 | ret = '' 1069 | if version == 4: 1070 | if ip > 0xffffffffL: 1071 | raise ValueError, "IPv4 Addresses can't be larger than 0xffffffff: %s" % (hex(ip)) 1072 | for l in range(4): 1073 | ret = str(ip & 0xffL) + '.' + ret 1074 | ip = ip >> 8; 1075 | ret = ret[:-1] 1076 | elif version == 6: 1077 | if ip > 0xffffffffffffffffffffffffffffffffL: 1078 | raise ValueError, "IPv6 Addresses can't be larger than 0xffffffffffffffffffffffffffffffff: %s" % (hex(ip)) 1079 | l = '0' * 32 + hex(ip)[2:-1] 1080 | for x in range(1,33): 1081 | ret = l[-x] + ret 1082 | if x % 4 == 0: 1083 | ret = ':' + ret 1084 | ret = ret[1:] 1085 | else: 1086 | raise ValueError, "only IPv4 and IPv6 supported" 1087 | 1088 | return ret; 1089 | 1090 | def _ipVersionToLen(version): 1091 | """Return number of bits in address for a certain IP version. 1092 | 1093 | >>> _ipVersionToLen(4) 1094 | 32 1095 | >>> _ipVersionToLen(6) 1096 | 128 1097 | >>> _ipVersionToLen(5) 1098 | Traceback (most recent call last): 1099 | File "", line 1, in ? 1100 | File "IPy.py", line 1076, in _ipVersionToLen 1101 | raise ValueError, "only IPv4 and IPv6 supported" 1102 | ValueError: only IPv4 and IPv6 supported 1103 | """ 1104 | 1105 | if version == 4: 1106 | return 32 1107 | elif version == 6: 1108 | return 128 1109 | else: 1110 | raise ValueError, "only IPv4 and IPv6 supported" 1111 | 1112 | 1113 | def _countFollowingZeros(l): 1114 | """Return Nr. of elements containing 0 at the beginning th the list.""" 1115 | if len(l) == 0: 1116 | return 0 1117 | elif l[0] != 0: 1118 | return 0 1119 | else: 1120 | return 1 + _countFollowingZeros(l[1:]) 1121 | 1122 | 1123 | _BitTable = {'0': '0000', '1': '0001', '2': '0010', '3': '0011', 1124 | '4': '0100', '5': '0101', '6': '0110', '7': '0111', 1125 | '8': '1000', '9': '1001', 'a': '1010', 'b': '1011', 1126 | 'c': '1100', 'd': '1101', 'e': '1110', 'f': '1111'} 1127 | 1128 | def _intToBin(val): 1129 | """Return the binary representation of an integer as string.""" 1130 | 1131 | if val < 0: 1132 | raise ValueError, "Only positive Values allowed" 1133 | s = hex(val).lower() 1134 | ret = '' 1135 | if s[-1] == 'l': 1136 | s = s[:-1] 1137 | for x in s[2:]: 1138 | if __debug__: 1139 | if not _BitTable.has_key(x): 1140 | raise AssertionError, "hex() returned strange result" 1141 | ret += _BitTable[x] 1142 | # remove leading zeros 1143 | while ret[0] == '0' and len(ret) > 1: 1144 | ret = ret[1:] 1145 | return ret 1146 | 1147 | def _count1Bits(num): 1148 | """Find the highest bit set to 1 in an integer.""" 1149 | ret = 0 1150 | while num > 0: 1151 | num = num >> 1 1152 | ret += 1 1153 | return ret 1154 | 1155 | def _count0Bits(num): 1156 | """Find the highest bit set to 0 in an integer.""" 1157 | 1158 | # this could be so easy if _count1Bits(~long(num)) would work as excepted 1159 | num = long(num) 1160 | if num < 0: 1161 | raise ValueError, "Only positive Numbers please: %s" % (num) 1162 | ret = 0 1163 | while num > 0: 1164 | if num & 1 == 1: 1165 | break 1166 | num = num >> 1 1167 | ret += 1 1168 | return ret 1169 | 1170 | 1171 | def _checkPrefix(ip, prefixlen, version): 1172 | """Check the validity of a prefix 1173 | 1174 | Checks if the variant part of a prefix only has 0s, and the length is 1175 | correct. 1176 | 1177 | >>> _checkPrefix(0x7f000000L, 24, 4) 1178 | 1 1179 | >>> _checkPrefix(0x7f000001L, 24, 4) 1180 | 0 1181 | >>> repr(_checkPrefix(0x7f000001L, -1, 4)) 1182 | 'None' 1183 | >>> repr(_checkPrefix(0x7f000001L, 33, 4)) 1184 | 'None' 1185 | """ 1186 | 1187 | # TODO: unify this v4/v6/invalid code in a function 1188 | bits = _ipVersionToLen(version) 1189 | 1190 | if prefixlen < 0 or prefixlen > bits: 1191 | return None 1192 | 1193 | if ip == 0: 1194 | zbits = bits + 1 1195 | else: 1196 | zbits = _count0Bits(ip) 1197 | if zbits < bits - prefixlen: 1198 | return 0 1199 | else: 1200 | return 1 1201 | 1202 | 1203 | def _checkNetmask(netmask, masklen): 1204 | """Checks if a netmask is expressable as e prefixlen.""" 1205 | 1206 | num = long(netmask) 1207 | bits = masklen 1208 | 1209 | # remove zero bits at the end 1210 | while (num & 1) == 0: 1211 | num = num >> 1 1212 | bits -= 1 1213 | if bits == 0: 1214 | break 1215 | # now check if the rest consists only of ones 1216 | while bits > 0: 1217 | if (num & 1) == 0: 1218 | raise ValueError, "Netmask %s can't be expressed as an prefix." % (hex(netmask)) 1219 | num = num >> 1 1220 | bits -= 1 1221 | 1222 | 1223 | def _checkNetaddrWorksWithPrefixlen(net, prefixlen, version): 1224 | """Check if a base addess of e network is compatible with a prefixlen""" 1225 | if net & _prefixlenToNetmask(prefixlen, version) == net: 1226 | return 1 1227 | else: 1228 | return 0 1229 | 1230 | 1231 | def _netmaskToPrefixlen(netmask): 1232 | """Convert an Integer reprsenting a Netmask to an prefixlen. 1233 | 1234 | E.g. 0xffffff00 (255.255.255.0) returns 24 1235 | """ 1236 | 1237 | netlen = _count0Bits(netmask) 1238 | masklen = _count1Bits(netmask) 1239 | _checkNetmask(netmask, masklen) 1240 | return masklen - netlen 1241 | 1242 | 1243 | def _prefixlenToNetmask(prefixlen, version): 1244 | """Return a mask of n bits as a long integer. 1245 | 1246 | From 'IP address conversion functions with the builtin socket module' by Alex Martelli 1247 | http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66517 1248 | """ 1249 | if prefixlen == 0: 1250 | return 0 1251 | elif prefixlen < 0: 1252 | raise ValueError, "Prefixlen must be > 0" 1253 | return ((2L< 0: 20 | critere = [critere] 21 | 22 | with open(self.dbname + '_' + '_'.join(critere) + '.csv', 'w') as fw: 23 | for domaine in domaines: 24 | try: 25 | towrite = '' 26 | for key in critere: 27 | infos = domaine[key] 28 | if len(infos) > 0: 29 | if isinstance(infos, list): 30 | infos = ','.join(infos) 31 | towrite = towrite + ',' + str(infos) 32 | 33 | fw.write(towrite[1:] + '\n') 34 | except KeyError: 35 | print 'domaine: ' + str(domaine) 36 | except pymongo.errors.OperationFailure: 37 | print 'error mongo ' + str(domaine) 38 | -------------------------------------------------------------------------------- /processing/createcorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from mongodb import mongodb 3 | import sys 4 | import filters 5 | db=sys.argv[1] 6 | mdb=mongodb.mongodb('localhost',27017,db) 7 | 8 | i=0 9 | 10 | with open(db+'_domaine.txt','w') as fw: 11 | fw.write('**** *domaine\n') 12 | for domaine in mdb.selectall('metadatas'): 13 | fw.write(domaine['domaine']) 14 | fw.write('\n') 15 | with open(db+'_metadatas.txt','w') as fw: 16 | fw.write('**** *metadata\n') 17 | for domaine in mdb.selectall('metadatas'): 18 | meta=domaine['meta'] 19 | for filt in filters.filters_metadata: 20 | meta=meta.replace(filt,'') 21 | meta=meta.replace(filt.swapcase(),'') 22 | fw.write(meta.encode('ascii','ignore')) 23 | fw.write('\n') 24 | 25 | 26 | 27 | fw.close() 28 | 29 | 30 | -------------------------------------------------------------------------------- /processing/dnstree.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Dec 20, 2012 3 | 4 | @author: slarinier 5 | ''' 6 | from pymongo import MongoClient 7 | from pyfaup.faup import Faup 8 | 9 | 10 | class DNSTree(object): 11 | ''' 12 | classdocs 13 | ''' 14 | 15 | def __init__(self, db_value): 16 | ''' 17 | Constructor 18 | ''' 19 | connection = MongoClient(host='localhost', port=27017) 20 | self.db = connection[db_value] 21 | 22 | def process(self): 23 | list_domains = self.db['new_domaines'].distinct('domaine') 24 | fex = Faup() 25 | for domain in list_domains: 26 | url = 'http://' + str(domain) 27 | fex.decode(url) 28 | 29 | try: 30 | print ( 31 | fex.get_tld() + ',' + fex.get_domain() + ',' + ','.join(fex.get_subdomain().split('.')[::-1]).replace('www', 32 | '')).replace( 33 | ',,', ',') 34 | except: 35 | pass -------------------------------------------------------------------------------- /processing/filters.py: -------------------------------------------------------------------------------- 1 | filters_metadata=['charset','text','iso','html','-8859-1','www','fr'] 2 | -------------------------------------------------------------------------------- /processing/filters.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/processing/filters.pyc -------------------------------------------------------------------------------- /processing/gouv.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/processing/gouv.log -------------------------------------------------------------------------------- /processing/gouv_domaine.txt: -------------------------------------------------------------------------------- 1 | **** *domaine 2 | legifrance.gouv.fr 3 | www.eure.sit.gouv.fr 4 | archives.livreblancdefenseetsecurite.gouv.fr 5 | www.yonne.sit.gouv.fr 6 | www.internet.gouv.fr 7 | www.direct-fr.com 8 | forum.webmaster-rank.info 9 | www.drees.sante.gouv.fr 10 | www.impots.gouv.fr 11 | www.oncfs.gouv.fr 12 | www.legifrance.gouv.fr 13 | www.interieur.gouv.fr 14 | www.refondonslecole.gouv.fr 15 | www.immigration.gouv.fr 16 | www.dmp.gouv.fr 17 | www.vendee.gouv.fr 18 | www.gouvernement.fr 19 | direccte.gouv.fr 20 | service-public.fr 21 | www.jeunes.gouv.fr 22 | vosdroits.service-public.fr 23 | www.tourisme.gouv.fr 24 | www.service-public.fr 25 | archives.forum.gouv.fr 26 | www.banqoutils.education.gouv.fr 27 | www2.impots.gouv.fr 28 | www.sgdn.gouv.fr 29 | www.cerpet.education.gouv.fr 30 | www.legifrance.org 31 | www-int.dmp.gouv.fr 32 | www.hcst.fr 33 | www.publinetd5.education.fr 34 | archives.dividende-numerique.fr 35 | cpcnu.fr 36 | www.dicod.defense.gouv.fr 37 | www.legifrance.com 38 | delegation.internet.gouv.fr 39 | www.clemi.org 40 | www-prod.sante.gouv.fr 41 | archives.europe.gouv.fr 42 | www.sante.gouv.fr 43 | www.nord.gouv.fr 44 | www.ove-national.education.fr 45 | www.2011-annee-droits-patients.sante.gouv.fr 46 | org-www.sante.gouv.fr 47 | www.emsome.terre.defense.gouv.fr 48 | www.rt519.terre.defense.gouv.fr 49 | archives.surfez-intelligent.gouv.fr 50 | www.restructurations.defense.gouv.fr 51 | www.bretagne.pref.gouv.fr 52 | www.bca13.terre.defense.gouv.fr 53 | ons.education.gouv.fr 54 | www.juryeps.education.fr 55 | 160.92.162.230 56 | www.publinetde.education.fr 57 | publinetce2.education.fr 58 | www.cdj59.org 59 | www.ancien.eure.pref.gouv.fr 60 | www.drdjs-lorraine.jeunesse-sports.gouv.fr 61 | www.hce.education.fr 62 | www.cpcnu.fr 63 | www.bretagne.drjscs.gouv.fr 64 | www.centre.drjscs.gouv.fr 65 | www.paca.drjscs.gouv.fr 66 | www.auvergne.drjscs.gouv.fr 67 | www.observatoire-parite.gouv.fr 68 | www.rama3.terre.defense.gouv.fr 69 | www.securite-sociale.fr 70 | www.haute-normandie.drjscs.gouv.fr 71 | www.bilrif.sga.defense.gouv.fr 72 | www.drjscs.gouv.fr 73 | www.aquitaine.drjscs.gouv.fr 74 | www.sports.gouv.fr 75 | www.anesm.sante.gouv.fr 76 | www.cesat.terre.defense.gouv.fr 77 | www.franche-comte.drjscs.gouv.fr 78 | www.garnison-besancon.terre.defense.gouv.fr 79 | www.etrs.terre.defense.gouv.fr 80 | www.bca7.terre.defense.gouv.fr 81 | www.lorraine.drjscs.gouv.fr 82 | www.midi-pyrenees.drjscs.gouv.fr 83 | www.rhone-alpes.drjscs.gouv.fr 84 | fr.webmaster-rank.info 85 | it.webmaster-rank.info 86 | easy404.webmaster-rank.info 87 | www.webmaster-rank.info 88 | www.pyrenees-atlantiques.pref.gouv.fr 89 | www.vigicrues.developpement-durable.gouv.fr 90 | www.memoiredeshommes.sga.defense.gouv.fr 91 | www.vigicrues.ecologie.gouv.fr 92 | www.servicehistorique.sga.defense.gouv.fr 93 | www.basse-normandie.pref.gouv.fr 94 | www.loire.pref.gouv.fr 95 | www.oise.pref.gouv.fr 96 | www.dordogne.pref.gouv.fr 97 | dordogne.pref.gouv.fr 98 | www.morbihan.pref.gouv.fr 99 | formation.oncfs.gouv.fr 100 | www.martinique.pref.gouv.fr 101 | www.drome.pref.gouv.fr 102 | loire.gouv.fr 103 | www.tarn-et-garonne.pref.gouv.fr 104 | www.cada.fr 105 | www.loire.gouv.fr 106 | m.geoportail.fr 107 | www.conseilculturel-upm.gouv.fr 108 | www.essonne.gouv.fr 109 | archives.internet.gouv.fr 110 | www.like-rank.com 111 | www.drogues.gouv.fr 112 | www.nord.pref.gouv.fr 113 | drogues.gouv.fr 114 | www.haute-saone.pref.gouv.fr 115 | www.maine-et-loire.pref.gouv.fr 116 | www.gopher.com 117 | www.ariege.pref.gouv.fr 118 | www2.direct-fr.com 119 | www.pyrenees-orientales.pref.gouv.fr 120 | www.haut-rhin.pref.gouv.fr 121 | www.isere.pref.gouv.fr 122 | www.somme.pref.gouv.fr 123 | search.kiwee.com 124 | ardeche.pref.gouv.fr 125 | vendee.gouv.fr 126 | www.franche-comte.pref.gouv.fr 127 | org-www.impots.gouv.fr 128 | www.contact.impots.gouv.fr 129 | interne.impots.gouv.fr 130 | contacts.impots.gouv.fr 131 | www.champagne-ardenne.pref.gouv.fr 132 | www.cartocrime.net 133 | www.guadeloupe.dieccte.gouv.fr 134 | www.guyane.dieccte.gouv.fr 135 | www.auvergne.direccte.gouv.fr 136 | www.boamp.fr 137 | www.paca.direccte.gouv.fr 138 | www.gers.pref.gouv.fr 139 | www.savoie.pref.gouv.fr 140 | www.ladocumentationfrancaise.fr 141 | www.vie-publique.fr 142 | www.pme.service-public.fr 143 | www.direccte.gouv.fr 144 | www.alsace.direccte.gouv.fr 145 | www.marne.pref.gouv.fr 146 | lannuaire.service-public.fr 147 | www.corse.direccte.gouv.fr 148 | www.ddjs-ardennes.jeunesse-sports.gouv.fr 149 | www.correze.pref.gouv.fr 150 | www.centre.pref.gouv.fr 151 | landes.pref.gouv.fr 152 | www.recrutement.terre.defense.gouv.fr 153 | search.zip2.com 154 | www.nievre.pref.gouv.fr 155 | www.contacts.impots.gouv.fr 156 | search.firstplace.com 157 | www.poitou-charentes.direccte.gouv.fr 158 | www.commentcamarche.net 159 | www.idf.direccte.gouv.fr 160 | www.ardennes.pref.gouv.fr 161 | www.pays-de-la-loire.direccte.gouv.fr 162 | www.mayotte.dieccte.gouv.fr 163 | experts-univers.com 164 | m.vosdroits.service-public.fr 165 | communaute.vie-publique.fr 166 | discours.vie-publique.fr 167 | interactif.service-public.fr 168 | pme.service-public.fr 169 | www.bourgogne.direccte.gouv.fr 170 | sciencespo.ladocumentationfrancaise.fr 171 | environnement-sante.com 172 | incredimailhosted.infospace.com 173 | mamma.infospace.com 174 | www.concours-civils.defense.gouv.fr 175 | www.concours-civils.sga.defense.gouv.fr 176 | www.leroustidou.com 177 | www.ri92.terre.defense.gouv.fr 178 | www.gites-erable-alsace.com 179 | www.formation.terre.defense.gouv.fr 180 | 90plan.ovh.net 181 | www.sante-environnement-travail.fr 182 | dmp.gouv.fr 183 | ladsetjockeys-lefilm.fr 184 | www.sante-environnement.fr 185 | www.topfouine.com 186 | www.basse-normandie.direccte.gouv.fr 187 | www.laubergine-eygalieres.com 188 | www.bretagne.direccte.gouv.fr 189 | www.meuse.pref.gouv.fr 190 | www.bilrif.defense.gouv.fr 191 | www.antoine.fr 192 | www.terredebruyere.com 193 | www.beghingroux.fr 194 | www.auberge-provencale.fr 195 | www.soirsdefetes.com 196 | www.telestock.fr 197 | sante-environnement.org 198 | www.chaletliotard.fr 199 | www.cars-la-populaire.com 200 | environnement-sante.org 201 | www.sermesdistribution.fr 202 | www.camping-la-pinede.com 203 | patisserieolivierbourau.com 204 | www.gourmets-events.com 205 | www.environnement-sante.net 206 | www.environnement-sante.fr 207 | www.alliancepavillons.org 208 | atelierfeesbrodeuses.fr 209 | www.dermophilindien-lab.com 210 | www.la-cabane-perchee.com 211 | www.aeta-audio.com 212 | www.sahlm79.fr 213 | www.ba118.air.defense.gouv.fr 214 | www.cclinouest.com 215 | www.rg3.terre.defense.gouv.fr 216 | www.iserba.fr 217 | www.fantasyforest.fr 218 | www.televitale.fr 219 | www.serialproducteurs.com 220 | www.ville-saintdie.fr 221 | www.coiffure2010.com 222 | www.cehd.sga.defense.gouv.fr 223 | www.varini.org 224 | www.ain.pref.gouv.fr 225 | www.beauregard-hotel.com 226 | www.transports-bernard.com 227 | www.tattootatouage.com 228 | www.automobile2010.com 229 | www.eetaa722.air.defense.gouv.fr 230 | coiffure2008.com 231 | www.nettoyagebijoux.com 232 | www.stages.defense.gouv.fr 233 | coupe-de-cheveux-homme.com 234 | www.coupedecheveuxfemme.com 235 | www.ba901.air.defense.gouv.fr 236 | www.ba106.air.defense.gouv.fr 237 | www.ba120.air.defense.gouv.fr 238 | www.coiffure2008.com 239 | www.web200708.clarahost.fr 240 | www.beautedeco.com 241 | www.qcclick.com 242 | coiffure2009.com 243 | www.epa749.air.defense.gouv.fr 244 | www.vpgreen.fr 245 | www.bcsfreelance.com 246 | www.lechaletdumoulin.fr 247 | www.media.recrutement.terre.defense.gouv.fr 248 | www.photo-phore.com 249 | www.marocchezlhabitant.com 250 | www.industube.com 251 | www.georget.fr 252 | www.acrie.fr 253 | mobile.recrutement.terre.defense.gouv.fr 254 | www.ba942.air.defense.gouv.fr 255 | www.hotelsatlas.com 256 | www.pharmacie-de-lherm.fr 257 | www.rpmi.fr 258 | 87.106.4.168 259 | www.ba107.air.defense.gouv.fr 260 | www.enligne.recrutement.terre.defense.gouv.fr 261 | www.hotel-st-georges.com 262 | www.ville-challans.fr 263 | www.ba217.air.defense.gouv.fr 264 | www.airmobilite.air.defense.gouv.fr 265 | www.ba721.air.defense.gouv.fr 266 | www.palmiers-ocean.fr 267 | www.quellemutuelles.com 268 | www.cfas.air.defense.gouv.fr 269 | www.ba112.air.defense.gouv.fr 270 | www.cma-bareges.air.defense.gouv.fr 271 | www.da204.air.defense.gouv.fr 272 | ead.ent-etrs.net 273 | www.ent-etrs.net 274 | www.eppa.sante.defense.gouv.fr 275 | www.plasti-ouest.com 276 | pharmacieduvalsaintjean.e-officine.net 277 | www.cedimattp.fr 278 | www.machecoul.com 279 | www.tsr-be.com 280 | pharmaciecentralelens.e-officine.net 281 | www.reseauetudiant.com 282 | twitter-icon.com 283 | search.egreetings.com 284 | www.ado.justice.gouv.fr 285 | www.experatoo.com 286 | www.journaldunet.com 287 | www.annuaires.justice.gouv.fr 288 | www.coiffures2011.net 289 | www.saint-martin-de-sanzay.fr 290 | www.puregourmandise.com 291 | www.yatoshi.com 292 | www.techniques-transparentes.com 293 | vecteurdiffusion.com 294 | www.domaine-sainteleocadie.com 295 | www.lejulesverne-paris.com 296 | www.lewistrondheim.com 297 | arnaudfrichphoto.com 298 | www.cdad-lot.justice.fr 299 | www.cdad-manche.justice.fr 300 | www.metiers.justice.gouv.fr 301 | affinitiz.net 302 | www.alerte-enlevement.gouv.fr 303 | www.ca-paris.justice.fr 304 | www.ciao.fr 305 | www.rip.justice.fr 306 | www.ca-besancon.justice.fr 307 | www.fontainedemars.com 308 | www.ca-bourges.justice.fr 309 | www.cdad-cotedor.justice.fr 310 | www.ca-aixenprovence.justice.fr 311 | www.holiprom.com 312 | www.western-valley.fr 313 | www.infoceane.com 314 | www.bateaux-mouches.fr 315 | www.justice.gouv.fr 316 | www.alaindelorme.com 317 | avocats.fr 318 | anissaledorze.avocats.fr 319 | www.vos-droits.justice.gouv.fr 320 | cmonatelier.cultura.com 321 | isabelle.chevalier-dupont.avocats.fr 322 | reseau.avf.asso.fr 323 | www.ca-amiens.justice.fr 324 | www.boutique-clubdsk.fr 325 | www.noube.fr 326 | www.ca-chambery.justice.fr 327 | www.eng.justice.fr 328 | www.ca-versailles.justice.fr 329 | servirlafrance.com 330 | www.animalnature.fr 331 | reseaulia.com 332 | selli-vine.avocats.fr 333 | kityuko.42stores.com 334 | couturejihanny.42stores.com 335 | www.ca-angers.justice.fr 336 | www.setzaomi.com 337 | www.editions-infini.fr 338 | www.lineab1.fr 339 | corinegaudilliere.avocats.fr 340 | planete-volontaires.fr 341 | blogs.jardiner-malin.fr 342 | loisicrea.com 343 | www.cevennescaravanes.com 344 | www.colorme.ch 345 | affinitiz.com 346 | parentsindignes.42stores.com 347 | www.suite23.fr 348 | www.1bijoux2perles.fr 349 | www.mecaservice.com 350 | www.ptfp.fr 351 | www.nosfell.com 352 | cheminsblancs.com 353 | cubexar.com 354 | www.jetaide.com 355 | forum-centres-d-appels.com 356 | www.avocatforum.com 357 | jetaide.com 358 | www.manzi.be 359 | www.cabasse.com 360 | candyshop.42stores.com 361 | kits-n-scrap.42stores.com 362 | www.lecoinplaisir.com 363 | www.swingromaneacademie.com 364 | www.limprimeur.net 365 | www.fert-demolition.com 366 | www.eguiazabal.com 367 | www.chacunsonchemin.com 368 | www.normanniae.com 369 | www.ot-saverne.fr 370 | www.poleressources95.org 371 | 720plan.ovh.net 372 | www.ba116.air.defense.gouv.fr 373 | www.ypluthier.com 374 | marina-erbarossa.com 375 | www.lamy-diffusion.com 376 | www.ba125.air.defense.gouv.fr 377 | www.leganet.fr 378 | constat-huissier.net 379 | information-juridique.com 380 | famillesdavant.linternaute.com 381 | msn.ciao.fr 382 | ecran-de-veille.linternaute.com 383 | www.forum-entreprise.com 384 | www.cgv-expert.fr 385 | webcam.linternaute.com 386 | programme-tv.linternaute.com 387 | www.guyane.pref.gouv.fr 388 | www.conseil-juridique.net 389 | www.action-collective.com 390 | polardiagram.com 391 | encyclopedie.linternaute.com 392 | www.legavox.fr 393 | site.journaldunet.com 394 | www.juristudiant.com 395 | emploi.journaldunet.com 396 | juristudiant.com 397 | arwatch.org 398 | formation.journaldunet.com 399 | www.inpharma2000.ru 400 | www.yvelines.pref.gouv.fr 401 | ms.ciao.fr 402 | www.veille-reputation.com 403 | www.finistere.pref.gouv.fr 404 | www.sarthe.pref.gouv.fr 405 | www.twitter-icon.com 406 | www.sarthe.gouv.fr 407 | photos.linternaute.com 408 | societe.journaldunet.com 409 | www.portail-mystique.fr 410 | www.moselle.pref.gouv.fr 411 | alavoileblanche.com 412 | piecemontee.com 413 | www.albifun.com 414 | www.urlidea.com 415 | www.guadeloupe.pref.gouv.fr 416 | dhammadana.fr 417 | www.sante-environnement.com 418 | www.escale-wellness.be 419 | www.markosweb.com 420 | www.aquitaine.pref.gouv.fr 421 | www.mc-franquevielle.fr 422 | www.domaine-de-marseillens.com 423 | www.ardeche.pref.gouv.fr 424 | www.lot.pref.gouv.fr 425 | www.charente.pref.gouv.fr 426 | www.indre-et-loire.pref.gouv.fr 427 | www.loiret.pref.gouv.fr 428 | www.motards-idf.fr 429 | www.indre.pref.gouv.fr 430 | www.mjdatabank.com 431 | www.zsysteme.com 432 | www.lemanoir39.com 433 | www.hotel-les-pyrenees.com 434 | www.droitsenfant.com 435 | annecybonlieuhotel.fr 436 | www.manche.pref.gouv.fr 437 | galeriedu7eme.com 438 | www.assoprairieland.com 439 | www.lexilogos.com 440 | www.preparation-physique.net 441 | www.theoutlaw.fr 442 | www.bill-looking.fr 443 | www.landes.pref.gouv.fr 444 | www.aigrehandball.fr 445 | www.iletaitunevoix.org 446 | www.jura.pref.gouv.fr 447 | www.jm-planchon.fr 448 | www.campingchadeyron.com 449 | www.fruirouge.fr 450 | www.campingcassis.com 451 | www.evretz.fr 452 | www.contespedagogiques.be 453 | www.lazare-et-vespucci.com 454 | www.randoleiesclops.fr 455 | www.braccomotos.com 456 | www.hugme.fr 457 | mondolatino.fr 458 | www.pkma.eu 459 | www.photos-allain-mousset.fr 460 | unamourdeuxperles.com 461 | www.vaccination-h1n1.moselle.pref.gouv.fr 462 | jardinvoyageur.com 463 | seine-saint-denis.gouv.fr 464 | www.auvergne.pref.gouv.fr 465 | mobile.hauts-de-seine.gouv.fr 466 | www.pfrh.lorraine.pref.gouv.fr 467 | www.srias.lorraine.pref.gouv.fr 468 | paysages.mayenne.pref.gouv.fr 469 | www.risquesmajeurs-hautes-pyrenees.pref.gouv.fr 470 | 208.76.50.76 471 | ddrm.mayotte.pref.gouv.fr 472 | lot-et-garonne.gouv.fr 473 | www.ppol-taxi.interieur.gouv.fr 474 | nasdaq.infospace.com 475 | www.prse.lorraine.gouv.fr 476 | www.haute-savoie.pref.gouv.fr 477 | www.cakechloes.com 478 | www.languedoc-roussillon.pref.gouv.fr 479 | aveyron.gouv.fr 480 | old.pyrenees-atlantiques.pref.gouv.fr 481 | www.finistere.gouv.fr 482 | www.seine-saint-denis.pref.gouv.fr 483 | www.lorraine.pref.gouv.fr 484 | www.charente-maritime.pref.gouv.fr 485 | www.finances.gouv.fr 486 | laboratoirecentral.interieur.gouv.fr 487 | sas.sante.gouv.fr 488 | yvelines.pref.gouv.fr 489 | www.recherche-biomedicale.sante.gouv.fr 490 | www.datar.gouv.fr 491 | www.lenotre.culture.gouv.fr 492 | www.sanglier5767.com 493 | portailmoselle.dims.fr 494 | baignades.sante.gouv.fr 495 | agriculture.gouv.fr 496 | www.moselle.gouv.fr 497 | voiceillusion.com 498 | ddaf.ain.pref.gouv.fr 499 | www.pref93.pref.gouv.fr 500 | www.srcae.lorraine.gouv.fr 501 | www.diplomatie.gouv.fr 502 | www.economie.gouv.fr 503 | www.developpement-durable.gouv.fr 504 | en.palmiers-ocean.fr 505 | www.sae-diffusion.sante.gouv.fr 506 | www.ddjs-haute-savoie.jeunesse-sports.gouv.fr 507 | www.ile-de-france.sante.gouv.fr 508 | www.coupesdecheveux2011.net 509 | www.eure.sit.gouv.fr 510 | archives.livreblancdefenseetsecurite.gouv.fr 511 | www.oncfs.gouv.fr 512 | www.yonne.sit.gouv.fr 513 | www.internet.gouv.fr 514 | www.impots.gouv.fr 515 | forum.webmaster-rank.info 516 | www2.impots.gouv.fr 517 | direccte.gouv.fr 518 | www.immigration.gouv.fr 519 | www.hcst.fr 520 | www.drees.sante.gouv.fr 521 | -------------------------------------------------------------------------------- /processing/metadataextract.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from subprocess import Popen, PIPE 4 | import threading 5 | import pymongo 6 | from pymongo import MongoClient 7 | import simplejson 8 | import HTMLParser 9 | class metadataextract(threading.Thread): 10 | def __init__(self,scriptjs,db,domaine,url): 11 | threading.Thread.__init__(self) 12 | self.result=[] 13 | self.domaine=domaine 14 | self.scriptjs=scriptjs 15 | self.url=url 16 | self.connection= MongoClient(host='localhost', port=27017,db=db) 17 | self.db=self.connection[db] 18 | 19 | def run(self): 20 | result=subprocess.Popen(['casperjs',self.scriptjs,self.url],stdout=PIPE) 21 | meta='' 22 | contents=[] 23 | 24 | for ligne in result.stdout: 25 | meta=meta+ligne 26 | 27 | try: 28 | data = simplejson.loads(meta) 29 | #print data 30 | print len(data) 31 | if len(data) > 0: 32 | print data 33 | for content in data: 34 | contents.append(content['content']) 35 | 36 | meta=' '.join(contents) 37 | print meta 38 | if len(meta) >0: 39 | h = HTMLParser.HTMLParser() 40 | print h.unescape(meta) 41 | value_db={'domaine':self.domaine,'meta':h.unescape(meta)} 42 | self.db.metadatas.save(value_db) 43 | except ValueError: 44 | print 'Erreur encoding: '+ meta 45 | -------------------------------------------------------------------------------- /processing/metadataextract.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/processing/metadataextract.pyc -------------------------------------------------------------------------------- /scanners/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/scanners/__init__.py -------------------------------------------------------------------------------- /scanners/networks.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 14 mai 2014 3 | 4 | @author: slarinier 5 | ''' 6 | from libnmap.parser import NmapParser 7 | from libnmap.process import NmapProcess 8 | 9 | class Networks(object): 10 | ''' 11 | classdocs 12 | ''' 13 | 14 | 15 | def __init__(self, targets,options): 16 | self.nmap=NmapProcess(targets,options) 17 | def run(self): 18 | self.nmap.run() 19 | 20 | def make_report(self): 21 | report=NmapParser.parse(self.nmap.stdout) 22 | result=[] 23 | for host in report.hosts: 24 | temp={} 25 | print host 26 | print host.scripts_results 27 | temp['ip']=host.ipv4 28 | print [(service.state,service.port,service.scripts_results) for service in host.services] 29 | # for service in host.services: 30 | # for k in service.scripts_results: 31 | # if k.find('.'): 32 | # v=service.scripts_results[k] 33 | # del service.scripts_resutls[k] 34 | # service.scripts_resutls[k.replace('.','_')]=v 35 | # temp['services']=[(service.state,service.port,service.scripts_results)] 36 | # result.append(temp) 37 | # return result 38 | def record_report(self,records,cache,coll): 39 | for r in records: 40 | doc=cache[r['ip']] 41 | doc['service']=r 42 | try: 43 | coll.save(doc) 44 | except: 45 | print doc -------------------------------------------------------------------------------- /screenshots/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/screenshots/__init__.py -------------------------------------------------------------------------------- /screenshots/make_screenshots.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import screenshots 3 | import sys 4 | import threading 5 | 6 | file_list_websites=sys.argv[1] 7 | jsfile=sys.argv[2] 8 | emplacement=sys.argv[3] 9 | threadpool=sys.argv[4] 10 | domaines=[] 11 | main_thread = threading.currentThread() 12 | with open(file_list_websites,'r') as fr: 13 | for ligne in fr: 14 | domaines.append(ligne.replace('\r\n','')) 15 | print domaines 16 | i=0 17 | for domaine in domaines: 18 | i+=1 19 | screen=screenshots.Screenshots(domaines,jsfile,emplacement,domaine) 20 | screen.start() 21 | if i % int(threadpool): 22 | for t in threading.enumerate(): 23 | if t is not main_thread: 24 | t.join() 25 | 26 | -------------------------------------------------------------------------------- /screenshots/screenshots.js: -------------------------------------------------------------------------------- 1 | var casper = require('casper').create({ 2 | 3 | }) 4 | , terms = casper.cli.get(0),url=casper.cli.get(1),emplacement=casper.cli.get(2),i=0 5 | casper.start(url, function() { 6 | this.capture(emplacement+'/'+terms+'.png', { 7 | top: 10, 8 | left: 10, 9 | width: 1024, 10 | height: 768 11 | },12000); 12 | }); 13 | 14 | casper.run() 15 | -------------------------------------------------------------------------------- /screenshots/screenshots.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from subprocess import Popen, PIPE 3 | import threading 4 | import time 5 | # -*- coding: utf-8 -*- 6 | """ 7 | Created on Mon Apr 30 12:24:14 2012 8 | 9 | @author: slarinier 10 | """ 11 | 12 | class Screenshots(threading.Thread): 13 | def __init__(self,listofwebsites,jsfile,location,website): 14 | self.listofwebsites=listofwebsites 15 | self.jsfile=jsfile 16 | self.location=location 17 | self.website=website 18 | threading.Thread.__init__(self) 19 | 20 | 21 | def run(self): 22 | cmd='casperjs '+self.jsfile+' '+self.website +' http://'+self.website +' '+self.location+' --web-security=no' 23 | args=cmd.split() 24 | result=subprocess.Popen(args,stdout=PIPE) 25 | print "Make screenshots :"+self.website 26 | time.sleep(3) 27 | 28 | -------------------------------------------------------------------------------- /storage/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /storage/redis_record.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 24 15:30:33 2013 4 | 5 | @author: slarinier 6 | """ 7 | import redis 8 | 9 | class RedisRecord(object): 10 | 11 | def __init__(self,host='localhost',port=6379,db=1): 12 | pool=redis.ConnectionPool(host=host,port=port,db=db) 13 | self.r=redis.Redis(connection_pool=pool) 14 | self.processus_tab=[] 15 | def delete(self,key): 16 | self.r.delete(key) 17 | def get(self,key): 18 | return self.r.get(key) 19 | def put(self,key,value): 20 | self.r.set(key,value) 21 | def init(self,dbs): 22 | for i in dbs: 23 | self.flushdb(i) 24 | def flushdb(self,db_value): 25 | self.switchDB(db_value) 26 | self.r.flushdb() 27 | def rpush(self,listvalue,item): 28 | self.r.rpush(listvalue,item) 29 | def rpop(self,listvalue): 30 | return self.r.rpop(listvalue) 31 | def switchDB(self,db,host='localhost',port=6379): 32 | pool=redis.ConnectionPool(host=host,port=port,db=db) 33 | self.r=redis.Redis(connection_pool=pool) 34 | def currentDB(self): 35 | return self.r.connection_pool.get_connection(1).db --------------------------------------------------------------------------------