├── .gitignore
├── README.md
├── __init__.py
├── actions.py
├── geoloc_by_domain.py
├── geolocatisation
├── GeoLiteCity.dat
├── __init__.py
├── dschield.py
├── geolocalisation.py
└── result.txt
├── harvesting
├── __init__.py
├── __init__.pyc
├── bingsearch.js
├── content.py
├── content_search.py
├── crawler.py
├── dynamic.js
├── filters.py
├── googlesearch.js
├── keywords
├── metaextract.js
├── pastebin.js
├── pastebin.py
├── pastebinExtract.py
├── pastebintest.py
├── pastebintext.js
├── pholcidae.py
├── random_user_agent.py
├── search.py
├── user_agents
├── white_list.py
└── yahoosearch.js
├── history
├── __init__.py
└── history.py
├── main.py
├── mongodb
├── __init__.py
├── __init__.pyc
├── mongodb.py
└── mongodb.pyc
├── network
├── IPy.py
├── __init__.py
├── __init__.pyc
├── make_networks.py
├── networks.py
├── networks.pyc
└── search_on_network.py
├── processing
├── __init__.py
├── __init__.pyc
├── bulk.py
├── categoryze_result.py
├── clean_db.py
├── compare.py
├── create_request.py
├── create_result.py
├── createcorpus.py
├── dnstree.py
├── filters.py
├── filters.pyc
├── gouv.log
├── gouv_domaine.txt
├── gouv_metadatas.txt
├── metadataextract.py
└── metadataextract.pyc
├── scanners
├── __init__.py
└── networks.py
├── screenshots
├── __init__.py
├── make_screenshots.py
├── screenshots.js
└── screenshots.py
└── storage
├── __init__.py
└── redis_record.py
/.gitignore:
--------------------------------------------------------------------------------
1 | #compiled file
2 | *.pyc
3 | #log
4 | *.log
5 | #screen
6 | *.png
7 | #data
8 | *.csv
9 | .project
10 | .settings/org.eclipse.ltk.core.refactoring.prefs
11 | .pydevproject
12 | *.txt
13 | *.tar.gz
14 | *.zip
15 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/README.md
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/__init__.py
--------------------------------------------------------------------------------
/actions.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Feb 1, 2013
3 |
4 | @author: slarinier
5 | '''
6 |
7 | from libnmap.parser import NmapParser
8 | from libnmap.process import NmapProcess
9 | import pymongo
10 | from pymongo import MongoClient
11 | import threading
12 |
13 | from harvesting import search
14 | from harvesting.crawler import Record, CrawlerThread
15 | import mongodb
16 | from network import make_networks, networks
17 | from network.IPy import IP
18 | from processing import metadataextract
19 | from processing.clean_db import Cleandb
20 | from processing.create_result import Create_Result
21 | from processing.dnstree import DNSTree
22 | from screenshots.screenshots import Screenshots
23 | from scanners.networks import Networks
24 |
25 | class Actions(object):
26 | '''
27 | classdocs
28 | '''
29 | def __init__(self, db_value):
30 | self.db_value = db_value
31 | connection = MongoClient(host='localhost', port=27017)
32 | self.db = connection[db_value]
33 |
34 | def create_network(self):
35 | network=make_networks.make_networks('localhost', self.db_value)
36 | network.createNetworks('new_domaines')
37 | network.exportFile(self.db_value+'_network.log')
38 |
39 | def create_result(self,collection,criteria):
40 | createResult=Create_Result(self.db_value,criteria)
41 | if collection=='scanners':
42 | createResult.processScanners(collection)
43 | return
44 | createResult.process(collection)
45 |
46 | def metasearch(self,criteria,scriptsJS,geoloc):
47 | print "########### Meta Search ###########"
48 | main_thread = threading.currentThread()
49 | thread_pool=[]
50 | for criterius in criteria:
51 | for script in scriptsJS:
52 | gs=search.search(100,criterius,script,self.db_value)
53 | gs.start()
54 | thread_pool.append(gs)
55 | for t in thread_pool:
56 | t.join()
57 | for t in thread_pool:
58 | t.record()
59 | print "########### Search terminated ###########"
60 |
61 | print "########### Resolve IP ############"
62 | networks.resolve(geoloc,self.db_value)
63 |
64 | def search_ip(self,geoloc,scriptsJS,ip_range):
65 | main_thread = threading.currentThread()
66 | print "########### Search by IP ###########"
67 | ips=[]
68 | domaines=self.db.new_domaines.find()
69 | thread_pool=[]
70 | cache={}
71 | for domaine in domaines:
72 | try:
73 | ips.append(domaine['ip'])
74 |
75 | except KeyError:
76 | print domaine
77 | i=0
78 | print 'les IPS sont: '+ str(ips)
79 | ip_to_add=[]
80 | if ip_range:
81 | ip_to_add=[str(x) for x in IP(ip_range)]
82 | ips[len(ips):]=ip_to_add
83 | for ip in set(ips):
84 | if ip != '0.0.0.0':
85 | i+=1
86 | gs=search.search(20,'ip:'+str(ip),scriptsJS[1],self.db_value)
87 | gs.start()
88 | thread_pool.append(gs)
89 | if i % 10 ==0:
90 | for t in thread_pool:
91 | t.join()
92 | for t in thread_pool:
93 | t.record()
94 | print "########### Search terminated ###########"
95 | print "########### Search by network ###########"
96 |
97 | print "########### Resolve IP ############"
98 | networks.resolve(geoloc,self.db_value)
99 |
100 | def scan_network(self):
101 | pass
102 | def scan_nmap(self,ip_range,options):
103 | ips=[]
104 | domaines=self.db.new_domaines.find()
105 | thread_pool=[]
106 | cache={}
107 | for domaine in domaines:
108 | try:
109 | ips.append(domaine['ip'])
110 | cache[domaine['ip']]=domaine
111 | except KeyError:
112 | print domaine
113 | net=Networks(list(set(ips)),options)
114 | net.run()
115 | report=net.make_report()
116 | #net.record_report(report,cache,self.db.new_domaines)
117 | pass
118 | def screenshots(self,db_value,threadpool):
119 | connection= MongoClient(host='localhost', port=27017)
120 | db=connection[db_value]
121 | domaines=db.new_domaines.distinct('domaine')
122 | i=0
123 | main_thread = threading.currentThread()
124 | threadpools=[]
125 | print "print "+ str(len(domaines))+ " screenshots"
126 | for domaine in domaines:
127 | i+=1
128 | screen=Screenshots(domaines, 'screenshots/screenshots.js', 'screenshots/screenshots/'+db_value, domaine)
129 | screen.start()
130 | threadpools.append(screen)
131 | if i % int(threadpool)== 0:
132 | for t in threadpools:
133 | t.join()
134 |
135 | def metadata_exctract(self,db):
136 | main_thread = threading.currentThread()
137 | print "########## Meta Data IP ##########"
138 | mdb=mongodb.mongodb('localhost',27017,db)
139 | i=0
140 |
141 | for domaine in mdb.selectall('new_domaines'):
142 | i+=1
143 | url=domaine['url']
144 | domaine_value=domaine['domaine']
145 | print url
146 | if not 'meta' in domaine:
147 | domaine['meta']='ok'
148 | mtd=metadataextract.metadataextract('harvesting/metaextract.js',db,domaine_value,url)
149 | mtd.start()
150 | if i % 30==0:
151 | for t in threading.enumerate():
152 | if t is not main_thread:
153 | t.join(2)
154 |
155 | def dnstree(self,db_value):
156 | dnst=DNSTree(db_value)
157 | dnst.process()
158 |
159 | def crawl(self,list_domains):
160 | main_thread = threading.currentThread()
161 | #domaines=self.db.new_domaines.distinct('domaine')
162 | domains=list_domains.split(',')
163 | threadpool=[]
164 | lock=threading.Lock()
165 | rec=Record(self.db_value,lock)
166 | rec.start()
167 | i=0
168 | for domain in domains:
169 | i=i+1
170 | cw=CrawlerThread(domain,self.db,lock)
171 | cw.run()
172 |
173 | if i % 5==0:
174 | for t in threading.enumerate():
175 | if t is not main_thread:
176 | t.join(2)
177 | stop=True
178 |
179 | while(stop):
180 | for t in threadpool:
181 | if not t.IsActive():
182 | threadpool.remove(t)
183 | if len(threadpool)==0:
184 | stop=False
185 |
186 | def clean_db(self,pathfilters):
187 | print "#####Clean DB####"
188 | directory = "screenshots/screenshots/"+self.db_value
189 | filters=[]
190 | with open(pathfilters,'r') as fw:
191 | for ligne in fw:
192 | filters.append(ligne.strip())
193 | cl=Cleandb(self.db_value, directory, filters)
194 | cl.clean()
195 |
196 | def reset(self):
197 |
198 | for domaine in self.db.new_domaines.find():
199 | domaine['meta']=None
200 | self.db.update(domaine,'new_domaines')
201 |
202 | def init(self,db,coll,attrib):
203 |
204 | self.db.create_collection(coll)
205 | self.db[coll].ensure_index([(attrib,pymongo.ASCENDING)],unique=True)
206 |
--------------------------------------------------------------------------------
/geoloc_by_domain.py:
--------------------------------------------------------------------------------
1 | from network import networks
2 | import argparse
3 | import sys
4 | from geolocatisation import dschield
5 |
6 | parser = argparse.ArgumentParser(description='Geolocalisation by domains')
7 | parser.add_argument('--domaine', dest='fqdn',help='make a fqdn for geolocalisation')
8 | parser.add_argument('--filename',dest='list_domaine')
9 | parser.add_argument('--geoloc_file',dest='geoloc_file')
10 | parser.add_argument('--resolve_dns',dest='resolve_dns')
11 | parser.add_argument('--geoloc_country',dest='geoloc_country')
12 | parser.add_argument('--outfile',dest='outfile')
13 |
14 | args=parser.parse_args()
15 | domaines=[]
16 | geoloc=[]
17 | geoloc_country=False
18 | geoloc_file=False
19 | if args.fqdn != None:
20 | domaines=[args.fqdn]
21 | if args.list_domaine != None:
22 | print "Read Domaine List"
23 | with open(args.list_domaine,'r') as fr:
24 | for ligne in fr:
25 | domaines.append(ligne.strip())
26 | if args.geoloc_file != None:
27 | print "Geolocalisation Load"
28 | geoloc_file=True
29 | if args.geoloc_file == None:
30 | parser.print_help()
31 | sys.exit(-1)
32 | print "geoloc"
33 |
34 | if args.geoloc_country:
35 | print "Geolocalisation country ok"
36 | geoloc_country=True
37 | domaines=list(set(domaines))
38 | print "Domaines list: "+str(len(domaines))
39 | for domaine in domaines:
40 | ip='0.0.0.0'
41 | ip=networks.resolve_dns(domaine)
42 | if ip != None:
43 | temp=ip+','+domaine
44 | if geoloc_file == True:
45 | geo=networks.geolocIP(args.geoloc_file,ip)
46 | country=networks.geolocCountry(args.geoloc_file,ip)
47 | if country:
48 | temp=temp+','+country
49 | if geo:
50 | temp=temp+','+geo
51 | if geoloc_country ==True:
52 | ds=dschield.dschield('http://dshield.org/ipinfo_ascii.html?ip=')
53 | ip,country,asname,network=ds.response(ip)
54 | temp=temp+','+country
55 | print temp
56 | geoloc.append(temp)
57 | else:
58 | geoloc.append('DNS Failure: '+domaine)
59 | if args.outfile != None:
60 | with open(args.outfile,'w') as fw:
61 | for ligne in geoloc:
62 | fw.write(ligne+'\n')
63 |
64 |
65 |
--------------------------------------------------------------------------------
/geolocatisation/GeoLiteCity.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/geolocatisation/GeoLiteCity.dat
--------------------------------------------------------------------------------
/geolocatisation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/geolocatisation/__init__.py
--------------------------------------------------------------------------------
/geolocatisation/dschield.py:
--------------------------------------------------------------------------------
1 | import urllib2
2 | import re
3 | class dschield(object):
4 |
5 | def __init__(self,url):
6 | self.url=url
7 |
8 | def response(self,ip):
9 | dschieldContent=urllib2.urlopen(self.url+ip)
10 | value=dschieldContent.read()
11 | patern='country= (\w+)'
12 |
13 | reg =re.compile(patern)
14 | m = reg.search(value)
15 | country=''
16 | if m:
17 | country=m.group(1)
18 | patern='asname= (.+)'
19 | reg =re.compile(patern)
20 | m = reg.search(value)
21 | asname=''
22 | if m:
23 | asname=m.group(1)
24 | patern='network= (.+)'
25 | reg =re.compile(patern)
26 | m = reg.search(value)
27 | network=''
28 | if m:
29 | network=m.group(1)
30 | network=network.split(' ')[0]
31 | if country != '' and asname !='' and network !='':
32 | return (ip,country,asname,network)
33 | return ('127.0.0.1','mars','alien','nothing')
34 |
--------------------------------------------------------------------------------
/geolocatisation/geolocalisation.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Feb 20, 2013
3 |
4 | @author: slarinier
5 | '''
6 | from pymongo.connection import Connection
7 |
8 | class Geolocalisation(object):
9 | '''
10 | classdocs
11 | '''
12 |
13 |
14 | def __init__(self,list_domaine,db_value):
15 | '''
16 | Constructor
17 | '''
18 |
19 | def geolochoffline(self):
20 |
21 |
22 | def geolocOnline(self):
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/geolocatisation/result.txt:
--------------------------------------------------------------------------------
1 | 31.184.244.9,onlinetracksz.net,24.0_54.0,RU
2 | 31.184.244.9,httpsites.org,24.0_54.0,RU
3 | 31.184.244.9,onlinegreencm.org,24.0_54.0,RU
4 | 31.184.244.9,onlinegiigii.com,24.0_54.0,RU
5 | 31.184.244.9,onlinefishmw3bid.net,24.0_54.0,RU
6 | 31.184.244.9,onlineliverss.org,24.0_54.0,RU
7 | 31.184.244.9,onlinemooviii.com,24.0_54.0,RU
8 | 31.184.244.9,onlinegiigii.net,24.0_54.0,RU
9 | 62.109.12.39,62.109.12.39,55.7522_37.6156,RU
10 | 31.184.244.9,httpsites.net,24.0_54.0,RU
11 | DNS Failure: sauth-yandex.ru
12 | 31.184.244.9,onlinepainrs.com,24.0_54.0,RU
13 | 31.184.244.9,onlinegreenguide.com,24.0_54.0,RU
14 | 31.184.244.9,onlinepainrs.net,24.0_54.0,RU
15 | 31.184.244.9,onlineliververs.net,24.0_54.0,RU
16 | 31.184.244.9,online-moo-viii.net,24.0_54.0,RU
17 | 31.184.244.9,onlinemaris.com,24.0_54.0,RU
18 | 31.184.244.9,onlinegreenguide.net,24.0_54.0,RU
19 | 31.184.244.9,httpblogs.com,24.0_54.0,RU
20 | 31.184.244.9,onlinecodmw3buy.net,24.0_54.0,RU
21 | 31.184.244.9,onlinemaris.net,24.0_54.0,RU
22 | 31.184.244.9,onlinemooviii.net,24.0_54.0,RU
23 | 173.45.252.44,oase2.net,38.6446_-90.2533,US
24 | 92.63.106.133,www.money-yanbex.ru,60.0_100.0,RU
25 | 31.184.244.9,31.184.244.9,24.0_54.0,RU
26 | 31.184.244.219,onlinemoneysstock.org,24.0_54.0,RU
27 | 31.184.244.219,onlinefundsgoods.org,24.0_54.0,RU
28 | 31.184.244.219,livemoneysgoods.org,24.0_54.0,RU
29 | 31.184.244.219,onlineincomegoods.org,24.0_54.0,RU
30 | DNS Failure: newdomeninfo.info
31 | 31.184.244.9,onlineliververs.com,24.0_54.0,RU
32 | 31.184.244.9,onlineliverss.com,24.0_54.0,RU
33 | 31.184.244.9,onlineliverss.net,24.0_54.0,RU
34 | DNS Failure: onlinecashsstt.org
35 | DNS Failure: internetmoneysstt.org
36 | 69.43.161.151,moneyinternetlovesff.info,-27.0_133.0,US
37 | 31.184.244.219,livewindowsxpf4.info,24.0_54.0,RU
38 | 31.184.244.219,onlinewinsphonessite.org,24.0_54.0,RU
39 | 141.8.224.162,webstockcwo.info,47.0_8.0,CH
40 | DNS Failure: internetwindowslive.info
41 | 31.184.244.219,theonlinewinsphones.org,24.0_54.0,RU
42 | 31.184.244.219,webwindowsproc.info,24.0_54.0,RU
43 | 31.184.244.219,internetwindowslows.com,24.0_54.0,RU
44 | DNS Failure: moneydigitallovesff.info
45 | 31.184.244.219,internet-wins-phones.org,24.0_54.0,RU
46 | 31.184.244.219,livewindowsproc.info,24.0_54.0,RU
47 | 31.184.244.219,onlinewindowsxpf4site.info,24.0_54.0,RU
48 | 31.184.244.219,webwindowslows.com,24.0_54.0,RU
49 | 31.184.244.219,webbuildingstore.info,24.0_54.0,RU
50 | DNS Failure: livemoneysstt.org
51 | DNS Failure: moneylivelovesff.info
52 | 69.43.161.161,stockonlinelovesff.info,-27.0_133.0,US
53 | 69.43.161.156,moneyweblovesff.info,-27.0_133.0,US
54 | 31.184.244.219,digitalwindowsproc.info,24.0_54.0,RU
55 | DNS Failure: cashonlinelovesff.info
56 | 31.184.244.219,onlinemoneyssuv.info,24.0_54.0,RU
57 | 31.184.244.219,onlinemicrosoftproc.info,24.0_54.0,RU
58 | 31.184.244.219,onlinewindowsxpf4s.info,24.0_54.0,RU
59 | 69.43.161.161,dollaronlinelovesff.info,-27.0_133.0,US
60 | 31.184.244.219,digitalwinsphones.org,24.0_54.0,RU
61 | 62.109.23.82,l2-pantheon.ru,59.8944_30.2642,RU
62 | 31.184.244.219,onlinefinanses2f.info,24.0_54.0,RU
63 | 141.8.224.162,internetstockcwo.info,47.0_8.0,CH
64 |
--------------------------------------------------------------------------------
/harvesting/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/harvesting/__init__.py
--------------------------------------------------------------------------------
/harvesting/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/harvesting/__init__.pyc
--------------------------------------------------------------------------------
/harvesting/bingsearch.js:
--------------------------------------------------------------------------------
1 | var links = [];
2 | var casper = require('casper').create();
3 | var padding=casper.cli.get(0)
4 | var criteria=casper.cli.get(1)
5 | function getLinks() {
6 |
7 | var links = document.querySelectorAll('h2 a')
8 | return Array.prototype.map.call(links, function(e) {
9 | return e.getAttribute('href')
10 | });
11 | }
12 |
13 |
14 | casper.start();
15 |
16 | casper.open('http://www.bing.com/search?q='+criteria+'&go=&qs=ds&filt=all&first='+padding+'&FORM=PERE')
17 | casper.then(function() {
18 | // aggregate results for the 'casperjs' search
19 |
20 |
21 | links = this.evaluate(getLinks);
22 |
23 | // now search for 'phantomjs' by filling the form again
24 | });
25 |
26 |
27 |
28 | casper.run(function() {
29 | // echo results in some pretty fashion
30 | this.echo(links.length + ' links found:');
31 | this.echo(' - ' + links.join('\n - ')).exit();
32 | });
33 |
--------------------------------------------------------------------------------
/harvesting/content.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 1, 2012
3 |
4 | @author: slarinier
5 | '''
6 | import re
7 | from content_search import Content_search
8 |
9 | class Content(object):
10 | '''
11 | classdocs
12 | '''
13 | _instance = None
14 | def __new__(cls, *args, **kwargs):
15 | if not cls._instance:
16 | cls._instance = super(Content, cls).__new__(cls, *args, **kwargs)
17 | return cls._instance
18 |
19 | def __init__(self,filetoload='keywords'):
20 | '''
21 | Constructor
22 | '''
23 | self.filetoload=filetoload
24 | self.keywords=[]
25 | with open(self.filetoload,'r') as fr:
26 | for ligne in fr:
27 | self.keywords.append(ligne.strip())
28 |
29 | def analyse(self,ligne):
30 | if ligne.find('&') != -1:
31 | return 'keywords_and'
32 | else :
33 | return 'keyword_only'
34 |
35 | def search(self,keyword,data):
36 | action=self.analyse(keyword)
37 | cs = Content_search(action,data)
38 | find=getattr(cs, action)(keyword)
39 | return find
40 |
41 |
42 |
--------------------------------------------------------------------------------
/harvesting/content_search.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 2, 2012
3 |
4 | @author: slarinier
5 | '''
6 | import re
7 | class Content_search(object):
8 | '''
9 | classdocs
10 | '''
11 |
12 |
13 | def __init__(self,action,data):
14 | '''
15 | Constructor
16 | '''
17 | self.action=action
18 | self.data=data
19 |
20 | def keyword_only(self,keyword):
21 | tokens=re.findall(keyword, self.data)
22 | if len(tokens) > 0:
23 | return True
24 | return False
25 |
26 | def keywords_and(self,keywords):
27 | keywords=keywords.split('&')
28 |
29 | for keyword in keywords:
30 | if self.keyword_only(keyword) == False:
31 | return False
32 | return True
33 |
34 |
35 |
--------------------------------------------------------------------------------
/harvesting/crawler.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Jan 7, 2013
3 |
4 | @author: slarinier
5 | '''
6 | from selenium import webdriver
7 | from pymongo import MongoClient
8 | from threading import Thread
9 | import redis
10 | import threading
11 | from pyfaup.faup import Faup
12 | import time
13 | from storage.redis_record import RedisRecord
14 | from filters import Filters
15 | from urllib2 import URLError
16 | from collections import deque
17 |
18 |
19 | class CrawlerThread(threading.Thread):
20 | def __init__(self, domain, db_value, lock):
21 | threading.Thread.__init__(self)
22 | self.domain = domain
23 | self.lock = lock
24 |
25 | def run(self):
26 | cw = Crawler(webdriver.Firefox(), self.lock, "http://" + self.domain)
27 | cw.init()
28 | cw.navigation()
29 |
30 |
31 | class Record(threading.Thread):
32 | def __init__(self, db_value, lock):
33 | self.r = RedisRecord()
34 | self.connection = MongoClient(host='localhost', port=27017, db=db_value)
35 | self.db = self.connection[db_value]
36 |
37 | threading.Thread.__init__(self)
38 | self.lock = lock
39 |
40 | def run(self):
41 | i = 0
42 | while (True):
43 | i = i + 1
44 | if i % 1000 == 0:
45 | time.sleep(10)
46 | self.lock.acquire()
47 | self.r.switchDB(1)
48 | url = self.r.rpop('crawl')
49 | self.lock.release()
50 | # print url
51 | fex = Faup()
52 | if url:
53 | print "url found: " + url
54 | try:
55 | fex.decode(url)
56 | domain = fex.get_host()
57 | entry = self.db.new_domaines.find_one({'domaine': domain})
58 | if entry == None:
59 | print "record: " + domain
60 | self.db.new_domaines.save({'domaine': domain, 'urls': [url]})
61 |
62 | urls_stored = entry['urls']
63 | if not url in urls_stored:
64 | urls_stored.append(url)
65 | entry['urls'] = urls_stored
66 | self.db.new_domaines.save(entry)
67 | except:
68 | print "parsing fault " + url
69 |
70 |
71 | class Crawler(object):
72 | def __init__(self, driver, lock, first_url, db_int=1):
73 | self.driver = driver
74 | self.driver.implicitly_wait(10)
75 | self.driver.set_page_load_timeout(30)
76 | self.r = RedisRecord()
77 | self.lock = lock
78 | self.queue = deque([])
79 | self.queue.append(first_url)
80 | self.dbs = [1, 2]
81 |
82 | def init(self):
83 | self.r.init(self.dbs)
84 | url = self.queue.popleft()
85 | self.driver.get(url)
86 | self.parser(url)
87 |
88 | def parser(self, url):
89 | self.r.switchDB(1)
90 | if not self.r.get(url):
91 | self.driver.get(url)
92 | elem_links = self.driver.find_elements_by_tag_name('a')
93 | self.lock.acquire()
94 | self.sort([link.get_attribute("href") for link in elem_links], url)
95 | self.lock.release()
96 | self.r.switchDB(1)
97 | self.r.put(url, url)
98 |
99 | def navigation(self):
100 |
101 | while (len(self.queue) > 0):
102 | url = self.queue.popleft()
103 | try:
104 | # self.driver.refresh()
105 | self.r.switchDB(1)
106 | self.parser(url)
107 |
108 | except URLError as e:
109 | print url
110 | except IOError as e:
111 | self.r.switchDB(2)
112 | print "I/O error({0}): {1}".format(e.errno, e.strerror)
113 | # self.r.put(new_url,new_url)
114 | self.r.switchDB(1)
115 | except e:
116 | continue
117 | try:
118 | self.driver.quit()
119 | print "Fin du crawling du site " + url
120 | except URLError as e:
121 | self.driver = getattr(webdriver, 'Firefox')()
122 | print 'boum'
123 | self.lock.acquire()
124 | self.r.switchDB(1)
125 | self.r.put(url, url)
126 | self.lock.release()
127 |
128 | def sort(self, elem_links, url):
129 | fex = Faup()
130 | f = Filters()
131 | f.load()
132 | self.r.switchDB(1)
133 | extend = True
134 | domainfilter = True
135 | schemefilter = True
136 | try:
137 | for link in elem_links:
138 | new_url = link
139 | self.r.switchDB(2)
140 | if not self.r.get(new_url) and new_url:
141 | self.r.switchDB(1)
142 | if not self.r.get(new_url):
143 | fex.decode(new_url)
144 | domain = fex.get_host()
145 | if f.isfilteredscheme(fex.get_scheme()):
146 | self.r.switchDB(2)
147 | self.r.put(new_url, new_url)
148 | schemefilter = False
149 | if f.isfiltereddomains(domain):
150 | self.r.switchDB(2)
151 | self.r.put(new_url, new_url)
152 | domainfilter = False
153 | if f.isfilteredextention(fex.get_resource_path()):
154 | extend = False
155 | self.r.switchDB(2)
156 | self.r.put(new_url, new_url)
157 |
158 | if extend and domainfilter and schemefilter:
159 | self.r.switchDB(1)
160 | self.r.rpush('crawl', new_url)
161 | self.queue.append(new_url)
162 | except TypeError as e:
163 | print "TypeError"
164 |
--------------------------------------------------------------------------------
/harvesting/dynamic.js:
--------------------------------------------------------------------------------
1 | var casper = require("casper").create({
2 | verbose: true
3 | });
4 | url = casper.cli.get(0)
5 | // The base links array
6 | var links = [
7 | url
8 | ];
9 |
10 | // If we don't set a limit, it could go on forever
11 | var upTo = ~~casper.cli.get(0) || 10;
12 |
13 | var currentLink = 0;
14 |
15 | // Get the links, and add them to the links array
16 | // (It could be done all in one step, but it is intentionally splitted)
17 | function addLinks(link) {
18 | this.then(function() {
19 | var found = this.evaluate(searchLinks);
20 | this.echo(found.length + " links found on " + link);
21 | links = links.concat(found);
22 | });
23 | }
24 |
25 | // Fetch all elements from the page and return
26 | // the ones which contains a href starting with 'http://'
27 | function searchLinks() {
28 | var filter, map;
29 | filter = Array.prototype.filter;
30 | map = Array.prototype.map;
31 | return map.call(filter.call(document.querySelectorAll("a"), function(a) {
32 | return (/^http:\/\/.*/i).test(a.getAttribute("href"));
33 | }), function(a) {
34 | return a.getAttribute("href");
35 | });
36 | }
37 |
38 | // Just opens the page and prints the title
39 | function start(link) {
40 | this.start(link, function() {
41 | this.echo('Page title: ' + this.getTitle());
42 | });
43 | }
44 |
45 | // As long as it has a next link, and is under the maximum limit, will keep running
46 | function check() {
47 | if (links[currentLink] && currentLink < upTo) {
48 | this.echo('--- Link ' + currentLink + ' ---');
49 | start.call(this, links[currentLink]);
50 | addLinks.call(this, links[currentLink]);
51 | currentLink++;
52 | this.run(check);
53 | } else {
54 | this.echo("All done.");
55 | this.exit();
56 | }
57 | }
58 |
59 | casper.start().then(function() {
60 | this.echo("Starting");
61 | });
62 |
63 | casper.run(check);
64 |
--------------------------------------------------------------------------------
/harvesting/filters.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Jun 12 17:40:53 2013
4 |
5 | @author: slarinier
6 | """
7 | class Filters(object):
8 | def __init__(self,pathextention='harvesting/filtered_extensions',pathscheme='harvesting/filtered_schemes',pathdomain='harvesting/filtered_domains'):
9 | self.pathdomain=pathdomain
10 | self.pathscheme=pathscheme
11 | self.pathextentions=pathextention
12 | self.domains=[]
13 | self.schemes=[]
14 | self.extentions=[]
15 | def load(self):
16 | with open(self.pathdomain,"r") as fr:
17 | self.domains=[line.strip() for line in fr]
18 | with open(self.pathscheme,"r") as fr:
19 | self.schemes=[line.strip() for line in fr]
20 | with open(self.pathextentions,"r") as fr:
21 | self.extentions=[line.strip() for line in fr]
22 | def isfilteredextention(self,path):
23 | try:
24 | for ext in self.extentions:
25 | if path.endswith(ext):
26 | return True
27 | return False
28 | except:
29 | print "extension error"
30 |
31 | def isfilteredscheme(self,scheme):
32 | return scheme is self.schemes
33 | def isfiltereddomains(self,domain):
34 | try:
35 | tokens=domain.split('.')[::-1]
36 | for d in self.domains:
37 | d_tokens=d.split('.')[::-1]
38 | d_reverse=d_tokens[0]+'.'+d_tokens[1]
39 | t_reverse=str(tokens[0]+'.'+tokens[1])
40 | if d_reverse == t_reverse:
41 | return True
42 | except IndexError as e:
43 | if domain.find('.') == -1:
44 | return True
45 | except AttributeError as e:
46 | print "test"
47 | return False
--------------------------------------------------------------------------------
/harvesting/googlesearch.js:
--------------------------------------------------------------------------------
1 | var links = [];
2 | var casper = require('casper').create();
3 | var padding=casper.cli.get(0);
4 | var criteria=casper.cli.get(1);
5 | var ua=casper.cli.get(2)
6 | function getLinks() {
7 |
8 | var links = document.querySelectorAll('h3.r a');
9 | return Array.prototype.map.call(links, function(e) {
10 | return e.getAttribute('href')
11 | });
12 | }
13 |
14 |
15 | casper.start();
16 | casper.userAgent(ua)
17 | casper.open('http://google.com/search?q='+criteria+'&start='+padding)
18 | casper.then(function() {
19 | // aggregate results for the 'casperjs' search
20 |
21 |
22 | links = this.evaluate(getLinks);
23 |
24 | // now search for 'phantomjs' by filling the form again
25 | });
26 |
27 |
28 |
29 | casper.run(function() {
30 | // echo results in some pretty fashion
31 | this.echo(links.length + ' links found:');
32 | this.echo(' - ' + links.join('\n - ')).exit();
33 | });
34 |
--------------------------------------------------------------------------------
/harvesting/keywords:
--------------------------------------------------------------------------------
1 | porn
2 | user&password
3 |
--------------------------------------------------------------------------------
/harvesting/metaextract.js:
--------------------------------------------------------------------------------
1 | var casper = require("casper").create()
2 | , url = casper.cli.get(0)
3 | , metas = [];
4 |
5 | if (!url) {
6 | casper.echo('Usage: casperjs [url]').exit();
7 | }
8 |
9 | casper.start(url, function() {
10 | metas = this.evaluate(function() {
11 | var metas = [];
12 | [].forEach.call(document.querySelectorAll('META'), function(elem) {
13 | var meta = {};
14 | [].slice.call(elem.attributes).forEach(function(attr) {
15 | meta[attr.name] = attr.value;
16 | });
17 | metas.push(meta);
18 | });
19 | return metas;
20 | });
21 | });
22 |
23 | casper.run(function() {
24 | require("utils").dump(metas);
25 | this.exit();
26 | });
27 |
--------------------------------------------------------------------------------
/harvesting/pastebin.js:
--------------------------------------------------------------------------------
1 | var links = [];
2 | var casper = require('casper').create();
3 | var url=casper.cli.get(0);
4 | var ua=casper.cli.get(1)
5 | function getLinks() {
6 |
7 | var links = document.querySelectorAll('tr a');
8 | return Array.prototype.map.call(links, function(e) {
9 | return e.getAttribute('href')
10 | });
11 | }
12 |
13 |
14 | casper.start();
15 | casper.userAgent(ua);
16 | casper.open(url);
17 | casper.then(function() {
18 | // aggregate results for the 'casperjs' search
19 | links = this.evaluate(getLinks);
20 |
21 | // now search for 'phantomjs' by filling the form again
22 | });
23 |
24 |
25 | casper.run(function() {
26 | // echo results in some pretty fashion
27 | this.echo(links.length + ' links found:');
28 | this.echo(' - ' + links.join('\n - ')).exit();
29 |
30 | });
31 |
--------------------------------------------------------------------------------
/harvesting/pastebin.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 | from subprocess import Popen, PIPE
4 | import threading
5 | import pymongo
6 | from pymongo import Connection
7 | from pastebinExtract import pastebinExtract
8 | from random_user_agent import Random_user_agent
9 | import time
10 | class pastebin():
11 | def __init__(self, url,keyword,casperJSScript):
12 | self.url=url
13 | self.keyword=keyword
14 | self.casperJSScript=casperJSScript
15 | self.urls=[]
16 | rua=Random_user_agent()
17 | self.ua=rua.rand()
18 | self.time = rua.randsleep()
19 | self.result=[]
20 | def pastebinArchive(self):
21 | result=subprocess.Popen(['casperjs' ,self.casperJSScript,self.url,'\''+self.ua+'\''],stdout=PIPE)
22 | for ligne in result.stdout:
23 | if ligne.find('/')!=-1 and ligne.find('archive') == -1:
24 | id=ligne.replace(' - /','').strip()
25 | id=id.replace('\n','')
26 | self.urls.append('http://pastebin.com/raw.php?i='+id)
27 | print self.urls
28 |
29 | def pastebinAnalyse(self):
30 | i=0
31 | main_thread = threading.currentThread()
32 | thread_pool=[]
33 | for url in self.urls:
34 | pasteExtract=pastebinExtract(url)
35 | time.sleep(self.time)
36 | pasteExtract.start()
37 | thread_pool.append(pasteExtract)
38 | i+=1
39 | if i % 500 ==0:
40 | for t in threading.enumerate():
41 | if t is not main_thread:
42 | t.join()
43 |
44 | for t in thread_pool:
45 | result =getattr(t,'result')
46 | if result :
47 | self.result.append(result)
48 | return self.result
49 |
50 |
51 |
--------------------------------------------------------------------------------
/harvesting/pastebinExtract.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 | from subprocess import Popen, PIPE
4 | import threading
5 | from content import Content
6 | from random_user_agent import Random_user_agent
7 |
8 | class pastebinExtract(threading.Thread):
9 | def __init__(self,url,casperJSScript='pastebintext.js'):
10 | threading.Thread.__init__(self)
11 | self.url=url
12 | self.casperJSScript=casperJSScript
13 | self.content=Content()
14 | self.data=[]
15 | rua=Random_user_agent()
16 | self.ua=rua.rand()
17 | self.result=None
18 |
19 | def run(self):
20 | result=subprocess.Popen(['casperjs' ,self.casperJSScript,self.url,'\''+self.ua+'\''],stdout=PIPE)
21 | for ligne in result.stdout:
22 | record=ligne.strip()
23 | self.data.append(record.lower())
24 |
25 | keywords=getattr(self.content,'keywords')
26 | for keyword in keywords:
27 | if self.content.search(keyword,str(self.data)):
28 | self.result={'url': self.url, 'data': self.data}
--------------------------------------------------------------------------------
/harvesting/pastebintest.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import pastebin
3 |
4 | paste=pastebin.pastebin('http://pastebin.com/archive',[],'pastebin.js')
5 | paste.pastebinArchive()
6 | setattr(paste,'casperJSScript','pastebintext.js')
7 | result=paste.pastebinAnalyse()
8 | print result
9 |
--------------------------------------------------------------------------------
/harvesting/pastebintext.js:
--------------------------------------------------------------------------------
1 |
2 | var casper = require('casper').create();
3 |
4 |
5 | var url=casper.cli.get(0);
6 | var ua =casper.cli.get(1)
7 |
8 | casper.start().then(function() {
9 | this.userAgent(ua);
10 | this.open(url, {
11 | method: 'get',
12 | headers: {
13 | 'Accept': 'application/text'
14 | }
15 | });
16 | });
17 |
18 | casper.run(function() {
19 | this.echo(this.debugPage());
20 | this.exit();
21 | });
22 |
23 |
--------------------------------------------------------------------------------
/harvesting/pholcidae.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 |
3 | import re
4 | import sys
5 |
6 | # importing modules corresponding to Python version
7 |
8 | import urlparse
9 | import urllib2
10 |
11 | class Pholcidae(object):
12 |
13 | """" Pholcidae is a small and fast web crawler. """
14 |
15 | def __init__(self):
16 |
17 | """
18 | @return void
19 |
20 | Creates Pholcidae instance and updates default settings dict.
21 | """
22 |
23 | # default local urllib2 opener
24 | self._opener = None
25 | # creating new sets of unparsed, already parsed and failed URLs
26 | self._unparsed_urls = set()
27 | self._parsed_urls = set()
28 | self._failed_urls = set()
29 | # extending settings with given values
30 | self._extend_settings()
31 | # compiling regular expressions
32 | self._compile_regexs()
33 | # autostart crawler if settings allows
34 | if self._settings.autostart:
35 | self.start()
36 |
37 | def crawl(self, response):
38 |
39 | """
40 | @type response AttrDict
41 | @return void
42 |
43 | Dummy method which can be overrided by inheriting Pholcidae class.
44 | Use it to get html page and parse it as you want to.
45 | """
46 |
47 | pass
48 |
49 | def start(self):
50 |
51 | """
52 | @return void
53 |
54 | Simple crawler start trigger.
55 | """
56 |
57 | self._get_page()
58 |
59 | ############################################################################
60 | # PRIVATE METHODS #
61 | ############################################################################
62 |
63 | ############################ INIT METHODS ##################################
64 |
65 | def _extend_settings(self):
66 |
67 | """
68 | @return void
69 |
70 | Extends default settings with given settings.
71 | """
72 |
73 | # creating default settings object
74 | self._settings = AttrDict({
75 | # do we need to follow HTTP redirects?
76 | 'follow_redirects': True,
77 | # what page links do we need to parse?
78 | 'valid_links': ['(.*)'],
79 | # what URLs must be excluded
80 | 'exclude_links': [],
81 | # what is an entry point for crawler?
82 | 'start_page': '/',
83 | # which domain should we parse?
84 | 'domain': '',
85 | # should we ignor pages outside of the given domain?
86 | 'stay_in_domain': True,
87 | # which protocol do we need to use?
88 | 'protocol': 'http://',
89 | # autostart crawler right after initialization?
90 | 'autostart': False,
91 | # cookies to be added to each request
92 | 'cookies': {},
93 | # custom headers to be added to each request
94 | 'headers': {}
95 | })
96 |
97 | # updating settings with given values
98 | self._settings.update(self.settings)
99 |
100 | # creating urllib2 opener
101 | self._create_opener()
102 | # compiling cookies
103 | self._compile_cookies()
104 | # compiling headers
105 | self._compile_headers()
106 |
107 | # adding start point into unparsed list
108 | start_url = '%s%s%s' % (self._settings.protocol, self._settings.domain,
109 | self._settings.start_page)
110 | self._unparsed_urls.add(start_url)
111 |
112 | def _compile_regexs(self):
113 |
114 | """
115 | @return void
116 |
117 | Compiles regular expressions for further use.
118 | """
119 |
120 | # setting default flags
121 | flags = re.I | re.S
122 | # compiling regexs
123 | self._regex = AttrDict({
124 | # collects all links across given page
125 | 'href_links': re.compile(r'',
126 | flags=flags),
127 | # valid links regexs
128 | 'valid_link': [],
129 | # invalid links regexs
130 | 'invalid_link': []
131 | })
132 |
133 | # complinig valid links regexs
134 | for regex in self._settings.valid_links:
135 | self._regex.valid_link.append(re.compile(regex, flags=flags))
136 |
137 | # compiling invalid links regexs
138 | for regex in self._settings.exclude_links:
139 | self._regex.invalid_link.append(re.compile(regex, flags=flags))
140 |
141 | def _compile_cookies(self):
142 |
143 | """
144 | @return void
145 |
146 | Compiles given dict of cookies to string.
147 | """
148 |
149 | compiled = []
150 | for name, value in self._settings.cookies.items():
151 | compiled.append('%s=%s' % (name, value))
152 | self._settings.cookies = ','.join(compiled)
153 | self._opener.addheaders.append(('Cookie', self._settings.cookies))
154 |
155 | def _compile_headers(self):
156 |
157 | """
158 | @return void
159 |
160 | Adds given dict of headers to urllib2 opener.
161 | """
162 |
163 | for header_name, header_value in self._settings.headers.items():
164 | self._opener.addheaders.append((header_name, header_value))
165 |
166 | def _create_opener(self):
167 |
168 | """
169 | @return void
170 |
171 | Creates local urllib2 opener and extends it with custom
172 | redirect handler if needed.
173 | """
174 |
175 | self._opener = urllib2.build_opener()
176 | if not self._settings.follow_redirects:
177 | self._opener = urllib2.build_opener(PholcidaeRedirectHandler,
178 | urllib2.HTTPCookieProcessor())
179 |
180 | ########################## CRAWLING METHODS ################################
181 |
182 | def _get_page(self):
183 |
184 | """
185 | @return bool
186 |
187 | Fetches page by URL.
188 | """
189 |
190 | # iterating over unparsed links
191 | while self._unparsed_urls:
192 | # getting link to get
193 | url = self._unparsed_urls.pop()
194 |
195 | # fetching page
196 | page = self._fetch_url(url)
197 | if page.status not in [500, 404, 502]:
198 | # parsing only valid urls
199 | valid_match = self._is_valid_link(page.url)
200 | if valid_match:
201 | # adding regex match to page object
202 | page.match = valid_match
203 | # sending raw HTML to crawl function
204 | self.crawl(page)
205 | # moving url from unparsed to parsed list
206 | self._parsed_urls.add(url)
207 | # collecting links from page
208 | self._get_page_links(page.body, page.url)
209 | else:
210 | # moving url from unparsed to failed list
211 | self._failed_urls.add(url)
212 |
213 | def _get_page_links(self, raw_html, url):
214 |
215 | """
216 | @type raw_html str
217 | @type url str
218 | @return void
219 |
220 | Parses out all links from crawled web page.
221 | """
222 |
223 | links_groups = self._regex.href_links.findall(str(raw_html))
224 | links = [group[1] for group in links_groups]
225 | for link in links:
226 | # is link not excluded?
227 | if not self._is_excluded(link):
228 | # getting link parts
229 | link_info = urlparse.urlparse(link)
230 | # if link not relative
231 | if link_info.scheme or link_info.netloc:
232 | # if stay_in_domain enabled and link outside of domain scope
233 | if self._settings.stay_in_domain:
234 | try:
235 | is_link = self._settings.domain not in link
236 | except UnicodeDecodeError:
237 | continue
238 | else:
239 | if is_link:
240 | continue
241 | else:
242 | # converting relative link into absolute
243 | link = urlparse.urljoin(url, link)
244 | # if link was not previously parsed
245 | if link not in self._parsed_urls:
246 | if link not in self._failed_urls:
247 | self._unparsed_urls.add(link)
248 |
249 | def _is_valid_link(self, link):
250 |
251 | """
252 | @type link str
253 | @return str
254 |
255 | Compares link with given regex to decide if we need to parse that
256 | page.
257 | """
258 |
259 | # if hash in URL - assumimg anchor or AJAX
260 | if link and '#' not in link:
261 | for regex in self._regex.valid_link:
262 | matches = regex.findall(link)
263 | if matches:
264 | return matches
265 | return ''
266 |
267 | def _is_excluded(self, link):
268 |
269 | """
270 | @type link str
271 | @return bool
272 |
273 | Checks if link matches exluded regex.
274 | """
275 |
276 | for regex in self._regex.invalid_link:
277 | if regex.search(link):
278 | return True
279 | return False
280 |
281 | ######################### URL FETCHING METHODS #############################
282 |
283 | def _fetch_url(self, url):
284 |
285 | """
286 | @type url str
287 | @return AttrDict
288 |
289 | Fetches given URL and returns data from it.
290 | """
291 |
292 | # empty page container
293 | page = AttrDict()
294 |
295 | try:
296 | # getting response from given URL
297 | resp = self._opener.open(url)
298 | page = AttrDict({
299 | 'body': resp.read(),
300 | 'url': resp.geturl(),
301 | 'headers': AttrDict(dict(resp.headers.items())),
302 | 'cookies': self._parse_cookies(dict(resp.headers.items())),
303 | 'status': resp.getcode()
304 | })
305 | except:
306 | # drop invalid page with 500 HTTP error code
307 | page = AttrDict({'status': 500})
308 | self._failed_urls.add(url)
309 | return page
310 |
311 | def _parse_cookies(self, headers):
312 |
313 | """
314 | @type headers dict
315 | @return AttrDict
316 |
317 | Parses cookies from response headers.
318 | """
319 |
320 | cookies = AttrDict()
321 | # lowering headers keys
322 | headers_lower={}
323 |
324 | for k,v in headers.items():
325 | headers_lower[k.lower()]=v
326 | headers=headers_lower
327 | if 'set-cookie' in headers:
328 | # splitting raw cookies
329 | raw_cookies = headers['set-cookie'].split(';')
330 | # cookie parts to throw out
331 | throw_out = ['expires', 'path', 'domain', 'secure', 'HttpOnly']
332 | for cookie in raw_cookies:
333 | cookie = cookie.split('=')
334 | if cookie[0].strip() not in throw_out:
335 | cookies.update({cookie[0]: cookie[1]})
336 | return cookies
337 |
338 |
339 | class AttrDict(dict):
340 |
341 | """ A dict that allows for object-like property access syntax. """
342 |
343 | def __init__(self, new_dict=None):
344 | dict.__init__(self)
345 | if new_dict:
346 | self.update(new_dict)
347 |
348 | def __getattr__(self, name):
349 | try:
350 | return self[name]
351 | except KeyError:
352 | raise AttributeError(name)
353 |
354 | def __setattr__(self, key, value):
355 | self.update({key: value})
356 |
357 |
358 | class PholcidaeRedirectHandler(urllib2.HTTPRedirectHandler):
359 |
360 | """ Custom URL redirects handler. """
361 |
362 | def http_error_302(self, req, fp, code, msg, headers):
363 | return fp
364 |
365 | http_error_301 = http_error_303 = http_error_307 = http_error_302
366 |
--------------------------------------------------------------------------------
/harvesting/random_user_agent.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 2, 2012
3 |
4 | @author: slarinier
5 | '''
6 | import random
7 | class Random_user_agent(object):
8 | '''
9 | classdocs
10 | '''
11 | _instance = None
12 | def __init__(self,path_user_agent='harvesting/user_agents'):
13 | '''
14 | Constructor
15 | '''
16 | self.user_agent_list=[]
17 | self.path_user_agent=path_user_agent
18 | with open(self.path_user_agent,'r') as fr:
19 | for user_agent in fr:
20 | if user_agent.find('#') == -1:
21 | self.user_agent_list.append(user_agent)
22 |
23 |
24 | def __new__(cls, *args, **kwargs):
25 | if not cls._instance:
26 | cls._instance = super(Random_user_agent, cls).__new__(
27 | cls, *args, **kwargs)
28 | return cls._instance
29 |
30 | def rand(self):
31 | return random.choice(self.user_agent_list)
32 | def randsleep(self):
33 | return random.randrange(1,3,2)
--------------------------------------------------------------------------------
/harvesting/search.py:
--------------------------------------------------------------------------------
1 | from pymongo import MongoClient
2 | from subprocess import PIPE
3 | from white_list import white_list
4 | import re
5 | import subprocess
6 | import threading
7 | from random_user_agent import Random_user_agent
8 |
9 | class search(threading.Thread):
10 | def __init__(self,limit,criteria,scriptjs,db,url_pattern='((https?|ftp|gopher|telnet|file|notes|ms-help):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&])*)'):
11 | threading.Thread.__init__(self)
12 | self.result=[]
13 | self.limit=limit
14 | self.criteria=criteria
15 | self.scriptjs=scriptjs
16 | self.connection= MongoClient(host='localhost', port=27017)
17 | self.db=self.connection[db]
18 | self.whitelist=white_list(db)
19 | self.regex_url=re.compile(url_pattern)
20 | rua=Random_user_agent()
21 | self.ua=rua.rand()
22 | self.urls_by_domaine={}
23 |
24 | def run(self):
25 | i=0
26 | while i < self.limit:
27 | result=subprocess.Popen(['casperjs' ,self.scriptjs,str(i),self.criteria,self.ua],stdout=PIPE)
28 | for ligne in result.stdout:
29 | if ligne.find('/')!=-1 and ligne.find('http://') != -1:
30 | url_information=self.regex_url.search(ligne)
31 | url=url_information.group(1)
32 | domaine=url.split('/')[2]
33 | tokens=domaine.split('.')
34 | racine=tokens[len(tokens)-2]+'.'+tokens[len(tokens)-1]
35 |
36 | print "domain found: "+ domaine
37 |
38 | if not racine in getattr(self.whitelist, 'white_domaine'):
39 | if domaine in self.urls_by_domaine:
40 | urls= self.urls_by_domaine[domaine]
41 | urls.append(url)
42 | self.urls_by_domaine[domaine]=urls
43 | else:
44 | self.urls_by_domaine[domaine]=[url]
45 |
46 | i=i+10
47 |
48 | def record(self):
49 | print "#######################record############################"
50 | domaines = iter(self.urls_by_domaine)
51 | for domaine in domaines:
52 | entry = self.db.new_domaines.find_one({'domaine':domaine})
53 | if entry == None:
54 | self.db.new_domaines.insert_one({'domaine':domaine,'urls':self.urls_by_domaine[domaine],'criteria':[self.criteria]})
55 | else:
56 |
57 | try:
58 | urls_stored = entry['urls']
59 | urls=self.urls_by_domaine[domaine]
60 | urls_to_store=list(set(urls_stored + urls))
61 | criteria=entry['criteria']
62 | criteria=list(set(criteria.append(self.criteria)))
63 | entry['criteria']=criteria
64 | self.db.new_domaines.save(entry)
65 | except :
66 | criteria=[]
67 | try :
68 | criteria=entry['criteria']
69 | criteria=list(set(criteria.append(self.criteria)))
70 | except:
71 | criteria.append(self.criteria)
72 | pass
73 |
74 | entry['criteria']=criteria
75 | try:
76 | self.db.new_domaines.insert_one({'domaine':domaine},{'urls':self.urls_by_domaine[domaine],'criteria':criteria})
77 | except:
78 | pass
79 |
80 |
--------------------------------------------------------------------------------
/harvesting/white_list.py:
--------------------------------------------------------------------------------
1 | from mongodb import mongodb
2 | import os
3 | import glob
4 | class white_list():
5 |
6 | def __init__(self,db):
7 | self.mdb=mongodb.mongodb('localhost',27017,db)
8 | self.white_list=[]
9 | self.white_domaine=['msn.com','google.com','wikipedia.fr','free.fr','linkedin.com']
10 |
11 | def loadWhiteList(self):
12 | domaines=self.mdb.selectall('white_list')
13 | for domaine in domaines:
14 | self. white_domaine.append(domaine['domaine'])
15 |
16 | def makeWhiteList(self,path):
17 | list_files=os.walk(path)
18 | for root,dirs,files in list_files:
19 | category=''
20 | for fl in files:
21 | if fl=='domains':
22 | with open(root+'/'+fl,'r') as fr:
23 | root=root.replace(path,'')
24 | if '/' in root:
25 | category=root.replace('/','_')
26 | else:
27 | category=root
28 | for ligne in fr:
29 | item={'domaine':ligne.strip(),'category':category}
30 | self.mdb.update(item,'white_list')
31 | def searchInWhiteList(self,domaine):
32 | result=self.mdb.selectbycreteria('domaine',domaine,'white_list')
33 | if result is not None:
34 | category=result[0]
35 | print category['category']
36 | return category
37 | #def compare_white_list()
38 |
--------------------------------------------------------------------------------
/harvesting/yahoosearch.js:
--------------------------------------------------------------------------------
1 | var links = [];
2 | var casper = require('casper').create();
3 | var padding=casper.cli.get(0)
4 | var criteria=casper.cli.get(1)
5 | var ua = casper.cli.get(2)
6 |
7 | function getLinks() {
8 |
9 | var links = document.querySelectorAll('h3 a');
10 | return Array.prototype.map.call(links, function(e) {
11 | return e.getAttribute('href')
12 | });
13 | }
14 |
15 |
16 | casper.start();
17 | casper.userAgent(ua)
18 | casper.open('http://fr.yahoo.com/search='+criteria+'&rd=r1&fr=yfp-t-731&fr2=sb-top&xargs=0&pstart=1&b='+padding)
19 | casper.then(function() {
20 | // aggregate results for the 'casperjs' search
21 |
22 | links = this.evaluate(getLinks);
23 |
24 | // now search for 'phantomjs' by filling the form again
25 | });
26 |
27 |
28 |
29 | casper.run(function() {
30 | // echo results in some pretty fashion
31 | this.echo(links.length + ' links found:');
32 | this.echo(' - ' + links.join('\n - ')).exit();
33 | });
34 |
--------------------------------------------------------------------------------
/history/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/history/__init__.py
--------------------------------------------------------------------------------
/history/history.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Jan 18, 2013
3 |
4 | @author: slarinier
5 | '''
6 |
7 | import datetime
8 | import logging
9 |
10 | class History(object):
11 | '''
12 | classdocs
13 | '''
14 |
15 |
16 | def __init__(self):
17 | '''
18 | Constructor
19 | '''
20 | d=datetime.datetime.now()
21 | date_value=d.strftime("%Y-%m-%d")
22 | self.logger=logging.getLogger('history')
23 | hdlr = logging.FileHandler('history/'+date_value+'.log')
24 | formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
25 | hdlr.setFormatter(formatter)
26 | self.logger.addHandler(hdlr)
27 | self.logger.setLevel(logging.INFO)
28 |
29 | def register(self,action):
30 | self.logger.info(action)
31 |
32 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | '''
3 | Created on Sep 25, 2012
4 |
5 | @author: slarinier
6 | '''
7 |
8 | from actions import Actions
9 | import argparse
10 | from history.history import History
11 | import sys
12 | import threading
13 |
14 | if __name__ == '__main__':
15 | scriptsJS = ['harvesting/googlesearch.js', 'harvesting/bingsearch.js', 'harvesting/yahoosearch.js']
16 | h = History()
17 | result = []
18 | domaine_ip = {}
19 |
20 | # limit=sys.argv[4]
21 |
22 |
23 | parser = argparse.ArgumentParser(description='metaharvester')
24 | parser.add_argument('--db', dest='db', help='db in mongo to store informations')
25 | parser.add_argument('--geoloc', dest='geoloc')
26 | parser.add_argument('--action', dest='action')
27 | parser.add_argument('--criteria', dest='criteria')
28 | parser.add_argument('--collection', dest='collection')
29 | parser.add_argument('--attr', dest='attr')
30 | parser.add_argument('--threadpool', dest='threadpool')
31 | parser.add_argument('--filters', dest='filters')
32 | parser.add_argument('--domains', dest='domains')
33 | parser.add_argument('--range', dest='range')
34 | parser.add_argument('--nmap_options', dest='nmap_options')
35 | args = parser.parse_args()
36 | db = args.db
37 | filters = args.filters
38 | criteria = args.criteria
39 | if criteria == None:
40 | criteria = ''
41 | geoloc = args.geoloc
42 | if geoloc == None:
43 | geoloc = ''
44 | collection = args.collection
45 | attr = args.attr
46 | msg = db + ' ' + ' ' + args.action + ' ' + criteria
47 | h.register(msg)
48 | act = Actions(db)
49 | if args.action == 'reset':
50 | act.reset()
51 | elif args.action == 'metasearch':
52 | if criteria and scriptsJS and db and geoloc:
53 | criteria = criteria.split(',')
54 | act.metasearch(criteria, scriptsJS, geoloc)
55 | elif args.action == 'search_ip':
56 | act.search_ip(geoloc, scriptsJS, args.range)
57 | elif args.action == 'create_network':
58 | act.create_network()
59 | elif args.action == 'metadata':
60 | act.metadata_exctract()
61 | elif args.action == 'create_result':
62 | if not criteria and not db:
63 | parser.print_help()
64 | else:
65 | if collection:
66 | act.create_result(collection, criteria)
67 | elif args.action == 'dnstree':
68 | if db:
69 | act.dnstree(db)
70 | elif args.action == 'crawl' and args.domains:
71 | if db:
72 | act.crawl(args.domains)
73 | elif args.action == 'cleandb':
74 | if db and filters:
75 | act.clean_db(filters)
76 | elif args.action == 'screenshots':
77 | if db and args.threadpool:
78 | act.screenshots(db, args.threadpool)
79 | else:
80 | parser.print_help()
81 | elif args.action == 'init':
82 | if db and attr and collection:
83 | act.init(db, collection, attr)
84 | else:
85 | parser.print_help()
86 | elif args.action == 'nmap':
87 | if args.nmap_options or args.range:
88 | act.scan_nmap(args.range, args.nmap_options)
89 | else:
90 |
91 | parser.print_help()
92 | sys.exit(1)
93 |
--------------------------------------------------------------------------------
/mongodb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/mongodb/__init__.py
--------------------------------------------------------------------------------
/mongodb/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/mongodb/__init__.pyc
--------------------------------------------------------------------------------
/mongodb/mongodb.py:
--------------------------------------------------------------------------------
1 | from pymongo import MongoClient
2 | import bson
3 | import pymongo
4 |
5 |
6 | class mongodb(object):
7 | def __init__(self, host, port, db):
8 | self.host = host
9 | self.port = port
10 | self.connection = MongoClient(host=host, port=port)
11 | self.db = self.connection[db]
12 |
13 | def insert(self, collection, key, value):
14 | col = self.db[collection]
15 | value_db = {'domaine': value}
16 | # col.create_index([('domaine', pymongo.DESCENDING)])
17 | col.save(value_db)
18 |
19 | def update(self, item, collection):
20 | col = self.db[collection]
21 | try:
22 | col.save(item)
23 | except bson.errors.InvalidStringData:
24 | print 'InvalidString ' + str(item)
25 |
26 | def selectbyDict(self, request, col):
27 | self.col = self.db[col]
28 | return self.col.find(request)
29 |
30 | def selectbycreteria(self, key, criteria, col):
31 | request = {key: criteria}
32 | self.col = self.db[col]
33 | return self.col.find(request)
34 |
35 | def selectall(self, collection):
36 | col = self.db[collection]
37 | return col.find()
38 |
39 | def insertMultiCriteria(self, collection, items):
40 | print "insert " + str(items)
41 | col = self.db[collection]
42 | try:
43 | col.save(items)
44 | except ValueError:
45 | print 'Erreur encoding: ' + items
46 |
--------------------------------------------------------------------------------
/mongodb/mongodb.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/mongodb/mongodb.pyc
--------------------------------------------------------------------------------
/network/IPy.py:
--------------------------------------------------------------------------------
1 | """ IPy - class and tools for handling of IPv4 and IPv6 Addresses and Networks.
2 |
3 | $HeadURL: http://svn.23.nu/svn/repos/IPy/trunk/IPy.py $
4 |
5 | $Id: IPy.py 671 2004-08-22 21:02:29Z md $
6 |
7 | The IP class allows a comfortable parsing and handling for most
8 | notations in use for IPv4 and IPv6 Addresses and Networks. It was
9 | greatly inspired bei RIPE's Perl module NET::IP's interface but
10 | doesn't share the Implementation. It doesn't share non-CIDR netmasks,
11 | so funky stuff lixe a netmask 0xffffff0f can't be done here.
12 |
13 | >>> ip = IP('127.0.0.0/30')
14 | >>> for x in ip:
15 | ... print x
16 | ...
17 | 127.0.0.0
18 | 127.0.0.1
19 | 127.0.0.2
20 | 127.0.0.3
21 | >>> ip2 = IP('0x7f000000/30')
22 | >>> ip == ip2
23 | 1
24 | >>> ip.reverseNames()
25 | ['0.0.0.127.in-addr.arpa.', '1.0.0.127.in-addr.arpa.', '2.0.0.127.in-addr.arpa.', '3.0.0.127.in-addr.arpa.']
26 | >>> ip.reverseName()
27 | '0-3.0.0.127.in-addr.arpa.'
28 | >>> ip.iptype()
29 | 'PRIVATE'
30 |
31 | It can detect about a dozen different ways of expressing IP addresses
32 | and networks, parse them and distinguish between IPv4 and IPv6 addresses.
33 |
34 | >>> IP('10.0.0.0/8').version()
35 | 4
36 | >>> IP('::1').version()
37 | 6
38 | >>> print IP(0x7f000001)
39 | 127.0.0.1
40 | >>> print IP('0x7f000001')
41 | 127.0.0.1
42 | >>> print IP('127.0.0.1')
43 | 127.0.0.1
44 | >>> print IP('10')
45 | 10.0.0.0
46 | >>> print IP('1080:0:0:0:8:800:200C:417A')
47 | 1080:0000:0000:0000:0008:0800:200c:417a
48 | >>> print IP('1080::8:800:200C:417A')
49 | 1080:0000:0000:0000:0008:0800:200c:417a
50 | >>> print IP('::1')
51 | 0000:0000:0000:0000:0000:0000:0000:0001
52 | >>> print IP('::13.1.68.3')
53 | 0000:0000:0000:0000:0000:0000:0d01:4403
54 | >>> print IP('127.0.0.0/8')
55 | 127.0.0.0/8
56 | >>> print IP('127.0.0.0/255.0.0.0')
57 | 127.0.0.0/8
58 | >>> print IP('127.0.0.0-127.255.255.255')
59 | 127.0.0.0/8
60 |
61 | Nearly all class methods which return a string have an optional
62 | parameter 'wantprefixlen' which controlles if the prefixlen or netmask
63 | is printed. Per default the prefilen is always shown if the net
64 | contains more than one address.
65 |
66 | wantprefixlen == 0 / None don't return anything 1.2.3.0
67 | wantprefixlen == 1 /prefix 1.2.3.0/24
68 | wantprefixlen == 2 /netmask 1.2.3.0/255.255.255.0
69 | wantprefixlen == 3 -lastip 1.2.3.0-1.2.3.255
70 |
71 | You can also change the defaults on an per-object basis by fiddeling with the class members
72 |
73 | NoPrefixForSingleIp
74 | WantPrefixLen
75 |
76 | >>> IP('10.0.0.0/32').strNormal()
77 | '10.0.0.0'
78 | >>> IP('10.0.0.0/24').strNormal()
79 | '10.0.0.0/24'
80 | >>> IP('10.0.0.0/24').strNormal(0)
81 | '10.0.0.0'
82 | >>> IP('10.0.0.0/24').strNormal(1)
83 | '10.0.0.0/24'
84 | >>> IP('10.0.0.0/24').strNormal(2)
85 | '10.0.0.0/255.255.255.0'
86 | >>> IP('10.0.0.0/24').strNormal(3)
87 | '10.0.0.0-10.0.0.255'
88 | >>> ip = IP('10.0.0.0')
89 | >>> print ip
90 | 10.0.0.0
91 | >>> ip.NoPrefixForSingleIp = None
92 | >>> print ip
93 | 10.0.0.0/32
94 | >>> ip.WantPrefixLen = 3
95 | >>> print ip
96 | 10.0.0.0-10.0.0.0
97 |
98 |
99 | Further Information might be available at http://c0re.jp/c0de/IPy/
100 |
101 | Hacked 2001 by drt@un.bewaff.net
102 |
103 | TODO:
104 | * better comparison (__cmp__ and friends)
105 | * tests for __cmp__
106 | * always write hex values lowercase
107 | * interpret 2001:1234:5678:1234/64 as 2001:1234:5678:1234::/64
108 | * move size in bits into class variables to get rid of some "if self._ipversion ..."
109 | * support for base85 encoding
110 | * support for output of IPv6 encoded IPv4 Addresses
111 | * update address type tables
112 | * first-last notation should be allowed for IPv6
113 | * add IPv6 docstring examples
114 | * check better for negative parameters
115 | * add addition / aggregation
116 | * move reverse name stuff out of the classes and refactor it
117 | * support for aggregation of more than two nets at once
118 | * support for aggregation with "holes"
119 | * support for finding common prefix
120 | * '>>' and '<<' for prefix manipulation
121 | * add our own exceptions instead ValueError all the time
122 | * rename checkPrefix to checkPrefixOk
123 | * add more documentation and doctests
124 | * refactor
125 | """
126 |
127 | __rcsid__ = '$Id: IPy.py 671 2004-08-22 21:02:29Z md $'
128 | __version__ = '0.42'
129 |
130 | import types
131 |
132 | # Definition of the Ranges for IPv4 IPs
133 | # this should include www.iana.org/assignments/ipv4-address-space
134 | # and www.iana.org/assignments/multicast-addresses
135 | IPv4ranges = {
136 | '0' : 'PUBLIC', # fall back
137 | '00000000' : 'PRIVATE', # 0/8
138 | '00001010' : 'PRIVATE', # 10/8
139 | '01111111' : 'PRIVATE', # 127.0/8
140 | '1' : 'PUBLIC', # fall back
141 | '101011000001' : 'PRIVATE', # 172.16/12
142 | '1100000010101000' : 'PRIVATE', # 192.168/16
143 | '11011111' : 'RESERVED', # 223/8
144 | '111' : 'RESERVED' # 224/3
145 | }
146 |
147 | # Definition of the Ranges for IPv6 IPs
148 | # see also www.iana.org/assignments/ipv6-address-space,
149 | # www.iana.org/assignments/ipv6-tla-assignments,
150 | # www.iana.org/assignments/ipv6-multicast-addresses,
151 | # www.iana.org/assignments/ipv6-anycast-addresses
152 | IPv6ranges = {
153 | '00000000' : 'RESERVED', # ::/8
154 | '00000001' : 'UNASSIGNED', # 100::/8
155 | '0000001' : 'NSAP', # 200::/7
156 | '0000010' : 'IPX', # 400::/7
157 | '0000011' : 'UNASSIGNED', # 600::/7
158 | '00001' : 'UNASSIGNED', # 800::/5
159 | '0001' : 'UNASSIGNED', # 1000::/4
160 | '0010000000000000' : 'RESERVED', # 2000::/16 Reserved
161 | '0010000000000001' : 'ASSIGNABLE', # 2001::/16 Sub-TLA Assignments [RFC2450]
162 | '00100000000000010000000': 'ASSIGNABLE IANA', # 2001:0000::/29 - 2001:01F8::/29 IANA
163 | '00100000000000010000001': 'ASSIGNABLE APNIC', # 2001:0200::/29 - 2001:03F8::/29 APNIC
164 | '00100000000000010000010': 'ASSIGNABLE ARIN', # 2001:0400::/29 - 2001:05F8::/29 ARIN
165 | '00100000000000010000011': 'ASSIGNABLE RIPE', # 2001:0600::/29 - 2001:07F8::/29 RIPE NCC
166 | '0010000000000010' : '6TO4', # 2002::/16 "6to4" [RFC3056]
167 | '0011111111111110' : '6BONE', # 3FFE::/16 6bone Testing [RFC2471]
168 | '0011111111111111' : 'RESERVED', # 3FFF::/16 Reserved
169 | '010' : 'GLOBAL-UNICAST', # 4000::/3
170 | '011' : 'UNASSIGNED', # 6000::/3
171 | '100' : 'GEO-UNICAST', # 8000::/3
172 | '101' : 'UNASSIGNED', # A000::/3
173 | '110' : 'UNASSIGNED', # C000::/3
174 | '1110' : 'UNASSIGNED', # E000::/4
175 | '11110' : 'UNASSIGNED', # F000::/5
176 | '111110' : 'UNASSIGNED', # F800::/6
177 | '1111110' : 'UNASSIGNED', # FC00::/7
178 | '111111100' : 'UNASSIGNED', # FE00::/9
179 | '1111111010' : 'LINKLOCAL', # FE80::/10
180 | '1111111011' : 'SITELOCAL', # FEC0::/10
181 | '11111111' : 'MULTICAST', # FF00::/8
182 | '0' * 96 : 'IPV4COMP', # ::/96
183 | '0' * 80 + '1' * 16 : 'IPV4MAP', # ::FFFF:0:0/96
184 | '0' * 128 : 'UNSPECIFIED', # ::/128
185 | '0' * 127 + '1' : 'LOOPBACK' # ::1/128
186 | }
187 |
188 |
189 | class IPint:
190 | """Handling of IP addresses returning integers.
191 |
192 | Use class IP instead because some features are not implemented for
193 | IPint."""
194 |
195 | def __init__(self, data, ipversion = 0):
196 | """Create an instance of an IP object.
197 |
198 | Data can be a network specification or a single IP. IP
199 | Addresses can be specified in all forms understood by
200 | parseAddress.() the size of a network can be specified as
201 |
202 | /prefixlen a.b.c.0/24 2001:658:22a:cafe::/64
203 | -lastIP a.b.c.0-a.b.c.255 2001:658:22a:cafe::-2001:658:22a:cafe:ffff:ffff:ffff:ffff
204 | /decimal netmask a.b.c.d/255.255.255.0 not supported for IPv6
205 |
206 | If no size specification is given a size of 1 address (/32 for
207 | IPv4 and /128 for IPv6) is assumed.
208 |
209 | >>> print IP('127.0.0.0/8')
210 | 127.0.0.0/8
211 | >>> print IP('127.0.0.0/255.0.0.0')
212 | 127.0.0.0/8
213 | >>> print IP('127.0.0.0-127.255.255.255')
214 | 127.0.0.0/8
215 |
216 | See module documentation for more examples.
217 | """
218 |
219 | self.NoPrefixForSingleIp = 1 # Print no Prefixlen for /32 and /128
220 | self.WantPrefixLen = None # Do we want prefix printed by default? see _printPrefix()
221 |
222 | netbits = 0
223 | prefixlen = -1
224 |
225 | # handling of non string values in constructor
226 | if type(data) == types.IntType or type(data) == types.LongType:
227 | self.ip = long(data)
228 | if ipversion == 0:
229 | if self.ip < 0x100000000L:
230 | ipversion = 4
231 | else:
232 | ipversion = 6
233 | if ipversion == 4:
234 | prefixlen = 32
235 | elif ipversion == 6:
236 | prefixlen = 128
237 | else:
238 | raise ValueError, "only IPv4 and IPv6 supported"
239 | self._ipversion = ipversion
240 | self._prefixlen = prefixlen
241 | # handle IP instance as an parameter
242 | elif isinstance(data, IPint):
243 | self._ipversion = data._ipversion
244 | self._prefixlen = data._prefixlen
245 | self.ip = data.ip
246 | else:
247 | # TODO: refactor me!
248 | # splitting of a string into IP and prefixlen et. al.
249 | x = data.split('-')
250 | if len(x) == 2:
251 | # a.b.c.0-a.b.c.255 specification ?
252 | (ip, last) = x
253 | (self.ip, parsedVersion) = parseAddress(ip)
254 | if parsedVersion != 4:
255 | raise ValueError, "first-last notation only allowed for IPv4"
256 | (last, lastversion) = parseAddress(last)
257 | if lastversion != 4:
258 | raise ValueError, "last address should be IPv4, too"
259 | if last < self.ip:
260 | raise ValueError, "last address should be larger than first"
261 | size = last - self.ip
262 | netbits = _count1Bits(size)
263 | elif len(x) == 1:
264 | x = data.split('/')
265 | # if no prefix is given use defaults
266 | if len(x) == 1:
267 | ip = x[0]
268 | prefixlen = -1
269 | elif len(x) > 2:
270 | raise ValueError, "only one '/' allowed in IP Address"
271 | else:
272 | (ip, prefixlen) = x
273 | if prefixlen.find('.') != -1:
274 | # check if the user might have used a netmask like
275 | # a.b.c.d/255.255.255.0
276 | (netmask, vers) = parseAddress(prefixlen)
277 | if vers != 4:
278 | raise ValueError, "netmask must be IPv4"
279 | prefixlen = _netmaskToPrefixlen(netmask)
280 | elif len(x) > 2:
281 | raise ValueError, "only one '-' allowed in IP Address"
282 | else:
283 | raise ValueError, "can't parse"
284 |
285 | (self.ip, parsedVersion) = parseAddress(ip)
286 | if ipversion == 0:
287 | ipversion = parsedVersion
288 | if prefixlen == -1:
289 | if ipversion == 4:
290 | prefixlen = 32 - netbits
291 | elif ipversion == 6:
292 | prefixlen = 128 - netbits
293 | else:
294 | raise ValueError, "only IPv4 and IPv6 supported"
295 | self._ipversion = ipversion
296 | self._prefixlen = int(prefixlen)
297 |
298 | if not _checkNetaddrWorksWithPrefixlen(self.ip, self._prefixlen, self._ipversion):
299 | raise ValueError, "%s goes not well with prefixlen %d" % (hex(self.ip), self._prefixlen)
300 |
301 |
302 | def int(self):
303 | """Return the first / base / network addess as an (long) integer.
304 |
305 | The same as IP[0].
306 |
307 | >>> hex(IP('10.0.0.0/8').int())
308 | '0xA000000L'
309 | """
310 | return self.ip
311 |
312 | def version(self):
313 | """Return the IP version of this Object.
314 |
315 | >>> IP('10.0.0.0/8').version()
316 | 4
317 | >>> IP('::1').version()
318 | 6
319 | """
320 | return self._ipversion
321 |
322 | def prefixlen(self):
323 | """Returns Network Prefixlen.
324 |
325 | >>> IP('10.0.0.0/8').prefixlen()
326 | 8
327 | """
328 | return self._prefixlen
329 |
330 | def net(self):
331 | """Return the base (first) address of a network as an (long) integer."""
332 |
333 | return self.int()
334 |
335 | def broadcast(self):
336 | """Return the broadcast (last) address of a network as an (long) integer.
337 |
338 | The same as IP[-1]."""
339 | return self.int() + self.len() - 1
340 |
341 | def _printPrefix(self, want):
342 | """Prints Prefixlen/Netmask.
343 |
344 | Not really. In fact it is our universal Netmask/Prefixlen printer.
345 | This is considered an internel function.
346 |
347 | want == 0 / None don't return anything 1.2.3.0
348 | want == 1 /prefix 1.2.3.0/24
349 | want == 2 /netmask 1.2.3.0/255.255.255.0
350 | want == 3 -lastip 1.2.3.0-1.2.3.255
351 | """
352 |
353 | if (self._ipversion == 4 and self._prefixlen == 32) or \
354 | (self._ipversion == 6 and self._prefixlen == 128):
355 | if self.NoPrefixForSingleIp:
356 | want = 0
357 | if want == None:
358 | want = self.WantPrefixLen
359 | if want == None:
360 | want = 1
361 | if want:
362 | if want == 2:
363 | # this should work wit IP and IPint
364 | netmask = self.netmask()
365 | if type(netmask) != types.IntType and type(netmask) != types.LongType:
366 | netmask = netmask.int()
367 | return "/%s" % (intToIp(netmask, self._ipversion))
368 | elif want == 3:
369 | return "-%s" % (intToIp(self.ip + self.len() - 1, self._ipversion))
370 | else:
371 | # default
372 | return "/%d" % (self._prefixlen)
373 | else:
374 | return ''
375 |
376 | # We have different Favours to convert to:
377 | # strFullsize 127.0.0.1 2001:0658:022a:cafe:0200:c0ff:fe8d:08fa
378 | # strNormal 127.0.0.1 2001:658:22a:cafe:200:c0ff:fe8d:08fa
379 | # strCompressed 127.0.0.1 2001:658:22a:cafe::1
380 | # strHex 0x7F000001L 0x20010658022ACAFE0200C0FFFE8D08FA
381 | # strDec 2130706433 42540616829182469433547974687817795834
382 |
383 | def strBin(self, wantprefixlen = None):
384 | """Return a string representation as a binary value.
385 |
386 | >>> print IP('127.0.0.1').strBin()
387 | 01111111000000000000000000000001
388 | """
389 |
390 |
391 | if self._ipversion == 4:
392 | bits = 32
393 | elif self._ipversion == 6:
394 | bits = 128
395 | else:
396 | raise ValueError, "only IPv4 and IPv6 supported"
397 |
398 | if self.WantPrefixLen == None and wantprefixlen == None:
399 | wantprefixlen = 0
400 | ret = _intToBin(self.ip)
401 | return '0' * (bits - len(ret)) + ret + self._printPrefix(wantprefixlen)
402 |
403 | def strCompressed(self, wantprefixlen = None):
404 | """Return a string representation in compressed format using '::' Notation.
405 |
406 | >>> print IP('127.0.0.1').strCompressed()
407 | 127.0.0.1
408 | >>> print IP('2001:0658:022a:cafe:0200::1').strCompressed()
409 | 2001:658:22a:cafe:200::1
410 | """
411 |
412 | if self.WantPrefixLen == None and wantprefixlen == None:
413 | wantprefixlen = 1
414 |
415 | if self._ipversion == 4:
416 | return self.strFullsize(wantprefixlen)
417 | else:
418 | # find the longest sequence of '0'
419 | hextets = [int(x, 16) for x in self.strFullsize(0).split(':')]
420 | # every element of followingzeros will contain the number of zeros
421 | # following the corrospondending element of hextetes
422 | followingzeros = [0] * 8
423 | for i in range(len(hextets)):
424 | followingzeros[i] = _countFollowingZeros(hextets[i:])
425 | # compressionpos is the position where we can start removing zeros
426 | compressionpos = followingzeros.index(max(followingzeros))
427 | if max(followingzeros) > 1:
428 | # genererate string with the longest number of zeros cut out
429 | # now we need hextets as strings
430 | hextets = [x for x in self.strNormal(0).split(':')]
431 | while compressionpos < len(hextets) and hextets[compressionpos] == '0':
432 | del(hextets[compressionpos])
433 | hextets.insert(compressionpos, '')
434 | if compressionpos + 1 >= len(hextets):
435 | hextets.append('')
436 | if compressionpos == 0:
437 | hextets = [''] + hextets
438 | return ':'.join(hextets) + self._printPrefix(wantprefixlen)
439 | else:
440 | return self.strNormal() + self._printPrefix(wantprefixlen)
441 |
442 | def strNormal(self, wantprefixlen = None):
443 | """Return a string representation in the usual format.
444 |
445 | >>> print IP('127.0.0.1').strNormal()
446 | 127.0.0.1
447 | >>> print IP('2001:0658:022a:cafe:0200::1').strNormal()
448 | 2001:658:22a:cafe:200:0:0:1
449 | """
450 |
451 | if self.WantPrefixLen == None and wantprefixlen == None:
452 | wantprefixlen = 1
453 |
454 | if self._ipversion == 4:
455 | ret = self.strFullsize(0)
456 | elif self._ipversion == 6:
457 | ret = ':'.join([hex(x)[2:] for x in [int(x, 16) for x in self.strFullsize(0).split(':')]])
458 | else:
459 | raise ValueError, "only IPv4 and IPv6 supported"
460 |
461 |
462 |
463 | return ret + self._printPrefix(wantprefixlen)
464 |
465 | def strFullsize(self, wantprefixlen = None):
466 | """Return a string representation in the non mangled format.
467 |
468 | >>> print IP('127.0.0.1').strFullsize()
469 | 127.0.0.1
470 | >>> print IP('2001:0658:022a:cafe:0200::1').strFullsize()
471 | 2001:0658:022a:cafe:0200:0000:0000:0001
472 | """
473 |
474 | if self.WantPrefixLen == None and wantprefixlen == None:
475 | wantprefixlen = 1
476 |
477 | return intToIp(self.ip, self._ipversion).lower() + self._printPrefix(wantprefixlen)
478 |
479 | def strHex(self, wantprefixlen = None):
480 | """Return a string representation in hex format.
481 |
482 | >>> print IP('127.0.0.1').strHex()
483 | 0x7F000001
484 | >>> print IP('2001:0658:022a:cafe:0200::1').strHex()
485 | 0x20010658022ACAFE0200000000000001
486 | """
487 |
488 | if self.WantPrefixLen == None and wantprefixlen == None:
489 | wantprefixlen = 0
490 |
491 | x = hex(self.ip)
492 | if x[-1] == 'L':
493 | x = x[:-1]
494 | return x + self._printPrefix(wantprefixlen)
495 |
496 | def strDec(self, wantprefixlen = None):
497 | """Return a string representation in decimal format.
498 |
499 | >>> print IP('127.0.0.1').strDec()
500 | 2130706433
501 | >>> print IP('2001:0658:022a:cafe:0200::1').strDec()
502 | 42540616829182469433547762482097946625
503 | """
504 |
505 | if self.WantPrefixLen == None and wantprefixlen == None:
506 | wantprefixlen = 0
507 |
508 | x = str(self.ip)
509 | if x[-1] == 'L':
510 | x = x[:-1]
511 | return x + self._printPrefix(wantprefixlen)
512 |
513 | def iptype(self):
514 | """Return a description of the IP type ('PRIVATE', 'RESERVERD', etc).
515 |
516 | >>> print IP('127.0.0.1').iptype()
517 | PRIVATE
518 | >>> print IP('192.168.1.1').iptype()
519 | PRIVATE
520 | >>> print IP('195.185.1.2').iptype()
521 | PUBLIC
522 | >>> print IP('::1').iptype()
523 | LOOPBACK
524 | >>> print IP('2001:0658:022a:cafe:0200::1').iptype()
525 | ASSIGNABLE RIPE
526 |
527 | The type information for IPv6 is out of sync with reality.
528 | """
529 |
530 | # this could be greatly improved
531 |
532 | if self._ipversion == 4:
533 | iprange = IPv4ranges
534 | elif self._ipversion == 6:
535 | iprange = IPv6ranges
536 | else:
537 | raise ValueError, "only IPv4 and IPv6 supported"
538 |
539 | bits = self.strBin()
540 | for i in range(len(bits), 0, -1):
541 | if iprange.has_key(bits[:i]):
542 | return iprange[bits[:i]]
543 | return "unknown"
544 |
545 |
546 | def netmask(self):
547 | """Return netmask as an integer.
548 |
549 | >>> print hex(IP('195.185.0.0/16').netmask().int())
550 | 0xFFFF0000L
551 | """
552 |
553 | # TODO: unify with prefixlenToNetmask?
554 | if self._ipversion == 4:
555 | locallen = 32 - self._prefixlen
556 | elif self._ipversion == 6:
557 | locallen = 128 - self._prefixlen
558 | else:
559 | raise ValueError, "only IPv4 and IPv6 supported"
560 |
561 | return ((2L ** self._prefixlen) - 1) << locallen
562 |
563 |
564 | def strNetmask(self):
565 | """Return netmask as an string. Mostly useful for IPv6.
566 |
567 | >>> print IP('195.185.0.0/16').strNetmask()
568 | 255.255.0.0
569 | >>> print IP('2001:0658:022a:cafe::0/64').strNetmask()
570 | /64
571 | """
572 |
573 | # TODO: unify with prefixlenToNetmask?
574 | if self._ipversion == 4:
575 | locallen = 32 - self._prefixlen
576 | return intToIp(((2L ** self._prefixlen) - 1) << locallen, 4)
577 | elif self._ipversion == 6:
578 | locallen = 128 - self._prefixlen
579 | return "/%d" % self._prefixlen
580 | else:
581 | raise ValueError, "only IPv4 and IPv6 supported"
582 |
583 | def len(self):
584 | """Return the length of an subnet.
585 |
586 | >>> print IP('195.185.1.0/28').len()
587 | 16
588 | >>> print IP('195.185.1.0/24').len()
589 | 256
590 | """
591 |
592 | if self._ipversion == 4:
593 | locallen = 32 - self._prefixlen
594 | elif self._ipversion == 6:
595 | locallen = 128 - self._prefixlen
596 | else:
597 | raise ValueError, "only IPv4 and IPv6 supported"
598 |
599 | return 2L ** locallen
600 |
601 |
602 | def __len__(self):
603 | """Return the length of an subnet.
604 |
605 | Called to implement the built-in function len().
606 | It breaks with IPv6 Networks. Anybody knows how to fix this."""
607 |
608 | # Python < 2.2 has this silly restriction which breaks IPv6
609 | # how about Python >= 2.2 ... ouch - it presists!
610 |
611 | return int(self.len())
612 |
613 |
614 | def __getitem__(self, key):
615 | """Called to implement evaluation of self[key].
616 |
617 | >>> ip=IP('127.0.0.0/30')
618 | >>> for x in ip:
619 | ... print hex(x.int())
620 | ...
621 | 0x7F000000L
622 | 0x7F000001L
623 | 0x7F000002L
624 | 0x7F000003L
625 | >>> hex(ip[2].int())
626 | '0x7F000002L'
627 | >>> hex(ip[-1].int())
628 | '0x7F000003L'
629 | """
630 |
631 | if type(key) != types.IntType and type(key) != types.LongType:
632 | raise TypeError
633 | if abs(key) >= self.len():
634 | raise IndexError
635 | if key < 0:
636 | key = self.len() - abs(key)
637 |
638 | return self.ip + long(key)
639 |
640 |
641 |
642 | def __contains__(self, item):
643 | """Called to implement membership test operators.
644 |
645 | Should return true if item is in self, false otherwise. Item
646 | can be other IP-objects, strings or ints.
647 |
648 | >>> print IP('195.185.1.1').strHex()
649 | 0xC3B90101
650 | >>> 0xC3B90101L in IP('195.185.1.0/24')
651 | 1
652 | >>> '127.0.0.1' in IP('127.0.0.0/24')
653 | 1
654 | >>> IP('127.0.0.0/24') in IP('127.0.0.0/25')
655 | 0
656 | """
657 |
658 | item = IP(item)
659 | if item.ip >= self.ip and item.ip < self.ip + self.len() - item.len() + 1:
660 | return 1
661 | else:
662 | return 0
663 |
664 |
665 | def overlaps(self, item):
666 | """Check if two IP address ranges overlap.
667 |
668 | Returns 0 if the two ranged don't overlap, 1 if the given
669 | range overlaps at the end and -1 if it does at the beginning.
670 |
671 | >>> IP('192.168.0.0/23').overlaps('192.168.1.0/24')
672 | 1
673 | >>> IP('192.168.0.0/23').overlaps('192.168.1.255')
674 | 1
675 | >>> IP('192.168.0.0/23').overlaps('192.168.2.0')
676 | 0
677 | >>> IP('192.168.1.0/24').overlaps('192.168.0.0/23')
678 | -1
679 | """
680 |
681 | item = IP(item)
682 | if item.ip >= self.ip and item.ip < self.ip + self.len():
683 | return 1
684 | elif self.ip >= item.ip and self.ip < item.ip + item.len():
685 | return -1
686 | else:
687 | return 0
688 |
689 |
690 | def __str__(self):
691 | """Dispatch to the prefered String Representation.
692 |
693 | Used to implement str(IP)."""
694 |
695 | return self.strFullsize()
696 |
697 |
698 | def __repr__(self):
699 | """Print a representation of the Object.
700 |
701 | Used to implement repr(IP). Returns a string which evaluates
702 | to an identical Object (without the wnatprefixlen stuff - see
703 | module docstring.
704 |
705 | >>> print repr(IP('10.0.0.0/24'))
706 | IP('10.0.0.0/24')
707 | """
708 |
709 | return("IPint('%s')" % (self.strCompressed(1)))
710 |
711 |
712 | def __cmp__(self, other):
713 | """Called by comparison operations.
714 |
715 | Should return a negative integer if self < other, zero if self
716 | == other, a positive integer if self > other.
717 |
718 | Networks with different prefixlen are considered non-equal.
719 | Networks with the same prefixlen and differing addresses are
720 | considered non equal but are compared by thair base address
721 | integer value to aid sorting of IP objects.
722 |
723 | The Version of Objects is not put into consideration.
724 |
725 | >>> IP('10.0.0.0/24') > IP('10.0.0.0')
726 | 1
727 | >>> IP('10.0.0.0/24') < IP('10.0.0.0')
728 | 0
729 | >>> IP('10.0.0.0/24') < IP('12.0.0.0/24')
730 | 1
731 | >>> IP('10.0.0.0/24') > IP('12.0.0.0/24')
732 | 0
733 |
734 | """
735 |
736 | # Im not really sure if this is "the right thing to do"
737 | if self._prefixlen < other.prefixlen():
738 | return (other.prefixlen() - self._prefixlen)
739 | elif self._prefixlen > other.prefixlen():
740 |
741 | # Fixed bySamuel Krempp :
742 |
743 | # The bug is quite obvious really (as 99% bugs are once
744 | # spotted, isn't it ? ;-) Because of precedence of
745 | # multiplication by -1 over the substraction, prefixlen
746 | # differences were causing the __cmp__ function to always
747 | # return positive numbers, thus the function was failing
748 | # the basic assumptions for a __cmp__ function.
749 |
750 | # Namely we could have (a > b AND b > a), when the
751 | # prefixlen of a and b are different. (eg let
752 | # a=IP("1.0.0.0/24"); b=IP("2.0.0.0/16");) thus, anything
753 | # could happen when launching a sort algorithm..
754 | # everything's in order with the trivial, attached patch.
755 |
756 | return (self._prefixlen - other.prefixlen()) * -1
757 | else:
758 | if self.ip < other.ip:
759 | return -1
760 | elif self.ip > other.ip:
761 | return 1
762 | else:
763 | return 0
764 |
765 |
766 | def __hash__(self):
767 | """Called for the key object for dictionary operations, and by
768 | the built-in function hash() Should return a 32-bit integer
769 | usable as a hash value for dictionary operations. The only
770 | required property is that objects which compare equal have the
771 | same hash value
772 |
773 | >>> hex(IP('10.0.0.0/24').__hash__())
774 | '0xf5ffffe7'
775 | """
776 |
777 | thehash = int(-1)
778 | ip = self.ip
779 | while ip > 0:
780 | thehash = thehash ^ (ip & 0x7fffffff)
781 | ip = ip >> 32
782 | thehash = thehash ^ self._prefixlen
783 | return int(thehash)
784 |
785 |
786 | class IP(IPint):
787 | """Class for handling IP Addresses and Networks."""
788 |
789 | def net(self):
790 | """Return the base (first) address of a network as an IP object.
791 |
792 | The same as IP[0].
793 |
794 | >>> IP('10.0.0.0/8').net()
795 | IP('10.0.0.0')
796 | """
797 | return IP(IPint.net(self))
798 |
799 | def broadcast(self):
800 | """Return the broadcast (last) address of a network as an IP object.
801 |
802 | The same as IP[-1].
803 |
804 | >>> IP('10.0.0.0/8').broadcast()
805 | IP('10.255.255.255')
806 | """
807 | return IP(IPint.broadcast(self))
808 |
809 | def netmask(self):
810 | """Return netmask as an IP object.
811 |
812 | >>> IP('10.0.0.0/8').netmask()
813 | IP('255.0.0.0')
814 | """
815 | return IP(IPint.netmask(self))
816 |
817 |
818 | def reverseNames(self):
819 | """Return a list with values forming the reverse lookup.
820 |
821 | >>> IP('213.221.113.87/32').reverseNames()
822 | ['87.113.221.213.in-addr.arpa.']
823 | >>> IP('213.221.112.224/30').reverseNames()
824 | ['224.112.221.213.in-addr.arpa.', '225.112.221.213.in-addr.arpa.', '226.112.221.213.in-addr.arpa.', '227.112.221.213.in-addr.arpa.']
825 | >>> IP('127.0.0.0/24').reverseNames()
826 | ['0.0.127.in-addr.arpa.']
827 | >>> IP('127.0.0.0/23').reverseNames()
828 | ['0.0.127.in-addr.arpa.', '1.0.127.in-addr.arpa.']
829 | >>> IP('127.0.0.0/16').reverseNames()
830 | ['0.127.in-addr.arpa.']
831 | >>> IP('127.0.0.0/15').reverseNames()
832 | ['0.127.in-addr.arpa.', '1.127.in-addr.arpa.']
833 | >>> IP('128.0.0.0/8').reverseNames()
834 | ['128.in-addr.arpa.']
835 | >>> IP('128.0.0.0/7').reverseNames()
836 | ['128.in-addr.arpa.', '129.in-addr.arpa.']
837 |
838 | """
839 |
840 | if self._ipversion == 4:
841 | ret =[]
842 | # TODO: Refactor. Add support for IPint objects
843 | if self.len() < 2**8:
844 | for x in self:
845 | ret.append(x.reverseName())
846 | elif self.len() < 2**16L:
847 | for i in range(0, self.len(), 2**8):
848 | ret.append(self[i].reverseName()[2:])
849 | elif self.len() < 2**24L:
850 | for i in range(0, self.len(), 2**16):
851 | ret.append(self[i].reverseName()[4:])
852 | else:
853 | for i in range(0, self.len(), 2**24):
854 | ret.append(self[i].reverseName()[6:])
855 | return ret
856 | elif self._ipversion == 6:
857 | s = hex(self.ip)[2:].lower()
858 | if s[-1] == 'l':
859 | s = s[:-1]
860 | if self._prefixlen % 4 != 0:
861 | raise NotImplementedError, "can't create IPv6 reverse names at sub nibble level"
862 | s = list(s)
863 | s.reverse()
864 | s = '.'.join(s)
865 | first_nibble_index = int(32 - (self._prefixlen / 4)) * 2
866 | return ["%s.ip6.int." % s[first_nibble_index:]]
867 | else:
868 | raise ValueError, "only IPv4 and IPv6 supported"
869 |
870 |
871 |
872 | def reverseName(self):
873 | """Return the value for reverse lookup/PTR records as RfC 2317 look alike.
874 |
875 | RfC 2317 is an ugly hack which only works for sub-/24 e.g. not
876 | for /23. Do not use it. Better set up a Zone for every
877 | address. See reverseName for a way to arcive that.
878 |
879 | >>> print IP('195.185.1.1').reverseName()
880 | 1.1.185.195.in-addr.arpa.
881 | >>> print IP('195.185.1.0/28').reverseName()
882 | 0-15.1.185.195.in-addr.arpa.
883 | """
884 |
885 | if self._ipversion == 4:
886 | s = self.strFullsize(0)
887 | s = s.split('.')
888 | s.reverse()
889 | first_byte_index = int(4 - (self._prefixlen / 8))
890 | if self._prefixlen % 8 != 0:
891 | nibblepart = "%s-%s" % (s[3-(self._prefixlen / 8)], intToIp(self.ip + self.len() - 1, 4).split('.')[-1])
892 | if nibblepart[-1] == 'l':
893 | nibblepart = nibblepart[:-1]
894 | nibblepart += '.'
895 | else:
896 | nibblepart = ""
897 |
898 | s = '.'.join(s[first_byte_index:])
899 | return "%s%s.in-addr.arpa." % (nibblepart, s)
900 |
901 | elif self._ipversion == 6:
902 | s = hex(self.ip)[2:].lower()
903 | if s[-1] == 'l':
904 | s = s[:-1]
905 | if self._prefixlen % 4 != 0:
906 | nibblepart = "%s-%s" % (s[self._prefixlen:], hex(self.ip + self.len() - 1)[2:].lower())
907 | if nibblepart[-1] == 'l':
908 | nibblepart = nibblepart[:-1]
909 | nibblepart += '.'
910 | else:
911 | nibblepart = ""
912 | s = list(s)
913 | s.reverse()
914 | s = '.'.join(s)
915 | first_nibble_index = int(32 - (self._prefixlen / 4)) * 2
916 | return "%s%s.ip6.int." % (nibblepart, s[first_nibble_index:])
917 | else:
918 | raise ValueError, "only IPv4 and IPv6 supported"
919 |
920 | def __getitem__(self, key):
921 | """Called to implement evaluation of self[key].
922 |
923 | >>> ip=IP('127.0.0.0/30')
924 | >>> for x in ip:
925 | ... print str(x)
926 | ...
927 | 127.0.0.0
928 | 127.0.0.1
929 | 127.0.0.2
930 | 127.0.0.3
931 | >>> print str(ip[2])
932 | 127.0.0.2
933 | >>> print str(ip[-1])
934 | 127.0.0.3
935 | """
936 | return IP(IPint.__getitem__(self, key))
937 |
938 | def __repr__(self):
939 | """Print a representation of the Object.
940 |
941 | >>> IP('10.0.0.0/8')
942 | IP('10.0.0.0/8')
943 | """
944 |
945 | return("IP('%s')" % (self.strCompressed(1)))
946 |
947 | def __add__(self, other):
948 | """Emulate numeric objects through network aggregation"""
949 | if self.prefixlen() != other.prefixlen():
950 | raise ValueError, "Only networks with the same prefixlen can be added."
951 | if self.prefixlen < 1:
952 | raise ValueError, "Networks with a prefixlen longer than /1 can't be added."
953 | if self.version() != other.version():
954 | raise ValueError, "Only networks with the same IP version can be added."
955 | if self > other:
956 | # fixed by Skinny Puppy
957 | return other.__add__(self)
958 | else:
959 | ret = IP(self.int())
960 | ret._prefixlen = self.prefixlen() - 1
961 | return ret
962 |
963 | def parseAddress(ipstr):
964 | """Parse a string and return the corrospondending IPaddress and the a guess of the IP version.
965 |
966 | Following Forms ar recorgnized:
967 | 0x0123456789abcdef # IPv4 if <= 0xffffffff else IPv6
968 | 123.123.123.123 # IPv4
969 | 123.123 # 0-padded IPv4
970 | 1080:0000:0000:0000:0008:0800:200C:417A
971 | 1080:0:0:0:8:800:200C:417A
972 | 1080:0::8:800:200C:417A
973 | ::1
974 | ::
975 | 0:0:0:0:0:FFFF:129.144.52.38
976 | ::13.1.68.3
977 | ::FFFF:129.144.52.38
978 | """
979 |
980 | # TODO: refactor me!
981 | if ipstr.startswith('0x'):
982 | ret = long(ipstr[2:], 16)
983 | if ret > 0xffffffffffffffffffffffffffffffffL:
984 | raise ValueError, "%r: IP Address can't be bigger than 2^128" % (ipstr)
985 | if ret < 0x100000000L:
986 | return (ret, 4)
987 | else:
988 | return (ret, 6)
989 |
990 | if ipstr.find(':') != -1:
991 | # assume IPv6
992 | if ipstr.find(':::') != -1:
993 | raise ValueError, "%r: IPv6 Address can't contain ':::'" % (ipstr)
994 | hextets = ipstr.split(':')
995 | if ipstr.find('.') != -1:
996 | # this might be a mixed address like '0:0:0:0:0:0:13.1.68.3'
997 | (v4, foo) = parseAddress(hextets[-1])
998 | assert foo == 4
999 | del(hextets[-1])
1000 | hextets.append(hex(v4 >> 16)[2:-1])
1001 | hextets.append(hex(v4 & 0xffff)[2:-1])
1002 | if len(hextets) > 8:
1003 | raise ValueError, "%r: IPv6 Address with more than 8 hexletts" % (ipstr)
1004 | if len(hextets) < 8:
1005 | if '' not in hextets:
1006 | raise ValueError, "%r IPv6 Address with less than 8 hexletts and without '::'" % (ipstr)
1007 | # catch :: at the beginning or end
1008 | if hextets.index('') < len(hextets) - 1 and hextets[hextets.index('')+1] == '':
1009 | hextets.remove('')
1010 | # catch '::'
1011 | if hextets.index('') < len(hextets) - 1 and hextets[hextets.index('')+1] == '':
1012 | hextets.remove('')
1013 |
1014 | for foo in range(9-len(hextets)):
1015 | hextets.insert(hextets.index(''), '0')
1016 | hextets.remove('')
1017 | if '' in hextets:
1018 | raise ValueError, "%r IPv6 Address may contain '::' only once" % (ipstr)
1019 | if '' in hextets:
1020 | raise ValueError, "%r IPv6 Address may contain '::' only if it has less than 8 hextets" % (ipstr)
1021 | num = ''
1022 | for x in hextets:
1023 | if len(x) < 4:
1024 | x = ((4 - len(x)) * '0') + x
1025 | if int(x, 16) < 0 or int(x, 16) > 0xffff:
1026 | raise ValueError, "%r: single hextet must be 0 <= hextet <= 0xffff which isn't true for %s" % (ipstr, x)
1027 | num += x
1028 | return (long(num, 16), 6)
1029 |
1030 | elif len(ipstr) == 32:
1031 | # assume IPv6 in pure hexadecimal notation
1032 | return (long(ipstr, 16), 6)
1033 |
1034 | elif ipstr.find('.') != -1 or (len(ipstr) < 4 and int(ipstr) < 256):
1035 | # assume IPv4 ('127' gets interpreted as '127.0.0.0')
1036 | bytes = ipstr.split('.')
1037 | if len(bytes) > 4:
1038 | raise ValueError, "IPv4 Address with more than 4 bytes"
1039 | bytes += ['0'] * (4 - len(bytes))
1040 | bytes = [long(x) for x in bytes]
1041 | for x in bytes:
1042 | if x > 255 or x < 0:
1043 | raise ValueError, "%r: single byte must be 0 <= byte < 256" % (ipstr)
1044 | return ((bytes[0] << 24) + (bytes[1] << 16) + (bytes[2] << 8) + bytes[3], 4)
1045 |
1046 | else:
1047 | # we try to interprete it as a decimal digit -
1048 | # this ony works for numbers > 255 ... others
1049 | # will be interpreted as IPv4 first byte
1050 | ret = long(ipstr)
1051 | if ret > 0xffffffffffffffffffffffffffffffffL:
1052 | raise ValueError, "IP Address cant be bigger than 2^128"
1053 | if ret <= 0xffffffffL:
1054 | return (ret, 4)
1055 | else:
1056 | return (ret, 6)
1057 |
1058 |
1059 | def intToIp(ip, version):
1060 | """Transform an integer string into an IP address."""
1061 |
1062 | # just to be sure and hoping for Python 2.22
1063 | ip = long(ip)
1064 |
1065 | if ip < 0:
1066 | raise ValueError, "IPs can't be negative: %d" % (ip)
1067 |
1068 | ret = ''
1069 | if version == 4:
1070 | if ip > 0xffffffffL:
1071 | raise ValueError, "IPv4 Addresses can't be larger than 0xffffffff: %s" % (hex(ip))
1072 | for l in range(4):
1073 | ret = str(ip & 0xffL) + '.' + ret
1074 | ip = ip >> 8;
1075 | ret = ret[:-1]
1076 | elif version == 6:
1077 | if ip > 0xffffffffffffffffffffffffffffffffL:
1078 | raise ValueError, "IPv6 Addresses can't be larger than 0xffffffffffffffffffffffffffffffff: %s" % (hex(ip))
1079 | l = '0' * 32 + hex(ip)[2:-1]
1080 | for x in range(1,33):
1081 | ret = l[-x] + ret
1082 | if x % 4 == 0:
1083 | ret = ':' + ret
1084 | ret = ret[1:]
1085 | else:
1086 | raise ValueError, "only IPv4 and IPv6 supported"
1087 |
1088 | return ret;
1089 |
1090 | def _ipVersionToLen(version):
1091 | """Return number of bits in address for a certain IP version.
1092 |
1093 | >>> _ipVersionToLen(4)
1094 | 32
1095 | >>> _ipVersionToLen(6)
1096 | 128
1097 | >>> _ipVersionToLen(5)
1098 | Traceback (most recent call last):
1099 | File "", line 1, in ?
1100 | File "IPy.py", line 1076, in _ipVersionToLen
1101 | raise ValueError, "only IPv4 and IPv6 supported"
1102 | ValueError: only IPv4 and IPv6 supported
1103 | """
1104 |
1105 | if version == 4:
1106 | return 32
1107 | elif version == 6:
1108 | return 128
1109 | else:
1110 | raise ValueError, "only IPv4 and IPv6 supported"
1111 |
1112 |
1113 | def _countFollowingZeros(l):
1114 | """Return Nr. of elements containing 0 at the beginning th the list."""
1115 | if len(l) == 0:
1116 | return 0
1117 | elif l[0] != 0:
1118 | return 0
1119 | else:
1120 | return 1 + _countFollowingZeros(l[1:])
1121 |
1122 |
1123 | _BitTable = {'0': '0000', '1': '0001', '2': '0010', '3': '0011',
1124 | '4': '0100', '5': '0101', '6': '0110', '7': '0111',
1125 | '8': '1000', '9': '1001', 'a': '1010', 'b': '1011',
1126 | 'c': '1100', 'd': '1101', 'e': '1110', 'f': '1111'}
1127 |
1128 | def _intToBin(val):
1129 | """Return the binary representation of an integer as string."""
1130 |
1131 | if val < 0:
1132 | raise ValueError, "Only positive Values allowed"
1133 | s = hex(val).lower()
1134 | ret = ''
1135 | if s[-1] == 'l':
1136 | s = s[:-1]
1137 | for x in s[2:]:
1138 | if __debug__:
1139 | if not _BitTable.has_key(x):
1140 | raise AssertionError, "hex() returned strange result"
1141 | ret += _BitTable[x]
1142 | # remove leading zeros
1143 | while ret[0] == '0' and len(ret) > 1:
1144 | ret = ret[1:]
1145 | return ret
1146 |
1147 | def _count1Bits(num):
1148 | """Find the highest bit set to 1 in an integer."""
1149 | ret = 0
1150 | while num > 0:
1151 | num = num >> 1
1152 | ret += 1
1153 | return ret
1154 |
1155 | def _count0Bits(num):
1156 | """Find the highest bit set to 0 in an integer."""
1157 |
1158 | # this could be so easy if _count1Bits(~long(num)) would work as excepted
1159 | num = long(num)
1160 | if num < 0:
1161 | raise ValueError, "Only positive Numbers please: %s" % (num)
1162 | ret = 0
1163 | while num > 0:
1164 | if num & 1 == 1:
1165 | break
1166 | num = num >> 1
1167 | ret += 1
1168 | return ret
1169 |
1170 |
1171 | def _checkPrefix(ip, prefixlen, version):
1172 | """Check the validity of a prefix
1173 |
1174 | Checks if the variant part of a prefix only has 0s, and the length is
1175 | correct.
1176 |
1177 | >>> _checkPrefix(0x7f000000L, 24, 4)
1178 | 1
1179 | >>> _checkPrefix(0x7f000001L, 24, 4)
1180 | 0
1181 | >>> repr(_checkPrefix(0x7f000001L, -1, 4))
1182 | 'None'
1183 | >>> repr(_checkPrefix(0x7f000001L, 33, 4))
1184 | 'None'
1185 | """
1186 |
1187 | # TODO: unify this v4/v6/invalid code in a function
1188 | bits = _ipVersionToLen(version)
1189 |
1190 | if prefixlen < 0 or prefixlen > bits:
1191 | return None
1192 |
1193 | if ip == 0:
1194 | zbits = bits + 1
1195 | else:
1196 | zbits = _count0Bits(ip)
1197 | if zbits < bits - prefixlen:
1198 | return 0
1199 | else:
1200 | return 1
1201 |
1202 |
1203 | def _checkNetmask(netmask, masklen):
1204 | """Checks if a netmask is expressable as e prefixlen."""
1205 |
1206 | num = long(netmask)
1207 | bits = masklen
1208 |
1209 | # remove zero bits at the end
1210 | while (num & 1) == 0:
1211 | num = num >> 1
1212 | bits -= 1
1213 | if bits == 0:
1214 | break
1215 | # now check if the rest consists only of ones
1216 | while bits > 0:
1217 | if (num & 1) == 0:
1218 | raise ValueError, "Netmask %s can't be expressed as an prefix." % (hex(netmask))
1219 | num = num >> 1
1220 | bits -= 1
1221 |
1222 |
1223 | def _checkNetaddrWorksWithPrefixlen(net, prefixlen, version):
1224 | """Check if a base addess of e network is compatible with a prefixlen"""
1225 | if net & _prefixlenToNetmask(prefixlen, version) == net:
1226 | return 1
1227 | else:
1228 | return 0
1229 |
1230 |
1231 | def _netmaskToPrefixlen(netmask):
1232 | """Convert an Integer reprsenting a Netmask to an prefixlen.
1233 |
1234 | E.g. 0xffffff00 (255.255.255.0) returns 24
1235 | """
1236 |
1237 | netlen = _count0Bits(netmask)
1238 | masklen = _count1Bits(netmask)
1239 | _checkNetmask(netmask, masklen)
1240 | return masklen - netlen
1241 |
1242 |
1243 | def _prefixlenToNetmask(prefixlen, version):
1244 | """Return a mask of n bits as a long integer.
1245 |
1246 | From 'IP address conversion functions with the builtin socket module' by Alex Martelli
1247 | http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66517
1248 | """
1249 | if prefixlen == 0:
1250 | return 0
1251 | elif prefixlen < 0:
1252 | raise ValueError, "Prefixlen must be > 0"
1253 | return ((2L< 0:
20 | critere = [critere]
21 |
22 | with open(self.dbname + '_' + '_'.join(critere) + '.csv', 'w') as fw:
23 | for domaine in domaines:
24 | try:
25 | towrite = ''
26 | for key in critere:
27 | infos = domaine[key]
28 | if len(infos) > 0:
29 | if isinstance(infos, list):
30 | infos = ','.join(infos)
31 | towrite = towrite + ',' + str(infos)
32 |
33 | fw.write(towrite[1:] + '\n')
34 | except KeyError:
35 | print 'domaine: ' + str(domaine)
36 | except pymongo.errors.OperationFailure:
37 | print 'error mongo ' + str(domaine)
38 |
--------------------------------------------------------------------------------
/processing/createcorpus.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | from mongodb import mongodb
3 | import sys
4 | import filters
5 | db=sys.argv[1]
6 | mdb=mongodb.mongodb('localhost',27017,db)
7 |
8 | i=0
9 |
10 | with open(db+'_domaine.txt','w') as fw:
11 | fw.write('**** *domaine\n')
12 | for domaine in mdb.selectall('metadatas'):
13 | fw.write(domaine['domaine'])
14 | fw.write('\n')
15 | with open(db+'_metadatas.txt','w') as fw:
16 | fw.write('**** *metadata\n')
17 | for domaine in mdb.selectall('metadatas'):
18 | meta=domaine['meta']
19 | for filt in filters.filters_metadata:
20 | meta=meta.replace(filt,'')
21 | meta=meta.replace(filt.swapcase(),'')
22 | fw.write(meta.encode('ascii','ignore'))
23 | fw.write('\n')
24 |
25 |
26 |
27 | fw.close()
28 |
29 |
30 |
--------------------------------------------------------------------------------
/processing/dnstree.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Dec 20, 2012
3 |
4 | @author: slarinier
5 | '''
6 | from pymongo import MongoClient
7 | from pyfaup.faup import Faup
8 |
9 |
10 | class DNSTree(object):
11 | '''
12 | classdocs
13 | '''
14 |
15 | def __init__(self, db_value):
16 | '''
17 | Constructor
18 | '''
19 | connection = MongoClient(host='localhost', port=27017)
20 | self.db = connection[db_value]
21 |
22 | def process(self):
23 | list_domains = self.db['new_domaines'].distinct('domaine')
24 | fex = Faup()
25 | for domain in list_domains:
26 | url = 'http://' + str(domain)
27 | fex.decode(url)
28 |
29 | try:
30 | print (
31 | fex.get_tld() + ',' + fex.get_domain() + ',' + ','.join(fex.get_subdomain().split('.')[::-1]).replace('www',
32 | '')).replace(
33 | ',,', ',')
34 | except:
35 | pass
--------------------------------------------------------------------------------
/processing/filters.py:
--------------------------------------------------------------------------------
1 | filters_metadata=['charset','text','iso','html','-8859-1','www','fr']
2 |
--------------------------------------------------------------------------------
/processing/filters.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/processing/filters.pyc
--------------------------------------------------------------------------------
/processing/gouv.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/processing/gouv.log
--------------------------------------------------------------------------------
/processing/gouv_domaine.txt:
--------------------------------------------------------------------------------
1 | **** *domaine
2 | legifrance.gouv.fr
3 | www.eure.sit.gouv.fr
4 | archives.livreblancdefenseetsecurite.gouv.fr
5 | www.yonne.sit.gouv.fr
6 | www.internet.gouv.fr
7 | www.direct-fr.com
8 | forum.webmaster-rank.info
9 | www.drees.sante.gouv.fr
10 | www.impots.gouv.fr
11 | www.oncfs.gouv.fr
12 | www.legifrance.gouv.fr
13 | www.interieur.gouv.fr
14 | www.refondonslecole.gouv.fr
15 | www.immigration.gouv.fr
16 | www.dmp.gouv.fr
17 | www.vendee.gouv.fr
18 | www.gouvernement.fr
19 | direccte.gouv.fr
20 | service-public.fr
21 | www.jeunes.gouv.fr
22 | vosdroits.service-public.fr
23 | www.tourisme.gouv.fr
24 | www.service-public.fr
25 | archives.forum.gouv.fr
26 | www.banqoutils.education.gouv.fr
27 | www2.impots.gouv.fr
28 | www.sgdn.gouv.fr
29 | www.cerpet.education.gouv.fr
30 | www.legifrance.org
31 | www-int.dmp.gouv.fr
32 | www.hcst.fr
33 | www.publinetd5.education.fr
34 | archives.dividende-numerique.fr
35 | cpcnu.fr
36 | www.dicod.defense.gouv.fr
37 | www.legifrance.com
38 | delegation.internet.gouv.fr
39 | www.clemi.org
40 | www-prod.sante.gouv.fr
41 | archives.europe.gouv.fr
42 | www.sante.gouv.fr
43 | www.nord.gouv.fr
44 | www.ove-national.education.fr
45 | www.2011-annee-droits-patients.sante.gouv.fr
46 | org-www.sante.gouv.fr
47 | www.emsome.terre.defense.gouv.fr
48 | www.rt519.terre.defense.gouv.fr
49 | archives.surfez-intelligent.gouv.fr
50 | www.restructurations.defense.gouv.fr
51 | www.bretagne.pref.gouv.fr
52 | www.bca13.terre.defense.gouv.fr
53 | ons.education.gouv.fr
54 | www.juryeps.education.fr
55 | 160.92.162.230
56 | www.publinetde.education.fr
57 | publinetce2.education.fr
58 | www.cdj59.org
59 | www.ancien.eure.pref.gouv.fr
60 | www.drdjs-lorraine.jeunesse-sports.gouv.fr
61 | www.hce.education.fr
62 | www.cpcnu.fr
63 | www.bretagne.drjscs.gouv.fr
64 | www.centre.drjscs.gouv.fr
65 | www.paca.drjscs.gouv.fr
66 | www.auvergne.drjscs.gouv.fr
67 | www.observatoire-parite.gouv.fr
68 | www.rama3.terre.defense.gouv.fr
69 | www.securite-sociale.fr
70 | www.haute-normandie.drjscs.gouv.fr
71 | www.bilrif.sga.defense.gouv.fr
72 | www.drjscs.gouv.fr
73 | www.aquitaine.drjscs.gouv.fr
74 | www.sports.gouv.fr
75 | www.anesm.sante.gouv.fr
76 | www.cesat.terre.defense.gouv.fr
77 | www.franche-comte.drjscs.gouv.fr
78 | www.garnison-besancon.terre.defense.gouv.fr
79 | www.etrs.terre.defense.gouv.fr
80 | www.bca7.terre.defense.gouv.fr
81 | www.lorraine.drjscs.gouv.fr
82 | www.midi-pyrenees.drjscs.gouv.fr
83 | www.rhone-alpes.drjscs.gouv.fr
84 | fr.webmaster-rank.info
85 | it.webmaster-rank.info
86 | easy404.webmaster-rank.info
87 | www.webmaster-rank.info
88 | www.pyrenees-atlantiques.pref.gouv.fr
89 | www.vigicrues.developpement-durable.gouv.fr
90 | www.memoiredeshommes.sga.defense.gouv.fr
91 | www.vigicrues.ecologie.gouv.fr
92 | www.servicehistorique.sga.defense.gouv.fr
93 | www.basse-normandie.pref.gouv.fr
94 | www.loire.pref.gouv.fr
95 | www.oise.pref.gouv.fr
96 | www.dordogne.pref.gouv.fr
97 | dordogne.pref.gouv.fr
98 | www.morbihan.pref.gouv.fr
99 | formation.oncfs.gouv.fr
100 | www.martinique.pref.gouv.fr
101 | www.drome.pref.gouv.fr
102 | loire.gouv.fr
103 | www.tarn-et-garonne.pref.gouv.fr
104 | www.cada.fr
105 | www.loire.gouv.fr
106 | m.geoportail.fr
107 | www.conseilculturel-upm.gouv.fr
108 | www.essonne.gouv.fr
109 | archives.internet.gouv.fr
110 | www.like-rank.com
111 | www.drogues.gouv.fr
112 | www.nord.pref.gouv.fr
113 | drogues.gouv.fr
114 | www.haute-saone.pref.gouv.fr
115 | www.maine-et-loire.pref.gouv.fr
116 | www.gopher.com
117 | www.ariege.pref.gouv.fr
118 | www2.direct-fr.com
119 | www.pyrenees-orientales.pref.gouv.fr
120 | www.haut-rhin.pref.gouv.fr
121 | www.isere.pref.gouv.fr
122 | www.somme.pref.gouv.fr
123 | search.kiwee.com
124 | ardeche.pref.gouv.fr
125 | vendee.gouv.fr
126 | www.franche-comte.pref.gouv.fr
127 | org-www.impots.gouv.fr
128 | www.contact.impots.gouv.fr
129 | interne.impots.gouv.fr
130 | contacts.impots.gouv.fr
131 | www.champagne-ardenne.pref.gouv.fr
132 | www.cartocrime.net
133 | www.guadeloupe.dieccte.gouv.fr
134 | www.guyane.dieccte.gouv.fr
135 | www.auvergne.direccte.gouv.fr
136 | www.boamp.fr
137 | www.paca.direccte.gouv.fr
138 | www.gers.pref.gouv.fr
139 | www.savoie.pref.gouv.fr
140 | www.ladocumentationfrancaise.fr
141 | www.vie-publique.fr
142 | www.pme.service-public.fr
143 | www.direccte.gouv.fr
144 | www.alsace.direccte.gouv.fr
145 | www.marne.pref.gouv.fr
146 | lannuaire.service-public.fr
147 | www.corse.direccte.gouv.fr
148 | www.ddjs-ardennes.jeunesse-sports.gouv.fr
149 | www.correze.pref.gouv.fr
150 | www.centre.pref.gouv.fr
151 | landes.pref.gouv.fr
152 | www.recrutement.terre.defense.gouv.fr
153 | search.zip2.com
154 | www.nievre.pref.gouv.fr
155 | www.contacts.impots.gouv.fr
156 | search.firstplace.com
157 | www.poitou-charentes.direccte.gouv.fr
158 | www.commentcamarche.net
159 | www.idf.direccte.gouv.fr
160 | www.ardennes.pref.gouv.fr
161 | www.pays-de-la-loire.direccte.gouv.fr
162 | www.mayotte.dieccte.gouv.fr
163 | experts-univers.com
164 | m.vosdroits.service-public.fr
165 | communaute.vie-publique.fr
166 | discours.vie-publique.fr
167 | interactif.service-public.fr
168 | pme.service-public.fr
169 | www.bourgogne.direccte.gouv.fr
170 | sciencespo.ladocumentationfrancaise.fr
171 | environnement-sante.com
172 | incredimailhosted.infospace.com
173 | mamma.infospace.com
174 | www.concours-civils.defense.gouv.fr
175 | www.concours-civils.sga.defense.gouv.fr
176 | www.leroustidou.com
177 | www.ri92.terre.defense.gouv.fr
178 | www.gites-erable-alsace.com
179 | www.formation.terre.defense.gouv.fr
180 | 90plan.ovh.net
181 | www.sante-environnement-travail.fr
182 | dmp.gouv.fr
183 | ladsetjockeys-lefilm.fr
184 | www.sante-environnement.fr
185 | www.topfouine.com
186 | www.basse-normandie.direccte.gouv.fr
187 | www.laubergine-eygalieres.com
188 | www.bretagne.direccte.gouv.fr
189 | www.meuse.pref.gouv.fr
190 | www.bilrif.defense.gouv.fr
191 | www.antoine.fr
192 | www.terredebruyere.com
193 | www.beghingroux.fr
194 | www.auberge-provencale.fr
195 | www.soirsdefetes.com
196 | www.telestock.fr
197 | sante-environnement.org
198 | www.chaletliotard.fr
199 | www.cars-la-populaire.com
200 | environnement-sante.org
201 | www.sermesdistribution.fr
202 | www.camping-la-pinede.com
203 | patisserieolivierbourau.com
204 | www.gourmets-events.com
205 | www.environnement-sante.net
206 | www.environnement-sante.fr
207 | www.alliancepavillons.org
208 | atelierfeesbrodeuses.fr
209 | www.dermophilindien-lab.com
210 | www.la-cabane-perchee.com
211 | www.aeta-audio.com
212 | www.sahlm79.fr
213 | www.ba118.air.defense.gouv.fr
214 | www.cclinouest.com
215 | www.rg3.terre.defense.gouv.fr
216 | www.iserba.fr
217 | www.fantasyforest.fr
218 | www.televitale.fr
219 | www.serialproducteurs.com
220 | www.ville-saintdie.fr
221 | www.coiffure2010.com
222 | www.cehd.sga.defense.gouv.fr
223 | www.varini.org
224 | www.ain.pref.gouv.fr
225 | www.beauregard-hotel.com
226 | www.transports-bernard.com
227 | www.tattootatouage.com
228 | www.automobile2010.com
229 | www.eetaa722.air.defense.gouv.fr
230 | coiffure2008.com
231 | www.nettoyagebijoux.com
232 | www.stages.defense.gouv.fr
233 | coupe-de-cheveux-homme.com
234 | www.coupedecheveuxfemme.com
235 | www.ba901.air.defense.gouv.fr
236 | www.ba106.air.defense.gouv.fr
237 | www.ba120.air.defense.gouv.fr
238 | www.coiffure2008.com
239 | www.web200708.clarahost.fr
240 | www.beautedeco.com
241 | www.qcclick.com
242 | coiffure2009.com
243 | www.epa749.air.defense.gouv.fr
244 | www.vpgreen.fr
245 | www.bcsfreelance.com
246 | www.lechaletdumoulin.fr
247 | www.media.recrutement.terre.defense.gouv.fr
248 | www.photo-phore.com
249 | www.marocchezlhabitant.com
250 | www.industube.com
251 | www.georget.fr
252 | www.acrie.fr
253 | mobile.recrutement.terre.defense.gouv.fr
254 | www.ba942.air.defense.gouv.fr
255 | www.hotelsatlas.com
256 | www.pharmacie-de-lherm.fr
257 | www.rpmi.fr
258 | 87.106.4.168
259 | www.ba107.air.defense.gouv.fr
260 | www.enligne.recrutement.terre.defense.gouv.fr
261 | www.hotel-st-georges.com
262 | www.ville-challans.fr
263 | www.ba217.air.defense.gouv.fr
264 | www.airmobilite.air.defense.gouv.fr
265 | www.ba721.air.defense.gouv.fr
266 | www.palmiers-ocean.fr
267 | www.quellemutuelles.com
268 | www.cfas.air.defense.gouv.fr
269 | www.ba112.air.defense.gouv.fr
270 | www.cma-bareges.air.defense.gouv.fr
271 | www.da204.air.defense.gouv.fr
272 | ead.ent-etrs.net
273 | www.ent-etrs.net
274 | www.eppa.sante.defense.gouv.fr
275 | www.plasti-ouest.com
276 | pharmacieduvalsaintjean.e-officine.net
277 | www.cedimattp.fr
278 | www.machecoul.com
279 | www.tsr-be.com
280 | pharmaciecentralelens.e-officine.net
281 | www.reseauetudiant.com
282 | twitter-icon.com
283 | search.egreetings.com
284 | www.ado.justice.gouv.fr
285 | www.experatoo.com
286 | www.journaldunet.com
287 | www.annuaires.justice.gouv.fr
288 | www.coiffures2011.net
289 | www.saint-martin-de-sanzay.fr
290 | www.puregourmandise.com
291 | www.yatoshi.com
292 | www.techniques-transparentes.com
293 | vecteurdiffusion.com
294 | www.domaine-sainteleocadie.com
295 | www.lejulesverne-paris.com
296 | www.lewistrondheim.com
297 | arnaudfrichphoto.com
298 | www.cdad-lot.justice.fr
299 | www.cdad-manche.justice.fr
300 | www.metiers.justice.gouv.fr
301 | affinitiz.net
302 | www.alerte-enlevement.gouv.fr
303 | www.ca-paris.justice.fr
304 | www.ciao.fr
305 | www.rip.justice.fr
306 | www.ca-besancon.justice.fr
307 | www.fontainedemars.com
308 | www.ca-bourges.justice.fr
309 | www.cdad-cotedor.justice.fr
310 | www.ca-aixenprovence.justice.fr
311 | www.holiprom.com
312 | www.western-valley.fr
313 | www.infoceane.com
314 | www.bateaux-mouches.fr
315 | www.justice.gouv.fr
316 | www.alaindelorme.com
317 | avocats.fr
318 | anissaledorze.avocats.fr
319 | www.vos-droits.justice.gouv.fr
320 | cmonatelier.cultura.com
321 | isabelle.chevalier-dupont.avocats.fr
322 | reseau.avf.asso.fr
323 | www.ca-amiens.justice.fr
324 | www.boutique-clubdsk.fr
325 | www.noube.fr
326 | www.ca-chambery.justice.fr
327 | www.eng.justice.fr
328 | www.ca-versailles.justice.fr
329 | servirlafrance.com
330 | www.animalnature.fr
331 | reseaulia.com
332 | selli-vine.avocats.fr
333 | kityuko.42stores.com
334 | couturejihanny.42stores.com
335 | www.ca-angers.justice.fr
336 | www.setzaomi.com
337 | www.editions-infini.fr
338 | www.lineab1.fr
339 | corinegaudilliere.avocats.fr
340 | planete-volontaires.fr
341 | blogs.jardiner-malin.fr
342 | loisicrea.com
343 | www.cevennescaravanes.com
344 | www.colorme.ch
345 | affinitiz.com
346 | parentsindignes.42stores.com
347 | www.suite23.fr
348 | www.1bijoux2perles.fr
349 | www.mecaservice.com
350 | www.ptfp.fr
351 | www.nosfell.com
352 | cheminsblancs.com
353 | cubexar.com
354 | www.jetaide.com
355 | forum-centres-d-appels.com
356 | www.avocatforum.com
357 | jetaide.com
358 | www.manzi.be
359 | www.cabasse.com
360 | candyshop.42stores.com
361 | kits-n-scrap.42stores.com
362 | www.lecoinplaisir.com
363 | www.swingromaneacademie.com
364 | www.limprimeur.net
365 | www.fert-demolition.com
366 | www.eguiazabal.com
367 | www.chacunsonchemin.com
368 | www.normanniae.com
369 | www.ot-saverne.fr
370 | www.poleressources95.org
371 | 720plan.ovh.net
372 | www.ba116.air.defense.gouv.fr
373 | www.ypluthier.com
374 | marina-erbarossa.com
375 | www.lamy-diffusion.com
376 | www.ba125.air.defense.gouv.fr
377 | www.leganet.fr
378 | constat-huissier.net
379 | information-juridique.com
380 | famillesdavant.linternaute.com
381 | msn.ciao.fr
382 | ecran-de-veille.linternaute.com
383 | www.forum-entreprise.com
384 | www.cgv-expert.fr
385 | webcam.linternaute.com
386 | programme-tv.linternaute.com
387 | www.guyane.pref.gouv.fr
388 | www.conseil-juridique.net
389 | www.action-collective.com
390 | polardiagram.com
391 | encyclopedie.linternaute.com
392 | www.legavox.fr
393 | site.journaldunet.com
394 | www.juristudiant.com
395 | emploi.journaldunet.com
396 | juristudiant.com
397 | arwatch.org
398 | formation.journaldunet.com
399 | www.inpharma2000.ru
400 | www.yvelines.pref.gouv.fr
401 | ms.ciao.fr
402 | www.veille-reputation.com
403 | www.finistere.pref.gouv.fr
404 | www.sarthe.pref.gouv.fr
405 | www.twitter-icon.com
406 | www.sarthe.gouv.fr
407 | photos.linternaute.com
408 | societe.journaldunet.com
409 | www.portail-mystique.fr
410 | www.moselle.pref.gouv.fr
411 | alavoileblanche.com
412 | piecemontee.com
413 | www.albifun.com
414 | www.urlidea.com
415 | www.guadeloupe.pref.gouv.fr
416 | dhammadana.fr
417 | www.sante-environnement.com
418 | www.escale-wellness.be
419 | www.markosweb.com
420 | www.aquitaine.pref.gouv.fr
421 | www.mc-franquevielle.fr
422 | www.domaine-de-marseillens.com
423 | www.ardeche.pref.gouv.fr
424 | www.lot.pref.gouv.fr
425 | www.charente.pref.gouv.fr
426 | www.indre-et-loire.pref.gouv.fr
427 | www.loiret.pref.gouv.fr
428 | www.motards-idf.fr
429 | www.indre.pref.gouv.fr
430 | www.mjdatabank.com
431 | www.zsysteme.com
432 | www.lemanoir39.com
433 | www.hotel-les-pyrenees.com
434 | www.droitsenfant.com
435 | annecybonlieuhotel.fr
436 | www.manche.pref.gouv.fr
437 | galeriedu7eme.com
438 | www.assoprairieland.com
439 | www.lexilogos.com
440 | www.preparation-physique.net
441 | www.theoutlaw.fr
442 | www.bill-looking.fr
443 | www.landes.pref.gouv.fr
444 | www.aigrehandball.fr
445 | www.iletaitunevoix.org
446 | www.jura.pref.gouv.fr
447 | www.jm-planchon.fr
448 | www.campingchadeyron.com
449 | www.fruirouge.fr
450 | www.campingcassis.com
451 | www.evretz.fr
452 | www.contespedagogiques.be
453 | www.lazare-et-vespucci.com
454 | www.randoleiesclops.fr
455 | www.braccomotos.com
456 | www.hugme.fr
457 | mondolatino.fr
458 | www.pkma.eu
459 | www.photos-allain-mousset.fr
460 | unamourdeuxperles.com
461 | www.vaccination-h1n1.moselle.pref.gouv.fr
462 | jardinvoyageur.com
463 | seine-saint-denis.gouv.fr
464 | www.auvergne.pref.gouv.fr
465 | mobile.hauts-de-seine.gouv.fr
466 | www.pfrh.lorraine.pref.gouv.fr
467 | www.srias.lorraine.pref.gouv.fr
468 | paysages.mayenne.pref.gouv.fr
469 | www.risquesmajeurs-hautes-pyrenees.pref.gouv.fr
470 | 208.76.50.76
471 | ddrm.mayotte.pref.gouv.fr
472 | lot-et-garonne.gouv.fr
473 | www.ppol-taxi.interieur.gouv.fr
474 | nasdaq.infospace.com
475 | www.prse.lorraine.gouv.fr
476 | www.haute-savoie.pref.gouv.fr
477 | www.cakechloes.com
478 | www.languedoc-roussillon.pref.gouv.fr
479 | aveyron.gouv.fr
480 | old.pyrenees-atlantiques.pref.gouv.fr
481 | www.finistere.gouv.fr
482 | www.seine-saint-denis.pref.gouv.fr
483 | www.lorraine.pref.gouv.fr
484 | www.charente-maritime.pref.gouv.fr
485 | www.finances.gouv.fr
486 | laboratoirecentral.interieur.gouv.fr
487 | sas.sante.gouv.fr
488 | yvelines.pref.gouv.fr
489 | www.recherche-biomedicale.sante.gouv.fr
490 | www.datar.gouv.fr
491 | www.lenotre.culture.gouv.fr
492 | www.sanglier5767.com
493 | portailmoselle.dims.fr
494 | baignades.sante.gouv.fr
495 | agriculture.gouv.fr
496 | www.moselle.gouv.fr
497 | voiceillusion.com
498 | ddaf.ain.pref.gouv.fr
499 | www.pref93.pref.gouv.fr
500 | www.srcae.lorraine.gouv.fr
501 | www.diplomatie.gouv.fr
502 | www.economie.gouv.fr
503 | www.developpement-durable.gouv.fr
504 | en.palmiers-ocean.fr
505 | www.sae-diffusion.sante.gouv.fr
506 | www.ddjs-haute-savoie.jeunesse-sports.gouv.fr
507 | www.ile-de-france.sante.gouv.fr
508 | www.coupesdecheveux2011.net
509 | www.eure.sit.gouv.fr
510 | archives.livreblancdefenseetsecurite.gouv.fr
511 | www.oncfs.gouv.fr
512 | www.yonne.sit.gouv.fr
513 | www.internet.gouv.fr
514 | www.impots.gouv.fr
515 | forum.webmaster-rank.info
516 | www2.impots.gouv.fr
517 | direccte.gouv.fr
518 | www.immigration.gouv.fr
519 | www.hcst.fr
520 | www.drees.sante.gouv.fr
521 |
--------------------------------------------------------------------------------
/processing/metadataextract.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 | from subprocess import Popen, PIPE
4 | import threading
5 | import pymongo
6 | from pymongo import MongoClient
7 | import simplejson
8 | import HTMLParser
9 | class metadataextract(threading.Thread):
10 | def __init__(self,scriptjs,db,domaine,url):
11 | threading.Thread.__init__(self)
12 | self.result=[]
13 | self.domaine=domaine
14 | self.scriptjs=scriptjs
15 | self.url=url
16 | self.connection= MongoClient(host='localhost', port=27017,db=db)
17 | self.db=self.connection[db]
18 |
19 | def run(self):
20 | result=subprocess.Popen(['casperjs',self.scriptjs,self.url],stdout=PIPE)
21 | meta=''
22 | contents=[]
23 |
24 | for ligne in result.stdout:
25 | meta=meta+ligne
26 |
27 | try:
28 | data = simplejson.loads(meta)
29 | #print data
30 | print len(data)
31 | if len(data) > 0:
32 | print data
33 | for content in data:
34 | contents.append(content['content'])
35 |
36 | meta=' '.join(contents)
37 | print meta
38 | if len(meta) >0:
39 | h = HTMLParser.HTMLParser()
40 | print h.unescape(meta)
41 | value_db={'domaine':self.domaine,'meta':h.unescape(meta)}
42 | self.db.metadatas.save(value_db)
43 | except ValueError:
44 | print 'Erreur encoding: '+ meta
45 |
--------------------------------------------------------------------------------
/processing/metadataextract.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/processing/metadataextract.pyc
--------------------------------------------------------------------------------
/scanners/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/scanners/__init__.py
--------------------------------------------------------------------------------
/scanners/networks.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on 14 mai 2014
3 |
4 | @author: slarinier
5 | '''
6 | from libnmap.parser import NmapParser
7 | from libnmap.process import NmapProcess
8 |
9 | class Networks(object):
10 | '''
11 | classdocs
12 | '''
13 |
14 |
15 | def __init__(self, targets,options):
16 | self.nmap=NmapProcess(targets,options)
17 | def run(self):
18 | self.nmap.run()
19 |
20 | def make_report(self):
21 | report=NmapParser.parse(self.nmap.stdout)
22 | result=[]
23 | for host in report.hosts:
24 | temp={}
25 | print host
26 | print host.scripts_results
27 | temp['ip']=host.ipv4
28 | print [(service.state,service.port,service.scripts_results) for service in host.services]
29 | # for service in host.services:
30 | # for k in service.scripts_results:
31 | # if k.find('.'):
32 | # v=service.scripts_results[k]
33 | # del service.scripts_resutls[k]
34 | # service.scripts_resutls[k.replace('.','_')]=v
35 | # temp['services']=[(service.state,service.port,service.scripts_results)]
36 | # result.append(temp)
37 | # return result
38 | def record_report(self,records,cache,coll):
39 | for r in records:
40 | doc=cache[r['ip']]
41 | doc['service']=r
42 | try:
43 | coll.save(doc)
44 | except:
45 | print doc
--------------------------------------------------------------------------------
/screenshots/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebdraven/OSINT/460f7f2f7082369090b62f85a57e4bf7c6f41688/screenshots/__init__.py
--------------------------------------------------------------------------------
/screenshots/make_screenshots.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import screenshots
3 | import sys
4 | import threading
5 |
6 | file_list_websites=sys.argv[1]
7 | jsfile=sys.argv[2]
8 | emplacement=sys.argv[3]
9 | threadpool=sys.argv[4]
10 | domaines=[]
11 | main_thread = threading.currentThread()
12 | with open(file_list_websites,'r') as fr:
13 | for ligne in fr:
14 | domaines.append(ligne.replace('\r\n',''))
15 | print domaines
16 | i=0
17 | for domaine in domaines:
18 | i+=1
19 | screen=screenshots.Screenshots(domaines,jsfile,emplacement,domaine)
20 | screen.start()
21 | if i % int(threadpool):
22 | for t in threading.enumerate():
23 | if t is not main_thread:
24 | t.join()
25 |
26 |
--------------------------------------------------------------------------------
/screenshots/screenshots.js:
--------------------------------------------------------------------------------
1 | var casper = require('casper').create({
2 |
3 | })
4 | , terms = casper.cli.get(0),url=casper.cli.get(1),emplacement=casper.cli.get(2),i=0
5 | casper.start(url, function() {
6 | this.capture(emplacement+'/'+terms+'.png', {
7 | top: 10,
8 | left: 10,
9 | width: 1024,
10 | height: 768
11 | },12000);
12 | });
13 |
14 | casper.run()
15 |
--------------------------------------------------------------------------------
/screenshots/screenshots.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | from subprocess import Popen, PIPE
3 | import threading
4 | import time
5 | # -*- coding: utf-8 -*-
6 | """
7 | Created on Mon Apr 30 12:24:14 2012
8 |
9 | @author: slarinier
10 | """
11 |
12 | class Screenshots(threading.Thread):
13 | def __init__(self,listofwebsites,jsfile,location,website):
14 | self.listofwebsites=listofwebsites
15 | self.jsfile=jsfile
16 | self.location=location
17 | self.website=website
18 | threading.Thread.__init__(self)
19 |
20 |
21 | def run(self):
22 | cmd='casperjs '+self.jsfile+' '+self.website +' http://'+self.website +' '+self.location+' --web-security=no'
23 | args=cmd.split()
24 | result=subprocess.Popen(args,stdout=PIPE)
25 | print "Make screenshots :"+self.website
26 | time.sleep(3)
27 |
28 |
--------------------------------------------------------------------------------
/storage/__init__.py:
--------------------------------------------------------------------------------
1 | #
--------------------------------------------------------------------------------
/storage/redis_record.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Apr 24 15:30:33 2013
4 |
5 | @author: slarinier
6 | """
7 | import redis
8 |
9 | class RedisRecord(object):
10 |
11 | def __init__(self,host='localhost',port=6379,db=1):
12 | pool=redis.ConnectionPool(host=host,port=port,db=db)
13 | self.r=redis.Redis(connection_pool=pool)
14 | self.processus_tab=[]
15 | def delete(self,key):
16 | self.r.delete(key)
17 | def get(self,key):
18 | return self.r.get(key)
19 | def put(self,key,value):
20 | self.r.set(key,value)
21 | def init(self,dbs):
22 | for i in dbs:
23 | self.flushdb(i)
24 | def flushdb(self,db_value):
25 | self.switchDB(db_value)
26 | self.r.flushdb()
27 | def rpush(self,listvalue,item):
28 | self.r.rpush(listvalue,item)
29 | def rpop(self,listvalue):
30 | return self.r.rpop(listvalue)
31 | def switchDB(self,db,host='localhost',port=6379):
32 | pool=redis.ConnectionPool(host=host,port=port,db=db)
33 | self.r=redis.Redis(connection_pool=pool)
34 | def currentDB(self):
35 | return self.r.connection_pool.get_connection(1).db
--------------------------------------------------------------------------------