├── README.md ├── es2json.py ├── json2es.py └── fdns_download.py /README.md: -------------------------------------------------------------------------------- 1 | #elasticsearch Python脚本 2 | 3 | --- 4 | 5 | ###es2json.py 6 | 用于将数据从elasticsearch导出为json 7 | 支持导出过程中将json文件压缩为gz格式 8 | 9 | --- 10 | 11 | ###json2es.py     12 | 用于将json格式文件的输入导入elasticsearch     13 | 支持json文件的gz压缩包直接导入 14 | 支持自定义mapping(在脚本所在目录加入mapping.json) 15 | 16 | --- 17 | 18 | ###fdns_download.py 19 | 下载scans.io下的最新dnsrecords_all数据,自动解压入库 20 | 网络数据连接:https://scans.io/study/sonar.fdns 21 | -------------------------------------------------------------------------------- /es2json.py: -------------------------------------------------------------------------------- 1 | 2 | #coding: utf-8 3 | import sys 4 | import json 5 | import gzip 6 | import getopt 7 | import threading 8 | import Queue 9 | from datetime import datetime 10 | from elasticsearch import Elasticsearch, helpers 11 | 12 | class logRecode(object): 13 | def __init__(self): 14 | self.begin_time = datetime.now() 15 | self.end_time = datetime.now() 16 | self.total = 0 17 | self.success = 0 18 | self.failed = 0 19 | 20 | class ElasticExportIndex(logRecode): 21 | 22 | def __init__(self, es_host, index_name, save_name, 23 | b_gzip=False, count=False, scroll_size=1000,queue_size=10000): 24 | logRecode.__init__(self) 25 | self.elastic = Elasticsearch(es_host) 26 | self.index_name = index_name 27 | self.save_name = save_name 28 | self.b_gzip = b_gzip 29 | self.count = int(count) if count else False # select size 30 | self.res_count = 0 31 | self.scroll_size = scroll_size 32 | self.queue = Queue.Queue(queue_size) 33 | self.fquit = object() 34 | self.start() 35 | 36 | def read_elastic(self): 37 | body = {'query': {'match_all':{}}} 38 | res = self.elastic.search(index=self.index_name, 39 | body=body, 40 | scroll='3m', 41 | search_type='scan', 42 | size=self.scroll_size, 43 | request_timeout=60) 44 | scroll_size = res['hits']['total'] 45 | self.total = scroll_size 46 | # print self.total 47 | 48 | while (scroll_size > 0): 49 | 50 | try: 51 | scroll_id = res['_scroll_id'] 52 | res = self.elastic.scroll(scroll_id=scroll_id, scroll='3m') 53 | for record in res['hits']['hits']: 54 | try: 55 | self.queue.put(record['_source']) 56 | except Exception as e: 57 | print e 58 | scroll_size = len(res['hits']['hits']) 59 | self.res_count += scroll_size # count check 60 | if self.count and (self.count <= self.res_count): break 61 | # print self.res_count,self.count 62 | print('[+] fetch {}...'.format(self.res_count)) 63 | except Exception as e: 64 | pass 65 | 66 | def write_file(self): 67 | try: 68 | if self.b_gzip: 69 | fp = gzip.open(self.save_name + '.gz', 'wb') 70 | else: 71 | fp = open(self.save_name, "wb") 72 | while True: 73 | record = self.queue.get() 74 | if record == self.fquit: 75 | self.queue.put(self.fquit) 76 | break 77 | try: 78 | fp.write(json.dumps(record) + '\n') 79 | self.success += 1 80 | except Exception, e: 81 | self.failed += 1 82 | print e 83 | self.queue.task_done() 84 | finally: 85 | fp.close() 86 | self.end_time = datetime.now() 87 | 88 | def __str__(self): 89 | return json.dumps({ 90 | "begin_time": str(self.begin_time), 91 | "end_time:": str(self.end_time), 92 | "total:": self.total, 93 | "success:": self.success, 94 | "failed:": self.failed}, indent=4) 95 | 96 | def start(self): 97 | if not self.elastic.indices.exists(self.index_name): 98 | print "Error: Elasticsearch index %s not exists." % self.index_name 99 | sys.exit(-1) 100 | 101 | read_thread = threading.Thread(target=self.read_elastic) 102 | read_thread.start() 103 | 104 | write_thread = threading.Thread(target=self.write_file) 105 | write_thread.start() 106 | 107 | read_thread.join() 108 | self.queue.join() 109 | self.queue.put(self.fquit) 110 | write_thread.join() 111 | 112 | def usage(): 113 | print "Example:" 114 | print "%s -e 192.168.1.121:9200 -i indexname -f result.json -c 10000 -g " %sys.argv[0] 115 | sys.exit(1) 116 | 117 | def main(): 118 | if len(sys.argv[1:]) < 1: 119 | usage() 120 | try: 121 | opts, args = getopt.getopt(sys.argv[1:], 'hge:i:f:c:', ["help", "gzip", "elastic", "index", "file","count"]) 122 | except getopt.GetoptError, e: 123 | print str(e) 124 | usage() 125 | 126 | b_gzip = False 127 | es_host = '' 128 | es_index = '' 129 | save_file = '' 130 | count = False 131 | encoding = sys.getfilesystemencoding() 132 | 133 | for o, a in opts: 134 | a = a.decode(encoding) 135 | if o in ("-h", "--help"): 136 | usage() 137 | elif o in ("-g", "gzip"): # 138 | b_gzip = True 139 | elif o in ("-e", "--elastic"): 140 | es_host = a 141 | elif o in ("-i", "--index"): 142 | es_index = a 143 | elif o in ("-f", "--file"): 144 | save_file = a 145 | elif o in ("-c", "--count"): 146 | count = a 147 | else: 148 | assert False, "Unhandled Option" 149 | 150 | if not (es_host and es_index and save_file): 151 | usage() 152 | sample = ElasticExportIndex(es_host, es_index, save_file, b_gzip, count) 153 | print sample 154 | 155 | if __name__ == "__main__": 156 | main() 157 | 158 | print 'Exiting Main Thread...' -------------------------------------------------------------------------------- /json2es.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | import os 3 | import sys 4 | import json 5 | import gzip 6 | import threading 7 | import Queue 8 | import getopt 9 | from datetime import datetime 10 | from elasticsearch import Elasticsearch, helpers 11 | import sys 12 | reload(sys) 13 | sys.setdefaultencoding('utf8') 14 | 15 | class logRecode(object): 16 | def __init__(self): 17 | self.begin_time = datetime.now() 18 | self.end_time = datetime.now() 19 | self.total = 0 20 | self.success = 0 21 | self.failed = 0 22 | 23 | class ImportElasticIndex(logRecode): 24 | def __init__(self, es_host, es_index, src_file, 25 | b_gzip=False, scroll_size=1000, queue_size=10000): 26 | logRecode.__init__(self) 27 | self.elastic = Elasticsearch(es_host) 28 | self.es_index = es_index 29 | self.doc_type = 'json' 30 | self.src_file = src_file 31 | self.b_gzip = b_gzip 32 | self.scroll_size = scroll_size 33 | self.queue = Queue.Queue(queue_size) 34 | self.fquit = object() 35 | self.start() 36 | 37 | def is_gzip(self, filename): 38 | try: 39 | fp = open(filename, 'rb') 40 | if os.path.getsize(filename) >=2: 41 | magic = self.fileobj.read(2) 42 | if magic == '\037\213': 43 | return True 44 | finally: 45 | fp.close() 46 | 47 | def read_file(self): 48 | try: 49 | if self.b_gzip: 50 | fp = gzip.open(self.src_file, 'rb') 51 | else: 52 | fp = open(self.src_file, 'rb') 53 | for record in fp: 54 | try: 55 | record = record.strip() 56 | self.queue.put(record) 57 | self.total += 1 58 | except Exception, e: 59 | print e 60 | finally: 61 | fp.close() 62 | 63 | def write_elastic(self): 64 | actions = [] 65 | while True: 66 | record = self.queue.get() 67 | if record == self.fquit: 68 | self.queue.put(self.fquit) 69 | if len(actions): 70 | try: 71 | success, failed = helpers.bulk(self.elastic, actions, True) 72 | self.success += success 73 | self.failed += failed 74 | actions[:] = [] 75 | except Exception, e: 76 | print e 77 | break 78 | actions.append({ 79 | "_index": self.es_index, 80 | "_type": self.doc_type, 81 | "_id": None, 82 | "_source": record 83 | }) 84 | if len(actions) >= self.scroll_size: 85 | try: 86 | success, failed = helpers.bulk(self.elastic, actions, True) 87 | self.success += success 88 | self.failed += failed 89 | actions[:] = [] 90 | except Exception, e: 91 | print e 92 | self.queue.task_done() 93 | self.end_time = datetime.now() 94 | 95 | def __str__(self): 96 | return json.dumps({ 97 | "begin_time": str(self.begin_time), 98 | "end_time:": str(self.end_time), 99 | "total:": self.total, 100 | "success:": self.success, 101 | "failed:": self.failed}, indent=4) 102 | 103 | def start(self): 104 | if not os.path.exists(os.path.abspath(self.src_file)): 105 | print "Error: Source file %s not exists." % self.src_file 106 | sys.exit(-1) 107 | self.elastic.indices.create(self.es_index, 108 | body=json.loads(open('mapping.json').read()), 109 | ignore=[400, 404]) 110 | read_thread = threading.Thread(target=self.read_file) 111 | read_thread.start() 112 | 113 | write_thread = threading.Thread(target=self.write_elastic) 114 | write_thread.start() 115 | 116 | read_thread.join() 117 | self.queue.join() 118 | self.queue.put(self.fquit) 119 | write_thread.join() 120 | 121 | def usage(): 122 | print "Example:" 123 | print "%s -e 192.168.1.121:9200 -i indexname -f result.json -g" %sys.argv[0] 124 | sys.exit(1) 125 | 126 | def main(): 127 | if len(sys.argv[1:]) < 1: 128 | usage() 129 | try: 130 | opts, args = getopt.getopt(sys.argv[1:], 'hge:i:f:', 131 | ["help", "gzip", "elastic", "index", "file"]) 132 | except getopt.GetoptError, e: 133 | print str(e) 134 | usage() 135 | 136 | b_gzip = False 137 | es_host = '' 138 | es_index = '' 139 | src_file = '' 140 | encoding = sys.getfilesystemencoding() 141 | 142 | for o, a in opts: 143 | a = a.decode(encoding) 144 | if o in ("-h", "--help"): 145 | usage() 146 | elif o in ("-g", "gzip"): 147 | b_gzip = True 148 | elif o in ("-e", "--elastic"): 149 | es_host = a 150 | elif o in ("-i", "--index"): 151 | es_index = a 152 | elif o in ("-f", "--file"): 153 | src_file = a 154 | else: 155 | assert False, "Unhandled Option" 156 | 157 | if not (es_host and es_index and src_file): 158 | usage() 159 | sample = ImportElasticIndex(es_host, es_index, src_file, b_gzip) 160 | print sample 161 | 162 | if __name__ == "__main__": 163 | main() 164 | -------------------------------------------------------------------------------- /fdns_download.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | 3 | import os 4 | import sys 5 | import requests 6 | import shlex 7 | import subprocess 8 | import hashlib 9 | import gzip 10 | import json 11 | import chardet 12 | from datetime import datetime 13 | from bs4 import BeautifulSoup 14 | from elasticsearch import Elasticsearch, helpers 15 | 16 | 17 | class Elastic(object): 18 | 19 | def __init__(self, elastic=None, url='http://192.168.1.121:9200'): 20 | self.elastic = elastic or Elasticsearch(url) 21 | 22 | def exists(self, index): 23 | return self.elastic.indices.exists(index) 24 | 25 | def create(self, index, body=None): 26 | self.elastic.indices.create(index, body=body, ignore=[400]) 27 | 28 | def insert(self, index, type, datas, id=None): 29 | actions = [] 30 | for data in datas: 31 | actions.append({ 32 | "_index": index, 33 | "_type": type, 34 | "_id": id, 35 | "_source": data 36 | }) 37 | try: 38 | helpers.bulk(self.elastic, actions) 39 | except Exception, e: 40 | print ("{0} start bulk error: {1}".format(datetime.now(), e)) 41 | 42 | class Fdns(object): 43 | 44 | def __init__(self): 45 | self.website = "https://scans.io/study/sonar.fdns" 46 | self.downurl, self.code = self.get_download_link() 47 | 48 | def get_download_link(self): 49 | resp = requests.get(self.website) 50 | if resp.status_code != 200: 51 | raise Exception("download page failed: %d" % resp.status_code) 52 | soup = BeautifulSoup(resp.text, 'lxml') 53 | try: 54 | table = soup.findAll(name='table', attrs={'class':'table table-condensed'})[0] 55 | tr = table.findAll(name='tr')[-1] 56 | href = tr.findAll(name='a')[0]['href'] 57 | code = tr.findAll(name='code')[0].text 58 | except Exception, msg: 59 | raise Exception("get download link failed: {}".format(msg)) 60 | return (href, code.lower()) 61 | 62 | @staticmethod 63 | def sha1(filepath, block_size=64*1024): 64 | try: 65 | with open(filepath, 'rb') as fd: 66 | sha1obj = hashlib.sha1() 67 | while True: 68 | data = fd.read(block_size) 69 | if not data: 70 | break 71 | sha1obj.update(data) 72 | retsha1 = sha1obj.hexdigest() 73 | return retsha1 74 | except IOError: 75 | raise Exception('Invalid file path: {}'.format(filepath)) 76 | 77 | @staticmethod 78 | def make_dirs(dirpath, default='./sample'): 79 | try: 80 | if not os.path.exists(dirpath): 81 | os.makedirs(dirpath) 82 | return dirpath.rstrip(os.sep) 83 | except Exception, e: 84 | if not os.path.exists(default): 85 | os.makedirs(default) 86 | return dirpath.rstrip(os.sep) 87 | 88 | @staticmethod 89 | def download_file(downurl, localdir=None, showlog=False): 90 | localdir = Fdns.make_dirs(localdir) 91 | command_line = "wget -c -t100 -P {0} {1}".format(localdir, downurl) 92 | tmp_cmdline = shlex.split(command_line) 93 | try: 94 | proc = subprocess.Popen(args=tmp_cmdline, 95 | stderr=subprocess.STDOUT, 96 | stdout=subprocess.PIPE, 97 | bufsize=0) 98 | except IOError: 99 | raise EnvironmentError(1, "wget is not installed or could " 100 | "not be found in system path") 101 | while showlog and proc.poll() is None: 102 | for streamline in iter(proc.stdout.readline, ''): 103 | sys.stdout.write(streamline) 104 | proc.communicate() 105 | return proc.returncode 106 | 107 | @staticmethod 108 | def gzip_extract(gzpath, dstpath, block_size=64*1024): 109 | try: 110 | with gzip.open(gzpath, 'rb') as fr, open(dstpath, 'wb') as fw: 111 | while True: 112 | data = fr.read(block_size) 113 | if not data: 114 | break 115 | fw.write(data) 116 | except IOError: 117 | raise Exception('Invalid file path: {}'.format(gzpath)) 118 | 119 | @staticmethod 120 | def import_elastic(elastic, gzfile, step=500, sep=','): 121 | with gzip.open(gzfile) as fd_file_down: 122 | lines = [] 123 | type = 'json' 124 | index = os.path.basename(gzfile) 125 | elastic.create(index=index, body={ 126 | 'mappings': { 127 | 'json': { 128 | 'properties' : { 129 | 'domain' : { 130 | 'type' : 'string', 131 | 'index': 'not_analyzed' 132 | }, 133 | 'record_type' : { 134 | 'type' : 'string', 135 | 'index': 'not_analyzed' 136 | }, 137 | 'record_value' : { 138 | 'type' : 'string', 139 | 'index': 'not_analyzed' 140 | } 141 | } 142 | } 143 | } 144 | }) 145 | 146 | for line in fd_file_down: 147 | success = False 148 | if not success: 149 | try: 150 | fields = line.strip().split(sep) 151 | data = dict(domain=fields[0], record_type=fields[1], record_value=fields[2]) 152 | lines.append(json.dumps(data)) 153 | success = True 154 | except Exception: 155 | pass 156 | if not success: 157 | try: 158 | encoding = chardet.detect(line).get('encoding') 159 | line = line.decode(encoding) 160 | fields = line.strip().split(sep) 161 | data = dict(domain=fields[0], record_type=fields[1], record_value=fields[2]) 162 | lines.append(json.dumps(data)) 163 | success = True 164 | except Exception: 165 | pass 166 | if len(lines) >= step: 167 | elastic.insert(index, type, lines) 168 | lines[:] = [] 169 | if len(lines): 170 | elastic.insert(index, type, lines) 171 | 172 | def main(): 173 | fdns = Fdns() 174 | localdir = "./sample" 175 | downfilename = os.path.basename(fdns.downurl) 176 | 177 | print ("{0} start download file: {1}".format(datetime.now(), downfilename)) 178 | while True: 179 | returncode = fdns.download_file(fdns.downurl, localdir=localdir, showlog=False) 180 | if returncode == 0: 181 | break 182 | 183 | print ("{0} start sha1 file: {1}".format(datetime.now(), downfilename)) 184 | downfilepath = localdir + os.path.sep + downfilename 185 | if fdns.sha1(downfilepath) != fdns.code: 186 | print "downfile failed: {}".format(fdns.downurl) 187 | sys.exit() 188 | 189 | print ("{0} start import file: {1}".format(datetime.now(), downfilename)) 190 | elastic = Elastic() 191 | if not elastic.exists(downfilename): 192 | fdns.import_elastic(elastic, downfilepath) 193 | 194 | if __name__ == "__main__": 195 | main() 196 | --------------------------------------------------------------------------------