├── README.md
├── es2json.py
├── json2es.py
└── fdns_download.py


/README.md:
--------------------------------------------------------------------------------
 1 | #elasticsearch Python脚本
 2 | 
 3 | ---
 4 | 
 5 | ###es2json.py    
 6 | 用于将数据从elasticsearch导出为json    
 7 | 支持导出过程中将json文件压缩为gz格式
 8 | 
 9 | ---   
10 | 
11 | ###json2es.py    
12 | 用于将json格式文件的输入导入elasticsearch    
13 | 支持json文件的gz压缩包直接导入    
14 | 支持自定义mapping（在脚本所在目录加入mapping.json）
15 | 
16 | ---
17 | 
18 | ###fdns_download.py    
19 | 下载scans.io下的最新dnsrecords_all数据，自动解压入库    
20 | 网络数据连接：https://scans.io/study/sonar.fdns
21 | 


--------------------------------------------------------------------------------
/es2json.py:
--------------------------------------------------------------------------------
  1 | 
  2 | #coding: utf-8
  3 | import sys
  4 | import json
  5 | import gzip
  6 | import getopt
  7 | import threading
  8 | import Queue
  9 | from datetime import datetime
 10 | from elasticsearch import Elasticsearch, helpers
 11 | 
 12 | class logRecode(object):
 13 |     def __init__(self):
 14 |         self.begin_time = datetime.now()
 15 |         self.end_time = datetime.now()
 16 |         self.total = 0
 17 |         self.success = 0 
 18 |         self.failed = 0   
 19 | 
 20 | class ElasticExportIndex(logRecode):
 21 |     
 22 |     def __init__(self, es_host, index_name, save_name,
 23 |                   b_gzip=False, count=False, scroll_size=1000,queue_size=10000):  
 24 |         logRecode.__init__(self)  
 25 |         self.elastic = Elasticsearch(es_host)
 26 |         self.index_name = index_name
 27 |         self.save_name = save_name
 28 |         self.b_gzip = b_gzip
 29 |         self.count = int(count) if count else False # select size
 30 |         self.res_count = 0
 31 |         self.scroll_size = scroll_size                
 32 |         self.queue = Queue.Queue(queue_size)
 33 |         self.fquit = object()          
 34 |         self.start()                        
 35 |         
 36 |     def read_elastic(self):                                            
 37 |         body = {'query': {'match_all':{}}}            
 38 |         res = self.elastic.search(index=self.index_name, 
 39 |                         body=body, 
 40 |                         scroll='3m',
 41 |                         search_type='scan',
 42 |                         size=self.scroll_size,
 43 |                         request_timeout=60)             
 44 |         scroll_size = res['hits']['total']              
 45 |         self.total = scroll_size
 46 |         # print self.total
 47 |         
 48 |         while (scroll_size > 0):
 49 |             
 50 |             try:                   
 51 |                 scroll_id = res['_scroll_id']
 52 |                 res = self.elastic.scroll(scroll_id=scroll_id, scroll='3m')                  
 53 |                 for record in res['hits']['hits']:
 54 |                     try:                                                                                                       
 55 |                         self.queue.put(record['_source'])
 56 |                     except Exception as e:                  
 57 |                         print e                                                                            
 58 |                 scroll_size = len(res['hits']['hits'])
 59 |                 self.res_count += scroll_size # count check
 60 |                 if self.count and (self.count <= self.res_count): break
 61 |                 # print self.res_count,self.count
 62 |                 print('[+] fetch {}...'.format(self.res_count))
 63 |             except Exception as e:                              
 64 |                 pass                              
 65 |     
 66 |     def write_file(self):       
 67 |         try:
 68 |             if self.b_gzip:
 69 |                 fp = gzip.open(self.save_name + '.gz', 'wb')
 70 |             else:
 71 |                 fp = open(self.save_name, "wb")                
 72 |             while True:                
 73 |                 record = self.queue.get()
 74 |                 if record == self.fquit:
 75 |                     self.queue.put(self.fquit)
 76 |                     break
 77 |                 try:                 
 78 |                     fp.write(json.dumps(record) + '\n')
 79 |                     self.success += 1
 80 |                 except Exception, e:
 81 |                     self.failed += 1
 82 |                     print e
 83 |                 self.queue.task_done()           
 84 |         finally:
 85 |             fp.close()
 86 |         self.end_time = datetime.now()
 87 |     
 88 |     def __str__(self):
 89 |         return json.dumps({
 90 |             "begin_time": str(self.begin_time),
 91 |             "end_time:": str(self.end_time),
 92 |             "total:": self.total,
 93 |             "success:": self.success,
 94 |             "failed:":  self.failed}, indent=4)      
 95 |     
 96 |     def start(self):
 97 |         if not self.elastic.indices.exists(self.index_name):
 98 |             print  "Error: Elasticsearch index %s not exists." % self.index_name
 99 |             sys.exit(-1)
100 |             
101 |         read_thread  = threading.Thread(target=self.read_elastic)
102 |         read_thread.start()           
103 |          
104 |         write_thread = threading.Thread(target=self.write_file)
105 |         write_thread.start()
106 |         
107 |         read_thread.join()
108 |         self.queue.join()
109 |         self.queue.put(self.fquit)
110 |         write_thread.join()                      
111 |     
112 | def usage():      
113 |     print "Example:"
114 |     print "%s -e 192.168.1.121:9200 -i indexname -f result.json -c 10000 -g " %sys.argv[0]    
115 |     sys.exit(1)
116 |     
117 | def main():    
118 |     if len(sys.argv[1:]) < 1:
119 |         usage()
120 |     try: 
121 |         opts, args = getopt.getopt(sys.argv[1:], 'hge:i:f:c:', ["help", "gzip", "elastic", "index", "file","count"])        
122 |     except getopt.GetoptError, e:
123 |         print str(e)
124 |         usage()        
125 |         
126 |     b_gzip = False
127 |     es_host = '' 
128 |     es_index = ''
129 |     save_file = ''
130 |     count = False
131 |     encoding = sys.getfilesystemencoding()     
132 |     
133 |     for o, a in opts:
134 |         a = a.decode(encoding)
135 |         if o in ("-h", "--help"):
136 |             usage()
137 |         elif o in ("-g", "gzip"): #
138 |             b_gzip  = True        
139 |         elif o in ("-e", "--elastic"): 
140 |             es_host = a
141 |         elif o in ("-i", "--index"):
142 |             es_index = a
143 |         elif o in ("-f", "--file"):
144 |             save_file = a
145 |         elif o in ("-c", "--count"):
146 |             count = a
147 |         else:
148 |             assert False, "Unhandled Option" 
149 |             
150 |     if not (es_host and es_index and save_file):
151 |         usage()           
152 |     sample = ElasticExportIndex(es_host, es_index, save_file, b_gzip, count)
153 |     print sample
154 |     
155 | if __name__ == "__main__":
156 |     main()    
157 | 
158 | print 'Exiting Main Thread...'


--------------------------------------------------------------------------------
/json2es.py:
--------------------------------------------------------------------------------
  1 | #coding: utf-8
  2 | import os
  3 | import sys
  4 | import json
  5 | import gzip
  6 | import threading
  7 | import Queue
  8 | import getopt
  9 | from datetime import datetime
 10 | from elasticsearch import Elasticsearch, helpers
 11 | import sys 
 12 | reload(sys) 
 13 | sys.setdefaultencoding('utf8')
 14 | 
 15 | class logRecode(object):
 16 |     def __init__(self):
 17 |         self.begin_time = datetime.now()
 18 |         self.end_time = datetime.now()
 19 |         self.total = 0
 20 |         self.success = 0 
 21 |         self.failed = 0   
 22 | 
 23 | class ImportElasticIndex(logRecode):    
 24 |     def __init__(self, es_host, es_index, src_file, 
 25 |                  b_gzip=False, scroll_size=1000, queue_size=10000):
 26 |         logRecode.__init__(self)        
 27 |         self.elastic = Elasticsearch(es_host)
 28 |         self.es_index = es_index  
 29 |         self.doc_type = 'json' 
 30 |         self.src_file = src_file
 31 |         self.b_gzip = b_gzip
 32 |         self.scroll_size = scroll_size     
 33 |         self.queue = Queue.Queue(queue_size)
 34 |         self.fquit = object()
 35 |         self.start()      
 36 |         
 37 |     def is_gzip(self, filename):
 38 |         try:
 39 |             fp = open(filename, 'rb') 
 40 |             if os.path.getsize(filename) >=2:
 41 |                 magic = self.fileobj.read(2)
 42 |                 if magic == '\037\213':
 43 |                     return True                    
 44 |         finally:
 45 |             fp.close()                              
 46 |         
 47 |     def read_file(self):        
 48 |         try:
 49 |             if self.b_gzip:
 50 |                 fp = gzip.open(self.src_file, 'rb')
 51 |             else:
 52 |                 fp = open(self.src_file, 'rb')
 53 |             for record in fp: 
 54 |                 try:               
 55 |                     record = record.strip()
 56 |                     self.queue.put(record) 
 57 |                     self.total += 1             
 58 |                 except Exception, e:
 59 |                     print e                               
 60 |         finally:
 61 |             fp.close()
 62 |     
 63 |     def write_elastic(self):
 64 |         actions = []                
 65 |         while True:
 66 |             record = self.queue.get()                  
 67 |             if record == self.fquit:
 68 |                 self.queue.put(self.fquit)
 69 |                 if len(actions):
 70 |                     try:                            
 71 |                         success, failed = helpers.bulk(self.elastic, actions, True)
 72 |                         self.success += success
 73 |                         self.failed += failed
 74 |                         actions[:] = [] 
 75 |                     except Exception, e:
 76 |                         print e                                        
 77 |                 break                     
 78 |             actions.append({
 79 |                    "_index": self.es_index,
 80 |                     "_type": self.doc_type,
 81 |                     "_id": None,
 82 |                     "_source": record                            
 83 |             })                        
 84 |             if len(actions) >= self.scroll_size:
 85 |                 try:
 86 |                     success, failed = helpers.bulk(self.elastic, actions, True)
 87 |                     self.success += success
 88 |                     self.failed += failed 
 89 |                     actions[:] = [] 
 90 |                 except Exception, e:                  
 91 |                     print e            
 92 |             self.queue.task_done()     
 93 |         self.end_time = datetime.now()
 94 |             
 95 |     def __str__(self):
 96 |         return json.dumps({
 97 |             "begin_time": str(self.begin_time),
 98 |             "end_time:": str(self.end_time),
 99 |             "total:": self.total,
100 |             "success:": self.success,
101 |             "failed:":  self.failed}, indent=4)        
102 |     
103 |     def start(self):
104 |         if not os.path.exists(os.path.abspath(self.src_file)):
105 |             print "Error: Source file %s not exists." % self.src_file
106 |             sys.exit(-1)                               
107 |         self.elastic.indices.create(self.es_index, 
108 |                                     body=json.loads(open('mapping.json').read()),
109 |                                     ignore=[400, 404])        
110 |         read_thread  = threading.Thread(target=self.read_file)
111 |         read_thread.start()           
112 |          
113 |         write_thread = threading.Thread(target=self.write_elastic)
114 |         write_thread.start()
115 |         
116 |         read_thread.join()
117 |         self.queue.join()
118 |         self.queue.put(self.fquit)
119 |         write_thread.join()                 
120 |         
121 | def usage():      
122 |     print "Example:"
123 |     print "%s -e 192.168.1.121:9200 -i indexname -f result.json -g" %sys.argv[0]    
124 |     sys.exit(1)
125 |     
126 | def main():    
127 |     if len(sys.argv[1:]) < 1:
128 |         usage()
129 |     try: 
130 |         opts, args = getopt.getopt(sys.argv[1:], 'hge:i:f:', 
131 |                                    ["help", "gzip", "elastic", "index", "file"])        
132 |     except getopt.GetoptError, e:
133 |         print str(e)
134 |         usage()        
135 |         
136 |     b_gzip = False
137 |     es_host = '' 
138 |     es_index = ''
139 |     src_file = ''  
140 |     encoding = sys.getfilesystemencoding()     
141 |     
142 |     for o, a in opts:
143 |         a = a.decode(encoding)
144 |         if o in ("-h", "--help"):
145 |             usage()
146 |         elif o in ("-g", "gzip"):
147 |             b_gzip  = True        
148 |         elif o in ("-e", "--elastic"): 
149 |             es_host = a
150 |         elif o in ("-i", "--index"):
151 |             es_index = a
152 |         elif o in ("-f", "--file"):
153 |             src_file = a
154 |         else:
155 |             assert False, "Unhandled Option" 
156 |             
157 |     if not (es_host and es_index and src_file):
158 |         usage()           
159 |     sample = ImportElasticIndex(es_host, es_index, src_file, b_gzip)
160 |     print sample
161 |     
162 | if __name__ == "__main__":
163 |     main()
164 | 


--------------------------------------------------------------------------------
/fdns_download.py:
--------------------------------------------------------------------------------
  1 | #coding: utf-8
  2 | 
  3 | import os
  4 | import sys    
  5 | import requests
  6 | import shlex
  7 | import subprocess
  8 | import hashlib
  9 | import gzip
 10 | import json
 11 | import chardet
 12 | from datetime import datetime
 13 | from bs4 import BeautifulSoup
 14 | from elasticsearch import Elasticsearch, helpers
 15 | 
 16 | 
 17 | class Elastic(object):
 18 |         
 19 |     def __init__(self, elastic=None, url='http://192.168.1.121:9200'):        
 20 |       self.elastic = elastic or Elasticsearch(url)
 21 |       
 22 |     def exists(self, index):
 23 |         return  self.elastic.indices.exists(index)      
 24 |            
 25 |     def create(self, index, body=None):                  
 26 |         self.elastic.indices.create(index, body=body, ignore=[400])   
 27 |     
 28 |     def insert(self, index, type, datas, id=None):    
 29 |         actions = []           
 30 |         for data in datas:
 31 |             actions.append({
 32 |                     "_index": index,
 33 |                     "_type": type,
 34 |                     "_id": id,
 35 |                     "_source": data
 36 |             })
 37 |         try:      
 38 |             helpers.bulk(self.elastic, actions) 
 39 |         except Exception, e:
 40 |             print ("{0} start bulk error: {1}".format(datetime.now(), e))         
 41 | 
 42 | class Fdns(object):
 43 |         
 44 |     def __init__(self):        
 45 |         self.website = "https://scans.io/study/sonar.fdns"       
 46 |         self.downurl, self.code = self.get_download_link()                     
 47 |         
 48 |     def get_download_link(self):       
 49 |         resp = requests.get(self.website)
 50 |         if resp.status_code != 200:
 51 |             raise Exception("download page failed: %d" % resp.status_code)
 52 |         soup = BeautifulSoup(resp.text, 'lxml')             
 53 |         try:
 54 |             table = soup.findAll(name='table', attrs={'class':'table table-condensed'})[0]
 55 |             tr = table.findAll(name='tr')[-1]              
 56 |             href = tr.findAll(name='a')[0]['href']
 57 |             code = tr.findAll(name='code')[0].text
 58 |         except Exception, msg:
 59 |             raise Exception("get download link failed: {}".format(msg))               
 60 |         return (href, code.lower())    
 61 |         
 62 |     @staticmethod
 63 |     def sha1(filepath, block_size=64*1024):
 64 |         try:
 65 |             with open(filepath, 'rb') as fd:
 66 |                 sha1obj = hashlib.sha1()
 67 |                 while True:
 68 |                     data = fd.read(block_size)
 69 |                     if not data:
 70 |                         break
 71 |                     sha1obj.update(data)
 72 |                 retsha1 = sha1obj.hexdigest()
 73 |                 return retsha1
 74 |         except IOError:
 75 |             raise Exception('Invalid file path: {}'.format(filepath))
 76 |         
 77 |     @staticmethod
 78 |     def make_dirs(dirpath, default='./sample'):
 79 |         try:       
 80 |             if not os.path.exists(dirpath):
 81 |                 os.makedirs(dirpath)
 82 |             return dirpath.rstrip(os.sep)
 83 |         except Exception, e:
 84 |             if not os.path.exists(default):
 85 |                 os.makedirs(default)
 86 |             return dirpath.rstrip(os.sep) 
 87 |         
 88 |     @staticmethod
 89 |     def download_file(downurl, localdir=None, showlog=False):    
 90 |         localdir = Fdns.make_dirs(localdir)
 91 |         command_line = "wget -c -t100 -P {0} {1}".format(localdir, downurl)
 92 |         tmp_cmdline = shlex.split(command_line)
 93 |         try:
 94 |             proc = subprocess.Popen(args=tmp_cmdline,
 95 |                                     stderr=subprocess.STDOUT,
 96 |                                     stdout=subprocess.PIPE,
 97 |                                     bufsize=0)
 98 |         except IOError:
 99 |              raise EnvironmentError(1, "wget is not installed or could "
100 |                                       "not be found in system path") 
101 |         while showlog and proc.poll() is None:
102 |             for streamline in iter(proc.stdout.readline, ''):
103 |                 sys.stdout.write(streamline)
104 |         proc.communicate()        
105 |         return proc.returncode      
106 |         
107 |     @staticmethod
108 |     def gzip_extract(gzpath, dstpath, block_size=64*1024):
109 |         try:
110 |             with gzip.open(gzpath, 'rb') as fr, open(dstpath, 'wb') as fw:
111 |                 while True:
112 |                     data = fr.read(block_size)
113 |                     if not data:
114 |                         break
115 |                     fw.write(data)
116 |         except IOError:
117 |             raise Exception('Invalid file path: {}'.format(gzpath))                    
118 |         
119 |     @staticmethod
120 |     def import_elastic(elastic, gzfile, step=500, sep=','):        
121 |         with gzip.open(gzfile) as fd_file_down:          
122 |             lines = []
123 |             type = 'json'                                                   
124 |             index = os.path.basename(gzfile)                 
125 |             elastic.create(index=index, body={ 
126 |                 'mappings': {
127 |                     'json': {
128 |                         'properties' : {
129 |                             'domain' : {
130 |                                 'type' : 'string',
131 |                                 'index': 'not_analyzed'
132 |                             },
133 |                             'record_type' : {
134 |                                 'type' : 'string',
135 |                                 'index': 'not_analyzed'
136 |                             },
137 |                             'record_value' : {
138 |                                 'type' : 'string',
139 |                                 'index': 'not_analyzed'
140 |                             }                                                  
141 |                         }                
142 |                     }                    
143 |                 }
144 |             })
145 |                                
146 |             for line in fd_file_down:
147 |                 success = False
148 |                 if not success:
149 |                     try:
150 |                         fields = line.strip().split(sep)
151 |                         data = dict(domain=fields[0], record_type=fields[1], record_value=fields[2])                                                                          
152 |                         lines.append(json.dumps(data))
153 |                         success = True
154 |                     except Exception:
155 |                         pass
156 |                 if not success:
157 |                     try:
158 |                         encoding = chardet.detect(line).get('encoding')
159 |                         line = line.decode(encoding)
160 |                         fields = line.strip().split(sep)
161 |                         data = dict(domain=fields[0], record_type=fields[1], record_value=fields[2])                                                                          
162 |                         lines.append(json.dumps(data))
163 |                         success = True                      
164 |                     except Exception:
165 |                         pass                      
166 |                 if len(lines) >= step:
167 |                     elastic.insert(index, type, lines)
168 |                     lines[:] = []
169 |             if len(lines):
170 |                 elastic.insert(index, type, lines)                  
171 |                       
172 | def main():    
173 |     fdns = Fdns() 
174 |     localdir = "./sample"   
175 |     downfilename = os.path.basename(fdns.downurl)
176 |     
177 |     print ("{0} start download file: {1}".format(datetime.now(), downfilename))
178 |     while True:
179 |         returncode = fdns.download_file(fdns.downurl, localdir=localdir, showlog=False)
180 |         if returncode == 0:
181 |             break
182 |     
183 |     print ("{0} start sha1 file: {1}".format(datetime.now(), downfilename))    
184 |     downfilepath = localdir + os.path.sep + downfilename
185 |     if fdns.sha1(downfilepath) != fdns.code:
186 |         print "downfile failed: {}".format(fdns.downurl)
187 |         sys.exit()
188 |         
189 |     print ("{0} start import file: {1}".format(datetime.now(), downfilename)) 
190 |     elastic = Elastic()
191 |     if not elastic.exists(downfilename):
192 |         fdns.import_elastic(elastic, downfilepath)    
193 |     
194 | if __name__ == "__main__":   
195 |     main()
196 | 


--------------------------------------------------------------------------------