├── LICENSE ├── README.md └── crawl.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Jesse 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Elasticsearch-Crawler -------------------------------------------------------------------------------- /crawl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import glob 5 | 6 | if sys.version_info[0] < 3: 7 | pVer = 2 8 | else: 9 | pVer = 3 10 | 11 | import requests 12 | try: 13 | from nested_lookup import nested_lookup 14 | except ImportError: 15 | print("Nested_lookup import not found.") 16 | if pVer == 3: 17 | print("please execute the command: pip3 install nested_lookup") 18 | else: 19 | print("please execute the command: pip install nested_lookup") 20 | 21 | sys.exit(1) 22 | 23 | import time 24 | import json 25 | import sys 26 | import os.path 27 | import socket 28 | import ast 29 | from io import open 30 | 31 | if pVer == 3: 32 | inpFunc = input 33 | else: 34 | inpFunc = raw_input 35 | 36 | size = 1000 37 | pagesPerFile = 1000 38 | scrollTimer = "1440" 39 | 40 | # Take input for IP address, port, index, and values to save 41 | if len(sys.argv) > 1: 42 | ipAdr = sys.argv[1] 43 | else: 44 | ipAdr = inpFunc("IP address: ") 45 | try: 46 | socket.inet_aton(ipAdr) 47 | except socket.error: 48 | print("Invalid IP.") 49 | sys.exit() 50 | 51 | if len(sys.argv) > 2: 52 | port = sys.argv[2] 53 | else: 54 | port = inpFunc("Port (Default is 9200): ") 55 | if port == "": 56 | port = "9200" 57 | 58 | 59 | if len(sys.argv) > 3: 60 | index = sys.argv[3] 61 | else: 62 | print("To list all indices go to http://{0}:{1}/_cat/indices?v".format(ipAdr, port)) 63 | index = inpFunc("Index name: ") 64 | 65 | if len(sys.argv) > 4: 66 | save = [] 67 | save = sys.argv[4:] 68 | 69 | else: 70 | save = [] 71 | print("Field values to obtain (submit an empty line when finished):") 72 | 73 | 74 | inp = inpFunc("Value: ") 75 | while inp != "": 76 | if '[' in inp and ']' in inp: 77 | try: 78 | save.append(ast.literal_eval(inp)) 79 | except SyntaxError: 80 | print("Invalid input.") 81 | else: 82 | save.append(inp) 83 | inp = inpFunc("Value: ") 84 | 85 | def parse_single(data): 86 | # Set our save string to nothing 87 | save_data = u"" 88 | 89 | # For each value you want to save, if it's a list, do two nested lookips 90 | for i in save: 91 | # If you passed a list, loop through it to get the innermost value 92 | if isinstance(i, (list,)): 93 | results = data 94 | for n in range(len(i)): 95 | results = nested_lookup(i[n], results) 96 | else: 97 | # Else just lookup the value 98 | results = nested_lookup(i, data) 99 | 100 | # If we have a single result that isn't empty add it to the string 101 | if len(results) == 1: 102 | if results[0] != "": 103 | save_data = u"%s%s," %(save_data, results[0]) 104 | else: 105 | # If we have list of results, if each of them isn't empty all them to the string 106 | for n in results: 107 | if n != "": 108 | save_data = u"%s%s," %(save_data, n) 109 | 110 | # Clean up the string 111 | save_data = save_data.replace(", \n", "") 112 | save_data = save_data.replace("\n", "") 113 | 114 | return u"%s" %save_data 115 | 116 | # Create session to keep track of cookies/headers 117 | s = requests.session() 118 | 119 | newScrollID = False 120 | rJson = "" 121 | 122 | # If there is a scrollID.txt file parse it to figure out where in the search we are 123 | if os.path.isfile("./" + ipAdr + "-scrollID.txt"): 124 | scrollFile = open(ipAdr + "-scrollID.txt", "r+", encoding="utf-8") 125 | scrollContents = scrollFile.read().split("\n") 126 | scrollFile.close() 127 | scrollID = scrollContents[0] 128 | 129 | else: 130 | newScrollID = True 131 | # If there is no scrollID.txt file 132 | # Send initial request to get a scrollID to start pulling all the data, and not just the 5000 results that you can get from a search 133 | 134 | # scrollContents contains the values we need to "scoll" through all the pages of results 135 | scrollContents = [] 136 | r = s.post("http://" + ipAdr + ":" + port + "/" + index + "/_search?scroll=" + scrollTimer + "m&size=" + str(size), headers={'Content-Type': 'application/json'}) 137 | #print("http://" + ipAdr + ":" + port + "/" + index + "/_search?scroll=" + scrollTimer + "m&size=" + str(size)) 138 | if not r.ok: 139 | #print(r.text) 140 | print("Response not okay, exiting") 141 | #print(r.text) 142 | sys.exit(1) 143 | 144 | rJson = json.loads(r.text) 145 | 146 | if 'error' in rJson: 147 | print("The server returned an error") 148 | #print(rJson) 149 | sys.exit(1) 150 | 151 | scrollID = rJson["_scroll_id"] 152 | if type(rJson["hits"]["total"]) is not dict: 153 | totalRequests = str(int((rJson["hits"]["total"])/size)) 154 | else: 155 | totalRequests = str(int((rJson["hits"]["total"]["value"])/size)) 156 | 157 | scrollContents.append(scrollID) 158 | scrollContents.append(totalRequests) 159 | scrollContents.append("1") 160 | 161 | 162 | # Strip all whitespace from the scrollContents 163 | #print(str(scrollContents)) 164 | for i in range(len(scrollContents)-1): 165 | scrollContents[i] = scrollContents[i].strip() 166 | 167 | # Create scroll files. We save 1000 "pages" of results per file 168 | fileName = ipAdr + "-" + index + "-" + str(int(int(scrollContents[2]) / pagesPerFile)) + ".txt" 169 | f = open(fileName, "a", encoding='utf-16') 170 | 171 | if newScrollID: 172 | # Run each result through the parsing function 173 | for hit in rJson["hits"]["hits"]: 174 | cwd = hit["_source"] 175 | csv = parse_single(cwd) 176 | # and write them to the current file 177 | if "," in csv: 178 | print(u"%s" %csv) 179 | else: 180 | f.write(u"%s\n" %csv) 181 | 182 | # Loop through every request, get the results, parse them, and save them to their respective files 183 | while True: 184 | #print("Getting page %s / %s" %(scrollContents[2], scrollContents[1])) 185 | scrollContents[2] = str(int(scrollContents[2]) + 1) 186 | 187 | if int(scrollContents[1]) % pagesPerFile == 0: 188 | # If we've hit the 1000 pages per file for our scolling, save the file and open the next 189 | f.close() 190 | 191 | fileName = ipAdr + "-" + index + "-" + str(int(int(scrollContents[2]) % pagesPerFile)) + ".txt" 192 | f = open(fileName, "a", encoding='utf-16') 193 | 194 | # Get next "page" storia_moments 195 | #print("http://" + ipAdr + ":" + str(port) + "/_search/scroll?scroll=" + scrollTimer + "m&scroll_id=" + scrollID) 196 | r = s.post("http://" + ipAdr + ":" + str(port) + "/_search/scroll?scroll=" + scrollTimer + "m&scroll_id=" + scrollID, headers={'Content-Type': 'application/json'}) 197 | if not r.ok: 198 | # This shouldn't happen often unless we're being ratelimited 199 | #print("Response not okay, sleeping 10 seconds") 200 | #print(r.text) 201 | #print("http://" + ipAdr + ":" + str(port) + "/_search/scroll?scroll=" + scrollTimer + "m&scroll_id=" + scrollID) 202 | time.sleep(10) 203 | continue 204 | 205 | # Update scrollID 206 | rJson = json.loads(r.text) 207 | scrollID = rJson["_scroll_id"] 208 | if scrollID != scrollContents[0]: 209 | scrollContents[0] = scrollID 210 | 211 | # Update scrollID.txt file if anything's changed 212 | scrollFile = open(ipAdr + "-scrollID.txt", "w",encoding='utf-8') 213 | for i in scrollContents: 214 | scrollFile.write(u"%s\n" %i) 215 | scrollFile.close() 216 | 217 | # If we're out of results, we've scraped everything 218 | #print(rJson) 219 | if len(rJson["hits"]["hits"]) == 0: 220 | #print(r.text) 221 | print("Got all data") 222 | f.close() 223 | fileList = glob.glob(ipAdr + "*.txt") 224 | for f in fileList: 225 | os.remove(f) 226 | sys.exit(0) 227 | 228 | # Run each result through the parsing function 229 | for hit in rJson["hits"]["hits"]: 230 | cwd = hit["_source"] 231 | csv = parse_single(cwd) 232 | # and write them to the current file 233 | if "," in csv: 234 | print(u"%s" %csv) 235 | else: 236 | f.write(u"%s\n" %csv) 237 | 238 | time.sleep(1) 239 | 240 | --------------------------------------------------------------------------------