├── LICENSE
├── README.md
└── crawl.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Jesse
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Elasticsearch-Crawler


--------------------------------------------------------------------------------
/crawl.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import glob
  5 | 
  6 | if sys.version_info[0] < 3:
  7 |     pVer = 2
  8 | else:
  9 |     pVer = 3
 10 | 
 11 | import requests
 12 | try:
 13 |     from nested_lookup import nested_lookup
 14 | except ImportError:
 15 |     print("Nested_lookup import not found.")
 16 |     if pVer == 3:
 17 |         print("please execute the command: pip3 install nested_lookup")
 18 |     else:
 19 |         print("please execute the command: pip install nested_lookup")
 20 | 
 21 |     sys.exit(1)
 22 | 
 23 | import time
 24 | import json
 25 | import sys
 26 | import os.path
 27 | import socket
 28 | import ast
 29 | from io import open
 30 | 
 31 | if pVer == 3:
 32 |     inpFunc = input
 33 | else:
 34 |     inpFunc = raw_input
 35 | 
 36 | size = 1000
 37 | pagesPerFile = 1000
 38 | scrollTimer = "1440"
 39 | 
 40 | # Take input for IP address, port, index, and values to save
 41 | if len(sys.argv) > 1:
 42 |     ipAdr = sys.argv[1]
 43 | else:
 44 |     ipAdr = inpFunc("IP address: ")
 45 | try:
 46 |     socket.inet_aton(ipAdr)
 47 | except socket.error:
 48 |     print("Invalid IP.")
 49 |     sys.exit()
 50 | 
 51 | if len(sys.argv) > 2:
 52 |     port = sys.argv[2]
 53 | else:
 54 |     port = inpFunc("Port (Default is 9200): ")
 55 |     if port == "":
 56 |         port = "9200"
 57 | 
 58 | 
 59 | if len(sys.argv) > 3:
 60 |     index = sys.argv[3]
 61 | else:
 62 |     print("To list all indices go to http://{0}:{1}/_cat/indices?v".format(ipAdr, port))
 63 |     index = inpFunc("Index name: ")
 64 | 
 65 | if len(sys.argv) > 4:
 66 |     save = []
 67 |     save = sys.argv[4:]
 68 | 
 69 | else:
 70 |     save = []
 71 |     print("Field values to obtain (submit an empty line when finished):")
 72 | 
 73 | 
 74 |     inp = inpFunc("Value: ")
 75 |     while inp != "":
 76 |         if '[' in inp and ']' in inp:
 77 |             try:
 78 |                 save.append(ast.literal_eval(inp))
 79 |             except SyntaxError:
 80 |                 print("Invalid input.")
 81 |         else:
 82 |             save.append(inp)
 83 |         inp = inpFunc("Value: ")
 84 | 
 85 | def parse_single(data):
 86 |     # Set our save string to nothing
 87 |     save_data = u""
 88 | 
 89 |     # For each value you want to save, if it's a list, do two nested lookips
 90 |     for i in save:
 91 |         # If you passed a list, loop through it to get the innermost value
 92 |         if isinstance(i, (list,)):
 93 |             results = data
 94 |             for n in range(len(i)):
 95 |                 results = nested_lookup(i[n], results)
 96 |         else:
 97 |             # Else just lookup the value
 98 |             results = nested_lookup(i, data)
 99 | 
100 |         # If we have a single result that isn't empty add it to the string
101 |         if len(results) == 1:
102 |             if results[0] != "":
103 |                 save_data =  u"%s%s," %(save_data, results[0])
104 |         else:
105 |             # If we have list of results, if each of them isn't empty all them to the string
106 |             for n in results:
107 |                 if n != "":
108 |                     save_data =  u"%s%s," %(save_data, n)
109 | 
110 |     # Clean up the string
111 |     save_data = save_data.replace(", \n", "")
112 |     save_data = save_data.replace("\n", "")
113 | 
114 |     return u"%s" %save_data
115 | 
116 | # Create session to keep track of cookies/headers
117 | s = requests.session()
118 | 
119 | newScrollID = False
120 | rJson = ""
121 | 
122 | # If there is a scrollID.txt file parse it to figure out where in the search we are
123 | if os.path.isfile("./" + ipAdr + "-scrollID.txt"):
124 |     scrollFile = open(ipAdr + "-scrollID.txt", "r+", encoding="utf-8")
125 |     scrollContents = scrollFile.read().split("\n")
126 |     scrollFile.close()
127 |     scrollID = scrollContents[0]
128 | 
129 | else:
130 |     newScrollID = True
131 |     # If there is no scrollID.txt file
132 |     # Send initial request to get a scrollID to start pulling all the data, and not just the 5000 results that you can get from a search
133 | 
134 |     # scrollContents contains the values we need to "scoll" through all the pages of results
135 |     scrollContents = []
136 |     r = s.post("http://" + ipAdr + ":" + port + "/" + index + "/_search?scroll=" + scrollTimer + "m&size=" + str(size), headers={'Content-Type': 'application/json'})
137 |     #print("http://" + ipAdr + ":" + port + "/" + index + "/_search?scroll=" + scrollTimer + "m&size=" + str(size))
138 |     if not r.ok:
139 |         #print(r.text)
140 |         print("Response not okay, exiting")
141 |         #print(r.text)
142 |         sys.exit(1)
143 | 
144 |     rJson = json.loads(r.text)
145 | 
146 |     if 'error' in rJson:
147 |         print("The server returned an error")
148 |         #print(rJson)
149 |         sys.exit(1)
150 | 
151 |     scrollID = rJson["_scroll_id"]
152 |     if type(rJson["hits"]["total"]) is not dict:
153 |         totalRequests = str(int((rJson["hits"]["total"])/size))
154 |     else:
155 |         totalRequests = str(int((rJson["hits"]["total"]["value"])/size))
156 | 
157 |     scrollContents.append(scrollID)
158 |     scrollContents.append(totalRequests)
159 |     scrollContents.append("1")
160 | 
161 | 
162 | # Strip all whitespace from the scrollContents
163 | #print(str(scrollContents))
164 | for i in range(len(scrollContents)-1):
165 |     scrollContents[i] = scrollContents[i].strip()
166 | 
167 | # Create scroll files. We save 1000 "pages" of results per file
168 | fileName = ipAdr + "-" + index + "-" + str(int(int(scrollContents[2]) / pagesPerFile)) + ".txt"
169 | f = open(fileName, "a", encoding='utf-16')
170 | 
171 | if newScrollID:
172 |     # Run each result through the parsing function
173 |     for hit in rJson["hits"]["hits"]:
174 |         cwd = hit["_source"]
175 |         csv = parse_single(cwd)
176 |         # and write them to the current file
177 |         if "," in csv:
178 |             print(u"%s" %csv)
179 |         else:
180 |             f.write(u"%s\n" %csv)
181 | 
182 | # Loop through every request, get the results, parse them, and save them to their respective files
183 | while True:
184 |     #print("Getting page %s / %s" %(scrollContents[2], scrollContents[1]))
185 |     scrollContents[2] = str(int(scrollContents[2]) + 1)
186 | 
187 |     if int(scrollContents[1]) % pagesPerFile == 0:
188 |         # If we've hit the 1000 pages per file for our scolling, save the file and open the next
189 |         f.close()
190 | 
191 |         fileName = ipAdr + "-" + index + "-" + str(int(int(scrollContents[2]) % pagesPerFile)) + ".txt"
192 |         f = open(fileName, "a", encoding='utf-16')
193 | 
194 |     # Get next "page" storia_moments
195 |     #print("http://" + ipAdr + ":" + str(port) + "/_search/scroll?scroll=" + scrollTimer + "m&scroll_id=" + scrollID)
196 |     r = s.post("http://" + ipAdr + ":" + str(port) + "/_search/scroll?scroll=" + scrollTimer + "m&scroll_id=" + scrollID, headers={'Content-Type': 'application/json'})
197 |     if not r.ok:
198 |         # This shouldn't happen often unless we're being ratelimited
199 |         #print("Response not okay, sleeping 10 seconds")
200 |         #print(r.text)
201 |         #print("http://" + ipAdr + ":" + str(port) + "/_search/scroll?scroll=" + scrollTimer + "m&scroll_id=" + scrollID)
202 |         time.sleep(10)
203 |         continue
204 | 
205 |     # Update scrollID
206 |     rJson = json.loads(r.text)
207 |     scrollID = rJson["_scroll_id"]
208 |     if scrollID != scrollContents[0]:
209 |         scrollContents[0] = scrollID
210 | 
211 |     # Update scrollID.txt file if anything's changed
212 |     scrollFile = open(ipAdr + "-scrollID.txt", "w",encoding='utf-8')
213 |     for i in scrollContents:
214 |         scrollFile.write(u"%s\n" %i)
215 |     scrollFile.close()
216 | 
217 |     # If we're out of results, we've scraped everything
218 |     #print(rJson)
219 |     if len(rJson["hits"]["hits"]) == 0:
220 |         #print(r.text)
221 |         print("Got all data")
222 |         f.close()
223 |         fileList = glob.glob(ipAdr + "*.txt")
224 |         for f in fileList:
225 |             os.remove(f)
226 |         sys.exit(0)
227 | 
228 |     # Run each result through the parsing function
229 |     for hit in rJson["hits"]["hits"]:
230 |         cwd = hit["_source"]
231 |         csv = parse_single(cwd)
232 |         # and write them to the current file
233 |         if "," in csv:
234 |             print(u"%s" %csv)
235 |         else:
236 |             f.write(u"%s\n" %csv)
237 | 
238 |     time.sleep(1)
239 | 
240 | 


--------------------------------------------------------------------------------