├── .gitignore
├── BappDescription.html
├── BappManifest.bmf
├── LICENSE
├── README.md
├── directories.txt
├── issue.png
├── off-by-slash.py
└── scrape.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # VSCode
107 | .vscode/settings.json
108 | 
109 | results.txt
110 | domains.txt
111 | 


--------------------------------------------------------------------------------
/BappDescription.html:
--------------------------------------------------------------------------------
 1 | <p>This extension detects NGINX alias traversal due to misconfiguration.</p>
 2 | 
 3 | <p>The technique is based on Orange Tsai's BlackHat USA 2018 <a href="https://i.blackhat.com/us-18/Wed-August-8/us-18-Orange-Tsai-Breaking-Parser-Logic-Take-Your-Path-Normalization-Off-And-Pop-0days-Out-2.pdf">
 4 |     Presentation</a></p>
 5 | 
 6 | <p>A server is assumed to be vulnerable if a request to an existing path like <code>https://example.com/static../</code> returns the same response as <code>https://example.com/</code>. To eliminate false positives the misconfiguration has to be confirmed by successfully requesting an existing resource via path traversal. This is done as follows:</p>
 7 | 
 8 | <p>For the URL https://example.com/folder1/folder2/static/main.css it generates the following links:</p>
 9 | 
10 | <pre>https://example.com/folder1../folder1/folder2/static/main.css
11 | https://example.com/folder1../%s/folder2/static/main.css
12 | https://example.com/folder1/folder2../folder2/static/main.css
13 | https://example.com/folder1/folder2../%s/static/main.css
14 | https://example.com/folder1/folder2/static../static/main.css
15 | https://example.com/folder1/folder2/static../%s/main.css</pre>
16 | 
17 | <p>Where <code>%s</code> are common directories used in alias paths based on around 9500 nginx configuration files from GH (thanks <a href="https://twitter.com/TomNomNom">@TomNomNom</a>), see directories.txt.</p>
18 | 


--------------------------------------------------------------------------------
/BappManifest.bmf:
--------------------------------------------------------------------------------
 1 | Uuid: a5fdd2cdffa6410eb530de5a4c294d3a
 2 | ExtensionType: 2
 3 | Name: NGINX Alias Traversal
 4 | RepoName: nginx-alias-traversal
 5 | ScreenVersion: 1.1
 6 | SerialVersion: 5
 7 | MinPlatformVersion: 0
 8 | ProOnly: True
 9 | Author: Martin Bajanik (@_bayotop)
10 | ShortDescription: Detects NGINX alias traversal due to misconfiguration.
11 | EntryPoint: off-by-slash.py
12 | BuildCommand: 
13 | SupportedProducts: Pro 
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Martin Bajanik
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # off-by-slash
 2 | Burp extension to detect alias traversal via NGINX misconfiguration at scale. Requires Burp Professional.
 3 | 
 4 | ![Issue](issue.png?raw=true "off-by-slash in Burp 2.0")
 5 | 
 6 | ## Usage
 7 | 
 8 | 1. git clone https://github.com/bayotop/off-by-slash/
 9 | 2. Burp -> Extender -> Add -> find and select `off-by-slash.py`
10 | 
11 | The extension implements an active scanner check. Simply run a new scan, preferably with an "Audit checks - extensions only" configuration, on static resources identified via Burp's crawler. Alternatively, use `scrape.py` with a list of URLs to scrape static resources from. The results can be directly passed to a new Burp scan (Burp 2.0).
12 | 
13 | ## Description
14 | 
15 | *https://i.blackhat.com/us-18/Wed-August-8/us-18-Orange-Tsai-Breaking-Parser-Logic-Take-Your-Path-Normalization-Off-And-Pop-0days-Out-2.pdf*
16 | 
17 | A server is assumed to be vulnerable if a request to an existing path like `https://example.com/static../` returns the same response as `https://example.com/`. To eliminate false positives the misconfiguration has to be confirmed by successfully requesting an existing resource via path traversal. This is done as follows:
18 | 
19 | For the URL https://example.com/folder1/folder2/static/main.css it generates the following links:
20 | 
21 | ```
22 | https://example.com/folder1../folder1/folder2/static/main.css
23 | https://example.com/folder1../%s/folder2/static/main.css
24 | https://example.com/folder1/folder2../folder2/static/main.css
25 | https://example.com/folder1/folder2../%s/static/main.css
26 | https://example.com/folder1/folder2/static../static/main.css
27 | https://example.com/folder1/folder2/static../%s/main.css
28 | ```
29 | 
30 | Where `%s` are common directories used in alias paths based on around 9500 nginx configuration files from GH (thanks [@TomNomNom](https://twitter.com/TomNomNom)), see directories.txt.
31 | 


--------------------------------------------------------------------------------
/directories.txt:
--------------------------------------------------------------------------------
  1 | Archipel
  2 | _static
  3 | acme
  4 | acme-challenge
  5 | acme_challenges
  6 | admin
  7 | adminer
  8 | alpha
  9 | app1-static
 10 | app2-static
 11 | app_dir
 12 | app_nginx_static_path
 13 | asset_img
 14 | assets
 15 | audio
 16 | awstats
 17 | backend
 18 | base_dir
 19 | blankon
 20 | blob
 21 | blue
 22 | bootstrap
 23 | build
 24 | cgi-bin
 25 | challenge
 26 | challenges
 27 | chiminey
 28 | clld_dir
 29 | collected_static
 30 | community
 31 | content
 32 | counterblockd
 33 | counterwallet
 34 | css
 35 | custom
 36 | d
 37 | data
 38 | dataset1
 39 | dataset2
 40 | default
 41 | demo
 42 | demo-app
 43 | developerslv
 44 | dist
 45 | django-blog
 46 | django_project_path
 47 | doc
 48 | docs
 49 | download
 50 | downloads
 51 | error
 52 | errors
 53 | export
 54 | favicons
 55 | favs
 56 | files
 57 | films
 58 | flask_test_uploads
 59 | fm
 60 | font-icons
 61 | fonts
 62 | frontend
 63 | ftp
 64 | ftpmaster
 65 | hgs-static
 66 | higlass-website
 67 | home
 68 | horde
 69 | htdocs
 70 | html
 71 | httpboot
 72 | icon
 73 | icons
 74 | ikiwiki
 75 | image_data
 76 | images
 77 | img
 78 | install
 79 | items
 80 | javascript
 81 | js
 82 | js-plugin
 83 | khanlinks
 84 | kibana
 85 | kolab-syncroton
 86 | latest
 87 | layout
 88 | legal
 89 | lemonldap-ng-doc
 90 | lemonldap-ng-fr-doc
 91 | letsencrypt
 92 | lib
 93 | libs
 94 | log
 95 | logging
 96 | mailinabox
 97 | mailman
 98 | main_user
 99 | manual
100 | media
101 | memcached
102 | minified
103 | misc
104 | moodledata
105 | msks
106 | munki_repo
107 | music
108 | name
109 | new-js
110 | nginx
111 | noVNC
112 | node_modules
113 | oldsanta
114 | option
115 | outputs
116 | owncloud
117 | packed
118 | patchwork
119 | path
120 | pictures
121 | plugin_static
122 | postfixadmin
123 | prod
124 | project_root
125 | pub
126 | public
127 | public_html
128 | public_root
129 | qv-frontend
130 | repo
131 | repos
132 | repository
133 | resources
134 | resourcesync
135 | results
136 | robots
137 | root
138 | roundcube
139 | roundcubemail
140 | run
141 | script
142 | scripts
143 | shared
144 | shibboleth
145 | site-library
146 | sitestatic
147 | spearmint
148 | src
149 | stackato-pkg
150 | static
151 | static-collected
152 | static-html
153 | static-root
154 | static_prod
155 | static_root
156 | static_user
157 | staticfiles
158 | stats
159 | storage
160 | style
161 | styles
162 | stylesheets
163 | target
164 | temp
165 | templates
166 | test
167 | testfiles
168 | tests
169 | theme
170 | theme_static
171 | thumb
172 | thumbs
173 | tiedostot
174 | tmp
175 | ubuntu
176 | ui
177 | unsplash-downloader
178 | upfiles
179 | upload
180 | uploads
181 | videos
182 | web
183 | web-dist
184 | webroot_path
185 | websocket
186 | webstatic
187 | well-known
188 | whturk
189 | wp-content
190 | www
191 | www-data
192 | zmusic-frontend


--------------------------------------------------------------------------------
/issue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PortSwigger/nginx-alias-traversal/43f4a8f5258bfed0c7d91421f7b276332bc55ec0/issue.png


--------------------------------------------------------------------------------
/off-by-slash.py:
--------------------------------------------------------------------------------
  1 | from burp import IBurpExtender, IScannerCheck, IScanIssue
  2 | from java.io import PrintWriter
  3 | from java.net import URL
  4 | 
  5 | # https://i.blackhat.com/us-18/Wed-August-8/us-18-Orange-Tsai-Breaking-Parser-Logic-Take-Your-Path-Normalization-Off-And-Pop-0days-Out-2.pdf
  6 | 
  7 | # Attempts to detect path traversal caused via a common NGINX misconfiguration.
  8 | # Example:
  9 | #       For the URL: https://example.com/folder1/folder2/static/main.css it generates the following links (only if the folders seem vulnerable):
 10 | #
 11 | #       https://example.com/folder1../folder1/folder2/static/main.css
 12 | #       https://example.com/folder1../%s/folder2/static/main.css
 13 | #       https://example.com/folder1/folder2../folder2/static/main.css
 14 | #       https://example.com/folder1/folder2../%s/static/main.css
 15 | #       https://example.com/folder1/folder2/static../static/main.css
 16 | #       https://example.com/folder1/folder2/static../%s/main.css
 17 | #
 18 | #       where %s are common directories used in alias paths based on top 10k nginx configuration files from GH (thanks @TomNomNom), see directories.txt.
 19 | 
 20 | 
 21 | class BurpExtender(IBurpExtender, IScannerCheck):
 22 |     scanned_urls = set()
 23 | 
 24 |     def registerExtenderCallbacks(self, callbacks):
 25 |         self._callbacks = callbacks
 26 |         self._helpers = callbacks.getHelpers()
 27 | 
 28 |         callbacks.setExtensionName("NGINX Alias Traversal")
 29 | 
 30 |         self._stdout = PrintWriter(callbacks.getStdout(), True)
 31 |         self._callbacks.registerScannerCheck(self)
 32 | 
 33 |         self.enableDirectoryGuessing = True
 34 |         with open("directories.txt", "r") as f:
 35 |             self.common_directories = [x.strip() for x in f.readlines()]
 36 | 
 37 |         self._stdout.println("GitHub: https://github.com/bayotop/off-by-slash/")
 38 |         self._stdout.println("Contact: https://twitter.com/_bayotop")
 39 |         self._stdout.println("")
 40 |         self._stdout.println("Successfully initialized (v1.1)!")
 41 | 
 42 |     def doActiveScan(self, baseRequestResponse, insertionPoint):
 43 |         scan_issues = []
 44 | 
 45 |         if not self.isGet(baseRequestResponse.getRequest()):
 46 |             return None
 47 | 
 48 |         if not self.isStaticResource(baseRequestResponse):
 49 |             return None
 50 | 
 51 |         # Am I missing cases because of this?
 52 |         if not self._helpers.analyzeResponse(baseRequestResponse.getResponse()).getStatusCode() == 200:
 53 |             return None
 54 | 
 55 |         # Prevent testing same paths repeadetly
 56 |         url = self._helpers.analyzeRequest(baseRequestResponse).getUrl().toString()
 57 |         url = url[: url.rindex("/")]
 58 | 
 59 |         if url in self.scanned_urls:
 60 |             return None
 61 | 
 62 |         self.scanned_urls.add(url)
 63 |         vulnerable, verifyingRequestResponse = self.detectAliasTraversal(baseRequestResponse)
 64 | 
 65 |         if vulnerable:
 66 |             scan_issues.append(self.generateIssue(baseRequestResponse, verifyingRequestResponse))
 67 | 
 68 |         return scan_issues
 69 | 
 70 |     def doPassiveScan(self, baseRequestResponse):
 71 |         return []
 72 | 
 73 |     def consolidateDuplicateIssues(self, existingIssue, newIssue):
 74 |         return existingIssue.getIssueName() == newIssue.getIssueName()
 75 | 
 76 |     def isGet(self, request):
 77 |         requestInfo = self._helpers.analyzeRequest(request)
 78 |         return requestInfo.getMethod() == "GET"
 79 | 
 80 |     def isStaticResource(self, requestResponse):
 81 |         # This likely needs adjustment.
 82 |         return "." in self._helpers.analyzeRequest(requestResponse).getUrl().getPath().split("/")[-1]
 83 | 
 84 |     def detectAliasTraversal(self, requestResponse):
 85 |         originalUrl = self._helpers.analyzeRequest(requestResponse).getUrl()
 86 |         urls = self.generateUrls(originalUrl, requestResponse)
 87 | 
 88 |         for url in urls:
 89 |             verifyingRequestResponse = self._callbacks.makeHttpRequest(
 90 |                 requestResponse.getHttpService(), self._helpers.buildHttpRequest(url)
 91 |             )
 92 |             if self.compareResponses(requestResponse.getResponse(), verifyingRequestResponse.getResponse()):
 93 |                 self._stdout.println("Vulnerable: %s" % url)
 94 |                 return True, verifyingRequestResponse
 95 | 
 96 |         return False, None
 97 | 
 98 |     def generateUrls(self, url, requestResponse):
 99 |         urls = []
100 |         path = url.getPath()
101 |         parts = filter(None, path.split("/"))
102 | 
103 |         for part in parts:
104 |             if "." in part and part[0] != ".":
105 |                 continue
106 | 
107 |             # Checks if /part../ returns the same as /
108 |             if not self.quickCheck(url, part, requestResponse):
109 |                 continue
110 | 
111 |             self._stdout.println("Potentially vulnerable: %s (folder /%s/)" % (url, part))
112 | 
113 |             replacement = "/%s../%s/" % (part, part)
114 |             urls.append(URL(url.toString().replace("/%s/" % part, replacement)))
115 |             if self.enableDirectoryGuessing:
116 |                 urls = urls + self.guessDirectories(url, part)
117 | 
118 |         return urls
119 | 
120 |     def quickCheck(self, url, part, requestResponse):
121 |         replacement = "/%s../" % part
122 | 
123 |         # https://host/some/part/other -> https://host/some/part../
124 |         probe = url.toString().replace("/%s/" % part, replacement)
125 |         probe = URL(probe[: probe.index("../") + 3])
126 | 
127 |         # https://host/some/part../ -> https://host/some/
128 |         verifier = URL(probe.toString().replace(replacement, "") + "/")
129 | 
130 |         expected = self._callbacks.makeHttpRequest(
131 |             requestResponse.getHttpService(), self._helpers.buildHttpRequest(verifier)
132 |         )
133 |         actual = self._callbacks.makeHttpRequest(
134 |             requestResponse.getHttpService(), self._helpers.buildHttpRequest(probe)
135 |         )
136 | 
137 |         return self.compareResponses(expected.getResponse(), actual.getResponse())
138 | 
139 |     def guessDirectories(self, url, part):
140 |         urls = []
141 | 
142 |         for directory in self.common_directories:
143 |             replacement = "/%s../%s/" % (part, directory)
144 |             urls.append(URL(url.toString().replace("/%s/" % part, replacement)))
145 | 
146 |         return urls
147 | 
148 |     def compareResponses(self, oResponse, vResponse):
149 |         vResponseInfo = self._helpers.analyzeResponse(vResponse)
150 |         oResponseInfo = self._helpers.analyzeResponse(oResponse)
151 | 
152 |         if vResponseInfo.getStatusCode() != oResponseInfo.getStatusCode():
153 |             return False
154 | 
155 |         vBodyOffset = vResponseInfo.getBodyOffset()
156 |         vBody = vResponse.tostring()[vBodyOffset:]
157 | 
158 |         oBodyOffset = oResponseInfo.getBodyOffset()
159 |         oBody = oResponse.tostring()[oBodyOffset:]
160 | 
161 |         return str(oBody) == str(vBody)
162 | 
163 |     def generateIssue(self, baseRequestResponse, verifyingRequestResponse):
164 |         name = "Path traversal via misconfigured NGINX alias"
165 |         severity = "High"
166 |         confidence = "Firm"
167 |         detail = """
168 | Found path traversal at:<br/>
169 | <ul>
170 | <li>Original url: %s</li>
171 | <li>Verification url: %s</li>
172 | </ul>        
173 | """ % (
174 |             self._helpers.analyzeRequest(baseRequestResponse).getUrl(),
175 |             self._helpers.analyzeRequest(verifyingRequestResponse).getUrl(),
176 |         )
177 |         # https://github.com/yandex/gixy/blob/master/docs/en/plugins/aliastraversal.md
178 |         background = """
179 | The alias directive is used to replace path of the specified location. For example, with the following configuration:<br/><br/>
180 | 
181 | <pre>location /i/ { 
182 |         alias /data/w3/images/;
183 | }</pre><br/>
184 | on request of /i/top.gif, the file /data/w3/images/top.gif will be sent.<br/><br/>
185 |         
186 | But, if the location doesn't ends with directory separator (i.e. /):<br/><br/>
187 | 
188 | <pre>location /i {
189 |         alias /data/w3/images/
190 | }</pre><br/>
191 | on request of /i../app/config.py, the file /data/w3/app/config.py will be sent.<br/><br/>
192 | 
193 | In other words, the incorrect configuration of alias could allow an attacker to read file stored outside the target folder.
194 | """
195 |         remediation = "Find all 'alias' directives and make sure that the parent prefixed location ends with and directory separator."
196 | 
197 |         return ScanIssue(
198 |             baseRequestResponse.getHttpService(),
199 |             self._helpers.analyzeRequest(baseRequestResponse).getUrl(),
200 |             [baseRequestResponse, verifyingRequestResponse],
201 |             name,
202 |             detail,
203 |             background,
204 |             confidence,
205 |             severity,
206 |             remediation,
207 |         )
208 | 
209 | 
210 | class ScanIssue(IScanIssue):
211 |     def __init__(self, httpService, url, httpMessages, name, detail, background, confidence, severity, remediation):
212 |         self.HttpService = httpService
213 |         self.Url = url
214 |         self.HttpMessages = httpMessages
215 |         self.Name = name
216 |         self.Background = background
217 |         self.Detail = detail
218 |         self.Severity = severity
219 |         self.Confidence = confidence
220 |         self.Remediation = remediation
221 |         return
222 | 
223 |     def getUrl(self):
224 |         return self.Url
225 | 
226 |     def getIssueName(self):
227 |         return self.Name
228 | 
229 |     def getIssueType(self):
230 |         return 0
231 | 
232 |     def getSeverity(self):
233 |         return self.Severity
234 | 
235 |     def getConfidence(self):
236 |         return self.Confidence
237 | 
238 |     def getIssueBackground(self):
239 |         return self.Background
240 | 
241 |     def getRemediationBackground(self):
242 |         return self.Remediation
243 | 
244 |     def getIssueDetail(self):
245 |         return self.Detail
246 | 
247 |     def getRemediationDetail(self):
248 |         return None
249 | 
250 |     def getHttpMessages(self):
251 |         return self.HttpMessages
252 | 
253 |     def getHttpService(self):
254 |         return self.HttpService
255 | 


--------------------------------------------------------------------------------
/scrape.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | import sys
  3 | import re
  4 | import requests
  5 | import urllib3
  6 | from urllib.parse import urlparse
  7 | 
  8 | # 1. Use this to scrape a resource from a list of given URLs
  9 | # 2. In Burp start a new scan and them as "URLs to Scan"
 10 | # 3. Selectively disable other extensions adding active scanner checks and run a "Audit checks - extensions only" scan.
 11 | 
 12 | RESOURCES_PATTERN = r'(?:(?:href|src)=(?:["\']([^\'"]*)[\'"]|([^\s<>]+)))'  # @d0nutptr
 13 | EXCLUDED_EXTENSIONS = [r"html?", r"as.x?", r"php\d?"]
 14 | 
 15 | RESULTS_FILE = "results.txt"
 16 | PROCESSES_COUNT = 4
 17 | DONE_FLAG = "__done__"
 18 | 
 19 | 
 20 | def initiate(pool, results, urls):
 21 |     jobs = []
 22 |     for url in urls:
 23 |         job = pool.apply_async(scrape, (url, results))
 24 |         jobs.append(job)
 25 | 
 26 |     try:
 27 |         for job in jobs:
 28 |             job.get()
 29 |     except KeyboardInterrupt:
 30 |         print("Killed.")
 31 |         try:
 32 |             pool.terminate()
 33 |             pool.close()
 34 |         finally:
 35 |             sys.exit(0)
 36 | 
 37 | 
 38 | def scrape(url, queue):
 39 |     urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 40 |     results = set()
 41 | 
 42 |     print("Scraping %s ..." % url)
 43 |     try:
 44 |         response = requests.get(url, verify=False, timeout=3)
 45 |         if response.history:
 46 |             url = "{uri.scheme}://{uri.netloc}".format(uri=urlparse(response.url))
 47 |         content = response.content
 48 |     except:
 49 |         print("Failed on %s: %s" % (url, sys.exc_info()[1]))
 50 |         return
 51 | 
 52 |     matches = re.findall(RESOURCES_PATTERN, content.decode("utf-8", "replace"))
 53 | 
 54 |     for match in matches:
 55 |         for group in match:
 56 |             results.add(group)
 57 | 
 58 |     results = [result for result in results if is_same_origin(url, result) or is_relative(result)]
 59 |     results = [
 60 |         result
 61 |         for result in results
 62 |         if ("." in result.split("/")[-1] and not is_excluded(result.split("/")[-1].split(".")[-1]))
 63 |     ]
 64 |     results = [get_full_url(url, result) for result in results]
 65 | 
 66 |     print("Found %s resources on %s" % (len(results), url))
 67 | 
 68 |     for result in results:
 69 |         queue.put(result.replace(" ", "%20"))
 70 | 
 71 | 
 72 | def writer(queue):
 73 |     results = set()
 74 |     while True:
 75 |         try:
 76 |             entry = queue.get()
 77 |             if entry == DONE_FLAG:
 78 |                 return results
 79 | 
 80 |             results.add(entry)
 81 |         except:
 82 |             # KeyboardInterrupt
 83 |             break
 84 | 
 85 | 
 86 | def is_same_origin(origin, url):
 87 |     return url.startswith(origin + "/") or url.startswith("//%s/" % origin.split("/")[2])
 88 | 
 89 | 
 90 | def is_relative(url):
 91 |     return url.startswith("/") and not (url.startswith("//") or url.startswith("/\\"))
 92 | 
 93 | 
 94 | def is_excluded(extension):
 95 |     return any(re.match(ep, extension) for ep in EXCLUDED_EXTENSIONS)
 96 | 
 97 | 
 98 | def get_full_url(origin, url):
 99 |     if url.startswith(origin):
100 |         return url
101 |     if url.startswith("//"):
102 |         return origin.split("/")[0] + url
103 |     if url.startswith("/"):
104 |         return origin + url
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     if len(sys.argv) != 2:
109 |         print("Usage: %s <domain_list>" % sys.argv[0])
110 |         sys.exit()
111 | 
112 |     with open(sys.argv[1]) as f:
113 |         urls = [line.strip().rstrip("/") for line in f.readlines()]
114 | 
115 |     results = multiprocessing.Manager().Queue()
116 |     p = multiprocessing.Pool(4)
117 | 
118 |     wjob = p.apply_async(writer, (results,))
119 |     initiate(p, results, urls)
120 | 
121 |     results.put(DONE_FLAG)
122 |     resources = wjob.get()
123 |     p.close()
124 | 
125 |     with open(RESULTS_FILE, "w", encoding="utf-8") as f:
126 |         for resource in resources:
127 |             f.write("%s\n" % resource)
128 | 


--------------------------------------------------------------------------------