├── .gitignore ├── .pre-commit-config.yaml ├── .secrets.baseline ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── requirements-dev.txt ├── requirements.txt ├── setup.py ├── tests ├── __init__.py ├── alexa_test.py ├── data │ └── response.xml ├── opendns_test.py ├── util │ ├── __init__.py │ ├── api_cache_test.py │ ├── error_messages_test.py │ └── http_test.py └── virustotal_test.py ├── threat_intel ├── __init__.py ├── alexaranking.py ├── exceptions.py ├── opendns.py ├── shadowserver.py ├── util │ ├── __init__.py │ ├── api_cache.py │ ├── error_messages.py │ └── http.py └── virustotal.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | 56 | .coverage 57 | virtualenv_run/ 58 | .DS_Store 59 | .idea 60 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: git://github.com/pre-commit/pre-commit-hooks 3 | sha: v2.1.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: check-yaml 8 | - id: debug-statements 9 | - id: name-tests-test 10 | - id: check-added-large-files 11 | - id: check-byte-order-marker 12 | - id: fix-encoding-pragma 13 | - id: flake8 14 | - id: requirements-txt-fixer 15 | - repo: git://github.com/asottile/reorder_python_imports 16 | sha: v1.3.4 17 | hooks: 18 | - id: reorder-python-imports 19 | - repo: git@git.yelpcorp.com:mirrors/asottile/add-trailing-comma 20 | rev: v0.7.1 21 | hooks: 22 | - id: add-trailing-comma 23 | - repo: git@git.yelpcorp.com:mirrors/pre-commit/mirrors-autopep8 24 | rev: v1.4.3 25 | hooks: 26 | - id: autopep8 27 | - repo: https://github.com/Yelp/detect-secrets 28 | sha: v0.12.0 29 | hooks: 30 | - id: detect-secrets 31 | args: ['--baseline', '.secrets.baseline'] 32 | exclude: .*tests/.*|\.pre-commit-config\.yaml 33 | -------------------------------------------------------------------------------- /.secrets.baseline: -------------------------------------------------------------------------------- 1 | { 2 | "exclude": { 3 | "files": ".*tests/.*|\\.pre-commit-config\\.yaml", 4 | "lines": null 5 | }, 6 | "generated_at": "2019-02-21T16:33:09Z", 7 | "plugins_used": [ 8 | { 9 | "base64_limit": 4.5, 10 | "name": "Base64HighEntropyString" 11 | }, 12 | { 13 | "hex_limit": 3, 14 | "name": "HexHighEntropyString" 15 | }, 16 | { 17 | "name": "PrivateKeyDetector" 18 | } 19 | ], 20 | "results": { 21 | ".travis.yml": [ 22 | { 23 | "hashed_secret": "9510ca1b3eda474063afbc25da5d08ac1314f340", 24 | "line_number": 14, 25 | "type": "Base64 High Entropy String" 26 | } 27 | ], 28 | "README.md": [ 29 | { 30 | "hashed_secret": "d39359993ff73436cd2caf84970d3247051968b5", 31 | "line_number": 463, 32 | "type": "Hex High Entropy String" 33 | }, 34 | { 35 | "hashed_secret": "8b0b46d5092ecb0b2e078091a07c421758d8b51e", 36 | "line_number": 545, 37 | "type": "Hex High Entropy String" 38 | }, 39 | { 40 | "hashed_secret": "1d86040d03a0ace59fa4ef4988341f5dba9ddab8", 41 | "line_number": 719, 42 | "type": "Hex High Entropy String" 43 | }, 44 | { 45 | "hashed_secret": "5ec0c35f36d8a545fb8225c525c9d9c3a3e174fc", 46 | "line_number": 720, 47 | "type": "Hex High Entropy String" 48 | } 49 | ], 50 | "threat_intel/__init__.py": [ 51 | { 52 | "hashed_secret": "d39359993ff73436cd2caf84970d3247051968b5", 53 | "line_number": 370, 54 | "type": "Hex High Entropy String" 55 | }, 56 | { 57 | "hashed_secret": "8b0b46d5092ecb0b2e078091a07c421758d8b51e", 58 | "line_number": 439, 59 | "type": "Hex High Entropy String" 60 | }, 61 | { 62 | "hashed_secret": "1d86040d03a0ace59fa4ef4988341f5dba9ddab8", 63 | "line_number": 522, 64 | "type": "Hex High Entropy String" 65 | }, 66 | { 67 | "hashed_secret": "5ec0c35f36d8a545fb8225c525c9d9c3a3e174fc", 68 | "line_number": 523, 69 | "type": "Hex High Entropy String" 70 | } 71 | ] 72 | }, 73 | "version": "0.12.0" 74 | } 75 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - '2.7' 4 | - '3.6' 5 | 6 | install: 7 | - pip install tox-travis 8 | 9 | script: make test 10 | deploy: 11 | provider: pypi 12 | user: yelplabs 13 | password: 14 | secure: QG9rd2z6pH4E5NCph+mw739wsaTlTpy1c5+AR1q+w/ZSrMpucNdp1i8BXAgpj2kIvuaIQQd3Behu+SVd7u5TSCZoAE7PxUKBFvEiN/7g++RVlDlPcpXTVQT8qXfvFnTGCnS95pLhXVIMDJU4cUjjDS6kshBVuvn2MTwskY4emow= 15 | on: 16 | tags: true 17 | python: '3.6' 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Yelp.com 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .DELETE_ON_ERROR: 2 | 3 | all: 4 | echo >&2 "Must specify target." 5 | 6 | test: 7 | tox 8 | 9 | venv: 10 | tox -evenv 11 | 12 | install-hooks: venv 13 | virtualenv_run/bin/pre-commit install -f --install-hooks 14 | 15 | clean: 16 | rm -rf build/ dist/ threat_intel.egg-info/ .tox/ virtualenv_run/ 17 | find . -name '*.pyc' -delete 18 | find . -name '__pycache__' -delete 19 | 20 | .PHONY: all test venv clean install-hooks 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # threat_intel [![Build Status: master](https://travis-ci.org/Yelp/threat_intel.svg?branch=master)](https://travis-ci.org/Yelp/threat_intel) [![PyPI](https://img.shields.io/pypi/v/threat_intel.svg)](https://pypi.python.org/pypi/threat_intel) 2 | Threat Intelligence APIs. 3 | 4 | 5 | ## Supported threat intelligence feeds 6 | 7 | The package contains API wrappers for: 8 | 9 | * Umbrella Investigate API 10 | * VirusTotal API v2.0 11 | * ShadowServer API 12 | 13 | ---- 14 | 15 | ### Umbrella Investigate API 16 | 17 | [Umbrella Investigate](https://docs.umbrella.com/developer/investigate-api/) provides an API that 18 | allows querying for: 19 | 20 | * Domain categorization 21 | * Security information about a domain 22 | * Co-occurrences for a domain 23 | * Related domains for a domain 24 | * Domains related to an IP 25 | * Domain tagging dates for a domain 26 | * DNS RR history for a domain 27 | * WHOIS information 28 | - WHOIS information for an email 29 | - WHOIS information for a nameserver 30 | - Historical WHOIS information for a domain 31 | * Latest malicious domains for an IP 32 | 33 | To use the Investigate API wrapper import `InvestigateApi` class from `threat_intel.opendns` module: 34 | 35 | ```python 36 | from threat_intel.opendns import InvestigateApi 37 | ``` 38 | 39 | To initialize the API wrapper you need the API key: 40 | 41 | ```python 42 | investigate = InvestigateApi("") 43 | ``` 44 | 45 | You can also specify a file name where the API responses will be cached in a JSON file, 46 | to save you the bandwidth for the multiple calls about the same domains or IPs: 47 | 48 | ```python 49 | investigate = InvestigateApi("", cache_file_name="/tmp/cache.opendns.json") 50 | ``` 51 | 52 | #### Domain categorization 53 | 54 | Calls `domains/categorization/?showLabels` Investigate API endpoint. 55 | It takes a list (or any other Python enumerable) of domains and returns 56 | the categories associated with this domains by Umbrella along with a [-1, 0, 1] score, where -1 is a malicious status. 57 | 58 | ```python 59 | domains = ["google.com", "baidu.com", "bibikun.ru"] 60 | investigate.categorization(domains) 61 | ``` 62 | 63 | will result in: 64 | 65 | ``` 66 | { 67 | "baidu.com": {"status": 1, "content_categories": ["Search Engines"], "security_categories": []}, 68 | "google.com": {"status": 1, "content_categories": ["Search Engines"], "security_categories": []}, 69 | "bibikun.ru": {"status": -1, "content_categories": [], "security_categories": ["Malware"]} 70 | } 71 | ``` 72 | 73 | #### Security information about a domain 74 | 75 | Calls `security/name/` Investigate API endpoint. 76 | It takes any Python enumerable with domains, e.g. list, and returns several security parameters 77 | associated with each domain. 78 | 79 | ```python 80 | domains = ["google.com", "baidu.com", "bibikun.ru"] 81 | investigate.security(domains) 82 | ``` 83 | 84 | will result in: 85 | 86 | ``` 87 | { 88 | "baidu.com": { 89 | "found": true, 90 | "handlings": { 91 | "domaintagging": 0.00032008666962131285, 92 | "blocked": 0.00018876906157154347, 93 | "whitelisted": 0.00019697641207465407, 94 | "expired": 2.462205150933176e-05, 95 | "normal": 0.9992695458052232 96 | }, 97 | "dga_score": 0, 98 | "rip_score": 0, 99 | 100 | .. 101 | 102 | } 103 | } 104 | ``` 105 | 106 | #### Co-occurrences for a domain 107 | 108 | Calls `recommendations/name/` Investigate API endpoint. 109 | Use this method to find out a list of co-occurence domains (domains that are being accessed by the same users within a small window of time) to the one given in a list, or any other Python enumerable. 110 | 111 | ```python 112 | domains = ["google.com", "baidu.com", "bibikun.ru"] 113 | investigate.cooccurrences(domains) 114 | ``` 115 | 116 | will result in: 117 | 118 | ``` 119 | { 120 | "baidu.com": { 121 | "found": true, 122 | "pfs2": [ 123 | ["www.howtoforge.de", 0.14108563836506008], 124 | } 125 | 126 | .. 127 | 128 | } 129 | ``` 130 | 131 | #### Related domains for a domain 132 | 133 | Calls `links/name/` Investigate API endpoint. 134 | Use this method to find out a list of related domains (domains that have been frequently seen requested around a time window of 60 seconds, but that are not associated with the given domain) to the one given in a list, or any other Python enumerable. 135 | 136 | ```python 137 | domains = ["google.com", "baidu.com", "bibikun.ru"] 138 | investigate.related_domains(domains) 139 | ``` 140 | 141 | will result in: 142 | 143 | ``` 144 | { 145 | "tb1": [ 146 | ["t.co", 11.0], 147 | ] 148 | 149 | .. 150 | 151 | } 152 | ``` 153 | 154 | #### Domain tagging dates for a domain 155 | 156 | Calls `domains/name/` Investigate API endpoint. 157 | 158 | Use this method to get the date range when the domain being queried was a part of the Umbrella block list and how long a domain has been in this list 159 | 160 | ```python 161 | domains = ["google.com", "baidu.com", "bibikun.ru"] 162 | investigate.domain_tag(domains) 163 | ``` 164 | 165 | will result in: 166 | 167 | ``` 168 | { 169 | 'category': u'Malware', 170 | 'url': None, 171 | 'period': { 172 | 'begin': u'2013-09-16', 173 | 'end': u'Current' 174 | } 175 | 176 | .. 177 | 178 | } 179 | ``` 180 | 181 | #### DNS RR history for a Domain 182 | 183 | Calls `dnsdb/name/a/` Investigate API endpoint. 184 | Use this method to find out related domains to domains given in a list, or any other Python enumerable. 185 | 186 | ```python 187 | domains = ["google.com", "baidu.com", "bibikun.ru"] 188 | investigate.dns_rr(domains) 189 | ``` 190 | 191 | will result in: 192 | 193 | ``` 194 | { 195 | 'features': { 196 | 'geo_distance_mean': 0.0, 197 | 'locations': [ 198 | { 199 | 'lat': 59.89440155029297, 200 | 'lon': 30.26420021057129 201 | } 202 | ], 203 | 'rips': 1, 204 | 'is_subdomain': False, 205 | 'ttls_mean': 86400.0, 206 | 'non_routable': False, 207 | } 208 | 209 | .. 210 | 211 | } 212 | ``` 213 | 214 | #### DNS RR history for an IP 215 | 216 | Calls `dnsdb/ip/a/` Investigate API endpoint. 217 | Use this method to find out related domains to the IP addresses given in a list, or any other Python enumerable. 218 | 219 | ```python 220 | ips = ['8.8.8.8'] 221 | investigate.rr_history(ips) 222 | ``` 223 | 224 | will result in: 225 | 226 | ``` 227 | { 228 | "8.8.8.8": { 229 | "rrs": [ 230 | { 231 | "name": "8.8.8.8", 232 | "type": "A", 233 | "class": "IN", 234 | "rr": "000189.com.", 235 | "ttl": 3600 236 | }, 237 | { 238 | "name": "8.8.8.8", 239 | "type": "A", 240 | "class": "IN", 241 | "rr": "008.no-ip.net.", 242 | "ttl": 60 243 | }, 244 | } 245 | 246 | .. 247 | 248 | } 249 | ``` 250 | 251 | #### WHOIS information for a domain 252 | 253 | ##### WHOIS information for an email 254 | 255 | Calls `whois/emails/{email}` Investigate API endpoint. 256 | 257 | Use this method to see WHOIS information for the email address. For now the Umbrella API will only return at most 500 results. 258 | 259 | ```python 260 | emails = ["dns-admin@google.com"] 261 | investigate.whois_emails(emails) 262 | ``` 263 | 264 | will result in: 265 | 266 | ``` 267 | { 268 | "dns-admin@google.com": { 269 | "totalResults": 500, 270 | "moreDataAvailable": true, 271 | "limit": 500, 272 | "domains": [ 273 | { 274 | "domain": "0emm.com", 275 | "current": true 276 | }, 277 | .. 278 | ] 279 | } 280 | } 281 | ``` 282 | 283 | ##### WHOIS information for a nameserver 284 | 285 | Calls `whois/nameservers/{nameserver}` Investigate API endpoint. 286 | 287 | Use this method to see WHOIS information for the nameserver. For now the Umbrella API will only return at most 500 results. 288 | 289 | ```python 290 | nameservers = ["ns2.google.com"] 291 | investigate.whois_nameservers(nameservers) 292 | ``` 293 | 294 | will result in: 295 | 296 | ``` 297 | { 298 | "ns2.google.com": { 299 | "totalResults": 500, 300 | "moreDataAvailable": true, 301 | "limit": 500, 302 | "domains": [ 303 | { 304 | "domain": "46645.biz", 305 | "current": true 306 | }, 307 | .. 308 | ] 309 | } 310 | } 311 | ``` 312 | 313 | ##### WHOIS information for a domain 314 | 315 | Calls `whois/{domain}` Investigate API endpoint. 316 | 317 | Use this method to see WHOIS information for the domain. 318 | 319 | ```python 320 | domains = ["google.com"] 321 | investigate.whois_domains(domains) 322 | ``` 323 | 324 | will result in: 325 | 326 | ``` 327 | { 328 | "administrativeContactFax": null, 329 | "whoisServers": null, 330 | "addresses": [ 331 | "1600 amphitheatre parkway", 332 | "please contact contact-admin@google.com, 1600 amphitheatre parkway", 333 | "2400 e. bayshore pkwy" 334 | ], 335 | .. 336 | } 337 | ``` 338 | 339 | ##### Historical WHOIS information for a domain 340 | 341 | Calls `whois/{domain}/history` Investigate API endpoint. 342 | 343 | Use this method to see historical WHOIS information for the domain. 344 | 345 | ```python 346 | domains = ["5esb.biz"] 347 | investigate.whois_domains_history(domains) 348 | ``` 349 | 350 | will result in: 351 | 352 | ``` 353 | { 354 | '5esb.biz':[ 355 | { 356 | u'registrantFaxExt':u'', 357 | u'administrativeContactPostalCode':u'656448', 358 | u'zoneContactCity':u'', 359 | u'addresses':[ 360 | u'nan qu hua yuan xiao he' 361 | ], 362 | .. 363 | }, 364 | .. 365 | ] 366 | } 367 | ``` 368 | 369 | #### Latest malicious domains for an IP 370 | 371 | Calls `ips/{ip}/latest_domains` Investigate API endpoint. 372 | 373 | Use this method to see whether the IP address has any malicious domains associated with it. 374 | 375 | ```python 376 | ips = ["8.8.8.8"] 377 | investigate.latest_malicious(ips) 378 | ``` 379 | 380 | will result in: 381 | 382 | ``` 383 | { 384 | [ 385 | '7ltd.biz', 386 | 'co0s.ru', 387 | 't0link.in', 388 | ] 389 | 390 | .. 391 | } 392 | ``` 393 | 394 | ---- 395 | 396 | ### VirusTotal API 397 | 398 | [VirusTotal](https://www.virustotal.com/) provides an 399 | [API](https://www.virustotal.com/en/documentation/public-api/) that makes it 400 | possible to query for the reports about: 401 | 402 | * Domains 403 | * URLs 404 | * IPs 405 | * File hashes 406 | * File Upload 407 | * Live Feed 408 | * Advanced search 409 | 410 | To use the VirusTotal API wrapper import `VirusTotalApi` class from `threat_intel.virustotal` module: 411 | 412 | ```python 413 | from threat_intel.virustotal import VirusTotalApi 414 | ``` 415 | 416 | To initialize the API wrapper you need the API key: 417 | 418 | ```python 419 | vt = VirusTotalApi("") 420 | ``` 421 | 422 | VirusTotal API calls allow to squeeze a list of file hashes or URLs into a single HTTP call. 423 | Depending on the API version you are using (public or private) you may need to tune the maximum number 424 | of the resources (file hashes or URLs) that could be passed in a single API call. 425 | You can do it with the `resources_per_req` parameter: 426 | 427 | ```python 428 | vt = VirusTotalApi("", resources_per_req=4) 429 | ``` 430 | 431 | When using the public API your standard request rate allows you too put maximum 4 resources per request. 432 | With private API you are able to put up to 25 resources per call. That is also the default value if you 433 | don't pass the `resources_per_req` parameter. 434 | 435 | Of course when calling the API wrapper methods in the `VirusTotalApi` class you can pass as many resources 436 | as you want and the wrapper will take care of producing as many API calls as necessary to satisfy the request rate. 437 | 438 | You can also specify the file name where the responses will be cached: 439 | 440 | ```python 441 | vt = VirusTotalApi("", cache_file_name="/tmp/cache.virustotal.json") 442 | ``` 443 | 444 | #### Domain report endpoint 445 | 446 | Calls `domain/report` VirusTotal API endpoint. 447 | Pass a list or any other Python enumerable containing the domains: 448 | 449 | ```python 450 | domains = ["google.com", "baidu.com", "bibikun.ru"] 451 | vt.get_domain_reports(domains) 452 | ``` 453 | 454 | will result in: 455 | 456 | ``` 457 | { 458 | "baidu.com": { 459 | "undetected_referrer_samples": [ 460 | { 461 | "positives": 0, 462 | "total": 56, 463 | "sha256": "e3c1aea1352362e4b5c008e16b03810192d12a4f1cc71245f5a75e796c719c69" 464 | } 465 | ], 466 | 467 | .. 468 | 469 | } 470 | } 471 | ``` 472 | 473 | 474 | #### URL report endpoint 475 | 476 | Calls `url/report` VirusTotal API endpoint. 477 | Pass a list or any other Python enumerable containing the URL addresses: 478 | 479 | ```python 480 | urls = ["http://www.google.com", "http://www.yelp.com"] 481 | vt.get_url_reports(urls) 482 | ``` 483 | 484 | will result in: 485 | 486 | ``` 487 | { 488 | "http://www.google.com": { 489 | "permalink": "https://www.virustotal.com/url/dd014af5ed6b38d9130e3f466f850e46d21b951199d53a18ef29ee9341614eaf/analysis/1423344006/", 490 | "resource": "http://www.google.com", 491 | "url": "http://www.google.com/", 492 | "response_code": 1, 493 | "scan_date": "2015-02-07 21:20:06", 494 | "scan_id": "dd014af5ed6b38d9130e3f466f850e46d21b951199d53a18ef29ee9341614eaf-1423344006", 495 | "verbose_msg": "Scan finished, scan information embedded in this object", 496 | "filescan_id": null, 497 | "positives": 0, 498 | "total": 62, 499 | "scans": { 500 | "CLEAN MX": { 501 | "detected": false, 502 | "result": "clean site" 503 | }, 504 | } 505 | .. 506 | 507 | } 508 | ``` 509 | 510 | #### URL scan endpoint 511 | 512 | Calls 'url/scan' VirusTotal API endpoint. 513 | Submit a url or any other Python enumerable containing the URL addresses: 514 | 515 | ```python 516 | urls = ["http://www.google.com", "http://www.yelp.com"] 517 | vt.get_url_reports(urls) 518 | ``` 519 | 520 | #### Hash report endpoint 521 | 522 | Calls `file/report` VirusTotal API endpoint. 523 | You can request the file reports passing a list of hashes (md5, sha1 or sha2): 524 | 525 | ```python 526 | file_hashes = [ 527 | "99017f6eebbac24f351415dd410d522d", 528 | "88817f6eebbac24f351415dd410d522d" 529 | ] 530 | 531 | vt.get_file_reports(file_hashes) 532 | ``` 533 | 534 | will result in: 535 | 536 | ``` 537 | { 538 | "88817f6eebbac24f351415dd410d522d": { 539 | "response_code": 0, 540 | "resource": "88817f6eebbac24f351415dd410d522d", 541 | "verbose_msg": "The requested resource is not among the finished, queued or pending scans" 542 | }, 543 | "99017f6eebbac24f351415dd410d522d": { 544 | "scan_id": "52d3df0ed60c46f336c131bf2ca454f73bafdc4b04dfa2aea80746f5ba9e6d1c-1423261860", 545 | "sha1": "4d1740485713a2ab3a4f5822a01f645fe8387f92", 546 | } 547 | 548 | .. 549 | 550 | } 551 | ``` 552 | 553 | #### Hash rescan endpoint 554 | 555 | Calls `file/rescan` VirusTotal API endpoint. Use to rescan a previously submitted file. 556 | You can request the file reports passing a list of hashes (md5, sha1 or sha2): 557 | 558 | #### Hash behaviour endpoint 559 | 560 | Calls `file/behaviour` VirusTotal API endpoint. Use to get a report about the behaviour of the file when executed in a sandboxed environment (Cuckoo sandbox). 561 | You can request the file reports passing a list of hashes (md5, sha1 or sha2): 562 | 563 | ```python 564 | file_hashes = [ 565 | "99017f6eebbac24f351415dd410d522d", 566 | "88817f6eebbac24f351415dd410d522d" 567 | ] 568 | 569 | vt.get_file_behaviour(file_hashes) 570 | ``` 571 | 572 | #### Hash network-traffic endpoint 573 | 574 | Calls `file/network-traffic` VirusTotal API endpoint. Use to get the dump of the network traffic generated by the file when executed. 575 | You can request the file reports passing a list of hashes (md5, sha1 or sha2): 576 | 577 | ```python 578 | file_hashes = [ 579 | "99017f6eebbac24f351415dd410d522d", 580 | "88817f6eebbac24f351415dd410d522d" 581 | ] 582 | 583 | vt.get_file_network_traffic(file_hashes) 584 | ``` 585 | 586 | #### Hash download endpoint 587 | 588 | Calls `file/download` VirusTotal API endpoint. Use to download a file by its hash. 589 | You can request the file reports passing a list of hashes (md5, sha1 or sha2): 590 | 591 | ```python 592 | file_hashes = [ 593 | "99017f6eebbac24f351415dd410d522d", 594 | "88817f6eebbac24f351415dd410d522d" 595 | ] 596 | 597 | vt.get_file_download(file_hashes) 598 | ``` 599 | 600 | #### IP reports endpoint 601 | 602 | Calls `ip-address/report` VirusTotal API endpoint. 603 | Pass a list or any other Python enumerable containing the IP addresses: 604 | 605 | ```python 606 | ips = ['90.156.201.27', '198.51.132.80'] 607 | vt.get_ip_reports(ips) 608 | ``` 609 | 610 | will result in: 611 | 612 | ``` 613 | { 614 | "90.156.201.27": { 615 | "asn": "25532", 616 | "country": "RU", 617 | "response_code": 1, 618 | "as_owner": ".masterhost autonomous system", 619 | "verbose_msg": "IP address found in dataset", 620 | "resolutions": [ 621 | { 622 | "last_resolved": "2013-04-01 00:00:00", 623 | "hostname": "027.ru" 624 | }, 625 | { 626 | "last_resolved": "2015-01-20 00:00:00", 627 | "hostname": "600volt.ru" 628 | }, 629 | 630 | .. 631 | 632 | ], 633 | "detected_urls": [ 634 | { 635 | "url": "http://shop.albione.ru/", 636 | "positives": 2, 637 | "total": 52, 638 | "scan_date": "2014-04-06 11:18:17" 639 | }, 640 | { 641 | "url": "http://www.orlov.ru/", 642 | "positives": 3, 643 | "total": 52, 644 | "scan_date": "2014-03-05 09:13:31" 645 | } 646 | ], 647 | }, 648 | 649 | "198.51.132.80": { 650 | 651 | .. 652 | 653 | } 654 | } 655 | ``` 656 | 657 | #### URL live feed endpoint 658 | 659 | Calls `url/distribution` VirusTotal API endpoint. Use to get a live a feed with the latest URLs submitted to VirusTotal. 660 | 661 | ```python 662 | vt.get_url_distribution() 663 | ``` 664 | 665 | #### Hash live feed endpoint 666 | 667 | Calls `file/distribution` VirusTotal API endpoint. Use to get a live a feed with the latest Hashes submitted to VirusTotal. 668 | 669 | ```python 670 | vt.get_file_distribution() 671 | ``` 672 | 673 | #### Hash search endpoint 674 | 675 | Calls `file/search` VirusTotal API endpoint. Use to search for samples that match some binary/metadata/detection criteria. 676 | 677 | ```python 678 | vt.get_file_search() 679 | ``` 680 | 681 | #### File date endpoint 682 | 683 | Calls `file/clusters` VirusTotal API endpoint. Use to list simililarity clusters for a given time frame. 684 | 685 | ```python 686 | vt.get_file_clusters() 687 | ``` 688 | 689 | --- 690 | 691 | ### ShadowServer API 692 | 693 | [ShadowServer](http://shadowserver.org/) provides and [API](http://bin-test.shadowserver.org/) that allows to test 694 | the hashes against a list of known software applications. 695 | 696 | To use the ShadowServer API wrapper import `ShadowServerApi` class from `threat_intel.shadowserver` module: 697 | 698 | ```python 699 | from threat_intel.shadowserver import ShadowServerApi 700 | ``` 701 | 702 | To use the API wrapper simply call the `ShadowServerApi` initializer: 703 | 704 | ```python 705 | ss = ShadowServerApi() 706 | ``` 707 | 708 | You can also specify the file name where the API responses will be cached: 709 | 710 | ```python 711 | ss = ShadowServerApi(cache_file_name="/tmp/cache.shadowserver.json") 712 | ``` 713 | 714 | To check whether the hashes are on the ShadowServer list of known hashes, 715 | call `get_bin_test` method and pass enumerable with the hashes you want to test: 716 | 717 | ```python 718 | file_hashes = [ 719 | "99017f6eebbac24f351415dd410d522d", 720 | "88817f6eebbac24f351415dd410d522d" 721 | ] 722 | 723 | ss.get_bin_test(file_hashes) 724 | 725 | ``` 726 | 727 | --- 728 | 729 | ## Installation 730 | 731 | ### Install with `pip` 732 | 733 | ```shell 734 | $ pip install threat_intel 735 | ``` 736 | 737 | ### Testing 738 | Go to town with `make`: 739 | 740 | ```shell 741 | $ sudo pip install tox 742 | $ make test 743 | ``` 744 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -e . 2 | -r requirements.txt 3 | coverage==3.7.1 4 | mock==1.0.1 5 | pre-commit==1.13.0 6 | pyflakes==0.9.2 7 | testify==0.7.2 8 | tornado==4.5.3 9 | tox==2.3.1 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests-futures==0.9.9 2 | requests[security]==2.21.0 3 | simplejson==3.10.0 4 | six==1.10.0 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from setuptools import find_packages 3 | from setuptools import setup 4 | 5 | 6 | setup( 7 | name="threat_intel", 8 | version='0.2.1', 9 | provides=['threat_intel'], 10 | author="Yelp Security", 11 | url='https://github.com/Yelp/threat_intel', 12 | setup_requires='setuptools', 13 | license='Copyright 2016 Yelp', 14 | author_email="opensource@yelp.com", 15 | description="Collection of the API calls for various threat intel feeds.", 16 | packages=find_packages(), 17 | install_requires=[ 18 | "requests-futures>=0.9.9", 19 | "requests[security]>=2.13.0", 20 | "simplejson>=3.10.0", 21 | "six>=1.10.0", 22 | ], 23 | ) 24 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /tests/alexa_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | import testify as T 4 | from mock import patch 5 | 6 | from threat_intel.alexaranking import AlexaRankingApi 7 | 8 | from requests.models import Response 9 | 10 | 11 | class AlexaRankingApiTest(T.TestCase): 12 | 13 | """Tests requesting reports from AlexaRankingApi.""" 14 | 15 | def mock_ok_response(self): 16 | """Mocks a successful request response.""" 17 | content_ok = open("tests/data/response.xml").read() 18 | response = Response() 19 | response.status_code = 200 20 | response._content = content_ok 21 | return response 22 | 23 | def mock_bad_response(self): 24 | """Mocks an unsuccessful request response.""" 25 | response = Response() 26 | content_bad = u'Internal Server Error'.encode('utf-8') 27 | response.status_code = 400 28 | response._content = content_bad 29 | return response 30 | 31 | @T.setup 32 | def setup_ar(self): 33 | self.ar = AlexaRankingApi() 34 | 35 | def _test_api_call( 36 | self, call, request, expected_query_params, api_response, 37 | expected_result): 38 | """ 39 | Tests a AlexaRankingApi call by mocking out the HTTP request. 40 | 41 | Args: 42 | call: Function in AlexaRankingApi to call. 43 | endpoint: Endpoint of AlexaRanking API that is hit. 44 | request: Call arguments. 45 | expected_query_params: Parameters that should be passed to API. 46 | api_response: The expected response by the API. 47 | expected_result: What the call should return. 48 | """ 49 | with patch.object(self.ar, '_requests') as request_mock: 50 | request_mock.multi_get.return_value = api_response 51 | result = call(request) 52 | request_mock.multi_get.assert_called_with( 53 | self.ar.BASE_URL, 54 | to_json=False, 55 | query_params=expected_query_params) 56 | T.assert_equal(result, expected_result) 57 | 58 | def test_get_alexa_rankings_good_response(self): 59 | successful_response = self.mock_ok_response() 60 | self._test_api_call(call=self.ar.get_alexa_rankings, 61 | request=['domain1.com'], 62 | expected_query_params=[{'url': 'domain1.com'}], 63 | api_response=[successful_response], 64 | expected_result={ 65 | "domain1.com": { 66 | "attributes": { 67 | "domain": "domain1.com", 68 | "popularity": "81743", 69 | "reach": "76276", 70 | "rank": "-67329" 71 | } 72 | } 73 | }) 74 | 75 | def test_get_alexa_rankings_bad_response(self): 76 | unsuccessful_response = self.mock_bad_response() 77 | self._test_api_call(call=self.ar.get_alexa_rankings, 78 | request=['domain2.com'], 79 | expected_query_params=[{'url': 'domain2.com'}], 80 | api_response=[unsuccessful_response], 81 | expected_result={ 82 | "domain2.com": { 83 | "attributes": { 84 | "domain": "domain2.com" 85 | } 86 | } 87 | }) 88 | -------------------------------------------------------------------------------- /tests/data/response.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /tests/opendns_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | from six.moves import builtins 4 | import testify as T 5 | from mock import ANY 6 | from mock import mock_open 7 | from mock import patch 8 | 9 | from threat_intel.opendns import InvestigateApi 10 | from threat_intel.opendns import ResponseError 11 | from threat_intel.util.api_cache import ApiCache 12 | from threat_intel.util.http import MultiRequest 13 | 14 | 15 | class InvestigateApiTest(T.TestCase): 16 | 17 | """Tests requesting reports from OpenDNS.""" 18 | 19 | @T.setup 20 | def setup_opendns(self): 21 | self.opendns = InvestigateApi('test_key') 22 | 23 | def _patch_and_assert_categorization(self, all_responses, expected_responses, domains, expected_url, expected_data): 24 | with patch.object(MultiRequest, 'multi_post', autospec=True, return_value=all_responses) as patched_multi_post: 25 | actual_responses = self.opendns.categorization(domains) 26 | 27 | patched_multi_post.assert_called_with(ANY, expected_url, data=expected_data) 28 | assert expected_responses == actual_responses 29 | 30 | def test_categorization(self): 31 | domains = ['yellowstone.org', 'zion.org', 'sequoia.org', 'greatsanddunes.org'] 32 | all_responses = [ 33 | { 34 | u'yellowstone.org': { 35 | u'content_categories': [u'National Parks'], 36 | u'security_categories': [], 37 | u'status': 1 38 | }, 39 | u'zion.org': { 40 | u'content_categories': [u'National Parks'], 41 | u'security_categories': [], 42 | u'status': 1 43 | }, 44 | u'sequoia.org': { 45 | u'content_categories': [u'National Parks'], 46 | u'security_categories': [], 47 | u'status': 1 48 | }, 49 | u'greatsanddunes.org': { 50 | u'content_categories': [u'National Parks'], 51 | u'security_categories': [], 52 | u'status': 1 53 | } 54 | } 55 | ] 56 | 57 | expected_url = u'https://investigate.api.umbrella.com/domains/categorization/?showLabels' 58 | expected_data = ['["yellowstone.org", "zion.org", "sequoia.org", "greatsanddunes.org"]'] 59 | expected_responses = all_responses[0] 60 | 61 | self._patch_and_assert_categorization(all_responses, expected_responses, domains, expected_url, expected_data) 62 | 63 | def test_categorization_domains_limit(self): 64 | self.opendns.MAX_DOMAINS_IN_POST = 2 65 | domains = [ 66 | 'northyorkmoors.org.uk', 'peakdistrict.org.uk', 67 | 'cairngorms.org.uk', 'pembrokeshirecoast.org.uk', 68 | 'northumberland.org.uk'] 69 | all_responses = [ 70 | { 71 | u'northyorkmoors.org.uk': { 72 | u'content_categories': [u'National Parks'], 73 | u'security_categories': [], 74 | u'status': 1 75 | }, 76 | u'peakdistrict.org.uk': { 77 | u'content_categories': [u'National Parks'], 78 | u'security_categories': [], 79 | u'status': 1 80 | }, 81 | }, 82 | { 83 | u'cairngorms.org.uk': { 84 | u'content_categories': [u'National Parks'], 85 | u'security_categories': [], 86 | u'status': 1 87 | }, 88 | u'pembrokeshirecoast.org.uk': { 89 | u'content_categories': [u'National Parks'], 90 | u'security_categories': [], 91 | u'status': 1 92 | }, 93 | }, 94 | { 95 | u'northumberland.org.uk': { 96 | u'content_categories': [u'National Parks'], 97 | u'security_categories': [], 98 | u'status': 1 99 | } 100 | } 101 | ] 102 | 103 | expected_data = [ 104 | '["northyorkmoors.org.uk", "peakdistrict.org.uk"]', 105 | '["cairngorms.org.uk", "pembrokeshirecoast.org.uk"]', 106 | '["northumberland.org.uk"]'] 107 | expected_responses = { 108 | u'northyorkmoors.org.uk': { 109 | u'content_categories': [u'National Parks'], 110 | u'security_categories': [], 111 | u'status': 1 112 | }, 113 | u'peakdistrict.org.uk': { 114 | u'content_categories': [u'National Parks'], 115 | u'security_categories': [], 116 | u'status': 1 117 | }, 118 | u'cairngorms.org.uk': { 119 | u'content_categories': [u'National Parks'], 120 | u'security_categories': [], 121 | u'status': 1 122 | }, 123 | u'pembrokeshirecoast.org.uk': { 124 | u'content_categories': [u'National Parks'], 125 | u'security_categories': [], 126 | u'status': 1 127 | }, 128 | u'northumberland.org.uk': { 129 | u'content_categories': [u'National Parks'], 130 | u'security_categories': [], 131 | u'status': 1 132 | } 133 | } 134 | 135 | self._patch_and_assert_categorization(all_responses, expected_responses, domains, ANY, expected_data) 136 | 137 | def test_categorization_response_error(self): 138 | """Tests whether the ResponseError is raised when the response 139 | returned from the actual API call is empty. 140 | """ 141 | domains = ['yosemite.gov', 'joushuatree.gov', 'deathvalley.gov'] 142 | # empty responses should raise an error 143 | all_responses = [{}] 144 | 145 | # mock cache file 146 | mock_read = mock_open(read_data="{}") 147 | 148 | with patch.object( 149 | builtins, 'open', mock_read, create=True 150 | ), patch.object( 151 | ApiCache, 'bulk_lookup', autospec=True, return_value={} 152 | ), patch.object( 153 | MultiRequest, 'multi_post', autospec=True, return_value=all_responses 154 | ): 155 | i = InvestigateApi('hocus pocus', 'cache.json') 156 | with T.assert_raises(ResponseError): 157 | i.categorization(domains) 158 | 159 | def _test_api_call_get(self, call, endpoint, request, expected_url_params, 160 | api_response, expected_result, expected_query_params=None): 161 | """ 162 | Tests a OpenDNS call by mocking out the HTTP GET request. 163 | 164 | Args: 165 | call: function in OpenDNSApi to call. 166 | endpoint: endpoint of OpenDNS API that is hit (appended to base url) 167 | request: call arguments 168 | expected_url_params: URL parameters that should be passed to API 169 | api_response: the expected response by the API 170 | expected_result: what call should return (given the api response provided) 171 | expected_query_params: query parameters that should be passed to API 172 | """ 173 | with patch.object(self.opendns, '_requests') as request_mock: 174 | request_mock.multi_get.return_value = api_response 175 | result = call(request) 176 | 177 | url = self.opendns._to_url(endpoint.format(expected_url_params)) 178 | request_mock.multi_get.assert_called_with([url], expected_query_params) 179 | T.assert_equal(result, expected_result) 180 | 181 | def test_security(self): 182 | self._test_api_call_get(call=self.opendns.security, 183 | endpoint=u'security/name/{0}.json', 184 | request=['domain'], 185 | expected_url_params='domain', 186 | api_response={}, 187 | expected_result={}) 188 | 189 | def test_whois_emails(self): 190 | self._test_api_call_get(call=self.opendns.whois_emails, 191 | endpoint=u'whois/emails/{0}', 192 | request=['admin@dns.com'], 193 | expected_url_params='admin@dns.com', 194 | api_response={}, 195 | expected_result={}) 196 | 197 | def test_whois_nameservers(self): 198 | self._test_api_call_get(call=self.opendns.whois_nameservers, 199 | endpoint=u'whois/nameservers/{0}', 200 | request=['ns.dns.com'], 201 | expected_url_params='ns.dns.com', 202 | api_response={}, 203 | expected_result={}) 204 | 205 | def test_whois_domains(self): 206 | self._test_api_call_get(call=self.opendns.whois_domains, 207 | endpoint=u'whois/{0}', 208 | request=['google.com'], 209 | expected_url_params='google.com', 210 | api_response={}, 211 | expected_result={}) 212 | 213 | def test_whois_domains_history(self): 214 | self._test_api_call_get(call=self.opendns.whois_domains_history, 215 | endpoint=u'whois/{0}/history', 216 | request=['5esb.biz'], 217 | expected_url_params='5esb.biz', 218 | api_response={}, 219 | expected_result={}) 220 | 221 | def test_coocurrences(self): 222 | self._test_api_call_get(call=self.opendns.cooccurrences, 223 | endpoint=u'recommendations/name/{0}.json', 224 | request=['domain'], 225 | expected_url_params='domain', 226 | api_response={}, 227 | expected_result={}) 228 | 229 | def test_rr_history(self): 230 | self._test_api_call_get(call=self.opendns.rr_history, 231 | endpoint=u'dnsdb/ip/a/{0}.json', 232 | request=['8.8.8.8'], 233 | expected_url_params='8.8.8.8', 234 | api_response={}, 235 | expected_result={}) 236 | 237 | def test_latest_malicious(self): 238 | self._test_api_call_get(call=self.opendns.latest_malicious, 239 | endpoint=u'ips/{0}/latest_domains', 240 | request=['8.8.8.8'], 241 | expected_url_params='8.8.8.8', 242 | api_response={}, 243 | expected_result={}) 244 | 245 | def test_domain_tag(self): 246 | self._test_api_call_get(call=self.opendns.domain_tag, 247 | endpoint=u'domains/{0}/latest_tags', 248 | request=['domain'], 249 | expected_url_params='domain', 250 | api_response={}, 251 | expected_result={}) 252 | 253 | def test_dns_rr(self): 254 | self._test_api_call_get(call=self.opendns.dns_rr, 255 | endpoint=u'dnsdb/name/a/{0}.json', 256 | request=['domain'], 257 | expected_url_params='domain', 258 | api_response={}, 259 | expected_result={}) 260 | 261 | def test_related_domains(self): 262 | self._test_api_call_get(call=self.opendns.related_domains, 263 | endpoint=u'links/name/{0}.json', 264 | request=['domain'], 265 | expected_url_params='domain', 266 | api_response={}, 267 | expected_result={}) 268 | 269 | def test_sample(self): 270 | self._test_api_call_get(call=self.opendns.sample, 271 | endpoint=u'sample/{0}', 272 | request=['0492d93195451e41f568f68e7704eb0812bc2b19'], 273 | expected_url_params='0492d93195451e41f568f68e7704eb0812bc2b19', 274 | api_response={}, 275 | expected_result={}) 276 | 277 | def test_search(self): 278 | self._test_api_call_get(call=self.opendns.search, 279 | endpoint=u'search/{0}', 280 | request=['pattern'], 281 | expected_url_params='pattern', 282 | api_response={}, 283 | expected_result={}, 284 | expected_query_params={'start': '-30days', 285 | 'includecategory': 'false', 286 | 'limit': 1000}) 287 | 288 | def test_risk_score(self): 289 | self._test_api_call_get(call=self.opendns.risk_score, 290 | endpoint=u'domains/risk-score/{0}', 291 | request=['domain'], 292 | expected_url_params='domain', 293 | api_response={}, 294 | expected_result={}) 295 | -------------------------------------------------------------------------------- /tests/util/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /tests/util/api_cache_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | from six.moves import builtins 4 | import simplejson 5 | import testify as T 6 | from mock import mock_open 7 | from mock import patch 8 | 9 | from threat_intel.util.api_cache import ApiCache 10 | 11 | 12 | def assert_cache_written(mock_write, patched_open): 13 | T.assert_equal(mock_write.call_count, 1) 14 | 15 | for call in patched_open.mock_calls: 16 | name, args, kwargs = call 17 | if '().write' != name: 18 | continue 19 | 20 | return simplejson.loads(args[0]) 21 | return None 22 | 23 | 24 | def assert_cache_not_written(mock_write): 25 | T.assert_falsey(mock_write.called) 26 | return None 27 | 28 | 29 | class ApiCacheFileIOTest(T.TestCase): 30 | 31 | """Allows for setting and retrieving results of API calls.""" 32 | 33 | @T.setup 34 | def setup_filename(self): 35 | self._file_name = '/tmp/any_name_will_do' 36 | 37 | def _open_cache(self, initial_contents=None, update_cache=True): 38 | """Creates an ApiCache object, mocking the contents of the cache on disk. 39 | 40 | Args: 41 | initial_contents: A dict containing the initial contents of the cache 42 | update_cache: Specifies whether ApiCache should write out the 43 | cache file when closing it 44 | Returns: 45 | ApiCache 46 | """ 47 | if not initial_contents: 48 | initial_contents = {} 49 | 50 | file_contents = simplejson.dumps(initial_contents) 51 | mock_read = mock_open(read_data=file_contents) 52 | with patch.object(builtins, 'open', mock_read, create=True): 53 | api_cache = ApiCache(self._file_name, update_cache=update_cache) 54 | return api_cache 55 | 56 | def _close_cache(self, api_cache, cache_written=True): 57 | """Closes an ApiCache and reads the final contents that were written to disk. 58 | 59 | Args: 60 | api_cache: An ApiCache instance 61 | cache_written: Specifies whether it should test that the cache 62 | was written out to the cache file or whether to 63 | test that it was not written out 64 | Returns: 65 | A dict representing the contents of the cache that was written 66 | out to the cache file or `None` in case cache was not expected 67 | to be written out 68 | """ 69 | mock_write = mock_open() 70 | with patch.object(builtins, 'open', mock_write, create=True) as patched_open: 71 | api_cache.close() 72 | 73 | if cache_written: 74 | return assert_cache_written(mock_write, patched_open) 75 | 76 | return assert_cache_not_written(mock_write) 77 | 78 | def test_create_cache(self): 79 | initial_contents = { 80 | 'banana': { 81 | 'apple': ['pear', 'panda'], 82 | 'sumo': False, 83 | 'rebel_base_count': 42 84 | }, 85 | 'skiddo': 'Fo Sure', 86 | 'pi': 3.1415 87 | } 88 | 89 | api_cache = self._open_cache(initial_contents) 90 | final_contents = self._close_cache(api_cache) 91 | T.assert_equal(initial_contents, final_contents) 92 | 93 | def test_persist_objects(self): 94 | contents_to_load = { 95 | 'api1': { 96 | 'key1': 'value1', 97 | 'key2': 11, 98 | 'key3': {'some': 'dict'}, 99 | 'key4': ['a', 'list'] 100 | }, 101 | 'api2': { 102 | 'key1': 'value42', 103 | 'key4': 'lavash bread' 104 | } 105 | } 106 | 107 | # Open an empty cache 108 | api_cache = self._open_cache() 109 | 110 | # Load the cache 111 | for api_name in contents_to_load.keys(): 112 | for key in contents_to_load[api_name]: 113 | api_cache.cache_value(api_name, key, contents_to_load[api_name][key]) 114 | 115 | # Verify the cache 116 | for api_name in contents_to_load.keys(): 117 | for key in contents_to_load[api_name]: 118 | expected_val = contents_to_load[api_name][key] 119 | actual_val = api_cache.lookup_value(api_name, key) 120 | T.assert_equal(expected_val, actual_val) 121 | 122 | # Close the cache 123 | final_contents = self._close_cache(api_cache) 124 | T.assert_equal(contents_to_load, final_contents) 125 | 126 | def test_do_not_update_cache(self): 127 | initial_contents = { 128 | 'api1': { 129 | 'bingo': 'woohoo' 130 | }, 131 | 'api2': { 132 | 'bongo': 'boo' 133 | } 134 | } 135 | api_cache = self._open_cache(initial_contents, False) 136 | final_contents = self._close_cache(api_cache, cache_written=False) 137 | T.assert_equal(None, final_contents) 138 | -------------------------------------------------------------------------------- /tests/util/error_messages_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from six import StringIO 3 | 4 | import testify as T 5 | from mock import patch 6 | 7 | from threat_intel.exceptions import InvalidRequestError 8 | from threat_intel.util.error_messages import write_error_message 9 | from threat_intel.util.error_messages import write_exception 10 | 11 | 12 | class StdErrTestCase(T.TestCase): 13 | 14 | """Mocks out sys.stderr""" 15 | 16 | @T.setup_teardown 17 | def setupStringIO(self): 18 | self._stringio = StringIO() 19 | with patch('sys.stderr', self._stringio): 20 | yield 21 | 22 | 23 | class WriteExceptionTest(StdErrTestCase): 24 | 25 | def test_simple_exception(self): 26 | try: 27 | raise Exception() 28 | except Exception as e: 29 | write_exception(e) 30 | 31 | output = self._stringio.getvalue() 32 | T.assert_equal(0, output.find('[ERROR]')) 33 | 34 | def test_specific_exception(self): 35 | try: 36 | raise InvalidRequestError() 37 | except Exception as e: 38 | write_exception(e) 39 | 40 | output = self._stringio.getvalue() 41 | T.assert_equal(0, output.find('[ERROR] InvalidRequestError')) 42 | 43 | def test_exception_message(self): 44 | try: 45 | raise InvalidRequestError('Look for me in validation') 46 | except Exception as e: 47 | write_exception(e) 48 | 49 | output = self._stringio.getvalue() 50 | T.assert_equal(0, output.find('[ERROR] InvalidRequestError Look for me in validation')) 51 | 52 | 53 | class WriteErrorMessageTest(StdErrTestCase): 54 | 55 | def test_write_error_message(self): 56 | message = 'Look for me in validation' 57 | expected = '[ERROR] Look for me in validation\n' 58 | 59 | write_error_message(message) 60 | 61 | output = self._stringio.getvalue() 62 | T.assert_equal(output, expected) 63 | -------------------------------------------------------------------------------- /tests/util/http_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from itertools import chain 4 | 5 | import testify as T 6 | from mock import MagicMock 7 | from requests.models import Response 8 | from requests_futures import sessions 9 | 10 | from threat_intel.exceptions import InvalidRequestError 11 | from threat_intel.util.http import MultiRequest 12 | 13 | 14 | class MultiRequestTest(T.TestCase): 15 | 16 | def mock_ok_responses(self, number_of_responses): 17 | """Mocks `number_of_responses` response mocks. All of them are with the "200 OK" HTTP status code.""" 18 | responses = [Response() for _ in range(number_of_responses)] 19 | for response in responses: 20 | response.status_code = 200 21 | response._content = u'{"Director": "Alejandro González Iñárritu"}'.encode('utf-8') 22 | return responses 23 | 24 | def mock_forbidden_response(self, response): 25 | """Mocks forbidden response by changing its status code to 403 and the content to indicate the error.""" 26 | response.status_code = 403 27 | response._content = u'Forbidden'.encode('utf-8') 28 | 29 | def mock_unsuccessful_response(self, response): 30 | """Mocks unsuccessful response by changing its status code to 500 and the content to indicate the error.""" 31 | response.status_code = 500 32 | response._content = u'Internal Server Error'.encode('utf-8') 33 | 34 | def mock_not_found_response(self, response): 35 | """Mocks a 404 response by changes its status code""" 36 | response.status_code = 404 37 | response._content = u'Not Found'.encode('utf-8') 38 | 39 | def mock_unsuccessful_responses(self, responses): 40 | """Mocks unsuccessful responses by changing their status code to 500 and the content to indicate the error.""" 41 | for response in responses: 42 | self.mock_unsuccessful_response(response) 43 | 44 | def mock_json_convertion_error(self, response): 45 | """Mocks the exception raised in case response cannot be converted to JSON. 46 | Based on http://docs.python-requests.org/en/master/user/quickstart/#json-response-content 47 | """ 48 | response.json = MagicMock(side_effect=ValueError('No JSON object could be decoded')) 49 | response._content = u'This is not JSON'.encode('utf-8') 50 | response.request = MagicMock() 51 | # this is necessary for the log message referencing the URL 52 | response.request.response = response 53 | 54 | def mock_request_futures(self, responses): 55 | """Mocks session.request method call returning `responses`.""" 56 | mock_responder = MagicMock(name='requests_session') 57 | mock_responder.return_value.result = MagicMock(side_effect=responses) 58 | sessions.FuturesSession.get = mock_responder 59 | sessions.FuturesSession.post = mock_responder 60 | return mock_responder 61 | 62 | def test_multi_get_none_response(self): 63 | """Tests the behavior of the `multi_get()` method when one of the responses is `None`.""" 64 | number_of_requests = 10 65 | query_params = [{'Jim Bridger': 'Will Poulter'}] * number_of_requests 66 | responses = self.mock_ok_responses(number_of_requests) 67 | responses[3] = None 68 | self.mock_request_futures(responses) 69 | 70 | actual_responses = MultiRequest(max_retry=1).multi_get('example.com', query_params) 71 | 72 | T.assert_equals(10, len(actual_responses)) 73 | T.assert_is(actual_responses[3], None) 74 | 75 | def test_multi_get_access_forbidden(self): 76 | """Tests the exception handling in the cases when a request returns "403 Forbidden".""" 77 | number_of_requests = 20 78 | query_params = [{'Hugh Glass': 'Leonardo DiCaprio'}] * number_of_requests 79 | responses = self.mock_ok_responses(number_of_requests) 80 | self.mock_forbidden_response(responses[13]) 81 | self.mock_request_futures(responses) 82 | 83 | with T.assert_raises_such_that(InvalidRequestError, lambda e: T.assert_equal(str(e), 'Access forbidden')): 84 | MultiRequest().multi_get('example.com', query_params) 85 | 86 | def test_multi_get_max_retry(self): 87 | """Tests the case when the number of the maximum retries is reached, due to the unsuccessful responses. 88 | Request is repeated 3 times (based on `max_retry`), each time there is only one successful response. 89 | Eventually the call to `multi_get` returns the responses among which one is unsuccessful (`None`). 90 | """ 91 | number_of_requests = 4 92 | query_params = [{'John Fitzgerald': 'Tom Hardy'}] * number_of_requests 93 | responses_to_calls = [ 94 | self.mock_ok_responses(number_of_requests), 95 | self.mock_ok_responses(number_of_requests - 1), 96 | self.mock_ok_responses(number_of_requests - 2), 97 | ] 98 | # mock unsuccessful responses to the first call 99 | self.mock_unsuccessful_responses(responses_to_calls[0][0:3]) 100 | # mock unsuccessful responses to the second call 101 | self.mock_unsuccessful_responses(responses_to_calls[1][1:3]) 102 | # mock unsuccessful response to the third call 103 | self.mock_unsuccessful_response(responses_to_calls[2][1]) 104 | get_mock = self.mock_request_futures(chain.from_iterable(responses_to_calls)) 105 | 106 | actual_responses = MultiRequest(max_retry=3).multi_get('example.com', query_params) 107 | 108 | T.assert_equal(get_mock.call_count, 9) 109 | T.assert_is(actual_responses[2], None) 110 | 111 | def test_multi_get_response_to_json(self): 112 | """Tests the exception handling in the cases when the response was supposed to return JSON but did not.""" 113 | number_of_requests = 5 114 | query_params = [{'Andrew Henry': 'Domhnall Gleeson'}] * number_of_requests 115 | responses = self.mock_ok_responses(number_of_requests) 116 | self.mock_json_convertion_error(responses[3]) 117 | self.mock_request_futures(responses) 118 | logging.warning = MagicMock() 119 | 120 | actual_responses = MultiRequest().multi_get('example.com', query_params) 121 | 122 | T.assert_equals(5, len(actual_responses)) 123 | T.assert_is(actual_responses[3], None) 124 | logging.warning.called_once_with( 125 | 'Expected response in JSON format from example.com/movie/TheRevenant' 126 | ' but the actual response text is: This is not JSON', 127 | ) 128 | 129 | def test_multi_get_retry_only_unsuccessful_requests(self): 130 | """Tests whether only the unsuccessful requests are passed to the consequitive request calls. 131 | 3 unsuccessful responses to the first request batch and then 2 unsuccessful responses to the second. 132 | The third (and the last) returns successful responses only. 133 | """ 134 | responses_to_calls = [ 135 | self.mock_ok_responses(10), 136 | self.mock_ok_responses(3), 137 | self.mock_ok_responses(2), 138 | ] 139 | # mock unsuccessful responses to the first call 140 | unsuccessful_responses_first_call = [ 141 | responses_to_calls[0][2], 142 | responses_to_calls[0][3], 143 | responses_to_calls[0][5], 144 | ] 145 | self.mock_unsuccessful_responses(unsuccessful_responses_first_call) 146 | # mock unsuccessful responses to the second call 147 | unsuccessful_responses_second_call = [ 148 | responses_to_calls[1][0], 149 | responses_to_calls[1][2], 150 | ] 151 | self.mock_unsuccessful_responses(unsuccessful_responses_second_call) 152 | mock_get = self.mock_request_futures(chain.from_iterable(responses_to_calls)) 153 | 154 | query_params = [ 155 | {'Max Rockatansky': 'Tom Hardy'}, 156 | {'Imperator Furiosa': 'Charlize Theron'}, 157 | {'Nux': 'Nicholas Hoult'}, 158 | {'Immortan Joe': 'Hugh Keays-Byrne'}, 159 | {'Slit': 'Josh Helman'}, 160 | {'Rictus Erectus': 'Nathan Jones'}, 161 | {'Toast the Knowing': 'Zoë Kravitz'}, 162 | {'The Splendid Angharad': 'Rosie Huntington-Whiteley'}, 163 | {'Capable': 'Riley Keough'}, 164 | {'The Dag': 'Abbey Lee'}, 165 | ] 166 | 167 | MultiRequest().multi_get('example.com', query_params) 168 | T.assert_equal(mock_get.call_count, 15) # 10 + 3 + 2 169 | call_params = [kwargs['params'] for args, kwargs in mock_get.call_args_list] 170 | # Assert retries 171 | call_params_keys = [list(cp.keys())[0] for cp in call_params] 172 | T.assert_equal(call_params_keys.count('Nux'), 3) 173 | T.assert_equal(call_params_keys.count('Immortan Joe'), 2) 174 | T.assert_equal(call_params_keys.count('Rictus Erectus'), 3) 175 | 176 | def test_multi_get_drop_404s(self): 177 | responses_to_calls = self.mock_ok_responses(3) 178 | self.mock_not_found_response(responses_to_calls[1]) 179 | query_params = [{'Hugh Glass': 'Leonardo DiCaprio'}] * 3 180 | get_mock = self.mock_request_futures(responses_to_calls) 181 | result = MultiRequest(drop_404s=True).multi_get('example.org', query_params) 182 | T.assert_equal(get_mock.call_count, 3) 183 | T.assert_is(result[1], None) 184 | -------------------------------------------------------------------------------- /tests/virustotal_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | import testify as T 4 | from mock import patch 5 | from mock import ANY 6 | 7 | from threat_intel.virustotal import VirusTotalApi 8 | 9 | 10 | class VirusTotalApiTest(T.TestCase): 11 | 12 | """Tests requesting reports from VirusTotalApi.""" 13 | 14 | @T.setup 15 | def setup_vt(self): 16 | self.vt = VirusTotalApi('test_key') 17 | 18 | def _test_api_call(self, call, endpoint, request, expected_query_params, api_response, expected_result): 19 | """ 20 | Tests a VirusTotalApi call by mocking out the HTTP request. 21 | 22 | Args: 23 | call: function in VirusTotalApi to call. 24 | endpoint: endpoint of VirusTotal API that is hit (appended to base url) 25 | request: call arguments 26 | expected_query_params: query parameters that should be passed to API 27 | api_response: the expected response by the API 28 | expected_result: what call should return (given the api response provided) 29 | """ 30 | with patch.object(self.vt, '_requests') as request_mock: 31 | request_mock.multi_get.return_value = api_response 32 | result = call(request) 33 | param_list = [self.vt.BASE_DOMAIN + endpoint.format(param) for param in expected_query_params] 34 | request_mock.multi_get.assert_called_with(param_list, file_download=ANY) 35 | T.assert_equal(result, expected_result) 36 | 37 | def test_get_file_reports(self): 38 | self._test_api_call(call=self.vt.get_file_reports, 39 | endpoint='files/{}', 40 | request=['file1', 'file2'], 41 | expected_query_params=['file1', 'file2'], 42 | api_response=[{'data':{'id': 'file1'}}, {'data':{'id': 'file2'}}], 43 | expected_result={'file1': {'data': {'id': 'file1'}}, 44 | 'file2': {'data': {'id': 'file2'}}}) 45 | 46 | def test_get_file_behaviour(self): 47 | self._test_api_call(call=self.vt.get_file_behaviour, 48 | endpoint='files/{}/behaviours', 49 | request=['file1', 'file2'], 50 | expected_query_params=['file1', 'file2'], 51 | api_response=[{'data':{'id': 'file1'}}, {'data':{'id': 'file2'}}], 52 | expected_result={'file1': {'data': {'id': 'file1'}}, 53 | 'file2': {'data': {'id': 'file2'}}}) 54 | 55 | def test_get_file_download(self): 56 | self._test_api_call(call=self.vt.get_file_download, 57 | endpoint='files/{}/download', 58 | request=['file1', 'file2'], 59 | expected_query_params=['file1', 'file2'], 60 | api_response=[{'data':{'id': 'file1'}}, {'data':{'id': 'file2'}}], 61 | expected_result={'file1': {'data': {'id': 'file1'}}, 62 | 'file2': {'data': {'id': 'file2'}}}) 63 | 64 | def test_get_domain_reports(self): 65 | self._test_api_call(call=self.vt.get_domain_reports, 66 | endpoint='domains/{}', 67 | request=['domain1', 'domain2'], 68 | expected_query_params=['domain1', 'domain2'], 69 | api_response=[{}, {}], 70 | expected_result={'domain1': {}, 71 | 'domain2': {}}) 72 | 73 | def test_get_url_reports(self): 74 | self._test_api_call(call=self.vt.get_url_reports, 75 | endpoint='urls/{}', 76 | request=['url1', 'url2'], 77 | expected_query_params = ['url1', 'url2'], 78 | api_response=[{'data':{'id': 'url1'}}, {'data':{'id': 'url2'}}], 79 | expected_result={'url1': {'data': {'id': 'url1'}}, 80 | 'url2': {'data': {'id': 'url2'}}}) 81 | 82 | def test_get_ip_reports(self): 83 | self._test_api_call(call=self.vt.get_ip_reports, 84 | endpoint='ip_addresses/{}', 85 | request=['ip1', 'ip2'], 86 | expected_query_params=['ip1', 'ip2'], 87 | api_response=[{}, {}], 88 | expected_result={'ip1': {}, 89 | 'ip2': {}}) 90 | 91 | def test_get_file_contacted_domains(self): 92 | self._test_api_call(call=self.vt.get_file_contacted_domains, 93 | endpoint='files/{}/contacted_domains', 94 | request=['domain1', 'domain2'], 95 | expected_query_params=['domain1', 'domain2'], 96 | api_response=[{'data':{'id': 'domain1'}}, {'data':{'id': 'domain2'}}], 97 | expected_result={'domain1': {'data': {'id': 'domain1'}}, 98 | 'domain2': {'data': {'id': 'domain2'}}}) 99 | 100 | def test_get_file_contacted_ips(self): 101 | self._test_api_call(call=self.vt.get_file_contacted_ips, 102 | endpoint='files/{}/contacted_ips', 103 | request=['file1', 'file2'], 104 | expected_query_params=['file1', 'file2'], 105 | api_response=[{'data':{'id': 'file1'}}, {'data':{'id': 'file2'}}], 106 | expected_result={'file1': {'data': {'id': 'file1'}}, 107 | 'file2': {'data': {'id': 'file2'}}}) 108 | 109 | def test_get_file_contacted_urls(self): 110 | self._test_api_call(call=self.vt.get_file_contacted_urls, 111 | endpoint='files/{}/contacted_urls', 112 | request=['file1', 'file2'], 113 | expected_query_params=['file1', 'file2'], 114 | api_response=[{'data':{'id': 'file1'}}, {'data':{'id': 'file2'}}], 115 | expected_result={'file1': {'data': {'id': 'file1'}}, 116 | 'file2': {'data': {'id': 'file2'}}}) 117 | 118 | def test_get_file_itw_urls(self): 119 | self._test_api_call(call=self.vt.get_file_itw_urls, 120 | endpoint='files/{}/itw_urls', 121 | request=['file1', 'file2'], 122 | expected_query_params=['file1', 'file2'], 123 | api_response=[{'data':{'id': 'file1'}}, {'data':{'id': 'file2'}}], 124 | expected_result={'file1': {'data': {'id': 'file1'}}, 125 | 'file2': {'data': {'id': 'file2'}}}) 126 | 127 | def test_get_domain_communicating_files(self): 128 | self._test_api_call(call=self.vt.get_domain_communicating_files, 129 | endpoint='domains/{}/communicating_files', 130 | request=['domain1', 'domain2'], 131 | expected_query_params=['domain1', 'domain2'], 132 | api_response=[{'data':{'id': 'domain1'}}, {'data':{'id': 'domain2'}}], 133 | expected_result={'domain1': {'data': {'id': 'domain1'}}, 134 | 'domain2': {'data': {'id': 'domain2'}}}) 135 | 136 | def test_get_domain_referrer_files(self): 137 | self._test_api_call(call=self.vt.get_domain_referrer_files, 138 | endpoint='domains/{}/referrer_files', 139 | request=['domain1', 'domain2'], 140 | expected_query_params=['domain1', 'domain2'], 141 | api_response=[{'data':{'id': 'domain1'}}, {'data':{'id': 'domain2'}}], 142 | expected_result={'domain1': {'data': {'id': 'domain1'}}, 143 | 'domain2': {'data': {'id': 'domain2'}}}) 144 | def test_get_domain_reports(self): 145 | self._test_api_call(call=self.vt.get_domain_reports, 146 | endpoint='domains/{}', 147 | request=['domain1', 'domain2'], 148 | expected_query_params=['domain1', 'domain2'], 149 | api_response=[{}, {}], 150 | expected_result={'domain1': {}, 151 | 'domain2': {}}) 152 | 153 | def test_get_file_clusters(self): 154 | self._test_api_call(call=self.vt.get_file_clusters, 155 | endpoint='feeds/file-behaviours/{}', 156 | request=['time1', 'time2'], 157 | expected_query_params=['time1', 'time2'], 158 | api_response=[{'data':{'id': 'time1'}}, {'data':{'id': 'time2'}}], 159 | expected_result={'time1': {'data': {'id': 'time1'}}, 160 | 'time2': {'data': {'id': 'time2'}}}) 161 | -------------------------------------------------------------------------------- /threat_intel/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | Supported threat intelligence feeds. 5 | 6 | The package contains API wrappers for: 7 | 8 | * OpenDNS Investigate API 9 | * VirusTotal API v2.0 10 | * ShadowServer API 11 | 12 | 13 | OpenDNS Investigate API 14 | ======================= 15 | 16 | OpenDNS Investigate provides an API that allows querying for: 17 | 18 | * Domain categorization 19 | * Security information about a domain 20 | * Co-occurrences for a domain 21 | * Related domains for a domain 22 | * Domains related to an IP 23 | * Domain tagging dates for a domain 24 | * DNS RR history for a domain 25 | * WHOIS information 26 | - WHOIS information for an email 27 | - WHOIS information for a nameserver 28 | - Historical WHOIS information for a domain 29 | * Latest malicious domains for an IP 30 | 31 | To use the Investigate API wrapper import InvestigateApi class from threat_intel.opendns module: 32 | 33 | >>> from threat_intel import InvestigateApi 34 | 35 | To initialize the API wrapper you need the API key: 36 | 37 | >>> investigate = InvestigateApi("") 38 | 39 | You can also specify a file name where the API responses will be cached in a JSON file, to save you the bandwidth for the multiple calls 40 | about the same domains or IPs: 41 | 42 | >>> investigate = InvestigateApi("", cache_file_name="/tmp/cache.opendns.json") 43 | 44 | 45 | Domain categorization 46 | --------------------- 47 | Calls domains/categorization/?showLabels Investigate API endpoint. It takes a list (or any other Python enumerable) of domains and returns 48 | the categories associated with this domains by OpenDNS. 49 | 50 | >>> domains = ["google.com", "baidu.com", "bibikun.ru"] 51 | >>> investigate.categorization(domains) 52 | 53 | will result in: 54 | { 55 | "baidu.com": {"status": 1, "content_categories": ["Search Engines"], "security_categories": []}, 56 | "google.com": {"status": 1, "content_categories": ["Search Engines"], "security_categories": []}, 57 | "bibikun.ru": {"status": -1, "content_categories": [], "security_categories": ["Malware"]} 58 | } 59 | 60 | 61 | Security information about a domain 62 | ----------------------------------- 63 | Calls security/name/ Investigate API endpoint. It takes any Python enumerable with domains, e.g. list, and returns security parameters 64 | associated with each domain. 65 | 66 | >>> domains = ["google.com", "baidu.com", "bibikun.ru"] 67 | >>> investigate.security(domains) 68 | 69 | will result in: 70 | 71 | { 72 | "baidu.com": { 73 | "found": true, 74 | "handlings": { 75 | "domaintagging": 0.00032008666962131285, 76 | "blocked": 0.00018876906157154347, 77 | "whitelisted": 0.00019697641207465407, 78 | "expired": 2.462205150933176e-05, 79 | "normal": 0.9992695458052232 80 | }, 81 | "dga_score": 0, 82 | "rip_score": 0, 83 | .. 84 | } 85 | } 86 | 87 | 88 | Co-ooccurrences of domain 89 | -------------------------- 90 | Calls recommendations/name/ Investigate API endpoint. Use this method to find out related domains to the one given in a list, or any other 91 | Python enumerable. 92 | 93 | >>> domains = ["google.com", "baidu.com", "bibikun.ru"] 94 | >>> investigate.cooccurrences(domains) 95 | 96 | will result in: 97 | 98 | { 99 | "baidu.com": { 100 | "found": true, 101 | "pfs2": [ 102 | ["www.howtoforge.de", 0.14108563836506008], 103 | .. 104 | } 105 | 106 | 107 | Related domains for a domain 108 | ---------------------------- 109 | 110 | Calls links/name/ Investigate API endpoint. Use this method to find out a list of related domains (domains that have been frequently seen 111 | requested around a time window of 60 seconds, but that are not associated with the given domain) to the one given in a list, or any other 112 | Python enumerable. 113 | 114 | >>> domains = ["google.com", "baidu.com", "bibikun.ru"] 115 | >>> investigate.related_domains(domains) 116 | 117 | will result in: 118 | 119 | { 120 | "tb1": [ 121 | ["t.co", 11.0], 122 | ] 123 | 124 | .. 125 | 126 | } 127 | 128 | 129 | Domain tagging dates for a domain 130 | --------------------------------- 131 | 132 | Calls domains/name/ Investigate API endpoint. 133 | 134 | Use this method to get the date range when the domain being queried was a part of the OpenDNS block list and how long a domain has been in 135 | this list 136 | 137 | >>> domains = ["google.com", "baidu.com", "bibikun.ru"] 138 | >>> investigate.domain_tag(domains) 139 | 140 | will result in: 141 | 142 | { 143 | 'category': u'Malware', 144 | 'url': None, 145 | 'period': { 146 | 'begin': u'2013-09-16', 147 | 'end': u'Current' 148 | } 149 | 150 | .. 151 | 152 | } 153 | 154 | 155 | 156 | DNS RR history for an IP 157 | ------------------------ 158 | Calls dnsdb/ip/a/ Investigate API endpoint. Use this method to find out related domains to the IP addresses given in a list, or any other 159 | Python enumerable. 160 | 161 | >>> ips = ['8.8.8.8'] 162 | >>> investigate.rr_history(ips) 163 | 164 | will result in: 165 | 166 | { 167 | "8.8.8.8": { 168 | "rrs": [ 169 | { 170 | "name": "8.8.8.8", 171 | "type": "A", 172 | "class": "IN", 173 | "rr": "000189.com.", 174 | "ttl": 3600 175 | }, 176 | { 177 | "name": "8.8.8.8", 178 | "type": "A", 179 | "class": "IN", 180 | "rr": "008.no-ip.net.", 181 | "ttl": 60 182 | }, 183 | .. 184 | } 185 | 186 | WHOIS information for a domain 187 | ------------------------------ 188 | 189 | WHOIS information for an email 190 | ------------------------------ 191 | 192 | Calls `whois/emails/{email}` Investigate API endpoint. 193 | 194 | Use this method to see WHOIS information for the email address. (For now the OpenDNS API will only return at most 500 results) 195 | 196 | >>> emails = ["dns-admin@google.com"] 197 | >>> investigate.whois_emails(emails) 198 | 199 | will result in: 200 | 201 | { 202 | "dns-admin@google.com": { 203 | "totalResults": 500, 204 | "moreDataAvailable": true, 205 | "limit": 500, 206 | "domains": [ 207 | { 208 | "domain": "0emm.com", 209 | "current": true 210 | }, 211 | .. 212 | ] 213 | } 214 | } 215 | 216 | WHOIS information for a nameserver 217 | ---------------------------------- 218 | 219 | Calls `whois/nameservers/{nameserver}` Investigate API endpoint. 220 | 221 | Use this method to see WHOIS information for the nameserver. (For now the OpenDNS API will only return at most 500 results) 222 | 223 | >>> nameservers = ["ns2.google.com"] 224 | >>> investigate.whois_nameservers(nameservers) 225 | 226 | will result in: 227 | 228 | { 229 | "ns2.google.com": { 230 | "totalResults": 500, 231 | "moreDataAvailable": true, 232 | "limit": 500, 233 | "domains": [ 234 | { 235 | "domain": "46645.biz", 236 | "current": true 237 | }, 238 | .. 239 | ] 240 | } 241 | } 242 | 243 | WHOIS information for a domain 244 | ------------------------------ 245 | 246 | Calls `whois/{domain}` Investigate API endpoint. 247 | 248 | Use this method to see WHOIS information for the domain. 249 | 250 | >>> domains = ["google.com"] 251 | >>> investigate.whois_domains(domains) 252 | 253 | will result in: 254 | 255 | { 256 | "administrativeContactFax": null, 257 | "whoisServers": null, 258 | "addresses": [ 259 | "1600 amphitheatre parkway", 260 | "please contact contact-admin@google.com, 1600 amphitheatre parkway", 261 | "2400 e. bayshore pkwy" 262 | ], 263 | .. 264 | } 265 | 266 | Historical WHOIS information for a domain 267 | ----------------------------------------- 268 | 269 | Calls `whois/{domain}/history` Investigate API endpoint. 270 | 271 | Use this method to see historical WHOIS information for the domain. 272 | 273 | >>> domains = ["5esb.biz"] 274 | >>> investigate.whois_domains_history(domains) 275 | 276 | will result in: 277 | 278 | { 279 | '5esb.biz':[ 280 | { 281 | u'registrantFaxExt':u'', 282 | u'administrativeContactPostalCode':u'656448', 283 | u'zoneContactCity':u'', 284 | u'addresses':[ 285 | u'nan qu hua yuan xiao he' 286 | ], 287 | .. 288 | }, 289 | .. 290 | ] 291 | } 292 | 293 | Latest malicious domains for an IP 294 | ---------------------------------- 295 | 296 | Calls `ips/{ip}/latest_domains` Investigate API endpoint. 297 | 298 | Use this method to see whether the IP address has any malicious domains associated with it. 299 | 300 | >>> ips = ["8.8.8.8"] 301 | >>> investigate.latest_malicious(ips) 302 | 303 | will result in: 304 | 305 | { 306 | [ 307 | '7ltd.biz', 308 | 'co0s.ru', 309 | 't0link.in', 310 | ] 311 | 312 | .. 313 | } 314 | 315 | 316 | VirusTotal API 317 | ============== 318 | 319 | VirusTotal provides an API that makes it possible to query for the reports about: 320 | 321 | * Domains 322 | * URLs 323 | * IPs 324 | * File hashes 325 | * File Upload 326 | * Live Feed 327 | * Advanced search 328 | 329 | To use the VirusTotal API wrapper import VirusTotalApi class from threat_intel.virustotal module: 330 | 331 | >>> from threat_intel import VirusTotalApi 332 | 333 | To initialize the API wrapper you need the API key: 334 | 335 | >>> vt = VirusTotalApi("") 336 | 337 | VirusTotal API calls allow to squeeze a list of file hashes or URLs into a single HTTP call. Depending on the API version you are using 338 | (public or private) you may need to tune the maximum number of the resources (file hashes or URLs) that could be passed in a single API 339 | call. You can do it with the resources_per_req parameter: 340 | 341 | >>> vt = VirusTotalApi("", resources_per_req=4) 342 | 343 | When using the public API your standard request rate allows you too put maximum 4 resources per request. With private API you are able to 344 | put up to 25 resources per call. That is also the default value if you don't pass the resources_per_req parameter. 345 | 346 | Of course when calling the API wrapper methods in the VirusTotalApi class you can pass as many resources as you want and the wrapper will 347 | take care of producing as many API calls as necessary to satisfy the request rate. 348 | 349 | Similarly to OpenDNS API wrapper, you can also specify the file name where the responses will be cached: 350 | 351 | >>> vt = VirusTotalApi("", cache_file_name="/tmp/cache.virustotal.json") 352 | 353 | 354 | #### Domain reports 355 | 356 | Calls domain/report VirusTotal API endpoint. 357 | Pass a list or any other Python enumerable containing the domains: 358 | 359 | >>> domains = ["google.com", "baidu.com", "bibikun.ru"] 360 | >>> vt.get_domain_reports(domains) 361 | 362 | will result in: 363 | 364 | { 365 | "baidu.com": { 366 | "undetected_referrer_samples": [ 367 | { 368 | "positives": 0, 369 | "total": 56, 370 | "sha256": "e3c1aea1352362e4b5c008e16b03810192d12a4f1cc71245f5a75e796c719c69" 371 | } 372 | ], 373 | .. 374 | } 375 | 376 | 377 | #### URL report endpoint 378 | 379 | Calls 'url/report' VirusTotal API endpoint. 380 | Pass a list or any other Python enumerable containing the URL addresses: 381 | 382 | >>> urls = ["http://www.google.com", "http://www.yelp.com"] 383 | >>> vt.get_url_reports(urls) 384 | 385 | will result in: 386 | 387 | { 388 | "http://www.google.com": { 389 | "permalink": "https://www.virustotal.com/url/dd014af5ed6b38d9130e3f466f850e46d21b951199d53a18ef29ee9341614eaf/analysis/1423344006/", 390 | "resource": "http://www.google.com", 391 | "url": "http://www.google.com/", 392 | "response_code": 1, 393 | "scan_date": "2015-02-07 21:20:06", 394 | "scan_id": "dd014af5ed6b38d9130e3f466f850e46d21b951199d53a18ef29ee9341614eaf-1423344006", 395 | "verbose_msg": "Scan finished, scan information embedded in this object", 396 | "filescan_id": null, 397 | "positives": 0, 398 | "total": 62, 399 | "scans": { 400 | "CLEAN MX": { 401 | "detected": false, 402 | "result": "clean site" 403 | }, 404 | .. 405 | } 406 | 407 | 408 | #### URL scan endpoint 409 | 410 | Calls url/scan VirusTotal API endpoint. 411 | Submit a url or any other Python enumerable containing the URL addresses: 412 | 413 | >>> urls = ["http://www.google.com", "http://www.yelp.com"] 414 | >>> vt.get_url_reports(urls) 415 | 416 | 417 | #### Hash report endpoint 418 | 419 | Calls file/report VirusTotal API endpoint. 420 | You can request the file reports passing a list of hashes (md5, sha1 or sha2): 421 | 422 | >>> file_hashes = [ 423 | "99017f6eebbac24f351415dd410d522d", 424 | "88817f6eebbac24f351415dd410d522d" 425 | ] 426 | 427 | >>> vt.get_file_reports(file_hashes) 428 | 429 | will result in: 430 | 431 | { 432 | "88817f6eebbac24f351415dd410d522d": { 433 | "response_code": 0, 434 | "resource": "88817f6eebbac24f351415dd410d522d", 435 | "verbose_msg": "The requested resource is not among the finished, queued or pending scans" 436 | }, 437 | "99017f6eebbac24f351415dd410d522d": { 438 | "scan_id": "52d3df0ed60c46f336c131bf2ca454f73bafdc4b04dfa2aea80746f5ba9e6d1c-1423261860", 439 | "sha1": "4d1740485713a2ab3a4f5822a01f645fe8387f92", 440 | } 441 | 442 | 443 | #### Hash rescan endpoint 444 | 445 | Calls `file/rescan` VirusTotal API endpoint. Use to rescan a previously submitted file. 446 | You can request the file reports passing a list of hashes (md5, sha1 or sha2): 447 | 448 | 449 | #### Hash behaviour endpoint 450 | 451 | Calls `file/behaviour` VirusTotal API endpoint. Use to get a report about the behaviour of the file when executed in a sandboxed 452 | environment (Cuckoo sandbox). You can request the file reports passing a list of hashes (md5, sha1 or sha2): 453 | 454 | 455 | >>> vt.get_file_behaviour(file_hashes) 456 | 457 | 458 | #### Hash network-traffic endpoint 459 | 460 | Calls `file/network-traffic` VirusTotal API endpoint. Use to get the dump of the network traffic generated by the file when executed. 461 | You can request the file reports passing a list of hashes (md5, sha1 or sha2): 462 | 463 | >>> vt.get_file_network_traffic(file_hashes) 464 | 465 | 466 | #### Hash download endpoint 467 | 468 | Calls `file/download` VirusTotal API endpoint. Use to download a file by its hash. 469 | You can request the file reports passing a list of hashes (md5, sha1 or sha2): 470 | 471 | >>> vt.get_file_download(file_hashes) 472 | 473 | 474 | #### URL live feed endpoint 475 | 476 | Calls `url/distribution` VirusTotal API endpoint. Use to get a live a feed with the latest URLs submitted to VirusTotal. 477 | 478 | >>> vt.get_url_distribution() 479 | 480 | 481 | #### Hash live feed endpoint 482 | 483 | Calls `file/distribution` VirusTotal API endpoint. Use to get a live a feed with the latest Hashes submitted to VirusTotal. 484 | 485 | >>> vt.get_file_distribution() 486 | 487 | 488 | #### Hash search endpoint 489 | 490 | Calls `file/search` VirusTotal API endpoint. Use to search for samples that match some binary/metadata/detection criteria. 491 | 492 | >>> vt.get_file_search() 493 | 494 | 495 | #### File date endpoint 496 | 497 | Calls `file/clusters` VirusTotal API endpoint. Use to list simililarity clusters for a given time frame. 498 | 499 | >>> vt.get_file_clusters() 500 | 501 | 502 | ShadowServer API 503 | ---------------- 504 | ShadowServer provides and API that allows to test the hashes against a list of known software applications. 505 | 506 | To use the ShadowServer API wrapper import ShadowServerApi class from threat_intel.shadowserver module: 507 | 508 | >>> from threat_intel import ShadowServerApi 509 | 510 | To use the API wrapper simply call the ShadowServerApi initializer: 511 | 512 | >>> ss = ShadowServerApi() 513 | 514 | You can also specify the file name where the API responses will be cached: 515 | 516 | >>> ss = ShadowServerApi(cache_file_name="/tmp/cache.shadowserver.json") 517 | 518 | To check whether the hashes are on the ShadowServer list of known hashes, call get_bin_test method and pass enumerable with the hashes you 519 | want to test: 520 | 521 | >>> file_hashes = [ 522 | "99017f6eebbac24f351415dd410d522d", 523 | "88817f6eebbac24f351415dd410d522d" 524 | ] 525 | 526 | >>> ss.get_bin_test(file_hashes) 527 | 528 | """ 529 | from __future__ import absolute_import 530 | from .exceptions import InvalidRequestError 531 | 532 | from .opendns import InvestigateApi 533 | from .shadowserver import ShadowServerApi 534 | from .virustotal import VirusTotalApi 535 | 536 | __all__ = ['InvalidRequestError', 'InvestigateApi', 'ShadowServerApi', 'VirusTotalApi'] 537 | -------------------------------------------------------------------------------- /threat_intel/alexaranking.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # AlexaRankingsAPI makes calls to the Alexa Ranking API 4 | # 5 | from threat_intel.util.api_cache import ApiCache 6 | from threat_intel.util.http import MultiRequest 7 | import xml.etree.ElementTree as ET 8 | from xml.etree.ElementTree import ParseError 9 | 10 | 11 | class AlexaRankingApi(object): 12 | 13 | BASE_URL = u'https://data.alexa.com/data?cli=10' 14 | 15 | def __init__(self, resources_per_req=10, cache_file_name=None, 16 | update_cache=True, req_timeout=None): 17 | """Establishes basic HTTP params and loads a cache. 18 | 19 | Args: 20 | resources_per_req: Maximum number of resources (hashes, URLs) 21 | to be send in a single request 22 | cache_file_name: String file name of cache. 23 | update_cache: Determines whether cache should be written out 24 | back to the disk when closing it. 25 | Default is `True`. 26 | req_timeout: Maximum number of seconds to wait without reading 27 | a response byte before deciding an error has occurred. 28 | Default is None. 29 | """ 30 | self._resources_per_req = resources_per_req 31 | self._requests = MultiRequest(req_timeout=req_timeout) 32 | 33 | # Create an ApiCache if instructed to 34 | self._cache = ApiCache(cache_file_name, 35 | update_cache) if cache_file_name else None 36 | 37 | @MultiRequest.error_handling 38 | def get_alexa_rankings(self, domains): 39 | """Retrieves the most recent VT info for a set of domains. 40 | 41 | Args: 42 | domains: list of string domains. 43 | Returns: 44 | A dict with the domain as key and the VT report as value. 45 | """ 46 | api_name = 'alexa_rankings' 47 | 48 | (all_responses, domains) = self._bulk_cache_lookup(api_name, domains) 49 | responses = self._request_reports(domains) 50 | 51 | for domain, response in zip(domains, responses): 52 | xml_response = self._extract_response_xml(domain, response) 53 | if self._cache: 54 | self._cache.cache_value(api_name, domain, response) 55 | all_responses[domain] = xml_response 56 | 57 | return all_responses 58 | 59 | def _request_reports(self, domains): 60 | """Sends multiples requests for the resources to a particular endpoint. 61 | 62 | Args: 63 | resource_param_name: a string name of the resource parameter. 64 | resources: list of of the resources. 65 | endpoint_name: AlexaRankingApi endpoint URL suffix. 66 | Returns: 67 | A list of the responses. 68 | """ 69 | params = [{'url': domain} for domain in domains] 70 | responses = self._requests.multi_get( 71 | self.BASE_URL, query_params=params, to_json=False) 72 | return responses 73 | 74 | def _extract_response_xml(self, domain, response): 75 | """Extract XML content of an HTTP response into dictionary format. 76 | 77 | Args: 78 | response: HTML Response objects 79 | Returns: 80 | A dictionary: {alexa-ranking key : alexa-ranking value}. 81 | """ 82 | attributes = {} 83 | alexa_keys = {'POPULARITY': 'TEXT', 'REACH': 'RANK', 'RANK': 'DELTA'} 84 | try: 85 | xml_root = ET.fromstring(response._content) 86 | for xml_child in xml_root.findall('SD//'): 87 | if xml_child.tag in alexa_keys and \ 88 | alexa_keys[xml_child.tag] in xml_child.attrib: 89 | attributes[xml_child.tag.lower( 90 | )] = xml_child.attrib[alexa_keys[xml_child.tag]] 91 | except ParseError: 92 | # Skip ill-formatted XML and return no Alexa attributes 93 | pass 94 | attributes['domain'] = domain 95 | return {'attributes': attributes} 96 | 97 | def _bulk_cache_lookup(self, api_name, keys): 98 | """Performes a bulk cache lookup and returns a tuple with the results 99 | found and the keys missing in the cache. If cached is not configured 100 | it will return an empty dictionary of found results and the initial 101 | list of keys. 102 | 103 | Args: 104 | api_name: a string name of the API. 105 | keys: an enumerable of string keys. 106 | Returns: 107 | A tuple: (responses found, missing keys). 108 | """ 109 | if self._cache: 110 | responses = self._cache.bulk_lookup(api_name, keys) 111 | missing_keys = [key for key in keys if key not in responses.keys()] 112 | return (responses, missing_keys) 113 | 114 | return ({}, keys) 115 | -------------------------------------------------------------------------------- /threat_intel/exceptions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # All exceptions thrown by the threat_intel module 4 | # 5 | 6 | 7 | class InvalidRequestError(Exception): 8 | 9 | """Raised by MultiRequest when it can't figure out how to make a request.""" 10 | pass 11 | -------------------------------------------------------------------------------- /threat_intel/opendns.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # InvestigateApi makes calls to the OpenDNS Investigate API. 4 | # 5 | from warnings import warn 6 | 7 | import simplejson 8 | from six.moves import range 9 | 10 | from threat_intel.util.api_cache import ApiCache 11 | from threat_intel.util.error_messages import write_error_message 12 | from threat_intel.util.error_messages import write_exception 13 | from threat_intel.util.http import MultiRequest 14 | 15 | 16 | def _cached_by_domain(api_name): 17 | """A caching wrapper for functions that take a list of domains as 18 | parameters. 19 | 20 | Raises: 21 | ResponseError - if the response received from the endpoint is 22 | not valid. 23 | """ 24 | 25 | def wrapped(func): 26 | def decorated(self, domains): 27 | if not self._cache: 28 | return func(self, domains) 29 | 30 | all_responses = self._cache.bulk_lookup(api_name, domains) 31 | domains = list(set(domains) - set(all_responses)) 32 | 33 | if domains: 34 | response = func(self, domains) 35 | 36 | if not response: 37 | raise ResponseError("No response for uncached domains") 38 | 39 | for domain in response: 40 | self._cache.cache_value(api_name, domain, response[domain]) 41 | all_responses[domain] = response[domain] 42 | 43 | return all_responses 44 | return decorated 45 | return wrapped 46 | 47 | 48 | class InvestigateApi(object): 49 | 50 | """Calls the OpenDNS investigate API. 51 | 52 | Applies rate limits and issues parallel requests. 53 | """ 54 | 55 | BASE_URL = u'https://investigate.api.umbrella.com/' 56 | 57 | # TODO: consider moving this to a config file 58 | MAX_DOMAINS_IN_POST = 1000 59 | 60 | def __init__(self, api_key, cache_file_name=None, update_cache=True, req_timeout=None): 61 | auth_header = {'Authorization': 'Bearer {0}'.format(api_key)} 62 | self._requests = MultiRequest( 63 | default_headers=auth_header, max_requests=12, rate_limit=30, 64 | req_timeout=req_timeout, drop_404s=True, 65 | ) 66 | 67 | # Create an ApiCache if instructed to 68 | self._cache = ApiCache(cache_file_name, update_cache) if cache_file_name else None 69 | 70 | @classmethod 71 | def _to_url(cls, url_path): 72 | try: 73 | return u'{0}{1}'.format(cls.BASE_URL, url_path) 74 | except Exception as e: 75 | write_error_message(url_path) 76 | write_exception(e) 77 | raise e 78 | 79 | @classmethod 80 | def _to_urls(cls, fmt_url_path, url_path_args): 81 | url_paths = [] 82 | for path_arg in url_path_args: 83 | try: 84 | url_paths.append(fmt_url_path.format(path_arg)) 85 | except Exception as e: 86 | write_error_message(path_arg) 87 | write_exception(e) 88 | raise e 89 | 90 | return [cls._to_url(url_path) for url_path in url_paths] 91 | 92 | @MultiRequest.error_handling 93 | def _multi_post(self, url_path, domains): 94 | data = [simplejson.dumps(domains[pos:pos + self.MAX_DOMAINS_IN_POST]) for pos in range(0, len(domains), self.MAX_DOMAINS_IN_POST)] 95 | # multi_post() returns list of dictionaries, so they need to be merged into one dict 96 | all_responses = self._requests.multi_post(self._to_url(url_path), data=data) 97 | responses = {} 98 | for r in all_responses: 99 | responses.update(r) 100 | return responses 101 | 102 | @_cached_by_domain(api_name='opendns-categorization') 103 | def categorization(self, domains): 104 | """Calls categorization end point and adds an 'is_suspicious' key to each response. 105 | 106 | Args: 107 | domains: An enumerable of domains 108 | Returns: 109 | A dict of {domain: categorization_result} 110 | """ 111 | url_path = u'domains/categorization/?showLabels' 112 | return self._multi_post(url_path, domains) 113 | 114 | @_cached_by_domain(api_name='opendns-domain_score') 115 | def domain_score(self, domains): 116 | """Calls domain scores endpoint. 117 | 118 | This method is deprecated since OpenDNS Investigate API 119 | endpoint is also deprecated. 120 | """ 121 | warn( 122 | 'OpenDNS Domain Scores endpoint is deprecated. Use ' 123 | 'InvestigateApi.categorization() instead', DeprecationWarning, 124 | ) 125 | url_path = 'domains/score/' 126 | return self._multi_post(url_path, domains) 127 | 128 | @MultiRequest.error_handling 129 | def _multi_get(self, cache_api_name, fmt_url_path, url_params, query_params=None): 130 | """Makes multiple GETs to an OpenDNS endpoint. 131 | 132 | Args: 133 | cache_api_name: string api_name for caching 134 | fmt_url_path: format string for building URL paths 135 | url_params: An enumerable of strings used in building URLs 136 | query_params - None / dict / list of dicts containing query params 137 | Returns: 138 | A dict of {url_param: api_result} 139 | """ 140 | all_responses = {} 141 | 142 | if self._cache: 143 | all_responses = self._cache.bulk_lookup(cache_api_name, url_params) 144 | url_params = [key for key in url_params if key not in all_responses.keys()] 145 | 146 | if len(url_params): 147 | urls = self._to_urls(fmt_url_path, url_params) 148 | responses = self._requests.multi_get(urls, query_params) 149 | for url_param, response in zip(url_params, responses): 150 | if self._cache: 151 | self._cache.cache_value(cache_api_name, url_param, response) 152 | all_responses[url_param] = response 153 | 154 | return all_responses 155 | 156 | def security(self, domains): 157 | """Calls security end point and adds an 'is_suspicious' key to each response. 158 | 159 | Args: 160 | domains: An enumerable of strings 161 | Returns: 162 | A dict of {domain: security_result} 163 | """ 164 | api_name = 'opendns-security' 165 | fmt_url_path = u'security/name/{0}.json' 166 | return self._multi_get(api_name, fmt_url_path, domains) 167 | 168 | def whois_emails(self, emails): 169 | """Calls WHOIS Email end point 170 | 171 | Args: 172 | emails: An enumerable of string Emails 173 | Returns: 174 | A dict of {email: domain_result} 175 | """ 176 | api_name = 'opendns-whois-emails' 177 | fmt_url_path = u'whois/emails/{0}' 178 | return self._multi_get(api_name, fmt_url_path, emails) 179 | 180 | def whois_nameservers(self, nameservers): 181 | """Calls WHOIS Nameserver end point 182 | 183 | Args: 184 | emails: An enumerable of nameservers 185 | Returns: 186 | A dict of {nameserver: domain_result} 187 | """ 188 | api_name = 'opendns-whois-nameservers' 189 | fmt_url_path = u'whois/nameservers/{0}' 190 | return self._multi_get(api_name, fmt_url_path, nameservers) 191 | 192 | def whois_domains(self, domains): 193 | """Calls WHOIS domain end point 194 | 195 | Args: 196 | domains: An enumerable of domains 197 | Returns: 198 | A dict of {domain: domain_result} 199 | """ 200 | api_name = 'opendns-whois-domain' 201 | fmt_url_path = u'whois/{0}' 202 | return self._multi_get(api_name, fmt_url_path, domains) 203 | 204 | def whois_domains_history(self, domains): 205 | """Calls WHOIS domain history end point 206 | 207 | Args: 208 | domains: An enumerable of domains 209 | Returns: 210 | A dict of {domain: domain_history_result} 211 | """ 212 | api_name = 'opendns-whois-domain-history' 213 | fmt_url_path = u'whois/{0}/history' 214 | return self._multi_get(api_name, fmt_url_path, domains) 215 | 216 | def cooccurrences(self, domains): 217 | """Get the domains related to input domains. 218 | 219 | Args: 220 | domains: an enumerable of strings domain names 221 | Returns: 222 | An enumerable of string domain names 223 | """ 224 | api_name = 'opendns-cooccurrences' 225 | fmt_url_path = u'recommendations/name/{0}.json' 226 | return self._multi_get(api_name, fmt_url_path, domains) 227 | 228 | def domain_tag(self, domains): 229 | """Get the data range when a domain is part of OpenDNS block list. 230 | 231 | Args: 232 | domains: an enumerable of strings domain names 233 | Returns: 234 | An enumerable of string with period, category, and url 235 | """ 236 | api_name = 'opendns-domain_tag' 237 | fmt_url_path = u'domains/{0}/latest_tags' 238 | return self._multi_get(api_name, fmt_url_path, domains) 239 | 240 | def related_domains(self, domains): 241 | """Get list of domain names that have been seen requested around the 242 | same time (up to 60 seconds before or after) to the given domain name. 243 | 244 | Args: 245 | domains: an enumerable of strings domain names 246 | Returns: 247 | An enumerable of [domain name, scores] 248 | """ 249 | api_name = 'opendns-related_domains' 250 | fmt_url_path = u'links/name/{0}.json' 251 | return self._multi_get(api_name, fmt_url_path, domains) 252 | 253 | def rr_history(self, ips): 254 | """Get the domains related to input ips. 255 | 256 | Args: 257 | ips: an enumerable of strings as ips 258 | Returns: 259 | An enumerable of resource records and features 260 | """ 261 | api_name = 'opendns-rr_history' 262 | fmt_url_path = u'dnsdb/ip/a/{0}.json' 263 | return self._multi_get(api_name, fmt_url_path, ips) 264 | 265 | def dns_rr(self, ips): 266 | """Get the domains related to input domains. 267 | 268 | Args: 269 | domains: an enumerable of strings as domains 270 | Returns: 271 | An enumerable of resource records and features 272 | """ 273 | api_name = 'opendns-dns_rr' 274 | fmt_url_path = u'dnsdb/name/a/{0}.json' 275 | return self._multi_get(api_name, fmt_url_path, ips) 276 | 277 | def latest_malicious(self, ips): 278 | """Get the a list of malicious domains related to input ips. 279 | 280 | Args: 281 | ips: an enumerable of strings as ips 282 | Returns: 283 | An enumerable of strings for the malicious domains 284 | """ 285 | api_name = 'opendns-latest_malicious' 286 | fmt_url_path = u'ips/{0}/latest_domains' 287 | return self._multi_get(api_name, fmt_url_path, ips) 288 | 289 | def sample(self, hashes): 290 | """Get the information about a sample based on its hash. 291 | 292 | Args: 293 | hashes: an enumerable of strings as hashes 294 | Returns: 295 | An enumerable of arrays which contains the information 296 | about the original samples 297 | """ 298 | api_name = 'opendns-sample' 299 | fmt_url_path = u'sample/{0}' 300 | return self._multi_get(api_name, fmt_url_path, hashes) 301 | 302 | def search(self, patterns, start=30, limit=1000, include_category=False): 303 | """Performs pattern searches against the Investigate database. 304 | 305 | Args: 306 | patterns: An enumerable of RegEx domain patterns to search for 307 | start: How far back results extend from in days (max is 30) 308 | limit: Number of results to show (max is 1000) 309 | include_category: Include OpenDNS security categories 310 | Returns: 311 | An enumerable of matching domain strings 312 | """ 313 | api_name = 'opendns-patterns' 314 | fmt_url_path = u'search/{0}' 315 | start = '-{0}days'.format(start) 316 | include_category = str(include_category).lower() 317 | query_params = { 318 | 'start': start, 319 | 'limit': limit, 320 | 'includecategory': include_category, 321 | } 322 | return self._multi_get(api_name, fmt_url_path, patterns, query_params) 323 | 324 | def risk_score(self, domains): 325 | """Performs Umbrella risk score analysis on the input domains 326 | 327 | Args: 328 | domains: an enumerable of domains 329 | Returns: 330 | An enumerable of associated domain risk scores 331 | """ 332 | api_name = 'opendns-risk_score' 333 | fmt_url_path = u'domains/risk-score/{0}' 334 | return self._multi_get(api_name, fmt_url_path, domains) 335 | 336 | 337 | class ResponseError(Exception): 338 | 339 | """Raised when the response received from the endpoint is not valid.""" 340 | -------------------------------------------------------------------------------- /threat_intel/shadowserver.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # ShadowServerApi makes calls to the ShadowServer APIs. 4 | # 5 | from six.moves import range 6 | import simplejson 7 | 8 | from threat_intel.util.api_cache import ApiCache 9 | from threat_intel.util.http import MultiRequest 10 | 11 | 12 | class ShadowServerApi(object): 13 | BINTEST_URL = u'http://bin-test.shadowserver.org/api' 14 | 15 | def __init__(self, cache_file_name=None, update_cache=True, req_timeout=90.0): 16 | """Establishes basic HTTP params and loads a cache. 17 | 18 | Args: 19 | cache_file_name: String file name of cache. 20 | update_cache: Determines whether cache should be written out back to the disk when closing it. 21 | Default is `True`. 22 | req_timeout: Maximum number of seconds to wait without reading a response byte before deciding an error has occurred. 23 | Default is 90.0 seconds. 24 | """ 25 | 26 | # TODO - lookup request rate limit 27 | # By observation, ShadowServer can be quite slow, so give it 90 seconds before it times out. 28 | self._requests = MultiRequest(max_requests=2, req_timeout=req_timeout) 29 | 30 | # Create an ApiCache if instructed to 31 | self._cache = ApiCache(cache_file_name, update_cache) if cache_file_name else None 32 | 33 | @MultiRequest.error_handling 34 | def get_bin_test(self, hashes): 35 | """Test hashes against a list of known software applications. 36 | 37 | Known hashes will return a dictionary of information. 38 | Unknown hashes will return nothing. 39 | 40 | Args: 41 | hashes: list of string hashes. 42 | Returns: 43 | A dict with the hash as key and the shadowserver report as value. 44 | """ 45 | all_responses = {} 46 | 47 | if self._cache: 48 | api_name = 'shadowserver-bin-test' 49 | all_responses = self._cache.bulk_lookup(api_name, hashes) 50 | hashes = [key for key in hashes if key not in all_responses.keys()] 51 | all_responses = dict([(key, val) for key, val in all_responses.items() if len(val) >= 2]) 52 | 53 | HASHES_PER_REQ = 25 54 | hash_chunks = ['\n'.join(hashes[pos:pos + HASHES_PER_REQ]) for pos in range(0, len(hashes), HASHES_PER_REQ)] 55 | 56 | responses = self._requests.multi_post(self.BINTEST_URL, data=hash_chunks, to_json=False, send_as_file=True) 57 | for response in responses: 58 | if response is not None and 200 == response.status_code: 59 | response_lines = response.text.split('\n') 60 | for line in response_lines: 61 | # Set an initial val. 62 | val = {} 63 | 64 | # There is just a key, no value. This means the hash was unknown to ShadowServer. 65 | index_of_first_space = line.find(' ') 66 | if -1 == index_of_first_space: 67 | index_of_first_space = len(line) 68 | key = line[:index_of_first_space].lower() 69 | 70 | # The response only has a JSON body if the hash was known. 71 | json_text = line[index_of_first_space + 1:] 72 | if len(json_text): 73 | try: 74 | val = simplejson.loads(json_text) 75 | # A very short response indicates an error? 76 | if len(val.keys()) >= 2: 77 | all_responses[key] = val 78 | 79 | except ValueError: 80 | # Sometimes ShadowServer returns invalid data. Silently skip it. 81 | pass 82 | 83 | if self._cache: 84 | self._cache.cache_value(api_name, key, val) 85 | 86 | return all_responses 87 | -------------------------------------------------------------------------------- /threat_intel/util/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __all__ = ['api_cache', 'error_messages', 'http'] 4 | -------------------------------------------------------------------------------- /threat_intel/util/api_cache.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # ApiCache creates an on disk cache of API call results 4 | # 5 | import simplejson 6 | from simplejson.scanner import JSONDecodeError 7 | 8 | 9 | class ApiCache(object): 10 | 11 | """creates an on disk cache of API call results.""" 12 | 13 | def __init__(self, cache_file_name, update_cache=True): 14 | """Opens the cache file and reads previous results. 15 | 16 | Args: 17 | cache_file_name: string file name 18 | update_cache: Specifies whether ApiCache should write out the 19 | cache file when closing it 20 | """ 21 | self._cache_file_name = cache_file_name 22 | self._cache = self._read_cache_from_file() 23 | self._update_cache = update_cache 24 | 25 | def __del__(self): 26 | """Ensures cache is persisted to disk before object is destroyed. 27 | 28 | Using a destructor is a bit inflammatory but it seems like a very nice way to write a file when "everything is done". 29 | The ApiCache avoids circular dependencies so it should work out. 30 | """ 31 | self.close() 32 | 33 | def close(self): 34 | """Write the contents of the cache to disk (only if `update_cache` 35 | parameter during the object initialization was not set to `False`) and 36 | clear the in memory cache.""" 37 | if self._cache: 38 | if self._update_cache: 39 | self._write_cache_to_file() 40 | self._cache = None 41 | 42 | def _write_cache_to_file(self): 43 | """Write the contents of the cache to a file on disk.""" 44 | with(open(self._cache_file_name, 'w')) as fp: 45 | fp.write(simplejson.dumps(self._cache)) 46 | 47 | def _read_cache_from_file(self): 48 | """Read the contents of the cache from a file on disk.""" 49 | cache = {} 50 | try: 51 | with(open(self._cache_file_name, 'r')) as fp: 52 | contents = fp.read() 53 | cache = simplejson.loads(contents) 54 | except (IOError, JSONDecodeError): 55 | # The file could not be read. This is not a problem if the file does not exist. 56 | pass 57 | 58 | return cache 59 | 60 | def cache_value(self, api_name, key, value): 61 | """Add the value of an API call to the cache. 62 | 63 | Args: 64 | api_name: a string name of the API. Keys and values are segmented by api_name. 65 | key: a string key for the specific call. 66 | value: the value of the call using the specific key 67 | """ 68 | self._cache.setdefault(api_name, {}) 69 | self._cache[api_name][key] = value 70 | 71 | def lookup_value(self, api_name, key): 72 | """Add the value of an API call to the cache. 73 | 74 | Args: 75 | api_name: a string name of the API. Keys and values are segmented by api_name. 76 | key: a string key for the specific call. 77 | """ 78 | if api_name in self._cache: 79 | return self._cache[api_name].get(key, None) 80 | return None 81 | 82 | def bulk_lookup(self, api_name, keys): 83 | """Perform lookup on an enumerable of keys. 84 | 85 | Args: 86 | api_name: a string name of the API. Keys and values are segmented by api_name. 87 | keys: an enumerable of string keys. 88 | """ 89 | cached_data = {} 90 | 91 | for key in keys: 92 | value = self.lookup_value(api_name, key) 93 | if value is not None: 94 | cached_data[key] = value 95 | return cached_data 96 | -------------------------------------------------------------------------------- /threat_intel/util/error_messages.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # A set of simple methods for writing messages to stderr 4 | # 5 | import sys 6 | from traceback import extract_tb 7 | from traceback import format_list 8 | 9 | 10 | def write_exception(e): 11 | exc_type, __, exc_traceback = sys.exc_info() 12 | sys.stderr.write('[ERROR] {0} {1}\n'.format(exc_type.__name__, str(e))) 13 | for line in format_list(extract_tb(exc_traceback)): 14 | sys.stderr.write(line) 15 | 16 | 17 | def write_error_message(message): 18 | sys.stderr.write('[ERROR] ') 19 | sys.stderr.write(message) 20 | sys.stderr.write('\n') 21 | -------------------------------------------------------------------------------- /threat_intel/util/http.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Utilities for dealing with HTTP requests 3 | # 4 | # RateLimiter helps to only make a certain number of calls per second. 5 | # MultiRequest wraps requests-futures and issues multiple requests at once with an easy to use interface. 6 | # SSLAdapter helps force use of the highest possible version of TLS. 7 | # 8 | import logging 9 | import re 10 | import ssl 11 | import time 12 | from base64 import urlsafe_b64encode 13 | from collections import namedtuple 14 | from collections import OrderedDict 15 | from functools import partial 16 | 17 | from requests.adapters import HTTPAdapter 18 | from requests.exceptions import RequestException 19 | from requests_futures.sessions import FuturesSession 20 | from six.moves import range 21 | from urllib3.util.retry import Retry 22 | 23 | from threat_intel.exceptions import InvalidRequestError 24 | from threat_intel.util.error_messages import write_error_message 25 | from threat_intel.util.error_messages import write_exception 26 | 27 | 28 | PreparedRequest = namedtuple('PreparedRequest', ('callable', 'url')) 29 | 30 | 31 | class SSLAdapter(HTTPAdapter): 32 | 33 | """Attempt to use the highest possible TLS version for HTTPS connections. 34 | 35 | By explictly controlling which TLS version is used when connecting, avoid the client offering only SSLv2 or SSLv3. 36 | 37 | The best version specifier to pass is `ssl.PROTOCOL_TLS`, as this will choose the highest available protocol 38 | compatible with both client and server. For details see the documentation for `ssl.wrap_socket` 39 | (https://docs.python.org/2/library/ssl.html#socket-creation). 40 | 41 | To use this class, mount it to a `requests.Session` and then make HTTPS using the session object. 42 | 43 | .. code-block:: python 44 | # Mount an SSLAdapter in a Session 45 | session = requests.Session() 46 | session.mount('https://', SSLAdapter()) 47 | 48 | # Make a requests call through the session 49 | session.get('https://api.github.com/events') 50 | 51 | """ 52 | 53 | def init_poolmanager(self, connections, maxsize, block=False, **pool_kwargs): 54 | """Called to initialize the HTTPAdapter when no proxy is used.""" 55 | try: 56 | pool_kwargs['ssl_version'] = ssl.PROTOCOL_TLS 57 | except AttributeError: 58 | pool_kwargs['ssl_version'] = ssl.PROTOCOL_SSLv23 59 | return super(SSLAdapter, self).init_poolmanager(connections, maxsize, block, **pool_kwargs) 60 | 61 | def proxy_manager_for(self, proxy, **proxy_kwargs): 62 | """Called to initialize the HTTPAdapter when a proxy is used.""" 63 | try: 64 | proxy_kwargs['ssl_version'] = ssl.PROTOCOL_TLS 65 | except AttributeError: 66 | proxy_kwargs['ssl_version'] = ssl.PROTOCOL_SSLv23 67 | return super(SSLAdapter, self).proxy_manager_for(proxy, **proxy_kwargs) 68 | 69 | 70 | class RateLimiter(object): 71 | 72 | """Limits how many calls can be made per second""" 73 | 74 | CallRecord = namedtuple('CallRecord', ['time', 'num_calls']) 75 | 76 | def __init__(self, calls_per_sec): 77 | self._max_calls_per_second = calls_per_sec 78 | self._call_times = [] 79 | self._outstanding_calls = 0 80 | 81 | def make_calls(self, num_calls=1): 82 | """Adds appropriate sleep to avoid making too many calls. 83 | 84 | Args: 85 | num_calls: int the number of calls which will be made 86 | """ 87 | self._cull() 88 | while self._outstanding_calls + num_calls > self._max_calls_per_second: 89 | time.sleep(0) # yield 90 | self._cull() 91 | 92 | self._call_times.append(self.CallRecord(time=time.time(), num_calls=num_calls)) 93 | self._outstanding_calls += num_calls 94 | 95 | def _cull(self): 96 | """Remove calls more than 1 second old from the queue.""" 97 | right_now = time.time() 98 | 99 | cull_from = -1 100 | for index in range(len(self._call_times)): 101 | if right_now - self._call_times[index].time >= 1.0: 102 | cull_from = index 103 | self._outstanding_calls -= self._call_times[index].num_calls 104 | else: 105 | break 106 | 107 | if cull_from > -1: 108 | self._call_times = self._call_times[cull_from + 1:] 109 | 110 | 111 | class AvailabilityLimiter(object): 112 | 113 | """Limits the total number of requests issued for a session.""" 114 | 115 | def __init__(self, total_retries): 116 | """ Wrapper object for managing total session retry limit. 117 | 118 | Args: 119 | total_retries: Total request attempts to be made per sesssion. 120 | This is shared between all request objects. 121 | """ 122 | self.total_retries = total_retries 123 | 124 | def map_with_retries(self, requests, responses_for_requests): 125 | """Provides session-based retry functionality 126 | 127 | :param requests: A collection of Request objects. 128 | :param responses_for_requests: Dictionary mapping of requests to responses 129 | :param max_retries: The maximum number of retries to perform per session 130 | :param args: Additional arguments to pass into a retry mapping call 131 | 132 | 133 | """ 134 | retries = [] 135 | response_futures = [preq.callable() for preq in requests] 136 | 137 | for request, response_future in zip(requests, response_futures): 138 | try: 139 | response = response_future.result() 140 | if response is not None and response.status_code == 403: 141 | logging.warning('Request to {} caused a 403 response status code.'.format(request.url)) 142 | raise InvalidRequestError('Access forbidden') 143 | if response is not None: 144 | responses_for_requests[request] = response 145 | except RequestException as re: 146 | logging.error('An exception was raised for {}: {}'.format(request.url, re)) 147 | if self.total_retries > 0: 148 | self.total_retries -= 1 149 | retries.append(request) 150 | 151 | # Recursively retry failed requests with the modified total retry count 152 | if retries: 153 | self.map_with_retries(retries, responses_for_requests) 154 | 155 | 156 | class MultiRequest(object): 157 | 158 | """Wraps requests-futures to make simultaneous HTTP requests. 159 | 160 | Can use a RateLimiter to limit # of outstanding requests. 161 | Can also use AvailabilityLimiter to limit total # of request issuance threshold. 162 | `multi_get` and `multi_post` try to be smart about how many requests to issue: 163 | 164 | * One url & one param - One request will be made. 165 | * Multiple url & one query param - Multiple requests will be made, with differing urls and the same query param. 166 | * Multiple url & multiple query params - Multiple requests will be made, with the same url and differing query params. 167 | """ 168 | 169 | _VERB_GET = 'GET' 170 | _VERB_POST = 'POST' 171 | 172 | def __init__( 173 | self, default_headers=None, max_requests=10, rate_limit=0, 174 | req_timeout=None, max_retry=10, total_retry=100, drop_404s=False, 175 | ): 176 | """Create the MultiRequest. 177 | 178 | Args: 179 | default_headers - A dict of headers which will be added to every request 180 | max_requests - Maximum number of requests to issue at once 181 | rate_limit - Maximum number of requests to issue per second 182 | req_timeout - Maximum number of seconds to wait without reading a response byte before deciding an error has occurred 183 | max_retry - The total number of attempts to retry a single batch of requests 184 | total_retry - The total number of request retries that can be made through the entire session 185 | Note there is a difference between `max_retry` and `total_retry`: 186 | - `max_retry` refers to how many times a batch of requests will be re-issued collectively 187 | - `total_retry` refers to a limit on the total number of outstanding requests made 188 | Once the latter is exhausted, no failed request within the whole session will be retried. 189 | """ 190 | self._default_headers = default_headers 191 | self._max_requests = max_requests 192 | self._req_timeout = req_timeout or 25.0 193 | self._max_retry = max_retry 194 | self._drop_404s = drop_404s 195 | self._rate_limiter = RateLimiter(rate_limit) if rate_limit else None 196 | self._availability_limiter = AvailabilityLimiter(total_retry) if total_retry else None 197 | self._session = FuturesSession(max_workers=max_requests) 198 | retries = Retry(total=0, status_forcelist=[500, 502, 503, 504], raise_on_status=True) 199 | self._session.mount( 200 | 'https://', SSLAdapter( 201 | max_retries=retries, pool_maxsize=max_requests, pool_connections=max_requests, 202 | ), 203 | ) 204 | 205 | def multi_get(self, urls, query_params=None, to_json=True, file_download=False): 206 | """Issue multiple GET requests. 207 | 208 | Args: 209 | urls - A string URL or list of string URLs 210 | query_params - None, a dict, or a list of dicts representing the query params 211 | to_json - A boolean, should the responses be returned as JSON blobs 212 | file_download - A boolean, whether a file download is expected 213 | 214 | Returns: 215 | a list of dicts if to_json is set of requests.response otherwise. 216 | Raises: 217 | InvalidRequestError - Can not decide how many requests to issue. 218 | """ 219 | return self._multi_request( 220 | MultiRequest._VERB_GET, urls, query_params, 221 | data=None, to_json=to_json, file_download=file_download, 222 | ) 223 | 224 | def multi_post(self, urls, query_params=None, data=None, to_json=True, send_as_file=False): 225 | """Issue multiple POST requests. 226 | 227 | Args: 228 | urls - A string URL or list of string URLs 229 | query_params - None, a dict, or a list of dicts representing the query params 230 | data - None, a dict or string, or a list of dicts and strings representing the data body. 231 | to_json - A boolean, should the responses be returned as JSON blobs 232 | send_as_file - A boolean, should the data be sent as a file. 233 | Returns: 234 | a list of dicts if to_json is set of requests.response otherwise. 235 | Raises: 236 | InvalidRequestError - Can not decide how many requests to issue. 237 | """ 238 | return self._multi_request( 239 | MultiRequest._VERB_POST, urls, query_params, 240 | data, to_json=to_json, send_as_file=send_as_file, 241 | ) 242 | 243 | def _create_request(self, verb, url, query_params=None, data=None, send_as_file=False): 244 | """Helper method to create a single post/get requests. 245 | 246 | Args: 247 | verb - MultiRequest._VERB_POST or MultiRequest._VERB_GET 248 | url - A string URL 249 | query_params - None or a dict 250 | data - None or a string or a dict 251 | send_as_file - A boolean, should the data be sent as a file. 252 | Returns: 253 | requests.PreparedRequest 254 | Raises: 255 | InvalidRequestError - if an invalid verb is passed in. 256 | """ 257 | 258 | # Prepare a set of kwargs to make it easier to avoid missing default params. 259 | kwargs = { 260 | 'headers': self._default_headers, 261 | 'params': query_params, 262 | 'timeout': self._req_timeout, 263 | } 264 | 265 | if MultiRequest._VERB_POST == verb: 266 | if send_as_file: 267 | kwargs['files'] = {'file': data} 268 | else: 269 | kwargs['data'] = data 270 | return PreparedRequest(partial(self._session.post, url, **kwargs), url) 271 | elif MultiRequest._VERB_GET == verb: 272 | return PreparedRequest(partial(self._session.get, url, **kwargs), url) 273 | else: 274 | raise InvalidRequestError('Invalid verb {0}'.format(verb)) 275 | 276 | def _zip_request_params(self, urls, query_params, data): 277 | """Massages inputs and returns a list of 3-tuples zipping them up. 278 | 279 | This is all the smarts behind deciding how many requests to issue. 280 | It's fine for an input to have 0, 1, or a list of values. 281 | If there are two inputs each with a list of values, the cardinality of those lists much match. 282 | 283 | Args: 284 | urls - 1 string URL or a list of URLs 285 | query_params - None, 1 dict, or a list of dicts 286 | data - None, 1 dict or string, or a list of dicts or strings 287 | Returns: 288 | A list of 3-tuples (url, query_param, data) 289 | Raises: 290 | InvalidRequestError - if cardinality of lists does not match 291 | """ 292 | 293 | # Everybody gets to be a list 294 | if not isinstance(urls, list): 295 | urls = [urls] 296 | if not isinstance(query_params, list): 297 | query_params = [query_params] 298 | if not isinstance(data, list): 299 | data = [data] 300 | 301 | # Counts must not mismatch 302 | url_count = len(urls) 303 | query_param_count = len(query_params) 304 | data_count = len(data) 305 | 306 | max_count = max(url_count, query_param_count, data_count) 307 | 308 | if ( 309 | max_count > url_count > 1 310 | or max_count > query_param_count > 1 311 | or max_count > data_count > 1 312 | ): 313 | raise InvalidRequestError( 314 | 'Mismatched parameter count url_count:{0} query_param_count:{1} data_count:{2} max_count:{3}', 315 | url_count, query_param_count, data_count, max_count, 316 | ) 317 | 318 | # Pad out lists 319 | if url_count < max_count: 320 | urls = urls * max_count 321 | if query_param_count < max_count: 322 | query_params = query_params * max_count 323 | if data_count < max_count: 324 | data = data * max_count 325 | 326 | return list(zip(urls, query_params, data)) 327 | 328 | def _wait_for_response(self, requests): 329 | """Issues a batch of requests and waits for the responses. 330 | If some of the requests fail it will retry the failed ones up to `_max_retry` times. 331 | 332 | Args: 333 | requests - A list of requests 334 | Returns: 335 | A list of `requests.models.Response` objects 336 | Raises: 337 | InvalidRequestError - if any of the requests returns "403 Forbidden" response 338 | """ 339 | failed_requests = [] 340 | responses_for_requests = OrderedDict.fromkeys(requests) 341 | 342 | for retry in range(self._max_retry): 343 | try: 344 | logging.debug('Try #{0}'.format(retry + 1)) 345 | self._availability_limiter.map_with_retries(requests, responses_for_requests) 346 | 347 | failed_requests = [] 348 | for request, response in responses_for_requests.items(): 349 | if self._drop_404s and response is not None and response.status_code == 404: 350 | logging.warning('Request to {0} failed with status code 404, dropping.'.format(request.url)) 351 | elif not response: 352 | failed_requests.append((request, response)) 353 | 354 | if not failed_requests: 355 | break 356 | 357 | logging.warning('Try #{0}. Expected {1} successful response(s) but only got {2}.'.format( 358 | retry + 1, len(requests), len(requests) - len(failed_requests), 359 | )) 360 | 361 | # retry only for the failed requests 362 | requests = [fr[0] for fr in failed_requests] 363 | except InvalidRequestError: 364 | raise 365 | except Exception as e: 366 | # log the exception for the informative purposes and pass to the next iteration 367 | logging.exception('Try #{0}. Exception occured: {1}. Retrying.'.format(retry + 1, e)) 368 | pass 369 | 370 | if failed_requests: 371 | logging.warning('Still {0} failed request(s) after {1} retries:'.format( 372 | len(failed_requests), self._max_retry, 373 | )) 374 | for failed_request, failed_response in failed_requests: 375 | if failed_response is not None: 376 | # in case response text does contain some non-ascii characters 377 | failed_response_text = failed_response.text.encode('ascii', 'xmlcharrefreplace') 378 | logging.warning('Request to {0} failed with status code {1}. Response text: {2}'.format( 379 | failed_request.url, failed_response.status_code, failed_response_text, 380 | )) 381 | else: 382 | logging.warning('Request to {0} failed with None response.'.format(failed_request.url)) 383 | 384 | return list(responses_for_requests.values()) 385 | 386 | def _handle_file_download(self, response): 387 | name = None 388 | data = None 389 | try: 390 | name = re.findall('filename=(.+)', response.headers['content-disposition'])[0] 391 | data = urlsafe_b64encode(response.text.encode('utf-8')).decode('utf-8') 392 | except Exception: 393 | logging.exception('Unable to extract download data for {} '.format(response.request.url)) 394 | return {'data': {'id': name, 'text': data}} 395 | 396 | def _convert_to_json(self, response): 397 | """Converts response to JSON. 398 | If the response cannot be converted to JSON then `None` is returned. 399 | 400 | Args: 401 | response - An object of type `requests.models.Response` 402 | Returns: 403 | Response in JSON format if the response can be converted to JSON. `None` otherwise. 404 | """ 405 | try: 406 | return response.json() 407 | except ValueError: 408 | logging.warning('Expected response in JSON format from {0} but the actual response text is: {1}'.format( 409 | response.request.url, response.text, 410 | )) 411 | return None 412 | 413 | def _multi_request(self, verb, urls, query_params, data, to_json=True, send_as_file=False, file_download=False): 414 | """Issues multiple batches of simultaneous HTTP requests and waits for responses. 415 | 416 | Args: 417 | verb - MultiRequest._VERB_POST or MultiRequest._VERB_GET 418 | urls - A string URL or list of string URLs 419 | query_params - None, a dict, or a list of dicts representing the query params 420 | data - None, a dict or string, or a list of dicts and strings representing the data body. 421 | to_json - A boolean, should the responses be returned as JSON blobs 422 | Returns: 423 | If multiple requests are made - a list of dicts if to_json, a list of requests responses otherwise 424 | If a single request is made, the return is not a list 425 | Raises: 426 | InvalidRequestError - if no URL is supplied or if any of the requests returns 403 Access Forbidden response 427 | """ 428 | if not urls: 429 | raise InvalidRequestError('No URL supplied') 430 | 431 | # Break the params into batches of request_params 432 | request_params = self._zip_request_params(urls, query_params, data) 433 | batch_of_params = [ 434 | request_params[pos:pos + self._max_requests] 435 | for pos in range(0, len(request_params), self._max_requests) 436 | ] 437 | 438 | # Iteratively issue each batch, applying the rate limiter if necessary 439 | all_responses = [] 440 | for param_batch in batch_of_params: 441 | if self._rate_limiter: 442 | self._rate_limiter.make_calls(num_calls=len(param_batch)) 443 | 444 | prepared_requests = [ 445 | self._create_request( 446 | verb, url, query_params=query_param, data=datum, send_as_file=send_as_file, 447 | ) for url, query_param, datum in param_batch 448 | ] 449 | 450 | responses = self._wait_for_response(prepared_requests) 451 | for response in responses: 452 | if response and not file_download: 453 | all_responses.append(self._convert_to_json(response) if to_json else response) 454 | elif file_download: 455 | all_responses.append(self._handle_file_download(response)) 456 | else: 457 | all_responses.append(None) 458 | 459 | return all_responses 460 | 461 | def post_file(self, url, file, to_json=True): 462 | request = self._create_request(MultiRequest._VERB_POST, url) 463 | return request 464 | 465 | @classmethod 466 | def error_handling(cls, fn): 467 | """Decorator to handle errors""" 468 | def wrapper(*args, **kwargs): 469 | try: 470 | result = fn(*args, **kwargs) 471 | return result 472 | except InvalidRequestError as e: 473 | write_exception(e) 474 | 475 | if hasattr(e, 'request'): 476 | write_error_message('request {0}'.format(repr(e.request))) 477 | if hasattr(e, 'response'): 478 | write_error_message('response {0}'.format(repr(e.response))) 479 | 480 | raise e 481 | return wrapper 482 | -------------------------------------------------------------------------------- /threat_intel/virustotal.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # VirusTotalApi makes calls to the VirusTotal API. 4 | # 5 | from six.moves import range 6 | 7 | from threat_intel.util.api_cache import ApiCache 8 | from threat_intel.util.http import MultiRequest 9 | 10 | 11 | class VirusTotalApi(object): 12 | BASE_DOMAIN = u'https://www.virustotal.com/api/v3/' 13 | 14 | def __init__(self, api_key, cache_file_name=None, update_cache=True, req_timeout=None): 15 | """Establishes basic HTTP params and loads a cache. 16 | 17 | Args: 18 | api_key: VirusTotal API key 19 | cache_file_name: String file name of cache. 20 | update_cache: Determines whether cache should be written out back to the disk when closing it. 21 | Default is `True`. 22 | req_timeout: Maximum number of seconds to wait without reading a response byte before deciding an error has occurred. 23 | Default is None. 24 | """ 25 | self._requests = MultiRequest(req_timeout=req_timeout, default_headers={'x-apikey': api_key}, drop_404s=True) 26 | 27 | # Create an ApiCache if instructed to 28 | self._cache = ApiCache(cache_file_name, update_cache) if cache_file_name else None 29 | 30 | @MultiRequest.error_handling 31 | def get_file_reports(self, file_hash_list): 32 | """Retrieves the most recent reports for a set of md5, sha1, and/or sha2 hashes. 33 | 34 | Args: 35 | file_hash_list: list of string hashes. 36 | Returns: 37 | A dict with the hash as key and the VT report as value. 38 | """ 39 | api_name = 'virustotal-file-reports' 40 | api_endpoint = 'files/{}' 41 | 42 | all_responses, file_hash_list = self._bulk_cache_lookup(api_name, file_hash_list) 43 | response_chunks = self._request_reports(file_hash_list, api_endpoint) 44 | self._extract_response_chunks(all_responses, response_chunks, api_name) 45 | 46 | return all_responses 47 | 48 | @MultiRequest.error_handling 49 | def get_file_behaviour(self, file_hash_list): 50 | """Retrieves a report about the behaviour of a md5, sha1, and/or sha2 hash of 51 | a file when executed in a sandboxed environment (Cuckoo sandbox). 52 | 53 | Args: 54 | file_hash_list: list of string hashes. 55 | """ 56 | api_name = 'virustotal-file-behaviour' 57 | api_endpoint = 'files/{}/behaviours' 58 | 59 | all_responses, file_hash_list = self._bulk_cache_lookup(api_name, file_hash_list) 60 | response_chunks = self._request_reports(file_hash_list, api_endpoint) 61 | self._extract_response_chunks(all_responses, response_chunks, api_name) 62 | 63 | return all_responses 64 | 65 | @MultiRequest.error_handling 66 | def get_file_download(self, file_hash_list): 67 | """Retrieves a file from its a md5, sha1, and/or sha2 hash. 68 | 69 | Args: 70 | file_hash_list: list of string hashes. 71 | Returns: 72 | a base64encoded string of the file 73 | """ 74 | api_name = 'virustotal-file-download' 75 | api_endpoint = 'files/{}/download' 76 | return self._extract_all_responses(file_hash_list, api_endpoint, api_name, file_download=True) 77 | 78 | @MultiRequest.error_handling 79 | def get_file_contacted_domains(self, file_hash_list): 80 | """Retrieves a report about the contacted domains of a md5, sha1, and/or sha2 hash of 81 | file, when it is executed. 82 | 83 | Args: 84 | file_hash_list: list of string hashes. 85 | """ 86 | api_name = 'virustotal-file-contacted-domains' 87 | api_endpoint = 'files/{}/contacted_domains' 88 | 89 | return self._extract_all_responses(file_hash_list, api_endpoint, api_name) 90 | 91 | @MultiRequest.error_handling 92 | def get_file_contacted_ips(self, file_hash_list): 93 | """Retrieves a report about the contacted ip addresses of a md5, sha1, 94 | and/or sha2 hash of file, when it is executed. 95 | 96 | Args: 97 | resources: list of string hashes. 98 | """ 99 | api_name = 'virustotal-file-contacted-ips' 100 | api_endpoint = 'files/{}/contacted_ips' 101 | 102 | return self._extract_all_responses(file_hash_list, api_endpoint, api_name) 103 | 104 | @MultiRequest.error_handling 105 | def get_file_contacted_urls(self, file_hash_list): 106 | """Retrieves a report about the contacted urls of a md5, sha1, 107 | and/or sha2 hash of file, when it is executed. 108 | 109 | Args: 110 | file_hash_list: list of string hashes. 111 | """ 112 | api_name = 'virustotal-file-contacted-urls' 113 | api_endpoint = 'files/{}/contacted_urls' 114 | 115 | return self._extract_all_responses(file_hash_list, api_endpoint, api_name) 116 | 117 | @MultiRequest.error_handling 118 | def get_file_itw_urls(self, file_hash_list): 119 | """Retrieves a report about the in the wild URLs from where the file 120 | with the hash has been downloaded. 121 | 122 | Args: 123 | file_hash_list: list of string hashes. 124 | """ 125 | api_name = 'virustotal-file-itw-urls' 126 | api_endpoint = 'files/{}/itw_urls' 127 | 128 | return self._extract_all_responses(file_hash_list, api_endpoint, api_name) 129 | 130 | @MultiRequest.error_handling 131 | def get_domain_communicating_files(self, domain_list): 132 | """Retrieves a report about the files that communicate with this internet domain. 133 | 134 | Args: 135 | domain_list: list of string domains. 136 | """ 137 | api_name = 'virustotal-domain-communicating-files' 138 | api_endpoint = 'domains/{}/communicating_files' 139 | 140 | return self._extract_all_responses(domain_list, api_endpoint, api_name) 141 | 142 | @MultiRequest.error_handling 143 | def get_domain_referrer_files(self, domain_list): 144 | """Retrieves a report about the files containing the internet domain. 145 | 146 | Args: 147 | domain_list: list of string domains. 148 | """ 149 | api_name = 'virustotal-domain-referrer-files' 150 | api_endpoint = 'domains/{}/referrer_files' 151 | 152 | return self._extract_all_responses(domain_list, api_endpoint, api_name) 153 | 154 | @MultiRequest.error_handling 155 | def get_domain_reports(self, domain_list): 156 | """Retrieves the most recent VT info for a set of domains. 157 | 158 | Args: 159 | domain_list: list of string domains. 160 | Returns: 161 | A dict with the domain as key and the VT report as value. 162 | """ 163 | api_name = 'virustotal-domain-reports' 164 | 165 | (all_responses, domain_list) = self._bulk_cache_lookup(api_name, domain_list) 166 | responses = self._request_reports(domain_list, 'domains/{}') 167 | 168 | for domain, response in zip(domain_list, responses): 169 | if self._cache: 170 | self._cache.cache_value(api_name, domain, response) 171 | all_responses[domain] = response 172 | 173 | return all_responses 174 | 175 | @MultiRequest.error_handling 176 | def get_feeds_url(self, time_frame): 177 | """Retrieves a live feed with the latest URLs submitted to VT. 178 | 179 | Args: 180 | time_frame: a list of timeframe strings in date format YYYYMMDDhhmm. 181 | Returns: 182 | A base64 encoded bzip2 compressed UTF-8 text file contains one JSON structure per line. 183 | """ 184 | api_name = 'virustotal-url-distribution' 185 | all_responses = {} 186 | 187 | response = self._request_reports(time_frame, 'feeds/urls/{}', file_download=True) 188 | self._extract_response_chunks(all_responses, response, api_name) 189 | 190 | return all_responses 191 | 192 | @MultiRequest.error_handling 193 | def get_file_distribution(self, time_frame): 194 | """Retrieves a live feed with the latest hashes submitted to VT. 195 | 196 | Args: 197 | time_frame: A list of strings in format YYYYMMDDhhmm. 198 | Returns: 199 | A dict with the VT report. 200 | """ 201 | all_responses = {} 202 | api_name = 'virustotal-file-distribution' 203 | 204 | response = self._request_reports(time_frame, 'feeds/files/{}') 205 | self._extract_response_chunks(all_responses, response, api_name) 206 | 207 | return all_responses 208 | 209 | @MultiRequest.error_handling 210 | def get_url_reports(self, url_hash_list): 211 | """Retrieves a scan report on a given URL. 212 | 213 | Args: 214 | url_hash_list: list of sha256 hashed urls. 215 | Returns: 216 | A dict with the URL hash as key and the VT report as value. 217 | """ 218 | api_name = 'virustotal-url-reports' 219 | api_endpoint = 'urls/{}' 220 | 221 | return self._extract_all_responses(url_hash_list, api_endpoint, api_name) 222 | 223 | @MultiRequest.error_handling 224 | def get_ip_reports(self, ips): 225 | """Retrieves the most recent VT info for a set of ips. 226 | 227 | Args: 228 | ips: list of IPs. 229 | Returns: 230 | A dict with the IP as key and the VT report as value. 231 | """ 232 | api_name = 'virustotal-ip-address-reports' 233 | 234 | (all_responses, ips) = self._bulk_cache_lookup(api_name, ips) 235 | responses = self._request_reports(ips, 'ip_addresses/{}') 236 | 237 | for ip, response in zip(ips, responses): 238 | if self._cache: 239 | self._cache.cache_value(api_name, ip, response) 240 | all_responses[ip] = response 241 | 242 | return all_responses 243 | 244 | @MultiRequest.error_handling 245 | def get_file_search(self, query): 246 | """Performs advanced search on samples, matching certain binary/ 247 | metadata/detection criteria. 248 | Possible queries: file size, file type, first or last submission to 249 | VT, number of positives, bynary content, etc. 250 | 251 | Args: 252 | query: dictionary with search arguments 253 | Example: 'query': 'type:peexe size:90kb+ positives:5+ behaviour:"taskkill"' 254 | Returns: 255 | A dict with the VT report. 256 | """ 257 | api_name = 'virustotal-file-search' 258 | api_endpoint = 'intelligence/search?query={}' 259 | 260 | return self._extract_all_responses(query, api_endpoint, api_name) 261 | 262 | @MultiRequest.error_handling 263 | def get_file_clusters(self, time_frame): 264 | """Retrieves file similarity clusters for a given time frame. 265 | 266 | Args: 267 | time_frame: a list of time frames for which we want the clustering details in YYYYMMDDhhmm format. 268 | Returns: 269 | A dict with the VT report. 270 | """ 271 | api_name = 'virustotal-file-clusters' 272 | api_endpoint = 'feeds/file-behaviours/{}' 273 | 274 | return self._extract_all_responses(time_frame, api_endpoint, api_name) 275 | 276 | 277 | def _bulk_cache_lookup(self, api_name, keys): 278 | """Performes a bulk cache lookup and returns a tuple with the results 279 | found and the keys missing in the cache. If cached is not configured 280 | it will return an empty dictionary of found results and the initial 281 | list of keys. 282 | 283 | Args: 284 | api_name: a string name of the API. 285 | keys: an enumerable of string keys. 286 | Returns: 287 | A tuple: (responses found, missing keys). 288 | """ 289 | if self._cache: 290 | responses = self._cache.bulk_lookup(api_name, keys) 291 | missing_keys = [key for key in keys if key not in responses.keys()] 292 | return (responses, missing_keys) 293 | 294 | return ({}, keys) 295 | 296 | def _request_reports(self, ids, endpoint_name, file_download=False): 297 | """Sends multiples requests for the resources to a particular endpoint. 298 | 299 | Args: 300 | ids: list of the hash identifying the file. 301 | endpoint_name: VirusTotal endpoint URL suffix. 302 | file_download: boolean, whether a file download is expected 303 | Returns: 304 | A list of the responses. 305 | """ 306 | urls = ['{}{}'.format(self.BASE_DOMAIN, endpoint_name.format(id)) for id in ids] 307 | return self._requests.multi_get(urls, file_download=file_download) if urls else [] 308 | 309 | 310 | def _extract_cache_id(self, response): 311 | """Extracts the object hash from the response to be used to 312 | uniquely identify the result. 313 | 314 | Args: 315 | response: response object. 316 | Returns: 317 | A hash that uniquely identities the result. 318 | """ 319 | 320 | cache_id = None 321 | if isinstance(response['data'], list): 322 | if response['data']: 323 | # gets the first data items' id 324 | cache_id = response['data'][0]['id'] 325 | else: 326 | cache_id = response['data']['id'] 327 | # sandbox id output has an underscore as the separator 328 | if cache_id and '_' in cache_id: 329 | cache_id = cache_id.split('_')[0] 330 | return cache_id 331 | 332 | def _extract_all_responses(self, resources, api_endpoint, api_name, file_download=False): 333 | """ Aux function to extract all the API endpoint responses. 334 | 335 | Args: 336 | resources: list of string hashes. 337 | api_endpoint: endpoint path 338 | api_name: endpoint name 339 | Returns: 340 | A dict with the hash as key and the VT report as value. 341 | """ 342 | all_responses, resources = self._bulk_cache_lookup(api_name, resources) 343 | response_chunks = self._request_reports(resources, api_endpoint, file_download) 344 | self._extract_response_chunks(all_responses, response_chunks, api_name) 345 | 346 | return all_responses 347 | 348 | def _extract_response_chunks(self, all_responses, response_chunks, api_name): 349 | """Extracts and caches the responses from the response chunks in case 350 | of the responses for the requests containing multiple concatenated 351 | resources. Extracted responses are added to the already cached 352 | responses passed in the all_responses parameter. 353 | 354 | Args: 355 | all_responses: a list containing already cached responses. 356 | response_chunks: a list with response chunks. 357 | api_name: a string name of the API. 358 | """ 359 | for response_chunk in response_chunks: 360 | if not isinstance(response_chunk, list): 361 | response_chunk = [response_chunk] 362 | for response in response_chunk: 363 | if not response: 364 | continue 365 | 366 | cache_id = self._extract_cache_id(response) 367 | if cache_id: 368 | if self._cache: 369 | self._cache.cache_value(api_name, cache_id, response) 370 | all_responses[cache_id] = response 371 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | project = threat_intel 3 | envlist = py27,py36 4 | tox_pip_extensions_ext_pip_custom_platform = true 5 | 6 | [testenv] 7 | deps = -r{toxinidir}/requirements-dev.txt 8 | commands = 9 | {envpython} --version 10 | coverage run --source=threat_intel/,tests/ -m testify.test_program --summary --verbose {posargs:tests} 11 | coverage report -m 12 | 13 | [testenv:venv] 14 | envdir = virtualenv_run 15 | basepython = python2.7 16 | commands= 17 | 18 | [flake8] 19 | max_line_length = 140 20 | --------------------------------------------------------------------------------