├── tests
├── __init__.py
├── util
│ ├── __init__.py
│ ├── error_messages_test.py
│ ├── api_cache_test.py
│ └── http_test.py
├── data
│ └── response.xml
├── alexa_test.py
├── virustotal_test.py
└── opendns_test.py
├── MANIFEST.in
├── requirements.txt
├── threat_intel
├── util
│ ├── __init__.py
│ ├── error_messages.py
│ ├── api_cache.py
│ └── http.py
├── exceptions.py
├── shadowserver.py
├── alexaranking.py
├── opendns.py
├── virustotal.py
└── __init__.py
├── requirements-dev.txt
├── Makefile
├── .travis.yml
├── tox.ini
├── setup.py
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── .secrets.baseline
└── README.md
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.md
3 |
--------------------------------------------------------------------------------
/tests/util/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests-futures==0.9.9
2 | requests[security]==2.21.0
3 | simplejson==3.10.0
4 | six==1.10.0
5 |
--------------------------------------------------------------------------------
/threat_intel/util/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | __all__ = ['api_cache', 'error_messages', 'http']
4 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | -e .
2 | -r requirements.txt
3 | coverage==3.7.1
4 | mock==1.0.1
5 | pre-commit==1.13.0
6 | pyflakes==0.9.2
7 | testify==0.7.2
8 | tornado==4.5.3
9 | tox==2.3.1
10 |
--------------------------------------------------------------------------------
/tests/data/response.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/threat_intel/exceptions.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # All exceptions thrown by the threat_intel module
4 | #
5 |
6 |
7 | class InvalidRequestError(Exception):
8 |
9 | """Raised by MultiRequest when it can't figure out how to make a request."""
10 | pass
11 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .DELETE_ON_ERROR:
2 |
3 | all:
4 | echo >&2 "Must specify target."
5 |
6 | test:
7 | tox
8 |
9 | venv:
10 | tox -evenv
11 |
12 | install-hooks: venv
13 | virtualenv_run/bin/pre-commit install -f --install-hooks
14 |
15 | clean:
16 | rm -rf build/ dist/ threat_intel.egg-info/ .tox/ virtualenv_run/
17 | find . -name '*.pyc' -delete
18 | find . -name '__pycache__' -delete
19 |
20 | .PHONY: all test venv clean install-hooks
21 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - '2.7'
4 | - '3.6'
5 |
6 | install:
7 | - pip install tox-travis
8 |
9 | script: make test
10 | deploy:
11 | provider: pypi
12 | user: yelplabs
13 | password:
14 | secure: QG9rd2z6pH4E5NCph+mw739wsaTlTpy1c5+AR1q+w/ZSrMpucNdp1i8BXAgpj2kIvuaIQQd3Behu+SVd7u5TSCZoAE7PxUKBFvEiN/7g++RVlDlPcpXTVQT8qXfvFnTGCnS95pLhXVIMDJU4cUjjDS6kshBVuvn2MTwskY4emow=
15 | on:
16 | tags: true
17 | python: '3.6'
18 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | project = threat_intel
3 | envlist = py27,py36
4 | tox_pip_extensions_ext_pip_custom_platform = true
5 |
6 | [testenv]
7 | deps = -r{toxinidir}/requirements-dev.txt
8 | commands =
9 | {envpython} --version
10 | coverage run --source=threat_intel/,tests/ -m testify.test_program --summary --verbose {posargs:tests}
11 | coverage report -m
12 |
13 | [testenv:venv]
14 | envdir = virtualenv_run
15 | basepython = python2.7
16 | commands=
17 |
18 | [flake8]
19 | max_line_length = 140
20 |
--------------------------------------------------------------------------------
/threat_intel/util/error_messages.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # A set of simple methods for writing messages to stderr
4 | #
5 | import sys
6 | from traceback import extract_tb
7 | from traceback import format_list
8 |
9 |
10 | def write_exception(e):
11 | exc_type, __, exc_traceback = sys.exc_info()
12 | sys.stderr.write('[ERROR] {0} {1}\n'.format(exc_type.__name__, str(e)))
13 | for line in format_list(extract_tb(exc_traceback)):
14 | sys.stderr.write(line)
15 |
16 |
17 | def write_error_message(message):
18 | sys.stderr.write('[ERROR] ')
19 | sys.stderr.write(message)
20 | sys.stderr.write('\n')
21 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from setuptools import find_packages
3 | from setuptools import setup
4 |
5 |
6 | setup(
7 | name="threat_intel",
8 | version='0.2.1',
9 | provides=['threat_intel'],
10 | author="Yelp Security",
11 | url='https://github.com/Yelp/threat_intel',
12 | setup_requires='setuptools',
13 | license='Copyright 2016 Yelp',
14 | author_email="opensource@yelp.com",
15 | description="Collection of the API calls for various threat intel feeds.",
16 | packages=find_packages(),
17 | install_requires=[
18 | "requests-futures>=0.9.9",
19 | "requests[security]>=2.13.0",
20 | "simplejson>=3.10.0",
21 | "six>=1.10.0",
22 | ],
23 | )
24 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 |
25 | # PyInstaller
26 | # Usually these files are written by a python script from a template
27 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 |
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 |
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 |
43 | # Translations
44 | *.mo
45 | *.pot
46 |
47 | # Django stuff:
48 | *.log
49 |
50 | # Sphinx documentation
51 | docs/_build/
52 |
53 | # PyBuilder
54 | target/
55 |
56 | .coverage
57 | virtualenv_run/
58 | .DS_Store
59 | .idea
60 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: git://github.com/pre-commit/pre-commit-hooks
3 | sha: v2.1.0
4 | hooks:
5 | - id: trailing-whitespace
6 | - id: end-of-file-fixer
7 | - id: check-yaml
8 | - id: debug-statements
9 | - id: name-tests-test
10 | - id: check-added-large-files
11 | - id: check-byte-order-marker
12 | - id: fix-encoding-pragma
13 | - id: flake8
14 | - id: requirements-txt-fixer
15 | - repo: git://github.com/asottile/reorder_python_imports
16 | sha: v1.3.4
17 | hooks:
18 | - id: reorder-python-imports
19 | - repo: git@git.yelpcorp.com:mirrors/asottile/add-trailing-comma
20 | rev: v0.7.1
21 | hooks:
22 | - id: add-trailing-comma
23 | - repo: git@git.yelpcorp.com:mirrors/pre-commit/mirrors-autopep8
24 | rev: v1.4.3
25 | hooks:
26 | - id: autopep8
27 | - repo: https://github.com/Yelp/detect-secrets
28 | sha: v0.12.0
29 | hooks:
30 | - id: detect-secrets
31 | args: ['--baseline', '.secrets.baseline']
32 | exclude: .*tests/.*|\.pre-commit-config\.yaml
33 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Yelp.com
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/tests/util/error_messages_test.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from six import StringIO
3 |
4 | import testify as T
5 | from mock import patch
6 |
7 | from threat_intel.exceptions import InvalidRequestError
8 | from threat_intel.util.error_messages import write_error_message
9 | from threat_intel.util.error_messages import write_exception
10 |
11 |
12 | class StdErrTestCase(T.TestCase):
13 |
14 | """Mocks out sys.stderr"""
15 |
16 | @T.setup_teardown
17 | def setupStringIO(self):
18 | self._stringio = StringIO()
19 | with patch('sys.stderr', self._stringio):
20 | yield
21 |
22 |
23 | class WriteExceptionTest(StdErrTestCase):
24 |
25 | def test_simple_exception(self):
26 | try:
27 | raise Exception()
28 | except Exception as e:
29 | write_exception(e)
30 |
31 | output = self._stringio.getvalue()
32 | T.assert_equal(0, output.find('[ERROR]'))
33 |
34 | def test_specific_exception(self):
35 | try:
36 | raise InvalidRequestError()
37 | except Exception as e:
38 | write_exception(e)
39 |
40 | output = self._stringio.getvalue()
41 | T.assert_equal(0, output.find('[ERROR] InvalidRequestError'))
42 |
43 | def test_exception_message(self):
44 | try:
45 | raise InvalidRequestError('Look for me in validation')
46 | except Exception as e:
47 | write_exception(e)
48 |
49 | output = self._stringio.getvalue()
50 | T.assert_equal(0, output.find('[ERROR] InvalidRequestError Look for me in validation'))
51 |
52 |
53 | class WriteErrorMessageTest(StdErrTestCase):
54 |
55 | def test_write_error_message(self):
56 | message = 'Look for me in validation'
57 | expected = '[ERROR] Look for me in validation\n'
58 |
59 | write_error_message(message)
60 |
61 | output = self._stringio.getvalue()
62 | T.assert_equal(output, expected)
63 |
--------------------------------------------------------------------------------
/.secrets.baseline:
--------------------------------------------------------------------------------
1 | {
2 | "exclude": {
3 | "files": ".*tests/.*|\\.pre-commit-config\\.yaml",
4 | "lines": null
5 | },
6 | "generated_at": "2019-02-21T16:33:09Z",
7 | "plugins_used": [
8 | {
9 | "base64_limit": 4.5,
10 | "name": "Base64HighEntropyString"
11 | },
12 | {
13 | "hex_limit": 3,
14 | "name": "HexHighEntropyString"
15 | },
16 | {
17 | "name": "PrivateKeyDetector"
18 | }
19 | ],
20 | "results": {
21 | ".travis.yml": [
22 | {
23 | "hashed_secret": "9510ca1b3eda474063afbc25da5d08ac1314f340",
24 | "line_number": 14,
25 | "type": "Base64 High Entropy String"
26 | }
27 | ],
28 | "README.md": [
29 | {
30 | "hashed_secret": "d39359993ff73436cd2caf84970d3247051968b5",
31 | "line_number": 463,
32 | "type": "Hex High Entropy String"
33 | },
34 | {
35 | "hashed_secret": "8b0b46d5092ecb0b2e078091a07c421758d8b51e",
36 | "line_number": 545,
37 | "type": "Hex High Entropy String"
38 | },
39 | {
40 | "hashed_secret": "1d86040d03a0ace59fa4ef4988341f5dba9ddab8",
41 | "line_number": 719,
42 | "type": "Hex High Entropy String"
43 | },
44 | {
45 | "hashed_secret": "5ec0c35f36d8a545fb8225c525c9d9c3a3e174fc",
46 | "line_number": 720,
47 | "type": "Hex High Entropy String"
48 | }
49 | ],
50 | "threat_intel/__init__.py": [
51 | {
52 | "hashed_secret": "d39359993ff73436cd2caf84970d3247051968b5",
53 | "line_number": 370,
54 | "type": "Hex High Entropy String"
55 | },
56 | {
57 | "hashed_secret": "8b0b46d5092ecb0b2e078091a07c421758d8b51e",
58 | "line_number": 439,
59 | "type": "Hex High Entropy String"
60 | },
61 | {
62 | "hashed_secret": "1d86040d03a0ace59fa4ef4988341f5dba9ddab8",
63 | "line_number": 522,
64 | "type": "Hex High Entropy String"
65 | },
66 | {
67 | "hashed_secret": "5ec0c35f36d8a545fb8225c525c9d9c3a3e174fc",
68 | "line_number": 523,
69 | "type": "Hex High Entropy String"
70 | }
71 | ]
72 | },
73 | "version": "0.12.0"
74 | }
75 |
--------------------------------------------------------------------------------
/threat_intel/util/api_cache.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # ApiCache creates an on disk cache of API call results
4 | #
5 | import simplejson
6 | from simplejson.scanner import JSONDecodeError
7 |
8 |
9 | class ApiCache(object):
10 |
11 | """creates an on disk cache of API call results."""
12 |
13 | def __init__(self, cache_file_name, update_cache=True):
14 | """Opens the cache file and reads previous results.
15 |
16 | Args:
17 | cache_file_name: string file name
18 | update_cache: Specifies whether ApiCache should write out the
19 | cache file when closing it
20 | """
21 | self._cache_file_name = cache_file_name
22 | self._cache = self._read_cache_from_file()
23 | self._update_cache = update_cache
24 |
25 | def __del__(self):
26 | """Ensures cache is persisted to disk before object is destroyed.
27 |
28 | Using a destructor is a bit inflammatory but it seems like a very nice way to write a file when "everything is done".
29 | The ApiCache avoids circular dependencies so it should work out.
30 | """
31 | self.close()
32 |
33 | def close(self):
34 | """Write the contents of the cache to disk (only if `update_cache`
35 | parameter during the object initialization was not set to `False`) and
36 | clear the in memory cache."""
37 | if self._cache:
38 | if self._update_cache:
39 | self._write_cache_to_file()
40 | self._cache = None
41 |
42 | def _write_cache_to_file(self):
43 | """Write the contents of the cache to a file on disk."""
44 | with(open(self._cache_file_name, 'w')) as fp:
45 | fp.write(simplejson.dumps(self._cache))
46 |
47 | def _read_cache_from_file(self):
48 | """Read the contents of the cache from a file on disk."""
49 | cache = {}
50 | try:
51 | with(open(self._cache_file_name, 'r')) as fp:
52 | contents = fp.read()
53 | cache = simplejson.loads(contents)
54 | except (IOError, JSONDecodeError):
55 | # The file could not be read. This is not a problem if the file does not exist.
56 | pass
57 |
58 | return cache
59 |
60 | def cache_value(self, api_name, key, value):
61 | """Add the value of an API call to the cache.
62 |
63 | Args:
64 | api_name: a string name of the API. Keys and values are segmented by api_name.
65 | key: a string key for the specific call.
66 | value: the value of the call using the specific key
67 | """
68 | self._cache.setdefault(api_name, {})
69 | self._cache[api_name][key] = value
70 |
71 | def lookup_value(self, api_name, key):
72 | """Add the value of an API call to the cache.
73 |
74 | Args:
75 | api_name: a string name of the API. Keys and values are segmented by api_name.
76 | key: a string key for the specific call.
77 | """
78 | if api_name in self._cache:
79 | return self._cache[api_name].get(key, None)
80 | return None
81 |
82 | def bulk_lookup(self, api_name, keys):
83 | """Perform lookup on an enumerable of keys.
84 |
85 | Args:
86 | api_name: a string name of the API. Keys and values are segmented by api_name.
87 | keys: an enumerable of string keys.
88 | """
89 | cached_data = {}
90 |
91 | for key in keys:
92 | value = self.lookup_value(api_name, key)
93 | if value is not None:
94 | cached_data[key] = value
95 | return cached_data
96 |
--------------------------------------------------------------------------------
/tests/alexa_test.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | import testify as T
4 | from mock import patch
5 |
6 | from threat_intel.alexaranking import AlexaRankingApi
7 |
8 | from requests.models import Response
9 |
10 |
11 | class AlexaRankingApiTest(T.TestCase):
12 |
13 | """Tests requesting reports from AlexaRankingApi."""
14 |
15 | def mock_ok_response(self):
16 | """Mocks a successful request response."""
17 | content_ok = open("tests/data/response.xml").read()
18 | response = Response()
19 | response.status_code = 200
20 | response._content = content_ok
21 | return response
22 |
23 | def mock_bad_response(self):
24 | """Mocks an unsuccessful request response."""
25 | response = Response()
26 | content_bad = u'Internal Server Error'.encode('utf-8')
27 | response.status_code = 400
28 | response._content = content_bad
29 | return response
30 |
31 | @T.setup
32 | def setup_ar(self):
33 | self.ar = AlexaRankingApi()
34 |
35 | def _test_api_call(
36 | self, call, request, expected_query_params, api_response,
37 | expected_result):
38 | """
39 | Tests a AlexaRankingApi call by mocking out the HTTP request.
40 |
41 | Args:
42 | call: Function in AlexaRankingApi to call.
43 | endpoint: Endpoint of AlexaRanking API that is hit.
44 | request: Call arguments.
45 | expected_query_params: Parameters that should be passed to API.
46 | api_response: The expected response by the API.
47 | expected_result: What the call should return.
48 | """
49 | with patch.object(self.ar, '_requests') as request_mock:
50 | request_mock.multi_get.return_value = api_response
51 | result = call(request)
52 | request_mock.multi_get.assert_called_with(
53 | self.ar.BASE_URL,
54 | to_json=False,
55 | query_params=expected_query_params)
56 | T.assert_equal(result, expected_result)
57 |
58 | def test_get_alexa_rankings_good_response(self):
59 | successful_response = self.mock_ok_response()
60 | self._test_api_call(call=self.ar.get_alexa_rankings,
61 | request=['domain1.com'],
62 | expected_query_params=[{'url': 'domain1.com'}],
63 | api_response=[successful_response],
64 | expected_result={
65 | "domain1.com": {
66 | "attributes": {
67 | "domain": "domain1.com",
68 | "popularity": "81743",
69 | "reach": "76276",
70 | "rank": "-67329"
71 | }
72 | }
73 | })
74 |
75 | def test_get_alexa_rankings_bad_response(self):
76 | unsuccessful_response = self.mock_bad_response()
77 | self._test_api_call(call=self.ar.get_alexa_rankings,
78 | request=['domain2.com'],
79 | expected_query_params=[{'url': 'domain2.com'}],
80 | api_response=[unsuccessful_response],
81 | expected_result={
82 | "domain2.com": {
83 | "attributes": {
84 | "domain": "domain2.com"
85 | }
86 | }
87 | })
88 |
--------------------------------------------------------------------------------
/threat_intel/shadowserver.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # ShadowServerApi makes calls to the ShadowServer APIs.
4 | #
5 | from six.moves import range
6 | import simplejson
7 |
8 | from threat_intel.util.api_cache import ApiCache
9 | from threat_intel.util.http import MultiRequest
10 |
11 |
12 | class ShadowServerApi(object):
13 | BINTEST_URL = u'http://bin-test.shadowserver.org/api'
14 |
15 | def __init__(self, cache_file_name=None, update_cache=True, req_timeout=90.0):
16 | """Establishes basic HTTP params and loads a cache.
17 |
18 | Args:
19 | cache_file_name: String file name of cache.
20 | update_cache: Determines whether cache should be written out back to the disk when closing it.
21 | Default is `True`.
22 | req_timeout: Maximum number of seconds to wait without reading a response byte before deciding an error has occurred.
23 | Default is 90.0 seconds.
24 | """
25 |
26 | # TODO - lookup request rate limit
27 | # By observation, ShadowServer can be quite slow, so give it 90 seconds before it times out.
28 | self._requests = MultiRequest(max_requests=2, req_timeout=req_timeout)
29 |
30 | # Create an ApiCache if instructed to
31 | self._cache = ApiCache(cache_file_name, update_cache) if cache_file_name else None
32 |
33 | @MultiRequest.error_handling
34 | def get_bin_test(self, hashes):
35 | """Test hashes against a list of known software applications.
36 |
37 | Known hashes will return a dictionary of information.
38 | Unknown hashes will return nothing.
39 |
40 | Args:
41 | hashes: list of string hashes.
42 | Returns:
43 | A dict with the hash as key and the shadowserver report as value.
44 | """
45 | all_responses = {}
46 |
47 | if self._cache:
48 | api_name = 'shadowserver-bin-test'
49 | all_responses = self._cache.bulk_lookup(api_name, hashes)
50 | hashes = [key for key in hashes if key not in all_responses.keys()]
51 | all_responses = dict([(key, val) for key, val in all_responses.items() if len(val) >= 2])
52 |
53 | HASHES_PER_REQ = 25
54 | hash_chunks = ['\n'.join(hashes[pos:pos + HASHES_PER_REQ]) for pos in range(0, len(hashes), HASHES_PER_REQ)]
55 |
56 | responses = self._requests.multi_post(self.BINTEST_URL, data=hash_chunks, to_json=False, send_as_file=True)
57 | for response in responses:
58 | if response is not None and 200 == response.status_code:
59 | response_lines = response.text.split('\n')
60 | for line in response_lines:
61 | # Set an initial val.
62 | val = {}
63 |
64 | # There is just a key, no value. This means the hash was unknown to ShadowServer.
65 | index_of_first_space = line.find(' ')
66 | if -1 == index_of_first_space:
67 | index_of_first_space = len(line)
68 | key = line[:index_of_first_space].lower()
69 |
70 | # The response only has a JSON body if the hash was known.
71 | json_text = line[index_of_first_space + 1:]
72 | if len(json_text):
73 | try:
74 | val = simplejson.loads(json_text)
75 | # A very short response indicates an error?
76 | if len(val.keys()) >= 2:
77 | all_responses[key] = val
78 |
79 | except ValueError:
80 | # Sometimes ShadowServer returns invalid data. Silently skip it.
81 | pass
82 |
83 | if self._cache:
84 | self._cache.cache_value(api_name, key, val)
85 |
86 | return all_responses
87 |
--------------------------------------------------------------------------------
/threat_intel/alexaranking.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # AlexaRankingsAPI makes calls to the Alexa Ranking API
4 | #
5 | from threat_intel.util.api_cache import ApiCache
6 | from threat_intel.util.http import MultiRequest
7 | import xml.etree.ElementTree as ET
8 | from xml.etree.ElementTree import ParseError
9 |
10 |
11 | class AlexaRankingApi(object):
12 |
13 | BASE_URL = u'https://data.alexa.com/data?cli=10'
14 |
15 | def __init__(self, resources_per_req=10, cache_file_name=None,
16 | update_cache=True, req_timeout=None):
17 | """Establishes basic HTTP params and loads a cache.
18 |
19 | Args:
20 | resources_per_req: Maximum number of resources (hashes, URLs)
21 | to be send in a single request
22 | cache_file_name: String file name of cache.
23 | update_cache: Determines whether cache should be written out
24 | back to the disk when closing it.
25 | Default is `True`.
26 | req_timeout: Maximum number of seconds to wait without reading
27 | a response byte before deciding an error has occurred.
28 | Default is None.
29 | """
30 | self._resources_per_req = resources_per_req
31 | self._requests = MultiRequest(req_timeout=req_timeout)
32 |
33 | # Create an ApiCache if instructed to
34 | self._cache = ApiCache(cache_file_name,
35 | update_cache) if cache_file_name else None
36 |
37 | @MultiRequest.error_handling
38 | def get_alexa_rankings(self, domains):
39 | """Retrieves the most recent VT info for a set of domains.
40 |
41 | Args:
42 | domains: list of string domains.
43 | Returns:
44 | A dict with the domain as key and the VT report as value.
45 | """
46 | api_name = 'alexa_rankings'
47 |
48 | (all_responses, domains) = self._bulk_cache_lookup(api_name, domains)
49 | responses = self._request_reports(domains)
50 |
51 | for domain, response in zip(domains, responses):
52 | xml_response = self._extract_response_xml(domain, response)
53 | if self._cache:
54 | self._cache.cache_value(api_name, domain, response)
55 | all_responses[domain] = xml_response
56 |
57 | return all_responses
58 |
59 | def _request_reports(self, domains):
60 | """Sends multiples requests for the resources to a particular endpoint.
61 |
62 | Args:
63 | resource_param_name: a string name of the resource parameter.
64 | resources: list of of the resources.
65 | endpoint_name: AlexaRankingApi endpoint URL suffix.
66 | Returns:
67 | A list of the responses.
68 | """
69 | params = [{'url': domain} for domain in domains]
70 | responses = self._requests.multi_get(
71 | self.BASE_URL, query_params=params, to_json=False)
72 | return responses
73 |
74 | def _extract_response_xml(self, domain, response):
75 | """Extract XML content of an HTTP response into dictionary format.
76 |
77 | Args:
78 | response: HTML Response objects
79 | Returns:
80 | A dictionary: {alexa-ranking key : alexa-ranking value}.
81 | """
82 | attributes = {}
83 | alexa_keys = {'POPULARITY': 'TEXT', 'REACH': 'RANK', 'RANK': 'DELTA'}
84 | try:
85 | xml_root = ET.fromstring(response._content)
86 | for xml_child in xml_root.findall('SD//'):
87 | if xml_child.tag in alexa_keys and \
88 | alexa_keys[xml_child.tag] in xml_child.attrib:
89 | attributes[xml_child.tag.lower(
90 | )] = xml_child.attrib[alexa_keys[xml_child.tag]]
91 | except ParseError:
92 | # Skip ill-formatted XML and return no Alexa attributes
93 | pass
94 | attributes['domain'] = domain
95 | return {'attributes': attributes}
96 |
97 | def _bulk_cache_lookup(self, api_name, keys):
98 | """Performes a bulk cache lookup and returns a tuple with the results
99 | found and the keys missing in the cache. If cached is not configured
100 | it will return an empty dictionary of found results and the initial
101 | list of keys.
102 |
103 | Args:
104 | api_name: a string name of the API.
105 | keys: an enumerable of string keys.
106 | Returns:
107 | A tuple: (responses found, missing keys).
108 | """
109 | if self._cache:
110 | responses = self._cache.bulk_lookup(api_name, keys)
111 | missing_keys = [key for key in keys if key not in responses.keys()]
112 | return (responses, missing_keys)
113 |
114 | return ({}, keys)
115 |
--------------------------------------------------------------------------------
/tests/util/api_cache_test.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | from six.moves import builtins
4 | import simplejson
5 | import testify as T
6 | from mock import mock_open
7 | from mock import patch
8 |
9 | from threat_intel.util.api_cache import ApiCache
10 |
11 |
12 | def assert_cache_written(mock_write, patched_open):
13 | T.assert_equal(mock_write.call_count, 1)
14 |
15 | for call in patched_open.mock_calls:
16 | name, args, kwargs = call
17 | if '().write' != name:
18 | continue
19 |
20 | return simplejson.loads(args[0])
21 | return None
22 |
23 |
24 | def assert_cache_not_written(mock_write):
25 | T.assert_falsey(mock_write.called)
26 | return None
27 |
28 |
29 | class ApiCacheFileIOTest(T.TestCase):
30 |
31 | """Allows for setting and retrieving results of API calls."""
32 |
33 | @T.setup
34 | def setup_filename(self):
35 | self._file_name = '/tmp/any_name_will_do'
36 |
37 | def _open_cache(self, initial_contents=None, update_cache=True):
38 | """Creates an ApiCache object, mocking the contents of the cache on disk.
39 |
40 | Args:
41 | initial_contents: A dict containing the initial contents of the cache
42 | update_cache: Specifies whether ApiCache should write out the
43 | cache file when closing it
44 | Returns:
45 | ApiCache
46 | """
47 | if not initial_contents:
48 | initial_contents = {}
49 |
50 | file_contents = simplejson.dumps(initial_contents)
51 | mock_read = mock_open(read_data=file_contents)
52 | with patch.object(builtins, 'open', mock_read, create=True):
53 | api_cache = ApiCache(self._file_name, update_cache=update_cache)
54 | return api_cache
55 |
56 | def _close_cache(self, api_cache, cache_written=True):
57 | """Closes an ApiCache and reads the final contents that were written to disk.
58 |
59 | Args:
60 | api_cache: An ApiCache instance
61 | cache_written: Specifies whether it should test that the cache
62 | was written out to the cache file or whether to
63 | test that it was not written out
64 | Returns:
65 | A dict representing the contents of the cache that was written
66 | out to the cache file or `None` in case cache was not expected
67 | to be written out
68 | """
69 | mock_write = mock_open()
70 | with patch.object(builtins, 'open', mock_write, create=True) as patched_open:
71 | api_cache.close()
72 |
73 | if cache_written:
74 | return assert_cache_written(mock_write, patched_open)
75 |
76 | return assert_cache_not_written(mock_write)
77 |
78 | def test_create_cache(self):
79 | initial_contents = {
80 | 'banana': {
81 | 'apple': ['pear', 'panda'],
82 | 'sumo': False,
83 | 'rebel_base_count': 42
84 | },
85 | 'skiddo': 'Fo Sure',
86 | 'pi': 3.1415
87 | }
88 |
89 | api_cache = self._open_cache(initial_contents)
90 | final_contents = self._close_cache(api_cache)
91 | T.assert_equal(initial_contents, final_contents)
92 |
93 | def test_persist_objects(self):
94 | contents_to_load = {
95 | 'api1': {
96 | 'key1': 'value1',
97 | 'key2': 11,
98 | 'key3': {'some': 'dict'},
99 | 'key4': ['a', 'list']
100 | },
101 | 'api2': {
102 | 'key1': 'value42',
103 | 'key4': 'lavash bread'
104 | }
105 | }
106 |
107 | # Open an empty cache
108 | api_cache = self._open_cache()
109 |
110 | # Load the cache
111 | for api_name in contents_to_load.keys():
112 | for key in contents_to_load[api_name]:
113 | api_cache.cache_value(api_name, key, contents_to_load[api_name][key])
114 |
115 | # Verify the cache
116 | for api_name in contents_to_load.keys():
117 | for key in contents_to_load[api_name]:
118 | expected_val = contents_to_load[api_name][key]
119 | actual_val = api_cache.lookup_value(api_name, key)
120 | T.assert_equal(expected_val, actual_val)
121 |
122 | # Close the cache
123 | final_contents = self._close_cache(api_cache)
124 | T.assert_equal(contents_to_load, final_contents)
125 |
126 | def test_do_not_update_cache(self):
127 | initial_contents = {
128 | 'api1': {
129 | 'bingo': 'woohoo'
130 | },
131 | 'api2': {
132 | 'bongo': 'boo'
133 | }
134 | }
135 | api_cache = self._open_cache(initial_contents, False)
136 | final_contents = self._close_cache(api_cache, cache_written=False)
137 | T.assert_equal(None, final_contents)
138 |
--------------------------------------------------------------------------------
/tests/virustotal_test.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | import testify as T
4 | from mock import patch
5 | from mock import ANY
6 |
7 | from threat_intel.virustotal import VirusTotalApi
8 |
9 |
10 | class VirusTotalApiTest(T.TestCase):
11 |
12 | """Tests requesting reports from VirusTotalApi."""
13 |
14 | @T.setup
15 | def setup_vt(self):
16 | self.vt = VirusTotalApi('test_key')
17 |
18 | def _test_api_call(self, call, endpoint, request, expected_query_params, api_response, expected_result):
19 | """
20 | Tests a VirusTotalApi call by mocking out the HTTP request.
21 |
22 | Args:
23 | call: function in VirusTotalApi to call.
24 | endpoint: endpoint of VirusTotal API that is hit (appended to base url)
25 | request: call arguments
26 | expected_query_params: query parameters that should be passed to API
27 | api_response: the expected response by the API
28 | expected_result: what call should return (given the api response provided)
29 | """
30 | with patch.object(self.vt, '_requests') as request_mock:
31 | request_mock.multi_get.return_value = api_response
32 | result = call(request)
33 | param_list = [self.vt.BASE_DOMAIN + endpoint.format(param) for param in expected_query_params]
34 | request_mock.multi_get.assert_called_with(param_list, file_download=ANY)
35 | T.assert_equal(result, expected_result)
36 |
37 | def test_get_file_reports(self):
38 | self._test_api_call(call=self.vt.get_file_reports,
39 | endpoint='files/{}',
40 | request=['file1', 'file2'],
41 | expected_query_params=['file1', 'file2'],
42 | api_response=[{'data':{'id': 'file1'}}, {'data':{'id': 'file2'}}],
43 | expected_result={'file1': {'data': {'id': 'file1'}},
44 | 'file2': {'data': {'id': 'file2'}}})
45 |
46 | def test_get_file_behaviour(self):
47 | self._test_api_call(call=self.vt.get_file_behaviour,
48 | endpoint='files/{}/behaviours',
49 | request=['file1', 'file2'],
50 | expected_query_params=['file1', 'file2'],
51 | api_response=[{'data':{'id': 'file1'}}, {'data':{'id': 'file2'}}],
52 | expected_result={'file1': {'data': {'id': 'file1'}},
53 | 'file2': {'data': {'id': 'file2'}}})
54 |
55 | def test_get_file_download(self):
56 | self._test_api_call(call=self.vt.get_file_download,
57 | endpoint='files/{}/download',
58 | request=['file1', 'file2'],
59 | expected_query_params=['file1', 'file2'],
60 | api_response=[{'data':{'id': 'file1'}}, {'data':{'id': 'file2'}}],
61 | expected_result={'file1': {'data': {'id': 'file1'}},
62 | 'file2': {'data': {'id': 'file2'}}})
63 |
64 | def test_get_domain_reports(self):
65 | self._test_api_call(call=self.vt.get_domain_reports,
66 | endpoint='domains/{}',
67 | request=['domain1', 'domain2'],
68 | expected_query_params=['domain1', 'domain2'],
69 | api_response=[{}, {}],
70 | expected_result={'domain1': {},
71 | 'domain2': {}})
72 |
73 | def test_get_url_reports(self):
74 | self._test_api_call(call=self.vt.get_url_reports,
75 | endpoint='urls/{}',
76 | request=['url1', 'url2'],
77 | expected_query_params = ['url1', 'url2'],
78 | api_response=[{'data':{'id': 'url1'}}, {'data':{'id': 'url2'}}],
79 | expected_result={'url1': {'data': {'id': 'url1'}},
80 | 'url2': {'data': {'id': 'url2'}}})
81 |
82 | def test_get_ip_reports(self):
83 | self._test_api_call(call=self.vt.get_ip_reports,
84 | endpoint='ip_addresses/{}',
85 | request=['ip1', 'ip2'],
86 | expected_query_params=['ip1', 'ip2'],
87 | api_response=[{}, {}],
88 | expected_result={'ip1': {},
89 | 'ip2': {}})
90 |
91 | def test_get_file_contacted_domains(self):
92 | self._test_api_call(call=self.vt.get_file_contacted_domains,
93 | endpoint='files/{}/contacted_domains',
94 | request=['domain1', 'domain2'],
95 | expected_query_params=['domain1', 'domain2'],
96 | api_response=[{'data':{'id': 'domain1'}}, {'data':{'id': 'domain2'}}],
97 | expected_result={'domain1': {'data': {'id': 'domain1'}},
98 | 'domain2': {'data': {'id': 'domain2'}}})
99 |
100 | def test_get_file_contacted_ips(self):
101 | self._test_api_call(call=self.vt.get_file_contacted_ips,
102 | endpoint='files/{}/contacted_ips',
103 | request=['file1', 'file2'],
104 | expected_query_params=['file1', 'file2'],
105 | api_response=[{'data':{'id': 'file1'}}, {'data':{'id': 'file2'}}],
106 | expected_result={'file1': {'data': {'id': 'file1'}},
107 | 'file2': {'data': {'id': 'file2'}}})
108 |
109 | def test_get_file_contacted_urls(self):
110 | self._test_api_call(call=self.vt.get_file_contacted_urls,
111 | endpoint='files/{}/contacted_urls',
112 | request=['file1', 'file2'],
113 | expected_query_params=['file1', 'file2'],
114 | api_response=[{'data':{'id': 'file1'}}, {'data':{'id': 'file2'}}],
115 | expected_result={'file1': {'data': {'id': 'file1'}},
116 | 'file2': {'data': {'id': 'file2'}}})
117 |
118 | def test_get_file_itw_urls(self):
119 | self._test_api_call(call=self.vt.get_file_itw_urls,
120 | endpoint='files/{}/itw_urls',
121 | request=['file1', 'file2'],
122 | expected_query_params=['file1', 'file2'],
123 | api_response=[{'data':{'id': 'file1'}}, {'data':{'id': 'file2'}}],
124 | expected_result={'file1': {'data': {'id': 'file1'}},
125 | 'file2': {'data': {'id': 'file2'}}})
126 |
127 | def test_get_domain_communicating_files(self):
128 | self._test_api_call(call=self.vt.get_domain_communicating_files,
129 | endpoint='domains/{}/communicating_files',
130 | request=['domain1', 'domain2'],
131 | expected_query_params=['domain1', 'domain2'],
132 | api_response=[{'data':{'id': 'domain1'}}, {'data':{'id': 'domain2'}}],
133 | expected_result={'domain1': {'data': {'id': 'domain1'}},
134 | 'domain2': {'data': {'id': 'domain2'}}})
135 |
136 | def test_get_domain_referrer_files(self):
137 | self._test_api_call(call=self.vt.get_domain_referrer_files,
138 | endpoint='domains/{}/referrer_files',
139 | request=['domain1', 'domain2'],
140 | expected_query_params=['domain1', 'domain2'],
141 | api_response=[{'data':{'id': 'domain1'}}, {'data':{'id': 'domain2'}}],
142 | expected_result={'domain1': {'data': {'id': 'domain1'}},
143 | 'domain2': {'data': {'id': 'domain2'}}})
144 | def test_get_domain_reports(self):
145 | self._test_api_call(call=self.vt.get_domain_reports,
146 | endpoint='domains/{}',
147 | request=['domain1', 'domain2'],
148 | expected_query_params=['domain1', 'domain2'],
149 | api_response=[{}, {}],
150 | expected_result={'domain1': {},
151 | 'domain2': {}})
152 |
153 | def test_get_file_clusters(self):
154 | self._test_api_call(call=self.vt.get_file_clusters,
155 | endpoint='feeds/file-behaviours/{}',
156 | request=['time1', 'time2'],
157 | expected_query_params=['time1', 'time2'],
158 | api_response=[{'data':{'id': 'time1'}}, {'data':{'id': 'time2'}}],
159 | expected_result={'time1': {'data': {'id': 'time1'}},
160 | 'time2': {'data': {'id': 'time2'}}})
161 |
--------------------------------------------------------------------------------
/tests/util/http_test.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import logging
3 | from itertools import chain
4 |
5 | import testify as T
6 | from mock import MagicMock
7 | from requests.models import Response
8 | from requests_futures import sessions
9 |
10 | from threat_intel.exceptions import InvalidRequestError
11 | from threat_intel.util.http import MultiRequest
12 |
13 |
14 | class MultiRequestTest(T.TestCase):
15 |
16 | def mock_ok_responses(self, number_of_responses):
17 | """Mocks `number_of_responses` response mocks. All of them are with the "200 OK" HTTP status code."""
18 | responses = [Response() for _ in range(number_of_responses)]
19 | for response in responses:
20 | response.status_code = 200
21 | response._content = u'{"Director": "Alejandro González Iñárritu"}'.encode('utf-8')
22 | return responses
23 |
24 | def mock_forbidden_response(self, response):
25 | """Mocks forbidden response by changing its status code to 403 and the content to indicate the error."""
26 | response.status_code = 403
27 | response._content = u'Forbidden'.encode('utf-8')
28 |
29 | def mock_unsuccessful_response(self, response):
30 | """Mocks unsuccessful response by changing its status code to 500 and the content to indicate the error."""
31 | response.status_code = 500
32 | response._content = u'Internal Server Error'.encode('utf-8')
33 |
34 | def mock_not_found_response(self, response):
35 | """Mocks a 404 response by changes its status code"""
36 | response.status_code = 404
37 | response._content = u'Not Found'.encode('utf-8')
38 |
39 | def mock_unsuccessful_responses(self, responses):
40 | """Mocks unsuccessful responses by changing their status code to 500 and the content to indicate the error."""
41 | for response in responses:
42 | self.mock_unsuccessful_response(response)
43 |
44 | def mock_json_convertion_error(self, response):
45 | """Mocks the exception raised in case response cannot be converted to JSON.
46 | Based on http://docs.python-requests.org/en/master/user/quickstart/#json-response-content
47 | """
48 | response.json = MagicMock(side_effect=ValueError('No JSON object could be decoded'))
49 | response._content = u'This is not JSON'.encode('utf-8')
50 | response.request = MagicMock()
51 | # this is necessary for the log message referencing the URL
52 | response.request.response = response
53 |
54 | def mock_request_futures(self, responses):
55 | """Mocks session.request method call returning `responses`."""
56 | mock_responder = MagicMock(name='requests_session')
57 | mock_responder.return_value.result = MagicMock(side_effect=responses)
58 | sessions.FuturesSession.get = mock_responder
59 | sessions.FuturesSession.post = mock_responder
60 | return mock_responder
61 |
62 | def test_multi_get_none_response(self):
63 | """Tests the behavior of the `multi_get()` method when one of the responses is `None`."""
64 | number_of_requests = 10
65 | query_params = [{'Jim Bridger': 'Will Poulter'}] * number_of_requests
66 | responses = self.mock_ok_responses(number_of_requests)
67 | responses[3] = None
68 | self.mock_request_futures(responses)
69 |
70 | actual_responses = MultiRequest(max_retry=1).multi_get('example.com', query_params)
71 |
72 | T.assert_equals(10, len(actual_responses))
73 | T.assert_is(actual_responses[3], None)
74 |
75 | def test_multi_get_access_forbidden(self):
76 | """Tests the exception handling in the cases when a request returns "403 Forbidden"."""
77 | number_of_requests = 20
78 | query_params = [{'Hugh Glass': 'Leonardo DiCaprio'}] * number_of_requests
79 | responses = self.mock_ok_responses(number_of_requests)
80 | self.mock_forbidden_response(responses[13])
81 | self.mock_request_futures(responses)
82 |
83 | with T.assert_raises_such_that(InvalidRequestError, lambda e: T.assert_equal(str(e), 'Access forbidden')):
84 | MultiRequest().multi_get('example.com', query_params)
85 |
86 | def test_multi_get_max_retry(self):
87 | """Tests the case when the number of the maximum retries is reached, due to the unsuccessful responses.
88 | Request is repeated 3 times (based on `max_retry`), each time there is only one successful response.
89 | Eventually the call to `multi_get` returns the responses among which one is unsuccessful (`None`).
90 | """
91 | number_of_requests = 4
92 | query_params = [{'John Fitzgerald': 'Tom Hardy'}] * number_of_requests
93 | responses_to_calls = [
94 | self.mock_ok_responses(number_of_requests),
95 | self.mock_ok_responses(number_of_requests - 1),
96 | self.mock_ok_responses(number_of_requests - 2),
97 | ]
98 | # mock unsuccessful responses to the first call
99 | self.mock_unsuccessful_responses(responses_to_calls[0][0:3])
100 | # mock unsuccessful responses to the second call
101 | self.mock_unsuccessful_responses(responses_to_calls[1][1:3])
102 | # mock unsuccessful response to the third call
103 | self.mock_unsuccessful_response(responses_to_calls[2][1])
104 | get_mock = self.mock_request_futures(chain.from_iterable(responses_to_calls))
105 |
106 | actual_responses = MultiRequest(max_retry=3).multi_get('example.com', query_params)
107 |
108 | T.assert_equal(get_mock.call_count, 9)
109 | T.assert_is(actual_responses[2], None)
110 |
111 | def test_multi_get_response_to_json(self):
112 | """Tests the exception handling in the cases when the response was supposed to return JSON but did not."""
113 | number_of_requests = 5
114 | query_params = [{'Andrew Henry': 'Domhnall Gleeson'}] * number_of_requests
115 | responses = self.mock_ok_responses(number_of_requests)
116 | self.mock_json_convertion_error(responses[3])
117 | self.mock_request_futures(responses)
118 | logging.warning = MagicMock()
119 |
120 | actual_responses = MultiRequest().multi_get('example.com', query_params)
121 |
122 | T.assert_equals(5, len(actual_responses))
123 | T.assert_is(actual_responses[3], None)
124 | logging.warning.called_once_with(
125 | 'Expected response in JSON format from example.com/movie/TheRevenant'
126 | ' but the actual response text is: This is not JSON',
127 | )
128 |
129 | def test_multi_get_retry_only_unsuccessful_requests(self):
130 | """Tests whether only the unsuccessful requests are passed to the consequitive request calls.
131 | 3 unsuccessful responses to the first request batch and then 2 unsuccessful responses to the second.
132 | The third (and the last) returns successful responses only.
133 | """
134 | responses_to_calls = [
135 | self.mock_ok_responses(10),
136 | self.mock_ok_responses(3),
137 | self.mock_ok_responses(2),
138 | ]
139 | # mock unsuccessful responses to the first call
140 | unsuccessful_responses_first_call = [
141 | responses_to_calls[0][2],
142 | responses_to_calls[0][3],
143 | responses_to_calls[0][5],
144 | ]
145 | self.mock_unsuccessful_responses(unsuccessful_responses_first_call)
146 | # mock unsuccessful responses to the second call
147 | unsuccessful_responses_second_call = [
148 | responses_to_calls[1][0],
149 | responses_to_calls[1][2],
150 | ]
151 | self.mock_unsuccessful_responses(unsuccessful_responses_second_call)
152 | mock_get = self.mock_request_futures(chain.from_iterable(responses_to_calls))
153 |
154 | query_params = [
155 | {'Max Rockatansky': 'Tom Hardy'},
156 | {'Imperator Furiosa': 'Charlize Theron'},
157 | {'Nux': 'Nicholas Hoult'},
158 | {'Immortan Joe': 'Hugh Keays-Byrne'},
159 | {'Slit': 'Josh Helman'},
160 | {'Rictus Erectus': 'Nathan Jones'},
161 | {'Toast the Knowing': 'Zoë Kravitz'},
162 | {'The Splendid Angharad': 'Rosie Huntington-Whiteley'},
163 | {'Capable': 'Riley Keough'},
164 | {'The Dag': 'Abbey Lee'},
165 | ]
166 |
167 | MultiRequest().multi_get('example.com', query_params)
168 | T.assert_equal(mock_get.call_count, 15) # 10 + 3 + 2
169 | call_params = [kwargs['params'] for args, kwargs in mock_get.call_args_list]
170 | # Assert retries
171 | call_params_keys = [list(cp.keys())[0] for cp in call_params]
172 | T.assert_equal(call_params_keys.count('Nux'), 3)
173 | T.assert_equal(call_params_keys.count('Immortan Joe'), 2)
174 | T.assert_equal(call_params_keys.count('Rictus Erectus'), 3)
175 |
176 | def test_multi_get_drop_404s(self):
177 | responses_to_calls = self.mock_ok_responses(3)
178 | self.mock_not_found_response(responses_to_calls[1])
179 | query_params = [{'Hugh Glass': 'Leonardo DiCaprio'}] * 3
180 | get_mock = self.mock_request_futures(responses_to_calls)
181 | result = MultiRequest(drop_404s=True).multi_get('example.org', query_params)
182 | T.assert_equal(get_mock.call_count, 3)
183 | T.assert_is(result[1], None)
184 |
--------------------------------------------------------------------------------
/threat_intel/opendns.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # InvestigateApi makes calls to the OpenDNS Investigate API.
4 | #
5 | from warnings import warn
6 |
7 | import simplejson
8 | from six.moves import range
9 |
10 | from threat_intel.util.api_cache import ApiCache
11 | from threat_intel.util.error_messages import write_error_message
12 | from threat_intel.util.error_messages import write_exception
13 | from threat_intel.util.http import MultiRequest
14 |
15 |
16 | def _cached_by_domain(api_name):
17 | """A caching wrapper for functions that take a list of domains as
18 | parameters.
19 |
20 | Raises:
21 | ResponseError - if the response received from the endpoint is
22 | not valid.
23 | """
24 |
25 | def wrapped(func):
26 | def decorated(self, domains):
27 | if not self._cache:
28 | return func(self, domains)
29 |
30 | all_responses = self._cache.bulk_lookup(api_name, domains)
31 | domains = list(set(domains) - set(all_responses))
32 |
33 | if domains:
34 | response = func(self, domains)
35 |
36 | if not response:
37 | raise ResponseError("No response for uncached domains")
38 |
39 | for domain in response:
40 | self._cache.cache_value(api_name, domain, response[domain])
41 | all_responses[domain] = response[domain]
42 |
43 | return all_responses
44 | return decorated
45 | return wrapped
46 |
47 |
48 | class InvestigateApi(object):
49 |
50 | """Calls the OpenDNS investigate API.
51 |
52 | Applies rate limits and issues parallel requests.
53 | """
54 |
55 | BASE_URL = u'https://investigate.api.umbrella.com/'
56 |
57 | # TODO: consider moving this to a config file
58 | MAX_DOMAINS_IN_POST = 1000
59 |
60 | def __init__(self, api_key, cache_file_name=None, update_cache=True, req_timeout=None):
61 | auth_header = {'Authorization': 'Bearer {0}'.format(api_key)}
62 | self._requests = MultiRequest(
63 | default_headers=auth_header, max_requests=12, rate_limit=30,
64 | req_timeout=req_timeout, drop_404s=True,
65 | )
66 |
67 | # Create an ApiCache if instructed to
68 | self._cache = ApiCache(cache_file_name, update_cache) if cache_file_name else None
69 |
70 | @classmethod
71 | def _to_url(cls, url_path):
72 | try:
73 | return u'{0}{1}'.format(cls.BASE_URL, url_path)
74 | except Exception as e:
75 | write_error_message(url_path)
76 | write_exception(e)
77 | raise e
78 |
79 | @classmethod
80 | def _to_urls(cls, fmt_url_path, url_path_args):
81 | url_paths = []
82 | for path_arg in url_path_args:
83 | try:
84 | url_paths.append(fmt_url_path.format(path_arg))
85 | except Exception as e:
86 | write_error_message(path_arg)
87 | write_exception(e)
88 | raise e
89 |
90 | return [cls._to_url(url_path) for url_path in url_paths]
91 |
92 | @MultiRequest.error_handling
93 | def _multi_post(self, url_path, domains):
94 | data = [simplejson.dumps(domains[pos:pos + self.MAX_DOMAINS_IN_POST]) for pos in range(0, len(domains), self.MAX_DOMAINS_IN_POST)]
95 | # multi_post() returns list of dictionaries, so they need to be merged into one dict
96 | all_responses = self._requests.multi_post(self._to_url(url_path), data=data)
97 | responses = {}
98 | for r in all_responses:
99 | responses.update(r)
100 | return responses
101 |
102 | @_cached_by_domain(api_name='opendns-categorization')
103 | def categorization(self, domains):
104 | """Calls categorization end point and adds an 'is_suspicious' key to each response.
105 |
106 | Args:
107 | domains: An enumerable of domains
108 | Returns:
109 | A dict of {domain: categorization_result}
110 | """
111 | url_path = u'domains/categorization/?showLabels'
112 | return self._multi_post(url_path, domains)
113 |
114 | @_cached_by_domain(api_name='opendns-domain_score')
115 | def domain_score(self, domains):
116 | """Calls domain scores endpoint.
117 |
118 | This method is deprecated since OpenDNS Investigate API
119 | endpoint is also deprecated.
120 | """
121 | warn(
122 | 'OpenDNS Domain Scores endpoint is deprecated. Use '
123 | 'InvestigateApi.categorization() instead', DeprecationWarning,
124 | )
125 | url_path = 'domains/score/'
126 | return self._multi_post(url_path, domains)
127 |
128 | @MultiRequest.error_handling
129 | def _multi_get(self, cache_api_name, fmt_url_path, url_params, query_params=None):
130 | """Makes multiple GETs to an OpenDNS endpoint.
131 |
132 | Args:
133 | cache_api_name: string api_name for caching
134 | fmt_url_path: format string for building URL paths
135 | url_params: An enumerable of strings used in building URLs
136 | query_params - None / dict / list of dicts containing query params
137 | Returns:
138 | A dict of {url_param: api_result}
139 | """
140 | all_responses = {}
141 |
142 | if self._cache:
143 | all_responses = self._cache.bulk_lookup(cache_api_name, url_params)
144 | url_params = [key for key in url_params if key not in all_responses.keys()]
145 |
146 | if len(url_params):
147 | urls = self._to_urls(fmt_url_path, url_params)
148 | responses = self._requests.multi_get(urls, query_params)
149 | for url_param, response in zip(url_params, responses):
150 | if self._cache:
151 | self._cache.cache_value(cache_api_name, url_param, response)
152 | all_responses[url_param] = response
153 |
154 | return all_responses
155 |
156 | def security(self, domains):
157 | """Calls security end point and adds an 'is_suspicious' key to each response.
158 |
159 | Args:
160 | domains: An enumerable of strings
161 | Returns:
162 | A dict of {domain: security_result}
163 | """
164 | api_name = 'opendns-security'
165 | fmt_url_path = u'security/name/{0}.json'
166 | return self._multi_get(api_name, fmt_url_path, domains)
167 |
168 | def whois_emails(self, emails):
169 | """Calls WHOIS Email end point
170 |
171 | Args:
172 | emails: An enumerable of string Emails
173 | Returns:
174 | A dict of {email: domain_result}
175 | """
176 | api_name = 'opendns-whois-emails'
177 | fmt_url_path = u'whois/emails/{0}'
178 | return self._multi_get(api_name, fmt_url_path, emails)
179 |
180 | def whois_nameservers(self, nameservers):
181 | """Calls WHOIS Nameserver end point
182 |
183 | Args:
184 | emails: An enumerable of nameservers
185 | Returns:
186 | A dict of {nameserver: domain_result}
187 | """
188 | api_name = 'opendns-whois-nameservers'
189 | fmt_url_path = u'whois/nameservers/{0}'
190 | return self._multi_get(api_name, fmt_url_path, nameservers)
191 |
192 | def whois_domains(self, domains):
193 | """Calls WHOIS domain end point
194 |
195 | Args:
196 | domains: An enumerable of domains
197 | Returns:
198 | A dict of {domain: domain_result}
199 | """
200 | api_name = 'opendns-whois-domain'
201 | fmt_url_path = u'whois/{0}'
202 | return self._multi_get(api_name, fmt_url_path, domains)
203 |
204 | def whois_domains_history(self, domains):
205 | """Calls WHOIS domain history end point
206 |
207 | Args:
208 | domains: An enumerable of domains
209 | Returns:
210 | A dict of {domain: domain_history_result}
211 | """
212 | api_name = 'opendns-whois-domain-history'
213 | fmt_url_path = u'whois/{0}/history'
214 | return self._multi_get(api_name, fmt_url_path, domains)
215 |
216 | def cooccurrences(self, domains):
217 | """Get the domains related to input domains.
218 |
219 | Args:
220 | domains: an enumerable of strings domain names
221 | Returns:
222 | An enumerable of string domain names
223 | """
224 | api_name = 'opendns-cooccurrences'
225 | fmt_url_path = u'recommendations/name/{0}.json'
226 | return self._multi_get(api_name, fmt_url_path, domains)
227 |
228 | def domain_tag(self, domains):
229 | """Get the data range when a domain is part of OpenDNS block list.
230 |
231 | Args:
232 | domains: an enumerable of strings domain names
233 | Returns:
234 | An enumerable of string with period, category, and url
235 | """
236 | api_name = 'opendns-domain_tag'
237 | fmt_url_path = u'domains/{0}/latest_tags'
238 | return self._multi_get(api_name, fmt_url_path, domains)
239 |
240 | def related_domains(self, domains):
241 | """Get list of domain names that have been seen requested around the
242 | same time (up to 60 seconds before or after) to the given domain name.
243 |
244 | Args:
245 | domains: an enumerable of strings domain names
246 | Returns:
247 | An enumerable of [domain name, scores]
248 | """
249 | api_name = 'opendns-related_domains'
250 | fmt_url_path = u'links/name/{0}.json'
251 | return self._multi_get(api_name, fmt_url_path, domains)
252 |
253 | def rr_history(self, ips):
254 | """Get the domains related to input ips.
255 |
256 | Args:
257 | ips: an enumerable of strings as ips
258 | Returns:
259 | An enumerable of resource records and features
260 | """
261 | api_name = 'opendns-rr_history'
262 | fmt_url_path = u'dnsdb/ip/a/{0}.json'
263 | return self._multi_get(api_name, fmt_url_path, ips)
264 |
265 | def dns_rr(self, ips):
266 | """Get the domains related to input domains.
267 |
268 | Args:
269 | domains: an enumerable of strings as domains
270 | Returns:
271 | An enumerable of resource records and features
272 | """
273 | api_name = 'opendns-dns_rr'
274 | fmt_url_path = u'dnsdb/name/a/{0}.json'
275 | return self._multi_get(api_name, fmt_url_path, ips)
276 |
277 | def latest_malicious(self, ips):
278 | """Get the a list of malicious domains related to input ips.
279 |
280 | Args:
281 | ips: an enumerable of strings as ips
282 | Returns:
283 | An enumerable of strings for the malicious domains
284 | """
285 | api_name = 'opendns-latest_malicious'
286 | fmt_url_path = u'ips/{0}/latest_domains'
287 | return self._multi_get(api_name, fmt_url_path, ips)
288 |
289 | def sample(self, hashes):
290 | """Get the information about a sample based on its hash.
291 |
292 | Args:
293 | hashes: an enumerable of strings as hashes
294 | Returns:
295 | An enumerable of arrays which contains the information
296 | about the original samples
297 | """
298 | api_name = 'opendns-sample'
299 | fmt_url_path = u'sample/{0}'
300 | return self._multi_get(api_name, fmt_url_path, hashes)
301 |
302 | def search(self, patterns, start=30, limit=1000, include_category=False):
303 | """Performs pattern searches against the Investigate database.
304 |
305 | Args:
306 | patterns: An enumerable of RegEx domain patterns to search for
307 | start: How far back results extend from in days (max is 30)
308 | limit: Number of results to show (max is 1000)
309 | include_category: Include OpenDNS security categories
310 | Returns:
311 | An enumerable of matching domain strings
312 | """
313 | api_name = 'opendns-patterns'
314 | fmt_url_path = u'search/{0}'
315 | start = '-{0}days'.format(start)
316 | include_category = str(include_category).lower()
317 | query_params = {
318 | 'start': start,
319 | 'limit': limit,
320 | 'includecategory': include_category,
321 | }
322 | return self._multi_get(api_name, fmt_url_path, patterns, query_params)
323 |
324 | def risk_score(self, domains):
325 | """Performs Umbrella risk score analysis on the input domains
326 |
327 | Args:
328 | domains: an enumerable of domains
329 | Returns:
330 | An enumerable of associated domain risk scores
331 | """
332 | api_name = 'opendns-risk_score'
333 | fmt_url_path = u'domains/risk-score/{0}'
334 | return self._multi_get(api_name, fmt_url_path, domains)
335 |
336 |
337 | class ResponseError(Exception):
338 |
339 | """Raised when the response received from the endpoint is not valid."""
340 |
--------------------------------------------------------------------------------
/tests/opendns_test.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | from six.moves import builtins
4 | import testify as T
5 | from mock import ANY
6 | from mock import mock_open
7 | from mock import patch
8 |
9 | from threat_intel.opendns import InvestigateApi
10 | from threat_intel.opendns import ResponseError
11 | from threat_intel.util.api_cache import ApiCache
12 | from threat_intel.util.http import MultiRequest
13 |
14 |
15 | class InvestigateApiTest(T.TestCase):
16 |
17 | """Tests requesting reports from OpenDNS."""
18 |
19 | @T.setup
20 | def setup_opendns(self):
21 | self.opendns = InvestigateApi('test_key')
22 |
23 | def _patch_and_assert_categorization(self, all_responses, expected_responses, domains, expected_url, expected_data):
24 | with patch.object(MultiRequest, 'multi_post', autospec=True, return_value=all_responses) as patched_multi_post:
25 | actual_responses = self.opendns.categorization(domains)
26 |
27 | patched_multi_post.assert_called_with(ANY, expected_url, data=expected_data)
28 | assert expected_responses == actual_responses
29 |
30 | def test_categorization(self):
31 | domains = ['yellowstone.org', 'zion.org', 'sequoia.org', 'greatsanddunes.org']
32 | all_responses = [
33 | {
34 | u'yellowstone.org': {
35 | u'content_categories': [u'National Parks'],
36 | u'security_categories': [],
37 | u'status': 1
38 | },
39 | u'zion.org': {
40 | u'content_categories': [u'National Parks'],
41 | u'security_categories': [],
42 | u'status': 1
43 | },
44 | u'sequoia.org': {
45 | u'content_categories': [u'National Parks'],
46 | u'security_categories': [],
47 | u'status': 1
48 | },
49 | u'greatsanddunes.org': {
50 | u'content_categories': [u'National Parks'],
51 | u'security_categories': [],
52 | u'status': 1
53 | }
54 | }
55 | ]
56 |
57 | expected_url = u'https://investigate.api.umbrella.com/domains/categorization/?showLabels'
58 | expected_data = ['["yellowstone.org", "zion.org", "sequoia.org", "greatsanddunes.org"]']
59 | expected_responses = all_responses[0]
60 |
61 | self._patch_and_assert_categorization(all_responses, expected_responses, domains, expected_url, expected_data)
62 |
63 | def test_categorization_domains_limit(self):
64 | self.opendns.MAX_DOMAINS_IN_POST = 2
65 | domains = [
66 | 'northyorkmoors.org.uk', 'peakdistrict.org.uk',
67 | 'cairngorms.org.uk', 'pembrokeshirecoast.org.uk',
68 | 'northumberland.org.uk']
69 | all_responses = [
70 | {
71 | u'northyorkmoors.org.uk': {
72 | u'content_categories': [u'National Parks'],
73 | u'security_categories': [],
74 | u'status': 1
75 | },
76 | u'peakdistrict.org.uk': {
77 | u'content_categories': [u'National Parks'],
78 | u'security_categories': [],
79 | u'status': 1
80 | },
81 | },
82 | {
83 | u'cairngorms.org.uk': {
84 | u'content_categories': [u'National Parks'],
85 | u'security_categories': [],
86 | u'status': 1
87 | },
88 | u'pembrokeshirecoast.org.uk': {
89 | u'content_categories': [u'National Parks'],
90 | u'security_categories': [],
91 | u'status': 1
92 | },
93 | },
94 | {
95 | u'northumberland.org.uk': {
96 | u'content_categories': [u'National Parks'],
97 | u'security_categories': [],
98 | u'status': 1
99 | }
100 | }
101 | ]
102 |
103 | expected_data = [
104 | '["northyorkmoors.org.uk", "peakdistrict.org.uk"]',
105 | '["cairngorms.org.uk", "pembrokeshirecoast.org.uk"]',
106 | '["northumberland.org.uk"]']
107 | expected_responses = {
108 | u'northyorkmoors.org.uk': {
109 | u'content_categories': [u'National Parks'],
110 | u'security_categories': [],
111 | u'status': 1
112 | },
113 | u'peakdistrict.org.uk': {
114 | u'content_categories': [u'National Parks'],
115 | u'security_categories': [],
116 | u'status': 1
117 | },
118 | u'cairngorms.org.uk': {
119 | u'content_categories': [u'National Parks'],
120 | u'security_categories': [],
121 | u'status': 1
122 | },
123 | u'pembrokeshirecoast.org.uk': {
124 | u'content_categories': [u'National Parks'],
125 | u'security_categories': [],
126 | u'status': 1
127 | },
128 | u'northumberland.org.uk': {
129 | u'content_categories': [u'National Parks'],
130 | u'security_categories': [],
131 | u'status': 1
132 | }
133 | }
134 |
135 | self._patch_and_assert_categorization(all_responses, expected_responses, domains, ANY, expected_data)
136 |
137 | def test_categorization_response_error(self):
138 | """Tests whether the ResponseError is raised when the response
139 | returned from the actual API call is empty.
140 | """
141 | domains = ['yosemite.gov', 'joushuatree.gov', 'deathvalley.gov']
142 | # empty responses should raise an error
143 | all_responses = [{}]
144 |
145 | # mock cache file
146 | mock_read = mock_open(read_data="{}")
147 |
148 | with patch.object(
149 | builtins, 'open', mock_read, create=True
150 | ), patch.object(
151 | ApiCache, 'bulk_lookup', autospec=True, return_value={}
152 | ), patch.object(
153 | MultiRequest, 'multi_post', autospec=True, return_value=all_responses
154 | ):
155 | i = InvestigateApi('hocus pocus', 'cache.json')
156 | with T.assert_raises(ResponseError):
157 | i.categorization(domains)
158 |
159 | def _test_api_call_get(self, call, endpoint, request, expected_url_params,
160 | api_response, expected_result, expected_query_params=None):
161 | """
162 | Tests a OpenDNS call by mocking out the HTTP GET request.
163 |
164 | Args:
165 | call: function in OpenDNSApi to call.
166 | endpoint: endpoint of OpenDNS API that is hit (appended to base url)
167 | request: call arguments
168 | expected_url_params: URL parameters that should be passed to API
169 | api_response: the expected response by the API
170 | expected_result: what call should return (given the api response provided)
171 | expected_query_params: query parameters that should be passed to API
172 | """
173 | with patch.object(self.opendns, '_requests') as request_mock:
174 | request_mock.multi_get.return_value = api_response
175 | result = call(request)
176 |
177 | url = self.opendns._to_url(endpoint.format(expected_url_params))
178 | request_mock.multi_get.assert_called_with([url], expected_query_params)
179 | T.assert_equal(result, expected_result)
180 |
181 | def test_security(self):
182 | self._test_api_call_get(call=self.opendns.security,
183 | endpoint=u'security/name/{0}.json',
184 | request=['domain'],
185 | expected_url_params='domain',
186 | api_response={},
187 | expected_result={})
188 |
189 | def test_whois_emails(self):
190 | self._test_api_call_get(call=self.opendns.whois_emails,
191 | endpoint=u'whois/emails/{0}',
192 | request=['admin@dns.com'],
193 | expected_url_params='admin@dns.com',
194 | api_response={},
195 | expected_result={})
196 |
197 | def test_whois_nameservers(self):
198 | self._test_api_call_get(call=self.opendns.whois_nameservers,
199 | endpoint=u'whois/nameservers/{0}',
200 | request=['ns.dns.com'],
201 | expected_url_params='ns.dns.com',
202 | api_response={},
203 | expected_result={})
204 |
205 | def test_whois_domains(self):
206 | self._test_api_call_get(call=self.opendns.whois_domains,
207 | endpoint=u'whois/{0}',
208 | request=['google.com'],
209 | expected_url_params='google.com',
210 | api_response={},
211 | expected_result={})
212 |
213 | def test_whois_domains_history(self):
214 | self._test_api_call_get(call=self.opendns.whois_domains_history,
215 | endpoint=u'whois/{0}/history',
216 | request=['5esb.biz'],
217 | expected_url_params='5esb.biz',
218 | api_response={},
219 | expected_result={})
220 |
221 | def test_coocurrences(self):
222 | self._test_api_call_get(call=self.opendns.cooccurrences,
223 | endpoint=u'recommendations/name/{0}.json',
224 | request=['domain'],
225 | expected_url_params='domain',
226 | api_response={},
227 | expected_result={})
228 |
229 | def test_rr_history(self):
230 | self._test_api_call_get(call=self.opendns.rr_history,
231 | endpoint=u'dnsdb/ip/a/{0}.json',
232 | request=['8.8.8.8'],
233 | expected_url_params='8.8.8.8',
234 | api_response={},
235 | expected_result={})
236 |
237 | def test_latest_malicious(self):
238 | self._test_api_call_get(call=self.opendns.latest_malicious,
239 | endpoint=u'ips/{0}/latest_domains',
240 | request=['8.8.8.8'],
241 | expected_url_params='8.8.8.8',
242 | api_response={},
243 | expected_result={})
244 |
245 | def test_domain_tag(self):
246 | self._test_api_call_get(call=self.opendns.domain_tag,
247 | endpoint=u'domains/{0}/latest_tags',
248 | request=['domain'],
249 | expected_url_params='domain',
250 | api_response={},
251 | expected_result={})
252 |
253 | def test_dns_rr(self):
254 | self._test_api_call_get(call=self.opendns.dns_rr,
255 | endpoint=u'dnsdb/name/a/{0}.json',
256 | request=['domain'],
257 | expected_url_params='domain',
258 | api_response={},
259 | expected_result={})
260 |
261 | def test_related_domains(self):
262 | self._test_api_call_get(call=self.opendns.related_domains,
263 | endpoint=u'links/name/{0}.json',
264 | request=['domain'],
265 | expected_url_params='domain',
266 | api_response={},
267 | expected_result={})
268 |
269 | def test_sample(self):
270 | self._test_api_call_get(call=self.opendns.sample,
271 | endpoint=u'sample/{0}',
272 | request=['0492d93195451e41f568f68e7704eb0812bc2b19'],
273 | expected_url_params='0492d93195451e41f568f68e7704eb0812bc2b19',
274 | api_response={},
275 | expected_result={})
276 |
277 | def test_search(self):
278 | self._test_api_call_get(call=self.opendns.search,
279 | endpoint=u'search/{0}',
280 | request=['pattern'],
281 | expected_url_params='pattern',
282 | api_response={},
283 | expected_result={},
284 | expected_query_params={'start': '-30days',
285 | 'includecategory': 'false',
286 | 'limit': 1000})
287 |
288 | def test_risk_score(self):
289 | self._test_api_call_get(call=self.opendns.risk_score,
290 | endpoint=u'domains/risk-score/{0}',
291 | request=['domain'],
292 | expected_url_params='domain',
293 | api_response={},
294 | expected_result={})
295 |
--------------------------------------------------------------------------------
/threat_intel/virustotal.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # VirusTotalApi makes calls to the VirusTotal API.
4 | #
5 | from six.moves import range
6 |
7 | from threat_intel.util.api_cache import ApiCache
8 | from threat_intel.util.http import MultiRequest
9 |
10 |
11 | class VirusTotalApi(object):
12 | BASE_DOMAIN = u'https://www.virustotal.com/api/v3/'
13 |
14 | def __init__(self, api_key, cache_file_name=None, update_cache=True, req_timeout=None):
15 | """Establishes basic HTTP params and loads a cache.
16 |
17 | Args:
18 | api_key: VirusTotal API key
19 | cache_file_name: String file name of cache.
20 | update_cache: Determines whether cache should be written out back to the disk when closing it.
21 | Default is `True`.
22 | req_timeout: Maximum number of seconds to wait without reading a response byte before deciding an error has occurred.
23 | Default is None.
24 | """
25 | self._requests = MultiRequest(req_timeout=req_timeout, default_headers={'x-apikey': api_key}, drop_404s=True)
26 |
27 | # Create an ApiCache if instructed to
28 | self._cache = ApiCache(cache_file_name, update_cache) if cache_file_name else None
29 |
30 | @MultiRequest.error_handling
31 | def get_file_reports(self, file_hash_list):
32 | """Retrieves the most recent reports for a set of md5, sha1, and/or sha2 hashes.
33 |
34 | Args:
35 | file_hash_list: list of string hashes.
36 | Returns:
37 | A dict with the hash as key and the VT report as value.
38 | """
39 | api_name = 'virustotal-file-reports'
40 | api_endpoint = 'files/{}'
41 |
42 | all_responses, file_hash_list = self._bulk_cache_lookup(api_name, file_hash_list)
43 | response_chunks = self._request_reports(file_hash_list, api_endpoint)
44 | self._extract_response_chunks(all_responses, response_chunks, api_name)
45 |
46 | return all_responses
47 |
48 | @MultiRequest.error_handling
49 | def get_file_behaviour(self, file_hash_list):
50 | """Retrieves a report about the behaviour of a md5, sha1, and/or sha2 hash of
51 | a file when executed in a sandboxed environment (Cuckoo sandbox).
52 |
53 | Args:
54 | file_hash_list: list of string hashes.
55 | """
56 | api_name = 'virustotal-file-behaviour'
57 | api_endpoint = 'files/{}/behaviours'
58 |
59 | all_responses, file_hash_list = self._bulk_cache_lookup(api_name, file_hash_list)
60 | response_chunks = self._request_reports(file_hash_list, api_endpoint)
61 | self._extract_response_chunks(all_responses, response_chunks, api_name)
62 |
63 | return all_responses
64 |
65 | @MultiRequest.error_handling
66 | def get_file_download(self, file_hash_list):
67 | """Retrieves a file from its a md5, sha1, and/or sha2 hash.
68 |
69 | Args:
70 | file_hash_list: list of string hashes.
71 | Returns:
72 | a base64encoded string of the file
73 | """
74 | api_name = 'virustotal-file-download'
75 | api_endpoint = 'files/{}/download'
76 | return self._extract_all_responses(file_hash_list, api_endpoint, api_name, file_download=True)
77 |
78 | @MultiRequest.error_handling
79 | def get_file_contacted_domains(self, file_hash_list):
80 | """Retrieves a report about the contacted domains of a md5, sha1, and/or sha2 hash of
81 | file, when it is executed.
82 |
83 | Args:
84 | file_hash_list: list of string hashes.
85 | """
86 | api_name = 'virustotal-file-contacted-domains'
87 | api_endpoint = 'files/{}/contacted_domains'
88 |
89 | return self._extract_all_responses(file_hash_list, api_endpoint, api_name)
90 |
91 | @MultiRequest.error_handling
92 | def get_file_contacted_ips(self, file_hash_list):
93 | """Retrieves a report about the contacted ip addresses of a md5, sha1,
94 | and/or sha2 hash of file, when it is executed.
95 |
96 | Args:
97 | resources: list of string hashes.
98 | """
99 | api_name = 'virustotal-file-contacted-ips'
100 | api_endpoint = 'files/{}/contacted_ips'
101 |
102 | return self._extract_all_responses(file_hash_list, api_endpoint, api_name)
103 |
104 | @MultiRequest.error_handling
105 | def get_file_contacted_urls(self, file_hash_list):
106 | """Retrieves a report about the contacted urls of a md5, sha1,
107 | and/or sha2 hash of file, when it is executed.
108 |
109 | Args:
110 | file_hash_list: list of string hashes.
111 | """
112 | api_name = 'virustotal-file-contacted-urls'
113 | api_endpoint = 'files/{}/contacted_urls'
114 |
115 | return self._extract_all_responses(file_hash_list, api_endpoint, api_name)
116 |
117 | @MultiRequest.error_handling
118 | def get_file_itw_urls(self, file_hash_list):
119 | """Retrieves a report about the in the wild URLs from where the file
120 | with the hash has been downloaded.
121 |
122 | Args:
123 | file_hash_list: list of string hashes.
124 | """
125 | api_name = 'virustotal-file-itw-urls'
126 | api_endpoint = 'files/{}/itw_urls'
127 |
128 | return self._extract_all_responses(file_hash_list, api_endpoint, api_name)
129 |
130 | @MultiRequest.error_handling
131 | def get_domain_communicating_files(self, domain_list):
132 | """Retrieves a report about the files that communicate with this internet domain.
133 |
134 | Args:
135 | domain_list: list of string domains.
136 | """
137 | api_name = 'virustotal-domain-communicating-files'
138 | api_endpoint = 'domains/{}/communicating_files'
139 |
140 | return self._extract_all_responses(domain_list, api_endpoint, api_name)
141 |
142 | @MultiRequest.error_handling
143 | def get_domain_referrer_files(self, domain_list):
144 | """Retrieves a report about the files containing the internet domain.
145 |
146 | Args:
147 | domain_list: list of string domains.
148 | """
149 | api_name = 'virustotal-domain-referrer-files'
150 | api_endpoint = 'domains/{}/referrer_files'
151 |
152 | return self._extract_all_responses(domain_list, api_endpoint, api_name)
153 |
154 | @MultiRequest.error_handling
155 | def get_domain_reports(self, domain_list):
156 | """Retrieves the most recent VT info for a set of domains.
157 |
158 | Args:
159 | domain_list: list of string domains.
160 | Returns:
161 | A dict with the domain as key and the VT report as value.
162 | """
163 | api_name = 'virustotal-domain-reports'
164 |
165 | (all_responses, domain_list) = self._bulk_cache_lookup(api_name, domain_list)
166 | responses = self._request_reports(domain_list, 'domains/{}')
167 |
168 | for domain, response in zip(domain_list, responses):
169 | if self._cache:
170 | self._cache.cache_value(api_name, domain, response)
171 | all_responses[domain] = response
172 |
173 | return all_responses
174 |
175 | @MultiRequest.error_handling
176 | def get_feeds_url(self, time_frame):
177 | """Retrieves a live feed with the latest URLs submitted to VT.
178 |
179 | Args:
180 | time_frame: a list of timeframe strings in date format YYYYMMDDhhmm.
181 | Returns:
182 | A base64 encoded bzip2 compressed UTF-8 text file contains one JSON structure per line.
183 | """
184 | api_name = 'virustotal-url-distribution'
185 | all_responses = {}
186 |
187 | response = self._request_reports(time_frame, 'feeds/urls/{}', file_download=True)
188 | self._extract_response_chunks(all_responses, response, api_name)
189 |
190 | return all_responses
191 |
192 | @MultiRequest.error_handling
193 | def get_file_distribution(self, time_frame):
194 | """Retrieves a live feed with the latest hashes submitted to VT.
195 |
196 | Args:
197 | time_frame: A list of strings in format YYYYMMDDhhmm.
198 | Returns:
199 | A dict with the VT report.
200 | """
201 | all_responses = {}
202 | api_name = 'virustotal-file-distribution'
203 |
204 | response = self._request_reports(time_frame, 'feeds/files/{}')
205 | self._extract_response_chunks(all_responses, response, api_name)
206 |
207 | return all_responses
208 |
209 | @MultiRequest.error_handling
210 | def get_url_reports(self, url_hash_list):
211 | """Retrieves a scan report on a given URL.
212 |
213 | Args:
214 | url_hash_list: list of sha256 hashed urls.
215 | Returns:
216 | A dict with the URL hash as key and the VT report as value.
217 | """
218 | api_name = 'virustotal-url-reports'
219 | api_endpoint = 'urls/{}'
220 |
221 | return self._extract_all_responses(url_hash_list, api_endpoint, api_name)
222 |
223 | @MultiRequest.error_handling
224 | def get_ip_reports(self, ips):
225 | """Retrieves the most recent VT info for a set of ips.
226 |
227 | Args:
228 | ips: list of IPs.
229 | Returns:
230 | A dict with the IP as key and the VT report as value.
231 | """
232 | api_name = 'virustotal-ip-address-reports'
233 |
234 | (all_responses, ips) = self._bulk_cache_lookup(api_name, ips)
235 | responses = self._request_reports(ips, 'ip_addresses/{}')
236 |
237 | for ip, response in zip(ips, responses):
238 | if self._cache:
239 | self._cache.cache_value(api_name, ip, response)
240 | all_responses[ip] = response
241 |
242 | return all_responses
243 |
244 | @MultiRequest.error_handling
245 | def get_file_search(self, query):
246 | """Performs advanced search on samples, matching certain binary/
247 | metadata/detection criteria.
248 | Possible queries: file size, file type, first or last submission to
249 | VT, number of positives, bynary content, etc.
250 |
251 | Args:
252 | query: dictionary with search arguments
253 | Example: 'query': 'type:peexe size:90kb+ positives:5+ behaviour:"taskkill"'
254 | Returns:
255 | A dict with the VT report.
256 | """
257 | api_name = 'virustotal-file-search'
258 | api_endpoint = 'intelligence/search?query={}'
259 |
260 | return self._extract_all_responses(query, api_endpoint, api_name)
261 |
262 | @MultiRequest.error_handling
263 | def get_file_clusters(self, time_frame):
264 | """Retrieves file similarity clusters for a given time frame.
265 |
266 | Args:
267 | time_frame: a list of time frames for which we want the clustering details in YYYYMMDDhhmm format.
268 | Returns:
269 | A dict with the VT report.
270 | """
271 | api_name = 'virustotal-file-clusters'
272 | api_endpoint = 'feeds/file-behaviours/{}'
273 |
274 | return self._extract_all_responses(time_frame, api_endpoint, api_name)
275 |
276 |
277 | def _bulk_cache_lookup(self, api_name, keys):
278 | """Performes a bulk cache lookup and returns a tuple with the results
279 | found and the keys missing in the cache. If cached is not configured
280 | it will return an empty dictionary of found results and the initial
281 | list of keys.
282 |
283 | Args:
284 | api_name: a string name of the API.
285 | keys: an enumerable of string keys.
286 | Returns:
287 | A tuple: (responses found, missing keys).
288 | """
289 | if self._cache:
290 | responses = self._cache.bulk_lookup(api_name, keys)
291 | missing_keys = [key for key in keys if key not in responses.keys()]
292 | return (responses, missing_keys)
293 |
294 | return ({}, keys)
295 |
296 | def _request_reports(self, ids, endpoint_name, file_download=False):
297 | """Sends multiples requests for the resources to a particular endpoint.
298 |
299 | Args:
300 | ids: list of the hash identifying the file.
301 | endpoint_name: VirusTotal endpoint URL suffix.
302 | file_download: boolean, whether a file download is expected
303 | Returns:
304 | A list of the responses.
305 | """
306 | urls = ['{}{}'.format(self.BASE_DOMAIN, endpoint_name.format(id)) for id in ids]
307 | return self._requests.multi_get(urls, file_download=file_download) if urls else []
308 |
309 |
310 | def _extract_cache_id(self, response):
311 | """Extracts the object hash from the response to be used to
312 | uniquely identify the result.
313 |
314 | Args:
315 | response: response object.
316 | Returns:
317 | A hash that uniquely identities the result.
318 | """
319 |
320 | cache_id = None
321 | if isinstance(response['data'], list):
322 | if response['data']:
323 | # gets the first data items' id
324 | cache_id = response['data'][0]['id']
325 | else:
326 | cache_id = response['data']['id']
327 | # sandbox id output has an underscore as the separator
328 | if cache_id and '_' in cache_id:
329 | cache_id = cache_id.split('_')[0]
330 | return cache_id
331 |
332 | def _extract_all_responses(self, resources, api_endpoint, api_name, file_download=False):
333 | """ Aux function to extract all the API endpoint responses.
334 |
335 | Args:
336 | resources: list of string hashes.
337 | api_endpoint: endpoint path
338 | api_name: endpoint name
339 | Returns:
340 | A dict with the hash as key and the VT report as value.
341 | """
342 | all_responses, resources = self._bulk_cache_lookup(api_name, resources)
343 | response_chunks = self._request_reports(resources, api_endpoint, file_download)
344 | self._extract_response_chunks(all_responses, response_chunks, api_name)
345 |
346 | return all_responses
347 |
348 | def _extract_response_chunks(self, all_responses, response_chunks, api_name):
349 | """Extracts and caches the responses from the response chunks in case
350 | of the responses for the requests containing multiple concatenated
351 | resources. Extracted responses are added to the already cached
352 | responses passed in the all_responses parameter.
353 |
354 | Args:
355 | all_responses: a list containing already cached responses.
356 | response_chunks: a list with response chunks.
357 | api_name: a string name of the API.
358 | """
359 | for response_chunk in response_chunks:
360 | if not isinstance(response_chunk, list):
361 | response_chunk = [response_chunk]
362 | for response in response_chunk:
363 | if not response:
364 | continue
365 |
366 | cache_id = self._extract_cache_id(response)
367 | if cache_id:
368 | if self._cache:
369 | self._cache.cache_value(api_name, cache_id, response)
370 | all_responses[cache_id] = response
371 |
--------------------------------------------------------------------------------
/threat_intel/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 |
4 | Supported threat intelligence feeds.
5 |
6 | The package contains API wrappers for:
7 |
8 | * OpenDNS Investigate API
9 | * VirusTotal API v2.0
10 | * ShadowServer API
11 |
12 |
13 | OpenDNS Investigate API
14 | =======================
15 |
16 | OpenDNS Investigate provides an API that allows querying for:
17 |
18 | * Domain categorization
19 | * Security information about a domain
20 | * Co-occurrences for a domain
21 | * Related domains for a domain
22 | * Domains related to an IP
23 | * Domain tagging dates for a domain
24 | * DNS RR history for a domain
25 | * WHOIS information
26 | - WHOIS information for an email
27 | - WHOIS information for a nameserver
28 | - Historical WHOIS information for a domain
29 | * Latest malicious domains for an IP
30 |
31 | To use the Investigate API wrapper import InvestigateApi class from threat_intel.opendns module:
32 |
33 | >>> from threat_intel import InvestigateApi
34 |
35 | To initialize the API wrapper you need the API key:
36 |
37 | >>> investigate = InvestigateApi("")
38 |
39 | You can also specify a file name where the API responses will be cached in a JSON file, to save you the bandwidth for the multiple calls
40 | about the same domains or IPs:
41 |
42 | >>> investigate = InvestigateApi("", cache_file_name="/tmp/cache.opendns.json")
43 |
44 |
45 | Domain categorization
46 | ---------------------
47 | Calls domains/categorization/?showLabels Investigate API endpoint. It takes a list (or any other Python enumerable) of domains and returns
48 | the categories associated with this domains by OpenDNS.
49 |
50 | >>> domains = ["google.com", "baidu.com", "bibikun.ru"]
51 | >>> investigate.categorization(domains)
52 |
53 | will result in:
54 | {
55 | "baidu.com": {"status": 1, "content_categories": ["Search Engines"], "security_categories": []},
56 | "google.com": {"status": 1, "content_categories": ["Search Engines"], "security_categories": []},
57 | "bibikun.ru": {"status": -1, "content_categories": [], "security_categories": ["Malware"]}
58 | }
59 |
60 |
61 | Security information about a domain
62 | -----------------------------------
63 | Calls security/name/ Investigate API endpoint. It takes any Python enumerable with domains, e.g. list, and returns security parameters
64 | associated with each domain.
65 |
66 | >>> domains = ["google.com", "baidu.com", "bibikun.ru"]
67 | >>> investigate.security(domains)
68 |
69 | will result in:
70 |
71 | {
72 | "baidu.com": {
73 | "found": true,
74 | "handlings": {
75 | "domaintagging": 0.00032008666962131285,
76 | "blocked": 0.00018876906157154347,
77 | "whitelisted": 0.00019697641207465407,
78 | "expired": 2.462205150933176e-05,
79 | "normal": 0.9992695458052232
80 | },
81 | "dga_score": 0,
82 | "rip_score": 0,
83 | ..
84 | }
85 | }
86 |
87 |
88 | Co-ooccurrences of domain
89 | --------------------------
90 | Calls recommendations/name/ Investigate API endpoint. Use this method to find out related domains to the one given in a list, or any other
91 | Python enumerable.
92 |
93 | >>> domains = ["google.com", "baidu.com", "bibikun.ru"]
94 | >>> investigate.cooccurrences(domains)
95 |
96 | will result in:
97 |
98 | {
99 | "baidu.com": {
100 | "found": true,
101 | "pfs2": [
102 | ["www.howtoforge.de", 0.14108563836506008],
103 | ..
104 | }
105 |
106 |
107 | Related domains for a domain
108 | ----------------------------
109 |
110 | Calls links/name/ Investigate API endpoint. Use this method to find out a list of related domains (domains that have been frequently seen
111 | requested around a time window of 60 seconds, but that are not associated with the given domain) to the one given in a list, or any other
112 | Python enumerable.
113 |
114 | >>> domains = ["google.com", "baidu.com", "bibikun.ru"]
115 | >>> investigate.related_domains(domains)
116 |
117 | will result in:
118 |
119 | {
120 | "tb1": [
121 | ["t.co", 11.0],
122 | ]
123 |
124 | ..
125 |
126 | }
127 |
128 |
129 | Domain tagging dates for a domain
130 | ---------------------------------
131 |
132 | Calls domains/name/ Investigate API endpoint.
133 |
134 | Use this method to get the date range when the domain being queried was a part of the OpenDNS block list and how long a domain has been in
135 | this list
136 |
137 | >>> domains = ["google.com", "baidu.com", "bibikun.ru"]
138 | >>> investigate.domain_tag(domains)
139 |
140 | will result in:
141 |
142 | {
143 | 'category': u'Malware',
144 | 'url': None,
145 | 'period': {
146 | 'begin': u'2013-09-16',
147 | 'end': u'Current'
148 | }
149 |
150 | ..
151 |
152 | }
153 |
154 |
155 |
156 | DNS RR history for an IP
157 | ------------------------
158 | Calls dnsdb/ip/a/ Investigate API endpoint. Use this method to find out related domains to the IP addresses given in a list, or any other
159 | Python enumerable.
160 |
161 | >>> ips = ['8.8.8.8']
162 | >>> investigate.rr_history(ips)
163 |
164 | will result in:
165 |
166 | {
167 | "8.8.8.8": {
168 | "rrs": [
169 | {
170 | "name": "8.8.8.8",
171 | "type": "A",
172 | "class": "IN",
173 | "rr": "000189.com.",
174 | "ttl": 3600
175 | },
176 | {
177 | "name": "8.8.8.8",
178 | "type": "A",
179 | "class": "IN",
180 | "rr": "008.no-ip.net.",
181 | "ttl": 60
182 | },
183 | ..
184 | }
185 |
186 | WHOIS information for a domain
187 | ------------------------------
188 |
189 | WHOIS information for an email
190 | ------------------------------
191 |
192 | Calls `whois/emails/{email}` Investigate API endpoint.
193 |
194 | Use this method to see WHOIS information for the email address. (For now the OpenDNS API will only return at most 500 results)
195 |
196 | >>> emails = ["dns-admin@google.com"]
197 | >>> investigate.whois_emails(emails)
198 |
199 | will result in:
200 |
201 | {
202 | "dns-admin@google.com": {
203 | "totalResults": 500,
204 | "moreDataAvailable": true,
205 | "limit": 500,
206 | "domains": [
207 | {
208 | "domain": "0emm.com",
209 | "current": true
210 | },
211 | ..
212 | ]
213 | }
214 | }
215 |
216 | WHOIS information for a nameserver
217 | ----------------------------------
218 |
219 | Calls `whois/nameservers/{nameserver}` Investigate API endpoint.
220 |
221 | Use this method to see WHOIS information for the nameserver. (For now the OpenDNS API will only return at most 500 results)
222 |
223 | >>> nameservers = ["ns2.google.com"]
224 | >>> investigate.whois_nameservers(nameservers)
225 |
226 | will result in:
227 |
228 | {
229 | "ns2.google.com": {
230 | "totalResults": 500,
231 | "moreDataAvailable": true,
232 | "limit": 500,
233 | "domains": [
234 | {
235 | "domain": "46645.biz",
236 | "current": true
237 | },
238 | ..
239 | ]
240 | }
241 | }
242 |
243 | WHOIS information for a domain
244 | ------------------------------
245 |
246 | Calls `whois/{domain}` Investigate API endpoint.
247 |
248 | Use this method to see WHOIS information for the domain.
249 |
250 | >>> domains = ["google.com"]
251 | >>> investigate.whois_domains(domains)
252 |
253 | will result in:
254 |
255 | {
256 | "administrativeContactFax": null,
257 | "whoisServers": null,
258 | "addresses": [
259 | "1600 amphitheatre parkway",
260 | "please contact contact-admin@google.com, 1600 amphitheatre parkway",
261 | "2400 e. bayshore pkwy"
262 | ],
263 | ..
264 | }
265 |
266 | Historical WHOIS information for a domain
267 | -----------------------------------------
268 |
269 | Calls `whois/{domain}/history` Investigate API endpoint.
270 |
271 | Use this method to see historical WHOIS information for the domain.
272 |
273 | >>> domains = ["5esb.biz"]
274 | >>> investigate.whois_domains_history(domains)
275 |
276 | will result in:
277 |
278 | {
279 | '5esb.biz':[
280 | {
281 | u'registrantFaxExt':u'',
282 | u'administrativeContactPostalCode':u'656448',
283 | u'zoneContactCity':u'',
284 | u'addresses':[
285 | u'nan qu hua yuan xiao he'
286 | ],
287 | ..
288 | },
289 | ..
290 | ]
291 | }
292 |
293 | Latest malicious domains for an IP
294 | ----------------------------------
295 |
296 | Calls `ips/{ip}/latest_domains` Investigate API endpoint.
297 |
298 | Use this method to see whether the IP address has any malicious domains associated with it.
299 |
300 | >>> ips = ["8.8.8.8"]
301 | >>> investigate.latest_malicious(ips)
302 |
303 | will result in:
304 |
305 | {
306 | [
307 | '7ltd.biz',
308 | 'co0s.ru',
309 | 't0link.in',
310 | ]
311 |
312 | ..
313 | }
314 |
315 |
316 | VirusTotal API
317 | ==============
318 |
319 | VirusTotal provides an API that makes it possible to query for the reports about:
320 |
321 | * Domains
322 | * URLs
323 | * IPs
324 | * File hashes
325 | * File Upload
326 | * Live Feed
327 | * Advanced search
328 |
329 | To use the VirusTotal API wrapper import VirusTotalApi class from threat_intel.virustotal module:
330 |
331 | >>> from threat_intel import VirusTotalApi
332 |
333 | To initialize the API wrapper you need the API key:
334 |
335 | >>> vt = VirusTotalApi("")
336 |
337 | VirusTotal API calls allow to squeeze a list of file hashes or URLs into a single HTTP call. Depending on the API version you are using
338 | (public or private) you may need to tune the maximum number of the resources (file hashes or URLs) that could be passed in a single API
339 | call. You can do it with the resources_per_req parameter:
340 |
341 | >>> vt = VirusTotalApi("", resources_per_req=4)
342 |
343 | When using the public API your standard request rate allows you too put maximum 4 resources per request. With private API you are able to
344 | put up to 25 resources per call. That is also the default value if you don't pass the resources_per_req parameter.
345 |
346 | Of course when calling the API wrapper methods in the VirusTotalApi class you can pass as many resources as you want and the wrapper will
347 | take care of producing as many API calls as necessary to satisfy the request rate.
348 |
349 | Similarly to OpenDNS API wrapper, you can also specify the file name where the responses will be cached:
350 |
351 | >>> vt = VirusTotalApi("", cache_file_name="/tmp/cache.virustotal.json")
352 |
353 |
354 | #### Domain reports
355 |
356 | Calls domain/report VirusTotal API endpoint.
357 | Pass a list or any other Python enumerable containing the domains:
358 |
359 | >>> domains = ["google.com", "baidu.com", "bibikun.ru"]
360 | >>> vt.get_domain_reports(domains)
361 |
362 | will result in:
363 |
364 | {
365 | "baidu.com": {
366 | "undetected_referrer_samples": [
367 | {
368 | "positives": 0,
369 | "total": 56,
370 | "sha256": "e3c1aea1352362e4b5c008e16b03810192d12a4f1cc71245f5a75e796c719c69"
371 | }
372 | ],
373 | ..
374 | }
375 |
376 |
377 | #### URL report endpoint
378 |
379 | Calls 'url/report' VirusTotal API endpoint.
380 | Pass a list or any other Python enumerable containing the URL addresses:
381 |
382 | >>> urls = ["http://www.google.com", "http://www.yelp.com"]
383 | >>> vt.get_url_reports(urls)
384 |
385 | will result in:
386 |
387 | {
388 | "http://www.google.com": {
389 | "permalink": "https://www.virustotal.com/url/dd014af5ed6b38d9130e3f466f850e46d21b951199d53a18ef29ee9341614eaf/analysis/1423344006/",
390 | "resource": "http://www.google.com",
391 | "url": "http://www.google.com/",
392 | "response_code": 1,
393 | "scan_date": "2015-02-07 21:20:06",
394 | "scan_id": "dd014af5ed6b38d9130e3f466f850e46d21b951199d53a18ef29ee9341614eaf-1423344006",
395 | "verbose_msg": "Scan finished, scan information embedded in this object",
396 | "filescan_id": null,
397 | "positives": 0,
398 | "total": 62,
399 | "scans": {
400 | "CLEAN MX": {
401 | "detected": false,
402 | "result": "clean site"
403 | },
404 | ..
405 | }
406 |
407 |
408 | #### URL scan endpoint
409 |
410 | Calls url/scan VirusTotal API endpoint.
411 | Submit a url or any other Python enumerable containing the URL addresses:
412 |
413 | >>> urls = ["http://www.google.com", "http://www.yelp.com"]
414 | >>> vt.get_url_reports(urls)
415 |
416 |
417 | #### Hash report endpoint
418 |
419 | Calls file/report VirusTotal API endpoint.
420 | You can request the file reports passing a list of hashes (md5, sha1 or sha2):
421 |
422 | >>> file_hashes = [
423 | "99017f6eebbac24f351415dd410d522d",
424 | "88817f6eebbac24f351415dd410d522d"
425 | ]
426 |
427 | >>> vt.get_file_reports(file_hashes)
428 |
429 | will result in:
430 |
431 | {
432 | "88817f6eebbac24f351415dd410d522d": {
433 | "response_code": 0,
434 | "resource": "88817f6eebbac24f351415dd410d522d",
435 | "verbose_msg": "The requested resource is not among the finished, queued or pending scans"
436 | },
437 | "99017f6eebbac24f351415dd410d522d": {
438 | "scan_id": "52d3df0ed60c46f336c131bf2ca454f73bafdc4b04dfa2aea80746f5ba9e6d1c-1423261860",
439 | "sha1": "4d1740485713a2ab3a4f5822a01f645fe8387f92",
440 | }
441 |
442 |
443 | #### Hash rescan endpoint
444 |
445 | Calls `file/rescan` VirusTotal API endpoint. Use to rescan a previously submitted file.
446 | You can request the file reports passing a list of hashes (md5, sha1 or sha2):
447 |
448 |
449 | #### Hash behaviour endpoint
450 |
451 | Calls `file/behaviour` VirusTotal API endpoint. Use to get a report about the behaviour of the file when executed in a sandboxed
452 | environment (Cuckoo sandbox). You can request the file reports passing a list of hashes (md5, sha1 or sha2):
453 |
454 |
455 | >>> vt.get_file_behaviour(file_hashes)
456 |
457 |
458 | #### Hash network-traffic endpoint
459 |
460 | Calls `file/network-traffic` VirusTotal API endpoint. Use to get the dump of the network traffic generated by the file when executed.
461 | You can request the file reports passing a list of hashes (md5, sha1 or sha2):
462 |
463 | >>> vt.get_file_network_traffic(file_hashes)
464 |
465 |
466 | #### Hash download endpoint
467 |
468 | Calls `file/download` VirusTotal API endpoint. Use to download a file by its hash.
469 | You can request the file reports passing a list of hashes (md5, sha1 or sha2):
470 |
471 | >>> vt.get_file_download(file_hashes)
472 |
473 |
474 | #### URL live feed endpoint
475 |
476 | Calls `url/distribution` VirusTotal API endpoint. Use to get a live a feed with the latest URLs submitted to VirusTotal.
477 |
478 | >>> vt.get_url_distribution()
479 |
480 |
481 | #### Hash live feed endpoint
482 |
483 | Calls `file/distribution` VirusTotal API endpoint. Use to get a live a feed with the latest Hashes submitted to VirusTotal.
484 |
485 | >>> vt.get_file_distribution()
486 |
487 |
488 | #### Hash search endpoint
489 |
490 | Calls `file/search` VirusTotal API endpoint. Use to search for samples that match some binary/metadata/detection criteria.
491 |
492 | >>> vt.get_file_search()
493 |
494 |
495 | #### File date endpoint
496 |
497 | Calls `file/clusters` VirusTotal API endpoint. Use to list simililarity clusters for a given time frame.
498 |
499 | >>> vt.get_file_clusters()
500 |
501 |
502 | ShadowServer API
503 | ----------------
504 | ShadowServer provides and API that allows to test the hashes against a list of known software applications.
505 |
506 | To use the ShadowServer API wrapper import ShadowServerApi class from threat_intel.shadowserver module:
507 |
508 | >>> from threat_intel import ShadowServerApi
509 |
510 | To use the API wrapper simply call the ShadowServerApi initializer:
511 |
512 | >>> ss = ShadowServerApi()
513 |
514 | You can also specify the file name where the API responses will be cached:
515 |
516 | >>> ss = ShadowServerApi(cache_file_name="/tmp/cache.shadowserver.json")
517 |
518 | To check whether the hashes are on the ShadowServer list of known hashes, call get_bin_test method and pass enumerable with the hashes you
519 | want to test:
520 |
521 | >>> file_hashes = [
522 | "99017f6eebbac24f351415dd410d522d",
523 | "88817f6eebbac24f351415dd410d522d"
524 | ]
525 |
526 | >>> ss.get_bin_test(file_hashes)
527 |
528 | """
529 | from __future__ import absolute_import
530 | from .exceptions import InvalidRequestError
531 |
532 | from .opendns import InvestigateApi
533 | from .shadowserver import ShadowServerApi
534 | from .virustotal import VirusTotalApi
535 |
536 | __all__ = ['InvalidRequestError', 'InvestigateApi', 'ShadowServerApi', 'VirusTotalApi']
537 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # threat_intel [](https://travis-ci.org/Yelp/threat_intel) [](https://pypi.python.org/pypi/threat_intel)
2 | Threat Intelligence APIs.
3 |
4 |
5 | ## Supported threat intelligence feeds
6 |
7 | The package contains API wrappers for:
8 |
9 | * Umbrella Investigate API
10 | * VirusTotal API v2.0
11 | * ShadowServer API
12 |
13 | ----
14 |
15 | ### Umbrella Investigate API
16 |
17 | [Umbrella Investigate](https://docs.umbrella.com/developer/investigate-api/) provides an API that
18 | allows querying for:
19 |
20 | * Domain categorization
21 | * Security information about a domain
22 | * Co-occurrences for a domain
23 | * Related domains for a domain
24 | * Domains related to an IP
25 | * Domain tagging dates for a domain
26 | * DNS RR history for a domain
27 | * WHOIS information
28 | - WHOIS information for an email
29 | - WHOIS information for a nameserver
30 | - Historical WHOIS information for a domain
31 | * Latest malicious domains for an IP
32 |
33 | To use the Investigate API wrapper import `InvestigateApi` class from `threat_intel.opendns` module:
34 |
35 | ```python
36 | from threat_intel.opendns import InvestigateApi
37 | ```
38 |
39 | To initialize the API wrapper you need the API key:
40 |
41 | ```python
42 | investigate = InvestigateApi("")
43 | ```
44 |
45 | You can also specify a file name where the API responses will be cached in a JSON file,
46 | to save you the bandwidth for the multiple calls about the same domains or IPs:
47 |
48 | ```python
49 | investigate = InvestigateApi("", cache_file_name="/tmp/cache.opendns.json")
50 | ```
51 |
52 | #### Domain categorization
53 |
54 | Calls `domains/categorization/?showLabels` Investigate API endpoint.
55 | It takes a list (or any other Python enumerable) of domains and returns
56 | the categories associated with this domains by Umbrella along with a [-1, 0, 1] score, where -1 is a malicious status.
57 |
58 | ```python
59 | domains = ["google.com", "baidu.com", "bibikun.ru"]
60 | investigate.categorization(domains)
61 | ```
62 |
63 | will result in:
64 |
65 | ```
66 | {
67 | "baidu.com": {"status": 1, "content_categories": ["Search Engines"], "security_categories": []},
68 | "google.com": {"status": 1, "content_categories": ["Search Engines"], "security_categories": []},
69 | "bibikun.ru": {"status": -1, "content_categories": [], "security_categories": ["Malware"]}
70 | }
71 | ```
72 |
73 | #### Security information about a domain
74 |
75 | Calls `security/name/` Investigate API endpoint.
76 | It takes any Python enumerable with domains, e.g. list, and returns several security parameters
77 | associated with each domain.
78 |
79 | ```python
80 | domains = ["google.com", "baidu.com", "bibikun.ru"]
81 | investigate.security(domains)
82 | ```
83 |
84 | will result in:
85 |
86 | ```
87 | {
88 | "baidu.com": {
89 | "found": true,
90 | "handlings": {
91 | "domaintagging": 0.00032008666962131285,
92 | "blocked": 0.00018876906157154347,
93 | "whitelisted": 0.00019697641207465407,
94 | "expired": 2.462205150933176e-05,
95 | "normal": 0.9992695458052232
96 | },
97 | "dga_score": 0,
98 | "rip_score": 0,
99 |
100 | ..
101 |
102 | }
103 | }
104 | ```
105 |
106 | #### Co-occurrences for a domain
107 |
108 | Calls `recommendations/name/` Investigate API endpoint.
109 | Use this method to find out a list of co-occurence domains (domains that are being accessed by the same users within a small window of time) to the one given in a list, or any other Python enumerable.
110 |
111 | ```python
112 | domains = ["google.com", "baidu.com", "bibikun.ru"]
113 | investigate.cooccurrences(domains)
114 | ```
115 |
116 | will result in:
117 |
118 | ```
119 | {
120 | "baidu.com": {
121 | "found": true,
122 | "pfs2": [
123 | ["www.howtoforge.de", 0.14108563836506008],
124 | }
125 |
126 | ..
127 |
128 | }
129 | ```
130 |
131 | #### Related domains for a domain
132 |
133 | Calls `links/name/` Investigate API endpoint.
134 | Use this method to find out a list of related domains (domains that have been frequently seen requested around a time window of 60 seconds, but that are not associated with the given domain) to the one given in a list, or any other Python enumerable.
135 |
136 | ```python
137 | domains = ["google.com", "baidu.com", "bibikun.ru"]
138 | investigate.related_domains(domains)
139 | ```
140 |
141 | will result in:
142 |
143 | ```
144 | {
145 | "tb1": [
146 | ["t.co", 11.0],
147 | ]
148 |
149 | ..
150 |
151 | }
152 | ```
153 |
154 | #### Domain tagging dates for a domain
155 |
156 | Calls `domains/name/` Investigate API endpoint.
157 |
158 | Use this method to get the date range when the domain being queried was a part of the Umbrella block list and how long a domain has been in this list
159 |
160 | ```python
161 | domains = ["google.com", "baidu.com", "bibikun.ru"]
162 | investigate.domain_tag(domains)
163 | ```
164 |
165 | will result in:
166 |
167 | ```
168 | {
169 | 'category': u'Malware',
170 | 'url': None,
171 | 'period': {
172 | 'begin': u'2013-09-16',
173 | 'end': u'Current'
174 | }
175 |
176 | ..
177 |
178 | }
179 | ```
180 |
181 | #### DNS RR history for a Domain
182 |
183 | Calls `dnsdb/name/a/` Investigate API endpoint.
184 | Use this method to find out related domains to domains given in a list, or any other Python enumerable.
185 |
186 | ```python
187 | domains = ["google.com", "baidu.com", "bibikun.ru"]
188 | investigate.dns_rr(domains)
189 | ```
190 |
191 | will result in:
192 |
193 | ```
194 | {
195 | 'features': {
196 | 'geo_distance_mean': 0.0,
197 | 'locations': [
198 | {
199 | 'lat': 59.89440155029297,
200 | 'lon': 30.26420021057129
201 | }
202 | ],
203 | 'rips': 1,
204 | 'is_subdomain': False,
205 | 'ttls_mean': 86400.0,
206 | 'non_routable': False,
207 | }
208 |
209 | ..
210 |
211 | }
212 | ```
213 |
214 | #### DNS RR history for an IP
215 |
216 | Calls `dnsdb/ip/a/` Investigate API endpoint.
217 | Use this method to find out related domains to the IP addresses given in a list, or any other Python enumerable.
218 |
219 | ```python
220 | ips = ['8.8.8.8']
221 | investigate.rr_history(ips)
222 | ```
223 |
224 | will result in:
225 |
226 | ```
227 | {
228 | "8.8.8.8": {
229 | "rrs": [
230 | {
231 | "name": "8.8.8.8",
232 | "type": "A",
233 | "class": "IN",
234 | "rr": "000189.com.",
235 | "ttl": 3600
236 | },
237 | {
238 | "name": "8.8.8.8",
239 | "type": "A",
240 | "class": "IN",
241 | "rr": "008.no-ip.net.",
242 | "ttl": 60
243 | },
244 | }
245 |
246 | ..
247 |
248 | }
249 | ```
250 |
251 | #### WHOIS information for a domain
252 |
253 | ##### WHOIS information for an email
254 |
255 | Calls `whois/emails/{email}` Investigate API endpoint.
256 |
257 | Use this method to see WHOIS information for the email address. For now the Umbrella API will only return at most 500 results.
258 |
259 | ```python
260 | emails = ["dns-admin@google.com"]
261 | investigate.whois_emails(emails)
262 | ```
263 |
264 | will result in:
265 |
266 | ```
267 | {
268 | "dns-admin@google.com": {
269 | "totalResults": 500,
270 | "moreDataAvailable": true,
271 | "limit": 500,
272 | "domains": [
273 | {
274 | "domain": "0emm.com",
275 | "current": true
276 | },
277 | ..
278 | ]
279 | }
280 | }
281 | ```
282 |
283 | ##### WHOIS information for a nameserver
284 |
285 | Calls `whois/nameservers/{nameserver}` Investigate API endpoint.
286 |
287 | Use this method to see WHOIS information for the nameserver. For now the Umbrella API will only return at most 500 results.
288 |
289 | ```python
290 | nameservers = ["ns2.google.com"]
291 | investigate.whois_nameservers(nameservers)
292 | ```
293 |
294 | will result in:
295 |
296 | ```
297 | {
298 | "ns2.google.com": {
299 | "totalResults": 500,
300 | "moreDataAvailable": true,
301 | "limit": 500,
302 | "domains": [
303 | {
304 | "domain": "46645.biz",
305 | "current": true
306 | },
307 | ..
308 | ]
309 | }
310 | }
311 | ```
312 |
313 | ##### WHOIS information for a domain
314 |
315 | Calls `whois/{domain}` Investigate API endpoint.
316 |
317 | Use this method to see WHOIS information for the domain.
318 |
319 | ```python
320 | domains = ["google.com"]
321 | investigate.whois_domains(domains)
322 | ```
323 |
324 | will result in:
325 |
326 | ```
327 | {
328 | "administrativeContactFax": null,
329 | "whoisServers": null,
330 | "addresses": [
331 | "1600 amphitheatre parkway",
332 | "please contact contact-admin@google.com, 1600 amphitheatre parkway",
333 | "2400 e. bayshore pkwy"
334 | ],
335 | ..
336 | }
337 | ```
338 |
339 | ##### Historical WHOIS information for a domain
340 |
341 | Calls `whois/{domain}/history` Investigate API endpoint.
342 |
343 | Use this method to see historical WHOIS information for the domain.
344 |
345 | ```python
346 | domains = ["5esb.biz"]
347 | investigate.whois_domains_history(domains)
348 | ```
349 |
350 | will result in:
351 |
352 | ```
353 | {
354 | '5esb.biz':[
355 | {
356 | u'registrantFaxExt':u'',
357 | u'administrativeContactPostalCode':u'656448',
358 | u'zoneContactCity':u'',
359 | u'addresses':[
360 | u'nan qu hua yuan xiao he'
361 | ],
362 | ..
363 | },
364 | ..
365 | ]
366 | }
367 | ```
368 |
369 | #### Latest malicious domains for an IP
370 |
371 | Calls `ips/{ip}/latest_domains` Investigate API endpoint.
372 |
373 | Use this method to see whether the IP address has any malicious domains associated with it.
374 |
375 | ```python
376 | ips = ["8.8.8.8"]
377 | investigate.latest_malicious(ips)
378 | ```
379 |
380 | will result in:
381 |
382 | ```
383 | {
384 | [
385 | '7ltd.biz',
386 | 'co0s.ru',
387 | 't0link.in',
388 | ]
389 |
390 | ..
391 | }
392 | ```
393 |
394 | ----
395 |
396 | ### VirusTotal API
397 |
398 | [VirusTotal](https://www.virustotal.com/) provides an
399 | [API](https://www.virustotal.com/en/documentation/public-api/) that makes it
400 | possible to query for the reports about:
401 |
402 | * Domains
403 | * URLs
404 | * IPs
405 | * File hashes
406 | * File Upload
407 | * Live Feed
408 | * Advanced search
409 |
410 | To use the VirusTotal API wrapper import `VirusTotalApi` class from `threat_intel.virustotal` module:
411 |
412 | ```python
413 | from threat_intel.virustotal import VirusTotalApi
414 | ```
415 |
416 | To initialize the API wrapper you need the API key:
417 |
418 | ```python
419 | vt = VirusTotalApi("")
420 | ```
421 |
422 | VirusTotal API calls allow to squeeze a list of file hashes or URLs into a single HTTP call.
423 | Depending on the API version you are using (public or private) you may need to tune the maximum number
424 | of the resources (file hashes or URLs) that could be passed in a single API call.
425 | You can do it with the `resources_per_req` parameter:
426 |
427 | ```python
428 | vt = VirusTotalApi("", resources_per_req=4)
429 | ```
430 |
431 | When using the public API your standard request rate allows you too put maximum 4 resources per request.
432 | With private API you are able to put up to 25 resources per call. That is also the default value if you
433 | don't pass the `resources_per_req` parameter.
434 |
435 | Of course when calling the API wrapper methods in the `VirusTotalApi` class you can pass as many resources
436 | as you want and the wrapper will take care of producing as many API calls as necessary to satisfy the request rate.
437 |
438 | You can also specify the file name where the responses will be cached:
439 |
440 | ```python
441 | vt = VirusTotalApi("", cache_file_name="/tmp/cache.virustotal.json")
442 | ```
443 |
444 | #### Domain report endpoint
445 |
446 | Calls `domain/report` VirusTotal API endpoint.
447 | Pass a list or any other Python enumerable containing the domains:
448 |
449 | ```python
450 | domains = ["google.com", "baidu.com", "bibikun.ru"]
451 | vt.get_domain_reports(domains)
452 | ```
453 |
454 | will result in:
455 |
456 | ```
457 | {
458 | "baidu.com": {
459 | "undetected_referrer_samples": [
460 | {
461 | "positives": 0,
462 | "total": 56,
463 | "sha256": "e3c1aea1352362e4b5c008e16b03810192d12a4f1cc71245f5a75e796c719c69"
464 | }
465 | ],
466 |
467 | ..
468 |
469 | }
470 | }
471 | ```
472 |
473 |
474 | #### URL report endpoint
475 |
476 | Calls `url/report` VirusTotal API endpoint.
477 | Pass a list or any other Python enumerable containing the URL addresses:
478 |
479 | ```python
480 | urls = ["http://www.google.com", "http://www.yelp.com"]
481 | vt.get_url_reports(urls)
482 | ```
483 |
484 | will result in:
485 |
486 | ```
487 | {
488 | "http://www.google.com": {
489 | "permalink": "https://www.virustotal.com/url/dd014af5ed6b38d9130e3f466f850e46d21b951199d53a18ef29ee9341614eaf/analysis/1423344006/",
490 | "resource": "http://www.google.com",
491 | "url": "http://www.google.com/",
492 | "response_code": 1,
493 | "scan_date": "2015-02-07 21:20:06",
494 | "scan_id": "dd014af5ed6b38d9130e3f466f850e46d21b951199d53a18ef29ee9341614eaf-1423344006",
495 | "verbose_msg": "Scan finished, scan information embedded in this object",
496 | "filescan_id": null,
497 | "positives": 0,
498 | "total": 62,
499 | "scans": {
500 | "CLEAN MX": {
501 | "detected": false,
502 | "result": "clean site"
503 | },
504 | }
505 | ..
506 |
507 | }
508 | ```
509 |
510 | #### URL scan endpoint
511 |
512 | Calls 'url/scan' VirusTotal API endpoint.
513 | Submit a url or any other Python enumerable containing the URL addresses:
514 |
515 | ```python
516 | urls = ["http://www.google.com", "http://www.yelp.com"]
517 | vt.get_url_reports(urls)
518 | ```
519 |
520 | #### Hash report endpoint
521 |
522 | Calls `file/report` VirusTotal API endpoint.
523 | You can request the file reports passing a list of hashes (md5, sha1 or sha2):
524 |
525 | ```python
526 | file_hashes = [
527 | "99017f6eebbac24f351415dd410d522d",
528 | "88817f6eebbac24f351415dd410d522d"
529 | ]
530 |
531 | vt.get_file_reports(file_hashes)
532 | ```
533 |
534 | will result in:
535 |
536 | ```
537 | {
538 | "88817f6eebbac24f351415dd410d522d": {
539 | "response_code": 0,
540 | "resource": "88817f6eebbac24f351415dd410d522d",
541 | "verbose_msg": "The requested resource is not among the finished, queued or pending scans"
542 | },
543 | "99017f6eebbac24f351415dd410d522d": {
544 | "scan_id": "52d3df0ed60c46f336c131bf2ca454f73bafdc4b04dfa2aea80746f5ba9e6d1c-1423261860",
545 | "sha1": "4d1740485713a2ab3a4f5822a01f645fe8387f92",
546 | }
547 |
548 | ..
549 |
550 | }
551 | ```
552 |
553 | #### Hash rescan endpoint
554 |
555 | Calls `file/rescan` VirusTotal API endpoint. Use to rescan a previously submitted file.
556 | You can request the file reports passing a list of hashes (md5, sha1 or sha2):
557 |
558 | #### Hash behaviour endpoint
559 |
560 | Calls `file/behaviour` VirusTotal API endpoint. Use to get a report about the behaviour of the file when executed in a sandboxed environment (Cuckoo sandbox).
561 | You can request the file reports passing a list of hashes (md5, sha1 or sha2):
562 |
563 | ```python
564 | file_hashes = [
565 | "99017f6eebbac24f351415dd410d522d",
566 | "88817f6eebbac24f351415dd410d522d"
567 | ]
568 |
569 | vt.get_file_behaviour(file_hashes)
570 | ```
571 |
572 | #### Hash network-traffic endpoint
573 |
574 | Calls `file/network-traffic` VirusTotal API endpoint. Use to get the dump of the network traffic generated by the file when executed.
575 | You can request the file reports passing a list of hashes (md5, sha1 or sha2):
576 |
577 | ```python
578 | file_hashes = [
579 | "99017f6eebbac24f351415dd410d522d",
580 | "88817f6eebbac24f351415dd410d522d"
581 | ]
582 |
583 | vt.get_file_network_traffic(file_hashes)
584 | ```
585 |
586 | #### Hash download endpoint
587 |
588 | Calls `file/download` VirusTotal API endpoint. Use to download a file by its hash.
589 | You can request the file reports passing a list of hashes (md5, sha1 or sha2):
590 |
591 | ```python
592 | file_hashes = [
593 | "99017f6eebbac24f351415dd410d522d",
594 | "88817f6eebbac24f351415dd410d522d"
595 | ]
596 |
597 | vt.get_file_download(file_hashes)
598 | ```
599 |
600 | #### IP reports endpoint
601 |
602 | Calls `ip-address/report` VirusTotal API endpoint.
603 | Pass a list or any other Python enumerable containing the IP addresses:
604 |
605 | ```python
606 | ips = ['90.156.201.27', '198.51.132.80']
607 | vt.get_ip_reports(ips)
608 | ```
609 |
610 | will result in:
611 |
612 | ```
613 | {
614 | "90.156.201.27": {
615 | "asn": "25532",
616 | "country": "RU",
617 | "response_code": 1,
618 | "as_owner": ".masterhost autonomous system",
619 | "verbose_msg": "IP address found in dataset",
620 | "resolutions": [
621 | {
622 | "last_resolved": "2013-04-01 00:00:00",
623 | "hostname": "027.ru"
624 | },
625 | {
626 | "last_resolved": "2015-01-20 00:00:00",
627 | "hostname": "600volt.ru"
628 | },
629 |
630 | ..
631 |
632 | ],
633 | "detected_urls": [
634 | {
635 | "url": "http://shop.albione.ru/",
636 | "positives": 2,
637 | "total": 52,
638 | "scan_date": "2014-04-06 11:18:17"
639 | },
640 | {
641 | "url": "http://www.orlov.ru/",
642 | "positives": 3,
643 | "total": 52,
644 | "scan_date": "2014-03-05 09:13:31"
645 | }
646 | ],
647 | },
648 |
649 | "198.51.132.80": {
650 |
651 | ..
652 |
653 | }
654 | }
655 | ```
656 |
657 | #### URL live feed endpoint
658 |
659 | Calls `url/distribution` VirusTotal API endpoint. Use to get a live a feed with the latest URLs submitted to VirusTotal.
660 |
661 | ```python
662 | vt.get_url_distribution()
663 | ```
664 |
665 | #### Hash live feed endpoint
666 |
667 | Calls `file/distribution` VirusTotal API endpoint. Use to get a live a feed with the latest Hashes submitted to VirusTotal.
668 |
669 | ```python
670 | vt.get_file_distribution()
671 | ```
672 |
673 | #### Hash search endpoint
674 |
675 | Calls `file/search` VirusTotal API endpoint. Use to search for samples that match some binary/metadata/detection criteria.
676 |
677 | ```python
678 | vt.get_file_search()
679 | ```
680 |
681 | #### File date endpoint
682 |
683 | Calls `file/clusters` VirusTotal API endpoint. Use to list simililarity clusters for a given time frame.
684 |
685 | ```python
686 | vt.get_file_clusters()
687 | ```
688 |
689 | ---
690 |
691 | ### ShadowServer API
692 |
693 | [ShadowServer](http://shadowserver.org/) provides and [API](http://bin-test.shadowserver.org/) that allows to test
694 | the hashes against a list of known software applications.
695 |
696 | To use the ShadowServer API wrapper import `ShadowServerApi` class from `threat_intel.shadowserver` module:
697 |
698 | ```python
699 | from threat_intel.shadowserver import ShadowServerApi
700 | ```
701 |
702 | To use the API wrapper simply call the `ShadowServerApi` initializer:
703 |
704 | ```python
705 | ss = ShadowServerApi()
706 | ```
707 |
708 | You can also specify the file name where the API responses will be cached:
709 |
710 | ```python
711 | ss = ShadowServerApi(cache_file_name="/tmp/cache.shadowserver.json")
712 | ```
713 |
714 | To check whether the hashes are on the ShadowServer list of known hashes,
715 | call `get_bin_test` method and pass enumerable with the hashes you want to test:
716 |
717 | ```python
718 | file_hashes = [
719 | "99017f6eebbac24f351415dd410d522d",
720 | "88817f6eebbac24f351415dd410d522d"
721 | ]
722 |
723 | ss.get_bin_test(file_hashes)
724 |
725 | ```
726 |
727 | ---
728 |
729 | ## Installation
730 |
731 | ### Install with `pip`
732 |
733 | ```shell
734 | $ pip install threat_intel
735 | ```
736 |
737 | ### Testing
738 | Go to town with `make`:
739 |
740 | ```shell
741 | $ sudo pip install tox
742 | $ make test
743 | ```
744 |
--------------------------------------------------------------------------------
/threat_intel/util/http.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Utilities for dealing with HTTP requests
3 | #
4 | # RateLimiter helps to only make a certain number of calls per second.
5 | # MultiRequest wraps requests-futures and issues multiple requests at once with an easy to use interface.
6 | # SSLAdapter helps force use of the highest possible version of TLS.
7 | #
8 | import logging
9 | import re
10 | import ssl
11 | import time
12 | from base64 import urlsafe_b64encode
13 | from collections import namedtuple
14 | from collections import OrderedDict
15 | from functools import partial
16 |
17 | from requests.adapters import HTTPAdapter
18 | from requests.exceptions import RequestException
19 | from requests_futures.sessions import FuturesSession
20 | from six.moves import range
21 | from urllib3.util.retry import Retry
22 |
23 | from threat_intel.exceptions import InvalidRequestError
24 | from threat_intel.util.error_messages import write_error_message
25 | from threat_intel.util.error_messages import write_exception
26 |
27 |
28 | PreparedRequest = namedtuple('PreparedRequest', ('callable', 'url'))
29 |
30 |
31 | class SSLAdapter(HTTPAdapter):
32 |
33 | """Attempt to use the highest possible TLS version for HTTPS connections.
34 |
35 | By explictly controlling which TLS version is used when connecting, avoid the client offering only SSLv2 or SSLv3.
36 |
37 | The best version specifier to pass is `ssl.PROTOCOL_TLS`, as this will choose the highest available protocol
38 | compatible with both client and server. For details see the documentation for `ssl.wrap_socket`
39 | (https://docs.python.org/2/library/ssl.html#socket-creation).
40 |
41 | To use this class, mount it to a `requests.Session` and then make HTTPS using the session object.
42 |
43 | .. code-block:: python
44 | # Mount an SSLAdapter in a Session
45 | session = requests.Session()
46 | session.mount('https://', SSLAdapter())
47 |
48 | # Make a requests call through the session
49 | session.get('https://api.github.com/events')
50 |
51 | """
52 |
53 | def init_poolmanager(self, connections, maxsize, block=False, **pool_kwargs):
54 | """Called to initialize the HTTPAdapter when no proxy is used."""
55 | try:
56 | pool_kwargs['ssl_version'] = ssl.PROTOCOL_TLS
57 | except AttributeError:
58 | pool_kwargs['ssl_version'] = ssl.PROTOCOL_SSLv23
59 | return super(SSLAdapter, self).init_poolmanager(connections, maxsize, block, **pool_kwargs)
60 |
61 | def proxy_manager_for(self, proxy, **proxy_kwargs):
62 | """Called to initialize the HTTPAdapter when a proxy is used."""
63 | try:
64 | proxy_kwargs['ssl_version'] = ssl.PROTOCOL_TLS
65 | except AttributeError:
66 | proxy_kwargs['ssl_version'] = ssl.PROTOCOL_SSLv23
67 | return super(SSLAdapter, self).proxy_manager_for(proxy, **proxy_kwargs)
68 |
69 |
70 | class RateLimiter(object):
71 |
72 | """Limits how many calls can be made per second"""
73 |
74 | CallRecord = namedtuple('CallRecord', ['time', 'num_calls'])
75 |
76 | def __init__(self, calls_per_sec):
77 | self._max_calls_per_second = calls_per_sec
78 | self._call_times = []
79 | self._outstanding_calls = 0
80 |
81 | def make_calls(self, num_calls=1):
82 | """Adds appropriate sleep to avoid making too many calls.
83 |
84 | Args:
85 | num_calls: int the number of calls which will be made
86 | """
87 | self._cull()
88 | while self._outstanding_calls + num_calls > self._max_calls_per_second:
89 | time.sleep(0) # yield
90 | self._cull()
91 |
92 | self._call_times.append(self.CallRecord(time=time.time(), num_calls=num_calls))
93 | self._outstanding_calls += num_calls
94 |
95 | def _cull(self):
96 | """Remove calls more than 1 second old from the queue."""
97 | right_now = time.time()
98 |
99 | cull_from = -1
100 | for index in range(len(self._call_times)):
101 | if right_now - self._call_times[index].time >= 1.0:
102 | cull_from = index
103 | self._outstanding_calls -= self._call_times[index].num_calls
104 | else:
105 | break
106 |
107 | if cull_from > -1:
108 | self._call_times = self._call_times[cull_from + 1:]
109 |
110 |
111 | class AvailabilityLimiter(object):
112 |
113 | """Limits the total number of requests issued for a session."""
114 |
115 | def __init__(self, total_retries):
116 | """ Wrapper object for managing total session retry limit.
117 |
118 | Args:
119 | total_retries: Total request attempts to be made per sesssion.
120 | This is shared between all request objects.
121 | """
122 | self.total_retries = total_retries
123 |
124 | def map_with_retries(self, requests, responses_for_requests):
125 | """Provides session-based retry functionality
126 |
127 | :param requests: A collection of Request objects.
128 | :param responses_for_requests: Dictionary mapping of requests to responses
129 | :param max_retries: The maximum number of retries to perform per session
130 | :param args: Additional arguments to pass into a retry mapping call
131 |
132 |
133 | """
134 | retries = []
135 | response_futures = [preq.callable() for preq in requests]
136 |
137 | for request, response_future in zip(requests, response_futures):
138 | try:
139 | response = response_future.result()
140 | if response is not None and response.status_code == 403:
141 | logging.warning('Request to {} caused a 403 response status code.'.format(request.url))
142 | raise InvalidRequestError('Access forbidden')
143 | if response is not None:
144 | responses_for_requests[request] = response
145 | except RequestException as re:
146 | logging.error('An exception was raised for {}: {}'.format(request.url, re))
147 | if self.total_retries > 0:
148 | self.total_retries -= 1
149 | retries.append(request)
150 |
151 | # Recursively retry failed requests with the modified total retry count
152 | if retries:
153 | self.map_with_retries(retries, responses_for_requests)
154 |
155 |
156 | class MultiRequest(object):
157 |
158 | """Wraps requests-futures to make simultaneous HTTP requests.
159 |
160 | Can use a RateLimiter to limit # of outstanding requests.
161 | Can also use AvailabilityLimiter to limit total # of request issuance threshold.
162 | `multi_get` and `multi_post` try to be smart about how many requests to issue:
163 |
164 | * One url & one param - One request will be made.
165 | * Multiple url & one query param - Multiple requests will be made, with differing urls and the same query param.
166 | * Multiple url & multiple query params - Multiple requests will be made, with the same url and differing query params.
167 | """
168 |
169 | _VERB_GET = 'GET'
170 | _VERB_POST = 'POST'
171 |
172 | def __init__(
173 | self, default_headers=None, max_requests=10, rate_limit=0,
174 | req_timeout=None, max_retry=10, total_retry=100, drop_404s=False,
175 | ):
176 | """Create the MultiRequest.
177 |
178 | Args:
179 | default_headers - A dict of headers which will be added to every request
180 | max_requests - Maximum number of requests to issue at once
181 | rate_limit - Maximum number of requests to issue per second
182 | req_timeout - Maximum number of seconds to wait without reading a response byte before deciding an error has occurred
183 | max_retry - The total number of attempts to retry a single batch of requests
184 | total_retry - The total number of request retries that can be made through the entire session
185 | Note there is a difference between `max_retry` and `total_retry`:
186 | - `max_retry` refers to how many times a batch of requests will be re-issued collectively
187 | - `total_retry` refers to a limit on the total number of outstanding requests made
188 | Once the latter is exhausted, no failed request within the whole session will be retried.
189 | """
190 | self._default_headers = default_headers
191 | self._max_requests = max_requests
192 | self._req_timeout = req_timeout or 25.0
193 | self._max_retry = max_retry
194 | self._drop_404s = drop_404s
195 | self._rate_limiter = RateLimiter(rate_limit) if rate_limit else None
196 | self._availability_limiter = AvailabilityLimiter(total_retry) if total_retry else None
197 | self._session = FuturesSession(max_workers=max_requests)
198 | retries = Retry(total=0, status_forcelist=[500, 502, 503, 504], raise_on_status=True)
199 | self._session.mount(
200 | 'https://', SSLAdapter(
201 | max_retries=retries, pool_maxsize=max_requests, pool_connections=max_requests,
202 | ),
203 | )
204 |
205 | def multi_get(self, urls, query_params=None, to_json=True, file_download=False):
206 | """Issue multiple GET requests.
207 |
208 | Args:
209 | urls - A string URL or list of string URLs
210 | query_params - None, a dict, or a list of dicts representing the query params
211 | to_json - A boolean, should the responses be returned as JSON blobs
212 | file_download - A boolean, whether a file download is expected
213 |
214 | Returns:
215 | a list of dicts if to_json is set of requests.response otherwise.
216 | Raises:
217 | InvalidRequestError - Can not decide how many requests to issue.
218 | """
219 | return self._multi_request(
220 | MultiRequest._VERB_GET, urls, query_params,
221 | data=None, to_json=to_json, file_download=file_download,
222 | )
223 |
224 | def multi_post(self, urls, query_params=None, data=None, to_json=True, send_as_file=False):
225 | """Issue multiple POST requests.
226 |
227 | Args:
228 | urls - A string URL or list of string URLs
229 | query_params - None, a dict, or a list of dicts representing the query params
230 | data - None, a dict or string, or a list of dicts and strings representing the data body.
231 | to_json - A boolean, should the responses be returned as JSON blobs
232 | send_as_file - A boolean, should the data be sent as a file.
233 | Returns:
234 | a list of dicts if to_json is set of requests.response otherwise.
235 | Raises:
236 | InvalidRequestError - Can not decide how many requests to issue.
237 | """
238 | return self._multi_request(
239 | MultiRequest._VERB_POST, urls, query_params,
240 | data, to_json=to_json, send_as_file=send_as_file,
241 | )
242 |
243 | def _create_request(self, verb, url, query_params=None, data=None, send_as_file=False):
244 | """Helper method to create a single post/get requests.
245 |
246 | Args:
247 | verb - MultiRequest._VERB_POST or MultiRequest._VERB_GET
248 | url - A string URL
249 | query_params - None or a dict
250 | data - None or a string or a dict
251 | send_as_file - A boolean, should the data be sent as a file.
252 | Returns:
253 | requests.PreparedRequest
254 | Raises:
255 | InvalidRequestError - if an invalid verb is passed in.
256 | """
257 |
258 | # Prepare a set of kwargs to make it easier to avoid missing default params.
259 | kwargs = {
260 | 'headers': self._default_headers,
261 | 'params': query_params,
262 | 'timeout': self._req_timeout,
263 | }
264 |
265 | if MultiRequest._VERB_POST == verb:
266 | if send_as_file:
267 | kwargs['files'] = {'file': data}
268 | else:
269 | kwargs['data'] = data
270 | return PreparedRequest(partial(self._session.post, url, **kwargs), url)
271 | elif MultiRequest._VERB_GET == verb:
272 | return PreparedRequest(partial(self._session.get, url, **kwargs), url)
273 | else:
274 | raise InvalidRequestError('Invalid verb {0}'.format(verb))
275 |
276 | def _zip_request_params(self, urls, query_params, data):
277 | """Massages inputs and returns a list of 3-tuples zipping them up.
278 |
279 | This is all the smarts behind deciding how many requests to issue.
280 | It's fine for an input to have 0, 1, or a list of values.
281 | If there are two inputs each with a list of values, the cardinality of those lists much match.
282 |
283 | Args:
284 | urls - 1 string URL or a list of URLs
285 | query_params - None, 1 dict, or a list of dicts
286 | data - None, 1 dict or string, or a list of dicts or strings
287 | Returns:
288 | A list of 3-tuples (url, query_param, data)
289 | Raises:
290 | InvalidRequestError - if cardinality of lists does not match
291 | """
292 |
293 | # Everybody gets to be a list
294 | if not isinstance(urls, list):
295 | urls = [urls]
296 | if not isinstance(query_params, list):
297 | query_params = [query_params]
298 | if not isinstance(data, list):
299 | data = [data]
300 |
301 | # Counts must not mismatch
302 | url_count = len(urls)
303 | query_param_count = len(query_params)
304 | data_count = len(data)
305 |
306 | max_count = max(url_count, query_param_count, data_count)
307 |
308 | if (
309 | max_count > url_count > 1
310 | or max_count > query_param_count > 1
311 | or max_count > data_count > 1
312 | ):
313 | raise InvalidRequestError(
314 | 'Mismatched parameter count url_count:{0} query_param_count:{1} data_count:{2} max_count:{3}',
315 | url_count, query_param_count, data_count, max_count,
316 | )
317 |
318 | # Pad out lists
319 | if url_count < max_count:
320 | urls = urls * max_count
321 | if query_param_count < max_count:
322 | query_params = query_params * max_count
323 | if data_count < max_count:
324 | data = data * max_count
325 |
326 | return list(zip(urls, query_params, data))
327 |
328 | def _wait_for_response(self, requests):
329 | """Issues a batch of requests and waits for the responses.
330 | If some of the requests fail it will retry the failed ones up to `_max_retry` times.
331 |
332 | Args:
333 | requests - A list of requests
334 | Returns:
335 | A list of `requests.models.Response` objects
336 | Raises:
337 | InvalidRequestError - if any of the requests returns "403 Forbidden" response
338 | """
339 | failed_requests = []
340 | responses_for_requests = OrderedDict.fromkeys(requests)
341 |
342 | for retry in range(self._max_retry):
343 | try:
344 | logging.debug('Try #{0}'.format(retry + 1))
345 | self._availability_limiter.map_with_retries(requests, responses_for_requests)
346 |
347 | failed_requests = []
348 | for request, response in responses_for_requests.items():
349 | if self._drop_404s and response is not None and response.status_code == 404:
350 | logging.warning('Request to {0} failed with status code 404, dropping.'.format(request.url))
351 | elif not response:
352 | failed_requests.append((request, response))
353 |
354 | if not failed_requests:
355 | break
356 |
357 | logging.warning('Try #{0}. Expected {1} successful response(s) but only got {2}.'.format(
358 | retry + 1, len(requests), len(requests) - len(failed_requests),
359 | ))
360 |
361 | # retry only for the failed requests
362 | requests = [fr[0] for fr in failed_requests]
363 | except InvalidRequestError:
364 | raise
365 | except Exception as e:
366 | # log the exception for the informative purposes and pass to the next iteration
367 | logging.exception('Try #{0}. Exception occured: {1}. Retrying.'.format(retry + 1, e))
368 | pass
369 |
370 | if failed_requests:
371 | logging.warning('Still {0} failed request(s) after {1} retries:'.format(
372 | len(failed_requests), self._max_retry,
373 | ))
374 | for failed_request, failed_response in failed_requests:
375 | if failed_response is not None:
376 | # in case response text does contain some non-ascii characters
377 | failed_response_text = failed_response.text.encode('ascii', 'xmlcharrefreplace')
378 | logging.warning('Request to {0} failed with status code {1}. Response text: {2}'.format(
379 | failed_request.url, failed_response.status_code, failed_response_text,
380 | ))
381 | else:
382 | logging.warning('Request to {0} failed with None response.'.format(failed_request.url))
383 |
384 | return list(responses_for_requests.values())
385 |
386 | def _handle_file_download(self, response):
387 | name = None
388 | data = None
389 | try:
390 | name = re.findall('filename=(.+)', response.headers['content-disposition'])[0]
391 | data = urlsafe_b64encode(response.text.encode('utf-8')).decode('utf-8')
392 | except Exception:
393 | logging.exception('Unable to extract download data for {} '.format(response.request.url))
394 | return {'data': {'id': name, 'text': data}}
395 |
396 | def _convert_to_json(self, response):
397 | """Converts response to JSON.
398 | If the response cannot be converted to JSON then `None` is returned.
399 |
400 | Args:
401 | response - An object of type `requests.models.Response`
402 | Returns:
403 | Response in JSON format if the response can be converted to JSON. `None` otherwise.
404 | """
405 | try:
406 | return response.json()
407 | except ValueError:
408 | logging.warning('Expected response in JSON format from {0} but the actual response text is: {1}'.format(
409 | response.request.url, response.text,
410 | ))
411 | return None
412 |
413 | def _multi_request(self, verb, urls, query_params, data, to_json=True, send_as_file=False, file_download=False):
414 | """Issues multiple batches of simultaneous HTTP requests and waits for responses.
415 |
416 | Args:
417 | verb - MultiRequest._VERB_POST or MultiRequest._VERB_GET
418 | urls - A string URL or list of string URLs
419 | query_params - None, a dict, or a list of dicts representing the query params
420 | data - None, a dict or string, or a list of dicts and strings representing the data body.
421 | to_json - A boolean, should the responses be returned as JSON blobs
422 | Returns:
423 | If multiple requests are made - a list of dicts if to_json, a list of requests responses otherwise
424 | If a single request is made, the return is not a list
425 | Raises:
426 | InvalidRequestError - if no URL is supplied or if any of the requests returns 403 Access Forbidden response
427 | """
428 | if not urls:
429 | raise InvalidRequestError('No URL supplied')
430 |
431 | # Break the params into batches of request_params
432 | request_params = self._zip_request_params(urls, query_params, data)
433 | batch_of_params = [
434 | request_params[pos:pos + self._max_requests]
435 | for pos in range(0, len(request_params), self._max_requests)
436 | ]
437 |
438 | # Iteratively issue each batch, applying the rate limiter if necessary
439 | all_responses = []
440 | for param_batch in batch_of_params:
441 | if self._rate_limiter:
442 | self._rate_limiter.make_calls(num_calls=len(param_batch))
443 |
444 | prepared_requests = [
445 | self._create_request(
446 | verb, url, query_params=query_param, data=datum, send_as_file=send_as_file,
447 | ) for url, query_param, datum in param_batch
448 | ]
449 |
450 | responses = self._wait_for_response(prepared_requests)
451 | for response in responses:
452 | if response and not file_download:
453 | all_responses.append(self._convert_to_json(response) if to_json else response)
454 | elif file_download:
455 | all_responses.append(self._handle_file_download(response))
456 | else:
457 | all_responses.append(None)
458 |
459 | return all_responses
460 |
461 | def post_file(self, url, file, to_json=True):
462 | request = self._create_request(MultiRequest._VERB_POST, url)
463 | return request
464 |
465 | @classmethod
466 | def error_handling(cls, fn):
467 | """Decorator to handle errors"""
468 | def wrapper(*args, **kwargs):
469 | try:
470 | result = fn(*args, **kwargs)
471 | return result
472 | except InvalidRequestError as e:
473 | write_exception(e)
474 |
475 | if hasattr(e, 'request'):
476 | write_error_message('request {0}'.format(repr(e.request)))
477 | if hasattr(e, 'response'):
478 | write_error_message('response {0}'.format(repr(e.response)))
479 |
480 | raise e
481 | return wrapper
482 |
--------------------------------------------------------------------------------