├── .github └── workflows │ └── codeql-analysis.yml ├── .gitignore ├── .travis.yml ├── Code_Of_Conduct.md ├── Dockerfile ├── LICENSE.txt ├── README.md ├── images ├── README.md ├── machinae-square.jpg ├── machinae.jpg ├── robot-plainer.jpg └── t-machinae.jpg ├── machinae.yml ├── pylintrc ├── requirements.txt ├── setup.py └── src └── machinae ├── __init__.py ├── cmd.py ├── outputs.py ├── sites ├── __init__.py ├── base.py ├── csv.py ├── html.py ├── ipwhois.py ├── json.py └── rss.py └── utils.py /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: [master, ] 6 | pull_request: 7 | # The branches below must be a subset of the branches above 8 | branches: [master] 9 | schedule: 10 | - cron: '0 2 * * 6' 11 | 12 | jobs: 13 | analyze: 14 | name: Analyze 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@v2 20 | with: 21 | # We must fetch at least the immediate parents so that if this is 22 | # a pull request then we can checkout the head. 23 | fetch-depth: 2 24 | 25 | # If this run was triggered by a pull request event, then checkout 26 | # the head of the pull request instead of the merge commit. 27 | - run: git checkout HEAD^2 28 | if: ${{ github.event_name == 'pull_request' }} 29 | 30 | # Initializes the CodeQL tools for scanning. 31 | - name: Initialize CodeQL 32 | uses: github/codeql-action/init@v1 33 | # Override language selection by uncommenting this and choosing your languages 34 | with: 35 | languages: python 36 | 37 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 38 | # If this step fails, then you should remove it and run the build manually (see below) 39 | - name: Autobuild 40 | uses: github/codeql-action/autobuild@v1 41 | 42 | # ℹ️ Command-line programs to run using the OS shell. 43 | # 📚 https://git.io/JvXDl 44 | 45 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 46 | # and modify them (or add more) to build your code if your project 47 | # uses a compiled language 48 | 49 | #- run: | 50 | # make bootstrap 51 | # make release 52 | 53 | - name: Perform CodeQL Analysis 54 | uses: github/codeql-action/analyze@v1 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | .python-version 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .cache 41 | nosetests.xml 42 | coverage.xml 43 | 44 | # Translations 45 | *.mo 46 | *.pot 47 | 48 | # Django stuff: 49 | *.log 50 | 51 | # Sphinx documentation 52 | docs/_build/ 53 | 54 | # PyBuilder 55 | target/ 56 | 57 | #Vim 58 | *.swp 59 | 60 | # Divshot 61 | .divshot-cache/ 62 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - "3.6" 5 | - "3.7" 6 | - "3.8" 7 | - "3.9-dev" 8 | 9 | before_install: 10 | - pip install pandoc 11 | - pip install pylint 12 | script: 13 | - pip install -r requirements.txt 14 | - pylint src/machinae/*.py 15 | - pylint src/machinae/sites/*.py 16 | 17 | deploy: 18 | provider: pypi 19 | user: billfordx 20 | password: 21 | secure: PyOibJ0cErm9yCOfgWvToefrnCrwt3iw7H4eU7hdg4x73DXyqVRNHaJDTvfiVWJyJSNRxPe2r80v7VzUKO24Lqgp7FEpf+4dNbEJtJJEis93vYxOerYXthO/VUIh3yk7ULq9YIAn+65XgNRUk/YllebvOpHLnwNh8FQn63HesDVkCrcuiNFjALqC3SNKcg8vQxrBJzXo+f36a45BgZiQ20qZ8czechXKhi1UVWdQ8ezS/+4YAZcdudD3A0+qnfPd0ve0zfpIrm7ZsyQ9jyDXtnWw7QlOLOuQcT3o4OH9WHrtxjrFONtjg4zZnT9gygxUycgWz2NNVqVWx57ZkZImjAVaf8p7Ym/0DKLuMix2f+K5iMtVlKtYnb8ZKCj4UuaNrNmHrbDj7PasckezKbQF+TwMW9UoG54qh3q1fa+l13rZ3kTcjxg1Wn5RXv6/aw/i+3TGHW2hO0eWxAjgRl741NAzZDVuh0PAenYK8DETT2ZUIlU3VnzCbzi6jTunwV8UsToERHyla3GuiykTlmIOb/3THYIs+n7kffH89b1GlOj/+joLWL12AY5dG4zrhv2VYqt+erJ65K34/nJLk19S+KPqIpYKn/dj1cGzE3y2awiADR4nJbDH87BioqjTQ1fV8bxwPmyl0bGEzOoH9DQnFy/hAc6E9RNWkDIJKUUOEH0= 22 | on: 23 | tags: true 24 | branch: master 25 | -------------------------------------------------------------------------------- /Code_Of_Conduct.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at billford+gitmach@billford.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3 2 | 3 | RUN pip3 install machinae 4 | 5 | #make sure you have a machinae.yml file to build with 6 | COPY machinae.yml /etc 7 | 8 | ENTRYPOINT ["/usr/local/bin/machinae"] 9 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Hurricane Labs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/HurricaneLabs/machinae.svg?branch=master)](https://travis-ci.org/HurricaneLabs/machinae) 2 | [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/2344/badge)](https://bestpractices.coreinfrastructure.org/projects/2344) 3 | [![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=HurricaneLabs_machinae&metric=alert_status)](https://sonarcloud.io/dashboard?id=HurricaneLabs_machinae) 4 | 5 | ![Machinae Logo](images/machinae.jpg) 6 | 7 | 8 | Machinae Security Intelligence Collector 9 | ======================================== 10 | 11 | Machinae is a tool for collecting intelligence from public sites/feeds about 12 | various security-related pieces of data: IP addresses, domain names, URLs, 13 | email addresses, file hashes and SSL fingerprints. It was inspired by 14 | [Automater][1], another excellent tool for collecting information. The Machinae 15 | project was born from wishing to improve Automater in 4 areas: 16 | 17 | 1. Codebase - Bring Automater to python3 compatibility while making the code 18 | more pythonic 19 | 2. Configuration - Use a more human readable configuration format (YAML) 20 | 3. Inputs - Support JSON parsing out-of-the-box without the need to write 21 | regular expressions, but still support regex scraping when needed 22 | 4. Outputs - Support additional output types, including JSON, while making 23 | extraneous output optional 24 | 25 | 26 | Installation 27 | ------------ 28 | 29 | Machinae can be installed using pip3: 30 | 31 | pip3 install machinae 32 | 33 | Or, if you're feeling adventurous, can be installed directly from github: 34 | 35 | pip3 install git+https://github.com/HurricaneLabs/machinae.git 36 | 37 | You will need to have whatever dependencies are required on your system for 38 | compiling Python modules (on Debian based systems, `python3-dev`), as well as 39 | the libyaml development package (on Debian based systems, `libyaml-dev`). 40 | 41 | You'll also want to grab the [latest configuration file][2] and place it in 42 | `/etc/machinae.yml`. 43 | 44 | 45 | Configuration File 46 | ------------------ 47 | 48 | Machinae supports a simple configuration merging system to allow you to make 49 | adjustments to the configuration without modifying the machinae.yml we provide 50 | you, making configuration updates a snap. This is done by finding a system-wide 51 | default configuration (default `/etc/machinae.yml`), merging into that a 52 | system-wide local configuration (`/etc/machinae.local.yml`) and finally a 53 | per-user local configuration (`~/.machinae.yml`). The system-wide configuration 54 | can also be located in the current working directory, can be set using the 55 | `MACHINAE_CONFIG` environment variable, or of course by using the `-c` or 56 | `--config` command line options. Configuration merging can be disabled by 57 | passing the `--nomerge` option, which will cause Machinae to only load the 58 | default system-wide configuration (or the one passed on the command line). 59 | 60 | As an example of this, say you'd like to enable the Fortinet Category site, 61 | which is disabled by default. You could modify `/etc/machinae.yml`, but these 62 | changes would be overwritten by an update. Instead, you can put the following 63 | in either `/etc/machinae.local.yml` or `~/.machinae.yml`: 64 | 65 | fortinet_classify: 66 | default: true 67 | 68 | Or, conversely, to disable a site, such as Virus Total pDNS: 69 | 70 | vt_ip: 71 | default: false 72 | vt_domain: 73 | default: false 74 | 75 | 76 | Usage 77 | ----- 78 | 79 | Machinae usage is very similar to Automater: 80 | 81 | usage: machinae [-h] [-c CONFIG] [--nomerge] [-d DELAY] [-f FILE] [-i INFILE] [-v] 82 | [-o {D,J,N,S}] [-O {ipv4,ipv6,fqdn,email,sslfp,hash,url}] [-q] 83 | [-s SITES] [-a AUTH] [-H HTTP_PROXY] 84 | [--dump-config | --detect-otype] 85 | ... 86 | 87 | - See above for details on the `-c`/`--config` and `--nomerge` options. 88 | 89 | - Machinae supports a `-d`/`--delay` option, like Automater. However, Machinae 90 | uses 0 by default. 91 | 92 | - Machinae output is controlled by two arguments: 93 | - `-o` controls the output format, and can be followed by a single character 94 | to indicated the desired type of output: 95 | - *N* is the default output ("Normal") 96 | - *D* is the default output, but dot characters are replaced 97 | - *J* is JSON output 98 | - `-f`/`--file` specifies the file where output should be written. The default 99 | is "-" for stdout. 100 | 101 | - Machinae will attempt to auto-detect the type of target passed in (Machinae 102 | refers to targets as "observables" and the type as "otype"). This detection can 103 | be overridden with the `-O`/`--otype` option. The choices are listed in the 104 | usage 105 | 106 | - By default, Machinae operates in verbose mode. In this mode, it will output 107 | status information about the services it is querying on the console as they are 108 | queried. This output will always be written to stdout, regardless of the output 109 | setting. To disable verbose mode, use `-q` 110 | 111 | - By default, Machinae will run through all services in the configuration that 112 | apply to each target's otype *and* are not marked as "default: false". To modify 113 | this behavior, you can: 114 | - Pass a comma separated list of sites to run (use the top level key from the 115 | configuration). 116 | - Pass the special keyword `all` to run through all services *including* those 117 | marked as "default: false" 118 | 119 | Note that in both cases, `otype` validation is still applied. 120 | 121 | - Machinae supports passing an HTTP proxy on the command line using the 122 | `-H`/`--http-proxy` argument. If no proxy is specified, machinae will search the 123 | standard `HTTP_PROXY` and `HTTPS_PROXY` environment variables, as well as the 124 | less standard `http_proxy` and `https_proxy` environment variables. 125 | 126 | - Lastly, a list of targets should be passed. All arguments other than the 127 | options listed above will be interpreted as targets. 128 | 129 | 130 | Out-of-the-Box Data Sources 131 | --------------------------- 132 | 133 | Machinae comes with out-of-the-box support for the following data sources: 134 | 135 | - IPVoid 136 | - URLVoid 137 | - URL Unshortener (http://www.toolsvoid.com/unshorten-url) 138 | - Malc0de 139 | - SANS 140 | - FreeGeoIP (freegeoip.io) 141 | - Fortinet Category 142 | - VirusTotal pDNS (via web scrape - commented out) 143 | - VirusTotal pDNS (via JSON API) 144 | - VirusTotal URL Report (via JSON API) 145 | - VirusTotal File Report (via JSON API) 146 | - Reputation Authority 147 | - ThreatExpert 148 | - VxVault 149 | - ProjectHoneypot 150 | - McAfee Threat Intelligence 151 | - StopForumSpam 152 | - Cymru MHR 153 | - ICSI Certificate Notary 154 | - TotalHash (disabled by default) 155 | - DomainTools Parsed Whois (Requires API key) 156 | - DomainTools Reverse Whois (Requires API key) 157 | - DomainTools Reputation 158 | - IP WHOIS (Using RIR REST interfaces) 159 | - Hacked IP 160 | - Metadefender Cloud (Requires API key) 161 | - GreyNoise (Requires API key) 162 | - IBM XForce (Required API key) 163 | 164 | With additional data sources on the way. 165 | 166 | HTTP Basic Authentication and Configuration 167 | ------------------------------------------- 168 | 169 | Machinae supports HTTP Basic Auth for sites that require it through the `--auth/-a` 170 | flag. You will need to create a YAML file with your credentials, which will include 171 | a key to the site that requires the credentials and a list of two items, username 172 | and password or API key. For example, for the included PassiveTotal site this might 173 | look like: 174 | 175 | passivetotal: ['myemail@example.com', 'my_api_key'] 176 | 177 | Inside the site configuration under `request` you will see a key such as: 178 | 179 | json: 180 | request: 181 | url: '...' 182 | auth: passivetotal 183 | 184 | The `auth: passivetotal` points to the key inside the authentication config passed 185 | via the command line. 186 | 187 | ### Disabled by default 188 | 189 | The following sites are disabled by default 190 | 191 | - Fortinet Category (`fortinet_classify`) 192 | - Telize Geo IP (`telize`) 193 | - TotalHash (`totalhash_ip`) 194 | - DomainTools Parsed Whois (`domaintools_parsed_whois`) 195 | - DomainTools Reverse Whois (`domaintools_reverse_whois`) 196 | - DomainTools Reputation (`domaintools_reputation`) 197 | - PassiveTotal Passive DNS (`passivetotal_pdns`) 198 | - PassiveTotal Whois (`passivetotal_whois`) 199 | - PassiveTotal SSL Certificate History (`passivetotal_sslcert`) 200 | - PassiveTotal Host Attribute Components (`passivetotal_components`) 201 | - PassiveTotal Host Attribute Trackers (`passivetotal_trackers`) 202 | - MaxMind GeoIP2 Passive Insight (`maxmind`) 203 | - FraudGuard (`fraudguard`) 204 | - Shodan (`shodan`) 205 | - Hacked IP 206 | - Metadefender Cloud (Requires API key) 207 | - GreyNoise (Requires API key) 208 | - IBM XForce (Requires API key) 209 | 210 | Output Formats 211 | -------------- 212 | 213 | Machinae comes with a limited set of output formats: normal, normal with dot 214 | escaping, and JSON. We plan to add additional output formats in the future. 215 | 216 | 217 | Adding additional sites 218 | ----------------------- 219 | 220 | *** COMING SOON *** 221 | 222 | 223 | Known Issues 224 | ------------ 225 | 226 | - Some ISP's on IPvoid contain double-encoded HTML entities, which are not 227 | double-decoded 228 | 229 | 230 | Upcoming Features 231 | ----------------- 232 | 233 | - Add IDS rule search functionality (VRT/ET) 234 | - Add "More info" link for sites 235 | - Add "dedup" option to parser settings 236 | - Add option for per-otype request settings 237 | - Add custom per-site output for error codes 238 | 239 | 240 | Version History 241 | --------------- 242 | 243 | ### Version 1.4.9 (2020-11-25) ### 244 | - Fix bug in JSON as_time processing when time is an epoch time, but str type 245 | 246 | ### Version 1.4.1 (2018-08-31) ### 247 | - New Features 248 | - Automatically Defangs output 249 | - MISP Support (example added to machinae.yml) 250 | 251 | ### Version 1.4.0 (2016-04-20) ### 252 | - New features 253 | - "-a"/"--auth" option for passing an auth config file 254 | - Thanks johannestaas for the submission 255 | - "-H"/"--http-proxy" option, and environment support, for HTTP proxies 256 | - New sites 257 | - Passivetotal (various forms, thanks johannestaas) 258 | - MaxMind 259 | - FraudGuard 260 | - Shodan 261 | - Updated sites 262 | - FreeGeoIP (replaced freegeoip.net with freegeoip.io) 263 | 264 | ### Version 1.3.4 (2016-04-01) ### 265 | - Bug fixes 266 | - Convert exceptions to str when outputting to JSON 267 | - Should actually close #14 268 | 269 | ### Version 1.3.3 (2016-03-28) ### 270 | - Bug fixes 271 | - Correctly handle error results when outputting to JSON 272 | - Closes #14 273 | - Thanks Den1al for the bug report 274 | 275 | ### Version 1.3.2 (2016-03-10) ### 276 | - New features 277 | - "Short" output mode - simply output yes/no/error for each site 278 | - "-i"/"--infile" option for passing a file with list of targets 279 | 280 | ### Version 1.3.1 (2016-03-08) ### 281 | 282 | - New features 283 | - Prepend "http://" to URL targets when not starting with http:// or https:// 284 | 285 | ### Version 1.3.0 (2016-03-07) ### 286 | 287 | - New sites 288 | - Cymon.io - Threat intel aggregator/tracker by eSentire 289 | - New features 290 | - Support simple paginated responses 291 | - Support url encoding 'target' in request URL 292 | - Support url decoding values in results 293 | 294 | ### Version 1.2.0 (2016-02-16) ### 295 | 296 | - New features 297 | - Support for sites returning multiple JSON documents 298 | - Ability to specify time format for relative time parameters 299 | - Ability to parse Unix timestamps in results and display in ISO-8601 format 300 | - Ability to specify status codes to ignore per-API 301 | - New sites 302 | - DNSDB - FarSight Security Passive DNS Data base (premium) 303 | 304 | ### Version 1.1.2 (2015-11-26) ### 305 | 306 | - New sites 307 | - Telize (premium) - GeoIP site (premium) 308 | - Freegeoip - GeoIP site (free) 309 | - CIF - CIFv2 API support, from csirtgadgets.org 310 | - New features 311 | - Ability to specify labels for single-line multimatch JSON outputs 312 | - Ability to specify relative time parameters using relatime library 313 | 314 | ### Version 1.0.1 (2015-10-13) ### 315 | 316 | - Fixed a false-positive bug with Spamhaus (Github#10) 317 | 318 | ### Version 1.0.0 (2015-07-02) ### 319 | 320 | - Initial release 321 | 322 | 323 | License Info 324 | ------------ 325 | 326 | The MIT License (MIT) 327 | 328 | Copyright (c) 2015 Hurricane Labs LLC 329 | 330 | Permission is hereby granted, free of charge, to any person obtaining a copy 331 | of this software and associated documentation files (the "Software"), to deal 332 | in the Software without restriction, including without limitation the rights 333 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 334 | copies of the Software, and to permit persons to whom the Software is 335 | furnished to do so, subject to the following conditions: 336 | 337 | The above copyright notice and this permission notice shall be included in 338 | all copies or substantial portions of the Software. 339 | 340 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 341 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 342 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 343 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 344 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 345 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 346 | THE SOFTWARE. 347 | 348 | 349 | [1]: https://github.com/1aN0rmus/TekDefense-Automater 350 | [2]: https://github.com/HurricaneLabs/machinae/raw/master/machinae.yml 351 | -------------------------------------------------------------------------------- /images/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /images/machinae-square.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HurricaneLabs/machinae/9ef3e6ce1a8d4ad00107ca206e72bf8dc09878f1/images/machinae-square.jpg -------------------------------------------------------------------------------- /images/machinae.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HurricaneLabs/machinae/9ef3e6ce1a8d4ad00107ca206e72bf8dc09878f1/images/machinae.jpg -------------------------------------------------------------------------------- /images/robot-plainer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HurricaneLabs/machinae/9ef3e6ce1a8d4ad00107ca206e72bf8dc09878f1/images/robot-plainer.jpg -------------------------------------------------------------------------------- /images/t-machinae.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HurricaneLabs/machinae/9ef3e6ce1a8d4ad00107ca206e72bf8dc09878f1/images/t-machinae.jpg -------------------------------------------------------------------------------- /machinae.yml: -------------------------------------------------------------------------------- 1 | ipwhois: 2 | name: IP Whois 3 | otypes: 4 | - ipv4 5 | ipwhois: 6 | results: 7 | - key: '@' 8 | multi_match: 9 | keys: 10 | - asn 11 | - asn_cidr 12 | - asn_date 13 | - asn_registry 14 | - asn_country_code 15 | pretty_name: ASN Information 16 | - key: nets 17 | multi_match: 18 | keys: 19 | - cidr 20 | - handle 21 | - name 22 | - range 23 | pretty_name: Network Information 24 | - key: nets 25 | multi_match: 26 | keys: 27 | - description 28 | - key: created 29 | regex: '(\d+-\d+-\d+)T' 30 | - key: updated 31 | regex: '(\d+-\d+-\d+)T' 32 | pretty_name: Registration Info 33 | - key: nets 34 | multi_match: 35 | keys: 36 | - city 37 | - state 38 | - postal_code 39 | - country 40 | pretty_name: Registration Locality 41 | # For when we use RWS 42 | - key: nets 43 | multi_match: 44 | keys: 45 | - key: abuse_emails 46 | split: "\n" 47 | pretty_name: Abuse Email 48 | - key: nets 49 | multi_match: 50 | keys: 51 | - key: tech_emails 52 | split: "\n" 53 | pretty_name: Tech Email 54 | # For when we fall back to regular whois 55 | - key: nets 56 | multi_match: 57 | keys: 58 | - key: emails 59 | split: "\n" 60 | pretty_name: Contacts 61 | spamhaus_ip: 62 | name: Spamhaus Zen BL 63 | default: False 64 | otypes: 65 | - ipv4 66 | webscraper: 67 | request: 68 | url: 'http://www.spamhaus.org/query/ip/{target}' 69 | method: get 70 | strip_comments: true 71 | results: 72 | - regex: '\S+ is (listed in the \w+)' 73 | values: 74 | - spamhaus_zenbl 75 | pretty_name: Spamhaus Zen BL 76 | spamhaus_domain: 77 | name: Spamhaus Domain BL 78 | default: False 79 | otypes: 80 | - fqdn 81 | webscraper: 82 | request: 83 | url: 'http://www.spamhaus.org/query/domain/{target}' 84 | method: get 85 | results: 86 | - regex: '\S+ is (listed in the \w+)' 87 | values: 88 | - spamhaus_dbl 89 | pretty_name: Spamhaus DBL 90 | ipvoid: 91 | name: IPVoid 92 | default: False 93 | otypes: 94 | - ipv4 95 | json: 96 | request: 97 | url: 'https://endpoint.apivoid.com/iprep/v1/pay-as-you-go/' 98 | params: 99 | key: 100 | ip: '{target}' 101 | method: get 102 | results: 103 | - key: data.report.blacklists.detections 104 | pretty_name: Number of detections 105 | - key: data.report.blacklists.detection_rate 106 | pretty_name: IP Void Detection Rate 107 | - key: data.report.blacklists.engines 108 | pretty_name: Engines 109 | multi_match: 110 | keys: 111 | - engine 112 | - reference 113 | onlyif: detected 114 | 115 | urlvoid: 116 | name: URLVoid 117 | otypes: 118 | - fqdn 119 | webscraper: 120 | request: 121 | url: 'http://www.urlvoid.com/scan/{target}' 122 | method: get 123 | results: 124 | - regex: 'Analysis Date<\/td>(.+?)<\/td>' 125 | values: urlvoid_analysis_date 126 | pretty_name: Last Analysis 127 | - regex: '(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}).{5,30}Find\swebsites\shosted\shere' 128 | values: urlvoid_ip 129 | pretty_name: IP from URLVoid 130 | - regex: '\/>(.+?)<\/td><\/i>' 131 | values: urlvoid_blacklist 132 | pretty_name: Blacklist from URL Void 133 | - regex: 'Domain\s1st\sRegistered.+\(.+)\<\/td\>' 134 | values: urlvoid_domain_age 135 | pretty_name: Domain Age from URL Void 136 | - regex: 'latitude\s/\slongitude.+\(.+)\<\/td\>' 137 | values: urlvoid_location 138 | pretty_name: Geo Coordinates from URLVoid 139 | - regex: 'alt="flag"\s/>\s\(\w+\)\s+([\w\s]+)' 140 | values: urlvoid_country_code 141 | pretty_name: Country from URLVoid 142 | unshorten: 143 | name: URL Unshorten 144 | otypes: 145 | - fqdn 146 | - url 147 | webscraper: 148 | request: 149 | url: http://www.toolsvoid.com/unshorten-url 150 | method: post 151 | data: 152 | urladdr: '{target}' 153 | results: 154 | - regex: 'class="myarea">(.*?))\d{1,3})' 205 | values: 206 | - AbuseIPReports 207 | pretty_name: 'AbuseIPDB reports' 208 | 209 | - regex: '((?<=most\srecent\sreport\swas\s)\d{1,3}\s\w+\s\w+)' 210 | values: 211 | - Last_seen 212 | pretty_name: 'Last seen' 213 | 214 | RansomwareTracker: 215 | name: RansomwareTracker 216 | otypes: 217 | - ipv4 218 | webscraper: 219 | request: 220 | url: 'https://ransomwaretracker.abuse.ch/host/{target}' 221 | method: get 222 | results: 223 | - regex: '((?<=Host\sStatus:)\w+)' 224 | values: 225 | - Active 226 | pretty_name: 'Host Status' 227 | - regex: '((?<=\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})' 228 | values: 229 | - Last_seen 230 | pretty_name: 'Last seen' 231 | - regex: '((?<=Malware:)\w+)' 232 | values: 233 | - ransomwareType 234 | pretty_name: 'Ransomware Type' 235 | 236 | sans: 237 | name: SANS 238 | otypes: 239 | - ipv4 240 | webscraper: 241 | request: 242 | url: 'https://isc.sans.edu/api/ip/{target}' 243 | method: get 244 | results: 245 | - regex: 'attacks>(\d+)<' 246 | values: 247 | - sans_attacks 248 | pretty_name: SANS attacks 249 | - regex: 'count>(\d+)<' 250 | values: 251 | - sans_count 252 | pretty_name: SANS count 253 | - regex: 'count>(\d+)<' 254 | values: 255 | - sans_count 256 | pretty_name: SANS count 257 | - regex: 'maxdate>(\d{4}-\d{2}-\d{2})<' 258 | values: 259 | - sans_maxdate 260 | pretty_name: SANS maxdate 261 | - regex: 'mindate>(\d{4}-\d{2}-\d{2})<' 262 | values: 263 | - sans_mindate 264 | pretty_name: SANS mindate 265 | telize: 266 | name: Telize GeoIP 267 | default: False 268 | otypes: 269 | - ipv4 270 | json: 271 | request: 272 | url: 'https://telize-v1.p.rapidapi.com/location/{target}' 273 | method: get 274 | headers: 275 | x-rapidapi-host: telize-v1.p.rapidapi.com 276 | x-rapidapi-key: 277 | Accept: application/json 278 | results: 279 | - key: continent_code 280 | pretty_name: GeoIP Continent Code 281 | - key: country_code 282 | pretty_name: GeoIP Country Code 283 | - key: country 284 | pretty_name: GeoIP Country 285 | - key: region_code 286 | pretty_name: GeoIP Region Code 287 | - key: region 288 | pretty_name: GeoIP Region 289 | - key: city 290 | pretty_name: GeoIP City 291 | - key: postal_code 292 | pretty_name: GeoIP Zip Code 293 | - key: latitude 294 | pretty_name: GeoIP Latitude 295 | - key: longitude 296 | pretty_name: GeoIP Longitude 297 | - key: timezone 298 | pretty_name: GeoIP Timezone 299 | - key: offset 300 | pretty_name: GeoIP UTC Offset 301 | - key: asn 302 | pretty_name: GeoIP ASN 303 | - key: isp 304 | pretty_name: GeoIP ISP 305 | maxmind: 306 | name: MaxMind GeoIP2 Precision 307 | default: False 308 | otypes: 309 | - ipv4 310 | json: 311 | request: 312 | url: https://geoip.maxmind.com/geoip/v2.1/insights/{target} 313 | auth: maxmind 314 | results: 315 | - key: country.iso_code 316 | pretty_name: MaxMind Country Code 317 | - key: country.names.en 318 | pretty_name: MaxMind Country 319 | - key: subdivisions 320 | multi_match: 321 | keys: 322 | - iso_code 323 | pretty_name: MaxMind Region Code 324 | - key: subdivisions 325 | multi_match: 326 | keys: 327 | - names.en 328 | pretty_name: MaxMind Region 329 | - key: city.names.en 330 | pretty_name: MaxMind City 331 | - key: postal.code 332 | pretty_name: MaxMind Zip Code 333 | - key: location.latitude 334 | pretty_name: MaxMind Latitude 335 | - key: location.longitude 336 | pretty_name: MaxMind Longitude 337 | - key: location.time_zone 338 | pretty_name: MaxMind Timezone 339 | freegeoip: 340 | name: freegeoip.io 341 | default: true 342 | otypes: 343 | - ipv4 344 | # - fqdn 345 | json: 346 | request: 347 | url: https://freegeoip.io/json/{target} 348 | results: 349 | - key: country_code 350 | pretty_name: GeoIP Country Code 351 | - key: country_name 352 | pretty_name: GeoIP Country 353 | # - key: region_code 354 | # pretty_name: GeoIP Region Code 355 | # - key: region_name 356 | # pretty_name: GeoIP Region 357 | - key: city 358 | pretty_name: GeoIP City 359 | # - key: zip_code 360 | # pretty_name: GeoIP Zip Code 361 | # - key: latitude 362 | # pretty_name: GeoIP Latitude 363 | # - key: longitude 364 | # pretty_name: GeoIP Longitude 365 | # - key: time_zone 366 | # pretty_name: GeoIP Timezone 367 | fortinet_classify: 368 | name: Fortinet Category 369 | default: True 370 | otypes: 371 | - ipv4 372 | - fqdn 373 | - url 374 | webscraper: 375 | request: 376 | url: 'https://www.fortiguard.com/webfilter?q={target}' 377 | method: get 378 | results: 379 | - regex: 'Category:\s(.+)<\/h4>\s' 380 | values: 381 | - fortinet_category 382 | pretty_name: Fortinet URL Category 383 | vt_ip: 384 | name: VirusTotal pDNS 385 | otypes: 386 | - ipv4 387 | json: 388 | request: 389 | url: https://www.virustotal.com/vtapi/v2/ip-address/report 390 | params: 391 | ip: '{target}' 392 | apikey: 308211ef74a1044ea98134424b3d20769451d25beda0b808a8b61036badc0ea1 393 | method: get 394 | results: 395 | - key: resolutions 396 | multi_match: 397 | keys: 398 | - key: last_resolved 399 | regex: '(\d{4}\-\d{1,2}\-\d{1,2})' 400 | - hostname 401 | onlyif: 402 | key: last_resolved 403 | maxage: '-30d' 404 | pretty_name: pDNS data from VirusTotal 405 | - key: detected_urls 406 | multi_match: 407 | keys: 408 | - key: scan_date 409 | regex: '(\d{4}\-\d{1,2}\-\d{1,2})' 410 | - key: url 411 | regex: '(http.{1,70}/)' 412 | onlyif: 413 | key: scan_date 414 | maxage: '-30d' 415 | pretty_name: pDNS malicious URLs from VirusTotal 416 | # vt_ip: 417 | # name: VirusTotal pDNS 418 | # otypes: 419 | # - ip 420 | # webscraper: 421 | # request: 422 | # url: 'https://www.virustotal.com/en/ip-address/{target}/information/' 423 | # method: get 424 | # headers: 425 | # Accept: 'text/html, application/xhtml+xml, */*' 426 | # Accept-Language: 'en-US' 427 | # Accept-Encoding: 'gzip, deflate' 428 | # DNT: 1 429 | # Connection: 'Keep-Alive' 430 | # results: 431 | # - regex: '(\d{4}\-\d{1,2}\-\d{1,2})\s+<.{30,70}/en/domain/(.{1,80})/information' 432 | # values: 433 | # - vt_pdns_date 434 | # - vt_pdns_domain 435 | # pretty_name: 'pDNS data from VirtusTotal' 436 | # - regex: '(\d{4}\-\d{1,2}\-\d{1,2}).{1,20}\s+<.{10,80}/en/url/.{1,100}/analysis/.{1,5}\s+(http.{1,70}/)' 437 | # values: 438 | # - vt_pdns_date 439 | # - vt_pdns_url 440 | # pretty_name: 'pDNS malicious URLs from VirusTotal' 441 | vt_domain: 442 | name: VirusTotal pDNS 443 | otypes: 444 | - fqdn 445 | json: 446 | request: 447 | url: https://www.virustotal.com/vtapi/v2/domain/report 448 | params: 449 | domain: '{target}' 450 | apikey: 308211ef74a1044ea98134424b3d20769451d25beda0b808a8b61036badc0ea1 451 | method: get 452 | results: 453 | - key: resolutions 454 | multi_match: 455 | keys: 456 | - key: last_resolved 457 | regex: '(\d{4}\-\d{1,2}\-\d{1,2})' 458 | - ip_address 459 | pretty_name: pDNS data from VirusTotal 460 | - key: Websense ThreatSeeker category 461 | pretty_name: Websense ThreatSeeker category 462 | - key: Webutation domain info.Safety score 463 | pretty_name: Webutation Safety score 464 | # vt_domain: 465 | # name: VirusTotal pDNS 466 | # otypes: 467 | # - fqdn 468 | # webscraper: 469 | # request: 470 | # url: 'https://www.virustotal.com/en/domain/{target}/information/' 471 | # method: get 472 | # headers: 473 | # Accept: 'text/html, application/xhtml+xml, */*' 474 | # Accept-Language: 'en-US' 475 | # Accept-Encoding: 'gzip, deflate' 476 | # DNT: 1 477 | # Connection: 'Keep-Alive' 478 | # results: 479 | # - regex: '(\d{4}\-\d{1,2}\-\d{1,2})\s+<.{30,70}/en/ip-address/(.{1,80})/information' 480 | # values: 481 | # - vt_pdns_date 482 | # - vt_pdns_ip 483 | # pretty_name: 'pDNS data from VirtusTotal' 484 | # - regex: '(\d{4}\-\d{1,2}\-\d{1,2}).{1,20}\s+<.{10,80}/en/url/.{1,100}/analysis/.{1,5}\s+(http.{1,70}/)' 485 | # values: 486 | # - vt_pdns_date 487 | # - vt_pdns_url 488 | # pretty_name: 'pDNS malicious URLs from VirusTotal' 489 | vt_url: 490 | name: VirusTotal URL Report 491 | otypes: 492 | - url 493 | json: 494 | request: 495 | url: https://www.virustotal.com/vtapi/v2/url/report 496 | method: get 497 | params: 498 | apikey: 308211ef74a1044ea98134424b3d20769451d25beda0b808a8b61036badc0ea1 499 | resource: '{target}' 500 | results: 501 | - key: scan_date 502 | pretty_name: Date submitted 503 | - key: positives 504 | pretty_name: Detected scanners 505 | - key: total 506 | pretty_name: Total scanners 507 | - key: scans 508 | pretty_name: URL Scanner 509 | multi_match: 510 | keys: 511 | - '@' 512 | - result 513 | onlyif: detected 514 | vt_hash: 515 | name: VirusTotal File Report 516 | otypes: 517 | - hash 518 | - hash.sha1 519 | - 'hash.sha256' 520 | json: 521 | request: 522 | url: https://www.virustotal.com/vtapi/v2/file/report 523 | method: get 524 | params: 525 | apikey: 308211ef74a1044ea98134424b3d20769451d25beda0b808a8b61036badc0ea1 526 | resource: '{target}' 527 | results: 528 | - key: scan_date 529 | pretty_name: Date submitted 530 | - key: positives 531 | pretty_name: Detected engines 532 | - key: total 533 | pretty_name: Total engines 534 | - key: scans 535 | pretty_name: Scans 536 | multi_match: 537 | keys: 538 | - '@' 539 | - result 540 | onlyif: detected 541 | reputation_authority: 542 | name: Reputation Authority 543 | otypes: 544 | - fqdn 545 | - ipv4 546 | webscraper: 547 | request: 548 | url: 'http://www.reputationauthority.org/lookup.php?ip={target}' 549 | method: get 550 | results: 551 | - regex: '>(\d{1,3}\/\d{1,3})' 552 | values: 553 | - ra_score 554 | pretty_name: Reputation Authority Score 555 | threatexpert: 556 | name: ThreatExpert 557 | otypes: 558 | - hash 559 | webscraper: 560 | request: 561 | url: 'http://www.threatexpert.com/report.aspx?md5={target}' 562 | method: get 563 | results: 564 | - regex: 'Submission\sreceived.\s(.+)' 565 | values: 566 | - threatexpert_date 567 | pretty_name: Hash found at ThreatExpert 568 | - regex: '1">(.{5,100})\s*(\d+-\d+)\s*\[D\]\s*(.*?)\s*\s*(.*?) 582 | - regex: '>(\d{2}\-\d{2})<' 583 | values: 584 | - vxvault_date 585 | pretty_name: Date found at VXVault 586 | - regex: '\[D\].{2,40}\Wphp\?id.{2,10}>(.{5,100})([a-zA-Z\s]+)' 601 | values: 602 | - php_activity_type 603 | pretty_name: ProjectHoneyPot activity type 604 | - regex: '>First Received From.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])[a-zA-Z0-9><"&:,()=;\s\t/]+Number Received' 605 | values: 606 | - php_first_mail 607 | pretty_name: ProjectHoneyPot first mail received 608 | - regex: '>Last Received From.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])[a-zA-Z0-9><":,()=;\s\t/]+Number Received' 609 | values: 610 | - php_last_mail 611 | pretty_name: ProjectHoneyPot last mail received 612 | - regex: '>Number Received.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s\(\)]+[a-zA-Z\)])' 613 | values: 614 | - php_total_mail 615 | pretty_name: ProjectHoneyPot total mail received 616 | - regex: '>Spider First Seen.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])' 617 | values: 618 | - php_first_spider 619 | pretty_name: ProjectHoneyPot spider first seen 620 | - regex: '>Spider Last Seen.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s\(\)]+[a-zA-Z])' 621 | values: 622 | - php_last_spider 623 | pretty_name: ProjectHoneyPot spider last seen 624 | - regex: '>Spider Sightings.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s\(]+[a-zA-Z\)])' 625 | values: 626 | - php_spider_sightings 627 | pretty_name: ProjectHoneyPot total spider sightings 628 | - regex: '>User-Agents.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9\-\(\),\s]+[a-zA-Z\)])' 629 | values: 630 | - php_user_agents 631 | pretty_name: ProjectHoneyPot user-agent sightings 632 | - regex: '>First Post On.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])' 633 | values: 634 | - php_first_post 635 | pretty_name: ProjectHoneyPot first form post 636 | - regex: '>Last Post On.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])' 637 | values: 638 | - php_last_post 639 | pretty_name: ProjectHoneyPot last form post 640 | - regex: '>Form Posts.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s\(\)]+[a-zA-Z\)])' 641 | values: 642 | - php_form_posts 643 | pretty_name: ProjectHoneyPot total form posts 644 | - regex: '>First Rule-Break On.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])' 645 | values: 646 | - php_first_rulebreak 647 | pretty_name: ProjectHoneyPot first rule break 648 | - regex: '>Last Rule-Break On.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])' 649 | values: 650 | - php_last_rulebreak 651 | pretty_name: ProjectHoneyPot last rule break 652 | - regex: '>Rule Breaks.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s\(\)]+[a-zA-Z\)])' 653 | values: 654 | - php_total_rulebreaks 655 | pretty_name: ProjectHoneyPot total rule breaks 656 | - regex: 'Dictionary Attacks[a-zA-Z0-9><":,()=;\s\t/]+>First Received From.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])' 657 | values: 658 | - php_first_dictionary_attack 659 | pretty_name: ProjectHoneyPot first dictionary attack 660 | - regex: 'Dictionary Attacks[a-zA-Z0-9><"&:,()=;\s\t/]+>Last Received From.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])' 661 | values: 662 | - php_last_dictionary_attack 663 | pretty_name: ProjectHoneyPot last dictionary attack 664 | - regex: '>Dictionary Attacks.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s\(\)]+[a-zA-Z\)])' 665 | values: 666 | - php_total_dictionary_attacks 667 | pretty_name: ProjectHoneyPot total dictionary attacks 668 | - regex: '>First Bad Host Appearance.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])' 669 | values: 670 | - php_first_bad_host 671 | pretty_name: ProjectHoneyPot first bad host 672 | - regex: '>Last Bad Host Appearance.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])' 673 | values: 674 | - php_last_bad_host 675 | pretty_name: ProjectHoneyPot last bad host 676 | - regex: '>Bad Host Appearances.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s\(\)\-]+[a-zA-Z\)])' 677 | values: 678 | - php_total_bad_host 679 | pretty_name: ProjectHoneyPot total bad hosts 680 | - regex: '>Harvester First Seen.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])' 681 | values: 682 | - php_first_harvester 683 | pretty_name: ProjectHoneyPot harvester first seen 684 | - regex: '>Harvester Last Seen.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s\(\)]+[a-zA-Z])' 685 | values: 686 | - php_last_harvester 687 | pretty_name: ProjectHoneyPot harvester last seen 688 | - regex: '>Harvester Sightings.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\(\s]+[a-zA-Z\)])' 689 | values: 690 | - php_total_harvester 691 | pretty_name: ProjectHoneyPot total harvester sightings 692 | - regex: '(?:>Harvester Results(?:.+[\n\s].+[\n\s]+)\s{2,}|(?:))(?!\s)([0-9a-zA-Z.\s:,()-]+)\s{2,}' 693 | values: 694 | - php_harvester_results 695 | pretty_name: ProjectHoneyPot harvester results 696 | mcafee_threat_domain: 697 | name: McAfee Threat 698 | otypes: 699 | - fqdn 700 | webscraper: 701 | request: 702 | url: 'https://www.mcafee.com/threat-intelligence/domain/default.aspx?domain={target}' 703 | method: get 704 | results: 705 | - regex: 'ctl00_breadcrumbContent_imgRisk"[^\r\n]+title="([A-Za-z]+)"' 706 | values: 707 | - mcafee_risk 708 | pretty_name: McAfee Web Risk 709 | - regex: '
  • [\n\s]*Web\sCategory:[\n\s]*([A-Z][A-Za-z\s/,]+?)[\n\s]*
  • ' 710 | values: 711 | - mcafee_category 712 | pretty_name: McAfee Web Category 713 | - regex: '
  • [\n\s]*Last\sSeen:[\n\s]*([0-9\-]+)[\n\s]*
  • ' 714 | values: 715 | - mcafee_last_seen 716 | pretty_name: McAfee Last Seen 717 | mcafee_threat_ip: 718 | name: McAfee Threat 719 | otypes: 720 | - ipv4 721 | webscraper: 722 | request: 723 | url: 'https://www.mcafee.com/threat-intelligence/ip/default.aspx?ip={target}' 724 | method: get 725 | results: 726 | - regex: 'ctl00_breadcrumbContent_imgRisk"[^\r\n]+src="/img/Threat_IP/rep_([a-z]+)\.png"' 727 | values: 728 | - mcafee_risk 729 | pretty_name: McAfee Web Risk 730 | - regex: 'ctl00_breadcrumbContent_imgRisk1"[^\r\n]+src="/img/Threat_IP/rep_([a-z]+)\.png"' 731 | values: 732 | - mcafee_risk 733 | pretty_name: McAfee Email Risk 734 | - regex: 'ctl00_breadcrumbContent_imgRisk2"[^\r\n]+src="/img/Threat_IP/rep_([a-z]+)\.png"' 735 | values: 736 | - mcafee_risk 737 | pretty_name: McAfee Network Risk 738 | - regex: '
  • [\n\s]*Web\sCategory:[\n\s]*([A-Z][A-Za-z\s/,]+?)[\n\s]*
  • ' 739 | values: 740 | - mcafee_category 741 | pretty_name: McAfee Web Category 742 | stopforumspam: 743 | name: StopForumSpam 744 | otypes: 745 | - email 746 | webscraper: 747 | request: 748 | url: 'http://www.stopforumspam.com/search/{target}' 749 | method: get 750 | results: 751 | - regex: '>Found (0*[1-9]\d*) entries' 752 | values: 753 | - sfs_spam_count 754 | pretty_name: Spam email count 755 | cymru_mhr: 756 | name: Cymru MHR 757 | otypes: 758 | - hash 759 | - hash.sha1 760 | webscraper: 761 | request: 762 | url: 'https://hash.cymru.com/cgi-bin/bulkmhr.cgi' 763 | method: post 764 | data: 765 | action: do_whois 766 | bulk_paste: '{target}' 767 | submit_paste: Submit 768 | results: 769 | - regex: '[a-f0-9]+\s(\d+)\s(\d+)' 770 | values: 771 | - cymru_mhr_detect_time 772 | - cymru_mhr_detect_pct 773 | pretty_name: Cymru MHR Detection Percent 774 | icsi_notary: 775 | name: ICSI Certificate Notary 776 | otypes: 777 | - sslfp 778 | dns: 779 | request: 780 | query: '{target_stripped}.notary.icsi.berkeley.edu' 781 | rrtype: txt 782 | results: 783 | - regex: 'version=1 first_seen=(\d+) last_seen=(\d+) times_seen=(\d+) validated=(\d+)' 784 | values: 785 | - icsi_first_seen 786 | - icsi_last_seen 787 | - icsi_times_seen 788 | - icsi_validated 789 | pretty_name: ICSI Notary Results 790 | totalhash_ip: 791 | name: TotalHash 792 | default: false 793 | otypes: 794 | - ip 795 | webscraper: 796 | request: 797 | url: 'https://totalhash.com/network/dnsrr:*{target}*%20or%20ip:{target}' 798 | method: get 799 | results: 800 | - regex: '/analysis/(\w{40}).+(\d{4}\-\d{1,2}\-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})' 801 | values: 802 | - thip_hash 803 | - thip_date 804 | pretty_name: Totalhash 805 | domaintools_parsed_whois: 806 | name: DomainTools Whois 807 | default: false 808 | otypes: 809 | - fqdn 810 | json: 811 | request: 812 | url: 'https://api.domaintools.com/v1/{target}/whois/parsed' 813 | method: get 814 | params: 815 | api_username: 816 | api_key: 817 | results: 818 | - key: response.parsed_whois.contacts 819 | multi_match: 820 | keys: 821 | - '@' 822 | - name 823 | - country 824 | - email 825 | onlyif: name 826 | pretty_name: Whois Contacts 827 | - key: response.parsed_whois.created_date 828 | pretty_name: Domain registered 829 | regex: '(\d{4}\-\d{1,2}\-\d{1,2})' 830 | - key: response.parsed_whois.updated_date 831 | pretty_name: Whois updated 832 | regex: '(\d{4}\-\d{1,2}\-\d{1,2})' 833 | - key: response.parsed_whois.expired_date 834 | pretty_name: Domain expiration 835 | regex: '(\d{4}\-\d{1,2}\-\d{1,2})' 836 | - key: response.parsed_whois.name_servers 837 | pretty_name: Name Servers 838 | #match_all: true 839 | - key: response.parsed_whois.registrar 840 | pretty_name: Registrar Info 841 | multi_match: 842 | keys: 843 | - name 844 | - abuse_contact_phone 845 | - abuse_contact_email 846 | - url 847 | domaintools_reverse_whois: 848 | name: DomainTools Reverse Whois 849 | default: false 850 | otypes: 851 | - email 852 | json: 853 | request: 854 | url: 'https://api.domaintools.com/v1/reverse-whois/' 855 | method: get 856 | params: 857 | terms: '{target}' 858 | mode: purchase 859 | api_username: 860 | api_key: 861 | results: 862 | - key: response.domains 863 | match_all: true 864 | pretty_name: Registered domain 865 | - key: reponse.domain_count.current 866 | pretty_name: Currently active registered domains 867 | - key: response.domain_count.historic 868 | pretty_name: All registered domains 869 | domaintools_reputation: 870 | name: DomainTools Reputation 871 | default: false 872 | otypes: 873 | - fqdn 874 | json: 875 | request: 876 | url: 'https://api.domaintools.com/v1/reputation/' 877 | method: get 878 | params: 879 | domain: '{target}' 880 | include_reasons: 'true' 881 | api_username: 882 | api_key: 883 | results: 884 | - key: response.risk_score 885 | pretty_name: Risk Score 886 | - key: response.reasons 887 | pretty_name: Reasons 888 | dnsdb_ip: 889 | name: Farsight DNSDB 890 | default: False 891 | otypes: 892 | - ipv4 893 | - ipv6 894 | json: 895 | multi_json: true 896 | request: 897 | url: 'https://api.dnsdb.info/lookup/rdata/ip/{target}' 898 | method: get 899 | headers: 900 | Accept: application/json 901 | X-Api-Key: 902 | results: 903 | - key: '@' 904 | multi_match: 905 | keys: 906 | - rrname 907 | - rrtype 908 | - key: time_first 909 | format: as_time 910 | - key: time_last 911 | format: as_time 912 | labels: 913 | - Record Name 914 | - Record Type 915 | - First Seen 916 | - Last Seen 917 | dnsdb_fqdn: 918 | name: Farsight DNSDB 919 | default: False 920 | otypes: 921 | - fqdn 922 | json: 923 | multi_json: true 924 | request: 925 | url: 'https://api.dnsdb.info/lookup/rrset/name/{target}' 926 | method: get 927 | ignored_status_codes: 928 | - 404 929 | params: 930 | time_last_after: 931 | relatime: '-7d' 932 | timezone: UTC 933 | format: as_epoch 934 | headers: 935 | Accept: application/json 936 | X-Api-Key: 937 | results: 938 | - key: '@' 939 | multi_match: 940 | keys: 941 | - rrtype 942 | - key: rdata 943 | # format: as_list 944 | - key: time_last 945 | format: as_time 946 | labels: 947 | - Record Type 948 | - Record Data 949 | - Last Seen 950 | onlyif: 951 | key: rrtype 952 | regex: "^(A|AAAA|MX|SPF|TXT)$" 953 | cif: 954 | name: Collective Intelligence Framework 955 | default: false 956 | otypes: 957 | - ipv4 958 | - fqdn 959 | - email 960 | - hash 961 | json: 962 | request: 963 | url: 'https://cif/observables' 964 | method: get 965 | params: 966 | nolog: 1 967 | confidence: 75 968 | observable: '{target}' 969 | reporttime: 970 | relatime: '-2d' 971 | timezone: UTC 972 | reporttimeend: 973 | relatime: 'now' 974 | timezone: UTC 975 | headers: 976 | Accept: application/vnd.cif.v2+json 977 | Authorization: Token token= 978 | verify_ssl: False 979 | results: 980 | - key: '@' 981 | multi_match: 982 | keys: 983 | - asn 984 | - cc 985 | labels: 986 | - AS Number 987 | - Country Code 988 | - key: '@' 989 | multi_match: 990 | keys: 991 | - key: reporttime 992 | regex: '^(\d+-\d+-\d+)T' 993 | - confidence 994 | - key: tags 995 | format: as_list 996 | - provider 997 | - description 998 | labels: 999 | - Report Date 1000 | - Confidence 1001 | - Tags 1002 | - Provider 1003 | - Description 1004 | 1005 | threatcrowd_ip_report: 1006 | name: ThreatCrowd IP Report 1007 | default: True 1008 | otypes: 1009 | - ipv4 1010 | json: 1011 | paginated: false 1012 | request: 1013 | url: 'https://www.threatcrowd.org/searchApi/v2/ip/report/?ip={target}' 1014 | method: get 1015 | ignored_status_codes: 1016 | - 404 1017 | results: 1018 | - key: 'resolutions' 1019 | pretty_name: Passive DNS 1020 | multi_match: 1021 | keys: 1022 | - domain 1023 | - last_resolved 1024 | labels: 1025 | - Domain 1026 | - Last Resolved 1027 | onlyif: 1028 | key: last_resolved 1029 | maxage: '-30d' 1030 | - key: 'hashes' 1031 | pretty_name: Known Malware Hash 1032 | match_all: true 1033 | 1034 | passivetotal_pdns: 1035 | name: PassiveTotal Passive DNS 1036 | default: False 1037 | otypes: 1038 | - fqdn 1039 | - ipv4 1040 | json: 1041 | request: 1042 | url: 'https://api.passivetotal.org/v2/dns/passive' 1043 | auth: passivetotal 1044 | params: 1045 | query: '{target}' 1046 | method: get 1047 | headers: 1048 | Accept: application/json 1049 | ignored_status_codes: 1050 | - 401 1051 | results: 1052 | - key: results 1053 | format: as_list 1054 | pretty_name: Results 1055 | multi_match: 1056 | keys: 1057 | - key: resolve 1058 | - key: queryValue 1059 | pretty_name: Query Value 1060 | 1061 | passivetotal_whois: 1062 | name: PassiveTotal Whois 1063 | default: False 1064 | otypes: 1065 | - fqdn 1066 | json: 1067 | request: 1068 | url: 'https://api.passivetotal.org/v2/whois' 1069 | auth: passivetotal 1070 | params: 1071 | query: '{target}' 1072 | method: get 1073 | headers: 1074 | Accept: application/json 1075 | ignored_status_codes: 1076 | - 401 1077 | results: 1078 | - key: registryUpdatedAt 1079 | pretty_name: Registry Updated At 1080 | - key: domain 1081 | pretty_name: Domain 1082 | - key: billing 1083 | pretty_name: Billing 1084 | - key: zone 1085 | pretty_name: Zone 1086 | - key: nameServers 1087 | pretty_name: Name Servers 1088 | - key: registered 1089 | pretty_name: Registered 1090 | - key: lastLoadedAt 1091 | pretty_name: Last Loaded At 1092 | - key: whoisServer 1093 | pretty_name: Whois Server 1094 | - key: contactEmail 1095 | pretty_name: Contact Email 1096 | - key: admin 1097 | pretty_name: Admin 1098 | - key: expiresAt 1099 | pretty_name: Expires At 1100 | - key: registrar 1101 | pretty_name: Registrar 1102 | - key: tech 1103 | pretty_name: Tech 1104 | - key: registrant 1105 | pretty_name: Registrant 1106 | 1107 | passivetotal_sslcert: 1108 | name: PassiveTotal SSL Certificate History 1109 | default: False 1110 | otypes: 1111 | - ipv4 1112 | json: 1113 | request: 1114 | url: 'https://api.passivetotal.org/v2/ssl-certificate/history' 1115 | auth: passivetotal 1116 | params: 1117 | query: '{target}' 1118 | method: get 1119 | headers: 1120 | Accept: application/json 1121 | ignored_status_codes: 1122 | - 401 1123 | results: 1124 | - key: results 1125 | multi_match: 1126 | keys: 1127 | - key: sha1 1128 | pretty_name: Sha1 1129 | - key: firstSeen 1130 | pretty_name: First Seen 1131 | - key: ipAddresses 1132 | pretty_name: Ip Addresses 1133 | - key: lastSeen 1134 | pretty_name: Last Seen 1135 | pretty_name: Results 1136 | 1137 | passivetotal_components: 1138 | name: PassiveTotal Components 1139 | default: False 1140 | otypes: 1141 | - fqdn 1142 | json: 1143 | request: 1144 | url: 'https://api.passivetotal.org/v2/host-attributes/components' 1145 | auth: passivetotal 1146 | params: 1147 | query: '{target}' 1148 | method: get 1149 | headers: 1150 | Accept: application/json 1151 | ignored_status_codes: 1152 | - 401 1153 | results: 1154 | - key: results 1155 | multi_match: 1156 | keys: 1157 | - key: category 1158 | pretty_name: Category 1159 | - key: hostname 1160 | pretty_name: Hostname 1161 | - key: lastSeen 1162 | pretty_name: Last Seen 1163 | - key: firstSeen 1164 | pretty_name: First Seen 1165 | - key: label 1166 | pretty_name: Label 1167 | pretty_name: Results 1168 | 1169 | passivetotal_trackers: 1170 | name: PassiveTotal Trackers 1171 | default: False 1172 | otypes: 1173 | - fqdn 1174 | json: 1175 | request: 1176 | url: 'https://api.passivetotal.org/v2/host-attributes/trackers' 1177 | auth: passivetotal 1178 | params: 1179 | query: '{target}' 1180 | method: get 1181 | headers: 1182 | Accept: application/json 1183 | ignored_status_codes: 1184 | - 401 1185 | results: 1186 | - key: results 1187 | multi_match: 1188 | keys: 1189 | - key: hostname 1190 | pretty_name: Hostname 1191 | - key: attributeType 1192 | pretty_name: Type 1193 | - key: attributeValue 1194 | pretty_name: Value 1195 | - key: lastSeen 1196 | pretty_name: Last Seen 1197 | - key: firstSeen 1198 | pretty_name: First Seen 1199 | pretty_name: Results 1200 | fraudguard: 1201 | name: FraudGuard 1202 | default: False 1203 | otypes: 1204 | - ipv4 1205 | json: 1206 | request: 1207 | url: https://api.fraudguard.io/ip/{target} 1208 | auth: fraudguard 1209 | results: 1210 | - key: isocode 1211 | pretty_name: FraudGuard Country Code 1212 | - key: country 1213 | pretty_name: FraudGuard Country 1214 | - key: state 1215 | pretty_name: FraudGuard State 1216 | - key: city 1217 | pretty_name: FraudGuard City 1218 | - key: discover_date 1219 | pretty_name: FraudGuard Discovery Date 1220 | - key: threat 1221 | pretty_name: FraudGuard Threat Type 1222 | - key: risk_level 1223 | pretty_name: FraudGuard Risk Level 1224 | shodan: 1225 | name: Shodan 1226 | default: False 1227 | otypes: 1228 | - ipv4 1229 | json: 1230 | request: 1231 | url: https://api.shodan.io/shodan/host/{target} 1232 | params: 1233 | key: 1234 | results: 1235 | - key: '@' 1236 | multi_match: 1237 | keys: 1238 | - asn 1239 | - org 1240 | - city 1241 | - region 1242 | - country_code 1243 | - postal_code 1244 | pretty_name: Shodan Organization 1245 | - key: hostnames 1246 | match_all: true 1247 | pretty_name: Shodan Hostnames 1248 | - key: isp 1249 | pretty_name: Shodan ISP 1250 | - key: data 1251 | multi_match: 1252 | keys: 1253 | - timestamp 1254 | - transport 1255 | - port 1256 | - product 1257 | - version 1258 | pretty_name: Shodan Ports 1259 | - key: data 1260 | multi_match: 1261 | keys: 1262 | - transport 1263 | - port 1264 | - ssl.versions 1265 | onlyif: ssl.versions 1266 | pretty_name: Shodan SSL Versions 1267 | - key: data 1268 | multi_match: 1269 | keys: 1270 | - transport 1271 | - port 1272 | - ssl.cert.subject.CN 1273 | - ssl.cert.fingerprint.sha256 1274 | onlyif: ssl.cert.fingerprint.sha256 1275 | pretty_name: Shodan SSL Certs 1276 | ipinfoio: 1277 | name: ipinfo.io 1278 | default: False 1279 | otypes: 1280 | - ipv4 1281 | - ipv6 1282 | json: 1283 | request: 1284 | url: https://ipinfo.io/{target} 1285 | headers: 1286 | Accept: application/json 1287 | results: 1288 | - key: hostname 1289 | pretty_name: ipinfo.io hostname 1290 | - key: city 1291 | pretty_name: ipinfo.io city 1292 | - key: region 1293 | pretty_name: ipinfo.io region 1294 | - key: country 1295 | pretty_name: ipinfo.io country 1296 | - key: loc 1297 | pretty_name: ipinfo.io geolocation 1298 | - key: org 1299 | pretty_name: ipinfo.io organization 1300 | - key: postal 1301 | pretty_name: ipinfo.io postal code 1302 | xforce-malware: 1303 | name: IBM XForce Malware Report 1304 | default: False 1305 | otypes: 1306 | - ipv4 1307 | json: 1308 | request: 1309 | url: https://api.xforce.ibmcloud.com/ipr/malware/{target} 1310 | auth: xforce 1311 | results: 1312 | - key: type 1313 | pretty_name: malware type 1314 | - key: md5 1315 | pretty_name: md5 1316 | - key: domain 1317 | pretty_name: domain name 1318 | - key: firstseen 1319 | pretty_name: first seen 1320 | - key: lastseen 1321 | pretty_name: last seen 1322 | hackedip: 1323 | name: Hacked IP 1324 | default: False 1325 | otypes: 1326 | - ipv4 1327 | json: 1328 | request: 1329 | url: http://www.hackedip.com/api.php?ip={target} 1330 | results: 1331 | - key: '@' 1332 | format: as_list 1333 | pretty_name: Hacked IP Threat List 1334 | metadefender_hash: 1335 | name: MetaDefender File Report 1336 | default: False 1337 | otypes: 1338 | - hash 1339 | - hash.sha1 1340 | - hash.sha256 1341 | json: 1342 | request: 1343 | url: https://api.metadefender.com/v2/hash/{target} 1344 | method: get 1345 | headers: 1346 | apikey: 1347 | results: 1348 | - key: scan_results.start_time 1349 | pretty_name: Date submitted 1350 | - key: scan_results.total_detected_avs 1351 | pretty_name: Detected engines 1352 | - key: scan_results.total_avs 1353 | pretty_name: Total engines 1354 | - key: scan_results.scan_details 1355 | pretty_name: Scans 1356 | multi_match: 1357 | keys: 1358 | - '@' 1359 | - threat_found 1360 | onlyif: scan_result_i 1361 | # misp: 1362 | # name: MISP 1363 | # default: true 1364 | # otypes: 1365 | # - ipv4 1366 | # - url 1367 | # - email 1368 | # - fqdn 1369 | # - hash 1370 | # - hash.sha1 1371 | # - hash.sha256 1372 | # json: 1373 | # request: 1374 | # url: https://***YOUR_MISP_HERE***/events/restSearch/download/{target}/null/null/null/null/7 1375 | # method: get 1376 | # headers: 1377 | # Authorization: ***YOUR_APIKEY_HERE*** 1378 | # results: 1379 | # - key: response 1380 | # pretty_name: MISP Events 1381 | # multi_match: 1382 | # keys: 1383 | # - Event.date 1384 | # - Event.id 1385 | # - Event.info 1386 | greynoise: 1387 | # This entry is for the GreyNoise *community* API 1388 | name: GreyNoise 1389 | otypes: 1390 | - ipv4 1391 | json: 1392 | request: 1393 | url: https://api.greynoise.io/v3/community/{target} 1394 | # headers: 1395 | # key: ***YOUR_APIKEY_HERE*** 1396 | # you can get this from https://viz.greynoise.io/account/ 1397 | ignored_status_codes: 1398 | - 404 1399 | results: 1400 | - key: noise 1401 | pretty_name: GreyNoise Known Scanner 1402 | - key: riot 1403 | pretty_name: GreyNoise Rule-It-OuT 1404 | - key: classification 1405 | pretty_name: GreyNoise Classification 1406 | - key: name 1407 | pretty_name: GreyNoise Name 1408 | greynoise_ent: 1409 | # This entry is for the GreyNoise *enterprise* API 1410 | name: GreyNoise 1411 | default: False 1412 | otypes: 1413 | - ipv4 1414 | json: 1415 | request: 1416 | url: https://enterprise.api.greynoise.io/v2/noise/context/{target} 1417 | headers: 1418 | key: YOUR_APIKEY_HERE 1419 | ignored_status_codes: 1420 | - 404 1421 | results: 1422 | - key: seen 1423 | pretty_name: GreyNoise Known Scanner 1424 | - key: actor 1425 | pretty_name: GreyNoise Actor 1426 | - key: tags 1427 | pretty_name: GreyNoise Reason 1428 | - key: metadata.category 1429 | pretty_name: GreyNoise Category 1430 | - key: first_seen 1431 | pretty_name: GreyNoise First Seen 1432 | - key: last_seen 1433 | pretty_name: GreyNoise Last Seen 1434 | - key: raw_data.web.useragents 1435 | pretty_name: GreyNoise User-agent 1436 | - key: raw_data.scan 1437 | multi_match: 1438 | keys: 1439 | - port 1440 | - protocol 1441 | pretty_name: GreyNoise Observations 1442 | macvendors: 1443 | name: MACVendors 1444 | default: true 1445 | otypes: 1446 | - mac 1447 | webscraper: 1448 | request: 1449 | url: 'https://api.macvendors.com/{target}' 1450 | method: get 1451 | results: 1452 | - regex: '(.+)' 1453 | values: 1454 | - vendor 1455 | pretty_name: Mac Address Vendor 1456 | -------------------------------------------------------------------------------- /pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # A comma-separated list of package or module names from where C extensions may 4 | # be loaded. Extensions are loading into the active Python interpreter and may 5 | # run arbitrary code. 6 | extension-pkg-whitelist= 7 | 8 | # Add files or directories to the blacklist. They should be base names, not 9 | # paths. 10 | ignore=CVS 11 | 12 | # Add files or directories matching the regex patterns to the blacklist. The 13 | # regex matches against base names, not paths. 14 | ignore-patterns= 15 | 16 | # Python code to execute, usually for sys.path manipulation such as 17 | # pygtk.require(). 18 | #init-hook= 19 | 20 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the 21 | # number of processors available to use. 22 | jobs=1 23 | 24 | # Control the amount of potential inferred values when inferring a single 25 | # object. This can help the performance when dealing with large functions or 26 | # complex, nested conditions. 27 | limit-inference-results=100 28 | 29 | # List of plugins (as comma separated values of python modules names) to load, 30 | # usually to register additional checkers. 31 | load-plugins= 32 | 33 | # Pickle collected data for later comparisons. 34 | persistent=yes 35 | 36 | # Specify a configuration file. 37 | #rcfile= 38 | 39 | # When enabled, pylint would attempt to guess common misconfiguration and emit 40 | # user-friendly hints instead of false-positive error messages. 41 | suggestion-mode=yes 42 | 43 | # Allow loading of arbitrary C extensions. Extensions are imported into the 44 | # active Python interpreter and may run arbitrary code. 45 | unsafe-load-any-extension=no 46 | 47 | 48 | [MESSAGES CONTROL] 49 | 50 | # Only show warnings with the listed confidence levels. Leave empty to show 51 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. 52 | confidence= 53 | 54 | # Disable the message, report, category or checker with the given id(s). You 55 | # can either give multiple identifiers separated by comma (,) or put this 56 | # option multiple times (only on the command line, not in the configuration 57 | # file where it should appear only once). You can also use "--disable=all" to 58 | # disable everything first and then reenable specific checks. For example, if 59 | # you want to run only the similarities checker, you can use "--disable=all 60 | # --enable=similarities". If you want to run only the classes checker, but have 61 | # no Warning level messages displayed, use "--disable=all --enable=classes 62 | # --disable=W". 63 | disable=print-statement, 64 | parameter-unpacking, 65 | unpacking-in-except, 66 | old-raise-syntax, 67 | backtick, 68 | long-suffix, 69 | old-ne-operator, 70 | old-octal-literal, 71 | import-star-module-level, 72 | non-ascii-bytes-literal, 73 | raw-checker-failed, 74 | bad-inline-option, 75 | locally-disabled, 76 | locally-enabled, 77 | file-ignored, 78 | suppressed-message, 79 | useless-suppression, 80 | deprecated-pragma, 81 | use-symbolic-message-instead, 82 | apply-builtin, 83 | basestring-builtin, 84 | buffer-builtin, 85 | cmp-builtin, 86 | coerce-builtin, 87 | execfile-builtin, 88 | file-builtin, 89 | long-builtin, 90 | raw_input-builtin, 91 | reduce-builtin, 92 | standarderror-builtin, 93 | unicode-builtin, 94 | xrange-builtin, 95 | coerce-method, 96 | delslice-method, 97 | getslice-method, 98 | setslice-method, 99 | no-absolute-import, 100 | old-division, 101 | dict-iter-method, 102 | dict-view-method, 103 | next-method-called, 104 | metaclass-assignment, 105 | indexing-exception, 106 | raising-string, 107 | reload-builtin, 108 | oct-method, 109 | hex-method, 110 | nonzero-method, 111 | cmp-method, 112 | input-builtin, 113 | round-builtin, 114 | intern-builtin, 115 | unichr-builtin, 116 | map-builtin-not-iterating, 117 | zip-builtin-not-iterating, 118 | range-builtin-not-iterating, 119 | filter-builtin-not-iterating, 120 | using-cmp-argument, 121 | eq-without-hash, 122 | div-method, 123 | idiv-method, 124 | rdiv-method, 125 | exception-message-attribute, 126 | invalid-str-codec, 127 | sys-max-int, 128 | bad-python3-import, 129 | deprecated-string-function, 130 | deprecated-str-translate-call, 131 | deprecated-itertools-function, 132 | deprecated-types-field, 133 | next-method-defined, 134 | dict-items-not-iterating, 135 | dict-keys-not-iterating, 136 | dict-values-not-iterating, 137 | deprecated-operator-function, 138 | deprecated-urllib-function, 139 | xreadlines-attribute, 140 | deprecated-sys-function, 141 | exception-escape, 142 | comprehension-escape, 143 | line-too-long, 144 | missing-docstring, 145 | invalid-name, 146 | unused-argument, 147 | inconsistent-return-statements, 148 | arguments-differ, 149 | protected-access, 150 | too-many-locals, 151 | too-many-branches, 152 | not-context-manager, 153 | unexpected-keyword-arg, 154 | no-member, 155 | cyclic-import, 156 | anomalous-backslash-in-string, 157 | import-outside-toplevel, 158 | no-else-continue, 159 | super-with-arguments 160 | 161 | 162 | # Enable the message, report, category or checker with the given id(s). You can 163 | # either give multiple identifier separated by comma (,) or put this option 164 | # multiple time (only on the command line, not in the configuration file where 165 | # it should appear only once). See also the "--disable" option for examples. 166 | enable=c-extension-no-member 167 | 168 | 169 | [REPORTS] 170 | 171 | # Python expression which should return a note less than 10 (10 is the highest 172 | # note). You have access to the variables errors warning, statement which 173 | # respectively contain the number of errors / warnings messages and the total 174 | # number of statements analyzed. This is used by the global evaluation report 175 | # (RP0004). 176 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 177 | 178 | # Template used to display messages. This is a python new-style format string 179 | # used to format the message information. See doc for all details. 180 | #msg-template= 181 | 182 | # Set the output format. Available formats are text, parseable, colorized, json 183 | # and msvs (visual studio). You can also give a reporter class, e.g. 184 | # mypackage.mymodule.MyReporterClass. 185 | output-format=text 186 | 187 | # Tells whether to display a full report or only the messages. 188 | reports=no 189 | 190 | # Activate the evaluation score. 191 | score=yes 192 | 193 | 194 | [REFACTORING] 195 | 196 | # Maximum number of nested blocks for function / method body 197 | max-nested-blocks=5 198 | 199 | # Complete name of functions that never returns. When checking for 200 | # inconsistent-return-statements if a never returning function is called then 201 | # it will be considered as an explicit return statement and no message will be 202 | # printed. 203 | never-returning-functions=sys.exit 204 | 205 | 206 | [BASIC] 207 | 208 | # Naming style matching correct argument names. 209 | argument-naming-style=snake_case 210 | 211 | # Regular expression matching correct argument names. Overrides argument- 212 | # naming-style. 213 | #argument-rgx= 214 | 215 | # Naming style matching correct attribute names. 216 | attr-naming-style=snake_case 217 | 218 | # Regular expression matching correct attribute names. Overrides attr-naming- 219 | # style. 220 | #attr-rgx= 221 | 222 | # Bad variable names which should always be refused, separated by a comma. 223 | bad-names=foo, 224 | bar, 225 | baz, 226 | toto, 227 | tutu, 228 | tata 229 | 230 | # Naming style matching correct class attribute names. 231 | class-attribute-naming-style=any 232 | 233 | # Regular expression matching correct class attribute names. Overrides class- 234 | # attribute-naming-style. 235 | #class-attribute-rgx= 236 | 237 | # Naming style matching correct class names. 238 | class-naming-style=PascalCase 239 | 240 | # Regular expression matching correct class names. Overrides class-naming- 241 | # style. 242 | #class-rgx= 243 | 244 | # Naming style matching correct constant names. 245 | const-naming-style=UPPER_CASE 246 | 247 | # Regular expression matching correct constant names. Overrides const-naming- 248 | # style. 249 | #const-rgx= 250 | 251 | # Minimum line length for functions/classes that require docstrings, shorter 252 | # ones are exempt. 253 | docstring-min-length=-1 254 | 255 | # Naming style matching correct function names. 256 | function-naming-style=snake_case 257 | 258 | # Regular expression matching correct function names. Overrides function- 259 | # naming-style. 260 | #function-rgx= 261 | 262 | # Good variable names which should always be accepted, separated by a comma. 263 | good-names=i, 264 | j, 265 | k, 266 | ex, 267 | Run, 268 | _ 269 | 270 | # Include a hint for the correct naming format with invalid-name. 271 | include-naming-hint=no 272 | 273 | # Naming style matching correct inline iteration names. 274 | inlinevar-naming-style=any 275 | 276 | # Regular expression matching correct inline iteration names. Overrides 277 | # inlinevar-naming-style. 278 | #inlinevar-rgx= 279 | 280 | # Naming style matching correct method names. 281 | method-naming-style=snake_case 282 | 283 | # Regular expression matching correct method names. Overrides method-naming- 284 | # style. 285 | #method-rgx= 286 | 287 | # Naming style matching correct module names. 288 | module-naming-style=snake_case 289 | 290 | # Regular expression matching correct module names. Overrides module-naming- 291 | # style. 292 | #module-rgx= 293 | 294 | # Colon-delimited sets of names that determine each other's naming style when 295 | # the name regexes allow several styles. 296 | name-group= 297 | 298 | # Regular expression which should only match function or class names that do 299 | # not require a docstring. 300 | no-docstring-rgx=^_ 301 | 302 | # List of decorators that produce properties, such as abc.abstractproperty. Add 303 | # to this list to register other decorators that produce valid properties. 304 | # These decorators are taken in consideration only for invalid-name. 305 | property-classes=abc.abstractproperty 306 | 307 | # Naming style matching correct variable names. 308 | variable-naming-style=snake_case 309 | 310 | # Regular expression matching correct variable names. Overrides variable- 311 | # naming-style. 312 | #variable-rgx= 313 | 314 | 315 | [FORMAT] 316 | 317 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 318 | expected-line-ending-format= 319 | 320 | # Regexp for a line that is allowed to be longer than the limit. 321 | ignore-long-lines=^\s*(# )??$ 322 | 323 | # Number of spaces of indent required inside a hanging or continued line. 324 | indent-after-paren=4 325 | 326 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 327 | # tab). 328 | indent-string=' ' 329 | 330 | # Maximum number of characters on a single line. 331 | max-line-length=100 332 | 333 | # Maximum number of lines in a module. 334 | max-module-lines=1000 335 | 336 | # List of optional constructs for which whitespace checking is disabled. `dict- 337 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 338 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 339 | # `empty-line` allows space-only lines. 340 | no-space-check=trailing-comma, 341 | dict-separator 342 | 343 | # Allow the body of a class to be on the same line as the declaration if body 344 | # contains single statement. 345 | single-line-class-stmt=no 346 | 347 | # Allow the body of an if to be on the same line as the test if there is no 348 | # else. 349 | single-line-if-stmt=no 350 | 351 | 352 | [LOGGING] 353 | 354 | # Logging modules to check that the string format arguments are in logging 355 | # function parameter format. 356 | logging-modules=logging 357 | 358 | 359 | [VARIABLES] 360 | 361 | # List of additional names supposed to be defined in builtins. Remember that 362 | # you should avoid to define new builtins when possible. 363 | additional-builtins= 364 | 365 | # Tells whether unused global variables should be treated as a violation. 366 | allow-global-unused-variables=yes 367 | 368 | # List of strings which can identify a callback function by name. A callback 369 | # name must start or end with one of those strings. 370 | callbacks=cb_, 371 | _cb 372 | 373 | # A regular expression matching the name of dummy variables (i.e. expected to 374 | # not be used). 375 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 376 | 377 | # Argument names that match this expression will be ignored. Default to name 378 | # with leading underscore. 379 | ignored-argument-names=_.*|^ignored_|^unused_ 380 | 381 | # Tells whether we should check for unused import in __init__ files. 382 | init-import=no 383 | 384 | # List of qualified module names which can have objects that can redefine 385 | # builtins. 386 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io 387 | 388 | 389 | [TYPECHECK] 390 | 391 | # List of decorators that produce context managers, such as 392 | # contextlib.contextmanager. Add to this list to register other decorators that 393 | # produce valid context managers. 394 | contextmanager-decorators=contextlib.contextmanager 395 | 396 | # List of members which are set dynamically and missed by pylint inference 397 | # system, and so shouldn't trigger E1101 when accessed. Python regular 398 | # expressions are accepted. 399 | generated-members= 400 | 401 | # Tells whether missing members accessed in mixin class should be ignored. A 402 | # mixin class is detected if its name ends with "mixin" (case insensitive). 403 | ignore-mixin-members=yes 404 | 405 | # Tells whether to warn about missing members when the owner of the attribute 406 | # is inferred to be None. 407 | ignore-none=yes 408 | 409 | # This flag controls whether pylint should warn about no-member and similar 410 | # checks whenever an opaque object is returned when inferring. The inference 411 | # can return multiple potential results while evaluating a Python object, but 412 | # some branches might not be evaluated, which results in partial inference. In 413 | # that case, it might be useful to still emit no-member and other checks for 414 | # the rest of the inferred objects. 415 | ignore-on-opaque-inference=yes 416 | 417 | # List of class names for which member attributes should not be checked (useful 418 | # for classes with dynamically set attributes). This supports the use of 419 | # qualified names. 420 | ignored-classes=optparse.Values,thread._local,_thread._local 421 | 422 | # List of module names for which member attributes should not be checked 423 | # (useful for modules/projects where namespaces are manipulated during runtime 424 | # and thus existing member attributes cannot be deduced by static analysis. It 425 | # supports qualified module names, as well as Unix pattern matching. 426 | ignored-modules= 427 | 428 | # Show a hint with possible names when a member name was not found. The aspect 429 | # of finding the hint is based on edit distance. 430 | missing-member-hint=yes 431 | 432 | # The minimum edit distance a name should have in order to be considered a 433 | # similar match for a missing member name. 434 | missing-member-hint-distance=1 435 | 436 | # The total number of similar names that should be taken in consideration when 437 | # showing a hint for a missing member. 438 | missing-member-max-choices=1 439 | 440 | 441 | [SIMILARITIES] 442 | 443 | # Ignore comments when computing similarities. 444 | ignore-comments=yes 445 | 446 | # Ignore docstrings when computing similarities. 447 | ignore-docstrings=yes 448 | 449 | # Ignore imports when computing similarities. 450 | ignore-imports=no 451 | 452 | # Minimum lines number of a similarity. 453 | min-similarity-lines=4 454 | 455 | 456 | [MISCELLANEOUS] 457 | 458 | # List of note tags to take in consideration, separated by a comma. 459 | notes=FIXME, 460 | XXX, 461 | TODO 462 | 463 | 464 | [SPELLING] 465 | 466 | # Limits count of emitted suggestions for spelling mistakes. 467 | max-spelling-suggestions=4 468 | 469 | # Spelling dictionary name. Available dictionaries: none. To make it working 470 | # install python-enchant package.. 471 | spelling-dict= 472 | 473 | # List of comma separated words that should not be checked. 474 | spelling-ignore-words= 475 | 476 | # A path to a file that contains private dictionary; one word per line. 477 | spelling-private-dict-file= 478 | 479 | # Tells whether to store unknown words to indicated private dictionary in 480 | # --spelling-private-dict-file option instead of raising a message. 481 | spelling-store-unknown-words=no 482 | 483 | 484 | [IMPORTS] 485 | 486 | # Allow wildcard imports from modules that define __all__. 487 | allow-wildcard-with-all=no 488 | 489 | # Analyse import fallback blocks. This can be used to support both Python 2 and 490 | # 3 compatible code, which means that the block might have code that exists 491 | # only in one or another interpreter, leading to false positives when analysed. 492 | analyse-fallback-blocks=no 493 | 494 | # Deprecated modules which should not be used, separated by a comma. 495 | deprecated-modules=optparse,tkinter.tix 496 | 497 | # Create a graph of external dependencies in the given file (report RP0402 must 498 | # not be disabled). 499 | ext-import-graph= 500 | 501 | # Create a graph of every (i.e. internal and external) dependencies in the 502 | # given file (report RP0402 must not be disabled). 503 | import-graph= 504 | 505 | # Create a graph of internal dependencies in the given file (report RP0402 must 506 | # not be disabled). 507 | int-import-graph= 508 | 509 | # Force import order to recognize a module as part of the standard 510 | # compatibility libraries. 511 | known-standard-library= 512 | 513 | # Force import order to recognize a module as part of a third party library. 514 | known-third-party=enchant 515 | 516 | 517 | [DESIGN] 518 | 519 | # Maximum number of arguments for function / method. 520 | max-args=5 521 | 522 | # Maximum number of attributes for a class (see R0902). 523 | max-attributes=7 524 | 525 | # Maximum number of boolean expressions in an if statement. 526 | max-bool-expr=5 527 | 528 | # Maximum number of branch for function / method body. 529 | max-branches=12 530 | 531 | # Maximum number of locals for function / method body. 532 | max-locals=15 533 | 534 | # Maximum number of parents for a class (see R0901). 535 | max-parents=10 536 | 537 | # Maximum number of public methods for a class (see R0904). 538 | max-public-methods=20 539 | 540 | # Maximum number of return / yield for function / method body. 541 | max-returns=6 542 | 543 | # Maximum number of statements in function / method body. 544 | max-statements=50 545 | 546 | # Minimum number of public methods for a class (see R0903). 547 | min-public-methods=0 548 | 549 | 550 | [CLASSES] 551 | 552 | # List of method names used to declare (i.e. assign) instance attributes. 553 | defining-attr-methods=__init__, 554 | __new__, 555 | setUp 556 | 557 | # List of member names, which should be excluded from the protected access 558 | # warning. 559 | exclude-protected=_asdict, 560 | _fields, 561 | _replace, 562 | _source, 563 | _make 564 | 565 | # List of valid names for the first argument in a class method. 566 | valid-classmethod-first-arg=cls 567 | 568 | # List of valid names for the first argument in a metaclass class method. 569 | valid-metaclass-classmethod-first-arg=cls 570 | 571 | 572 | [EXCEPTIONS] 573 | 574 | # Exceptions that will emit a warning when being caught. Defaults to 575 | # "Exception". 576 | overgeneral-exceptions=Exception 577 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | defang 2 | relatime 3 | urllib3 4 | stopit 5 | feedparser 6 | beautifulsoup4 7 | python_dateutil 8 | python_magic 9 | PyYAML 10 | requests 11 | tzlocal 12 | pytz 13 | dnspython3 14 | ipwhois 15 | html5lib 16 | 17 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | from setuptools import setup, find_packages 3 | 4 | 5 | #this should hopefully allow us to have a more pypi friendly, always up to date readme 6 | readMeDir = path.abspath(path.dirname(__file__)) 7 | with open(path.join(readMeDir, 'README.md'), encoding='utf-8') as readFile: 8 | long_desc = readFile.read() 9 | 10 | 11 | VERSION = '1.4.11' 12 | 13 | setup( 14 | name='machinae', 15 | version=VERSION, 16 | author='Steve McMaster', 17 | author_email='mcmaster@hurricanelabs.com', 18 | package_dir={'': 'src'}, 19 | packages=find_packages('src'), 20 | include_package_data=True, 21 | zip_safe=False, 22 | url='http://hurricanelabs.github.io/machinae/', 23 | description='Machinae Security Intelligence Collector', 24 | long_description=long_desc, 25 | long_description_content_type='text/markdown', 26 | install_requires=[ 27 | 'dnspython3', 28 | 'ipwhois<0.11', 29 | 'requests', 30 | 'stopit', 31 | 'pyyaml', 32 | 'beautifulsoup4', 33 | 'html5lib', 34 | 'relatime', 35 | 'tzlocal', 36 | 'python-magic', 37 | 'feedparser', 38 | 'defang', 39 | ], 40 | entry_points={ 41 | 'console_scripts': [ 42 | 'machinae = machinae.cmd:main', 43 | ] 44 | }, 45 | classifiers=[ 46 | 'License :: OSI Approved :: MIT License', 47 | 'Programming Language :: Python :: 3 :: Only', 48 | 'Development Status :: 5 - Production/Stable', 49 | ], 50 | bugtrack_url='https://github.com/HurricaneLabs/machinae/issues', 51 | ) 52 | -------------------------------------------------------------------------------- /src/machinae/__init__.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import re 3 | import socket 4 | import ipaddress 5 | 6 | __version__ = "1.4.8" 7 | TargetInfo = collections.namedtuple("TargetInfo", ("target", "otype", "otype_detected")) 8 | ErrorResult = collections.namedtuple("ErrorResult", ("target_info", "site_info", "error_info")) 9 | ResultSet = collections.namedtuple("ResultSet", ("target_info", "results")) 10 | SiteResults = collections.namedtuple("SiteResults", ("site_info", "resultset")) 11 | Result = collections.namedtuple("Result", ("value", "pretty_name")) 12 | 13 | #pylint: disable=no-else-return,too-many-return-statements 14 | def get_target_type(target): 15 | try: 16 | getVer = ipaddress.ip_address(target) 17 | if getVer.version == 4: 18 | return "ipv4" 19 | elif getVer.version == 6: 20 | return "ipv6" 21 | except ValueError: 22 | pass 23 | 24 | #pylint: disable=no-else-return 25 | # Hashes 26 | if re.match("^[a-f0-9]{32}$", target, re.I): 27 | # MD5 28 | return "hash" 29 | elif re.match("^[a-f0-9]{40}$", target, re.I): 30 | # SHA-1 31 | return "hash.sha1" 32 | elif re.match("^[a-f0-9]{64}$", target, re.I): 33 | # SHA-256 34 | return "hash.sha256" 35 | elif re.match("^[a-f0-9]{128}$", target, re.I): 36 | # SHA-512 37 | return "hash.sha512" 38 | 39 | # URL 40 | elif re.match("^https?://", target, re.I): 41 | return "url" 42 | 43 | # Email Addresses 44 | elif re.match("^.*?@.*?$", target, re.I): 45 | return "email" 46 | 47 | # SSL fingerprints 48 | elif re.match("^(?:[a-f0-9]{2}:){19}[a-f0-9]{2}$", target, flags=re.I): 49 | return "sslfp" 50 | 51 | # Mac Addresses 52 | elif re.match("^([0-9a-fA-F][0-9a-fA-F][-:\.]){5}([0-9a-fA-F][0-9a-fA-F])$", target, re.I): 53 | return "mac" 54 | 55 | return "fqdn" 56 | 57 | 58 | # d2 takes precedence 59 | def dict_merge(d1, d2): 60 | d3 = d1.copy() 61 | for key in d2: 62 | if key in d3 and hasattr(d3[key], "items") and hasattr(d2[key], "items"): 63 | d3[key] = dict_merge(d3[key], d2[key]) 64 | elif hasattr(d2[key], "items"): 65 | d3[key] = d2[key].copy() 66 | else: 67 | d3[key] = d2[key] 68 | return d3 69 | -------------------------------------------------------------------------------- /src/machinae/cmd.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import copy 3 | import os 4 | import sys 5 | from collections import OrderedDict 6 | import stopit 7 | from machinae import __version__ 8 | 9 | from . import dict_merge, get_target_type, outputs, utils 10 | from . import ErrorResult, Result, ResultSet, SiteResults, TargetInfo 11 | from .sites import Site 12 | 13 | 14 | default_config_locations = ( 15 | "machinae.yml", 16 | "/etc/machinae.yml", 17 | os.path.expanduser(os.getenv("MACHINAE_CONFIG", "")), 18 | ) 19 | 20 | 21 | class MachinaeCommand: 22 | _conf = None 23 | _sites = None 24 | 25 | def __init__(self, args=None): 26 | if args is None: 27 | ap = argparse.ArgumentParser() 28 | ap.add_argument("-c", "--config", default=None) 29 | ap.add_argument("--nomerge", default=False, action="store_true") 30 | 31 | ap.add_argument("-d", "--delay", default=0) 32 | ap.add_argument("-f", "--file", default="-") 33 | ap.add_argument("-i", "--infile", default=None) 34 | ap.add_argument("-o", dest="output", default="N", choices=("D", "J", "N", "S")) 35 | ap.add_argument("-O", "--otype", 36 | choices=("ipv4", "ipv6", "fqdn", "email", "sslfp", "hash", "url", "mac") 37 | ) 38 | ap.add_argument("-q", "--quiet", dest="verbose", default=True, action="store_false") 39 | ap.add_argument("-s", "--sites", default="default") 40 | ap.add_argument("-a", "--auth") 41 | ap.add_argument("-H", "--http-proxy", dest="http_proxy") 42 | ap.add_argument("targets", nargs=argparse.REMAINDER) 43 | ap.add_argument("-v", "--version", action="version", version="%(prog)s "+ __version__) 44 | 45 | modes = ap.add_mutually_exclusive_group() 46 | modes.add_argument("--dump-config", dest="mode", 47 | action="store_const", const="dump_config") 48 | modes.add_argument("--detect-otype", dest="mode", 49 | action="store_const", const="detect_otype") 50 | modes.add_argument("--list-sites", dest="mode", 51 | action="store_const", const="list_sites") 52 | args = ap.parse_args() 53 | self.args = args 54 | 55 | @property 56 | def conf(self): 57 | if self._conf is None: 58 | path = None 59 | if self.args.config: 60 | path = self.args.config 61 | else: 62 | for possible_path in default_config_locations: 63 | if possible_path is None: 64 | continue 65 | if os.path.exists(possible_path): 66 | path = possible_path 67 | break 68 | 69 | if path: 70 | with open(path, "r") as f: 71 | conf = utils.safe_load(f) 72 | else: 73 | conf = {} 74 | 75 | if not self.args.nomerge: 76 | local_path = "/etc/machinae.local.yml" 77 | if os.path.exists(local_path): 78 | with open(local_path, "r") as f: 79 | local_conf = utils.safe_load(f) 80 | conf = dict_merge(conf, local_conf) 81 | 82 | local_path = os.path.expanduser("~/.machinae.yml") 83 | if os.path.exists(local_path): 84 | with open(local_path, "r") as f: 85 | local_conf = utils.safe_load(f) 86 | conf = dict_merge(conf, local_conf) 87 | 88 | self._conf = conf 89 | return self._conf 90 | 91 | @property 92 | #pylint: disable=too-many-locals, too-many-branches 93 | def results(self): 94 | creds = None 95 | if self.args.auth and os.path.isfile(self.args.auth): 96 | with open(self.args.auth) as auth_f: 97 | creds = utils.safe_load(auth_f.read()) 98 | 99 | proxies = {} 100 | if self.args.http_proxy: 101 | proxies["http"] = self.args.http_proxy 102 | proxies["https"] = self.args.http_proxy 103 | else: 104 | if "HTTP_PROXY" in os.environ: 105 | proxies["http"] = os.environ["HTTP_PROXY"] 106 | elif "http_proxy" in os.environ: 107 | proxies["http"] = os.environ["http_proxy"] 108 | if "HTTPS_PROXY" in os.environ: 109 | proxies["https"] = os.environ["HTTPS_PROXY"] 110 | elif "https_proxy" in os.environ: 111 | proxies["https"] = os.environ["https_proxy"] 112 | 113 | if "http" in proxies: 114 | print("HTTP Proxy: {http}".format(**proxies), file=sys.stderr) 115 | if "https" in proxies: 116 | print("HTTPS Proxy: {https}".format(**proxies), file=sys.stderr) 117 | 118 | for target_info in self.targets: 119 | (target, otype, _) = target_info 120 | 121 | target_results = list() 122 | #pylint: disable=unused-variable 123 | for (site_name, site_conf) in self.sites.items(): 124 | if otype.lower() not in map(lambda x: x.lower(), site_conf["otypes"]): 125 | continue 126 | 127 | site_conf["target"] = target 128 | site_conf["verbose"] = self.args.verbose 129 | scraper = Site.from_conf(site_conf, creds=creds, proxies=proxies) # , verbose=self.verbose) 130 | 131 | try: 132 | with stopit.SignalTimeout(15, swallow_exc=False): 133 | run_results = list() 134 | for r in scraper.run(): 135 | if "value" not in r: 136 | r = {"value": r, "pretty_name": None} 137 | run_results.append(Result(r["value"], r["pretty_name"])) 138 | except stopit.TimeoutException: 139 | target_results.append(ErrorResult(target_info, site_conf, "Timeout")) 140 | #pylint: disable=broad-except 141 | #Will be cleaned up in upcoming refactor 142 | except Exception as e: 143 | target_results.append(ErrorResult(target_info, site_conf, e)) 144 | else: 145 | target_results.append(SiteResults(site_conf, run_results)) 146 | 147 | yield ResultSet(target_info, target_results) 148 | 149 | @property 150 | def sites(self): 151 | if self._sites is None: 152 | if self.args.sites.lower() == "all": 153 | sites = self._conf.keys() 154 | elif self.args.sites.lower() == "default": 155 | sites = [k for (k, v) in self.conf.items() if v.get("default", True)] 156 | else: 157 | sites = self.args.sites.lower().split(",") 158 | self._sites = OrderedDict([(k, v) for (k, v) in self.conf.items() if k in sites]) 159 | return copy.deepcopy(self._sites) 160 | 161 | @property 162 | def targets(self): 163 | targets = list() 164 | if self.args.infile: 165 | with open(self.args.infile, "r") as f: 166 | targets.extend([line.strip() for line in f.readlines()]) 167 | 168 | targets.extend(self.args.targets) 169 | 170 | for target in targets: 171 | (otype, otype_detected) = self.detect_otype(target) 172 | if otype == "url" and not (target.startswith("http://") or target.startswith("https://")): 173 | target = "http://{0}".format(target) 174 | yield TargetInfo(target, otype, otype_detected) 175 | 176 | def detect_otype(self, target): 177 | if self.args.otype: 178 | return (self.args.otype, False) 179 | return (get_target_type(target), True) 180 | 181 | def run(self): 182 | fmt = self.args.output.upper() 183 | dest = self.args.file 184 | 185 | if not self.conf: 186 | sys.stderr.write("Warning: operating without a config file. This is probably not what " 187 | "you want. To correct this, fetch a copy of the default " 188 | "configuration file from https://github.com/hurricanelabs/machinae " 189 | "and place it in /etc/machinae.yml or ~/.machinae.yml and run again." 190 | "\n") 191 | 192 | if self.args.mode == "dump_config": 193 | output = utils.dump(self.conf) 194 | elif self.args.mode == "detect_otype": 195 | target_dict = OrderedDict() 196 | for target_info in self.targets: 197 | target_dict.update({target_info.target: target_info.otype}) 198 | output = utils.dump(target_dict) 199 | elif self.args.mode == "list_sites": 200 | output = utils.listsites(self.conf) 201 | else: 202 | output = outputs.MachinaeOutput.get_formatter(fmt).run(self.results) 203 | 204 | if dest == "-": 205 | ofile = sys.stdout 206 | else: 207 | ofile = open(dest, "w") 208 | 209 | ofile.write(output) 210 | 211 | if dest != "-": 212 | ofile.close() 213 | 214 | 215 | def main(): 216 | try: 217 | cmd = MachinaeCommand() 218 | cmd.run() 219 | except KeyboardInterrupt: 220 | pass 221 | 222 | 223 | if __name__ == "__main__": 224 | main() 225 | -------------------------------------------------------------------------------- /src/machinae/outputs.py: -------------------------------------------------------------------------------- 1 | import io 2 | import json 3 | from defang import defang 4 | 5 | class MachinaeOutput: 6 | @staticmethod 7 | #pylint: disable=no-else-return, redefined-builtin, inconsistent-return-statements 8 | #Will be cleaned up in upcoming refactor 9 | def get_formatter(format): 10 | if format.upper() == "N": 11 | return NormalOutput() 12 | elif format.upper() == "J": 13 | return JsonOutput() 14 | elif format.upper() == "D": 15 | return DotEscapedOutput() 16 | elif format.upper() == "S": 17 | return ShortOutput() 18 | 19 | @staticmethod 20 | def escape(text): 21 | return str(text) 22 | 23 | #pylint: disable=attribute-defined-outside-init 24 | #Will be cleaned up in upcoming refactor 25 | def init_buffer(self): 26 | self._buffer = io.StringIO() 27 | 28 | def print(self, line, lf=True): 29 | self._buffer.write(line) 30 | if lf: 31 | self._buffer.write("\n") 32 | 33 | 34 | class NormalOutput(MachinaeOutput): 35 | def output_header(self, target, otype, otype_detected): 36 | self.print("*" * 80) 37 | self.print("* Information for {0}".format(self.escape(target))) 38 | self.print("* Observable type: {0} (Auto-detected: {1})".format(otype, otype_detected)) 39 | self.print("*" * 80) 40 | #This needs to be refactored so the site from args is available here. No time currently, will do though 41 | self.print("Not seeing what you expect? Likely not a valid site. Try running with --list-sites") 42 | 43 | def run(self, result_sets: object): 44 | self.init_buffer() 45 | #pylint: disable=too-many-nested-blocks 46 | for row in result_sets: 47 | (target, otype, otype_detected) = row.target_info 48 | 49 | self.output_header(target, otype, otype_detected) 50 | self.print("") 51 | 52 | for item in row.results: 53 | site = item.site_info 54 | if hasattr(item, "error_info"): 55 | self.print("[!] Error from {0}: {1}".format(site["name"], item.error_info)) 56 | continue 57 | 58 | if not item.resultset: 59 | self.print("[-] No {0} Results".format(site["name"])) 60 | else: 61 | self.print("[+] {0} Results".format(site["name"])) 62 | for result in item.resultset: 63 | labels = getattr(result[0], "labels", None) 64 | if len(result[0].values()) > 1 or labels is not None: 65 | values = map(repr, result[0].values()) 66 | values = map(self.escape, values) 67 | if labels is not None: 68 | values = zip(labels, values) 69 | values = ["{0}: {1}".format(label, value) for (label, value) in values] 70 | output = ", ".join(values) 71 | 72 | if result[1] is not None: 73 | output = "({0})".format(", ".join(values)) 74 | output = defang(output) 75 | else: 76 | output = self.escape(list(result[0].values())[0]) 77 | output = defang(output) 78 | if result[1] is not None: 79 | output = "{1}: {0}".format(output, result[1]) 80 | output = defang(output) 81 | self.print(" [-] {0}".format(output)) 82 | 83 | return self._buffer.getvalue() 84 | 85 | 86 | class DotEscapedOutput(NormalOutput): 87 | escapes = { 88 | # ".": "\u2024", 89 | # ".": "", 90 | # ".": " DOT ", 91 | ".": "[.]", 92 | "@": " AT ", 93 | "http://": "hxxp://", 94 | "https://": "hxxps://", 95 | } 96 | 97 | def output_header(self, target, otype, otype_detected): 98 | super().output_header(target, otype, otype_detected) 99 | self.print("* These characters are escaped in the output below:") 100 | for (find, replace) in self.escapes.items(): 101 | self.print("* '{0}' replaced with '{1}'".format(find, replace)) 102 | self.print("* Do not click any links you find below") 103 | self.print("*" * 80) 104 | 105 | @classmethod 106 | def escape(cls, text): 107 | text = super(DotEscapedOutput, cls).escape(text) 108 | for (find, replace) in cls.escapes.items(): 109 | text = text.replace(find, replace) 110 | return text 111 | 112 | #pylint: disable=no-self-use, unused-variable 113 | #Will be cleaned up in upcoming refactor 114 | class JsonGenerator(MachinaeOutput): 115 | def run(self, result_sets): 116 | records = list() 117 | for row in result_sets: 118 | (target, otype, otype_detected) = row.target_info 119 | 120 | for item in row.results: 121 | output = dict() 122 | output["site"] = item.site_info["name"] 123 | output["results"] = dict() 124 | output["observable"] = target 125 | output["observable type"] = otype 126 | output["observable type detected"] = otype_detected 127 | 128 | if hasattr(item, "error_info"): 129 | output["results"] = {"error_info": str(item.error_info)} 130 | elif item.resultset: 131 | for result in item.resultset: 132 | if result.pretty_name not in output["results"]: 133 | output["results"][result.pretty_name] = list() 134 | values = list(result.value.values()) 135 | if len(values) == 1: 136 | output["results"][result.pretty_name].append(values[0]) 137 | elif len(values) > 1: 138 | output["results"][result.pretty_name].append(values) 139 | for (k, v) in output["results"].items(): 140 | if len(v) == 1: 141 | output["results"][k] = v[0] 142 | records.append(output) 143 | return records 144 | 145 | 146 | class JsonOutput(JsonGenerator): 147 | def run(self, result_sets): 148 | self.init_buffer() 149 | 150 | for record in super().run(result_sets): 151 | self.print(json.dumps(record)) 152 | 153 | return self._buffer.getvalue() 154 | 155 | 156 | class ShortOutput(MachinaeOutput): 157 | def run(self, result_sets): 158 | self.init_buffer() 159 | 160 | for row in result_sets: 161 | (target, otype, otype_detected) = row.target_info 162 | self.print("[+] {0}".format(target)) 163 | 164 | for item in row.results: 165 | site = item.site_info 166 | if hasattr(item, "error_info"): 167 | self.print(" {0}: Error".format(site["name"])) 168 | elif not item.resultset: 169 | self.print(" {0}: No".format(site["name"])) 170 | else: 171 | self.print(" {0}: Yes".format(site["name"])) 172 | 173 | return self._buffer.getvalue() 174 | -------------------------------------------------------------------------------- /src/machinae/sites/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import urllib.parse 4 | 5 | #pylint: disable=useless-object-inheritance 6 | class Site(object): 7 | _session = None 8 | _kwargs = None 9 | 10 | def __init__(self, conf, creds=None, proxies=None): 11 | self.conf = conf 12 | self.creds = creds 13 | self.proxies = proxies 14 | 15 | def kwargs_getter(self): 16 | return self._kwargs 17 | 18 | def kwargs_setter(self, kwargs): 19 | if "target" in kwargs: 20 | target = kwargs.pop("target") 21 | if "target" in self.conf.get("request", {}): 22 | target_conf = self.conf["request"]["target"] 23 | 24 | # PTR-style 25 | ptr_style = str(target_conf.get("ptr", False)).lower() 26 | if ptr_style in ("1", "yes", "true"): 27 | target = ".".join(reversed(target.split("."))) 28 | 29 | urlencode = str(target_conf.get("urlencode", False)).lower() 30 | if urlencode in ("1", "yes", "true"): 31 | target = urllib.parse.quote(target) 32 | elif urlencode == "twice": 33 | target = urllib.parse.quote( 34 | urllib.parse.quote(target, safe="") 35 | ) 36 | 37 | if "format" in target_conf: 38 | target = target_conf["format"] % (target,) 39 | 40 | kwargs["target"] = target 41 | 42 | self._kwargs = kwargs 43 | 44 | kwargs = property(kwargs_getter, kwargs_setter) 45 | 46 | @staticmethod 47 | def from_conf(conf, *args, **kwargs): 48 | from . import csv, html, rss, json, ipwhois 49 | if "webscraper" in conf: 50 | site_conf = conf.pop("webscraper") 51 | scraper = html.Webscraper(site_conf, *args, **kwargs) 52 | elif "tablescraper" in conf: 53 | site_conf = conf.pop("tablescraper") 54 | scraper = html.TableScraper(site_conf, *args, **kwargs) 55 | elif "json" in conf: 56 | site_conf = conf.pop("json") 57 | scraper = json.JsonApi(site_conf, *args, **kwargs) 58 | elif "csv" in conf: 59 | site_conf = conf.pop("csv") 60 | scraper = csv.CsvSite(site_conf, *args, **kwargs) 61 | elif "rss" in conf: 62 | site_conf = conf.pop("rss") 63 | scraper = rss.RssSite(site_conf, *args, **kwargs) 64 | elif "ipwhois" in conf: 65 | site_conf = conf.pop("ipwhois") 66 | scraper = ipwhois.IpWhois(site_conf, *args, **kwargs) 67 | # elif "dns" in conf: 68 | # scraper = DnsSite(conf["dns"], *args, **kwargs) 69 | # elif "ipwhois" in conf: 70 | # scraper = IpWhois(conf["ipwhois"], *args, **kwargs) 71 | else: 72 | raise NotImplementedError(conf.keys()) 73 | scraper.kwargs = conf.copy() 74 | return scraper 75 | 76 | def get_content(self): 77 | raise NotImplementedError 78 | #pylint: disable=no-member 79 | def __iter__(self): 80 | for _ in self.run(): 81 | yield _ 82 | -------------------------------------------------------------------------------- /src/machinae/sites/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import gzip 4 | import io 5 | import warnings 6 | import zipfile 7 | from collections import OrderedDict 8 | 9 | import magic 10 | import pytz 11 | import relatime 12 | import requests 13 | from tzlocal import get_localzone 14 | try: 15 | #pylint: disable=ungrouped-imports 16 | from requests.packages.urllib3 import exceptions 17 | except ImportError: 18 | # Apparently, some linux distros strip the packages out of requests 19 | # I'm not going to tell you what I think of that, just going to deal with it 20 | from urllib3 import exceptions 21 | 22 | from . import Site 23 | 24 | 25 | class HttpSite(Site): 26 | @property 27 | def url(self): 28 | print(self.conf["request"]["url"].format(**self.kwargs)) 29 | return self.conf["request"]["url"].format(**self.kwargs) 30 | 31 | @property 32 | def session(self): 33 | if self._session is None: 34 | self._session = requests.Session() 35 | self._session.headers.update({"User-Agent": "Vor/1.0 (Like CIF/2.0)"}) 36 | if self.proxies: 37 | self._session.proxies = self.proxies 38 | return self._session 39 | 40 | @staticmethod 41 | def unzip_content(r, *args, **kwargs): 42 | content = r.content 43 | 44 | mime = magic.from_buffer(content, mime=True) 45 | 46 | if mime == "application/zip": 47 | zip_buffer = io.BytesIO(content) 48 | with zipfile.ZipFile(zip_buffer) as zf: 49 | fn = zf.namelist()[0] 50 | with zf.open(fn) as f: 51 | r._content = f.read() 52 | elif mime == "application/x-gzip": 53 | gz_buffer = io.BytesIO(content) 54 | with gzip.GzipFile(fileobj=gz_buffer) as gz: 55 | r._content = gz.read() 56 | else: 57 | r._content = content 58 | 59 | return r 60 | 61 | def _req(self, conf, url=None): 62 | if url is None: 63 | url = conf.get("url", "") 64 | if url == "": 65 | return 66 | url = url.format(**self.kwargs) 67 | method = conf.get("method", "get").upper() 68 | 69 | kwargs = dict() 70 | headers = conf.get("headers", {}) 71 | if headers: 72 | kwargs["headers"] = headers 73 | verify_ssl = conf.get("verify_ssl", True) 74 | 75 | # GET params 76 | params = conf.get("params", {}).copy() 77 | for (k, v) in params.items(): 78 | if hasattr(v, "items"): 79 | conf = params.pop(k) 80 | if "relatime" in conf: 81 | dt = relatime.timeParser(conf["relatime"], timezone=str(get_localzone())) 82 | target_tz = pytz.timezone(conf.get("timezone", "UTC")) 83 | dt = dt.astimezone(target_tz) 84 | dt = dt.replace(tzinfo=None) 85 | time_format = conf.get("format", "%Y-%m-%dT%H:%M:%S.%fZ") 86 | if time_format.lower() == "as_epoch": 87 | params[k] = str(int(dt.timestamp())) 88 | else: 89 | params[k] = dt.strftime(time_format) 90 | else: 91 | params[k] = str(v).format(**self.kwargs) 92 | if params: 93 | kwargs["params"] = params 94 | 95 | # POST data 96 | data = conf.get("data", {}) 97 | for (k, v) in data.items(): 98 | data[k] = v.format(**self.kwargs) 99 | if data: 100 | kwargs["data"] = data 101 | 102 | # HTTP Basic Auth 103 | if conf.get("auth") and self.creds and self.creds.get(conf["auth"]): 104 | kwargs["auth"] = tuple(self.creds[conf["auth"]]) 105 | 106 | # Auto decompress 107 | if conf.get("decompress", False): 108 | kwargs["hooks"] = {"response": self.unzip_content} 109 | 110 | raw_req = requests.Request(method, url, **kwargs) 111 | req = self.session.prepare_request(raw_req) 112 | if self.kwargs.get("verbose", False): 113 | print("[.] Requesting {0} ({1})".format(req.url, req.method)) 114 | with warnings.catch_warnings(): 115 | if not verify_ssl: 116 | warnings.simplefilter("ignore", exceptions.InsecureRequestWarning) 117 | return self.session.send(req, verify=verify_ssl) 118 | 119 | def get_content(self, conf=None, url=None): 120 | if conf is None: 121 | conf = self.conf["request"] 122 | 123 | r = self._req(conf, url) 124 | ignored_status_codes = [int(sc) for sc in conf.get("ignored_status_codes", [])] 125 | if r.status_code not in ignored_status_codes: 126 | r.raise_for_status() 127 | return r 128 | #pylint: disable=no-self-use 129 | #Will be cleaned up in upcoming refactor 130 | def build_result(self, parser, result_dict): 131 | defaults_dict = parser.get("defaults", {}) 132 | 133 | result = OrderedDict() 134 | result.update(defaults_dict) 135 | result.update(result_dict) 136 | 137 | result.pop(None, None) 138 | 139 | if "map" in parser: 140 | for (old, new) in parser["map"].items(): 141 | if new is None: 142 | result.pop(old) 143 | elif old in result: 144 | result[new] = result.pop(old) 145 | 146 | # fmt = dict() 147 | # for (k, v) in result.items(): 148 | # fk = "<{0}>".format(k) 149 | # fmt[fk] = str(v) 150 | # 151 | # for (k, v) in result.items(): 152 | # for (find, replace) in fmt.items(): 153 | # try: 154 | # result[k] = v.replace(find, replace) 155 | # except AttributeError: 156 | # pass 157 | 158 | if "defaults" in parser: 159 | for (k, v) in parser["defaults"].items(): 160 | result[k] = v 161 | 162 | if "pretty_name" in parser: 163 | result = OrderedDict([ 164 | ("value", result), 165 | ("pretty_name", parser["pretty_name"]) 166 | ]) 167 | 168 | if hasattr(result_dict, "labels"): 169 | result.labels = result_dict.labels 170 | 171 | return result 172 | -------------------------------------------------------------------------------- /src/machinae/sites/csv.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import csv 4 | import io 5 | import re 6 | 7 | from .base import HttpSite 8 | 9 | 10 | class CsvSite(HttpSite): 11 | _delim = None 12 | 13 | @property 14 | def dialect(self): 15 | if "pattern" not in self.conf: 16 | return "excel" 17 | 18 | class DelimDialect(csv.excel): 19 | delimiter = str(self.delim) 20 | skipinitialspace = True 21 | 22 | return DelimDialect() 23 | 24 | @property 25 | def delim(self): 26 | return self._delim or self.conf.get("pattern", ",") 27 | 28 | def get_content(self): 29 | r = super(CsvSite, self).get_content() 30 | body = r.text 31 | 32 | if len(self.delim) > 1: 33 | body = re.sub(self.conf["pattern"], "|", body) 34 | self._delim = "|" 35 | 36 | buf = io.StringIO(body) 37 | csvfile = csv.reader(buf, dialect=self.dialect) 38 | 39 | return csvfile 40 | 41 | def run(self): 42 | r = self._req(self.conf["request"]) 43 | 44 | body = r.text 45 | if len(self.delim) > 1: 46 | body = re.sub(self.conf["pattern"], "|", body) 47 | self._delim = "|" 48 | 49 | buf = io.StringIO(body) 50 | csvfile = csv.reader(buf, dialect=self.dialect) 51 | 52 | for (lineno, row) in enumerate(csvfile): 53 | for parser in self.conf["results"]: 54 | start = parser.get("start", 1) 55 | stop = parser.get("end", None) 56 | 57 | # raise ValueError(start, stop) 58 | #pylint: disable=len-as-condition 59 | if lineno < start or len(row) == 0 or row[0].startswith("#"): 60 | continue 61 | elif stop is not None and lineno > stop: 62 | break 63 | 64 | if "match" in parser: 65 | rex = re.compile(parser["match"]["regex"]) 66 | col = int(parser["match"]["column"]) 67 | if not rex.search(row[col]): 68 | continue 69 | 70 | row = [item.strip() for item in row] 71 | result_dict = dict(zip(parser["values"], row)) 72 | yield self.build_result(parser, result_dict) 73 | -------------------------------------------------------------------------------- /src/machinae/sites/html.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import re 4 | from collections import Counter 5 | 6 | from bs4 import BeautifulSoup, Comment 7 | 8 | from .base import HttpSite 9 | 10 | 11 | def html_unescape(content): 12 | try: 13 | import html 14 | return html.unescape(content) 15 | except ImportError: 16 | import HTMLParser 17 | html_parser = HTMLParser.HTMLParser() 18 | return html_parser.unescape(content) 19 | 20 | 21 | class HtmlSite(HttpSite): 22 | def get_html(self): 23 | r = super(HtmlSite, self).get_content() 24 | body = r.text 25 | 26 | cleanup = self.conf["request"].get("cleanup", {}) 27 | 28 | strip_comments = str(cleanup.get("strip_comments", False)).lower() 29 | if strip_comments in ("1", "yes", "true"): 30 | soup = BeautifulSoup(r.text, "html5lib") 31 | for comment in soup.find_all(text=lambda _: isinstance(_, Comment)): 32 | comment.extract() 33 | body = str(soup) 34 | 35 | return html_unescape(body) 36 | 37 | 38 | class TableScraper(HtmlSite): 39 | @staticmethod 40 | def compare_rows(row1, row2): 41 | row1 = [cell.strip().lower() for cell in row1] 42 | row2 = [cell.strip().lower() for cell in row2] 43 | #pylint: disable=superfluous-parens 44 | #I believe these to actually be unnecessary,er,superfluous but will have to test 45 | return (Counter(row1) == Counter(row2)) 46 | 47 | @staticmethod 48 | def get_row_contents(row): 49 | return [cell.get_text().strip() for cell in row.find_all(["td", "th"])] 50 | 51 | @classmethod 52 | def find_table(cls, html, headers): 53 | soup = BeautifulSoup(html, "html5lib") 54 | for table in soup.find_all("table"): 55 | cells = cls.get_row_contents(table.find("tr")) 56 | if cls.compare_rows(cells, headers): 57 | return (table, cells) 58 | raise ValueError("No matching table found") 59 | 60 | def run(self): 61 | body = self.get_html() 62 | 63 | for parser in self.conf["results"]: 64 | (table, columns) = self.find_table(body, parser["map"].keys()) 65 | for row in table.find_all("tr"): 66 | cells = self.get_row_contents(row) 67 | if self.compare_rows(cells, columns): 68 | continue 69 | result_dict = dict(zip(columns, cells)) 70 | yield self.build_result(parser, result_dict) 71 | 72 | 73 | class Webscraper(HtmlSite): 74 | def run(self): 75 | body = self.get_html() 76 | 77 | if "results" not in self.conf: 78 | raise Exception("No parsing configuration found") 79 | for parser in self.conf["results"]: 80 | rex = re.compile(parser["regex"], flags=re.I) 81 | for match in rex.finditer(body): 82 | result_dict = dict() 83 | for (k, v) in zip(parser["values"], match.groups()): 84 | result_dict[k] = v 85 | yield self.build_result(parser, result_dict) 86 | -------------------------------------------------------------------------------- /src/machinae/sites/ipwhois.py: -------------------------------------------------------------------------------- 1 | from ipaddress import ip_address, summarize_address_range 2 | 3 | import ipwhois 4 | 5 | from .json import JsonApi 6 | 7 | 8 | class IpWhois(JsonApi): 9 | @staticmethod 10 | def get_cidr(network): 11 | networks = [str(net) for net in summarize_address_range( 12 | ip_address(network["start_address"]), 13 | ip_address(network["end_address"]) 14 | )] 15 | if len(networks) == 1: 16 | networks = networks[0] 17 | return networks 18 | 19 | def get_json(self): 20 | obj = ipwhois.IPWhois(self.kwargs["target"]) 21 | try: 22 | # import json 23 | # print(json.dumps(obj.lookup_rdap(depth=2))) 24 | # return obj.lookup_rdap(depth=2) 25 | return obj.lookup_rws() 26 | except AttributeError: 27 | # rdap = obj.lookup_rdap(inc_raw=True) 28 | # print(json.dumps(rdap)) 29 | # rdap["network"]["range"] = "{start_address} - {end_address}".format(**rdap["network"]) 30 | # rdap["network"]["cidr"] = self.get_cidr(rdap["network"]) 31 | # return rdap 32 | # RDAP is a stupid format, use raw whois 33 | raw = obj.lookup() 34 | print(raw) 35 | return raw 36 | -------------------------------------------------------------------------------- /src/machinae/sites/json.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import datetime 4 | import json 5 | import re 6 | import urllib.parse 7 | from collections import OrderedDict 8 | 9 | from dateutil.parser import parse 10 | 11 | from relatime import timeParser 12 | 13 | from .base import HttpSite 14 | 15 | class JsonApi(HttpSite): 16 | @staticmethod 17 | def get_value(data, key, default=None): 18 | if key == "@" or data is None: 19 | return data 20 | ret = data 21 | key_parts = key.split(".") 22 | for key_part in key_parts: 23 | if key_part not in ret: 24 | return default 25 | ret = ret[key_part] 26 | return ret 27 | 28 | def get_json(self, url=None): 29 | r = self.get_content(url=url) 30 | 31 | ignored_status_codes = [int(sc) for sc in self.conf["request"].get("ignored_status_codes", [])] 32 | if r.status_code in ignored_status_codes: 33 | return [] 34 | 35 | if not self.conf.get("multi_json", False): 36 | return r.json() 37 | 38 | if r.status_code in ignored_status_codes: 39 | return [] 40 | 41 | results = list() 42 | for json_line in r.text.split("\n"): 43 | if not json_line: 44 | break 45 | results.append(json.loads(json_line)) 46 | return results 47 | 48 | def run(self): 49 | data = self.get_json() 50 | 51 | if hasattr(data, "items"): 52 | next_url = None 53 | if self.conf.get("paginated", False): 54 | next_url = data.get("next", None) 55 | 56 | data = [data] 57 | while next_url: 58 | next_data = self.get_json(url=next_url) 59 | next_url = next_data.get("next", None) 60 | data.append(next_data) 61 | 62 | if "results" not in self.conf: 63 | return 64 | 65 | for row in data: 66 | for parser in self.conf["results"]: 67 | for _ in self.parse_dict(row, parser): 68 | yield _ 69 | 70 | @classmethod 71 | def get_result_dicts(cls, data, parser, mm_key=None, onlyif=None): 72 | if not hasattr(parser, "items"): 73 | parser = {"key": parser} 74 | 75 | if "key" not in parser: 76 | yield data 77 | return 78 | 79 | key = parser["key"] 80 | rex = None 81 | if "regex" in parser: 82 | rex = re.compile(parser["regex"], flags=re.I) 83 | 84 | if key == "@" and mm_key is not None: 85 | yield {key: mm_key} 86 | return 87 | 88 | values = cls.get_value(data, key) 89 | if values is None: 90 | return 91 | 92 | if not parser.get("match_all", False): 93 | values = [values] 94 | 95 | for val in values: 96 | result_dict = OrderedDict() 97 | 98 | if rex: 99 | m = rex.search(val) 100 | if not m: 101 | return 102 | #pylint: disable=len-as-condition 103 | if len(m.groups()) > 0: 104 | val = m.groups() 105 | if len(val) == 1: 106 | val = val[0] 107 | 108 | urldecode = str(parser.get("urldecode", False)).lower() 109 | if urldecode in ("1", "yes", "true"): 110 | val = urllib.parse.unquote(val) 111 | elif urldecode == "twice": 112 | val = urllib.parse.unquote( 113 | urllib.parse.unquote(val) 114 | ) 115 | 116 | if "format" in parser: 117 | if parser["format"] == "as_list": 118 | val = ", ".join(map(str, val)) 119 | elif parser["format"] == "as_time": 120 | try: 121 | dt = datetime.datetime.fromtimestamp(float(val)) 122 | #pylint: disable=bare-except 123 | #Will be cleaned up in future refactor -- I hate mcmaster 124 | except: 125 | dt = parse(val) 126 | val = dt.isoformat() 127 | result_dict[key] = val 128 | 129 | yield result_dict 130 | 131 | @classmethod 132 | def multi_match_generator(cls, data, parser, mm_key): 133 | if not hasattr(data, "items"): 134 | # Is a list, process each list item 135 | for item in data: 136 | for _ in cls.multi_match_generator(item, parser, mm_key="@"): 137 | yield _ 138 | 139 | return 140 | 141 | onlyif = parser.get("onlyif", None) 142 | if onlyif is not None and not hasattr(onlyif, "items"): 143 | onlyif = {"key": onlyif} 144 | 145 | # Decide how to iterate on the data 146 | # Options are: 147 | # Return result_dict per match in dict (if: data is dict) 148 | # Return one result_dict for whole dict (if: data is dict) 149 | if mm_key == "@" or parser.get("match_all", False): 150 | # Treat the entire data as a single match 151 | # Returns a single result_dict 152 | data = [(None, data)] 153 | else: 154 | # Each matching key is a separate result_dict 155 | data = data.items() 156 | 157 | for (k, v) in data: 158 | if onlyif is not None: 159 | if not hasattr(onlyif, "items"): 160 | onlyif = {"key": onlyif} 161 | value = cls.get_value(v, onlyif["key"], None) 162 | 163 | if value is None: 164 | continue 165 | elif "regex" in onlyif: 166 | rex = re.compile(onlyif["regex"], re.I) 167 | if not rex.search(value): 168 | continue 169 | # Check for maxage key in onlyif. If it exists, parse it as Splunk relative time syntax and compare to parsed input "value" 170 | elif "maxage" in onlyif: 171 | age = parse(value) 172 | if not onlyif["maxage"].startswith("-"): # Assume we want dates in the past 173 | print('\033[91m' + 'WARNING: maxage must be prepended with "-" Please correct this in your configuration file.' + '\033[0m') 174 | onlyif["maxage"] = "-%s" % onlyif["maxage"] 175 | ageout = timeParser(onlyif["maxage"]).replace(tzinfo=None) 176 | if age < ageout: 177 | continue 178 | else: 179 | if not bool(value): 180 | continue 181 | result_dict = OrderedDict() 182 | for mm_parser in parser["keys"]: 183 | for mm_result_dict in cls.get_result_dicts(v, mm_parser, mm_key=k, onlyif=onlyif): 184 | result_dict.update(mm_result_dict) 185 | 186 | if result_dict: 187 | result_dict.labels = parser.get("labels", None) 188 | yield result_dict 189 | 190 | def parse_dict(self, data, parser): 191 | if not hasattr(parser, "items"): 192 | parser = {"key": parser} 193 | 194 | if "multi_match" in parser: 195 | target = self.get_value(data, parser["key"]) 196 | if target is None: 197 | return 198 | result_iter = self.multi_match_generator(target, parser["multi_match"], parser["key"]) 199 | else: 200 | result_iter = self.get_result_dicts(data, parser) 201 | 202 | for result_dict in result_iter: 203 | yield self.build_result(parser, result_dict) 204 | -------------------------------------------------------------------------------- /src/machinae/sites/rss.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import re 4 | 5 | import feedparser 6 | 7 | from .base import HttpSite 8 | 9 | 10 | class RssSite(HttpSite): 11 | def get_content(self): 12 | r = super(RssSite, self).get_content() 13 | return feedparser.parse(r.text) 14 | 15 | def run(self): 16 | r = self._req(self.conf["request"]) 17 | body = r.text 18 | rss = feedparser.parse(body) 19 | parser = None 20 | 21 | for entry in rss.entries: 22 | for parser1 in self.conf["results"]: 23 | result_dict = dict() 24 | for (key, parser) in parser1.items(): 25 | print(parser) 26 | rex = re.compile(parser["regex"]) 27 | fieldnames = parser["values"] 28 | if not isinstance(fieldnames, list): 29 | fieldnames = [fieldnames] 30 | rss_value = getattr(entry, key) 31 | m = rex.search(rss_value) 32 | if m: 33 | result_dict.update(dict(zip(fieldnames, m.groups()))) 34 | else: 35 | result_dict = None 36 | break 37 | 38 | if result_dict is None: 39 | continue 40 | 41 | yield self.build_result(parser, result_dict) 42 | -------------------------------------------------------------------------------- /src/machinae/utils.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import yaml 3 | 4 | class MachinaeLoader(yaml.SafeLoader): 5 | #pylint: disable=arguments-differ,too-many-ancestors 6 | def construct_mapping(self, node): 7 | self.flatten_mapping(node) 8 | return OrderedDict(self.construct_pairs(node)) 9 | 10 | 11 | MachinaeLoader.add_constructor( 12 | yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, 13 | MachinaeLoader.construct_mapping) 14 | 15 | 16 | #pylint: disable=too-many-ancestors 17 | class MachinaeDumper(yaml.Dumper): 18 | def represent_dict(self, data): 19 | return self.represent_mapping('tag:yaml.org,2002:map', data, False) 20 | 21 | def represent_list(self, data): 22 | return self.represent_sequence('tag:yaml.org,2002:seq', data, False) 23 | 24 | 25 | MachinaeDumper.add_representer( 26 | OrderedDict, 27 | MachinaeDumper.represent_dict) 28 | 29 | MachinaeDumper.add_representer( 30 | list, 31 | MachinaeDumper.represent_list) 32 | 33 | #This is to load site results as an OrderedDict so we override the 34 | #built-in PyYAML safe_load 35 | def safe_load(*args, **kwargs): 36 | kwargs["Loader"] = MachinaeLoader 37 | return yaml.load(*args, **kwargs) 38 | 39 | 40 | def dump(*args, **kwargs): 41 | kwargs["Dumper"] = MachinaeDumper 42 | return yaml.dump(*args, **kwargs) 43 | 44 | 45 | def listsites(conf): 46 | rstr = '{0:40}{1:40}{2:40}{3}'.format('SITE', 'NAME', 'OTYPES', 'DEFAULT') 47 | rstr += '\n' 48 | for key in conf: 49 | d = 'True' 50 | if "default" in conf[key].keys(): 51 | d = str(conf[key]["default"]) 52 | rstr += '{0:40}{1:40}{2:40}{3}'.format(key, 53 | conf[key]["name"], 54 | ', '.join(conf[key]["otypes"]), 55 | d) 56 | rstr += '\n' 57 | return rstr 58 | --------------------------------------------------------------------------------