├── .github
└── workflows
│ └── codeql-analysis.yml
├── .gitignore
├── .travis.yml
├── Code_Of_Conduct.md
├── Dockerfile
├── LICENSE.txt
├── README.md
├── images
├── README.md
├── machinae-square.jpg
├── machinae.jpg
├── robot-plainer.jpg
└── t-machinae.jpg
├── machinae.yml
├── pylintrc
├── requirements.txt
├── setup.py
└── src
└── machinae
├── __init__.py
├── cmd.py
├── outputs.py
├── sites
├── __init__.py
├── base.py
├── csv.py
├── html.py
├── ipwhois.py
├── json.py
└── rss.py
└── utils.py
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | name: "CodeQL"
2 |
3 | on:
4 | push:
5 | branches: [master, ]
6 | pull_request:
7 | # The branches below must be a subset of the branches above
8 | branches: [master]
9 | schedule:
10 | - cron: '0 2 * * 6'
11 |
12 | jobs:
13 | analyze:
14 | name: Analyze
15 | runs-on: ubuntu-latest
16 |
17 | steps:
18 | - name: Checkout repository
19 | uses: actions/checkout@v2
20 | with:
21 | # We must fetch at least the immediate parents so that if this is
22 | # a pull request then we can checkout the head.
23 | fetch-depth: 2
24 |
25 | # If this run was triggered by a pull request event, then checkout
26 | # the head of the pull request instead of the merge commit.
27 | - run: git checkout HEAD^2
28 | if: ${{ github.event_name == 'pull_request' }}
29 |
30 | # Initializes the CodeQL tools for scanning.
31 | - name: Initialize CodeQL
32 | uses: github/codeql-action/init@v1
33 | # Override language selection by uncommenting this and choosing your languages
34 | with:
35 | languages: python
36 |
37 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
38 | # If this step fails, then you should remove it and run the build manually (see below)
39 | - name: Autobuild
40 | uses: github/codeql-action/autobuild@v1
41 |
42 | # ℹ️ Command-line programs to run using the OS shell.
43 | # 📚 https://git.io/JvXDl
44 |
45 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
46 | # and modify them (or add more) to build your code if your project
47 | # uses a compiled language
48 |
49 | #- run: |
50 | # make bootstrap
51 | # make release
52 |
53 | - name: Perform CodeQL Analysis
54 | uses: github/codeql-action/analyze@v1
55 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | .python-version
25 |
26 | # PyInstaller
27 | # Usually these files are written by a python script from a template
28 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 |
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 |
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .cache
41 | nosetests.xml
42 | coverage.xml
43 |
44 | # Translations
45 | *.mo
46 | *.pot
47 |
48 | # Django stuff:
49 | *.log
50 |
51 | # Sphinx documentation
52 | docs/_build/
53 |
54 | # PyBuilder
55 | target/
56 |
57 | #Vim
58 | *.swp
59 |
60 | # Divshot
61 | .divshot-cache/
62 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 |
3 | python:
4 | - "3.6"
5 | - "3.7"
6 | - "3.8"
7 | - "3.9-dev"
8 |
9 | before_install:
10 | - pip install pandoc
11 | - pip install pylint
12 | script:
13 | - pip install -r requirements.txt
14 | - pylint src/machinae/*.py
15 | - pylint src/machinae/sites/*.py
16 |
17 | deploy:
18 | provider: pypi
19 | user: billfordx
20 | password:
21 | secure: PyOibJ0cErm9yCOfgWvToefrnCrwt3iw7H4eU7hdg4x73DXyqVRNHaJDTvfiVWJyJSNRxPe2r80v7VzUKO24Lqgp7FEpf+4dNbEJtJJEis93vYxOerYXthO/VUIh3yk7ULq9YIAn+65XgNRUk/YllebvOpHLnwNh8FQn63HesDVkCrcuiNFjALqC3SNKcg8vQxrBJzXo+f36a45BgZiQ20qZ8czechXKhi1UVWdQ8ezS/+4YAZcdudD3A0+qnfPd0ve0zfpIrm7ZsyQ9jyDXtnWw7QlOLOuQcT3o4OH9WHrtxjrFONtjg4zZnT9gygxUycgWz2NNVqVWx57ZkZImjAVaf8p7Ym/0DKLuMix2f+K5iMtVlKtYnb8ZKCj4UuaNrNmHrbDj7PasckezKbQF+TwMW9UoG54qh3q1fa+l13rZ3kTcjxg1Wn5RXv6/aw/i+3TGHW2hO0eWxAjgRl741NAzZDVuh0PAenYK8DETT2ZUIlU3VnzCbzi6jTunwV8UsToERHyla3GuiykTlmIOb/3THYIs+n7kffH89b1GlOj/+joLWL12AY5dG4zrhv2VYqt+erJ65K34/nJLk19S+KPqIpYKn/dj1cGzE3y2awiADR4nJbDH87BioqjTQ1fV8bxwPmyl0bGEzOoH9DQnFy/hAc6E9RNWkDIJKUUOEH0=
22 | on:
23 | tags: true
24 | branch: master
25 |
--------------------------------------------------------------------------------
/Code_Of_Conduct.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at billford+gitmach@billford.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3
2 |
3 | RUN pip3 install machinae
4 |
5 | #make sure you have a machinae.yml file to build with
6 | COPY machinae.yml /etc
7 |
8 | ENTRYPOINT ["/usr/local/bin/machinae"]
9 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Hurricane Labs
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://travis-ci.org/HurricaneLabs/machinae)
2 | [](https://bestpractices.coreinfrastructure.org/projects/2344)
3 | [](https://sonarcloud.io/dashboard?id=HurricaneLabs_machinae)
4 |
5 | 
6 |
7 |
8 | Machinae Security Intelligence Collector
9 | ========================================
10 |
11 | Machinae is a tool for collecting intelligence from public sites/feeds about
12 | various security-related pieces of data: IP addresses, domain names, URLs,
13 | email addresses, file hashes and SSL fingerprints. It was inspired by
14 | [Automater][1], another excellent tool for collecting information. The Machinae
15 | project was born from wishing to improve Automater in 4 areas:
16 |
17 | 1. Codebase - Bring Automater to python3 compatibility while making the code
18 | more pythonic
19 | 2. Configuration - Use a more human readable configuration format (YAML)
20 | 3. Inputs - Support JSON parsing out-of-the-box without the need to write
21 | regular expressions, but still support regex scraping when needed
22 | 4. Outputs - Support additional output types, including JSON, while making
23 | extraneous output optional
24 |
25 |
26 | Installation
27 | ------------
28 |
29 | Machinae can be installed using pip3:
30 |
31 | pip3 install machinae
32 |
33 | Or, if you're feeling adventurous, can be installed directly from github:
34 |
35 | pip3 install git+https://github.com/HurricaneLabs/machinae.git
36 |
37 | You will need to have whatever dependencies are required on your system for
38 | compiling Python modules (on Debian based systems, `python3-dev`), as well as
39 | the libyaml development package (on Debian based systems, `libyaml-dev`).
40 |
41 | You'll also want to grab the [latest configuration file][2] and place it in
42 | `/etc/machinae.yml`.
43 |
44 |
45 | Configuration File
46 | ------------------
47 |
48 | Machinae supports a simple configuration merging system to allow you to make
49 | adjustments to the configuration without modifying the machinae.yml we provide
50 | you, making configuration updates a snap. This is done by finding a system-wide
51 | default configuration (default `/etc/machinae.yml`), merging into that a
52 | system-wide local configuration (`/etc/machinae.local.yml`) and finally a
53 | per-user local configuration (`~/.machinae.yml`). The system-wide configuration
54 | can also be located in the current working directory, can be set using the
55 | `MACHINAE_CONFIG` environment variable, or of course by using the `-c` or
56 | `--config` command line options. Configuration merging can be disabled by
57 | passing the `--nomerge` option, which will cause Machinae to only load the
58 | default system-wide configuration (or the one passed on the command line).
59 |
60 | As an example of this, say you'd like to enable the Fortinet Category site,
61 | which is disabled by default. You could modify `/etc/machinae.yml`, but these
62 | changes would be overwritten by an update. Instead, you can put the following
63 | in either `/etc/machinae.local.yml` or `~/.machinae.yml`:
64 |
65 | fortinet_classify:
66 | default: true
67 |
68 | Or, conversely, to disable a site, such as Virus Total pDNS:
69 |
70 | vt_ip:
71 | default: false
72 | vt_domain:
73 | default: false
74 |
75 |
76 | Usage
77 | -----
78 |
79 | Machinae usage is very similar to Automater:
80 |
81 | usage: machinae [-h] [-c CONFIG] [--nomerge] [-d DELAY] [-f FILE] [-i INFILE] [-v]
82 | [-o {D,J,N,S}] [-O {ipv4,ipv6,fqdn,email,sslfp,hash,url}] [-q]
83 | [-s SITES] [-a AUTH] [-H HTTP_PROXY]
84 | [--dump-config | --detect-otype]
85 | ...
86 |
87 | - See above for details on the `-c`/`--config` and `--nomerge` options.
88 |
89 | - Machinae supports a `-d`/`--delay` option, like Automater. However, Machinae
90 | uses 0 by default.
91 |
92 | - Machinae output is controlled by two arguments:
93 | - `-o` controls the output format, and can be followed by a single character
94 | to indicated the desired type of output:
95 | - *N* is the default output ("Normal")
96 | - *D* is the default output, but dot characters are replaced
97 | - *J* is JSON output
98 | - `-f`/`--file` specifies the file where output should be written. The default
99 | is "-" for stdout.
100 |
101 | - Machinae will attempt to auto-detect the type of target passed in (Machinae
102 | refers to targets as "observables" and the type as "otype"). This detection can
103 | be overridden with the `-O`/`--otype` option. The choices are listed in the
104 | usage
105 |
106 | - By default, Machinae operates in verbose mode. In this mode, it will output
107 | status information about the services it is querying on the console as they are
108 | queried. This output will always be written to stdout, regardless of the output
109 | setting. To disable verbose mode, use `-q`
110 |
111 | - By default, Machinae will run through all services in the configuration that
112 | apply to each target's otype *and* are not marked as "default: false". To modify
113 | this behavior, you can:
114 | - Pass a comma separated list of sites to run (use the top level key from the
115 | configuration).
116 | - Pass the special keyword `all` to run through all services *including* those
117 | marked as "default: false"
118 |
119 | Note that in both cases, `otype` validation is still applied.
120 |
121 | - Machinae supports passing an HTTP proxy on the command line using the
122 | `-H`/`--http-proxy` argument. If no proxy is specified, machinae will search the
123 | standard `HTTP_PROXY` and `HTTPS_PROXY` environment variables, as well as the
124 | less standard `http_proxy` and `https_proxy` environment variables.
125 |
126 | - Lastly, a list of targets should be passed. All arguments other than the
127 | options listed above will be interpreted as targets.
128 |
129 |
130 | Out-of-the-Box Data Sources
131 | ---------------------------
132 |
133 | Machinae comes with out-of-the-box support for the following data sources:
134 |
135 | - IPVoid
136 | - URLVoid
137 | - URL Unshortener (http://www.toolsvoid.com/unshorten-url)
138 | - Malc0de
139 | - SANS
140 | - FreeGeoIP (freegeoip.io)
141 | - Fortinet Category
142 | - VirusTotal pDNS (via web scrape - commented out)
143 | - VirusTotal pDNS (via JSON API)
144 | - VirusTotal URL Report (via JSON API)
145 | - VirusTotal File Report (via JSON API)
146 | - Reputation Authority
147 | - ThreatExpert
148 | - VxVault
149 | - ProjectHoneypot
150 | - McAfee Threat Intelligence
151 | - StopForumSpam
152 | - Cymru MHR
153 | - ICSI Certificate Notary
154 | - TotalHash (disabled by default)
155 | - DomainTools Parsed Whois (Requires API key)
156 | - DomainTools Reverse Whois (Requires API key)
157 | - DomainTools Reputation
158 | - IP WHOIS (Using RIR REST interfaces)
159 | - Hacked IP
160 | - Metadefender Cloud (Requires API key)
161 | - GreyNoise (Requires API key)
162 | - IBM XForce (Required API key)
163 |
164 | With additional data sources on the way.
165 |
166 | HTTP Basic Authentication and Configuration
167 | -------------------------------------------
168 |
169 | Machinae supports HTTP Basic Auth for sites that require it through the `--auth/-a`
170 | flag. You will need to create a YAML file with your credentials, which will include
171 | a key to the site that requires the credentials and a list of two items, username
172 | and password or API key. For example, for the included PassiveTotal site this might
173 | look like:
174 |
175 | passivetotal: ['myemail@example.com', 'my_api_key']
176 |
177 | Inside the site configuration under `request` you will see a key such as:
178 |
179 | json:
180 | request:
181 | url: '...'
182 | auth: passivetotal
183 |
184 | The `auth: passivetotal` points to the key inside the authentication config passed
185 | via the command line.
186 |
187 | ### Disabled by default
188 |
189 | The following sites are disabled by default
190 |
191 | - Fortinet Category (`fortinet_classify`)
192 | - Telize Geo IP (`telize`)
193 | - TotalHash (`totalhash_ip`)
194 | - DomainTools Parsed Whois (`domaintools_parsed_whois`)
195 | - DomainTools Reverse Whois (`domaintools_reverse_whois`)
196 | - DomainTools Reputation (`domaintools_reputation`)
197 | - PassiveTotal Passive DNS (`passivetotal_pdns`)
198 | - PassiveTotal Whois (`passivetotal_whois`)
199 | - PassiveTotal SSL Certificate History (`passivetotal_sslcert`)
200 | - PassiveTotal Host Attribute Components (`passivetotal_components`)
201 | - PassiveTotal Host Attribute Trackers (`passivetotal_trackers`)
202 | - MaxMind GeoIP2 Passive Insight (`maxmind`)
203 | - FraudGuard (`fraudguard`)
204 | - Shodan (`shodan`)
205 | - Hacked IP
206 | - Metadefender Cloud (Requires API key)
207 | - GreyNoise (Requires API key)
208 | - IBM XForce (Requires API key)
209 |
210 | Output Formats
211 | --------------
212 |
213 | Machinae comes with a limited set of output formats: normal, normal with dot
214 | escaping, and JSON. We plan to add additional output formats in the future.
215 |
216 |
217 | Adding additional sites
218 | -----------------------
219 |
220 | *** COMING SOON ***
221 |
222 |
223 | Known Issues
224 | ------------
225 |
226 | - Some ISP's on IPvoid contain double-encoded HTML entities, which are not
227 | double-decoded
228 |
229 |
230 | Upcoming Features
231 | -----------------
232 |
233 | - Add IDS rule search functionality (VRT/ET)
234 | - Add "More info" link for sites
235 | - Add "dedup" option to parser settings
236 | - Add option for per-otype request settings
237 | - Add custom per-site output for error codes
238 |
239 |
240 | Version History
241 | ---------------
242 |
243 | ### Version 1.4.9 (2020-11-25) ###
244 | - Fix bug in JSON as_time processing when time is an epoch time, but str type
245 |
246 | ### Version 1.4.1 (2018-08-31) ###
247 | - New Features
248 | - Automatically Defangs output
249 | - MISP Support (example added to machinae.yml)
250 |
251 | ### Version 1.4.0 (2016-04-20) ###
252 | - New features
253 | - "-a"/"--auth" option for passing an auth config file
254 | - Thanks johannestaas for the submission
255 | - "-H"/"--http-proxy" option, and environment support, for HTTP proxies
256 | - New sites
257 | - Passivetotal (various forms, thanks johannestaas)
258 | - MaxMind
259 | - FraudGuard
260 | - Shodan
261 | - Updated sites
262 | - FreeGeoIP (replaced freegeoip.net with freegeoip.io)
263 |
264 | ### Version 1.3.4 (2016-04-01) ###
265 | - Bug fixes
266 | - Convert exceptions to str when outputting to JSON
267 | - Should actually close #14
268 |
269 | ### Version 1.3.3 (2016-03-28) ###
270 | - Bug fixes
271 | - Correctly handle error results when outputting to JSON
272 | - Closes #14
273 | - Thanks Den1al for the bug report
274 |
275 | ### Version 1.3.2 (2016-03-10) ###
276 | - New features
277 | - "Short" output mode - simply output yes/no/error for each site
278 | - "-i"/"--infile" option for passing a file with list of targets
279 |
280 | ### Version 1.3.1 (2016-03-08) ###
281 |
282 | - New features
283 | - Prepend "http://" to URL targets when not starting with http:// or https://
284 |
285 | ### Version 1.3.0 (2016-03-07) ###
286 |
287 | - New sites
288 | - Cymon.io - Threat intel aggregator/tracker by eSentire
289 | - New features
290 | - Support simple paginated responses
291 | - Support url encoding 'target' in request URL
292 | - Support url decoding values in results
293 |
294 | ### Version 1.2.0 (2016-02-16) ###
295 |
296 | - New features
297 | - Support for sites returning multiple JSON documents
298 | - Ability to specify time format for relative time parameters
299 | - Ability to parse Unix timestamps in results and display in ISO-8601 format
300 | - Ability to specify status codes to ignore per-API
301 | - New sites
302 | - DNSDB - FarSight Security Passive DNS Data base (premium)
303 |
304 | ### Version 1.1.2 (2015-11-26) ###
305 |
306 | - New sites
307 | - Telize (premium) - GeoIP site (premium)
308 | - Freegeoip - GeoIP site (free)
309 | - CIF - CIFv2 API support, from csirtgadgets.org
310 | - New features
311 | - Ability to specify labels for single-line multimatch JSON outputs
312 | - Ability to specify relative time parameters using relatime library
313 |
314 | ### Version 1.0.1 (2015-10-13) ###
315 |
316 | - Fixed a false-positive bug with Spamhaus (Github#10)
317 |
318 | ### Version 1.0.0 (2015-07-02) ###
319 |
320 | - Initial release
321 |
322 |
323 | License Info
324 | ------------
325 |
326 | The MIT License (MIT)
327 |
328 | Copyright (c) 2015 Hurricane Labs LLC
329 |
330 | Permission is hereby granted, free of charge, to any person obtaining a copy
331 | of this software and associated documentation files (the "Software"), to deal
332 | in the Software without restriction, including without limitation the rights
333 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
334 | copies of the Software, and to permit persons to whom the Software is
335 | furnished to do so, subject to the following conditions:
336 |
337 | The above copyright notice and this permission notice shall be included in
338 | all copies or substantial portions of the Software.
339 |
340 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
341 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
342 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
343 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
344 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
345 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
346 | THE SOFTWARE.
347 |
348 |
349 | [1]: https://github.com/1aN0rmus/TekDefense-Automater
350 | [2]: https://github.com/HurricaneLabs/machinae/raw/master/machinae.yml
351 |
--------------------------------------------------------------------------------
/images/README.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/images/machinae-square.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HurricaneLabs/machinae/9ef3e6ce1a8d4ad00107ca206e72bf8dc09878f1/images/machinae-square.jpg
--------------------------------------------------------------------------------
/images/machinae.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HurricaneLabs/machinae/9ef3e6ce1a8d4ad00107ca206e72bf8dc09878f1/images/machinae.jpg
--------------------------------------------------------------------------------
/images/robot-plainer.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HurricaneLabs/machinae/9ef3e6ce1a8d4ad00107ca206e72bf8dc09878f1/images/robot-plainer.jpg
--------------------------------------------------------------------------------
/images/t-machinae.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HurricaneLabs/machinae/9ef3e6ce1a8d4ad00107ca206e72bf8dc09878f1/images/t-machinae.jpg
--------------------------------------------------------------------------------
/machinae.yml:
--------------------------------------------------------------------------------
1 | ipwhois:
2 | name: IP Whois
3 | otypes:
4 | - ipv4
5 | ipwhois:
6 | results:
7 | - key: '@'
8 | multi_match:
9 | keys:
10 | - asn
11 | - asn_cidr
12 | - asn_date
13 | - asn_registry
14 | - asn_country_code
15 | pretty_name: ASN Information
16 | - key: nets
17 | multi_match:
18 | keys:
19 | - cidr
20 | - handle
21 | - name
22 | - range
23 | pretty_name: Network Information
24 | - key: nets
25 | multi_match:
26 | keys:
27 | - description
28 | - key: created
29 | regex: '(\d+-\d+-\d+)T'
30 | - key: updated
31 | regex: '(\d+-\d+-\d+)T'
32 | pretty_name: Registration Info
33 | - key: nets
34 | multi_match:
35 | keys:
36 | - city
37 | - state
38 | - postal_code
39 | - country
40 | pretty_name: Registration Locality
41 | # For when we use RWS
42 | - key: nets
43 | multi_match:
44 | keys:
45 | - key: abuse_emails
46 | split: "\n"
47 | pretty_name: Abuse Email
48 | - key: nets
49 | multi_match:
50 | keys:
51 | - key: tech_emails
52 | split: "\n"
53 | pretty_name: Tech Email
54 | # For when we fall back to regular whois
55 | - key: nets
56 | multi_match:
57 | keys:
58 | - key: emails
59 | split: "\n"
60 | pretty_name: Contacts
61 | spamhaus_ip:
62 | name: Spamhaus Zen BL
63 | default: False
64 | otypes:
65 | - ipv4
66 | webscraper:
67 | request:
68 | url: 'http://www.spamhaus.org/query/ip/{target}'
69 | method: get
70 | strip_comments: true
71 | results:
72 | - regex: '\S+ is (listed in the \w+)'
73 | values:
74 | - spamhaus_zenbl
75 | pretty_name: Spamhaus Zen BL
76 | spamhaus_domain:
77 | name: Spamhaus Domain BL
78 | default: False
79 | otypes:
80 | - fqdn
81 | webscraper:
82 | request:
83 | url: 'http://www.spamhaus.org/query/domain/{target}'
84 | method: get
85 | results:
86 | - regex: '\S+ is (listed in the \w+)'
87 | values:
88 | - spamhaus_dbl
89 | pretty_name: Spamhaus DBL
90 | ipvoid:
91 | name: IPVoid
92 | default: False
93 | otypes:
94 | - ipv4
95 | json:
96 | request:
97 | url: 'https://endpoint.apivoid.com/iprep/v1/pay-as-you-go/'
98 | params:
99 | key:
100 | ip: '{target}'
101 | method: get
102 | results:
103 | - key: data.report.blacklists.detections
104 | pretty_name: Number of detections
105 | - key: data.report.blacklists.detection_rate
106 | pretty_name: IP Void Detection Rate
107 | - key: data.report.blacklists.engines
108 | pretty_name: Engines
109 | multi_match:
110 | keys:
111 | - engine
112 | - reference
113 | onlyif: detected
114 |
115 | urlvoid:
116 | name: URLVoid
117 | otypes:
118 | - fqdn
119 | webscraper:
120 | request:
121 | url: 'http://www.urlvoid.com/scan/{target}'
122 | method: get
123 | results:
124 | - regex: 'Analysis Date<\/td>
(.+?)<\/td>'
125 | values: urlvoid_analysis_date
126 | pretty_name: Last Analysis
127 | - regex: '(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}).{5,30}Find\swebsites\shosted\shere'
128 | values: urlvoid_ip
129 | pretty_name: IP from URLVoid
130 | - regex: '\/>(.+?)<\/td> | <\/i>'
131 | values: urlvoid_blacklist
132 | pretty_name: Blacklist from URL Void
133 | - regex: 'Domain\s1st\sRegistered.+\ | (.+)\<\/td\>'
134 | values: urlvoid_domain_age
135 | pretty_name: Domain Age from URL Void
136 | - regex: 'latitude\s/\slongitude.+\ | (.+)\<\/td\>'
137 | values: urlvoid_location
138 | pretty_name: Geo Coordinates from URLVoid
139 | - regex: 'alt="flag"\s/>\s\(\w+\)\s+([\w\s]+) | '
140 | values: urlvoid_country_code
141 | pretty_name: Country from URLVoid
142 | unshorten:
143 | name: URL Unshorten
144 | otypes:
145 | - fqdn
146 | - url
147 | webscraper:
148 | request:
149 | url: http://www.toolsvoid.com/unshorten-url
150 | method: post
151 | data:
152 | urladdr: '{target}'
153 | results:
154 | - regex: 'class="myarea">(.*?))\d{1,3})'
205 | values:
206 | - AbuseIPReports
207 | pretty_name: 'AbuseIPDB reports'
208 |
209 | - regex: '((?<=most\srecent\sreport\swas\s)\d{1,3}\s\w+\s\w+)'
210 | values:
211 | - Last_seen
212 | pretty_name: 'Last seen'
213 |
214 | RansomwareTracker:
215 | name: RansomwareTracker
216 | otypes:
217 | - ipv4
218 | webscraper:
219 | request:
220 | url: 'https://ransomwaretracker.abuse.ch/host/{target}'
221 | method: get
222 | results:
223 | - regex: '((?<=Host\sStatus:)\w+)'
224 | values:
225 | - Active
226 | pretty_name: 'Host Status'
227 | - regex: '((?<= | \d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2} | )\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})'
228 | values:
229 | - Last_seen
230 | pretty_name: 'Last seen'
231 | - regex: '((?<=Malware: | )\w+)'
232 | values:
233 | - ransomwareType
234 | pretty_name: 'Ransomware Type'
235 |
236 | sans:
237 | name: SANS
238 | otypes:
239 | - ipv4
240 | webscraper:
241 | request:
242 | url: 'https://isc.sans.edu/api/ip/{target}'
243 | method: get
244 | results:
245 | - regex: 'attacks>(\d+)<'
246 | values:
247 | - sans_attacks
248 | pretty_name: SANS attacks
249 | - regex: 'count>(\d+)<'
250 | values:
251 | - sans_count
252 | pretty_name: SANS count
253 | - regex: 'count>(\d+)<'
254 | values:
255 | - sans_count
256 | pretty_name: SANS count
257 | - regex: 'maxdate>(\d{4}-\d{2}-\d{2})<'
258 | values:
259 | - sans_maxdate
260 | pretty_name: SANS maxdate
261 | - regex: 'mindate>(\d{4}-\d{2}-\d{2})<'
262 | values:
263 | - sans_mindate
264 | pretty_name: SANS mindate
265 | telize:
266 | name: Telize GeoIP
267 | default: False
268 | otypes:
269 | - ipv4
270 | json:
271 | request:
272 | url: 'https://telize-v1.p.rapidapi.com/location/{target}'
273 | method: get
274 | headers:
275 | x-rapidapi-host: telize-v1.p.rapidapi.com
276 | x-rapidapi-key:
277 | Accept: application/json
278 | results:
279 | - key: continent_code
280 | pretty_name: GeoIP Continent Code
281 | - key: country_code
282 | pretty_name: GeoIP Country Code
283 | - key: country
284 | pretty_name: GeoIP Country
285 | - key: region_code
286 | pretty_name: GeoIP Region Code
287 | - key: region
288 | pretty_name: GeoIP Region
289 | - key: city
290 | pretty_name: GeoIP City
291 | - key: postal_code
292 | pretty_name: GeoIP Zip Code
293 | - key: latitude
294 | pretty_name: GeoIP Latitude
295 | - key: longitude
296 | pretty_name: GeoIP Longitude
297 | - key: timezone
298 | pretty_name: GeoIP Timezone
299 | - key: offset
300 | pretty_name: GeoIP UTC Offset
301 | - key: asn
302 | pretty_name: GeoIP ASN
303 | - key: isp
304 | pretty_name: GeoIP ISP
305 | maxmind:
306 | name: MaxMind GeoIP2 Precision
307 | default: False
308 | otypes:
309 | - ipv4
310 | json:
311 | request:
312 | url: https://geoip.maxmind.com/geoip/v2.1/insights/{target}
313 | auth: maxmind
314 | results:
315 | - key: country.iso_code
316 | pretty_name: MaxMind Country Code
317 | - key: country.names.en
318 | pretty_name: MaxMind Country
319 | - key: subdivisions
320 | multi_match:
321 | keys:
322 | - iso_code
323 | pretty_name: MaxMind Region Code
324 | - key: subdivisions
325 | multi_match:
326 | keys:
327 | - names.en
328 | pretty_name: MaxMind Region
329 | - key: city.names.en
330 | pretty_name: MaxMind City
331 | - key: postal.code
332 | pretty_name: MaxMind Zip Code
333 | - key: location.latitude
334 | pretty_name: MaxMind Latitude
335 | - key: location.longitude
336 | pretty_name: MaxMind Longitude
337 | - key: location.time_zone
338 | pretty_name: MaxMind Timezone
339 | freegeoip:
340 | name: freegeoip.io
341 | default: true
342 | otypes:
343 | - ipv4
344 | # - fqdn
345 | json:
346 | request:
347 | url: https://freegeoip.io/json/{target}
348 | results:
349 | - key: country_code
350 | pretty_name: GeoIP Country Code
351 | - key: country_name
352 | pretty_name: GeoIP Country
353 | # - key: region_code
354 | # pretty_name: GeoIP Region Code
355 | # - key: region_name
356 | # pretty_name: GeoIP Region
357 | - key: city
358 | pretty_name: GeoIP City
359 | # - key: zip_code
360 | # pretty_name: GeoIP Zip Code
361 | # - key: latitude
362 | # pretty_name: GeoIP Latitude
363 | # - key: longitude
364 | # pretty_name: GeoIP Longitude
365 | # - key: time_zone
366 | # pretty_name: GeoIP Timezone
367 | fortinet_classify:
368 | name: Fortinet Category
369 | default: True
370 | otypes:
371 | - ipv4
372 | - fqdn
373 | - url
374 | webscraper:
375 | request:
376 | url: 'https://www.fortiguard.com/webfilter?q={target}'
377 | method: get
378 | results:
379 | - regex: 'Category:\s(.+)<\/h4>\s'
380 | values:
381 | - fortinet_category
382 | pretty_name: Fortinet URL Category
383 | vt_ip:
384 | name: VirusTotal pDNS
385 | otypes:
386 | - ipv4
387 | json:
388 | request:
389 | url: https://www.virustotal.com/vtapi/v2/ip-address/report
390 | params:
391 | ip: '{target}'
392 | apikey: 308211ef74a1044ea98134424b3d20769451d25beda0b808a8b61036badc0ea1
393 | method: get
394 | results:
395 | - key: resolutions
396 | multi_match:
397 | keys:
398 | - key: last_resolved
399 | regex: '(\d{4}\-\d{1,2}\-\d{1,2})'
400 | - hostname
401 | onlyif:
402 | key: last_resolved
403 | maxage: '-30d'
404 | pretty_name: pDNS data from VirusTotal
405 | - key: detected_urls
406 | multi_match:
407 | keys:
408 | - key: scan_date
409 | regex: '(\d{4}\-\d{1,2}\-\d{1,2})'
410 | - key: url
411 | regex: '(http.{1,70}/)'
412 | onlyif:
413 | key: scan_date
414 | maxage: '-30d'
415 | pretty_name: pDNS malicious URLs from VirusTotal
416 | # vt_ip:
417 | # name: VirusTotal pDNS
418 | # otypes:
419 | # - ip
420 | # webscraper:
421 | # request:
422 | # url: 'https://www.virustotal.com/en/ip-address/{target}/information/'
423 | # method: get
424 | # headers:
425 | # Accept: 'text/html, application/xhtml+xml, */*'
426 | # Accept-Language: 'en-US'
427 | # Accept-Encoding: 'gzip, deflate'
428 | # DNT: 1
429 | # Connection: 'Keep-Alive'
430 | # results:
431 | # - regex: '(\d{4}\-\d{1,2}\-\d{1,2})\s+<.{30,70}/en/domain/(.{1,80})/information'
432 | # values:
433 | # - vt_pdns_date
434 | # - vt_pdns_domain
435 | # pretty_name: 'pDNS data from VirtusTotal'
436 | # - regex: '(\d{4}\-\d{1,2}\-\d{1,2}).{1,20}\s+<.{10,80}/en/url/.{1,100}/analysis/.{1,5}\s+(http.{1,70}/)'
437 | # values:
438 | # - vt_pdns_date
439 | # - vt_pdns_url
440 | # pretty_name: 'pDNS malicious URLs from VirusTotal'
441 | vt_domain:
442 | name: VirusTotal pDNS
443 | otypes:
444 | - fqdn
445 | json:
446 | request:
447 | url: https://www.virustotal.com/vtapi/v2/domain/report
448 | params:
449 | domain: '{target}'
450 | apikey: 308211ef74a1044ea98134424b3d20769451d25beda0b808a8b61036badc0ea1
451 | method: get
452 | results:
453 | - key: resolutions
454 | multi_match:
455 | keys:
456 | - key: last_resolved
457 | regex: '(\d{4}\-\d{1,2}\-\d{1,2})'
458 | - ip_address
459 | pretty_name: pDNS data from VirusTotal
460 | - key: Websense ThreatSeeker category
461 | pretty_name: Websense ThreatSeeker category
462 | - key: Webutation domain info.Safety score
463 | pretty_name: Webutation Safety score
464 | # vt_domain:
465 | # name: VirusTotal pDNS
466 | # otypes:
467 | # - fqdn
468 | # webscraper:
469 | # request:
470 | # url: 'https://www.virustotal.com/en/domain/{target}/information/'
471 | # method: get
472 | # headers:
473 | # Accept: 'text/html, application/xhtml+xml, */*'
474 | # Accept-Language: 'en-US'
475 | # Accept-Encoding: 'gzip, deflate'
476 | # DNT: 1
477 | # Connection: 'Keep-Alive'
478 | # results:
479 | # - regex: '(\d{4}\-\d{1,2}\-\d{1,2})\s+<.{30,70}/en/ip-address/(.{1,80})/information'
480 | # values:
481 | # - vt_pdns_date
482 | # - vt_pdns_ip
483 | # pretty_name: 'pDNS data from VirtusTotal'
484 | # - regex: '(\d{4}\-\d{1,2}\-\d{1,2}).{1,20}\s+<.{10,80}/en/url/.{1,100}/analysis/.{1,5}\s+(http.{1,70}/)'
485 | # values:
486 | # - vt_pdns_date
487 | # - vt_pdns_url
488 | # pretty_name: 'pDNS malicious URLs from VirusTotal'
489 | vt_url:
490 | name: VirusTotal URL Report
491 | otypes:
492 | - url
493 | json:
494 | request:
495 | url: https://www.virustotal.com/vtapi/v2/url/report
496 | method: get
497 | params:
498 | apikey: 308211ef74a1044ea98134424b3d20769451d25beda0b808a8b61036badc0ea1
499 | resource: '{target}'
500 | results:
501 | - key: scan_date
502 | pretty_name: Date submitted
503 | - key: positives
504 | pretty_name: Detected scanners
505 | - key: total
506 | pretty_name: Total scanners
507 | - key: scans
508 | pretty_name: URL Scanner
509 | multi_match:
510 | keys:
511 | - '@'
512 | - result
513 | onlyif: detected
514 | vt_hash:
515 | name: VirusTotal File Report
516 | otypes:
517 | - hash
518 | - hash.sha1
519 | - 'hash.sha256'
520 | json:
521 | request:
522 | url: https://www.virustotal.com/vtapi/v2/file/report
523 | method: get
524 | params:
525 | apikey: 308211ef74a1044ea98134424b3d20769451d25beda0b808a8b61036badc0ea1
526 | resource: '{target}'
527 | results:
528 | - key: scan_date
529 | pretty_name: Date submitted
530 | - key: positives
531 | pretty_name: Detected engines
532 | - key: total
533 | pretty_name: Total engines
534 | - key: scans
535 | pretty_name: Scans
536 | multi_match:
537 | keys:
538 | - '@'
539 | - result
540 | onlyif: detected
541 | reputation_authority:
542 | name: Reputation Authority
543 | otypes:
544 | - fqdn
545 | - ipv4
546 | webscraper:
547 | request:
548 | url: 'http://www.reputationauthority.org/lookup.php?ip={target}'
549 | method: get
550 | results:
551 | - regex: '>(\d{1,3}\/\d{1,3})'
552 | values:
553 | - ra_score
554 | pretty_name: Reputation Authority Score
555 | threatexpert:
556 | name: ThreatExpert
557 | otypes:
558 | - hash
559 | webscraper:
560 | request:
561 | url: 'http://www.threatexpert.com/report.aspx?md5={target}'
562 | method: get
563 | results:
564 | - regex: 'Submission\sreceived.\s(.+)'
565 | values:
566 | - threatexpert_date
567 | pretty_name: Hash found at ThreatExpert
568 | - regex: '1">(.{5,100})\s* | (\d+-\d+)\s*\[D\]\s*(.*?)\s*\s*(.*?)
582 | - regex: '>(\d{2}\-\d{2})<'
583 | values:
584 | - vxvault_date
585 | pretty_name: Date found at VXVault
586 | - regex: '\[D\].{2,40}\Wphp\?id.{2,10}>(.{5,100})([a-zA-Z\s]+)'
601 | values:
602 | - php_activity_type
603 | pretty_name: ProjectHoneyPot activity type
604 | - regex: '>First Received From.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])[a-zA-Z0-9><"&:,()=;\s\t/]+Number Received'
605 | values:
606 | - php_first_mail
607 | pretty_name: ProjectHoneyPot first mail received
608 | - regex: '>Last Received From.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])[a-zA-Z0-9><":,()=;\s\t/]+Number Received'
609 | values:
610 | - php_last_mail
611 | pretty_name: ProjectHoneyPot last mail received
612 | - regex: '>Number Received.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s\(\)]+[a-zA-Z\)])'
613 | values:
614 | - php_total_mail
615 | pretty_name: ProjectHoneyPot total mail received
616 | - regex: '>Spider First Seen.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])'
617 | values:
618 | - php_first_spider
619 | pretty_name: ProjectHoneyPot spider first seen
620 | - regex: '>Spider Last Seen.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s\(\)]+[a-zA-Z])'
621 | values:
622 | - php_last_spider
623 | pretty_name: ProjectHoneyPot spider last seen
624 | - regex: '>Spider Sightings.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s\(]+[a-zA-Z\)])'
625 | values:
626 | - php_spider_sightings
627 | pretty_name: ProjectHoneyPot total spider sightings
628 | - regex: '>User-Agents.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9\-\(\),\s]+[a-zA-Z\)])'
629 | values:
630 | - php_user_agents
631 | pretty_name: ProjectHoneyPot user-agent sightings
632 | - regex: '>First Post On.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])'
633 | values:
634 | - php_first_post
635 | pretty_name: ProjectHoneyPot first form post
636 | - regex: '>Last Post On.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])'
637 | values:
638 | - php_last_post
639 | pretty_name: ProjectHoneyPot last form post
640 | - regex: '>Form Posts.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s\(\)]+[a-zA-Z\)])'
641 | values:
642 | - php_form_posts
643 | pretty_name: ProjectHoneyPot total form posts
644 | - regex: '>First Rule-Break On.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])'
645 | values:
646 | - php_first_rulebreak
647 | pretty_name: ProjectHoneyPot first rule break
648 | - regex: '>Last Rule-Break On.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])'
649 | values:
650 | - php_last_rulebreak
651 | pretty_name: ProjectHoneyPot last rule break
652 | - regex: '>Rule Breaks.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s\(\)]+[a-zA-Z\)])'
653 | values:
654 | - php_total_rulebreaks
655 | pretty_name: ProjectHoneyPot total rule breaks
656 | - regex: 'Dictionary Attacks[a-zA-Z0-9><":,()=;\s\t/]+>First Received From.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])'
657 | values:
658 | - php_first_dictionary_attack
659 | pretty_name: ProjectHoneyPot first dictionary attack
660 | - regex: 'Dictionary Attacks[a-zA-Z0-9><"&:,()=;\s\t/]+>Last Received From.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])'
661 | values:
662 | - php_last_dictionary_attack
663 | pretty_name: ProjectHoneyPot last dictionary attack
664 | - regex: '>Dictionary Attacks.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s\(\)]+[a-zA-Z\)])'
665 | values:
666 | - php_total_dictionary_attacks
667 | pretty_name: ProjectHoneyPot total dictionary attacks
668 | - regex: '>First Bad Host Appearance.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])'
669 | values:
670 | - php_first_bad_host
671 | pretty_name: ProjectHoneyPot first bad host
672 | - regex: '>Last Bad Host Appearance.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])'
673 | values:
674 | - php_last_bad_host
675 | pretty_name: ProjectHoneyPot last bad host
676 | - regex: '>Bad Host Appearances.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s\(\)\-]+[a-zA-Z\)])'
677 | values:
678 | - php_total_bad_host
679 | pretty_name: ProjectHoneyPot total bad hosts
680 | - regex: '>Harvester First Seen.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s]+[a-zA-Z])'
681 | values:
682 | - php_first_harvester
683 | pretty_name: ProjectHoneyPot harvester first seen
684 | - regex: '>Harvester Last Seen.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\s\(\)]+[a-zA-Z])'
685 | values:
686 | - php_last_harvester
687 | pretty_name: ProjectHoneyPot harvester last seen
688 | - regex: '>Harvester Sightings.+[\n\r\t\s]+.+[\n\r\t\s]+([a-zA-Z0-9,\(\s]+[a-zA-Z\)])'
689 | values:
690 | - php_total_harvester
691 | pretty_name: ProjectHoneyPot total harvester sightings
692 | - regex: '(?:>Harvester Results(?:.+[\n\s].+[\n\s]+)\s{2,}|(?:
))(?!\s)([0-9a-zA-Z.\s:,()-]+)\s{2,}'
693 | values:
694 | - php_harvester_results
695 | pretty_name: ProjectHoneyPot harvester results
696 | mcafee_threat_domain:
697 | name: McAfee Threat
698 | otypes:
699 | - fqdn
700 | webscraper:
701 | request:
702 | url: 'https://www.mcafee.com/threat-intelligence/domain/default.aspx?domain={target}'
703 | method: get
704 | results:
705 | - regex: 'ctl00_breadcrumbContent_imgRisk"[^\r\n]+title="([A-Za-z]+)"'
706 | values:
707 | - mcafee_risk
708 | pretty_name: McAfee Web Risk
709 | - regex: '[\n\s]*Web\sCategory:[\n\s]*([A-Z][A-Za-z\s/,]+?)[\n\s]*'
710 | values:
711 | - mcafee_category
712 | pretty_name: McAfee Web Category
713 | - regex: '[\n\s]*Last\sSeen:[\n\s]*([0-9\-]+)[\n\s]*'
714 | values:
715 | - mcafee_last_seen
716 | pretty_name: McAfee Last Seen
717 | mcafee_threat_ip:
718 | name: McAfee Threat
719 | otypes:
720 | - ipv4
721 | webscraper:
722 | request:
723 | url: 'https://www.mcafee.com/threat-intelligence/ip/default.aspx?ip={target}'
724 | method: get
725 | results:
726 | - regex: 'ctl00_breadcrumbContent_imgRisk"[^\r\n]+src="/img/Threat_IP/rep_([a-z]+)\.png"'
727 | values:
728 | - mcafee_risk
729 | pretty_name: McAfee Web Risk
730 | - regex: 'ctl00_breadcrumbContent_imgRisk1"[^\r\n]+src="/img/Threat_IP/rep_([a-z]+)\.png"'
731 | values:
732 | - mcafee_risk
733 | pretty_name: McAfee Email Risk
734 | - regex: 'ctl00_breadcrumbContent_imgRisk2"[^\r\n]+src="/img/Threat_IP/rep_([a-z]+)\.png"'
735 | values:
736 | - mcafee_risk
737 | pretty_name: McAfee Network Risk
738 | - regex: '[\n\s]*Web\sCategory:[\n\s]*([A-Z][A-Za-z\s/,]+?)[\n\s]*'
739 | values:
740 | - mcafee_category
741 | pretty_name: McAfee Web Category
742 | stopforumspam:
743 | name: StopForumSpam
744 | otypes:
745 | - email
746 | webscraper:
747 | request:
748 | url: 'http://www.stopforumspam.com/search/{target}'
749 | method: get
750 | results:
751 | - regex: '>Found (0*[1-9]\d*) entries'
752 | values:
753 | - sfs_spam_count
754 | pretty_name: Spam email count
755 | cymru_mhr:
756 | name: Cymru MHR
757 | otypes:
758 | - hash
759 | - hash.sha1
760 | webscraper:
761 | request:
762 | url: 'https://hash.cymru.com/cgi-bin/bulkmhr.cgi'
763 | method: post
764 | data:
765 | action: do_whois
766 | bulk_paste: '{target}'
767 | submit_paste: Submit
768 | results:
769 | - regex: '[a-f0-9]+\s(\d+)\s(\d+)'
770 | values:
771 | - cymru_mhr_detect_time
772 | - cymru_mhr_detect_pct
773 | pretty_name: Cymru MHR Detection Percent
774 | icsi_notary:
775 | name: ICSI Certificate Notary
776 | otypes:
777 | - sslfp
778 | dns:
779 | request:
780 | query: '{target_stripped}.notary.icsi.berkeley.edu'
781 | rrtype: txt
782 | results:
783 | - regex: 'version=1 first_seen=(\d+) last_seen=(\d+) times_seen=(\d+) validated=(\d+)'
784 | values:
785 | - icsi_first_seen
786 | - icsi_last_seen
787 | - icsi_times_seen
788 | - icsi_validated
789 | pretty_name: ICSI Notary Results
790 | totalhash_ip:
791 | name: TotalHash
792 | default: false
793 | otypes:
794 | - ip
795 | webscraper:
796 | request:
797 | url: 'https://totalhash.com/network/dnsrr:*{target}*%20or%20ip:{target}'
798 | method: get
799 | results:
800 | - regex: '/analysis/(\w{40}).+(\d{4}\-\d{1,2}\-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})'
801 | values:
802 | - thip_hash
803 | - thip_date
804 | pretty_name: Totalhash
805 | domaintools_parsed_whois:
806 | name: DomainTools Whois
807 | default: false
808 | otypes:
809 | - fqdn
810 | json:
811 | request:
812 | url: 'https://api.domaintools.com/v1/{target}/whois/parsed'
813 | method: get
814 | params:
815 | api_username:
816 | api_key:
817 | results:
818 | - key: response.parsed_whois.contacts
819 | multi_match:
820 | keys:
821 | - '@'
822 | - name
823 | - country
824 | - email
825 | onlyif: name
826 | pretty_name: Whois Contacts
827 | - key: response.parsed_whois.created_date
828 | pretty_name: Domain registered
829 | regex: '(\d{4}\-\d{1,2}\-\d{1,2})'
830 | - key: response.parsed_whois.updated_date
831 | pretty_name: Whois updated
832 | regex: '(\d{4}\-\d{1,2}\-\d{1,2})'
833 | - key: response.parsed_whois.expired_date
834 | pretty_name: Domain expiration
835 | regex: '(\d{4}\-\d{1,2}\-\d{1,2})'
836 | - key: response.parsed_whois.name_servers
837 | pretty_name: Name Servers
838 | #match_all: true
839 | - key: response.parsed_whois.registrar
840 | pretty_name: Registrar Info
841 | multi_match:
842 | keys:
843 | - name
844 | - abuse_contact_phone
845 | - abuse_contact_email
846 | - url
847 | domaintools_reverse_whois:
848 | name: DomainTools Reverse Whois
849 | default: false
850 | otypes:
851 | - email
852 | json:
853 | request:
854 | url: 'https://api.domaintools.com/v1/reverse-whois/'
855 | method: get
856 | params:
857 | terms: '{target}'
858 | mode: purchase
859 | api_username:
860 | api_key:
861 | results:
862 | - key: response.domains
863 | match_all: true
864 | pretty_name: Registered domain
865 | - key: reponse.domain_count.current
866 | pretty_name: Currently active registered domains
867 | - key: response.domain_count.historic
868 | pretty_name: All registered domains
869 | domaintools_reputation:
870 | name: DomainTools Reputation
871 | default: false
872 | otypes:
873 | - fqdn
874 | json:
875 | request:
876 | url: 'https://api.domaintools.com/v1/reputation/'
877 | method: get
878 | params:
879 | domain: '{target}'
880 | include_reasons: 'true'
881 | api_username:
882 | api_key:
883 | results:
884 | - key: response.risk_score
885 | pretty_name: Risk Score
886 | - key: response.reasons
887 | pretty_name: Reasons
888 | dnsdb_ip:
889 | name: Farsight DNSDB
890 | default: False
891 | otypes:
892 | - ipv4
893 | - ipv6
894 | json:
895 | multi_json: true
896 | request:
897 | url: 'https://api.dnsdb.info/lookup/rdata/ip/{target}'
898 | method: get
899 | headers:
900 | Accept: application/json
901 | X-Api-Key:
902 | results:
903 | - key: '@'
904 | multi_match:
905 | keys:
906 | - rrname
907 | - rrtype
908 | - key: time_first
909 | format: as_time
910 | - key: time_last
911 | format: as_time
912 | labels:
913 | - Record Name
914 | - Record Type
915 | - First Seen
916 | - Last Seen
917 | dnsdb_fqdn:
918 | name: Farsight DNSDB
919 | default: False
920 | otypes:
921 | - fqdn
922 | json:
923 | multi_json: true
924 | request:
925 | url: 'https://api.dnsdb.info/lookup/rrset/name/{target}'
926 | method: get
927 | ignored_status_codes:
928 | - 404
929 | params:
930 | time_last_after:
931 | relatime: '-7d'
932 | timezone: UTC
933 | format: as_epoch
934 | headers:
935 | Accept: application/json
936 | X-Api-Key:
937 | results:
938 | - key: '@'
939 | multi_match:
940 | keys:
941 | - rrtype
942 | - key: rdata
943 | # format: as_list
944 | - key: time_last
945 | format: as_time
946 | labels:
947 | - Record Type
948 | - Record Data
949 | - Last Seen
950 | onlyif:
951 | key: rrtype
952 | regex: "^(A|AAAA|MX|SPF|TXT)$"
953 | cif:
954 | name: Collective Intelligence Framework
955 | default: false
956 | otypes:
957 | - ipv4
958 | - fqdn
959 | - email
960 | - hash
961 | json:
962 | request:
963 | url: 'https://cif/observables'
964 | method: get
965 | params:
966 | nolog: 1
967 | confidence: 75
968 | observable: '{target}'
969 | reporttime:
970 | relatime: '-2d'
971 | timezone: UTC
972 | reporttimeend:
973 | relatime: 'now'
974 | timezone: UTC
975 | headers:
976 | Accept: application/vnd.cif.v2+json
977 | Authorization: Token token=
978 | verify_ssl: False
979 | results:
980 | - key: '@'
981 | multi_match:
982 | keys:
983 | - asn
984 | - cc
985 | labels:
986 | - AS Number
987 | - Country Code
988 | - key: '@'
989 | multi_match:
990 | keys:
991 | - key: reporttime
992 | regex: '^(\d+-\d+-\d+)T'
993 | - confidence
994 | - key: tags
995 | format: as_list
996 | - provider
997 | - description
998 | labels:
999 | - Report Date
1000 | - Confidence
1001 | - Tags
1002 | - Provider
1003 | - Description
1004 |
1005 | threatcrowd_ip_report:
1006 | name: ThreatCrowd IP Report
1007 | default: True
1008 | otypes:
1009 | - ipv4
1010 | json:
1011 | paginated: false
1012 | request:
1013 | url: 'https://www.threatcrowd.org/searchApi/v2/ip/report/?ip={target}'
1014 | method: get
1015 | ignored_status_codes:
1016 | - 404
1017 | results:
1018 | - key: 'resolutions'
1019 | pretty_name: Passive DNS
1020 | multi_match:
1021 | keys:
1022 | - domain
1023 | - last_resolved
1024 | labels:
1025 | - Domain
1026 | - Last Resolved
1027 | onlyif:
1028 | key: last_resolved
1029 | maxage: '-30d'
1030 | - key: 'hashes'
1031 | pretty_name: Known Malware Hash
1032 | match_all: true
1033 |
1034 | passivetotal_pdns:
1035 | name: PassiveTotal Passive DNS
1036 | default: False
1037 | otypes:
1038 | - fqdn
1039 | - ipv4
1040 | json:
1041 | request:
1042 | url: 'https://api.passivetotal.org/v2/dns/passive'
1043 | auth: passivetotal
1044 | params:
1045 | query: '{target}'
1046 | method: get
1047 | headers:
1048 | Accept: application/json
1049 | ignored_status_codes:
1050 | - 401
1051 | results:
1052 | - key: results
1053 | format: as_list
1054 | pretty_name: Results
1055 | multi_match:
1056 | keys:
1057 | - key: resolve
1058 | - key: queryValue
1059 | pretty_name: Query Value
1060 |
1061 | passivetotal_whois:
1062 | name: PassiveTotal Whois
1063 | default: False
1064 | otypes:
1065 | - fqdn
1066 | json:
1067 | request:
1068 | url: 'https://api.passivetotal.org/v2/whois'
1069 | auth: passivetotal
1070 | params:
1071 | query: '{target}'
1072 | method: get
1073 | headers:
1074 | Accept: application/json
1075 | ignored_status_codes:
1076 | - 401
1077 | results:
1078 | - key: registryUpdatedAt
1079 | pretty_name: Registry Updated At
1080 | - key: domain
1081 | pretty_name: Domain
1082 | - key: billing
1083 | pretty_name: Billing
1084 | - key: zone
1085 | pretty_name: Zone
1086 | - key: nameServers
1087 | pretty_name: Name Servers
1088 | - key: registered
1089 | pretty_name: Registered
1090 | - key: lastLoadedAt
1091 | pretty_name: Last Loaded At
1092 | - key: whoisServer
1093 | pretty_name: Whois Server
1094 | - key: contactEmail
1095 | pretty_name: Contact Email
1096 | - key: admin
1097 | pretty_name: Admin
1098 | - key: expiresAt
1099 | pretty_name: Expires At
1100 | - key: registrar
1101 | pretty_name: Registrar
1102 | - key: tech
1103 | pretty_name: Tech
1104 | - key: registrant
1105 | pretty_name: Registrant
1106 |
1107 | passivetotal_sslcert:
1108 | name: PassiveTotal SSL Certificate History
1109 | default: False
1110 | otypes:
1111 | - ipv4
1112 | json:
1113 | request:
1114 | url: 'https://api.passivetotal.org/v2/ssl-certificate/history'
1115 | auth: passivetotal
1116 | params:
1117 | query: '{target}'
1118 | method: get
1119 | headers:
1120 | Accept: application/json
1121 | ignored_status_codes:
1122 | - 401
1123 | results:
1124 | - key: results
1125 | multi_match:
1126 | keys:
1127 | - key: sha1
1128 | pretty_name: Sha1
1129 | - key: firstSeen
1130 | pretty_name: First Seen
1131 | - key: ipAddresses
1132 | pretty_name: Ip Addresses
1133 | - key: lastSeen
1134 | pretty_name: Last Seen
1135 | pretty_name: Results
1136 |
1137 | passivetotal_components:
1138 | name: PassiveTotal Components
1139 | default: False
1140 | otypes:
1141 | - fqdn
1142 | json:
1143 | request:
1144 | url: 'https://api.passivetotal.org/v2/host-attributes/components'
1145 | auth: passivetotal
1146 | params:
1147 | query: '{target}'
1148 | method: get
1149 | headers:
1150 | Accept: application/json
1151 | ignored_status_codes:
1152 | - 401
1153 | results:
1154 | - key: results
1155 | multi_match:
1156 | keys:
1157 | - key: category
1158 | pretty_name: Category
1159 | - key: hostname
1160 | pretty_name: Hostname
1161 | - key: lastSeen
1162 | pretty_name: Last Seen
1163 | - key: firstSeen
1164 | pretty_name: First Seen
1165 | - key: label
1166 | pretty_name: Label
1167 | pretty_name: Results
1168 |
1169 | passivetotal_trackers:
1170 | name: PassiveTotal Trackers
1171 | default: False
1172 | otypes:
1173 | - fqdn
1174 | json:
1175 | request:
1176 | url: 'https://api.passivetotal.org/v2/host-attributes/trackers'
1177 | auth: passivetotal
1178 | params:
1179 | query: '{target}'
1180 | method: get
1181 | headers:
1182 | Accept: application/json
1183 | ignored_status_codes:
1184 | - 401
1185 | results:
1186 | - key: results
1187 | multi_match:
1188 | keys:
1189 | - key: hostname
1190 | pretty_name: Hostname
1191 | - key: attributeType
1192 | pretty_name: Type
1193 | - key: attributeValue
1194 | pretty_name: Value
1195 | - key: lastSeen
1196 | pretty_name: Last Seen
1197 | - key: firstSeen
1198 | pretty_name: First Seen
1199 | pretty_name: Results
1200 | fraudguard:
1201 | name: FraudGuard
1202 | default: False
1203 | otypes:
1204 | - ipv4
1205 | json:
1206 | request:
1207 | url: https://api.fraudguard.io/ip/{target}
1208 | auth: fraudguard
1209 | results:
1210 | - key: isocode
1211 | pretty_name: FraudGuard Country Code
1212 | - key: country
1213 | pretty_name: FraudGuard Country
1214 | - key: state
1215 | pretty_name: FraudGuard State
1216 | - key: city
1217 | pretty_name: FraudGuard City
1218 | - key: discover_date
1219 | pretty_name: FraudGuard Discovery Date
1220 | - key: threat
1221 | pretty_name: FraudGuard Threat Type
1222 | - key: risk_level
1223 | pretty_name: FraudGuard Risk Level
1224 | shodan:
1225 | name: Shodan
1226 | default: False
1227 | otypes:
1228 | - ipv4
1229 | json:
1230 | request:
1231 | url: https://api.shodan.io/shodan/host/{target}
1232 | params:
1233 | key:
1234 | results:
1235 | - key: '@'
1236 | multi_match:
1237 | keys:
1238 | - asn
1239 | - org
1240 | - city
1241 | - region
1242 | - country_code
1243 | - postal_code
1244 | pretty_name: Shodan Organization
1245 | - key: hostnames
1246 | match_all: true
1247 | pretty_name: Shodan Hostnames
1248 | - key: isp
1249 | pretty_name: Shodan ISP
1250 | - key: data
1251 | multi_match:
1252 | keys:
1253 | - timestamp
1254 | - transport
1255 | - port
1256 | - product
1257 | - version
1258 | pretty_name: Shodan Ports
1259 | - key: data
1260 | multi_match:
1261 | keys:
1262 | - transport
1263 | - port
1264 | - ssl.versions
1265 | onlyif: ssl.versions
1266 | pretty_name: Shodan SSL Versions
1267 | - key: data
1268 | multi_match:
1269 | keys:
1270 | - transport
1271 | - port
1272 | - ssl.cert.subject.CN
1273 | - ssl.cert.fingerprint.sha256
1274 | onlyif: ssl.cert.fingerprint.sha256
1275 | pretty_name: Shodan SSL Certs
1276 | ipinfoio:
1277 | name: ipinfo.io
1278 | default: False
1279 | otypes:
1280 | - ipv4
1281 | - ipv6
1282 | json:
1283 | request:
1284 | url: https://ipinfo.io/{target}
1285 | headers:
1286 | Accept: application/json
1287 | results:
1288 | - key: hostname
1289 | pretty_name: ipinfo.io hostname
1290 | - key: city
1291 | pretty_name: ipinfo.io city
1292 | - key: region
1293 | pretty_name: ipinfo.io region
1294 | - key: country
1295 | pretty_name: ipinfo.io country
1296 | - key: loc
1297 | pretty_name: ipinfo.io geolocation
1298 | - key: org
1299 | pretty_name: ipinfo.io organization
1300 | - key: postal
1301 | pretty_name: ipinfo.io postal code
1302 | xforce-malware:
1303 | name: IBM XForce Malware Report
1304 | default: False
1305 | otypes:
1306 | - ipv4
1307 | json:
1308 | request:
1309 | url: https://api.xforce.ibmcloud.com/ipr/malware/{target}
1310 | auth: xforce
1311 | results:
1312 | - key: type
1313 | pretty_name: malware type
1314 | - key: md5
1315 | pretty_name: md5
1316 | - key: domain
1317 | pretty_name: domain name
1318 | - key: firstseen
1319 | pretty_name: first seen
1320 | - key: lastseen
1321 | pretty_name: last seen
1322 | hackedip:
1323 | name: Hacked IP
1324 | default: False
1325 | otypes:
1326 | - ipv4
1327 | json:
1328 | request:
1329 | url: http://www.hackedip.com/api.php?ip={target}
1330 | results:
1331 | - key: '@'
1332 | format: as_list
1333 | pretty_name: Hacked IP Threat List
1334 | metadefender_hash:
1335 | name: MetaDefender File Report
1336 | default: False
1337 | otypes:
1338 | - hash
1339 | - hash.sha1
1340 | - hash.sha256
1341 | json:
1342 | request:
1343 | url: https://api.metadefender.com/v2/hash/{target}
1344 | method: get
1345 | headers:
1346 | apikey:
1347 | results:
1348 | - key: scan_results.start_time
1349 | pretty_name: Date submitted
1350 | - key: scan_results.total_detected_avs
1351 | pretty_name: Detected engines
1352 | - key: scan_results.total_avs
1353 | pretty_name: Total engines
1354 | - key: scan_results.scan_details
1355 | pretty_name: Scans
1356 | multi_match:
1357 | keys:
1358 | - '@'
1359 | - threat_found
1360 | onlyif: scan_result_i
1361 | # misp:
1362 | # name: MISP
1363 | # default: true
1364 | # otypes:
1365 | # - ipv4
1366 | # - url
1367 | # - email
1368 | # - fqdn
1369 | # - hash
1370 | # - hash.sha1
1371 | # - hash.sha256
1372 | # json:
1373 | # request:
1374 | # url: https://***YOUR_MISP_HERE***/events/restSearch/download/{target}/null/null/null/null/7
1375 | # method: get
1376 | # headers:
1377 | # Authorization: ***YOUR_APIKEY_HERE***
1378 | # results:
1379 | # - key: response
1380 | # pretty_name: MISP Events
1381 | # multi_match:
1382 | # keys:
1383 | # - Event.date
1384 | # - Event.id
1385 | # - Event.info
1386 | greynoise:
1387 | # This entry is for the GreyNoise *community* API
1388 | name: GreyNoise
1389 | otypes:
1390 | - ipv4
1391 | json:
1392 | request:
1393 | url: https://api.greynoise.io/v3/community/{target}
1394 | # headers:
1395 | # key: ***YOUR_APIKEY_HERE***
1396 | # you can get this from https://viz.greynoise.io/account/
1397 | ignored_status_codes:
1398 | - 404
1399 | results:
1400 | - key: noise
1401 | pretty_name: GreyNoise Known Scanner
1402 | - key: riot
1403 | pretty_name: GreyNoise Rule-It-OuT
1404 | - key: classification
1405 | pretty_name: GreyNoise Classification
1406 | - key: name
1407 | pretty_name: GreyNoise Name
1408 | greynoise_ent:
1409 | # This entry is for the GreyNoise *enterprise* API
1410 | name: GreyNoise
1411 | default: False
1412 | otypes:
1413 | - ipv4
1414 | json:
1415 | request:
1416 | url: https://enterprise.api.greynoise.io/v2/noise/context/{target}
1417 | headers:
1418 | key: YOUR_APIKEY_HERE
1419 | ignored_status_codes:
1420 | - 404
1421 | results:
1422 | - key: seen
1423 | pretty_name: GreyNoise Known Scanner
1424 | - key: actor
1425 | pretty_name: GreyNoise Actor
1426 | - key: tags
1427 | pretty_name: GreyNoise Reason
1428 | - key: metadata.category
1429 | pretty_name: GreyNoise Category
1430 | - key: first_seen
1431 | pretty_name: GreyNoise First Seen
1432 | - key: last_seen
1433 | pretty_name: GreyNoise Last Seen
1434 | - key: raw_data.web.useragents
1435 | pretty_name: GreyNoise User-agent
1436 | - key: raw_data.scan
1437 | multi_match:
1438 | keys:
1439 | - port
1440 | - protocol
1441 | pretty_name: GreyNoise Observations
1442 | macvendors:
1443 | name: MACVendors
1444 | default: true
1445 | otypes:
1446 | - mac
1447 | webscraper:
1448 | request:
1449 | url: 'https://api.macvendors.com/{target}'
1450 | method: get
1451 | results:
1452 | - regex: '(.+)'
1453 | values:
1454 | - vendor
1455 | pretty_name: Mac Address Vendor
1456 |
--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
1 | [MASTER]
2 |
3 | # A comma-separated list of package or module names from where C extensions may
4 | # be loaded. Extensions are loading into the active Python interpreter and may
5 | # run arbitrary code.
6 | extension-pkg-whitelist=
7 |
8 | # Add files or directories to the blacklist. They should be base names, not
9 | # paths.
10 | ignore=CVS
11 |
12 | # Add files or directories matching the regex patterns to the blacklist. The
13 | # regex matches against base names, not paths.
14 | ignore-patterns=
15 |
16 | # Python code to execute, usually for sys.path manipulation such as
17 | # pygtk.require().
18 | #init-hook=
19 |
20 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
21 | # number of processors available to use.
22 | jobs=1
23 |
24 | # Control the amount of potential inferred values when inferring a single
25 | # object. This can help the performance when dealing with large functions or
26 | # complex, nested conditions.
27 | limit-inference-results=100
28 |
29 | # List of plugins (as comma separated values of python modules names) to load,
30 | # usually to register additional checkers.
31 | load-plugins=
32 |
33 | # Pickle collected data for later comparisons.
34 | persistent=yes
35 |
36 | # Specify a configuration file.
37 | #rcfile=
38 |
39 | # When enabled, pylint would attempt to guess common misconfiguration and emit
40 | # user-friendly hints instead of false-positive error messages.
41 | suggestion-mode=yes
42 |
43 | # Allow loading of arbitrary C extensions. Extensions are imported into the
44 | # active Python interpreter and may run arbitrary code.
45 | unsafe-load-any-extension=no
46 |
47 |
48 | [MESSAGES CONTROL]
49 |
50 | # Only show warnings with the listed confidence levels. Leave empty to show
51 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
52 | confidence=
53 |
54 | # Disable the message, report, category or checker with the given id(s). You
55 | # can either give multiple identifiers separated by comma (,) or put this
56 | # option multiple times (only on the command line, not in the configuration
57 | # file where it should appear only once). You can also use "--disable=all" to
58 | # disable everything first and then reenable specific checks. For example, if
59 | # you want to run only the similarities checker, you can use "--disable=all
60 | # --enable=similarities". If you want to run only the classes checker, but have
61 | # no Warning level messages displayed, use "--disable=all --enable=classes
62 | # --disable=W".
63 | disable=print-statement,
64 | parameter-unpacking,
65 | unpacking-in-except,
66 | old-raise-syntax,
67 | backtick,
68 | long-suffix,
69 | old-ne-operator,
70 | old-octal-literal,
71 | import-star-module-level,
72 | non-ascii-bytes-literal,
73 | raw-checker-failed,
74 | bad-inline-option,
75 | locally-disabled,
76 | locally-enabled,
77 | file-ignored,
78 | suppressed-message,
79 | useless-suppression,
80 | deprecated-pragma,
81 | use-symbolic-message-instead,
82 | apply-builtin,
83 | basestring-builtin,
84 | buffer-builtin,
85 | cmp-builtin,
86 | coerce-builtin,
87 | execfile-builtin,
88 | file-builtin,
89 | long-builtin,
90 | raw_input-builtin,
91 | reduce-builtin,
92 | standarderror-builtin,
93 | unicode-builtin,
94 | xrange-builtin,
95 | coerce-method,
96 | delslice-method,
97 | getslice-method,
98 | setslice-method,
99 | no-absolute-import,
100 | old-division,
101 | dict-iter-method,
102 | dict-view-method,
103 | next-method-called,
104 | metaclass-assignment,
105 | indexing-exception,
106 | raising-string,
107 | reload-builtin,
108 | oct-method,
109 | hex-method,
110 | nonzero-method,
111 | cmp-method,
112 | input-builtin,
113 | round-builtin,
114 | intern-builtin,
115 | unichr-builtin,
116 | map-builtin-not-iterating,
117 | zip-builtin-not-iterating,
118 | range-builtin-not-iterating,
119 | filter-builtin-not-iterating,
120 | using-cmp-argument,
121 | eq-without-hash,
122 | div-method,
123 | idiv-method,
124 | rdiv-method,
125 | exception-message-attribute,
126 | invalid-str-codec,
127 | sys-max-int,
128 | bad-python3-import,
129 | deprecated-string-function,
130 | deprecated-str-translate-call,
131 | deprecated-itertools-function,
132 | deprecated-types-field,
133 | next-method-defined,
134 | dict-items-not-iterating,
135 | dict-keys-not-iterating,
136 | dict-values-not-iterating,
137 | deprecated-operator-function,
138 | deprecated-urllib-function,
139 | xreadlines-attribute,
140 | deprecated-sys-function,
141 | exception-escape,
142 | comprehension-escape,
143 | line-too-long,
144 | missing-docstring,
145 | invalid-name,
146 | unused-argument,
147 | inconsistent-return-statements,
148 | arguments-differ,
149 | protected-access,
150 | too-many-locals,
151 | too-many-branches,
152 | not-context-manager,
153 | unexpected-keyword-arg,
154 | no-member,
155 | cyclic-import,
156 | anomalous-backslash-in-string,
157 | import-outside-toplevel,
158 | no-else-continue,
159 | super-with-arguments
160 |
161 |
162 | # Enable the message, report, category or checker with the given id(s). You can
163 | # either give multiple identifier separated by comma (,) or put this option
164 | # multiple time (only on the command line, not in the configuration file where
165 | # it should appear only once). See also the "--disable" option for examples.
166 | enable=c-extension-no-member
167 |
168 |
169 | [REPORTS]
170 |
171 | # Python expression which should return a note less than 10 (10 is the highest
172 | # note). You have access to the variables errors warning, statement which
173 | # respectively contain the number of errors / warnings messages and the total
174 | # number of statements analyzed. This is used by the global evaluation report
175 | # (RP0004).
176 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
177 |
178 | # Template used to display messages. This is a python new-style format string
179 | # used to format the message information. See doc for all details.
180 | #msg-template=
181 |
182 | # Set the output format. Available formats are text, parseable, colorized, json
183 | # and msvs (visual studio). You can also give a reporter class, e.g.
184 | # mypackage.mymodule.MyReporterClass.
185 | output-format=text
186 |
187 | # Tells whether to display a full report or only the messages.
188 | reports=no
189 |
190 | # Activate the evaluation score.
191 | score=yes
192 |
193 |
194 | [REFACTORING]
195 |
196 | # Maximum number of nested blocks for function / method body
197 | max-nested-blocks=5
198 |
199 | # Complete name of functions that never returns. When checking for
200 | # inconsistent-return-statements if a never returning function is called then
201 | # it will be considered as an explicit return statement and no message will be
202 | # printed.
203 | never-returning-functions=sys.exit
204 |
205 |
206 | [BASIC]
207 |
208 | # Naming style matching correct argument names.
209 | argument-naming-style=snake_case
210 |
211 | # Regular expression matching correct argument names. Overrides argument-
212 | # naming-style.
213 | #argument-rgx=
214 |
215 | # Naming style matching correct attribute names.
216 | attr-naming-style=snake_case
217 |
218 | # Regular expression matching correct attribute names. Overrides attr-naming-
219 | # style.
220 | #attr-rgx=
221 |
222 | # Bad variable names which should always be refused, separated by a comma.
223 | bad-names=foo,
224 | bar,
225 | baz,
226 | toto,
227 | tutu,
228 | tata
229 |
230 | # Naming style matching correct class attribute names.
231 | class-attribute-naming-style=any
232 |
233 | # Regular expression matching correct class attribute names. Overrides class-
234 | # attribute-naming-style.
235 | #class-attribute-rgx=
236 |
237 | # Naming style matching correct class names.
238 | class-naming-style=PascalCase
239 |
240 | # Regular expression matching correct class names. Overrides class-naming-
241 | # style.
242 | #class-rgx=
243 |
244 | # Naming style matching correct constant names.
245 | const-naming-style=UPPER_CASE
246 |
247 | # Regular expression matching correct constant names. Overrides const-naming-
248 | # style.
249 | #const-rgx=
250 |
251 | # Minimum line length for functions/classes that require docstrings, shorter
252 | # ones are exempt.
253 | docstring-min-length=-1
254 |
255 | # Naming style matching correct function names.
256 | function-naming-style=snake_case
257 |
258 | # Regular expression matching correct function names. Overrides function-
259 | # naming-style.
260 | #function-rgx=
261 |
262 | # Good variable names which should always be accepted, separated by a comma.
263 | good-names=i,
264 | j,
265 | k,
266 | ex,
267 | Run,
268 | _
269 |
270 | # Include a hint for the correct naming format with invalid-name.
271 | include-naming-hint=no
272 |
273 | # Naming style matching correct inline iteration names.
274 | inlinevar-naming-style=any
275 |
276 | # Regular expression matching correct inline iteration names. Overrides
277 | # inlinevar-naming-style.
278 | #inlinevar-rgx=
279 |
280 | # Naming style matching correct method names.
281 | method-naming-style=snake_case
282 |
283 | # Regular expression matching correct method names. Overrides method-naming-
284 | # style.
285 | #method-rgx=
286 |
287 | # Naming style matching correct module names.
288 | module-naming-style=snake_case
289 |
290 | # Regular expression matching correct module names. Overrides module-naming-
291 | # style.
292 | #module-rgx=
293 |
294 | # Colon-delimited sets of names that determine each other's naming style when
295 | # the name regexes allow several styles.
296 | name-group=
297 |
298 | # Regular expression which should only match function or class names that do
299 | # not require a docstring.
300 | no-docstring-rgx=^_
301 |
302 | # List of decorators that produce properties, such as abc.abstractproperty. Add
303 | # to this list to register other decorators that produce valid properties.
304 | # These decorators are taken in consideration only for invalid-name.
305 | property-classes=abc.abstractproperty
306 |
307 | # Naming style matching correct variable names.
308 | variable-naming-style=snake_case
309 |
310 | # Regular expression matching correct variable names. Overrides variable-
311 | # naming-style.
312 | #variable-rgx=
313 |
314 |
315 | [FORMAT]
316 |
317 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
318 | expected-line-ending-format=
319 |
320 | # Regexp for a line that is allowed to be longer than the limit.
321 | ignore-long-lines=^\s*(# )??$
322 |
323 | # Number of spaces of indent required inside a hanging or continued line.
324 | indent-after-paren=4
325 |
326 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
327 | # tab).
328 | indent-string=' '
329 |
330 | # Maximum number of characters on a single line.
331 | max-line-length=100
332 |
333 | # Maximum number of lines in a module.
334 | max-module-lines=1000
335 |
336 | # List of optional constructs for which whitespace checking is disabled. `dict-
337 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
338 | # `trailing-comma` allows a space between comma and closing bracket: (a, ).
339 | # `empty-line` allows space-only lines.
340 | no-space-check=trailing-comma,
341 | dict-separator
342 |
343 | # Allow the body of a class to be on the same line as the declaration if body
344 | # contains single statement.
345 | single-line-class-stmt=no
346 |
347 | # Allow the body of an if to be on the same line as the test if there is no
348 | # else.
349 | single-line-if-stmt=no
350 |
351 |
352 | [LOGGING]
353 |
354 | # Logging modules to check that the string format arguments are in logging
355 | # function parameter format.
356 | logging-modules=logging
357 |
358 |
359 | [VARIABLES]
360 |
361 | # List of additional names supposed to be defined in builtins. Remember that
362 | # you should avoid to define new builtins when possible.
363 | additional-builtins=
364 |
365 | # Tells whether unused global variables should be treated as a violation.
366 | allow-global-unused-variables=yes
367 |
368 | # List of strings which can identify a callback function by name. A callback
369 | # name must start or end with one of those strings.
370 | callbacks=cb_,
371 | _cb
372 |
373 | # A regular expression matching the name of dummy variables (i.e. expected to
374 | # not be used).
375 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
376 |
377 | # Argument names that match this expression will be ignored. Default to name
378 | # with leading underscore.
379 | ignored-argument-names=_.*|^ignored_|^unused_
380 |
381 | # Tells whether we should check for unused import in __init__ files.
382 | init-import=no
383 |
384 | # List of qualified module names which can have objects that can redefine
385 | # builtins.
386 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
387 |
388 |
389 | [TYPECHECK]
390 |
391 | # List of decorators that produce context managers, such as
392 | # contextlib.contextmanager. Add to this list to register other decorators that
393 | # produce valid context managers.
394 | contextmanager-decorators=contextlib.contextmanager
395 |
396 | # List of members which are set dynamically and missed by pylint inference
397 | # system, and so shouldn't trigger E1101 when accessed. Python regular
398 | # expressions are accepted.
399 | generated-members=
400 |
401 | # Tells whether missing members accessed in mixin class should be ignored. A
402 | # mixin class is detected if its name ends with "mixin" (case insensitive).
403 | ignore-mixin-members=yes
404 |
405 | # Tells whether to warn about missing members when the owner of the attribute
406 | # is inferred to be None.
407 | ignore-none=yes
408 |
409 | # This flag controls whether pylint should warn about no-member and similar
410 | # checks whenever an opaque object is returned when inferring. The inference
411 | # can return multiple potential results while evaluating a Python object, but
412 | # some branches might not be evaluated, which results in partial inference. In
413 | # that case, it might be useful to still emit no-member and other checks for
414 | # the rest of the inferred objects.
415 | ignore-on-opaque-inference=yes
416 |
417 | # List of class names for which member attributes should not be checked (useful
418 | # for classes with dynamically set attributes). This supports the use of
419 | # qualified names.
420 | ignored-classes=optparse.Values,thread._local,_thread._local
421 |
422 | # List of module names for which member attributes should not be checked
423 | # (useful for modules/projects where namespaces are manipulated during runtime
424 | # and thus existing member attributes cannot be deduced by static analysis. It
425 | # supports qualified module names, as well as Unix pattern matching.
426 | ignored-modules=
427 |
428 | # Show a hint with possible names when a member name was not found. The aspect
429 | # of finding the hint is based on edit distance.
430 | missing-member-hint=yes
431 |
432 | # The minimum edit distance a name should have in order to be considered a
433 | # similar match for a missing member name.
434 | missing-member-hint-distance=1
435 |
436 | # The total number of similar names that should be taken in consideration when
437 | # showing a hint for a missing member.
438 | missing-member-max-choices=1
439 |
440 |
441 | [SIMILARITIES]
442 |
443 | # Ignore comments when computing similarities.
444 | ignore-comments=yes
445 |
446 | # Ignore docstrings when computing similarities.
447 | ignore-docstrings=yes
448 |
449 | # Ignore imports when computing similarities.
450 | ignore-imports=no
451 |
452 | # Minimum lines number of a similarity.
453 | min-similarity-lines=4
454 |
455 |
456 | [MISCELLANEOUS]
457 |
458 | # List of note tags to take in consideration, separated by a comma.
459 | notes=FIXME,
460 | XXX,
461 | TODO
462 |
463 |
464 | [SPELLING]
465 |
466 | # Limits count of emitted suggestions for spelling mistakes.
467 | max-spelling-suggestions=4
468 |
469 | # Spelling dictionary name. Available dictionaries: none. To make it working
470 | # install python-enchant package..
471 | spelling-dict=
472 |
473 | # List of comma separated words that should not be checked.
474 | spelling-ignore-words=
475 |
476 | # A path to a file that contains private dictionary; one word per line.
477 | spelling-private-dict-file=
478 |
479 | # Tells whether to store unknown words to indicated private dictionary in
480 | # --spelling-private-dict-file option instead of raising a message.
481 | spelling-store-unknown-words=no
482 |
483 |
484 | [IMPORTS]
485 |
486 | # Allow wildcard imports from modules that define __all__.
487 | allow-wildcard-with-all=no
488 |
489 | # Analyse import fallback blocks. This can be used to support both Python 2 and
490 | # 3 compatible code, which means that the block might have code that exists
491 | # only in one or another interpreter, leading to false positives when analysed.
492 | analyse-fallback-blocks=no
493 |
494 | # Deprecated modules which should not be used, separated by a comma.
495 | deprecated-modules=optparse,tkinter.tix
496 |
497 | # Create a graph of external dependencies in the given file (report RP0402 must
498 | # not be disabled).
499 | ext-import-graph=
500 |
501 | # Create a graph of every (i.e. internal and external) dependencies in the
502 | # given file (report RP0402 must not be disabled).
503 | import-graph=
504 |
505 | # Create a graph of internal dependencies in the given file (report RP0402 must
506 | # not be disabled).
507 | int-import-graph=
508 |
509 | # Force import order to recognize a module as part of the standard
510 | # compatibility libraries.
511 | known-standard-library=
512 |
513 | # Force import order to recognize a module as part of a third party library.
514 | known-third-party=enchant
515 |
516 |
517 | [DESIGN]
518 |
519 | # Maximum number of arguments for function / method.
520 | max-args=5
521 |
522 | # Maximum number of attributes for a class (see R0902).
523 | max-attributes=7
524 |
525 | # Maximum number of boolean expressions in an if statement.
526 | max-bool-expr=5
527 |
528 | # Maximum number of branch for function / method body.
529 | max-branches=12
530 |
531 | # Maximum number of locals for function / method body.
532 | max-locals=15
533 |
534 | # Maximum number of parents for a class (see R0901).
535 | max-parents=10
536 |
537 | # Maximum number of public methods for a class (see R0904).
538 | max-public-methods=20
539 |
540 | # Maximum number of return / yield for function / method body.
541 | max-returns=6
542 |
543 | # Maximum number of statements in function / method body.
544 | max-statements=50
545 |
546 | # Minimum number of public methods for a class (see R0903).
547 | min-public-methods=0
548 |
549 |
550 | [CLASSES]
551 |
552 | # List of method names used to declare (i.e. assign) instance attributes.
553 | defining-attr-methods=__init__,
554 | __new__,
555 | setUp
556 |
557 | # List of member names, which should be excluded from the protected access
558 | # warning.
559 | exclude-protected=_asdict,
560 | _fields,
561 | _replace,
562 | _source,
563 | _make
564 |
565 | # List of valid names for the first argument in a class method.
566 | valid-classmethod-first-arg=cls
567 |
568 | # List of valid names for the first argument in a metaclass class method.
569 | valid-metaclass-classmethod-first-arg=cls
570 |
571 |
572 | [EXCEPTIONS]
573 |
574 | # Exceptions that will emit a warning when being caught. Defaults to
575 | # "Exception".
576 | overgeneral-exceptions=Exception
577 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | defang
2 | relatime
3 | urllib3
4 | stopit
5 | feedparser
6 | beautifulsoup4
7 | python_dateutil
8 | python_magic
9 | PyYAML
10 | requests
11 | tzlocal
12 | pytz
13 | dnspython3
14 | ipwhois
15 | html5lib
16 |
17 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from os import path
2 | from setuptools import setup, find_packages
3 |
4 |
5 | #this should hopefully allow us to have a more pypi friendly, always up to date readme
6 | readMeDir = path.abspath(path.dirname(__file__))
7 | with open(path.join(readMeDir, 'README.md'), encoding='utf-8') as readFile:
8 | long_desc = readFile.read()
9 |
10 |
11 | VERSION = '1.4.11'
12 |
13 | setup(
14 | name='machinae',
15 | version=VERSION,
16 | author='Steve McMaster',
17 | author_email='mcmaster@hurricanelabs.com',
18 | package_dir={'': 'src'},
19 | packages=find_packages('src'),
20 | include_package_data=True,
21 | zip_safe=False,
22 | url='http://hurricanelabs.github.io/machinae/',
23 | description='Machinae Security Intelligence Collector',
24 | long_description=long_desc,
25 | long_description_content_type='text/markdown',
26 | install_requires=[
27 | 'dnspython3',
28 | 'ipwhois<0.11',
29 | 'requests',
30 | 'stopit',
31 | 'pyyaml',
32 | 'beautifulsoup4',
33 | 'html5lib',
34 | 'relatime',
35 | 'tzlocal',
36 | 'python-magic',
37 | 'feedparser',
38 | 'defang',
39 | ],
40 | entry_points={
41 | 'console_scripts': [
42 | 'machinae = machinae.cmd:main',
43 | ]
44 | },
45 | classifiers=[
46 | 'License :: OSI Approved :: MIT License',
47 | 'Programming Language :: Python :: 3 :: Only',
48 | 'Development Status :: 5 - Production/Stable',
49 | ],
50 | bugtrack_url='https://github.com/HurricaneLabs/machinae/issues',
51 | )
52 |
--------------------------------------------------------------------------------
/src/machinae/__init__.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import re
3 | import socket
4 | import ipaddress
5 |
6 | __version__ = "1.4.8"
7 | TargetInfo = collections.namedtuple("TargetInfo", ("target", "otype", "otype_detected"))
8 | ErrorResult = collections.namedtuple("ErrorResult", ("target_info", "site_info", "error_info"))
9 | ResultSet = collections.namedtuple("ResultSet", ("target_info", "results"))
10 | SiteResults = collections.namedtuple("SiteResults", ("site_info", "resultset"))
11 | Result = collections.namedtuple("Result", ("value", "pretty_name"))
12 |
13 | #pylint: disable=no-else-return,too-many-return-statements
14 | def get_target_type(target):
15 | try:
16 | getVer = ipaddress.ip_address(target)
17 | if getVer.version == 4:
18 | return "ipv4"
19 | elif getVer.version == 6:
20 | return "ipv6"
21 | except ValueError:
22 | pass
23 |
24 | #pylint: disable=no-else-return
25 | # Hashes
26 | if re.match("^[a-f0-9]{32}$", target, re.I):
27 | # MD5
28 | return "hash"
29 | elif re.match("^[a-f0-9]{40}$", target, re.I):
30 | # SHA-1
31 | return "hash.sha1"
32 | elif re.match("^[a-f0-9]{64}$", target, re.I):
33 | # SHA-256
34 | return "hash.sha256"
35 | elif re.match("^[a-f0-9]{128}$", target, re.I):
36 | # SHA-512
37 | return "hash.sha512"
38 |
39 | # URL
40 | elif re.match("^https?://", target, re.I):
41 | return "url"
42 |
43 | # Email Addresses
44 | elif re.match("^.*?@.*?$", target, re.I):
45 | return "email"
46 |
47 | # SSL fingerprints
48 | elif re.match("^(?:[a-f0-9]{2}:){19}[a-f0-9]{2}$", target, flags=re.I):
49 | return "sslfp"
50 |
51 | # Mac Addresses
52 | elif re.match("^([0-9a-fA-F][0-9a-fA-F][-:\.]){5}([0-9a-fA-F][0-9a-fA-F])$", target, re.I):
53 | return "mac"
54 |
55 | return "fqdn"
56 |
57 |
58 | # d2 takes precedence
59 | def dict_merge(d1, d2):
60 | d3 = d1.copy()
61 | for key in d2:
62 | if key in d3 and hasattr(d3[key], "items") and hasattr(d2[key], "items"):
63 | d3[key] = dict_merge(d3[key], d2[key])
64 | elif hasattr(d2[key], "items"):
65 | d3[key] = d2[key].copy()
66 | else:
67 | d3[key] = d2[key]
68 | return d3
69 |
--------------------------------------------------------------------------------
/src/machinae/cmd.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import copy
3 | import os
4 | import sys
5 | from collections import OrderedDict
6 | import stopit
7 | from machinae import __version__
8 |
9 | from . import dict_merge, get_target_type, outputs, utils
10 | from . import ErrorResult, Result, ResultSet, SiteResults, TargetInfo
11 | from .sites import Site
12 |
13 |
14 | default_config_locations = (
15 | "machinae.yml",
16 | "/etc/machinae.yml",
17 | os.path.expanduser(os.getenv("MACHINAE_CONFIG", "")),
18 | )
19 |
20 |
21 | class MachinaeCommand:
22 | _conf = None
23 | _sites = None
24 |
25 | def __init__(self, args=None):
26 | if args is None:
27 | ap = argparse.ArgumentParser()
28 | ap.add_argument("-c", "--config", default=None)
29 | ap.add_argument("--nomerge", default=False, action="store_true")
30 |
31 | ap.add_argument("-d", "--delay", default=0)
32 | ap.add_argument("-f", "--file", default="-")
33 | ap.add_argument("-i", "--infile", default=None)
34 | ap.add_argument("-o", dest="output", default="N", choices=("D", "J", "N", "S"))
35 | ap.add_argument("-O", "--otype",
36 | choices=("ipv4", "ipv6", "fqdn", "email", "sslfp", "hash", "url", "mac")
37 | )
38 | ap.add_argument("-q", "--quiet", dest="verbose", default=True, action="store_false")
39 | ap.add_argument("-s", "--sites", default="default")
40 | ap.add_argument("-a", "--auth")
41 | ap.add_argument("-H", "--http-proxy", dest="http_proxy")
42 | ap.add_argument("targets", nargs=argparse.REMAINDER)
43 | ap.add_argument("-v", "--version", action="version", version="%(prog)s "+ __version__)
44 |
45 | modes = ap.add_mutually_exclusive_group()
46 | modes.add_argument("--dump-config", dest="mode",
47 | action="store_const", const="dump_config")
48 | modes.add_argument("--detect-otype", dest="mode",
49 | action="store_const", const="detect_otype")
50 | modes.add_argument("--list-sites", dest="mode",
51 | action="store_const", const="list_sites")
52 | args = ap.parse_args()
53 | self.args = args
54 |
55 | @property
56 | def conf(self):
57 | if self._conf is None:
58 | path = None
59 | if self.args.config:
60 | path = self.args.config
61 | else:
62 | for possible_path in default_config_locations:
63 | if possible_path is None:
64 | continue
65 | if os.path.exists(possible_path):
66 | path = possible_path
67 | break
68 |
69 | if path:
70 | with open(path, "r") as f:
71 | conf = utils.safe_load(f)
72 | else:
73 | conf = {}
74 |
75 | if not self.args.nomerge:
76 | local_path = "/etc/machinae.local.yml"
77 | if os.path.exists(local_path):
78 | with open(local_path, "r") as f:
79 | local_conf = utils.safe_load(f)
80 | conf = dict_merge(conf, local_conf)
81 |
82 | local_path = os.path.expanduser("~/.machinae.yml")
83 | if os.path.exists(local_path):
84 | with open(local_path, "r") as f:
85 | local_conf = utils.safe_load(f)
86 | conf = dict_merge(conf, local_conf)
87 |
88 | self._conf = conf
89 | return self._conf
90 |
91 | @property
92 | #pylint: disable=too-many-locals, too-many-branches
93 | def results(self):
94 | creds = None
95 | if self.args.auth and os.path.isfile(self.args.auth):
96 | with open(self.args.auth) as auth_f:
97 | creds = utils.safe_load(auth_f.read())
98 |
99 | proxies = {}
100 | if self.args.http_proxy:
101 | proxies["http"] = self.args.http_proxy
102 | proxies["https"] = self.args.http_proxy
103 | else:
104 | if "HTTP_PROXY" in os.environ:
105 | proxies["http"] = os.environ["HTTP_PROXY"]
106 | elif "http_proxy" in os.environ:
107 | proxies["http"] = os.environ["http_proxy"]
108 | if "HTTPS_PROXY" in os.environ:
109 | proxies["https"] = os.environ["HTTPS_PROXY"]
110 | elif "https_proxy" in os.environ:
111 | proxies["https"] = os.environ["https_proxy"]
112 |
113 | if "http" in proxies:
114 | print("HTTP Proxy: {http}".format(**proxies), file=sys.stderr)
115 | if "https" in proxies:
116 | print("HTTPS Proxy: {https}".format(**proxies), file=sys.stderr)
117 |
118 | for target_info in self.targets:
119 | (target, otype, _) = target_info
120 |
121 | target_results = list()
122 | #pylint: disable=unused-variable
123 | for (site_name, site_conf) in self.sites.items():
124 | if otype.lower() not in map(lambda x: x.lower(), site_conf["otypes"]):
125 | continue
126 |
127 | site_conf["target"] = target
128 | site_conf["verbose"] = self.args.verbose
129 | scraper = Site.from_conf(site_conf, creds=creds, proxies=proxies) # , verbose=self.verbose)
130 |
131 | try:
132 | with stopit.SignalTimeout(15, swallow_exc=False):
133 | run_results = list()
134 | for r in scraper.run():
135 | if "value" not in r:
136 | r = {"value": r, "pretty_name": None}
137 | run_results.append(Result(r["value"], r["pretty_name"]))
138 | except stopit.TimeoutException:
139 | target_results.append(ErrorResult(target_info, site_conf, "Timeout"))
140 | #pylint: disable=broad-except
141 | #Will be cleaned up in upcoming refactor
142 | except Exception as e:
143 | target_results.append(ErrorResult(target_info, site_conf, e))
144 | else:
145 | target_results.append(SiteResults(site_conf, run_results))
146 |
147 | yield ResultSet(target_info, target_results)
148 |
149 | @property
150 | def sites(self):
151 | if self._sites is None:
152 | if self.args.sites.lower() == "all":
153 | sites = self._conf.keys()
154 | elif self.args.sites.lower() == "default":
155 | sites = [k for (k, v) in self.conf.items() if v.get("default", True)]
156 | else:
157 | sites = self.args.sites.lower().split(",")
158 | self._sites = OrderedDict([(k, v) for (k, v) in self.conf.items() if k in sites])
159 | return copy.deepcopy(self._sites)
160 |
161 | @property
162 | def targets(self):
163 | targets = list()
164 | if self.args.infile:
165 | with open(self.args.infile, "r") as f:
166 | targets.extend([line.strip() for line in f.readlines()])
167 |
168 | targets.extend(self.args.targets)
169 |
170 | for target in targets:
171 | (otype, otype_detected) = self.detect_otype(target)
172 | if otype == "url" and not (target.startswith("http://") or target.startswith("https://")):
173 | target = "http://{0}".format(target)
174 | yield TargetInfo(target, otype, otype_detected)
175 |
176 | def detect_otype(self, target):
177 | if self.args.otype:
178 | return (self.args.otype, False)
179 | return (get_target_type(target), True)
180 |
181 | def run(self):
182 | fmt = self.args.output.upper()
183 | dest = self.args.file
184 |
185 | if not self.conf:
186 | sys.stderr.write("Warning: operating without a config file. This is probably not what "
187 | "you want. To correct this, fetch a copy of the default "
188 | "configuration file from https://github.com/hurricanelabs/machinae "
189 | "and place it in /etc/machinae.yml or ~/.machinae.yml and run again."
190 | "\n")
191 |
192 | if self.args.mode == "dump_config":
193 | output = utils.dump(self.conf)
194 | elif self.args.mode == "detect_otype":
195 | target_dict = OrderedDict()
196 | for target_info in self.targets:
197 | target_dict.update({target_info.target: target_info.otype})
198 | output = utils.dump(target_dict)
199 | elif self.args.mode == "list_sites":
200 | output = utils.listsites(self.conf)
201 | else:
202 | output = outputs.MachinaeOutput.get_formatter(fmt).run(self.results)
203 |
204 | if dest == "-":
205 | ofile = sys.stdout
206 | else:
207 | ofile = open(dest, "w")
208 |
209 | ofile.write(output)
210 |
211 | if dest != "-":
212 | ofile.close()
213 |
214 |
215 | def main():
216 | try:
217 | cmd = MachinaeCommand()
218 | cmd.run()
219 | except KeyboardInterrupt:
220 | pass
221 |
222 |
223 | if __name__ == "__main__":
224 | main()
225 |
--------------------------------------------------------------------------------
/src/machinae/outputs.py:
--------------------------------------------------------------------------------
1 | import io
2 | import json
3 | from defang import defang
4 |
5 | class MachinaeOutput:
6 | @staticmethod
7 | #pylint: disable=no-else-return, redefined-builtin, inconsistent-return-statements
8 | #Will be cleaned up in upcoming refactor
9 | def get_formatter(format):
10 | if format.upper() == "N":
11 | return NormalOutput()
12 | elif format.upper() == "J":
13 | return JsonOutput()
14 | elif format.upper() == "D":
15 | return DotEscapedOutput()
16 | elif format.upper() == "S":
17 | return ShortOutput()
18 |
19 | @staticmethod
20 | def escape(text):
21 | return str(text)
22 |
23 | #pylint: disable=attribute-defined-outside-init
24 | #Will be cleaned up in upcoming refactor
25 | def init_buffer(self):
26 | self._buffer = io.StringIO()
27 |
28 | def print(self, line, lf=True):
29 | self._buffer.write(line)
30 | if lf:
31 | self._buffer.write("\n")
32 |
33 |
34 | class NormalOutput(MachinaeOutput):
35 | def output_header(self, target, otype, otype_detected):
36 | self.print("*" * 80)
37 | self.print("* Information for {0}".format(self.escape(target)))
38 | self.print("* Observable type: {0} (Auto-detected: {1})".format(otype, otype_detected))
39 | self.print("*" * 80)
40 | #This needs to be refactored so the site from args is available here. No time currently, will do though
41 | self.print("Not seeing what you expect? Likely not a valid site. Try running with --list-sites")
42 |
43 | def run(self, result_sets: object):
44 | self.init_buffer()
45 | #pylint: disable=too-many-nested-blocks
46 | for row in result_sets:
47 | (target, otype, otype_detected) = row.target_info
48 |
49 | self.output_header(target, otype, otype_detected)
50 | self.print("")
51 |
52 | for item in row.results:
53 | site = item.site_info
54 | if hasattr(item, "error_info"):
55 | self.print("[!] Error from {0}: {1}".format(site["name"], item.error_info))
56 | continue
57 |
58 | if not item.resultset:
59 | self.print("[-] No {0} Results".format(site["name"]))
60 | else:
61 | self.print("[+] {0} Results".format(site["name"]))
62 | for result in item.resultset:
63 | labels = getattr(result[0], "labels", None)
64 | if len(result[0].values()) > 1 or labels is not None:
65 | values = map(repr, result[0].values())
66 | values = map(self.escape, values)
67 | if labels is not None:
68 | values = zip(labels, values)
69 | values = ["{0}: {1}".format(label, value) for (label, value) in values]
70 | output = ", ".join(values)
71 |
72 | if result[1] is not None:
73 | output = "({0})".format(", ".join(values))
74 | output = defang(output)
75 | else:
76 | output = self.escape(list(result[0].values())[0])
77 | output = defang(output)
78 | if result[1] is not None:
79 | output = "{1}: {0}".format(output, result[1])
80 | output = defang(output)
81 | self.print(" [-] {0}".format(output))
82 |
83 | return self._buffer.getvalue()
84 |
85 |
86 | class DotEscapedOutput(NormalOutput):
87 | escapes = {
88 | # ".": "\u2024",
89 | # ".": "",
90 | # ".": " DOT ",
91 | ".": "[.]",
92 | "@": " AT ",
93 | "http://": "hxxp://",
94 | "https://": "hxxps://",
95 | }
96 |
97 | def output_header(self, target, otype, otype_detected):
98 | super().output_header(target, otype, otype_detected)
99 | self.print("* These characters are escaped in the output below:")
100 | for (find, replace) in self.escapes.items():
101 | self.print("* '{0}' replaced with '{1}'".format(find, replace))
102 | self.print("* Do not click any links you find below")
103 | self.print("*" * 80)
104 |
105 | @classmethod
106 | def escape(cls, text):
107 | text = super(DotEscapedOutput, cls).escape(text)
108 | for (find, replace) in cls.escapes.items():
109 | text = text.replace(find, replace)
110 | return text
111 |
112 | #pylint: disable=no-self-use, unused-variable
113 | #Will be cleaned up in upcoming refactor
114 | class JsonGenerator(MachinaeOutput):
115 | def run(self, result_sets):
116 | records = list()
117 | for row in result_sets:
118 | (target, otype, otype_detected) = row.target_info
119 |
120 | for item in row.results:
121 | output = dict()
122 | output["site"] = item.site_info["name"]
123 | output["results"] = dict()
124 | output["observable"] = target
125 | output["observable type"] = otype
126 | output["observable type detected"] = otype_detected
127 |
128 | if hasattr(item, "error_info"):
129 | output["results"] = {"error_info": str(item.error_info)}
130 | elif item.resultset:
131 | for result in item.resultset:
132 | if result.pretty_name not in output["results"]:
133 | output["results"][result.pretty_name] = list()
134 | values = list(result.value.values())
135 | if len(values) == 1:
136 | output["results"][result.pretty_name].append(values[0])
137 | elif len(values) > 1:
138 | output["results"][result.pretty_name].append(values)
139 | for (k, v) in output["results"].items():
140 | if len(v) == 1:
141 | output["results"][k] = v[0]
142 | records.append(output)
143 | return records
144 |
145 |
146 | class JsonOutput(JsonGenerator):
147 | def run(self, result_sets):
148 | self.init_buffer()
149 |
150 | for record in super().run(result_sets):
151 | self.print(json.dumps(record))
152 |
153 | return self._buffer.getvalue()
154 |
155 |
156 | class ShortOutput(MachinaeOutput):
157 | def run(self, result_sets):
158 | self.init_buffer()
159 |
160 | for row in result_sets:
161 | (target, otype, otype_detected) = row.target_info
162 | self.print("[+] {0}".format(target))
163 |
164 | for item in row.results:
165 | site = item.site_info
166 | if hasattr(item, "error_info"):
167 | self.print(" {0}: Error".format(site["name"]))
168 | elif not item.resultset:
169 | self.print(" {0}: No".format(site["name"]))
170 | else:
171 | self.print(" {0}: Yes".format(site["name"]))
172 |
173 | return self._buffer.getvalue()
174 |
--------------------------------------------------------------------------------
/src/machinae/sites/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 |
3 | import urllib.parse
4 |
5 | #pylint: disable=useless-object-inheritance
6 | class Site(object):
7 | _session = None
8 | _kwargs = None
9 |
10 | def __init__(self, conf, creds=None, proxies=None):
11 | self.conf = conf
12 | self.creds = creds
13 | self.proxies = proxies
14 |
15 | def kwargs_getter(self):
16 | return self._kwargs
17 |
18 | def kwargs_setter(self, kwargs):
19 | if "target" in kwargs:
20 | target = kwargs.pop("target")
21 | if "target" in self.conf.get("request", {}):
22 | target_conf = self.conf["request"]["target"]
23 |
24 | # PTR-style
25 | ptr_style = str(target_conf.get("ptr", False)).lower()
26 | if ptr_style in ("1", "yes", "true"):
27 | target = ".".join(reversed(target.split(".")))
28 |
29 | urlencode = str(target_conf.get("urlencode", False)).lower()
30 | if urlencode in ("1", "yes", "true"):
31 | target = urllib.parse.quote(target)
32 | elif urlencode == "twice":
33 | target = urllib.parse.quote(
34 | urllib.parse.quote(target, safe="")
35 | )
36 |
37 | if "format" in target_conf:
38 | target = target_conf["format"] % (target,)
39 |
40 | kwargs["target"] = target
41 |
42 | self._kwargs = kwargs
43 |
44 | kwargs = property(kwargs_getter, kwargs_setter)
45 |
46 | @staticmethod
47 | def from_conf(conf, *args, **kwargs):
48 | from . import csv, html, rss, json, ipwhois
49 | if "webscraper" in conf:
50 | site_conf = conf.pop("webscraper")
51 | scraper = html.Webscraper(site_conf, *args, **kwargs)
52 | elif "tablescraper" in conf:
53 | site_conf = conf.pop("tablescraper")
54 | scraper = html.TableScraper(site_conf, *args, **kwargs)
55 | elif "json" in conf:
56 | site_conf = conf.pop("json")
57 | scraper = json.JsonApi(site_conf, *args, **kwargs)
58 | elif "csv" in conf:
59 | site_conf = conf.pop("csv")
60 | scraper = csv.CsvSite(site_conf, *args, **kwargs)
61 | elif "rss" in conf:
62 | site_conf = conf.pop("rss")
63 | scraper = rss.RssSite(site_conf, *args, **kwargs)
64 | elif "ipwhois" in conf:
65 | site_conf = conf.pop("ipwhois")
66 | scraper = ipwhois.IpWhois(site_conf, *args, **kwargs)
67 | # elif "dns" in conf:
68 | # scraper = DnsSite(conf["dns"], *args, **kwargs)
69 | # elif "ipwhois" in conf:
70 | # scraper = IpWhois(conf["ipwhois"], *args, **kwargs)
71 | else:
72 | raise NotImplementedError(conf.keys())
73 | scraper.kwargs = conf.copy()
74 | return scraper
75 |
76 | def get_content(self):
77 | raise NotImplementedError
78 | #pylint: disable=no-member
79 | def __iter__(self):
80 | for _ in self.run():
81 | yield _
82 |
--------------------------------------------------------------------------------
/src/machinae/sites/base.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 |
3 | import gzip
4 | import io
5 | import warnings
6 | import zipfile
7 | from collections import OrderedDict
8 |
9 | import magic
10 | import pytz
11 | import relatime
12 | import requests
13 | from tzlocal import get_localzone
14 | try:
15 | #pylint: disable=ungrouped-imports
16 | from requests.packages.urllib3 import exceptions
17 | except ImportError:
18 | # Apparently, some linux distros strip the packages out of requests
19 | # I'm not going to tell you what I think of that, just going to deal with it
20 | from urllib3 import exceptions
21 |
22 | from . import Site
23 |
24 |
25 | class HttpSite(Site):
26 | @property
27 | def url(self):
28 | print(self.conf["request"]["url"].format(**self.kwargs))
29 | return self.conf["request"]["url"].format(**self.kwargs)
30 |
31 | @property
32 | def session(self):
33 | if self._session is None:
34 | self._session = requests.Session()
35 | self._session.headers.update({"User-Agent": "Vor/1.0 (Like CIF/2.0)"})
36 | if self.proxies:
37 | self._session.proxies = self.proxies
38 | return self._session
39 |
40 | @staticmethod
41 | def unzip_content(r, *args, **kwargs):
42 | content = r.content
43 |
44 | mime = magic.from_buffer(content, mime=True)
45 |
46 | if mime == "application/zip":
47 | zip_buffer = io.BytesIO(content)
48 | with zipfile.ZipFile(zip_buffer) as zf:
49 | fn = zf.namelist()[0]
50 | with zf.open(fn) as f:
51 | r._content = f.read()
52 | elif mime == "application/x-gzip":
53 | gz_buffer = io.BytesIO(content)
54 | with gzip.GzipFile(fileobj=gz_buffer) as gz:
55 | r._content = gz.read()
56 | else:
57 | r._content = content
58 |
59 | return r
60 |
61 | def _req(self, conf, url=None):
62 | if url is None:
63 | url = conf.get("url", "")
64 | if url == "":
65 | return
66 | url = url.format(**self.kwargs)
67 | method = conf.get("method", "get").upper()
68 |
69 | kwargs = dict()
70 | headers = conf.get("headers", {})
71 | if headers:
72 | kwargs["headers"] = headers
73 | verify_ssl = conf.get("verify_ssl", True)
74 |
75 | # GET params
76 | params = conf.get("params", {}).copy()
77 | for (k, v) in params.items():
78 | if hasattr(v, "items"):
79 | conf = params.pop(k)
80 | if "relatime" in conf:
81 | dt = relatime.timeParser(conf["relatime"], timezone=str(get_localzone()))
82 | target_tz = pytz.timezone(conf.get("timezone", "UTC"))
83 | dt = dt.astimezone(target_tz)
84 | dt = dt.replace(tzinfo=None)
85 | time_format = conf.get("format", "%Y-%m-%dT%H:%M:%S.%fZ")
86 | if time_format.lower() == "as_epoch":
87 | params[k] = str(int(dt.timestamp()))
88 | else:
89 | params[k] = dt.strftime(time_format)
90 | else:
91 | params[k] = str(v).format(**self.kwargs)
92 | if params:
93 | kwargs["params"] = params
94 |
95 | # POST data
96 | data = conf.get("data", {})
97 | for (k, v) in data.items():
98 | data[k] = v.format(**self.kwargs)
99 | if data:
100 | kwargs["data"] = data
101 |
102 | # HTTP Basic Auth
103 | if conf.get("auth") and self.creds and self.creds.get(conf["auth"]):
104 | kwargs["auth"] = tuple(self.creds[conf["auth"]])
105 |
106 | # Auto decompress
107 | if conf.get("decompress", False):
108 | kwargs["hooks"] = {"response": self.unzip_content}
109 |
110 | raw_req = requests.Request(method, url, **kwargs)
111 | req = self.session.prepare_request(raw_req)
112 | if self.kwargs.get("verbose", False):
113 | print("[.] Requesting {0} ({1})".format(req.url, req.method))
114 | with warnings.catch_warnings():
115 | if not verify_ssl:
116 | warnings.simplefilter("ignore", exceptions.InsecureRequestWarning)
117 | return self.session.send(req, verify=verify_ssl)
118 |
119 | def get_content(self, conf=None, url=None):
120 | if conf is None:
121 | conf = self.conf["request"]
122 |
123 | r = self._req(conf, url)
124 | ignored_status_codes = [int(sc) for sc in conf.get("ignored_status_codes", [])]
125 | if r.status_code not in ignored_status_codes:
126 | r.raise_for_status()
127 | return r
128 | #pylint: disable=no-self-use
129 | #Will be cleaned up in upcoming refactor
130 | def build_result(self, parser, result_dict):
131 | defaults_dict = parser.get("defaults", {})
132 |
133 | result = OrderedDict()
134 | result.update(defaults_dict)
135 | result.update(result_dict)
136 |
137 | result.pop(None, None)
138 |
139 | if "map" in parser:
140 | for (old, new) in parser["map"].items():
141 | if new is None:
142 | result.pop(old)
143 | elif old in result:
144 | result[new] = result.pop(old)
145 |
146 | # fmt = dict()
147 | # for (k, v) in result.items():
148 | # fk = "<{0}>".format(k)
149 | # fmt[fk] = str(v)
150 | #
151 | # for (k, v) in result.items():
152 | # for (find, replace) in fmt.items():
153 | # try:
154 | # result[k] = v.replace(find, replace)
155 | # except AttributeError:
156 | # pass
157 |
158 | if "defaults" in parser:
159 | for (k, v) in parser["defaults"].items():
160 | result[k] = v
161 |
162 | if "pretty_name" in parser:
163 | result = OrderedDict([
164 | ("value", result),
165 | ("pretty_name", parser["pretty_name"])
166 | ])
167 |
168 | if hasattr(result_dict, "labels"):
169 | result.labels = result_dict.labels
170 |
171 | return result
172 |
--------------------------------------------------------------------------------
/src/machinae/sites/csv.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 |
3 | import csv
4 | import io
5 | import re
6 |
7 | from .base import HttpSite
8 |
9 |
10 | class CsvSite(HttpSite):
11 | _delim = None
12 |
13 | @property
14 | def dialect(self):
15 | if "pattern" not in self.conf:
16 | return "excel"
17 |
18 | class DelimDialect(csv.excel):
19 | delimiter = str(self.delim)
20 | skipinitialspace = True
21 |
22 | return DelimDialect()
23 |
24 | @property
25 | def delim(self):
26 | return self._delim or self.conf.get("pattern", ",")
27 |
28 | def get_content(self):
29 | r = super(CsvSite, self).get_content()
30 | body = r.text
31 |
32 | if len(self.delim) > 1:
33 | body = re.sub(self.conf["pattern"], "|", body)
34 | self._delim = "|"
35 |
36 | buf = io.StringIO(body)
37 | csvfile = csv.reader(buf, dialect=self.dialect)
38 |
39 | return csvfile
40 |
41 | def run(self):
42 | r = self._req(self.conf["request"])
43 |
44 | body = r.text
45 | if len(self.delim) > 1:
46 | body = re.sub(self.conf["pattern"], "|", body)
47 | self._delim = "|"
48 |
49 | buf = io.StringIO(body)
50 | csvfile = csv.reader(buf, dialect=self.dialect)
51 |
52 | for (lineno, row) in enumerate(csvfile):
53 | for parser in self.conf["results"]:
54 | start = parser.get("start", 1)
55 | stop = parser.get("end", None)
56 |
57 | # raise ValueError(start, stop)
58 | #pylint: disable=len-as-condition
59 | if lineno < start or len(row) == 0 or row[0].startswith("#"):
60 | continue
61 | elif stop is not None and lineno > stop:
62 | break
63 |
64 | if "match" in parser:
65 | rex = re.compile(parser["match"]["regex"])
66 | col = int(parser["match"]["column"])
67 | if not rex.search(row[col]):
68 | continue
69 |
70 | row = [item.strip() for item in row]
71 | result_dict = dict(zip(parser["values"], row))
72 | yield self.build_result(parser, result_dict)
73 |
--------------------------------------------------------------------------------
/src/machinae/sites/html.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 |
3 | import re
4 | from collections import Counter
5 |
6 | from bs4 import BeautifulSoup, Comment
7 |
8 | from .base import HttpSite
9 |
10 |
11 | def html_unescape(content):
12 | try:
13 | import html
14 | return html.unescape(content)
15 | except ImportError:
16 | import HTMLParser
17 | html_parser = HTMLParser.HTMLParser()
18 | return html_parser.unescape(content)
19 |
20 |
21 | class HtmlSite(HttpSite):
22 | def get_html(self):
23 | r = super(HtmlSite, self).get_content()
24 | body = r.text
25 |
26 | cleanup = self.conf["request"].get("cleanup", {})
27 |
28 | strip_comments = str(cleanup.get("strip_comments", False)).lower()
29 | if strip_comments in ("1", "yes", "true"):
30 | soup = BeautifulSoup(r.text, "html5lib")
31 | for comment in soup.find_all(text=lambda _: isinstance(_, Comment)):
32 | comment.extract()
33 | body = str(soup)
34 |
35 | return html_unescape(body)
36 |
37 |
38 | class TableScraper(HtmlSite):
39 | @staticmethod
40 | def compare_rows(row1, row2):
41 | row1 = [cell.strip().lower() for cell in row1]
42 | row2 = [cell.strip().lower() for cell in row2]
43 | #pylint: disable=superfluous-parens
44 | #I believe these to actually be unnecessary,er,superfluous but will have to test
45 | return (Counter(row1) == Counter(row2))
46 |
47 | @staticmethod
48 | def get_row_contents(row):
49 | return [cell.get_text().strip() for cell in row.find_all(["td", "th"])]
50 |
51 | @classmethod
52 | def find_table(cls, html, headers):
53 | soup = BeautifulSoup(html, "html5lib")
54 | for table in soup.find_all("table"):
55 | cells = cls.get_row_contents(table.find("tr"))
56 | if cls.compare_rows(cells, headers):
57 | return (table, cells)
58 | raise ValueError("No matching table found")
59 |
60 | def run(self):
61 | body = self.get_html()
62 |
63 | for parser in self.conf["results"]:
64 | (table, columns) = self.find_table(body, parser["map"].keys())
65 | for row in table.find_all("tr"):
66 | cells = self.get_row_contents(row)
67 | if self.compare_rows(cells, columns):
68 | continue
69 | result_dict = dict(zip(columns, cells))
70 | yield self.build_result(parser, result_dict)
71 |
72 |
73 | class Webscraper(HtmlSite):
74 | def run(self):
75 | body = self.get_html()
76 |
77 | if "results" not in self.conf:
78 | raise Exception("No parsing configuration found")
79 | for parser in self.conf["results"]:
80 | rex = re.compile(parser["regex"], flags=re.I)
81 | for match in rex.finditer(body):
82 | result_dict = dict()
83 | for (k, v) in zip(parser["values"], match.groups()):
84 | result_dict[k] = v
85 | yield self.build_result(parser, result_dict)
86 |
--------------------------------------------------------------------------------
/src/machinae/sites/ipwhois.py:
--------------------------------------------------------------------------------
1 | from ipaddress import ip_address, summarize_address_range
2 |
3 | import ipwhois
4 |
5 | from .json import JsonApi
6 |
7 |
8 | class IpWhois(JsonApi):
9 | @staticmethod
10 | def get_cidr(network):
11 | networks = [str(net) for net in summarize_address_range(
12 | ip_address(network["start_address"]),
13 | ip_address(network["end_address"])
14 | )]
15 | if len(networks) == 1:
16 | networks = networks[0]
17 | return networks
18 |
19 | def get_json(self):
20 | obj = ipwhois.IPWhois(self.kwargs["target"])
21 | try:
22 | # import json
23 | # print(json.dumps(obj.lookup_rdap(depth=2)))
24 | # return obj.lookup_rdap(depth=2)
25 | return obj.lookup_rws()
26 | except AttributeError:
27 | # rdap = obj.lookup_rdap(inc_raw=True)
28 | # print(json.dumps(rdap))
29 | # rdap["network"]["range"] = "{start_address} - {end_address}".format(**rdap["network"])
30 | # rdap["network"]["cidr"] = self.get_cidr(rdap["network"])
31 | # return rdap
32 | # RDAP is a stupid format, use raw whois
33 | raw = obj.lookup()
34 | print(raw)
35 | return raw
36 |
--------------------------------------------------------------------------------
/src/machinae/sites/json.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 |
3 | import datetime
4 | import json
5 | import re
6 | import urllib.parse
7 | from collections import OrderedDict
8 |
9 | from dateutil.parser import parse
10 |
11 | from relatime import timeParser
12 |
13 | from .base import HttpSite
14 |
15 | class JsonApi(HttpSite):
16 | @staticmethod
17 | def get_value(data, key, default=None):
18 | if key == "@" or data is None:
19 | return data
20 | ret = data
21 | key_parts = key.split(".")
22 | for key_part in key_parts:
23 | if key_part not in ret:
24 | return default
25 | ret = ret[key_part]
26 | return ret
27 |
28 | def get_json(self, url=None):
29 | r = self.get_content(url=url)
30 |
31 | ignored_status_codes = [int(sc) for sc in self.conf["request"].get("ignored_status_codes", [])]
32 | if r.status_code in ignored_status_codes:
33 | return []
34 |
35 | if not self.conf.get("multi_json", False):
36 | return r.json()
37 |
38 | if r.status_code in ignored_status_codes:
39 | return []
40 |
41 | results = list()
42 | for json_line in r.text.split("\n"):
43 | if not json_line:
44 | break
45 | results.append(json.loads(json_line))
46 | return results
47 |
48 | def run(self):
49 | data = self.get_json()
50 |
51 | if hasattr(data, "items"):
52 | next_url = None
53 | if self.conf.get("paginated", False):
54 | next_url = data.get("next", None)
55 |
56 | data = [data]
57 | while next_url:
58 | next_data = self.get_json(url=next_url)
59 | next_url = next_data.get("next", None)
60 | data.append(next_data)
61 |
62 | if "results" not in self.conf:
63 | return
64 |
65 | for row in data:
66 | for parser in self.conf["results"]:
67 | for _ in self.parse_dict(row, parser):
68 | yield _
69 |
70 | @classmethod
71 | def get_result_dicts(cls, data, parser, mm_key=None, onlyif=None):
72 | if not hasattr(parser, "items"):
73 | parser = {"key": parser}
74 |
75 | if "key" not in parser:
76 | yield data
77 | return
78 |
79 | key = parser["key"]
80 | rex = None
81 | if "regex" in parser:
82 | rex = re.compile(parser["regex"], flags=re.I)
83 |
84 | if key == "@" and mm_key is not None:
85 | yield {key: mm_key}
86 | return
87 |
88 | values = cls.get_value(data, key)
89 | if values is None:
90 | return
91 |
92 | if not parser.get("match_all", False):
93 | values = [values]
94 |
95 | for val in values:
96 | result_dict = OrderedDict()
97 |
98 | if rex:
99 | m = rex.search(val)
100 | if not m:
101 | return
102 | #pylint: disable=len-as-condition
103 | if len(m.groups()) > 0:
104 | val = m.groups()
105 | if len(val) == 1:
106 | val = val[0]
107 |
108 | urldecode = str(parser.get("urldecode", False)).lower()
109 | if urldecode in ("1", "yes", "true"):
110 | val = urllib.parse.unquote(val)
111 | elif urldecode == "twice":
112 | val = urllib.parse.unquote(
113 | urllib.parse.unquote(val)
114 | )
115 |
116 | if "format" in parser:
117 | if parser["format"] == "as_list":
118 | val = ", ".join(map(str, val))
119 | elif parser["format"] == "as_time":
120 | try:
121 | dt = datetime.datetime.fromtimestamp(float(val))
122 | #pylint: disable=bare-except
123 | #Will be cleaned up in future refactor -- I hate mcmaster
124 | except:
125 | dt = parse(val)
126 | val = dt.isoformat()
127 | result_dict[key] = val
128 |
129 | yield result_dict
130 |
131 | @classmethod
132 | def multi_match_generator(cls, data, parser, mm_key):
133 | if not hasattr(data, "items"):
134 | # Is a list, process each list item
135 | for item in data:
136 | for _ in cls.multi_match_generator(item, parser, mm_key="@"):
137 | yield _
138 |
139 | return
140 |
141 | onlyif = parser.get("onlyif", None)
142 | if onlyif is not None and not hasattr(onlyif, "items"):
143 | onlyif = {"key": onlyif}
144 |
145 | # Decide how to iterate on the data
146 | # Options are:
147 | # Return result_dict per match in dict (if: data is dict)
148 | # Return one result_dict for whole dict (if: data is dict)
149 | if mm_key == "@" or parser.get("match_all", False):
150 | # Treat the entire data as a single match
151 | # Returns a single result_dict
152 | data = [(None, data)]
153 | else:
154 | # Each matching key is a separate result_dict
155 | data = data.items()
156 |
157 | for (k, v) in data:
158 | if onlyif is not None:
159 | if not hasattr(onlyif, "items"):
160 | onlyif = {"key": onlyif}
161 | value = cls.get_value(v, onlyif["key"], None)
162 |
163 | if value is None:
164 | continue
165 | elif "regex" in onlyif:
166 | rex = re.compile(onlyif["regex"], re.I)
167 | if not rex.search(value):
168 | continue
169 | # Check for maxage key in onlyif. If it exists, parse it as Splunk relative time syntax and compare to parsed input "value"
170 | elif "maxage" in onlyif:
171 | age = parse(value)
172 | if not onlyif["maxage"].startswith("-"): # Assume we want dates in the past
173 | print('\033[91m' + 'WARNING: maxage must be prepended with "-" Please correct this in your configuration file.' + '\033[0m')
174 | onlyif["maxage"] = "-%s" % onlyif["maxage"]
175 | ageout = timeParser(onlyif["maxage"]).replace(tzinfo=None)
176 | if age < ageout:
177 | continue
178 | else:
179 | if not bool(value):
180 | continue
181 | result_dict = OrderedDict()
182 | for mm_parser in parser["keys"]:
183 | for mm_result_dict in cls.get_result_dicts(v, mm_parser, mm_key=k, onlyif=onlyif):
184 | result_dict.update(mm_result_dict)
185 |
186 | if result_dict:
187 | result_dict.labels = parser.get("labels", None)
188 | yield result_dict
189 |
190 | def parse_dict(self, data, parser):
191 | if not hasattr(parser, "items"):
192 | parser = {"key": parser}
193 |
194 | if "multi_match" in parser:
195 | target = self.get_value(data, parser["key"])
196 | if target is None:
197 | return
198 | result_iter = self.multi_match_generator(target, parser["multi_match"], parser["key"])
199 | else:
200 | result_iter = self.get_result_dicts(data, parser)
201 |
202 | for result_dict in result_iter:
203 | yield self.build_result(parser, result_dict)
204 |
--------------------------------------------------------------------------------
/src/machinae/sites/rss.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 |
3 | import re
4 |
5 | import feedparser
6 |
7 | from .base import HttpSite
8 |
9 |
10 | class RssSite(HttpSite):
11 | def get_content(self):
12 | r = super(RssSite, self).get_content()
13 | return feedparser.parse(r.text)
14 |
15 | def run(self):
16 | r = self._req(self.conf["request"])
17 | body = r.text
18 | rss = feedparser.parse(body)
19 | parser = None
20 |
21 | for entry in rss.entries:
22 | for parser1 in self.conf["results"]:
23 | result_dict = dict()
24 | for (key, parser) in parser1.items():
25 | print(parser)
26 | rex = re.compile(parser["regex"])
27 | fieldnames = parser["values"]
28 | if not isinstance(fieldnames, list):
29 | fieldnames = [fieldnames]
30 | rss_value = getattr(entry, key)
31 | m = rex.search(rss_value)
32 | if m:
33 | result_dict.update(dict(zip(fieldnames, m.groups())))
34 | else:
35 | result_dict = None
36 | break
37 |
38 | if result_dict is None:
39 | continue
40 |
41 | yield self.build_result(parser, result_dict)
42 |
--------------------------------------------------------------------------------
/src/machinae/utils.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | import yaml
3 |
4 | class MachinaeLoader(yaml.SafeLoader):
5 | #pylint: disable=arguments-differ,too-many-ancestors
6 | def construct_mapping(self, node):
7 | self.flatten_mapping(node)
8 | return OrderedDict(self.construct_pairs(node))
9 |
10 |
11 | MachinaeLoader.add_constructor(
12 | yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
13 | MachinaeLoader.construct_mapping)
14 |
15 |
16 | #pylint: disable=too-many-ancestors
17 | class MachinaeDumper(yaml.Dumper):
18 | def represent_dict(self, data):
19 | return self.represent_mapping('tag:yaml.org,2002:map', data, False)
20 |
21 | def represent_list(self, data):
22 | return self.represent_sequence('tag:yaml.org,2002:seq', data, False)
23 |
24 |
25 | MachinaeDumper.add_representer(
26 | OrderedDict,
27 | MachinaeDumper.represent_dict)
28 |
29 | MachinaeDumper.add_representer(
30 | list,
31 | MachinaeDumper.represent_list)
32 |
33 | #This is to load site results as an OrderedDict so we override the
34 | #built-in PyYAML safe_load
35 | def safe_load(*args, **kwargs):
36 | kwargs["Loader"] = MachinaeLoader
37 | return yaml.load(*args, **kwargs)
38 |
39 |
40 | def dump(*args, **kwargs):
41 | kwargs["Dumper"] = MachinaeDumper
42 | return yaml.dump(*args, **kwargs)
43 |
44 |
45 | def listsites(conf):
46 | rstr = '{0:40}{1:40}{2:40}{3}'.format('SITE', 'NAME', 'OTYPES', 'DEFAULT')
47 | rstr += '\n'
48 | for key in conf:
49 | d = 'True'
50 | if "default" in conf[key].keys():
51 | d = str(conf[key]["default"])
52 | rstr += '{0:40}{1:40}{2:40}{3}'.format(key,
53 | conf[key]["name"],
54 | ', '.join(conf[key]["otypes"]),
55 | d)
56 | rstr += '\n'
57 | return rstr
58 |
--------------------------------------------------------------------------------