├── .bumpversion.cfg ├── .editorconfig ├── .github └── ISSUE_TEMPLATE.md ├── .gitignore ├── .gitlab-ci.yml ├── .travis.yml ├── AUTHORS.rst ├── CHANGELOG.md ├── CONTRIBUTING.rst ├── HISTORY.rst ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── detectem ├── __init__.py ├── cli.py ├── core.py ├── data │ └── wordpress.jl ├── exceptions.py ├── matchers.py ├── plugin.py ├── plugins │ ├── __init__.py │ ├── angular.py │ ├── angularjs.py │ ├── apache.py │ ├── backbonejs.py │ ├── crayon-syntax-highlighter.py │ ├── d3js.py │ ├── emberjs.py │ ├── generic │ │ ├── __init__.py │ │ └── wordpress.py │ ├── ghost.py │ ├── helpers.py │ ├── iis.py │ ├── joomla.py │ ├── jquery.py │ ├── jqueryui.py │ ├── knockoutjs.py │ ├── lightbox.py │ ├── modernizr.py │ ├── momentjs.py │ ├── mootools.py │ ├── nginx.py │ ├── php.py │ ├── phusion-passenger.py │ ├── piwik.py │ ├── react.py │ ├── requirejs.py │ ├── ssl.py │ ├── underscorejs.py │ ├── vue.py │ ├── w3-total-cache.py │ ├── webpack.py │ ├── wordpress.py │ └── wp-super-cache.py ├── response.py ├── results.py ├── script.lua ├── settings.py ├── splash.py ├── utils.py └── ws.py ├── docs ├── Makefile ├── assets │ └── browser_js_console.png ├── conf.py ├── generic.rst ├── index.rst ├── installation.rst ├── matchers.rst ├── matchers │ ├── body.rst │ ├── dom.rst │ ├── header.rst │ ├── url.rst │ └── xpath.rst ├── plugin_development.rst ├── references.rst └── requirements.txt ├── extras └── docker │ ├── Dockerfile-alternate │ └── docker-compose.yml ├── requirements ├── base.txt ├── devel.txt └── tests.txt ├── scripts ├── add_new_plugin.py ├── get_shodan_banners.py └── get_software_hashes.py ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── conftest.py ├── integration │ ├── __init__.py │ └── test_core.py ├── plugins │ ├── __init__.py │ ├── fixtures │ │ ├── angular.yml │ │ ├── angularjs.yml │ │ ├── apache.yml │ │ ├── backbonejs.yml │ │ ├── d3js.yml │ │ ├── ember.yml │ │ ├── ghost.yml │ │ ├── iis.yml │ │ ├── joomla.yml │ │ ├── jquery.yml │ │ ├── jqueryui.yml │ │ ├── knockoutjs.yml │ │ ├── lightbox.yml │ │ ├── modernizr.yml │ │ ├── momentjs.yml │ │ ├── mootools.yml │ │ ├── nginx.yml │ │ ├── php.yml │ │ ├── phusion-passenger.yml │ │ ├── piwik.yml │ │ ├── react.yml │ │ ├── requirejs.yml │ │ ├── ssl.yml │ │ ├── underscorejs.yml │ │ ├── vue.yml │ │ ├── w3-total-cache.yml │ │ ├── webpack.yml │ │ ├── wordpress.yml │ │ └── wp-super-cache.yml │ ├── test_common.py │ ├── test_generic.py │ └── utils.py ├── splash │ ├── __init__.py │ ├── test_docker_manager.py │ ├── test_remote_manager.py │ └── test_splash_manager.py ├── test_cli.py ├── test_core.py ├── test_exceptions.py ├── test_matchers.py ├── test_response.py ├── test_results.py ├── test_utils.py └── test_ws.py └── tox.ini /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.7.3 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | search = version="{current_version}" 8 | replace = version="{new_version}" 9 | 10 | [bumpversion:file:detectem/__init__.py] 11 | search = __version__ = "{current_version}" 12 | replace = __version__ = "{new_version}" 13 | 14 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.py] 14 | # isort configuration 15 | multi_line_output=3 16 | include_trailing_comma=true 17 | lines_between_types=1 18 | 19 | [*.lua] 20 | indent_size = 2 21 | 22 | [*.yml] 23 | indent_size = 2 24 | 25 | [LICENSE] 26 | insert_final_newline = false 27 | 28 | [Makefile] 29 | indent_style = tab 30 | 31 | [.bumpversion.cfg] 32 | trim_trailing_whitespace = false 33 | indent_style = tab 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | * detectem version: 2 | * Python version: 3 | * Operating System: 4 | 5 | ### Description 6 | 7 | Describe what you were trying to get done. 8 | Tell us what happened, what went wrong, and what you expected to happen. 9 | 10 | ### What I Did 11 | 12 | ``` 13 | Paste the command(s) you ran and the output. 14 | If there was a crash, please include the traceback here. 15 | ``` 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Distribution / packaging 7 | .Python 8 | env/ 9 | build/ 10 | develop-eggs/ 11 | dist/ 12 | downloads/ 13 | eggs/ 14 | .eggs/ 15 | lib/ 16 | lib64/ 17 | parts/ 18 | sdist/ 19 | var/ 20 | *.egg-info/ 21 | .installed.cfg 22 | *.egg 23 | 24 | # PyInstaller 25 | # Usually these files are written by a python script from a template 26 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 27 | *.manifest 28 | *.spec 29 | 30 | # Installer logs 31 | pip-log.txt 32 | pip-delete-this-directory.txt 33 | 34 | # Unit test / coverage reports 35 | htmlcov/ 36 | .tox/ 37 | .coverage 38 | .coverage.* 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | *,cover 43 | .hypothesis/ 44 | 45 | # Translations 46 | *.mo 47 | *.pot 48 | 49 | # Django stuff: 50 | *.log 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | _build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # pyenv python configuration file 60 | .python-version 61 | .mypy_cache 62 | settings.json 63 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | image: python:3.6.2-alpine 2 | 3 | test: 4 | script: 5 | - apk add --update build-base libxml2-dev libxslt-dev 6 | - pip install -r requirements.txt 7 | - pip install -r tests/requirements.txt 8 | - py.test tests 9 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 3.6 4 | - 3.7 5 | install: 6 | - pip install -r requirements/base.txt 7 | - pip install -r requirements/devel.txt 8 | - pip install -r requirements/tests.txt 9 | script: 10 | - pytest 11 | deploy: 12 | provider: pypi 13 | distributions: "sdist bdist_wheel" 14 | on: 15 | tags: true 16 | skip_existing: true 17 | user: "__token__" 18 | password: 19 | secure: upF9ChYklNX9P1WQmydWjfVF8MZyYTj7WRiF9UeG/JLLgWE18MzeLmhibzvLy8qE1d1PBjdBY45m8PKnNGfvT5UCDFyVcYoodiLpHZ9MgXCYCCoKDLJBm0SA0CkmbJ3M29zLIGzIEJrwDhFihQBdHYMIh4tRbMi/QwfeHoy/R7KXyeBxyhAe1tZAns6u6i9fgP9ECwK112cznNCf759Rapu6CAzV9wpzM/FYTzttcyiPuAoD7AflfRZS4s+MUqQXC3EHqrV6girzai7yiWSfUP3+XFME7wZVPgDORo9MBvG67UpIIy+seZ4Y11p+gr6qJmVWjRSePwW5zevOTJXXGyaLuUkFHp1xrAXC16pufqau8NFQYo5e8P771c3T3Fn+uQvQ9onyYW2lSXhHJOqrVA07jni8HuYKuh8PgCxld4zdQYrUpW32PxQBdV9pyuBapAJmmad0Pj8MSkPI8+9ffma/P1Cz9RXPJxTqV0WqWaNy+mODZ1NIQPW2LHqImoTGML/k97pmRth77BGNDGnNqcs9x1UL7TCXTif/BKhm4Cfb3O5HZDJBsyetMDHdUHwMYrqYFKiSf4i841RGk5fruVksdJwu3EWCPy18MY7rV7HwbEtsV9FiZ8b2f5Osk8rjZ9VDYu8rDe2iz9tZfKxJM3HjcqB+RLmULXCEYWRFn3Y= 20 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Development Lead 6 | ---------------- 7 | 8 | * Claudio Salazar 9 | * Sebastian Mancilla 10 | 11 | Contributors 12 | ------------ 13 | 14 | * Rodrigo Tobar 15 | * Tomas Hernandez 16 | * Alexei Volynshchikov 17 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | Change Log 2 | ========== 3 | 4 | All notable changes to this project will be documented in this file. 5 | 6 | The format is based on [Keep a Changelog](http://keepachangelog.com/) 7 | and this project adheres to [Semantic Versioning](http://semver.org/). 8 | 9 | 0.7.3 - 2020-07-02 10 | ------------------ 11 | ## Added 12 | - Support for input file (1st approach) 13 | 14 | ## Fixed 15 | - Remove tests from deliverables 16 | - Handling of JS errors 17 | 18 | 19 | 0.7.2 - 2020-06-29 20 | ------------------ 21 | ## Added 22 | - Meaningful message in test failure 23 | 24 | ## Updated 25 | - Start splash container listening only locally 26 | 27 | ## Fixed 28 | - Some plugins 29 | 30 | 0.7.1 - 2019-12-03 31 | ------------------ 32 | ## Added 33 | - Update dependencies 34 | - Include black and isort in tests 35 | 36 | 0.7.0 - 2018-09-26 37 | ------------------ 38 | ## Added 39 | - Improve Javascript support 40 | - Add updated documentation 41 | - Add generic plugins 42 | - Add multiple scripts 43 | - Many minor improvements 44 | 45 | 0.6.1 - 2017-09-07 46 | ------------------ 47 | ## Fixed 48 | - Regression in the web service 49 | 50 | 0.6.0 - 2017-08-31 51 | ------------------ 52 | ### Added 53 | - Options to set Splash timeout 54 | - Detect software on inline scripts 55 | - Support for custom plugins 56 | - A from_url field to the output metadata 57 | 58 | ### Changed 59 | - Better error handling 60 | - Do not run JS matchers when site uses Content Security Policy 61 | - Disable loading images on Splash 62 | - Use names when listing hints 63 | - Sort detected plugins by name and version 64 | - Remove duplicated plugins from detection results 65 | 66 | 0.5.2 - 2017-03-24 67 | ------------------ 68 | ### Changed 69 | - Refactor of result types 70 | - Blacklist in HAR results 71 | - Output improvement 72 | 73 | 0.5.1 - 2017-03-23 74 | ------------------ 75 | ### Added 76 | - Concept of hints 77 | - Documentation about modular matchers 78 | - Two Joomla plugins 79 | 80 | ### Changed 81 | - Plugin interface to make it more flexible 82 | 83 | 0.5.0 - 2017-03-20 84 | ------------------ 85 | ### Added 86 | - Concept of indicators 87 | - Documentation about modular matchers 88 | 89 | ### Fixed 90 | - Tests 91 | 92 | 0.4.5 - 2017-03-15 93 | ------------------ 94 | 95 | 0.4.4 - 2017-03-15 96 | ------------------ 97 | 98 | ### Fixed 99 | - Header detection 100 | 101 | 0.4.3 - 2017-03-15 102 | ------------------ 103 | ### Added 104 | - Documentation for the project 105 | - MooTools plugins 106 | - Add modular plugins support 107 | - Angular.js plugin 108 | 109 | ### Changed 110 | - Improved add_new_plugin to be easy to create a plugin 111 | - Refactored core components 112 | 113 | 0.4.2 - 2017-02-13 114 | ------------------ 115 | ### Added 116 | - A new plugin 117 | - Javascript support through LUA script 118 | - Tests for new javascript feature 119 | - New d3.js plugin 120 | 121 | ### Changed 122 | - Replace spaces by dash in plugin names 123 | - contact-form-7 location 124 | 125 | ### Fixed 126 | - Tests 127 | 128 | 0.4.1 - 2017-02-13 129 | ------------------ 130 | ### Added 131 | - jquery_migrate plugin 132 | - Better error handling 133 | 134 | 0.4.0 - 2017-02-12 135 | ------------------ 136 | ### Added 137 | - Plugin metadata 138 | - Javascript support through LUA script 139 | - Tests for new javascript feature 140 | - New d3.js plugin 141 | 142 | ### Changed 143 | - Updated requirements file 144 | 145 | 0.3.0 - 2016-12-27 146 | ------------------ 147 | ### Added 148 | - Web service 149 | - Support to configure Splash from environment variables 150 | 151 | 0.2.0 - 2016-12-21 152 | ------------------ 153 | ### Added 154 | - Some new plugins 155 | 156 | ### Changed 157 | - Updated to use docker 2.0 library 158 | - Improved docker decorator 159 | 160 | 0.1.3 - 2016-11-18 161 | ------------------ 162 | ### Added 163 | - Some new plugins 164 | 165 | ### Changed 166 | - Updated add_new_plugin script with latest changes 167 | 168 | ### Fixed 169 | - Response body decoding 170 | 171 | 0.1.2 - 2016-11-16 172 | ------------------ 173 | ### Added 174 | - Some attributes to Plugin interface 175 | - Some new plugins 176 | - Script to create plugins faster 177 | 178 | ### Fixed 179 | - jQuery plugin 180 | 181 | 0.1.1 - 2016-11-15 182 | ------------------ 183 | ### Fixed 184 | - Travis setup and setup.py 185 | 186 | 0.1.0 - 2016-11-15 187 | ------------------ 188 | ### Added 189 | - Initial version 190 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Contributing 5 | ============ 6 | 7 | Contributions are welcome, and they are greatly appreciated! Every 8 | little bit helps, and credit will always be given. 9 | 10 | You can contribute in many ways: 11 | 12 | Types of Contributions 13 | ---------------------- 14 | 15 | Report Bugs 16 | ~~~~~~~~~~~ 17 | 18 | Report bugs at https://github.com/alertot/detectem/issues. 19 | 20 | If you are reporting a bug, please include: 21 | 22 | * Your operating system name and version. 23 | * Any details about your local setup that might be helpful in troubleshooting. 24 | * Detailed steps to reproduce the bug. 25 | 26 | Fix Bugs 27 | ~~~~~~~~ 28 | 29 | Look through the GitHub issues for bugs. Anything tagged with "bug" 30 | and "help wanted" is open to whoever wants to implement it. 31 | 32 | Implement Features 33 | ~~~~~~~~~~~~~~~~~~ 34 | 35 | Look through the GitHub issues for features. Anything tagged with "enhancement" 36 | and "help wanted" is open to whoever wants to implement it. 37 | 38 | Write Documentation 39 | ~~~~~~~~~~~~~~~~~~~ 40 | 41 | detectem could always use more documentation, whether as part of the 42 | official detectem docs, in docstrings, or even on the web in blog posts, 43 | articles, and such. 44 | 45 | Submit Feedback 46 | ~~~~~~~~~~~~~~~ 47 | 48 | The best way to send feedback is to file an issue at https://github.com/alertot/detectem/issues. 49 | 50 | If you are proposing a feature: 51 | 52 | * Explain in detail how it would work. 53 | * Keep the scope as narrow as possible, to make it easier to implement. 54 | * Remember that this is a volunteer-driven project, and that contributions 55 | are welcome :) 56 | 57 | Get Started! 58 | ------------ 59 | 60 | Ready to contribute? Here's how to set up `detectem` for local development. 61 | 62 | 1. Fork the `detectem` repo on GitHub. 63 | 2. Clone your fork locally:: 64 | 65 | $ git clone git@github.com:your_name_here/detectem.git 66 | 67 | 3. Install your local copy into a virtualenv:: 68 | 69 | $ mkvirtualenv detectem 70 | $ cd detectem/ 71 | $ pip install -e . 72 | $ pip install -r requirements/devel.txt 73 | $ pip install -r requirements/tests.txt 74 | 75 | 4. You must include `black` and `isort` in your IDE. 76 | 77 | 5. Create a branch for local development:: 78 | 79 | $ git checkout -b name-of-your-bugfix-or-feature 80 | 81 | Now you can make your changes locally. 82 | 83 | 6. When you're done making changes, check that your changes pass the tests, including testing other Python versions with tox:: 84 | 85 | $ pytest . 86 | $ tox 87 | 88 | 7. Commit your changes and push your branch to GitHub:: 89 | 90 | $ git add . 91 | $ git commit -m "Your detailed description of your changes." 92 | $ git push origin name-of-your-bugfix-or-feature 93 | 94 | 8. Submit a pull request through the GitHub website. 95 | 96 | Pull Request Guidelines 97 | ----------------------- 98 | 99 | Before you submit a pull request, check that it meets these guidelines: 100 | 101 | 1. The pull request should include tests. 102 | 2. If the pull request adds functionality, the docs should be updated. Put 103 | your new functionality into a function with a docstring, and add the 104 | feature to the list in README.rst. 105 | 3. The pull request should work for Python 3.6. Check 106 | https://travis-ci.org/alertot/detectem/pull_requests 107 | and make sure that the tests pass for all supported Python versions. 108 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | History 3 | ======= 4 | 5 | 0.1.0 (2016-11-08) 6 | ------------------ 7 | 8 | * First release on PyPI. 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Claudio Salazar 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | 2 | include AUTHORS.rst 3 | 4 | include CONTRIBUTING.rst 5 | include HISTORY.rst 6 | include LICENSE 7 | include README.rst 8 | include requirements/*.txt 9 | include detectem/script.lua 10 | 11 | recursive-exclude tests/* 12 | recursive-exclude * __pycache__ 13 | recursive-exclude * *.py[co] 14 | 15 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif 16 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean clean-test clean-pyc clean-build docs help 2 | .DEFAULT_GOAL := help 3 | define BROWSER_PYSCRIPT 4 | import os, webbrowser, sys 5 | try: 6 | from urllib import pathname2url 7 | except: 8 | from urllib.request import pathname2url 9 | 10 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) 11 | endef 12 | export BROWSER_PYSCRIPT 13 | 14 | define PRINT_HELP_PYSCRIPT 15 | import re, sys 16 | 17 | for line in sys.stdin: 18 | match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) 19 | if match: 20 | target, help = match.groups() 21 | print("%-20s %s" % (target, help)) 22 | endef 23 | export PRINT_HELP_PYSCRIPT 24 | BROWSER := python -c "$$BROWSER_PYSCRIPT" 25 | 26 | help: 27 | @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) 28 | 29 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts 30 | 31 | 32 | clean-build: ## remove build artifacts 33 | rm -fr build/ 34 | rm -fr dist/ 35 | rm -fr .eggs/ 36 | find . -name '*.egg-info' -exec rm -fr {} + 37 | find . -name '*.egg' -exec rm -f {} + 38 | 39 | clean-pyc: ## remove Python file artifacts 40 | find . -name '*.pyc' -exec rm -f {} + 41 | find . -name '*.pyo' -exec rm -f {} + 42 | find . -name '*~' -exec rm -f {} + 43 | find . -name '__pycache__' -exec rm -fr {} + 44 | 45 | clean-test: ## remove test and coverage artifacts 46 | rm -fr .tox/ 47 | rm -f .coverage 48 | rm -fr htmlcov/ 49 | 50 | lint: ## check style with flake8 51 | flake8 detectem tests 52 | 53 | test: ## run tests quickly with the default Python 54 | py.test 55 | 56 | 57 | test-all: ## run tests on every Python version with tox 58 | tox 59 | 60 | coverage: ## check code coverage quickly with the default Python 61 | coverage run --source detectem -m pytest 62 | 63 | coverage report -m 64 | coverage html 65 | $(BROWSER) htmlcov/index.html 66 | 67 | release: clean ## package and upload a release 68 | python setup.py sdist upload 69 | python setup.py bdist_wheel upload 70 | 71 | dist: clean ## builds source and wheel package 72 | python setup.py sdist 73 | python setup.py bdist_wheel 74 | ls -l dist 75 | 76 | install: clean ## install the package to the active Python's site-packages 77 | python setup.py install 78 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | detectem 2 | ======== 3 | 4 | **Python 3.6 or greater is required.** 5 | 6 | .. image:: https://img.shields.io/pypi/v/detectem.svg 7 | :target: https://pypi.python.org/pypi/detectem 8 | 9 | .. image:: https://img.shields.io/travis/alertot/detectem.svg 10 | :target: https://travis-ci.org/alertot/detectem 11 | 12 | detectem is a specialized software detector. 13 | Let's see it in action. 14 | 15 | .. code-block:: bash 16 | 17 | $ det http://domain.tld 18 | [{'name': 'phusion-passenger', 'version': '4.0.10'}, 19 | {'name': 'apache-mod_bwlimited', 'version': '1.4'}, 20 | {'name': 'apache-mod_fcgid', 'version': '2.3.9'}, 21 | {'name': 'jquery', 'version': '1.11.3'}, 22 | {'name': 'crayon-syntax-highlighter', 'version': '2.7.2_beta'}] 23 | 24 | 25 | Using a serie of indicators, it's able to detect software running on a site 26 | and extract accurately its version information. 27 | It uses Splash_ API 28 | to render the website and start the detection routine. 29 | It does full analysis on requests, responses and even on the DOM_! 30 | 31 | There are two important articles to read: 32 | 33 | * `Reasons to create detectem `_ 34 | * `Introduction to detectem `_ 35 | 36 | 37 | Features 38 | -------- 39 | 40 | * Detect software in modern web technologies. 41 | * Browser support provided by Splash_. 42 | * Analysis on requests made and responses received by the browser. 43 | * Get software information from the DOM. 44 | * Great performance (less than 10 seconds to get a fingerprint). 45 | * Plugin system to add new software easily. 46 | * Test suite to ensure plugin result integrity. 47 | * Continuous development to support new features. 48 | 49 | 50 | Installation 51 | ------------ 52 | 53 | 1. Install Docker_ and add your user to the docker group, then you avoid to use sudo. 54 | 55 | 2. Pull the image:: 56 | 57 | $ docker pull scrapinghub/splash 58 | 59 | 3. Create a virtual environment with Python >= 3.6 . 60 | 61 | 4. Install detectem:: 62 | 63 | $ pip install detectem 64 | 65 | 5. Run it against some URL:: 66 | 67 | $ det http://domain.tld 68 | 69 | 70 | Other installation method 71 | ------------------------- 72 | 73 | detectem as Docker Container 74 | ------------------------------ 75 | 76 | Let's see it in action. 77 | 78 | .. code-block:: bash 79 | 80 | $ docker-compose run --rm detectem http://domain.tld 81 | [{'name': 'phusion-passenger', 'version': '4.0.10'}, 82 | {'name': 'apache-mod_bwlimited', 'version': '1.4'}, 83 | {'name': 'apache-mod_fcgid', 'version': '2.3.9'}, 84 | {'name': 'jquery', 'version': '1.11.3'}, 85 | {'name': 'crayon-syntax-highlighter', 'version': '2.7.2_beta'}] 86 | 87 | But first that all we must do: 88 | 89 | 90 | Installation 91 | ------------ 92 | 93 | 1. Install the last `Docker CE Stable version`_. 94 | 95 | 2. Add your user to the docker group and logout:: 96 | 97 | $ sudo usermod -aG docker $USER 98 | 99 | 3. Make sure you have logout to apply changes, then log in again. 100 | 101 | 4. Install `Docker Compose`_ 102 | 103 | 5. Download to your workspace the docker-compose building files. 104 | 105 | `Dockerfile-alternate`_ 106 | `docker-compose.yml`_ 107 | 108 | 6. Build the required docker images for detectem at the same directory as the 109 | previous point:: 110 | 111 | $ docker-compose up -d 112 | 113 | 7. Run detectem against some URL:: 114 | 115 | $ docker-compose run --rm detectem http://domain.tld 116 | 117 | 118 | Documentation 119 | ------------- 120 | 121 | The documentation is at `ReadTheDocs `_. 122 | 123 | .. _Docker: http://docker.io 124 | .. _Splash: https://github.com/scrapinghub/splash 125 | .. _DOM: https://en.wikipedia.org/wiki/Document_Object_Model 126 | .. _`Docker CE Stable version`: https://www.docker.com/community-edition 127 | .. _`Docker compose`: https://docs.docker.com/compose/install/ 128 | .. _Dockerfile-alternate: extras/docker/Dockerfile-alternate 129 | .. _docker-compose.yml: extras/docker/docker-compose.yml 130 | -------------------------------------------------------------------------------- /detectem/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = """Claudio Salazar""" 2 | __email__ = "csalazar@spect.cl" 3 | __version__ = "0.7.3" 4 | -------------------------------------------------------------------------------- /detectem/cli.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import sys 4 | import tempfile 5 | from collections import namedtuple 6 | from multiprocessing import Process, Queue, current_process 7 | from operator import attrgetter 8 | from urllib.parse import urlparse 9 | 10 | import click 11 | import click_log 12 | 13 | from detectem.core import Detector 14 | from detectem.exceptions import DockerStartError, NoPluginsError, SplashError 15 | from detectem.plugin import load_plugins 16 | from detectem.response import get_response 17 | from detectem.settings import ( 18 | CMD_OUTPUT, 19 | JSON_OUTPUT, 20 | NUMBER_OF_SPLASH_INSTANCES, 21 | SPLASH_MAX_RETRIES, 22 | SPLASH_TIMEOUT, 23 | ) 24 | from detectem.splash import get_splash_manager 25 | from detectem.utils import create_printer 26 | 27 | # Set up logging 28 | logger = logging.getLogger("detectem") 29 | click_log.ColorFormatter.colors["info"] = dict(fg="green") 30 | click_log.basic_config(logger) 31 | 32 | 33 | TaskItem = namedtuple("TaskItem", ["args", "retries"]) 34 | 35 | 36 | @click.command() 37 | @click.option( 38 | "--timeout", 39 | default=SPLASH_TIMEOUT, 40 | type=click.INT, 41 | help="Timeout for Splash (in seconds).", 42 | ) 43 | @click.option( 44 | "--format", 45 | default=CMD_OUTPUT, 46 | type=click.Choice([CMD_OUTPUT, JSON_OUTPUT]), 47 | help="Set the format of the results.", 48 | ) 49 | @click.option( 50 | "--metadata", 51 | default=False, 52 | is_flag=True, 53 | help="Include this flag to return plugin metadata.", 54 | ) 55 | @click.option("--list-plugins", is_flag=True, help="List registered plugins") 56 | @click.option("--save-har", is_flag=True, help="Save har to file") 57 | @click.option("-i", "--input-file", type=click.File("r"), help="Read URLs from file") 58 | @click_log.simple_verbosity_option(logger, default="error") 59 | @click.argument("input_url", required=False) 60 | def main(timeout, format, metadata, list_plugins, save_har, input_file, input_url): 61 | # Gather urls 62 | urls = [] 63 | if input_file: 64 | urls += input_file.read().splitlines() 65 | if input_url: 66 | urls.append(input_url) 67 | 68 | # Check that `urls` contains valid URLs 69 | if not all(map(lambda u: urlparse(u).scheme in ["http", "https"], urls)): 70 | raise click.BadParameter("Check that all provided URLs are valid URLS") 71 | 72 | OPTIONS_WITHOUT_URLS = [list_plugins] 73 | 74 | # Exit if neither urls were defined nor an option that works without urls 75 | if not urls and not any(OPTIONS_WITHOUT_URLS): 76 | click.echo(click.get_current_context().get_help()) 77 | sys.exit(1) 78 | 79 | printer = create_printer(format) 80 | 81 | # --list-plugins option 82 | if list_plugins: 83 | try: 84 | printer(get_plugins(metadata)) 85 | except NoPluginsError as e: 86 | printer(str(e)) 87 | finally: 88 | sys.exit(1) 89 | 90 | # Create queues 91 | task_queue = Queue() 92 | result_queue = Queue() 93 | 94 | # Init splash manager 95 | splash_manager = get_splash_manager() 96 | logger.info(f"[+] Using {splash_manager.__class__.__name__} as Splash manager") 97 | 98 | # Change number of instances if there are fewer urls to analyze 99 | n_instances = NUMBER_OF_SPLASH_INSTANCES 100 | if n_instances > len(urls): 101 | n_instances = len(urls) 102 | 103 | logger.info(f"[+] Using {n_instances} Splash instances") 104 | logger.info(f"[+] Setting up Splash manager") 105 | splash_manager.setup(n_instances) 106 | 107 | # Number of available instances could be different to `n_instances` because of issues starting instances 108 | n_available_instances = splash_manager.get_number_of_available_instances() 109 | if n_available_instances != n_instances: 110 | logger.info(f"[+] Only {n_available_instances} instances are going to be used") 111 | 112 | logger.info(f"[+] Setting up done") 113 | 114 | # Create pool of workers 115 | processes = [ 116 | Process( 117 | target=process_url_worker, 118 | args=(splash_manager, task_queue, result_queue), 119 | ) 120 | for _ in range(n_available_instances) 121 | ] 122 | 123 | # Start the workers 124 | for p in processes: 125 | p.start() 126 | 127 | # Send the provided urls to the input queue 128 | for url in urls: 129 | task_queue.put(TaskItem(args=[url, timeout, metadata, save_har], retries=0)) 130 | 131 | # Wait until processing on all workers is done 132 | for p in processes: 133 | p.join() 134 | 135 | # Process results 136 | results = [] 137 | while not result_queue.empty(): 138 | result = result_queue.get() 139 | results.append(result) 140 | 141 | printer(results) 142 | 143 | splash_manager.teardown() 144 | 145 | 146 | def process_url_worker(splash_manager, task_queue, result_queue): 147 | process_name = current_process().name 148 | 149 | with splash_manager.sem: 150 | task_item: TaskItem 151 | 152 | for task_item in iter(task_queue.get, "STOP"): 153 | args = task_item.args 154 | url = args[0] 155 | 156 | # Get a Splash instance from pool of Splash servers 157 | with splash_manager.assign_instance() as (container_name, splash_url): 158 | result = None 159 | 160 | logger.info( 161 | f"[+] Processing {url} @ {process_name} [retry: {task_item.retries} | instance: {container_name}]" 162 | ) 163 | 164 | try: 165 | result = get_detection_results(*args + [splash_url]) 166 | except SplashError as e: 167 | # Handle limit of retries 168 | retries = task_item.retries + 1 169 | 170 | if retries == SPLASH_MAX_RETRIES: 171 | result = { 172 | "url": url, 173 | "error": "Maximum number of retries reached.", 174 | } 175 | else: 176 | # Put back in `task_queue` with incremented `retries` 177 | task_queue.put(TaskItem(args=task_item.args, retries=retries)) 178 | 179 | # Notify error to the manager 180 | if splash_manager.handles_errors: 181 | splash_manager.handle_error(container_name) 182 | except (NoPluginsError, DockerStartError) as e: 183 | result = {"url": url, "error": str(e)} 184 | 185 | if result: 186 | result_queue.put(result) 187 | 188 | # Finish if there aren't any more tasks in the queue 189 | if task_queue.empty(): 190 | logger.info(f"[+] Processing is done @ {process_name}") 191 | return 192 | 193 | 194 | def get_detection_results( 195 | url, 196 | timeout, 197 | metadata=False, 198 | save_har=False, 199 | splash_url="", 200 | ): 201 | """Return results from detector. 202 | 203 | This function prepares the environment loading the plugins, 204 | getting the response and passing it to the detector. 205 | 206 | In case of errors, it raises exceptions to be handled externally. 207 | 208 | """ 209 | plugins = load_plugins() 210 | if not plugins: 211 | raise NoPluginsError("No plugins found") 212 | 213 | logger.debug("[+] Starting detection with %(n)d plugins", {"n": len(plugins)}) 214 | 215 | response = get_response(url, plugins, timeout, splash_url) 216 | 217 | # Save HAR 218 | if save_har: 219 | fd, path = tempfile.mkstemp(suffix=".har") 220 | logger.info(f"Saving HAR file to {path}") 221 | 222 | with open(fd, "w") as f: 223 | json.dump(response["har"], f) 224 | 225 | det = Detector(response, plugins, url) 226 | softwares = det.get_results(metadata=metadata) 227 | 228 | output = {"url": url, "softwares": softwares} 229 | 230 | return output 231 | 232 | 233 | def get_plugins(metadata): 234 | """Return the registered plugins. 235 | 236 | Load and return all registered plugins. 237 | """ 238 | plugins = load_plugins() 239 | if not plugins: 240 | raise NoPluginsError("No plugins found") 241 | 242 | results = [] 243 | for p in sorted(plugins.get_all(), key=attrgetter("name")): 244 | if metadata: 245 | data = {"name": p.name, "homepage": p.homepage} 246 | hints = getattr(p, "hints", []) 247 | if hints: 248 | data["hints"] = hints 249 | results.append(data) 250 | else: 251 | results.append(p.name) 252 | return results 253 | 254 | 255 | if __name__ == "__main__": 256 | main() 257 | -------------------------------------------------------------------------------- /detectem/core.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import urllib.parse 3 | 4 | from detectem.matchers import BodyMatcher, HeaderMatcher, UrlMatcher, XPathMatcher 5 | from detectem.results import Result, ResultCollection 6 | from detectem.settings import ( 7 | GENERIC_TYPE, 8 | HINT_TYPE, 9 | INDICATOR_TYPE, 10 | INLINE_SCRIPT_ENTRY, 11 | MAIN_ENTRY, 12 | RESOURCE_ENTRY, 13 | ) 14 | from detectem.utils import get_most_complete_pm, get_url, get_version_via_file_hashes 15 | 16 | logger = logging.getLogger("detectem") 17 | MATCHERS = { 18 | "url": UrlMatcher(), 19 | "body": BodyMatcher(), 20 | "header": HeaderMatcher(), 21 | "xpath": XPathMatcher(), 22 | } 23 | 24 | 25 | class HarProcessor: 26 | """This class process the HAR list returned by Splash 27 | adding some useful markers for matcher application 28 | """ 29 | 30 | @staticmethod 31 | def _set_entry_type(entry, entry_type): 32 | """ Set entry type (detectem internal metadata) """ 33 | entry.setdefault("detectem", {})["type"] = entry_type 34 | 35 | @staticmethod 36 | def _get_location(entry): 37 | """ Return `Location` header value if it's present in ``entry`` """ 38 | headers = entry["response"].get("headers", []) 39 | 40 | for header in headers: 41 | if header["name"] == "Location": 42 | return header["value"] 43 | 44 | return None 45 | 46 | @classmethod 47 | def _script_to_har_entry(cls, script, url): 48 | """ Return entry for embed script """ 49 | entry = { 50 | "request": {"url": url}, 51 | "response": {"url": url, "content": {"text": script}}, 52 | } 53 | 54 | cls._set_entry_type(entry, INLINE_SCRIPT_ENTRY) 55 | 56 | return entry 57 | 58 | def mark_entries(self, entries): 59 | """Mark one entry as main entry and the rest as resource entry. 60 | 61 | Main entry is the entry that contain response's body 62 | of the requested URL. 63 | """ 64 | 65 | for entry in entries: 66 | self._set_entry_type(entry, RESOURCE_ENTRY) 67 | 68 | # If first entry doesn't have a redirect, set is as main entry 69 | main_entry = entries[0] 70 | main_location = self._get_location(main_entry) 71 | if not main_location: 72 | self._set_entry_type(main_entry, MAIN_ENTRY) 73 | return 74 | 75 | # Resolve redirected URL and see if it's in the rest of entries 76 | main_url = urllib.parse.urljoin(get_url(main_entry), main_location) 77 | for entry in entries[1:]: 78 | url = get_url(entry) 79 | if url == main_url: 80 | self._set_entry_type(entry, MAIN_ENTRY) 81 | break 82 | else: 83 | # In fail case, set the first entry 84 | self._set_entry_type(main_entry, MAIN_ENTRY) 85 | 86 | def prepare(self, response, url): 87 | har = response.get("har", []) 88 | if har: 89 | self.mark_entries(har) 90 | 91 | # Detect embed scripts and add them to HAR list 92 | for script in response.get("scripts", []): 93 | har.append(self._script_to_har_entry(script, url)) 94 | 95 | return har 96 | 97 | 98 | class Detector: 99 | def __init__(self, response, plugins, requested_url): 100 | self.requested_url = requested_url 101 | self.har = HarProcessor().prepare(response, requested_url) 102 | 103 | self._softwares_from_splash = response["softwares"] 104 | self._plugins = plugins 105 | self._results = ResultCollection() 106 | 107 | @staticmethod 108 | def _get_entry_type(entry): 109 | """ Return entry type. """ 110 | return entry["detectem"]["type"] 111 | 112 | def get_hints(self, plugin): 113 | """ Return plugin hints from ``plugin``. """ 114 | hints = [] 115 | 116 | for hint_name in getattr(plugin, "hints", []): 117 | hint_plugin = self._plugins.get(hint_name) 118 | if hint_plugin: 119 | hint_result = Result( 120 | name=hint_plugin.name, 121 | homepage=hint_plugin.homepage, 122 | from_url=self.requested_url, 123 | type=HINT_TYPE, 124 | plugin=plugin.name, 125 | ) 126 | hints.append(hint_result) 127 | 128 | logger.debug(f"{plugin.name} & hint {hint_result.name} detected") 129 | else: 130 | logger.error(f"{plugin.name} hints an invalid plugin: {hint_name}") 131 | 132 | return hints 133 | 134 | def process_from_splash(self): 135 | """ Add softwares found in the DOM """ 136 | for software in self._softwares_from_splash: 137 | plugin = self._plugins.get(software["name"]) 138 | 139 | # Determine if it's a version or presence result 140 | try: 141 | additional_data = {"version": software["version"]} 142 | except KeyError: 143 | additional_data = {"type": INDICATOR_TYPE} 144 | 145 | self._results.add_result( 146 | Result( 147 | name=plugin.name, 148 | homepage=plugin.homepage, 149 | from_url=self.requested_url, 150 | plugin=plugin.name, 151 | **additional_data, 152 | ) 153 | ) 154 | 155 | for hint in self.get_hints(plugin): 156 | self._results.add_result(hint) 157 | 158 | def _get_matchers_for_entry(self, plugin, entry): 159 | grouped_matchers = plugin.get_grouped_matchers() 160 | 161 | def remove_group(group): 162 | if group in grouped_matchers: 163 | del grouped_matchers[group] 164 | 165 | if self._get_entry_type(entry) == MAIN_ENTRY: 166 | remove_group("body") 167 | remove_group("url") 168 | else: 169 | remove_group("header") 170 | remove_group("xpath") 171 | 172 | remove_group("dom") 173 | 174 | return grouped_matchers 175 | 176 | def apply_plugin_matchers(self, plugin, entry): 177 | data_list = [] 178 | grouped_matchers = self._get_matchers_for_entry(plugin, entry) 179 | 180 | for matcher_type, matchers in grouped_matchers.items(): 181 | klass = MATCHERS[matcher_type] 182 | plugin_match = klass.get_info(entry, *matchers) 183 | if plugin_match.name or plugin_match.version or plugin_match.presence: 184 | data_list.append(plugin_match) 185 | 186 | return get_most_complete_pm(data_list) 187 | 188 | def process_har(self): 189 | """ Detect plugins present in the page. """ 190 | hints = [] 191 | 192 | version_plugins = self._plugins.with_version_matchers() 193 | generic_plugins = self._plugins.with_generic_matchers() 194 | 195 | for entry in self.har: 196 | for plugin in version_plugins: 197 | pm = self.apply_plugin_matchers(plugin, entry) 198 | if not pm: 199 | continue 200 | 201 | # Set name if matchers could detect modular name 202 | if pm.name: 203 | name = "{}-{}".format(plugin.name, pm.name) 204 | else: 205 | name = plugin.name 206 | 207 | if pm.version: 208 | self._results.add_result( 209 | Result( 210 | name=name, 211 | version=pm.version, 212 | homepage=plugin.homepage, 213 | from_url=get_url(entry), 214 | plugin=plugin.name, 215 | ) 216 | ) 217 | elif pm.presence: 218 | # Try to get version through file hashes 219 | version = get_version_via_file_hashes(plugin, entry) 220 | if version: 221 | self._results.add_result( 222 | Result( 223 | name=name, 224 | version=version, 225 | homepage=plugin.homepage, 226 | from_url=get_url(entry), 227 | plugin=plugin.name, 228 | ) 229 | ) 230 | else: 231 | self._results.add_result( 232 | Result( 233 | name=name, 234 | homepage=plugin.homepage, 235 | from_url=get_url(entry), 236 | type=INDICATOR_TYPE, 237 | plugin=plugin.name, 238 | ) 239 | ) 240 | hints += self.get_hints(plugin) 241 | 242 | for plugin in generic_plugins: 243 | pm = self.apply_plugin_matchers(plugin, entry) 244 | if not pm: 245 | continue 246 | 247 | plugin_data = plugin.get_information(entry) 248 | 249 | # Only add to results if it's a valid result 250 | if "name" in plugin_data: 251 | self._results.add_result( 252 | Result( 253 | name=plugin_data["name"], 254 | homepage=plugin_data["homepage"], 255 | from_url=get_url(entry), 256 | type=GENERIC_TYPE, 257 | plugin=plugin.name, 258 | ) 259 | ) 260 | 261 | hints += self.get_hints(plugin) 262 | 263 | for hint in hints: 264 | self._results.add_result(hint) 265 | 266 | def get_results(self, metadata=False): 267 | """ Return results of the analysis. """ 268 | results_data = [] 269 | 270 | self.process_har() 271 | self.process_from_splash() 272 | 273 | for rt in sorted(self._results.get_results()): 274 | rdict = {"name": rt.name} 275 | if rt.version: 276 | rdict["version"] = rt.version 277 | 278 | if metadata: 279 | rdict["homepage"] = rt.homepage 280 | rdict["type"] = rt.type 281 | rdict["from_url"] = rt.from_url 282 | rdict["plugin"] = rt.plugin 283 | 284 | results_data.append(rdict) 285 | 286 | return results_data 287 | -------------------------------------------------------------------------------- /detectem/exceptions.py: -------------------------------------------------------------------------------- 1 | class DockerStartError(Exception): 2 | pass 3 | 4 | 5 | class NotNamedParameterFound(Exception): 6 | pass 7 | 8 | 9 | class SplashError(Exception): 10 | def __init__(self, msg): 11 | self.msg = "Splash error: {}".format(msg) 12 | super().__init__(self.msg) 13 | 14 | 15 | class NoPluginsError(Exception): 16 | def __init__(self, msg): 17 | self.msg = msg 18 | super().__init__(self.msg) 19 | -------------------------------------------------------------------------------- /detectem/matchers.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import namedtuple 3 | 4 | from parsel import Selector 5 | 6 | from detectem.utils import get_response_body 7 | 8 | PluginMatch = namedtuple("PluginMatch", "name,version,presence") 9 | 10 | 11 | def extract_named_group(text, named_group, matchers, return_presence=False): 12 | """Return ``named_group`` match from ``text`` reached 13 | by using a matcher from ``matchers``. 14 | 15 | It also supports matching without a ``named_group`` in a matcher, 16 | which sets ``presence=True``. 17 | 18 | ``presence`` is only returned if ``return_presence=True``. 19 | 20 | """ 21 | presence = False 22 | 23 | for matcher in matchers: 24 | if isinstance(matcher, str): 25 | v = re.search(matcher, text, flags=re.DOTALL) 26 | if v: 27 | dict_result = v.groupdict() 28 | try: 29 | return dict_result[named_group] 30 | except KeyError: 31 | if dict_result: 32 | # It's other named group matching, discard 33 | continue 34 | else: 35 | # It's a matcher without named_group 36 | # but we can't return it until every matcher pass 37 | # because a following matcher could have a named group 38 | presence = True 39 | elif callable(matcher): 40 | v = matcher(text) 41 | if v: 42 | return v 43 | 44 | if return_presence and presence: 45 | return "presence" 46 | 47 | return None 48 | 49 | 50 | def extract_version(text, *matchers): 51 | return extract_named_group(text, "version", matchers, return_presence=True) 52 | 53 | 54 | def extract_name(text, *matchers): 55 | return extract_named_group(text, "name", matchers) 56 | 57 | 58 | class UrlMatcher: 59 | @classmethod 60 | def get_info(cls, entry, *matchers): 61 | name = None 62 | version = None 63 | presence = False 64 | 65 | for rtype in ["request", "response"]: 66 | try: 67 | url = entry[rtype]["url"] 68 | except KeyError: 69 | # It could not contain response 70 | continue 71 | 72 | if not name: 73 | name = extract_name(url, *matchers) 74 | 75 | if not version: 76 | version = extract_version(url, *matchers) 77 | if version: 78 | if version == "presence": 79 | presence = True 80 | version = None 81 | 82 | return PluginMatch(name=name, version=version, presence=presence) 83 | 84 | 85 | class BodyMatcher: 86 | @classmethod 87 | def get_info(cls, entry, *matchers): 88 | name = None 89 | version = None 90 | presence = False 91 | body = get_response_body(entry) 92 | 93 | name = extract_name(body, *matchers) 94 | version = extract_version(body, *matchers) 95 | if version: 96 | if version == "presence": 97 | presence = True 98 | version = None 99 | 100 | return PluginMatch(name=name, version=version, presence=presence) 101 | 102 | 103 | class HeaderMatcher: 104 | @classmethod 105 | def _get_matches(cls, headers, *matchers): 106 | try: 107 | for matcher_name, matcher_value in matchers: 108 | for header in headers: 109 | if header["name"] == matcher_name: 110 | yield header["value"], matcher_value 111 | except ValueError: 112 | raise ValueError("Header matcher value must be a tuple") 113 | 114 | @classmethod 115 | def get_info(cls, entry, *matchers): 116 | name = None 117 | version = None 118 | presence = False 119 | headers = entry["response"]["headers"] 120 | 121 | for hstring, hmatcher in cls._get_matches(headers, *matchers): 122 | # Avoid overriding 123 | if not name: 124 | name = extract_name(hstring, hmatcher) 125 | 126 | if not version: 127 | version = extract_version(hstring, hmatcher) 128 | if version: 129 | if version == "presence": 130 | presence = True 131 | version = None 132 | 133 | return PluginMatch(name=name, version=version, presence=presence) 134 | 135 | 136 | class XPathMatcher: 137 | @classmethod 138 | def get_info(cls, entry, *matchers): 139 | name = None 140 | version = None 141 | presence = False 142 | body = get_response_body(entry) 143 | selector = Selector(text=body) 144 | 145 | for matcher in matchers: 146 | if len(matcher) == 2: 147 | xpath, regexp = matcher 148 | else: 149 | xpath = matcher[0] 150 | regexp = None 151 | 152 | value = selector.xpath(xpath).extract_first() 153 | if not value: 154 | continue 155 | 156 | if regexp: 157 | # Avoid overriding 158 | if not name: 159 | name = extract_name(value, regexp) 160 | 161 | version = extract_version(value, regexp) 162 | if version == "presence": 163 | presence = True 164 | version = None 165 | break 166 | else: 167 | presence = True 168 | 169 | return PluginMatch(name=name, version=version, presence=presence) 170 | -------------------------------------------------------------------------------- /detectem/plugin.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import inspect 3 | import logging 4 | import re 5 | from importlib.util import find_spec, module_from_spec 6 | 7 | from zope.interface import Attribute, Interface, implementer 8 | from zope.interface.exceptions import BrokenImplementation 9 | from zope.interface.verify import verifyObject 10 | 11 | from detectem.settings import PLUGIN_PACKAGES 12 | 13 | logger = logging.getLogger("detectem") 14 | 15 | LANGUAGE_TAGS = [ 16 | "php", 17 | "python", 18 | "ruby", 19 | "perl", 20 | "node.js", 21 | "javascript", 22 | "asp.net", 23 | "java", 24 | "go", 25 | "ruby on rails", 26 | "cfml", 27 | ] 28 | FRAMEWORK_TAGS = [ 29 | "django", 30 | "angular", 31 | "backbone", 32 | "react", 33 | "symfony", 34 | "bootstrap", 35 | "vue", 36 | "laravel", 37 | "woltlab", 38 | "knockout", 39 | "ember", 40 | ] 41 | PRODUCT_TAGS = [ 42 | "wordpress", 43 | "mysql", 44 | "jquery", 45 | "mootools", 46 | "apache", 47 | "iis", 48 | "nginx", 49 | "ssl", 50 | "joomla!", 51 | "drupal", 52 | "underscore.js", 53 | "marionette.js", 54 | "moment timezone", 55 | "moment.js", 56 | "devtools", 57 | "teamcity", 58 | "google code prettyfy", 59 | "solr", 60 | "postgresql", 61 | "octopress", 62 | "k2", 63 | "sobi 2", 64 | "sobipro", 65 | "virtuemart", 66 | "tomcat", 67 | "coldfusion", 68 | "jekill", 69 | "less", 70 | "windows server", 71 | "mysql", 72 | "waf", 73 | "webpack", 74 | ] 75 | CATEGORY_TAGS = [ 76 | "cms", 77 | "seo", 78 | "blog", 79 | "advertising networks", 80 | "analytics", 81 | "wiki", 82 | "document management system", 83 | "miscellaneous", 84 | "message board", 85 | "angular", 86 | "js framework", 87 | "web framework", 88 | "visualization", 89 | "graphics", 90 | "web server", 91 | "wiki", 92 | "editor", 93 | "ecommerce", 94 | "accounting", 95 | "database manager", 96 | "photo gallery", 97 | "issue tracker", 98 | "mobile framework", 99 | "slider", 100 | "accounting", 101 | "programming language", 102 | "hosting panel", 103 | "lms", 104 | "js graphic", 105 | "exhibit", 106 | "marketing automation", 107 | "search engine", 108 | "documentation tool", 109 | "database", 110 | "template engine", 111 | "module bundler", 112 | ] 113 | 114 | HARDWARE_TAGS = ["router", "hmi"] 115 | PLUGIN_TAGS = ( 116 | LANGUAGE_TAGS + FRAMEWORK_TAGS + PRODUCT_TAGS + CATEGORY_TAGS + HARDWARE_TAGS 117 | ) 118 | 119 | 120 | class PluginCollection(object): 121 | def __init__(self): 122 | self._plugins = {} 123 | 124 | def __len__(self): 125 | return len(self._plugins) 126 | 127 | def add(self, ins): 128 | self._plugins[ins.name] = ins 129 | 130 | def get(self, name): 131 | return self._plugins.get(name) 132 | 133 | def get_all(self): 134 | return self._plugins.values() 135 | 136 | def with_version_matchers(self): 137 | return [p for p in self._plugins.values() if p.is_version] 138 | 139 | def with_dom_matchers(self): 140 | return [p for p in self._plugins.values() if p.is_dom] 141 | 142 | def with_generic_matchers(self): 143 | return [p for p in self._plugins.values() if p.is_generic] 144 | 145 | 146 | class _PluginLoader: 147 | def __init__(self): 148 | self.plugins = PluginCollection() 149 | 150 | def _full_class_name(self, ins): 151 | return "{}.{}".format(ins.__class__.__module__, ins.__class__.__name__) 152 | 153 | def _get_plugin_module_paths(self, plugin_dir): 154 | """ Return a list of every module in `plugin_dir`. """ 155 | filepaths = [ 156 | fp 157 | for fp in glob.glob("{}/**/*.py".format(plugin_dir), recursive=True) 158 | if not fp.endswith("__init__.py") 159 | ] 160 | rel_paths = [re.sub(plugin_dir.rstrip("/") + "/", "", fp) for fp in filepaths] 161 | module_paths = [rp.replace("/", ".").replace(".py", "") for rp in rel_paths] 162 | 163 | return module_paths 164 | 165 | def _is_plugin_ok(self, instance): 166 | """Return `True` if: 167 | 1. Plugin meets plugin interface. 168 | 2. Is not already registered in the plugin collection. 169 | 3. Have accepted tags. 170 | 171 | Otherwise, return `False` and log warnings. 172 | 173 | """ 174 | try: 175 | verifyObject(IPlugin, instance) 176 | except BrokenImplementation: 177 | logger.warning( 178 | "Plugin '%(name)s' doesn't provide the plugin interface", 179 | {"name": self._full_class_name(instance)}, 180 | ) 181 | return False 182 | 183 | # Check if the plugin is already registered 184 | reg = self.plugins.get(instance.name) 185 | if reg: 186 | logger.warning( 187 | "Plugin '%(name)s' by '%(instance)s' is already provided by '%(reg)s'", 188 | { 189 | "name": instance.name, 190 | "instance": self._full_class_name(instance), 191 | "reg": self._full_class_name(reg), 192 | }, 193 | ) 194 | return False 195 | 196 | for tag in instance.tags: 197 | if tag not in PLUGIN_TAGS: 198 | logger.warning( 199 | "Invalid tag '%(tag)s' in '%(instance)s'", 200 | {"tag": tag, "instance": self._full_class_name(instance)}, 201 | ) 202 | return False 203 | 204 | return True 205 | 206 | def load_plugins(self, plugins_package): 207 | """ Load plugins from `plugins_package` module. """ 208 | try: 209 | # Resolve directory in the filesystem 210 | plugin_dir = find_spec(plugins_package).submodule_search_locations[0] 211 | except ImportError: 212 | logger.error( 213 | "Could not load plugins package '%(pkg)s'", {"pkg": plugins_package} 214 | ) 215 | return 216 | 217 | for module_path in self._get_plugin_module_paths(plugin_dir): 218 | # Load the module dynamically 219 | spec = find_spec("{}.{}".format(plugins_package, module_path)) 220 | m = module_from_spec(spec) 221 | spec.loader.exec_module(m) 222 | 223 | # Get classes from module and extract the plugin classes 224 | classes = inspect.getmembers(m, predicate=inspect.isclass) 225 | for _, klass in classes: 226 | # Avoid imports processing 227 | if klass.__module__ != spec.name: 228 | continue 229 | 230 | # Avoid classes not ending in Plugin 231 | if not klass.__name__.endswith("Plugin"): 232 | continue 233 | 234 | instance = klass() 235 | if self._is_plugin_ok(instance): 236 | self.plugins.add(instance) 237 | 238 | 239 | def load_plugins(): 240 | """ Return the list of plugin instances. """ 241 | loader = _PluginLoader() 242 | 243 | for pkg in PLUGIN_PACKAGES: 244 | loader.load_plugins(pkg) 245 | 246 | return loader.plugins 247 | 248 | 249 | class IPlugin(Interface): 250 | name = Attribute(""" Name to identify the plugin. """) 251 | homepage = Attribute(""" Plugin homepage. """) 252 | tags = Attribute(""" Tags to categorize plugins """) 253 | matchers = Attribute(""" List of matchers """) 254 | 255 | 256 | @implementer(IPlugin) 257 | class Plugin: 258 | """Class used by normal plugins. 259 | It implements :class:`~IPlugin`. 260 | 261 | """ 262 | 263 | ptype = "normal" 264 | 265 | def get_matchers(self, matcher_type): 266 | return [m[matcher_type] for m in self.matchers if matcher_type in m] 267 | 268 | def get_grouped_matchers(self): 269 | """Return dictionary of matchers (not empty ones) 270 | with matcher type as key and matcher list as value. 271 | 272 | """ 273 | data = {} 274 | for matcher_type in ["url", "body", "header", "xpath", "dom"]: 275 | matcher_list = self.get_matchers(matcher_type) 276 | if matcher_list: 277 | data[matcher_type] = matcher_list 278 | 279 | return data 280 | 281 | @property 282 | def is_version(self): 283 | return self.ptype == "normal" 284 | 285 | @property 286 | def is_dom(self): 287 | return any([m for m in self.matchers if "dom" in m]) 288 | 289 | @property 290 | def is_generic(self): 291 | return self.ptype == "generic" 292 | 293 | 294 | class GenericPlugin(Plugin): 295 | """ Class used by generic plugins. """ 296 | 297 | ptype = "generic" 298 | 299 | def get_information(self, entry): 300 | raise NotImplementedError() 301 | -------------------------------------------------------------------------------- /detectem/plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alertot/detectem/bc5f073575643c4c95a778ef576a5f0cbb1d3852/detectem/plugins/__init__.py -------------------------------------------------------------------------------- /detectem/plugins/angular.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class AngularPlugin(Plugin): 5 | name = "angular" 6 | homepage = "https://angular.io/" 7 | tags = ["angular", "js framework"] 8 | matchers = [ 9 | {"xpath": ("//app-root/@ng-version", r"(?P[0-9a-z\.-]+)")}, 10 | { 11 | "dom": ( 12 | "window.getAllAngularRootElements", 13 | 'window.getAllAngularRootElements()[0].attributes["ng-version"].value', 14 | ) 15 | }, 16 | {"dom": ("window.ng && window.ng.coreTokens", None)}, 17 | ] 18 | -------------------------------------------------------------------------------- /detectem/plugins/angularjs.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class AngularjsPlugin(Plugin): 5 | name = "angularjs" 6 | homepage = "https://angularjs.org/" 7 | tags = ["angular", "js framework"] 8 | 9 | matchers = [ 10 | {"body": r"^/\*\s+AngularJS v(?P[0-9a-z\.-]+)\s"}, 11 | {"url": r"angular\.?js/(?P[0-9\.]+)/angular(\.min)?\.js"}, 12 | {"url": r"/(?:angular-)(?P\w+)(\.min)?\.js"}, 13 | { 14 | "dom": ( 15 | "window.angular && window.angular.version", 16 | "window.angular.version.full", 17 | ) 18 | }, 19 | ] 20 | -------------------------------------------------------------------------------- /detectem/plugins/apache.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class ApachePlugin(Plugin): 5 | name = "apache" 6 | homepage = "http://httpd.apache.org/" 7 | tags = ["web server", "apache"] 8 | 9 | matchers = [{"header": ("Server", r"Apache/(?P[0-9\.]+)")}] 10 | 11 | 12 | class ApacheCoyotePlugin(Plugin): 13 | name = "apache-coyote" 14 | homepage = "http://httpd.apache.org/" 15 | tags = ["apache"] 16 | 17 | matchers = [{"header": ("Server", r"Apache-Coyote/(?P[0-9\.]+)")}] 18 | 19 | 20 | class ApacheModbwlimitedPlugin(Plugin): 21 | name = "apache-mod_bwlimited" 22 | homepage = "http://cpanel.com/" # It comes with cpanel 23 | tags = ["apache"] 24 | 25 | matchers = [{"header": ("Server", r"mod_bwlimited/(?P[0-9\.]+)")}] 26 | 27 | 28 | class ApacheModfcgidPlugin(Plugin): 29 | name = "apache-mod_fcgid" 30 | homepage = "https://httpd.apache.org/mod_fcgid/" 31 | tags = ["apache"] 32 | 33 | matchers = [{"header": ("Server", r"mod_fcgid/(?P[0-9\.]+)")}] 34 | -------------------------------------------------------------------------------- /detectem/plugins/backbonejs.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class BackboneJsPlugin(Plugin): 5 | name = "backbone.js" 6 | homepage = "http://backbonejs.org" 7 | tags = ["backbone", "js framework"] 8 | 9 | matchers = [ 10 | {"body": r"^//\s+Backbone\.js (?P[0-9\.]+)"}, 11 | {"url": r"/backbone\.?js/(?P[0-9\.]+)/backbone(-min)?\.js"}, 12 | {"url": r"/backbone-(?P[0-9\.]+)(\.min)?\.js"}, 13 | {"dom": ("window.Backbone", "window.Backbone.VERSION")}, 14 | ] 15 | hints = ["underscore.js"] 16 | -------------------------------------------------------------------------------- /detectem/plugins/crayon-syntax-highlighter.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class CrayonSyntaxHighlighterPlugin(Plugin): 5 | name = "crayon-syntax-highlighter" 6 | homepage = "https://wordpress.org/plugins-wp/crayon-syntax-highlighter/" 7 | tags = ["wordpress"] 8 | 9 | matchers = [ 10 | {"dom": ("window.CrayonSyntaxSettings", "window.CrayonSyntaxSettings.version")} 11 | ] 12 | -------------------------------------------------------------------------------- /detectem/plugins/d3js.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class D3JSPlugin(Plugin): 5 | name = "d3.js" 6 | homepage = "https://d3js.org" 7 | vendor = "Mike Bostock" 8 | tags = ["javascript", "graphics", "visualization"] 9 | 10 | matchers = [ 11 | {"body": r"// https://d3js.org Version (?P[0-9\.]+)\. Copyright"}, 12 | {"url": r"[dD]3(\.js)?/(?P[0-9\.]+)/d3(\.min)?\.js"}, 13 | {"dom": ("window.d3", "window.d3.version")}, 14 | ] 15 | -------------------------------------------------------------------------------- /detectem/plugins/emberjs.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class EmberJSPlugin(Plugin): 5 | name = "ember" 6 | homepage = "http://emberjs.com" 7 | tags = ["ember", "javascript", "js framework"] 8 | 9 | matchers = [{"dom": ("window.Ember", "window.Ember.VERSION")}] 10 | -------------------------------------------------------------------------------- /detectem/plugins/generic/__init__.py: -------------------------------------------------------------------------------- 1 | from .wordpress import WordpressGenericPlugin 2 | -------------------------------------------------------------------------------- /detectem/plugins/generic/wordpress.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pkgutil 3 | import re 4 | 5 | from detectem.plugin import GenericPlugin 6 | from detectem.utils import get_url 7 | 8 | 9 | class WordpressGenericPlugin(GenericPlugin): 10 | name = "wordpress_generic" 11 | homepage = "https://wordpress.org/plugins/%s/" 12 | tags = ["wordpress"] 13 | plugins = {} 14 | 15 | def __init__(self, *args, **kwargs): 16 | super().__init__(*args, **kwargs) 17 | 18 | wordpress_data = pkgutil.get_data("detectem", "data/wordpress.jl") 19 | for line in wordpress_data.splitlines(): 20 | data = json.loads(line) 21 | self.plugins[data["name"]] = data["vendor"] 22 | 23 | matchers = [{"url": "/wp-content/plugins/"}] 24 | 25 | def get_information(self, entry): 26 | name_match = re.findall("/wp-content/plugins/([^/]+)/", get_url(entry)) 27 | # There are weird cases with malformed plugins urls 28 | if not name_match: 29 | return {} 30 | 31 | name = name_match[0].lower() 32 | homepage = self.homepage % name 33 | 34 | try: 35 | vendor = self.plugins[name] 36 | except KeyError: 37 | vendor = None 38 | 39 | return {"name": name, "homepage": homepage, "vendor": vendor} 40 | -------------------------------------------------------------------------------- /detectem/plugins/ghost.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | from detectem.plugins.helpers import meta_generator 3 | 4 | 5 | class GhostPlugin(Plugin): 6 | name = "ghost" 7 | homepage = "https://www.ghost.org/" 8 | tags = ["blog", "node.js"] 9 | 10 | matchers = [{"xpath": (meta_generator("Ghost"), r"(?P[0-9\.]+)")}] 11 | -------------------------------------------------------------------------------- /detectem/plugins/helpers.py: -------------------------------------------------------------------------------- 1 | def meta_generator(name): 2 | return ( 3 | '//meta[re:test(@name,"generator","i") and contains(@content, "{}")]' 4 | "/@content".format(name) 5 | ) 6 | -------------------------------------------------------------------------------- /detectem/plugins/iis.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class IISPlugin(Plugin): 5 | name = "iis" 6 | homepage = "https://www.iis.net/" 7 | tags = ["web server", "iis"] 8 | 9 | matchers = [{"header": ("Server", r"Microsoft-IIS/(?P[0-9\.]+)")}] 10 | -------------------------------------------------------------------------------- /detectem/plugins/joomla.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class JoomlaPlugin(Plugin): 5 | name = "joomla!" 6 | homepage = "https://www.joomla.org/" 7 | vendor = "Open Source Matters, Inc." 8 | tags = ["joomla!", "cms", "php"] 9 | 10 | matchers = [ 11 | { 12 | "body": '[0-9\.]+)( \S+)? \| \(c\)"}, 11 | {"body": r"/\*\!? jQuery v(?P[0-9\.]+) \| \(c\)"}, 12 | { 13 | "body": r"/\*\!? jQuery v(?P[0-9\.]+) jquery.com \| jquery.org/license" 14 | }, 15 | {"body": r"\* jQuery JavaScript Library v(?P[0-9\.]+)"}, 16 | {"url": r"/jquery/(?P[0-9\.]+)/jquery(\.slim)?(\.min)?\.js"}, 17 | {"url": r"/jquery-(?P[0-9\.]+)(\.slim)?(\.min)?\.js"}, 18 | {"dom": ('"jQuery" in window', "window.jQuery().jquery")}, 19 | ] 20 | 21 | 22 | class ColorBoxPlugin(Plugin): 23 | name = "jquery-colorbox" 24 | homepage = "http://www.jacklmoore.com/colorbox/" 25 | tags = ["javascript", "jquery"] 26 | 27 | matchers = [{"body": r"// ColorBox v(?P[0-9\.]+) - a full featured"}] 28 | 29 | 30 | class JqueryMigratePlugin(Plugin): 31 | name = "jquery-migrate" 32 | homepage = "https://github.com/jquery/jquery-migrate" 33 | tags = ["javascript", "jquery"] 34 | 35 | matchers = [{"body": r"/*! jQuery Migrate v(?P[0-9\.]+) \| \(c\) jQuery"}] 36 | -------------------------------------------------------------------------------- /detectem/plugins/jqueryui.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class JqueryUiPlugin(Plugin): 5 | name = "jqueryui" 6 | homepage = "http://jqueryui.com" 7 | tags = ["javascript", "jquery"] 8 | 9 | matchers = [ 10 | {"body": r"jQuery UI (\w+ )+(?P[0-9\.]+)"}, 11 | {"body": r"/\*! jQuery UI - v(?P[0-9\.]+)"}, 12 | {"url": r"ui/(?P[0-9\.]+)/jquery-ui(\.min)?\.js"}, 13 | ] 14 | -------------------------------------------------------------------------------- /detectem/plugins/knockoutjs.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class KnockoutJSPlugin(Plugin): 5 | name = "knockoutjs" 6 | homepage = "http://knockoutjs.com/" 7 | vendor = "Steve Sanderson" 8 | tags = ["javascript", "knockout", "js framework"] 9 | 10 | matchers = [ 11 | {"body": r"^//\s+Knockout\.js (?P[0-9\.]+)"}, 12 | {"url": r"/knockout(.js)?/(?P[0-9\.]+)/knockout(-min)?\.js"}, 13 | {"url": r"/knockout-(?P[0-9\.]+|latest)(\.min)?\.js"}, 14 | {"dom": ("window.ko", "window.ko.version")}, 15 | {"xpath": ('//script[@data-requiremodule="knockout"]', None)}, 16 | ] 17 | -------------------------------------------------------------------------------- /detectem/plugins/lightbox.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class LightBoxPlugin(Plugin): 5 | name = "lightbox" 6 | homepage = "http://lokeshdhakar.com/projects/lightbox2/" 7 | tags = ["javascript"] 8 | 9 | matchers = [ 10 | {"body": r"\* Lightbox v(?P[0-9\.]+).*Lokesh Dhakar"}, 11 | {"url": r"/lightbox2/(?P[0-9\.]+)/js/lightbox(\.min)?\.(js|css)"}, 12 | {"url": r"/lightbox2/([^/]+/)*lightbox(\.min)?\.(js|css)"}, 13 | ] 14 | 15 | 16 | class PrettyPhotoPlugin(Plugin): 17 | name = "prettyphoto" 18 | homepage = "http://www.no-margin-for-errors.com/projects/prettyphoto-jquery-lightbox-clone/" # noqa: E501 19 | tags = ["javascript"] 20 | 21 | matchers = [ 22 | {"body": r'prettyPhoto\s*=\s*{version:\s*[\'"](?P[0-9\.]+)[\'"]}'}, 23 | {"url": r"/prettyPhoto/(?P[0-9\.]+)/css/prettyPhoto(\.min)?\.css"}, 24 | { 25 | "url": r"/prettyPhoto/(?P[0-9\.]+)/js/jquery\.prettyPhoto(\.min)?\.js" 26 | }, 27 | ] 28 | -------------------------------------------------------------------------------- /detectem/plugins/modernizr.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class ModernizrPlugin(Plugin): 5 | name = "modernizr" 6 | homepage = "http://www.modernizr.com/" 7 | tags = ["javascript"] 8 | 9 | matchers = [ 10 | {"body": r"/\* Modernizr (?P[0-9\.]+) \(Custom Build\)"}, 11 | {"url": r"/modernizr/(?P[0-9\.]+)/modernizr(\.min)?\.js"}, 12 | {"url": r"/modernizr-(?P[0-9\.]+)(\.min)?\.js"}, 13 | {"dom": ("window.Modernizr", "window.Modernizr._version")}, 14 | ] 15 | -------------------------------------------------------------------------------- /detectem/plugins/momentjs.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class MomentJSPlugin(Plugin): 5 | name = "moment.js" 6 | homepage = "http://momentjs.com/" 7 | tags = ["javascript"] 8 | 9 | matchers = [ 10 | {"body": r"//! moment\.js\s+//! version : (?P[0-9\.]+)"}, 11 | {"url": r"/moment\.js/(?P[0-9\.]+)/moment(\.min)?\.js"}, 12 | {"dom": ("window.moment", "window.moment.version")}, 13 | ] 14 | -------------------------------------------------------------------------------- /detectem/plugins/mootools.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class MooToolsCorePlugin(Plugin): 5 | name = "mootools-core" 6 | homepage = "https://mootools.net/core" 7 | tags = ["javascript", "mootools"] 8 | 9 | matchers = [{"dom": ("window.MooTools", "window.MooTools.version")}] 10 | 11 | 12 | class MooToolsMorePlugin(Plugin): 13 | name = "mootools-more" 14 | homepage = "https://mootools.net/more" 15 | tags = ["javascript", "mootools"] 16 | 17 | matchers = [ 18 | { 19 | "dom": ( 20 | "window.MooTools && window.MooTools.More", 21 | "window.MooTools.More.version", 22 | ) 23 | } 24 | ] 25 | -------------------------------------------------------------------------------- /detectem/plugins/nginx.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class NginxPlugin(Plugin): 5 | name = "nginx" 6 | homepage = "https://www.nginx.com/" 7 | vendor = "NGINX" 8 | tags = ["web server", "nginx"] 9 | 10 | matchers = [ 11 | {"header": ("Server", r"nginx/(?P[0-9\.]+)")}, 12 | {"header": ("Server", r"nginx\s*")}, 13 | ] 14 | -------------------------------------------------------------------------------- /detectem/plugins/php.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class PhpPlugin(Plugin): 5 | name = "php" 6 | homepage = "http://php.net/" 7 | tags = ["php"] 8 | 9 | matchers = [{"header": ("X-Powered-By", r"PHP/(?P[0-9\.]+)")}] 10 | -------------------------------------------------------------------------------- /detectem/plugins/phusion-passenger.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class PhusionPassengerPlugin(Plugin): 5 | name = "phusion-passenger" 6 | homepage = "https://www.phusionpassenger.com/" 7 | tags = ["web server"] 8 | 9 | matchers = [{"header": ("Server", r"Phusion_Passenger/(?P[0-9\.]+)")}] 10 | -------------------------------------------------------------------------------- /detectem/plugins/piwik.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class PiwikPlugin(Plugin): 5 | name = "piwik" 6 | homepage = "https://matomo.org/" 7 | vendor = "Matomo" 8 | tags = ["analytics"] 9 | 10 | matchers = [{"body": r"/\*!!\s+ \* Piwik - free/libre analytics platform"}] 11 | file_hashes = { 12 | "/piwik.js": { 13 | "2.0.2": "a3dc8ef0fea499626ae53bc8e1a1d5def45bf3c3ea4c90aae38325bcd40a4198", 14 | "2.0.3": "a3dc8ef0fea499626ae53bc8e1a1d5def45bf3c3ea4c90aae38325bcd40a4198", 15 | "2.1.0": "36e634c0b665c18a45fb01afc067d8da014295c25fe62445f9ee46a7936a7551", 16 | "2.10.0": "14a4d7d5ec8a8ed2bcf6861bd418ad5c015cbd38a33d4e777a4e82b15aaba416", 17 | "2.11.0": "c507d83a495dabd4562d9e8d7a89295a0c817bdfe3f355e5409d52a4387591e9", 18 | "2.11.1": "c507d83a495dabd4562d9e8d7a89295a0c817bdfe3f355e5409d52a4387591e9", 19 | "2.11.2": "03a9ee60740e86308067e0dcda878a1e9087e437c926bcd114fee1fc66352223", 20 | "2.12.0": "5133454dc113dd1149879a08349aac88aaff835a963faf945f848dfd66e64530", 21 | "2.12.1": "5133454dc113dd1149879a08349aac88aaff835a963faf945f848dfd66e64530", 22 | "2.13.0": "c9bef9c3b566f387eaff6d62107de48a951515c4549b27dafe70f21ad8c62b25", 23 | "2.13.1": "c9bef9c3b566f387eaff6d62107de48a951515c4549b27dafe70f21ad8c62b25", 24 | "2.14.0": "6dae32a01833cd0ce2f55c5bd910ffa21a032b6227eb42701386ae8181a06f54", 25 | "2.14.1": "6dae32a01833cd0ce2f55c5bd910ffa21a032b6227eb42701386ae8181a06f54", 26 | "2.14.2": "9fbda8a59fbfc183b5ef3f5190d543574bc6dd1468f80a81fce74e5c212171f2", 27 | "2.14.3": "9fbda8a59fbfc183b5ef3f5190d543574bc6dd1468f80a81fce74e5c212171f2", 28 | "2.15.0": "02e66e19e5d2b0957f948fba33c867652f7607a1c27676745b48263f40d03e3e", 29 | "2.16.0": "5af2a36db66a4d78269adf19d3e1485f71ed9b45220026bab21d3595b5ab3d97", 30 | "2.16.1": "4ca8f7722320d5e59ac553dc60baf881d5fddc53eef14a442c8f69bc2b481a4a", 31 | "2.16.2": "d3049c2dd205f92b69e0938521ab7e2a2258276e693afc965095d84f70d8b336", 32 | "2.16.3": "a569ed96e0068f4a12783f58bad7ba46644fb5cf571fed1634956a5ab4ce6792", 33 | "2.16.4": "a569ed96e0068f4a12783f58bad7ba46644fb5cf571fed1634956a5ab4ce6792", 34 | "2.16.5": "a569ed96e0068f4a12783f58bad7ba46644fb5cf571fed1634956a5ab4ce6792", 35 | "2.17.0": "a569ed96e0068f4a12783f58bad7ba46644fb5cf571fed1634956a5ab4ce6792", 36 | "2.17.1": "714576ef1d7b58980b7658ae9b8b4d74a223fba87934dc442db4098873e179a3", 37 | "2.2.0": "4baa6799598d2bbcb9e01626d2dcc11d46e2d1045f05fb49f557a0ff82b96c2a", 38 | "2.2.1": "4baa6799598d2bbcb9e01626d2dcc11d46e2d1045f05fb49f557a0ff82b96c2a", 39 | "2.2.2": "4baa6799598d2bbcb9e01626d2dcc11d46e2d1045f05fb49f557a0ff82b96c2a", 40 | "2.3.0": "90df3ecfd311b43c73ddcf659091b1339df53b13af62f03b9e12286856cd2d46", 41 | "2.4.0": "396765e89a8163ef75e94fa0e11ae32233c19ef0e08a70b2d7780ca9802c3dd0", 42 | "2.4.1": "396765e89a8163ef75e94fa0e11ae32233c19ef0e08a70b2d7780ca9802c3dd0", 43 | "2.5.0": "664e1545be52000a249d20d0e1e98c93d819b862760ee6200d09950c85d521ec", 44 | "2.6.0": "664e1545be52000a249d20d0e1e98c93d819b862760ee6200d09950c85d521ec", 45 | "2.6.1": "664e1545be52000a249d20d0e1e98c93d819b862760ee6200d09950c85d521ec", 46 | "2.7.0": "136efb353a418331df2b85ca05e9afbbca5a33db2225d2215d7bca983264c61d", 47 | "2.8.0": "bfc3d18460a6b969f473d9f5067457c13de349943352dee71e14615e4f3b5fab", 48 | "2.8.1": "bfc3d18460a6b969f473d9f5067457c13de349943352dee71e14615e4f3b5fab", 49 | "2.8.2": "bfc3d18460a6b969f473d9f5067457c13de349943352dee71e14615e4f3b5fab", 50 | "2.8.3": "bfc3d18460a6b969f473d9f5067457c13de349943352dee71e14615e4f3b5fab", 51 | "2.9.0": "bfc3d18460a6b969f473d9f5067457c13de349943352dee71e14615e4f3b5fab", 52 | "2.9.1": "14a4d7d5ec8a8ed2bcf6861bd418ad5c015cbd38a33d4e777a4e82b15aaba416", 53 | "3.0.0": "4f51df044b76eabafab2fbf420871d472c8f3a629da79ec5fac75c530d79f266", 54 | "3.0.1": "4f51df044b76eabafab2fbf420871d472c8f3a629da79ec5fac75c530d79f266", 55 | "3.0.2": "0d1a1c3b8255cc84090979079ca6d6e7a3391339c8b89e26a2b5de3994726d46", 56 | "3.0.3": "0d1a1c3b8255cc84090979079ca6d6e7a3391339c8b89e26a2b5de3994726d46", 57 | "3.0.4": "af256878a3ed52614189b6e2031e5c9cfd5aa57491a48b13905836fb8217069e", 58 | "3.1.0": "fc4d5552e532b1f510808810b230b193c4aaf7a6b26375750dde03aeb2f1a302", 59 | "3.1.1": "fc4d5552e532b1f510808810b230b193c4aaf7a6b26375750dde03aeb2f1a302", 60 | "3.2.0": "fc4d5552e532b1f510808810b230b193c4aaf7a6b26375750dde03aeb2f1a302", 61 | "3.2.1": "8fbe1031e8234fab32983f4e5afbc30831720db278418b5a4a48e50ad7611d15", 62 | "3.3.0": "420f9f744643ee9e73f716e92d9136d92ad459b10748fe1a2f94fcafbfd6508d", 63 | "3.4.0": "dc7fea63642f28330bb86d1f02c7bef24122d5b889400c2e421f76ce2fce9725", 64 | "3.5.0": "c7d392694a1257cc4052e24f1f02e9bbd1431ab0d27b64c3d9a76b13f539130b", 65 | "3.5.1": "8b73bdb35d8412d8be46a0046e3da0081ed1169c11d50fcb6bde65b7fb6c5dda", 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /detectem/plugins/react.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class ReactPlugin(Plugin): 5 | name = "react" 6 | homepage = "https://facebook.github.io/react/" 7 | tags = ["javascript", "react"] 8 | vendor = "Facebook" 9 | 10 | matchers = [ 11 | {"body": r" \* React v(?P[0-9\.]+)"}, 12 | {"url": r"/react/(?P[0-9\.]+)/react(-with-addons)?(\.min)?\.js"}, 13 | {"url": r"/react(-with-addons)?-(?P[0-9\.]+)(\.min)?\.js"}, 14 | {"dom": ("window.React", "window.React.version")}, 15 | {"xpath": ("//div[@data-reactid]", None)}, 16 | ] 17 | -------------------------------------------------------------------------------- /detectem/plugins/requirejs.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class RequireJSPlugin(Plugin): 5 | name = "require.js" 6 | homepage = "http://requirejs.org/" 7 | tags = ["javascript"] 8 | 9 | matchers = [ 10 | {"body": r"\* @license RequireJS (?P[0-9\.]+)"}, 11 | {"url": r"/require\.js/(?P[0-9\.]+)/require(\.min)?\.js"}, 12 | ] 13 | -------------------------------------------------------------------------------- /detectem/plugins/ssl.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class ModSSLPlugin(Plugin): 5 | name = "modssl" 6 | homepage = "http://www.modssl.org/" 7 | tags = ["ssl"] 8 | 9 | matchers = [{"header": ("Server", r"mod_ssl/(?P[0-9\.]+)")}] 10 | 11 | 12 | class OpenSSLPlugin(Plugin): 13 | name = "openssl" 14 | homepage = "https://www.openssl.org/" 15 | tags = ["ssl"] 16 | 17 | matchers = [{"header": ("Server", r"OpenSSL/(?P[\w\.]+)")}] 18 | -------------------------------------------------------------------------------- /detectem/plugins/underscorejs.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class UnderscoreJSPlugin(Plugin): 5 | name = "underscore.js" 6 | homepage = "http://underscorejs.org/" 7 | vendor = "Jeremy Ashkenas" 8 | tags = ["javascript"] 9 | 10 | matchers = [ 11 | {"body": r"^//\s+Underscore\.js (?P[0-9\.]+)"}, 12 | {"url": r"/underscore\.?js/(?P[0-9\.]+)/underscore(-min)?\.js"}, 13 | {"url": r"/underscore-(?P[0-9\.]+)(\.min)?\.js"}, 14 | ] 15 | -------------------------------------------------------------------------------- /detectem/plugins/vue.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class VuePlugin(Plugin): 5 | name = "vue" 6 | homepage = "https://vuejs.org" 7 | vendor = "Evan You" 8 | tags = ["vue", "js framework"] 9 | matchers = [ 10 | {"url": r"/vue@(?P[0-9a-z\.-]+)"}, 11 | {"dom": ("window.Vue", "window.Vue.version")}, 12 | {"xpath": ("//*[contains(local-name(@*),'data-v-')]", None)}, 13 | ] 14 | -------------------------------------------------------------------------------- /detectem/plugins/w3-total-cache.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class W3TotalCachePlugin(Plugin): 5 | name = "w3-total-cache" 6 | vendor = "Frederick Townes" 7 | homepage = "https://wordpress.org/plugins/w3-total-cache/" 8 | tags = ["wordpress"] 9 | 10 | matchers = [ 11 | {"header": ("X-Powered-By", r"W3 Total Cache/(?P[0-9\.]+)")}, 12 | { 13 | "xpath": ( 14 | "//comment()[contains(.,'Performance optimized by W3 Total Cache')]", 15 | None, 16 | ) 17 | }, 18 | ] 19 | -------------------------------------------------------------------------------- /detectem/plugins/webpack.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class WebpackPlugin(Plugin): 5 | name = "webpack" 6 | homepage = "https://webpack.js.org/" 7 | tags = ["webpack", "module bundler"] 8 | 9 | matchers = [{"dom": ("window.webpackJsonp", None)}] 10 | -------------------------------------------------------------------------------- /detectem/plugins/wordpress.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | from .helpers import meta_generator 4 | 5 | 6 | class WordpressPlugin(Plugin): 7 | name = "wordpress" 8 | homepage = "https://wordpress.org/" 9 | tags = ["wordpress"] 10 | 11 | matchers = [ 12 | {"url": r"/wp-includes/js/wp-embed.min.js\?ver=(?P[0-9\.]+)"}, 13 | {"xpath": (meta_generator("Wordpress"), r"(?P[0-9\.]+)")}, 14 | {"url": "/wp-content/plugins/"}, 15 | ] 16 | -------------------------------------------------------------------------------- /detectem/plugins/wp-super-cache.py: -------------------------------------------------------------------------------- 1 | from detectem.plugin import Plugin 2 | 3 | 4 | class WPSuperCachePlugin(Plugin): 5 | name = "wp-super-cache" 6 | homepage = "https://wordpress.org/plugins/wp-super-cache/" 7 | tags = ["wordpress"] 8 | 9 | matchers = [ 10 | { 11 | "xpath": ( 12 | '//comment()[contains(.,"Cached page generated by WP-Super-Cache on")]', 13 | None, 14 | ) 15 | } 16 | ] 17 | -------------------------------------------------------------------------------- /detectem/response.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | import logging 4 | import re 5 | import urllib.parse 6 | from string import Template 7 | from typing import Optional 8 | 9 | import pkg_resources 10 | import requests 11 | 12 | from detectem.exceptions import SplashError 13 | from detectem.settings import SPLASH_TIMEOUT 14 | 15 | DEFAULT_CHARSET = "iso-8859-1" 16 | ERROR_STATUS_CODES = [400, 504] 17 | 18 | logger = logging.getLogger("detectem") 19 | 20 | 21 | def is_url_allowed(url): 22 | """Return ``True`` if ``url`` is not in ``blacklist``. 23 | 24 | :rtype: bool 25 | 26 | """ 27 | blacklist = [ 28 | r"\.ttf", 29 | r"\.woff", 30 | r"fonts\.googleapis\.com", 31 | r"\.png", 32 | r"\.jpe?g", 33 | r"\.gif", 34 | r"\.svg", 35 | r"\.otf", 36 | ] 37 | 38 | for ft in blacklist: 39 | if re.search(ft, url): 40 | return False 41 | 42 | return True 43 | 44 | 45 | def is_valid_mimetype(response): 46 | """Return ``True`` if the mimetype is not blacklisted. 47 | 48 | :rtype: bool 49 | 50 | """ 51 | blacklist = ["image/"] 52 | 53 | mimetype = response.get("mimeType") 54 | if not mimetype: 55 | return True 56 | 57 | for bw in blacklist: 58 | if bw in mimetype: 59 | return False 60 | 61 | return True 62 | 63 | 64 | def get_charset(response): 65 | """Return charset from ``response`` or default charset. 66 | 67 | :rtype: str 68 | 69 | """ 70 | # Set default charset 71 | charset = DEFAULT_CHARSET 72 | 73 | m = re.findall(r";charset=(.*)", response.get("mimeType", "")) 74 | if m: 75 | charset = m[0] 76 | 77 | return charset 78 | 79 | 80 | def create_lua_script(plugins): 81 | """Return script template filled up with plugin javascript data. 82 | 83 | :rtype: str 84 | 85 | """ 86 | lua_template = pkg_resources.resource_string("detectem", "script.lua") 87 | template = Template(lua_template.decode("utf-8")) 88 | 89 | javascript_data = to_javascript_data(plugins) 90 | 91 | return template.substitute(js_data=json.dumps(javascript_data)) 92 | 93 | 94 | def to_javascript_data(plugins): 95 | """ 96 | Return a dictionary with all JavaScript matchers. Quotes are escaped. 97 | 98 | :rtype: dict 99 | 100 | """ 101 | 102 | def escape(v): 103 | return re.sub(r'"', r'\\"', v) 104 | 105 | def dom_matchers(p): 106 | dom_matchers = p.get_matchers("dom") 107 | escaped_dom_matchers = [] 108 | 109 | for dm in dom_matchers: 110 | check_statement, version_statement = dm 111 | 112 | escaped_dom_matchers.append( 113 | { 114 | "check_statement": escape(check_statement), 115 | # Escape '' and not None 116 | "version_statement": escape(version_statement or ""), 117 | } 118 | ) 119 | 120 | return escaped_dom_matchers 121 | 122 | return [ 123 | {"name": p.name, "matchers": dom_matchers(p)} 124 | for p in plugins.with_dom_matchers() 125 | ] 126 | 127 | 128 | def get_response(url, plugins, timeout=SPLASH_TIMEOUT, splash_url=""): 129 | """ 130 | Return response with HAR, inline scritps and software detected by JS matchers. 131 | 132 | :rtype: dict 133 | 134 | """ 135 | lua_script = create_lua_script(plugins) 136 | lua = urllib.parse.quote_plus(lua_script) 137 | 138 | try: 139 | page_url = f"{splash_url}/execute?url={url}&timeout={timeout}&lua_source={lua}" 140 | res = requests.get(page_url, timeout=timeout) 141 | except requests.exceptions.ConnectionError: 142 | raise SplashError(f"Could not connect to Splash server at {splash_url}") 143 | except requests.exceptions.ReadTimeout: 144 | raise SplashError("Connection to Splash server timed out") 145 | 146 | logger.debug("[+] Response received") 147 | 148 | json_data = res.json() 149 | 150 | if res.status_code in ERROR_STATUS_CODES: 151 | raise SplashError(get_splash_error(json_data)) 152 | 153 | softwares = json_data["softwares"] 154 | scripts = json_data["scripts"].values() 155 | har = get_valid_har(json_data["har"]) 156 | 157 | js_error = get_evaljs_error(json_data) 158 | if js_error: 159 | logger.warning(f"[-] Failed to eval JS matchers: {js_error}") 160 | else: 161 | logger.debug("[+] Detected %(n)d softwares from the DOM", {"n": len(softwares)}) 162 | 163 | logger.debug("[+] Detected %(n)d scripts from the DOM", {"n": len(scripts)}) 164 | logger.debug("[+] Final HAR has %(n)d valid entries", {"n": len(har)}) 165 | 166 | return {"har": har, "scripts": scripts, "softwares": softwares} 167 | 168 | 169 | def get_splash_error(json_data): 170 | msg = json_data["description"] 171 | if "info" in json_data and "error" in json_data["info"]: 172 | error = json_data["info"]["error"] 173 | if error.startswith("http"): 174 | msg = "Request to site failed with error code {0}".format(error) 175 | elif error.startswith("network"): 176 | # see http://doc.qt.io/qt-5/qnetworkreply.html 177 | qt_errors = { 178 | "network1": "ConnectionRefusedError", 179 | "network2": "RemoteHostClosedError", 180 | "network3": "HostNotFoundError", 181 | "network4": "TimeoutError", 182 | "network5": "OperationCanceledError", 183 | "network6": "SslHandshakeFailedError", 184 | } 185 | error = qt_errors.get(error, "error code {0}".format(error)) 186 | msg = "Request to site failed with {0}".format(error) 187 | else: 188 | msg = "{0}: {1}".format(msg, error) 189 | return msg 190 | 191 | 192 | def get_evaljs_error(json_data: dict) -> Optional[str]: 193 | try: 194 | evaljs_message = json_data["errors"]["evaljs"] 195 | except KeyError: 196 | return None 197 | 198 | if isinstance(evaljs_message, str): 199 | m = re.search(r"'js_error': \"(.*?)\", '", evaljs_message) 200 | if m: 201 | return m.group(1) 202 | 203 | return None 204 | 205 | 206 | def get_valid_har(har_data): 207 | """Return list of valid HAR entries. 208 | 209 | :rtype: list 210 | 211 | """ 212 | new_entries = [] 213 | entries = har_data.get("log", {}).get("entries", []) 214 | logger.debug("[+] Detected %(n)d entries in HAR", {"n": len(entries)}) 215 | 216 | for entry in entries: 217 | url = entry["request"]["url"] 218 | if not is_url_allowed(url): 219 | continue 220 | 221 | response = entry["response"]["content"] 222 | if not is_valid_mimetype(response): 223 | continue 224 | 225 | if response.get("text"): 226 | charset = get_charset(response) 227 | response["text"] = base64.b64decode(response["text"]).decode( 228 | charset, errors="ignore" 229 | ) 230 | else: 231 | response["text"] = "" 232 | 233 | new_entries.append(entry) 234 | 235 | logger.debug("[+] Added URL: %(url)s ...", {"url": url[:100]}) 236 | 237 | return new_entries 238 | -------------------------------------------------------------------------------- /detectem/results.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from distutils.version import LooseVersion 3 | 4 | from detectem.settings import GENERIC_TYPE, HINT_TYPE, INDICATOR_TYPE, VERSION_TYPE 5 | 6 | 7 | class Result: 8 | def __init__( 9 | self, 10 | name, 11 | version=None, 12 | homepage=None, 13 | from_url=None, 14 | type=VERSION_TYPE, 15 | plugin=None, 16 | ): 17 | self.name = name 18 | self.type = type 19 | self.version = version 20 | self.homepage = homepage 21 | self.from_url = from_url 22 | self.plugin = plugin 23 | 24 | def __hash__(self): 25 | return hash((self.name, self.version, self.type)) 26 | 27 | def __eq__(self, o): 28 | def to_tuple(rt): 29 | return (rt.name, rt.version, rt.type) 30 | 31 | return to_tuple(self) == to_tuple(o) 32 | 33 | def __lt__(self, o): 34 | def to_tuple(rt): 35 | return (rt.name, LooseVersion(rt.version or "0"), rt.type) 36 | 37 | return to_tuple(self) < to_tuple(o) 38 | 39 | def __repr__(self): 40 | return str({"name": self.name, "version": self.version, "type": self.type}) 41 | 42 | 43 | class ResultCollection: 44 | def __init__(self): 45 | self._results = defaultdict(list) 46 | 47 | def add_result(self, rt): 48 | self._results[rt.name].append(rt) 49 | 50 | def _normalize_results(self): 51 | norm_results = defaultdict(list) 52 | 53 | for p_name, p_results in self._results.items(): 54 | rdict = defaultdict(set) 55 | for rt in p_results: 56 | rdict[rt.type].add(rt) 57 | 58 | p_list = [] 59 | if VERSION_TYPE in rdict: 60 | p_list = list(rdict[VERSION_TYPE]) 61 | assert len(p_list) >= 1 62 | elif INDICATOR_TYPE in rdict: 63 | p_list = list(rdict[INDICATOR_TYPE]) 64 | assert len(p_list) == 1 65 | elif HINT_TYPE in rdict: 66 | p_list = list(rdict[HINT_TYPE]) 67 | assert len(p_list) == 1 68 | elif GENERIC_TYPE in rdict: 69 | p_list = list(rdict[GENERIC_TYPE]) 70 | assert len(p_list) == 1 71 | 72 | norm_results[p_name] = p_list 73 | 74 | return norm_results 75 | 76 | def get_results(self, normalize=True): 77 | results = self._normalize_results() if normalize else self._results 78 | return [rt for p_results in results.values() for rt in p_results] 79 | -------------------------------------------------------------------------------- /detectem/script.lua: -------------------------------------------------------------------------------- 1 | function main(splash) 2 | splash.images_enabled = false 3 | splash.response_body_enabled = true 4 | 5 | local url = splash.args.url 6 | splash:go(url) 7 | assert(splash:wait(5)) 8 | 9 | local detectFunction = [[ 10 | detect = function(){ 11 | var rs = []; 12 | 13 | softwareData.forEach(function(s) { 14 | var matchers = s.matchers; 15 | var presenceFlag = false; 16 | 17 | for (var i in matchers) { 18 | var check_statement = matchers[i].check_statement 19 | var version_statement = matchers[i].version_statement 20 | 21 | if (eval(check_statement)){ 22 | if (!version_statement){ 23 | presenceFlag = true; 24 | continue; 25 | } 26 | 27 | var version = eval(version_statement); 28 | if (version) { 29 | var vRegex = /^([\d\.]+)/g; 30 | var matches = vRegex.exec(version); 31 | if (matches) { 32 | rs.push({'name': s.name, 'version': matches[0]}); 33 | } 34 | } 35 | } 36 | } 37 | 38 | if (presenceFlag) { 39 | rs.push({'name': s.name}) 40 | } 41 | 42 | }); 43 | 44 | return rs; 45 | } 46 | ]] 47 | splash:runjs('softwareData = $js_data;') 48 | splash:runjs(detectFunction) 49 | 50 | local softwares = {} 51 | local scripts = {} 52 | local errors = {} 53 | 54 | local ok, res = pcall(splash.evaljs, self, 'detect()') 55 | if ok then 56 | softwares = res 57 | else 58 | errors['evaljs'] = res 59 | end 60 | 61 | local ok, res = pcall(splash.select_all, self, 'script') 62 | if ok then 63 | if res then 64 | for _, s in ipairs(res) do 65 | scripts[#scripts+1] = s.node.innerHTML 66 | end 67 | end 68 | else 69 | errors['select_all'] = res 70 | end 71 | 72 | return { 73 | har = splash:har(), 74 | softwares=softwares, 75 | scripts=scripts, 76 | errors=errors, 77 | } 78 | end 79 | -------------------------------------------------------------------------------- /detectem/settings.py: -------------------------------------------------------------------------------- 1 | from environs import Env 2 | 3 | env = Env() 4 | env.read_env() 5 | 6 | DEBUG = env.bool("DEBUG", False) 7 | PLUGIN_PACKAGES = env.list("DET_PLUGIN_PACKAGES", "detectem.plugins") 8 | 9 | # General Splash configuration 10 | SPLASH_URLS = env.list("SPLASH_URLS", ["http://localhost:8050"]) 11 | SETUP_SPLASH = env.bool("SETUP_SPLASH", True) 12 | DOCKER_SPLASH_IMAGE = env("DOCKER_SPLASH_IMAGE", "scrapinghub/splash:latest") 13 | NUMBER_OF_SPLASH_INSTANCES = env.int("NUMBER_OF_SPLASH_INSTANCES", 3) 14 | 15 | # Splash internal settings 16 | SPLASH_MAX_TIMEOUT = env.int("SPLASH_MAX_TIMEOUT", 120) 17 | SPLASH_TIMEOUT = 30 18 | SPLASH_MAX_RETRIES = 3 19 | 20 | 21 | # CONSTANTS 22 | JSON_OUTPUT = "json" 23 | CMD_OUTPUT = "cmd" 24 | 25 | VERSION_TYPE = "version" 26 | INDICATOR_TYPE = "indicator" 27 | HINT_TYPE = "hint" 28 | GENERIC_TYPE = "generic" 29 | 30 | RESOURCE_ENTRY = "resource" 31 | MAIN_ENTRY = "main" 32 | INLINE_SCRIPT_ENTRY = "inline-script" 33 | -------------------------------------------------------------------------------- /detectem/splash.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import logging 3 | import multiprocessing 4 | import time 5 | from contextlib import contextmanager 6 | from typing import Callable, Iterator, Optional 7 | 8 | import docker 9 | import requests 10 | from docker.client import DockerClient 11 | from docker.models.containers import Container 12 | 13 | from detectem.exceptions import DockerStartError 14 | from detectem.settings import ( 15 | DOCKER_SPLASH_IMAGE, 16 | NUMBER_OF_SPLASH_INSTANCES, 17 | SETUP_SPLASH, 18 | SPLASH_MAX_TIMEOUT, 19 | SPLASH_URLS, 20 | ) 21 | 22 | logger = logging.getLogger("detectem") 23 | 24 | 25 | class SplashManagerInterface(abc.ABC): 26 | handles_errors = True 27 | 28 | def __init__(self, *args, **kwargs): 29 | self.manager = multiprocessing.Manager() 30 | self.lock = multiprocessing.Lock() 31 | self.sem = multiprocessing.Semaphore(NUMBER_OF_SPLASH_INSTANCES) 32 | 33 | # State 34 | self._instances = self.manager.dict() 35 | 36 | def get_number_of_available_instances(self): 37 | return len(self._instances) 38 | 39 | @abc.abstractmethod 40 | def setup(self, n_instances: int): 41 | ... 42 | 43 | @abc.abstractmethod 44 | def teardown(self): 45 | ... 46 | 47 | @abc.abstractmethod 48 | def handle_error(self, instance_name: str): 49 | ... 50 | 51 | @contextmanager 52 | def assign_instance(self) -> Iterator: 53 | """Context manager that gets a not "in use" Splash instance, 54 | sets it as "in use" to not be used by any other process, 55 | requests the instance HTTP endpoint to run the garbage collector, 56 | yields a tuple `(instance_name, instance_url)` to be used in code 57 | and then sets the instance as not "in use". 58 | 59 | """ 60 | instance_name: str = "" 61 | instance_data: dict = {} 62 | 63 | with self.lock: 64 | for instance_name, instance_data in self._instances.items(): 65 | if instance_data["in_use"]: 66 | continue 67 | 68 | # Set instance in use and update it in multiprocessing's way 69 | instance_data["in_use"] = True 70 | self._instances[instance_name] = instance_data 71 | 72 | break 73 | 74 | # Get Splash url and clean container (call garbage collector) 75 | url: str = instance_data["url"] 76 | try: 77 | requests.post(f"{url}/_gc") 78 | except requests.exceptions.RequestException: 79 | pass 80 | 81 | yield (instance_name, url) 82 | 83 | # Set container as not in use 84 | with self.lock: 85 | # Update dict multiprocessing's way 86 | instance_data = self._instances[instance_name] 87 | instance_data["in_use"] = False 88 | self._instances[instance_name] = instance_data 89 | 90 | 91 | class RemoteSplashManager(SplashManagerInterface): 92 | handles_errors = False 93 | 94 | def __init__(self, *args, **kwargs): 95 | if NUMBER_OF_SPLASH_INSTANCES != len(SPLASH_URLS): 96 | raise ValueError( 97 | "Number of Splash instances must match number of provided Splash urls" 98 | ) 99 | 100 | super().__init__(*args, **kwargs) 101 | 102 | def setup(self, n_instances): 103 | for index, url in enumerate(SPLASH_URLS): 104 | name = f"instance-{index}" 105 | self._instances[name] = { 106 | "url": url, 107 | "in_use": False, 108 | } 109 | 110 | def teardown(self): 111 | # Nothing to do 112 | pass 113 | 114 | def handle_error(self, instance_name: str): 115 | # Nothing to do 116 | pass 117 | 118 | 119 | def docker_error(method: Callable): 120 | """ Decorator to catch docker exceptions """ 121 | 122 | def run_method(*args, **kwargs): 123 | try: 124 | method(*args, **kwargs) 125 | except docker.errors.DockerException as e: 126 | raise DockerStartError(f"Docker error: {e}") 127 | 128 | return run_method 129 | 130 | 131 | class DockerSplashManager(SplashManagerInterface): 132 | """ Manage Splash instances using local docker """ 133 | 134 | MAXIMUM_NUMBER_OF_ERRORS = 3 135 | 136 | _docker_cli: Optional[DockerClient] = None 137 | 138 | @property 139 | def docker_cli(self) -> DockerClient: 140 | """Return a docker client instance 141 | 142 | Raises: 143 | - DockerError 144 | """ 145 | if not self._docker_cli: 146 | try: 147 | self._docker_cli: DockerClient = docker.from_env(version="auto") 148 | except docker.errors.DockerException: 149 | raise DockerStartError( 150 | "Could not connect to Docker daemon. " 151 | "Please ensure Docker is running and your user has access." 152 | ) 153 | 154 | return self._docker_cli 155 | 156 | def _wait_container(self, container_name): 157 | """ Wait until Splash HTTP service is ready to receive requests """ 158 | url = self._instances[container_name]["url"] 159 | 160 | for t in [1, 2, 4, 6, 8, 10]: 161 | try: 162 | requests.get(f"{url}/_ping") 163 | break 164 | except requests.exceptions.RequestException: 165 | time.sleep(t) 166 | else: 167 | raise DockerStartError( 168 | f"Could not connect to started Splash container. " 169 | f"See 'docker logs {container_name}' for more details." 170 | ) 171 | 172 | @docker_error 173 | def setup(self, n_instances: int): 174 | """Fill ``self._instances`` with created containers. 175 | 176 | ``n_instances`` could be equal to NUMBER_OF_SPLASH_INSTANCES or lower 177 | since it's also determined by the number of URLs to analyze. 178 | 179 | It also checks that the target docker image exists 180 | and there weren't any issues creating the containers. 181 | 182 | """ 183 | # Check base image 184 | try: 185 | self.docker_cli.images.get(DOCKER_SPLASH_IMAGE) 186 | except docker.errors.ImageNotFound: 187 | raise DockerStartError( 188 | f"Docker image {DOCKER_SPLASH_IMAGE} not found." 189 | f"Please install it or set an image using DOCKER_SPLASH_IMAGE environment variable." 190 | ) 191 | 192 | for container_index in range(n_instances): 193 | container_name = f"splash-detectem-{container_index}" 194 | port = 8050 + container_index 195 | 196 | self._instances[container_name] = { 197 | "url": f"http://localhost:{port}", 198 | "in_use": False, 199 | "errors": 0, 200 | } 201 | 202 | try: 203 | container: Container = self.docker_cli.containers.get(container_name) 204 | except docker.errors.NotFound: 205 | # If not found, create it 206 | container: Container = self.docker_cli.containers.create( 207 | name=container_name, 208 | image=DOCKER_SPLASH_IMAGE, 209 | ports={ 210 | "8050/tcp": ("127.0.0.1", port), 211 | }, 212 | command=f"--max-timeout {SPLASH_MAX_TIMEOUT}", 213 | ) 214 | 215 | if container.status != "running": 216 | container.start() 217 | 218 | try: 219 | self._wait_container(container_name) 220 | except DockerStartError: 221 | # If the container didn't start it's probable to be a unrecuperable error 222 | # We stop it and delete the container to be recreated next run 223 | container.stop() 224 | container.remove() 225 | # Also it's not available to send work to 226 | del self._instances[container_name] 227 | continue 228 | 229 | @docker_error 230 | def teardown(self): 231 | for container_name in self._instances: 232 | container: Container = self.docker_cli.containers.get(container_name) 233 | container.stop() 234 | 235 | @docker_error 236 | def handle_error(self, container_name: str): 237 | with self.lock: 238 | # Update dict multiprocessing's way 239 | instance_data = self._instances[container_name] 240 | instance_data["errors"] += 1 241 | self._instances[container_name] = instance_data 242 | 243 | if instance_data["errors"] >= self.MAXIMUM_NUMBER_OF_ERRORS: 244 | logger.warning( 245 | f"[-] Restarting container {container_name} due to errors .." 246 | ) 247 | 248 | container: Container = self.docker_cli.containers.get(container_name) 249 | container.restart() 250 | self._wait_container(container_name) 251 | 252 | # Restart error counter 253 | with self.lock: 254 | # Update dict multiprocessing's way 255 | instance_data = self._instances[container_name] 256 | instance_data["errors"] = 0 257 | self._instances[container_name] = instance_data 258 | 259 | 260 | def get_splash_manager() -> SplashManagerInterface: 261 | if SETUP_SPLASH: 262 | return DockerSplashManager() 263 | 264 | return RemoteSplashManager() 265 | -------------------------------------------------------------------------------- /detectem/utils.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import json 3 | import pprint 4 | 5 | from detectem.settings import CMD_OUTPUT, JSON_OUTPUT 6 | 7 | 8 | def get_most_complete_pm(pms): 9 | """Return plugin match with longer version, if not available 10 | will return plugin match with ``presence=True`` 11 | """ 12 | if not pms: 13 | return None 14 | 15 | selected_version = None 16 | selected_presence = None 17 | 18 | for pm in pms: 19 | if pm.version: 20 | if not selected_version: 21 | selected_version = pm 22 | else: 23 | if len(pm.version) > len(selected_version.version): 24 | selected_version = pm 25 | elif pm.presence: 26 | selected_presence = pm 27 | 28 | return selected_version or selected_presence 29 | 30 | 31 | def create_printer(oformat): 32 | if oformat == CMD_OUTPUT: 33 | return pprint.pprint 34 | elif oformat == JSON_OUTPUT: 35 | 36 | def json_printer(data): 37 | print(json.dumps(data)) 38 | 39 | return json_printer 40 | 41 | 42 | def get_url(entry): 43 | """ Return URL from response if it was received otherwise requested URL. """ 44 | try: 45 | return entry["response"]["url"] 46 | except KeyError: 47 | return entry["request"]["url"] 48 | 49 | 50 | def get_response_body(entry): 51 | return entry["response"]["content"]["text"] 52 | 53 | 54 | def get_version_via_file_hashes(plugin, entry): 55 | file_hashes = getattr(plugin, "file_hashes", {}) 56 | if not file_hashes: 57 | return 58 | 59 | url = get_url(entry) 60 | body = get_response_body(entry).encode("utf-8") 61 | for file, hash_dict in file_hashes.items(): 62 | if file not in url: 63 | continue 64 | 65 | m = hashlib.sha256() 66 | m.update(body) 67 | h = m.hexdigest() 68 | 69 | for version, version_hash in hash_dict.items(): 70 | if h == version_hash: 71 | return version 72 | -------------------------------------------------------------------------------- /detectem/ws.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | from detectem.cli import get_detection_results 5 | from detectem.exceptions import NoPluginsError, SplashError 6 | from detectem.settings import DEBUG, SPLASH_TIMEOUT 7 | 8 | try: 9 | import bottle 10 | from bottle import post, request, run 11 | except ImportError: 12 | print("[+] Install bottle to use the web service") 13 | sys.exit(0) 14 | 15 | 16 | @post("/detect") 17 | def do_detection(): 18 | # Url is mandatory 19 | url = request.forms.get("url") 20 | if not url: 21 | return json.dumps({"error": "You must provide `url` parameter."}) 22 | 23 | # metadata is optional 24 | metadata = request.forms.get("metadata", "0") 25 | metadata = bool(metadata == "1") 26 | 27 | # timeout is optional 28 | timeout = request.forms.get("timeout", type=int) 29 | if not timeout: 30 | timeout = SPLASH_TIMEOUT 31 | 32 | try: 33 | result = get_detection_results(url, timeout=timeout, metadata=metadata) 34 | except (SplashError, NoPluginsError) as e: 35 | result = {"error": e.msg} 36 | 37 | return json.dumps(result) 38 | 39 | 40 | def main(): 41 | bottle.debug(DEBUG) 42 | run(host="0.0.0.0", port=5723) 43 | 44 | 45 | if __name__ == "__main__": 46 | main() 47 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = detectem 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/assets/browser_js_console.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alertot/detectem/bc5f073575643c4c95a778ef576a5f0cbb1d3852/docs/assets/browser_js_console.png -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # detectem documentation build configuration file, created by 5 | # sphinx-quickstart on Mon Feb 13 17:20:46 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | # import os 21 | # import sys 22 | # sys.path.insert(0, os.path.abspath('.')) 23 | 24 | # -- General configuration ------------------------------------------------ 25 | 26 | # If your documentation needs a minimal Sphinx version, state it here. 27 | # 28 | # needs_sphinx = '1.0' 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | "sphinx.ext.autodoc", 35 | "sphinx.ext.doctest", 36 | "sphinx.ext.intersphinx", 37 | "sphinx.ext.viewcode", 38 | "sphinxcontrib.zopeext.autointerface", 39 | ] 40 | 41 | # Add any paths that contain templates here, relative to this directory. 42 | templates_path = ["_templates"] 43 | 44 | # The suffix(es) of source filenames. 45 | # You can specify multiple suffix as a list of string: 46 | # 47 | # source_suffix = ['.rst', '.md'] 48 | source_suffix = ".rst" 49 | 50 | # The master toctree document. 51 | master_doc = "index" 52 | 53 | # General information about the project. 54 | project = "detectem" 55 | copyright = "2018, alertot SpA" 56 | author = "Claudio Salazar" 57 | 58 | # The version info for the project you're documenting, acts as replacement for 59 | # |version| and |release|, also used in various other places throughout the 60 | # built documents. 61 | # 62 | # The short X.Y version. 63 | version = "0.7" 64 | # The full version, including alpha/beta/rc tags. 65 | release = "0.7" 66 | 67 | # The language for content autogenerated by Sphinx. Refer to documentation 68 | # for a list of supported languages. 69 | # 70 | # This is also used if you do content translation via gettext catalogs. 71 | # Usually you set "language" from the command line for these cases. 72 | language = None 73 | 74 | # List of patterns, relative to source directory, that match files and 75 | # directories to ignore when looking for source files. 76 | # This patterns also effect to html_static_path and html_extra_path 77 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 78 | 79 | # The name of the Pygments (syntax highlighting) style to use. 80 | pygments_style = "sphinx" 81 | 82 | # If true, `todo` and `todoList` produce output, else they produce nothing. 83 | todo_include_todos = False 84 | 85 | # -- Options for HTML output ---------------------------------------------- 86 | 87 | # The theme to use for HTML and HTML Help pages. See the documentation for 88 | # a list of builtin themes. 89 | # 90 | html_theme = "sphinx_rtd_theme" 91 | 92 | # Theme options are theme-specific and customize the look and feel of a theme 93 | # further. For a list of options available for each theme, see the 94 | # documentation. 95 | # 96 | # html_theme_options = {} 97 | 98 | # Add any paths that contain custom static files (such as style sheets) here, 99 | # relative to this directory. They are copied after the builtin static files, 100 | # so a file named "default.css" will overwrite the builtin "default.css". 101 | html_static_path = ["_static"] 102 | 103 | # -- Options for HTMLHelp output ------------------------------------------ 104 | 105 | # Output file base name for HTML help builder. 106 | htmlhelp_basename = "detectemdoc" 107 | 108 | # -- Options for LaTeX output --------------------------------------------- 109 | 110 | latex_elements = { 111 | # The paper size ('letterpaper' or 'a4paper'). 112 | # 113 | # 'papersize': 'letterpaper', 114 | # The font size ('10pt', '11pt' or '12pt'). 115 | # 116 | # 'pointsize': '10pt', 117 | # Additional stuff for the LaTeX preamble. 118 | # 119 | # 'preamble': '', 120 | # Latex figure (float) alignment 121 | # 122 | # 'figure_align': 'htbp', 123 | } 124 | 125 | # Grouping the document tree into LaTeX files. List of tuples 126 | # (source start file, target name, title, 127 | # author, documentclass [howto, manual, or own class]). 128 | latex_documents = [ 129 | (master_doc, "detectem.tex", "detectem Documentation", "Claudio Salazar", "manual") 130 | ] 131 | 132 | # -- Options for manual page output --------------------------------------- 133 | 134 | # One entry per manual page. List of tuples 135 | # (source start file, name, description, authors, manual section). 136 | man_pages = [(master_doc, "detectem", "detectem Documentation", [author], 1)] 137 | 138 | # -- Options for Texinfo output ------------------------------------------- 139 | 140 | # Grouping the document tree into Texinfo files. List of tuples 141 | # (source start file, target name, title, author, 142 | # dir menu entry, description, category) 143 | texinfo_documents = [ 144 | ( 145 | master_doc, 146 | "detectem", 147 | "detectem Documentation", 148 | author, 149 | "detectem", 150 | "One line description of project.", 151 | "Miscellaneous", 152 | ) 153 | ] 154 | 155 | # Example configuration for intersphinx: refer to the Python standard library. 156 | intersphinx_mapping = {"https://docs.python.org/": None} 157 | -------------------------------------------------------------------------------- /docs/generic.rst: -------------------------------------------------------------------------------- 1 | .. _generic_plugin: 2 | 3 | .. |wgp| replace:: :class:`WordpressGenericPlugin ` 4 | 5 | Generic plugin 6 | ============== 7 | 8 | A generic plugin is a plugin that detects the presence of multiple softwares. 9 | It's ideal for software that uses plugins and its detection could be automatic. 10 | For this walkthrough we will take as example |wgp|. 11 | 12 | Please verify the code for an exact working example 13 | since here we put it by pieces just highlighting important parts. 14 | 15 | 16 | Basics 17 | ^^^^^^ 18 | 19 | What defines a generic plugin? 20 | The first difference, it subclasses :class:`GenericPlugin ` 21 | to set the plugin type (``ptype`` attribute) to ``'generic'``, 22 | which let us know that it's a generic plugin. 23 | For organization purposes, they lie in ``detectem.plugins.generic`` module. 24 | 25 | 26 | .. code:: python 27 | 28 | from detectem.plugin import GenericPlugin 29 | 30 | class WordpressGenericPlugin(GenericPlugin): 31 | [...] 32 | 33 | 34 | In this class of plugins, as far as we've seen during development until now, 35 | there's no way to extract the version reliably, 36 | then we're going to use ``indicators``. 37 | 38 | What's a proper indicator for a generic plugin? 39 | In Wordpress, every Wordpress plugin is located at the following path: 40 | ``/wp-content/plugins//``. 41 | The mission of our generic plugin is to discover 42 | as many Wordpress plugins as possible, 43 | then we are going to use an URL matcher that matches Wordpress plugin directory. 44 | 45 | .. code:: python 46 | 47 | from detectem.plugin import GenericPlugin 48 | 49 | class WordpressGenericPlugin(GenericPlugin): 50 | indicators = [ 51 | {'url': '/wp-content/plugins/'} 52 | ] 53 | 54 | 55 | Then, if a website loads a resource from the directory ``/wp-content/plugins/``, 56 | our |wgp| will match. 57 | 58 | 59 | Data extraction 60 | ^^^^^^^^^^^^^^^ 61 | 62 | After matching, 63 | detectem will call a method named ``get_information(entry)`` on the plugin. 64 | This method extracts information from the matching _HAR_ ``entry`` 65 | and returns a dictionary with at least ``name`` and ``homepage`` keys 66 | to be displayed in detectem results. 67 | 68 | In the case of |wgp|, 69 | we extract the plugin name from ``entry``'s url 70 | and build homepage plugin URL dinamycally. 71 | 72 | .. code:: python 73 | 74 | def get_information(self, entry): 75 | name = re.findall('/wp-content/plugins/([^/]+)/', get_url(entry))[0] 76 | homepage = self.homepage % name 77 | 78 | return { 79 | 'name': name, 80 | 'homepage': homepage, 81 | } 82 | 83 | 84 | Despite of using an ``indicator``, 85 | the generic spider could return version data from ``get_information(entry)`` 86 | since the ``indicator`` is just a signal to execute the generic plugin logic. 87 | 88 | Actual implementation of WordpressGenericPlugin returns more data 89 | and verifies plugin name against the public repository of Wordpress plugins. 90 | Please check out the source code to see the working implementation. 91 | 92 | 93 | Priority 94 | ^^^^^^^^ 95 | 96 | As seen with |wgp|, 97 | it returns the same data as an ``indicator`` would do, 98 | then they share the same priority. 99 | 100 | That makes possible that we can create plugins to detect Wordpress plugins 101 | where we can extract the version and they will prevail over generic results 102 | because they have higher priority. 103 | A good example is the plugin ``crayon-syntax-highlighter``, 104 | which correct detection will return the plugin version 105 | and that information will be returned instead of generic information. 106 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. _detectem: 2 | 3 | Welcome to detectem's documentation! 4 | ==================================== 5 | 6 | This documentation contains everything you need to know about detectem. 7 | 8 | detectem is a passive software detector. 9 | Let's see it in action. 10 | 11 | .. code-block:: bash 12 | 13 | $ det http://domain.tld 14 | [{'name': 'phusion-passenger', 'version': '4.0.10'}, 15 | {'name': 'apache-mod_bwlimited', 'version': '1.4'}, 16 | {'name': 'apache-mod_fcgid', 'version': '2.3.9'}, 17 | {'name': 'jquery', 'version': '1.11.3'}, 18 | {'name': 'crayon-syntax-highlighter', 'version': '2.7.2'}] 19 | 20 | 21 | Using a serie of indicators, it's able to detect software running on a site 22 | and in most cases extract accurately its version information. 23 | It uses Splash_ API 24 | to render the website and start the detection routine. 25 | It does full analysis on requests, responses and even on the DOM_! 26 | 27 | There are two important articles to read: 28 | 29 | * `Reasons to create detectem `_ 30 | * `Introduction to detectem `_ 31 | 32 | 33 | Features 34 | -------- 35 | 36 | * Detect software in modern web technologies. 37 | * Browser support provided by Splash_. 38 | * Analysis on requests made and responses received by the browser. 39 | * Get software information from the DOM. 40 | * Match by file fingerprints. 41 | * Great performance (less than 10 seconds to get a fingerprint). 42 | * Plugin system to add new software easily. 43 | * Test suite to ensure plugin result integrity. 44 | * Continuous development to support new features. 45 | 46 | 47 | Contribuiting 48 | ------------- 49 | 50 | It's easy to contribute. 51 | If you want to add a new plugin follow the guide of :ref:`plugin_dev` 52 | and make your pull request at the official repository. 53 | 54 | 55 | Documentation 56 | ------------- 57 | 58 | .. toctree:: 59 | :maxdepth: 1 60 | 61 | installation 62 | matchers 63 | plugin_development 64 | 65 | 66 | .. _DOM: https://en.wikipedia.org/wiki/Document_Object_Model 67 | .. _Splash: https://github.com/scrapinghub/splash 68 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | 1. Install Docker_ and add your user to the docker group, then you avoid to use sudo. 5 | 6 | 2. Pull the image:: 7 | 8 | $ docker pull scrapinghub/splash 9 | 10 | 3. Create a virtual environment with Python >= 3.6 . 11 | 12 | 4. Install detectem:: 13 | 14 | $ pip install detectem 15 | 16 | 5. Run it against some URL:: 17 | 18 | $ det http://domain.tld 19 | 20 | 21 | .. _Docker: http://docker.io 22 | -------------------------------------------------------------------------------- /docs/matchers.rst: -------------------------------------------------------------------------------- 1 | .. _matchers: 2 | 3 | Matchers 4 | ======== 5 | 6 | Matchers are in charge of extract software information. 7 | *detectem* has different matchers according to its target. 8 | 9 | .. toctree:: 10 | :glob: 11 | :maxdepth: 1 12 | 13 | matchers/* 14 | 15 | 16 | Most matchers use an argument called ``extractor``. 17 | Depending on its value, it could extract: 18 | 19 | Presence 20 | ~~~~~~~~ 21 | 22 | If ``extractor`` doesn't have a named parameter or doesn't exist, 23 | the matcher only checks plugin presence. 24 | 25 | 26 | Version extraction 27 | ~~~~~~~~~~~~~~~~~~ 28 | 29 | For these cases the ``extractor`` has ``version`` 30 | as the named parameter for the regular expression. 31 | 32 | 33 | Name extraction 34 | ~~~~~~~~~~~~~~~ 35 | 36 | Some projects like AngularJS_ have modules that could be included 37 | to add functionality. 38 | The issue is that both core library and module 39 | have the same signature for the version, 40 | then it's needed to determine the software module too. 41 | 42 | For these cases ``extractor`` has ``name`` 43 | as the named parameter for the regular expression. 44 | 45 | 46 | .. _AngularJS: https://angularjs.org/ 47 | -------------------------------------------------------------------------------- /docs/matchers/body.rst: -------------------------------------------------------------------------------- 1 | .. _body_matcher: 2 | 3 | Body Matcher 4 | ============ 5 | 6 | ====== === 7 | Format ``extractor=string`` 8 | Type string 9 | Scope All requests/responses except first one 10 | ====== === 11 | 12 | It operates on response body as a regular expression on raw text. 13 | Its scope is every response body except the first one 14 | since doing matching at it is highly prone to false positives. 15 | To select data from first response, you should use a XPath matcher. 16 | 17 | It's used usually to extract data from commentaries. 18 | 19 | 20 | Example 21 | ^^^^^^^ 22 | 23 | A website ``X`` uses a library called ``yobu`` and loads it from 24 | ``https://cdn.tld/yobu.js``. 25 | As you see, no version could be extracted using a URL matcher. 26 | However, the response body contains some valuable information: 27 | 28 | .. code-block:: javascript 29 | 30 | //! yobu v1.2.3 31 | [...] 32 | 33 | 34 | Then, it's the perfect fit for a body matcher. 35 | Let's create a plugin to detect ``yobu``. 36 | 37 | 38 | .. code-block:: python 39 | 40 | from detectem.plugin import Plugin 41 | 42 | class YobuPlugin(Plugin): 43 | name = 'yobu' 44 | matchers = [ 45 | {'body': r'//! yobu v(?P[0-9\.]+)'}, 46 | ] 47 | 48 | Then, when you run detectem on ``X``, 49 | it will detect the presence of ``yobu`` and its version ``1.2.3``. 50 | -------------------------------------------------------------------------------- /docs/matchers/dom.rst: -------------------------------------------------------------------------------- 1 | .. _dom_matcher: 2 | 3 | DOM Matcher 4 | =========== 5 | 6 | ====== === 7 | Format ``(check_statement=string:required, extractor=string:optional)`` 8 | Type tuple 9 | Scope DOM 10 | ====== === 11 | 12 | It operates on the DOM_ loaded by the browser. 13 | When there are some data representation issues (minification, bundles, etc) 14 | it's better to access the objects already loaded in browser's DOM_ 15 | than trying to parse them with regular expressions. 16 | 17 | This matcher is useful to extract information 18 | from objects loaded in the DOM 19 | that could contain version information under some attribute. 20 | 21 | 22 | Example 23 | ^^^^^^^ 24 | 25 | A website ``X`` uses a software called ``yobu``. 26 | It's loaded as part of a bundle file called ``assets.js`` 27 | that groups every *Javascript* file used by the website. 28 | Moreover, this file is obfuscated and 29 | its content changes every time there's a change 30 | because of a internal building process. 31 | 32 | No way to use a regular expression here. 33 | However, in the browser's *Javascript* console 34 | you can see that there's a ``Yobu`` object. 35 | 36 | 37 | .. image:: ../assets/browser_js_console.png 38 | 39 | 40 | We will use a DOM matcher to extract that data. 41 | The first element of the tuple is a ``check_statement`` written in Javascript. 42 | What should be able to give us this statement? 43 | It will assert that the target object exists in the DOM 44 | to continue with version extraction. 45 | 46 | The second element is an ``extractor`` statement written in Javascript 47 | and it will try to access the attribute where version data lies. 48 | Finally, we are ready with our new matcher: 49 | 50 | 51 | .. code-block:: python 52 | 53 | from detectem.plugin import Plugin 54 | 55 | 56 | class YobuPlugin(Plugin): 57 | name = 'yobu' 58 | matchers = [ 59 | {'dom': ('window.Yobu', 'version': 'window.Yobu.version')}, 60 | ] 61 | 62 | 63 | Then, when you run detectem on ``X``, 64 | it will detect the presence of ``yobu`` and its version ``1.2.3``. 65 | 66 | Notes 67 | ^^^^^ 68 | 69 | The plugins use ``window`` as prefix because 70 | the check statement won't raise any error if the object doesn't exist, 71 | it's easier to emulate browser in our testing suite and avoid side effects 72 | in presence of iframes. 73 | 74 | .. _DOM: https://en.wikipedia.org/wiki/Document_Object_Model 75 | .. _D3.js: https://d3js.org/ 76 | -------------------------------------------------------------------------------- /docs/matchers/header.rst: -------------------------------------------------------------------------------- 1 | .. _header_matcher: 2 | 3 | Header Matcher 4 | ============== 5 | 6 | ====== === 7 | Format ``(header=string:required, extractor=string:optional)`` 8 | Type tuple 9 | Scope First response 10 | ====== === 11 | 12 | It operates on response headers. 13 | As you could expect, it works only on first response 14 | since it contains the headers sent by website's server. 15 | 16 | It's used to extract data exposed by the web server software 17 | and its stack. 18 | You could also dive into ``Set-Cookie`` headers 19 | to extract cookie information. 20 | 21 | 22 | Example 23 | ^^^^^^^ 24 | 25 | A website ``X`` uses ``Apache HTTPd Server``. 26 | The response contains the following headers: 27 | 28 | .. code-block:: bash 29 | 30 | [...] 31 | Server: Apache/2.4.25 32 | [...] 33 | 34 | We will use a header matcher to extract Apache's version. 35 | First, we need to decide which header to look for. 36 | In this case, it's the header ``Server``. 37 | 38 | 39 | .. code-block:: python 40 | 41 | from detectem.plugin import Plugin 42 | 43 | 44 | class ApachePlugin(Plugin): 45 | name = 'apache' 46 | matchers = [ 47 | {'header': ('Server', r'Apache/(?P[0-9\.]+)')}, 48 | ] 49 | 50 | Then, when you run detectem on ``X``, 51 | it will detect the presence of ``Apache`` and its version ``2.4.25``. 52 | -------------------------------------------------------------------------------- /docs/matchers/url.rst: -------------------------------------------------------------------------------- 1 | .. _url_matcher: 2 | 3 | URL matcher 4 | =========== 5 | 6 | ====== === 7 | Format ``extractor=string`` 8 | Type string 9 | Scope All requests/responses except first one 10 | ====== === 11 | 12 | It operates on request/response URLs made by the browser when loading a website. 13 | The *scope* for this matcher is every request/response URL 14 | except the first one, 15 | since they are usually the website's URL to analyze. 16 | 17 | 18 | Example 19 | ^^^^^^^ 20 | 21 | A website ``X`` uses a library called ``yobu`` and loads it from 22 | ``https://cdn.tld/yobu-1.2.3.js``. 23 | As you see, the version is present in the URL 24 | and we can extract it using a URL matcher. 25 | Let's create a plugin to detect ``yobu``. 26 | 27 | 28 | .. code-block:: python 29 | 30 | from detectem.plugin import Plugin 31 | 32 | class YobuPlugin(Plugin): 33 | name = 'yobu' 34 | matchers = [ 35 | {'url': r'/yobu-(?P[0-9\.]+)\.js'}, 36 | ] 37 | 38 | Then, when you run detectem on ``X``, 39 | it will detect the presence of ``yobu`` and its version ``1.2.3``. 40 | -------------------------------------------------------------------------------- /docs/matchers/xpath.rst: -------------------------------------------------------------------------------- 1 | .. _xpath_matcher: 2 | 3 | XPath Matcher 4 | ============= 5 | 6 | ====== === 7 | Format ``(xpath=string:required, extractor=string:optional)`` 8 | Type tuple 9 | Scope First response 10 | ====== === 11 | 12 | It operates on the first response. 13 | Since regular expressions are unproper 14 | to use on first response body 15 | it's better to use XPaths that are context-aware. 16 | 17 | This matcher is useful to extract version information 18 | from meta tags, tag attributes or HTML comments. 19 | Javascript embedded scripts or inline declarations 20 | aren't available to XPath matcher because of embedded inline split. 21 | 22 | 23 | Example 24 | ^^^^^^^ 25 | 26 | A website ``X`` uses a software called ``yobu``. 27 | It doesn't load any resource that could lead 28 | to identify the version of ``yobu`` 29 | but it adds a meta tag to the page 30 | that contains its version. 31 | It looks like: 32 | 33 | .. 34 | [...] 35 | 36 | [...] 37 | 38 | We will use a XPath matcher to extract that data. 39 | The first element of the tuple is an XPath. 40 | What should be able to give us this XPath? 41 | A string where we could apply our version extractor string. 42 | In this case, our goal is to get ``yobu 1.2.3``. 43 | 44 | A XPath capable of doing this is: 45 | ``//meta[@name='generator']/@content``. 46 | That is enough but as this case is so common, 47 | we've added a helper named ``meta_generator`` 48 | that works very well in this scenario. 49 | In this case, it should be called ``meta_generator('yobu')``. 50 | 51 | The second element is our well-known version extractor string. 52 | Finally, we are ready with our new matcher: 53 | 54 | 55 | .. code-block:: python 56 | 57 | from detectem.plugin import Plugin 58 | from detectem.plugins.helpers import meta_generator 59 | 60 | 61 | class YobuPlugin(Plugin): 62 | name = 'yobu' 63 | matchers = [ 64 | {'xpath': (meta_generator('yobu'), r'(?P[0-9\.]+)')}, 65 | ] 66 | 67 | Then, when you run detectem on ``X``, 68 | it will detect the presence of ``yobu`` and its version ``1.2.3``. 69 | -------------------------------------------------------------------------------- /docs/plugin_development.rst: -------------------------------------------------------------------------------- 1 | .. _plugin_dev: 2 | 3 | Plugin development 4 | ================== 5 | 6 | A plugin is the component in charge of detect one software and its version. 7 | Since a software could have many different signatures, 8 | every plugin has test files associated to assure version integrity 9 | and add new signatures without breaking the working ones. 10 | 11 | Let's see how to write your own plugin. 12 | 13 | Requirements 14 | ^^^^^^^^^^^^ 15 | 16 | The plugin has to: 17 | 18 | * Be compliant with :class:`IPlugin ` interface. 19 | * Be a subclass of :class:`Plugin `. 20 | * Have a test file at ``tests/plugins/fixtures/.yml``. 21 | 22 | To make it faster, there's a script called ``add_new_plugin.py`` 23 | which creates both plugin and test file. 24 | 25 | 26 | .. code-block:: bash 27 | 28 | $ python scripts/add_new_plugin.py --matcher=url example 29 | 30 | Created plugin file at detectem/detectem/plugins/example.py 31 | Created test file at detectem/tests/plugins/fixtures/example.yml 32 | 33 | 34 | Plugin file 35 | ^^^^^^^^^^^ 36 | 37 | We're creating an example plugin 38 | for a ficticious software called *examplelib*. 39 | We can detect it easily since it's included as an external library 40 | and in its *URL* it contains the version. 41 | Then we will use the :ref:`url_matcher` for this case. 42 | 43 | 44 | .. code-block:: python 45 | 46 | from detectem.plugin import Plugin 47 | 48 | 49 | class ExamplePlugin(Plugin): 50 | name = 'example' 51 | homepage = 'http://example.org' 52 | matchers = [ 53 | {'url': '/examplelib\.v(?P[0-9\.]+)-min\.js$'}, 54 | ] 55 | 56 | Review :ref:`matchers ` page to meet the available matchers 57 | to write your own plugin. 58 | 59 | 60 | Test file 61 | ^^^^^^^^^ 62 | 63 | This is the test file for our example plugin: 64 | 65 | .. code-block:: yaml 66 | 67 | - plugin: example 68 | matches: 69 | - url: http://domain.tld/examplelib.v1.1.3-min.js 70 | version: 1.1.3 71 | 72 | 73 | Then running the test is simple: 74 | 75 | 76 | .. code-block:: bash 77 | 78 | $ pytest tests/plugins/test_common.py --plugin example 79 | 80 | When you need to support a new signature 81 | and it's not supported by current signatures, 82 | you must modify your plugin file 83 | and add a new test to the list to see 84 | that your changes don't break previous detected versions. 85 | 86 | 87 | References 88 | ^^^^^^^^^^ 89 | 90 | .. autointerface:: detectem.plugin.IPlugin 91 | 92 | .. autoclass:: detectem.plugin.Plugin 93 | -------------------------------------------------------------------------------- /docs/references.rst: -------------------------------------------------------------------------------- 1 | Class references 2 | ================ 3 | 4 | detectem.plugin module 5 | ---------------------- 6 | 7 | .. autointerface:: detectem.plugin.IPlugin 8 | 9 | .. autoclass:: detectem.plugin.Plugin 10 | :show-inheritance: 11 | 12 | .. autoclass:: detectem.plugin.GenericPlugin 13 | :show-inheritance: 14 | 15 | 16 | detectem.plugin.generic module 17 | ------------------------------ 18 | 19 | .. autoclass:: detectem.plugins.generic.WordpressGenericPlugin 20 | :show-inheritance: 21 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==2.2.2 2 | sphinx_rtd_theme==0.4.3 3 | sphinxcontrib-zopeext==0.2.3 4 | -------------------------------------------------------------------------------- /extras/docker/Dockerfile-alternate: -------------------------------------------------------------------------------- 1 | FROM python:3.6.2-alpine3.6 2 | 3 | RUN apk add --update build-base libxml2-dev libxslt-dev 4 | 5 | RUN pip install detectem 6 | 7 | USER nobody 8 | 9 | ENTRYPOINT ["det"] 10 | -------------------------------------------------------------------------------- /extras/docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.1' 2 | 3 | services: 4 | splash: 5 | image: scrapinghub/splash:3.0 6 | 7 | detectem: 8 | image: detectem:latest 9 | build: 10 | context: . 11 | dockerfile: Dockerfile-alternate 12 | environment: 13 | - SETUP_SPLASH=False 14 | - SPLASH_URL=http://splash:8050 15 | depends_on: 16 | - splash 17 | -------------------------------------------------------------------------------- /requirements/base.txt: -------------------------------------------------------------------------------- 1 | click==7.1.2 2 | click-log==0.3.2 3 | docker==4.3.1 4 | parsel==1.6.0 5 | requests==2.24.0 6 | zope.interface==5.1.0 7 | environs==8.0.0 -------------------------------------------------------------------------------- /requirements/devel.txt: -------------------------------------------------------------------------------- 1 | -r base.txt 2 | isort==5.4.2 3 | black==20.8b1 4 | -------------------------------------------------------------------------------- /requirements/tests.txt: -------------------------------------------------------------------------------- 1 | pytest==6.0.1 2 | pyyaml==5.3.1 3 | dukpy==0.2.3 4 | pytest-mock==3.3.1 5 | boddle==0.2.9 6 | pytest-black==0.3.10 7 | pytest-isort==1.2.0 8 | tox==3.20.0 9 | -------------------------------------------------------------------------------- /scripts/add_new_plugin.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import click 4 | 5 | ROOT_DIRECTORY = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) 6 | PLUGIN_DIRECTORY = os.path.join(ROOT_DIRECTORY, "detectem/plugins") 7 | PLUGIN_DIRECTORIES = [ 8 | d 9 | for d in os.listdir(PLUGIN_DIRECTORY) 10 | if os.path.isdir(os.path.join(PLUGIN_DIRECTORY, d)) and d != "__pycache__" 11 | ] 12 | 13 | 14 | @click.command() 15 | @click.option( 16 | "--matcher", 17 | type=click.Choice(["url", "body", "header", "xpath", "dom"]), 18 | required=True, 19 | help="Set the matcher type.", 20 | ) 21 | @click.argument("name") 22 | def main(name, matcher): 23 | create_plugin_file(name, matcher) 24 | create_test_file(name, matcher) 25 | 26 | 27 | def create_plugin_file(name, matcher): 28 | plugin_template = """ 29 | from detectem.plugin import Plugin 30 | 31 | 32 | class {title}Plugin(Plugin): 33 | name = "{name}" 34 | homepage = "" 35 | tags = [] 36 | matchers = [ 37 | {{"{matcher}": "Plugin signature v(?P[0-9\.]+)"}}, 38 | ] 39 | """.format( 40 | name=name, title=name.title(), matcher=matcher 41 | ).lstrip() 42 | 43 | plugin_filename = name + ".py" 44 | plugin_filepath = os.path.join(PLUGIN_DIRECTORY, plugin_filename) 45 | 46 | if os.path.exists(plugin_filepath): 47 | raise FileExistsError("Plugin file already exists.") 48 | 49 | with open(plugin_filepath, mode="w") as f: 50 | f.write(plugin_template) 51 | print("Created plugin file at {}".format(plugin_filepath)) 52 | 53 | 54 | def create_test_file(name, matcher): 55 | test_template = """ 56 | - plugin: {name} 57 | matches: 58 | - {matcher}: 59 | version: 60 | """.format( 61 | name=name, matcher=matcher 62 | ).lstrip() 63 | 64 | test_filename = name + ".yml" 65 | test_filepath = os.path.join( 66 | ROOT_DIRECTORY, "tests", "plugins", "fixtures", test_filename 67 | ) 68 | 69 | if os.path.exists(test_filepath): 70 | raise FileExistsError("Test file already exists.") 71 | 72 | with open(test_filepath, mode="w") as f: 73 | f.write(test_template) 74 | print("Created test file at {}".format(test_filepath)) 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /scripts/get_shodan_banners.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pprint 3 | import re 4 | import sys 5 | 6 | import click 7 | 8 | try: 9 | import shodan 10 | except ImportError: 11 | print("Install shodan: pip install shodan") 12 | sys.exit(0) 13 | 14 | try: 15 | SHODAN_API_KEY = os.environ["SHODAN_API_KEY"] 16 | except KeyError: 17 | print("Set SHODAN_API_KEY environment variable with your key") 18 | sys.exit(0) 19 | 20 | 21 | def get_headers(text): 22 | header_string = re.findall("^(.*?)(?:[\r\n]{3,4})", text, flags=re.DOTALL | re.I) 23 | if not header_string: 24 | return None 25 | 26 | data = {} 27 | for line in header_string[0].splitlines(): 28 | match = re.findall("^(.*?):(.*)", line) 29 | 30 | if match: 31 | key, value = map(lambda v: v.strip(), match[0]) 32 | data[key] = value 33 | 34 | return data 35 | 36 | 37 | @click.command() 38 | @click.option("--filter", default=None, type=str, help="Filter by header") 39 | @click.option("--stats", default=False, is_flag=True, help="Include stats") 40 | @click.option("--show-names", default=False, is_flag=True, help="Show header names") 41 | @click.argument("query") 42 | def main(filter, stats, show_names, query): 43 | counter = 0 44 | filtered_header = set() 45 | api = shodan.Shodan(SHODAN_API_KEY) 46 | 47 | try: 48 | result = api.search(query) 49 | except shodan.exception.APIError: 50 | print("[-] API connection error.") 51 | sys.exit(0) 52 | 53 | for match in result["matches"]: 54 | server = "{}:{}".format(match["ip_str"], match["port"]) 55 | hd = get_headers(match["data"]) 56 | if not hd: 57 | continue 58 | 59 | if show_names: 60 | filtered_header.update(set(hd.keys())) 61 | elif filter: 62 | value = hd.get(filter) 63 | if value: 64 | filtered_header.add((server, value)) 65 | else: 66 | pprint.pprint(hd, width=160) 67 | counter += 1 68 | 69 | if filtered_header: 70 | pprint.pprint(filtered_header, width=160) 71 | 72 | if stats: 73 | print("\n--- Stats ---") 74 | print("[+] n_matches: {}".format(len(result["matches"]))) 75 | print("[+] n_printed: {}".format(counter or len(filtered_header))) 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /scripts/get_software_hashes.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import io 3 | import logging 4 | import os 5 | import pprint 6 | import re 7 | import tempfile 8 | import zipfile 9 | 10 | import click 11 | import click_log 12 | import requests 13 | 14 | logger = logging.getLogger(__name__) 15 | click_log.basic_config(logger) 16 | 17 | FILE_REGEX = r"^v?(\d+\.\d+\.?\d+?)$" # avoid beta releases 18 | N_RESULTS = 100 19 | 20 | 21 | def get_files_from_github(user_and_repo, filedir, regex, prefer_tags): 22 | """ Return dictionary of version:directory and directory has extracted files. """ 23 | github_url = f"https://api.github.com/repos/{user_and_repo}" 24 | args = f"?per_page={N_RESULTS}" 25 | results = [] 26 | json_data = None 27 | 28 | # Determine the right url 29 | if not prefer_tags: 30 | url = f"{github_url}/releases{args}" 31 | json_data = requests.get(url).json() 32 | 33 | if not json_data: 34 | url = f"{github_url}/tags{args}" 35 | json_data = requests.get(url).json() 36 | 37 | logger.debug(f"[+] Using {url} as base url.") 38 | 39 | # Get all the releases/tags available 40 | results = json_data 41 | for n_page in range(2, 1000): 42 | logger.debug(f"[+] Requesting page {n_page} of releases/tags ..") 43 | json_data = requests.get(f"{url}&page={n_page}").json() 44 | results += json_data 45 | 46 | if len(json_data) < N_RESULTS: 47 | break 48 | 49 | if not results: 50 | raise ValueError(f"No releases/tags for {user_and_repo}") 51 | 52 | directories = {} 53 | for result in results: 54 | name = result["name"] 55 | m = re.match(regex, name) 56 | if not m: 57 | continue 58 | 59 | name = m.groups()[0] 60 | 61 | logger.debug(f"[+] Downloading zip file for {name} version ..") 62 | 63 | # Download zip file and extract in temporary directory 64 | zip_url = result["zipball_url"] 65 | zf = requests.get(zip_url, allow_redirects=True) 66 | z = zipfile.ZipFile(io.BytesIO(zf.content)) 67 | output_dir = f"{filedir}/{z.namelist()[0]}" 68 | 69 | z.extractall(path=filedir) 70 | directories[name] = output_dir 71 | 72 | return directories 73 | 74 | 75 | @click.command() 76 | @click.option("--github", default=None, type=str, help="user/repository") 77 | @click.option( 78 | "--directory", 79 | default=None, 80 | type=str, 81 | help="local directory containing version directories", 82 | ) 83 | @click.option( 84 | "--regex", default=FILE_REGEX, type=str, help="regex to select the releases" 85 | ) 86 | @click.option("--prefer-tags", is_flag=True, help="prefer tags over releases") 87 | @click_log.simple_verbosity_option(logger, default="error") 88 | @click.argument("filepath", type=str) 89 | def main(github, directory, regex, filepath, prefer_tags): 90 | directories = [] 91 | 92 | if github: 93 | with tempfile.TemporaryDirectory() as filedir: 94 | logger.debug(f"[+] Using {filedir} as temporary directory.") 95 | 96 | directories = get_files_from_github(github, filedir, regex, prefer_tags) 97 | else: 98 | directories = {f.name: f.path for f in os.scandir(directory) if f.is_dir()} 99 | 100 | logger.debug("[+] Creating hashes ..") 101 | 102 | hashes = {} 103 | for version, path in directories.items(): 104 | target_file = os.path.join(path, filepath) 105 | if not os.path.exists(target_file): 106 | logger.error(f"version {version} not contains target file") 107 | continue 108 | 109 | with open(target_file) as f: 110 | content = f.read().encode("utf-8") 111 | m = hashlib.sha256() 112 | m.update(content) 113 | h = m.hexdigest() 114 | hashes[version] = h 115 | 116 | pprint.pprint({filepath: hashes}) 117 | 118 | 119 | if __name__ == "__main__": 120 | main() 121 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | 4 | [tool:pytest] 5 | addopts = --black --isort 6 | 7 | [tool:isort] 8 | multi_line_output=3 9 | include_trailing_comma=True 10 | force_grid_wrap=0 11 | use_parentheses=True 12 | line_length=88 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from pathlib import Path 5 | 6 | from setuptools import find_packages, setup 7 | 8 | 9 | def get_requirements(name): 10 | return ( 11 | open(Path(__file__).parent.joinpath(f"requirements/{name}.txt")) 12 | .read() 13 | .splitlines() 14 | ) 15 | 16 | 17 | requirements = get_requirements("base") 18 | test_requirements = get_requirements("tests") 19 | 20 | setup( 21 | name="detectem", 22 | version="0.7.3", 23 | description="Detect software in websites.", 24 | author="Claudio Salazar", 25 | author_email="csalazar@spect.cl", 26 | url="https://github.com/alertot/detectem", 27 | packages=find_packages(exclude=("tests*", "docs")), 28 | package_data={"detectem": ["data/*"]}, 29 | package_dir={"detectem": "detectem"}, 30 | entry_points={"console_scripts": ["det=detectem.cli:main"]}, 31 | include_package_data=True, 32 | install_requires=requirements, 33 | python_requires=">=3.6", 34 | license="MIT", 35 | zip_safe=False, 36 | keywords="detector detection", 37 | classifiers=[ 38 | "Intended Audience :: Developers", 39 | "License :: OSI Approved :: MIT License", 40 | "Natural Language :: English", 41 | "Programming Language :: Python :: 3", 42 | "Programming Language :: Python :: 3.6", 43 | ], 44 | test_suite="tests", 45 | tests_require=test_requirements, 46 | ) 47 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | from collections import defaultdict 4 | 5 | from yaml import FullLoader, load 6 | 7 | from detectem.matchers import PluginMatch 8 | 9 | 10 | def load_from_yaml(test_dir, relative_yaml_file): 11 | final_path = os.path.join(test_dir, relative_yaml_file) 12 | lines = [] 13 | 14 | if os.path.isdir(final_path): 15 | for filepath in glob.glob("{}/**.yml".format(final_path)): 16 | lines += [line for line in load(open(filepath), Loader=FullLoader)] 17 | else: 18 | lines = [line for line in load(open(final_path), Loader=FullLoader)] 19 | 20 | return lines 21 | 22 | 23 | def tree(): 24 | return defaultdict(tree) 25 | 26 | 27 | def create_pm(name=None, version=None, presence=False): 28 | return PluginMatch(name, version, presence) 29 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def pytest_addoption(parser): 5 | parser.addoption("--plugin", action="store", default=None) 6 | 7 | 8 | @pytest.fixture 9 | def plugin(request): 10 | return request.config.getoption("--plugin") 11 | -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alertot/detectem/bc5f073575643c4c95a778ef576a5f0cbb1d3852/tests/integration/__init__.py -------------------------------------------------------------------------------- /tests/integration/test_core.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from detectem.core import Detector 4 | from detectem.plugin import Plugin, PluginCollection 5 | from detectem.plugins.helpers import meta_generator 6 | 7 | 8 | @pytest.mark.skip("After making the unit tests this should be finished") 9 | class TestDetector: 10 | HAR_ENTRY_1 = { 11 | "request": {"url": "http://domain.tld/libA-1.4.2.js"}, 12 | "response": {"url": "http://domain.tld/libA-1.4.2.js"}, 13 | } 14 | 15 | HAR_NO_URL_REDIRECT = [ 16 | {"request": {"url": "http://domain.tld/"}, "response": {}}, 17 | {"request": {"url": "http://domain.tld/js/script.js"}, "response": {}}, 18 | ] 19 | HAR_URL_REDIRECT_PATH = [ 20 | { 21 | "request": {"url": "http://domain.tld/"}, 22 | "response": { 23 | "headers": [{"name": "Location", "value": "/new/default.html"}] 24 | }, 25 | }, 26 | {"request": {"url": "http://domain.tld/new/default.html"}, "response": {}}, 27 | ] 28 | HAR_URL_REDIRECT_ABS = [ 29 | { 30 | "request": {"url": "http://domain.tld/"}, 31 | "response": { 32 | "headers": [{"name": "Location", "value": "http://other-domain.tld/"}] 33 | }, 34 | }, 35 | {"request": {"url": "http://other-domain.tld/"}, "response": {}}, 36 | ] 37 | 38 | URL = "http://domain.tld/" 39 | 40 | FOO_PLUGIN = { 41 | "name": "foo", 42 | "homepage": "foo", 43 | "version_matchers": { 44 | "url": r"foo.*-(?P[0-9\.]+)\.js", 45 | "header": ("FooHeader", r"Foo.* v(?P[0-9\.]+)"), 46 | "body": r"Foo.* v(?P[0-9\.]+)", 47 | "xpath": (meta_generator("foo-min"), r"(?P[0-9\.]+)"), 48 | }, 49 | "presence_matchers": { 50 | "url": r"foo.*\.js", 51 | "header": ("FooHeader", "Foo"), 52 | "body": "Foo", 53 | "xpath": "//meta[@name='generator']", 54 | }, 55 | "name_matchers": { 56 | "url": r"foo-(?P\w+)-.*\.js", 57 | "header": ("FooHeader", r"Foo-(?P\w+)"), 58 | "body": r"Foo-(?P\w+)", 59 | "xpath": (meta_generator("foo-min"), r"foo-(?P\w+)"), 60 | }, 61 | } 62 | 63 | FOO_RESULTS = [ 64 | [{"name": "foo", "version": "1.1"}], 65 | [{"name": "foo"}], 66 | [{"name": "foo-min", "version": "1.1"}], 67 | ] 68 | 69 | MATCHER_SOURCES = ["version_matchers", "presence_matchers", "name_matchers"] 70 | 71 | def test_convert_inline_script_to_har_entry(self): 72 | script = "Inline script" 73 | 74 | d = Detector({"har": [], "softwares": [], "scripts": [script]}, None, self.URL) 75 | e = d.har[0] 76 | 77 | assert e["request"]["url"] == self.URL 78 | assert e["response"]["content"]["text"] == script 79 | 80 | @pytest.mark.parametrize( 81 | "scripts,n_entries", [([], 0), (["script1", "script2"], 2)] 82 | ) 83 | def test_add_inline_scripts_to_har(self, scripts, n_entries): 84 | d = Detector({"har": [], "softwares": [], "scripts": scripts}, None, self.URL) 85 | assert len(d.har) == n_entries 86 | 87 | def _create_plugin(self, template, source, matchers): 88 | class TestPlugin(Plugin): 89 | name = template["name"] 90 | homepage = template["homepage"] 91 | 92 | p = TestPlugin() 93 | g = [{m: template[source][m]} for m in matchers] 94 | p.matchers = g 95 | 96 | return p 97 | 98 | def _create_detector(self, har, plugins): 99 | pc = PluginCollection() 100 | for p in plugins: 101 | pc.add(p) 102 | return Detector({"har": har, "softwares": []}, pc, self.URL) 103 | 104 | @pytest.mark.parametrize("sources,result", zip(MATCHER_SOURCES, FOO_RESULTS)) 105 | def test_match_from_headers(self, sources, result): 106 | har = [ 107 | { 108 | "request": {"url": self.URL}, 109 | "response": { 110 | "url": self.URL, 111 | "headers": [{"name": "FooHeader", "value": "Foo-min v1.1"}], 112 | }, 113 | } 114 | ] 115 | p = self._create_plugin(self.FOO_PLUGIN, sources, ["header"]) 116 | d = self._create_detector(har, [p]) 117 | 118 | assert d.get_results() == result 119 | 120 | @pytest.mark.parametrize("sources", MATCHER_SOURCES) 121 | def test_match_from_headers_ignores_resource_entries(self, sources): 122 | har = [ 123 | { 124 | "request": {"url": self.URL}, 125 | "response": {"url": self.URL, "headers": []}, 126 | }, 127 | { 128 | "request": {"url": "http://foo.org/lib/foo.js"}, 129 | "response": { 130 | "url": "http://foo.org/lib/foo.js", 131 | "headers": [{"name": "FooHeader", "value": "Foo-min v1.1"}], 132 | }, 133 | }, 134 | ] 135 | p = self._create_plugin(self.FOO_PLUGIN, sources, ["header"]) 136 | d = self._create_detector(har, [p]) 137 | 138 | assert not d.get_results() 139 | 140 | @pytest.mark.parametrize("sources,result", zip(MATCHER_SOURCES, FOO_RESULTS)) 141 | def test_match_from_body(self, sources, result): 142 | har = [ 143 | { 144 | "request": {"url": self.URL}, 145 | "response": {"url": self.URL, "content": {"text": "Main content"}}, 146 | }, 147 | { 148 | "request": {"url": "http://foo.org/lib/foo.js"}, 149 | "response": { 150 | "url": "http://foo.org/lib/foo.js", 151 | "content": {"text": "Plugin Foo-min v1.1"}, 152 | }, 153 | }, 154 | ] 155 | p = self._create_plugin(self.FOO_PLUGIN, sources, ["body"]) 156 | d = self._create_detector(har, [p]) 157 | 158 | assert d.get_results() == result 159 | 160 | @pytest.mark.parametrize("sources", MATCHER_SOURCES) 161 | def test_match_from_body_excludes_main_entry(self, sources): 162 | har = [ 163 | { 164 | "request": {"url": self.URL}, 165 | "response": { 166 | "url": self.URL, 167 | "content": {"text": "About Foo-min v1.1"}, 168 | }, 169 | } 170 | ] 171 | p = self._create_plugin(self.FOO_PLUGIN, sources, ["body"]) 172 | d = self._create_detector(har, [p]) 173 | 174 | assert not d.get_results() 175 | 176 | @pytest.mark.parametrize("sources,result", zip(MATCHER_SOURCES, FOO_RESULTS)) 177 | def test_match_from_url(self, sources, result): 178 | har = [ 179 | {"request": {"url": self.URL}, "response": {"url": self.URL}}, 180 | { 181 | "request": {"url": "http://foo.org/lib/foo-min-1.1.js"}, 182 | "response": {"url": "http://foo.org/lib/foo-min-1.1.js"}, 183 | }, 184 | ] 185 | p = self._create_plugin(self.FOO_PLUGIN, sources, ["url"]) 186 | d = self._create_detector(har, [p]) 187 | 188 | assert d.get_results() == result 189 | 190 | @pytest.mark.parametrize("sources,result", zip(MATCHER_SOURCES, FOO_RESULTS)) 191 | def test_match_from_xpath(self, sources, result): 192 | har = [ 193 | { 194 | "request": {"url": self.URL}, 195 | "response": { 196 | "url": self.URL, 197 | "content": { 198 | "text": '' 199 | }, 200 | }, 201 | } 202 | ] 203 | p = self._create_plugin(self.FOO_PLUGIN, sources, ["xpath"]) 204 | d = self._create_detector(har, [p]) 205 | 206 | assert d.get_results() == result 207 | 208 | def test_get_hints_with_valid_hint(self): 209 | class TestPlugin(Plugin): 210 | name = "test" 211 | homepage = "test" 212 | 213 | class BlaPlugin(Plugin): 214 | name = "bla" 215 | hints = ["test"] 216 | 217 | detector = self._create_detector(None, [TestPlugin()]) 218 | 219 | hints = detector.get_hints(BlaPlugin()) 220 | assert hints 221 | 222 | def test_get_hints_with_invalid_hint(self): 223 | class BlaPlugin(Plugin): 224 | name = "bla" 225 | hints = ["test"] 226 | 227 | detector = self._create_detector(None, []) 228 | hints = detector.get_hints(BlaPlugin()) 229 | assert not hints 230 | -------------------------------------------------------------------------------- /tests/plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alertot/detectem/bc5f073575643c4c95a778ef576a5f0cbb1d3852/tests/plugins/__init__.py -------------------------------------------------------------------------------- /tests/plugins/fixtures/angular.yml: -------------------------------------------------------------------------------- 1 | - plugin: angular 2 | matches: 3 | - xpath: 4 | version: 6.1.2 5 | - dom: "window.ng = {'coreTokens': {}}" 6 | presence: true 7 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/angularjs.yml: -------------------------------------------------------------------------------- 1 | - plugin: angularjs 2 | matches: 3 | - body: | 4 | /* 5 | AngularJS v1.5.6 6 | (c) 2010-2016 Google, Inc. http://angularjs.org 7 | version: 1.5.6 8 | - url: https://ajax.googleapis.com/ajax/libs/angularjs/1.5.6/angular.min.js 9 | version: 1.5.6 10 | - url: http://www.domain.tld/angular-route.min.js 11 | name: route 12 | - dom: "window.angular = {version: {full: '1.5.9'}}" 13 | version: 1.5.9 14 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/apache.yml: -------------------------------------------------------------------------------- 1 | - plugin: apache 2 | matches: 3 | - header: 4 | name: Server 5 | value: Apache/2.2.16 (Debian) mod_ssl/2.2.16 OpenSSL/0.9.8o 6 | version: 2.2.16 7 | - plugin: apache-coyote 8 | matches: 9 | - header: 10 | name: Server 11 | value: Apache-Coyote/1.1 12 | version: '1.1' 13 | - plugin: apache-mod_bwlimited 14 | matches: 15 | - header: 16 | name: Server 17 | value: Apache Phusion_Passenger/4.0.10 mod_bwlimited/1.4 mod_fcgid/2.3.9 18 | version: '1.4' 19 | - plugin: apache-mod_fcgid 20 | matches: 21 | - header: 22 | name: Server 23 | value: Apache Phusion_Passenger/4.0.10 mod_bwlimited/1.4 mod_fcgid/2.3.9 24 | version: '2.3.9' 25 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/backbonejs.yml: -------------------------------------------------------------------------------- 1 | - plugin: backbone.js 2 | matches: 3 | - body: // Backbone.js 1.3.3 4 | version: 1.3.3 5 | - url: https://cdnjs.cloudflare.com/ajax/libs/backbone.js/1.1.1/backbone-min.js 6 | version: 1.1.1 7 | - url: /static/js/libs/backbone/backbone-1.1.2.js 8 | version: 1.1.2 9 | - dom: "window.Backbone = {VERSION: '1.0.0'}" 10 | version: 1.0.0 11 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/d3js.yml: -------------------------------------------------------------------------------- 1 | - plugin: d3.js 2 | matches: 3 | - body: // https://d3js.org Version 4.5.0. Copyright 2017 Mike Bostock. 4 | version: 4.5.0 5 | - url: https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.17/d3.min.js 6 | version: 3.5.17 7 | - url: https://fastcdn.org/D3.js/3.5.6/d3.min.js 8 | version: 3.5.6 9 | - dom: "window.d3 = {version: '4.0.0'}" 10 | version: 4.0.0 11 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/ember.yml: -------------------------------------------------------------------------------- 1 | - plugin: ember 2 | matches: 3 | - dom: "window.Ember = {'VERSION': '3.10.2'}" 4 | version: 3.10.2 5 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/ghost.yml: -------------------------------------------------------------------------------- 1 | - plugin: ghost 2 | matches: 3 | - xpath: '' 4 | version: '0.11' 5 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/iis.yml: -------------------------------------------------------------------------------- 1 | - plugin: iis 2 | matches: 3 | - header: 4 | name: Server 5 | value: Microsoft-IIS/6.0 6 | version: '6.0' 7 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/joomla.yml: -------------------------------------------------------------------------------- 1 | - plugin: joomla! 2 | matches: 3 | - body: 4 | presence: true 5 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/jquery.yml: -------------------------------------------------------------------------------- 1 | - plugin: jquery 2 | matches: 3 | - url: http://ajax.googleapis.com/ajax/libs/jquery/1.8.3/jquery.min.js 4 | version: 1.8.3 5 | - url: https://cdnjs.cloudflare.com/ajax/libs/jquery/3.2.1/jquery.slim.min.js 6 | version: 3.2.1 7 | - url: https://code.jquery.com/jquery-3.2.1.slim.min.js 8 | version: 3.2.1 9 | - url: https://code.jquery.com/jquery-1.11.3.min.js 10 | version: 1.11.3 11 | - url: https://code.jquery.com/jquery-1.11.4.js 12 | version: 1.11.4 13 | - body: /*! jQuery v1.12.4 | (c) jQuery Foundation | jquery.org/license */ 14 | version: 1.12.4 15 | - body: /* jQuery v1.11.3 | (c) 2005, 2015 jQuery Foundation, Inc. | jquery.org/license */ 16 | version: 1.11.3 17 | - body: \* jQuery JavaScript Library v1.4.4 18 | version: 1.4.4 19 | - body: /*! jQuery v1.7.2 jquery.com | jquery.org/license */ 20 | version: 1.7.2 21 | - body: "/*! jQuery v3.2.1 \ 22 | -ajax,-ajax/jsonp,-ajax/load,-ajax/parseXML,-ajax/script,-ajax/var/location,\ 23 | -ajax/var/nonce,-ajax/var/rquery,-ajax/xhr,-manipulation/_evalUrl,-event/ajax,\ 24 | -effects,-effects/Tween,-effects/animatedSelector \ 25 | | (c) JS Foundation and other contributors" 26 | version: 3.2.1 27 | - dom: 'window.jQuery = function() { return {jquery: "1.0.0"} }' 28 | version: 1.0.0 29 | 30 | - plugin: jquery-colorbox 31 | matches: 32 | - body: // ColorBox v1.3.15 - a full featured, light-weight, customizable lightbox based on jQuery 1.3+ 33 | version: 1.3.15 34 | 35 | - plugin: jquery-migrate 36 | matches: 37 | - body: /*! jQuery Migrate v1.4.1 | (c) jQuery Foundation 38 | version: 1.4.1 39 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/jqueryui.yml: -------------------------------------------------------------------------------- 1 | - plugin: jqueryui 2 | matches: 3 | - body: | 4 | /*! 5 | * jQuery UI Core 1.11.4 6 | * http://jqueryui.com 7 | version: 1.11.4 8 | - body: /*! jQuery UI - v1.11.1 - 2014-08-13 9 | version: 1.11.1 10 | - url: https://code.jquery.com/ui/1.11.1/jquery-ui.min.js 11 | version: 1.11.1 12 | - url: https://cdnjs.cloudflare.com/ajax/libs/jqueryui/1.12.1/jquery-ui.js 13 | version: 1.12.1 14 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/knockoutjs.yml: -------------------------------------------------------------------------------- 1 | - plugin: knockoutjs 2 | matches: 3 | - body: // Knockout.js 3.4.2 4 | version: 3.4.2 5 | - url: https://cdnjs.cloudflare.com/ajax/libs/knockout/3.4.0/knockout-min.js 6 | version: 3.4.0 7 | - url: http://ajax.aspnetcdn.com/ajax/knockout/knockout-3.4.2.js 8 | version: 3.4.2 9 | - dom: "window.ko = {'version': '3.4.0' }" 10 | version: 3.4.0 11 | - xpath: | 12 | 14 | presence: true 15 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/lightbox.yml: -------------------------------------------------------------------------------- 1 | - plugin: lightbox 2 | matches: 3 | - body: | 4 | /*! 5 | * Lightbox v2.9.0 6 | * by Lokesh Dhakar 7 | * 8 | * More info: 9 | * http://lokeshdhakar.com/projects/lightbox2/ 10 | version: 2.9.0 11 | - url: https://cdnjs.cloudflare.com/ajax/libs/lightbox2/2.9.0/js/lightbox.js 12 | version: 2.9.0 13 | - url: https://cdnjs.cloudflare.com/ajax/libs/lightbox2/2.9.0/js/lightbox.min.js 14 | version: 2.9.0 15 | - url: https://www.freecodecamp.org/bower_components/lightbox2/dist/css/lightbox.css 16 | presence: true 17 | 18 | - plugin: prettyphoto 19 | matches: 20 | - body: (function($){$.prettyPhoto={version:'3.0.1'}; 21 | version: 3.0.1 22 | - body: | 23 | (function($) { 24 | $.prettyPhoto = {version: '3.1.6'};} 25 | version: 3.1.6 26 | - body: e.prettyPhoto={version:"3.1.6"} 27 | version: 3.1.6 28 | - url: https://cdnjs.cloudflare.com/ajax/libs/prettyPhoto/3.1.6/js/jquery.prettyPhoto.min.js 29 | version: 3.1.6 30 | - url: https://cdnjs.cloudflare.com/ajax/libs/prettyPhoto/3.1.6/css/prettyPhoto.min.css 31 | version: 3.1.6 32 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/modernizr.yml: -------------------------------------------------------------------------------- 1 | - plugin: modernizr 2 | matches: 3 | - body: /* Modernizr 2.8.3 (Custom Build) | MIT & BSD 4 | version: 2.8.3 5 | - url: https://cdnjs.cloudflare.com/ajax/libs/modernizr/2.8.3/modernizr.js 6 | version: 2.8.3 7 | - url: https://cdnjs.cloudflare.com/ajax/libs/modernizr/2.8.3/modernizr.min.js 8 | version: 2.8.3 9 | - url: js/modernizr-2.0.6.js 10 | version: 2.0.6 11 | - dom: "window.Modernizr = {_version: '3.3.1'}" 12 | version: 3.3.1 13 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/momentjs.yml: -------------------------------------------------------------------------------- 1 | - plugin: moment.js 2 | matches: 3 | - body: | 4 | //! moment.js 5 | //! version : 2.18.1 6 | version: 2.18.1 7 | - url: //cdnjs.cloudflare.com/ajax/libs/moment.js/2.8.2/moment.min.js 8 | version: 2.8.2 9 | - dom: "window.moment = {version: '2.5.1'}" 10 | version: 2.5.1 11 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/mootools.yml: -------------------------------------------------------------------------------- 1 | - plugin: mootools-core 2 | matches: 3 | - dom: "window.MooTools = {version: '1.0.0'}" 4 | version: 1.0.0 5 | 6 | - plugin: mootools-more 7 | matches: 8 | - dom: "window.MooTools = {More: {version: '1.0.0'}}" 9 | version: 1.0.0 10 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/nginx.yml: -------------------------------------------------------------------------------- 1 | - plugin: nginx 2 | matches: 3 | - header: 4 | name: Server 5 | value: nginx/1.1.19 6 | version: 1.1.19 7 | - header: 8 | name: Server 9 | value: nginx 10 | presence: true 11 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/php.yml: -------------------------------------------------------------------------------- 1 | - plugin: php 2 | matches: 3 | - header: 4 | name: X-Powered-By 5 | value: PHP/5.4.45 6 | version: 5.4.45 7 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/phusion-passenger.yml: -------------------------------------------------------------------------------- 1 | - plugin: phusion-passenger 2 | matches: 3 | - header: 4 | name: Server 5 | value: Apache Phusion_Passenger/4.0.10 mod_bwlimited/1.4 mod_fcgid/2.3.9 6 | version: 4.0.10 7 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/piwik.yml: -------------------------------------------------------------------------------- 1 | - plugin: piwik 2 | matches: 3 | - body: | 4 | /*!! 5 | * Piwik - free/libre analytics platform 6 | presence: true 7 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/react.yml: -------------------------------------------------------------------------------- 1 | - plugin: react 2 | matches: 3 | - body: | 4 | /** 5 | * React v15.6.1 6 | * 7 | * Copyright 2013-present, Facebook, Inc. 8 | version: 15.6.1 9 | - url: https://cdnjs.cloudflare.com/ajax/libs/react/15.6.1/react-with-addons.js 10 | version: 15.6.1 11 | - url: https://cdnjs.cloudflare.com/ajax/libs/react/15.6.1/react.js 12 | version: 15.6.1 13 | - url: https://fb.me/react-0.14.3.js 14 | version: 0.14.3 15 | - url: https://fb.me/react-with-addons-0.14.3.min.js 16 | version: 0.14.3 17 | - dom: "window.React = {version: '15.1.0'}" 18 | version: 15.1.0 19 | - xpath:
20 | presence: true 21 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/requirejs.yml: -------------------------------------------------------------------------------- 1 | - plugin: require.js 2 | matches: 3 | - body: | 4 | * @license RequireJS 2.3.5 Copyright jQuery Foundation and other contributors. 5 | version: 2.3.5 6 | - url: https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.22/require.js 7 | version: 2.1.22 8 | - url: https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.5/require.min.js 9 | version: 2.3.5 10 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/ssl.yml: -------------------------------------------------------------------------------- 1 | - plugin: modssl 2 | matches: 3 | - header: 4 | name: Server 5 | value: Apache/2.2.16 (Debian) mod_ssl/2.2.16 OpenSSL/0.9.8o 6 | version: 2.2.16 7 | 8 | - plugin: openssl 9 | matches: 10 | - header: 11 | name: Server 12 | value: Apache/2.2.16 (Debian) mod_ssl/2.2.16 OpenSSL/0.9.8o 13 | version: 0.9.8o 14 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/underscorejs.yml: -------------------------------------------------------------------------------- 1 | - plugin: underscore.js 2 | matches: 3 | - body: // Underscore.js 1.5.1 4 | version: 1.5.1 5 | - url: https://cdn.jsdelivr.net/underscorejs/1.8.3/underscore-min.js 6 | version: 1.8.3 7 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/vue.yml: -------------------------------------------------------------------------------- 1 | - plugin: vue 2 | matches: 3 | - xpath:
4 | presence: true 5 | - url: https://cdn.jsdelivr.net/npm/vue@2.6.0 6 | version: 2.6.0 7 | - dom: "window.Vue = {'version': '2.6.10' }" 8 | version: 2.6.10 9 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/w3-total-cache.yml: -------------------------------------------------------------------------------- 1 | - plugin: w3-total-cache 2 | matches: 3 | - header: 4 | name: X-Powered-By 5 | value: W3 Total Cache/0.9.6 6 | version: 0.9.6 7 | - xpath: 8 | presence: true 9 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/webpack.yml: -------------------------------------------------------------------------------- 1 | - plugin: webpack 2 | matches: 3 | - dom: "window.webpackJsonp = []" 4 | presence: true 5 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/wordpress.yml: -------------------------------------------------------------------------------- 1 | - plugin: crayon-syntax-highlighter 2 | matches: 3 | - dom: "window.CrayonSyntaxSettings = {version: '1.0.0'}" 4 | version: 1.0.0 5 | - plugin: wordpress 6 | matches: 7 | - url: http://domain.tld/wp-includes/js/wp-embed.min.js?ver=1.2.3 8 | version: 1.2.3 9 | - xpath: '' 10 | version: 4.9.1 11 | - url: http://www.domain.tld/wp-content/plugins/bla 12 | presence: true 13 | -------------------------------------------------------------------------------- /tests/plugins/fixtures/wp-super-cache.yml: -------------------------------------------------------------------------------- 1 | - plugin: wp-super-cache 2 | matches: 3 | - xpath: | 4 | 5 | " 6 | 7 | presence: true 8 | -------------------------------------------------------------------------------- /tests/plugins/test_common.py: -------------------------------------------------------------------------------- 1 | import os 2 | from importlib.util import find_spec 3 | 4 | import dukpy 5 | import pytest 6 | 7 | from detectem.core import MATCHERS 8 | from detectem.plugin import load_plugins 9 | from detectem.settings import PLUGIN_PACKAGES 10 | from tests import create_pm, load_from_yaml 11 | 12 | from .utils import create_har_entry 13 | 14 | 15 | class TestCommonMatches: 16 | FIELDS = [k for k in MATCHERS] 17 | 18 | def pytest_generate_tests(self, metafunc): 19 | fname = metafunc.function.__name__ 20 | cases = [] 21 | all_plugins = load_plugins() 22 | 23 | for plugin_package in PLUGIN_PACKAGES: 24 | package = plugin_package.split(".")[0] 25 | package_dir = find_spec(package).submodule_search_locations[0] 26 | test_dir = os.path.join(package_dir, os.pardir, "tests") 27 | 28 | plugin = metafunc.config.getoption("plugin", None) 29 | data = load_from_yaml(test_dir, "plugins/fixtures/") 30 | 31 | only_dom_matches = fname == "test_dom_matches" 32 | 33 | # Entry is the full plugin test file evaluated as a dictionary 34 | for entry in data: 35 | # Each yaml_dict is an entry in matches 36 | for yaml_dict in entry["matches"]: 37 | # Filter valid matchers if dom matchers are expected 38 | if (only_dom_matches and "dom" not in yaml_dict) or ( 39 | not only_dom_matches and "dom" in yaml_dict 40 | ): 41 | continue 42 | 43 | target_plugin_name = entry["plugin"] 44 | 45 | if plugin: 46 | # Case if plugin was provided by developer 47 | if plugin == target_plugin_name: 48 | p = all_plugins.get(target_plugin_name) 49 | cases.append([p, yaml_dict]) 50 | else: 51 | p = all_plugins.get(target_plugin_name) 52 | if not p: 53 | pytest.fail( 54 | f"Plugin name `{target_plugin_name}` extracted from fixture doesn't exist. " 55 | "Verify that both plugin and fixture file refer to the same name." 56 | ) 57 | 58 | cases.append([p, yaml_dict]) 59 | 60 | metafunc.parametrize("plugin,yaml_dict", cases) 61 | 62 | def _get_plugin_match(self, plugin, yaml_dict): 63 | field = [k for k in yaml_dict.keys() if k in self.FIELDS][0] 64 | har_entry = create_har_entry(field, yaml_dict) 65 | 66 | matchers = plugin.get_matchers(field) 67 | matcher_class = MATCHERS[field] 68 | 69 | return matcher_class.get_info(har_entry, *matchers) 70 | 71 | def test_matches(self, plugin, yaml_dict): 72 | pm = self._get_plugin_match(plugin, yaml_dict) 73 | 74 | # More than one value could be asserted, then we need to create this dict 75 | asserter = { 76 | k: v for k, v in yaml_dict.items() if k in ["version", "name", "presence"] 77 | } 78 | assert pm == create_pm(**asserter) 79 | 80 | def test_dom_matches(self, plugin, yaml_dict): 81 | was_asserted = False # At least one assert was done 82 | js_code = yaml_dict["dom"] 83 | 84 | interpreter = dukpy.JSInterpreter() 85 | # Create window browser object 86 | interpreter.evaljs("window = {};") 87 | interpreter.evaljs(js_code) 88 | 89 | for matcher in plugin.get_matchers("dom"): 90 | check_statement, version_statement = matcher 91 | 92 | is_present = interpreter.evaljs(check_statement) 93 | if is_present is not None: 94 | if version_statement: 95 | version = interpreter.evaljs(version_statement) 96 | assert yaml_dict["version"] == version 97 | 98 | was_asserted = True 99 | break 100 | else: 101 | assert yaml_dict["presence"] 102 | was_asserted = True 103 | 104 | assert was_asserted 105 | -------------------------------------------------------------------------------- /tests/plugins/test_generic.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from detectem.core import MATCHERS 4 | from detectem.plugin import GenericPlugin, load_plugins 5 | from tests import create_pm 6 | 7 | from .utils import create_har_entry 8 | 9 | 10 | class TestGenericPlugin: 11 | @pytest.fixture 12 | def plugins(self): 13 | return load_plugins() 14 | 15 | def test_generic_plugin(self): 16 | class MyGenericPlugin(GenericPlugin): 17 | pass 18 | 19 | x = MyGenericPlugin() 20 | with pytest.raises(NotImplementedError): 21 | x.get_information(entry=None) 22 | 23 | assert x.ptype == "generic" 24 | 25 | @pytest.mark.parametrize( 26 | "plugin_name,matcher_type,har_content,name", 27 | [ 28 | ( 29 | "wordpress_generic", 30 | "url", 31 | "http://domain.tld/wp-content/plugins/example/", 32 | "example", 33 | ) 34 | ], 35 | ) 36 | def test_real_generic_plugin( 37 | self, plugin_name, matcher_type, har_content, name, plugins 38 | ): 39 | plugin = plugins.get(plugin_name) 40 | har_entry = create_har_entry(matcher_type, value=har_content) 41 | 42 | # Verify presence using matcher class 43 | matchers = plugin.get_matchers(matcher_type) 44 | matcher_class = MATCHERS[matcher_type] 45 | 46 | assert matcher_class.get_info(har_entry, *matchers) == create_pm(presence=True) 47 | 48 | assert plugin.get_information(har_entry)["name"] == name 49 | -------------------------------------------------------------------------------- /tests/plugins/utils.py: -------------------------------------------------------------------------------- 1 | from tests import tree 2 | 3 | 4 | def create_har_entry(field, yaml_dict=None, value=None): 5 | fake_har_entry = tree() 6 | 7 | try: 8 | content = yaml_dict[field] 9 | except (TypeError, KeyError): 10 | content = value 11 | 12 | assert content 13 | 14 | if field == "url": 15 | fake_har_entry["request"]["url"] = content 16 | fake_har_entry["response"]["url"] = content 17 | elif field == "body": 18 | fake_har_entry["response"]["content"]["text"] = content 19 | elif field == "header": 20 | fake_har_entry["response"]["headers"] = [content] 21 | elif field == "xpath": 22 | fake_har_entry["response"]["content"]["text"] = content 23 | 24 | return fake_har_entry 25 | -------------------------------------------------------------------------------- /tests/splash/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alertot/detectem/bc5f073575643c4c95a778ef576a5f0cbb1d3852/tests/splash/__init__.py -------------------------------------------------------------------------------- /tests/splash/test_docker_manager.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from unittest.mock import patch 3 | 4 | import pytest 5 | from docker.errors import DockerException 6 | 7 | from detectem.exceptions import DockerStartError 8 | from detectem.splash import DockerSplashManager, requests 9 | 10 | 11 | class TestDockerSplashManager: 12 | def test_init(self): 13 | dm = DockerSplashManager() 14 | assert hasattr(dm, "_instances") 15 | assert dm.handles_errors 16 | 17 | def test_docker_cli(self): 18 | dm = DockerSplashManager() 19 | cli_1 = dm.docker_cli 20 | cli_2 = dm.docker_cli 21 | 22 | assert cli_1 == cli_2 23 | 24 | def test_docker_cli_with_exception(self): 25 | dm = DockerSplashManager() 26 | 27 | with patch("detectem.splash.docker.from_env", side_effect=DockerException()): 28 | with pytest.raises(DockerStartError): 29 | dm.docker_cli 30 | 31 | def test_wait_container_valid_case(self): 32 | dm = DockerSplashManager() 33 | 34 | # Set manager metadata 35 | container_name = "c-1" 36 | dm._instances[container_name] = {"url": "http://localhost"} 37 | 38 | # Mock requests response 39 | class PingResponse: 40 | status_code = 200 41 | 42 | with patch.object(requests, "get", return_value=lambda u: PingResponse()): 43 | assert dm._wait_container(container_name) == None 44 | 45 | def test_wait_container_with_exception(self): 46 | dm = DockerSplashManager() 47 | 48 | # Set manager metadata 49 | container_name = "c-1" 50 | dm._instances[container_name] = {"url": "http://localhost"} 51 | 52 | with pytest.raises(DockerStartError): 53 | dm._wait_container(container_name) 54 | 55 | def test_setup_with_inexistent_docker_image(self): 56 | dm = DockerSplashManager() 57 | 58 | with patch("detectem.splash.DOCKER_SPLASH_IMAGE", "inexistent_1_2_3"): 59 | with pytest.raises(DockerStartError): 60 | dm.setup(n_instances=3) 61 | 62 | def test_setup_valid_case(self): 63 | dm = DockerSplashManager() 64 | n_instances = 3 65 | 66 | with patch("detectem.splash.NUMBER_OF_SPLASH_INSTANCES", n_instances): 67 | dm.setup(n_instances) 68 | assert len(dm._instances.keys()) == n_instances 69 | 70 | # Test that containers are ready 71 | for container_name, container_data in dm._instances.items(): 72 | assert not container_data["in_use"] 73 | assert not container_data["errors"] 74 | 75 | c = dm.docker_cli.containers.get(container_name) 76 | assert c.status == "running" 77 | 78 | dm.teardown() 79 | 80 | def test_teardown(self): 81 | dm = DockerSplashManager() 82 | n_instances = 3 83 | 84 | with patch("detectem.splash.NUMBER_OF_SPLASH_INSTANCES", n_instances): 85 | dm.setup(n_instances=3) 86 | dm.teardown() 87 | 88 | # Test that containers are stopped 89 | for container_name in dm._instances.keys(): 90 | 91 | c = dm.docker_cli.containers.get(container_name) 92 | assert c.status == "exited" 93 | 94 | def test_handle_error_normal_case(self): 95 | dm = DockerSplashManager() 96 | container_name = "c-1" 97 | dm._instances[container_name] = {"url": "http://localhost", "errors": 0} 98 | 99 | dm.handle_error(container_name) 100 | 101 | assert dm._instances[container_name]["errors"] == 1 102 | 103 | def test_handle_error_with_restart(self): 104 | dm = DockerSplashManager() 105 | dm.setup(n_instances=3) 106 | container_name = list(dm._instances.keys())[0] 107 | 108 | dm.handle_error(container_name) 109 | dm.handle_error(container_name) 110 | assert dm._instances[container_name]["errors"] == 2 111 | 112 | dm.handle_error(container_name) 113 | assert dm._instances[container_name]["errors"] == 0 114 | 115 | # Check that container was restarted 116 | events = dm.docker_cli.events( 117 | decode=True, since=datetime.utcnow() - timedelta(seconds=1) 118 | ) 119 | for event in events: 120 | if "status" in event and event["status"] == "restart": 121 | assert event["Actor"]["Attributes"]["name"] == container_name 122 | break 123 | events.close() 124 | 125 | dm.teardown() 126 | 127 | def test_setup_with_container_not_starting(self): 128 | dm = DockerSplashManager() 129 | n_instances = 3 130 | 131 | def _conditional_mock(*args, **kwargs): 132 | # Raise exception for only one container 133 | if args[0] == "splash-detectem-0": 134 | raise DockerStartError() 135 | 136 | with patch("detectem.splash.NUMBER_OF_SPLASH_INSTANCES", n_instances): 137 | with patch.object(dm, "_wait_container", _conditional_mock): 138 | dm.setup(n_instances) 139 | assert dm.get_number_of_available_instances() == 2 140 | dm.teardown() 141 | -------------------------------------------------------------------------------- /tests/splash/test_remote_manager.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | import pytest 4 | 5 | from detectem.splash import RemoteSplashManager 6 | 7 | 8 | class TestRemoteSplashManager: 9 | def test_init_with_invalid_settings(self): 10 | with pytest.raises(ValueError): 11 | RemoteSplashManager() 12 | 13 | def test_init(self): 14 | with patch("detectem.splash.NUMBER_OF_SPLASH_INSTANCES", 1): 15 | RemoteSplashManager() 16 | 17 | def test_setup(self): 18 | with patch("detectem.splash.NUMBER_OF_SPLASH_INSTANCES", 1): 19 | rm = RemoteSplashManager() 20 | rm.setup(n_instances=1) 21 | 22 | assert "instance-0" in rm._instances.keys() 23 | -------------------------------------------------------------------------------- /tests/splash/test_splash_manager.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | import pytest 4 | 5 | from detectem.splash import ( 6 | DockerSplashManager, 7 | RemoteSplashManager, 8 | SplashManagerInterface, 9 | get_splash_manager, 10 | requests, 11 | ) 12 | 13 | 14 | @pytest.mark.parametrize( 15 | "value,klass", [(True, DockerSplashManager), (False, RemoteSplashManager)] 16 | ) 17 | def test_get_splash_manager(value, klass): 18 | with patch("detectem.splash.NUMBER_OF_SPLASH_INSTANCES", 1): 19 | with patch("detectem.splash.SETUP_SPLASH", value): 20 | splash_instance = get_splash_manager() 21 | assert isinstance(splash_instance, klass) 22 | 23 | 24 | class TestSplashManagerInterface: 25 | def test_assign_instance_valid_case(self): 26 | class TestManager(SplashManagerInterface): 27 | handle_error = lambda v: v 28 | setup = lambda v: v 29 | teardown = lambda v: v 30 | 31 | tm = TestManager() 32 | # Set manager metadata 33 | container_name = "c-1" 34 | url = "http://localhost" 35 | tm._instances[container_name] = {"url": url, "in_use": False} 36 | 37 | # Mock requests response 38 | class PingResponse: 39 | status_code = 200 40 | 41 | with patch.object(requests, "get", return_value=lambda u: PingResponse()): 42 | with tm.assign_instance() as (c, u): 43 | assert c == container_name 44 | assert u == url 45 | assert tm._instances[container_name]["in_use"] 46 | 47 | assert not tm._instances[container_name]["in_use"] 48 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from detectem.cli import get_detection_results 4 | from detectem.exceptions import NoPluginsError, SplashError 5 | 6 | 7 | def test_get_detection_results_with_no_plugins(mocker): 8 | mocker.patch("detectem.cli.load_plugins", return_value=[]) 9 | 10 | with pytest.raises(NoPluginsError): 11 | get_detection_results("http://domain.tld", timeout=30, metadata=True) 12 | 13 | 14 | def test_get_detection_results_with_splash_error(mocker): 15 | mocker.patch("detectem.cli.get_response", side_effect=SplashError("test")) 16 | 17 | with pytest.raises(SplashError): 18 | get_detection_results("http://domain.tld", timeout=30, metadata=True) 19 | 20 | 21 | def test_get_detection_ok(mocker): 22 | class FakeDetector: 23 | def __init__(*args): 24 | pass 25 | 26 | def get_results(**kwargs): 27 | return [1, 2, 3] 28 | 29 | mocker.patch("detectem.cli.get_response", return_value=1) 30 | mocker.patch("detectem.cli.Detector", return_value=FakeDetector) 31 | 32 | rs = get_detection_results("http://domain.tld", timeout=30, metadata=True) 33 | assert rs == {"url": "http://domain.tld", "softwares": [1, 2, 3]} 34 | -------------------------------------------------------------------------------- /tests/test_core.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from detectem.core import HarProcessor 4 | from detectem.settings import INLINE_SCRIPT_ENTRY, MAIN_ENTRY 5 | 6 | 7 | class TestHarProcessor: 8 | HAR_NO_URL_REDIRECT = [ 9 | {"request": {"url": "http://domain.tld/"}, "response": {}}, 10 | {"request": {"url": "http://domain.tld/js/script.js"}, "response": {}}, 11 | ] 12 | 13 | HAR_URL_REDIRECT = [ 14 | { 15 | "request": {"url": "http://domain.tld/"}, 16 | "response": { 17 | "headers": [{"name": "Location", "value": "/new/default.html"}] 18 | }, 19 | }, 20 | {"request": {"url": "http://domain.tld/new/default.html"}, "response": {}}, 21 | ] 22 | 23 | def test__set_entry_type(self): 24 | data = {} 25 | HarProcessor._set_entry_type(data, "marker") 26 | assert data["detectem"]["type"] == "marker" 27 | 28 | @pytest.mark.parametrize( 29 | "entry,result", 30 | [ 31 | ({"response": {}}, None), 32 | ({"response": {"headers": [{"name": "any"}]}}, None), 33 | (HAR_URL_REDIRECT[0], "/new/default.html"), 34 | ], 35 | ) 36 | def test__get_location(self, entry, result): 37 | assert HarProcessor._get_location(entry) == result 38 | 39 | def test__script_to_har_entry(self): 40 | url = "http://url" 41 | content = "content" 42 | 43 | entry = HarProcessor._script_to_har_entry(content, url) 44 | assert entry["request"]["url"] == url 45 | assert entry["response"]["url"] == url 46 | assert entry["response"]["content"]["text"] == content 47 | 48 | assert entry["detectem"]["type"] == INLINE_SCRIPT_ENTRY 49 | 50 | @pytest.mark.parametrize( 51 | "entries,index", [(HAR_NO_URL_REDIRECT, 0), (HAR_URL_REDIRECT, 1)] 52 | ) 53 | def test_mark_entries(self, entries, index): 54 | HarProcessor().mark_entries(entries) 55 | assert entries[index]["detectem"]["type"] == MAIN_ENTRY 56 | -------------------------------------------------------------------------------- /tests/test_exceptions.py: -------------------------------------------------------------------------------- 1 | from detectem.exceptions import SplashError 2 | 3 | 4 | def test_splash_error(): 5 | try: 6 | raise SplashError("test") 7 | except SplashError as e: 8 | assert "Splash error: test" in str(e) 9 | -------------------------------------------------------------------------------- /tests/test_matchers.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | 5 | from detectem.matchers import ( 6 | BodyMatcher, 7 | HeaderMatcher, 8 | UrlMatcher, 9 | XPathMatcher, 10 | extract_name, 11 | extract_named_group, 12 | extract_version, 13 | ) 14 | from tests import create_pm 15 | 16 | 17 | def res_text(text): 18 | return {"response": {"content": {"text": text}}} 19 | 20 | 21 | def req_res_url(url): 22 | return {"request": {"url": url}, "response": {"url": url}} 23 | 24 | 25 | def res_server_header(value): 26 | return {"response": {"headers": [{"name": "Server", "value": value}]}} 27 | 28 | 29 | class TestMatcherHelpers: 30 | @pytest.mark.parametrize( 31 | "matcher,result", 32 | [ 33 | (r"plugin (?P\w+)", "example"), 34 | (r"plugin (?P\d+)", None), 35 | (r"plugin (?P\w+)", None), 36 | (lambda v: re.findall("plugin (.*)", v)[0], "example"), 37 | (lambda v: None, None), 38 | ], 39 | ) 40 | def test_extract_named_group(self, matcher, result): 41 | assert extract_named_group("plugin example", "target", [matcher]) == result 42 | 43 | def test_extract_named_group_with_presence(self): 44 | matcher = "plugin example" 45 | 46 | assert extract_named_group("plugin example", "target", [matcher]) is None 47 | assert ( 48 | extract_named_group( 49 | "plugin example", "target", [matcher], return_presence=True 50 | ) 51 | == "presence" 52 | ) 53 | 54 | def test_extract_named_group_with_different_named_group_and_presence(self): 55 | matcher = r"plugin (?P\w+)" 56 | 57 | assert ( 58 | extract_named_group( 59 | "plugin example", "target", [matcher], return_presence=True 60 | ) 61 | is None 62 | ) 63 | 64 | def test_extract_version_uses_version_parameter(self): 65 | matcher = r"plugin (?P\w+)" 66 | assert extract_version("plugin example", matcher) == "example" 67 | 68 | def test_extract_name_uses_name_parameter(self): 69 | matcher = r"plugin (?P\w+)" 70 | assert extract_name("plugin example", matcher) == "example" 71 | 72 | 73 | class TestMatchers: 74 | version_re = r"foo-(?P[\d\.]+)" 75 | presence_re = "foo" 76 | name_re = r"foo-(?P\w+)" 77 | 78 | @pytest.mark.parametrize( 79 | "matcher_class,entry,matcher,version", 80 | [ 81 | (UrlMatcher, req_res_url("http://d.tld/foo-1.1"), version_re, "1.1"), 82 | (UrlMatcher, req_res_url("http://d.tld/foo"), version_re, None), 83 | (BodyMatcher, res_text("foo-1.1"), version_re, "1.1"), 84 | (BodyMatcher, res_text("bar-1.1"), version_re, None), 85 | ( 86 | HeaderMatcher, 87 | res_server_header("foo-1.1"), 88 | ("Server", version_re), 89 | "1.1", 90 | ), 91 | (HeaderMatcher, res_server_header("bar-1.1"), ("Server", version_re), None), 92 | ( 93 | XPathMatcher, 94 | res_text("foo-1.1"), 95 | ("//a/text()", version_re), 96 | "1.1", 97 | ), 98 | ( 99 | XPathMatcher, 100 | res_text("bar-1.1"), 101 | ("//a/text()", version_re), 102 | None, 103 | ), 104 | ], 105 | ) # yapf: disable 106 | def test_get_version(self, matcher_class, entry, matcher, version): 107 | assert matcher_class.get_info(entry, matcher) == create_pm(version=version) 108 | 109 | @pytest.mark.parametrize( 110 | "matcher_class,entry,matcher,presence", 111 | [ 112 | (UrlMatcher, req_res_url("http://d.tld/foo"), presence_re, True), 113 | (UrlMatcher, req_res_url("http://d.tld/bar"), presence_re, False), 114 | (BodyMatcher, res_text("foo"), presence_re, True), 115 | (BodyMatcher, res_text("bar"), presence_re, False), 116 | (HeaderMatcher, res_server_header("foo"), ("Server", presence_re), True), 117 | (HeaderMatcher, res_server_header("bar"), ("Server", presence_re), False), 118 | (XPathMatcher, res_text("foo"), ("//a/text()", presence_re), True), 119 | (XPathMatcher, res_text("bar"), ("//a/text()", presence_re), False), 120 | ], 121 | ) # yapf: disable 122 | def test_get_presence(self, matcher_class, entry, matcher, presence): 123 | assert matcher_class.get_info(entry, matcher) == create_pm(presence=presence) 124 | 125 | @pytest.mark.parametrize( 126 | "matcher_class,entry,matcher,name", 127 | [ 128 | (UrlMatcher, req_res_url("http://d.tld/foo-core"), name_re, "core"), 129 | (UrlMatcher, req_res_url("http://d.tld/bar-core"), name_re, None), 130 | (BodyMatcher, res_text("foo-core"), name_re, "core"), 131 | (BodyMatcher, res_text("bar-core"), name_re, None), 132 | (HeaderMatcher, res_server_header("foo-core"), ("Server", name_re), "core"), 133 | (HeaderMatcher, res_server_header("bar-core"), ("Server", name_re), None), 134 | ( 135 | XPathMatcher, 136 | res_text("foo-core"), 137 | ("//a/text()", name_re), 138 | "core", 139 | ), 140 | (XPathMatcher, res_text("bar-core"), ("//a/text()", name_re), None), 141 | ], 142 | ) # yapf: disable 143 | def test_get_name(self, matcher_class, entry, matcher, name): 144 | assert matcher_class.get_info(entry, matcher) == create_pm(name=name) 145 | 146 | 147 | class TestUrlMatcher: 148 | @pytest.mark.parametrize( 149 | "entry", 150 | [ 151 | {"request": {"url": "http://d.tld/foo-1.1"}}, 152 | {"request": {"url": ""}, "response": {"url": "http://d.tld/foo-1.1"}}, 153 | ], 154 | ) 155 | def test_get_version_with_har(self, entry): 156 | version_re = r"foo-(?P[\d\.]+)" 157 | assert UrlMatcher.get_info(entry, version_re) == create_pm(version="1.1") 158 | -------------------------------------------------------------------------------- /tests/test_response.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import detectem.utils 4 | from detectem.exceptions import SplashError 5 | from detectem.plugin import Plugin, PluginCollection 6 | from detectem.response import ( 7 | DEFAULT_CHARSET, 8 | create_lua_script, 9 | get_charset, 10 | get_evaljs_error, 11 | get_response, 12 | get_valid_har, 13 | is_url_allowed, 14 | is_valid_mimetype, 15 | requests, 16 | ) 17 | 18 | 19 | @pytest.mark.parametrize( 20 | "url,result", 21 | [("http://domain.tld/font.ttf", False), ("http://domain.tld/index.html", True)], 22 | ) 23 | def test_is_url_allowed(url, result): 24 | assert is_url_allowed(url) == result 25 | 26 | 27 | @pytest.mark.parametrize( 28 | "response,result", 29 | [ 30 | ({}, True), 31 | ({"mimeType": "image/gif;charset=utf-8"}, False), 32 | ({"mimeType": "text/html"}, True), 33 | ], 34 | ) 35 | def test_is_valid_mimetype(response, result): 36 | assert is_valid_mimetype(response) == result 37 | 38 | 39 | @pytest.mark.parametrize( 40 | "response,result", 41 | [ 42 | ({}, DEFAULT_CHARSET), 43 | ({"mimeType": ";charset=mycharset"}, "mycharset"), 44 | ({"otherField": "blabla"}, DEFAULT_CHARSET), 45 | ], 46 | ) 47 | def test_get_charset(response, result): 48 | assert get_charset(response) == result 49 | 50 | 51 | def test_create_lua_script(): 52 | class BlaPlugin(Plugin): 53 | name = "bla" 54 | matchers = [{"dom": ("bla", "bla.version")}] 55 | 56 | plugins = PluginCollection() 57 | plugins.add(BlaPlugin()) 58 | 59 | script = create_lua_script(plugins) 60 | assert script 61 | 62 | assert '"name": "bla"' in script 63 | assert '"check_statement": "bla"' in script 64 | assert '"version_statement": "bla.version"' in script 65 | 66 | 67 | def test_get_response(monkeypatch): 68 | class TestResponse: 69 | status_code = 200 70 | 71 | def json(self): 72 | return {"har": {}, "softwares": [], "scripts": {}} 73 | 74 | def __mock_requests_get(url, timeout=None): 75 | return TestResponse() 76 | 77 | monkeypatch.setattr(requests, "get", __mock_requests_get) 78 | monkeypatch.setattr(requests, "post", lambda v: v) 79 | monkeypatch.setattr(detectem.settings, "SETUP_SPLASH", False) 80 | 81 | response = get_response("http://domain.tld", PluginCollection()) 82 | assert response 83 | assert "har" in response 84 | assert "softwares" in response 85 | 86 | 87 | def test_get_response_with_error_status_codes(monkeypatch): 88 | class TestResponse: 89 | status_code = 504 90 | 91 | def json(self): 92 | return {"description": "error 100"} 93 | 94 | def __mock_requests_get(url, timeout=None): 95 | return TestResponse() 96 | 97 | monkeypatch.setattr(requests, "get", __mock_requests_get) 98 | monkeypatch.setattr(requests, "post", lambda v: v) 99 | monkeypatch.setattr(detectem.settings, "SETUP_SPLASH", False) 100 | 101 | with pytest.raises(SplashError): 102 | get_response("http://domain.tld", PluginCollection()) 103 | 104 | 105 | @pytest.mark.parametrize( 106 | "har_data,result_len", 107 | [ 108 | ({}, 0), 109 | ({"log": {}}, 0), 110 | ({"log": {"entries": []}}, 0), 111 | ( 112 | { 113 | "log": { 114 | "entries": [{"request": {"url": "http://fonts.googleapis.com/"}}] 115 | } 116 | }, 117 | 0, 118 | ), 119 | ( 120 | { 121 | "log": { 122 | "entries": [ 123 | { 124 | "request": {"url": "http://domain.tld/"}, 125 | "response": {"content": {}}, 126 | } 127 | ] 128 | } 129 | }, 130 | 1, 131 | ), 132 | ({"log": {"entries": [{"request": {"url": "http://domain.tld/img.png"}}]}}, 0), 133 | ( 134 | { 135 | "log": { 136 | "entries": [ 137 | { 138 | "request": {"url": "http://domain.tld"}, 139 | "response": { 140 | "content": {"text": "blab", "mimeType": "image/gif"} 141 | }, 142 | } 143 | ] 144 | } 145 | }, 146 | 0, 147 | ), 148 | ( 149 | { 150 | "log": { 151 | "entries": [ 152 | { 153 | "request": {"url": "http://domain.tld/"}, 154 | "response": {"content": {"text": "blab"}}, 155 | } 156 | ] 157 | } 158 | }, 159 | 1, 160 | ), 161 | ], 162 | ) 163 | def test_get_valid_har(har_data, result_len): 164 | assert len(get_valid_har(har_data)) == result_len 165 | 166 | 167 | def test_get_evaljs_error(): 168 | json_data = { 169 | "errors": { 170 | "evaljs": "ScriptError(" 171 | "{" 172 | "'js_error_type': 'ReferenceError', " 173 | "'message': 'JS error: \"ReferenceError: Can\\'t find variable: softwareData\"', " # noqa: E501 174 | "'js_error': \"ReferenceError: Can't find variable: softwareData\", " 175 | "'js_error_message': \"Can't find variable: softwareData\", " 176 | "'splash_method': 'evaljs', " 177 | "'type': 'JS_ERROR'" 178 | "},)" 179 | } 180 | } 181 | message = get_evaljs_error(json_data) 182 | assert message == "ReferenceError: Can't find variable: softwareData" 183 | -------------------------------------------------------------------------------- /tests/test_results.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from detectem.results import Result, ResultCollection 4 | from detectem.settings import GENERIC_TYPE, HINT_TYPE, INDICATOR_TYPE 5 | 6 | 7 | class TestResultCollection: 8 | @staticmethod 9 | def _assert_results(detected, results): 10 | c = ResultCollection() 11 | for d in detected: 12 | c.add_result(d) 13 | assert set(c.get_results()) == set(results) 14 | 15 | @pytest.mark.parametrize( 16 | "detected,results", 17 | [ 18 | ( 19 | [ 20 | Result("pluginA", "1.1"), 21 | Result("pluginB", "3.8.7"), 22 | Result("pluginC", "4.0"), 23 | ], 24 | [ 25 | Result("pluginA", "1.1"), 26 | Result("pluginB", "3.8.7"), 27 | Result("pluginC", "4.0"), 28 | ], 29 | ), 30 | ( 31 | [ 32 | Result("pluginA", "1.3"), 33 | Result("pluginA", "1.2"), 34 | Result("pluginA", "1.1"), 35 | ], 36 | [ 37 | Result("pluginA", "1.1"), 38 | Result("pluginA", "1.2"), 39 | Result("pluginA", "1.3"), 40 | ], 41 | ), 42 | ( 43 | [ 44 | Result("pluginA", "1.1"), 45 | Result("pluginC", type=HINT_TYPE), 46 | Result("pluginB", type=INDICATOR_TYPE), 47 | Result("pluginD", type=GENERIC_TYPE), 48 | ], 49 | [ 50 | Result("pluginA", "1.1"), 51 | Result("pluginB", type=INDICATOR_TYPE), 52 | Result("pluginC", type=HINT_TYPE), 53 | Result("pluginD", type=GENERIC_TYPE), 54 | ], 55 | ), 56 | ], 57 | ) 58 | def test_get_all_detected_plugins(self, detected, results): 59 | self._assert_results(detected, results) 60 | 61 | @pytest.mark.parametrize( 62 | "detected,results", 63 | [ 64 | ( 65 | [ 66 | Result("pluginA", "1.1"), 67 | Result("pluginA", "1.2"), 68 | Result("pluginA", "1.1"), 69 | ], 70 | [Result("pluginA", "1.1"), Result("pluginA", "1.2")], 71 | ), 72 | ( 73 | [ 74 | Result("pluginA", "1.1"), 75 | Result("pluginA", type=INDICATOR_TYPE), 76 | Result("pluginA", type=HINT_TYPE), 77 | ], 78 | [Result("pluginA", "1.1")], 79 | ), 80 | ( 81 | [Result("pluginB", type=HINT_TYPE), Result("pluginB", type=HINT_TYPE)], 82 | [Result("pluginB", type=HINT_TYPE)], 83 | ), 84 | ( 85 | [ 86 | Result("pluginB", type=INDICATOR_TYPE), 87 | Result("pluginB", type=INDICATOR_TYPE), 88 | ], 89 | [Result("pluginB", type=INDICATOR_TYPE)], 90 | ), 91 | ( 92 | [ 93 | Result("pluginB", type=INDICATOR_TYPE), 94 | Result("pluginB", type=HINT_TYPE), 95 | ], 96 | [Result("pluginB", type=INDICATOR_TYPE)], 97 | ), 98 | ( 99 | [ 100 | Result("pluginB", type=INDICATOR_TYPE), 101 | Result("pluginB", type=GENERIC_TYPE), 102 | ], 103 | [Result("pluginB", type=INDICATOR_TYPE)], 104 | ), 105 | ], 106 | ) 107 | def test_remove_duplicated_results(self, detected, results): 108 | self._assert_results(detected, results) 109 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from detectem.utils import get_url 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "entry,result", 8 | [ 9 | ({"request": {"url": "http://a"}}, "http://a"), 10 | ({"request": {"url": "http://a"}, "response": {"url": "http://b"}}, "http://b"), 11 | ], 12 | ) 13 | def test_get_url(entry, result): 14 | assert get_url(entry) == result 15 | -------------------------------------------------------------------------------- /tests/test_ws.py: -------------------------------------------------------------------------------- 1 | import json 2 | from unittest.mock import patch 3 | 4 | from boddle import boddle 5 | 6 | from detectem.exceptions import NoPluginsError, SplashError 7 | from detectem.ws import do_detection 8 | 9 | """ 10 | Tests run with `autospec` to match function signature in case of change 11 | """ 12 | 13 | 14 | @patch("detectem.ws.get_detection_results", autospec=True) 15 | def test_do_detection_with_normal_behavior(gdr): 16 | gdr.return_value = [] 17 | 18 | with boddle(method="post", params={"url": "http://domain.tld"}): 19 | assert do_detection() == json.dumps([]) 20 | 21 | 22 | @patch("detectem.ws.get_detection_results", autospec=True) 23 | def test_do_detection_with_splash_exception(gdr): 24 | gdr.side_effect = SplashError("splash") 25 | 26 | with boddle(method="post", params={"url": "http://domain.tld"}): 27 | assert do_detection() == json.dumps({"error": "Splash error: splash"}) 28 | 29 | 30 | @patch("detectem.ws.get_detection_results", autospec=True) 31 | def test_do_detection_with_noplugins_exception(gdr): 32 | gdr.side_effect = NoPluginsError("No plugins") 33 | 34 | with boddle(method="post", params={"url": "http://domain.tld"}): 35 | assert do_detection() == json.dumps({"error": "No plugins"}) 36 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py36,py37,py38 3 | 4 | [testenv] 5 | setenv = 6 | PYTHONPATH = {toxinidir}:{toxinidir}/detectem 7 | deps = 8 | -r{toxinidir}/requirements/base.txt 9 | -r{toxinidir}/requirements/devel.txt 10 | -r{toxinidir}/requirements/tests.txt 11 | commands = 12 | pip install -U pip 13 | py.test --basetemp={envtmpdir} 14 | --------------------------------------------------------------------------------