├── .dockerignore ├── .githooks ├── check-branch-name.py └── pre-commit.linux.sample ├── .github ├── local-test │ └── run-test.sh ├── logo │ ├── blue │ │ ├── logo_desc │ │ │ ├── 192x192.png │ │ │ ├── 256x256.png │ │ │ ├── 320x320.png │ │ │ ├── 480x480.png │ │ │ └── 512x512.png │ │ └── logo_no_desc │ │ │ ├── 192x192.png │ │ │ ├── 256x256.png │ │ │ ├── 320x320.png │ │ │ ├── 480x480.png │ │ │ └── 512x512.png │ └── white │ │ ├── logo_desc │ │ ├── 192x192.png │ │ ├── 256x256.png │ │ ├── 320x320.png │ │ ├── 480x480.png │ │ └── 512x512.png │ │ └── logo_no_desc │ │ ├── 192x192.png │ │ ├── 256x256.png │ │ ├── 320x320.png │ │ ├── 480x480.png │ │ └── 512x512.png └── workflows │ ├── formatpythoncode.yml │ ├── pythonpackage.yml │ └── pythonpublish.yml ├── .gitignore ├── Dockerfile.test ├── LICENSE ├── Makefile ├── README.md ├── docs ├── Makefile ├── _static │ └── .gitkeep ├── conf.py ├── index.rst └── make.bat ├── requirements.txt ├── richkit ├── __init__.py ├── analyse │ ├── __init__.py │ ├── analyse.py │ ├── segment.py │ └── util.py ├── lookup │ ├── __init__.py │ ├── geo.py │ └── util.py ├── retrieve │ ├── __init__.py │ ├── cert_sh.py │ ├── ctlogs.py │ ├── data │ │ ├── .gitkeep │ │ └── categories_list.txt │ ├── dns.py │ ├── symantec.py │ ├── urlvoid.py │ ├── whois.py │ └── x509.py └── test │ ├── __init__.py │ ├── analyse │ ├── __init__.py │ └── test_analyse.py │ ├── lookup │ ├── __init__.py │ ├── test_geo.py │ └── test_util.py │ └── retrieve │ ├── __init__.py │ ├── test_ctlogs.py │ ├── test_dns.py │ ├── test_symantec.py │ ├── test_urlvoid.py │ ├── test_whois.py │ └── test_x509.py └── setup.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .pytest_cache 2 | .githooks 3 | .docs 4 | .github/logo 5 | .github/workflows 6 | 7 | 8 | .env 9 | .venv 10 | env/ 11 | venv/ 12 | ENV/ 13 | env.bak/ 14 | venv.bak/ 15 | 16 | -------------------------------------------------------------------------------- /.githooks/check-branch-name.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import re 3 | import sys 4 | 5 | 6 | def check(name): 7 | """Check a git branch name against gitflow naming conventions. 8 | 9 | This is most likely the function you are looking for. 10 | 11 | """ 12 | if name in ( # First level only branches 13 | 'master', 14 | 'develop', 15 | ): 16 | return True 17 | elif len(name.split('/')) == 2: 18 | # some have two levels separated by / 19 | return checkSecondLevel(name) 20 | else: 21 | # Default 22 | print(f'Error: Did not recognise "{name}" as a valid branch.') 23 | return False 24 | 25 | 26 | def checkLen(string, min_len, max_len): 27 | if len(string) < min_len: 28 | print( 29 | f'Error: {string} is too short' 30 | f' (it is {len(string)}, minimum is {min_len})' 31 | ) 32 | return False 33 | if len(string) > max_len: 34 | print( 35 | f'Error: {string} is too long' 36 | f' (it is {len(string)}, maximum is {max_len})' 37 | ) 38 | return False 39 | else: 40 | return True 41 | 42 | 43 | def checkSecondLevel(name): 44 | """Checks the name to be a valid gitflow branch name containing a `/`. 45 | 46 | This is intended for internal use, and asumes a single `/` to be 47 | present in `name`. 48 | 49 | """ 50 | category, label = name.split('/') 51 | 52 | if category in ( # valid categories 53 | 'feature', 54 | 'hotfix', 55 | ): 56 | return checkLabel(label) 57 | elif category in ( # Not currently validating release branch names 58 | 'release', 59 | ): 60 | return True 61 | else: 62 | print(f'Error: Did not recognise "{category}" as a valid category') 63 | return False 64 | 65 | 66 | def checkLabel(label): 67 | """Checks the label to have a description of one or more words 68 | (lowercase alphanumerics), joined by a dash (`-`), followed by an 69 | issue reference. 70 | 71 | Example: word-and-numb3r-#1 72 | 73 | """ 74 | # Description 75 | desc_re = r'(?P[a-z0-9]+(?:-[a-z0-9]+)*)' # one or more words 76 | desc_re = r'^' + desc_re # must be at begining 77 | m = re.search(desc_re, label) 78 | if not m: 79 | print( 80 | f'Error: No valid description in "{label}"' 81 | f' (Expected it to start with lowercase alphanumeric and dashes' 82 | f' like this: ex4mple-description)' 83 | ) 84 | return False 85 | 86 | if not checkLen(m.groupdict()['description'], 10, 25): 87 | return False 88 | 89 | # Issue reference 90 | issue_re = r'(?P#[0-9]+)' # hashtag and integer 91 | issue_re = issue_re + r'$' # must be at end 92 | if not re.search(issue_re, label): 93 | print( 94 | f'Error: No issue reference in "{label}"' 95 | f' (Expected it to in like this: ...-#1)' 96 | ) 97 | return False 98 | 99 | # Dash seperator 100 | label_re = desc_re + r'-' + issue_re 101 | if not re.search(label_re, label): 102 | print( 103 | f'Error: Missing dash between description and issue reference ' 104 | f' in "{label}"' 105 | ) 106 | return False 107 | 108 | return True # no problems found 109 | 110 | 111 | if __name__ == "__main__": 112 | 113 | parser = argparse.ArgumentParser( 114 | description='Validate branch name according to gitflow', 115 | ) 116 | parser.add_argument( 117 | '-t', '--test', dest='test', action='store_const', 118 | const=True, default=False, 119 | help='Run the built in tests and exit', 120 | ) 121 | parser.add_argument( 122 | 'name', metavar='NAME', type=str, 123 | help='The branch name to check' 124 | ) 125 | args = parser.parse_args() 126 | 127 | if not args.test: 128 | success = check(args.name) 129 | sys.exit(not success) 130 | 131 | print('Starting built-in self-testing') 132 | print('Expect error messages, but not AssertionError\'s') 133 | assert check('master') 134 | assert check('develop') 135 | assert not check('random') # no custom at top level 136 | assert not check('alkshjdg') # no custom at top level 137 | assert not check('master/asdasdasdasdasdasd') # nothing below master 138 | assert not check('develop/asdasdasdasdasdas') # nothing below develop 139 | assert check('feature/some-feature-#9') # good 140 | assert not check('feature/2-shrt-fe#1') # too short 141 | assert not check('feature/very-long-description-here-#1') # too long 142 | print('Done - either all tests passed or you disable `assert`') 143 | -------------------------------------------------------------------------------- /.githooks/pre-commit.linux.sample: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ## Would be nice to have linting before commit 4 | if ! [ -x "$(command -v autopep8)" ] || [ "$(pip3 list | 5 | cut -d " " -f 1 | 6 | grep -xF "$package_name" | grep autopep8)" != "autopep8" ] 7 | then 8 | echo 'autopep8 is NOT installed, linting test may fail on CI ... ' 9 | echo 'consider to install autopep8, you may use following commands: ' 10 | echo 'Debian: [ sudo apt-get install -y python-autopep8 ] ' 11 | echo 'MacOS: [ brew install autopep8 ]' 12 | echo 'You may consider to install it into virtual environment of your project:' 13 | echo 'source venv/bin/activate' 14 | echo 'pip3 install autopep8' 15 | echo 'autopep8 should be available in your system, to do not face with linting problem.' 16 | exit 1 17 | else 18 | echo 'Linting...' 19 | echo 'Going to root directory of the project' 20 | cd ../richkit 21 | autopep8 --in-place --recursive --max-line-length=100 --exclude docs/source/conf.py,venv,__pycache__,old,build,dist . 22 | fi 23 | 24 | python3 .githooks/check-branch-name.py "$(git rev-parse --abbrev-ref HEAD)" 25 | exit $? -------------------------------------------------------------------------------- /.github/local-test/run-test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | NC='\033[0m' 4 | RED='\033[0;31m' 5 | ORANGE='\033[0;33m' 6 | GREEN='\033[0;32m' 7 | 8 | if [ "$MAXMIND_LICENSE_KEY" = "" ] ; then 9 | echo "${ORANGE} Warning: Environment variable for MAXMINDDB could not be found, proceeding without it, check README file " 10 | fi 11 | # change directory to /richkit 12 | 13 | cd /richkit 14 | 15 | echo "${GREEN}1. Checking flake8 linting ... " 16 | # test that number of violations does not increase 17 | FLAKE8_ERROR_CNT=$(flake8 . -qq --count --exit-zero --max-complexity=10 --max-line-length=127 --exclude venv,__pycache__,docs/source/conf.py,old,build,dist) 18 | FLAKE8_ERROR_LIMIT=25 19 | if [ "$FLAKE8_ERROR_CNT" -gt "$FLAKE8_ERROR_LIMIT" ] ; then 20 | echo "${RED}Failed because the number of errors from flake8 increased (This: $FLAKE8_ERROR_CNT Previously: $FLAKE8_ERROR_LIMIT)" 1>&2 21 | false 22 | exit 1 23 | fi 24 | echo "${ORANGE}Number of validation errors from flake8 is: $FLAKE8_ERROR_CNT (Limit is: $FLAKE8_ERROR_LIMIT)" 25 | 26 | 27 | echo "${GREEN}2. Testing module .... " 28 | echo "${NC}" 29 | coverage run --source=richkit -m pytest -Werror /richkit/richkit 30 | 31 | -------------------------------------------------------------------------------- /.github/logo/blue/logo_desc/192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_desc/192x192.png -------------------------------------------------------------------------------- /.github/logo/blue/logo_desc/256x256.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_desc/256x256.png -------------------------------------------------------------------------------- /.github/logo/blue/logo_desc/320x320.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_desc/320x320.png -------------------------------------------------------------------------------- /.github/logo/blue/logo_desc/480x480.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_desc/480x480.png -------------------------------------------------------------------------------- /.github/logo/blue/logo_desc/512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_desc/512x512.png -------------------------------------------------------------------------------- /.github/logo/blue/logo_no_desc/192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_no_desc/192x192.png -------------------------------------------------------------------------------- /.github/logo/blue/logo_no_desc/256x256.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_no_desc/256x256.png -------------------------------------------------------------------------------- /.github/logo/blue/logo_no_desc/320x320.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_no_desc/320x320.png -------------------------------------------------------------------------------- /.github/logo/blue/logo_no_desc/480x480.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_no_desc/480x480.png -------------------------------------------------------------------------------- /.github/logo/blue/logo_no_desc/512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_no_desc/512x512.png -------------------------------------------------------------------------------- /.github/logo/white/logo_desc/192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_desc/192x192.png -------------------------------------------------------------------------------- /.github/logo/white/logo_desc/256x256.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_desc/256x256.png -------------------------------------------------------------------------------- /.github/logo/white/logo_desc/320x320.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_desc/320x320.png -------------------------------------------------------------------------------- /.github/logo/white/logo_desc/480x480.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_desc/480x480.png -------------------------------------------------------------------------------- /.github/logo/white/logo_desc/512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_desc/512x512.png -------------------------------------------------------------------------------- /.github/logo/white/logo_no_desc/192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_no_desc/192x192.png -------------------------------------------------------------------------------- /.github/logo/white/logo_no_desc/256x256.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_no_desc/256x256.png -------------------------------------------------------------------------------- /.github/logo/white/logo_no_desc/320x320.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_no_desc/320x320.png -------------------------------------------------------------------------------- /.github/logo/white/logo_no_desc/480x480.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_no_desc/480x480.png -------------------------------------------------------------------------------- /.github/logo/white/logo_no_desc/512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_no_desc/512x512.png -------------------------------------------------------------------------------- /.github/workflows/formatpythoncode.yml: -------------------------------------------------------------------------------- 1 | name: Format python code 2 | on: push 3 | jobs: 4 | autopep8: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v1 8 | - name: autopep8 9 | uses: peter-evans/autopep8@v1.1.0 10 | with: 11 | args: --recursive --in-place --aggressive --aggressive **/*.py 12 | - name: Create Pull Request 13 | uses: peter-evans/create-pull-request@v1 14 | with: 15 | token: ${{ secrets.GITHUB_TOKEN }} 16 | commit-message: autopep8 action fixes 17 | author-email: mrturkmen06@users.noreply.github.com 18 | author-name: Ahmet Turkmen 19 | title: Fixes by autopep8 action 20 | body: This is an auto-generated PR with fixes by autopep8. 21 | labels: autopep8, automated pr 22 | branch: autopep8-patches -------------------------------------------------------------------------------- /.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - 'README.md' 7 | - 'LICENCE' 8 | jobs: 9 | lint: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v1 13 | - name: Set up Python ${{ matrix.python-version }} 14 | uses: actions/setup-python@v1 15 | with: 16 | python-version: 3.7 17 | - name: Install flake8 18 | run: pip install flake8 19 | - name: Check for syntax errors or undefined names 20 | run: | 21 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 22 | - name: Lint with flake8 23 | run: | 24 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 25 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 26 | - name: Check that number of pep8 violations is not going up 27 | run: | 28 | # test that number of violations does not increase 29 | FLAKE8_ERROR_CNT=$(flake8 . -qq --count --exit-zero --max-complexity=10 --max-line-length=127 --exclude venv,__pycache__,docs/source/conf.py,old,build,dist) 30 | FLAKE8_ERROR_LIMIT=25 31 | if [ "$FLAKE8_ERROR_CNT" -gt "$FLAKE8_ERROR_LIMIT" ] ; then 32 | echo "Failed because the number of errors from flake8 increased (This: $FLAKE8_ERROR_CNT Previously: $FLAKE8_ERROR_LIMIT)" 1>&2 33 | false 34 | fi 35 | echo "Number of validation errors from flake8 is: $FLAKE8_ERROR_CNT (Limit is: $FLAKE8_ERROR_LIMIT)" 36 | 37 | formalities: 38 | runs-on: ubuntu-latest 39 | steps: 40 | - uses: actions/checkout@v1 41 | - name: Set up Python ${{ matrix.python-version }} 42 | uses: actions/setup-python@v1 43 | with: 44 | python-version: 3.7 45 | - name: Extract branch name 46 | shell: bash 47 | run: echo "::set-env name=BRANCH_NAME::$(echo ${GITHUB_REF#refs/heads/})" 48 | - name: Check branch name 49 | run: | 50 | echo "Checking ${BRANCH_NAME}..." 51 | python3 .githooks/check-branch-name.py "$BRANCH_NAME" 52 | 53 | test: 54 | runs-on: ${{ matrix.os }} 55 | strategy: 56 | max-parallel: 4 57 | matrix: 58 | python-version: [3.7] 59 | os: [windows-latest,ubuntu-latest, macOS-latest] 60 | 61 | steps: 62 | - uses: actions/checkout@v1 63 | - name: Set up Python ${{ matrix.python-version }} 64 | uses: actions/setup-python@v1 65 | with: 66 | python-version: ${{ matrix.python-version }} 67 | 68 | - name: Install dependencies 69 | run: | 70 | python -m pip install --upgrade pip 71 | pip install -r requirements.txt 72 | pip install coverage pytest sphinx 73 | 74 | - name: Test with pytest 75 | env: 76 | MAXMIND_LICENSE_KEY: ${{ secrets.MAXMIND_LICENSE_KEY }} 77 | run: | 78 | coverage run --source=richkit -m pytest -Werror --ignore src/python-whois 79 | - name: Coverage report 80 | run: | 81 | coverage report --fail-under=79 82 | 83 | - name: Doctest 84 | env: 85 | MAXMIND_LICENSE_KEY: ${{ secrets.MAXMIND_LICENSE_KEY }} 86 | run: | 87 | python -m doctest -v README.md 88 | cd docs 89 | make doctest 90 | - name: Documentation coverage 91 | env: 92 | MAXMIND_LICENSE_KEY: "DUMMY: A valid license is not needed here" 93 | run: | 94 | cd docs 95 | make coverage 96 | python -c "from pathlib import Path; print(Path('_build/coverage/python.txt').read_text())" # this prints dosctring coverage report 97 | - name: Build documentation 98 | run: | 99 | cd docs 100 | make html 101 | -------------------------------------------------------------------------------- /.github/workflows/pythonpublish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*.*.*' 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v1 12 | - name: Set up Python 13 | uses: actions/setup-python@v1 14 | with: 15 | python-version: '3.7' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools wheel twine 20 | - name: Publish 21 | env: 22 | TWINE_USERNAME: ${{ secrets.PIP_USERNAME }} 23 | TWINE_PASSWORD: ${{ secrets.PIP_TOKEN }} 24 | run: | 25 | python setup.py sdist bdist_wheel 26 | twine upload dist/* 27 | ## Initialize release process 28 | - name: Checkout code 29 | uses: actions/checkout@master 30 | - name: Create Release 31 | id: create_release 32 | uses: actions/create-release@v1.0.0 33 | env: 34 | GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }} 35 | with: 36 | tag_name: ${{ github.ref }} 37 | release_name: Release ${{ github.ref }} 38 | draft: false 39 | prerelease: false 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .pytest_cache/ 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | .idea/* 28 | .idea 29 | 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | .DS_Store 53 | richkit/test/.DS_Store 54 | # Local data 55 | richkit/retrieve/data/*.txt 56 | richkit/test/analyse/data/*.csv 57 | # Translations 58 | *.mo 59 | *.pot 60 | categories_list.txt 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # celery beat schedule file 87 | celerybeat-schedule 88 | 89 | # SageMath parsed files 90 | *.sage.py 91 | 92 | # Environments 93 | .env 94 | .venv 95 | env/ 96 | venv/ 97 | ENV/ 98 | env.bak/ 99 | venv.bak/ 100 | 101 | # Spyder project settings 102 | .spyderproject 103 | .spyproject 104 | 105 | # Rope project settings 106 | .ropeproject 107 | 108 | # mkdocs documentation 109 | /site 110 | 111 | # mypy 112 | .mypy_cache/ 113 | .idea/* 114 | richkit/lookup/data/* 115 | 116 | # caches for of resources fetched from Internet, used in richkit 117 | richkit/retrieve/data/categorized_urls.txt -------------------------------------------------------------------------------- /Dockerfile.test: -------------------------------------------------------------------------------- 1 | FROM ubuntu 2 | 3 | # provide environment variable as MAXMIND_LICENSE_KEY 4 | # when you run docker image see readme 5 | 6 | # git is required to fetch given requirement in the requirements.txt file 7 | # for unmerged whois library 8 | 9 | RUN apt-get update && apt-get install -y python3 python3-pip git 10 | 11 | COPY requirements.txt /richkit/requirements.txt 12 | 13 | COPY richkit /richkit/richkit 14 | 15 | 16 | RUN pip3 install -r /richkit/requirements.txt 17 | 18 | RUN pip3 install coverage pytest sphinx flake8 19 | 20 | COPY .github/local-test/run-test.sh /richkit/richkit/run-test.sh 21 | 22 | CMD ["/richkit/richkit/run-test.sh"] 23 | 24 | 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Aalborg University 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all prep-dev venv clean lint test docker-test 2 | 3 | # virtual environment for development 4 | VENV_NAME?=venv 5 | VENV_ACTIVATE=. $(VENV_NAME)/bin/activate 6 | PYTHON=${VENV_NAME}/bin/python3 7 | # help messages for make, it runs in `make` or `make all` 8 | all: 9 | @echo "\033[92m make prep-dev \033[0m" 10 | @echo "---> Prepares dev environment, use only once" 11 | @echo "\033[92m make test \033[0m" 12 | @echo "---> Runs test cases in virtual environment" 13 | @echo "\033[92m make lint \033[0m" 14 | @echo "---> Linting project with autopep8" 15 | @echo "\033[92m make clean \033[0m" 16 | @echo "---> Cleans project cache and other stuffs" 17 | @echo "\033[92m make docker-test \033[0m" 18 | @echo "---> Runs test cases in docker environment" 19 | 20 | 21 | prep-dev: 22 | python3 -m pip install virtualenv ## virtual environment for development purposes 23 | make venv 24 | 25 | venv: $(VENV_NAME)/bin/activate 26 | $(VENV_NAME)/bin/activate: requirements.txt 27 | test -d $(VENV_NAME) || virtualenv -p python3 $(VENV_NAME) 28 | ${PYTHON} -m pip install -U pip setuptools 29 | ${PYTHON} -m pip install -U autopep8 coverage isort 30 | ${PYTHON} -m pip install -U -r requirements.txt 31 | touch $(VENV_NAME)/bin/activate 32 | 33 | clean: 34 | rm -rf $(VENV_NAME) *.eggs *.egg-info dist build docs/_build .cache .coverage 35 | rm -rf .pytest* # cache file for Intellij PyCharm 36 | 37 | sort: venv 38 | isort -rc . --skip_glob docs/* 39 | 40 | 41 | lint: venv 42 | autopep8 --in-place --recursive --max-line-length=100 --exclude docs/source/conf.py,venv,__pycache__,old,build,dist . 43 | 44 | test: venv 45 | coverage run --source=richkit -m pytest -Werror --ignore src/python-whois 46 | 47 | docker-test: clean 48 | docker build -t richkit-docker-test -f Dockerfile.test . 49 | docker run -e MAXMIND_LICENSE_KEY=$MAXMIND_LICENSE_KEY richkit-docker-test -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |

Richkit

4 |
5 |

6 |

7 | 8 | 9 | GitHub release 10 | 11 | 12 | licence 13 | 14 |
15 | 16 | issues 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |
25 | 26 |
27 | 28 | Richkit is a python3 package that provides tools taking a domain name as input, and returns addtional information on that domain. It can be an analysis of the domain itself, looked up from data-bases, retrieved from other services, or some combination thereof. 29 | 30 | The purpose of richkit is to provide a reusable library of domain name-related analysis, lookups, and retrieval functions, that are shared within the Network Security research group at Aalborg University, and also availble to the public for reuse and modification. 31 | 32 | Documentation can be found at https://richkit.readthedocs.io/en/latest/. 33 | 34 | 35 | ## Requirements 36 | 37 | - `Python >= 3.5` 38 | 39 | ## Installation 40 | 41 | In order to install richikit just type in the terminal `pip install richkit` 42 | 43 | ## Usage 44 | 45 | The following codes can be used to retrieve the TLD and the URL category, respectively. 46 | 47 | - Retriving effective top level domain of a given url: 48 | 49 | ```python3 50 | >>> from richkit.analyse import tld 51 | >>> urls = ["www.aau.dk","www.github.com","www.google.com"] 52 | >>> 53 | >>> for url in urls: 54 | ... print(tld(url)) 55 | dk 56 | com 57 | com 58 | 59 | ``` 60 | 61 | - Retriving category of a given url: 62 | 63 | ```python3 64 | >>> from richkit.retrieve.symantec import fetch_from_internet 65 | >>> from richkit.retrieve.symantec import LocalCategoryDB 66 | >>> 67 | >>> urls = ["www.aau.dk","www.github.com","www.google.com"] 68 | >>> 69 | >>> local_db = LocalCategoryDB() 70 | >>> for url in urls: 71 | ... url_category=local_db.get_category(url) 72 | ... if url_category=='': 73 | ... url_category=fetch_from_internet(url) 74 | ... print(url_category) 75 | Education 76 | Technology/Internet 77 | Search Engines/Portals 78 | 79 | ``` 80 | 81 | ## Modules 82 | 83 | Richkit define a set of functions categorized by the following modules: 84 | 85 | - `richkit.analyse`: This module provides functions that can be applied to a domain name. Similarly to `richkit.lookup`, and in contrast to `richkit.retrieve`, this is done without disclosing the domain name to third parties and breaching confidentiality. 86 | 87 | - `richkit.lookup`: This modules provides the ability to look up domain names in local resources, i.e. the domain name cannot be sent of to third parties. The module might fetch resources, such as lists or databasese, but this must be done in a way that keeps the domain name confidential. Contrast this with `richkit.retrieve`. 88 | 89 | - `richkit.retrieve`: This module provides the ability to retrieve data on domain names of any sort. It comes without the "confidentiality contract" of `richkit.lookup`. 90 | 91 | ## Run Tests on Docker 92 | 93 | In order to prevent any problems regarding to environment, we are providing `Dockerfile.test` file which basically constructs a docker image to run tests of Richkit. 94 | 95 | - The only thing to add is just `MAXMIND_LICENCE_KEY` in `.github/local-test/run-test.sh` at line 3. It is required to pass the test cases for `lookup` module. 96 | 97 | Commands to test them in Docker environment. 98 | 99 | - `docker build -t richkit-test -f Dockerfile.test . ` : Builds required image to run test cases 100 | 101 | - `docker run -e MAXMIND_LICENSE_KEY=" " richkit-test ` : Runs `run-test.sh` file in Docker image. 102 | 103 | 104 | ## Contributing 105 | 106 | Contributions are most welcome. 107 | 108 | We use the [gitflow](https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow) 109 | branching strategy, so if you plan to push a branch to this repository 110 | please follow that. Note that we test branch names with 111 | `.githooks/check-branch-name.py`. The git pre-commit hook can be used 112 | to automatically check this on commit. An example that can be used 113 | directly as follows is available on linux, and can be enabled like 114 | this (assuming `python>=3.6` and `bash`): 115 | 116 | ln -s $(pwd)/.githooks/pre-commit.linux.sample $(pwd)/.git/hooks/pre-commit 117 | 118 | ## Credits 119 | 120 | - Logo designed by [indepedenthand](https://www.behance.net/independenthand) 121 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_static/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/docs/_static/.gitkeep -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('..')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'richkit' 21 | copyright = '2019, <<>>' 22 | author = '<<>>' 23 | master_doc = 'index' 24 | 25 | 26 | # -- General configuration --------------------------------------------------- 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 30 | # ones. 31 | extensions = [ 32 | 'sphinx.ext.autodoc', 33 | 'sphinx.ext.coverage', 34 | 'sphinx.ext.doctest', 35 | ] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | 40 | # List of patterns, relative to source directory, that match files and 41 | # directories to ignore when looking for source files. 42 | # This pattern also affects html_static_path and html_extra_path. 43 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 44 | 45 | 46 | # -- Options for HTML output ------------------------------------------------- 47 | 48 | # The theme to use for HTML and HTML Help pages. See the documentation for 49 | # a list of builtin themes. 50 | # 51 | html_theme = 'alabaster' 52 | 53 | # Add any paths that contain custom static files (such as style sheets) here, 54 | # relative to this directory. They are copied after the builtin static files, 55 | # so a file named "default.css" will overwrite the builtin "default.css". 56 | html_static_path = ['_static'] 57 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to richkit's documentation! 2 | ================================== 3 | 4 | .. automodule:: richkit 5 | 6 | .. toctree:: 7 | :caption: Contents: 8 | 9 | Modules 10 | ======= 11 | 12 | The functionality is organised in the following modules. 13 | 14 | Analysis 15 | -------- 16 | .. automodule:: richkit.analyse 17 | :members: 18 | 19 | Lookup 20 | ------ 21 | .. automodule:: richkit.lookup 22 | :members: 23 | 24 | .. automodule:: richkit.lookup.geo 25 | :members: 26 | 27 | Retrieve 28 | -------- 29 | .. automodule:: richkit.retrieve 30 | :members: 31 | 32 | .. automodule:: richkit.retrieve.dns 33 | :members: 34 | 35 | .. automodule:: richkit.retrieve.symantec 36 | :members: fetch_from_internet, LocalCategoryDB 37 | 38 | Indices and tables 39 | ================== 40 | 41 | * :ref:`genindex` 42 | * :ref:`modindex` 43 | * :ref:`search` 44 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dnspython 2 | maxminddb 3 | numpy==1.17.2 4 | scikit-learn==0.21.3 5 | langid==1.1.6 6 | bs4==0.0.1 7 | lxml==4.4.1 8 | requests==2.22.0 9 | # when this branch is merged into upstream and released 10 | -e git://github.com/aau-network-security/pywhois.git@release-for-richkit#egg=python-whois 11 | # replace the line with: 12 | # python-whois 13 | -------------------------------------------------------------------------------- /richkit/__init__.py: -------------------------------------------------------------------------------- 1 | """richkit is the Domain Enrichment Kit 2 | 3 | See the `README 4 | `_ 5 | for a general introduction. 6 | 7 | """ 8 | 9 | __all__ = [ 10 | 'analyse', 11 | 'lookup', 12 | 'retrieve', 13 | ] 14 | -------------------------------------------------------------------------------- /richkit/analyse/__init__.py: -------------------------------------------------------------------------------- 1 | """Analysis and computations on domain names. 2 | 3 | This module provides functions that can be applied to a domain 4 | name. Similarly to `richkit.lookup`, and in contrast to `richkit.retrieve`, 5 | this is done without disclosing the domain name to third parties and 6 | breaching confidentiality. 7 | 8 | .. note:: For this entire module, we adopt the notion of effective 9 | Top-Level Domains (eTLD), effective Second-Level Domain 10 | (e2LD), etc. "Effective" refers to the practice where the 11 | public sufffic is considered the effective TLD, and counted 12 | as one label. The `list of public suffixes 13 | `_, maintained by Mozilla, 14 | is used as the definitive truth on what public suffixes 15 | exists. 16 | 17 | """ 18 | 19 | from richkit.analyse import analyse 20 | 21 | # aka tld 22 | 23 | 24 | def tld(domain): 25 | """ 26 | Returns the Effective Top-Level Domain (eTLD) (aka Public Suffix). 27 | 28 | The eTLD is extracted from the domain, 29 | 30 | :param domain: Domain (string) 31 | 32 | """ 33 | return analyse.get_tld(domain) 34 | 35 | 36 | def sld(domain): 37 | """ 38 | Returns the Effective Second-Level Domain (2LD) (aka Apex Domain). 39 | 40 | The 2LD, aka the Apex Domain, is extracted from the domain, using 41 | the `list of public suffixes `_ 42 | maintained by Mozilla 43 | 44 | :param domain: Domain (string) 45 | 46 | """ 47 | return analyse.get_sld(domain) 48 | 49 | 50 | def sl_label(domain): 51 | """ 52 | Returns the Effective 2-level label. 53 | 54 | :param domain: Domain (string) 55 | 56 | """ 57 | return analyse.get_2l_label(domain) 58 | 59 | 60 | def nld(domain, n): 61 | """ 62 | Returns the Effective N'th-Level Domain (eNLD). 63 | 64 | :param domain: Domain (string) 65 | :param n: N'th-Level (int) 66 | 67 | Usage: 68 | 69 | from richkit.analyse import nld 70 | 71 | ## returns second level domain ... 72 | print(nld("www.google.com", 2)) 73 | 74 | ## returns top level domain 75 | print(nld("www.google.com",1)) 76 | 77 | """ 78 | return analyse.get_nld(domain, n) 79 | 80 | 81 | def n_label(domain, n): 82 | """ 83 | Returns the Effective N'th-level label. 84 | 85 | :param domain: Domain (string) 86 | :param n: N'th-Level (int) 87 | 88 | """ 89 | return analyse.get_n_label(domain, n) 90 | 91 | 92 | def depth(domain): 93 | """ 94 | Returns the effective depth of the domain, 95 | 96 | The depth is the number of labels in the domain. 97 | 98 | :Example: `google.co.uk` is "effectively a 2LD. `google` is one 99 | label. The public suffix of `co.uk` is considered one 100 | label effectively. With effectively two labels, the 101 | effective depth is two. 102 | 103 | :param domain: Domain (string) 104 | 105 | """ 106 | domain_name_features = analyse.get_domain_name_features(domain) 107 | return domain_name_features.get("num_tokens", "") 108 | 109 | 110 | def length(domain): 111 | """ 112 | Returns the sum of count of characters for all labels. 113 | 114 | :param domain: Domain (string) 115 | 116 | """ 117 | domain_name_features = analyse.get_domain_name_features(domain) 118 | return domain_name_features.get("len_domain", "") 119 | 120 | 121 | def language(domain): 122 | """ 123 | Returns the best gues for the language of the domain. 124 | 125 | :param domain: Domain (string) 126 | 127 | """ 128 | return analyse.get_language(domain) 129 | 130 | 131 | def entropy(s): 132 | """ 133 | Returns the entropy of characters in s. 134 | 135 | :param s: Domain (string) 136 | 137 | """ 138 | return analyse.get_entropy_2ld(s) 139 | 140 | 141 | def ratio_vowels(s): 142 | """ 143 | Returns the ratio vowels to all characters in s. 144 | 145 | :param s: Domain (string) 146 | 147 | """ 148 | return analyse.get_ratio_vowels_2ld(s) 149 | 150 | 151 | def number_vowels(s): 152 | """ 153 | Returns the number vowels to all characters in s. 154 | 155 | :param s: Domain (string) 156 | 157 | """ 158 | return analyse.get_num_of_vowels_2ld(s) 159 | 160 | 161 | def ratio_consonants(s): 162 | """ 163 | Returns the ratio consonants to all characters in s. 164 | 165 | :param s: Domain (string) 166 | 167 | """ 168 | return analyse.get_ratio_consonants_2ld(s) 169 | 170 | 171 | def number_consonants(s): 172 | """ 173 | Returns the number consonants to all characters in s. 174 | 175 | :param s: Domain (string) 176 | 177 | """ 178 | return analyse.get_num_of_consonants_2ld(s) 179 | 180 | 181 | def ratio_numerics(s): 182 | """ 183 | Returns the ratio numeric characters to all characters in s. 184 | 185 | :param s: Domain (string) 186 | 187 | """ 188 | return analyse.get_radio_numeric_2ld(s) 189 | 190 | 191 | def number_numerics(s): 192 | """ 193 | Returns the number numeric characters to all characters in s. 194 | 195 | :param s: Domain (string) 196 | 197 | """ 198 | return analyse.get_num_numeric_2ld(s) 199 | 200 | 201 | def ratio_specials(s): 202 | """ 203 | Returns the ratio special characters to all characters in s. 204 | The default special character list is "~`!@#$%^&*()_={}[]:>;',;',;',;', 3] 44 | cls.WORDS = {} 45 | for item in lines: 46 | cls.WORDS[item] = None 47 | # cls.WORDS = set(lines) 48 | 49 | def __init__(self): 50 | 51 | # check if the class has been initialised 52 | if self.__class__.count > 0: 53 | return 54 | else: 55 | self.__class__.count += 1 56 | 57 | if path.exists(WordMatcher.MASTERFILE): 58 | WordMatcher.load_words() 59 | 60 | if WordMatcher.WORDS is None: 61 | WordMatcher.fetch_words() 62 | WordMatcher.load_words() 63 | 64 | def get_num_of_words(self, domain): 65 | num = 0 66 | for word in WordMatcher.WORDS: 67 | if word in domain: 68 | num += 1 69 | return num 70 | 71 | 72 | def load_alexa(limit=None, is_test=False): 73 | """ 74 | Reads top @limit number of popular domains based on alexa.com 75 | 76 | """ 77 | alexa_domains = set() 78 | alexa_top_1m = data_folder 79 | if not path.exists(alexa_top_1m): 80 | if is_test: 81 | alexa_top_1m = fetch_alexa_data(url=top_100_alexa) 82 | else: 83 | alexa_top_1m = fetch_alexa_data() 84 | with open(alexa_top_1m) as f: 85 | for line in f: 86 | line = line.strip() 87 | sline = line.split(',') 88 | 89 | if limit and int(sline[0]) > limit: 90 | break 91 | 92 | """ 93 | sometimes the Alexa list contains full URLs, e.g. 94 | example.com/path; need to get rid of that for later matching 95 | """ 96 | domain = (sline[1].split('/'))[0] 97 | 98 | """ 99 | we want only the 2LD+TLD, else we do not know later against what we 100 | need to match 101 | """ 102 | sld_domain = get_2ld(domain) 103 | alexa_domains.add(sld_domain) 104 | alexa_domains.add(domain) 105 | alexa_slds = set([get_2ld(el) for el in alexa_domains]) 106 | 107 | return alexa_slds 108 | 109 | 110 | def load_words(path_to_data=data_folder, is_test=False): 111 | if not path.exists(path_to_data): 112 | if is_test: 113 | path_to_data = fetch_alexa_data(url=top_100_alexa) 114 | else: 115 | path_to_data = fetch_alexa_data() 116 | 117 | lines = read_local(path_to_data) 118 | 119 | # strip whitespaces 120 | # only words with more than three letters are considered 121 | lines = [ln for ln in (ln.strip() for ln in lines) if len(ln) > 3] 122 | words = set(lines) 123 | return words 124 | 125 | 126 | def read_local(path_to_data=data_folder): 127 | if path.exists(path_to_data): 128 | f = open(path_to_data, 'r', encoding="utf8") 129 | lines = f.readlines() 130 | f.close() 131 | else: 132 | lines = [] 133 | return lines 134 | 135 | 136 | def fetch_alexa_data(path_to_data=data_folder, url=top_1m_alexa): 137 | 138 | response = requests.get(url, stream=True) 139 | if response.status_code == 200: 140 | with open(path_to_data, 'wb+') as file: 141 | file.write(response.content) 142 | else: 143 | logger.error('Error while downloading the TOP 1M URL list status code : %s', 144 | str(response.status_code)) 145 | return path_to_data 146 | 147 | 148 | class TldMatcher(object): 149 | # use class vars for lazy loading 150 | MASTERURL = "https://publicsuffix.org/list/effective_tld_names.dat" 151 | MASTERFILE = temp_directory + "/effective_tld_names.dat" 152 | 153 | TLDS = None 154 | No_TLDS = None 155 | count = 0 156 | 157 | @classmethod 158 | def fetch_tlds(cls, url=None): 159 | url = url or cls.MASTERURL 160 | 161 | response = requests.get(url, stream=True) 162 | if response.status_code == 200: 163 | with open(cls.MASTERFILE, 'wb') as file: 164 | file.write(response.content) 165 | else: 166 | 167 | logger.error('Error while downloading the Public Suffix List status code %s ', 168 | str(response.status_code)) 169 | 170 | @classmethod 171 | def load_tlds(cls): 172 | try: 173 | f = open(cls.MASTERFILE, 'r', encoding="utf8") 174 | lines = f.readlines() 175 | except FileNotFoundError as e: 176 | logger.exception('File not readable, not found %s', e) 177 | f.close() 178 | f.close() 179 | 180 | # strip comments and blank lines 181 | stripped_lines = [ln for ln in (ln.strip() for ln in lines) if len(ln) and ln[:2] != '//'] 182 | 183 | excluded_lines = [ln.strip('!') for ln in (ln.strip() 184 | for ln in lines) if len(ln) and ln[:1] == '!'] 185 | 186 | cls.TLDS = set(stripped_lines) 187 | cls.No_TLDS = set(excluded_lines) 188 | 189 | def __init__(self): 190 | 191 | # check if the class has been initialised 192 | if self.__class__.count > 0: 193 | return 194 | else: 195 | self.__class__.count += 1 196 | 197 | if path.exists(TldMatcher.MASTERFILE): 198 | TldMatcher.load_tlds() 199 | 200 | if TldMatcher.TLDS is None: 201 | TldMatcher.fetch_tlds() 202 | TldMatcher.load_tlds() 203 | 204 | def get_tld(self, url): 205 | best_match = None 206 | chunks = url.split('.') 207 | 208 | for start in range(len(chunks) - 1, -1, -1): 209 | test = '.'.join(chunks[start:]) 210 | startest = '.'.join(['*'] + chunks[start + 1:]) 211 | 212 | if test in TldMatcher.TLDS or startest in TldMatcher.TLDS: 213 | best_match = test 214 | 215 | # return an Error since is not clear on the PS List which is the TLD of the domain marked with '!' 216 | if best_match in TldMatcher.No_TLDS: 217 | raise NotImplementedError() 218 | 219 | return best_match 220 | 221 | def get_2ld(self, url): 222 | urls = url.split('.') 223 | tlds = self.get_tld(url).split('.') 224 | return urls[-1 - len(tlds)] 225 | 226 | def get_nld(self, url, n): 227 | urls = url.split('.') 228 | tlds = self.get_tld(url).split('.') 229 | return urls[-n - len(tlds)] 230 | 231 | 232 | tldmatch = TldMatcher() 233 | 234 | 235 | def get_2ld(domain): 236 | """ 237 | Finds 2LD for given FQDN 238 | """ 239 | sdomain = domain.split('.') 240 | 241 | tld = tldmatch.get_tld(domain) 242 | index = 2 243 | 244 | if tld: 245 | num_tld_Levels = len(tld.split('.')) 246 | index = num_tld_Levels + 1 247 | 248 | if len(sdomain) < index: 249 | return domain 250 | else: 251 | return '.'.join(sdomain[-index:]) 252 | -------------------------------------------------------------------------------- /richkit/lookup/__init__.py: -------------------------------------------------------------------------------- 1 | """Confidentiality-aware look-ups for data on domain names. 2 | 3 | This modules provides the ability to look up domain names in local 4 | resources, i.e. the domain name cannot be sent of to third 5 | parties. The module might fetch resources, such as lists or 6 | databasese, but this must be done in a way that keeps the domain name 7 | confidential. Contrast this with `richkit.retrieve`.""" 8 | 9 | from richkit.lookup import geo 10 | 11 | 12 | def country(ip_address): 13 | """ 14 | Return the country code of a given IP Address 15 | 16 | :param ip_address: IP Address (string) 17 | """ 18 | return geo.get_country(ip_address) 19 | 20 | 21 | def asn(ip_address): 22 | """ 23 | Return the Autonomous System Number of a given IP Address 24 | 25 | :param ip_address: IP Address (string) 26 | """ 27 | return geo.get_asn(ip_address) 28 | 29 | 30 | def registered_country(ip_address): 31 | """ 32 | Return the registered country code of a given IP Address 33 | 34 | :param ip_address: IP Address (string) 35 | """ 36 | return geo.get_registered_country(ip_address) 37 | 38 | 39 | def maxmindb_licence_key(license_key): 40 | """ 41 | Return license key for MaxMind DB 42 | Retrieve license key for usage of MaxMindDb 43 | 44 | If it is not present print warning 45 | """ 46 | 47 | return geo.get_license_key(license_key) 48 | -------------------------------------------------------------------------------- /richkit/lookup/geo.py: -------------------------------------------------------------------------------- 1 | from richkit.lookup.util import MaxMindDB 2 | import os 3 | 4 | 5 | def get_license_key(license_key='MAXMIND_LICENSE_KEY'): 6 | """ 7 | @param license_key: Name of environment variable 8 | @return: license of MaxMindDB from environent variables as string 9 | @return: in case of error, returns Exception, more specifically KeyError 10 | """ 11 | try: 12 | maxmind_db_license = os.environ[license_key] 13 | return maxmind_db_license 14 | except Exception: 15 | print("\nWARNING: No MAXMIND LICENSE KEY Found in environment variables") 16 | print("\nUsage of lookup module might be affected due to no MaxMind DB License".strip()) 17 | print("\nMore info ? Check here: https://github.com/aau-network-security/richkit/wiki/Retrieve-and-configure" 18 | "-licence-key".strip()) 19 | print("Proceeding anyway...") 20 | return 'NOLICENSEKEYFOUND' 21 | 22 | 23 | def get_country(ip_address): 24 | """ 25 | Return the country code of a given IP address 26 | 27 | :param ip_address: IP Address (string) 28 | 29 | """ 30 | 31 | try: 32 | country_code_db = MaxMindDB(( 33 | "https://download.maxmind.com/app/geoip_download?" 34 | "edition_id=GeoLite2-Country&" 35 | "license_key={license_key}&" 36 | "suffix=tar.gz" 37 | ).format( 38 | license_key=get_license_key(), 39 | ), "cc" 40 | ) 41 | result = country_code_db.get_data(ip_address) 42 | country_code = str(result['country']['iso_code']) 43 | except: 44 | country_code = None 45 | return country_code 46 | 47 | 48 | def get_registered_country(ip_address): 49 | """ 50 | Return the registered country code of a given IP address 51 | 52 | :param ip_address: IP Address (string) 53 | 54 | """ 55 | try: 56 | country_code_db = MaxMindDB(( 57 | "https://download.maxmind.com/app/geoip_download?" 58 | "edition_id=GeoLite2-Country&" 59 | "license_key={license_key}&" 60 | "suffix=tar.gz" 61 | ).format( 62 | license_key=get_license_key(), 63 | ), "cc" 64 | ) 65 | result = country_code_db.get_data(ip_address) 66 | country_code = str(result['registered_country']['iso_code']) 67 | except: 68 | country_code = None 69 | return country_code 70 | 71 | 72 | def get_asn(ip_address): 73 | """ 74 | Return the ASN of a given IP address 75 | 76 | :param ip_address: IP Address (string) 77 | 78 | """ 79 | try: 80 | country_code_db = MaxMindDB(( 81 | "https://download.maxmind.com/app/geoip_download?" 82 | "edition_id=GeoLite2-ASN&" 83 | "license_key={license_key}&" 84 | "suffix=tar.gz" 85 | ).format( 86 | license_key=get_license_key(), 87 | ), "asn" 88 | ) 89 | result = country_code_db.get_data(ip_address) 90 | asn = str('AS' + str(result['autonomous_system_number'])) 91 | except: 92 | asn = None 93 | return asn 94 | -------------------------------------------------------------------------------- /richkit/lookup/util.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | import subprocess 4 | import time 5 | from datetime import datetime, timedelta 6 | import logging 7 | from pathlib import Path 8 | import maxminddb 9 | 10 | """ 11 | Lookups in the MaxMind GeoLite2 databases. 12 | 13 | A license key is required as per [#GeoLite2_CCPA_GDPR]_: 14 | 15 | #. Sign up for a MaxMind account (no purchase required): https://www.maxmind.com/en/geolite2/signup 16 | #. Set your password and create a license key: https://www.maxmind.com/en/accounts/current/license-key 17 | #. Setup your download mechanism by using our GeoIP Update program or creating a direct download script: https://dev.maxmind.com/geoip/geoipupdate/#Direct_Downloads 18 | 19 | .. rubric:: Footnotes 20 | 21 | .. [#GeoLite2_CCPA_GDPR] https://blog.maxmind.com/2019/12/18/significant-changes-to-accessing-and-using-geolite2-databases/ 22 | """ 23 | 24 | logger = logging.getLogger(__name__) 25 | directory = os.getcwd().split("richkit") 26 | maxmind_directory = directory[0] + "/richkit/richkit/lookup/data" 27 | Path(maxmind_directory).mkdir(parents=True, exist_ok=True) 28 | 29 | 30 | class MaxMindDB: 31 | """ 32 | This class provides functions to download, extract and get data from MaxMind DBs 33 | """ 34 | 35 | # Dict to lookup const's, structured like this: 36 | # name given by MaxMind, name of the extracted DB, directory of the downloaded file from MaxMind 37 | helpers = { 38 | "asn": ['GeoLite2-ASN_', 'GeoLite2-ASN.mmdb', str(Path(maxmind_directory, "asn.tar.gz"))], 39 | "cc": ['GeoLite2-Country_', 'GeoLite2-Country.mmdb', str(Path(maxmind_directory, "cc.tar.gz"))] 40 | } 41 | 42 | def __init__(self, url, query): 43 | self.MASTERURL = url 44 | self.query = query 45 | self.path_db = maxmind_directory 46 | if MaxMindDB.get_db_path(self) is None: 47 | MaxMindDB.get_db(self) 48 | # weeks = 1 because the database is updated once a week. 49 | # if it is downloaded more than one week ago, it will be removed and updated 50 | 51 | if self.get_age() > timedelta(weeks=1): 52 | os.remove(self.get_db_path()) 53 | MaxMindDB.get_db(self) 54 | 55 | def get_db(self): 56 | """ 57 | Download the MaxMind database in zip format from the MaxMind website 58 | 59 | """ 60 | logger.debug('Downloading the '+self.helpers[self.query][2]+' DB ... ') 61 | try: 62 | response = requests.get(self.MASTERURL, stream=True) 63 | except Exception as e: 64 | logger.error('Reraising Exception raised by requests.get ({})'.format(e)) 65 | raise e 66 | 67 | if response.status_code == 200: 68 | with open(self.helpers[self.query][2], 'wb') as file: 69 | file.write(response.content) 70 | else: 71 | msg = ( 72 | 'Error while downloading the ASN DB ' 73 | '(Status Code={}): {}' 74 | ).format( 75 | response.status_code, 76 | response.text, 77 | ) 78 | logger.error(msg) 79 | raise Exception(msg) 80 | self.unpack() 81 | 82 | def unpack(self): 83 | """ 84 | Extract MaxMind DB 85 | """ 86 | if os.path.exists(self.helpers[self.query][2]): 87 | subprocess.Popen(['tar', '-xzf', self.helpers[self.query][2]], cwd=maxmind_directory) 88 | time.sleep(2) 89 | else: 90 | msg = 'Error extract DB on get_db ' 91 | logger.error(msg) 92 | raise Exception(msg) 93 | 94 | def get_db_path(self): 95 | """ 96 | Return the ASN Database path if exists 97 | 98 | """ 99 | filtered_dir = [x for x in os.listdir( 100 | self.path_db) if x.startswith(self.helpers[self.query][0])] 101 | sorted_dir = sorted(filtered_dir, reverse=True) 102 | if sorted_dir: 103 | return str(Path( 104 | maxmind_directory, 105 | sorted_dir[0], 106 | self.helpers[self.query][1], 107 | )) 108 | else: 109 | return None 110 | 111 | def open_db(self): 112 | country_code_db_path = self.get_db_path() 113 | reader = maxminddb.open_database(country_code_db_path) 114 | return reader 115 | 116 | def get_data(self, ip_address): 117 | reader = self.open_db() 118 | return reader.get(ip_address) 119 | 120 | def get_age(self): 121 | reader = self.open_db() 122 | delta = datetime.now() - datetime.fromtimestamp( 123 | reader.metadata().build_epoch 124 | ) 125 | return delta 126 | -------------------------------------------------------------------------------- /richkit/retrieve/__init__.py: -------------------------------------------------------------------------------- 1 | """Retrieval of data on domain names. 2 | 3 | This module provides the ability to retrieve data on domain names of 4 | any sort. It comes without the "confidentiality contract" of 5 | `richkit.lookup`. 6 | 7 | """ 8 | from richkit.retrieve import symantec 9 | from richkit.retrieve import dns 10 | 11 | 12 | def symantec_category(domain): 13 | """ 14 | Returns the category from Symantec's BlueCoat service. 15 | :param domain: 16 | :return: 17 | """ 18 | return symantec.fetch_from_internet(domain) 19 | 20 | 21 | def dns_a(domain): 22 | """ 23 | Return the A Records of a given domain 24 | :param domain: domain (string) 25 | :return: IP Addresses (list) 26 | """ 27 | return dns.get_a_record(domain) 28 | 29 | 30 | def dns_ptr(ip_address): 31 | """ 32 | Return the PTR record of a given IP address 33 | :param ip_address: IP Address (string) 34 | :return: domains (list) 35 | """ 36 | return dns.get_ptr_record(ip_address) 37 | -------------------------------------------------------------------------------- /richkit/retrieve/cert_sh.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import logging 4 | from richkit.retrieve.x509 import X509 5 | from datetime import datetime 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class DomainCertificates: 11 | """ 12 | This class provides the functions to get certificates of a given domain. 13 | The website used to get them is crt.sh 14 | """ 15 | 16 | # Website used to retrieve the certificates belonging a domain 17 | crtSH_url = "https://crt.sh/{}" 18 | 19 | def __init__(self, domain): 20 | """ 21 | Get the certificate features from the given domain 22 | :param domain: domain to analyze 23 | """ 24 | self.domain = domain 25 | self.certificates = self.get_certificates(self.domain) 26 | self.certificates_features = None 27 | 28 | def get_certificates(self, domain): 29 | """ 30 | Make a request and get the response content of the given domain 31 | :param domain: the choosen domain 32 | """ 33 | try: 34 | r = requests.get(self.crtSH_url.format("?q=" + domain + "&output=json")) 35 | if r.status_code != 200: 36 | raise Exception("Server not available") 37 | content = r.content.decode('utf-8') 38 | if len(r.text) == 2: # It's 2 when the domain is not found 39 | raise Exception("Domain not found") 40 | return json.loads(content) 41 | except Exception as e: 42 | logger.error('Error while retrieving certificates: %s', e) 43 | return None 44 | 45 | def get_all(self): 46 | """ 47 | Get the list of certificates for the given domain and the certificate features for each of them 48 | """ 49 | certs_features = [] 50 | for cert in self.certificates: 51 | # filter out all the rows containing @ because they are email 52 | # example: https://crt.sh/?id=34083306 53 | cf = X509(cert.get('id')) 54 | not_before = cert.get('not_before') 55 | not_after = cert.get('not_after') 56 | not_before_obj = datetime.strptime(not_before, "%Y-%m-%dT%H:%M:%S") 57 | not_after_obj = datetime.strptime(not_after, "%Y-%m-%dT%H:%M:%S") 58 | validity = (not_after_obj.date() - not_before_obj.date()).days 59 | features = dict({ 60 | 'ID': cert.get('id'), 61 | 'Issuer': cert.get('issuer_name'), 62 | 'Algorithm': cf.algorithm, 63 | 'ValidationL': cf.policy_list, 64 | 'NotBefore': not_before, 65 | 'NotAfter': not_after, 66 | 'Validity': validity, # days 67 | 'SANFeatures': cf.certificates_features 68 | }) 69 | certs_features.append(features) 70 | self.certificates_features = certs_features 71 | return certs_features 72 | 73 | def get_certificates_list(self): 74 | """ 75 | Get the list of certificates for the given domain 76 | """ 77 | certs_features = [] 78 | for cert in self.certificates: 79 | # filter out all the rows containing @ because they are email 80 | # example: https://crt.sh/?id=34083306 81 | not_before = cert.get('not_before') 82 | not_after = cert.get('not_after') 83 | not_before_obj = datetime.strptime(not_before, "%Y-%m-%dT%H:%M:%S") 84 | not_after_obj = datetime.strptime(not_after, "%Y-%m-%dT%H:%M:%S") 85 | validity = (not_after_obj.date() - not_before_obj.date()).days 86 | features = dict({ 87 | 'ID': cert.get('id'), 88 | 'Issuer': cert.get('issuer_name'), 89 | 'NotBefore': not_before, 90 | 'NotAfter': not_after, 91 | 'Validity': validity, # days 92 | }) 93 | certs_features.append(features) 94 | self.certificates_features = certs_features 95 | return certs_features 96 | -------------------------------------------------------------------------------- /richkit/retrieve/ctlogs.py: -------------------------------------------------------------------------------- 1 | from richkit.retrieve.cert_sh import DomainCertificates 2 | from richkit.retrieve.x509 import X509 3 | 4 | 5 | def get_logs(domain): 6 | """ 7 | Get a list of certificates with all the features 8 | :param domain: Input domain 9 | """ 10 | try: 11 | certs = DomainCertificates(domain) 12 | return certs.get_all() 13 | except Exception as e: 14 | print(e) 15 | 16 | 17 | def get_certificates(domain): 18 | """ 19 | Get just the list of certificates of the domain 20 | :param domain: Input domain 21 | """ 22 | try: 23 | certs = DomainCertificates(domain) 24 | return certs.get_certificates_list() 25 | except Exception as e: 26 | print(e) 27 | 28 | 29 | def get_certificates_features(cert_id): 30 | """ 31 | Get the certificate features by certificate ID 32 | :param cert_id: crt.sh certificate ID 33 | """ 34 | try: 35 | cert = X509(cert_id) 36 | return cert.certificates_features 37 | except Exception as e: 38 | print(e) 39 | -------------------------------------------------------------------------------- /richkit/retrieve/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/richkit/retrieve/data/.gitkeep -------------------------------------------------------------------------------- /richkit/retrieve/data/categories_list.txt: -------------------------------------------------------------------------------- 1 | {"01": "Adult/Mature Content", "03": "Pornography", "04": "Sex Education", "05": "Intimate Apparel/Swimsuit", "06": "Nudity", "07": "Extreme", "09": "Scam/Questionable/Illegal", "0b": "Gambling", "0e": "Violence/Hate/Racism", "0f": "Weapons", "10": "Abortion", "11": "Hacking", "12": "Phishing", "14": "Entertainment", "15": "Business/Economy", "16": "Alternative Spirituality/Belief", "17": "Alcohol", "18": "Tobacco", "19": "Controlled Substances", "1a": "Child Pornography", "1b": "Education", "1d": "Charitable Organizations", "1e": "Art/Culture", "1f": "Financial Services", "20": "Brokerage/Trading", "21": "Games", "22": "Government/Legal", "23": "Military", "24": "Political/Social Advocacy", "25": "Health", "26": "Technology/Internet", "28": "Search Engines/Portals", "2b": "Malicious Sources/Malnets", "2c": "Malicious Outbound Data/Botnets", "2d": "Job Search/Careers", "2e": "News/Media", "2f": "Personals/Dating", "31": "Reference", "32": "Mixed Content/Potentially Adult", "33": "Chat (IM)/SMS", "34": "Email", "35": "Newsgroups/Forums", "36": "Religion", "37": "Social Networking", "38": "File Storage/Sharing", "39": "Remote Access Tools", "3a": "Shopping", "3b": "Auctions", "3c": "Real Estate", "3d": "Society/Daily Living", "3f": "Personal Sites", "40": "Restaurants/Dining/Food", "41": "Sports/Recreation", "42": "Travel", "43": "Vehicles", "44": "Humor/Jokes", "47": "Software Downloads", "53": "Peer-to-Peer (P2P)", "54": "Audio/Video Clips", "55": "Office/Business Applications", "56": "Proxy Avoidance", "57": "For Kids", "58": "Web Ads/Analytics", "59": "Web Hosting", "5a": "Uncategorized", "5c": "Suspicious", "5d": "Sexual Expression", "5f": "Translation", "60": "Non-Viewable/Infrastructure", "61": "Content Servers", "62": "Placeholders", "65": "Spam", "66": "Potentially Unwanted Software", "67": "Dynamic DNS Host", "6a": "E-Card/Invitations", "6b": "Informational", "6c": "Computer/Information Security", "6d": "Internet Connected Devices", "6e": "Internet Telephony", "6f": "Online Meetings", "70": "Media Sharing", "71": "Radio/Audio Streams", "72": "TV/Video Streams", "76": "Piracy/Copyright Concerns", "79": "Marijuana"} -------------------------------------------------------------------------------- /richkit/retrieve/dns.py: -------------------------------------------------------------------------------- 1 | from dns import resolver 2 | from dns import reversename 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | def get_a_record(domain): 9 | """ 10 | Return the A record list of a given domain 11 | :param domain: domain (string) 12 | :return: IP Addresses (list) 13 | """ 14 | try: 15 | a_record = [] 16 | result = resolver.query(domain, 'A') 17 | for ip in result: 18 | a_record.append(ip.to_text()) 19 | return a_record 20 | except Exception as ex: 21 | logger.error(ex) 22 | return None 23 | 24 | 25 | def get_ptr_record(ip_address): 26 | """ 27 | Return the PTR record of a given IP Address 28 | :param ip_address: IP Address (string) 29 | :return: domains list 30 | """ 31 | try: 32 | ptr_record = [] 33 | addr = reversename.from_address(ip_address) 34 | result = resolver.query(addr, 'PTR') 35 | for ip in result: 36 | ptr_record.append(ip.to_text()) 37 | return ptr_record 38 | except Exception as ex: 39 | logger.error(ex) 40 | return None 41 | -------------------------------------------------------------------------------- /richkit/retrieve/symantec.py: -------------------------------------------------------------------------------- 1 | """Symantec Web Service 2 | 3 | This is generated to get categories of given urls, normally it fetches 4 | category from symantec web service then saves it to local file which 5 | is called `categorized_urls` under `richkit/retrieve/data/` 6 | 7 | 8 | How to use: 9 | 10 | >>> # Import necesseary functions and make a call as demonstrated given below 11 | >>> from richkit.retrieve.symantec import fetch_from_internet 12 | >>> from richkit.retrieve.symantec import LocalCategoryDB 13 | >>> 14 | >>> urls = ["www.aau.dk","www.github.com","www.google.com"] 15 | >>> 16 | >>> local_db = LocalCategoryDB() 17 | >>> for url in urls: 18 | ... url_category=local_db.get_category(url) 19 | ... if url_category=='': 20 | ... url_category=fetch_from_internet(url) 21 | ... print(url_category) 22 | Education 23 | Technology/Internet 24 | Search Engines/Portals 25 | 26 | """ 27 | import ast 28 | import json 29 | import os 30 | from json import dumps 31 | import re 32 | from pathlib import Path 33 | import logging 34 | import requests 35 | from xml.etree.ElementTree import fromstring 36 | from requests.exceptions import HTTPError 37 | from requests.exceptions import InvalidURL 38 | logger = logging.getLogger(__name__) 39 | 40 | """ 41 | Configuration 42 | Get one here: http://www1.k9webprotection.com/get-k9-web-protection-free 43 | """ 44 | categories_url = "https://gitlab.com/snippets/1740321/raw" 45 | data_path = Path(os.path.dirname(__file__), 'data') 46 | categories_file_path = data_path / "categories_list.txt" 47 | categorized_urls_file = data_path / "categorized_urls.txt" 48 | 49 | k9License = 'Replace_by_your_own_license' 50 | 51 | 52 | class LocalCategoryDB(): 53 | def __init__(self): 54 | 55 | self.url_to_category = read_categorized_file() 56 | 57 | def get_category(self, url): 58 | if url in self.url_to_category: 59 | return self.url_to_category[url] 60 | else: 61 | return '' 62 | 63 | 64 | def fetch_categories(categories_url=categories_url, local_categories_path=categories_file_path): 65 | """Fetch categories and create local cache """ 66 | if not categories_url: 67 | return None 68 | try: 69 | resp = requests.get(categories_url) 70 | data = resp.json() 71 | d = dict([('%02x' % c['num'], c['name']) for c in data]) 72 | except HTTPError as e: 73 | logger.error('Cannot fetch categories, HTTP error: %s\n' % str(e.code)) 74 | except InvalidURL as e: 75 | logger.error('Cannot fetch categories, URL error: %s\n' % str(e.reason)) 76 | try: 77 | f = open(local_categories_path, 'w') 78 | f.write(dumps(d)) 79 | f.close() 80 | except Exception as e: 81 | f.close() 82 | logger.error('Cannot save categories: %s\n' % e) 83 | return d 84 | 85 | 86 | # 87 | def load_categories(name): 88 | """Load categories from a cache file""" 89 | if not name: 90 | return None 91 | d = {} 92 | try: 93 | f = open(name, 'r') 94 | data = f.read() 95 | d = ast.literal_eval(data) 96 | f.close() 97 | except FileNotFoundError as e: 98 | return {} 99 | except OSError as er: 100 | f.close() 101 | os.exit(1) 102 | return d 103 | 104 | 105 | def check_local_categories_file_exists(categories_file_path=categories_file_path): 106 | webCats = load_categories(categories_file_path) 107 | if webCats == {}: 108 | webCats = fetch_categories(categories_url, categories_file_path) 109 | return webCats 110 | 111 | 112 | def _chunks(s): 113 | # Original: https://github.com/allfro/sploitego/blob/master/src/sploitego/webtools/bluecoat.py 114 | return [s[i:i + 2] for i in range(0, len(s), 2)] 115 | 116 | 117 | # if there is no info related with link then call for api and append it to categorized_url.txt 118 | def write_to_local_file(text, categorized_urls_file=categorized_urls_file): 119 | with open(categorized_urls_file, 'a') as file: 120 | file.write(text + "\n") 121 | 122 | 123 | def fetch_from_internet(url, categories_file_path=categories_file_path, categorized_urls_file=categorized_urls_file): 124 | result = '' 125 | hostname = url 126 | port = '80' 127 | webservice_endpoint = 'http://sp.cwfservice.net/1/R/%s/K9-00006/0/GET/HTTP/%s/%s///' % ( 128 | k9License, hostname, port) 129 | r = requests.get(webservice_endpoint) 130 | if r.status_code == 200: 131 | e = fromstring(r.text) 132 | domc = e.find('DomC') 133 | dirc = e.find('DirC') 134 | if domc is not None: 135 | cats = _chunks(domc.text) 136 | result = [check_local_categories_file_exists().get(c.lower(), 'Unknown') 137 | for c in cats][0] 138 | write_to_local_file(url + "," + re.sub('\n', '', result), categorized_urls_file) 139 | elif dirc is not None: 140 | cats = _chunks(dirc.text) 141 | logger.debug( 142 | '%s,%s\n' % (hostname, [check_local_categories_file_exists(categories_file_path).get(c.lower(), 'Unknown') for c in cats][0])) 143 | result = [check_local_categories_file_exists( 144 | categories_file_path).get(c.lower(), 'Unknown') for c in cats][0] 145 | write_to_local_file(url + "," + re.sub('\n', '', result), categorized_urls_file) 146 | else: 147 | logger.error('Cannot get category for %s\n' % hostname) 148 | 149 | return re.sub('\n', '', result) 150 | 151 | 152 | def read_categorized_file(file_path=categorized_urls_file): 153 | url_to_category = dict() 154 | if not os.path.exists(file_path): 155 | open(file_path, 'w').close() 156 | else: 157 | with open(file_path, "r") as ins: 158 | for line in ins: 159 | pair = line.replace('\n', '').split(',') 160 | url_to_category[pair[0]] = pair[1] 161 | 162 | return url_to_category 163 | 164 | 165 | def check_for_local(url): 166 | domains = dict() 167 | for i in read_categorized_file(): 168 | line = i.split(',') 169 | 170 | if len(line) == 2: 171 | if line[1] in domains: 172 | # append tyhhe new number to the existing array at this slot 173 | if line[0] not in domains[line[1]]: 174 | domains[line[1]].append(line[0]) 175 | else: 176 | # create a new array in this slot 177 | domains[line[1]] = [line[0]] 178 | url_belong_to = [] 179 | result = '' 180 | for index, key in enumerate(domains): 181 | if url in domains[key]: 182 | result = key 183 | return result 184 | 185 | 186 | def get_index(category): 187 | for k, v in check_local_categories_file_exists().items(): 188 | if (v == category): 189 | return k 190 | -------------------------------------------------------------------------------- /richkit/retrieve/urlvoid.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import logging 3 | import re 4 | import requests 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | # RFC 6793 specifies 32 bit integer. The convention, of unknown origin, 9 | # is to prefix "AS" to the decimal form integer. \d{1,10} is a rough 10 | # approximation of 4,294,967,295 11 | ASN_REGEX = re.compile('AS\\d{1,10}') 12 | 13 | 14 | class URLVoid(object): 15 | 16 | def __init__(self, domain): 17 | self.domain = domain 18 | self.value = self.urlvoid_parser() 19 | 20 | def urlvoid_parser(self): 21 | """ 22 | Parses URLVOID table with beatifulsoup 23 | :return: dictionary which contains urlvoid response 24 | """ 25 | url = "https://www.urlvoid.com/scan/" + self.domain 26 | res = requests.get(url) 27 | text = res.text 28 | try: 29 | soup = BeautifulSoup( 30 | text, "lxml" 31 | ).find( 32 | "table", 33 | class_="table table-custom table-striped" 34 | ) 35 | all_tr = soup.find_all("tr") 36 | value = {tr.find_all("td")[0].text: 37 | tr.find_all("td")[1].text.replace("\xa0", "") 38 | for tr in all_tr} 39 | except ModuleNotFoundError as me: 40 | logger.error("Opps ! Error : %s", me) 41 | return value 42 | 43 | def get_last_analysis_date(self): 44 | """ 45 | 46 | :return: Last analysis time of given domain on URLVOID 47 | """ 48 | try: 49 | result = self.value["Last Analysis"][:-9] 50 | except KeyError as ke: 51 | logger.error('Error while retrieving value', ke) 52 | return result 53 | 54 | def domain_registration_date(self): 55 | """ 56 | 57 | :return: Registration time of domain 58 | """ 59 | try: 60 | result = self.value["Domain Registration"] 61 | except KeyError as ke: 62 | logger.error(' DRD: Error while retrieving value; %s ', ke) 63 | return result 64 | 65 | def blacklist_status(self): 66 | """ 67 | 68 | :return: Blacklist status among 36 services or more which are enable 69 | in URLVOID itself. 70 | """ 71 | try: 72 | result = self.value["Blacklist Status"] 73 | except KeyError as ke: 74 | logger.error( 75 | ' Blacklist status: Error while retrieving value; %s ', ke) 76 | return result 77 | 78 | def get_asn(self): 79 | """ 80 | 81 | :return: ASN Number 82 | """ 83 | try: 84 | result = self.value["ASN"] 85 | except KeyError as ke: 86 | logger.error('ASN: Error while retrieving value; %s ', ke) 87 | m = ASN_REGEX.search(result) 88 | if m is None: 89 | logger.error( 90 | "Failed to parse ASN for {} from \"{}\"".format( 91 | self.domain, 92 | result, 93 | ) 94 | ) 95 | return None 96 | else: 97 | return m.group() 98 | 99 | def get_server_location(self): 100 | """ 101 | 102 | :return: Server location of domain which exists on URLVOID 103 | """ 104 | try: 105 | result = self.value["Server Location"] 106 | except KeyError as ke: 107 | logger.error( 108 | ' Server Location : Error while retrieving value; %s ', ke) 109 | return result 110 | 111 | def get_ip_address(self): 112 | """ 113 | 114 | :return: IP address of given domain via URLVOID service 115 | """ 116 | try: 117 | result = self.value["IP Address"] 118 | except KeyError as ke: 119 | logger.error(' IP Address: Error while retrieving value; %s ', ke) 120 | return result 121 | 122 | def get_detection_rate(self): 123 | """ 124 | 125 | :return: Returns detection rate in percentage. 126 | """ 127 | try: 128 | parts = self.blacklist_status().split("/") 129 | result = int(parts[0]) / int(parts[1]) * 100 130 | except IndexError as ie: 131 | logger.error( 132 | 'Detection rate : Error while retrieving value; %s ', ie) 133 | return result 134 | -------------------------------------------------------------------------------- /richkit/retrieve/whois.py: -------------------------------------------------------------------------------- 1 | import whois 2 | import logging 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | def get_whois_info(domain): 8 | """Retrive a WHOIS information for a domain name 9 | 10 | :param domain: Domain name 11 | :type domain: str 12 | :return: WHOIS information of given domain name 13 | :rtype: dict (Actually a subclass of whois.parser.WhoisEntry, which 14 | itself is a subclass of `dict`) 15 | 16 | """ 17 | result = whois.whois(domain) 18 | 19 | return result 20 | -------------------------------------------------------------------------------- /richkit/retrieve/x509.py: -------------------------------------------------------------------------------- 1 | from richkit.analyse import tld, sld, sl_label, depth, length 2 | import statistics 3 | import requests 4 | import logging 5 | import time 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class X509: 11 | """ 12 | This class provides functions to extract certificate features from crt.sh 13 | The only needed parameter is the crt.sh ID of the certificate, it's possible to 14 | get it just making a request on crt.sh by listing all the certificates for a specific domain 15 | """ 16 | 17 | # Website used to retrieve the certificates belonging a domain 18 | crtSH_url = "https://crt.sh/{}" 19 | 20 | def __init__(self, cert_id): 21 | """ 22 | Get the Subject Alternative Name features from the given certificate 23 | :param cert_id: unique ID given by crt.sh per certificate 24 | """ 25 | self.cert_id = cert_id 26 | self.algorithm = None 27 | self.policy_list = None 28 | self.certificates_features = None 29 | self.get_certificate_features() 30 | 31 | def get_certificate_info(self, cert_id): 32 | """ 33 | Make a request and get the response content of the given ID 34 | :param cert_id: crt.sh ID of the certificate 35 | :return: response as text or None in case an Exception raised 36 | """ 37 | try: 38 | r = requests.get(self.crtSH_url.format("?id=" + cert_id)) 39 | if "

Certificate not found " in r.text: 40 | raise Exception("Certificate not found") 41 | if "

Invalid value:" in r.text: 42 | raise Exception("Certificate not found") 43 | return r.text 44 | except Exception as e: 45 | raise e 46 | 47 | def get_certificate_features(self): 48 | """ 49 | Parse the response content to get the certificate features 50 | """ 51 | text = None 52 | for _ in range(5): 53 | if text is not None: 54 | break 55 | try: 56 | text = self.get_certificate_info(str(self.cert_id)) 57 | text_list = text.split('
') 58 | except: 59 | time.sleep(10) 60 | 61 | sans = SANList() # Used to store the SANs 62 | policy_list = [] # Used to store the policies in order to get the Validation Level 63 | 64 | algo_index = '        Signature Algorithm:' 65 | san_index = \ 66 | '                DNS:' 67 | 68 | san_index_email = \ 69 | '                email:' 70 | 71 | policy_index = \ 72 | '            ' \ 73 | '    Policy: ' 74 | for row in text_list: 75 | # Get Signature Algorithm 76 | if algo_index in row: 77 | self.algorithm = row[len(algo_index) + 6:] 78 | 79 | # Get SANs 80 | if san_index in row: 81 | sans.append(row[len(san_index):]) 82 | if san_index_email in row: 83 | sans.append(row[len(san_index_email):]) 84 | 85 | if policy_index in row: 86 | policy_list.append(row[len(policy_index):]) 87 | 88 | # Calculating the LCS 89 | apex = [sld(san) for san in sans.get_sans()] 90 | lcs_num = get_lcs_apex(apex) 91 | 92 | self.policy_list = policy_list 93 | self.certificates_features = dict({ 94 | 'san_list': sans.get_sans(), 95 | 'DomainCount': len(sans.get_sans()), 96 | 'UniqueApexCount': unique_apex(sans.get_sans()), 97 | 'UniqueSLDCount': unique_sld(sans.get_sans()), 98 | 'ShortestSAN': sans.min(), 99 | 'LongestSAN': sans.max(), 100 | 'SANsMean': sans.mean(), 101 | 'MinSubLabels': sans.min_labels(), 102 | 'MaxSubLabels': sans.max_labels(), 103 | 'MeanSubLabels': sans.mean_labels(), 104 | 'UniqueTLDsCount': unique_tld(sans.get_sans()), 105 | 'UniqueTLDsDomainCount': sans.uniqueTLDsDomainCount(), 106 | 'ApexLCS': None, # Don't need to implement 107 | 'LenApexLCS': lcs_num, 108 | 'LenApexLCSNorm': sans.lenApexLCSNorm(lcs_num), 109 | }) 110 | 111 | 112 | def unique_apex(sans): 113 | """ 114 | Number of unique apex/root domains covered by the certificate 115 | :param sans: List of Subject Alternative Name 116 | """ 117 | apex = [sld(san) for san in sans] 118 | return len(set(apex)) 119 | 120 | 121 | def unique_tld(sans): 122 | """ 123 | Number of unique TLDs covered by the certificate 124 | :param sans: List of Subject Alternative Name 125 | """ 126 | get_tlds = [tld(san) for san in sans] 127 | return len(set(get_tlds)) 128 | 129 | 130 | def unique_sld(sans): 131 | """ 132 | Number of unique effective 2-level label domains covered by the certificate 133 | :param sans: List of Subject Alternative Name 134 | """ 135 | get_sld = [sl_label(san) for san in sans] 136 | return len(set(get_sld)) 137 | 138 | 139 | def get_lcs_apex(apex): 140 | """ 141 | The longest common substring of an array 142 | :param apex: apex array 143 | :return: The longest common substring 144 | """ 145 | lcs_num = 0 146 | for i in apex: 147 | current_sans_list = apex[:] 148 | current_sans_list.remove(i) 149 | for j in current_sans_list: 150 | current_lcs = lcs(i, j) 151 | if current_lcs > lcs_num: 152 | lcs_num = current_lcs 153 | return lcs_num 154 | 155 | 156 | def lcs(x, y): 157 | """ 158 | The longest common substring (LCS) 159 | :param x: First string 160 | :param y: Second string 161 | :return LCS 162 | """ 163 | m = len(x) 164 | n = len(y) 165 | 166 | h = [[None] * (n + 1) for i in range(m + 1)] 167 | 168 | for i in range(m + 1): 169 | for j in range(n + 1): 170 | if i == 0 or j == 0: 171 | h[i][j] = 0 172 | elif x[i - 1] == y[j - 1]: 173 | h[i][j] = h[i - 1][j - 1] + 1 174 | else: 175 | h[i][j] = max(h[i - 1][j], h[i][j - 1]) 176 | return h[m][n] 177 | 178 | 179 | class SANList: 180 | """ 181 | This class provides tje functions to extract features from the SAN list 182 | """ 183 | 184 | def __init__(self): 185 | self.sans = [] 186 | 187 | def append(self, san): 188 | self.sans.append(san) 189 | 190 | def get_sans(self): 191 | return self.sans 192 | 193 | def min(self): 194 | if not self.sans: 195 | return 0 196 | return int(min([length(row) for row in self.sans])) 197 | 198 | def max(self): 199 | if not self.sans: 200 | return 0 201 | return int(max([length(row) for row in self.sans])) 202 | 203 | def mean(self): 204 | if not self.sans: 205 | return 0 206 | return statistics.mean([len(row) for row in self.sans]) 207 | 208 | def min_labels(self): 209 | if not self.sans: 210 | return 0 211 | return min([int(depth(row)) - 2 for row in self.sans]) 212 | 213 | def max_labels(self): 214 | if not self.sans: 215 | return 0 216 | return max([int(depth(row)) - 2 for row in self.sans]) 217 | 218 | def mean_labels(self): 219 | if not self.sans: 220 | return 0 221 | return statistics.mean([int(depth(row)) for row in self.sans]) 222 | 223 | def uniqueTLDsDomainCount(self): 224 | if not self.sans: 225 | return 0 226 | return unique_tld(self.sans) / len(self.sans) 227 | 228 | def lenApexLCSNorm(self, lcs): 229 | if not self.sans: 230 | return 0 231 | return lcs / len(self.sans) 232 | -------------------------------------------------------------------------------- /richkit/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/richkit/test/__init__.py -------------------------------------------------------------------------------- /richkit/test/analyse/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/richkit/test/analyse/__init__.py -------------------------------------------------------------------------------- /richkit/test/analyse/test_analyse.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | 4 | 5 | from richkit import analyse 6 | from os import path 7 | import requests 8 | import tempfile 9 | import logging 10 | import os 11 | 12 | logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', 13 | datefmt='%m-%d %H:%M', 14 | level=logging.DEBUG) 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class TestEffect2LD(): 19 | temp_directory = tempfile.mkdtemp() 20 | MASTERURL = "https://raw.githubusercontent.com/publicsuffix/list/master/tests/test_psl.txt" 21 | MASTERFILE = temp_directory + 'correct_test.txt' 22 | test = None 23 | 24 | @classmethod 25 | def fetch_tlds(cls, url=None): 26 | url = url or cls.MASTERURL 27 | 28 | # grab master list 29 | response = requests.get(url, stream=True) 30 | if response.status_code == 200: 31 | with open(cls.MASTERFILE, 'wb') as file: 32 | file.write(response.content) 33 | else: 34 | logger.error('Error while downloading the Test List status code: %s', 35 | response.status_code) 36 | 37 | @classmethod 38 | def load_tlds(cls): 39 | try: 40 | f = open(cls.MASTERFILE, 'r', encoding="utf8") 41 | lines = f.readlines() 42 | except FileNotFoundError as e: 43 | 44 | logger.error("File not readable, not found %s", e) 45 | f.close() 46 | f.close() 47 | 48 | # strip comments and blank lines 49 | lines = [ln for ln in (ln.strip() for ln in lines) if len(ln) and ln[:2] != '//'] 50 | 51 | cls.test = set(lines) 52 | 53 | def load(self): 54 | 55 | if path.exists(TestEffect2LD.MASTERFILE): 56 | TestEffect2LD.load_tlds() 57 | 58 | if TestEffect2LD.test is None: 59 | TestEffect2LD.fetch_tlds() 60 | TestEffect2LD.load_tlds() 61 | 62 | def get_tests(self): 63 | test_list = [] 64 | for i in TestEffect2LD.test: 65 | parser = i[i.find("(")+1:i.find(")")] 66 | test_list.append(parser.replace(" ", "").replace("null", "'None'")) 67 | return test_list 68 | 69 | 70 | class TestAnalyse(unittest.TestCase): 71 | 72 | def setUp(self): 73 | self.domain = { 74 | 'www.google.co.uk': { 75 | 'num_tokens': 4, 76 | 'len2ld': 12, 77 | 'len_domain': 13, 78 | 'domain_tld': "co.uk", 79 | 'domain_sld': "google.co.uk", 80 | 'second_label': "google", 81 | 'language': "en", 82 | 'nld': "www.google.co.uk", 83 | 'n_label': "www", 84 | 'entropy': 2.8553885422075336, 85 | 'num_words_2ld': 0, 86 | 'vowels': 5, 87 | 'ratio_vowels': 0.4166666666666667, 88 | 'num_of_consonants_2ld': 5, 89 | 'ratio_consonants_2ld': 0.4166666666666667, 90 | 'num_of_special_2ld': 0, 91 | 'ratio_special_2ld': 0.0, 92 | 'num_numeric_2ld': 0, 93 | 'radio_numeric_2ld': 0.0, 94 | # following values are smaller than expected due to top 100 alexa which is expected 95 | 'n_grams_2ld': 27.33635144637163, 96 | 'n_grams_2ld_alexa': 27.33081895777167 97 | }, 98 | 'www.intranet.es.aau.dk': { 99 | 'num_tokens': 5, 100 | 'len2ld': 6, 101 | 'len_domain': 18, 102 | 'domain_tld': "dk", 103 | 'domain_sld': 'aau.dk', 104 | 'second_label': "aau", 105 | 'language': "en", 106 | 'nld': "es.aau.dk", 107 | 'n_label': "es", 108 | 'entropy': 2.2516291673878226, 109 | 'num_words_2ld': 0, 110 | 'vowels': 3, 111 | 'ratio_vowels': 0.5, 112 | 'num_of_consonants_2ld': 2, 113 | 'ratio_consonants_2ld': 0.3333333333333333, 114 | 'num_of_special_2ld': 0, 115 | 'ratio_special_2ld': 0.0, 116 | 'num_numeric_2ld': 0, 117 | 'radio_numeric_2ld': 0.0, 118 | # this is 0.0 because of gathering top 100 alexa db, written for just ensuring test functions running correctly 119 | 'n_grams_2ld': 0.0, 120 | 'n_grams_2ld_alexa': 0.0 121 | } 122 | } 123 | self.data_path = "data/" 124 | 125 | def tearDown(self): 126 | """ 127 | Removes the file after test is done. 128 | Could be modified in future according to need 129 | """ 130 | if os.path.isfile('top-1m.csv'): 131 | os.remove('top-1m.csv') 132 | 133 | def test_tld(self): 134 | for k, v in self.domain.items(): 135 | domain_tld = analyse.tld(k) 136 | self.assertEqual(domain_tld, v['domain_tld']) 137 | 138 | def test_sld(self): 139 | for k, v in self.domain.items(): 140 | domain_sld = analyse.sld(k) 141 | self.assertEqual(domain_sld, v['domain_sld']) 142 | 143 | def test_sl_label(self): 144 | for k, v in self.domain.items(): 145 | domain_sld = analyse.sl_label(k) 146 | self.assertEqual(domain_sld, v['second_label']) 147 | 148 | def test_nld(self): 149 | for k, v in self.domain.items(): 150 | nld3 = analyse.nld(k, 3) 151 | self.assertEqual(nld3, v['nld']) 152 | 153 | def test_n_label(self): 154 | for k, v in self.domain.items(): 155 | n_label3 = analyse.n_label(k, 3) 156 | self.assertEqual(n_label3, v['n_label']) 157 | 158 | def test_depth(self): 159 | for k, v in self.domain.items(): 160 | domain_depth = analyse.depth(k) 161 | self.assertEqual(domain_depth, str(v['num_tokens'])) 162 | 163 | def test_length(self): 164 | for k, v in self.domain.items(): 165 | domain_length = analyse.length(k) 166 | self.assertEqual(domain_length, str(v['len_domain'])) 167 | 168 | def test_language(self): 169 | for k, v in self.domain.items(): 170 | domain_language = analyse.language(k) 171 | self.assertEqual(domain_language, v['language']) 172 | 173 | def test_entropy(self): 174 | for k, v in self.domain.items(): 175 | domain_entropy = analyse.entropy(k) 176 | self.assertEqual(domain_entropy, str(v['entropy'])) 177 | 178 | def test_ratio_vowels(self): 179 | for k, v in self.domain.items(): 180 | domain_ratio_vowels = analyse.ratio_vowels(k) 181 | self.assertEqual(domain_ratio_vowels, str(v['ratio_vowels'])) 182 | 183 | def test_number_vowels(self): 184 | for k, v in self.domain.items(): 185 | domain_number_vowels = analyse.number_vowels(k) 186 | self.assertEqual(domain_number_vowels, str(v['vowels'])) 187 | 188 | def test_ratio_consonants(self): 189 | for k, v in self.domain.items(): 190 | domain_ratio_consonants = analyse.ratio_consonants(k) 191 | self.assertEqual(domain_ratio_consonants, str(v['ratio_consonants_2ld'])) 192 | 193 | def test_number_consonants(self): 194 | for k, v in self.domain.items(): 195 | domain_number_consonants = analyse.number_consonants(k) 196 | self.assertEqual(domain_number_consonants, str(v['num_of_consonants_2ld'])) 197 | 198 | def test_ratio_numerics(self): 199 | for k, v in self.domain.items(): 200 | domain_ratio_numerics = analyse.ratio_numerics(k) 201 | self.assertEqual(domain_ratio_numerics, str(v['radio_numeric_2ld'])) 202 | 203 | def test_number_numerics(self): 204 | for k, v in self.domain.items(): 205 | domain_number_numerics = analyse.number_numerics(k) 206 | self.assertEqual(domain_number_numerics, str(v['num_numeric_2ld'])) 207 | 208 | def test_ratio_specials(self): 209 | for k, v in self.domain.items(): 210 | domain_ratio_specials = analyse.ratio_specials(k) 211 | self.assertEqual(domain_ratio_specials, str(v['ratio_special_2ld'])) 212 | 213 | def test_number_specials(self): 214 | for k, v in self.domain.items(): 215 | domain_number_specials = analyse.number_specials(k) 216 | self.assertEqual(domain_number_specials, str(v['num_of_special_2ld'])) 217 | 218 | def test_number_words(self): 219 | for k, v in self.domain.items(): 220 | domain_number_words = analyse.number_words(k) 221 | self.assertEqual(domain_number_words, str(v['num_words_2ld'])) 222 | 223 | def test_get_grams_alexa_2ld(self): 224 | for k, v in self.domain.items(): 225 | alexa_grams_2ld = analyse.n_grams_alexa(k, is_test=True) 226 | self.assertEqual(alexa_grams_2ld, v['n_grams_2ld_alexa']) 227 | 228 | def test_get_grams_dict_2ld(self): 229 | for k, v in self.domain.items(): 230 | grams_dict_2ld = analyse.n_grams_dict(k, is_test=True) 231 | self.assertEqual(grams_dict_2ld, v['n_grams_2ld']) 232 | 233 | def test_correctly_tlds(self): 234 | tests = TestEffect2LD() 235 | tests.load() 236 | test_list = tests.get_tests() 237 | 238 | # Test skipped for the following list 239 | # Punycode are not handled by this library 240 | list_punycode_tests = [ 241 | 'xn--85x722f.xn--55qx5d.cn', 242 | 'xn--85x722f.xn--fiqs8s', 243 | 'xn--55qx5d.cn', 244 | 'shishi.xn--55qx5d.cn', 245 | 'www.xn--85x722f.xn--fiqs8s', 246 | 'www.xn--85x722f.xn--55qx5d.cn', 247 | 'shishi.xn--fiqs8s' 248 | ] 249 | 250 | # Test skipped for obvious invalid domains 251 | list_test_error = [ 252 | '公司.cn', 253 | '中国', 254 | 'biz', 255 | 'jp', 256 | 'us', 257 | 'com', 258 | 'a.b.example.example', 259 | 'b.example.example', 260 | 'example.example', 261 | '.example.com', 262 | '.com', 263 | ] 264 | 265 | # Test skipped for the following domains list 266 | # They start with esclamation point on the Public Suffix list 267 | list_esclamation_point = [ 268 | 'www.ck', 269 | 'www.city.kobe.jp', 270 | 'www.www.ck', 271 | 'city.kobe.jp' 272 | ] 273 | 274 | for i in test_list: 275 | values = i.split(',') 276 | input = values[0].replace("'", "") 277 | expected = values[1].replace("'", "") 278 | if expected == "None": 279 | expected = None 280 | 281 | if input in list_punycode_tests or \ 282 | input in list_test_error or \ 283 | input in list_esclamation_point: 284 | continue 285 | else: 286 | self.assertEqual(analyse.sld(input), expected) 287 | -------------------------------------------------------------------------------- /richkit/test/lookup/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/richkit/test/lookup/__init__.py -------------------------------------------------------------------------------- /richkit/test/lookup/test_geo.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from richkit.lookup import util 4 | from richkit import lookup 5 | import unittest 6 | 7 | 8 | def rm_recursive(pth): 9 | pth = Path(pth) 10 | # Recurse 11 | for child in pth.glob('*'): 12 | if child.is_file(): 13 | child.unlink() 14 | else: 15 | rm_recursive(child) 16 | # Handle current pth 17 | if pth.is_file(): 18 | pth.unlink() 19 | else: 20 | pth.rmdir() 21 | 22 | 23 | class LookupTestCase(unittest.TestCase): 24 | 25 | def tearDown(self): 26 | for el in Path(util.maxmind_directory).glob('*'): 27 | rm_recursive(el) 28 | 29 | def test_country(self): 30 | country = lookup.country("8.8.8.8") 31 | self.assertEqual(country, 'US') 32 | 33 | def test_asn(self): 34 | asn = lookup.asn("8.8.8.8") 35 | self.assertEqual(asn, 'AS15169') 36 | 37 | def test_registered_country(self): 38 | registered_country = lookup.registered_country("8.8.8.8") 39 | self.assertEqual(registered_country, 'US') 40 | 41 | def test_maxmindb_licence_key(self): 42 | test_license_key = os.environ["TEST_LICENSE_KEY"] = "LICENSEKEY" 43 | license_key = lookup.maxmindb_licence_key("TEST_LICENSE_KEY") 44 | non_existing_license_key = lookup.maxmindb_licence_key("NON-EXISTING") 45 | self.assertTrue(license_key, test_license_key) 46 | self.assertIs(non_existing_license_key, 'NOLICENSEKEYFOUND') 47 | -------------------------------------------------------------------------------- /richkit/test/lookup/test_util.py: -------------------------------------------------------------------------------- 1 | import time 2 | from datetime import datetime 3 | 4 | from richkit.lookup import util 5 | from richkit.lookup.util import MaxMindDB 6 | import os 7 | import unittest 8 | import logging 9 | from pathlib import Path 10 | from requests.exceptions import ConnectionError 11 | 12 | 13 | def rm_recursive(pth): 14 | pth = Path(pth) 15 | # Recurse 16 | for child in pth.glob('*'): 17 | if child.is_file(): 18 | child.unlink() 19 | else: 20 | rm_recursive(child) 21 | # Handle current pth 22 | if pth.is_file(): 23 | pth.unlink() 24 | else: 25 | pth.rmdir() 26 | 27 | 28 | class StubMaxMindDB(MaxMindDB): 29 | """Stub with minimal __init__, to not hit error there.""" 30 | 31 | def __init__(self): 32 | self.path_db = util.maxmind_directory 33 | self.query = "cc" 34 | 35 | 36 | class MaxMindDBTestCase(unittest.TestCase): 37 | 38 | def setUp(self): 39 | # Remove the logging for tests 40 | logging.disable(logging.CRITICAL) 41 | 42 | MaxMindDB.MASTERURL = ( 43 | "https://download.maxmind.com/app/geoip_download?" 44 | "edition_id=GeoLite2-Country&" 45 | "license_key={license_key}&" 46 | "suffix=tar.gz" 47 | ).format( 48 | license_key=os.environ['MAXMIND_LICENSE_KEY'], 49 | ) 50 | 51 | def tearDown(self): 52 | # deletes the files after test is done 53 | for el in Path(util.maxmind_directory).glob('*'): 54 | rm_recursive(el) 55 | 56 | def test_init(self): 57 | obj = MaxMindDB(MaxMindDB.MASTERURL, "cc") 58 | self.assertIsNotNone(obj) 59 | 60 | def test_get_db_path(self): 61 | s = StubMaxMindDB() 62 | 63 | # No db present 64 | p = s.get_db_path() 65 | self.assertIsNone(p) 66 | 67 | # Single db present 68 | folder = Path(s.path_db, 'GeoLite2-Country_DUMMYFOLDER_1970') 69 | folder.mkdir() 70 | db_file = Path(folder, 'GeoLite2-Country.mmdb') 71 | db_file.touch() 72 | p = MaxMindDB.get_db_path(s) 73 | 74 | self.assertEqual(p, str(db_file)) 75 | 76 | # Two dbs present 77 | folder = Path(s.path_db, 'GeoLite2-Country_DUMMYFOLDER_2040') 78 | folder.mkdir() 79 | db_file = Path(folder, 'GeoLite2-Country.mmdb') 80 | db_file.touch() 81 | p = MaxMindDB.get_db_path(s) 82 | 83 | self.assertEqual(p, str(db_file)) 84 | 85 | def test_get_db(self): 86 | 87 | # When DNS fails: 88 | MaxMindDB.MASTERURL = ( 89 | "https://this_domain_does_not_exist.local" 90 | "/download/geoip/database/GeoLite2-Country.tar.gz" 91 | ) 92 | 93 | with self.assertRaises(ConnectionError): 94 | MaxMindDB(MaxMindDB.MASTERURL, "cc").get_db() 95 | 96 | # When URL is bad 97 | MaxMindDB.MASTERURL = MaxMindDB.MASTERURL.replace( 98 | '?', "THIS_URL_IS_WRONG") 99 | 100 | with self.assertRaises(Exception): 101 | MaxMindDB.get_db() 102 | 103 | # When all is fine: 104 | self.setUp() 105 | s = StubMaxMindDB() 106 | s.get_db() 107 | # Check if file is present 108 | p = s.get_db_path() 109 | s_age = s.get_age() # get_age() function is tested over here 110 | self.assertIsNotNone(p, "get_db did not a path to the db") 111 | self.assertTrue(Path(p).exists()) 112 | self.assertTrue(s_age.microseconds) 113 | 114 | def test_extracted_db(self): 115 | s = StubMaxMindDB() 116 | # When fail to extract the DB 117 | with self.assertRaises(Exception): 118 | s.unpack() 119 | -------------------------------------------------------------------------------- /richkit/test/retrieve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/richkit/test/retrieve/__init__.py -------------------------------------------------------------------------------- /richkit/test/retrieve/test_ctlogs.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import richkit.retrieve.ctlogs as ct 3 | from richkit.retrieve.cert_sh import DomainCertificates 4 | from richkit.retrieve.x509 import X509 5 | 6 | 7 | class TestCTLogs(unittest.TestCase): 8 | 9 | def setUp(self): 10 | self.domains = { 11 | 'example.com': { 12 | 'certs': [ 13 | { 14 | "ID": "987119772", 15 | "Algorithm": "sha256WithRSAEncryption", 16 | "SANFeatures": { 17 | "DomainCount": 8, 18 | } 19 | }, 20 | { 21 | "ID": "984858191", 22 | "Algorithm": "sha256WithRSAEncryption", 23 | "SANFeatures": { 24 | "DomainCount": 8, 25 | } 26 | }, 27 | { 28 | "ID": "24560621", 29 | "Algorithm": "sha256WithRSAEncryption", 30 | "SANFeatures": { 31 | "DomainCount": 4, 32 | } 33 | }, 34 | ] 35 | } 36 | } 37 | 38 | def test_init_domain(self): 39 | obj = DomainCertificates("example.com") 40 | if not obj.certificates: 41 | self.skipTest("Server not available") 42 | self.assertIsNotNone(obj) 43 | 44 | def test_init_certificate(self): 45 | obj = X509("12345678") 46 | if not obj.certificates_features: 47 | self.skipTest("Server not available") 48 | self.assertIsNotNone(obj) 49 | 50 | def test_domain_error(self): 51 | with self.assertRaises(Exception): 52 | DomainCertificates("this_domain_does_not_exist.com") 53 | 54 | def test_certificate_error(self): 55 | with self.assertRaises(Exception): 56 | X509("this_id_does_not_exist.com") 57 | 58 | def test_get_all_certificate(self): 59 | 60 | for k, v in self.domains.items(): 61 | certs = ct.get_logs(k) 62 | print(certs) 63 | if certs is None: 64 | self.skipTest("Server not available") 65 | 66 | for cert in certs: 67 | for vx in v["certs"]: 68 | if str(cert["ID"]) == str(vx["ID"]): 69 | assert cert["Algorithm"] == vx["Algorithm"] 70 | assert cert["SANFeatures"]["DomainCount"] == vx["SANFeatures"]["DomainCount"] 71 | 72 | def test_get_certificate_features(self): 73 | 74 | for k, v in self.domains.items(): 75 | for cert in v["certs"]: 76 | cert_features = ct.get_certificates_features(cert["ID"]) 77 | if not cert_features: 78 | continue 79 | assert cert_features.get('DomainCount') == cert["SANFeatures"]["DomainCount"] 80 | -------------------------------------------------------------------------------- /richkit/test/retrieve/test_dns.py: -------------------------------------------------------------------------------- 1 | from richkit.retrieve import dns 2 | 3 | import unittest 4 | 5 | 6 | class DNSTestCase(unittest.TestCase): 7 | # Since A record change every time, just checking whether we are retrieving a record or not 8 | def setUp(self): 9 | self.test_urls = ["www.google.co.uk", "www.cloudflare.com", "www.intranet.es.aau.dk"] 10 | self.test_ips = ["8.8.8.8", "8.8.4.4", "1.1.1.1"] 11 | 12 | def test_a_record(self): 13 | for url in self.test_urls: 14 | instance = dns.get_a_record(url) 15 | self.assertIsNot(instance[0], None) 16 | 17 | # Since PTR record change every time, just checking whether we are retrieving a record or not 18 | def test_ptr_record(self): 19 | for url in self.test_ips: 20 | instance = dns.get_ptr_record(url) 21 | self.assertIsNot(instance[0], None) 22 | 23 | 24 | if __name__ == '__main__': 25 | unittest.main() 26 | -------------------------------------------------------------------------------- /richkit/test/retrieve/test_symantec.py: -------------------------------------------------------------------------------- 1 | from richkit.retrieve.symantec import read_categorized_file 2 | from richkit.retrieve.symantec import fetch_from_internet 3 | from richkit.retrieve.symantec import fetch_categories 4 | from richkit.retrieve.symantec import load_categories 5 | from richkit.retrieve.symantec import categories_url 6 | from pathlib import Path 7 | import unittest 8 | import os 9 | 10 | 11 | CAT_URLS_FILE = 'categorized_urls.txt' 12 | CATEGORIES_FILE_PATH = 'categories_list.txt' 13 | 14 | 15 | class SymantecTestCase(unittest.TestCase): 16 | 17 | @classmethod 18 | def tearDownClass(cls): 19 | """ 20 | Removes created resources during test phase 21 | """ 22 | for file in Path('.').glob('*.txt'): 23 | file.unlink() 24 | 25 | def test_read_categorized_file(self): 26 | cat_urls_file_path = Path(CAT_URLS_FILE) 27 | # Read with missing file 28 | try: 29 | cat_urls_file_path.unlink() 30 | except FileNotFoundError: 31 | pass 32 | self.assertIsInstance(read_categorized_file(), dict) 33 | 34 | # Read with empty file 35 | cat_urls_file_path.touch() 36 | 37 | d = read_categorized_file(CAT_URLS_FILE) 38 | self.assertIsInstance(d, dict) 39 | self.assertEqual(len(d), 0) 40 | 41 | # Read something already in file 42 | with open(CAT_URLS_FILE, 'w') as fd: 43 | fd.writelines([ 44 | 'www.example.com,Example' 45 | ]) 46 | d = read_categorized_file(CAT_URLS_FILE) 47 | self.assertIsInstance(d, dict) 48 | self.assertEqual(len(d), 1) 49 | self.assertEqual(d['www.example.com'], 'Example') 50 | 51 | def test_fetch_categories(self): 52 | # make sure that categories url is accessible and fetched correctly 53 | categories = fetch_categories(categories_url, CATEGORIES_FILE_PATH) 54 | self.assertNotEqual(categories, {}) 55 | 56 | def test_load_categories(self): 57 | if os.path.isfile(CATEGORIES_FILE_PATH): 58 | self.assertNotEqual(load_categories(CATEGORIES_FILE_PATH), {}) 59 | else: 60 | self.assertEqual(load_categories(CATEGORIES_FILE_PATH), {}) 61 | 62 | def test_fetch_from_internet( 63 | self, 64 | categories_file_path=CATEGORIES_FILE_PATH, 65 | categorized_url_path=CAT_URLS_FILE 66 | ): 67 | domain_categories = { 68 | "Search Engines/Portals": [ 69 | "www.bing.com", 70 | "www.google.com", 71 | "www.yandex.com" 72 | ], 73 | "Social Networking": [ 74 | "www.facebook.com", 75 | "www.twitter.com" 76 | ] 77 | } 78 | for category, url_list in domain_categories.items(): 79 | for url in url_list: 80 | fetched_category = fetch_from_internet( 81 | url, categories_file_path, categorized_url_path 82 | ) 83 | self.assertEqual(fetched_category, category) 84 | # unlinking CAT_URLS_FILE here, otherwise the len of dict 85 | # at line 29 (within test_read_categorized_file) will be different 86 | # it may cause failing that's why unlinking the file here is required. 87 | os.unlink(CAT_URLS_FILE) 88 | 89 | 90 | if __name__ == '__main__': 91 | unittest.main() 92 | -------------------------------------------------------------------------------- /richkit/test/retrieve/test_urlvoid.py: -------------------------------------------------------------------------------- 1 | from richkit.retrieve.urlvoid import URLVoid 2 | 3 | import unittest 4 | import re 5 | 6 | 7 | class URLVoidTestCase(unittest.TestCase): 8 | test_urls = { 9 | "google.co.uk": { 10 | "domain_registration": "1999-02-14", 11 | "blacklist_status": "0/36", 12 | "ASN": "AS15169", 13 | "server_location": " (US) United States", 14 | "detection_rate": 0, 15 | "ip_address": "172.217.19.227", 16 | "a_record": ['172.217.19.195', '172.217.17.67'], 17 | "ptr_record": [ 18 | 'ams16s30-in-f67.1e100.net.', 19 | 'ams16s31-in-f3.1e100.net.', 20 | 'ams16s30-in-f3.1e100.net.' 21 | ] 22 | }, 23 | "facebook.com": { 24 | "domain_registration": "1997-03-29", 25 | "blacklist_status": "0/36", 26 | "ASN": "AS32934", 27 | "server_location": " (US) United States", 28 | "detection_rate": 0, 29 | "ip_address": "157.240.21.35", 30 | "a_record": ['31.13.72.36'], 31 | "ptr_record": ['edge-star-mini-shv-01-arn2.facebook.com.'] 32 | }, 33 | } 34 | 35 | def test_domain_registration_date(self): 36 | for k, v in self.test_urls.items(): 37 | instance = URLVoid(k) 38 | domain_registration = instance.domain_registration_date()[:-15] 39 | self.assertEqual(domain_registration, v["domain_registration"]) 40 | 41 | def test_get_detection_rate(self): 42 | for k, v in self.test_urls.items(): 43 | instance = URLVoid(k) 44 | domain_detection_rate = instance.get_detection_rate() 45 | self.assertEqual(domain_detection_rate, v["detection_rate"]) 46 | 47 | def test_get_server_location(self): 48 | for k, v in self.test_urls.items(): 49 | instance = URLVoid(k) 50 | domain_server_location = instance.get_server_location() 51 | self.assertEqual(domain_server_location, v["server_location"]) 52 | 53 | def test_get_asn(self): 54 | for k, v in self.test_urls.items(): 55 | instance = URLVoid(k) 56 | domain_asn = instance.get_asn() 57 | self.assertEqual(domain_asn, v["ASN"]) 58 | 59 | class StubURLVoid(URLVoid): 60 | def __init__(self, asn): 61 | self.domain = None 62 | self.value = {'ASN': asn} 63 | 64 | self.assertIsNone(StubURLVoid('AZ1 Not a valid ASN').get_asn()) 65 | self.assertEqual(StubURLVoid('AS1').get_asn(), 'AS1') 66 | self.assertEqual(StubURLVoid('AS1 Random-Test-Text').get_asn(), 'AS1') 67 | self.assertEqual(StubURLVoid('AS1234567890').get_asn(), 'AS1234567890') 68 | # Strictly speaking, the below tests are correct, but covering them 69 | # is deemed unnecessary complex: 70 | # self.assertIsNone( 71 | # StubURLVoid('AS12345678901').get_asn(), 72 | # ("Failed to reject ASN of 10 decimal digits (One more digit that" 73 | # "possible with RFC 6793)"), 74 | # ) 75 | # self.assertIsNone( 76 | # StubURLVoid('AS4294967295').get_asn(), 77 | # "Failed to reject ASN 0xFFFFFFFF + 0x1 (RFC 6793 max value + 1)", 78 | # ) 79 | 80 | def test_blacklist_status(self): 81 | for k, v in self.test_urls.items(): 82 | instance = URLVoid(k) 83 | blacklist_status = instance.blacklist_status() 84 | self.assertTrue(re.match(r'[0]/\d*', blacklist_status)) 85 | 86 | 87 | if __name__ == '__main__': 88 | unittest.main() 89 | -------------------------------------------------------------------------------- /richkit/test/retrieve/test_whois.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datetime import datetime 3 | from richkit.retrieve import whois 4 | 5 | 6 | class WhoisTestCase(unittest.TestCase): 7 | 8 | # .dk domains give unknownTld exception ! 9 | def test_get_whois_info(self): 10 | # last updated field skipped since it could be None 11 | 12 | d = "www.google.com" 13 | w = whois.get_whois_info(d) 14 | self.assertTrue(len(w['registrar']) > 0) 15 | # .com uses "thin" WHOIS, so we get expiry from both registry 16 | # and registrar; 17 | self.assertTrue(len(w['expiration_date']) == 2) 18 | self.assertIsInstance(w['expiration_date'][0], datetime) 19 | self.assertIsInstance(w['expiration_date'][1], datetime) 20 | 21 | d = "www.cloudflare.com" 22 | w = whois.get_whois_info(d) 23 | self.assertTrue('registrar' in w) 24 | self.assertTrue(len(w['registrar']) > 0) 25 | self.assertTrue('expiration_date' in w) 26 | # .com uses "thin" WHOIS, so we get expiry from both registry 27 | # and registrar, but they are equal here, so only one is returned; 28 | self.assertIsInstance(w['expiration_date'], datetime) 29 | 30 | 31 | if __name__ == '__main__': 32 | unittest.main() 33 | -------------------------------------------------------------------------------- /richkit/test/retrieve/test_x509.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from richkit.retrieve.x509 import unique_apex, unique_sld, unique_tld, get_lcs_apex 3 | 4 | 5 | class Test_x509(unittest.TestCase): 6 | 7 | def setUp(self): 8 | self.sans = ['*.google.com', 'mail.google.com', 9 | 'example.com', 'test.example.dk', 'test_domain.co.uk'] 10 | 11 | def test_unique_apex(self): 12 | assert unique_apex(self.sans) == 4 13 | 14 | def test_unique_tld(self): 15 | assert unique_tld(self.sans) == 3 16 | 17 | def test_unique_sld(self): 18 | assert unique_sld(self.sans) == 3 19 | 20 | def test_lcs(self): 21 | assert get_lcs_apex(self.sans) == 11 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | from os import path 3 | root = path.curdir 4 | with open(path.join(root, 'README.md'), encoding='utf-8') as f: 5 | long_description = f.read() 6 | 7 | setuptools.setup( 8 | name='richkit', 9 | description='Domain enrichment kit ', 10 | version='1.1.1', 11 | long_description=long_description, 12 | long_description_content_type='text/markdown', 13 | url='https://github.com/aau-network-security/richkit', 14 | packages=setuptools.find_packages(exclude=['docs', 'richkit/test']), 15 | project_urls={ 16 | 'Bug Reports': 'https://github.com/aau-network-security/richkit/issues', 17 | 'Funding': 'https://donate.pypi.org', 18 | 'Source': 'https://github.com/aau-network-security/richkit', 19 | }, 20 | install_requires=['maxminddb', 21 | 'numpy==1.17.2', 22 | 'scikit-learn==0.21.3', 23 | 'langid==1.1.6', 24 | 'bs4==0.0.1', 25 | 'lxml==4.4.1', 26 | 'requests==2.22.0', 27 | 'pytest', 28 | 'dnspython', 29 | 'coverage'], 30 | python_requires='>=3.5', 31 | author=['Ahmet Turkmen', 'Gian Marco Mennecozzi ', 'Egon Kidmose'], 32 | classifiers=[ 33 | 'Development Status :: 4 - Beta', 34 | 'Intended Audience :: Developers', 35 | 'License :: OSI Approved :: MIT License', 36 | 'Programming Language :: Python :: 3.5', 37 | 'Programming Language :: Python :: 3.6', 38 | 'Programming Language :: Python :: 3.7', 39 | ], 40 | 41 | ) 42 | --------------------------------------------------------------------------------