├── .dockerignore
├── .githooks
    ├── check-branch-name.py
    └── pre-commit.linux.sample
├── .github
    ├── local-test
    │   └── run-test.sh
    ├── logo
    │   ├── blue
    │   │   ├── logo_desc
    │   │   │   ├── 192x192.png
    │   │   │   ├── 256x256.png
    │   │   │   ├── 320x320.png
    │   │   │   ├── 480x480.png
    │   │   │   └── 512x512.png
    │   │   └── logo_no_desc
    │   │   │   ├── 192x192.png
    │   │   │   ├── 256x256.png
    │   │   │   ├── 320x320.png
    │   │   │   ├── 480x480.png
    │   │   │   └── 512x512.png
    │   └── white
    │   │   ├── logo_desc
    │   │       ├── 192x192.png
    │   │       ├── 256x256.png
    │   │       ├── 320x320.png
    │   │       ├── 480x480.png
    │   │       └── 512x512.png
    │   │   └── logo_no_desc
    │   │       ├── 192x192.png
    │   │       ├── 256x256.png
    │   │       ├── 320x320.png
    │   │       ├── 480x480.png
    │   │       └── 512x512.png
    └── workflows
    │   ├── formatpythoncode.yml
    │   ├── pythonpackage.yml
    │   └── pythonpublish.yml
├── .gitignore
├── Dockerfile.test
├── LICENSE
├── Makefile
├── README.md
├── docs
    ├── Makefile
    ├── _static
    │   └── .gitkeep
    ├── conf.py
    ├── index.rst
    └── make.bat
├── requirements.txt
├── richkit
    ├── __init__.py
    ├── analyse
    │   ├── __init__.py
    │   ├── analyse.py
    │   ├── segment.py
    │   └── util.py
    ├── lookup
    │   ├── __init__.py
    │   ├── geo.py
    │   └── util.py
    ├── retrieve
    │   ├── __init__.py
    │   ├── cert_sh.py
    │   ├── ctlogs.py
    │   ├── data
    │   │   ├── .gitkeep
    │   │   └── categories_list.txt
    │   ├── dns.py
    │   ├── symantec.py
    │   ├── urlvoid.py
    │   ├── whois.py
    │   └── x509.py
    └── test
    │   ├── __init__.py
    │   ├── analyse
    │       ├── __init__.py
    │       └── test_analyse.py
    │   ├── lookup
    │       ├── __init__.py
    │       ├── test_geo.py
    │       └── test_util.py
    │   └── retrieve
    │       ├── __init__.py
    │       ├── test_ctlogs.py
    │       ├── test_dns.py
    │       ├── test_symantec.py
    │       ├── test_urlvoid.py
    │       ├── test_whois.py
    │       └── test_x509.py
└── setup.py


/.dockerignore:
--------------------------------------------------------------------------------
 1 | .pytest_cache
 2 | .githooks
 3 | .docs
 4 | .github/logo
 5 | .github/workflows
 6 | 
 7 | 
 8 | .env
 9 | .venv
10 | env/
11 | venv/
12 | ENV/
13 | env.bak/
14 | venv.bak/
15 | 
16 | 


--------------------------------------------------------------------------------
/.githooks/check-branch-name.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import re
  3 | import sys
  4 | 
  5 | 
  6 | def check(name):
  7 |     """Check a git branch name against gitflow naming conventions.
  8 | 
  9 |     This is most likely the function you are looking for.
 10 | 
 11 |     """
 12 |     if name in (  # First level only branches
 13 |             'master',
 14 |             'develop',
 15 |     ):
 16 |         return True
 17 |     elif len(name.split('/')) == 2:
 18 |         # some have two levels separated by /
 19 |         return checkSecondLevel(name)
 20 |     else:
 21 |         # Default
 22 |         print(f'Error: Did not recognise "{name}" as a valid branch.')
 23 |         return False
 24 | 
 25 | 
 26 | def checkLen(string, min_len, max_len):
 27 |     if len(string) < min_len:
 28 |         print(
 29 |             f'Error: {string} is too short'
 30 |             f' (it is {len(string)}, minimum is {min_len})'
 31 |         )
 32 |         return False
 33 |     if len(string) > max_len:
 34 |         print(
 35 |             f'Error: {string} is too long'
 36 |             f' (it is {len(string)}, maximum is {max_len})'
 37 |         )
 38 |         return False
 39 |     else:
 40 |         return True
 41 | 
 42 | 
 43 | def checkSecondLevel(name):
 44 |     """Checks the name to be a valid gitflow branch name containing a `/`.
 45 | 
 46 |     This is intended for internal use, and asumes a single `/` to be
 47 |     present in `name`.
 48 | 
 49 |     """
 50 |     category, label = name.split('/')
 51 | 
 52 |     if category in (  # valid categories
 53 |             'feature',
 54 |             'hotfix',
 55 |     ):
 56 |         return checkLabel(label)
 57 |     elif category in (  # Not currently validating release branch names
 58 |             'release',
 59 |     ):
 60 |         return True
 61 |     else:
 62 |         print(f'Error: Did not recognise "{category}" as a valid category')
 63 |         return False
 64 | 
 65 | 
 66 | def checkLabel(label):
 67 |     """Checks the label to have a description of one or more words
 68 |     (lowercase alphanumerics), joined by a dash (`-`), followed by an
 69 |     issue reference.
 70 | 
 71 |     Example: word-and-numb3r-#1
 72 | 
 73 |     """
 74 |     # Description
 75 |     desc_re = r'(?P<description>[a-z0-9]+(?:-[a-z0-9]+)*)'  # one or more words
 76 |     desc_re = r'^' + desc_re  # must be at begining
 77 |     m = re.search(desc_re, label)
 78 |     if not m:
 79 |         print(
 80 |             f'Error: No valid description in "{label}"'
 81 |             f' (Expected it to start with lowercase alphanumeric and dashes'
 82 |             f' like this: ex4mple-description)'
 83 |         )
 84 |         return False
 85 | 
 86 |     if not checkLen(m.groupdict()['description'], 10, 25):
 87 |         return False
 88 | 
 89 |     # Issue reference
 90 |     issue_re = r'(?P<issue>#[0-9]+)'  # hashtag and integer
 91 |     issue_re = issue_re + r'$'  # must be at end
 92 |     if not re.search(issue_re, label):
 93 |         print(
 94 |             f'Error: No issue reference in "{label}"'
 95 |             f' (Expected it to in like this: ...-#1)'
 96 |         )
 97 |         return False
 98 | 
 99 |     # Dash seperator
100 |     label_re = desc_re + r'-' + issue_re
101 |     if not re.search(label_re, label):
102 |         print(
103 |             f'Error: Missing dash between description and issue reference '
104 |             f' in "{label}"'
105 |         )
106 |         return False
107 | 
108 |     return True  # no problems found
109 | 
110 | 
111 | if __name__ == "__main__":
112 | 
113 |     parser = argparse.ArgumentParser(
114 |         description='Validate branch name according to gitflow',
115 |     )
116 |     parser.add_argument(
117 |         '-t', '--test', dest='test', action='store_const',
118 |         const=True, default=False,
119 |         help='Run the built in tests and exit',
120 |     )
121 |     parser.add_argument(
122 |         'name', metavar='NAME', type=str,
123 |         help='The branch name to check'
124 |     )
125 |     args = parser.parse_args()
126 | 
127 |     if not args.test:
128 |         success = check(args.name)
129 |         sys.exit(not success)
130 | 
131 |     print('Starting built-in self-testing')
132 |     print('Expect error messages, but not AssertionError\'s')
133 |     assert check('master')
134 |     assert check('develop')
135 |     assert not check('random')  # no custom at top level
136 |     assert not check('alkshjdg')  # no custom at top level
137 |     assert not check('master/asdasdasdasdasdasd')  # nothing below master
138 |     assert not check('develop/asdasdasdasdasdas')  # nothing below develop
139 |     assert check('feature/some-feature-#9')  # good
140 |     assert not check('feature/2-shrt-fe#1')  # too short
141 |     assert not check('feature/very-long-description-here-#1')  # too long
142 |     print('Done - either all tests passed or you disable `assert`')
143 | 


--------------------------------------------------------------------------------
/.githooks/pre-commit.linux.sample:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | ## Would be nice to have linting before commit
 4 | if ! [ -x "$(command -v autopep8)" ] || [ "$(pip3 list |
 5 |                                           cut -d " " -f 1 |
 6 |                                           grep -xF "$package_name" | grep autopep8)" != "autopep8" ]
 7 | then
 8 |     echo 'autopep8 is NOT installed, linting test may fail on CI ... '
 9 |     echo 'consider to install autopep8, you may use following commands: '
10 |     echo 'Debian: [ sudo apt-get install -y python-autopep8 ] '
11 |     echo 'MacOS: [ brew install autopep8 ]'
12 |     echo 'You may consider to install it into virtual environment of your project:'
13 |     echo 'source venv/bin/activate'
14 |     echo 'pip3 install autopep8'
15 |     echo 'autopep8 should be available in your system, to do not face with linting problem.'
16 |     exit 1
17 | else
18 |     echo 'Linting...'
19 |     echo 'Going to root directory of the project'
20 |     cd ../richkit
21 |     autopep8 --in-place --recursive --max-line-length=100 --exclude docs/source/conf.py,venv,__pycache__,old,build,dist .
22 | fi
23 | 
24 | python3 .githooks/check-branch-name.py "$(git rev-parse --abbrev-ref HEAD)"
25 | exit $?


--------------------------------------------------------------------------------
/.github/local-test/run-test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | 
 3 | NC='\033[0m'
 4 | RED='\033[0;31m'
 5 | ORANGE='\033[0;33m'
 6 | GREEN='\033[0;32m'
 7 | 
 8 | if [ "$MAXMIND_LICENSE_KEY" = "" ] ; then
 9 |    echo "${ORANGE} Warning: Environment variable for MAXMINDDB could not be found, proceeding without it, check README file "
10 | fi
11 | # change directory to /richkit
12 | 
13 | cd /richkit
14 | 
15 | echo "${GREEN}1. Checking flake8 linting ... "
16 |  # test that number of violations does not increase
17 | FLAKE8_ERROR_CNT=$(flake8 . -qq --count --exit-zero --max-complexity=10 --max-line-length=127 --exclude venv,__pycache__,docs/source/conf.py,old,build,dist)
18 | FLAKE8_ERROR_LIMIT=25
19 | if [ "$FLAKE8_ERROR_CNT" -gt "$FLAKE8_ERROR_LIMIT" ] ; then
20 |   echo "${RED}Failed because the number of errors from flake8 increased (This: $FLAKE8_ERROR_CNT Previously: $FLAKE8_ERROR_LIMIT)" 1>&2
21 |   false
22 |   exit 1
23 | fi
24 | echo "${ORANGE}Number of validation errors from flake8 is: $FLAKE8_ERROR_CNT (Limit is: $FLAKE8_ERROR_LIMIT)"
25 | 
26 | 
27 | echo "${GREEN}2. Testing module .... "
28 | echo "${NC}"
29 | coverage run --source=richkit -m pytest -Werror /richkit/richkit
30 | 
31 | 


--------------------------------------------------------------------------------
/.github/logo/blue/logo_desc/192x192.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_desc/192x192.png


--------------------------------------------------------------------------------
/.github/logo/blue/logo_desc/256x256.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_desc/256x256.png


--------------------------------------------------------------------------------
/.github/logo/blue/logo_desc/320x320.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_desc/320x320.png


--------------------------------------------------------------------------------
/.github/logo/blue/logo_desc/480x480.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_desc/480x480.png


--------------------------------------------------------------------------------
/.github/logo/blue/logo_desc/512x512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_desc/512x512.png


--------------------------------------------------------------------------------
/.github/logo/blue/logo_no_desc/192x192.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_no_desc/192x192.png


--------------------------------------------------------------------------------
/.github/logo/blue/logo_no_desc/256x256.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_no_desc/256x256.png


--------------------------------------------------------------------------------
/.github/logo/blue/logo_no_desc/320x320.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_no_desc/320x320.png


--------------------------------------------------------------------------------
/.github/logo/blue/logo_no_desc/480x480.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_no_desc/480x480.png


--------------------------------------------------------------------------------
/.github/logo/blue/logo_no_desc/512x512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/blue/logo_no_desc/512x512.png


--------------------------------------------------------------------------------
/.github/logo/white/logo_desc/192x192.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_desc/192x192.png


--------------------------------------------------------------------------------
/.github/logo/white/logo_desc/256x256.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_desc/256x256.png


--------------------------------------------------------------------------------
/.github/logo/white/logo_desc/320x320.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_desc/320x320.png


--------------------------------------------------------------------------------
/.github/logo/white/logo_desc/480x480.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_desc/480x480.png


--------------------------------------------------------------------------------
/.github/logo/white/logo_desc/512x512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_desc/512x512.png


--------------------------------------------------------------------------------
/.github/logo/white/logo_no_desc/192x192.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_no_desc/192x192.png


--------------------------------------------------------------------------------
/.github/logo/white/logo_no_desc/256x256.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_no_desc/256x256.png


--------------------------------------------------------------------------------
/.github/logo/white/logo_no_desc/320x320.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_no_desc/320x320.png


--------------------------------------------------------------------------------
/.github/logo/white/logo_no_desc/480x480.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_no_desc/480x480.png


--------------------------------------------------------------------------------
/.github/logo/white/logo_no_desc/512x512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/.github/logo/white/logo_no_desc/512x512.png


--------------------------------------------------------------------------------
/.github/workflows/formatpythoncode.yml:
--------------------------------------------------------------------------------
 1 | name: Format python code
 2 | on: push
 3 | jobs:
 4 |   autopep8:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |       - uses: actions/checkout@v1
 8 |       - name: autopep8
 9 |         uses: peter-evans/autopep8@v1.1.0
10 |         with:
11 |           args: --recursive --in-place --aggressive --aggressive **/*.py
12 |       - name: Create Pull Request
13 |         uses: peter-evans/create-pull-request@v1
14 |         with:
15 |           token: ${{ secrets.GITHUB_TOKEN }}
16 |           commit-message: autopep8 action fixes
17 |           author-email: mrturkmen06@users.noreply.github.com
18 |           author-name: Ahmet Turkmen
19 |           title: Fixes by autopep8 action
20 |           body: This is an auto-generated PR with fixes by autopep8.
21 |           labels: autopep8, automated pr
22 |           branch: autopep8-patches


--------------------------------------------------------------------------------
/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
  1 | name: Python package
  2 | 
  3 | on: 
  4 |   push:
  5 |     paths-ignore:
  6 |       - 'README.md'
  7 |       - 'LICENCE'
  8 | jobs:
  9 |   lint:
 10 |     runs-on: ubuntu-latest
 11 |     steps:
 12 |     - uses: actions/checkout@v1
 13 |     - name: Set up Python ${{ matrix.python-version }}
 14 |       uses: actions/setup-python@v1
 15 |       with:
 16 |         python-version: 3.7
 17 |     - name: Install flake8
 18 |       run: pip install flake8
 19 |     - name: Check for syntax errors or undefined names
 20 |       run: |
 21 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
 22 |     - name: Lint with flake8
 23 |       run: |
 24 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
 25 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127
 26 |     - name: Check that number of pep8 violations is not going up
 27 |       run: |
 28 |         # test that number of violations does not increase
 29 |         FLAKE8_ERROR_CNT=$(flake8 . -qq --count --exit-zero --max-complexity=10 --max-line-length=127 --exclude venv,__pycache__,docs/source/conf.py,old,build,dist)
 30 |         FLAKE8_ERROR_LIMIT=25
 31 |         if [ "$FLAKE8_ERROR_CNT" -gt "$FLAKE8_ERROR_LIMIT" ] ; then
 32 |             echo "Failed because the number of errors from flake8 increased (This: $FLAKE8_ERROR_CNT Previously: $FLAKE8_ERROR_LIMIT)" 1>&2
 33 |             false
 34 |         fi
 35 |         echo "Number of validation errors from flake8 is: $FLAKE8_ERROR_CNT (Limit is: $FLAKE8_ERROR_LIMIT)"
 36 | 
 37 |   formalities:
 38 |     runs-on: ubuntu-latest
 39 |     steps:
 40 |     - uses: actions/checkout@v1
 41 |     - name: Set up Python ${{ matrix.python-version }}
 42 |       uses: actions/setup-python@v1
 43 |       with:
 44 |         python-version: 3.7
 45 |     - name: Extract branch name
 46 |       shell: bash
 47 |       run: echo "::set-env name=BRANCH_NAME::$(echo ${GITHUB_REF#refs/heads/})"
 48 |     - name: Check branch name
 49 |       run: |
 50 |         echo "Checking ${BRANCH_NAME}..."
 51 |         python3 .githooks/check-branch-name.py "$BRANCH_NAME"
 52 | 
 53 |   test:
 54 |     runs-on: ${{ matrix.os }}
 55 |     strategy:
 56 |       max-parallel: 4
 57 |       matrix:
 58 |         python-version: [3.7]
 59 |         os: [windows-latest,ubuntu-latest, macOS-latest]
 60 | 
 61 |     steps:
 62 |     - uses: actions/checkout@v1
 63 |     - name: Set up Python ${{ matrix.python-version }}
 64 |       uses: actions/setup-python@v1
 65 |       with:
 66 |         python-version: ${{ matrix.python-version }}
 67 | 
 68 |     - name: Install dependencies
 69 |       run: |
 70 |         python -m pip install --upgrade pip
 71 |         pip install -r requirements.txt
 72 |         pip install coverage pytest sphinx
 73 | 
 74 |     - name: Test with pytest
 75 |       env:
 76 |         MAXMIND_LICENSE_KEY: ${{ secrets.MAXMIND_LICENSE_KEY }}
 77 |       run: |
 78 |         coverage run --source=richkit -m pytest -Werror --ignore src/python-whois
 79 |     - name: Coverage report
 80 |       run: |
 81 |         coverage report --fail-under=79
 82 |  
 83 |     - name: Doctest
 84 |       env:
 85 |         MAXMIND_LICENSE_KEY: ${{ secrets.MAXMIND_LICENSE_KEY }}
 86 |       run: |
 87 |         python -m doctest -v README.md
 88 |         cd docs
 89 |         make doctest
 90 |     - name: Documentation coverage
 91 |       env:
 92 |         MAXMIND_LICENSE_KEY: "DUMMY: A valid license is not needed here"
 93 |       run: |
 94 |         cd docs
 95 |         make coverage
 96 |         python -c "from pathlib import Path; print(Path('_build/coverage/python.txt').read_text())" # this prints dosctring coverage report
 97 |     - name: Build documentation
 98 |       run: |
 99 |         cd docs
100 |         make html
101 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - '*.*.*'
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v1
12 |     - name: Set up Python
13 |       uses: actions/setup-python@v1
14 |       with:
15 |         python-version: '3.7'
16 |     - name: Install dependencies
17 |       run: |
18 |         python -m pip install --upgrade pip
19 |         pip install setuptools wheel twine
20 |     - name: Publish
21 |       env:
22 |         TWINE_USERNAME: ${{ secrets.PIP_USERNAME }}
23 |         TWINE_PASSWORD: ${{ secrets.PIP_TOKEN }}
24 |       run: |
25 |         python setup.py sdist bdist_wheel
26 |         twine upload  dist/*
27 |   ## Initialize release process
28 |     - name: Checkout code
29 |       uses: actions/checkout@master
30 |     - name: Create Release
31 |       id: create_release
32 |       uses: actions/create-release@v1.0.0
33 |       env:
34 |         GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
35 |       with:
36 |         tag_name: ${{ github.ref }}
37 |         release_name: Release ${{ github.ref }}
38 |         draft: false
39 |         prerelease: false
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | .pytest_cache/
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | .idea/*
 28 | .idea
 29 | 
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | .DS_Store
 53 | richkit/test/.DS_Store
 54 | # Local data
 55 | richkit/retrieve/data/*.txt
 56 | richkit/test/analyse/data/*.csv
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | categories_list.txt
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # celery beat schedule file
 87 | celerybeat-schedule
 88 | 
 89 | # SageMath parsed files
 90 | *.sage.py
 91 | 
 92 | # Environments
 93 | .env
 94 | .venv
 95 | env/
 96 | venv/
 97 | ENV/
 98 | env.bak/
 99 | venv.bak/
100 | 
101 | # Spyder project settings
102 | .spyderproject
103 | .spyproject
104 | 
105 | # Rope project settings
106 | .ropeproject
107 | 
108 | # mkdocs documentation
109 | /site
110 | 
111 | # mypy
112 | .mypy_cache/
113 | .idea/*
114 | richkit/lookup/data/*
115 | 
116 | # caches for of resources fetched from Internet, used in richkit
117 | richkit/retrieve/data/categorized_urls.txt


--------------------------------------------------------------------------------
/Dockerfile.test:
--------------------------------------------------------------------------------
 1 | FROM ubuntu
 2 | 
 3 | # provide environment variable as MAXMIND_LICENSE_KEY
 4 | # when you run docker image see readme
 5 | 
 6 | # git is required to fetch given requirement in the requirements.txt file
 7 | # for unmerged whois library
 8 | 
 9 | RUN apt-get update && apt-get install -y python3 python3-pip git
10 | 
11 | COPY requirements.txt /richkit/requirements.txt
12 | 
13 | COPY richkit /richkit/richkit
14 | 
15 | 
16 | RUN pip3 install -r /richkit/requirements.txt
17 | 
18 | RUN pip3 install coverage pytest sphinx flake8
19 | 
20 | COPY .github/local-test/run-test.sh /richkit/richkit/run-test.sh
21 | 
22 | CMD ["/richkit/richkit/run-test.sh"]
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Aalborg University
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all prep-dev venv clean lint test docker-test
 2 | 
 3 | # virtual environment for development
 4 | VENV_NAME?=venv
 5 | VENV_ACTIVATE=. $(VENV_NAME)/bin/activate
 6 | PYTHON=${VENV_NAME}/bin/python3
 7 | # help messages for make, it runs in `make` or `make all`
 8 | all:
 9 | 	@echo "\033[92m make prep-dev \033[0m"
10 | 	@echo "---> Prepares dev environment, use only once"
11 | 	@echo "\033[92m make test \033[0m"
12 | 	@echo "---> Runs test cases in virtual environment"
13 | 	@echo "\033[92m make lint \033[0m"
14 | 	@echo "---> Linting project with autopep8"
15 | 	@echo "\033[92m make clean \033[0m"
16 | 	@echo "---> Cleans project cache and other stuffs"
17 | 	@echo "\033[92m make docker-test \033[0m"
18 | 	@echo "---> Runs test cases in docker environment"
19 | 
20 | 
21 | prep-dev:
22 | 	python3 -m pip install virtualenv  ## virtual environment for development purposes
23 | 	make venv
24 | 
25 | venv: $(VENV_NAME)/bin/activate
26 | $(VENV_NAME)/bin/activate: requirements.txt
27 | 		test -d $(VENV_NAME) || virtualenv -p python3 $(VENV_NAME)
28 | 		${PYTHON} -m pip install -U pip setuptools
29 | 		${PYTHON} -m pip install -U autopep8  coverage  isort
30 | 		${PYTHON} -m pip install -U -r requirements.txt
31 | 		touch $(VENV_NAME)/bin/activate
32 | 
33 | clean:
34 | 	rm -rf $(VENV_NAME) *.eggs *.egg-info dist build docs/_build .cache .coverage
35 | 	rm -rf .pytest*  # cache file for Intellij PyCharm
36 | 
37 | sort: venv
38 | 	isort -rc . --skip_glob docs/*
39 | 
40 | 
41 | lint: venv
42 | 	autopep8 --in-place --recursive --max-line-length=100 --exclude docs/source/conf.py,venv,__pycache__,old,build,dist .
43 | 
44 | test: venv
45 | 	coverage run --source=richkit -m pytest -Werror --ignore src/python-whois
46 | 
47 | docker-test: clean
48 | 	docker build -t richkit-docker-test -f Dockerfile.test .
49 | 	docker run -e MAXMIND_LICENSE_KEY=$MAXMIND_LICENSE_KEY richkit-docker-test


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | <img src=".github/logo/blue/logo_no_desc/256x256.png"  />
  3 | <h1>Richkit </h1>
  4 | </div>
  5 | <p align="center"> 
  6 | <div align="center">
  7 |    <!-- todo github actions buiild status  -->
  8 |   <a href="https://img.shields.io/pypi/pyversions/richkit">
  9 |     <img src="https://img.shields.io/pypi/pyversions/richkit" alt="GitHub release">
 10 |   </a>
 11 |    <a href="https://github.com/aau-network-security/richkit/blob/master/LICENSE">
 12 |     <img src="https://img.shields.io/pypi/l/richkit" alt="licence">
 13 |   </a>
 14 |   <div align ="center">
 15 |   <a href="https://github.com/aau-network-security/richkit/issues">
 16 |   <img src=https://img.shields.io/github/issues/aau-network-security/richkit?style=flat-square alt="issues">
 17 |   
 18 |   </a>
 19 |   <a href="https://github.com/aau-network-security/richkit/network/members">
 20 |   <img src=https://img.shields.io/github/forks/aau-network-security/richkit >
 21 |   </a>
 22 |   <a href="https://github.com/aau-network-security/richkit/stargazers">
 23 |   <img src=https://img.shields.io/github/stars/aau-network-security/richkit></a>
 24 |   </div>
 25 | 
 26 |  </div>
 27 | 
 28 | Richkit is a python3 package that provides tools taking a domain name as input, and returns addtional information on that domain. It can be an analysis of the domain itself, looked up from data-bases, retrieved from other services, or some combination thereof.
 29 | 
 30 | The purpose of richkit is to provide a reusable library of domain name-related analysis, lookups, and retrieval functions, that are shared within the Network Security research group at Aalborg University, and also availble to the public for reuse and modification.
 31 | 
 32 | Documentation can be found at https://richkit.readthedocs.io/en/latest/.
 33 | 
 34 | 
 35 | ## Requirements
 36 | 
 37 |  - `Python >= 3.5` 
 38 | 
 39 | ## Installation
 40 | 
 41 | In order to install richikit just type in the terminal `pip install richkit`
 42 | 
 43 | ## Usage
 44 | 
 45 | The following codes can be used to retrieve the TLD and the URL category, respectively.
 46 | 
 47 | - Retriving effective top level domain of a given url: 
 48 | 
 49 |     ```python3
 50 |     >>> from richkit.analyse import tld
 51 |     >>> urls = ["www.aau.dk","www.github.com","www.google.com"]
 52 |     >>>
 53 |     >>> for url in urls:
 54 |     ...     print(tld(url))
 55 |     dk
 56 |     com
 57 |     com
 58 |     
 59 |     ```
 60 | 
 61 | - Retriving category of a given url:
 62 | 
 63 |     ```python3
 64 |     >>> from richkit.retrieve.symantec import fetch_from_internet
 65 |     >>> from richkit.retrieve.symantec import LocalCategoryDB
 66 |     >>>
 67 |     >>> urls = ["www.aau.dk","www.github.com","www.google.com"]
 68 |     >>>
 69 |     >>> local_db = LocalCategoryDB()
 70 |     >>> for url in urls:
 71 |     ...     url_category=local_db.get_category(url)
 72 |     ...     if url_category=='':
 73 |     ...         url_category=fetch_from_internet(url)
 74 |     ...     print(url_category)
 75 |     Education
 76 |     Technology/Internet
 77 |     Search Engines/Portals
 78 |     
 79 |     ```
 80 | 
 81 | ## Modules
 82 | 
 83 | Richkit define a set of functions categorized by the following modules:
 84 | 
 85 | - `richkit.analyse`: This module provides functions that can be applied to a domain  name. Similarly to `richkit.lookup`, and in contrast to `richkit.retrieve`, this is done without disclosing the domain name to third parties and breaching confidentiality.
 86 | 
 87 | - `richkit.lookup`: This modules provides the ability to look up domain names in local resources, i.e. the domain name cannot be sent of to third parties. The module might fetch resources, such as lists or databasese, but this must be done in a way that keeps the domain name confidential. Contrast this with `richkit.retrieve`.
 88 | 
 89 | - `richkit.retrieve`: This module provides the ability to retrieve data on domain names of any sort. It comes without the "confidentiality contract" of `richkit.lookup`.
 90 | 
 91 | ## Run Tests on Docker 
 92 | 
 93 | In order to prevent any problems regarding to environment, we are providing `Dockerfile.test`  file which basically constructs a docker image to run tests of Richkit.
 94 | 
 95 |  - The only thing to add is just `MAXMIND_LICENCE_KEY` in `.github/local-test/run-test.sh` at line 3. It is required to pass the test cases for `lookup` module. 
 96 | 
 97 | Commands to test them in Docker environment. 
 98 | 
 99 | - `docker build -t richkit-test -f Dockerfile.test . ` : Builds required image to run test cases 
100 | 
101 | - `docker run -e MAXMIND_LICENSE_KEY="<licence-key> " richkit-test ` : Runs `run-test.sh` file in Docker image. 
102 | 
103 | 
104 | ## Contributing
105 | 
106 | Contributions are most welcome.
107 | 
108 | We use the [gitflow](https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow)
109 | branching strategy, so if you plan to push a branch to this repository
110 | please follow that. Note that we test branch names with
111 | `.githooks/check-branch-name.py`. The git pre-commit hook can be used
112 | to automatically check this on commit. An example that can be used
113 | directly as follows is available on linux, and can be enabled like
114 | this (assuming `python>=3.6` and `bash`):
115 | 
116 |     ln -s $(pwd)/.githooks/pre-commit.linux.sample $(pwd)/.git/hooks/pre-commit
117 | 
118 | ## Credits 
119 | 
120 | -  Logo designed by [indepedenthand](https://www.behance.net/independenthand)
121 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_static/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/docs/_static/.gitkeep


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('..'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'richkit'
21 | copyright = '2019, <<<AUTHORS>>>'
22 | author = '<<<AUTHORS>>>'
23 | master_doc = 'index'
24 | 
25 | 
26 | # -- General configuration ---------------------------------------------------
27 | 
28 | # Add any Sphinx extension module names here, as strings. They can be
29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
30 | # ones.
31 | extensions = [
32 |     'sphinx.ext.autodoc',
33 |     'sphinx.ext.coverage',
34 |     'sphinx.ext.doctest',
35 | ]
36 | 
37 | # Add any paths that contain templates here, relative to this directory.
38 | templates_path = ['_templates']
39 | 
40 | # List of patterns, relative to source directory, that match files and
41 | # directories to ignore when looking for source files.
42 | # This pattern also affects html_static_path and html_extra_path.
43 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
44 | 
45 | 
46 | # -- Options for HTML output -------------------------------------------------
47 | 
48 | # The theme to use for HTML and HTML Help pages.  See the documentation for
49 | # a list of builtin themes.
50 | #
51 | html_theme = 'alabaster'
52 | 
53 | # Add any paths that contain custom static files (such as style sheets) here,
54 | # relative to this directory. They are copied after the builtin static files,
55 | # so a file named "default.css" will overwrite the builtin "default.css".
56 | html_static_path = ['_static']
57 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to richkit's documentation!
 2 | ==================================
 3 | 
 4 | .. automodule:: richkit
 5 | 
 6 | .. toctree::
 7 |    :caption: Contents:
 8 | 
 9 | Modules
10 | =======
11 | 
12 | The functionality is organised in the following modules.
13 | 
14 | Analysis
15 | --------
16 | .. automodule:: richkit.analyse
17 |                 :members:
18 | 
19 | Lookup
20 | ------
21 | .. automodule:: richkit.lookup
22 |                 :members:
23 | 
24 | .. automodule:: richkit.lookup.geo
25 |                 :members:
26 | 
27 | Retrieve
28 | --------
29 | .. automodule:: richkit.retrieve
30 |                 :members:
31 | 
32 | .. automodule:: richkit.retrieve.dns
33 |                 :members:
34 | 
35 | .. automodule:: richkit.retrieve.symantec
36 |                 :members: fetch_from_internet, LocalCategoryDB
37 | 
38 | Indices and tables
39 | ==================
40 | 
41 | * :ref:`genindex`
42 | * :ref:`modindex`
43 | * :ref:`search`
44 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | dnspython
 2 | maxminddb
 3 | numpy==1.17.2
 4 | scikit-learn==0.21.3
 5 | langid==1.1.6
 6 | bs4==0.0.1
 7 | lxml==4.4.1
 8 | requests==2.22.0
 9 | # when this branch is merged into upstream and released
10 | -e git://github.com/aau-network-security/pywhois.git@release-for-richkit#egg=python-whois
11 | # replace the line with:
12 | # python-whois
13 | 


--------------------------------------------------------------------------------
/richkit/__init__.py:
--------------------------------------------------------------------------------
 1 | """richkit is the Domain Enrichment Kit
 2 | 
 3 | See the `README
 4 | <https://github.com/aau-network-security/richkit/blob/master/README.md>`_
 5 | for a general introduction.
 6 | 
 7 | """
 8 | 
 9 | __all__ = [
10 |     'analyse',
11 |     'lookup',
12 |     'retrieve',
13 | ]
14 | 


--------------------------------------------------------------------------------
/richkit/analyse/__init__.py:
--------------------------------------------------------------------------------
  1 | """Analysis and computations on domain names.
  2 | 
  3 | This module provides functions that can be applied to a domain
  4 | name. Similarly to `richkit.lookup`, and in contrast to `richkit.retrieve`,
  5 | this is done without disclosing the domain name to third parties and
  6 | breaching confidentiality.
  7 | 
  8 | .. note:: For this entire module, we adopt the notion of effective
  9 |           Top-Level Domains (eTLD), effective Second-Level Domain
 10 |           (e2LD), etc. "Effective" refers to the practice where the
 11 |           public sufffic is considered the effective TLD, and counted
 12 |           as one label. The `list of public suffixes
 13 |           <https://publicsuffix.org/list/>`_, maintained by Mozilla,
 14 |           is used as the definitive truth on what public suffixes
 15 |           exists.
 16 | 
 17 | """
 18 | 
 19 | from richkit.analyse import analyse
 20 | 
 21 | # aka tld
 22 | 
 23 | 
 24 | def tld(domain):
 25 |     """
 26 |     Returns the Effective Top-Level Domain (eTLD) (aka Public Suffix).
 27 | 
 28 |     The eTLD is extracted from the domain,
 29 | 
 30 |     :param domain: Domain (string)
 31 | 
 32 |     """
 33 |     return analyse.get_tld(domain)
 34 | 
 35 | 
 36 | def sld(domain):
 37 |     """
 38 |     Returns the Effective Second-Level Domain (2LD) (aka Apex Domain).
 39 | 
 40 |     The 2LD, aka the Apex Domain, is extracted from the domain, using
 41 |     the `list of public suffixes <https://publicsuffix.org/list/>`_
 42 |     maintained by Mozilla
 43 | 
 44 |     :param domain: Domain (string)
 45 | 
 46 |     """
 47 |     return analyse.get_sld(domain)
 48 | 
 49 | 
 50 | def sl_label(domain):
 51 |     """
 52 |     Returns the Effective 2-level label.
 53 | 
 54 |     :param domain: Domain (string)
 55 | 
 56 |     """
 57 |     return analyse.get_2l_label(domain)
 58 | 
 59 | 
 60 | def nld(domain, n):
 61 |     """
 62 |     Returns the Effective N'th-Level Domain (eNLD).
 63 | 
 64 |     :param domain: Domain (string)
 65 |     :param n: N'th-Level (int)
 66 | 
 67 |     Usage: 
 68 | 
 69 |       from richkit.analyse import nld
 70 | 
 71 |       ## returns second level domain ... 
 72 |       print(nld("www.google.com", 2))
 73 | 
 74 |       ## returns top level domain
 75 |       print(nld("www.google.com",1))
 76 | 
 77 |     """
 78 |     return analyse.get_nld(domain, n)
 79 | 
 80 | 
 81 | def n_label(domain, n):
 82 |     """
 83 |     Returns the Effective N'th-level label.
 84 | 
 85 |     :param domain: Domain (string)
 86 |     :param n: N'th-Level (int)
 87 | 
 88 |     """
 89 |     return analyse.get_n_label(domain, n)
 90 | 
 91 | 
 92 | def depth(domain):
 93 |     """
 94 |     Returns the effective depth of the domain,
 95 | 
 96 |     The depth is the number of labels in the domain.
 97 | 
 98 |     :Example: `google.co.uk` is "effectively a 2LD. `google` is one
 99 |               label. The public suffix of `co.uk` is considered one
100 |               label effectively. With effectively two labels, the
101 |               effective depth is two.
102 | 
103 |     :param domain: Domain (string)
104 | 
105 |     """
106 |     domain_name_features = analyse.get_domain_name_features(domain)
107 |     return domain_name_features.get("num_tokens", "")
108 | 
109 | 
110 | def length(domain):
111 |     """
112 |     Returns the sum of count of characters for all labels.
113 | 
114 |     :param domain: Domain (string)
115 | 
116 |     """
117 |     domain_name_features = analyse.get_domain_name_features(domain)
118 |     return domain_name_features.get("len_domain", "")
119 | 
120 | 
121 | def language(domain):
122 |     """
123 |     Returns the best gues for the language of the domain.
124 | 
125 |     :param domain: Domain (string)
126 | 
127 |     """
128 |     return analyse.get_language(domain)
129 | 
130 | 
131 | def entropy(s):
132 |     """
133 |     Returns the entropy of characters in s.
134 | 
135 |     :param s: Domain (string)
136 | 
137 |     """
138 |     return analyse.get_entropy_2ld(s)
139 | 
140 | 
141 | def ratio_vowels(s):
142 |     """
143 |     Returns the ratio vowels to all characters in s.
144 | 
145 |     :param s: Domain (string)
146 | 
147 |     """
148 |     return analyse.get_ratio_vowels_2ld(s)
149 | 
150 | 
151 | def number_vowels(s):
152 |     """
153 |     Returns the number vowels to all characters in s.
154 | 
155 |     :param s: Domain (string)
156 | 
157 |     """
158 |     return analyse.get_num_of_vowels_2ld(s)
159 | 
160 | 
161 | def ratio_consonants(s):
162 |     """
163 |     Returns the ratio consonants to all characters in s.
164 | 
165 |     :param s: Domain (string)
166 | 
167 |     """
168 |     return analyse.get_ratio_consonants_2ld(s)
169 | 
170 | 
171 | def number_consonants(s):
172 |     """
173 |     Returns the number consonants to all characters in s.
174 | 
175 |     :param s: Domain (string)
176 | 
177 |     """
178 |     return analyse.get_num_of_consonants_2ld(s)
179 | 
180 | 
181 | def ratio_numerics(s):
182 |     """
183 |     Returns the ratio numeric characters to all characters in s.
184 | 
185 |     :param s: Domain (string)
186 | 
187 |     """
188 |     return analyse.get_radio_numeric_2ld(s)
189 | 
190 | 
191 | def number_numerics(s):
192 |     """
193 |     Returns the number numeric characters to all characters in s.
194 | 
195 |     :param s: Domain (string)
196 | 
197 |     """
198 |     return analyse.get_num_numeric_2ld(s)
199 | 
200 | 
201 | def ratio_specials(s):
202 |     """
203 |     Returns the ratio special characters to all characters in s.
204 |     The default special character list is "~`!@#$%^&*()_={}[]:>;',</?*-+"
205 | 
206 |     :param s: Domain (string)
207 | 
208 |     """
209 |     return analyse.get_ratio_special_2ld(s)
210 | 
211 | 
212 | def number_specials(s):
213 |     """
214 |     Returns the number special characters to all characters in s.
215 |     The default special character list is "~`!@#$%^&*()_={}[]:>;',</?*-+".
216 | 
217 |     :param s: Domain (string)
218 | 
219 |     """
220 |     return analyse.get_num_of_special_2ld(s)
221 | 
222 | 
223 | def number_words(s):
224 |     """
225 |     Returns the number of English word found in s.
226 | 
227 |     :param s: Domain (string)
228 | 
229 |     """
230 |     return analyse.get_num_words_2ld(s)
231 | 
232 | 
233 | def n_grams_alexa(domain, is_test):
234 |     """
235 |     Returns similarity to distribution of N-grams in Alexa Top 1M.
236 | 
237 |     :param domain: Domain (string)
238 |     :param is_test: Checks whether function called from a test case  or not
239 | 
240 |     """
241 |     return analyse.get_grams_alexa_2ld(domain, is_test=is_test)
242 | 
243 | 
244 | def n_grams_dict(domain, is_test):
245 |     """
246 |     Returns similarity to distribution of N-grams in English dictionary
247 | 
248 |     :param domain: Domain (string)
249 | 
250 |     """
251 |     return analyse.get_grams_dict_2ld(domain, is_test)
252 | 


--------------------------------------------------------------------------------
/richkit/analyse/analyse.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from collections import Counter
  3 | import langid
  4 | import numpy as np
  5 | from richkit.analyse.segment import segment
  6 | from sklearn.feature_extraction.text import CountVectorizer
  7 | from richkit.analyse.util import WordMatcher
  8 | from richkit.analyse.util import load_alexa
  9 | from richkit.analyse.util import load_words
 10 | from richkit.analyse.util import TldMatcher
 11 | 
 12 | 
 13 | def entropy(s):
 14 |     p, lns = Counter(s), float(len(s))
 15 |     return -sum(count/lns * math.log(count/lns, 2) for count in list(p.values()))
 16 | 
 17 | 
 18 | def get_tld(domain):
 19 |     """
 20 |     Get the Effective Top-Level Domain (eTLD) (not the label)
 21 | 
 22 |     :param domain: Domain (string)
 23 |     :return: Effective Top-Level Domain (eTLD)
 24 |     """
 25 |     tldmatch = TldMatcher()
 26 |     try:
 27 |         tld = tldmatch.get_tld(domain.lower())
 28 |     except:
 29 |         tld = None
 30 |     return tld
 31 | 
 32 | 
 33 | def get_sld(domain):
 34 |     """
 35 |     Get the Effective Second-Level Domain (not the label)
 36 | 
 37 |     :param domain: Domain (string)
 38 |     :return: Effective Second-Level Domain
 39 |     """
 40 |     tld = get_tld(domain.lower())
 41 |     if tld is not None:
 42 |         tldmatch = TldMatcher()
 43 |         try:
 44 |             sld = tldmatch.get_2ld(domain.lower())
 45 |             return '.'.join([sld, tld])
 46 |         except:
 47 |             return None  # return None if the SLD does not exist
 48 |     else:
 49 |         return None  # return None if the TLD does not exist
 50 | 
 51 | 
 52 | def get_2l_label(domain):
 53 |     """
 54 |     Get the Effective 2-level label.
 55 | 
 56 |     :param domain: Domain (string)
 57 |     :return: Effective Second-Level label
 58 |     """
 59 |     tldmatch = TldMatcher()
 60 |     try:
 61 |         sld = tldmatch.get_2ld(domain)
 62 |     except:
 63 |         sld = domain.split(".")[-2]
 64 |     return sld
 65 | 
 66 | 
 67 | def get_nld(domain, n):
 68 |     """
 69 |     Get the Effective N'th-level Domain.
 70 | 
 71 |     :param domain: Domain (string)
 72 |     :param n: Label number (int)
 73 |     :return: Effective N'th-level Domain
 74 |     """
 75 | 
 76 |     if abs(n) == 1:
 77 |         nld = get_tld(domain)
 78 |     elif len(domain.split('.')) <= n:
 79 |         nld = None
 80 |     else:
 81 |         nld = ""
 82 |         try:
 83 |             for i in range(1, abs(n)):
 84 |                 nld = '.'.join([get_n_label(domain, i+1), nld])
 85 |             nld = nld+get_tld(domain)
 86 |         except IndexError:
 87 |             nld = None
 88 |     return nld
 89 | 
 90 | 
 91 | def get_n_label(domain, n):
 92 |     """
 93 |     Get the Effective N'th-level label.
 94 | 
 95 |     :param domain: Domain (string)
 96 |     :param n: Label number (int)
 97 |     :return: Effective N'th-level label
 98 |     """
 99 | 
100 |     if abs(n) == 1:
101 |         n_label = get_tld(domain)
102 |     elif abs(n) == 2:
103 |         n_label = get_2l_label(domain)
104 |     else:
105 |         tldmatch = TldMatcher()
106 |         try:
107 |             n_label = tldmatch.get_nld(domain, abs(n) - 1)
108 |         except IndexError:
109 |             n_label = None
110 |         except Exception:
111 |             n_label = domain.split(".")[-abs(n) - 1]
112 |     return n_label
113 | 
114 | 
115 | def get_domain_name_features(domain):
116 |     """
117 |     Returns domain name features within dictionary
118 |     includes num_tokens, len2ld, len_domain
119 | 
120 |     :param: domain
121 |     :return: dict
122 | 
123 |     """
124 |     domain_array = domain.split('.')
125 |     num_tokens = len(domain_array)
126 |     len2ld = len(get_sld(domain))
127 |     len_domain = sum([len(el) for el in domain_array])
128 |     domain_name_features = {
129 |         "num_tokens": str(num_tokens),
130 |         "len2ld": str(len2ld),
131 |         "len_domain": str(len_domain)
132 |     }
133 |     return domain_name_features
134 | 
135 | 
136 | def get_language(domain):
137 |     """
138 |     :param: domain
139 |     """
140 |     try:
141 |         language = langid.classify(" ".join(segment(get_tld(domain))))[0]
142 |     except IndexError:
143 |         language = ""
144 |     except ValueError:
145 |         language = ""
146 |     return str(language)
147 | 
148 | 
149 | def get_entropy_2ld(domain):
150 |     """
151 |     :param domain:
152 |     :return: entropy of second level domain
153 |     """
154 |     return str(entropy(get_sld(domain)))
155 | 
156 | 
157 | def get_grams_alexa_2ld(domain, analyzer='char', ngram_range=(3, 5), is_test=False):
158 |     """
159 |     :param : domain, analyzer, ngram_range
160 |     :return: grams of second level domain
161 |     """
162 | 
163 |     alexa_slds = load_alexa(is_test=is_test)
164 |     alexa_vc = CountVectorizer(analyzer=analyzer,
165 |                                ngram_range=ngram_range,
166 |                                min_df=1e-4,
167 |                                max_df=1.0)
168 |     counts_matrix = alexa_vc.fit_transform(alexa_slds)
169 |     alexa_counts = np.log10(counts_matrix.sum(axis=0).getA1())
170 |     grams_alexa2ld = ngram_count(get_sld(domain), alexa_counts, alexa_vc)
171 | 
172 |     return float(grams_alexa2ld)
173 | 
174 | 
175 | def get_grams_dict_2ld(domain, is_test=False):
176 |     """
177 | 
178 |     :param domain:
179 |     :return: grams_dict_2ld
180 |     """
181 |     words = load_words(is_test=is_test)
182 |     dict_vc = CountVectorizer(analyzer='char',
183 |                               ngram_range=(3, 5),
184 |                               min_df=1e-5,
185 |                               max_df=1.0)
186 |     counts_matrix = dict_vc.fit_transform(words)
187 |     dict_counts = np.log10(counts_matrix.sum(axis=0).getA1())
188 |     grams_dict2ld = ngram_count(get_sld(domain), dict_counts, dict_vc)
189 | 
190 |     return float(grams_dict2ld)
191 | 
192 | 
193 | def get_num_words_2ld(domain):
194 |     """
195 | 
196 |     :param domain:
197 |     :return: num of words in 2ld from WordMatcher Object
198 |     """
199 |     word_matcher = WordMatcher()
200 |     return str(word_matcher.get_num_of_words(get_tld(domain)))
201 | 
202 | 
203 | def get_num_of_vowels_2ld(domain):
204 |     """
205 | 
206 |     :param domain:
207 |     :return: number of counts:  vowels in 2ld
208 |     """
209 | 
210 |     sld = get_sld(domain)
211 |     vowels = list("aeiouy")
212 |     return str(sum([sld.count(c) for c in vowels]))
213 | 
214 | 
215 | def get_ratio_vowels_2ld(domain):
216 |     """
217 | 
218 |     :param domain:
219 |     :return: ratio of vowels in scope of 2ld
220 |     """
221 |     return str(float(get_num_of_vowels_2ld(domain)) / float(len(get_sld(domain))))
222 | 
223 | 
224 | def get_num_of_consonants_2ld(domain):
225 |     """
226 | 
227 |     :param domain:
228 |     :return: number of consonants in scope of 2ld
229 |     """
230 |     consonants = list("bcdfghjklmnpqrstvwxz")
231 | 
232 |     return str(sum([get_sld(domain).count(c) for c in consonants]))
233 | 
234 | 
235 | def get_ratio_consonants_2ld(domain):
236 |     """
237 |     :param domain:
238 |     :return:  ratio of consonants
239 |     """
240 |     return str(float(get_num_of_consonants_2ld(domain)) / float(len(get_sld(domain))))
241 | 
242 | 
243 | def get_num_of_special_2ld(domain, special=list("~`!@#$%^&*()_={}[]:>;',</?*-+")):
244 |     """
245 | 
246 |     :param domain:
247 |     :param special: special character list, default is "~`!@#$%^&*()_={}[]:>;',</?*-+"
248 |     :return: total special character in 2ld.
249 |     """
250 |     return str(sum([get_sld(domain).count(c) for c in special]))
251 | 
252 | 
253 | def get_ratio_special_2ld(domain):
254 |     """
255 | 
256 |     :param domain:
257 |     :return: ratio of special characters in 2ld
258 |     """
259 |     return str(float(get_num_of_special_2ld(domain)) / float(len(get_sld(domain))))
260 | 
261 | 
262 | def ngram_count(domain, counts, counts_vc):
263 |     """
264 |     :param domain:
265 |     :param counts:
266 |     :param counts_vc: count vectorizer from sklearn
267 |     :return: calculates ngram_count from given count vectorizer and counts
268 |     """
269 |     match = counts * counts_vc.transform([domain]).T
270 |     return str(match[0])
271 | 
272 | 
273 | def get_num_numeric_2ld(domain):
274 |     """
275 | 
276 |     :param domain:
277 |     :return: ratio of special characters in 2ld
278 |     """
279 |     return str(len([c for c in domain if c.isdigit()]))
280 | 
281 | 
282 | def get_radio_numeric_2ld(domain):
283 |     """
284 | 
285 |     :param domain:
286 |     :return: ratio of special characters in 2ld
287 |     """
288 |     return str(float(get_num_numeric_2ld(domain)) / float(len(get_sld(domain))))
289 | 


--------------------------------------------------------------------------------
/richkit/analyse/segment.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import requests
 3 | import os
 4 | from richkit.analyse.util import temp_directory
 5 | import logging
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | class OneGramDist(dict):
11 |     URL = "https://gist.githubusercontent.com/mrturkmencom/d9d5f8bc35be8efd81c447f70ca99fbf/raw/cfa317d7bce53ba55ca8f9bf27aa3170038f99cf/one-grams.txt"
12 |     FILEPATH = temp_directory + "/one-grams.txt"
13 | 
14 |     @classmethod
15 |     def fetch_one_grams(cls, url=None):
16 |         """
17 | 
18 |         :param url: Fetching one groms file from given URL
19 |         """
20 |         url = url or cls.URL
21 |         logger.info('Fetching one gram file from gist ...')
22 |         response = requests.get(url, stream=True)
23 |         if response.status_code == 200:
24 |             with open(cls.FILEPATH, 'wb') as file:
25 |                 file.write(response.content)
26 |         else:
27 |             logger.error('Error while downloading the One Gram file ...')
28 | 
29 |     def __init__(self, filename):
30 |         self.gramCount = 0
31 | 
32 |         for line in open(filename):
33 |             (word, count) = line[:-1].split('\t')
34 |             self[word] = int(count)
35 |             self.gramCount += self[word]
36 | 
37 |     def __call__(self, key):
38 |         if key in self:
39 |             return float(self[key]) / self.gramCount
40 |         else:
41 |             return 1.0 / (self.gramCount * 10 ** (len(key) - 2))
42 | 
43 | 
44 | if os.path.exists(temp_directory + "/one-grams.txt"):
45 |     singleWordProb = OneGramDist(temp_directory + "/one-grams.txt")
46 | else:
47 |     OneGramDist.fetch_one_grams()
48 | 
49 | singleWordProb = OneGramDist(temp_directory + "/one-grams.txt")
50 | 
51 | 
52 | def word_seq_fitness(words):
53 |     return sum(math.log10(singleWordProb(w)) for w in words)
54 | 
55 | 
56 | def memoize(f):
57 |     """
58 | 
59 |     :param f:
60 |     :return:
61 |     """
62 |     cache = {}
63 | 
64 |     def memoizedFunction(*args):
65 |         if args not in cache:
66 |             cache[args] = f(*args)
67 |         return cache[args]
68 | 
69 |     memoizedFunction.cache = cache
70 |     return memoizedFunction
71 | 
72 | 
73 | @memoize
74 | def segment(word):
75 |     """
76 | 
77 |     :param word:
78 |     :return:
79 |     """
80 |     if not word:
81 |         return []
82 |     word = word.lower()  # change to lower case
83 |     allSegmentations = [[first] + segment(rest) for (first, rest) in splitPairs(word)]
84 |     return max(allSegmentations, key=word_seq_fitness)
85 | 
86 | 
87 | def splitPairs(word, maxLen=20):
88 |     return [(word[:i + 1], word[i + 1:]) for i in range(max(len(word), maxLen))]
89 | 
90 | 
91 | @memoize
92 | def segment_with_prob(word):
93 |     segmented = segment(word)
94 |     return word_seq_fitness(segmented), segmented
95 | 


--------------------------------------------------------------------------------
/richkit/analyse/util.py:
--------------------------------------------------------------------------------
  1 | from os import path
  2 | from pathlib import Path
  3 | import requests
  4 | import tempfile
  5 | import logging
  6 | 
  7 | data_folder = Path("top-1m.csv").absolute()
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | temp_directory = tempfile.mkdtemp()
 11 | top_1m_alexa = "https://github.com/mozilla/cipherscan/blob/master/top1m/top-1m.csv?raw=true"
 12 | top_100_alexa = "https://gist.githubusercontent.com/mrturkmencom/98e33d97e6b8d07efabc1fda91946a21/raw/847e1c680d816bbac06ee5034e20b56d2ddfd78d/top-100.csv"
 13 | 
 14 | 
 15 | class WordMatcher(object):
 16 |     # use class vars for lazy loading
 17 |     MASTERURL = "http://www.greenteapress.com/thinkpython/code/words.txt"
 18 |     MASTERFILE = temp_directory + "/words.txt"
 19 |     WORDS = None
 20 |     count = 0
 21 | 
 22 |     @classmethod
 23 |     def fetch_words(cls, url=None):
 24 |         url = url or cls.MASTERURL
 25 | 
 26 |         logger.info('Fetching word list from server ...')
 27 |         response = requests.get(url, stream=True)
 28 |         if response.status_code == 200:
 29 |             with open(cls.MASTERFILE, 'wb') as file:
 30 |                 file.write(response.content)
 31 |         else:
 32 |             logger.error('Error while downloading the word list response code %s ',
 33 |                          str(response.status_code))
 34 | 
 35 |     @classmethod
 36 |     def load_words(cls):
 37 |         f = open(cls.MASTERFILE, 'r', encoding="utf8")
 38 |         lines = f.readlines()
 39 |         f.close()
 40 | 
 41 |         # strip whitespaces
 42 |         # only words with more than three letters are considered
 43 |         lines = [ln for ln in (ln.strip() for ln in lines) if len(ln) > 3]
 44 |         cls.WORDS = {}
 45 |         for item in lines:
 46 |             cls.WORDS[item] = None
 47 |         # cls.WORDS = set(lines)
 48 | 
 49 |     def __init__(self):
 50 | 
 51 |         # check if the class has been initialised
 52 |         if self.__class__.count > 0:
 53 |             return
 54 |         else:
 55 |             self.__class__.count += 1
 56 | 
 57 |         if path.exists(WordMatcher.MASTERFILE):
 58 |             WordMatcher.load_words()
 59 | 
 60 |         if WordMatcher.WORDS is None:
 61 |             WordMatcher.fetch_words()
 62 |             WordMatcher.load_words()
 63 | 
 64 |     def get_num_of_words(self, domain):
 65 |         num = 0
 66 |         for word in WordMatcher.WORDS:
 67 |             if word in domain:
 68 |                 num += 1
 69 |         return num
 70 | 
 71 | 
 72 | def load_alexa(limit=None, is_test=False):
 73 |     """
 74 |     Reads top @limit number of popular domains based on alexa.com
 75 | 
 76 |     """
 77 |     alexa_domains = set()
 78 |     alexa_top_1m = data_folder
 79 |     if not path.exists(alexa_top_1m):
 80 |         if is_test:
 81 |             alexa_top_1m = fetch_alexa_data(url=top_100_alexa)
 82 |         else:
 83 |             alexa_top_1m = fetch_alexa_data()
 84 |     with open(alexa_top_1m) as f:
 85 |         for line in f:
 86 |             line = line.strip()
 87 |             sline = line.split(',')
 88 | 
 89 |             if limit and int(sline[0]) > limit:
 90 |                 break
 91 | 
 92 |             """
 93 |             sometimes the Alexa list contains full URLs, e.g.
 94 |             example.com/path; need to get rid of that for later matching
 95 |             """
 96 |             domain = (sline[1].split('/'))[0]
 97 | 
 98 |             """
 99 |             we want only the 2LD+TLD, else we do not know later against what we
100 |             need to match
101 |             """
102 |             sld_domain = get_2ld(domain)
103 |             alexa_domains.add(sld_domain)
104 |             alexa_domains.add(domain)
105 |     alexa_slds = set([get_2ld(el) for el in alexa_domains])
106 | 
107 |     return alexa_slds
108 | 
109 | 
110 | def load_words(path_to_data=data_folder, is_test=False):
111 |     if not path.exists(path_to_data):
112 |         if is_test:
113 |             path_to_data = fetch_alexa_data(url=top_100_alexa)
114 |         else:
115 |             path_to_data = fetch_alexa_data()
116 | 
117 |     lines = read_local(path_to_data)
118 | 
119 |     # strip whitespaces
120 |     # only words with more than three letters are considered
121 |     lines = [ln for ln in (ln.strip() for ln in lines) if len(ln) > 3]
122 |     words = set(lines)
123 |     return words
124 | 
125 | 
126 | def read_local(path_to_data=data_folder):
127 |     if path.exists(path_to_data):
128 |         f = open(path_to_data, 'r', encoding="utf8")
129 |         lines = f.readlines()
130 |         f.close()
131 |     else:
132 |         lines = []
133 |     return lines
134 | 
135 | 
136 | def fetch_alexa_data(path_to_data=data_folder, url=top_1m_alexa):
137 | 
138 |     response = requests.get(url, stream=True)
139 |     if response.status_code == 200:
140 |         with open(path_to_data, 'wb+') as file:
141 |             file.write(response.content)
142 |     else:
143 |         logger.error('Error while downloading the TOP 1M URL list status code : %s',
144 |                      str(response.status_code))
145 |     return path_to_data
146 | 
147 | 
148 | class TldMatcher(object):
149 |     # use class vars for lazy loading
150 |     MASTERURL = "https://publicsuffix.org/list/effective_tld_names.dat"
151 |     MASTERFILE = temp_directory + "/effective_tld_names.dat"
152 | 
153 |     TLDS = None
154 |     No_TLDS = None
155 |     count = 0
156 | 
157 |     @classmethod
158 |     def fetch_tlds(cls, url=None):
159 |         url = url or cls.MASTERURL
160 | 
161 |         response = requests.get(url, stream=True)
162 |         if response.status_code == 200:
163 |             with open(cls.MASTERFILE, 'wb') as file:
164 |                 file.write(response.content)
165 |         else:
166 | 
167 |             logger.error('Error while downloading the Public Suffix List status code %s ',
168 |                          str(response.status_code))
169 | 
170 |     @classmethod
171 |     def load_tlds(cls):
172 |         try:
173 |             f = open(cls.MASTERFILE, 'r', encoding="utf8")
174 |             lines = f.readlines()
175 |         except FileNotFoundError as e:
176 |             logger.exception('File not readable, not found %s', e)
177 |             f.close()
178 |         f.close()
179 | 
180 |         # strip comments and blank lines
181 |         stripped_lines = [ln for ln in (ln.strip() for ln in lines) if len(ln) and ln[:2] != '//']
182 | 
183 |         excluded_lines = [ln.strip('!') for ln in (ln.strip()
184 |                                                    for ln in lines) if len(ln) and ln[:1] == '!']
185 | 
186 |         cls.TLDS = set(stripped_lines)
187 |         cls.No_TLDS = set(excluded_lines)
188 | 
189 |     def __init__(self):
190 | 
191 |         # check if the class has been initialised
192 |         if self.__class__.count > 0:
193 |             return
194 |         else:
195 |             self.__class__.count += 1
196 | 
197 |         if path.exists(TldMatcher.MASTERFILE):
198 |             TldMatcher.load_tlds()
199 | 
200 |         if TldMatcher.TLDS is None:
201 |             TldMatcher.fetch_tlds()
202 |             TldMatcher.load_tlds()
203 | 
204 |     def get_tld(self, url):
205 |         best_match = None
206 |         chunks = url.split('.')
207 | 
208 |         for start in range(len(chunks) - 1, -1, -1):
209 |             test = '.'.join(chunks[start:])
210 |             startest = '.'.join(['*'] + chunks[start + 1:])
211 | 
212 |             if test in TldMatcher.TLDS or startest in TldMatcher.TLDS:
213 |                 best_match = test
214 | 
215 |         # return an Error since is not clear on the PS List which is the TLD of the domain marked with '!'
216 |         if best_match in TldMatcher.No_TLDS:
217 |             raise NotImplementedError()
218 | 
219 |         return best_match
220 | 
221 |     def get_2ld(self, url):
222 |         urls = url.split('.')
223 |         tlds = self.get_tld(url).split('.')
224 |         return urls[-1 - len(tlds)]
225 | 
226 |     def get_nld(self, url, n):
227 |         urls = url.split('.')
228 |         tlds = self.get_tld(url).split('.')
229 |         return urls[-n - len(tlds)]
230 | 
231 | 
232 | tldmatch = TldMatcher()
233 | 
234 | 
235 | def get_2ld(domain):
236 |     """
237 |     Finds 2LD for given FQDN
238 |     """
239 |     sdomain = domain.split('.')
240 | 
241 |     tld = tldmatch.get_tld(domain)
242 |     index = 2
243 | 
244 |     if tld:
245 |         num_tld_Levels = len(tld.split('.'))
246 |         index = num_tld_Levels + 1
247 | 
248 |     if len(sdomain) < index:
249 |         return domain
250 |     else:
251 |         return '.'.join(sdomain[-index:])
252 | 


--------------------------------------------------------------------------------
/richkit/lookup/__init__.py:
--------------------------------------------------------------------------------
 1 | """Confidentiality-aware look-ups for data on domain names.
 2 | 
 3 | This modules provides the ability to look up domain names in local
 4 | resources, i.e. the domain name cannot be sent of to third
 5 | parties. The module might fetch resources, such as lists or
 6 | databasese, but this must be done in a way that keeps the domain name
 7 | confidential. Contrast this with `richkit.retrieve`."""
 8 | 
 9 | from richkit.lookup import geo
10 | 
11 | 
12 | def country(ip_address):
13 |     """
14 |     Return the country code of a given IP Address
15 | 
16 |     :param ip_address: IP Address (string)
17 |     """
18 |     return geo.get_country(ip_address)
19 | 
20 | 
21 | def asn(ip_address):
22 |     """
23 |     Return the Autonomous System Number of a given IP Address
24 | 
25 |     :param ip_address: IP Address (string)
26 |     """
27 |     return geo.get_asn(ip_address)
28 | 
29 | 
30 | def registered_country(ip_address):
31 |     """
32 |     Return the registered country code of a given IP Address
33 | 
34 |     :param ip_address: IP Address (string)
35 |     """
36 |     return geo.get_registered_country(ip_address)
37 | 
38 | 
39 | def maxmindb_licence_key(license_key):
40 |     """
41 |     Return license key for MaxMind DB
42 |     Retrieve license key for usage of MaxMindDb
43 | 
44 |     If it is not present print warning
45 |     """
46 | 
47 |     return geo.get_license_key(license_key)
48 | 


--------------------------------------------------------------------------------
/richkit/lookup/geo.py:
--------------------------------------------------------------------------------
 1 | from richkit.lookup.util import MaxMindDB
 2 | import os
 3 | 
 4 | 
 5 | def get_license_key(license_key='MAXMIND_LICENSE_KEY'):
 6 |     """
 7 |     @param license_key: Name of environment variable
 8 |     @return: license of MaxMindDB from environent variables as string
 9 |     @return: in case of error, returns Exception, more specifically KeyError
10 |     """
11 |     try:
12 |         maxmind_db_license = os.environ[license_key]
13 |         return maxmind_db_license
14 |     except Exception:
15 |         print("\nWARNING: No MAXMIND LICENSE KEY Found in environment variables")
16 |         print("\nUsage of lookup module might be affected due to no MaxMind DB License".strip())
17 |         print("\nMore info ? Check here: https://github.com/aau-network-security/richkit/wiki/Retrieve-and-configure"
18 |               "-licence-key".strip())
19 |         print("Proceeding anyway...")
20 |         return 'NOLICENSEKEYFOUND'
21 | 
22 | 
23 | def get_country(ip_address):
24 |     """
25 |     Return the country code of a given IP address
26 | 
27 |     :param ip_address: IP Address (string)
28 | 
29 |     """
30 | 
31 |     try:
32 |         country_code_db = MaxMindDB((
33 |             "https://download.maxmind.com/app/geoip_download?"
34 |             "edition_id=GeoLite2-Country&"
35 |             "license_key={license_key}&"
36 |             "suffix=tar.gz"
37 |         ).format(
38 |             license_key=get_license_key(),
39 |         ), "cc"
40 |         )
41 |         result = country_code_db.get_data(ip_address)
42 |         country_code = str(result['country']['iso_code'])
43 |     except:
44 |         country_code = None
45 |     return country_code
46 | 
47 | 
48 | def get_registered_country(ip_address):
49 |     """
50 |     Return the registered country code of a given IP address
51 | 
52 |     :param ip_address: IP Address (string)
53 | 
54 |     """
55 |     try:
56 |         country_code_db = MaxMindDB((
57 |             "https://download.maxmind.com/app/geoip_download?"
58 |             "edition_id=GeoLite2-Country&"
59 |             "license_key={license_key}&"
60 |             "suffix=tar.gz"
61 |         ).format(
62 |             license_key=get_license_key(),
63 |         ), "cc"
64 |         )
65 |         result = country_code_db.get_data(ip_address)
66 |         country_code = str(result['registered_country']['iso_code'])
67 |     except:
68 |         country_code = None
69 |     return country_code
70 | 
71 | 
72 | def get_asn(ip_address):
73 |     """
74 |     Return the ASN of a given IP address
75 | 
76 |     :param ip_address: IP Address (string)
77 | 
78 |     """
79 |     try:
80 |         country_code_db = MaxMindDB((
81 |             "https://download.maxmind.com/app/geoip_download?"
82 |             "edition_id=GeoLite2-ASN&"
83 |             "license_key={license_key}&"
84 |             "suffix=tar.gz"
85 |         ).format(
86 |             license_key=get_license_key(),
87 |         ), "asn"
88 |         )
89 |         result = country_code_db.get_data(ip_address)
90 |         asn = str('AS' + str(result['autonomous_system_number']))
91 |     except:
92 |         asn = None
93 |     return asn
94 | 


--------------------------------------------------------------------------------
/richkit/lookup/util.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import os
  3 | import subprocess
  4 | import time
  5 | from datetime import datetime, timedelta
  6 | import logging
  7 | from pathlib import Path
  8 | import maxminddb
  9 | 
 10 | """
 11 | Lookups in the MaxMind GeoLite2 databases.
 12 | 
 13 | A license key is required as per [#GeoLite2_CCPA_GDPR]_:
 14 | 
 15 | #. Sign up for a MaxMind account (no purchase required): https://www.maxmind.com/en/geolite2/signup
 16 | #. Set your password and create a license key: https://www.maxmind.com/en/accounts/current/license-key
 17 | #. Setup your download mechanism by using our GeoIP Update program or creating a direct download script: https://dev.maxmind.com/geoip/geoipupdate/#Direct_Downloads
 18 | 
 19 | .. rubric:: Footnotes
 20 | 
 21 | .. [#GeoLite2_CCPA_GDPR] https://blog.maxmind.com/2019/12/18/significant-changes-to-accessing-and-using-geolite2-databases/
 22 | """
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | directory = os.getcwd().split("richkit")
 26 | maxmind_directory = directory[0] + "/richkit/richkit/lookup/data"
 27 | Path(maxmind_directory).mkdir(parents=True, exist_ok=True)
 28 | 
 29 | 
 30 | class MaxMindDB:
 31 |     """
 32 |     This class provides functions to download, extract and get data from MaxMind DBs
 33 |     """
 34 | 
 35 |     # Dict to lookup const's, structured like this:
 36 |     # name given by MaxMind, name of the extracted DB, directory of the downloaded file from MaxMind
 37 |     helpers = {
 38 |         "asn": ['GeoLite2-ASN_', 'GeoLite2-ASN.mmdb', str(Path(maxmind_directory, "asn.tar.gz"))],
 39 |         "cc": ['GeoLite2-Country_', 'GeoLite2-Country.mmdb', str(Path(maxmind_directory, "cc.tar.gz"))]
 40 |     }
 41 | 
 42 |     def __init__(self, url, query):
 43 |         self.MASTERURL = url
 44 |         self.query = query
 45 |         self.path_db = maxmind_directory
 46 |         if MaxMindDB.get_db_path(self) is None:
 47 |             MaxMindDB.get_db(self)
 48 |         #  weeks = 1 because the database is updated once a week.
 49 |         #  if it is downloaded more than one week ago, it will be removed and updated
 50 | 
 51 |         if self.get_age() > timedelta(weeks=1):
 52 |             os.remove(self.get_db_path())
 53 |             MaxMindDB.get_db(self)
 54 | 
 55 |     def get_db(self):
 56 |         """
 57 |         Download the MaxMind database in zip format from the MaxMind website
 58 | 
 59 |         """
 60 |         logger.debug('Downloading the '+self.helpers[self.query][2]+' DB ... ')
 61 |         try:
 62 |             response = requests.get(self.MASTERURL, stream=True)
 63 |         except Exception as e:
 64 |             logger.error('Reraising Exception raised by requests.get ({})'.format(e))
 65 |             raise e
 66 | 
 67 |         if response.status_code == 200:
 68 |             with open(self.helpers[self.query][2], 'wb') as file:
 69 |                 file.write(response.content)
 70 |         else:
 71 |             msg = (
 72 |                 'Error while downloading the ASN DB '
 73 |                 '(Status Code={}): {}'
 74 |             ).format(
 75 |                 response.status_code,
 76 |                 response.text,
 77 |             )
 78 |             logger.error(msg)
 79 |             raise Exception(msg)
 80 |         self.unpack()
 81 | 
 82 |     def unpack(self):
 83 |         """
 84 |         Extract MaxMind DB
 85 |         """
 86 |         if os.path.exists(self.helpers[self.query][2]):
 87 |             subprocess.Popen(['tar', '-xzf', self.helpers[self.query][2]], cwd=maxmind_directory)
 88 |             time.sleep(2)
 89 |         else:
 90 |             msg = 'Error extract DB on get_db '
 91 |             logger.error(msg)
 92 |             raise Exception(msg)
 93 | 
 94 |     def get_db_path(self):
 95 |         """
 96 |         Return the ASN Database path if exists
 97 | 
 98 |         """
 99 |         filtered_dir = [x for x in os.listdir(
100 |             self.path_db) if x.startswith(self.helpers[self.query][0])]
101 |         sorted_dir = sorted(filtered_dir, reverse=True)
102 |         if sorted_dir:
103 |             return str(Path(
104 |                 maxmind_directory,
105 |                 sorted_dir[0],
106 |                 self.helpers[self.query][1],
107 |             ))
108 |         else:
109 |             return None
110 | 
111 |     def open_db(self):
112 |         country_code_db_path = self.get_db_path()
113 |         reader = maxminddb.open_database(country_code_db_path)
114 |         return reader
115 | 
116 |     def get_data(self, ip_address):
117 |         reader = self.open_db()
118 |         return reader.get(ip_address)
119 | 
120 |     def get_age(self):
121 |         reader = self.open_db()
122 |         delta = datetime.now() - datetime.fromtimestamp(
123 |             reader.metadata().build_epoch
124 |         )
125 |         return delta
126 | 


--------------------------------------------------------------------------------
/richkit/retrieve/__init__.py:
--------------------------------------------------------------------------------
 1 | """Retrieval of data on domain names.
 2 | 
 3 | This module provides the ability to retrieve data on domain names of
 4 | any sort. It comes without the "confidentiality contract" of
 5 | `richkit.lookup`.
 6 | 
 7 | """
 8 | from richkit.retrieve import symantec
 9 | from richkit.retrieve import dns
10 | 
11 | 
12 | def symantec_category(domain):
13 |     """
14 |     Returns the category from Symantec's BlueCoat service.
15 |     :param domain:
16 |     :return:
17 |     """
18 |     return symantec.fetch_from_internet(domain)
19 | 
20 | 
21 | def dns_a(domain):
22 |     """
23 |     Return the A Records of a given domain
24 |     :param domain: domain (string)
25 |     :return: IP Addresses (list)
26 |     """
27 |     return dns.get_a_record(domain)
28 | 
29 | 
30 | def dns_ptr(ip_address):
31 |     """
32 |     Return the PTR record of a given IP address
33 |     :param ip_address: IP Address (string)
34 |     :return: domains (list)
35 |     """
36 |     return dns.get_ptr_record(ip_address)
37 | 


--------------------------------------------------------------------------------
/richkit/retrieve/cert_sh.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import logging
 4 | from richkit.retrieve.x509 import X509
 5 | from datetime import datetime
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | class DomainCertificates:
11 |     """
12 |     This class provides the functions to get certificates of a given domain.
13 |     The website used to get them is crt.sh
14 |     """
15 | 
16 |     # Website used to retrieve the certificates belonging a domain
17 |     crtSH_url = "https://crt.sh/{}"
18 | 
19 |     def __init__(self, domain):
20 |         """
21 |         Get the certificate features from the given domain
22 |         :param domain: domain to analyze
23 |         """
24 |         self.domain = domain
25 |         self.certificates = self.get_certificates(self.domain)
26 |         self.certificates_features = None
27 | 
28 |     def get_certificates(self, domain):
29 |         """
30 |         Make a request and get the response content of the given domain
31 |         :param domain: the choosen domain
32 |         """
33 |         try:
34 |             r = requests.get(self.crtSH_url.format("?q=" + domain + "&output=json"))
35 |             if r.status_code != 200:
36 |                 raise Exception("Server not available")
37 |             content = r.content.decode('utf-8')
38 |             if len(r.text) == 2:        # It's 2 when the domain is not found
39 |                 raise Exception("Domain not found")
40 |             return json.loads(content)
41 |         except Exception as e:
42 |             logger.error('Error while retrieving certificates: %s', e)
43 |             return None
44 | 
45 |     def get_all(self):
46 |         """
47 |         Get the list of certificates for the given domain and the certificate features for each of them
48 |         """
49 |         certs_features = []
50 |         for cert in self.certificates:
51 |             # filter out all the rows containing @ because they are email
52 |             # example: https://crt.sh/?id=34083306
53 |             cf = X509(cert.get('id'))
54 |             not_before = cert.get('not_before')
55 |             not_after = cert.get('not_after')
56 |             not_before_obj = datetime.strptime(not_before, "%Y-%m-%dT%H:%M:%S")
57 |             not_after_obj = datetime.strptime(not_after, "%Y-%m-%dT%H:%M:%S")
58 |             validity = (not_after_obj.date() - not_before_obj.date()).days
59 |             features = dict({
60 |                 'ID': cert.get('id'),
61 |                 'Issuer': cert.get('issuer_name'),
62 |                 'Algorithm': cf.algorithm,
63 |                 'ValidationL': cf.policy_list,
64 |                 'NotBefore': not_before,
65 |                 'NotAfter': not_after,
66 |                 'Validity': validity,       # days
67 |                 'SANFeatures': cf.certificates_features
68 |             })
69 |             certs_features.append(features)
70 |         self.certificates_features = certs_features
71 |         return certs_features
72 | 
73 |     def get_certificates_list(self):
74 |         """
75 |         Get the list of certificates for the given domain
76 |         """
77 |         certs_features = []
78 |         for cert in self.certificates:
79 |             # filter out all the rows containing @ because they are email
80 |             # example: https://crt.sh/?id=34083306
81 |             not_before = cert.get('not_before')
82 |             not_after = cert.get('not_after')
83 |             not_before_obj = datetime.strptime(not_before, "%Y-%m-%dT%H:%M:%S")
84 |             not_after_obj = datetime.strptime(not_after, "%Y-%m-%dT%H:%M:%S")
85 |             validity = (not_after_obj.date() - not_before_obj.date()).days
86 |             features = dict({
87 |                 'ID': cert.get('id'),
88 |                 'Issuer': cert.get('issuer_name'),
89 |                 'NotBefore': not_before,
90 |                 'NotAfter': not_after,
91 |                 'Validity': validity,       # days
92 |             })
93 |             certs_features.append(features)
94 |         self.certificates_features = certs_features
95 |         return certs_features
96 | 


--------------------------------------------------------------------------------
/richkit/retrieve/ctlogs.py:
--------------------------------------------------------------------------------
 1 | from richkit.retrieve.cert_sh import DomainCertificates
 2 | from richkit.retrieve.x509 import X509
 3 | 
 4 | 
 5 | def get_logs(domain):
 6 |     """
 7 |     Get a list of certificates with all the features
 8 |     :param domain: Input domain
 9 |     """
10 |     try:
11 |         certs = DomainCertificates(domain)
12 |         return certs.get_all()
13 |     except Exception as e:
14 |         print(e)
15 | 
16 | 
17 | def get_certificates(domain):
18 |     """
19 |     Get just the list of certificates of the domain
20 |     :param domain: Input domain
21 |     """
22 |     try:
23 |         certs = DomainCertificates(domain)
24 |         return certs.get_certificates_list()
25 |     except Exception as e:
26 |         print(e)
27 | 
28 | 
29 | def get_certificates_features(cert_id):
30 |     """
31 |     Get the certificate features by certificate ID
32 |     :param cert_id: crt.sh certificate ID
33 |     """
34 |     try:
35 |         cert = X509(cert_id)
36 |         return cert.certificates_features
37 |     except Exception as e:
38 |         print(e)
39 | 


--------------------------------------------------------------------------------
/richkit/retrieve/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/richkit/retrieve/data/.gitkeep


--------------------------------------------------------------------------------
/richkit/retrieve/data/categories_list.txt:
--------------------------------------------------------------------------------
1 | {"01": "Adult/Mature Content", "03": "Pornography", "04": "Sex Education", "05": "Intimate Apparel/Swimsuit", "06": "Nudity", "07": "Extreme", "09": "Scam/Questionable/Illegal", "0b": "Gambling", "0e": "Violence/Hate/Racism", "0f": "Weapons", "10": "Abortion", "11": "Hacking", "12": "Phishing", "14": "Entertainment", "15": "Business/Economy", "16": "Alternative Spirituality/Belief", "17": "Alcohol", "18": "Tobacco", "19": "Controlled Substances", "1a": "Child Pornography", "1b": "Education", "1d": "Charitable Organizations", "1e": "Art/Culture", "1f": "Financial Services", "20": "Brokerage/Trading", "21": "Games", "22": "Government/Legal", "23": "Military", "24": "Political/Social Advocacy", "25": "Health", "26": "Technology/Internet", "28": "Search Engines/Portals", "2b": "Malicious Sources/Malnets", "2c": "Malicious Outbound Data/Botnets", "2d": "Job Search/Careers", "2e": "News/Media", "2f": "Personals/Dating", "31": "Reference", "32": "Mixed Content/Potentially Adult", "33": "Chat (IM)/SMS", "34": "Email", "35": "Newsgroups/Forums", "36": "Religion", "37": "Social Networking", "38": "File Storage/Sharing", "39": "Remote Access Tools", "3a": "Shopping", "3b": "Auctions", "3c": "Real Estate", "3d": "Society/Daily Living", "3f": "Personal Sites", "40": "Restaurants/Dining/Food", "41": "Sports/Recreation", "42": "Travel", "43": "Vehicles", "44": "Humor/Jokes", "47": "Software Downloads", "53": "Peer-to-Peer (P2P)", "54": "Audio/Video Clips", "55": "Office/Business Applications", "56": "Proxy Avoidance", "57": "For Kids", "58": "Web Ads/Analytics", "59": "Web Hosting", "5a": "Uncategorized", "5c": "Suspicious", "5d": "Sexual Expression", "5f": "Translation", "60": "Non-Viewable/Infrastructure", "61": "Content Servers", "62": "Placeholders", "65": "Spam", "66": "Potentially Unwanted Software", "67": "Dynamic DNS Host", "6a": "E-Card/Invitations", "6b": "Informational", "6c": "Computer/Information Security", "6d": "Internet Connected Devices", "6e": "Internet Telephony", "6f": "Online Meetings", "70": "Media Sharing", "71": "Radio/Audio Streams", "72": "TV/Video Streams", "76": "Piracy/Copyright Concerns", "79": "Marijuana"}


--------------------------------------------------------------------------------
/richkit/retrieve/dns.py:
--------------------------------------------------------------------------------
 1 | from dns import resolver
 2 | from dns import reversename
 3 | import logging
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | 
 8 | def get_a_record(domain):
 9 |     """
10 |     Return the A record list of a given domain
11 |     :param domain: domain (string)
12 |     :return: IP Addresses (list)
13 |     """
14 |     try:
15 |         a_record = []
16 |         result = resolver.query(domain, 'A')
17 |         for ip in result:
18 |             a_record.append(ip.to_text())
19 |         return a_record
20 |     except Exception as ex:
21 |         logger.error(ex)
22 |         return None
23 | 
24 | 
25 | def get_ptr_record(ip_address):
26 |     """
27 |     Return the PTR record of a given IP Address
28 |     :param ip_address: IP Address (string)
29 |     :return: domains list
30 |     """
31 |     try:
32 |         ptr_record = []
33 |         addr = reversename.from_address(ip_address)
34 |         result = resolver.query(addr, 'PTR')
35 |         for ip in result:
36 |             ptr_record.append(ip.to_text())
37 |         return ptr_record
38 |     except Exception as ex:
39 |         logger.error(ex)
40 |         return None
41 | 


--------------------------------------------------------------------------------
/richkit/retrieve/symantec.py:
--------------------------------------------------------------------------------
  1 | """Symantec Web Service
  2 | 
  3 | This is generated to get categories of given urls, normally it fetches
  4 | category from symantec web service then saves it to local file which
  5 | is called `categorized_urls` under `richkit/retrieve/data/`
  6 | 
  7 | 
  8 | How to use:
  9 | 
 10 |     >>> # Import necesseary functions and make a call as demonstrated given below
 11 |     >>> from richkit.retrieve.symantec import fetch_from_internet
 12 |     >>> from richkit.retrieve.symantec import LocalCategoryDB
 13 |     >>>
 14 |     >>> urls = ["www.aau.dk","www.github.com","www.google.com"]
 15 |     >>>
 16 |     >>> local_db = LocalCategoryDB()
 17 |     >>> for url in urls:
 18 |     ...     url_category=local_db.get_category(url)
 19 |     ...     if url_category=='':
 20 |     ...         url_category=fetch_from_internet(url)
 21 |     ...     print(url_category)
 22 |     Education
 23 |     Technology/Internet
 24 |     Search Engines/Portals
 25 | 
 26 | """
 27 | import ast
 28 | import json
 29 | import os
 30 | from json import dumps
 31 | import re
 32 | from pathlib import Path
 33 | import logging
 34 | import requests
 35 | from xml.etree.ElementTree import fromstring
 36 | from requests.exceptions import HTTPError
 37 | from requests.exceptions import InvalidURL
 38 | logger = logging.getLogger(__name__)
 39 | 
 40 | """
 41 | Configuration
 42 | Get one here: http://www1.k9webprotection.com/get-k9-web-protection-free
 43 | """
 44 | categories_url = "https://gitlab.com/snippets/1740321/raw"
 45 | data_path = Path(os.path.dirname(__file__), 'data')
 46 | categories_file_path = data_path / "categories_list.txt"
 47 | categorized_urls_file = data_path / "categorized_urls.txt"
 48 | 
 49 | k9License = 'Replace_by_your_own_license'
 50 | 
 51 | 
 52 | class LocalCategoryDB():
 53 |     def __init__(self):
 54 | 
 55 |         self.url_to_category = read_categorized_file()
 56 | 
 57 |     def get_category(self, url):
 58 |         if url in self.url_to_category:
 59 |             return self.url_to_category[url]
 60 |         else:
 61 |             return ''
 62 | 
 63 | 
 64 | def fetch_categories(categories_url=categories_url, local_categories_path=categories_file_path):
 65 |     """Fetch categories and create local cache """
 66 |     if not categories_url:
 67 |         return None
 68 |     try:
 69 |         resp = requests.get(categories_url)
 70 |         data = resp.json()
 71 |         d = dict([('%02x' % c['num'], c['name']) for c in data])
 72 |     except HTTPError as e:
 73 |         logger.error('Cannot fetch categories, HTTP error: %s\n' % str(e.code))
 74 |     except InvalidURL as e:
 75 |         logger.error('Cannot fetch categories, URL error: %s\n' % str(e.reason))
 76 |     try:
 77 |         f = open(local_categories_path, 'w')
 78 |         f.write(dumps(d))
 79 |         f.close()
 80 |     except Exception as e:
 81 |         f.close()
 82 |         logger.error('Cannot save categories: %s\n' % e)
 83 |     return d
 84 | 
 85 | 
 86 | #
 87 | def load_categories(name):
 88 |     """Load categories from a cache file"""
 89 |     if not name:
 90 |         return None
 91 |     d = {}
 92 |     try:
 93 |         f = open(name, 'r')
 94 |         data = f.read()
 95 |         d = ast.literal_eval(data)
 96 |         f.close()
 97 |     except FileNotFoundError as e:
 98 |         return {}
 99 |     except OSError as er:
100 |         f.close()
101 |         os.exit(1)
102 |     return d
103 | 
104 | 
105 | def check_local_categories_file_exists(categories_file_path=categories_file_path):
106 |     webCats = load_categories(categories_file_path)
107 |     if webCats == {}:
108 |         webCats = fetch_categories(categories_url, categories_file_path)
109 |     return webCats
110 | 
111 | 
112 | def _chunks(s):
113 |     # Original: https://github.com/allfro/sploitego/blob/master/src/sploitego/webtools/bluecoat.py
114 |     return [s[i:i + 2] for i in range(0, len(s), 2)]
115 | 
116 | 
117 | # if there is no info related with link  then call for api and append it to categorized_url.txt
118 | def write_to_local_file(text, categorized_urls_file=categorized_urls_file):
119 |     with open(categorized_urls_file, 'a') as file:
120 |         file.write(text + "\n")
121 | 
122 | 
123 | def fetch_from_internet(url, categories_file_path=categories_file_path, categorized_urls_file=categorized_urls_file):
124 |     result = ''
125 |     hostname = url
126 |     port = '80'
127 |     webservice_endpoint = 'http://sp.cwfservice.net/1/R/%s/K9-00006/0/GET/HTTP/%s/%s///' % (
128 |         k9License, hostname, port)
129 |     r = requests.get(webservice_endpoint)
130 |     if r.status_code == 200:
131 |         e = fromstring(r.text)
132 |         domc = e.find('DomC')
133 |         dirc = e.find('DirC')
134 |         if domc is not None:
135 |             cats = _chunks(domc.text)
136 |             result = [check_local_categories_file_exists().get(c.lower(), 'Unknown')
137 |                       for c in cats][0]
138 |             write_to_local_file(url + "," + re.sub('\n', '', result), categorized_urls_file)
139 |         elif dirc is not None:
140 |             cats = _chunks(dirc.text)
141 |             logger.debug(
142 |                 '%s,%s\n' % (hostname, [check_local_categories_file_exists(categories_file_path).get(c.lower(), 'Unknown') for c in cats][0]))
143 |             result = [check_local_categories_file_exists(
144 |                 categories_file_path).get(c.lower(), 'Unknown') for c in cats][0]
145 |             write_to_local_file(url + "," + re.sub('\n', '', result), categorized_urls_file)
146 |         else:
147 |             logger.error('Cannot get category for %s\n' % hostname)
148 | 
149 |     return re.sub('\n', '', result)
150 | 
151 | 
152 | def read_categorized_file(file_path=categorized_urls_file):
153 |     url_to_category = dict()
154 |     if not os.path.exists(file_path):
155 |         open(file_path, 'w').close()
156 |     else:
157 |         with open(file_path, "r") as ins:
158 |             for line in ins:
159 |                 pair = line.replace('\n', '').split(',')
160 |                 url_to_category[pair[0]] = pair[1]
161 | 
162 |     return url_to_category
163 | 
164 | 
165 | def check_for_local(url):
166 |     domains = dict()
167 |     for i in read_categorized_file():
168 |         line = i.split(',')
169 | 
170 |         if len(line) == 2:
171 |             if line[1] in domains:
172 |                 # append tyhhe new number to the existing array at this slot
173 |                 if line[0] not in domains[line[1]]:
174 |                     domains[line[1]].append(line[0])
175 |             else:
176 |                 # create a new array in this slot
177 |                 domains[line[1]] = [line[0]]
178 |     url_belong_to = []
179 |     result = ''
180 |     for index, key in enumerate(domains):
181 |         if url in domains[key]:
182 |             result = key
183 |     return result
184 | 
185 | 
186 | def get_index(category):
187 |     for k, v in check_local_categories_file_exists().items():
188 |         if (v == category):
189 |             return k
190 | 


--------------------------------------------------------------------------------
/richkit/retrieve/urlvoid.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | import logging
  3 | import re
  4 | import requests
  5 | 
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | # RFC 6793 specifies 32 bit integer. The convention, of unknown origin,
  9 | # is to prefix "AS" to the decimal form integer. \d{1,10} is a rough
 10 | # approximation of 4,294,967,295
 11 | ASN_REGEX = re.compile('AS\\d{1,10}')
 12 | 
 13 | 
 14 | class URLVoid(object):
 15 | 
 16 |     def __init__(self, domain):
 17 |         self.domain = domain
 18 |         self.value = self.urlvoid_parser()
 19 | 
 20 |     def urlvoid_parser(self):
 21 |         """
 22 |         Parses URLVOID table with beatifulsoup
 23 |         :return: dictionary which contains urlvoid response
 24 |         """
 25 |         url = "https://www.urlvoid.com/scan/" + self.domain
 26 |         res = requests.get(url)
 27 |         text = res.text
 28 |         try:
 29 |             soup = BeautifulSoup(
 30 |                 text, "lxml"
 31 |             ).find(
 32 |                 "table",
 33 |                 class_="table table-custom table-striped"
 34 |             )
 35 |             all_tr = soup.find_all("tr")
 36 |             value = {tr.find_all("td")[0].text:
 37 |                      tr.find_all("td")[1].text.replace("\xa0", "")
 38 |                      for tr in all_tr}
 39 |         except ModuleNotFoundError as me:
 40 |             logger.error("Opps ! Error : %s", me)
 41 |         return value
 42 | 
 43 |     def get_last_analysis_date(self):
 44 |         """
 45 | 
 46 |         :return: Last analysis time of given domain on URLVOID
 47 |         """
 48 |         try:
 49 |             result = self.value["Last Analysis"][:-9]
 50 |         except KeyError as ke:
 51 |             logger.error('Error while retrieving value', ke)
 52 |         return result
 53 | 
 54 |     def domain_registration_date(self):
 55 |         """
 56 | 
 57 |         :return: Registration time of domain
 58 |         """
 59 |         try:
 60 |             result = self.value["Domain Registration"]
 61 |         except KeyError as ke:
 62 |             logger.error(' DRD: Error while retrieving value; %s ', ke)
 63 |         return result
 64 | 
 65 |     def blacklist_status(self):
 66 |         """
 67 | 
 68 |         :return: Blacklist status among 36 services or more which are enable
 69 |         in URLVOID itself.
 70 |         """
 71 |         try:
 72 |             result = self.value["Blacklist Status"]
 73 |         except KeyError as ke:
 74 |             logger.error(
 75 |                 ' Blacklist status: Error while retrieving value; %s ', ke)
 76 |         return result
 77 | 
 78 |     def get_asn(self):
 79 |         """
 80 | 
 81 |         :return: ASN Number
 82 |         """
 83 |         try:
 84 |             result = self.value["ASN"]
 85 |         except KeyError as ke:
 86 |             logger.error('ASN: Error while retrieving value; %s ', ke)
 87 |         m = ASN_REGEX.search(result)
 88 |         if m is None:
 89 |             logger.error(
 90 |                 "Failed to parse ASN for {} from \"{}\"".format(
 91 |                     self.domain,
 92 |                     result,
 93 |                 )
 94 |             )
 95 |             return None
 96 |         else:
 97 |             return m.group()
 98 | 
 99 |     def get_server_location(self):
100 |         """
101 | 
102 |         :return: Server location of domain which exists on URLVOID
103 |         """
104 |         try:
105 |             result = self.value["Server Location"]
106 |         except KeyError as ke:
107 |             logger.error(
108 |                 ' Server Location : Error while retrieving value; %s ', ke)
109 |         return result
110 | 
111 |     def get_ip_address(self):
112 |         """
113 | 
114 |         :return: IP address of given domain via URLVOID service
115 |         """
116 |         try:
117 |             result = self.value["IP Address"]
118 |         except KeyError as ke:
119 |             logger.error(' IP Address: Error while retrieving value; %s ', ke)
120 |         return result
121 | 
122 |     def get_detection_rate(self):
123 |         """
124 | 
125 |         :return: Returns detection rate in percentage.
126 |         """
127 |         try:
128 |             parts = self.blacklist_status().split("/")
129 |             result = int(parts[0]) / int(parts[1]) * 100
130 |         except IndexError as ie:
131 |             logger.error(
132 |                 'Detection rate : Error while retrieving value; %s ', ie)
133 |         return result
134 | 


--------------------------------------------------------------------------------
/richkit/retrieve/whois.py:
--------------------------------------------------------------------------------
 1 | import whois
 2 | import logging
 3 | 
 4 | logger = logging.getLogger(__name__)
 5 | 
 6 | 
 7 | def get_whois_info(domain):
 8 |     """Retrive a WHOIS information for a domain name
 9 | 
10 |     :param domain: Domain name
11 |     :type domain: str
12 |     :return: WHOIS information of given domain name
13 |     :rtype: dict (Actually a subclass of whois.parser.WhoisEntry, which
14 |     itself is a subclass of `dict`)
15 | 
16 |     """
17 |     result = whois.whois(domain)
18 | 
19 |     return result
20 | 


--------------------------------------------------------------------------------
/richkit/retrieve/x509.py:
--------------------------------------------------------------------------------
  1 | from richkit.analyse import tld, sld, sl_label, depth, length
  2 | import statistics
  3 | import requests
  4 | import logging
  5 | import time
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | 
 10 | class X509:
 11 |     """
 12 |     This class provides functions to extract certificate features from crt.sh
 13 |     The only needed parameter is the crt.sh ID of the certificate, it's possible to
 14 |     get it just making a request on crt.sh by listing all the certificates for a specific domain
 15 |     """
 16 | 
 17 |     # Website used to retrieve the certificates belonging a domain
 18 |     crtSH_url = "https://crt.sh/{}"
 19 | 
 20 |     def __init__(self, cert_id):
 21 |         """
 22 |         Get the Subject Alternative Name features from the given certificate
 23 |         :param cert_id: unique ID given by crt.sh per certificate
 24 |         """
 25 |         self.cert_id = cert_id
 26 |         self.algorithm = None
 27 |         self.policy_list = None
 28 |         self.certificates_features = None
 29 |         self.get_certificate_features()
 30 | 
 31 |     def get_certificate_info(self, cert_id):
 32 |         """
 33 |         Make a request and get the response content of the given ID
 34 |         :param cert_id: crt.sh ID of the certificate
 35 |         :return: response as text or None in case an Exception raised
 36 |         """
 37 |         try:
 38 |             r = requests.get(self.crtSH_url.format("?id=" + cert_id))
 39 |             if "<BR><BR>Certificate not found </BODY>" in r.text:
 40 |                 raise Exception("Certificate not found")
 41 |             if "<BR><BR>Invalid value:" in r.text:
 42 |                 raise Exception("Certificate not found")
 43 |             return r.text
 44 |         except Exception as e:
 45 |             raise e
 46 | 
 47 |     def get_certificate_features(self):
 48 |         """
 49 |         Parse the response content to get the certificate features
 50 |         """
 51 |         text = None
 52 |         for _ in range(5):
 53 |             if text is not None:
 54 |                 break
 55 |             try:
 56 |                 text = self.get_certificate_info(str(self.cert_id))
 57 |                 text_list = text.split('<BR>')
 58 |             except:
 59 |                 time.sleep(10)
 60 | 
 61 |         sans = SANList()           # Used to store the  SANs
 62 |         policy_list = []        # Used to store the policies in order to get the Validation Level
 63 | 
 64 |         algo_index = '&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Signature&nbsp;Algorithm:'
 65 |         san_index = \
 66 |             '&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;DNS:'
 67 | 
 68 |         san_index_email = \
 69 |             '&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;email:'
 70 | 
 71 |         policy_index = \
 72 |             '&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;' \
 73 |             '&nbsp;&nbsp;&nbsp;&nbsp;Policy:&nbsp;'
 74 |         for row in text_list:
 75 |             # Get Signature Algorithm
 76 |             if algo_index in row:
 77 |                 self.algorithm = row[len(algo_index) + 6:]
 78 | 
 79 |             # Get SANs
 80 |             if san_index in row:
 81 |                 sans.append(row[len(san_index):])
 82 |             if san_index_email in row:
 83 |                 sans.append(row[len(san_index_email):])
 84 | 
 85 |             if policy_index in row:
 86 |                 policy_list.append(row[len(policy_index):])
 87 | 
 88 |         # Calculating the LCS
 89 |         apex = [sld(san) for san in sans.get_sans()]
 90 |         lcs_num = get_lcs_apex(apex)
 91 | 
 92 |         self.policy_list = policy_list
 93 |         self.certificates_features = dict({
 94 |             'san_list': sans.get_sans(),
 95 |             'DomainCount': len(sans.get_sans()),
 96 |             'UniqueApexCount': unique_apex(sans.get_sans()),
 97 |             'UniqueSLDCount': unique_sld(sans.get_sans()),
 98 |             'ShortestSAN': sans.min(),
 99 |             'LongestSAN': sans.max(),
100 |             'SANsMean': sans.mean(),
101 |             'MinSubLabels': sans.min_labels(),
102 |             'MaxSubLabels': sans.max_labels(),
103 |             'MeanSubLabels': sans.mean_labels(),
104 |             'UniqueTLDsCount': unique_tld(sans.get_sans()),
105 |             'UniqueTLDsDomainCount': sans.uniqueTLDsDomainCount(),
106 |             'ApexLCS': None,        # Don't need to implement
107 |             'LenApexLCS': lcs_num,
108 |             'LenApexLCSNorm': sans.lenApexLCSNorm(lcs_num),
109 |         })
110 | 
111 | 
112 | def unique_apex(sans):
113 |     """
114 |     Number of unique apex/root domains covered by the certificate
115 |     :param sans: List of Subject Alternative Name
116 |     """
117 |     apex = [sld(san) for san in sans]
118 |     return len(set(apex))
119 | 
120 | 
121 | def unique_tld(sans):
122 |     """
123 |     Number of unique TLDs covered by the certificate
124 |     :param sans: List of Subject Alternative Name
125 |     """
126 |     get_tlds = [tld(san) for san in sans]
127 |     return len(set(get_tlds))
128 | 
129 | 
130 | def unique_sld(sans):
131 |     """
132 |     Number of unique effective 2-level label domains covered by the certificate
133 |     :param sans: List of Subject Alternative Name
134 |     """
135 |     get_sld = [sl_label(san) for san in sans]
136 |     return len(set(get_sld))
137 | 
138 | 
139 | def get_lcs_apex(apex):
140 |     """
141 |     The longest common substring of an array
142 |     :param apex: apex array
143 |     :return: The longest common substring
144 |     """
145 |     lcs_num = 0
146 |     for i in apex:
147 |         current_sans_list = apex[:]
148 |         current_sans_list.remove(i)
149 |         for j in current_sans_list:
150 |             current_lcs = lcs(i, j)
151 |             if current_lcs > lcs_num:
152 |                 lcs_num = current_lcs
153 |     return lcs_num
154 | 
155 | 
156 | def lcs(x, y):
157 |     """
158 |     The longest common substring (LCS)
159 |     :param x: First string
160 |     :param y: Second string
161 |     :return LCS
162 |     """
163 |     m = len(x)
164 |     n = len(y)
165 | 
166 |     h = [[None] * (n + 1) for i in range(m + 1)]
167 | 
168 |     for i in range(m + 1):
169 |         for j in range(n + 1):
170 |             if i == 0 or j == 0:
171 |                 h[i][j] = 0
172 |             elif x[i - 1] == y[j - 1]:
173 |                 h[i][j] = h[i - 1][j - 1] + 1
174 |             else:
175 |                 h[i][j] = max(h[i - 1][j], h[i][j - 1])
176 |     return h[m][n]
177 | 
178 | 
179 | class SANList:
180 |     """
181 |     This class provides tje functions to extract features from the SAN list
182 |     """
183 | 
184 |     def __init__(self):
185 |         self.sans = []
186 | 
187 |     def append(self, san):
188 |         self.sans.append(san)
189 | 
190 |     def get_sans(self):
191 |         return self.sans
192 | 
193 |     def min(self):
194 |         if not self.sans:
195 |             return 0
196 |         return int(min([length(row) for row in self.sans]))
197 | 
198 |     def max(self):
199 |         if not self.sans:
200 |             return 0
201 |         return int(max([length(row) for row in self.sans]))
202 | 
203 |     def mean(self):
204 |         if not self.sans:
205 |             return 0
206 |         return statistics.mean([len(row) for row in self.sans])
207 | 
208 |     def min_labels(self):
209 |         if not self.sans:
210 |             return 0
211 |         return min([int(depth(row)) - 2 for row in self.sans])
212 | 
213 |     def max_labels(self):
214 |         if not self.sans:
215 |             return 0
216 |         return max([int(depth(row)) - 2 for row in self.sans])
217 | 
218 |     def mean_labels(self):
219 |         if not self.sans:
220 |             return 0
221 |         return statistics.mean([int(depth(row)) for row in self.sans])
222 | 
223 |     def uniqueTLDsDomainCount(self):
224 |         if not self.sans:
225 |             return 0
226 |         return unique_tld(self.sans) / len(self.sans)
227 | 
228 |     def lenApexLCSNorm(self, lcs):
229 |         if not self.sans:
230 |             return 0
231 |         return lcs / len(self.sans)
232 | 


--------------------------------------------------------------------------------
/richkit/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/richkit/test/__init__.py


--------------------------------------------------------------------------------
/richkit/test/analyse/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/richkit/test/analyse/__init__.py


--------------------------------------------------------------------------------
/richkit/test/analyse/test_analyse.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import unittest
  3 | 
  4 | 
  5 | from richkit import analyse
  6 | from os import path
  7 | import requests
  8 | import tempfile
  9 | import logging
 10 | import os
 11 | 
 12 | logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
 13 |                     datefmt='%m-%d %H:%M',
 14 |                     level=logging.DEBUG)
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | class TestEffect2LD():
 19 |     temp_directory = tempfile.mkdtemp()
 20 |     MASTERURL = "https://raw.githubusercontent.com/publicsuffix/list/master/tests/test_psl.txt"
 21 |     MASTERFILE = temp_directory + 'correct_test.txt'
 22 |     test = None
 23 | 
 24 |     @classmethod
 25 |     def fetch_tlds(cls, url=None):
 26 |         url = url or cls.MASTERURL
 27 | 
 28 |         # grab master list
 29 |         response = requests.get(url, stream=True)
 30 |         if response.status_code == 200:
 31 |             with open(cls.MASTERFILE, 'wb') as file:
 32 |                 file.write(response.content)
 33 |         else:
 34 |             logger.error('Error while downloading the Test List status code: %s',
 35 |                          response.status_code)
 36 | 
 37 |     @classmethod
 38 |     def load_tlds(cls):
 39 |         try:
 40 |             f = open(cls.MASTERFILE, 'r', encoding="utf8")
 41 |             lines = f.readlines()
 42 |         except FileNotFoundError as e:
 43 | 
 44 |             logger.error("File not readable, not found %s", e)
 45 |             f.close()
 46 |         f.close()
 47 | 
 48 |         # strip comments and blank lines
 49 |         lines = [ln for ln in (ln.strip() for ln in lines) if len(ln) and ln[:2] != '//']
 50 | 
 51 |         cls.test = set(lines)
 52 | 
 53 |     def load(self):
 54 | 
 55 |         if path.exists(TestEffect2LD.MASTERFILE):
 56 |             TestEffect2LD.load_tlds()
 57 | 
 58 |         if TestEffect2LD.test is None:
 59 |             TestEffect2LD.fetch_tlds()
 60 |             TestEffect2LD.load_tlds()
 61 | 
 62 |     def get_tests(self):
 63 |         test_list = []
 64 |         for i in TestEffect2LD.test:
 65 |             parser = i[i.find("(")+1:i.find(")")]
 66 |             test_list.append(parser.replace(" ", "").replace("null", "'None'"))
 67 |         return test_list
 68 | 
 69 | 
 70 | class TestAnalyse(unittest.TestCase):
 71 | 
 72 |     def setUp(self):
 73 |         self.domain = {
 74 |             'www.google.co.uk': {
 75 |                 'num_tokens': 4,
 76 |                 'len2ld': 12,
 77 |                 'len_domain': 13,
 78 |                 'domain_tld': "co.uk",
 79 |                 'domain_sld': "google.co.uk",
 80 |                 'second_label': "google",
 81 |                 'language': "en",
 82 |                 'nld': "www.google.co.uk",
 83 |                 'n_label': "www",
 84 |                 'entropy': 2.8553885422075336,
 85 |                 'num_words_2ld': 0,
 86 |                 'vowels': 5,
 87 |                 'ratio_vowels': 0.4166666666666667,
 88 |                 'num_of_consonants_2ld': 5,
 89 |                 'ratio_consonants_2ld': 0.4166666666666667,
 90 |                 'num_of_special_2ld': 0,
 91 |                 'ratio_special_2ld': 0.0,
 92 |                 'num_numeric_2ld': 0,
 93 |                 'radio_numeric_2ld': 0.0,
 94 |                 # following values are smaller than expected due to top 100 alexa which is expected
 95 |                 'n_grams_2ld': 27.33635144637163,
 96 |                 'n_grams_2ld_alexa': 27.33081895777167
 97 |             },
 98 |             'www.intranet.es.aau.dk': {
 99 |                 'num_tokens': 5,
100 |                 'len2ld': 6,
101 |                 'len_domain': 18,
102 |                 'domain_tld': "dk",
103 |                 'domain_sld': 'aau.dk',
104 |                 'second_label': "aau",
105 |                 'language': "en",
106 |                 'nld': "es.aau.dk",
107 |                 'n_label': "es",
108 |                 'entropy': 2.2516291673878226,
109 |                 'num_words_2ld': 0,
110 |                 'vowels': 3,
111 |                 'ratio_vowels': 0.5,
112 |                 'num_of_consonants_2ld': 2,
113 |                 'ratio_consonants_2ld': 0.3333333333333333,
114 |                 'num_of_special_2ld': 0,
115 |                 'ratio_special_2ld': 0.0,
116 |                 'num_numeric_2ld': 0,
117 |                 'radio_numeric_2ld': 0.0,
118 |                 # this is 0.0 because of gathering top 100 alexa db, written for just ensuring test functions running correctly
119 |                 'n_grams_2ld': 0.0,
120 |                 'n_grams_2ld_alexa':  0.0
121 |             }
122 |         }
123 |         self.data_path = "data/"
124 | 
125 |     def tearDown(self):
126 |         """
127 |             Removes the file after test is done.
128 |             Could be modified in future according to need
129 |         """
130 |         if os.path.isfile('top-1m.csv'):
131 |             os.remove('top-1m.csv')
132 | 
133 |     def test_tld(self):
134 |         for k, v in self.domain.items():
135 |             domain_tld = analyse.tld(k)
136 |             self.assertEqual(domain_tld, v['domain_tld'])
137 | 
138 |     def test_sld(self):
139 |         for k, v in self.domain.items():
140 |             domain_sld = analyse.sld(k)
141 |             self.assertEqual(domain_sld, v['domain_sld'])
142 | 
143 |     def test_sl_label(self):
144 |         for k, v in self.domain.items():
145 |             domain_sld = analyse.sl_label(k)
146 |             self.assertEqual(domain_sld, v['second_label'])
147 | 
148 |     def test_nld(self):
149 |         for k, v in self.domain.items():
150 |             nld3 = analyse.nld(k, 3)
151 |             self.assertEqual(nld3, v['nld'])
152 | 
153 |     def test_n_label(self):
154 |         for k, v in self.domain.items():
155 |             n_label3 = analyse.n_label(k, 3)
156 |             self.assertEqual(n_label3, v['n_label'])
157 | 
158 |     def test_depth(self):
159 |         for k, v in self.domain.items():
160 |             domain_depth = analyse.depth(k)
161 |             self.assertEqual(domain_depth, str(v['num_tokens']))
162 | 
163 |     def test_length(self):
164 |         for k, v in self.domain.items():
165 |             domain_length = analyse.length(k)
166 |             self.assertEqual(domain_length, str(v['len_domain']))
167 | 
168 |     def test_language(self):
169 |         for k, v in self.domain.items():
170 |             domain_language = analyse.language(k)
171 |             self.assertEqual(domain_language, v['language'])
172 | 
173 |     def test_entropy(self):
174 |         for k, v in self.domain.items():
175 |             domain_entropy = analyse.entropy(k)
176 |             self.assertEqual(domain_entropy, str(v['entropy']))
177 | 
178 |     def test_ratio_vowels(self):
179 |         for k, v in self.domain.items():
180 |             domain_ratio_vowels = analyse.ratio_vowels(k)
181 |             self.assertEqual(domain_ratio_vowels, str(v['ratio_vowels']))
182 | 
183 |     def test_number_vowels(self):
184 |         for k, v in self.domain.items():
185 |             domain_number_vowels = analyse.number_vowels(k)
186 |             self.assertEqual(domain_number_vowels, str(v['vowels']))
187 | 
188 |     def test_ratio_consonants(self):
189 |         for k, v in self.domain.items():
190 |             domain_ratio_consonants = analyse.ratio_consonants(k)
191 |             self.assertEqual(domain_ratio_consonants, str(v['ratio_consonants_2ld']))
192 | 
193 |     def test_number_consonants(self):
194 |         for k, v in self.domain.items():
195 |             domain_number_consonants = analyse.number_consonants(k)
196 |             self.assertEqual(domain_number_consonants, str(v['num_of_consonants_2ld']))
197 | 
198 |     def test_ratio_numerics(self):
199 |         for k, v in self.domain.items():
200 |             domain_ratio_numerics = analyse.ratio_numerics(k)
201 |             self.assertEqual(domain_ratio_numerics, str(v['radio_numeric_2ld']))
202 | 
203 |     def test_number_numerics(self):
204 |         for k, v in self.domain.items():
205 |             domain_number_numerics = analyse.number_numerics(k)
206 |             self.assertEqual(domain_number_numerics, str(v['num_numeric_2ld']))
207 | 
208 |     def test_ratio_specials(self):
209 |         for k, v in self.domain.items():
210 |             domain_ratio_specials = analyse.ratio_specials(k)
211 |             self.assertEqual(domain_ratio_specials, str(v['ratio_special_2ld']))
212 | 
213 |     def test_number_specials(self):
214 |         for k, v in self.domain.items():
215 |             domain_number_specials = analyse.number_specials(k)
216 |             self.assertEqual(domain_number_specials, str(v['num_of_special_2ld']))
217 | 
218 |     def test_number_words(self):
219 |         for k, v in self.domain.items():
220 |             domain_number_words = analyse.number_words(k)
221 |             self.assertEqual(domain_number_words, str(v['num_words_2ld']))
222 | 
223 |     def test_get_grams_alexa_2ld(self):
224 |         for k, v in self.domain.items():
225 |             alexa_grams_2ld = analyse.n_grams_alexa(k, is_test=True)
226 |             self.assertEqual(alexa_grams_2ld, v['n_grams_2ld_alexa'])
227 | 
228 |     def test_get_grams_dict_2ld(self):
229 |         for k, v in self.domain.items():
230 |             grams_dict_2ld = analyse.n_grams_dict(k, is_test=True)
231 |             self.assertEqual(grams_dict_2ld, v['n_grams_2ld'])
232 | 
233 |     def test_correctly_tlds(self):
234 |         tests = TestEffect2LD()
235 |         tests.load()
236 |         test_list = tests.get_tests()
237 | 
238 |         # Test skipped for the following list
239 |         # Punycode are not handled by this library
240 |         list_punycode_tests = [
241 |             'xn--85x722f.xn--55qx5d.cn',
242 |             'xn--85x722f.xn--fiqs8s',
243 |             'xn--55qx5d.cn',
244 |             'shishi.xn--55qx5d.cn',
245 |             'www.xn--85x722f.xn--fiqs8s',
246 |             'www.xn--85x722f.xn--55qx5d.cn',
247 |             'shishi.xn--fiqs8s'
248 |         ]
249 | 
250 |         # Test skipped for obvious invalid domains
251 |         list_test_error = [
252 |             '公司.cn',
253 |             '中国',
254 |             'biz',
255 |             'jp',
256 |             'us',
257 |             'com',
258 |             'a.b.example.example',
259 |             'b.example.example',
260 |             'example.example',
261 |             '.example.com',
262 |             '.com',
263 |         ]
264 | 
265 |         # Test skipped for the following domains list
266 |         # They start with esclamation point on the Public Suffix list
267 |         list_esclamation_point = [
268 |             'www.ck',
269 |             'www.city.kobe.jp',
270 |             'www.www.ck',
271 |             'city.kobe.jp'
272 |         ]
273 | 
274 |         for i in test_list:
275 |             values = i.split(',')
276 |             input = values[0].replace("'", "")
277 |             expected = values[1].replace("'", "")
278 |             if expected == "None":
279 |                 expected = None
280 | 
281 |             if input in list_punycode_tests or \
282 |                input in list_test_error or \
283 |                input in list_esclamation_point:
284 |                 continue
285 |             else:
286 |                 self.assertEqual(analyse.sld(input), expected)
287 | 


--------------------------------------------------------------------------------
/richkit/test/lookup/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/richkit/test/lookup/__init__.py


--------------------------------------------------------------------------------
/richkit/test/lookup/test_geo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | from richkit.lookup import util
 4 | from richkit import lookup
 5 | import unittest
 6 | 
 7 | 
 8 | def rm_recursive(pth):
 9 |     pth = Path(pth)
10 |     # Recurse
11 |     for child in pth.glob('*'):
12 |         if child.is_file():
13 |             child.unlink()
14 |         else:
15 |             rm_recursive(child)
16 |     # Handle current pth
17 |     if pth.is_file():
18 |         pth.unlink()
19 |     else:
20 |         pth.rmdir()
21 | 
22 | 
23 | class LookupTestCase(unittest.TestCase):
24 | 
25 |     def tearDown(self):
26 |         for el in Path(util.maxmind_directory).glob('*'):
27 |             rm_recursive(el)
28 | 
29 |     def test_country(self):
30 |         country = lookup.country("8.8.8.8")
31 |         self.assertEqual(country, 'US')
32 | 
33 |     def test_asn(self):
34 |         asn = lookup.asn("8.8.8.8")
35 |         self.assertEqual(asn, 'AS15169')
36 | 
37 |     def test_registered_country(self):
38 |         registered_country = lookup.registered_country("8.8.8.8")
39 |         self.assertEqual(registered_country, 'US')
40 | 
41 |     def test_maxmindb_licence_key(self):
42 |         test_license_key = os.environ["TEST_LICENSE_KEY"] = "LICENSEKEY"
43 |         license_key = lookup.maxmindb_licence_key("TEST_LICENSE_KEY")
44 |         non_existing_license_key = lookup.maxmindb_licence_key("NON-EXISTING")
45 |         self.assertTrue(license_key, test_license_key)
46 |         self.assertIs(non_existing_license_key, 'NOLICENSEKEYFOUND')
47 | 


--------------------------------------------------------------------------------
/richkit/test/lookup/test_util.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from datetime import datetime
  3 | 
  4 | from richkit.lookup import util
  5 | from richkit.lookup.util import MaxMindDB
  6 | import os
  7 | import unittest
  8 | import logging
  9 | from pathlib import Path
 10 | from requests.exceptions import ConnectionError
 11 | 
 12 | 
 13 | def rm_recursive(pth):
 14 |     pth = Path(pth)
 15 |     # Recurse
 16 |     for child in pth.glob('*'):
 17 |         if child.is_file():
 18 |             child.unlink()
 19 |         else:
 20 |             rm_recursive(child)
 21 |     # Handle current pth
 22 |     if pth.is_file():
 23 |         pth.unlink()
 24 |     else:
 25 |         pth.rmdir()
 26 | 
 27 | 
 28 | class StubMaxMindDB(MaxMindDB):
 29 |     """Stub with minimal __init__, to not hit error there."""
 30 | 
 31 |     def __init__(self):
 32 |         self.path_db = util.maxmind_directory
 33 |         self.query = "cc"
 34 | 
 35 | 
 36 | class MaxMindDBTestCase(unittest.TestCase):
 37 | 
 38 |     def setUp(self):
 39 |         # Remove the logging for tests
 40 |         logging.disable(logging.CRITICAL)
 41 | 
 42 |         MaxMindDB.MASTERURL = (
 43 |             "https://download.maxmind.com/app/geoip_download?"
 44 |             "edition_id=GeoLite2-Country&"
 45 |             "license_key={license_key}&"
 46 |             "suffix=tar.gz"
 47 |         ).format(
 48 |             license_key=os.environ['MAXMIND_LICENSE_KEY'],
 49 |         )
 50 | 
 51 |     def tearDown(self):
 52 |         # deletes the files after test is done
 53 |         for el in Path(util.maxmind_directory).glob('*'):
 54 |             rm_recursive(el)
 55 | 
 56 |     def test_init(self):
 57 |         obj = MaxMindDB(MaxMindDB.MASTERURL, "cc")
 58 |         self.assertIsNotNone(obj)
 59 | 
 60 |     def test_get_db_path(self):
 61 |         s = StubMaxMindDB()
 62 | 
 63 |         # No db present
 64 |         p = s.get_db_path()
 65 |         self.assertIsNone(p)
 66 | 
 67 |         # Single db present
 68 |         folder = Path(s.path_db, 'GeoLite2-Country_DUMMYFOLDER_1970')
 69 |         folder.mkdir()
 70 |         db_file = Path(folder, 'GeoLite2-Country.mmdb')
 71 |         db_file.touch()
 72 |         p = MaxMindDB.get_db_path(s)
 73 | 
 74 |         self.assertEqual(p, str(db_file))
 75 | 
 76 |         # Two dbs present
 77 |         folder = Path(s.path_db, 'GeoLite2-Country_DUMMYFOLDER_2040')
 78 |         folder.mkdir()
 79 |         db_file = Path(folder, 'GeoLite2-Country.mmdb')
 80 |         db_file.touch()
 81 |         p = MaxMindDB.get_db_path(s)
 82 | 
 83 |         self.assertEqual(p, str(db_file))
 84 | 
 85 |     def test_get_db(self):
 86 | 
 87 |         # When DNS fails:
 88 |         MaxMindDB.MASTERURL = (
 89 |             "https://this_domain_does_not_exist.local"
 90 |             "/download/geoip/database/GeoLite2-Country.tar.gz"
 91 |         )
 92 | 
 93 |         with self.assertRaises(ConnectionError):
 94 |             MaxMindDB(MaxMindDB.MASTERURL, "cc").get_db()
 95 | 
 96 |         # When URL is bad
 97 |         MaxMindDB.MASTERURL = MaxMindDB.MASTERURL.replace(
 98 |             '?', "THIS_URL_IS_WRONG")
 99 | 
100 |         with self.assertRaises(Exception):
101 |             MaxMindDB.get_db()
102 | 
103 |         # When all is fine:
104 |         self.setUp()
105 |         s = StubMaxMindDB()
106 |         s.get_db()
107 |         # Check if file is present
108 |         p = s.get_db_path()
109 |         s_age = s.get_age()  # get_age() function is tested over here
110 |         self.assertIsNotNone(p, "get_db did not a path to the db")
111 |         self.assertTrue(Path(p).exists())
112 |         self.assertTrue(s_age.microseconds)
113 | 
114 |     def test_extracted_db(self):
115 |         s = StubMaxMindDB()
116 |         # When fail to extract the DB
117 |         with self.assertRaises(Exception):
118 |             s.unpack()
119 | 


--------------------------------------------------------------------------------
/richkit/test/retrieve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aau-network-security/richkit/2575eeeeaf2151f0a4c42d4bea9caa9fcc1350c7/richkit/test/retrieve/__init__.py


--------------------------------------------------------------------------------
/richkit/test/retrieve/test_ctlogs.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import richkit.retrieve.ctlogs as ct
 3 | from richkit.retrieve.cert_sh import DomainCertificates
 4 | from richkit.retrieve.x509 import X509
 5 | 
 6 | 
 7 | class TestCTLogs(unittest.TestCase):
 8 | 
 9 |     def setUp(self):
10 |         self.domains = {
11 |             'example.com': {
12 |                 'certs': [
13 |                     {
14 |                         "ID": "987119772",
15 |                         "Algorithm": "sha256WithRSAEncryption",
16 |                         "SANFeatures": {
17 |                             "DomainCount": 8,
18 |                         }
19 |                     },
20 |                     {
21 |                         "ID": "984858191",
22 |                         "Algorithm": "sha256WithRSAEncryption",
23 |                         "SANFeatures": {
24 |                             "DomainCount": 8,
25 |                         }
26 |                     },
27 |                     {
28 |                         "ID": "24560621",
29 |                         "Algorithm": "sha256WithRSAEncryption",
30 |                         "SANFeatures": {
31 |                             "DomainCount": 4,
32 |                         }
33 |                     },
34 |                 ]
35 |             }
36 |         }
37 | 
38 |     def test_init_domain(self):
39 |         obj = DomainCertificates("example.com")
40 |         if not obj.certificates:
41 |             self.skipTest("Server not available")
42 |         self.assertIsNotNone(obj)
43 | 
44 |     def test_init_certificate(self):
45 |         obj = X509("12345678")
46 |         if not obj.certificates_features:
47 |             self.skipTest("Server not available")
48 |         self.assertIsNotNone(obj)
49 | 
50 |     def test_domain_error(self):
51 |         with self.assertRaises(Exception):
52 |             DomainCertificates("this_domain_does_not_exist.com")
53 | 
54 |     def test_certificate_error(self):
55 |         with self.assertRaises(Exception):
56 |             X509("this_id_does_not_exist.com")
57 | 
58 |     def test_get_all_certificate(self):
59 | 
60 |         for k, v in self.domains.items():
61 |             certs = ct.get_logs(k)
62 |             print(certs)
63 |             if certs is None:
64 |                 self.skipTest("Server not available")
65 | 
66 |             for cert in certs:
67 |                 for vx in v["certs"]:
68 |                     if str(cert["ID"]) == str(vx["ID"]):
69 |                         assert cert["Algorithm"] == vx["Algorithm"]
70 |                         assert cert["SANFeatures"]["DomainCount"] == vx["SANFeatures"]["DomainCount"]
71 | 
72 |     def test_get_certificate_features(self):
73 | 
74 |         for k, v in self.domains.items():
75 |             for cert in v["certs"]:
76 |                 cert_features = ct.get_certificates_features(cert["ID"])
77 |                 if not cert_features:
78 |                     continue
79 |                 assert cert_features.get('DomainCount') == cert["SANFeatures"]["DomainCount"]
80 | 


--------------------------------------------------------------------------------
/richkit/test/retrieve/test_dns.py:
--------------------------------------------------------------------------------
 1 | from richkit.retrieve import dns
 2 | 
 3 | import unittest
 4 | 
 5 | 
 6 | class DNSTestCase(unittest.TestCase):
 7 |     # Since A record change every time, just checking whether we are retrieving a record or not
 8 |     def setUp(self):
 9 |         self.test_urls = ["www.google.co.uk", "www.cloudflare.com", "www.intranet.es.aau.dk"]
10 |         self.test_ips = ["8.8.8.8", "8.8.4.4", "1.1.1.1"]
11 | 
12 |     def test_a_record(self):
13 |         for url in self.test_urls:
14 |             instance = dns.get_a_record(url)
15 |             self.assertIsNot(instance[0], None)
16 | 
17 |     # Since PTR record change every time, just checking whether we are retrieving a record or not
18 |     def test_ptr_record(self):
19 |         for url in self.test_ips:
20 |             instance = dns.get_ptr_record(url)
21 |             self.assertIsNot(instance[0], None)
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     unittest.main()
26 | 


--------------------------------------------------------------------------------
/richkit/test/retrieve/test_symantec.py:
--------------------------------------------------------------------------------
 1 | from richkit.retrieve.symantec import read_categorized_file
 2 | from richkit.retrieve.symantec import fetch_from_internet
 3 | from richkit.retrieve.symantec import fetch_categories
 4 | from richkit.retrieve.symantec import load_categories
 5 | from richkit.retrieve.symantec import categories_url
 6 | from pathlib import Path
 7 | import unittest
 8 | import os
 9 | 
10 | 
11 | CAT_URLS_FILE = 'categorized_urls.txt'
12 | CATEGORIES_FILE_PATH = 'categories_list.txt'
13 | 
14 | 
15 | class SymantecTestCase(unittest.TestCase):
16 | 
17 |     @classmethod
18 |     def tearDownClass(cls):
19 |         """
20 |         Removes created resources during test phase
21 |         """
22 |         for file in Path('.').glob('*.txt'):
23 |             file.unlink()
24 | 
25 |     def test_read_categorized_file(self):
26 |         cat_urls_file_path = Path(CAT_URLS_FILE)
27 |         # Read with missing file
28 |         try:
29 |             cat_urls_file_path.unlink()
30 |         except FileNotFoundError:
31 |             pass
32 |         self.assertIsInstance(read_categorized_file(), dict)
33 | 
34 |         # Read with empty file
35 |         cat_urls_file_path.touch()
36 | 
37 |         d = read_categorized_file(CAT_URLS_FILE)
38 |         self.assertIsInstance(d, dict)
39 |         self.assertEqual(len(d), 0)
40 | 
41 |         # Read something already in file
42 |         with open(CAT_URLS_FILE, 'w') as fd:
43 |             fd.writelines([
44 |                 'www.example.com,Example'
45 |             ])
46 |         d = read_categorized_file(CAT_URLS_FILE)
47 |         self.assertIsInstance(d, dict)
48 |         self.assertEqual(len(d), 1)
49 |         self.assertEqual(d['www.example.com'], 'Example')
50 | 
51 |     def test_fetch_categories(self):
52 |         # make sure that categories url is accessible and fetched correctly
53 |         categories = fetch_categories(categories_url, CATEGORIES_FILE_PATH)
54 |         self.assertNotEqual(categories, {})
55 | 
56 |     def test_load_categories(self):
57 |         if os.path.isfile(CATEGORIES_FILE_PATH):
58 |             self.assertNotEqual(load_categories(CATEGORIES_FILE_PATH), {})
59 |         else:
60 |             self.assertEqual(load_categories(CATEGORIES_FILE_PATH), {})
61 | 
62 |     def test_fetch_from_internet(
63 |             self,
64 |             categories_file_path=CATEGORIES_FILE_PATH,
65 |             categorized_url_path=CAT_URLS_FILE
66 |     ):
67 |         domain_categories = {
68 |             "Search Engines/Portals": [
69 |                 "www.bing.com",
70 |                 "www.google.com",
71 |                 "www.yandex.com"
72 |             ],
73 |             "Social Networking": [
74 |                 "www.facebook.com",
75 |                 "www.twitter.com"
76 |             ]
77 |         }
78 |         for category, url_list in domain_categories.items():
79 |             for url in url_list:
80 |                 fetched_category = fetch_from_internet(
81 |                     url, categories_file_path, categorized_url_path
82 |                 )
83 |                 self.assertEqual(fetched_category, category)
84 |         # unlinking CAT_URLS_FILE here, otherwise the len of dict
85 |         # at line 29 (within test_read_categorized_file) will be different
86 |         # it may cause failing that's why unlinking the file here is required.
87 |         os.unlink(CAT_URLS_FILE)
88 | 
89 | 
90 | if __name__ == '__main__':
91 |     unittest.main()
92 | 


--------------------------------------------------------------------------------
/richkit/test/retrieve/test_urlvoid.py:
--------------------------------------------------------------------------------
 1 | from richkit.retrieve.urlvoid import URLVoid
 2 | 
 3 | import unittest
 4 | import re
 5 | 
 6 | 
 7 | class URLVoidTestCase(unittest.TestCase):
 8 |     test_urls = {
 9 |         "google.co.uk": {
10 |             "domain_registration": "1999-02-14",
11 |             "blacklist_status": "0/36",
12 |             "ASN": "AS15169",
13 |             "server_location": " (US) United States",
14 |             "detection_rate": 0,
15 |             "ip_address": "172.217.19.227",
16 |             "a_record": ['172.217.19.195', '172.217.17.67'],
17 |             "ptr_record": [
18 |                 'ams16s30-in-f67.1e100.net.',
19 |                 'ams16s31-in-f3.1e100.net.',
20 |                 'ams16s30-in-f3.1e100.net.'
21 |             ]
22 |         },
23 |         "facebook.com": {
24 |             "domain_registration": "1997-03-29",
25 |             "blacklist_status": "0/36",
26 |             "ASN": "AS32934",
27 |             "server_location": " (US) United States",
28 |             "detection_rate": 0,
29 |             "ip_address": "157.240.21.35",
30 |             "a_record": ['31.13.72.36'],
31 |             "ptr_record": ['edge-star-mini-shv-01-arn2.facebook.com.']
32 |         },
33 |     }
34 | 
35 |     def test_domain_registration_date(self):
36 |         for k, v in self.test_urls.items():
37 |             instance = URLVoid(k)
38 |             domain_registration = instance.domain_registration_date()[:-15]
39 |             self.assertEqual(domain_registration, v["domain_registration"])
40 | 
41 |     def test_get_detection_rate(self):
42 |         for k, v in self.test_urls.items():
43 |             instance = URLVoid(k)
44 |             domain_detection_rate = instance.get_detection_rate()
45 |             self.assertEqual(domain_detection_rate, v["detection_rate"])
46 | 
47 |     def test_get_server_location(self):
48 |         for k, v in self.test_urls.items():
49 |             instance = URLVoid(k)
50 |             domain_server_location = instance.get_server_location()
51 |             self.assertEqual(domain_server_location, v["server_location"])
52 | 
53 |     def test_get_asn(self):
54 |         for k, v in self.test_urls.items():
55 |             instance = URLVoid(k)
56 |             domain_asn = instance.get_asn()
57 |             self.assertEqual(domain_asn, v["ASN"])
58 | 
59 |         class StubURLVoid(URLVoid):
60 |             def __init__(self, asn):
61 |                 self.domain = None
62 |                 self.value = {'ASN': asn}
63 | 
64 |         self.assertIsNone(StubURLVoid('AZ1 Not a valid ASN').get_asn())
65 |         self.assertEqual(StubURLVoid('AS1').get_asn(), 'AS1')
66 |         self.assertEqual(StubURLVoid('AS1 Random-Test-Text').get_asn(), 'AS1')
67 |         self.assertEqual(StubURLVoid('AS1234567890').get_asn(), 'AS1234567890')
68 |         # Strictly speaking, the below tests are correct, but covering them
69 |         # is deemed unnecessary complex:
70 |         # self.assertIsNone(
71 |         #     StubURLVoid('AS12345678901').get_asn(),
72 |         #     ("Failed to reject ASN of 10 decimal digits (One more digit that"
73 |         #      "possible with RFC 6793)"),
74 |         # )
75 |         # self.assertIsNone(
76 |         #     StubURLVoid('AS4294967295').get_asn(),
77 |         #     "Failed to reject ASN 0xFFFFFFFF + 0x1 (RFC 6793 max value + 1)",
78 |         # )
79 | 
80 |     def test_blacklist_status(self):
81 |         for k, v in self.test_urls.items():
82 |             instance = URLVoid(k)
83 |             blacklist_status = instance.blacklist_status()
84 |             self.assertTrue(re.match(r'[0]/\d*', blacklist_status))
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     unittest.main()
89 | 


--------------------------------------------------------------------------------
/richkit/test/retrieve/test_whois.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datetime import datetime
 3 | from richkit.retrieve import whois
 4 | 
 5 | 
 6 | class WhoisTestCase(unittest.TestCase):
 7 | 
 8 |     # .dk domains give unknownTld exception !
 9 |     def test_get_whois_info(self):
10 |         # last updated field skipped since it could be None
11 | 
12 |         d = "www.google.com"
13 |         w = whois.get_whois_info(d)
14 |         self.assertTrue(len(w['registrar']) > 0)
15 |         # .com uses "thin" WHOIS, so we get expiry from both registry
16 |         # and registrar;
17 |         self.assertTrue(len(w['expiration_date']) == 2)
18 |         self.assertIsInstance(w['expiration_date'][0], datetime)
19 |         self.assertIsInstance(w['expiration_date'][1], datetime)
20 | 
21 |         d = "www.cloudflare.com"
22 |         w = whois.get_whois_info(d)
23 |         self.assertTrue('registrar' in w)
24 |         self.assertTrue(len(w['registrar']) > 0)
25 |         self.assertTrue('expiration_date' in w)
26 |         # .com uses "thin" WHOIS, so we get expiry from both registry
27 |         # and registrar, but they are equal here, so only one is returned;
28 |         self.assertIsInstance(w['expiration_date'], datetime)
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     unittest.main()
33 | 


--------------------------------------------------------------------------------
/richkit/test/retrieve/test_x509.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from richkit.retrieve.x509 import unique_apex, unique_sld, unique_tld, get_lcs_apex
 3 | 
 4 | 
 5 | class Test_x509(unittest.TestCase):
 6 | 
 7 |     def setUp(self):
 8 |         self.sans = ['*.google.com', 'mail.google.com',
 9 |                      'example.com', 'test.example.dk', 'test_domain.co.uk']
10 | 
11 |     def test_unique_apex(self):
12 |         assert unique_apex(self.sans) == 4
13 | 
14 |     def test_unique_tld(self):
15 |         assert unique_tld(self.sans) == 3
16 | 
17 |     def test_unique_sld(self):
18 |         assert unique_sld(self.sans) == 3
19 | 
20 |     def test_lcs(self):
21 |         assert get_lcs_apex(self.sans) == 11
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | from os import path
 3 | root = path.curdir
 4 | with open(path.join(root, 'README.md'), encoding='utf-8') as f:
 5 |     long_description = f.read()
 6 | 
 7 | setuptools.setup(
 8 |     name='richkit',
 9 |     description='Domain enrichment kit ',
10 |     version='1.1.1',
11 |     long_description=long_description,
12 |     long_description_content_type='text/markdown',
13 |     url='https://github.com/aau-network-security/richkit',
14 |     packages=setuptools.find_packages(exclude=['docs', 'richkit/test']),
15 |     project_urls={
16 |                 'Bug Reports': 'https://github.com/aau-network-security/richkit/issues',
17 |                 'Funding': 'https://donate.pypi.org',
18 |                 'Source': 'https://github.com/aau-network-security/richkit',
19 |     },
20 |     install_requires=['maxminddb',
21 |                       'numpy==1.17.2',
22 |                       'scikit-learn==0.21.3',
23 |                       'langid==1.1.6',
24 |                       'bs4==0.0.1',
25 |                       'lxml==4.4.1',
26 |                       'requests==2.22.0',
27 |                       'pytest',
28 |                       'dnspython',
29 |                       'coverage'],
30 |     python_requires='>=3.5',
31 |     author=['Ahmet Turkmen', 'Gian Marco Mennecozzi ', 'Egon Kidmose'],
32 |     classifiers=[
33 |         'Development Status :: 4 - Beta',
34 |         'Intended Audience :: Developers',
35 |         'License :: OSI Approved :: MIT License',
36 |         'Programming Language :: Python :: 3.5',
37 |         'Programming Language :: Python :: 3.6',
38 |         'Programming Language :: Python :: 3.7',
39 |     ],
40 | 
41 | )
42 | 


--------------------------------------------------------------------------------