├── certstream_analytics
├── __init__.py
├── reporters
│ ├── __init__.py
│ └── base.py
├── transformers
│ ├── __init__.py
│ └── base.py
├── storages
│ ├── __init__.py
│ ├── base.py
│ └── elasticsearch_storage.py
├── analysers
│ ├── __init__.py
│ ├── base.py
│ ├── domain_matching.py
│ └── common_domain_analyser.py
└── stream.py
├── .coveragerc
├── tests
├── opendns-top-domains.txt
├── test_stream.py
├── test_elasticsearch.py
├── test_reporter.py
├── samples.json
└── test_domain_matching_analyser.py
├── setup.cfg
├── .gitmodules
├── LICENSE
├── scripts
├── sundry
│ ├── generate_features.py
│ ├── isolation_forest.py
│ ├── elliptic_envelope.py
│ ├── lof.py
│ └── certstream-domain-features.ipynb
└── replay.py
├── .travis.yml
├── setup.py
├── .gitignore
├── bin
└── domain_matching.py
├── README.md
└── pylintrc
/certstream_analytics/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | source=certstream-analytics
3 |
--------------------------------------------------------------------------------
/certstream_analytics/reporters/__init__.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=missing-docstring
2 | from .base import Reporter, FileReporter
3 |
--------------------------------------------------------------------------------
/tests/opendns-top-domains.txt:
--------------------------------------------------------------------------------
1 | google.com
2 | facebook.com
3 | bankofamerica.com
4 | apple.com
5 | www.net.cn
6 | discover.com
7 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal = 1
3 |
4 | [tool:pytest]
5 | pep8maxlinelength = 120
6 |
7 | [pep8]
8 | max-line-length = 120
9 |
--------------------------------------------------------------------------------
/certstream_analytics/transformers/__init__.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=missing-docstring
2 | from .base import Transformer, PassthroughTransformer, CertstreamTransformer
3 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "data/opendns/public-domain-lists"]
2 | path = data/opendns/public-domain-lists
3 | url = https://github.com/opendns/public-domain-lists.git
4 |
--------------------------------------------------------------------------------
/certstream_analytics/storages/__init__.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=missing-docstring
2 | from .base import Storage
3 | from .elasticsearch_storage import ElasticsearchStorage
4 |
--------------------------------------------------------------------------------
/certstream_analytics/storages/base.py:
--------------------------------------------------------------------------------
1 | """
2 | Save certstream data into various storages, streaming or not.
3 | """
4 | from abc import ABCMeta, abstractmethod
5 |
6 |
7 | # pylint: disable=no-init,too-few-public-methods
8 | class Storage:
9 | """
10 | Define the template of all analyser class.
11 | """
12 | __metaclass__ = ABCMeta
13 |
14 | @abstractmethod
15 | def save(self, record):
16 | """
17 | Move along, nothing to see here.
18 | """
19 |
--------------------------------------------------------------------------------
/certstream_analytics/analysers/__init__.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=missing-docstring
2 | from .base import Analyser, Debugger
3 | from .domain_matching import AhoCorasickDomainMatching
4 | from .domain_matching import DomainMatchingOption, DomainMatching
5 | from .common_domain_analyser import WordSegmentation
6 | from .common_domain_analyser import BulkDomainMarker
7 | from .common_domain_analyser import FeaturesGenerator
8 | from .common_domain_analyser import IDNADecoder
9 | from .common_domain_analyser import HomoglyphsDecoder
10 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Huy Do
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/certstream_analytics/reporters/base.py:
--------------------------------------------------------------------------------
1 | """
2 | Report the analysis result somewhere.
3 | """
4 | import json
5 | from abc import ABCMeta, abstractmethod
6 |
7 |
8 | # pylint: disable=no-init,too-few-public-methods
9 | class Reporter:
10 | """
11 | Define the template of all reporter class.
12 | """
13 | __metaclass__ = ABCMeta
14 |
15 | @abstractmethod
16 | def publish(self, report):
17 | """
18 | Move along, nothing to see here.
19 | """
20 |
21 |
22 | class FileReporter(Reporter):
23 | """
24 | Simply print the report to a file.
25 | """
26 | def __init__(self, path):
27 | """
28 | Note that an exception will be raised if the path is not valid or writable.
29 | """
30 | self.fhandler = open(path, 'a')
31 |
32 | def __del__(self):
33 | self.fhandler.close()
34 |
35 | def publish(self, report):
36 | """
37 | This is a very basic reporter that will only print out the record it receives
38 | to a plain text file.
39 | """
40 | if not report:
41 | return
42 |
43 | print(json.dumps(report), file=self.fhandler)
44 |
--------------------------------------------------------------------------------
/scripts/sundry/generate_features.py:
--------------------------------------------------------------------------------
1 | '''
2 | Generate features for outlier detection.
3 | '''
4 |
5 | import json
6 | import sys
7 |
8 | from certstream_analytics.analysers import WordSegmentation
9 | from certstream_analytics.analysers import IDNADecoder
10 | from certstream_analytics.analysers import FeaturesGenerator
11 |
12 | def main(max_count=None):
13 | '''
14 | The record is assumed to be stored in a JSON file passed in as the first
15 | parameter of the script.
16 | '''
17 | segmenter = WordSegmentation()
18 | decoder = IDNADecoder()
19 | generator = FeaturesGenerator()
20 |
21 | with open(sys.argv[1]) as fhandle:
22 | count = 0
23 |
24 | for line in fhandle:
25 | try:
26 | record = json.loads(line.strip())
27 | except json.decoder.JSONDecodeError:
28 | continue
29 |
30 | record = decoder.run(record)
31 | record = segmenter.run(record)
32 | record = generator.run(record)
33 |
34 | print(json.dumps(record))
35 | count += 1
36 |
37 | if max_count and count > max_count:
38 | break
39 |
40 |
41 | if __name__ == '__main__':
42 | main()
43 |
--------------------------------------------------------------------------------
/tests/test_stream.py:
--------------------------------------------------------------------------------
1 | '''
2 | Test consuming the data from the great certstream.
3 | '''
4 | import time
5 | import unittest
6 |
7 | from certstream_analytics.analysers import Debugger
8 | from certstream_analytics.transformers import CertstreamTransformer
9 | from certstream_analytics.stream import CertstreamAnalytics
10 |
11 |
12 | class CertstreamTest(unittest.TestCase):
13 | '''
14 | Test the way we consume data from certstream.
15 | '''
16 | DEFAULT_DELAY = 30
17 |
18 | def setUp(self):
19 | '''
20 | Setup the client to consume from certstream.
21 | '''
22 | self.debugger = Debugger()
23 | self.transformer = CertstreamTransformer()
24 |
25 | self.engine = CertstreamAnalytics(transformer=self.transformer,
26 | analysers=self.debugger)
27 |
28 | def test_consume(self):
29 | '''
30 | Start to consume some data from certstream.
31 | '''
32 | self.engine.start()
33 |
34 | # Wait a bit
35 | time.sleep(CertstreamTest.DEFAULT_DELAY)
36 |
37 | self.engine.stop()
38 | # We should see some data coming already
39 | self.assertTrue(self.debugger.count, 'Consuming data from certstream successfully')
40 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | os:
3 | - linux
4 | python:
5 | - '3.7'
6 | before_install:
7 | - sudo apt-get install -y libenchant-dev
8 | - sudo apt-get install -y apt-transport-https
9 | - wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo apt-key add -
10 | - echo "deb https://artifacts.elastic.co/packages/6.x/apt stable main" | sudo tee -a /etc/apt/sources.list.d/elastic-6.x.list
11 | - sudo apt-get update && sudo apt-get remove -y elasticsearch
12 | - sudo apt-get install -y elasticsearch
13 | - sudo chown -R elasticsearch:elasticsearch /etc/default/elasticsearch
14 | - sudo service elasticsearch start
15 | install:
16 | - pip install --upgrade pytest
17 | - pip install pytest-pep8 pytest-cov
18 | - pip install codecov
19 | - pip install elasticsearch_dsl certstream pyahocorasick tldextract wordsegment pyenchant idna confusable-homoglyphs
20 | - pip install git+https://github.com/casics/nostril.git
21 | - pip install -e .[tests]
22 | before_script:
23 | - sleep 10
24 | - sudo systemctl -l status elasticsearch
25 | - curl 'http://localhost:9200'
26 | script:
27 | - pytest --pep8 -m pep8 certstream_analytics/
28 | - PYTHONPATH=$PWD:$PYTHONPATH pytest --cov=./ tests/
29 | after_script:
30 | - curl 'http://localhost:9200/_cat/indices?v'
31 | after_success:
32 | - codecov
33 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | '''
2 | Standard Python setup script.
3 | '''
4 |
5 | from setuptools import setup, find_packages
6 |
7 | with open('README.md', 'r') as fh:
8 | long_description = fh.read()
9 |
10 | setup(
11 | name='certstream-analytics',
12 | version='0.1.7',
13 | description='certstream + analytics',
14 | url='https://github.com/huydhn/certstream-analytics',
15 | author='Huy Do',
16 | author_email='huydhn@gmail.com',
17 | license='MIT',
18 | long_description=long_description,
19 | long_description_content_type='text/markdown',
20 | install_requires=[
21 | 'elasticsearch_dsl',
22 | 'certstream',
23 | 'pyahocorasick',
24 | 'tldextract',
25 | 'wordsegment',
26 | 'pyenchant',
27 | 'idna',
28 | 'confusable_homoglyphs'
29 | ],
30 | tests_require=[
31 | 'coverage',
32 | 'nose',
33 | 'pytest-pep8',
34 | 'pytest-cov',
35 | 'codecov'
36 | ],
37 | dependency_links=[
38 | 'https://github.com/casics/nostril/tarball/master'
39 | ],
40 | packages=find_packages(),
41 | scripts=['bin/domain_matching.py'],
42 | classifiers=[
43 | "Programming Language :: Python :: 3",
44 | "License :: OSI Approved :: MIT License",
45 | "Operating System :: OS Independent",
46 | ],
47 | )
48 |
--------------------------------------------------------------------------------
/scripts/sundry/isolation_forest.py:
--------------------------------------------------------------------------------
1 | '''
2 | Apply the isolation forest method to separate our outliers.
3 | '''
4 | import json
5 | import sys
6 | import numpy as np
7 |
8 | from sklearn.ensemble import IsolationForest
9 | from sklearn.preprocessing import scale
10 |
11 |
12 | def main():
13 | '''
14 | The procedure contains two simple steps:
15 | - Scale the data to the standard distribution with mean 0 and unit variance.
16 | This might be too simplistic.
17 | - Apply the isolation forest. The contamination level is set manually.
18 | '''
19 | domains = []
20 | raw = []
21 |
22 | with open(sys.argv[1]) as fhandle:
23 | for line in fhandle:
24 | record = json.loads(line.strip())
25 |
26 | for analyser in record['analysers']:
27 | if analyser['analyser'] == 'FeaturesGenerator':
28 | raw.extend(analyser['output'])
29 |
30 | if analyser['analyser'] == 'WordSegmentation':
31 | domains.extend(analyser['output'].keys())
32 |
33 | if len(raw) != len(domains):
34 | print(record)
35 | sys.exit(0)
36 |
37 | x_samples = scale(np.array(raw))
38 |
39 | engine = IsolationForest(behaviour='new', contamination=0.015)
40 | y_samples = engine.fit_predict(x_samples)
41 |
42 | for index, y_sample in enumerate(y_samples):
43 | if y_sample == -1:
44 | print(domains[index])
45 |
46 |
47 | if __name__ == '__main__':
48 | main()
49 |
--------------------------------------------------------------------------------
/scripts/sundry/elliptic_envelope.py:
--------------------------------------------------------------------------------
1 | '''
2 | Apply the elliptic envelope method to separate our outliers.
3 | '''
4 | import json
5 | import sys
6 | import numpy as np
7 |
8 | from sklearn.covariance import EllipticEnvelope
9 | from sklearn.preprocessing import scale
10 |
11 |
12 | def main():
13 | '''
14 | The procedure contains two simple steps:
15 | - Scale the data to the standard distribution with mean 0 and unit variance.
16 | This might be too simplistic.
17 | - Apply the elliptic envelope. The contamination level is set manually.
18 | '''
19 | domains = []
20 | raw = []
21 |
22 | with open(sys.argv[1]) as fhandle:
23 | for line in fhandle:
24 | record = json.loads(line.strip())
25 |
26 | for analyser in record['analysers']:
27 | if analyser['analyser'] == 'FeaturesGenerator':
28 | raw.extend(analyser['output'])
29 |
30 | if analyser['analyser'] == 'WordSegmentation':
31 | domains.extend(analyser['output'].keys())
32 |
33 | if len(raw) != len(domains):
34 | print(record)
35 | sys.exit(0)
36 |
37 | x_samples = scale(np.array(raw))
38 |
39 | engine = EllipticEnvelope(contamination=0.015, support_fraction=1.0)
40 | y_samples = engine.fit_predict(x_samples)
41 |
42 | for index, y_sample in enumerate(y_samples):
43 | if y_sample == -1:
44 | print(domains[index])
45 |
46 |
47 | if __name__ == '__main__':
48 | main()
49 |
--------------------------------------------------------------------------------
/scripts/sundry/lof.py:
--------------------------------------------------------------------------------
1 | '''
2 | Apply the local outlier factor method to separate our outliers.
3 | '''
4 | import json
5 | import sys
6 | import numpy as np
7 |
8 | from sklearn.neighbors import LocalOutlierFactor
9 | from sklearn.preprocessing import scale
10 |
11 |
12 | def main():
13 | '''
14 | The procedure contains two simple steps:
15 | - Scale the data to the standard distribution with mean 0 and unit variance.
16 | This might be too simplistic.
17 | - Apply the local outlier factor. The contamination level is set manually.
18 |
19 | This method does not seem to work in our case cause I suspect it treats groups
20 | of several outliers as clusters.
21 | '''
22 | domains = []
23 | raw = []
24 |
25 | with open(sys.argv[1]) as fhandle:
26 | for line in fhandle:
27 | record = json.loads(line.strip())
28 |
29 | for analyser in record['analysers']:
30 | if analyser['analyser'] == 'FeaturesGenerator':
31 | raw.extend(analyser['output'])
32 |
33 | if analyser['analyser'] == 'WordSegmentation':
34 | domains.extend(analyser['output'].keys())
35 |
36 | if len(raw) != len(domains):
37 | print(record)
38 | sys.exit(0)
39 |
40 | x_samples = scale(np.array(raw))
41 |
42 | # Need to check the appropriate value for n_neighbors
43 | engine = LocalOutlierFactor(contamination=0.015)
44 | y_samples = engine.fit_predict(x_samples)
45 |
46 | for index, y_sample in enumerate(y_samples):
47 | if y_sample == -1:
48 | print(domains[index])
49 |
50 |
51 | if __name__ == '__main__':
52 | main()
53 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | nohup.*
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | .hypothesis/
50 | .pytest_cache/
51 |
52 | # Translations
53 | *.mo
54 | *.pot
55 |
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 | db.sqlite3
60 |
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 |
65 | # Scrapy stuff:
66 | .scrapy
67 |
68 | # Sphinx documentation
69 | docs/_build/
70 |
71 | # PyBuilder
72 | target/
73 |
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 |
77 | # pyenv
78 | .python-version
79 |
80 | # celery beat schedule file
81 | celerybeat-schedule
82 |
83 | # SageMath parsed files
84 | *.sage.py
85 |
86 | # Environments
87 | .env
88 | .venv
89 | env/
90 | venv/
91 | ENV/
92 | env.bak/
93 | venv.bak/
94 |
95 | # Spyder project settings
96 | .spyderproject
97 | .spyproject
98 |
99 | # Rope project settings
100 | .ropeproject
101 |
102 | # mkdocs documentation
103 | /site
104 |
105 | # mypy
106 | .mypy_cache/
107 | *.txt
108 | .idea
109 |
--------------------------------------------------------------------------------
/tests/test_elasticsearch.py:
--------------------------------------------------------------------------------
1 | '''
2 | Save some dummy records into Elasticsearch.
3 | '''
4 | import os
5 | import json
6 | import time
7 | import unittest
8 |
9 | from elasticsearch import Elasticsearch
10 | from elasticsearch_dsl import Search, Q
11 |
12 | from certstream_analytics.transformers import CertstreamTransformer
13 | from certstream_analytics.storages import ElasticsearchStorage
14 |
15 |
16 | class ElasticsearchTest(unittest.TestCase):
17 | '''
18 | Test the way we save data into Elasticsearch.
19 | '''
20 | def setUp(self):
21 | '''
22 | Setup the client to consume from certstream and save the data into
23 | Elasticsearch
24 | '''
25 | elasticsearch_host = os.getenv('ELASTICSEARCH_HOST', 'localhost:9200')
26 |
27 | self.transformer = CertstreamTransformer()
28 | self.storage = ElasticsearchStorage(hosts=[elasticsearch_host])
29 | self.search = Search(using=Elasticsearch(elasticsearch_host), index='certstream-*')
30 |
31 | def test_save(self):
32 | '''
33 | Start to save certstream data into Elasticsearch.
34 | '''
35 | current_dir = os.path.dirname(os.path.realpath(__file__))
36 |
37 | with open(os.path.join(current_dir, 'samples.json')) as fhandle:
38 | samples = json.load(fhandle)
39 |
40 | for sample in samples:
41 | filtered = self.transformer.apply(sample)
42 | self.storage.save(filtered)
43 |
44 | # Try to wait for a few seconds here so that Elasticsearch has enough
45 | # time to index the data
46 | time.sleep(5)
47 |
48 | for sample in samples:
49 | domain = sample['data']['leaf_cert']['all_domains'][0]
50 | # Look for the record in Elasticsearch
51 | query = Q('multi_match', query=domain, fields=['domain', 'san'])
52 | response = self.search.query(query).execute()
53 |
54 | self.assertGreaterEqual(response.hits.total, 1,
55 | 'The record has been indexed in Elasticsearch')
56 | self.assertIn(response.hits[0].domain, sample['data']['leaf_cert']['all_domains'],
57 | 'The correct record is returned')
58 |
--------------------------------------------------------------------------------
/certstream_analytics/analysers/base.py:
--------------------------------------------------------------------------------
1 | """
2 | Analyse the certificate data from certstream.
3 | """
4 | import json
5 | import logging
6 | from abc import ABCMeta, abstractmethod
7 |
8 |
9 | # pylint: disable=no-init,too-few-public-methods
10 | class Analyser:
11 | """
12 | Define the template of all analyser class.
13 | """
14 | __metaclass__ = ABCMeta
15 |
16 | @abstractmethod
17 | def run(self, record):
18 | """
19 | In normal cases, an analyser will process the record, save the result
20 | into the record, and then return the updated record so that the next
21 | analyser can choose what to do next. Therefore, the structure of the
22 | record comes from CertstreamTransformer class as follows:
23 |
24 | {
25 | # These fields are extracted from certstream
26 | cert_index: INTEGER,
27 | seen: TIMESTAMP,
28 | chain: [
29 | ORGANIZATION
30 | ],
31 | not_before: TIMESTAMP,
32 | not_after: TIMESTAMP,
33 | all_domains: [
34 | SAN
35 | ],
36 |
37 | # This is a place holder field which are used later by the
38 | # analysers. Each analyser will append its result here.
39 | analysers: [
40 | {
41 | analyser: ANALYSER NAME,
42 | output: ANYTHING GOES HERE,
43 | },
44 | ],
45 | }
46 | """
47 |
48 |
49 | class Debugger(Analyser):
50 | """
51 | A dummy analyser for debugging.
52 | """
53 | def __init__(self):
54 | """
55 | Keep track of the number of records so far for debugging purpose.
56 | """
57 | self.count = 0
58 |
59 | def run(self, record):
60 | '''
61 | This is a dummy analyser that will only print out the record it processes.
62 | '''
63 | logging.info(json.dumps(record))
64 |
65 | # Update the number of records so far
66 | self.count += 1
67 |
68 | if 'analysers' not in record:
69 | record['analysers'] = []
70 |
71 | record['analysers'].append({
72 | 'analyser': type(self).__name__,
73 | 'output': self.count,
74 | })
75 |
76 | return record
77 |
--------------------------------------------------------------------------------
/certstream_analytics/transformers/base.py:
--------------------------------------------------------------------------------
1 | """
2 | Transform the certificate data from certstream before passing it to the
3 | processing pipeline.
4 | """
5 | from abc import ABCMeta, abstractmethod
6 |
7 |
8 | # pylint: disable=no-init,too-few-public-methods
9 | class Transformer:
10 | """
11 | Define the template of all transformer class.
12 | """
13 | __metaclass__ = ABCMeta
14 |
15 | @abstractmethod
16 | def apply(self, raw):
17 | """
18 | Move along, nothing to see here.
19 | """
20 |
21 |
22 | class PassthroughTransformer(Transformer):
23 | """
24 | A dummy transformer that doesn't do anything.
25 | """
26 | def apply(self, raw):
27 | """
28 | Move along, nothing to see here.
29 | """
30 | return raw
31 |
32 |
33 | class CertstreamTransformer(Transformer):
34 | """
35 | Transform data from certstream into something readily consumable by the
36 | processing pipeline.
37 | """
38 | def apply(self, raw):
39 | """
40 | The format of the message from certstream can be found at their github
41 | documentation.
42 |
43 | So far, we are only interested in the domain names, the timestamps, and
44 | probably the content of the subject. So the returned stucture is as
45 | follows:
46 |
47 | {
48 | # These fields are extracted from certstream
49 | cert_index: INTEGER,
50 | seen: TIMESTAMP,
51 | chain: [
52 | ORGANIZATION
53 | ],
54 | not_before: TIMESTAMP,
55 | not_after: TIMESTAMP,
56 | all_domains: [
57 | SAN
58 | ],
59 |
60 | # This is a place holder field which are used later by the
61 | # analysers. Each analyser will append its result here.
62 | analysers: [
63 | {
64 | analyser: ANALYSER NAME,
65 | output: ANYTHING GOESE HERE,
66 | },
67 | ],
68 | }
69 | """
70 | filtered = {
71 | 'cert_index': raw['data']['cert_index'],
72 | 'seen': raw['data']['seen'],
73 | 'chain': [],
74 |
75 | # The analyser result will be stored here later on
76 | 'analysers': [],
77 | }
78 |
79 | interested_fields = ['not_before', 'not_after', 'all_domains']
80 |
81 | if raw['data']['leaf_cert']['all_domains']:
82 | filtered.update({k: raw['data']['leaf_cert'][k] for k in interested_fields})
83 | return filtered
84 |
85 | return None
86 |
--------------------------------------------------------------------------------
/certstream_analytics/storages/elasticsearch_storage.py:
--------------------------------------------------------------------------------
1 | """
2 | Save certstream data into Elasticsearch so that it can be queried by Kibana
3 | later on.
4 | """
5 | from datetime import datetime
6 | from elasticsearch_dsl import connections, analyzer
7 | from elasticsearch_dsl import Document, Date, Text, Keyword
8 |
9 | from .base import Storage
10 |
11 | ANALYZER = analyzer('standard_analyzer',
12 | tokenizer='standard_tokenizer',
13 | filter=['lowercase'])
14 |
15 |
16 | # pylint: disable=too-few-public-methods
17 | class ElasticsearchStorage(Storage):
18 | """
19 | An experiment Elasticsearch storage to keep and index the received records.
20 | """
21 | class Record(Document):
22 | """
23 | An Elasticsearch record as it is.
24 | """
25 | timestamp = Date(default_timezone='UTC')
26 |
27 | # As reported by certstream
28 | seen = Date(default_timezone='UTC')
29 |
30 | # The domain time to live
31 | not_before = Date(default_timezone='UTC')
32 | not_after = Date(default_timezone='UTC')
33 |
34 | # The domain and its alternative names
35 | domain = Text(analyzer=ANALYZER, fields={'raw': Keyword()})
36 | san = Text(analyzer=ANALYZER, fields={'raw': Keyword()})
37 |
38 | # The issuer
39 | chain = Text(analyzer=ANALYZER, fields={'raw': Keyword()})
40 |
41 | class Index:
42 | """
43 | Use daily indices.
44 | """
45 | name = 'certstream-*'
46 |
47 | # pylint: disable=arguments-differ
48 | def save(self, **kwargs):
49 | """
50 | Magically save the record in Elasticsearch.
51 | """
52 | self.timestamp = datetime.now()
53 | # Override the index to go to the proper timeslot
54 | kwargs['index'] = self.timestamp.strftime('certstream-%Y.%m.%d')
55 |
56 | return super().save(**kwargs)
57 |
58 | def __init__(self, hosts, timeout=10):
59 | """
60 | Provide the Elasticsearch hostname (Defaults to localhost).
61 | """
62 | connections.create_connection(hosts=hosts, timeout=timeout)
63 |
64 | def save(self, record):
65 | """
66 | Save the certstream record in Elasticsearch.
67 | """
68 | elasticsearch_record = ElasticsearchStorage.Record(meta={'id': record['cert_index']})
69 |
70 | # In miliseconds
71 | elasticsearch_record.seen = int(record['seen'] * 1000)
72 | elasticsearch_record.not_before = int(record['not_before'] * 1000)
73 | elasticsearch_record.not_after = int(record['not_after'] * 1000)
74 |
75 | # Elasticsearch will parse and index the domain and all its alternative names
76 | elasticsearch_record.domain = record['all_domains'][0]
77 | elasticsearch_record.san = record['all_domains'][1:]
78 |
79 | elasticsearch_record.save()
80 |
--------------------------------------------------------------------------------
/tests/test_reporter.py:
--------------------------------------------------------------------------------
1 | '''
2 | Various tests for the reporter module.
3 | '''
4 | import json
5 | import tempfile
6 | import unittest
7 |
8 | from certstream_analytics.reporters import FileReporter
9 |
10 |
11 | class FileReporterTest(unittest.TestCase):
12 | '''
13 | Test the file-based reporter.
14 | '''
15 | def setUp(self):
16 | '''
17 | Create a temporary file so that the test can write its reports into it.
18 | '''
19 | self.tmp = tempfile.NamedTemporaryFile()
20 | self.reporter = FileReporter(path=self.tmp.name)
21 |
22 | def test_report(self):
23 | '''
24 | Dump all the test reports to our temporary file.
25 | '''
26 | cases = [
27 | {
28 | 'report': {
29 | 'all_domains': ['store.google.com', 'google.com'],
30 | 'analysers': [
31 | {
32 | 'analyser': 'AhoCorasickDomainMatching',
33 | 'domain': 'store.google.com',
34 | 'match': 'google',
35 | },
36 | ],
37 | },
38 | 'description': 'Report an exact match domain',
39 | },
40 |
41 | {
42 | 'report': {
43 | 'all_domains': ['www.facebook.com.msg40.site'],
44 | 'analysers': [
45 | {
46 | 'analyser': 'AhoCorasickDomainMatching',
47 | 'domain': 'www.facebook.com.msg40.site',
48 | 'match': 'facebook',
49 | },
50 | ],
51 | },
52 | 'description': 'Report a phishing domain with a sub-domain match',
53 | },
54 |
55 | {
56 | 'report': {
57 | 'all_domains': ['login-appleid.apple.com.managesuppport.co'],
58 | 'analysers': [
59 | {
60 | 'analyser': 'AhoCorasickDomainMatching',
61 | 'domain': 'login-appleid.apple.com.managesuppport.co',
62 | 'match': 'apple',
63 | },
64 | ],
65 | },
66 | 'description': 'Report a phishing domain with a partial string match',
67 | },
68 |
69 | {
70 | 'report': {},
71 | 'description': 'Report nothing and thus will be ignored',
72 | },
73 | ]
74 |
75 | for case in cases:
76 | self.reporter.publish(case['report'])
77 |
78 | with open(self.tmp.name) as fhandler:
79 | lines = fhandler.readlines()
80 |
81 | for index, line in enumerate(lines):
82 | got = json.loads(line)
83 | self.assertDictEqual(got, cases[index]['report'], cases[index]['description'])
84 |
--------------------------------------------------------------------------------
/bin/domain_matching.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | A simple utility to query certstream and match its records with a list
4 | of known domains (from OpenDNS). This script can also save the data into
5 | downstream storage for further processing, for example, Elasticsearch.
6 | """
7 | import argparse
8 | import logging
9 | import signal
10 | import sys
11 | import time
12 |
13 | from certstream_analytics.analysers import AhoCorasickDomainMatching
14 | from certstream_analytics.analysers import WordSegmentation
15 | from certstream_analytics.analysers import DomainMatching, DomainMatchingOption
16 | from certstream_analytics.analysers import BulkDomainMarker
17 | from certstream_analytics.analysers import IDNADecoder
18 | from certstream_analytics.analysers import HomoglyphsDecoder
19 | from certstream_analytics.analysers import FeaturesGenerator
20 | from certstream_analytics.transformers import CertstreamTransformer
21 | from certstream_analytics.reporters import FileReporter
22 | from certstream_analytics.storages import ElasticsearchStorage
23 | from certstream_analytics.stream import CertstreamAnalytics
24 |
25 | DONE = False
26 |
27 |
28 | # pylint: disable=unused-argument
29 | def exit_gracefully(signum, stack):
30 | """
31 | Just to be nice.
32 | """
33 | # pylint: disable=global-statement
34 | global DONE
35 | DONE = True
36 |
37 |
38 | def init_analysers(domains_file, include_tld, matching_option):
39 | """
40 | Initialize all the analysers for matching domains. The list includes:
41 |
42 | - IDNA
43 | - Homoglyphs
44 | - AhoCorasick
45 | - Word segmentation
46 | - Bulk domains
47 | - Meta domain matching
48 | """
49 | with open(domains_file) as fhandle:
50 | domains = [line.rstrip() for line in fhandle]
51 |
52 | # Initialize all analysers. Note that their order is important cause they
53 | # will be executed in that order
54 | return [
55 | IDNADecoder(),
56 | HomoglyphsDecoder(greedy=False),
57 | AhoCorasickDomainMatching(domains=domains),
58 | WordSegmentation(),
59 | BulkDomainMarker(),
60 | DomainMatching(include_tld=include_tld, option=matching_option),
61 | FeaturesGenerator(),
62 | ]
63 |
64 |
65 | def run():
66 | """
67 | A simple utility to query certstream and match its records to a list of
68 | known domains from OpenDNS.
69 | """
70 | epilog = '''
71 | examples:
72 | \033[1;33m/usr/bin/domain_matching.py --elasticsearch-host elasticsearch:9200\033[0m
73 |
74 | \033[1;33m/usr/bin/domain_matching.py --dump-location certstream.txt\033[0m
75 |
76 | \033[1;33m/usr/bin/domain_matching.py --domains opendns-top-domains.txt\033[0m
77 |
78 | Consume data from Certstream and does its magic.
79 | '''
80 | parser = argparse.ArgumentParser(description=__doc__, epilog=epilog,
81 | formatter_class=argparse.RawDescriptionHelpFormatter)
82 |
83 | parser.add_argument('--domains',
84 | help='the list of domains to match with (e.g. opendns-top-domains.txt)')
85 |
86 | parser.add_argument('--elasticsearch-host',
87 | help='set the Elasticsearch host to store the records from Certstream')
88 |
89 | parser.add_argument('--dump-location',
90 | help='where to dump the records from Certstream')
91 |
92 | try:
93 | args = parser.parse_args()
94 | # pylint: disable=broad-except
95 | except Exception as error:
96 | logging.error(error)
97 | # some errors occur when parsing the arguments, show the usage
98 | parser.print_help()
99 | # then quit
100 | sys.exit(1)
101 |
102 | transformer = CertstreamTransformer()
103 | analysers = init_analysers(domains_file=args.domains,
104 | include_tld=True,
105 | matching_option=DomainMatchingOption.ORDER_MATCH)
106 | reporter = FileReporter(path=args.dump_location) if args.dump_location else None
107 | storage = ElasticsearchStorage(hosts=[args.elasticsearch_host]) if args.elasticsearch_host else None
108 |
109 | engine = CertstreamAnalytics(transformer=transformer,
110 | storages=storage,
111 | analysers=analysers,
112 | reporters=reporter)
113 | engine.start()
114 |
115 | while not DONE:
116 | time.sleep(1)
117 |
118 | engine.stop()
119 |
120 |
121 | if __name__ == '__main__':
122 | # Make sure that we can exit gracefully
123 | signal.signal(signal.SIGINT, exit_gracefully)
124 | signal.signal(signal.SIGTERM, exit_gracefully)
125 |
126 | run()
127 |
--------------------------------------------------------------------------------
/certstream_analytics/stream.py:
--------------------------------------------------------------------------------
1 | """
2 | All hail [certstream](https://github.com/CaliDog/certstream-python)!!
3 |
4 | This module consumes the feed of certificates from certstream and does
5 | the heavy lifting.
6 | """
7 | import sys
8 | import threading
9 | import certstream
10 |
11 | from certstream_analytics.analysers import Analyser
12 | from certstream_analytics.reporters import Reporter
13 | from certstream_analytics.storages import Storage
14 |
15 |
16 | class CertstreamAnalytics():
17 | """
18 | Consume the feed of certificates from certstream, transform the data, and
19 | save it into various storages.
20 | """
21 |
22 | def __init__(self, transformer=None, storages=None, analysers=None, reporters=None):
23 | """
24 | This is the entry point of the whole module. It consumes data from
25 | certstream, transform it using a Transformer class, save it into
26 | a predefined storage (elasticsearch), and run the use-defined
27 | analysis.
28 |
29 | The transformer can be None or a subclass of CertstreamTransformer. It
30 | transform the raw data from certstream.
31 |
32 | The storage can be None or a subclass of CertstreamStorage. A sample
33 | kind of storage is Elasticsearch.
34 |
35 | The analyser can be None or a subclass of CertstreamAnalyser. It's
36 | entirely up to the user to decide what to do here with the transformed
37 | data from certstream.
38 |
39 | The reporter, as its name implies, collects and publishes the analyser
40 | result somewhere, for example, email notification. It will be a subclass
41 | of CertstreamReporter.
42 | """
43 | self.transformer = transformer
44 |
45 | self.analysers = []
46 | self.reporters = []
47 | self.storages = []
48 |
49 | def _init_member(member, value, kind):
50 | """
51 | Initialize all storages, analysers, and reporters.
52 | """
53 | if value:
54 | if isinstance(value, (list, tuple)):
55 | setattr(self, member, value)
56 | else:
57 | getattr(self, member).append(value)
58 |
59 | for type_check in getattr(self, member):
60 | if not isinstance(type_check, kind):
61 | raise TypeError('Invalid {} type: {}'.format(member, type(type_check).__name__))
62 |
63 | _init_member('analysers', analysers, Analyser)
64 | _init_member('reporters', reporters, Reporter)
65 | _init_member('storages', storages, Storage)
66 |
67 | self.stopped = True
68 | self.thread = None
69 |
70 | def start(self):
71 | """
72 | Start consuming data from certstream.
73 | """
74 | # Run the stream in a separate thread
75 | self.thread = threading.Thread(target=self._consume)
76 | # So that it will be killed when the main thread stop
77 | self.thread.daemon = True
78 | self.thread.start()
79 |
80 | def stop(self):
81 | """
82 | Stop consuming data from certstream.
83 | """
84 | if self.stopped:
85 | return
86 |
87 | self.stopped = True
88 | self.thread.join()
89 |
90 | def _consume(self):
91 | """
92 | Start consuming the data from certstream.
93 | """
94 | self.stopped = False
95 | # pylint: disable=unnecessary-lambda
96 | certstream.listen_for_events(lambda m, c: self._callback(m, c),
97 | url='wss://certstream.calidog.io')
98 |
99 | # pylint: disable=unused-argument
100 | def _callback(self, message, context):
101 | """
102 | The callback handler template itself.
103 | """
104 | if self.stopped:
105 | sys.exit()
106 |
107 | if message['message_type'] == 'heartbeat':
108 | return
109 |
110 | if message['message_type'] == 'certificate_update':
111 | if self.transformer:
112 | # Apply the user-defined transformation. The structure of the raw
113 | # message is at See https://github.com/CaliDog/certstream-python/
114 | transformed_message = self.transformer.apply(message)
115 | else:
116 | transformed_message = message
117 |
118 | if self.storages and transformed_message:
119 | # Save the message into a more permanent storage. May be we should
120 | # support multiple storages in parallel here
121 | for storage in self.storages:
122 | storage.save(transformed_message)
123 |
124 | if self.analysers:
125 | # Note that the order of analysers is extremely important cause the
126 | # output of an analyser will be come the input of the next analyser
127 | for analyser in self.analysers:
128 | if not transformed_message:
129 | break
130 |
131 | # Run something here
132 | transformed_message = analyser.run(transformed_message)
133 |
134 | if self.reporters and transformed_message:
135 | # and report the final result
136 | for reporter in self.reporters:
137 | reporter.publish(transformed_message)
138 |
--------------------------------------------------------------------------------
/scripts/replay.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | '''
3 | Replay a stream of records from certstream to test the processing pipeline.
4 | '''
5 | import argparse
6 | import json
7 | import logging
8 | import sys
9 |
10 | from certstream_analytics.analysers import AhoCorasickDomainMatching
11 | from certstream_analytics.analysers import WordSegmentation
12 | from certstream_analytics.analysers import DomainMatching, DomainMatchingOption
13 | from certstream_analytics.analysers import BulkDomainMarker
14 | from certstream_analytics.analysers import IDNADecoder
15 | from certstream_analytics.analysers import HomoglyphsDecoder
16 | from certstream_analytics.analysers import FeaturesGenerator
17 | from certstream_analytics.reporters import FileReporter
18 | from certstream_analytics.storages import ElasticsearchStorage
19 |
20 |
21 | SUPPORTED_REPORTERS = {
22 | 'file': lambda location: FileReporter(path=location)
23 | }
24 |
25 | SUPPORTED_STORAGES = {
26 | 'elasticsearch': lambda host: ElasticsearchStorage(hosts=[host])
27 | }
28 |
29 |
30 | def init_analysers(domains_file, include_tld, matching_option):
31 | '''
32 | Initialize all the analysers for matching domains. The list includes:
33 |
34 | - IDNA
35 | - Homoglyphs
36 | - AhoCorasick
37 | - Word segmentation
38 | - Bulk domains
39 | - Meta domain matching
40 | '''
41 | with open(domains_file) as fhandle:
42 | domains = [line.rstrip() for line in fhandle]
43 |
44 | # Initialize all analysers. Note that their order is important cause they
45 | # will be executed in that order
46 | return [
47 | IDNADecoder(),
48 | HomoglyphsDecoder(greedy=False),
49 | AhoCorasickDomainMatching(domains=domains),
50 | WordSegmentation(),
51 | BulkDomainMarker(),
52 | DomainMatching(include_tld=include_tld, option=matching_option),
53 | FeaturesGenerator(),
54 | ]
55 |
56 |
57 | def run():
58 | '''
59 | A simple utility to replay certstream and match the records to a list of
60 | known domains from OpenDNS. It also generates several features for each
61 | domain such as the domain length.
62 | '''
63 | epilog = '''
64 | examples:
65 | \033[1;33m/usr/bin/replay.py --replay certstream.txt\033[0m
66 |
67 | \033[1;33m/usr/bin/replay.py --storage-host elasticsearch:9200 --storage elasticsearch\033[0m
68 |
69 | \033[1;33m/usr/bin/replay.py --report-location report.txt --report file\033[0m
70 |
71 | \033[1;33m/usr/bin/replay.py --domains opendns-top-domains.txt\033[0m
72 |
73 | Replay data from certstream.
74 | '''
75 | parser = argparse.ArgumentParser(description=__doc__, epilog=epilog,
76 | formatter_class=argparse.RawDescriptionHelpFormatter)
77 |
78 | parser.add_argument('--replay',
79 | help='the list of records from certstream (one per line)')
80 | parser.add_argument('--domains',
81 | help='the list of domains to match with (opendns-top-domains.txt)')
82 |
83 | parser.add_argument('--storage-host', default='localhost:9200',
84 | help='set the storage host')
85 | parser.add_argument('-s', '--storage',
86 | help='choose the storage type (elasticsearch)')
87 |
88 | parser.add_argument('--report-location',
89 | help='where to save the report to?')
90 | parser.add_argument('-r', '--report', default='file',
91 | help='choose the reporter type')
92 |
93 | try:
94 | args = parser.parse_args()
95 | # pylint: disable=broad-except
96 | except Exception as error:
97 | logging.error(error)
98 | # some errors occur when parsing the arguments, show the usage
99 | parser.print_help()
100 | # then quit
101 | sys.exit(1)
102 |
103 | if args.report and args.report not in SUPPORTED_REPORTERS:
104 | error = 'Report type \033[1;31m{}\033[0m is not supported. The list of supported reporters includes: {}' \
105 | .format(args.report, list(SUPPORTED_REPORTERS.keys()))
106 |
107 | logging.error(error)
108 | # Encounter an unsupported storage type
109 | sys.exit(1)
110 |
111 | if args.storage and args.storage not in SUPPORTED_STORAGES:
112 | error = 'Storage type \033[1;31m{}\033[0m is not supported. The list of supported storages includes: {}' \
113 | .format(args.storage, list(SUPPORTED_STORAGES.keys()))
114 |
115 | logging.error(error)
116 | # Encounter an unsupported storage type
117 | sys.exit(1)
118 |
119 | analysers = init_analysers(domains_file=args.domains,
120 | include_tld=True,
121 | matching_option=DomainMatchingOption.ORDER_MATCH)
122 |
123 | if args.report:
124 | reporter = SUPPORTED_REPORTERS[args.report](args.report_location)
125 |
126 | if args.storage:
127 | storage = SUPPORTED_STORAGES[args.storage](args.storage_host)
128 |
129 | with open(args.replay) as fhandler:
130 | for raw in fhandler:
131 | try:
132 | record = json.loads(raw)
133 | except json.decoder.JSONDecodeError:
134 | continue
135 |
136 | if args.storage:
137 | storage.save(record)
138 |
139 | for analyser in analysers:
140 | # Run something here
141 | record = analyser.run(record)
142 |
143 | reporter.publish(record)
144 |
145 | if __name__ == '__main__':
146 | run()
147 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Certstream + Analytics
2 |
3 | [](https://travis-ci.org/huydhn/certstream-analytics)
4 | [](http://codecov.io/gh/huydhn/certstream-analytics?branch=master)
5 |
6 |
7 | # Installation
8 |
9 | The package can be installed from
10 | [PyPI](https://pypi.org/project/certstream-analytics)
11 |
12 | ```
13 | pip install certstream-analytics
14 | ```
15 |
16 | # Quick usage
17 |
18 | ```python
19 | bin/domain_matching.py --domains domains.txt --dump-location certstream.txt
20 |
21 | # The file domains.txt contains the list of domains that we want to monitor
22 | # for matches (domains with similar names). For examples, a file with only
23 | # two entries:
24 | #
25 | # gmail.com
26 | # facebook.com
27 | #
28 | # will match any domains that contains gmail or facebook keywords.
29 | #
30 | # All the records consumed from certstream will be kept in certstream.txt
31 | ```
32 |
33 | # API
34 |
35 | ```python
36 | import time
37 |
38 | from certstream_analytics.analysers import WordSegmentation
39 | from certstream_analytics.analysers import IDNADecoder
40 | from certstream_analytics.analysers import HomoglyphsDecoder
41 |
42 | from certstream_analytics.transformers import CertstreamTransformer
43 | from certstream_analytics.storages import ElasticsearchStorage
44 | from certstream_analytics.stream import CertstreamAnalytics
45 |
46 | done = False
47 |
48 | # These analysers will be run in the same order
49 | analyser = [
50 | IDNADecoder(),
51 | HomoglyphsDecoder(),
52 | WordSegmentation(),
53 | ]
54 |
55 | # The following fields are filtered out and indexed:
56 | # - String: domain
57 | # - List: SAN
58 | # - List: Trust chain
59 | # - Timestamp: Not before
60 | # - Timestamp: Not after
61 | # - Timestamp: Seen
62 | transformer = CertstreamTransformer()
63 |
64 | # Indexed the data in Elasticsearch
65 | storage = ElasticsearchStorage(hosts=['localhost:9200'])
66 |
67 | consumer = CertstreamAnalytics(transformer=transformer,
68 | storage=storage,
69 | analyser=analyser)
70 | # The consumer is run in another thread so this function is non-blocking
71 | consumer.start()
72 |
73 | while not done:
74 | time.sleep(1)
75 |
76 | consumer.stop()
77 | ```
78 |
79 | ## IDNA decoder
80 | This analyser decode IDNA domain name into Unicode for further processing
81 | downstream. Normally, it will be the very first analyser to be run. If
82 | the analyser encounters a malform IDNA domain string, it will keep the
83 | domain as it is.
84 |
85 | ```python
86 | from certstream_analytics.analysers import IDNADecoder
87 |
88 | decoder = IDNADecoder()
89 |
90 | # Just an example dummy record
91 | record = {
92 | 'all_domains': [
93 | 'xn--f1ahbgpekke1h.xn--p1ai',
94 | ]
95 | }
96 |
97 | # The domain name will now become 'укрэмпужск.рф'
98 | print(decoder.run(record))
99 | ```
100 |
101 | ## Homoglyphs decoder
102 | There are lots of phishing websites that utilize [homoglyphs](https://en.wikipedia.org/wiki/Homoglyph)
103 | to lure the victims. Some common examples include 'l' and 'i' or the
104 | Unicode character RHO '𝞀' and 'p'. The homoglyphs decoder uses the excellent
105 | [confusable_homoglyphs](https://github.com/vhf/confusable_homoglyphs) to
106 | generate all potential alternative domain names in ASCII.
107 |
108 | ```python
109 | from certstream_analytics.analysers import HomoglyphsDecoder
110 |
111 | # If the greedy flag is set, all alternative domains will be returned
112 | decoder = HomoglyphsDecoder(greed=False)
113 |
114 | # Just an example dummy record
115 | record = {
116 | 'all_domains': [
117 | # MATHEMATICAL MONOSPACE SMALL P
118 | '*.𝗉aypal.com',
119 |
120 | # MATHEMATICAL SAN-SERIF BOLD SMALL RHO
121 | '*.𝗉ay𝞀al.com',
122 | ]
123 | }
124 |
125 | # The domain name will now be converted to '*.paypal.com' with the ASCII
126 | # character p
127 | print(decoder.run(record))
128 | ```
129 |
130 | ## Aho-Corasick
131 | A domain and its SAN from Certstream will be compared against a list of
132 | most popular [domains](https://github.com/opendns/public-domain-lists)
133 | (from OpenDNS) using Aho-Corasick algorithm. This is a simple check to
134 | remove some of the most obvious phishing domains, for examples, *www.facebook.com.msg40.site*
135 | will match with *facebook* cause *facebook* is in the above list of most
136 | popular domains (I wonder how long it is going to last).
137 |
138 | ```python
139 | from certstream_analytics.analysers import AhoCorasickDomainMatching
140 | from certstream_analytics.reporter import FileReporter
141 |
142 | # Print the list of matching domains
143 | reporter = FileReporter('matching-results.txt')
144 |
145 | with open('opendns-top-domains.txt')) as fhandle:
146 | domains = [line.rstrip() for line in fhandle]
147 |
148 | # The list of domains to match against
149 | domain_matching_analyser = AhoCorasickDomainMatching(domains)
150 |
151 | consumer = CertstreamAnalytics(transformer=transformer,
152 | analyser=domain_matching_analyser,
153 | reporter=reporter)
154 |
155 | # Need to think about what to do with the matching result
156 | consumer.start()
157 |
158 | while not done:
159 | time.sleep(1)
160 |
161 | consumer.stop()
162 | ```
163 |
164 | ## Word segmentation
165 | In order to improve the accuracy of the matching algorithm, we segment
166 | the domains into English words using
167 | [wordsegment](https://github.com/grantjenks/python-wordsegment).
168 |
169 | ```python
170 | from certstream_analytics.analysers import WordSegmentation
171 |
172 | wordsegmentation = WordSegmentation()
173 |
174 | # Just an example dummy record
175 | record = {
176 | 'all_domains': [
177 | 'login-appleid.apple.com.managesupport.co',
178 | ]
179 | }
180 |
181 | # The returned output is as follows:
182 | #
183 | # {
184 | # 'analyser': 'WordSegmentation',
185 | # 'output': {
186 | # 'login-appleid.apple.com.managesuppport.co': [
187 | # 'login',
188 | # 'apple',
189 | # 'id',
190 | # 'apple',
191 | # 'com',
192 | # 'manage',
193 | # 'support',
194 | # 'co'
195 | # ],
196 | # },
197 | #
198 | print(decoder.run(record))
199 | ```
200 |
201 | ## Features generator
202 | A list of features for each domain will also be generated so that they
203 | can be used for classification jobs further downstream. The list
204 | includes:
205 |
206 | - The number of dot-separated fields in the domain, for example, www.google.com has 3.
207 | - The overall length of the domain in characters.
208 | - The length of the longest dot-separate field .
209 | - The length of the TLD, e.g. .online (6) or .download (8) is longer than .com (3).
210 | - The randomness level of the domain. [Nostril](https://github.com/casics/nostril)
211 | package is used to check how many words as returned by the WordSegmentation
212 | analyser are non-sense.
213 |
--------------------------------------------------------------------------------
/certstream_analytics/analysers/domain_matching.py:
--------------------------------------------------------------------------------
1 | """
2 | Verify the domain against the list of most popular domains from OpenDNS
3 | (https://github.com/opendns/public-domain-lists). Let's see how useful
4 | it is to prevent phishing domains.
5 | """
6 | from enum import Enum
7 |
8 | import json
9 | import logging
10 | import re
11 | import tldextract
12 | import ahocorasick
13 | import wordsegment
14 |
15 | from .base import Analyser
16 | from .common_domain_analyser import BulkDomainMarker
17 | from .common_domain_analyser import WordSegmentation
18 |
19 |
20 | # pylint: disable=too-few-public-methods
21 | class AhoCorasickDomainMatching(Analyser):
22 | """
23 | The domain and its SAN will be compared against the list of domains, for
24 | example, the most popular domains from OpenDNS.
25 | """
26 | # Get this number from the histogram of the length of all top domains
27 | MIN_MATCHING_LENGTH = 3
28 |
29 | # Some domains that don't work too well with tldextract and generate too
30 | # many FPs
31 | EXCLUDED_DOMAINS = {
32 | 'www': 1,
33 | 'web': 1,
34 | }
35 |
36 | # Some common domain parts that cause too many FP
37 | IGNORED_PARTS = r'^(autodiscover\.|cpanel\.)'
38 |
39 | def __init__(self, domains):
40 | """
41 | Use Aho-Corasick to find the matching domain so we construct its Trie
42 | here. Thought: How the f**k is com.com in the list?
43 | """
44 | self.automaton = ahocorasick.Automaton()
45 | self.domains = {}
46 |
47 | for index, domain in enumerate(domains):
48 | # Processing only the domain part. All sub-domains or TLDs will
49 | # be ignored, for example:
50 | # - www.google.com becomes google
51 | # - www.google.co.uk becomes google
52 | # - del.icio.us becomes icio
53 | ext = tldextract.extract(domain)
54 |
55 | if ext.domain in AhoCorasickDomainMatching.EXCLUDED_DOMAINS:
56 | continue
57 |
58 | self.automaton.add_word(ext.domain, (index, ext.domain))
59 | self.domains[ext.domain] = domain
60 |
61 | self.automaton.make_automaton()
62 |
63 | def run(self, record):
64 | """
65 | Use Aho-Corasick to find the matching domain. Check the time complexity
66 | of this function later.
67 |
68 | Tricky situation #1: When the string (domain) in the Trie is too short,
69 | it could match many domains, for example, g.co or t.co. So they need
70 | to be ignored somehow. Looking at the histogram of the length of all
71 | domains in the list, there are only less than 100 domains with the
72 | length of 2 or less. So we choose to ignore those. Also, we will
73 | prefer longer match than a shorter one for now.
74 | """
75 | if 'analysers' not in record:
76 | record['analysers'] = []
77 |
78 | results = {}
79 | # Check the domain and all its SAN
80 | for domain in record['all_domains']:
81 | # Remove wildcard
82 | domain = re.sub(r'^\*\.', '', domain)
83 |
84 | # Remove some FP-prone parts
85 | domain = re.sub(AhoCorasickDomainMatching.IGNORED_PARTS, '', domain)
86 |
87 | # Similar to all domains in the list, the TLD will be stripped off
88 | ext = tldextract.extract(domain)
89 | # The match will be a tuple in the following format: (5, (0, 'google'))
90 | matches = [m[1][1] for m in self.automaton.iter('.'.join(ext[:2]))
91 | if len(m[1][1]) >= AhoCorasickDomainMatching.MIN_MATCHING_LENGTH]
92 |
93 | if matches:
94 | matches.sort(key=len)
95 |
96 | match = matches[-1]
97 | # We only keep the the longest match of the first matching domain
98 | # for now
99 | results[domain] = [self.domains[match]] if match in self.domains else match
100 | break
101 |
102 | if results:
103 | record['analysers'].append({
104 | 'analyser': type(self).__name__,
105 | 'output': results,
106 | })
107 |
108 | return record
109 |
110 |
111 | class DomainMatchingOption(Enum):
112 | """
113 | Control how strict we want to do our matching.
114 | """
115 | # For example applefake.it will match with apple.com case ['apple'] is
116 | # a subset of ['apple', 'fake']
117 | SUBSET_MATCH = 0
118 |
119 | # Similar but use in instead of issubset so that the order is preserved
120 | ORDER_MATCH = 1
121 |
122 |
123 | class DomainMatching(Analyser):
124 | """
125 | This is the first example of the new group of meta analysers which are used
126 | to combine the result of other analysers.
127 | """
128 | def __init__(self, include_tld=True, option=DomainMatchingOption.ORDER_MATCH):
129 | """
130 | Just load the wordsegment package, whatever it is.
131 | """
132 | wordsegment.load()
133 |
134 | # Save the matching option here so we can refer to it later
135 | self.include_tld = include_tld
136 |
137 | self.option = {
138 | DomainMatchingOption.SUBSET_MATCH: set,
139 | DomainMatchingOption.ORDER_MATCH: list,
140 | }[option]
141 |
142 | def run(self, record):
143 | """
144 | Note that a meta-analyser will need to run after other analysers have
145 | finished so that their outputs are available.
146 | """
147 | if 'analysers' not in record:
148 | return record
149 |
150 | analysers = {
151 | AhoCorasickDomainMatching.__name__: {},
152 | WordSegmentation.__name__: {},
153 | BulkDomainMarker.__name__: {},
154 | }
155 |
156 | for analyser in record['analysers']:
157 | name = analyser['analyser']
158 |
159 | if name not in analysers:
160 | continue
161 |
162 | if name == BulkDomainMarker.__name__ and analyser['output']:
163 | # Skip bulk record and deal with it later, with such large
164 | # number of SAN name, it's bound to be a match
165 | continue
166 |
167 | analysers[name] = analyser['output']
168 |
169 | # Check that all outputs are there before continuing
170 | if not analysers[AhoCorasickDomainMatching.__name__] or not analysers[WordSegmentation.__name__]:
171 | return record
172 |
173 | results = self._match(analysers[AhoCorasickDomainMatching.__name__],
174 | analysers[WordSegmentation.__name__])
175 |
176 | if results:
177 | record['analysers'].append({
178 | 'analyser': type(self).__name__,
179 | 'output': results,
180 | })
181 |
182 | # DEBUG
183 | logging.info(json.dumps(record))
184 |
185 | return record
186 |
187 | def _match(self, ahocorasick_output, segmentation_output):
188 | """
189 | Use internally by the run function to combine AhoCorasick and WordSegmentation
190 | results.
191 | """
192 | results = {}
193 | # Check all the matching domains reported by AhoCorasick analyser
194 | for match, domains in ahocorasick_output.items():
195 | # The result of AhoCorasick matcher is a list of matching domains, for example,
196 | #
197 | # {
198 | # 'analyser': 'AhoCorasickDomainMatching',
199 | # 'output': {
200 | # 'login-appleid.apple.com.managesuppport.co': ['apple.com', 'support.com'],
201 | # },
202 | # },
203 | #
204 | if match not in segmentation_output:
205 | continue
206 |
207 | phish = self.option(segmentation_output[match])
208 | match_ext = tldextract.extract(match)
209 |
210 | for domain in domains:
211 | ext = tldextract.extract(domain)
212 |
213 | # This record is from a legitimate source, for example, agrosupport.zendesk.com
214 | # will match with zendesk.com. In our case, we don't really care about this so
215 | # it will be ignored and not reported as a match.
216 | if ext[1:] == match_ext[1:]:
217 | continue
218 |
219 | tmp = []
220 | # Intuitively, it will be more accurate if we choose to include the TLD here.
221 | # For example, if both 'apple' and 'com' appear in the matching domain, it's
222 | # very likely that something phishing is going on here. On the other hand,
223 | # if only 'apple' occurs, we are not so sure and it's better left for more
224 | # advance analysers to have their says in that
225 | for part in ext[:] if self.include_tld else ext[:2]:
226 | for token in part.split('.'):
227 | tmp.extend(wordsegment.segment(token))
228 |
229 | legit = self.option(tmp)
230 |
231 | if (isinstance(phish, set) and legit.issubset(phish)) or \
232 | (isinstance(phish, list) and '.{}'.format('.'.join(legit)) in '.'.join(phish)):
233 | # Found a possible phishing domain
234 | if match not in results:
235 | results[match] = []
236 |
237 | results[match].append(domain)
238 |
239 | return results
240 |
--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
1 | [MASTER]
2 |
3 | # Specify a configuration file.
4 | #rcfile=
5 |
6 | # Python code to execute, usually for sys.path manipulation such as
7 | # pygtk.require().
8 | #init-hook=
9 |
10 | # Add files or directories to the blacklist. They should be base names, not
11 | # paths.
12 | ignore=CVS
13 |
14 | # Pickle collected data for later comparisons.
15 | persistent=yes
16 |
17 | # List of plugins (as comma separated values of python modules names) to load,
18 | # usually to register additional checkers.
19 | load-plugins=
20 |
21 | # Use multiple processes to speed up Pylint.
22 | jobs=4
23 |
24 | # Allow loading of arbitrary C extensions. Extensions are imported into the
25 | # active Python interpreter and may run arbitrary code.
26 | unsafe-load-any-extension=no
27 |
28 | # A comma-separated list of package or module names from where C extensions may
29 | # be loaded. Extensions are loading into the active Python interpreter and may
30 | # run arbitrary code
31 | extension-pkg-whitelist=
32 |
33 |
34 | [MESSAGES CONTROL]
35 |
36 | # Only show warnings with the listed confidence levels. Leave empty to show
37 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
38 | confidence=
39 |
40 | # Enable the message, report, category or checker with the given id(s). You can
41 | # either give multiple identifier separated by comma (,) or put this option
42 | # multiple time. See also the "--disable" option for examples.
43 | #enable=
44 |
45 | # Disable the message, report, category or checker with the given id(s). You
46 | # can either give multiple identifiers separated by comma (,) or put this
47 | # option multiple times (only on the command line, not in the configuration
48 | # file where it should appear only once).You can also use "--disable=all" to
49 | # disable everything first and then reenable specific checks. For example, if
50 | # you want to run only the similarities checker, you can use "--disable=all
51 | # --enable=similarities". If you want to run only the classes checker, but have
52 | # no Warning level messages displayed, use"--disable=all --enable=classes
53 | # --disable=W"
54 |
55 | disable=fixme,locally-disabled
56 |
57 | [REPORTS]
58 |
59 | # Set the output format. Available formats are text, parseable, colorized, msvs
60 | # (visual studio) and html. You can also give a reporter class, eg
61 | # mypackage.mymodule.MyReporterClass.
62 | output-format=parseable
63 |
64 | # Put messages in a separate file for each module / package specified on the
65 | # command line instead of printing them on stdout. Reports (if any) will be
66 | # written in a file name "pylint_global.[txt|html]".
67 | files-output=no
68 |
69 | # Tells whether to display a full report or only the messages
70 | reports=yes
71 |
72 | # Python expression which should return a note less than 10 (10 is the highest
73 | # note). You have access to the variables errors warning, statement which
74 | # respectively contain the number of errors / warnings messages and the total
75 | # number of statements analyzed. This is used by the global evaluation report
76 | # (RP0004).
77 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
78 |
79 | # Template used to display messages. This is a python new-style format string
80 | # used to format the message information. See doc for all details
81 | #msg-template=
82 |
83 |
84 | [LOGGING]
85 |
86 | # Logging modules to check that the string format arguments are in logging
87 | # function parameter format
88 | logging-modules=logging
89 |
90 |
91 | [MISCELLANEOUS]
92 |
93 | # List of note tags to take in consideration, separated by a comma.
94 | notes=FIXME,XXX,TODO
95 |
96 |
97 | [SIMILARITIES]
98 |
99 | # Minimum lines number of a similarity.
100 | min-similarity-lines=4
101 |
102 | # Ignore comments when computing similarities.
103 | ignore-comments=yes
104 |
105 | # Ignore docstrings when computing similarities.
106 | ignore-docstrings=yes
107 |
108 | # Ignore imports when computing similarities.
109 | ignore-imports=no
110 |
111 |
112 | [VARIABLES]
113 |
114 | # Tells whether we should check for unused import in __init__ files.
115 | init-import=no
116 |
117 | # A regular expression matching the name of dummy variables (i.e. expectedly
118 | # not used).
119 | dummy-variables-rgx=_$|dummy
120 |
121 | # List of additional names supposed to be defined in builtins. Remember that
122 | # you should avoid to define new builtins when possible.
123 | additional-builtins=
124 |
125 | # List of strings which can identify a callback function by name. A callback
126 | # name must start or end with one of those strings.
127 | callbacks=cb_,_cb
128 |
129 |
130 | [FORMAT]
131 |
132 | # Maximum number of characters on a single line.
133 | max-line-length=120
134 |
135 | # Regexp for a line that is allowed to be longer than the limit.
136 | ignore-long-lines=^\s*(# )??$
137 |
138 | # Allow the body of an if to be on the same line as the test if there is no
139 | # else.
140 | single-line-if-stmt=no
141 |
142 | # List of optional constructs for which whitespace checking is disabled
143 | no-space-check=trailing-comma,dict-separator
144 |
145 | # Maximum number of lines in a module
146 | max-module-lines=1000
147 |
148 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
149 | # tab).
150 | indent-string=' '
151 |
152 | # Number of spaces of indent required inside a hanging or continued line.
153 | indent-after-paren=4
154 |
155 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
156 | expected-line-ending-format=
157 |
158 |
159 | [BASIC]
160 |
161 | # List of builtins function names that should not be used, separated by a comma
162 | bad-functions=map,filter,input
163 |
164 | # Good variable names which should always be accepted, separated by a comma
165 | good-names=i,j,k,ex,Run,_
166 |
167 | # Bad variable names which should always be refused, separated by a comma
168 | bad-names=foo,bar,baz,toto,tutu,tata
169 |
170 | # Colon-delimited sets of names that determine each other's naming style when
171 | # the name regexes allow several styles.
172 | name-group=
173 |
174 | # Include a hint for the correct naming format with invalid-name
175 | include-naming-hint=no
176 |
177 | # Regular expression matching correct function names
178 | function-rgx=[a-z_][a-z0-9_]{2,30}$
179 |
180 | # Naming hint for function names
181 | function-name-hint=[a-z_][a-z0-9_]{2,30}$
182 |
183 | # Regular expression matching correct variable names
184 | variable-rgx=[a-z_][a-z0-9_]{2,30}$
185 |
186 | # Naming hint for variable names
187 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$
188 |
189 | # Regular expression matching correct constant names
190 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
191 |
192 | # Naming hint for constant names
193 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
194 |
195 | # Regular expression matching correct attribute names
196 | attr-rgx=[a-z_][a-z0-9_]{2,30}$
197 |
198 | # Naming hint for attribute names
199 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$
200 |
201 | # Regular expression matching correct argument names
202 | argument-rgx=[a-z_][a-z0-9_]{2,30}$
203 |
204 | # Naming hint for argument names
205 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$
206 |
207 | # Regular expression matching correct class attribute names
208 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
209 |
210 | # Naming hint for class attribute names
211 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
212 |
213 | # Regular expression matching correct inline iteration names
214 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
215 |
216 | # Naming hint for inline iteration names
217 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
218 |
219 | # Regular expression matching correct class names
220 | class-rgx=[A-Z_][a-zA-Z0-9]+$
221 |
222 | # Naming hint for class names
223 | class-name-hint=[A-Z_][a-zA-Z0-9]+$
224 |
225 | # Regular expression matching correct module names
226 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
227 |
228 | # Naming hint for module names
229 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
230 |
231 | # Regular expression matching correct method names
232 | method-rgx=[a-z_][a-z0-9_]{2,30}$
233 |
234 | # Naming hint for method names
235 | method-name-hint=[a-z_][a-z0-9_]{2,30}$
236 |
237 | # Regular expression which should only match function or class names that do
238 | # not require a docstring.
239 | no-docstring-rgx=__.*__
240 |
241 | # Minimum line length for functions/classes that require docstrings, shorter
242 | # ones are exempt.
243 | docstring-min-length=-1
244 |
245 | # List of decorators that define properties, such as abc.abstractproperty.
246 | property-classes=abc.abstractproperty
247 |
248 |
249 | [TYPECHECK]
250 |
251 | # Tells whether missing members accessed in mixin class should be ignored. A
252 | # mixin class is detected if its name ends with "mixin" (case insensitive).
253 | ignore-mixin-members=yes
254 |
255 | # List of module names for which member attributes should not be checked
256 | # (useful for modules/projects where namespaces are manipulated during runtime
257 | # and thus existing member attributes cannot be deduced by static analysis
258 | ignored-modules=
259 |
260 | # List of classes names for which member attributes should not be checked
261 | # (useful for classes with attributes dynamically set).
262 | ignored-classes=SQLObject, optparse.Values, thread._local, _thread._local
263 |
264 | # List of members which are set dynamically and missed by pylint inference
265 | # system, and so shouldn't trigger E1101 when accessed. Python regular
266 | # expressions are accepted.
267 | generated-members=REQUEST,acl_users,aq_parent
268 |
269 | # List of decorators that create context managers from functions, such as
270 | # contextlib.contextmanager.
271 | contextmanager-decorators=contextlib.contextmanager
272 |
273 |
274 | [SPELLING]
275 |
276 | # Spelling dictionary name. Available dictionaries: none. To make it working
277 | # install python-enchant package.
278 | spelling-dict=
279 |
280 | # List of comma separated words that should not be checked.
281 | spelling-ignore-words=
282 |
283 | # A path to a file that contains private dictionary; one word per line.
284 | spelling-private-dict-file=
285 |
286 | # Tells whether to store unknown words to indicated private dictionary in
287 | # --spelling-private-dict-file option instead of raising a message.
288 | spelling-store-unknown-words=no
289 |
290 |
291 | [DESIGN]
292 |
293 | # Maximum number of arguments for function / method
294 | max-args=5
295 |
296 | # Argument names that match this expression will be ignored. Default to name
297 | # with leading underscore
298 | ignored-argument-names=_.*
299 |
300 | # Maximum number of locals for function / method body
301 | max-locals=15
302 |
303 | # Maximum number of return / yield for function / method body
304 | max-returns=6
305 |
306 | # Maximum number of branch for function / method body
307 | max-branches=12
308 |
309 | # Maximum number of statements in function / method body
310 | max-statements=50
311 |
312 | # Maximum number of parents for a class (see R0901).
313 | max-parents=7
314 |
315 | # Maximum number of attributes for a class (see R0902).
316 | max-attributes=7
317 |
318 | # Minimum number of public methods for a class (see R0903).
319 | min-public-methods=2
320 |
321 | # Maximum number of public methods for a class (see R0904).
322 | max-public-methods=20
323 |
324 |
325 | [CLASSES]
326 |
327 | # List of method names used to declare (i.e. assign) instance attributes.
328 | defining-attr-methods=__init__,__new__,setUp
329 |
330 | # List of valid names for the first argument in a class method.
331 | valid-classmethod-first-arg=cls
332 |
333 | # List of valid names for the first argument in a metaclass class method.
334 | valid-metaclass-classmethod-first-arg=mcs
335 |
336 | # List of member names, which should be excluded from the protected access
337 | # warning.
338 | exclude-protected=_asdict,_fields,_replace,_source,_make
339 |
340 |
341 | [IMPORTS]
342 |
343 | # Deprecated modules which should not be used, separated by a comma
344 | deprecated-modules=regsub,TERMIOS,Bastion,rexec
345 |
346 | # Create a graph of every (i.e. internal and external) dependencies in the
347 | # given file (report RP0402 must not be disabled)
348 | import-graph=
349 |
350 | # Create a graph of external dependencies in the given file (report RP0402 must
351 | # not be disabled)
352 | ext-import-graph=
353 |
354 | # Create a graph of internal dependencies in the given file (report RP0402 must
355 | # not be disabled)
356 | int-import-graph=
357 |
358 |
359 | [EXCEPTIONS]
360 |
361 | # Exceptions that will emit a warning when being caught. Defaults to
362 | # "Exception"
363 | overgeneral-exceptions=Exception
364 |
--------------------------------------------------------------------------------
/certstream_analytics/analysers/common_domain_analyser.py:
--------------------------------------------------------------------------------
1 | """
2 | The list of basic analysers includes:
3 | - WordSegmentation
4 | - IDNADecoder
5 | - HomoglyphsDecoder
6 | - FeaturesGenerator (generate various features for further downstream processing)
7 | - BulkDomainMarker
8 | """
9 | import re
10 | import tldextract
11 | import wordsegment
12 | from nostril import nonsense
13 | import idna
14 | from confusable_homoglyphs import confusables
15 |
16 | from .base import Analyser
17 |
18 |
19 | # pylint: disable=too-few-public-methods
20 | class WordSegmentation(Analyser):
21 | """
22 | Perform word segmentation of all the SAN domains as an attempt to make sense
23 | of their names. For example, both arch.mappleonline.com and apple-verifyupdate.serveftp.com
24 | domains have 'apple' inside but only the second one is an actual Apple phishing
25 | page. Intuitively, a good word segmentation algorithm will return:
26 |
27 | - arch + mapple + online + com
28 | - apple + verify + update + serve + ftp + com
29 |
30 | Thus, it's much easier to spot the second phishing domain.
31 |
32 | Implementation-wise, there are several existing packages around to do this, for
33 | example:
34 |
35 | - https://github.com/grantjenks/python-wordsegment
36 | - https://github.com/keredson/wordninja
37 |
38 | Let's see what they can do, take it away!
39 | """
40 | # Some common stop words that are in the list of most popular domains
41 | STOPWORDS = {
42 | 'app': 1,
43 | 'inc': 1,
44 | 'box': 1,
45 | 'health': 1,
46 | 'home': 1,
47 | 'space': 1,
48 | 'cars': 1,
49 | 'nature': 1,
50 | }
51 |
52 | def __init__(self):
53 | """
54 | Just load the wordsegment package, whatever it is.
55 | """
56 | wordsegment.load()
57 |
58 | def run(self, record):
59 | """
60 | Apply word segment to all the SAN domain names. Let's see if it makes
61 | any sense.
62 | """
63 | if 'analysers' not in record:
64 | record['analysers'] = []
65 |
66 | results = {}
67 | # Check the domain and all its SAN
68 | for domain in record['all_domains']:
69 | # Remove wild card
70 | domain = re.sub(r'^\*\.', '', domain)
71 |
72 | # The TLD will be stripped off cause it does not contribute anything here
73 | ext = tldextract.extract(domain)
74 |
75 | words = []
76 | # We choose to segment the TLD here as well, for example, .co.uk
77 | # will become ['co', 'uk']. Let see if this works out.
78 | for part in ext[:]:
79 | for token in part.split('.'):
80 | segmented = [w for w in wordsegment.segment(token) if w not in WordSegmentation.STOPWORDS]
81 |
82 | if segmented:
83 | words.extend(segmented)
84 | elif token:
85 | # For some IDNA domain like xn--wgbfq3d.xn--ngbc5azd, the segmentation
86 | # won't work and an empty array is returned. So we choose to just keep
87 | # the original token
88 | words.append(token)
89 |
90 | results[domain] = words
91 |
92 | if results:
93 | record['analysers'].append({
94 | 'analyser': type(self).__name__,
95 | 'output': results,
96 | })
97 |
98 | return record
99 |
100 |
101 | class BulkDomainMarker(Analyser):
102 | """
103 | Mark the record that has tons of SAN domains in it. Most of the time, they are
104 | completely unrelated domains and probably the result of some bulk registration
105 | process. Benign or not, they are still suspicious and probably spam. We can also
106 | verify the similarity among these domains. A lower similarity score means these
107 | domains are totally unrelated.
108 | """
109 | # Take a histogram here and find out the suitable value for this
110 | THRESHOLD = 15
111 |
112 | def __init__(self, threshold=THRESHOLD):
113 | """
114 | Set the threshold to mark the record as a bulk record.
115 | """
116 | self.threshold = threshold
117 |
118 | def run(self, record):
119 | """
120 | See if the record is a bulk record. We will just use the threshold as
121 | the indicator for now. So if a record has more SAN names than the
122 | threshold, it is a bulk record.
123 | """
124 | if 'analysers' not in record:
125 | record['analysers'] = []
126 |
127 | is_bulked = len(record['all_domains']) >= self.threshold
128 |
129 | record['analysers'].append({
130 | 'analyser': type(self).__name__,
131 | 'output': is_bulked,
132 | })
133 |
134 | return record
135 |
136 |
137 | class IDNADecoder(Analyser):
138 | """
139 | Decode all domains in IDNA format.
140 | """
141 | def run(self, record):
142 | """
143 | Check if a domain in the list is in IDNA format and convert it back to
144 | Unicode.
145 | """
146 | decoded = []
147 |
148 | for domain in record['all_domains']:
149 | wildcard = False
150 |
151 | try:
152 | if re.match(r'^\*\.', domain):
153 | wildcard = True
154 | # Remove wildcard cause it interfere with the IDNA module
155 | # and we'll put it back later
156 | domain = re.sub(r'^\*\.', '', domain)
157 |
158 | domain = idna.decode(domain)
159 |
160 | except idna.core.InvalidCodepoint:
161 | # Fail to decode the domain, just keep it as it is for now
162 | pass
163 | except UnicodeError:
164 | pass
165 | finally:
166 | if wildcard:
167 | domain = '*.{}'.format(domain)
168 |
169 | decoded.append(domain)
170 |
171 | record['all_domains'] = decoded
172 | return record
173 |
174 |
175 | class HomoglyphsDecoder(Analyser):
176 | """
177 | Smartly convert domains whose names include some suspicious homoglyphs to
178 | ASCII. This will probably need to be right done after IDNA conversion and
179 | before other analysers so that they can get benefits from it.
180 | """
181 | def __init__(self, greedy=False):
182 | """
183 | We rely on the confusable-homoglyphs at https://github.com/vhf/confusable_homoglyphs
184 | to do its magic.
185 |
186 | If the greedy flag is set, all alternative domains will be returned. Otherwise, only
187 | the first one will be available.
188 | """
189 | self.greedy = greedy
190 |
191 | @staticmethod
192 | def is_latin(alt):
193 | """
194 | Check if a string is in Latin cause, in our specific case, we will
195 | only care about Latin characters
196 | """
197 | lower_s = range(ord('a'), ord('z') + 1)
198 | upper_s = range(ord('A'), ord('Z') + 1)
199 |
200 | # We need to check the length of the homoglyph here cause
201 | # confusable_homoglyphs library nicely returns multi-character
202 | # match as well, for example, 'rn' has an alternative of 'm'
203 | for alt_c in alt:
204 | if ord(alt_c) not in lower_s and ord(alt_c) not in upper_s:
205 | return False
206 |
207 | return True
208 |
209 | def run(self, record):
210 | """
211 | Using the confusable-homoglyphs, we are going to generate all alternatives ASCII
212 | names of a domain. It's a bit of a brute force though.
213 | """
214 | decoded = []
215 |
216 | for domain in record['all_domains']:
217 | wildcard = False
218 |
219 | if re.match(r'^\*\.', domain):
220 | wildcard = True
221 | # Remove wild card to simplify the domain name a bit and we'll put it back later
222 | domain = re.sub(r'^\*\.', '', domain)
223 |
224 | hg_map = {hg['character']: hg for hg in confusables.is_confusable(domain, greedy=True)}
225 | decoded_domain_c = []
226 |
227 | for domain_c in domain:
228 | # Confusable homoglyphs could not find any homoglyphs for this character
229 | # so we decide to keep the original character as it is
230 | if domain_c not in hg_map:
231 | decoded_domain_c.append([domain_c])
232 | continue
233 |
234 | found = []
235 | hglyph = hg_map[domain_c]
236 |
237 | if hglyph['alias'] == 'LATIN':
238 | # The character is Latin, we don't need to do anything here
239 | found.append(hglyph['character'])
240 |
241 | for alt in hglyph['homoglyphs']:
242 | if HomoglyphsDecoder.is_latin(alt['c']):
243 | found.append(alt['c'].lower())
244 |
245 | # If nothing is found, we keep the original character
246 | if not found:
247 | found.append(hglyph['character'])
248 |
249 | decoded_domain_c.append(found)
250 |
251 | for alt in self._generate_alternatives(decoded_domain_c):
252 | if wildcard:
253 | alt = '*.{}'.format(alt)
254 |
255 | decoded.append(alt)
256 |
257 | if not self.greedy:
258 | break
259 |
260 | record['all_domains'] = decoded
261 | return record
262 |
263 | def _generate_alternatives(self, alt_characters, index=0, current=''):
264 | """
265 | Generate all alternative ASCII names of a domain using the list of all
266 | alternative characters.
267 | """
268 | if index == len(alt_characters):
269 | yield current
270 |
271 | else:
272 | for alt_c in alt_characters[index]:
273 | yield from self._generate_alternatives(alt_characters,
274 | index + 1,
275 | current + alt_c)
276 |
277 |
278 | class FeaturesGenerator(Analyser):
279 | """
280 | Generate features to detect outliers in the stream. In our case, the outliers is
281 | the 'suspicious' phishing domains.
282 | """
283 | NOSTRIL_LENGTH_LIMIT = 6
284 |
285 | # pylint: disable=invalid-name
286 | def run(self, record):
287 | """
288 | The list of features will be:
289 | - The number of domain parts, for example, www.google.com is 3.
290 | - The overall length in characters.
291 | - The length of the longest domain part.
292 | - The length of the TLD, e.g. .online or .download is longer than .com.
293 | - The randomness level of the domain.
294 | """
295 | if 'analysers' not in record:
296 | record['analysers'] = []
297 |
298 | x_samples = []
299 | Y_samples = []
300 |
301 | for analyser in record['analysers']:
302 | if analyser['analyser'] != 'WordSegmentation':
303 | continue
304 |
305 | for domain, segments in analyser['output'].items():
306 | # Remove wildcard domain
307 | domain = re.sub(r'^\*\.', '', domain)
308 |
309 | parts = domain.split('.')
310 |
311 | x = []
312 | # Compute the number of domain parts
313 | x.append(len(parts))
314 |
315 | # Compute the length of the whole domain
316 | x.append(len(domain))
317 |
318 | longest = ''
319 | # Compute the length of the longest domain parts
320 | for part in parts:
321 | if len(part) > len(longest):
322 | longest = part
323 |
324 | x.append(len(longest))
325 |
326 | # Compute the length of the TLD
327 | x.append(len(parts[-1]))
328 |
329 | randomness_count = 0
330 | # The nostril package which we are using to detect non-sense words
331 | # in the domain only returns a boolean verdict so may be we need to
332 | # think of how we want to quantify this
333 | for w in segments:
334 | try:
335 | if len(w) >= FeaturesGenerator.NOSTRIL_LENGTH_LIMIT and nonsense(w):
336 | randomness_count += 1
337 | except ValueError:
338 | continue
339 |
340 | x.append(randomness_count / len(segments))
341 |
342 | x_samples.append(x)
343 | Y_samples.append('usual_suspect' in record)
344 |
345 | break
346 |
347 | record['analysers'].append({
348 | 'analyser': type(self).__name__,
349 | 'output': x_samples,
350 | })
351 |
352 | return record
353 |
--------------------------------------------------------------------------------
/tests/samples.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "data": {
4 | "cert_index": 447858050,
5 | "cert_link": "http://ct.googleapis.com/rocketeer/ct/v1/get-entries?start=447858050&end=447858050",
6 | "chain": [
7 | {
8 | "as_der": "REDACT",
9 | "extensions": {
10 | "authorityInfoAccess": "OCSP - URI:http://ocsp.comodoca.com\nCA Issuers - URI:http://crt.comodoca.com/COMODORSAAddTrustCA.crt\n",
11 | "authorityKeyIdentifier": "keyid:BB:AF:7E:02:3D:FA:A6:F1:3C:84:8E:AD:EE:38:98:EC:D9:32:32:D4\n",
12 | "basicConstraints": "CA:TRUE",
13 | "certificatePolicies": "Policy: 2.23.140.1.2.1\nPolicy: 1.3.6.1.4.1.6449.1.2.2.52",
14 | "crlDistributionPoints": "Full Name:\n URI:http://crl.comodoca.com/COMODORSACertificationAuthority.crl",
15 | "extendedKeyUsage": "TLS Web server authentication, TLS Web client authentication",
16 | "keyUsage": "Digital Signature, Key Cert Sign, C R L Sign",
17 | "subjectKeyIdentifier": "7E:03:5A:65:41:6B:A7:7E:0A:E1:B8:9D:08:EA:1D:8E:1D:6A:C7:65"
18 | },
19 | "fingerprint": "76:4D:2F:A5:9E:D1:23:F9:C9:55:70:C4:03:C9:2F:EF:33:8E:A7:45",
20 | "not_after": 1747526399,
21 | "not_before": 1431907200,
22 | "serial_number": "F01D4BEE7B7CA37B3C0566AC05972458",
23 | "subject": {
24 | "C": "US",
25 | "CN": "cPanel, Inc. Certification Authority",
26 | "L": "Houston",
27 | "O": "cPanel, Inc.",
28 | "OU": null,
29 | "ST": "TX",
30 | "aggregated": "/C=US/CN=cPanel, Inc. Certification Authority/L=Houston/O=cPanel, Inc./ST=TX"
31 | }
32 | },
33 | {
34 | "as_der": "REDACT",
35 | "extensions": {
36 | "basicConstraints": "CA:TRUE",
37 | "keyUsage": "Key Cert Sign, C R L Sign",
38 | "subjectKeyIdentifier": "BB:AF:7E:02:3D:FA:A6:F1:3C:84:8E:AD:EE:38:98:EC:D9:32:32:D4"
39 | },
40 | "fingerprint": "AF:E5:D2:44:A8:D1:19:42:30:FF:47:9F:E2:F8:97:BB:CD:7A:8C:B4",
41 | "not_after": 2147471999,
42 | "not_before": 1263859200,
43 | "serial_number": "4CAAF9CADB636FE01FF74ED85B03869D",
44 | "subject": {
45 | "C": "GB",
46 | "CN": "COMODO RSA Certification Authority",
47 | "L": "Salford",
48 | "O": "COMODO CA Limited",
49 | "OU": null,
50 | "ST": "Greater Manchester",
51 | "aggregated": "/C=GB/CN=COMODO RSA Certification Authority/L=Salford/O=COMODO CA Limited/ST=Greater Manchester"
52 | }
53 | }
54 | ],
55 | "leaf_cert": {
56 | "all_domains": [
57 | "firmyfarmaceutyczneeu.redirect.rejestracjadomen.pl",
58 | "www.firmyfarmaceutyczneeu.redirect.rejestracjadomen.pl"
59 | ],
60 | "as_der": "REDACT",
61 | "extensions": {
62 | "authorityInfoAccess": "OCSP - URI:http://ocsp.comodoca.com\nCA Issuers - URI:http://crt.comodoca.com/cPanelIncCertificationAuthority.crt\n",
63 | "authorityKeyIdentifier": "keyid:7E:03:5A:65:41:6B:A7:7E:0A:E1:B8:9D:08:EA:1D:8E:1D:6A:C7:65\n",
64 | "basicConstraints": "CA:FALSE",
65 | "certificatePolicies": "Policy: 2.23.140.1.2.1\nPolicy: 1.3.6.1.4.1.6449.1.2.2.52\n CPS: https://secure.comodo.com/CPS",
66 | "crlDistributionPoints": "Full Name:\n URI:http://crl.comodoca.com/cPanelIncCertificationAuthority.crl",
67 | "ctlPoisonByte": true,
68 | "extendedKeyUsage": "TLS Web server authentication, TLS Web client authentication",
69 | "keyUsage": "Digital Signature, Key Encipherment",
70 | "subjectAltName": "DNS:www.firmyfarmaceutyczneeu.redirect.rejestracjadomen.pl, DNS:firmyfarmaceutyczneeu.redirect.rejestracjadomen.pl",
71 | "subjectKeyIdentifier": "A6:F3:1B:BD:CB:A6:E0:95:E4:EA:86:C5:9D:FE:BC:9E:B1:C4:0B:FD"
72 | },
73 | "fingerprint": "32:8B:E0:CA:41:25:E0:EB:CD:92:29:7F:F3:17:3C:06:2C:3C:1F:D0",
74 | "not_after": 1546473599,
75 | "not_before": 1538611200,
76 | "serial_number": "DA28422511646C0552500F3DEE0AC20",
77 | "subject": {
78 | "C": null,
79 | "CN": "firmyfarmaceutyczneeu.redirect.rejestracjadomen.pl",
80 | "L": null,
81 | "O": null,
82 | "OU": null,
83 | "ST": null,
84 | "aggregated": "/CN=firmyfarmaceutyczneeu.redirect.rejestracjadomen.pl"
85 | }
86 | },
87 | "seen": 1538635262.355275,
88 | "source": {
89 | "name": "Google 'Rocketeer' log",
90 | "url": "ct.googleapis.com/rocketeer/"
91 | },
92 | "update_type": "PrecertLogEntry"
93 | },
94 | "message_type": "certificate_update"
95 | },
96 | {
97 | "data": {
98 | "cert_index": 447858049,
99 | "cert_link": "http://ct.googleapis.com/rocketeer/ct/v1/get-entries?start=447858049&end=447858049",
100 | "chain": [
101 | {
102 | "as_der": "REDACT",
103 | "extensions": {
104 | "authorityInfoAccess": "CA Issuers - URI:http://apps.identrust.com/roots/dstrootcax3.p7c\nOCSP - URI:http://isrg.trustid.ocsp.identrust.com\n",
105 | "authorityKeyIdentifier": "keyid:C4:A7:B1:A4:7B:2C:71:FA:DB:E1:4B:90:75:FF:C4:15:60:85:89:10\n",
106 | "basicConstraints": "CA:TRUE",
107 | "certificatePolicies": "Policy: 1.3.6.1.4.1.44947.1.1.1\n CPS: http://cps.root-x1.letsencrypt.org",
108 | "crlDistributionPoints": "Full Name:\n URI:http://crl.identrust.com/DSTROOTCAX3CRL.crl",
109 | "keyUsage": "Digital Signature, Key Cert Sign, C R L Sign",
110 | "subjectKeyIdentifier": "A8:4A:6A:63:04:7D:DD:BA:E6:D1:39:B7:A6:45:65:EF:F3:A8:EC:A1"
111 | },
112 | "fingerprint": "E6:A3:B4:5B:06:2D:50:9B:33:82:28:2D:19:6E:FE:97:D5:95:6C:CB",
113 | "not_after": 1615999246,
114 | "not_before": 1458232846,
115 | "serial_number": "A0141420000015385736A0B85ECA708",
116 | "subject": {
117 | "C": "US",
118 | "CN": "Let's Encrypt Authority X3",
119 | "L": null,
120 | "O": "Let's Encrypt",
121 | "OU": null,
122 | "ST": null,
123 | "aggregated": "/C=US/CN=Let's Encrypt Authority X3/O=Let's Encrypt"
124 | }
125 | },
126 | {
127 | "as_der": "REDACT",
128 | "extensions": {
129 | "basicConstraints": "CA:TRUE",
130 | "keyUsage": "Key Cert Sign, C R L Sign",
131 | "subjectKeyIdentifier": "C4:A7:B1:A4:7B:2C:71:FA:DB:E1:4B:90:75:FF:C4:15:60:85:89:10"
132 | },
133 | "fingerprint": "DA:C9:02:4F:54:D8:F6:DF:94:93:5F:B1:73:26:38:CA:6A:D7:7C:13",
134 | "not_after": 1633010475,
135 | "not_before": 970348339,
136 | "serial_number": "44AFB080D6A327BA893039862EF8406B",
137 | "subject": {
138 | "C": null,
139 | "CN": "DST Root CA X3",
140 | "L": null,
141 | "O": "Digital Signature Trust Co.",
142 | "OU": null,
143 | "ST": null,
144 | "aggregated": "/CN=DST Root CA X3/O=Digital Signature Trust Co."
145 | }
146 | }
147 | ],
148 | "leaf_cert": {
149 | "all_domains": [
150 | "rundschleifmaschinen-service.de",
151 | "www.rundschleifmaschinen-service.de"
152 | ],
153 | "as_der": "REDACT",
154 | "extensions": {
155 | "authorityInfoAccess": "CA Issuers - URI:http://cert.int-x3.letsencrypt.org/\nOCSP - URI:http://ocsp.int-x3.letsencrypt.org\n",
156 | "authorityKeyIdentifier": "keyid:A8:4A:6A:63:04:7D:DD:BA:E6:D1:39:B7:A6:45:65:EF:F3:A8:EC:A1\n",
157 | "basicConstraints": "CA:FALSE",
158 | "certificatePolicies": "Policy: 1.3.6.1.4.1.44947.1.1.1\n CPS: http://cps.letsencrypt.org\n User Notice: is Certificate may only be relied upon by Relying Parties and only in accordance with the Certificate Policy found at https://letsencrypt.org/repository/",
159 | "ctlSignedCertificateTimestamp": "BIHyAPAAdQBvU3asMfAxGdiZAKRRFf93FRwR2QLBACkGjbIImjfZEwAAAWYyuWCoAAAEAwBGMEQCIHAfF-WDz1YkPCONYN0aXohfUPFrhiKG61tXfDilc3dUAiB0oHYT0e5eCKi5k9mEzRpqC-NdvhEtr8qKBlxEoiQsGwB3ACk8UZZUyDlluqpQ_FgH1Ldvv1h6KXLcpMMM9OVFR_R4AAABZjK5YkwAAAQDAEgwRgIhAKlrVU0Na8GF1AT7lCpeUJMchwfHnFsjswnpultsgKQhAiEAuPvplxBQsMHbioLdPsNRQSr-xUHV2g7yZkUnKqZHbnQ=",
160 | "extendedKeyUsage": "TLS Web server authentication, TLS Web client authentication",
161 | "keyUsage": "Digital Signature, Key Encipherment",
162 | "subjectAltName": "DNS:www.rundschleifmaschinen-service.de, DNS:rundschleifmaschinen-service.de",
163 | "subjectKeyIdentifier": "E3:45:2E:7F:5C:8D:B4:17:CC:B8:73:09:E3:DA:F7:F3:F9:ED:F2:15"
164 | },
165 | "fingerprint": "9A:3A:AF:F8:DC:A4:18:4B:B6:46:61:F7:29:46:FA:42:9B:CA:9C:71",
166 | "not_after": 1546221701,
167 | "not_before": 1538445701,
168 | "serial_number": "3428B7C70A67819D5B9E7A13D2B9B8C778F",
169 | "subject": {
170 | "C": null,
171 | "CN": "rundschleifmaschinen-service.de",
172 | "L": null,
173 | "O": null,
174 | "OU": null,
175 | "ST": null,
176 | "aggregated": "/CN=rundschleifmaschinen-service.de"
177 | }
178 | },
179 | "seen": 1538635262.353125,
180 | "source": {
181 | "name": "Google 'Rocketeer' log",
182 | "url": "ct.googleapis.com/rocketeer/"
183 | },
184 | "update_type": "X509LogEntry"
185 | },
186 | "message_type": "certificate_update"
187 | },
188 | {
189 | "data": {
190 | "cert_index": 447857993,
191 | "cert_link": "http://ct.googleapis.com/rocketeer/ct/v1/get-entries?start=447857993&end=447857993",
192 | "chain": [
193 | {
194 | "as_der": "REDACT",
195 | "extensions": {
196 | "authorityInfoAccess": "CA Issuers - URI:http://apps.identrust.com/roots/dstrootcax3.p7c\nOCSP - URI:http://isrg.trustid.ocsp.identrust.com\n",
197 | "authorityKeyIdentifier": "keyid:C4:A7:B1:A4:7B:2C:71:FA:DB:E1:4B:90:75:FF:C4:15:60:85:89:10\n",
198 | "basicConstraints": "CA:TRUE",
199 | "certificatePolicies": "Policy: 1.3.6.1.4.1.44947.1.1.1\n CPS: http://cps.root-x1.letsencrypt.org",
200 | "crlDistributionPoints": "Full Name:\n URI:http://crl.identrust.com/DSTROOTCAX3CRL.crl",
201 | "keyUsage": "Digital Signature, Key Cert Sign, C R L Sign",
202 | "subjectKeyIdentifier": "A8:4A:6A:63:04:7D:DD:BA:E6:D1:39:B7:A6:45:65:EF:F3:A8:EC:A1"
203 | },
204 | "fingerprint": "E6:A3:B4:5B:06:2D:50:9B:33:82:28:2D:19:6E:FE:97:D5:95:6C:CB",
205 | "not_after": 1615999246,
206 | "not_before": 1458232846,
207 | "serial_number": "A0141420000015385736A0B85ECA708",
208 | "subject": {
209 | "C": "US",
210 | "CN": "Let's Encrypt Authority X3",
211 | "L": null,
212 | "O": "Let's Encrypt",
213 | "OU": null,
214 | "ST": null,
215 | "aggregated": "/C=US/CN=Let's Encrypt Authority X3/O=Let's Encrypt"
216 | }
217 | },
218 | {
219 | "as_der": "REDACT",
220 | "extensions": {
221 | "basicConstraints": "CA:TRUE",
222 | "keyUsage": "Key Cert Sign, C R L Sign",
223 | "subjectKeyIdentifier": "C4:A7:B1:A4:7B:2C:71:FA:DB:E1:4B:90:75:FF:C4:15:60:85:89:10"
224 | },
225 | "fingerprint": "DA:C9:02:4F:54:D8:F6:DF:94:93:5F:B1:73:26:38:CA:6A:D7:7C:13",
226 | "not_after": 1633010475,
227 | "not_before": 970348339,
228 | "serial_number": "44AFB080D6A327BA893039862EF8406B",
229 | "subject": {
230 | "C": null,
231 | "CN": "DST Root CA X3",
232 | "L": null,
233 | "O": "Digital Signature Trust Co.",
234 | "OU": null,
235 | "ST": null,
236 | "aggregated": "/CN=DST Root CA X3/O=Digital Signature Trust Co."
237 | }
238 | }
239 | ],
240 | "leaf_cert": {
241 | "all_domains": [
242 | "www.runaflohmarkt.de"
243 | ],
244 | "as_der": "REDACT",
245 | "extensions": {
246 | "authorityInfoAccess": "CA Issuers - URI:http://cert.int-x3.letsencrypt.org/\nOCSP - URI:http://ocsp.int-x3.letsencrypt.org\n",
247 | "authorityKeyIdentifier": "keyid:A8:4A:6A:63:04:7D:DD:BA:E6:D1:39:B7:A6:45:65:EF:F3:A8:EC:A1\n",
248 | "basicConstraints": "CA:FALSE",
249 | "certificatePolicies": "Policy: 1.3.6.1.4.1.44947.1.1.1\n CPS: http://cps.letsencrypt.org\n User Notice: is Certificate may only be relied upon by Relying Parties and only in accordance with the Certificate Policy found at https://letsencrypt.org/repository/",
250 | "ctlSignedCertificateTimestamp": "BIHxAO8AdQBvU3asMfAxGdiZAKRRFf93FRwR2QLBACkGjbIImjfZEwAAAWX9AeXwAAAEAwBGMEQCICIkjDXPcRgbcoKuh8Ciu_1sIVVKj_oGb-bzc8zPyhF2AiAhCQMKgrBcxZpZpGgOEgyBxIX6WqJFDOGamrWW-I55IAB2ACk8UZZUyDlluqpQ_FgH1Ldvv1h6KXLcpMMM9OVFR_R4AAABZf0B56wAAAQDAEcwRQIhAPNKe7X7XqNZF7H4NOWW-DtSvx1jVWxqsZVnknCjrkjrAiBTIKM-qsi4QMFHbTRfxz4tiRvI14vCXDAbyoLgbp6BKw==",
251 | "extendedKeyUsage": "TLS Web server authentication, TLS Web client authentication",
252 | "keyUsage": "Digital Signature, Key Encipherment",
253 | "subjectAltName": "DNS:www.runaflohmarkt.de",
254 | "subjectKeyIdentifier": "7C:82:16:CB:31:94:C6:C5:5C:72:A1:37:CA:AE:B9:9B:3D:73:3E:9B"
255 | },
256 | "fingerprint": "AD:5E:3D:91:50:46:7E:C6:D9:30:FD:65:11:8B:CE:81:FF:29:49:B9",
257 | "not_after": 1545320484,
258 | "not_before": 1537544484,
259 | "serial_number": "36434086EFE2BB58A2068BBA9F2E96B7898",
260 | "subject": {
261 | "C": null,
262 | "CN": "www.runaflohmarkt.de",
263 | "L": null,
264 | "O": null,
265 | "OU": null,
266 | "ST": null,
267 | "aggregated": "/CN=www.runaflohmarkt.de"
268 | }
269 | },
270 | "seen": 1538635262.249552,
271 | "source": {
272 | "name": "Google 'Rocketeer' log",
273 | "url": "ct.googleapis.com/rocketeer/"
274 | },
275 | "update_type": "X509LogEntry"
276 | },
277 | "message_type": "certificate_update"
278 | }
279 | ]
280 |
--------------------------------------------------------------------------------
/tests/test_domain_matching_analyser.py:
--------------------------------------------------------------------------------
1 | '''
2 | Common domain matching analyser.
3 | '''
4 | import copy
5 | import os
6 | import unittest
7 |
8 | from certstream_analytics.analysers import AhoCorasickDomainMatching
9 | from certstream_analytics.analysers import WordSegmentation
10 | from certstream_analytics.analysers import DomainMatching, DomainMatchingOption
11 | from certstream_analytics.analysers import BulkDomainMarker
12 | from certstream_analytics.analysers import IDNADecoder
13 | from certstream_analytics.analysers import HomoglyphsDecoder
14 |
15 |
16 | class DomainMatchingTest(unittest.TestCase):
17 | '''
18 | Test all the common domain matching analysers.
19 | '''
20 | def test_ahocorasick(self):
21 | '''
22 | Compare some mock domains against the list of most popular domains
23 | using Aho-Corasick algorithm.
24 | '''
25 | # Load the mock list of common domains for testing.
26 | current_dir = os.path.dirname(os.path.realpath(__file__))
27 |
28 | with open(os.path.join(current_dir, 'opendns-top-domains.txt')) as fhandle:
29 | domains = [line.rstrip() for line in fhandle]
30 |
31 | ahocorasick_analyser = AhoCorasickDomainMatching(domains)
32 |
33 | cases = [
34 | {
35 | 'data': {
36 | 'all_domains': [
37 | 'store.google.com',
38 | 'google.com',
39 | ],
40 | },
41 | 'expected': [
42 | {
43 | 'analyser': 'AhoCorasickDomainMatching',
44 | 'output': {
45 | 'store.google.com': ['google.com'],
46 | },
47 | },
48 | ],
49 | 'description': 'An exact match domain',
50 | },
51 |
52 | {
53 | 'data': {
54 | 'all_domains': [
55 | 'www.facebook.com.msg40.site',
56 | ],
57 | },
58 | 'expected': [
59 | {
60 | 'analyser': 'AhoCorasickDomainMatching',
61 | 'output': {
62 | 'www.facebook.com.msg40.site': ['facebook.com'],
63 | },
64 | },
65 | ],
66 | 'description': 'A sample phishing domain with a sub-domain match',
67 | },
68 |
69 | {
70 | 'data': {
71 | 'all_domains': [
72 | 'login-appleid.apple.com.managesuppport.co',
73 | ],
74 | },
75 | 'expected': [
76 | {
77 | 'analyser': 'AhoCorasickDomainMatching',
78 | 'output': {
79 | 'login-appleid.apple.com.managesuppport.co': ['apple.com'],
80 | },
81 | },
82 | ],
83 | 'description': 'A sample phishing domain with a partial string match',
84 | },
85 |
86 | {
87 | 'data': {
88 | 'all_domains': [
89 | 'socket.io',
90 | ],
91 | },
92 | 'expected': [],
93 | 'description': 'A non-matching domain (not in the list of most popular domains)',
94 | },
95 |
96 | {
97 | 'data': {
98 | 'all_domains': [
99 | 'www.foobar2000.com',
100 | ],
101 | },
102 | 'expected': [],
103 | 'description': 'A non-matching domain (excluded pattern)',
104 | },
105 |
106 | {
107 | 'data': {
108 | 'all_domains': [
109 | 'autodiscover.blablabla.com',
110 | ],
111 | },
112 | 'expected': [],
113 | 'description': 'Match a ignored pattern',
114 | },
115 | ]
116 |
117 | for case in cases:
118 | got = ahocorasick_analyser.run(case['data'])
119 | self.assertListEqual(got['analysers'], case['expected'], case['description'])
120 |
121 | def test_wordsegmentation(self):
122 | '''
123 | Try to segment some domains and check the result.
124 | '''
125 | wordsegmentation = WordSegmentation()
126 |
127 | cases = [
128 | {
129 | 'data': {
130 | 'all_domains': [
131 | 'store.google.com',
132 | 'google.com',
133 | ],
134 | },
135 | 'expected': [
136 | {
137 | 'analyser': 'WordSegmentation',
138 | 'output': {
139 | 'store.google.com': ['store', 'google', 'com'],
140 | 'google.com': ['google', 'com'],
141 | },
142 | },
143 | ],
144 | 'description': 'A legit domain',
145 | },
146 |
147 | {
148 | 'data': {
149 | 'all_domains': [
150 | 'www.facebook.com.msg40.site',
151 | ],
152 | },
153 | 'expected': [
154 | {
155 | 'analyser': 'WordSegmentation',
156 | 'output': {
157 | 'www.facebook.com.msg40.site': ['www', 'facebook', 'com', 'msg40', 'site'],
158 | },
159 | },
160 | ],
161 | 'description': 'Word segmentation using the domain separator (dot)',
162 | },
163 |
164 | {
165 | 'data': {
166 | 'all_domains': [
167 | 'login-appleid.apple.com.managesuppport.co',
168 | ],
169 | },
170 | 'expected': [
171 | {
172 | 'analyser': 'WordSegmentation',
173 | 'output': {
174 | 'login-appleid.apple.com.managesuppport.co': [
175 | 'login',
176 | 'apple',
177 | 'id',
178 | 'apple',
179 | 'com',
180 | 'manage',
181 | 'suppport',
182 | 'co'
183 | ],
184 | },
185 | },
186 | ],
187 | 'description': 'Word segmentation using dictionary',
188 | },
189 |
190 | {
191 | 'data': {
192 | 'all_domains': [
193 | 'arch.mappleonline.com',
194 | ],
195 | },
196 | 'expected': [
197 | {
198 | 'analyser': 'WordSegmentation',
199 | 'output': {
200 | 'arch.mappleonline.com': ['arch', 'm', 'apple', 'online', 'com'],
201 | },
202 | },
203 | ],
204 | 'description': 'Failed to segment the word correctly',
205 | },
206 |
207 | {
208 | 'data': {
209 | 'all_domains': [
210 | 'www.freybrothersinc.com',
211 | ],
212 | },
213 | 'expected': [
214 | {
215 | 'analyser': 'WordSegmentation',
216 | 'output': {
217 | 'www.freybrothersinc.com': ['www', 'frey', 'brothers', 'com'],
218 | },
219 | },
220 | ],
221 | 'description': 'Ignore certain stop words (inc) when doing segmentation',
222 | },
223 | ]
224 |
225 | for case in cases:
226 | got = wordsegmentation.run(case['data'])
227 | self.assertListEqual(got['analysers'], case['expected'], case['description'])
228 |
229 | def test_domain_matching(self):
230 | '''
231 | Combine the result of all domain matching analysers into one.
232 | '''
233 | # The first option decides if the TLD is included in the match
234 | options = [
235 | (True, DomainMatchingOption.SUBSET_MATCH),
236 | (False, DomainMatchingOption.SUBSET_MATCH),
237 | (True, DomainMatchingOption.ORDER_MATCH),
238 | (False, DomainMatchingOption.ORDER_MATCH),
239 | ]
240 |
241 | analysers = {o: DomainMatching(include_tld=o[0], option=o[1]) for o in options}
242 |
243 | cases = [
244 | {
245 | 'data': {
246 | 'all_domains': [
247 | 'store.google.com',
248 | 'google.com',
249 | ],
250 |
251 | 'analysers': [
252 | {
253 | 'analyser': 'AhoCorasickDomainMatching',
254 | 'output': {
255 | 'store.google.com': ['google.com'],
256 | },
257 | },
258 |
259 | {
260 | 'analyser': 'WordSegmentation',
261 | 'output': {
262 | 'store.google.com': ['store', 'google', 'com'],
263 | 'google.com': ['google', 'com'],
264 | },
265 | },
266 | ],
267 | },
268 | 'expected': {
269 | (True, DomainMatchingOption.SUBSET_MATCH): [],
270 | (False, DomainMatchingOption.SUBSET_MATCH): [],
271 | (True, DomainMatchingOption.ORDER_MATCH): [],
272 | (False, DomainMatchingOption.ORDER_MATCH): [],
273 | },
274 | 'description': 'A legit domain so it will be skipped (no match reported)',
275 | },
276 |
277 | {
278 | 'data': {
279 | 'all_domains': [
280 | 'login-appleid.managesuppport.com',
281 | ],
282 |
283 | 'analysers': [
284 | {
285 | 'analyser': 'AhoCorasickDomainMatching',
286 | 'output': {
287 | 'login-appleid.managesuppport.com': ['apple.com'],
288 | },
289 | },
290 |
291 | {
292 | 'analyser': 'WordSegmentation',
293 | 'output': {
294 | 'login-appleid.managesuppport.com': [
295 | 'login',
296 | 'apple',
297 | 'id',
298 | 'manage',
299 | 'suppport'
300 | ],
301 | },
302 | },
303 | ],
304 | },
305 | 'expected': {
306 | (True, DomainMatchingOption.SUBSET_MATCH): [],
307 | (False, DomainMatchingOption.SUBSET_MATCH): [
308 | {
309 | 'analyser': 'DomainMatching',
310 | 'output': {
311 | 'login-appleid.managesuppport.com': ['apple.com']
312 | },
313 | },
314 | ],
315 | (True, DomainMatchingOption.ORDER_MATCH): [],
316 | (False, DomainMatchingOption.ORDER_MATCH): [
317 | {
318 | 'analyser': 'DomainMatching',
319 | 'output': {
320 | 'login-appleid.managesuppport.com': ['apple.com']
321 | },
322 | },
323 | ],
324 | },
325 | 'description': 'Find a matching phishing domain',
326 | },
327 |
328 | {
329 | 'data': {
330 | 'all_domains': [
331 | 'djunprotected.com',
332 | 'www.djunprotected.com'
333 | ],
334 |
335 | 'analysers': [
336 | {
337 | 'analyser': 'AhoCorasickDomainMatching',
338 | 'output': {
339 | 'djunprotected.com': ['ted.com']
340 | }
341 | },
342 |
343 | {
344 | 'analyser': 'WordSegmentation',
345 | 'output': {
346 | 'djunprotected.com': ['dj', 'unprotected', 'com'],
347 | 'www.djunprotected.com': ['www', 'dj', 'unprotected', 'com']
348 | }
349 | },
350 | ],
351 | },
352 | 'expected': {
353 | (True, DomainMatchingOption.SUBSET_MATCH): [],
354 | (False, DomainMatchingOption.SUBSET_MATCH): [],
355 | (True, DomainMatchingOption.ORDER_MATCH): [],
356 | (False, DomainMatchingOption.ORDER_MATCH): [],
357 | },
358 | 'description': 'Find a matching phishing domain',
359 | },
360 | ]
361 |
362 | for case in cases:
363 | for option, analyser in analysers.items():
364 | expected = copy.deepcopy(case['data']['analysers'])
365 | expected.extend(case['expected'][option])
366 |
367 | got = analyser.run(case['data'])
368 | self.assertListEqual(got['analysers'], expected,
369 | '{} ({})'.format(case['description'], option))
370 |
371 | def test_bulk_domain_marker(self):
372 | '''
373 | Test the bulk domain analyser.
374 | '''
375 | bulky = BulkDomainMarker()
376 |
377 | cases = [
378 | {
379 | 'data': {
380 | 'all_domains': [
381 | 'store.google.com',
382 | 'google.com',
383 | ],
384 | },
385 | 'expected': [
386 | {'analyser': 'BulkDomainMarker', 'output': False}
387 | ],
388 | 'description': 'Not a bulk record',
389 | },
390 | {
391 | 'data': {
392 | 'all_domains': [
393 | 'a.com',
394 | 'b.com',
395 | 'c.com',
396 | 'd.com',
397 | 'e.com',
398 | 'f.com',
399 | 'g.com',
400 | 'h.com',
401 | 'i.com',
402 | 'j.com',
403 | 'k.com',
404 | 'l.com',
405 | 'm.com',
406 | 'n.com',
407 | 'o.com',
408 | ],
409 | },
410 | 'expected': [
411 | {'analyser': 'BulkDomainMarker', 'output': True}
412 | ],
413 | 'description': 'Mark a bulk record',
414 | },
415 | ]
416 |
417 | for case in cases:
418 | got = bulky.run(case['data'])
419 | self.assertListEqual(got['analysers'], case['expected'], case['description'])
420 |
421 | def test_idn_decoder(self):
422 | '''
423 | Test the IDNA decoder.
424 | '''
425 | decoder = IDNADecoder()
426 |
427 | cases = [
428 | {
429 | 'data': {
430 | 'all_domains': [
431 | 'store.google.com',
432 | 'google.com',
433 | ],
434 | },
435 | 'expected': [
436 | 'store.google.com',
437 | 'google.com',
438 | ],
439 | 'description': 'There is no domain in IDNA format',
440 | },
441 | {
442 | 'data': {
443 | 'all_domains': [
444 | 'xn--f1ahbgpekke1h.xn--p1ai',
445 | 'tigrobaldai.lt'
446 | ],
447 | },
448 | 'expected': [
449 | 'укрэмпужск.рф',
450 | 'tigrobaldai.lt'
451 | ],
452 | 'description': 'Convert some domains in IDNA format',
453 | },
454 | {
455 | 'data': {
456 | 'all_domains': [
457 | 'xn--foobar.xn--me',
458 | ],
459 | },
460 | 'expected': [
461 | 'xn--foobar.xn--me',
462 | ],
463 | 'description': 'Handle an invalid IDNA string',
464 | },
465 | {
466 | 'data': {
467 | 'all_domains': [
468 | '*.xn---35-5cd3cln6a9bzb.xn--p1ai',
469 | '*.nl-dating-vidkid.com',
470 | ],
471 | },
472 | 'expected': [
473 | '*.отмычка-35.рф',
474 | '*.nl-dating-vidkid.com',
475 | ],
476 | 'description': 'Handle an invalid code point',
477 | },
478 | ]
479 |
480 | for case in cases:
481 | got = decoder.run(case['data'])
482 | self.assertListEqual(got['all_domains'], case['expected'], case['description'])
483 |
484 | def test_homoglyphs_decoder(self):
485 | '''
486 | Test the homoglyphs decoder.
487 | '''
488 | cases = [
489 | {
490 | 'data': {
491 | 'all_domains': [
492 | 'store.google.com',
493 | '*.google.com',
494 | ],
495 | },
496 | 'greedy': False,
497 | 'expected': [
498 | 'store.google.com',
499 | '*.google.com',
500 | ],
501 | 'description': 'Normal domains in ASCII',
502 | },
503 | {
504 | 'data': {
505 | 'all_domains': [
506 | 'store.google.com',
507 | '*.google.com',
508 | ],
509 | },
510 | 'greedy': True,
511 | 'expected': [
512 | 'store.google.com',
513 | 'store.google.corn',
514 | 'store.googie.com',
515 | 'store.googie.corn',
516 | '*.google.com',
517 | '*.google.corn',
518 | '*.googie.com',
519 | '*.googie.corn'
520 | ],
521 | 'description': 'Normal domains in ASCII with a greedy decoder',
522 | },
523 | {
524 | 'data': {
525 | 'all_domains': [
526 | 'укрэмпужск.рф',
527 | 'tigrobaldai.lt',
528 | ],
529 | },
530 | 'greedy': False,
531 | 'expected': [
532 | 'yкpэмпyжcк.pф',
533 | 'tigrobaldai.lt',
534 | ],
535 | 'description': 'Normal domains in Unicode',
536 | },
537 | {
538 | 'data': {
539 | 'all_domains': [
540 | 'укрэмпужск.рф',
541 | 'tigrobaldai.lt',
542 | ],
543 | },
544 | 'greedy': True,
545 | 'expected': [
546 | 'yкpэмпyжcк.pф',
547 | 'tigrobaldai.lt',
548 | 'tigrobaldai.it',
549 | 'tigrobaidai.lt',
550 | 'tigrobaidai.it',
551 | ],
552 | 'description': 'Normal domains in Unicode with a greedy decoder',
553 | },
554 | {
555 | 'data': {
556 | 'all_domains': [
557 | # MATHEMATICAL MONOSPACE SMALL P 1D699
558 | '*.𝗉aypal.com',
559 |
560 | # MATHEMATICAL SAN-SERIF BOLD SMALL RHO
561 | 'phishing.𝗉ay𝞀al.com',
562 | ],
563 | },
564 | 'greedy': False,
565 | 'expected': [
566 | '*.paypal.com',
567 | 'phishing.paypal.com',
568 | ],
569 | 'description': 'Phishing example in confusable homoglyphs'
570 | },
571 | {
572 | 'data': {
573 | 'all_domains': [
574 | # MATHEMATICAL MONOSPACE SMALL P 1D699
575 | '*.𝗉aypal.com',
576 |
577 | # MATHEMATICAL SAN-SERIF BOLD SMALL RHO
578 | 'phishing.𝗉ay𝞀al.com',
579 | ],
580 | },
581 | 'greedy': True,
582 | 'expected': [
583 | '*.paypal.com',
584 | '*.paypal.corn',
585 | '*.paypai.com',
586 | '*.paypai.corn',
587 | 'phishing.paypal.com',
588 | 'phishing.paypal.corn',
589 | 'phishing.paypai.com',
590 | 'phishing.paypai.corn',
591 | ],
592 | 'description': 'Phishing example in confusable homoglyphs with a greedy decoder'
593 | },
594 | ]
595 |
596 | for case in cases:
597 | decoder = HomoglyphsDecoder(greedy=case['greedy'])
598 |
599 | got = decoder.run(case['data'])
600 | self.assertListEqual(got['all_domains'], case['expected'], case['description'])
601 |
--------------------------------------------------------------------------------
/scripts/sundry/certstream-domain-features.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 145,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import json\n",
10 | "import numpy as np\n",
11 | "\n",
12 | "domains = []\n",
13 | "features = []\n",
14 | "\n",
15 | "with open('domain-matching.20181014.decoded') as f:\n",
16 | " for line in f:\n",
17 | " record = json.loads(line.strip()) \n",
18 | " \n",
19 | " domains.extend(list(record['analysers'][0]['output'].keys()))\n",
20 | " features.extend(record['analysers'][-1]['output'])"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 146,
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "data": {
30 | "text/plain": [
31 | "14004"
32 | ]
33 | },
34 | "execution_count": 146,
35 | "metadata": {},
36 | "output_type": "execute_result"
37 | }
38 | ],
39 | "source": [
40 | "len(domains)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 147,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "data": {
50 | "text/plain": [
51 | "14004"
52 | ]
53 | },
54 | "execution_count": 147,
55 | "metadata": {},
56 | "output_type": "execute_result"
57 | }
58 | ],
59 | "source": [
60 | "len(features)"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 148,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "columns = ['NumberOfParts', 'Length', 'LongestPart', 'TLD', 'Randomness']"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 149,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "import pandas as pd\n",
79 | "\n",
80 | "df = pd.DataFrame(data=features, columns=columns, index=domains)"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 150,
86 | "metadata": {},
87 | "outputs": [
88 | {
89 | "data": {
90 | "text/plain": [
91 | "(14004, 5)"
92 | ]
93 | },
94 | "execution_count": 150,
95 | "metadata": {},
96 | "output_type": "execute_result"
97 | }
98 | ],
99 | "source": [
100 | "df.shape"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 151,
106 | "metadata": {},
107 | "outputs": [
108 | {
109 | "data": {
110 | "text/html": [
111 | "
\n",
112 | "\n",
125 | "
\n",
126 | " \n",
127 | " \n",
128 | " | \n",
129 | " NumberOfParts | \n",
130 | " Length | \n",
131 | " LongestPart | \n",
132 | " TLD | \n",
133 | " Randomness | \n",
134 | "
\n",
135 | " \n",
136 | " \n",
137 | " \n",
138 | " | www.sawyerrshousegivebackafrica.co.uk | \n",
139 | " 4 | \n",
140 | " 37 | \n",
141 | " 27 | \n",
142 | " 2 | \n",
143 | " 0.0 | \n",
144 | "
\n",
145 | " \n",
146 | "
\n",
147 | "
"
148 | ],
149 | "text/plain": [
150 | " NumberOfParts Length LongestPart \\\n",
151 | "www.sawyerrshousegivebackafrica.co.uk 4 37 27 \n",
152 | "\n",
153 | " TLD Randomness \n",
154 | "www.sawyerrshousegivebackafrica.co.uk 2 0.0 "
155 | ]
156 | },
157 | "execution_count": 151,
158 | "metadata": {},
159 | "output_type": "execute_result"
160 | }
161 | ],
162 | "source": [
163 | "df.sample()"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 152,
169 | "metadata": {},
170 | "outputs": [
171 | {
172 | "data": {
173 | "text/plain": [
174 | "count 6.000000\n",
175 | "mean 2334.000000\n",
176 | "std 2431.878862\n",
177 | "min 5.000000\n",
178 | "25% 109.000000\n",
179 | "50% 2355.000000\n",
180 | "75% 4507.250000\n",
181 | "max 4704.000000\n",
182 | "Name: NumberOfParts, dtype: float64"
183 | ]
184 | },
185 | "execution_count": 152,
186 | "metadata": {},
187 | "output_type": "execute_result"
188 | }
189 | ],
190 | "source": [
191 | "df['NumberOfParts'].value_counts().describe()"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 153,
197 | "metadata": {},
198 | "outputs": [
199 | {
200 | "data": {
201 | "text/plain": [
202 | ""
203 | ]
204 | },
205 | "execution_count": 153,
206 | "metadata": {},
207 | "output_type": "execute_result"
208 | },
209 | {
210 | "data": {
211 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmAAAAEvCAYAAADijX30AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAFqtJREFUeJzt3X+s1nX9//HHxTmAID+OKJyji/yWWrp+bpWIsFzHzwEVTbDDam0tMGeSy5FlM1v+SKHPFpUzvksZG9NWm4IebB03NCgPTM2VU8ppjYoNGueiEA5g6PEcru8ffjvLj/Xhh16vwzncbn/Bm+t6X8/3c+xwP9d1uK5KrVarBQCAYkYM9gAAAMcbAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKAwAQYAUFjjYA/wvzl48GD6+4fmJyU1NFSG7OxDlZ2XZ+fl2Xl5dl7eUN35yJENh33bYzrA+vtr2bPnH4M9xlFpaho7ZGcfquy8PDsvz87Ls/PyhurOJ08ef9i39RIkAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQ2DH9WZD1Nm7CmIwZXb8VHMlnQh2JA6/2Zf/eA3U5NwBQf8d1gI0Z3Zj/c2PnYI9xxLb+95zsH+whAICj5iVIAIDCBBgAQGECDACgMAEGAFCYAAMAKEyAAQAUJsAAAAo7rt8HjPJOntiYEaPG1O389Xrz24O9B7Krp68u5wbg+CPAKGrEqDHJrRMHe4wjNuLWniT7BnsMAIYJL0ECABQmwAAAChNgAACFCTAAgMIEGABAYf4XJAxz45pGZszIE+p2/nq99ceB117J/j2v1eXcAINNgMEwN2bkCfnAvR8Y7DGO2O8+/7vsjwADhicvQQIAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADACjssAOsv78/c+fOzRe/+MUkybZt2zJ//vy0tbVl8eLF6e3tTZL09vZm8eLFaWtry/z587N9+/aBc9xzzz1pa2vL7Nmzs3Hjxrf5UgAAhobDDrD77rsvZ5xxxsDvly1blgULFuSxxx7LhAkTsmbNmiTJ6tWrM2HChDz22GNZsGBBli1bliTZsmVLOjs709nZmZUrV+a2225Lf3//23w5AADHvsMKsO7u7vzqV79Ke3t7kqRWq+Wpp57K7NmzkyTz5s3L+vXrkyQbNmzIvHnzkiSzZ8/Ok08+mVqtlvXr12fOnDkZNWpUpk6dmtNPPz2bN2+uxzUBABzTDivAli5dmhtuuCEjRrx+8927d2fChAlpbGxMkrS0tKRarSZJqtVqTj311CRJY2Njxo8fn927d6daraalpWXgnM3NzQP3AQA4njQe6ga//OUvM2nSpLz//e/Pr3/96xIzDWhoqKSpaWzRxxwq7KU8Oy/Pzt+soWGEvRRm5+UdDzs/ZIA988wz2bBhQ7q6uvLqq69m//79WbJkSfbu3Zu+vr40Njamu7s7zc3NSV5/ZmvHjh1paWlJX19f9u3bl5NOOinNzc3p7u4eOG+1Wh24z3/S31/Lnj3/eIuX+J9Nnjy+bueut3rupZ7svDw7H16amsbaS2F2Xt5Q3fmRfL095EuQX/3qV9PV1ZUNGzbk+9//fs4777x873vfy7Rp07Ju3bokSUdHR1pbW5Mkra2t6ejoSJKsW7cu5513XiqVSlpbW9PZ2Zne3t5s27YtW7duzQc/+MGjuT4AgCHtqN8H7IYbbsiqVavS1taWPXv2ZP78+UmS9vb27NmzJ21tbVm1alW+9rWvJUnOOuusXHzxxbnkkkty1VVX5eabb05DQ8PbcxUAAEPIIV+C/FfTpk3LtGnTkiRTp04deOuJfzV69Ojcdddd//b+ixYtyqJFi45iTACA4cM74QMAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoLBDBtirr76a9vb2fPKTn8ycOXNy1113JUm2bduW+fPnp62tLYsXL05vb2+SpLe3N4sXL05bW1vmz5+f7du3D5zrnnvuSVtbW2bPnp2NGzfW6ZIAAI5thwywUaNG5d57783PfvazrF27Nhs3bsyzzz6bZcuWZcGCBXnssccyYcKErFmzJkmyevXqTJgwIY899lgWLFiQZcuWJUm2bNmSzs7OdHZ2ZuXKlbntttvS399f36sDADgGHTLAKpVKTjzxxCRJX19f+vr6UqlU8tRTT2X27NlJknnz5mX9+vVJkg0bNmTevHlJktmzZ+fJJ59MrVbL+vXrM2fOnIwaNSpTp07N6aefns2bN9frugAAjlmH9TNg/f39ufzyy3P++efn/PPPz9SpUzNhwoQ0NjYmSVpaWlKtVpMk1Wo1p556apKksbEx48ePz+7du1OtVtPS0jJwzubm5oH7AAAcTxoP50YNDQ15+OGHs3fv3lx77bX585//XO+5/v/jVtLUNLbIYw019lKenZdn52/W0DDCXgqz8/KOh50fVoD904QJEzJt2rQ8++yz2bt3b/r6+tLY2Jju7u40Nzcnef2ZrR07dqSlpSV9fX3Zt29fTjrppDQ3N6e7u3vgXNVqdeA+/0l/fy179vzjKC7r8EyePL5u5663eu6lnuy8PDsfXpqaxtpLYXZe3lDd+ZF8vT3kS5AvvfRS9u7dmyR55ZVX8sQTT+SMM87ItGnTsm7duiRJR0dHWltbkyStra3p6OhIkqxbty7nnXdeKpVKWltb09nZmd7e3mzbti1bt27NBz/4wSO+OACAoe6Qz4Dt3LkzN954Y/r7+1Or1XLRRRflE5/4RM4888x85StfyZ133plzzjkn8+fPT5K0t7fnhhtuSFtbWyZOnJgf/OAHSZKzzjorF198cS655JI0NDTk5ptvTkNDQ32vDgDgGHTIADv77LOzdu3aNx2fOnXqwFtP/KvRo0cPvFfY/7Ro0aIsWrToKMYEABg+vBM+AEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKO2SA7dixI5/73OdyySWXZM6cObn33nuTJHv27MnChQsza9asLFy4MD09PUmSWq2WO+64I21tbbnsssvy/PPPD5yro6Mjs2bNyqxZs9LR0VGnSwIAOLYdMsAaGhpy44035pFHHsn999+fn/70p9myZUtWrFiR6dOn59FHH8306dOzYsWKJElXV1e2bt2aRx99NLfffntuvfXWJK8H2/Lly/PAAw9k9erVWb58+UC0AQAcTw4ZYFOmTMn73ve+JMm4cePy7ne/O9VqNevXr8/cuXOTJHPnzs0vfvGLJBk4XqlU8uEPfzh79+7Nzp07s2nTpsyYMSNNTU2ZOHFiZsyYkY0bN9bx0gAAjk1H9DNg27dvzwsvvJAPfehD2bVrV6ZMmZIkmTx5cnbt2pUkqVaraWlpGbhPS0tLqtXqm443NzenWq2+HdcAADCkNB7uDV9++eVcd911uemmmzJu3Lg3/FmlUkmlUnnbh2toqKSpaezbft7hwF7Ks/Py7PzNGhpG2Ethdl7e8bDzwwqw1157Ldddd10uu+yyzJo1K0ly8sknZ+fOnZkyZUp27tyZSZMmJXn9ma3u7u6B+3Z3d6e5uTnNzc15+umnB45Xq9Wce+65/+vj9vfXsmfPP474og7X5Mnj63bueqvnXurJzsuz8+GlqWmsvRRm5+UN1Z0fydfbQ74EWavV8s1vfjPvfve7s3DhwoHjra2tWbt2bZJk7dq1ufDCC99wvFar5dlnn8348eMzZcqUzJw5M5s2bUpPT096enqyadOmzJw580ivDQBgyDvkM2C//e1v8/DDD+c973lPLr/88iTJ9ddfn6uvvjqLFy/OmjVrctppp+XOO+9MklxwwQV5/PHH09bWljFjxmTp0qVJkqampnzpS19Ke3t7kuTaa69NU1NTva4LAOCYdcgA++hHP5o//OEP//bP/vmeYP+qUqnklltu+be3b29vHwgwAIDjlXfCBwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKAwAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKAwAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKAwAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKAwAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKAwAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYYcMsG984xuZPn16Lr300oFje/bsycKFCzNr1qwsXLgwPT09SZJarZY77rgjbW1tueyyy/L8888P3KejoyOzZs3KrFmz0tHRUYdLAQAYGg4ZYFdccUVWrlz5hmMrVqzI9OnT8+ijj2b69OlZsWJFkqSrqytbt27No48+mttvvz233nprkteDbfny5XnggQeyevXqLF++fCDaAACON4cMsI997GOZOHHiG46tX78+c+fOTZLMnTs3v/jFL95wvFKp5MMf/nD27t2bnTt3ZtOmTZkxY0aampoyceLEzJgxIxs3bqzD5QAAHPuO6mfAdu3alSlTpiRJJk+enF27diVJqtVqWlpaBm7X0tKSarX6puPNzc2pVqtvZW4AgCGr8a2eoFKppFKpvB2zvElDQyVNTWPrcu6hzl7Ks/Py7PzNGhpG2Ethdl7e8bDzowqwk08+OTt37syUKVOyc+fOTJo0Kcnrz2x1d3cP3K67uzvNzc1pbm7O008/PXC8Wq3m3HPPPeTj9PfXsmfPP45mxMMyefL4up273uq5l3qy8/LsfHhpahprL4XZeXlDdedH8vX2qF6CbG1tzdq1a5Mka9euzYUXXviG47VaLc8++2zGjx+fKVOmZObMmdm0aVN6enrS09OTTZs2ZebMmUfz0AAAQ94hnwG7/vrr8/TTT2f37t35+Mc/ni9/+cu5+uqrs3jx4qxZsyannXZa7rzzziTJBRdckMcffzxtbW0ZM2ZMli5dmiRpamrKl770pbS3tydJrr322jQ1NdXxsgAAjl2HDLDvf//7//b4vffe+6ZjlUolt9xyy7+9fXt7+0CAAQAcz7wTPgBAYQIMAKAwAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKAwAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKAwAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKAwAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKCwxsEeAGC4OWncyDSOOaFu5588eXxdztt34JXs3v9aXc4NvJEAA3ibNY45IS+cfc5gj3HEznnxhUSAQRFeggQAKEyAAQAUJsAAAAoTYAAAhQkwAIDCBBgAQGECDACgMAEGAFCYAAMAKEyAAQAUVjzAurq6Mnv27LS1tWXFihWlHx4AYNAVDbD+/v58+9vfzsqVK9PZ2Zmf//zn2bJlS8kRAAAGXdEP4968eXNOP/30TJ06NUkyZ86crF+/PmeeeWbJMQAYZiZOGJNRo+v3T9rkyePrct7eV/vSs/dAXc7Nsa1ogFWr1bS0tAz8vrm5OZs3by45AgDD0KjRjfm/12wY7DGO2LV3tw72CAySogF2pEaObKjbdx3/tPW/59T1/PVS773U1a09gz3BURnKO//d53832CMclaG883NefGGwRzgqQ3nnQzVmhvLO62m476Xoz4A1Nzenu7t74PfVajXNzc0lRwAAGHRFA+wDH/hAtm7dmm3btqW3tzednZ1pbR2a37EAABytoi9BNjY25uabb85VV12V/v7+fOpTn8pZZ51VcgQAgEFXqdVqtcEeAgDgeOKd8AEAChNgAACFCTAAgMIEGABAYQKsDr7+9a8P9gjHhT/96U958skn8/LLL7/heFdX1yBNNPxt3rx54NMrtmzZklWrVuXxxx8f5KmOH7/5zW+yatWqbNq0abBHGbaee+657N+/P0nyyiuv5K677so111yT7373u9m3b98gTzc83XfffdmxY8dgj1Gc/wX5Fl1zzTVvOvbrX/8606ZNS5LcfffdpUc6Ltx33335yU9+kjPOOCMvvvhibrrppvzXf/1XkmTevHnp6OgY5AmHn+XLl6erqyt9fX2ZMWNGnnvuuUybNi1PPPFEZs6cmUWLFg32iMNOe3t71qxZkyR54IEH8pOf/CRtbW3ZtGlTWltbc/XVVw/yhMPPnDlz8vDDD6exsTHf+ta3csIJJ2T27Nl56qmn8uKLL2b58uWDPeKw85GPfCRjxozJO9/5zsyZMycXX3xxJk2aNNhj1d0x/VFEQ0G1Ws0ZZ5yR+fPnp1KppFar5fe//32uvPLKwR5tWFu9enUeeuihnHjiidm+fXuuu+66/PWvf83nP//5+J6iPtatW5e1a9emt7c3M2bMSFdXV8aNG5cvfOELmT9/vgCrg76+voFf33///Vm1alUmTZqUK6+8Mp/+9KcFWB0cPHgwjY2v/9P4+9//fuCbuY9+9KO5/PLLB3O0YWvq1Kl56KGH8sQTT+SRRx7JD3/4w7zvfe/LpZdemra2towbN26wR6wLL0G+RQ8++GDe//735+6778748eMzbdq0jB49Oueee27OPffcwR5v2Dp48GBOPPHEJMk73vGO/PjHP05XV1e+853vCLA6aWhoSENDw8B3qv/8onjCCSdkxAhfSurh4MGD6enpye7du1Or1QaeFRg7dmwaGhoGebrh6ayzzsqDDz6YJDn77LPzu9+9/jmqf/nLXwbCjLdXpVLJiBEjMnPmzCxdujQbN27MZz/72WzcuHHglY3hyN+mt2jEiBFZsGBBLrrooixdujSnnHJK+vv7B3usYe/kk0/OCy+8kHPOOSdJcuKJJ+aee+7JTTfdlD/+8Y+DPN3wNHLkyBw4cCBjxozJQw89NHB83759AqxO9u/fnyuuuCK1Wi2VSiU7d+7MlClT8vLLL/tGo06WLFmSJUuW5Ec/+lFOOumkfOYzn0lLS0tOPfXULFmyZLDHG5b+59/lkSNH5sILL8yFF16YAwcODNJU9ednwN5mv/rVr/LMM8/k+uuvH+xRhrXu7u40NDRk8uTJb/qz3/72t/nIRz4yCFMNb729vRk1atSbjr/00kv529/+lve+972DMNXx6cCBA/n73/+eqVOnDvYow9b+/fuzffv29PX1paWlJaeccspgjzRs/eUvf8m73vWuwR6jOAEGAFCY1w0AAAoTYAAAhQkwAIDCBBgAQGECDACgsP8HRsWezpLvOk4AAAAASUVORK5CYII=\n",
212 | "text/plain": [
213 | ""
214 | ]
215 | },
216 | "metadata": {},
217 | "output_type": "display_data"
218 | }
219 | ],
220 | "source": [
221 | "import matplotlib.pyplot as plt\n",
222 | "import seaborn\n",
223 | "\n",
224 | "seaborn.set_style(\"darkgrid\")\n",
225 | "\n",
226 | "plt.figure(figsize=(10,5))\n",
227 | "df['NumberOfParts'].value_counts().plot(kind='bar')"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 154,
233 | "metadata": {},
234 | "outputs": [
235 | {
236 | "data": {
237 | "text/plain": [
238 | ""
239 | ]
240 | },
241 | "execution_count": 154,
242 | "metadata": {},
243 | "output_type": "execute_result"
244 | },
245 | {
246 | "data": {
247 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlMAAAEvCAYAAABhSUTPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAADlhJREFUeJzt3V1o3Qf9x/HPabJqa5fG1jyoRP9uTipVvHASM0Qws43SSTO1InrhClIFoWyVSX1EhA3ROYp6FQbSXQhitamsgz1kSoulgsp8wAcYdtDJmkA0a+u21pzmfzEs7v8fnCzftCenfb3ucnJyzgeaX86b8zs9p7GwsLAQAACWZFW7BwAAdDIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUNB9Oe/swoULaTZ9eg2tdXU1/K4Ay87fFl6Oa67pWtT1LmtMNZsLmZt79nLeJR2qt3et3xVg2fnbwsvR13ftoq7nNB8AQIGYAgAoEFMAAAViCgCgQEwBABSIKQCAAjEFAFAgpgAACsQUAECBmAIAKBBTAAAFl/Wz+Xhp63rWZM0r/FP8X4v9TKSryXPn5nP29HPtngHAf/EIvgKseUV3/mfv4XbPoAM8+c1tOdvuEQC8iNN8AAAFYgoAoEBMAQAUiCkAgAIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUCCmAAAKxBQAQIGYAgAoEFMAAAWLjqlms5nx8fF85jOfSZKcPHkyO3bsyJYtW3L77bfn/Pnzl2wkAMBKteiYuv/++3P99ddf/Pqee+7JbbfdlkceeSQ9PT05cODAJRkIALCSLSqmTp06lV/84hf56Ec/miRZWFjI8ePHMzY2liS59dZbMzU1delWAgCsUIuKqbvvvjt33nlnVq164er//Oc/09PTk+7u7iTJ4OBgpqenL91KAIAVqrvVFX7+859nw4YNedvb3pZf/epXpTvr6mqkt3dt6TbgaucYgqXr6lrlGGLZtYyp3/72t3nsscdy5MiRnDt3LmfPns1dd92V06dPZ35+Pt3d3Tl16lQGBgZa3lmzuZC5uWeXZfiVpK/v2nZPoIM4hmDpenvXOoZYtMU+Prc8zff5z38+R44cyWOPPZZ777037373u/Od73wnw8PDeeihh5IkBw8ezOjoaG0xAEAHWvL7TN155535wQ9+kC1btmRubi47duxYzl0AAB2h5Wm+/zY8PJzh4eEkydDQkLdDAACuet4BHQCgQEwBABSIKQCAAjEFAFAgpgAACsQUAECBmAIAKBBTAAAFYgoAoEBMAQAUiCkAgAIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUCCmAAAKxBQAQIGYAgAoEFMAAAViCgCgQEwBABSIKQCAAjEFAFAgpgAACsQUAECBmAIAKBBTAAAFYgoAoEBMAQAUiCkAgAIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUCCmAAAKxBQAQIGYAgAoEFMAAAViCgCgQEwBABR0t7rCuXPn8slPfjLnz59Ps9nM2NhYdu/enZMnT2bPnj2Zm5vL5s2b861vfSurV6++HJsBAFaMls9MrV69Ovv378/PfvazTE5O5ujRo3n88cdzzz335LbbbssjjzySnp6eHDhw4HLsBQBYUVrGVKPRyKte9aokyfz8fObn59NoNHL8+PGMjY0lSW699dZMTU1d2qUAACvQol4z1Ww2s3379tx000256aabMjQ0lJ6ennR3v3CWcHBwMNPT05d0KADAStTyNVNJ0tXVlUOHDuX06dP53Oc+l7/97W9LurOurkZ6e9cu6WeBFziGYOm6ulY5hlh2i4qp/+jp6cnw8HAef/zxnD59OvPz8+nu7s6pU6cyMDDQ8uebzYXMzT275LFXqr6+a9s9gQ7iGIKl6+1d6xhi0Rb7+NzyNN8//vGPnD59Okny/PPP59ixY7n++uszPDychx56KEly8ODBjI6OFuYCAHSmls9MzczMZO/evWk2m1lYWMgHPvCBvO9978ub3/zm3HHHHdm3b1/e+ta3ZseOHZdjLwDAitIypjZt2pTJycn/d/nQ0JC3QwAArnreAR0AoEBMAQAUiCkAgAIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUCCmAAAKxBQAQIGYAgAoEFMAAAViCgCgQEwBABSIKQCAAjEFAFAgpgAACsQUAECBmAIAKBBTAAAFYgoAoEBMAQAUiCkAgAIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUCCmAAAKxBQAQIGYAgAoEFMAAAViCgCgQEwBABSIKQCAAjEFAFAgpgAACsQUAECBmAIAKBBTAAAFYgoAoEBMAQAUdLe6wtNPP50vfOELmZ2dTaPRyMc+9rF86lOfytzcXO644478/e9/z+tf//rs27cv69evvxybAQBWjJbPTHV1dWXv3r158MEH86Mf/Sg//OEP88QTT2RiYiIjIyN5+OGHMzIykomJicuxFwBgRWkZU/39/dm8eXOSZN26dbnuuusyPT2dqampjI+PJ0nGx8fz6KOPXtqlAAAr0Mt6zdRTTz2VP//5z3nHO96R2dnZ9Pf3J0n6+voyOzt7SQYCAKxkLV8z9R//+te/snv37nzpS1/KunXrXvS9RqORRqPR8ja6uhrp7V378lcCFzmGYOm6ulY5hlh2i4qpf//739m9e3c+9KEPZevWrUmSjRs3ZmZmJv39/ZmZmcmGDRta3k6zuZC5uWdri69AfX3XtnsCHcQxBEvX27vWMcSiLfbxueVpvoWFhXz5y1/Oddddl507d168fHR0NJOTk0mSycnJ3HzzzUucCgDQuVo+M/Wb3/wmhw4dylve8pZs3749SbJnz57s2rUrt99+ew4cOJDXve512bdv3yUfCwCw0rSMqRtvvDF//etfX/J7+/fvX/ZBAACdxDugAwAUiCkAgAIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUCCmAAAKxBQAQIGYAgAoEFMAAAViCgCgQEwBABSIKQCAAjEFAFAgpgAACsQUAECBmAIAKBBTAAAFYgoAoEBMAQAUiCkAgAIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUCCmAAAKxBQAQIGYAgAoEFMAAAViCgCgQEwBABSIKQCAAjEFAFAgpgAACsQUAECBmAIAKBBTAAAFYgoAoEBMAQAUiCkAgIKWMfXFL34xIyMjueWWWy5eNjc3l507d2br1q3ZuXNnnnnmmUs6EgBgpWoZUx/+8Idz3333veiyiYmJjIyM5OGHH87IyEgmJiYu2UAAgJWsZUy9613vyvr161902dTUVMbHx5Mk4+PjefTRRy/NOgCAFW5Jr5manZ1Nf39/kqSvry+zs7PLOgoAoFN0V2+g0Wik0Wgs6rpdXY309q6t3iVc1RxDsHRdXascQyy7JcXUxo0bMzMzk/7+/szMzGTDhg2L+rlmcyFzc88u5S6vaH1917Z7Ah3EMQRL19u71jHEoi328XlJp/lGR0czOTmZJJmcnMzNN9+8lJsBAOh4LWNqz549+fjHP54TJ07kve99b3784x9n165d+eUvf5mtW7fm2LFj2bVr1+XYCgCw4rQ8zXfvvfe+5OX79+9f9jEAAJ3GO6ADABSIKQCAAjEFAFAgpgAACsQUAECBmAIAKBBTAAAFYgoAoEBMAQAUiCkAgAIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUCCmAAAKxBQAQIGYAgAoEFMAAAViCgCgoLvdAwC4NDau786q1WvaPWPF6eu7tt0TVpwL55/L7DPz7Z7RscQUwBVq1eo1ydfXt3sGHWDV159JcqbdMzqW03wAAAViCgCgQEwBABSIKQCAAjEFAFAgpgAACsQUAECBmAIAKBBTAAAFYgoAoEBMAQAUiCkAgAIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUCCmAAAKxBQAQEEppo4cOZKxsbFs2bIlExMTy7UJAKBjLDmmms1mvvGNb+S+++7L4cOH88ADD+SJJ55Yzm0AACvekmPq97//fd74xjdmaGgoq1evzrZt2zI1NbWc2wAAVrwlx9T09HQGBwcvfj0wMJDp6ellGQUA0Cm6L+edXXNNV/r6rr2cd9kxnvzmtnZPoEM4hnhZvv5MuxfQIfxtWbolPzM1MDCQU6dOXfx6eno6AwMDyzIKAKBTLDmm3v72t+fJJ5/MyZMnc/78+Rw+fDijo6PLuQ0AYMVb8mm+7u7ufO1rX8unP/3pNJvNfOQjH8kNN9ywnNsAAFa8xsLCwkK7RwAAdCrvgA4AUCCmAAAKxBQAQIGYAgAoEFO03e9+97ucPXs2SfL888/nu9/9bj772c/m29/+ds6cOdPmdUCnuv/++/P000+3ewZXAf+bj7bbtm1bDh06lO7u7nz1q1/NK1/5yoyNjeX48eP5y1/+ku9///vtngh0oHe+851Zs2ZN3vCGN2Tbtm354Ac/mA0bNrR7Flegy/pxMvBSLly4kO7uF34V//jHP+bgwYNJkhtvvDHbt29v5zSggw0NDeWnP/1pjh07lgcffDDf+973snnz5txyyy3ZsmVL1q1b1+6JXCGc5qPtbrjhhvzkJz9JkmzatCl/+MMfkiQnTpy4GFkAL1ej0ciqVavynve8J3fffXeOHj2aT3ziEzl69Gje//73t3seVxCn+Wi7M2fO5K677sqvf/3rvPrVr86f/vSnDA4O5rWvfW2+8pWvZNOmTe2eCHSg8fHxTE5OvuT3nnvuuaxZs+YyL+JKJaZYMc6ePZunnnoq8/PzGRwczGte85p2TwI62IkTJ/KmN72p3TO4CogpAIACr5kCACgQUwAABWIKAKBATAEAFIgpAICC/wWjtKP6Gu64AgAAAABJRU5ErkJggg==\n",
248 | "text/plain": [
249 | ""
250 | ]
251 | },
252 | "metadata": {},
253 | "output_type": "display_data"
254 | }
255 | ],
256 | "source": [
257 | "plt.figure(figsize=(10,5))\n",
258 | "df.loc[df['NumberOfParts'] > 5]['NumberOfParts'].value_counts().plot(kind='bar')"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": 156,
264 | "metadata": {},
265 | "outputs": [
266 | {
267 | "data": {
268 | "text/html": [
269 | "\n",
270 | "\n",
283 | "
\n",
284 | " \n",
285 | " \n",
286 | " | \n",
287 | " NumberOfParts | \n",
288 | " Length | \n",
289 | " LongestPart | \n",
290 | " TLD | \n",
291 | " Randomness | \n",
292 | "
\n",
293 | " \n",
294 | " \n",
295 | " \n",
296 | " | waws-prod-blu-43680001.state.p.azurewebsites.windows.net | \n",
297 | " 6 | \n",
298 | " 56 | \n",
299 | " 22 | \n",
300 | " 3 | \n",
301 | " 0.0 | \n",
302 | "
\n",
303 | " \n",
304 | "
\n",
305 | "
"
306 | ],
307 | "text/plain": [
308 | " NumberOfParts Length \\\n",
309 | "waws-prod-blu-43680001.state.p.azurewebsites.wi... 6 56 \n",
310 | "\n",
311 | " LongestPart TLD \\\n",
312 | "waws-prod-blu-43680001.state.p.azurewebsites.wi... 22 3 \n",
313 | "\n",
314 | " Randomness \n",
315 | "waws-prod-blu-43680001.state.p.azurewebsites.wi... 0.0 "
316 | ]
317 | },
318 | "execution_count": 156,
319 | "metadata": {},
320 | "output_type": "execute_result"
321 | }
322 | ],
323 | "source": [
324 | "df.loc[df['NumberOfParts'] > 5].sample()"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": 157,
330 | "metadata": {},
331 | "outputs": [
332 | {
333 | "data": {
334 | "text/html": [
335 | "\n",
336 | "\n",
349 | "
\n",
350 | " \n",
351 | " \n",
352 | " | \n",
353 | " NumberOfParts | \n",
354 | " Length | \n",
355 | "
\n",
356 | " \n",
357 | " \n",
358 | " \n",
359 | " | NumberOfParts | \n",
360 | " 1.000000 | \n",
361 | " 0.604539 | \n",
362 | "
\n",
363 | " \n",
364 | " | Length | \n",
365 | " 0.604539 | \n",
366 | " 1.000000 | \n",
367 | "
\n",
368 | " \n",
369 | "
\n",
370 | "
"
371 | ],
372 | "text/plain": [
373 | " NumberOfParts Length\n",
374 | "NumberOfParts 1.000000 0.604539\n",
375 | "Length 0.604539 1.000000"
376 | ]
377 | },
378 | "execution_count": 157,
379 | "metadata": {},
380 | "output_type": "execute_result"
381 | }
382 | ],
383 | "source": [
384 | "df[['NumberOfParts', 'Length']].corr()"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": 158,
390 | "metadata": {},
391 | "outputs": [
392 | {
393 | "data": {
394 | "text/plain": [
395 | ""
396 | ]
397 | },
398 | "execution_count": 158,
399 | "metadata": {},
400 | "output_type": "execute_result"
401 | },
402 | {
403 | "data": {
404 | "image/png": "\n",
405 | "text/plain": [
406 | ""
407 | ]
408 | },
409 | "metadata": {},
410 | "output_type": "display_data"
411 | }
412 | ],
413 | "source": [
414 | "plt.figure(figsize=(10,5))\n",
415 | "seaborn.boxplot(data=df, x='NumberOfParts', y='Length')"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": 159,
421 | "metadata": {},
422 | "outputs": [
423 | {
424 | "data": {
425 | "text/html": [
426 | "\n",
427 | "\n",
440 | "
\n",
441 | " \n",
442 | " \n",
443 | " | \n",
444 | " NumberOfParts | \n",
445 | " Length | \n",
446 | " LongestPart | \n",
447 | " TLD | \n",
448 | " Randomness | \n",
449 | "
\n",
450 | " \n",
451 | " \n",
452 | " \n",
453 | "
\n",
454 | "
"
455 | ],
456 | "text/plain": [
457 | "Empty DataFrame\n",
458 | "Columns: [NumberOfParts, Length, LongestPart, TLD, Randomness]\n",
459 | "Index: []"
460 | ]
461 | },
462 | "execution_count": 159,
463 | "metadata": {},
464 | "output_type": "execute_result"
465 | }
466 | ],
467 | "source": [
468 | "df.loc[df['NumberOfParts'] == 1]"
469 | ]
470 | },
471 | {
472 | "cell_type": "code",
473 | "execution_count": 160,
474 | "metadata": {},
475 | "outputs": [
476 | {
477 | "data": {
478 | "text/plain": [
479 | ""
480 | ]
481 | },
482 | "execution_count": 160,
483 | "metadata": {},
484 | "output_type": "execute_result"
485 | },
486 | {
487 | "data": {
488 | "image/png": "\n",
489 | "text/plain": [
490 | ""
491 | ]
492 | },
493 | "metadata": {},
494 | "output_type": "display_data"
495 | }
496 | ],
497 | "source": [
498 | "plt.figure(figsize=(10,5))\n",
499 | "seaborn.violinplot(data=df, x='NumberOfParts', y='Length')"
500 | ]
501 | },
502 | {
503 | "cell_type": "code",
504 | "execution_count": 161,
505 | "metadata": {},
506 | "outputs": [
507 | {
508 | "data": {
509 | "text/plain": [
510 | ""
511 | ]
512 | },
513 | "execution_count": 161,
514 | "metadata": {},
515 | "output_type": "execute_result"
516 | },
517 | {
518 | "data": {
519 | "image/png": "\n",
520 | "text/plain": [
521 | ""
522 | ]
523 | },
524 | "metadata": {},
525 | "output_type": "display_data"
526 | }
527 | ],
528 | "source": [
529 | "plt.figure(figsize=(10,5))\n",
530 | "df['TLD'].value_counts().plot(kind='bar')"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": 162,
536 | "metadata": {},
537 | "outputs": [
538 | {
539 | "data": {
540 | "text/plain": [
541 | ""
542 | ]
543 | },
544 | "execution_count": 162,
545 | "metadata": {},
546 | "output_type": "execute_result"
547 | },
548 | {
549 | "data": {
550 | "image/png": "\n",
551 | "text/plain": [
552 | ""
553 | ]
554 | },
555 | "metadata": {},
556 | "output_type": "display_data"
557 | }
558 | ],
559 | "source": [
560 | "plt.figure(figsize=(10,5))\n",
561 | "df.loc[df['TLD'] > 3]['TLD'].value_counts().plot(kind='bar')"
562 | ]
563 | },
564 | {
565 | "cell_type": "code",
566 | "execution_count": 163,
567 | "metadata": {},
568 | "outputs": [
569 | {
570 | "data": {
571 | "text/html": [
572 | "\n",
573 | "\n",
586 | "
\n",
587 | " \n",
588 | " \n",
589 | " | \n",
590 | " NumberOfParts | \n",
591 | " Length | \n",
592 | " LongestPart | \n",
593 | " TLD | \n",
594 | " Randomness | \n",
595 | "
\n",
596 | " \n",
597 | " \n",
598 | " \n",
599 | " | emil.engineering | \n",
600 | " 2 | \n",
601 | " 16 | \n",
602 | " 11 | \n",
603 | " 11 | \n",
604 | " 0.0 | \n",
605 | "
\n",
606 | " \n",
607 | "
\n",
608 | "
"
609 | ],
610 | "text/plain": [
611 | " NumberOfParts Length LongestPart TLD Randomness\n",
612 | "emil.engineering 2 16 11 11 0.0"
613 | ]
614 | },
615 | "execution_count": 163,
616 | "metadata": {},
617 | "output_type": "execute_result"
618 | }
619 | ],
620 | "source": [
621 | "df.loc[df['TLD'] == 11].sample()"
622 | ]
623 | },
624 | {
625 | "cell_type": "code",
626 | "execution_count": null,
627 | "metadata": {},
628 | "outputs": [],
629 | "source": []
630 | }
631 | ],
632 | "metadata": {
633 | "kernelspec": {
634 | "display_name": "Python 3",
635 | "language": "python",
636 | "name": "python3"
637 | },
638 | "language_info": {
639 | "codemirror_mode": {
640 | "name": "ipython",
641 | "version": 3
642 | },
643 | "file_extension": ".py",
644 | "mimetype": "text/x-python",
645 | "name": "python",
646 | "nbconvert_exporter": "python",
647 | "pygments_lexer": "ipython3",
648 | "version": "3.6.7"
649 | }
650 | },
651 | "nbformat": 4,
652 | "nbformat_minor": 2
653 | }
654 |
--------------------------------------------------------------------------------