├── certstream_analytics ├── __init__.py ├── reporters │ ├── __init__.py │ └── base.py ├── transformers │ ├── __init__.py │ └── base.py ├── storages │ ├── __init__.py │ ├── base.py │ └── elasticsearch_storage.py ├── analysers │ ├── __init__.py │ ├── base.py │ ├── domain_matching.py │ └── common_domain_analyser.py └── stream.py ├── .coveragerc ├── tests ├── opendns-top-domains.txt ├── test_stream.py ├── test_elasticsearch.py ├── test_reporter.py ├── samples.json └── test_domain_matching_analyser.py ├── setup.cfg ├── .gitmodules ├── LICENSE ├── scripts ├── sundry │ ├── generate_features.py │ ├── isolation_forest.py │ ├── elliptic_envelope.py │ ├── lof.py │ └── certstream-domain-features.ipynb └── replay.py ├── .travis.yml ├── setup.py ├── .gitignore ├── bin └── domain_matching.py ├── README.md └── pylintrc /certstream_analytics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source=certstream-analytics 3 | -------------------------------------------------------------------------------- /certstream_analytics/reporters/__init__.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | from .base import Reporter, FileReporter 3 | -------------------------------------------------------------------------------- /tests/opendns-top-domains.txt: -------------------------------------------------------------------------------- 1 | google.com 2 | facebook.com 3 | bankofamerica.com 4 | apple.com 5 | www.net.cn 6 | discover.com 7 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | 4 | [tool:pytest] 5 | pep8maxlinelength = 120 6 | 7 | [pep8] 8 | max-line-length = 120 9 | -------------------------------------------------------------------------------- /certstream_analytics/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | from .base import Transformer, PassthroughTransformer, CertstreamTransformer 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "data/opendns/public-domain-lists"] 2 | path = data/opendns/public-domain-lists 3 | url = https://github.com/opendns/public-domain-lists.git 4 | -------------------------------------------------------------------------------- /certstream_analytics/storages/__init__.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | from .base import Storage 3 | from .elasticsearch_storage import ElasticsearchStorage 4 | -------------------------------------------------------------------------------- /certstream_analytics/storages/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Save certstream data into various storages, streaming or not. 3 | """ 4 | from abc import ABCMeta, abstractmethod 5 | 6 | 7 | # pylint: disable=no-init,too-few-public-methods 8 | class Storage: 9 | """ 10 | Define the template of all analyser class. 11 | """ 12 | __metaclass__ = ABCMeta 13 | 14 | @abstractmethod 15 | def save(self, record): 16 | """ 17 | Move along, nothing to see here. 18 | """ 19 | -------------------------------------------------------------------------------- /certstream_analytics/analysers/__init__.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | from .base import Analyser, Debugger 3 | from .domain_matching import AhoCorasickDomainMatching 4 | from .domain_matching import DomainMatchingOption, DomainMatching 5 | from .common_domain_analyser import WordSegmentation 6 | from .common_domain_analyser import BulkDomainMarker 7 | from .common_domain_analyser import FeaturesGenerator 8 | from .common_domain_analyser import IDNADecoder 9 | from .common_domain_analyser import HomoglyphsDecoder 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Huy Do 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /certstream_analytics/reporters/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Report the analysis result somewhere. 3 | """ 4 | import json 5 | from abc import ABCMeta, abstractmethod 6 | 7 | 8 | # pylint: disable=no-init,too-few-public-methods 9 | class Reporter: 10 | """ 11 | Define the template of all reporter class. 12 | """ 13 | __metaclass__ = ABCMeta 14 | 15 | @abstractmethod 16 | def publish(self, report): 17 | """ 18 | Move along, nothing to see here. 19 | """ 20 | 21 | 22 | class FileReporter(Reporter): 23 | """ 24 | Simply print the report to a file. 25 | """ 26 | def __init__(self, path): 27 | """ 28 | Note that an exception will be raised if the path is not valid or writable. 29 | """ 30 | self.fhandler = open(path, 'a') 31 | 32 | def __del__(self): 33 | self.fhandler.close() 34 | 35 | def publish(self, report): 36 | """ 37 | This is a very basic reporter that will only print out the record it receives 38 | to a plain text file. 39 | """ 40 | if not report: 41 | return 42 | 43 | print(json.dumps(report), file=self.fhandler) 44 | -------------------------------------------------------------------------------- /scripts/sundry/generate_features.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Generate features for outlier detection. 3 | ''' 4 | 5 | import json 6 | import sys 7 | 8 | from certstream_analytics.analysers import WordSegmentation 9 | from certstream_analytics.analysers import IDNADecoder 10 | from certstream_analytics.analysers import FeaturesGenerator 11 | 12 | def main(max_count=None): 13 | ''' 14 | The record is assumed to be stored in a JSON file passed in as the first 15 | parameter of the script. 16 | ''' 17 | segmenter = WordSegmentation() 18 | decoder = IDNADecoder() 19 | generator = FeaturesGenerator() 20 | 21 | with open(sys.argv[1]) as fhandle: 22 | count = 0 23 | 24 | for line in fhandle: 25 | try: 26 | record = json.loads(line.strip()) 27 | except json.decoder.JSONDecodeError: 28 | continue 29 | 30 | record = decoder.run(record) 31 | record = segmenter.run(record) 32 | record = generator.run(record) 33 | 34 | print(json.dumps(record)) 35 | count += 1 36 | 37 | if max_count and count > max_count: 38 | break 39 | 40 | 41 | if __name__ == '__main__': 42 | main() 43 | -------------------------------------------------------------------------------- /tests/test_stream.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Test consuming the data from the great certstream. 3 | ''' 4 | import time 5 | import unittest 6 | 7 | from certstream_analytics.analysers import Debugger 8 | from certstream_analytics.transformers import CertstreamTransformer 9 | from certstream_analytics.stream import CertstreamAnalytics 10 | 11 | 12 | class CertstreamTest(unittest.TestCase): 13 | ''' 14 | Test the way we consume data from certstream. 15 | ''' 16 | DEFAULT_DELAY = 30 17 | 18 | def setUp(self): 19 | ''' 20 | Setup the client to consume from certstream. 21 | ''' 22 | self.debugger = Debugger() 23 | self.transformer = CertstreamTransformer() 24 | 25 | self.engine = CertstreamAnalytics(transformer=self.transformer, 26 | analysers=self.debugger) 27 | 28 | def test_consume(self): 29 | ''' 30 | Start to consume some data from certstream. 31 | ''' 32 | self.engine.start() 33 | 34 | # Wait a bit 35 | time.sleep(CertstreamTest.DEFAULT_DELAY) 36 | 37 | self.engine.stop() 38 | # We should see some data coming already 39 | self.assertTrue(self.debugger.count, 'Consuming data from certstream successfully') 40 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | os: 3 | - linux 4 | python: 5 | - '3.7' 6 | before_install: 7 | - sudo apt-get install -y libenchant-dev 8 | - sudo apt-get install -y apt-transport-https 9 | - wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo apt-key add - 10 | - echo "deb https://artifacts.elastic.co/packages/6.x/apt stable main" | sudo tee -a /etc/apt/sources.list.d/elastic-6.x.list 11 | - sudo apt-get update && sudo apt-get remove -y elasticsearch 12 | - sudo apt-get install -y elasticsearch 13 | - sudo chown -R elasticsearch:elasticsearch /etc/default/elasticsearch 14 | - sudo service elasticsearch start 15 | install: 16 | - pip install --upgrade pytest 17 | - pip install pytest-pep8 pytest-cov 18 | - pip install codecov 19 | - pip install elasticsearch_dsl certstream pyahocorasick tldextract wordsegment pyenchant idna confusable-homoglyphs 20 | - pip install git+https://github.com/casics/nostril.git 21 | - pip install -e .[tests] 22 | before_script: 23 | - sleep 10 24 | - sudo systemctl -l status elasticsearch 25 | - curl 'http://localhost:9200' 26 | script: 27 | - pytest --pep8 -m pep8 certstream_analytics/ 28 | - PYTHONPATH=$PWD:$PYTHONPATH pytest --cov=./ tests/ 29 | after_script: 30 | - curl 'http://localhost:9200/_cat/indices?v' 31 | after_success: 32 | - codecov 33 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Standard Python setup script. 3 | ''' 4 | 5 | from setuptools import setup, find_packages 6 | 7 | with open('README.md', 'r') as fh: 8 | long_description = fh.read() 9 | 10 | setup( 11 | name='certstream-analytics', 12 | version='0.1.7', 13 | description='certstream + analytics', 14 | url='https://github.com/huydhn/certstream-analytics', 15 | author='Huy Do', 16 | author_email='huydhn@gmail.com', 17 | license='MIT', 18 | long_description=long_description, 19 | long_description_content_type='text/markdown', 20 | install_requires=[ 21 | 'elasticsearch_dsl', 22 | 'certstream', 23 | 'pyahocorasick', 24 | 'tldextract', 25 | 'wordsegment', 26 | 'pyenchant', 27 | 'idna', 28 | 'confusable_homoglyphs' 29 | ], 30 | tests_require=[ 31 | 'coverage', 32 | 'nose', 33 | 'pytest-pep8', 34 | 'pytest-cov', 35 | 'codecov' 36 | ], 37 | dependency_links=[ 38 | 'https://github.com/casics/nostril/tarball/master' 39 | ], 40 | packages=find_packages(), 41 | scripts=['bin/domain_matching.py'], 42 | classifiers=[ 43 | "Programming Language :: Python :: 3", 44 | "License :: OSI Approved :: MIT License", 45 | "Operating System :: OS Independent", 46 | ], 47 | ) 48 | -------------------------------------------------------------------------------- /scripts/sundry/isolation_forest.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Apply the isolation forest method to separate our outliers. 3 | ''' 4 | import json 5 | import sys 6 | import numpy as np 7 | 8 | from sklearn.ensemble import IsolationForest 9 | from sklearn.preprocessing import scale 10 | 11 | 12 | def main(): 13 | ''' 14 | The procedure contains two simple steps: 15 | - Scale the data to the standard distribution with mean 0 and unit variance. 16 | This might be too simplistic. 17 | - Apply the isolation forest. The contamination level is set manually. 18 | ''' 19 | domains = [] 20 | raw = [] 21 | 22 | with open(sys.argv[1]) as fhandle: 23 | for line in fhandle: 24 | record = json.loads(line.strip()) 25 | 26 | for analyser in record['analysers']: 27 | if analyser['analyser'] == 'FeaturesGenerator': 28 | raw.extend(analyser['output']) 29 | 30 | if analyser['analyser'] == 'WordSegmentation': 31 | domains.extend(analyser['output'].keys()) 32 | 33 | if len(raw) != len(domains): 34 | print(record) 35 | sys.exit(0) 36 | 37 | x_samples = scale(np.array(raw)) 38 | 39 | engine = IsolationForest(behaviour='new', contamination=0.015) 40 | y_samples = engine.fit_predict(x_samples) 41 | 42 | for index, y_sample in enumerate(y_samples): 43 | if y_sample == -1: 44 | print(domains[index]) 45 | 46 | 47 | if __name__ == '__main__': 48 | main() 49 | -------------------------------------------------------------------------------- /scripts/sundry/elliptic_envelope.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Apply the elliptic envelope method to separate our outliers. 3 | ''' 4 | import json 5 | import sys 6 | import numpy as np 7 | 8 | from sklearn.covariance import EllipticEnvelope 9 | from sklearn.preprocessing import scale 10 | 11 | 12 | def main(): 13 | ''' 14 | The procedure contains two simple steps: 15 | - Scale the data to the standard distribution with mean 0 and unit variance. 16 | This might be too simplistic. 17 | - Apply the elliptic envelope. The contamination level is set manually. 18 | ''' 19 | domains = [] 20 | raw = [] 21 | 22 | with open(sys.argv[1]) as fhandle: 23 | for line in fhandle: 24 | record = json.loads(line.strip()) 25 | 26 | for analyser in record['analysers']: 27 | if analyser['analyser'] == 'FeaturesGenerator': 28 | raw.extend(analyser['output']) 29 | 30 | if analyser['analyser'] == 'WordSegmentation': 31 | domains.extend(analyser['output'].keys()) 32 | 33 | if len(raw) != len(domains): 34 | print(record) 35 | sys.exit(0) 36 | 37 | x_samples = scale(np.array(raw)) 38 | 39 | engine = EllipticEnvelope(contamination=0.015, support_fraction=1.0) 40 | y_samples = engine.fit_predict(x_samples) 41 | 42 | for index, y_sample in enumerate(y_samples): 43 | if y_sample == -1: 44 | print(domains[index]) 45 | 46 | 47 | if __name__ == '__main__': 48 | main() 49 | -------------------------------------------------------------------------------- /scripts/sundry/lof.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Apply the local outlier factor method to separate our outliers. 3 | ''' 4 | import json 5 | import sys 6 | import numpy as np 7 | 8 | from sklearn.neighbors import LocalOutlierFactor 9 | from sklearn.preprocessing import scale 10 | 11 | 12 | def main(): 13 | ''' 14 | The procedure contains two simple steps: 15 | - Scale the data to the standard distribution with mean 0 and unit variance. 16 | This might be too simplistic. 17 | - Apply the local outlier factor. The contamination level is set manually. 18 | 19 | This method does not seem to work in our case cause I suspect it treats groups 20 | of several outliers as clusters. 21 | ''' 22 | domains = [] 23 | raw = [] 24 | 25 | with open(sys.argv[1]) as fhandle: 26 | for line in fhandle: 27 | record = json.loads(line.strip()) 28 | 29 | for analyser in record['analysers']: 30 | if analyser['analyser'] == 'FeaturesGenerator': 31 | raw.extend(analyser['output']) 32 | 33 | if analyser['analyser'] == 'WordSegmentation': 34 | domains.extend(analyser['output'].keys()) 35 | 36 | if len(raw) != len(domains): 37 | print(record) 38 | sys.exit(0) 39 | 40 | x_samples = scale(np.array(raw)) 41 | 42 | # Need to check the appropriate value for n_neighbors 43 | engine = LocalOutlierFactor(contamination=0.015) 44 | y_samples = engine.fit_predict(x_samples) 45 | 46 | for index, y_sample in enumerate(y_samples): 47 | if y_sample == -1: 48 | print(domains[index]) 49 | 50 | 51 | if __name__ == '__main__': 52 | main() 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | nohup.* 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | *.txt 108 | .idea 109 | -------------------------------------------------------------------------------- /tests/test_elasticsearch.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Save some dummy records into Elasticsearch. 3 | ''' 4 | import os 5 | import json 6 | import time 7 | import unittest 8 | 9 | from elasticsearch import Elasticsearch 10 | from elasticsearch_dsl import Search, Q 11 | 12 | from certstream_analytics.transformers import CertstreamTransformer 13 | from certstream_analytics.storages import ElasticsearchStorage 14 | 15 | 16 | class ElasticsearchTest(unittest.TestCase): 17 | ''' 18 | Test the way we save data into Elasticsearch. 19 | ''' 20 | def setUp(self): 21 | ''' 22 | Setup the client to consume from certstream and save the data into 23 | Elasticsearch 24 | ''' 25 | elasticsearch_host = os.getenv('ELASTICSEARCH_HOST', 'localhost:9200') 26 | 27 | self.transformer = CertstreamTransformer() 28 | self.storage = ElasticsearchStorage(hosts=[elasticsearch_host]) 29 | self.search = Search(using=Elasticsearch(elasticsearch_host), index='certstream-*') 30 | 31 | def test_save(self): 32 | ''' 33 | Start to save certstream data into Elasticsearch. 34 | ''' 35 | current_dir = os.path.dirname(os.path.realpath(__file__)) 36 | 37 | with open(os.path.join(current_dir, 'samples.json')) as fhandle: 38 | samples = json.load(fhandle) 39 | 40 | for sample in samples: 41 | filtered = self.transformer.apply(sample) 42 | self.storage.save(filtered) 43 | 44 | # Try to wait for a few seconds here so that Elasticsearch has enough 45 | # time to index the data 46 | time.sleep(5) 47 | 48 | for sample in samples: 49 | domain = sample['data']['leaf_cert']['all_domains'][0] 50 | # Look for the record in Elasticsearch 51 | query = Q('multi_match', query=domain, fields=['domain', 'san']) 52 | response = self.search.query(query).execute() 53 | 54 | self.assertGreaterEqual(response.hits.total, 1, 55 | 'The record has been indexed in Elasticsearch') 56 | self.assertIn(response.hits[0].domain, sample['data']['leaf_cert']['all_domains'], 57 | 'The correct record is returned') 58 | -------------------------------------------------------------------------------- /certstream_analytics/analysers/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Analyse the certificate data from certstream. 3 | """ 4 | import json 5 | import logging 6 | from abc import ABCMeta, abstractmethod 7 | 8 | 9 | # pylint: disable=no-init,too-few-public-methods 10 | class Analyser: 11 | """ 12 | Define the template of all analyser class. 13 | """ 14 | __metaclass__ = ABCMeta 15 | 16 | @abstractmethod 17 | def run(self, record): 18 | """ 19 | In normal cases, an analyser will process the record, save the result 20 | into the record, and then return the updated record so that the next 21 | analyser can choose what to do next. Therefore, the structure of the 22 | record comes from CertstreamTransformer class as follows: 23 | 24 | { 25 | # These fields are extracted from certstream 26 | cert_index: INTEGER, 27 | seen: TIMESTAMP, 28 | chain: [ 29 | ORGANIZATION 30 | ], 31 | not_before: TIMESTAMP, 32 | not_after: TIMESTAMP, 33 | all_domains: [ 34 | SAN 35 | ], 36 | 37 | # This is a place holder field which are used later by the 38 | # analysers. Each analyser will append its result here. 39 | analysers: [ 40 | { 41 | analyser: ANALYSER NAME, 42 | output: ANYTHING GOES HERE, 43 | }, 44 | ], 45 | } 46 | """ 47 | 48 | 49 | class Debugger(Analyser): 50 | """ 51 | A dummy analyser for debugging. 52 | """ 53 | def __init__(self): 54 | """ 55 | Keep track of the number of records so far for debugging purpose. 56 | """ 57 | self.count = 0 58 | 59 | def run(self, record): 60 | ''' 61 | This is a dummy analyser that will only print out the record it processes. 62 | ''' 63 | logging.info(json.dumps(record)) 64 | 65 | # Update the number of records so far 66 | self.count += 1 67 | 68 | if 'analysers' not in record: 69 | record['analysers'] = [] 70 | 71 | record['analysers'].append({ 72 | 'analyser': type(self).__name__, 73 | 'output': self.count, 74 | }) 75 | 76 | return record 77 | -------------------------------------------------------------------------------- /certstream_analytics/transformers/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Transform the certificate data from certstream before passing it to the 3 | processing pipeline. 4 | """ 5 | from abc import ABCMeta, abstractmethod 6 | 7 | 8 | # pylint: disable=no-init,too-few-public-methods 9 | class Transformer: 10 | """ 11 | Define the template of all transformer class. 12 | """ 13 | __metaclass__ = ABCMeta 14 | 15 | @abstractmethod 16 | def apply(self, raw): 17 | """ 18 | Move along, nothing to see here. 19 | """ 20 | 21 | 22 | class PassthroughTransformer(Transformer): 23 | """ 24 | A dummy transformer that doesn't do anything. 25 | """ 26 | def apply(self, raw): 27 | """ 28 | Move along, nothing to see here. 29 | """ 30 | return raw 31 | 32 | 33 | class CertstreamTransformer(Transformer): 34 | """ 35 | Transform data from certstream into something readily consumable by the 36 | processing pipeline. 37 | """ 38 | def apply(self, raw): 39 | """ 40 | The format of the message from certstream can be found at their github 41 | documentation. 42 | 43 | So far, we are only interested in the domain names, the timestamps, and 44 | probably the content of the subject. So the returned stucture is as 45 | follows: 46 | 47 | { 48 | # These fields are extracted from certstream 49 | cert_index: INTEGER, 50 | seen: TIMESTAMP, 51 | chain: [ 52 | ORGANIZATION 53 | ], 54 | not_before: TIMESTAMP, 55 | not_after: TIMESTAMP, 56 | all_domains: [ 57 | SAN 58 | ], 59 | 60 | # This is a place holder field which are used later by the 61 | # analysers. Each analyser will append its result here. 62 | analysers: [ 63 | { 64 | analyser: ANALYSER NAME, 65 | output: ANYTHING GOESE HERE, 66 | }, 67 | ], 68 | } 69 | """ 70 | filtered = { 71 | 'cert_index': raw['data']['cert_index'], 72 | 'seen': raw['data']['seen'], 73 | 'chain': [], 74 | 75 | # The analyser result will be stored here later on 76 | 'analysers': [], 77 | } 78 | 79 | interested_fields = ['not_before', 'not_after', 'all_domains'] 80 | 81 | if raw['data']['leaf_cert']['all_domains']: 82 | filtered.update({k: raw['data']['leaf_cert'][k] for k in interested_fields}) 83 | return filtered 84 | 85 | return None 86 | -------------------------------------------------------------------------------- /certstream_analytics/storages/elasticsearch_storage.py: -------------------------------------------------------------------------------- 1 | """ 2 | Save certstream data into Elasticsearch so that it can be queried by Kibana 3 | later on. 4 | """ 5 | from datetime import datetime 6 | from elasticsearch_dsl import connections, analyzer 7 | from elasticsearch_dsl import Document, Date, Text, Keyword 8 | 9 | from .base import Storage 10 | 11 | ANALYZER = analyzer('standard_analyzer', 12 | tokenizer='standard_tokenizer', 13 | filter=['lowercase']) 14 | 15 | 16 | # pylint: disable=too-few-public-methods 17 | class ElasticsearchStorage(Storage): 18 | """ 19 | An experiment Elasticsearch storage to keep and index the received records. 20 | """ 21 | class Record(Document): 22 | """ 23 | An Elasticsearch record as it is. 24 | """ 25 | timestamp = Date(default_timezone='UTC') 26 | 27 | # As reported by certstream 28 | seen = Date(default_timezone='UTC') 29 | 30 | # The domain time to live 31 | not_before = Date(default_timezone='UTC') 32 | not_after = Date(default_timezone='UTC') 33 | 34 | # The domain and its alternative names 35 | domain = Text(analyzer=ANALYZER, fields={'raw': Keyword()}) 36 | san = Text(analyzer=ANALYZER, fields={'raw': Keyword()}) 37 | 38 | # The issuer 39 | chain = Text(analyzer=ANALYZER, fields={'raw': Keyword()}) 40 | 41 | class Index: 42 | """ 43 | Use daily indices. 44 | """ 45 | name = 'certstream-*' 46 | 47 | # pylint: disable=arguments-differ 48 | def save(self, **kwargs): 49 | """ 50 | Magically save the record in Elasticsearch. 51 | """ 52 | self.timestamp = datetime.now() 53 | # Override the index to go to the proper timeslot 54 | kwargs['index'] = self.timestamp.strftime('certstream-%Y.%m.%d') 55 | 56 | return super().save(**kwargs) 57 | 58 | def __init__(self, hosts, timeout=10): 59 | """ 60 | Provide the Elasticsearch hostname (Defaults to localhost). 61 | """ 62 | connections.create_connection(hosts=hosts, timeout=timeout) 63 | 64 | def save(self, record): 65 | """ 66 | Save the certstream record in Elasticsearch. 67 | """ 68 | elasticsearch_record = ElasticsearchStorage.Record(meta={'id': record['cert_index']}) 69 | 70 | # In miliseconds 71 | elasticsearch_record.seen = int(record['seen'] * 1000) 72 | elasticsearch_record.not_before = int(record['not_before'] * 1000) 73 | elasticsearch_record.not_after = int(record['not_after'] * 1000) 74 | 75 | # Elasticsearch will parse and index the domain and all its alternative names 76 | elasticsearch_record.domain = record['all_domains'][0] 77 | elasticsearch_record.san = record['all_domains'][1:] 78 | 79 | elasticsearch_record.save() 80 | -------------------------------------------------------------------------------- /tests/test_reporter.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Various tests for the reporter module. 3 | ''' 4 | import json 5 | import tempfile 6 | import unittest 7 | 8 | from certstream_analytics.reporters import FileReporter 9 | 10 | 11 | class FileReporterTest(unittest.TestCase): 12 | ''' 13 | Test the file-based reporter. 14 | ''' 15 | def setUp(self): 16 | ''' 17 | Create a temporary file so that the test can write its reports into it. 18 | ''' 19 | self.tmp = tempfile.NamedTemporaryFile() 20 | self.reporter = FileReporter(path=self.tmp.name) 21 | 22 | def test_report(self): 23 | ''' 24 | Dump all the test reports to our temporary file. 25 | ''' 26 | cases = [ 27 | { 28 | 'report': { 29 | 'all_domains': ['store.google.com', 'google.com'], 30 | 'analysers': [ 31 | { 32 | 'analyser': 'AhoCorasickDomainMatching', 33 | 'domain': 'store.google.com', 34 | 'match': 'google', 35 | }, 36 | ], 37 | }, 38 | 'description': 'Report an exact match domain', 39 | }, 40 | 41 | { 42 | 'report': { 43 | 'all_domains': ['www.facebook.com.msg40.site'], 44 | 'analysers': [ 45 | { 46 | 'analyser': 'AhoCorasickDomainMatching', 47 | 'domain': 'www.facebook.com.msg40.site', 48 | 'match': 'facebook', 49 | }, 50 | ], 51 | }, 52 | 'description': 'Report a phishing domain with a sub-domain match', 53 | }, 54 | 55 | { 56 | 'report': { 57 | 'all_domains': ['login-appleid.apple.com.managesuppport.co'], 58 | 'analysers': [ 59 | { 60 | 'analyser': 'AhoCorasickDomainMatching', 61 | 'domain': 'login-appleid.apple.com.managesuppport.co', 62 | 'match': 'apple', 63 | }, 64 | ], 65 | }, 66 | 'description': 'Report a phishing domain with a partial string match', 67 | }, 68 | 69 | { 70 | 'report': {}, 71 | 'description': 'Report nothing and thus will be ignored', 72 | }, 73 | ] 74 | 75 | for case in cases: 76 | self.reporter.publish(case['report']) 77 | 78 | with open(self.tmp.name) as fhandler: 79 | lines = fhandler.readlines() 80 | 81 | for index, line in enumerate(lines): 82 | got = json.loads(line) 83 | self.assertDictEqual(got, cases[index]['report'], cases[index]['description']) 84 | -------------------------------------------------------------------------------- /bin/domain_matching.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | A simple utility to query certstream and match its records with a list 4 | of known domains (from OpenDNS). This script can also save the data into 5 | downstream storage for further processing, for example, Elasticsearch. 6 | """ 7 | import argparse 8 | import logging 9 | import signal 10 | import sys 11 | import time 12 | 13 | from certstream_analytics.analysers import AhoCorasickDomainMatching 14 | from certstream_analytics.analysers import WordSegmentation 15 | from certstream_analytics.analysers import DomainMatching, DomainMatchingOption 16 | from certstream_analytics.analysers import BulkDomainMarker 17 | from certstream_analytics.analysers import IDNADecoder 18 | from certstream_analytics.analysers import HomoglyphsDecoder 19 | from certstream_analytics.analysers import FeaturesGenerator 20 | from certstream_analytics.transformers import CertstreamTransformer 21 | from certstream_analytics.reporters import FileReporter 22 | from certstream_analytics.storages import ElasticsearchStorage 23 | from certstream_analytics.stream import CertstreamAnalytics 24 | 25 | DONE = False 26 | 27 | 28 | # pylint: disable=unused-argument 29 | def exit_gracefully(signum, stack): 30 | """ 31 | Just to be nice. 32 | """ 33 | # pylint: disable=global-statement 34 | global DONE 35 | DONE = True 36 | 37 | 38 | def init_analysers(domains_file, include_tld, matching_option): 39 | """ 40 | Initialize all the analysers for matching domains. The list includes: 41 | 42 | - IDNA 43 | - Homoglyphs 44 | - AhoCorasick 45 | - Word segmentation 46 | - Bulk domains 47 | - Meta domain matching 48 | """ 49 | with open(domains_file) as fhandle: 50 | domains = [line.rstrip() for line in fhandle] 51 | 52 | # Initialize all analysers. Note that their order is important cause they 53 | # will be executed in that order 54 | return [ 55 | IDNADecoder(), 56 | HomoglyphsDecoder(greedy=False), 57 | AhoCorasickDomainMatching(domains=domains), 58 | WordSegmentation(), 59 | BulkDomainMarker(), 60 | DomainMatching(include_tld=include_tld, option=matching_option), 61 | FeaturesGenerator(), 62 | ] 63 | 64 | 65 | def run(): 66 | """ 67 | A simple utility to query certstream and match its records to a list of 68 | known domains from OpenDNS. 69 | """ 70 | epilog = ''' 71 | examples: 72 | \033[1;33m/usr/bin/domain_matching.py --elasticsearch-host elasticsearch:9200\033[0m 73 | 74 | \033[1;33m/usr/bin/domain_matching.py --dump-location certstream.txt\033[0m 75 | 76 | \033[1;33m/usr/bin/domain_matching.py --domains opendns-top-domains.txt\033[0m 77 | 78 | Consume data from Certstream and does its magic. 79 | ''' 80 | parser = argparse.ArgumentParser(description=__doc__, epilog=epilog, 81 | formatter_class=argparse.RawDescriptionHelpFormatter) 82 | 83 | parser.add_argument('--domains', 84 | help='the list of domains to match with (e.g. opendns-top-domains.txt)') 85 | 86 | parser.add_argument('--elasticsearch-host', 87 | help='set the Elasticsearch host to store the records from Certstream') 88 | 89 | parser.add_argument('--dump-location', 90 | help='where to dump the records from Certstream') 91 | 92 | try: 93 | args = parser.parse_args() 94 | # pylint: disable=broad-except 95 | except Exception as error: 96 | logging.error(error) 97 | # some errors occur when parsing the arguments, show the usage 98 | parser.print_help() 99 | # then quit 100 | sys.exit(1) 101 | 102 | transformer = CertstreamTransformer() 103 | analysers = init_analysers(domains_file=args.domains, 104 | include_tld=True, 105 | matching_option=DomainMatchingOption.ORDER_MATCH) 106 | reporter = FileReporter(path=args.dump_location) if args.dump_location else None 107 | storage = ElasticsearchStorage(hosts=[args.elasticsearch_host]) if args.elasticsearch_host else None 108 | 109 | engine = CertstreamAnalytics(transformer=transformer, 110 | storages=storage, 111 | analysers=analysers, 112 | reporters=reporter) 113 | engine.start() 114 | 115 | while not DONE: 116 | time.sleep(1) 117 | 118 | engine.stop() 119 | 120 | 121 | if __name__ == '__main__': 122 | # Make sure that we can exit gracefully 123 | signal.signal(signal.SIGINT, exit_gracefully) 124 | signal.signal(signal.SIGTERM, exit_gracefully) 125 | 126 | run() 127 | -------------------------------------------------------------------------------- /certstream_analytics/stream.py: -------------------------------------------------------------------------------- 1 | """ 2 | All hail [certstream](https://github.com/CaliDog/certstream-python)!! 3 | 4 | This module consumes the feed of certificates from certstream and does 5 | the heavy lifting. 6 | """ 7 | import sys 8 | import threading 9 | import certstream 10 | 11 | from certstream_analytics.analysers import Analyser 12 | from certstream_analytics.reporters import Reporter 13 | from certstream_analytics.storages import Storage 14 | 15 | 16 | class CertstreamAnalytics(): 17 | """ 18 | Consume the feed of certificates from certstream, transform the data, and 19 | save it into various storages. 20 | """ 21 | 22 | def __init__(self, transformer=None, storages=None, analysers=None, reporters=None): 23 | """ 24 | This is the entry point of the whole module. It consumes data from 25 | certstream, transform it using a Transformer class, save it into 26 | a predefined storage (elasticsearch), and run the use-defined 27 | analysis. 28 | 29 | The transformer can be None or a subclass of CertstreamTransformer. It 30 | transform the raw data from certstream. 31 | 32 | The storage can be None or a subclass of CertstreamStorage. A sample 33 | kind of storage is Elasticsearch. 34 | 35 | The analyser can be None or a subclass of CertstreamAnalyser. It's 36 | entirely up to the user to decide what to do here with the transformed 37 | data from certstream. 38 | 39 | The reporter, as its name implies, collects and publishes the analyser 40 | result somewhere, for example, email notification. It will be a subclass 41 | of CertstreamReporter. 42 | """ 43 | self.transformer = transformer 44 | 45 | self.analysers = [] 46 | self.reporters = [] 47 | self.storages = [] 48 | 49 | def _init_member(member, value, kind): 50 | """ 51 | Initialize all storages, analysers, and reporters. 52 | """ 53 | if value: 54 | if isinstance(value, (list, tuple)): 55 | setattr(self, member, value) 56 | else: 57 | getattr(self, member).append(value) 58 | 59 | for type_check in getattr(self, member): 60 | if not isinstance(type_check, kind): 61 | raise TypeError('Invalid {} type: {}'.format(member, type(type_check).__name__)) 62 | 63 | _init_member('analysers', analysers, Analyser) 64 | _init_member('reporters', reporters, Reporter) 65 | _init_member('storages', storages, Storage) 66 | 67 | self.stopped = True 68 | self.thread = None 69 | 70 | def start(self): 71 | """ 72 | Start consuming data from certstream. 73 | """ 74 | # Run the stream in a separate thread 75 | self.thread = threading.Thread(target=self._consume) 76 | # So that it will be killed when the main thread stop 77 | self.thread.daemon = True 78 | self.thread.start() 79 | 80 | def stop(self): 81 | """ 82 | Stop consuming data from certstream. 83 | """ 84 | if self.stopped: 85 | return 86 | 87 | self.stopped = True 88 | self.thread.join() 89 | 90 | def _consume(self): 91 | """ 92 | Start consuming the data from certstream. 93 | """ 94 | self.stopped = False 95 | # pylint: disable=unnecessary-lambda 96 | certstream.listen_for_events(lambda m, c: self._callback(m, c), 97 | url='wss://certstream.calidog.io') 98 | 99 | # pylint: disable=unused-argument 100 | def _callback(self, message, context): 101 | """ 102 | The callback handler template itself. 103 | """ 104 | if self.stopped: 105 | sys.exit() 106 | 107 | if message['message_type'] == 'heartbeat': 108 | return 109 | 110 | if message['message_type'] == 'certificate_update': 111 | if self.transformer: 112 | # Apply the user-defined transformation. The structure of the raw 113 | # message is at See https://github.com/CaliDog/certstream-python/ 114 | transformed_message = self.transformer.apply(message) 115 | else: 116 | transformed_message = message 117 | 118 | if self.storages and transformed_message: 119 | # Save the message into a more permanent storage. May be we should 120 | # support multiple storages in parallel here 121 | for storage in self.storages: 122 | storage.save(transformed_message) 123 | 124 | if self.analysers: 125 | # Note that the order of analysers is extremely important cause the 126 | # output of an analyser will be come the input of the next analyser 127 | for analyser in self.analysers: 128 | if not transformed_message: 129 | break 130 | 131 | # Run something here 132 | transformed_message = analyser.run(transformed_message) 133 | 134 | if self.reporters and transformed_message: 135 | # and report the final result 136 | for reporter in self.reporters: 137 | reporter.publish(transformed_message) 138 | -------------------------------------------------------------------------------- /scripts/replay.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Replay a stream of records from certstream to test the processing pipeline. 4 | ''' 5 | import argparse 6 | import json 7 | import logging 8 | import sys 9 | 10 | from certstream_analytics.analysers import AhoCorasickDomainMatching 11 | from certstream_analytics.analysers import WordSegmentation 12 | from certstream_analytics.analysers import DomainMatching, DomainMatchingOption 13 | from certstream_analytics.analysers import BulkDomainMarker 14 | from certstream_analytics.analysers import IDNADecoder 15 | from certstream_analytics.analysers import HomoglyphsDecoder 16 | from certstream_analytics.analysers import FeaturesGenerator 17 | from certstream_analytics.reporters import FileReporter 18 | from certstream_analytics.storages import ElasticsearchStorage 19 | 20 | 21 | SUPPORTED_REPORTERS = { 22 | 'file': lambda location: FileReporter(path=location) 23 | } 24 | 25 | SUPPORTED_STORAGES = { 26 | 'elasticsearch': lambda host: ElasticsearchStorage(hosts=[host]) 27 | } 28 | 29 | 30 | def init_analysers(domains_file, include_tld, matching_option): 31 | ''' 32 | Initialize all the analysers for matching domains. The list includes: 33 | 34 | - IDNA 35 | - Homoglyphs 36 | - AhoCorasick 37 | - Word segmentation 38 | - Bulk domains 39 | - Meta domain matching 40 | ''' 41 | with open(domains_file) as fhandle: 42 | domains = [line.rstrip() for line in fhandle] 43 | 44 | # Initialize all analysers. Note that their order is important cause they 45 | # will be executed in that order 46 | return [ 47 | IDNADecoder(), 48 | HomoglyphsDecoder(greedy=False), 49 | AhoCorasickDomainMatching(domains=domains), 50 | WordSegmentation(), 51 | BulkDomainMarker(), 52 | DomainMatching(include_tld=include_tld, option=matching_option), 53 | FeaturesGenerator(), 54 | ] 55 | 56 | 57 | def run(): 58 | ''' 59 | A simple utility to replay certstream and match the records to a list of 60 | known domains from OpenDNS. It also generates several features for each 61 | domain such as the domain length. 62 | ''' 63 | epilog = ''' 64 | examples: 65 | \033[1;33m/usr/bin/replay.py --replay certstream.txt\033[0m 66 | 67 | \033[1;33m/usr/bin/replay.py --storage-host elasticsearch:9200 --storage elasticsearch\033[0m 68 | 69 | \033[1;33m/usr/bin/replay.py --report-location report.txt --report file\033[0m 70 | 71 | \033[1;33m/usr/bin/replay.py --domains opendns-top-domains.txt\033[0m 72 | 73 | Replay data from certstream. 74 | ''' 75 | parser = argparse.ArgumentParser(description=__doc__, epilog=epilog, 76 | formatter_class=argparse.RawDescriptionHelpFormatter) 77 | 78 | parser.add_argument('--replay', 79 | help='the list of records from certstream (one per line)') 80 | parser.add_argument('--domains', 81 | help='the list of domains to match with (opendns-top-domains.txt)') 82 | 83 | parser.add_argument('--storage-host', default='localhost:9200', 84 | help='set the storage host') 85 | parser.add_argument('-s', '--storage', 86 | help='choose the storage type (elasticsearch)') 87 | 88 | parser.add_argument('--report-location', 89 | help='where to save the report to?') 90 | parser.add_argument('-r', '--report', default='file', 91 | help='choose the reporter type') 92 | 93 | try: 94 | args = parser.parse_args() 95 | # pylint: disable=broad-except 96 | except Exception as error: 97 | logging.error(error) 98 | # some errors occur when parsing the arguments, show the usage 99 | parser.print_help() 100 | # then quit 101 | sys.exit(1) 102 | 103 | if args.report and args.report not in SUPPORTED_REPORTERS: 104 | error = 'Report type \033[1;31m{}\033[0m is not supported. The list of supported reporters includes: {}' \ 105 | .format(args.report, list(SUPPORTED_REPORTERS.keys())) 106 | 107 | logging.error(error) 108 | # Encounter an unsupported storage type 109 | sys.exit(1) 110 | 111 | if args.storage and args.storage not in SUPPORTED_STORAGES: 112 | error = 'Storage type \033[1;31m{}\033[0m is not supported. The list of supported storages includes: {}' \ 113 | .format(args.storage, list(SUPPORTED_STORAGES.keys())) 114 | 115 | logging.error(error) 116 | # Encounter an unsupported storage type 117 | sys.exit(1) 118 | 119 | analysers = init_analysers(domains_file=args.domains, 120 | include_tld=True, 121 | matching_option=DomainMatchingOption.ORDER_MATCH) 122 | 123 | if args.report: 124 | reporter = SUPPORTED_REPORTERS[args.report](args.report_location) 125 | 126 | if args.storage: 127 | storage = SUPPORTED_STORAGES[args.storage](args.storage_host) 128 | 129 | with open(args.replay) as fhandler: 130 | for raw in fhandler: 131 | try: 132 | record = json.loads(raw) 133 | except json.decoder.JSONDecodeError: 134 | continue 135 | 136 | if args.storage: 137 | storage.save(record) 138 | 139 | for analyser in analysers: 140 | # Run something here 141 | record = analyser.run(record) 142 | 143 | reporter.publish(record) 144 | 145 | if __name__ == '__main__': 146 | run() 147 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Certstream + Analytics 2 | 3 | [![Build Status](https://travis-ci.org/huydhn/certstream-analytics.svg?branch=master)](https://travis-ci.org/huydhn/certstream-analytics) 4 | [![codecov.io](https://codecov.io/gh/huydhn/certstream-analytics/master.svg)](http://codecov.io/gh/huydhn/certstream-analytics?branch=master) 5 | 6 | 7 | # Installation 8 | 9 | The package can be installed from 10 | [PyPI](https://pypi.org/project/certstream-analytics) 11 | 12 | ``` 13 | pip install certstream-analytics 14 | ``` 15 | 16 | # Quick usage 17 | 18 | ```python 19 | bin/domain_matching.py --domains domains.txt --dump-location certstream.txt 20 | 21 | # The file domains.txt contains the list of domains that we want to monitor 22 | # for matches (domains with similar names). For examples, a file with only 23 | # two entries: 24 | # 25 | # gmail.com 26 | # facebook.com 27 | # 28 | # will match any domains that contains gmail or facebook keywords. 29 | # 30 | # All the records consumed from certstream will be kept in certstream.txt 31 | ``` 32 | 33 | # API 34 | 35 | ```python 36 | import time 37 | 38 | from certstream_analytics.analysers import WordSegmentation 39 | from certstream_analytics.analysers import IDNADecoder 40 | from certstream_analytics.analysers import HomoglyphsDecoder 41 | 42 | from certstream_analytics.transformers import CertstreamTransformer 43 | from certstream_analytics.storages import ElasticsearchStorage 44 | from certstream_analytics.stream import CertstreamAnalytics 45 | 46 | done = False 47 | 48 | # These analysers will be run in the same order 49 | analyser = [ 50 | IDNADecoder(), 51 | HomoglyphsDecoder(), 52 | WordSegmentation(), 53 | ] 54 | 55 | # The following fields are filtered out and indexed: 56 | # - String: domain 57 | # - List: SAN 58 | # - List: Trust chain 59 | # - Timestamp: Not before 60 | # - Timestamp: Not after 61 | # - Timestamp: Seen 62 | transformer = CertstreamTransformer() 63 | 64 | # Indexed the data in Elasticsearch 65 | storage = ElasticsearchStorage(hosts=['localhost:9200']) 66 | 67 | consumer = CertstreamAnalytics(transformer=transformer, 68 | storage=storage, 69 | analyser=analyser) 70 | # The consumer is run in another thread so this function is non-blocking 71 | consumer.start() 72 | 73 | while not done: 74 | time.sleep(1) 75 | 76 | consumer.stop() 77 | ``` 78 | 79 | ## IDNA decoder 80 | This analyser decode IDNA domain name into Unicode for further processing 81 | downstream. Normally, it will be the very first analyser to be run. If 82 | the analyser encounters a malform IDNA domain string, it will keep the 83 | domain as it is. 84 | 85 | ```python 86 | from certstream_analytics.analysers import IDNADecoder 87 | 88 | decoder = IDNADecoder() 89 | 90 | # Just an example dummy record 91 | record = { 92 | 'all_domains': [ 93 | 'xn--f1ahbgpekke1h.xn--p1ai', 94 | ] 95 | } 96 | 97 | # The domain name will now become 'укрэмпужск.рф' 98 | print(decoder.run(record)) 99 | ``` 100 | 101 | ## Homoglyphs decoder 102 | There are lots of phishing websites that utilize [homoglyphs](https://en.wikipedia.org/wiki/Homoglyph) 103 | to lure the victims. Some common examples include 'l' and 'i' or the 104 | Unicode character RHO '𝞀' and 'p'. The homoglyphs decoder uses the excellent 105 | [confusable_homoglyphs](https://github.com/vhf/confusable_homoglyphs) to 106 | generate all potential alternative domain names in ASCII. 107 | 108 | ```python 109 | from certstream_analytics.analysers import HomoglyphsDecoder 110 | 111 | # If the greedy flag is set, all alternative domains will be returned 112 | decoder = HomoglyphsDecoder(greed=False) 113 | 114 | # Just an example dummy record 115 | record = { 116 | 'all_domains': [ 117 | # MATHEMATICAL MONOSPACE SMALL P 118 | '*.𝗉aypal.com', 119 | 120 | # MATHEMATICAL SAN-SERIF BOLD SMALL RHO 121 | '*.𝗉ay𝞀al.com', 122 | ] 123 | } 124 | 125 | # The domain name will now be converted to '*.paypal.com' with the ASCII 126 | # character p 127 | print(decoder.run(record)) 128 | ``` 129 | 130 | ## Aho-Corasick 131 | A domain and its SAN from Certstream will be compared against a list of 132 | most popular [domains](https://github.com/opendns/public-domain-lists) 133 | (from OpenDNS) using Aho-Corasick algorithm. This is a simple check to 134 | remove some of the most obvious phishing domains, for examples, *www.facebook.com.msg40.site* 135 | will match with *facebook* cause *facebook* is in the above list of most 136 | popular domains (I wonder how long it is going to last). 137 | 138 | ```python 139 | from certstream_analytics.analysers import AhoCorasickDomainMatching 140 | from certstream_analytics.reporter import FileReporter 141 | 142 | # Print the list of matching domains 143 | reporter = FileReporter('matching-results.txt') 144 | 145 | with open('opendns-top-domains.txt')) as fhandle: 146 | domains = [line.rstrip() for line in fhandle] 147 | 148 | # The list of domains to match against 149 | domain_matching_analyser = AhoCorasickDomainMatching(domains) 150 | 151 | consumer = CertstreamAnalytics(transformer=transformer, 152 | analyser=domain_matching_analyser, 153 | reporter=reporter) 154 | 155 | # Need to think about what to do with the matching result 156 | consumer.start() 157 | 158 | while not done: 159 | time.sleep(1) 160 | 161 | consumer.stop() 162 | ``` 163 | 164 | ## Word segmentation 165 | In order to improve the accuracy of the matching algorithm, we segment 166 | the domains into English words using 167 | [wordsegment](https://github.com/grantjenks/python-wordsegment). 168 | 169 | ```python 170 | from certstream_analytics.analysers import WordSegmentation 171 | 172 | wordsegmentation = WordSegmentation() 173 | 174 | # Just an example dummy record 175 | record = { 176 | 'all_domains': [ 177 | 'login-appleid.apple.com.managesupport.co', 178 | ] 179 | } 180 | 181 | # The returned output is as follows: 182 | # 183 | # { 184 | # 'analyser': 'WordSegmentation', 185 | # 'output': { 186 | # 'login-appleid.apple.com.managesuppport.co': [ 187 | # 'login', 188 | # 'apple', 189 | # 'id', 190 | # 'apple', 191 | # 'com', 192 | # 'manage', 193 | # 'support', 194 | # 'co' 195 | # ], 196 | # }, 197 | # 198 | print(decoder.run(record)) 199 | ``` 200 | 201 | ## Features generator 202 | A list of features for each domain will also be generated so that they 203 | can be used for classification jobs further downstream. The list 204 | includes: 205 | 206 | - The number of dot-separated fields in the domain, for example, www.google.com has 3. 207 | - The overall length of the domain in characters. 208 | - The length of the longest dot-separate field . 209 | - The length of the TLD, e.g. .online (6) or .download (8) is longer than .com (3). 210 | - The randomness level of the domain. [Nostril](https://github.com/casics/nostril) 211 | package is used to check how many words as returned by the WordSegmentation 212 | analyser are non-sense. 213 | -------------------------------------------------------------------------------- /certstream_analytics/analysers/domain_matching.py: -------------------------------------------------------------------------------- 1 | """ 2 | Verify the domain against the list of most popular domains from OpenDNS 3 | (https://github.com/opendns/public-domain-lists). Let's see how useful 4 | it is to prevent phishing domains. 5 | """ 6 | from enum import Enum 7 | 8 | import json 9 | import logging 10 | import re 11 | import tldextract 12 | import ahocorasick 13 | import wordsegment 14 | 15 | from .base import Analyser 16 | from .common_domain_analyser import BulkDomainMarker 17 | from .common_domain_analyser import WordSegmentation 18 | 19 | 20 | # pylint: disable=too-few-public-methods 21 | class AhoCorasickDomainMatching(Analyser): 22 | """ 23 | The domain and its SAN will be compared against the list of domains, for 24 | example, the most popular domains from OpenDNS. 25 | """ 26 | # Get this number from the histogram of the length of all top domains 27 | MIN_MATCHING_LENGTH = 3 28 | 29 | # Some domains that don't work too well with tldextract and generate too 30 | # many FPs 31 | EXCLUDED_DOMAINS = { 32 | 'www': 1, 33 | 'web': 1, 34 | } 35 | 36 | # Some common domain parts that cause too many FP 37 | IGNORED_PARTS = r'^(autodiscover\.|cpanel\.)' 38 | 39 | def __init__(self, domains): 40 | """ 41 | Use Aho-Corasick to find the matching domain so we construct its Trie 42 | here. Thought: How the f**k is com.com in the list? 43 | """ 44 | self.automaton = ahocorasick.Automaton() 45 | self.domains = {} 46 | 47 | for index, domain in enumerate(domains): 48 | # Processing only the domain part. All sub-domains or TLDs will 49 | # be ignored, for example: 50 | # - www.google.com becomes google 51 | # - www.google.co.uk becomes google 52 | # - del.icio.us becomes icio 53 | ext = tldextract.extract(domain) 54 | 55 | if ext.domain in AhoCorasickDomainMatching.EXCLUDED_DOMAINS: 56 | continue 57 | 58 | self.automaton.add_word(ext.domain, (index, ext.domain)) 59 | self.domains[ext.domain] = domain 60 | 61 | self.automaton.make_automaton() 62 | 63 | def run(self, record): 64 | """ 65 | Use Aho-Corasick to find the matching domain. Check the time complexity 66 | of this function later. 67 | 68 | Tricky situation #1: When the string (domain) in the Trie is too short, 69 | it could match many domains, for example, g.co or t.co. So they need 70 | to be ignored somehow. Looking at the histogram of the length of all 71 | domains in the list, there are only less than 100 domains with the 72 | length of 2 or less. So we choose to ignore those. Also, we will 73 | prefer longer match than a shorter one for now. 74 | """ 75 | if 'analysers' not in record: 76 | record['analysers'] = [] 77 | 78 | results = {} 79 | # Check the domain and all its SAN 80 | for domain in record['all_domains']: 81 | # Remove wildcard 82 | domain = re.sub(r'^\*\.', '', domain) 83 | 84 | # Remove some FP-prone parts 85 | domain = re.sub(AhoCorasickDomainMatching.IGNORED_PARTS, '', domain) 86 | 87 | # Similar to all domains in the list, the TLD will be stripped off 88 | ext = tldextract.extract(domain) 89 | # The match will be a tuple in the following format: (5, (0, 'google')) 90 | matches = [m[1][1] for m in self.automaton.iter('.'.join(ext[:2])) 91 | if len(m[1][1]) >= AhoCorasickDomainMatching.MIN_MATCHING_LENGTH] 92 | 93 | if matches: 94 | matches.sort(key=len) 95 | 96 | match = matches[-1] 97 | # We only keep the the longest match of the first matching domain 98 | # for now 99 | results[domain] = [self.domains[match]] if match in self.domains else match 100 | break 101 | 102 | if results: 103 | record['analysers'].append({ 104 | 'analyser': type(self).__name__, 105 | 'output': results, 106 | }) 107 | 108 | return record 109 | 110 | 111 | class DomainMatchingOption(Enum): 112 | """ 113 | Control how strict we want to do our matching. 114 | """ 115 | # For example applefake.it will match with apple.com case ['apple'] is 116 | # a subset of ['apple', 'fake'] 117 | SUBSET_MATCH = 0 118 | 119 | # Similar but use in instead of issubset so that the order is preserved 120 | ORDER_MATCH = 1 121 | 122 | 123 | class DomainMatching(Analyser): 124 | """ 125 | This is the first example of the new group of meta analysers which are used 126 | to combine the result of other analysers. 127 | """ 128 | def __init__(self, include_tld=True, option=DomainMatchingOption.ORDER_MATCH): 129 | """ 130 | Just load the wordsegment package, whatever it is. 131 | """ 132 | wordsegment.load() 133 | 134 | # Save the matching option here so we can refer to it later 135 | self.include_tld = include_tld 136 | 137 | self.option = { 138 | DomainMatchingOption.SUBSET_MATCH: set, 139 | DomainMatchingOption.ORDER_MATCH: list, 140 | }[option] 141 | 142 | def run(self, record): 143 | """ 144 | Note that a meta-analyser will need to run after other analysers have 145 | finished so that their outputs are available. 146 | """ 147 | if 'analysers' not in record: 148 | return record 149 | 150 | analysers = { 151 | AhoCorasickDomainMatching.__name__: {}, 152 | WordSegmentation.__name__: {}, 153 | BulkDomainMarker.__name__: {}, 154 | } 155 | 156 | for analyser in record['analysers']: 157 | name = analyser['analyser'] 158 | 159 | if name not in analysers: 160 | continue 161 | 162 | if name == BulkDomainMarker.__name__ and analyser['output']: 163 | # Skip bulk record and deal with it later, with such large 164 | # number of SAN name, it's bound to be a match 165 | continue 166 | 167 | analysers[name] = analyser['output'] 168 | 169 | # Check that all outputs are there before continuing 170 | if not analysers[AhoCorasickDomainMatching.__name__] or not analysers[WordSegmentation.__name__]: 171 | return record 172 | 173 | results = self._match(analysers[AhoCorasickDomainMatching.__name__], 174 | analysers[WordSegmentation.__name__]) 175 | 176 | if results: 177 | record['analysers'].append({ 178 | 'analyser': type(self).__name__, 179 | 'output': results, 180 | }) 181 | 182 | # DEBUG 183 | logging.info(json.dumps(record)) 184 | 185 | return record 186 | 187 | def _match(self, ahocorasick_output, segmentation_output): 188 | """ 189 | Use internally by the run function to combine AhoCorasick and WordSegmentation 190 | results. 191 | """ 192 | results = {} 193 | # Check all the matching domains reported by AhoCorasick analyser 194 | for match, domains in ahocorasick_output.items(): 195 | # The result of AhoCorasick matcher is a list of matching domains, for example, 196 | # 197 | # { 198 | # 'analyser': 'AhoCorasickDomainMatching', 199 | # 'output': { 200 | # 'login-appleid.apple.com.managesuppport.co': ['apple.com', 'support.com'], 201 | # }, 202 | # }, 203 | # 204 | if match not in segmentation_output: 205 | continue 206 | 207 | phish = self.option(segmentation_output[match]) 208 | match_ext = tldextract.extract(match) 209 | 210 | for domain in domains: 211 | ext = tldextract.extract(domain) 212 | 213 | # This record is from a legitimate source, for example, agrosupport.zendesk.com 214 | # will match with zendesk.com. In our case, we don't really care about this so 215 | # it will be ignored and not reported as a match. 216 | if ext[1:] == match_ext[1:]: 217 | continue 218 | 219 | tmp = [] 220 | # Intuitively, it will be more accurate if we choose to include the TLD here. 221 | # For example, if both 'apple' and 'com' appear in the matching domain, it's 222 | # very likely that something phishing is going on here. On the other hand, 223 | # if only 'apple' occurs, we are not so sure and it's better left for more 224 | # advance analysers to have their says in that 225 | for part in ext[:] if self.include_tld else ext[:2]: 226 | for token in part.split('.'): 227 | tmp.extend(wordsegment.segment(token)) 228 | 229 | legit = self.option(tmp) 230 | 231 | if (isinstance(phish, set) and legit.issubset(phish)) or \ 232 | (isinstance(phish, list) and '.{}'.format('.'.join(legit)) in '.'.join(phish)): 233 | # Found a possible phishing domain 234 | if match not in results: 235 | results[match] = [] 236 | 237 | results[match].append(domain) 238 | 239 | return results 240 | -------------------------------------------------------------------------------- /pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # Specify a configuration file. 4 | #rcfile= 5 | 6 | # Python code to execute, usually for sys.path manipulation such as 7 | # pygtk.require(). 8 | #init-hook= 9 | 10 | # Add files or directories to the blacklist. They should be base names, not 11 | # paths. 12 | ignore=CVS 13 | 14 | # Pickle collected data for later comparisons. 15 | persistent=yes 16 | 17 | # List of plugins (as comma separated values of python modules names) to load, 18 | # usually to register additional checkers. 19 | load-plugins= 20 | 21 | # Use multiple processes to speed up Pylint. 22 | jobs=4 23 | 24 | # Allow loading of arbitrary C extensions. Extensions are imported into the 25 | # active Python interpreter and may run arbitrary code. 26 | unsafe-load-any-extension=no 27 | 28 | # A comma-separated list of package or module names from where C extensions may 29 | # be loaded. Extensions are loading into the active Python interpreter and may 30 | # run arbitrary code 31 | extension-pkg-whitelist= 32 | 33 | 34 | [MESSAGES CONTROL] 35 | 36 | # Only show warnings with the listed confidence levels. Leave empty to show 37 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED 38 | confidence= 39 | 40 | # Enable the message, report, category or checker with the given id(s). You can 41 | # either give multiple identifier separated by comma (,) or put this option 42 | # multiple time. See also the "--disable" option for examples. 43 | #enable= 44 | 45 | # Disable the message, report, category or checker with the given id(s). You 46 | # can either give multiple identifiers separated by comma (,) or put this 47 | # option multiple times (only on the command line, not in the configuration 48 | # file where it should appear only once).You can also use "--disable=all" to 49 | # disable everything first and then reenable specific checks. For example, if 50 | # you want to run only the similarities checker, you can use "--disable=all 51 | # --enable=similarities". If you want to run only the classes checker, but have 52 | # no Warning level messages displayed, use"--disable=all --enable=classes 53 | # --disable=W" 54 | 55 | disable=fixme,locally-disabled 56 | 57 | [REPORTS] 58 | 59 | # Set the output format. Available formats are text, parseable, colorized, msvs 60 | # (visual studio) and html. You can also give a reporter class, eg 61 | # mypackage.mymodule.MyReporterClass. 62 | output-format=parseable 63 | 64 | # Put messages in a separate file for each module / package specified on the 65 | # command line instead of printing them on stdout. Reports (if any) will be 66 | # written in a file name "pylint_global.[txt|html]". 67 | files-output=no 68 | 69 | # Tells whether to display a full report or only the messages 70 | reports=yes 71 | 72 | # Python expression which should return a note less than 10 (10 is the highest 73 | # note). You have access to the variables errors warning, statement which 74 | # respectively contain the number of errors / warnings messages and the total 75 | # number of statements analyzed. This is used by the global evaluation report 76 | # (RP0004). 77 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 78 | 79 | # Template used to display messages. This is a python new-style format string 80 | # used to format the message information. See doc for all details 81 | #msg-template= 82 | 83 | 84 | [LOGGING] 85 | 86 | # Logging modules to check that the string format arguments are in logging 87 | # function parameter format 88 | logging-modules=logging 89 | 90 | 91 | [MISCELLANEOUS] 92 | 93 | # List of note tags to take in consideration, separated by a comma. 94 | notes=FIXME,XXX,TODO 95 | 96 | 97 | [SIMILARITIES] 98 | 99 | # Minimum lines number of a similarity. 100 | min-similarity-lines=4 101 | 102 | # Ignore comments when computing similarities. 103 | ignore-comments=yes 104 | 105 | # Ignore docstrings when computing similarities. 106 | ignore-docstrings=yes 107 | 108 | # Ignore imports when computing similarities. 109 | ignore-imports=no 110 | 111 | 112 | [VARIABLES] 113 | 114 | # Tells whether we should check for unused import in __init__ files. 115 | init-import=no 116 | 117 | # A regular expression matching the name of dummy variables (i.e. expectedly 118 | # not used). 119 | dummy-variables-rgx=_$|dummy 120 | 121 | # List of additional names supposed to be defined in builtins. Remember that 122 | # you should avoid to define new builtins when possible. 123 | additional-builtins= 124 | 125 | # List of strings which can identify a callback function by name. A callback 126 | # name must start or end with one of those strings. 127 | callbacks=cb_,_cb 128 | 129 | 130 | [FORMAT] 131 | 132 | # Maximum number of characters on a single line. 133 | max-line-length=120 134 | 135 | # Regexp for a line that is allowed to be longer than the limit. 136 | ignore-long-lines=^\s*(# )??$ 137 | 138 | # Allow the body of an if to be on the same line as the test if there is no 139 | # else. 140 | single-line-if-stmt=no 141 | 142 | # List of optional constructs for which whitespace checking is disabled 143 | no-space-check=trailing-comma,dict-separator 144 | 145 | # Maximum number of lines in a module 146 | max-module-lines=1000 147 | 148 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 149 | # tab). 150 | indent-string=' ' 151 | 152 | # Number of spaces of indent required inside a hanging or continued line. 153 | indent-after-paren=4 154 | 155 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 156 | expected-line-ending-format= 157 | 158 | 159 | [BASIC] 160 | 161 | # List of builtins function names that should not be used, separated by a comma 162 | bad-functions=map,filter,input 163 | 164 | # Good variable names which should always be accepted, separated by a comma 165 | good-names=i,j,k,ex,Run,_ 166 | 167 | # Bad variable names which should always be refused, separated by a comma 168 | bad-names=foo,bar,baz,toto,tutu,tata 169 | 170 | # Colon-delimited sets of names that determine each other's naming style when 171 | # the name regexes allow several styles. 172 | name-group= 173 | 174 | # Include a hint for the correct naming format with invalid-name 175 | include-naming-hint=no 176 | 177 | # Regular expression matching correct function names 178 | function-rgx=[a-z_][a-z0-9_]{2,30}$ 179 | 180 | # Naming hint for function names 181 | function-name-hint=[a-z_][a-z0-9_]{2,30}$ 182 | 183 | # Regular expression matching correct variable names 184 | variable-rgx=[a-z_][a-z0-9_]{2,30}$ 185 | 186 | # Naming hint for variable names 187 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$ 188 | 189 | # Regular expression matching correct constant names 190 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 191 | 192 | # Naming hint for constant names 193 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 194 | 195 | # Regular expression matching correct attribute names 196 | attr-rgx=[a-z_][a-z0-9_]{2,30}$ 197 | 198 | # Naming hint for attribute names 199 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$ 200 | 201 | # Regular expression matching correct argument names 202 | argument-rgx=[a-z_][a-z0-9_]{2,30}$ 203 | 204 | # Naming hint for argument names 205 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$ 206 | 207 | # Regular expression matching correct class attribute names 208 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 209 | 210 | # Naming hint for class attribute names 211 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 212 | 213 | # Regular expression matching correct inline iteration names 214 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ 215 | 216 | # Naming hint for inline iteration names 217 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ 218 | 219 | # Regular expression matching correct class names 220 | class-rgx=[A-Z_][a-zA-Z0-9]+$ 221 | 222 | # Naming hint for class names 223 | class-name-hint=[A-Z_][a-zA-Z0-9]+$ 224 | 225 | # Regular expression matching correct module names 226 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 227 | 228 | # Naming hint for module names 229 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 230 | 231 | # Regular expression matching correct method names 232 | method-rgx=[a-z_][a-z0-9_]{2,30}$ 233 | 234 | # Naming hint for method names 235 | method-name-hint=[a-z_][a-z0-9_]{2,30}$ 236 | 237 | # Regular expression which should only match function or class names that do 238 | # not require a docstring. 239 | no-docstring-rgx=__.*__ 240 | 241 | # Minimum line length for functions/classes that require docstrings, shorter 242 | # ones are exempt. 243 | docstring-min-length=-1 244 | 245 | # List of decorators that define properties, such as abc.abstractproperty. 246 | property-classes=abc.abstractproperty 247 | 248 | 249 | [TYPECHECK] 250 | 251 | # Tells whether missing members accessed in mixin class should be ignored. A 252 | # mixin class is detected if its name ends with "mixin" (case insensitive). 253 | ignore-mixin-members=yes 254 | 255 | # List of module names for which member attributes should not be checked 256 | # (useful for modules/projects where namespaces are manipulated during runtime 257 | # and thus existing member attributes cannot be deduced by static analysis 258 | ignored-modules= 259 | 260 | # List of classes names for which member attributes should not be checked 261 | # (useful for classes with attributes dynamically set). 262 | ignored-classes=SQLObject, optparse.Values, thread._local, _thread._local 263 | 264 | # List of members which are set dynamically and missed by pylint inference 265 | # system, and so shouldn't trigger E1101 when accessed. Python regular 266 | # expressions are accepted. 267 | generated-members=REQUEST,acl_users,aq_parent 268 | 269 | # List of decorators that create context managers from functions, such as 270 | # contextlib.contextmanager. 271 | contextmanager-decorators=contextlib.contextmanager 272 | 273 | 274 | [SPELLING] 275 | 276 | # Spelling dictionary name. Available dictionaries: none. To make it working 277 | # install python-enchant package. 278 | spelling-dict= 279 | 280 | # List of comma separated words that should not be checked. 281 | spelling-ignore-words= 282 | 283 | # A path to a file that contains private dictionary; one word per line. 284 | spelling-private-dict-file= 285 | 286 | # Tells whether to store unknown words to indicated private dictionary in 287 | # --spelling-private-dict-file option instead of raising a message. 288 | spelling-store-unknown-words=no 289 | 290 | 291 | [DESIGN] 292 | 293 | # Maximum number of arguments for function / method 294 | max-args=5 295 | 296 | # Argument names that match this expression will be ignored. Default to name 297 | # with leading underscore 298 | ignored-argument-names=_.* 299 | 300 | # Maximum number of locals for function / method body 301 | max-locals=15 302 | 303 | # Maximum number of return / yield for function / method body 304 | max-returns=6 305 | 306 | # Maximum number of branch for function / method body 307 | max-branches=12 308 | 309 | # Maximum number of statements in function / method body 310 | max-statements=50 311 | 312 | # Maximum number of parents for a class (see R0901). 313 | max-parents=7 314 | 315 | # Maximum number of attributes for a class (see R0902). 316 | max-attributes=7 317 | 318 | # Minimum number of public methods for a class (see R0903). 319 | min-public-methods=2 320 | 321 | # Maximum number of public methods for a class (see R0904). 322 | max-public-methods=20 323 | 324 | 325 | [CLASSES] 326 | 327 | # List of method names used to declare (i.e. assign) instance attributes. 328 | defining-attr-methods=__init__,__new__,setUp 329 | 330 | # List of valid names for the first argument in a class method. 331 | valid-classmethod-first-arg=cls 332 | 333 | # List of valid names for the first argument in a metaclass class method. 334 | valid-metaclass-classmethod-first-arg=mcs 335 | 336 | # List of member names, which should be excluded from the protected access 337 | # warning. 338 | exclude-protected=_asdict,_fields,_replace,_source,_make 339 | 340 | 341 | [IMPORTS] 342 | 343 | # Deprecated modules which should not be used, separated by a comma 344 | deprecated-modules=regsub,TERMIOS,Bastion,rexec 345 | 346 | # Create a graph of every (i.e. internal and external) dependencies in the 347 | # given file (report RP0402 must not be disabled) 348 | import-graph= 349 | 350 | # Create a graph of external dependencies in the given file (report RP0402 must 351 | # not be disabled) 352 | ext-import-graph= 353 | 354 | # Create a graph of internal dependencies in the given file (report RP0402 must 355 | # not be disabled) 356 | int-import-graph= 357 | 358 | 359 | [EXCEPTIONS] 360 | 361 | # Exceptions that will emit a warning when being caught. Defaults to 362 | # "Exception" 363 | overgeneral-exceptions=Exception 364 | -------------------------------------------------------------------------------- /certstream_analytics/analysers/common_domain_analyser.py: -------------------------------------------------------------------------------- 1 | """ 2 | The list of basic analysers includes: 3 | - WordSegmentation 4 | - IDNADecoder 5 | - HomoglyphsDecoder 6 | - FeaturesGenerator (generate various features for further downstream processing) 7 | - BulkDomainMarker 8 | """ 9 | import re 10 | import tldextract 11 | import wordsegment 12 | from nostril import nonsense 13 | import idna 14 | from confusable_homoglyphs import confusables 15 | 16 | from .base import Analyser 17 | 18 | 19 | # pylint: disable=too-few-public-methods 20 | class WordSegmentation(Analyser): 21 | """ 22 | Perform word segmentation of all the SAN domains as an attempt to make sense 23 | of their names. For example, both arch.mappleonline.com and apple-verifyupdate.serveftp.com 24 | domains have 'apple' inside but only the second one is an actual Apple phishing 25 | page. Intuitively, a good word segmentation algorithm will return: 26 | 27 | - arch + mapple + online + com 28 | - apple + verify + update + serve + ftp + com 29 | 30 | Thus, it's much easier to spot the second phishing domain. 31 | 32 | Implementation-wise, there are several existing packages around to do this, for 33 | example: 34 | 35 | - https://github.com/grantjenks/python-wordsegment 36 | - https://github.com/keredson/wordninja 37 | 38 | Let's see what they can do, take it away! 39 | """ 40 | # Some common stop words that are in the list of most popular domains 41 | STOPWORDS = { 42 | 'app': 1, 43 | 'inc': 1, 44 | 'box': 1, 45 | 'health': 1, 46 | 'home': 1, 47 | 'space': 1, 48 | 'cars': 1, 49 | 'nature': 1, 50 | } 51 | 52 | def __init__(self): 53 | """ 54 | Just load the wordsegment package, whatever it is. 55 | """ 56 | wordsegment.load() 57 | 58 | def run(self, record): 59 | """ 60 | Apply word segment to all the SAN domain names. Let's see if it makes 61 | any sense. 62 | """ 63 | if 'analysers' not in record: 64 | record['analysers'] = [] 65 | 66 | results = {} 67 | # Check the domain and all its SAN 68 | for domain in record['all_domains']: 69 | # Remove wild card 70 | domain = re.sub(r'^\*\.', '', domain) 71 | 72 | # The TLD will be stripped off cause it does not contribute anything here 73 | ext = tldextract.extract(domain) 74 | 75 | words = [] 76 | # We choose to segment the TLD here as well, for example, .co.uk 77 | # will become ['co', 'uk']. Let see if this works out. 78 | for part in ext[:]: 79 | for token in part.split('.'): 80 | segmented = [w for w in wordsegment.segment(token) if w not in WordSegmentation.STOPWORDS] 81 | 82 | if segmented: 83 | words.extend(segmented) 84 | elif token: 85 | # For some IDNA domain like xn--wgbfq3d.xn--ngbc5azd, the segmentation 86 | # won't work and an empty array is returned. So we choose to just keep 87 | # the original token 88 | words.append(token) 89 | 90 | results[domain] = words 91 | 92 | if results: 93 | record['analysers'].append({ 94 | 'analyser': type(self).__name__, 95 | 'output': results, 96 | }) 97 | 98 | return record 99 | 100 | 101 | class BulkDomainMarker(Analyser): 102 | """ 103 | Mark the record that has tons of SAN domains in it. Most of the time, they are 104 | completely unrelated domains and probably the result of some bulk registration 105 | process. Benign or not, they are still suspicious and probably spam. We can also 106 | verify the similarity among these domains. A lower similarity score means these 107 | domains are totally unrelated. 108 | """ 109 | # Take a histogram here and find out the suitable value for this 110 | THRESHOLD = 15 111 | 112 | def __init__(self, threshold=THRESHOLD): 113 | """ 114 | Set the threshold to mark the record as a bulk record. 115 | """ 116 | self.threshold = threshold 117 | 118 | def run(self, record): 119 | """ 120 | See if the record is a bulk record. We will just use the threshold as 121 | the indicator for now. So if a record has more SAN names than the 122 | threshold, it is a bulk record. 123 | """ 124 | if 'analysers' not in record: 125 | record['analysers'] = [] 126 | 127 | is_bulked = len(record['all_domains']) >= self.threshold 128 | 129 | record['analysers'].append({ 130 | 'analyser': type(self).__name__, 131 | 'output': is_bulked, 132 | }) 133 | 134 | return record 135 | 136 | 137 | class IDNADecoder(Analyser): 138 | """ 139 | Decode all domains in IDNA format. 140 | """ 141 | def run(self, record): 142 | """ 143 | Check if a domain in the list is in IDNA format and convert it back to 144 | Unicode. 145 | """ 146 | decoded = [] 147 | 148 | for domain in record['all_domains']: 149 | wildcard = False 150 | 151 | try: 152 | if re.match(r'^\*\.', domain): 153 | wildcard = True 154 | # Remove wildcard cause it interfere with the IDNA module 155 | # and we'll put it back later 156 | domain = re.sub(r'^\*\.', '', domain) 157 | 158 | domain = idna.decode(domain) 159 | 160 | except idna.core.InvalidCodepoint: 161 | # Fail to decode the domain, just keep it as it is for now 162 | pass 163 | except UnicodeError: 164 | pass 165 | finally: 166 | if wildcard: 167 | domain = '*.{}'.format(domain) 168 | 169 | decoded.append(domain) 170 | 171 | record['all_domains'] = decoded 172 | return record 173 | 174 | 175 | class HomoglyphsDecoder(Analyser): 176 | """ 177 | Smartly convert domains whose names include some suspicious homoglyphs to 178 | ASCII. This will probably need to be right done after IDNA conversion and 179 | before other analysers so that they can get benefits from it. 180 | """ 181 | def __init__(self, greedy=False): 182 | """ 183 | We rely on the confusable-homoglyphs at https://github.com/vhf/confusable_homoglyphs 184 | to do its magic. 185 | 186 | If the greedy flag is set, all alternative domains will be returned. Otherwise, only 187 | the first one will be available. 188 | """ 189 | self.greedy = greedy 190 | 191 | @staticmethod 192 | def is_latin(alt): 193 | """ 194 | Check if a string is in Latin cause, in our specific case, we will 195 | only care about Latin characters 196 | """ 197 | lower_s = range(ord('a'), ord('z') + 1) 198 | upper_s = range(ord('A'), ord('Z') + 1) 199 | 200 | # We need to check the length of the homoglyph here cause 201 | # confusable_homoglyphs library nicely returns multi-character 202 | # match as well, for example, 'rn' has an alternative of 'm' 203 | for alt_c in alt: 204 | if ord(alt_c) not in lower_s and ord(alt_c) not in upper_s: 205 | return False 206 | 207 | return True 208 | 209 | def run(self, record): 210 | """ 211 | Using the confusable-homoglyphs, we are going to generate all alternatives ASCII 212 | names of a domain. It's a bit of a brute force though. 213 | """ 214 | decoded = [] 215 | 216 | for domain in record['all_domains']: 217 | wildcard = False 218 | 219 | if re.match(r'^\*\.', domain): 220 | wildcard = True 221 | # Remove wild card to simplify the domain name a bit and we'll put it back later 222 | domain = re.sub(r'^\*\.', '', domain) 223 | 224 | hg_map = {hg['character']: hg for hg in confusables.is_confusable(domain, greedy=True)} 225 | decoded_domain_c = [] 226 | 227 | for domain_c in domain: 228 | # Confusable homoglyphs could not find any homoglyphs for this character 229 | # so we decide to keep the original character as it is 230 | if domain_c not in hg_map: 231 | decoded_domain_c.append([domain_c]) 232 | continue 233 | 234 | found = [] 235 | hglyph = hg_map[domain_c] 236 | 237 | if hglyph['alias'] == 'LATIN': 238 | # The character is Latin, we don't need to do anything here 239 | found.append(hglyph['character']) 240 | 241 | for alt in hglyph['homoglyphs']: 242 | if HomoglyphsDecoder.is_latin(alt['c']): 243 | found.append(alt['c'].lower()) 244 | 245 | # If nothing is found, we keep the original character 246 | if not found: 247 | found.append(hglyph['character']) 248 | 249 | decoded_domain_c.append(found) 250 | 251 | for alt in self._generate_alternatives(decoded_domain_c): 252 | if wildcard: 253 | alt = '*.{}'.format(alt) 254 | 255 | decoded.append(alt) 256 | 257 | if not self.greedy: 258 | break 259 | 260 | record['all_domains'] = decoded 261 | return record 262 | 263 | def _generate_alternatives(self, alt_characters, index=0, current=''): 264 | """ 265 | Generate all alternative ASCII names of a domain using the list of all 266 | alternative characters. 267 | """ 268 | if index == len(alt_characters): 269 | yield current 270 | 271 | else: 272 | for alt_c in alt_characters[index]: 273 | yield from self._generate_alternatives(alt_characters, 274 | index + 1, 275 | current + alt_c) 276 | 277 | 278 | class FeaturesGenerator(Analyser): 279 | """ 280 | Generate features to detect outliers in the stream. In our case, the outliers is 281 | the 'suspicious' phishing domains. 282 | """ 283 | NOSTRIL_LENGTH_LIMIT = 6 284 | 285 | # pylint: disable=invalid-name 286 | def run(self, record): 287 | """ 288 | The list of features will be: 289 | - The number of domain parts, for example, www.google.com is 3. 290 | - The overall length in characters. 291 | - The length of the longest domain part. 292 | - The length of the TLD, e.g. .online or .download is longer than .com. 293 | - The randomness level of the domain. 294 | """ 295 | if 'analysers' not in record: 296 | record['analysers'] = [] 297 | 298 | x_samples = [] 299 | Y_samples = [] 300 | 301 | for analyser in record['analysers']: 302 | if analyser['analyser'] != 'WordSegmentation': 303 | continue 304 | 305 | for domain, segments in analyser['output'].items(): 306 | # Remove wildcard domain 307 | domain = re.sub(r'^\*\.', '', domain) 308 | 309 | parts = domain.split('.') 310 | 311 | x = [] 312 | # Compute the number of domain parts 313 | x.append(len(parts)) 314 | 315 | # Compute the length of the whole domain 316 | x.append(len(domain)) 317 | 318 | longest = '' 319 | # Compute the length of the longest domain parts 320 | for part in parts: 321 | if len(part) > len(longest): 322 | longest = part 323 | 324 | x.append(len(longest)) 325 | 326 | # Compute the length of the TLD 327 | x.append(len(parts[-1])) 328 | 329 | randomness_count = 0 330 | # The nostril package which we are using to detect non-sense words 331 | # in the domain only returns a boolean verdict so may be we need to 332 | # think of how we want to quantify this 333 | for w in segments: 334 | try: 335 | if len(w) >= FeaturesGenerator.NOSTRIL_LENGTH_LIMIT and nonsense(w): 336 | randomness_count += 1 337 | except ValueError: 338 | continue 339 | 340 | x.append(randomness_count / len(segments)) 341 | 342 | x_samples.append(x) 343 | Y_samples.append('usual_suspect' in record) 344 | 345 | break 346 | 347 | record['analysers'].append({ 348 | 'analyser': type(self).__name__, 349 | 'output': x_samples, 350 | }) 351 | 352 | return record 353 | -------------------------------------------------------------------------------- /tests/samples.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "data": { 4 | "cert_index": 447858050, 5 | "cert_link": "http://ct.googleapis.com/rocketeer/ct/v1/get-entries?start=447858050&end=447858050", 6 | "chain": [ 7 | { 8 | "as_der": "REDACT", 9 | "extensions": { 10 | "authorityInfoAccess": "OCSP - URI:http://ocsp.comodoca.com\nCA Issuers - URI:http://crt.comodoca.com/COMODORSAAddTrustCA.crt\n", 11 | "authorityKeyIdentifier": "keyid:BB:AF:7E:02:3D:FA:A6:F1:3C:84:8E:AD:EE:38:98:EC:D9:32:32:D4\n", 12 | "basicConstraints": "CA:TRUE", 13 | "certificatePolicies": "Policy: 2.23.140.1.2.1\nPolicy: 1.3.6.1.4.1.6449.1.2.2.52", 14 | "crlDistributionPoints": "Full Name:\n URI:http://crl.comodoca.com/COMODORSACertificationAuthority.crl", 15 | "extendedKeyUsage": "TLS Web server authentication, TLS Web client authentication", 16 | "keyUsage": "Digital Signature, Key Cert Sign, C R L Sign", 17 | "subjectKeyIdentifier": "7E:03:5A:65:41:6B:A7:7E:0A:E1:B8:9D:08:EA:1D:8E:1D:6A:C7:65" 18 | }, 19 | "fingerprint": "76:4D:2F:A5:9E:D1:23:F9:C9:55:70:C4:03:C9:2F:EF:33:8E:A7:45", 20 | "not_after": 1747526399, 21 | "not_before": 1431907200, 22 | "serial_number": "F01D4BEE7B7CA37B3C0566AC05972458", 23 | "subject": { 24 | "C": "US", 25 | "CN": "cPanel, Inc. Certification Authority", 26 | "L": "Houston", 27 | "O": "cPanel, Inc.", 28 | "OU": null, 29 | "ST": "TX", 30 | "aggregated": "/C=US/CN=cPanel, Inc. Certification Authority/L=Houston/O=cPanel, Inc./ST=TX" 31 | } 32 | }, 33 | { 34 | "as_der": "REDACT", 35 | "extensions": { 36 | "basicConstraints": "CA:TRUE", 37 | "keyUsage": "Key Cert Sign, C R L Sign", 38 | "subjectKeyIdentifier": "BB:AF:7E:02:3D:FA:A6:F1:3C:84:8E:AD:EE:38:98:EC:D9:32:32:D4" 39 | }, 40 | "fingerprint": "AF:E5:D2:44:A8:D1:19:42:30:FF:47:9F:E2:F8:97:BB:CD:7A:8C:B4", 41 | "not_after": 2147471999, 42 | "not_before": 1263859200, 43 | "serial_number": "4CAAF9CADB636FE01FF74ED85B03869D", 44 | "subject": { 45 | "C": "GB", 46 | "CN": "COMODO RSA Certification Authority", 47 | "L": "Salford", 48 | "O": "COMODO CA Limited", 49 | "OU": null, 50 | "ST": "Greater Manchester", 51 | "aggregated": "/C=GB/CN=COMODO RSA Certification Authority/L=Salford/O=COMODO CA Limited/ST=Greater Manchester" 52 | } 53 | } 54 | ], 55 | "leaf_cert": { 56 | "all_domains": [ 57 | "firmyfarmaceutyczneeu.redirect.rejestracjadomen.pl", 58 | "www.firmyfarmaceutyczneeu.redirect.rejestracjadomen.pl" 59 | ], 60 | "as_der": "REDACT", 61 | "extensions": { 62 | "authorityInfoAccess": "OCSP - URI:http://ocsp.comodoca.com\nCA Issuers - URI:http://crt.comodoca.com/cPanelIncCertificationAuthority.crt\n", 63 | "authorityKeyIdentifier": "keyid:7E:03:5A:65:41:6B:A7:7E:0A:E1:B8:9D:08:EA:1D:8E:1D:6A:C7:65\n", 64 | "basicConstraints": "CA:FALSE", 65 | "certificatePolicies": "Policy: 2.23.140.1.2.1\nPolicy: 1.3.6.1.4.1.6449.1.2.2.52\n CPS: https://secure.comodo.com/CPS", 66 | "crlDistributionPoints": "Full Name:\n URI:http://crl.comodoca.com/cPanelIncCertificationAuthority.crl", 67 | "ctlPoisonByte": true, 68 | "extendedKeyUsage": "TLS Web server authentication, TLS Web client authentication", 69 | "keyUsage": "Digital Signature, Key Encipherment", 70 | "subjectAltName": "DNS:www.firmyfarmaceutyczneeu.redirect.rejestracjadomen.pl, DNS:firmyfarmaceutyczneeu.redirect.rejestracjadomen.pl", 71 | "subjectKeyIdentifier": "A6:F3:1B:BD:CB:A6:E0:95:E4:EA:86:C5:9D:FE:BC:9E:B1:C4:0B:FD" 72 | }, 73 | "fingerprint": "32:8B:E0:CA:41:25:E0:EB:CD:92:29:7F:F3:17:3C:06:2C:3C:1F:D0", 74 | "not_after": 1546473599, 75 | "not_before": 1538611200, 76 | "serial_number": "DA28422511646C0552500F3DEE0AC20", 77 | "subject": { 78 | "C": null, 79 | "CN": "firmyfarmaceutyczneeu.redirect.rejestracjadomen.pl", 80 | "L": null, 81 | "O": null, 82 | "OU": null, 83 | "ST": null, 84 | "aggregated": "/CN=firmyfarmaceutyczneeu.redirect.rejestracjadomen.pl" 85 | } 86 | }, 87 | "seen": 1538635262.355275, 88 | "source": { 89 | "name": "Google 'Rocketeer' log", 90 | "url": "ct.googleapis.com/rocketeer/" 91 | }, 92 | "update_type": "PrecertLogEntry" 93 | }, 94 | "message_type": "certificate_update" 95 | }, 96 | { 97 | "data": { 98 | "cert_index": 447858049, 99 | "cert_link": "http://ct.googleapis.com/rocketeer/ct/v1/get-entries?start=447858049&end=447858049", 100 | "chain": [ 101 | { 102 | "as_der": "REDACT", 103 | "extensions": { 104 | "authorityInfoAccess": "CA Issuers - URI:http://apps.identrust.com/roots/dstrootcax3.p7c\nOCSP - URI:http://isrg.trustid.ocsp.identrust.com\n", 105 | "authorityKeyIdentifier": "keyid:C4:A7:B1:A4:7B:2C:71:FA:DB:E1:4B:90:75:FF:C4:15:60:85:89:10\n", 106 | "basicConstraints": "CA:TRUE", 107 | "certificatePolicies": "Policy: 1.3.6.1.4.1.44947.1.1.1\n CPS: http://cps.root-x1.letsencrypt.org", 108 | "crlDistributionPoints": "Full Name:\n URI:http://crl.identrust.com/DSTROOTCAX3CRL.crl", 109 | "keyUsage": "Digital Signature, Key Cert Sign, C R L Sign", 110 | "subjectKeyIdentifier": "A8:4A:6A:63:04:7D:DD:BA:E6:D1:39:B7:A6:45:65:EF:F3:A8:EC:A1" 111 | }, 112 | "fingerprint": "E6:A3:B4:5B:06:2D:50:9B:33:82:28:2D:19:6E:FE:97:D5:95:6C:CB", 113 | "not_after": 1615999246, 114 | "not_before": 1458232846, 115 | "serial_number": "A0141420000015385736A0B85ECA708", 116 | "subject": { 117 | "C": "US", 118 | "CN": "Let's Encrypt Authority X3", 119 | "L": null, 120 | "O": "Let's Encrypt", 121 | "OU": null, 122 | "ST": null, 123 | "aggregated": "/C=US/CN=Let's Encrypt Authority X3/O=Let's Encrypt" 124 | } 125 | }, 126 | { 127 | "as_der": "REDACT", 128 | "extensions": { 129 | "basicConstraints": "CA:TRUE", 130 | "keyUsage": "Key Cert Sign, C R L Sign", 131 | "subjectKeyIdentifier": "C4:A7:B1:A4:7B:2C:71:FA:DB:E1:4B:90:75:FF:C4:15:60:85:89:10" 132 | }, 133 | "fingerprint": "DA:C9:02:4F:54:D8:F6:DF:94:93:5F:B1:73:26:38:CA:6A:D7:7C:13", 134 | "not_after": 1633010475, 135 | "not_before": 970348339, 136 | "serial_number": "44AFB080D6A327BA893039862EF8406B", 137 | "subject": { 138 | "C": null, 139 | "CN": "DST Root CA X3", 140 | "L": null, 141 | "O": "Digital Signature Trust Co.", 142 | "OU": null, 143 | "ST": null, 144 | "aggregated": "/CN=DST Root CA X3/O=Digital Signature Trust Co." 145 | } 146 | } 147 | ], 148 | "leaf_cert": { 149 | "all_domains": [ 150 | "rundschleifmaschinen-service.de", 151 | "www.rundschleifmaschinen-service.de" 152 | ], 153 | "as_der": "REDACT", 154 | "extensions": { 155 | "authorityInfoAccess": "CA Issuers - URI:http://cert.int-x3.letsencrypt.org/\nOCSP - URI:http://ocsp.int-x3.letsencrypt.org\n", 156 | "authorityKeyIdentifier": "keyid:A8:4A:6A:63:04:7D:DD:BA:E6:D1:39:B7:A6:45:65:EF:F3:A8:EC:A1\n", 157 | "basicConstraints": "CA:FALSE", 158 | "certificatePolicies": "Policy: 1.3.6.1.4.1.44947.1.1.1\n CPS: http://cps.letsencrypt.org\n User Notice: is Certificate may only be relied upon by Relying Parties and only in accordance with the Certificate Policy found at https://letsencrypt.org/repository/", 159 | "ctlSignedCertificateTimestamp": "BIHyAPAAdQBvU3asMfAxGdiZAKRRFf93FRwR2QLBACkGjbIImjfZEwAAAWYyuWCoAAAEAwBGMEQCIHAfF-WDz1YkPCONYN0aXohfUPFrhiKG61tXfDilc3dUAiB0oHYT0e5eCKi5k9mEzRpqC-NdvhEtr8qKBlxEoiQsGwB3ACk8UZZUyDlluqpQ_FgH1Ldvv1h6KXLcpMMM9OVFR_R4AAABZjK5YkwAAAQDAEgwRgIhAKlrVU0Na8GF1AT7lCpeUJMchwfHnFsjswnpultsgKQhAiEAuPvplxBQsMHbioLdPsNRQSr-xUHV2g7yZkUnKqZHbnQ=", 160 | "extendedKeyUsage": "TLS Web server authentication, TLS Web client authentication", 161 | "keyUsage": "Digital Signature, Key Encipherment", 162 | "subjectAltName": "DNS:www.rundschleifmaschinen-service.de, DNS:rundschleifmaschinen-service.de", 163 | "subjectKeyIdentifier": "E3:45:2E:7F:5C:8D:B4:17:CC:B8:73:09:E3:DA:F7:F3:F9:ED:F2:15" 164 | }, 165 | "fingerprint": "9A:3A:AF:F8:DC:A4:18:4B:B6:46:61:F7:29:46:FA:42:9B:CA:9C:71", 166 | "not_after": 1546221701, 167 | "not_before": 1538445701, 168 | "serial_number": "3428B7C70A67819D5B9E7A13D2B9B8C778F", 169 | "subject": { 170 | "C": null, 171 | "CN": "rundschleifmaschinen-service.de", 172 | "L": null, 173 | "O": null, 174 | "OU": null, 175 | "ST": null, 176 | "aggregated": "/CN=rundschleifmaschinen-service.de" 177 | } 178 | }, 179 | "seen": 1538635262.353125, 180 | "source": { 181 | "name": "Google 'Rocketeer' log", 182 | "url": "ct.googleapis.com/rocketeer/" 183 | }, 184 | "update_type": "X509LogEntry" 185 | }, 186 | "message_type": "certificate_update" 187 | }, 188 | { 189 | "data": { 190 | "cert_index": 447857993, 191 | "cert_link": "http://ct.googleapis.com/rocketeer/ct/v1/get-entries?start=447857993&end=447857993", 192 | "chain": [ 193 | { 194 | "as_der": "REDACT", 195 | "extensions": { 196 | "authorityInfoAccess": "CA Issuers - URI:http://apps.identrust.com/roots/dstrootcax3.p7c\nOCSP - URI:http://isrg.trustid.ocsp.identrust.com\n", 197 | "authorityKeyIdentifier": "keyid:C4:A7:B1:A4:7B:2C:71:FA:DB:E1:4B:90:75:FF:C4:15:60:85:89:10\n", 198 | "basicConstraints": "CA:TRUE", 199 | "certificatePolicies": "Policy: 1.3.6.1.4.1.44947.1.1.1\n CPS: http://cps.root-x1.letsencrypt.org", 200 | "crlDistributionPoints": "Full Name:\n URI:http://crl.identrust.com/DSTROOTCAX3CRL.crl", 201 | "keyUsage": "Digital Signature, Key Cert Sign, C R L Sign", 202 | "subjectKeyIdentifier": "A8:4A:6A:63:04:7D:DD:BA:E6:D1:39:B7:A6:45:65:EF:F3:A8:EC:A1" 203 | }, 204 | "fingerprint": "E6:A3:B4:5B:06:2D:50:9B:33:82:28:2D:19:6E:FE:97:D5:95:6C:CB", 205 | "not_after": 1615999246, 206 | "not_before": 1458232846, 207 | "serial_number": "A0141420000015385736A0B85ECA708", 208 | "subject": { 209 | "C": "US", 210 | "CN": "Let's Encrypt Authority X3", 211 | "L": null, 212 | "O": "Let's Encrypt", 213 | "OU": null, 214 | "ST": null, 215 | "aggregated": "/C=US/CN=Let's Encrypt Authority X3/O=Let's Encrypt" 216 | } 217 | }, 218 | { 219 | "as_der": "REDACT", 220 | "extensions": { 221 | "basicConstraints": "CA:TRUE", 222 | "keyUsage": "Key Cert Sign, C R L Sign", 223 | "subjectKeyIdentifier": "C4:A7:B1:A4:7B:2C:71:FA:DB:E1:4B:90:75:FF:C4:15:60:85:89:10" 224 | }, 225 | "fingerprint": "DA:C9:02:4F:54:D8:F6:DF:94:93:5F:B1:73:26:38:CA:6A:D7:7C:13", 226 | "not_after": 1633010475, 227 | "not_before": 970348339, 228 | "serial_number": "44AFB080D6A327BA893039862EF8406B", 229 | "subject": { 230 | "C": null, 231 | "CN": "DST Root CA X3", 232 | "L": null, 233 | "O": "Digital Signature Trust Co.", 234 | "OU": null, 235 | "ST": null, 236 | "aggregated": "/CN=DST Root CA X3/O=Digital Signature Trust Co." 237 | } 238 | } 239 | ], 240 | "leaf_cert": { 241 | "all_domains": [ 242 | "www.runaflohmarkt.de" 243 | ], 244 | "as_der": "REDACT", 245 | "extensions": { 246 | "authorityInfoAccess": "CA Issuers - URI:http://cert.int-x3.letsencrypt.org/\nOCSP - URI:http://ocsp.int-x3.letsencrypt.org\n", 247 | "authorityKeyIdentifier": "keyid:A8:4A:6A:63:04:7D:DD:BA:E6:D1:39:B7:A6:45:65:EF:F3:A8:EC:A1\n", 248 | "basicConstraints": "CA:FALSE", 249 | "certificatePolicies": "Policy: 1.3.6.1.4.1.44947.1.1.1\n CPS: http://cps.letsencrypt.org\n User Notice: is Certificate may only be relied upon by Relying Parties and only in accordance with the Certificate Policy found at https://letsencrypt.org/repository/", 250 | "ctlSignedCertificateTimestamp": "BIHxAO8AdQBvU3asMfAxGdiZAKRRFf93FRwR2QLBACkGjbIImjfZEwAAAWX9AeXwAAAEAwBGMEQCICIkjDXPcRgbcoKuh8Ciu_1sIVVKj_oGb-bzc8zPyhF2AiAhCQMKgrBcxZpZpGgOEgyBxIX6WqJFDOGamrWW-I55IAB2ACk8UZZUyDlluqpQ_FgH1Ldvv1h6KXLcpMMM9OVFR_R4AAABZf0B56wAAAQDAEcwRQIhAPNKe7X7XqNZF7H4NOWW-DtSvx1jVWxqsZVnknCjrkjrAiBTIKM-qsi4QMFHbTRfxz4tiRvI14vCXDAbyoLgbp6BKw==", 251 | "extendedKeyUsage": "TLS Web server authentication, TLS Web client authentication", 252 | "keyUsage": "Digital Signature, Key Encipherment", 253 | "subjectAltName": "DNS:www.runaflohmarkt.de", 254 | "subjectKeyIdentifier": "7C:82:16:CB:31:94:C6:C5:5C:72:A1:37:CA:AE:B9:9B:3D:73:3E:9B" 255 | }, 256 | "fingerprint": "AD:5E:3D:91:50:46:7E:C6:D9:30:FD:65:11:8B:CE:81:FF:29:49:B9", 257 | "not_after": 1545320484, 258 | "not_before": 1537544484, 259 | "serial_number": "36434086EFE2BB58A2068BBA9F2E96B7898", 260 | "subject": { 261 | "C": null, 262 | "CN": "www.runaflohmarkt.de", 263 | "L": null, 264 | "O": null, 265 | "OU": null, 266 | "ST": null, 267 | "aggregated": "/CN=www.runaflohmarkt.de" 268 | } 269 | }, 270 | "seen": 1538635262.249552, 271 | "source": { 272 | "name": "Google 'Rocketeer' log", 273 | "url": "ct.googleapis.com/rocketeer/" 274 | }, 275 | "update_type": "X509LogEntry" 276 | }, 277 | "message_type": "certificate_update" 278 | } 279 | ] 280 | -------------------------------------------------------------------------------- /tests/test_domain_matching_analyser.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Common domain matching analyser. 3 | ''' 4 | import copy 5 | import os 6 | import unittest 7 | 8 | from certstream_analytics.analysers import AhoCorasickDomainMatching 9 | from certstream_analytics.analysers import WordSegmentation 10 | from certstream_analytics.analysers import DomainMatching, DomainMatchingOption 11 | from certstream_analytics.analysers import BulkDomainMarker 12 | from certstream_analytics.analysers import IDNADecoder 13 | from certstream_analytics.analysers import HomoglyphsDecoder 14 | 15 | 16 | class DomainMatchingTest(unittest.TestCase): 17 | ''' 18 | Test all the common domain matching analysers. 19 | ''' 20 | def test_ahocorasick(self): 21 | ''' 22 | Compare some mock domains against the list of most popular domains 23 | using Aho-Corasick algorithm. 24 | ''' 25 | # Load the mock list of common domains for testing. 26 | current_dir = os.path.dirname(os.path.realpath(__file__)) 27 | 28 | with open(os.path.join(current_dir, 'opendns-top-domains.txt')) as fhandle: 29 | domains = [line.rstrip() for line in fhandle] 30 | 31 | ahocorasick_analyser = AhoCorasickDomainMatching(domains) 32 | 33 | cases = [ 34 | { 35 | 'data': { 36 | 'all_domains': [ 37 | 'store.google.com', 38 | 'google.com', 39 | ], 40 | }, 41 | 'expected': [ 42 | { 43 | 'analyser': 'AhoCorasickDomainMatching', 44 | 'output': { 45 | 'store.google.com': ['google.com'], 46 | }, 47 | }, 48 | ], 49 | 'description': 'An exact match domain', 50 | }, 51 | 52 | { 53 | 'data': { 54 | 'all_domains': [ 55 | 'www.facebook.com.msg40.site', 56 | ], 57 | }, 58 | 'expected': [ 59 | { 60 | 'analyser': 'AhoCorasickDomainMatching', 61 | 'output': { 62 | 'www.facebook.com.msg40.site': ['facebook.com'], 63 | }, 64 | }, 65 | ], 66 | 'description': 'A sample phishing domain with a sub-domain match', 67 | }, 68 | 69 | { 70 | 'data': { 71 | 'all_domains': [ 72 | 'login-appleid.apple.com.managesuppport.co', 73 | ], 74 | }, 75 | 'expected': [ 76 | { 77 | 'analyser': 'AhoCorasickDomainMatching', 78 | 'output': { 79 | 'login-appleid.apple.com.managesuppport.co': ['apple.com'], 80 | }, 81 | }, 82 | ], 83 | 'description': 'A sample phishing domain with a partial string match', 84 | }, 85 | 86 | { 87 | 'data': { 88 | 'all_domains': [ 89 | 'socket.io', 90 | ], 91 | }, 92 | 'expected': [], 93 | 'description': 'A non-matching domain (not in the list of most popular domains)', 94 | }, 95 | 96 | { 97 | 'data': { 98 | 'all_domains': [ 99 | 'www.foobar2000.com', 100 | ], 101 | }, 102 | 'expected': [], 103 | 'description': 'A non-matching domain (excluded pattern)', 104 | }, 105 | 106 | { 107 | 'data': { 108 | 'all_domains': [ 109 | 'autodiscover.blablabla.com', 110 | ], 111 | }, 112 | 'expected': [], 113 | 'description': 'Match a ignored pattern', 114 | }, 115 | ] 116 | 117 | for case in cases: 118 | got = ahocorasick_analyser.run(case['data']) 119 | self.assertListEqual(got['analysers'], case['expected'], case['description']) 120 | 121 | def test_wordsegmentation(self): 122 | ''' 123 | Try to segment some domains and check the result. 124 | ''' 125 | wordsegmentation = WordSegmentation() 126 | 127 | cases = [ 128 | { 129 | 'data': { 130 | 'all_domains': [ 131 | 'store.google.com', 132 | 'google.com', 133 | ], 134 | }, 135 | 'expected': [ 136 | { 137 | 'analyser': 'WordSegmentation', 138 | 'output': { 139 | 'store.google.com': ['store', 'google', 'com'], 140 | 'google.com': ['google', 'com'], 141 | }, 142 | }, 143 | ], 144 | 'description': 'A legit domain', 145 | }, 146 | 147 | { 148 | 'data': { 149 | 'all_domains': [ 150 | 'www.facebook.com.msg40.site', 151 | ], 152 | }, 153 | 'expected': [ 154 | { 155 | 'analyser': 'WordSegmentation', 156 | 'output': { 157 | 'www.facebook.com.msg40.site': ['www', 'facebook', 'com', 'msg40', 'site'], 158 | }, 159 | }, 160 | ], 161 | 'description': 'Word segmentation using the domain separator (dot)', 162 | }, 163 | 164 | { 165 | 'data': { 166 | 'all_domains': [ 167 | 'login-appleid.apple.com.managesuppport.co', 168 | ], 169 | }, 170 | 'expected': [ 171 | { 172 | 'analyser': 'WordSegmentation', 173 | 'output': { 174 | 'login-appleid.apple.com.managesuppport.co': [ 175 | 'login', 176 | 'apple', 177 | 'id', 178 | 'apple', 179 | 'com', 180 | 'manage', 181 | 'suppport', 182 | 'co' 183 | ], 184 | }, 185 | }, 186 | ], 187 | 'description': 'Word segmentation using dictionary', 188 | }, 189 | 190 | { 191 | 'data': { 192 | 'all_domains': [ 193 | 'arch.mappleonline.com', 194 | ], 195 | }, 196 | 'expected': [ 197 | { 198 | 'analyser': 'WordSegmentation', 199 | 'output': { 200 | 'arch.mappleonline.com': ['arch', 'm', 'apple', 'online', 'com'], 201 | }, 202 | }, 203 | ], 204 | 'description': 'Failed to segment the word correctly', 205 | }, 206 | 207 | { 208 | 'data': { 209 | 'all_domains': [ 210 | 'www.freybrothersinc.com', 211 | ], 212 | }, 213 | 'expected': [ 214 | { 215 | 'analyser': 'WordSegmentation', 216 | 'output': { 217 | 'www.freybrothersinc.com': ['www', 'frey', 'brothers', 'com'], 218 | }, 219 | }, 220 | ], 221 | 'description': 'Ignore certain stop words (inc) when doing segmentation', 222 | }, 223 | ] 224 | 225 | for case in cases: 226 | got = wordsegmentation.run(case['data']) 227 | self.assertListEqual(got['analysers'], case['expected'], case['description']) 228 | 229 | def test_domain_matching(self): 230 | ''' 231 | Combine the result of all domain matching analysers into one. 232 | ''' 233 | # The first option decides if the TLD is included in the match 234 | options = [ 235 | (True, DomainMatchingOption.SUBSET_MATCH), 236 | (False, DomainMatchingOption.SUBSET_MATCH), 237 | (True, DomainMatchingOption.ORDER_MATCH), 238 | (False, DomainMatchingOption.ORDER_MATCH), 239 | ] 240 | 241 | analysers = {o: DomainMatching(include_tld=o[0], option=o[1]) for o in options} 242 | 243 | cases = [ 244 | { 245 | 'data': { 246 | 'all_domains': [ 247 | 'store.google.com', 248 | 'google.com', 249 | ], 250 | 251 | 'analysers': [ 252 | { 253 | 'analyser': 'AhoCorasickDomainMatching', 254 | 'output': { 255 | 'store.google.com': ['google.com'], 256 | }, 257 | }, 258 | 259 | { 260 | 'analyser': 'WordSegmentation', 261 | 'output': { 262 | 'store.google.com': ['store', 'google', 'com'], 263 | 'google.com': ['google', 'com'], 264 | }, 265 | }, 266 | ], 267 | }, 268 | 'expected': { 269 | (True, DomainMatchingOption.SUBSET_MATCH): [], 270 | (False, DomainMatchingOption.SUBSET_MATCH): [], 271 | (True, DomainMatchingOption.ORDER_MATCH): [], 272 | (False, DomainMatchingOption.ORDER_MATCH): [], 273 | }, 274 | 'description': 'A legit domain so it will be skipped (no match reported)', 275 | }, 276 | 277 | { 278 | 'data': { 279 | 'all_domains': [ 280 | 'login-appleid.managesuppport.com', 281 | ], 282 | 283 | 'analysers': [ 284 | { 285 | 'analyser': 'AhoCorasickDomainMatching', 286 | 'output': { 287 | 'login-appleid.managesuppport.com': ['apple.com'], 288 | }, 289 | }, 290 | 291 | { 292 | 'analyser': 'WordSegmentation', 293 | 'output': { 294 | 'login-appleid.managesuppport.com': [ 295 | 'login', 296 | 'apple', 297 | 'id', 298 | 'manage', 299 | 'suppport' 300 | ], 301 | }, 302 | }, 303 | ], 304 | }, 305 | 'expected': { 306 | (True, DomainMatchingOption.SUBSET_MATCH): [], 307 | (False, DomainMatchingOption.SUBSET_MATCH): [ 308 | { 309 | 'analyser': 'DomainMatching', 310 | 'output': { 311 | 'login-appleid.managesuppport.com': ['apple.com'] 312 | }, 313 | }, 314 | ], 315 | (True, DomainMatchingOption.ORDER_MATCH): [], 316 | (False, DomainMatchingOption.ORDER_MATCH): [ 317 | { 318 | 'analyser': 'DomainMatching', 319 | 'output': { 320 | 'login-appleid.managesuppport.com': ['apple.com'] 321 | }, 322 | }, 323 | ], 324 | }, 325 | 'description': 'Find a matching phishing domain', 326 | }, 327 | 328 | { 329 | 'data': { 330 | 'all_domains': [ 331 | 'djunprotected.com', 332 | 'www.djunprotected.com' 333 | ], 334 | 335 | 'analysers': [ 336 | { 337 | 'analyser': 'AhoCorasickDomainMatching', 338 | 'output': { 339 | 'djunprotected.com': ['ted.com'] 340 | } 341 | }, 342 | 343 | { 344 | 'analyser': 'WordSegmentation', 345 | 'output': { 346 | 'djunprotected.com': ['dj', 'unprotected', 'com'], 347 | 'www.djunprotected.com': ['www', 'dj', 'unprotected', 'com'] 348 | } 349 | }, 350 | ], 351 | }, 352 | 'expected': { 353 | (True, DomainMatchingOption.SUBSET_MATCH): [], 354 | (False, DomainMatchingOption.SUBSET_MATCH): [], 355 | (True, DomainMatchingOption.ORDER_MATCH): [], 356 | (False, DomainMatchingOption.ORDER_MATCH): [], 357 | }, 358 | 'description': 'Find a matching phishing domain', 359 | }, 360 | ] 361 | 362 | for case in cases: 363 | for option, analyser in analysers.items(): 364 | expected = copy.deepcopy(case['data']['analysers']) 365 | expected.extend(case['expected'][option]) 366 | 367 | got = analyser.run(case['data']) 368 | self.assertListEqual(got['analysers'], expected, 369 | '{} ({})'.format(case['description'], option)) 370 | 371 | def test_bulk_domain_marker(self): 372 | ''' 373 | Test the bulk domain analyser. 374 | ''' 375 | bulky = BulkDomainMarker() 376 | 377 | cases = [ 378 | { 379 | 'data': { 380 | 'all_domains': [ 381 | 'store.google.com', 382 | 'google.com', 383 | ], 384 | }, 385 | 'expected': [ 386 | {'analyser': 'BulkDomainMarker', 'output': False} 387 | ], 388 | 'description': 'Not a bulk record', 389 | }, 390 | { 391 | 'data': { 392 | 'all_domains': [ 393 | 'a.com', 394 | 'b.com', 395 | 'c.com', 396 | 'd.com', 397 | 'e.com', 398 | 'f.com', 399 | 'g.com', 400 | 'h.com', 401 | 'i.com', 402 | 'j.com', 403 | 'k.com', 404 | 'l.com', 405 | 'm.com', 406 | 'n.com', 407 | 'o.com', 408 | ], 409 | }, 410 | 'expected': [ 411 | {'analyser': 'BulkDomainMarker', 'output': True} 412 | ], 413 | 'description': 'Mark a bulk record', 414 | }, 415 | ] 416 | 417 | for case in cases: 418 | got = bulky.run(case['data']) 419 | self.assertListEqual(got['analysers'], case['expected'], case['description']) 420 | 421 | def test_idn_decoder(self): 422 | ''' 423 | Test the IDNA decoder. 424 | ''' 425 | decoder = IDNADecoder() 426 | 427 | cases = [ 428 | { 429 | 'data': { 430 | 'all_domains': [ 431 | 'store.google.com', 432 | 'google.com', 433 | ], 434 | }, 435 | 'expected': [ 436 | 'store.google.com', 437 | 'google.com', 438 | ], 439 | 'description': 'There is no domain in IDNA format', 440 | }, 441 | { 442 | 'data': { 443 | 'all_domains': [ 444 | 'xn--f1ahbgpekke1h.xn--p1ai', 445 | 'tigrobaldai.lt' 446 | ], 447 | }, 448 | 'expected': [ 449 | 'укрэмпужск.рф', 450 | 'tigrobaldai.lt' 451 | ], 452 | 'description': 'Convert some domains in IDNA format', 453 | }, 454 | { 455 | 'data': { 456 | 'all_domains': [ 457 | 'xn--foobar.xn--me', 458 | ], 459 | }, 460 | 'expected': [ 461 | 'xn--foobar.xn--me', 462 | ], 463 | 'description': 'Handle an invalid IDNA string', 464 | }, 465 | { 466 | 'data': { 467 | 'all_domains': [ 468 | '*.xn---35-5cd3cln6a9bzb.xn--p1ai', 469 | '*.nl-dating-vidkid.com', 470 | ], 471 | }, 472 | 'expected': [ 473 | '*.отмычка-35.рф', 474 | '*.nl-dating-vidkid.com', 475 | ], 476 | 'description': 'Handle an invalid code point', 477 | }, 478 | ] 479 | 480 | for case in cases: 481 | got = decoder.run(case['data']) 482 | self.assertListEqual(got['all_domains'], case['expected'], case['description']) 483 | 484 | def test_homoglyphs_decoder(self): 485 | ''' 486 | Test the homoglyphs decoder. 487 | ''' 488 | cases = [ 489 | { 490 | 'data': { 491 | 'all_domains': [ 492 | 'store.google.com', 493 | '*.google.com', 494 | ], 495 | }, 496 | 'greedy': False, 497 | 'expected': [ 498 | 'store.google.com', 499 | '*.google.com', 500 | ], 501 | 'description': 'Normal domains in ASCII', 502 | }, 503 | { 504 | 'data': { 505 | 'all_domains': [ 506 | 'store.google.com', 507 | '*.google.com', 508 | ], 509 | }, 510 | 'greedy': True, 511 | 'expected': [ 512 | 'store.google.com', 513 | 'store.google.corn', 514 | 'store.googie.com', 515 | 'store.googie.corn', 516 | '*.google.com', 517 | '*.google.corn', 518 | '*.googie.com', 519 | '*.googie.corn' 520 | ], 521 | 'description': 'Normal domains in ASCII with a greedy decoder', 522 | }, 523 | { 524 | 'data': { 525 | 'all_domains': [ 526 | 'укрэмпужск.рф', 527 | 'tigrobaldai.lt', 528 | ], 529 | }, 530 | 'greedy': False, 531 | 'expected': [ 532 | 'yкpэмпyжcк.pф', 533 | 'tigrobaldai.lt', 534 | ], 535 | 'description': 'Normal domains in Unicode', 536 | }, 537 | { 538 | 'data': { 539 | 'all_domains': [ 540 | 'укрэмпужск.рф', 541 | 'tigrobaldai.lt', 542 | ], 543 | }, 544 | 'greedy': True, 545 | 'expected': [ 546 | 'yкpэмпyжcк.pф', 547 | 'tigrobaldai.lt', 548 | 'tigrobaldai.it', 549 | 'tigrobaidai.lt', 550 | 'tigrobaidai.it', 551 | ], 552 | 'description': 'Normal domains in Unicode with a greedy decoder', 553 | }, 554 | { 555 | 'data': { 556 | 'all_domains': [ 557 | # MATHEMATICAL MONOSPACE SMALL P 1D699 558 | '*.𝗉aypal.com', 559 | 560 | # MATHEMATICAL SAN-SERIF BOLD SMALL RHO 561 | 'phishing.𝗉ay𝞀al.com', 562 | ], 563 | }, 564 | 'greedy': False, 565 | 'expected': [ 566 | '*.paypal.com', 567 | 'phishing.paypal.com', 568 | ], 569 | 'description': 'Phishing example in confusable homoglyphs' 570 | }, 571 | { 572 | 'data': { 573 | 'all_domains': [ 574 | # MATHEMATICAL MONOSPACE SMALL P 1D699 575 | '*.𝗉aypal.com', 576 | 577 | # MATHEMATICAL SAN-SERIF BOLD SMALL RHO 578 | 'phishing.𝗉ay𝞀al.com', 579 | ], 580 | }, 581 | 'greedy': True, 582 | 'expected': [ 583 | '*.paypal.com', 584 | '*.paypal.corn', 585 | '*.paypai.com', 586 | '*.paypai.corn', 587 | 'phishing.paypal.com', 588 | 'phishing.paypal.corn', 589 | 'phishing.paypai.com', 590 | 'phishing.paypai.corn', 591 | ], 592 | 'description': 'Phishing example in confusable homoglyphs with a greedy decoder' 593 | }, 594 | ] 595 | 596 | for case in cases: 597 | decoder = HomoglyphsDecoder(greedy=case['greedy']) 598 | 599 | got = decoder.run(case['data']) 600 | self.assertListEqual(got['all_domains'], case['expected'], case['description']) 601 | -------------------------------------------------------------------------------- /scripts/sundry/certstream-domain-features.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 145, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import json\n", 10 | "import numpy as np\n", 11 | "\n", 12 | "domains = []\n", 13 | "features = []\n", 14 | "\n", 15 | "with open('domain-matching.20181014.decoded') as f:\n", 16 | " for line in f:\n", 17 | " record = json.loads(line.strip()) \n", 18 | " \n", 19 | " domains.extend(list(record['analysers'][0]['output'].keys()))\n", 20 | " features.extend(record['analysers'][-1]['output'])" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 146, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/plain": [ 31 | "14004" 32 | ] 33 | }, 34 | "execution_count": 146, 35 | "metadata": {}, 36 | "output_type": "execute_result" 37 | } 38 | ], 39 | "source": [ 40 | "len(domains)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 147, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/plain": [ 51 | "14004" 52 | ] 53 | }, 54 | "execution_count": 147, 55 | "metadata": {}, 56 | "output_type": "execute_result" 57 | } 58 | ], 59 | "source": [ 60 | "len(features)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 148, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "columns = ['NumberOfParts', 'Length', 'LongestPart', 'TLD', 'Randomness']" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 149, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "import pandas as pd\n", 79 | "\n", 80 | "df = pd.DataFrame(data=features, columns=columns, index=domains)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 150, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "text/plain": [ 91 | "(14004, 5)" 92 | ] 93 | }, 94 | "execution_count": 150, 95 | "metadata": {}, 96 | "output_type": "execute_result" 97 | } 98 | ], 99 | "source": [ 100 | "df.shape" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 151, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "data": { 110 | "text/html": [ 111 | "
\n", 112 | "\n", 125 | "\n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | "
NumberOfPartsLengthLongestPartTLDRandomness
www.sawyerrshousegivebackafrica.co.uk4372720.0
\n", 147 | "
" 148 | ], 149 | "text/plain": [ 150 | " NumberOfParts Length LongestPart \\\n", 151 | "www.sawyerrshousegivebackafrica.co.uk 4 37 27 \n", 152 | "\n", 153 | " TLD Randomness \n", 154 | "www.sawyerrshousegivebackafrica.co.uk 2 0.0 " 155 | ] 156 | }, 157 | "execution_count": 151, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "df.sample()" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 152, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/plain": [ 174 | "count 6.000000\n", 175 | "mean 2334.000000\n", 176 | "std 2431.878862\n", 177 | "min 5.000000\n", 178 | "25% 109.000000\n", 179 | "50% 2355.000000\n", 180 | "75% 4507.250000\n", 181 | "max 4704.000000\n", 182 | "Name: NumberOfParts, dtype: float64" 183 | ] 184 | }, 185 | "execution_count": 152, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "df['NumberOfParts'].value_counts().describe()" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 153, 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "data": { 201 | "text/plain": [ 202 | "" 203 | ] 204 | }, 205 | "execution_count": 153, 206 | "metadata": {}, 207 | "output_type": "execute_result" 208 | }, 209 | { 210 | "data": { 211 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmAAAAEvCAYAAADijX30AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAFqtJREFUeJzt3X+s1nX9//HHxTmAID+OKJyji/yWWrp+bpWIsFzHzwEVTbDDam0tMGeSy5FlM1v+SKHPFpUzvksZG9NWm4IebB03NCgPTM2VU8ppjYoNGueiEA5g6PEcru8ffjvLj/Xhh16vwzncbn/Bm+t6X8/3c+xwP9d1uK5KrVarBQCAYkYM9gAAAMcbAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKAwAQYAUFjjYA/wvzl48GD6+4fmJyU1NFSG7OxDlZ2XZ+fl2Xl5dl7eUN35yJENh33bYzrA+vtr2bPnH4M9xlFpaho7ZGcfquy8PDsvz87Ls/PyhurOJ08ef9i39RIkAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQ2DH9WZD1Nm7CmIwZXb8VHMlnQh2JA6/2Zf/eA3U5NwBQf8d1gI0Z3Zj/c2PnYI9xxLb+95zsH+whAICj5iVIAIDCBBgAQGECDACgMAEGAFCYAAMAKEyAAQAUJsAAAAo7rt8HjPJOntiYEaPG1O389Xrz24O9B7Krp68u5wbg+CPAKGrEqDHJrRMHe4wjNuLWniT7BnsMAIYJL0ECABQmwAAAChNgAACFCTAAgMIEGABAYf4XJAxz45pGZszIE+p2/nq99ceB117J/j2v1eXcAINNgMEwN2bkCfnAvR8Y7DGO2O8+/7vsjwADhicvQQIAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADACjssAOsv78/c+fOzRe/+MUkybZt2zJ//vy0tbVl8eLF6e3tTZL09vZm8eLFaWtry/z587N9+/aBc9xzzz1pa2vL7Nmzs3Hjxrf5UgAAhobDDrD77rsvZ5xxxsDvly1blgULFuSxxx7LhAkTsmbNmiTJ6tWrM2HChDz22GNZsGBBli1bliTZsmVLOjs709nZmZUrV+a2225Lf3//23w5AADHvsMKsO7u7vzqV79Ke3t7kqRWq+Wpp57K7NmzkyTz5s3L+vXrkyQbNmzIvHnzkiSzZ8/Ok08+mVqtlvXr12fOnDkZNWpUpk6dmtNPPz2bN2+uxzUBABzTDivAli5dmhtuuCEjRrx+8927d2fChAlpbGxMkrS0tKRarSZJqtVqTj311CRJY2Njxo8fn927d6daraalpWXgnM3NzQP3AQA4njQe6ga//OUvM2nSpLz//e/Pr3/96xIzDWhoqKSpaWzRxxwq7KU8Oy/Pzt+soWGEvRRm5+UdDzs/ZIA988wz2bBhQ7q6uvLqq69m//79WbJkSfbu3Zu+vr40Njamu7s7zc3NSV5/ZmvHjh1paWlJX19f9u3bl5NOOinNzc3p7u4eOG+1Wh24z3/S31/Lnj3/eIuX+J9Nnjy+bueut3rupZ7svDw7H16amsbaS2F2Xt5Q3fmRfL095EuQX/3qV9PV1ZUNGzbk+9//fs4777x873vfy7Rp07Ju3bokSUdHR1pbW5Mkra2t6ejoSJKsW7cu5513XiqVSlpbW9PZ2Zne3t5s27YtW7duzQc/+MGjuT4AgCHtqN8H7IYbbsiqVavS1taWPXv2ZP78+UmS9vb27NmzJ21tbVm1alW+9rWvJUnOOuusXHzxxbnkkkty1VVX5eabb05DQ8PbcxUAAEPIIV+C/FfTpk3LtGnTkiRTp04deOuJfzV69Ojcdddd//b+ixYtyqJFi45iTACA4cM74QMAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoLBDBtirr76a9vb2fPKTn8ycOXNy1113JUm2bduW+fPnp62tLYsXL05vb2+SpLe3N4sXL05bW1vmz5+f7du3D5zrnnvuSVtbW2bPnp2NGzfW6ZIAAI5thwywUaNG5d57783PfvazrF27Nhs3bsyzzz6bZcuWZcGCBXnssccyYcKErFmzJkmyevXqTJgwIY899lgWLFiQZcuWJUm2bNmSzs7OdHZ2ZuXKlbntttvS399f36sDADgGHTLAKpVKTjzxxCRJX19f+vr6UqlU8tRTT2X27NlJknnz5mX9+vVJkg0bNmTevHlJktmzZ+fJJ59MrVbL+vXrM2fOnIwaNSpTp07N6aefns2bN9frugAAjlmH9TNg/f39ufzyy3P++efn/PPPz9SpUzNhwoQ0NjYmSVpaWlKtVpMk1Wo1p556apKksbEx48ePz+7du1OtVtPS0jJwzubm5oH7AAAcTxoP50YNDQ15+OGHs3fv3lx77bX585//XO+5/v/jVtLUNLbIYw019lKenZdn52/W0DDCXgqz8/KOh50fVoD904QJEzJt2rQ8++yz2bt3b/r6+tLY2Jju7u40Nzcnef2ZrR07dqSlpSV9fX3Zt29fTjrppDQ3N6e7u3vgXNVqdeA+/0l/fy179vzjKC7r8EyePL5u5663eu6lnuy8PDsfXpqaxtpLYXZe3lDd+ZF8vT3kS5AvvfRS9u7dmyR55ZVX8sQTT+SMM87ItGnTsm7duiRJR0dHWltbkyStra3p6OhIkqxbty7nnXdeKpVKWltb09nZmd7e3mzbti1bt27NBz/4wSO+OACAoe6Qz4Dt3LkzN954Y/r7+1Or1XLRRRflE5/4RM4888x85StfyZ133plzzjkn8+fPT5K0t7fnhhtuSFtbWyZOnJgf/OAHSZKzzjorF198cS655JI0NDTk5ptvTkNDQ32vDgDgGHTIADv77LOzdu3aNx2fOnXqwFtP/KvRo0cPvFfY/7Ro0aIsWrToKMYEABg+vBM+AEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKE2AAAIUJMACAwgQYAEBhAgwAoDABBgBQmAADAChMgAEAFCbAAAAKO2SA7dixI5/73OdyySWXZM6cObn33nuTJHv27MnChQsza9asLFy4MD09PUmSWq2WO+64I21tbbnsssvy/PPPD5yro6Mjs2bNyqxZs9LR0VGnSwIAOLYdMsAaGhpy44035pFHHsn999+fn/70p9myZUtWrFiR6dOn59FHH8306dOzYsWKJElXV1e2bt2aRx99NLfffntuvfXWJK8H2/Lly/PAAw9k9erVWb58+UC0AQAcTw4ZYFOmTMn73ve+JMm4cePy7ne/O9VqNevXr8/cuXOTJHPnzs0vfvGLJBk4XqlU8uEPfzh79+7Nzp07s2nTpsyYMSNNTU2ZOHFiZsyYkY0bN9bx0gAAjk1H9DNg27dvzwsvvJAPfehD2bVrV6ZMmZIkmTx5cnbt2pUkqVaraWlpGbhPS0tLqtXqm443NzenWq2+HdcAADCkNB7uDV9++eVcd911uemmmzJu3Lg3/FmlUkmlUnnbh2toqKSpaezbft7hwF7Ks/Py7PzNGhpG2Ethdl7e8bDzwwqw1157Ldddd10uu+yyzJo1K0ly8sknZ+fOnZkyZUp27tyZSZMmJXn9ma3u7u6B+3Z3d6e5uTnNzc15+umnB45Xq9Wce+65/+vj9vfXsmfPP474og7X5Mnj63bueqvnXurJzsuz8+GlqWmsvRRm5+UN1Z0fydfbQ74EWavV8s1vfjPvfve7s3DhwoHjra2tWbt2bZJk7dq1ufDCC99wvFar5dlnn8348eMzZcqUzJw5M5s2bUpPT096enqyadOmzJw580ivDQBgyDvkM2C//e1v8/DDD+c973lPLr/88iTJ9ddfn6uvvjqLFy/OmjVrctppp+XOO+9MklxwwQV5/PHH09bWljFjxmTp0qVJkqampnzpS19Ke3t7kuTaa69NU1NTva4LAOCYdcgA++hHP5o//OEP//bP/vmeYP+qUqnklltu+be3b29vHwgwAIDjlXfCBwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKAwAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKAwAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKAwAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKAwAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKAwAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYYcMsG984xuZPn16Lr300oFje/bsycKFCzNr1qwsXLgwPT09SZJarZY77rgjbW1tueyyy/L8888P3KejoyOzZs3KrFmz0tHRUYdLAQAYGg4ZYFdccUVWrlz5hmMrVqzI9OnT8+ijj2b69OlZsWJFkqSrqytbt27No48+mttvvz233nprkteDbfny5XnggQeyevXqLF++fCDaAACON4cMsI997GOZOHHiG46tX78+c+fOTZLMnTs3v/jFL95wvFKp5MMf/nD27t2bnTt3ZtOmTZkxY0aampoyceLEzJgxIxs3bqzD5QAAHPuO6mfAdu3alSlTpiRJJk+enF27diVJqtVqWlpaBm7X0tKSarX6puPNzc2pVqtvZW4AgCGr8a2eoFKppFKpvB2zvElDQyVNTWPrcu6hzl7Ks/Py7PzNGhpG2Ethdl7e8bDzowqwk08+OTt37syUKVOyc+fOTJo0Kcnrz2x1d3cP3K67uzvNzc1pbm7O008/PXC8Wq3m3HPPPeTj9PfXsmfPP45mxMMyefL4up273uq5l3qy8/LsfHhpahprL4XZeXlDdedH8vX2qF6CbG1tzdq1a5Mka9euzYUXXviG47VaLc8++2zGjx+fKVOmZObMmdm0aVN6enrS09OTTZs2ZebMmUfz0AAAQ94hnwG7/vrr8/TTT2f37t35+Mc/ni9/+cu5+uqrs3jx4qxZsyannXZa7rzzziTJBRdckMcffzxtbW0ZM2ZMli5dmiRpamrKl770pbS3tydJrr322jQ1NdXxsgAAjl2HDLDvf//7//b4vffe+6ZjlUolt9xyy7+9fXt7+0CAAQAcz7wTPgBAYQIMAKAwAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKAwAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKAwAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKAwAQYAUJgAAwAoTIABABQmwAAAChNgAACFCTAAgMIEGABAYQIMAKCwxsEeAGC4OWncyDSOOaFu5588eXxdztt34JXs3v9aXc4NvJEAA3ibNY45IS+cfc5gj3HEznnxhUSAQRFeggQAKEyAAQAUJsAAAAoTYAAAhQkwAIDCBBgAQGECDACgMAEGAFCYAAMAKEyAAQAUVjzAurq6Mnv27LS1tWXFihWlHx4AYNAVDbD+/v58+9vfzsqVK9PZ2Zmf//zn2bJlS8kRAAAGXdEP4968eXNOP/30TJ06NUkyZ86crF+/PmeeeWbJMQAYZiZOGJNRo+v3T9rkyePrct7eV/vSs/dAXc7Nsa1ogFWr1bS0tAz8vrm5OZs3by45AgDD0KjRjfm/12wY7DGO2LV3tw72CAySogF2pEaObKjbdx3/tPW/59T1/PVS773U1a09gz3BURnKO//d53832CMclaG883NefGGwRzgqQ3nnQzVmhvLO62m476Xoz4A1Nzenu7t74PfVajXNzc0lRwAAGHRFA+wDH/hAtm7dmm3btqW3tzednZ1pbR2a37EAABytoi9BNjY25uabb85VV12V/v7+fOpTn8pZZ51VcgQAgEFXqdVqtcEeAgDgeOKd8AEAChNgAACFCTAAgMIEGABAYQKsDr7+9a8P9gjHhT/96U958skn8/LLL7/heFdX1yBNNPxt3rx54NMrtmzZklWrVuXxxx8f5KmOH7/5zW+yatWqbNq0abBHGbaee+657N+/P0nyyiuv5K677so111yT7373u9m3b98gTzc83XfffdmxY8dgj1Gc/wX5Fl1zzTVvOvbrX/8606ZNS5LcfffdpUc6Ltx33335yU9+kjPOOCMvvvhibrrppvzXf/1XkmTevHnp6OgY5AmHn+XLl6erqyt9fX2ZMWNGnnvuuUybNi1PPPFEZs6cmUWLFg32iMNOe3t71qxZkyR54IEH8pOf/CRtbW3ZtGlTWltbc/XVVw/yhMPPnDlz8vDDD6exsTHf+ta3csIJJ2T27Nl56qmn8uKLL2b58uWDPeKw85GPfCRjxozJO9/5zsyZMycXX3xxJk2aNNhj1d0x/VFEQ0G1Ws0ZZ5yR+fPnp1KppFar5fe//32uvPLKwR5tWFu9enUeeuihnHjiidm+fXuuu+66/PWvf83nP//5+J6iPtatW5e1a9emt7c3M2bMSFdXV8aNG5cvfOELmT9/vgCrg76+voFf33///Vm1alUmTZqUK6+8Mp/+9KcFWB0cPHgwjY2v/9P4+9//fuCbuY9+9KO5/PLLB3O0YWvq1Kl56KGH8sQTT+SRRx7JD3/4w7zvfe/LpZdemra2towbN26wR6wLL0G+RQ8++GDe//735+6778748eMzbdq0jB49Oueee27OPffcwR5v2Dp48GBOPPHEJMk73vGO/PjHP05XV1e+853vCLA6aWhoSENDw8B3qv/8onjCCSdkxAhfSurh4MGD6enpye7du1Or1QaeFRg7dmwaGhoGebrh6ayzzsqDDz6YJDn77LPzu9+9/jmqf/nLXwbCjLdXpVLJiBEjMnPmzCxdujQbN27MZz/72WzcuHHglY3hyN+mt2jEiBFZsGBBLrrooixdujSnnHJK+vv7B3usYe/kk0/OCy+8kHPOOSdJcuKJJ+aee+7JTTfdlD/+8Y+DPN3wNHLkyBw4cCBjxozJQw89NHB83759AqxO9u/fnyuuuCK1Wi2VSiU7d+7MlClT8vLLL/tGo06WLFmSJUuW5Ec/+lFOOumkfOYzn0lLS0tOPfXULFmyZLDHG5b+59/lkSNH5sILL8yFF16YAwcODNJU9ednwN5mv/rVr/LMM8/k+uuvH+xRhrXu7u40NDRk8uTJb/qz3/72t/nIRz4yCFMNb729vRk1atSbjr/00kv529/+lve+972DMNXx6cCBA/n73/+eqVOnDvYow9b+/fuzffv29PX1paWlJaeccspgjzRs/eUvf8m73vWuwR6jOAEGAFCY1w0AAAoTYAAAhQkwAIDCBBgAQGECDACgsP8HRsWezpLvOk4AAAAASUVORK5CYII=\n", 212 | "text/plain": [ 213 | "
" 214 | ] 215 | }, 216 | "metadata": {}, 217 | "output_type": "display_data" 218 | } 219 | ], 220 | "source": [ 221 | "import matplotlib.pyplot as plt\n", 222 | "import seaborn\n", 223 | "\n", 224 | "seaborn.set_style(\"darkgrid\")\n", 225 | "\n", 226 | "plt.figure(figsize=(10,5))\n", 227 | "df['NumberOfParts'].value_counts().plot(kind='bar')" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 154, 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "data": { 237 | "text/plain": [ 238 | "" 239 | ] 240 | }, 241 | "execution_count": 154, 242 | "metadata": {}, 243 | "output_type": "execute_result" 244 | }, 245 | { 246 | "data": { 247 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlMAAAEvCAYAAABhSUTPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAADlhJREFUeJzt3V1o3Qf9x/HPabJqa5fG1jyoRP9uTipVvHASM0Qws43SSTO1InrhClIFoWyVSX1EhA3ROYp6FQbSXQhitamsgz1kSoulgsp8wAcYdtDJmkA0a+u21pzmfzEs7v8fnCzftCenfb3ucnJyzgeaX86b8zs9p7GwsLAQAACWZFW7BwAAdDIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUNB9Oe/swoULaTZ9eg2tdXU1/K4Ay87fFl6Oa67pWtT1LmtMNZsLmZt79nLeJR2qt3et3xVg2fnbwsvR13ftoq7nNB8AQIGYAgAoEFMAAAViCgCgQEwBABSIKQCAAjEFAFAgpgAACsQUAECBmAIAKBBTAAAFl/Wz+Xhp63rWZM0r/FP8X4v9TKSryXPn5nP29HPtngHAf/EIvgKseUV3/mfv4XbPoAM8+c1tOdvuEQC8iNN8AAAFYgoAoEBMAQAUiCkAgAIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUCCmAAAKxBQAQIGYAgAoEFMAAAWLjqlms5nx8fF85jOfSZKcPHkyO3bsyJYtW3L77bfn/Pnzl2wkAMBKteiYuv/++3P99ddf/Pqee+7JbbfdlkceeSQ9PT05cODAJRkIALCSLSqmTp06lV/84hf56Ec/miRZWFjI8ePHMzY2liS59dZbMzU1delWAgCsUIuKqbvvvjt33nlnVq164er//Oc/09PTk+7u7iTJ4OBgpqenL91KAIAVqrvVFX7+859nw4YNedvb3pZf/epXpTvr6mqkt3dt6TbgaucYgqXr6lrlGGLZtYyp3/72t3nsscdy5MiRnDt3LmfPns1dd92V06dPZ35+Pt3d3Tl16lQGBgZa3lmzuZC5uWeXZfiVpK/v2nZPoIM4hmDpenvXOoZYtMU+Prc8zff5z38+R44cyWOPPZZ777037373u/Od73wnw8PDeeihh5IkBw8ezOjoaG0xAEAHWvL7TN155535wQ9+kC1btmRubi47duxYzl0AAB2h5Wm+/zY8PJzh4eEkydDQkLdDAACuet4BHQCgQEwBABSIKQCAAjEFAFAgpgAACsQUAECBmAIAKBBTAAAFYgoAoEBMAQAUiCkAgAIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUCCmAAAKxBQAQIGYAgAoEFMAAAViCgCgQEwBABSIKQCAAjEFAFAgpgAACsQUAECBmAIAKBBTAAAFYgoAoEBMAQAUiCkAgAIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUCCmAAAKxBQAQIGYAgAoEFMAAAViCgCgQEwBABR0t7rCuXPn8slPfjLnz59Ps9nM2NhYdu/enZMnT2bPnj2Zm5vL5s2b861vfSurV6++HJsBAFaMls9MrV69Ovv378/PfvazTE5O5ujRo3n88cdzzz335LbbbssjjzySnp6eHDhw4HLsBQBYUVrGVKPRyKte9aokyfz8fObn59NoNHL8+PGMjY0lSW699dZMTU1d2qUAACvQol4z1Ww2s3379tx000256aabMjQ0lJ6ennR3v3CWcHBwMNPT05d0KADAStTyNVNJ0tXVlUOHDuX06dP53Oc+l7/97W9LurOurkZ6e9cu6WeBFziGYOm6ulY5hlh2i4qp/+jp6cnw8HAef/zxnD59OvPz8+nu7s6pU6cyMDDQ8uebzYXMzT275LFXqr6+a9s9gQ7iGIKl6+1d6xhi0Rb7+NzyNN8//vGPnD59Okny/PPP59ixY7n++uszPDychx56KEly8ODBjI6OFuYCAHSmls9MzczMZO/evWk2m1lYWMgHPvCBvO9978ub3/zm3HHHHdm3b1/e+ta3ZseOHZdjLwDAitIypjZt2pTJycn/d/nQ0JC3QwAArnreAR0AoEBMAQAUiCkAgAIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUCCmAAAKxBQAQIGYAgAoEFMAAAViCgCgQEwBABSIKQCAAjEFAFAgpgAACsQUAECBmAIAKBBTAAAFYgoAoEBMAQAUiCkAgAIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUCCmAAAKxBQAQIGYAgAoEFMAAAViCgCgQEwBABSIKQCAAjEFAFAgpgAACsQUAECBmAIAKBBTAAAFYgoAoEBMAQAUdLe6wtNPP50vfOELmZ2dTaPRyMc+9rF86lOfytzcXO644478/e9/z+tf//rs27cv69evvxybAQBWjJbPTHV1dWXv3r158MEH86Mf/Sg//OEP88QTT2RiYiIjIyN5+OGHMzIykomJicuxFwBgRWkZU/39/dm8eXOSZN26dbnuuusyPT2dqampjI+PJ0nGx8fz6KOPXtqlAAAr0Mt6zdRTTz2VP//5z3nHO96R2dnZ9Pf3J0n6+voyOzt7SQYCAKxkLV8z9R//+te/snv37nzpS1/KunXrXvS9RqORRqPR8ja6uhrp7V378lcCFzmGYOm6ulY5hlh2i4qpf//739m9e3c+9KEPZevWrUmSjRs3ZmZmJv39/ZmZmcmGDRta3k6zuZC5uWdri69AfX3XtnsCHcQxBEvX27vWMcSiLfbxueVpvoWFhXz5y1/Oddddl507d168fHR0NJOTk0mSycnJ3HzzzUucCgDQuVo+M/Wb3/wmhw4dylve8pZs3749SbJnz57s2rUrt99+ew4cOJDXve512bdv3yUfCwCw0rSMqRtvvDF//etfX/J7+/fvX/ZBAACdxDugAwAUiCkAgAIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUCCmAAAKxBQAQIGYAgAoEFMAAAViCgCgQEwBABSIKQCAAjEFAFAgpgAACsQUAECBmAIAKBBTAAAFYgoAoEBMAQAUiCkAgAIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUCCmAAAKxBQAQIGYAgAoEFMAAAViCgCgQEwBABSIKQCAAjEFAFAgpgAACsQUAECBmAIAKBBTAAAFYgoAoEBMAQAUiCkAgIKWMfXFL34xIyMjueWWWy5eNjc3l507d2br1q3ZuXNnnnnmmUs6EgBgpWoZUx/+8Idz3333veiyiYmJjIyM5OGHH87IyEgmJiYu2UAAgJWsZUy9613vyvr161902dTUVMbHx5Mk4+PjefTRRy/NOgCAFW5Jr5manZ1Nf39/kqSvry+zs7PLOgoAoFN0V2+g0Wik0Wgs6rpdXY309q6t3iVc1RxDsHRdXascQyy7JcXUxo0bMzMzk/7+/szMzGTDhg2L+rlmcyFzc88u5S6vaH1917Z7Ah3EMQRL19u71jHEoi328XlJp/lGR0czOTmZJJmcnMzNN9+8lJsBAOh4LWNqz549+fjHP54TJ07kve99b3784x9n165d+eUvf5mtW7fm2LFj2bVr1+XYCgCw4rQ8zXfvvfe+5OX79+9f9jEAAJ3GO6ADABSIKQCAAjEFAFAgpgAACsQUAECBmAIAKBBTAAAFYgoAoEBMAQAUiCkAgAIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUCCmAAAKxBQAQIGYAgAoEFMAAAViCgCgoLvdAwC4NDau786q1WvaPWPF6eu7tt0TVpwL55/L7DPz7Z7RscQUwBVq1eo1ydfXt3sGHWDV159JcqbdMzqW03wAAAViCgCgQEwBABSIKQCAAjEFAFAgpgAACsQUAECBmAIAKBBTAAAFYgoAoEBMAQAUiCkAgAIxBQBQIKYAAArEFABAgZgCACgQUwAABWIKAKBATAEAFIgpAIACMQUAUCCmAAAKxBQAQEEppo4cOZKxsbFs2bIlExMTy7UJAKBjLDmmms1mvvGNb+S+++7L4cOH88ADD+SJJ55Yzm0AACvekmPq97//fd74xjdmaGgoq1evzrZt2zI1NbWc2wAAVrwlx9T09HQGBwcvfj0wMJDp6ellGQUA0Cm6L+edXXNNV/r6rr2cd9kxnvzmtnZPoEM4hnhZvv5MuxfQIfxtWbolPzM1MDCQU6dOXfx6eno6AwMDyzIKAKBTLDmm3v72t+fJJ5/MyZMnc/78+Rw+fDijo6PLuQ0AYMVb8mm+7u7ufO1rX8unP/3pNJvNfOQjH8kNN9ywnNsAAFa8xsLCwkK7RwAAdCrvgA4AUCCmAAAKxBQAQIGYAgAoEFO03e9+97ucPXs2SfL888/nu9/9bj772c/m29/+ds6cOdPmdUCnuv/++/P000+3ewZXAf+bj7bbtm1bDh06lO7u7nz1q1/NK1/5yoyNjeX48eP5y1/+ku9///vtngh0oHe+851Zs2ZN3vCGN2Tbtm354Ac/mA0bNrR7Flegy/pxMvBSLly4kO7uF34V//jHP+bgwYNJkhtvvDHbt29v5zSggw0NDeWnP/1pjh07lgcffDDf+973snnz5txyyy3ZsmVL1q1b1+6JXCGc5qPtbrjhhvzkJz9JkmzatCl/+MMfkiQnTpy4GFkAL1ej0ciqVavynve8J3fffXeOHj2aT3ziEzl69Gje//73t3seVxCn+Wi7M2fO5K677sqvf/3rvPrVr86f/vSnDA4O5rWvfW2+8pWvZNOmTe2eCHSg8fHxTE5OvuT3nnvuuaxZs+YyL+JKJaZYMc6ePZunnnoq8/PzGRwczGte85p2TwI62IkTJ/KmN72p3TO4CogpAIACr5kCACgQUwAABWIKAKBATAEAFIgpAICC/wWjtKP6Gu64AgAAAABJRU5ErkJggg==\n", 248 | "text/plain": [ 249 | "
" 250 | ] 251 | }, 252 | "metadata": {}, 253 | "output_type": "display_data" 254 | } 255 | ], 256 | "source": [ 257 | "plt.figure(figsize=(10,5))\n", 258 | "df.loc[df['NumberOfParts'] > 5]['NumberOfParts'].value_counts().plot(kind='bar')" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 156, 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "data": { 268 | "text/html": [ 269 | "
\n", 270 | "\n", 283 | "\n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | "
NumberOfPartsLengthLongestPartTLDRandomness
waws-prod-blu-43680001.state.p.azurewebsites.windows.net6562230.0
\n", 305 | "
" 306 | ], 307 | "text/plain": [ 308 | " NumberOfParts Length \\\n", 309 | "waws-prod-blu-43680001.state.p.azurewebsites.wi... 6 56 \n", 310 | "\n", 311 | " LongestPart TLD \\\n", 312 | "waws-prod-blu-43680001.state.p.azurewebsites.wi... 22 3 \n", 313 | "\n", 314 | " Randomness \n", 315 | "waws-prod-blu-43680001.state.p.azurewebsites.wi... 0.0 " 316 | ] 317 | }, 318 | "execution_count": 156, 319 | "metadata": {}, 320 | "output_type": "execute_result" 321 | } 322 | ], 323 | "source": [ 324 | "df.loc[df['NumberOfParts'] > 5].sample()" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 157, 330 | "metadata": {}, 331 | "outputs": [ 332 | { 333 | "data": { 334 | "text/html": [ 335 | "
\n", 336 | "\n", 349 | "\n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | "
NumberOfPartsLength
NumberOfParts1.0000000.604539
Length0.6045391.000000
\n", 370 | "
" 371 | ], 372 | "text/plain": [ 373 | " NumberOfParts Length\n", 374 | "NumberOfParts 1.000000 0.604539\n", 375 | "Length 0.604539 1.000000" 376 | ] 377 | }, 378 | "execution_count": 157, 379 | "metadata": {}, 380 | "output_type": "execute_result" 381 | } 382 | ], 383 | "source": [ 384 | "df[['NumberOfParts', 'Length']].corr()" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 158, 390 | "metadata": {}, 391 | "outputs": [ 392 | { 393 | "data": { 394 | "text/plain": [ 395 | "" 396 | ] 397 | }, 398 | "execution_count": 158, 399 | "metadata": {}, 400 | "output_type": "execute_result" 401 | }, 402 | { 403 | "data": { 404 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmcAAAFACAYAAAD589sCAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3Xt4lNWBx/HfTIYggdDMYC7gohAsFQOL7NOKaIm5GC4qcjMU9HEFdFmf1gaNQqMIi6h4edDS9bZEsRCXBTcRQ6nKxVyablWwyqpA0G4BATWJTWa4JAKZzOwfKdEgwkBm5rwz+X7+CWcyefPLkCf55bzve47N7/f7BQAAAEuwmw4AAACAb1DOAAAALIRyBgAAYCGUMwAAAAuhnAEAAFgI5QwAAMBCKGcAAAAWQjkDAACwEMoZAACAhThMB+gIn8+nlhY2OAAAANbXpUtMQM+L6HLW0uKXx9NkOgYAAMAZJSbGB/Q8TmsCAABYCOUMAADAQihnAAAAFkI5AwAAsBDKGQAAgIVQzgAAACyEcgYAAGAhIStn9913n0aMGKHrr7++7TGPx6MZM2Zo1KhRmjFjhg4ePChJ8vv9evjhh5WTk6Nx48Zpx44doYoFAABgaSErZ5MmTdKLL77Y7rHCwkKNGDFCmzZt0ogRI1RYWChJqqqq0t69e7Vp0yY99NBDWrhwYahiAQAAWFrIdgj4yU9+ogMHDrR7rKysTC+//LIkacKECbrllls0Z84clZWVacKECbLZbLrssst06NAh1dXVKSkpKVTxAACdVGVlmcrLNwflWB6PW5KUkODs8LGysnKUkZHd4eMg8oV1+6b6+vq2wpWYmKj6+npJUm1trVJSUtqel5KSotra2jOWs5gYmxIS4kIXGAAQdeLiYuVwBOfE0Ylydv75vTp8rLi4WH6nQZLBvTVtNptsNluHjsHemgCAs3X55SN1+eUjg3KsBQsK/v52cVCOx++06GbJvTV79eqluro6SVJdXZ1cLpckKTk5WTU1NW3Pq6mpUXJycjijAQAAWEJYy1lWVpZKS0slSaWlpcrOzm73uN/v1//+7/8qPj6e680AAECnFLLTmvn5+dq6davcbrfS09P1y1/+UrNmzdJdd92lkpIS9enTR0uXLpUkXX311frDH/6gnJwcdevWTYsXB2d6GAAAINLY/H6/33SIc9Xc3ML5eQCAMSeuOVu06DHDSRAJLHnNGQAAAE6PcgYAAGAhlDMAAAALoZwBAABYCOUMAADAQihnAAAAFkI5AwAAsBDKGQAAgIVQzgAAACyEcgYAAGAhlDMAAAALoZwBQJRwuxs0f/6v5HY3mI4CoAMoZwAQJYqLV6u6eoeKi9eYjgKgAyhnABAF3O4GVVS8Jb/fr4qKzcyeARGMcgYAUaC4eLV8Pp8kyefzMXsGRDDKGQBEgaqqSnm9XkmS1+tVVVWF4UQAzhXlDACiQHp6hhwOhyTJ4XAoPT3TcCIA54pyBgBRIDd3muz21h/pdrtdublTDScCcK4oZwAQBZxOlzIzr5HNZlNmZo6cTpfpSADOkcN0AABAcOTmTtP+/fuYNQMiHOUMAKKE0+nSQw89bjoGgA7itCYAAICFUM4AAAAshHIGAABgIZQzAAAAC6GcAQAAWAjlDAAAwEIoZwAAABZCOQMAALAQyhkAAICFUM4AAAAshHIGAABgIZQzAAAAC6GcAQAAWAjlDAAAwEIoZwAAABZCOQMAALAQyhkAAICFUM4AAAAshHIGAABgIZQzAAAAC6GcAQAAWAjlDAAAwEIcJj7pihUrVFxcLJvNpoEDB+rRRx9VXV2d8vPz5fF4lJaWpieeeEKxsbEm4gEAABgT9pmz2tpaFRUV6dVXX9Xvf/97tbS06PXXX9eSJUs0ffp0bd68WT179lRJSUm4owEAABhn5LRmS0uLjh49Kq/Xq6NHjyoxMVHvvvuuRo8eLUmaOHGiysrKTEQDAAAwKuynNZOTkzVz5kxlZmaqa9euuuqqq5SWlqaePXvK4WiNk5KSotra2jMeKybGpoSEuFBHBgDglByO1jkOfhchmMJezg4ePKiysjKVlZUpPj5es2fP1h//+MdzOlZLi18eT1OQEwIAEBiv1ydJ/C5CQBIT4wN6XtjL2dtvv61/+Id/kMvlkiSNGjVKH3zwgQ4dOiSv1yuHw6GamholJyeHOxoAAIBxYb/mrE+fPvrwww/19ddfy+/365133tHFF1+s4cOHa+PGjZKk1157TVlZWeGOBgAAYFzYZ86GDh2q0aNHa+LEiXI4HBo0aJB+9rOfKSMjQ3fffbeWLl2qQYMGKTc3N9zRAAAAjLP5/X6/6RDnqrm5hfP8ANAJvPRSofbu3W06xnfs2dOaqX//VMNJ2uvXL1UzZ84yHQMnsew1ZwAAnK29e3fr012fKiEuyXSUdmJ8XSVJdfs8hpN8w9NUZzoCOohyBiAkXnzxP/Tmm+s1btwNmj79X03HQRRIiEtS5iVTTcewvIpda0xHQAextyaAkHjzzfWSpPXrf2c4CQBEFsoZgKB78cX/aDdesWKZoSQAEHkoZwCC7sSs2QnMngFA4ChnAAAAFkI5A4Ao4XY3aP78X8ntbjAdBUAHUM4ABN3YsePajceNu8FQks6luHi1qqt3qLiYu/WASEY5AxB0t99+R7sxS2mEntvdoIqKt+T3+1VRsZnZMyCCUc4AhMSJ2TNmzcKjuHi1fD6fJMnn8zF7BkQwyhmAkLj99jv06quvM2sWJlVVlfJ6vZIkr9erqqoKw4kAnCvKGQBEgfT0DDkcrZu+OBwOpadnGk4E4FxRzgAgCuTmTpPd3voj3W63KzeXbY6ASEU5A4Ao4HS6lJl5jWw2mzIzc+R0ukxHAnCO2PgcAKJEbu407d+/j1kzIMJRzgAgSjidLj300OOmYwDoIE5rAgAAWAjlDAAAwEIoZwAAABZCOQMAALAQyhkAAICFUM4AAAAshHIGAABgIZQzAAAAC6GcAQAAWAjlDAAAwEIoZwAAABZCOQMAABHD7W7Q/Pm/ktvdYDpKyFDOAABAxCguXq3q6h0qLl5jOkrIUM4AAEBEcLsbVFHxlvx+vyoqNkft7BnlDAAARITi4tXy+XySJJ/PF7WzZ5QzAAAQEaqqKuX1eiVJXq9XVVUVhhOFBuUMAABEhPT0DDkcDkmSw+FQenqm4UShQTkDAAARITd3muz21upit9uVmzvVcKLQoJwBAICI4HS6lJl5jWw2mzIzc+R0ukxHCgmH6QBAOEyefF3bv1999XWDSQAAHZGbO0379++L2lkziXIGAAAiiNPp0kMPPW46RkhxWhNR79uzZqcaAwBgJZQzAAAAC6GcAQAAWAjlDAAAwEIoZwAAABZCOUPUGz362pPG3BAAALAuI+Xs0KFDysvL05gxYzR27Fht27ZNHo9HM2bM0KhRozRjxgwdPHjQRDREodzcaSeNo3dtHABA5DNSzh555BGNHDlSGzZs0Lp16zRgwAAVFhZqxIgR2rRpk0aMGKHCwkIT0RCFnE6XRo++VjabTaNHXxe1K0oDAKJD2MvZ4cOH9d577+nGG2+UJMXGxqpnz54qKyvThAkTJEkTJkzQW2+9Fe5oiGK5udM0aFAas2YAAMsL+w4BBw4ckMvl0n333addu3YpLS1N8+bNU319vZKSkiRJiYmJqq+vP+OxYmJsSkiIC3VkRIGEhDg9/fTTpmMAOEcOB5dInw2Hw87vxwgW9nLm9Xq1c+dOzZ8/X0OHDtXDDz/8nVOYNptNNpvtjMdqafHL42kKVVQAgEV4vT7TESKK1+vj96MFJSbGB/S8sJezlJQUpaSkaOjQoZKkMWPGqLCwUL169VJdXZ2SkpJUV1cnl4vrggBEv8rKMpWXbw7KsTwetyQpIcHZ4WNlZeUoIyO7w8cBcPbCPk+cmJiolJQU7d69W5L0zjvvaMCAAcrKylJpaakkqbS0VNnZ/FAAgLPhdrvldrtNxwDQQWGfOZOk+fPn695771Vzc7P69u2rRx99VD6fT3fddZdKSkrUp08fLV261EQ0AAirjIzsoM1QLVhQIElatOixoBwPgBlGytmgQYO0du3a7zy+cuVKA2kAAACsI6BytmfPHi1fvlxffPGFvF5v2+NFRUUhCwYAANAZBVTOZs+eralTp2rKlCmy27mdGZHH7W7QU089rvz8X7EILQDA0gIqZw6HQzfddFOoswAhU1y8WtXVO1RcvEazZv3cdBwAAL7XaafBPB6PPB6PMjMztWrVKtXV1bU95vF4wpUR6BC3u0EVFW/J7/eromKz3O4G05E6Bbe7QfPn/4rXGwDO0mlnziZNmiSbzSa/3y9JWr58edv7bDabysrKQpsOCILi4tXy+VoXsPT5fMyehQmzlQBwbk5bzsrLyyVJx44dU9euXdu979ixY6FLBQRRVVVl240sXq9XVVUVlIUQO3m2Mjd3Ktf6AUCAArq6f+rU724WfarHACtKT8+Qw9H6d4jD4VB6eqbhRNGvuHi1mpubJUnNzc0qLl5jOBEARI7TlrOvvvpK27dv19GjR7Vz507t2LFDO3bs0JYtW/T111+HKyPQIbm509ruMrbb7crN5Q+LUKuqqmy7HMLv96uqqsJwIgCIHKc9rfk///M/Wrt2rWpqavToo4+2Pd69e3fl5+eHPBwQDE6nS5mZ12jTpjeVmZnD6bUw6Nevv6qrd7SN+/dPNZgGACLLacvZxIkTNXHiRG3cuFGjR48OVyYg6HJzp2n//n3MmoXJt4uZJO3cud1QEgCIPAGtc/bFF1/ot7/9bbvHevToocGDB2vQoEEhCQYEk9Pp0kMPPW46BgAAZxRQOdu+fbu2b9+uzMzWC6krKir0ox/9SGvWrNGYMWP0L//yLyENCSDS2UwHAICIEdDdmjU1NVq7dq0KCgpUUFCgtWvXqqGhQatWrdJrr70W6owAIsygQZeedgwA+H4BlbP6+nrFxsa2jbt06aK//e1vOu+889o9DgCStHv37pPGfzWUBAAiT0CnNceNG6cpU6YoOztbUuvitNdff72ampo0YMCAkAYEEHkcDoe+vU71iXXmAESOysoyLV++LCjHOn78WNti4FbicDgUG9v1zE8MwG23/asyMrKDcqyAfmL+4he/0MiRI7Vt2zZJ0oMPPqghQ4ZIkp588smgBAEQPRobj5x2DAD4fgH/OZuWlqbk5GS1tLRIar2Ds0+fPiELBgTThg1v6IUXntUdd9ypnJyxpuNEvV69zld9/d/axuefn2gwDYBzkZGRHbSZIJydgMrZyy+/rGeeeUbnn39+20rrkrR+/fqQBQOC6cUXn5MkLVv2LOUsDJqaGtuNmTkDgMAFVM6Kioq0YcMGOZ3OUOcBgm7DhjfabSW0efObFLQQO3l7N7Z7A4DABXS3ZkpKiuLj40OdBQiJE7NmJyxb9qyhJAAAnFlAM2d9+/bVLbfcooyMjHZLZ8yYMSNkwYBgOTFr9n1jAACsJKBy1qdPH/Xp00fNzc1qbm4OdSYgxFitHgBgXQGVszvvvFNS63Uj3bp1C2kgINgGDbpU1dU7240BALCqgK4527Ztm6699lqNHdt6EfWuXbu0cOHCUOYCgmbv3r0njfeYCQIAQAACKmeLFy/W8uXLlZCQIEm65JJL9Oc//zmkwYBgufTSwe3GaWmDv+eZAACYF1A5k6TevXu3/0B7wB8KGFVdvaPdeOfOHd/zTAAAzAvomrPevXvrgw8+kM1mU3Nzs4qKithTExHj5AVRTx4DAGAlAU1/LVy4UKtWrVJtba3S09NVXV2tf/u3fwt1NiAobDbbaccAAFhJQDNnLpfrOxucr1ixQtOnTw9FJiCoWOcMABBJzvnCsRUrVgQxBhA6LpfrpHEvQ0kAADizcy5nzD4gUhw8ePCkscdQEgAAzuycyxnX7SBStLS0nHYMAICVnPaas2HDhp2yhPn9fh07dixkoQAAADqr05azbdu2hSsHAAAA1IHTmgAAAAi+gJbSAADAJI/HLU/TV6rYtcZ0FMvzNNUp1sNNe5GMmTMAAAALYebMALe7QU899bjy838lp9N15g8AgE4uIcGp44dsyrxkqukollexa40SEhJMx0AHUM4MKC5ererqHSouXqNZs35uOg7QprKyTOXlmzt8HLvdLp/P1268YEHBOR8vKytHGRnZHc4FAJGA05ph5nY3qKLiLfn9flVUbJbb3WA6EhB0gwalnXYMAPh+zJyFWXHx6rYZBZ/Px+wZLCUjIztoM1S5uePk8/mUlJSsRYseC8oxAaAzYOYszKqqKuX1eiVJXq9XVVUVhhMBoTFoUJrS0obo+edfMh0FACIK5SzM0tMz5HC0Tlg6HA6lp2caTgQAAKzE2GnNlpYWTZ48WcnJyVq2bJn279+v/Px8eTwepaWl6YknnlBsbKypeCGTmztNFRVvSWq9SDo3lzuPgEjz0kuF2rt3t+kY37FnT2umjtx8EQr9+qVq5sxZpmMAEcNYOSsqKtKAAQN05MgRSdKSJUs0ffp0XXfddVqwYIFKSkp00003mYoXMk6nS5mZ12jTpjeVmZnDUhpABNq7d7f+uuNjpZgOcpK4v79t3PGx0RzfVmM6ABCBjJSzmpoaVVZW6o477tCKFSvk9/v17rvv6sknn5QkTZw4Uc8880xUljNJuvTSIdq48Q0NHjzEdBRLC9ayDqfCsg7oqBRJt8lmOoblLRcr1SO4OsNaoUbK2eLFizVnzhw1NjZKktxut3r27Nl2LVZKSopqa2vPeJyYGJsSEuLO+DyreeGFZyVJy5Y9o2uvHW04jXXFxcXK4ej4ZZHdunXT119/3W7ckePGxcVG5PdduJ14jaPxtQrG92Vn4nDYO/x9wGt+doLxmlvVihXLVF29Q7/7XYlmz77LdJyQCHs5q6iokMvl0uDBg7Vly5YOHaulxS+PpylIycLjww+3tZ3KPXLkiKqq3tY//uNlhlNZ0+WXj9Tll48MyrEmT76u7d//+Z8lHT5epH3fmeD1ti4ZE42v1YmvDYHxen0d/j7gNT87wXjNrcjtbtDGjRvk9/u1YcObuuGGGyNq9iwxMT6g54X9T5EPPvhA5eXlysrKUn5+vt5991098sgjOnToUNsSEzU1NUpOTg53tLB48sn26z0tWfKooSSdy3nnnSdJGjLkHw0nAQCcq1OtFRqNwl7O7rnnHlVVVam8vFxPPfWUrrjiCj355JMaPny4Nm7cKEl67bXXlJWVFe5oYdHYeOS0Y4TGgAE/VFraEC1cSBkGgEjVWdYKtcxJ/Dlz5ui3v/2tcnJy5PF4lJubazpSSHTrFnfaMQAAOLXOslao0e2bhg8fruHDh0uS+vbtq5KSjl8LZHWXXjpY77+/tW2clsYdmwAABKKzrBVqmZmzzmL79o/ajT/++ENDSQAAiCwn1gq12WxRvVYoG5+HmcPh0LFj7ccAACAwubnTtH//vqidNZMoZ2HHDQEAAJw7p9Olhx563HSMkOK0Zpj17t2n3bhPnwsMJQEAAFbEzFmY9evXX19++UXb+KKL+htMAwCRw9NUp4pd1lrX6mhz604353XpbjjJNzxNdUpSgukY6ADKWZht3brlpPG7hpIAQOTo1y/VdIRT2rOnQZKUdKF1zoIkKcGyrxcCQzkLs5YW72nHAIDvmjlzlukIp7RgQYEkadGix87wTCBwXHMGAABgIZQzAAAAC6GcAQAAWAjlDAAAwEIoZwAAABbC3ZoAcJY8Hrf+Jmm5/KajWN6Xks73uE3HACIKM2cAAAAWwswZAJylhASnunx+QLfJZjqK5S2XX90TnKZjABGFmTMAAAALoZwBAABYCKc1gQj30kuF2rt3t+kY37FnT2umE9vbWEW/fqmW3QoIACTKGRDx9u7dre2ffiQlmE5ykpjWN9vrPjKb49s8pgMAwJlRzoBokCD5MnymU1ievZIrOSBVVpapvHxzUI4VzBnirKwcZWRkd/g4iHyUMwAAzpHTyZ2oCD7KGQCgU8nIyGaGCpbGHD8AAICFUM4AAAAshHIGAABgIZQzAAAAC6GcAQAAWAh3ayKoWK3+7LBafeSqUeum3lZy5O9vexhN0V6NpAGmQwARhnKGoNq7d7f2fPKhLuzRYjpKOz+QTZLU8vkHhpN8Y9+RGNMRcI769Us1HeGU6v7+R0hyf+vkGyDrvl6AVVHOEHQX9mjRAz8+cuYndnIP/9lK8xs4G1ad7TwxM7xo0WOGkwDoCMoZEOE8HrfkYWuigHgkT6zbdAoAOC1+mgMAAFgIM2dAhEtIcOrA8f1sfB4Ae6VdCQnshQjA2pg5AwAAsBDKGQAAgIVwWjNAlZVlKi/fHJJjd2TtraysHGVkZAcxDQAAMImZszBLSxty2jEAAOjcmDkLUEZGdtBmqCZPvq7t36xHBAAAvo1yZsCJ2TKKGYLGiuucHf372/OMpmjPIynJdAgAOD3KGRDhrLo1zon9TPsnWShfknVfLwA4gXKGoPJ43Go4HMPWRAH47HCMXJ6Or1bPVkIAEF0sdh4EAACgc2PmDEGVkOBUfOMeNj4PwMN/7qEYVqsHAJwk7OXsyy+/1Ny5c1VfXy+bzaYpU6bo1ltvlcfj0d13363PP/9cF1xwgZYuXaof/OAH4Y4HAABgVNhPa8bExKigoEBvvPGGXnnlFf3Xf/2X/u///k+FhYUaMWKENm3apBEjRqiwsDDc0QAAAIwL+8xZUlKSkpJa72Xv0aOHUlNTVVtbq7KyMr388suSpAkTJuiWW27RnDlzwh0PAMIqmLuPnLhDtiO7jpzA7iOAOUavOTtw4ICqq6s1dOhQ1dfXt5W2xMRE1dfXn/HjY2JsSkiIC3XMoHM4WicsIzH7mTgcdrWYDhFBHA57VH4fSNH9fR5McXGxba9VR/Xq5ZKkoBwvLi6W/zvAEGPlrLGxUXl5ebr//vvVo0f7ZRdsNptsNtsZj9HS4pfH0xSqiCHj9fokKSKzn8mJrw2B8Xp9Ufl9IEX393kwXX75SF1++UjTMU6J/zsguBIT4wN6npGlNJqbm5WXl6dx48Zp1KhRkqRevXqprq5OklRXVyeXy2UiGgAAgFFhL2d+v1/z5s1TamqqZsyY0fZ4VlaWSktLJUmlpaXKzuZaBwAA0PmE/bTm+++/r3Xr1mngwIEaP368JCk/P1+zZs3SXXfdpZKSEvXp00dLly4NdzQAAADjwl7OfvzjH+uTTz455ftWrlwZ5jQIhX1HrLd908Hjrdcw/iDWbzjJN/YdiVF/0yEAAJbDDgEIKqtuKn3w70sMuC6wTr7+su7rBQAwJ6rL2UsvFWrv3t2mY3xHMNciCqZ+/VI7vIk2m3ADANAxUV3O9u7dre27PpEvzlp3ftp8rS/7R/u+MpzkG/amBtMRAACAorycSZIvzqWjl15vOoblnbfz96YjAAAAGVrnDAAAAKdGOQMAALAQyhkAAICFUM4AAAAshHIGAABgIVF9t6bH45a9qZ47EQNgb6qXxxPV3w4AAEQEZs4AAAAsJKqnShISnNp3yMs6ZwE4b+fvlZDgNB0DAIBOj5kzAAAAC6GcAQAAWEhUn9aUWveMtNoNAbbmryVJ/i7dDCf5RuvemommYwAA0OlFdTnr1y/VdIRT2rNntySp/4VWKkOJln29AADoTKK6nM2cOct0hFNasKBAkrRo0WOGkwAAAKuJ6nIG4OxUVpapvHxzUI51Yob4xB8jHZGVlaOMjOwOHwcAIgHlDEBIOJ0szQIA54JyBqBNRkY2M1QAYBhLaQAAAFgI5QwAAMBCOK0Jy+LidABAZ0Q5Q6fAxekAgEhBOYNlcXE6AKAz4pozAAAAC6GcAQAAWAjlDAAAwEIoZwAAABZCOQMAALAQyhkAAICFUM4AAAAsxOb3+/2mQ5yr5uYWeTxNYflcoVitvn//1A4fi9XqAQCIDImJ8QE9j0VoDWC1egAA8H2YOQMAAAiDQGfOuOYMAADAQihnAAAAFkI5AwAAsBDKGQAAgIVQzgAAACyEcgYAAGAhlDMAAAALoZwBAABYCOUMAADAQihnAAAAFkI5AwAAsJCI3lsTAAAg2jBzBgAAYCGUMwAAAAuhnAEAAFgI5QwAAMBCKGcAAAAWQjkDAACwEMoZAACAhThMB+hMvvzyS82dO1f19fWy2WyaMmWKbr31VtOxotqxY8d088036/jx42ppadHo0aOVl5dnOlan0NLSosmTJys5OVnLli0zHSfqZWVlqXv37rLb7YqJidHatWtNR4p6hw4d0gMPPKBPP/1UNptNixcv1rBhw0zHilq7d+/W3Xff3Tbev3+/8vLyNH36dHOhQoRyFkYxMTEqKChQWlqajhw5osmTJ+uqq67SxRdfbDpa1IqNjdXKlSvVvXt3NTc366abblJ6erouu+wy09GiXlFRkQYMGKAjR46YjtJprFy5Ui6Xy3SMTuORRx7RyJEj9e///u86fvy4jh49ajpSVEtNTdW6desktf7xl56erpycHMOpQoPTmmGUlJSktLQ0SVKPHj2Umpqq2tpaw6mim81mU/fu3SVJXq9XXq9XNpvNcKroV1NTo8rKSt14442mowAhcfjwYb333ntt3+OxsbHq2bOn4VSdxzvvvKO+ffvqggsuMB0lJChnhhw4cEDV1dUaOnSo6ShRr6WlRePHj9eVV16pK6+8ktc8DBYvXqw5c+bIbudHTDjddtttmjRpkl555RXTUaLegQMH5HK5dN9992nChAmaN2+empqaTMfqNF5//XVdf/31pmOEDD85DWhsbFReXp7uv/9+9ejRw3ScqBcTE6N169bpD3/4gz766CN9+umnpiNFtYqKCrlcLg0ePNh0lE5l9erVeu211/TCCy9o1apVeu+990xHimper1c7d+7UtGnTVFpaqm7duqmwsNB0rE7h+PHjKi8v15gxY0xHCRnKWZg1NzcrLy9P48aN06hRo0zH6VR69uyp4cOH649//KPpKFHtgw8+UHl5ubKyspSfn693331X9957r+lYUS85OVmS1KtXL+Xk5Oijjz4ynCi6paSkKCUlpW0mfsyYMdq5c6fhVJ1DVVWV0tLSdP7555uOEjKUszDy+/2aN2+eUlNTNWPGDNNxOoWGhgbGThCSAAAGfklEQVQdOnRIknT06FG9/fbbSk1NNZwqut1zzz2qqqpSeXm5nnrqKV1xxRVasmSJ6VhRrampqe3Gi6amJv3pT3/SD3/4Q8OpoltiYqJSUlK0e/duSa3XQA0YMMBwqs7h9ddf13XXXWc6Rkhxt2YYvf/++1q3bp0GDhyo8ePHS5Ly8/N19dVXG04Wverq6lRQUKCWlhb5/X6NGTNGmZmZpmMBQVVfX69f/OIXklqvsbz++uuVnp5uOFX0mz9/vu699141Nzerb9++evTRR01HinpNTU16++23tWjRItNRQsrm9/v9pkMAAACgFac1AQAALIRyBgAAYCGUMwAAAAuhnAEAAFgI5QwAAMBCWEoDgFE/+tGPNGPGDBUUFEiSli9frqamJv3yl7/s8LELCgqUkZHRoZXEa2pq9OCDD+qvf/2rfD6fMjIyNHfuXMXGxkpqXQ7nL3/5iyZPnqxdu3Zp69atio+Pl91u14IFCzRs2LCAP9eWLVvUpUsX/dM//dM55wUQ+Zg5A2BUbGysNm3apIaGBtNR2vF6vfL7/brzzjt1zTXXaNOmTdq4caOampr061//WpL01Vdf6eOPP9b69es1ffp0SdLcuXO1bt063XPPPVqwYMFZfb6tW7dq27ZtofhyAEQQZs4AGOVwOPSzn/1MK1eu1N13393ufSfPfA0bNkzbtm3Tli1b9PTTTys+Pl6ffvqpxo4dq4EDB6qoqEjHjh3Ts88+qwsvvFCS9Pbbb6uwsFCNjY0qKChQZmamWlpatGTJEm3dulXHjx/XzTffrKlTp2rLli36zW9+o549e2rPnj1auHChunbtqsmTJ0tq3af1/vvvV3Z2tvLy8jRz5kzV1tZq/Pjxmj9/frvsP/nJT7Rv3z5J0n//93/rlVdeUXNzsy666CI98cQT6tatmwoKChQbG6vq6molJydr27Ztstvt+t3vfqf58+frq6++0rPPPiu73a74+HitWrUq1P8dACyAcgbAuJtvvlk33HCDbr/99oA/ZteuXXrjjTeUkJCg7Oxs5ebmqqSkRCtXrtTLL7+sefPmSZI+//xzlZSUaN++ffrnf/5nXXnllSotLVV8fLxeffVVHT9+XFOnTtVVV10lSdq5c6fWr1+vvn37qqioSGlpae0+b48ePdS7d2999tlnev7553XHHXdo3bp1kqSSkpK255WXl2vgwIGSpJycHE2ZMkWS9Otf/1olJSW65ZZbJEm1tbVas2aNYmJi9PTTTysuLk633XabJGncuHFavny5kpOT27YhAxD9KGcAjOvRo4fGjx+voqIinXfeeQF9zJAhQ5SUlCRJuvDCC9vK1cCBA7Vly5a2540dO1Z2u139+vVT3759tXv3bv3pT3/SJ598oo0bN0qSDh8+rM8++0xdunTRkCFD1Ldv33P+Wp544gk9//zzcrlceuSRRyRJf/nLX7R06VIdPnxYjY2N+ulPf9r2/DFjxigmJuaUxxo2bJgKCgo0duxY5eTknHMmAJGFcgbAEm699VZNmjRJkyZNanssJiZGPp9PkuTz+dTc3Nz2vhMX5EuS3W5vG9vtdrW0tLS9z2aztfs8NptNfr9fDzzwgEaOHNnufVu2bFFcXFzb+OKLL24rcCccOXJEX375pS666CLV19d/5+uYO3fud25AKCgo0HPPPadLLrlEa9eu1datW9ve161bt+95RaRFixbpww8/VGVlpSZPnqxXX31VTqfze58PIDpwQwAAS0hISNCYMWPanRq84IILtGPHDkmtpwm/Xc4CtWHDBvl8Pu3bt0/79+9X//799dOf/lSrV69uO96ePXvU1NT0nY8dMWKEvv76a5WWlkpq3VT8scce08SJE09bqk7W2NioxMRENTc3a/369d/7vO7du6uxsbFtvG/fPg0dOlSzZ8+W0+lUTU1NwJ8TQORi5gyAZcycObPdRe9TpkzRz3/+c91www0aOXJku1mtQPXu3Vs33nijGhsb9eCDD6pr167Kzc3V559/rkmTJsnv98vpdOq55577zsfabDY9++yzevDBB/Xcc8/J5/Pp6quvVn5+/lllmD17tnJzc+VyuTR06NB2BezbMjMzlZeXp7KyMs2fP18rVqzQZ599Jr/fryuuuEKXXHLJWX/9ACKPze/3+02HAAAAQCtOawIAAFgI5QwAAMBCKGcAAAAWQjkDAACwEMoZAACAhVDOAAAALIRyBgAAYCH/D9Nb5JFvyswVAAAAAElFTkSuQmCC\n", 405 | "text/plain": [ 406 | "
" 407 | ] 408 | }, 409 | "metadata": {}, 410 | "output_type": "display_data" 411 | } 412 | ], 413 | "source": [ 414 | "plt.figure(figsize=(10,5))\n", 415 | "seaborn.boxplot(data=df, x='NumberOfParts', y='Length')" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 159, 421 | "metadata": {}, 422 | "outputs": [ 423 | { 424 | "data": { 425 | "text/html": [ 426 | "
\n", 427 | "\n", 440 | "\n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | "
NumberOfPartsLengthLongestPartTLDRandomness
\n", 454 | "
" 455 | ], 456 | "text/plain": [ 457 | "Empty DataFrame\n", 458 | "Columns: [NumberOfParts, Length, LongestPart, TLD, Randomness]\n", 459 | "Index: []" 460 | ] 461 | }, 462 | "execution_count": 159, 463 | "metadata": {}, 464 | "output_type": "execute_result" 465 | } 466 | ], 467 | "source": [ 468 | "df.loc[df['NumberOfParts'] == 1]" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 160, 474 | "metadata": {}, 475 | "outputs": [ 476 | { 477 | "data": { 478 | "text/plain": [ 479 | "" 480 | ] 481 | }, 482 | "execution_count": 160, 483 | "metadata": {}, 484 | "output_type": "execute_result" 485 | }, 486 | { 487 | "data": { 488 | "image/png": "\n", 489 | "text/plain": [ 490 | "
" 491 | ] 492 | }, 493 | "metadata": {}, 494 | "output_type": "display_data" 495 | } 496 | ], 497 | "source": [ 498 | "plt.figure(figsize=(10,5))\n", 499 | "seaborn.violinplot(data=df, x='NumberOfParts', y='Length')" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": 161, 505 | "metadata": {}, 506 | "outputs": [ 507 | { 508 | "data": { 509 | "text/plain": [ 510 | "" 511 | ] 512 | }, 513 | "execution_count": 161, 514 | "metadata": {}, 515 | "output_type": "execute_result" 516 | }, 517 | { 518 | "data": { 519 | "image/png": "\n", 520 | "text/plain": [ 521 | "
" 522 | ] 523 | }, 524 | "metadata": {}, 525 | "output_type": "display_data" 526 | } 527 | ], 528 | "source": [ 529 | "plt.figure(figsize=(10,5))\n", 530 | "df['TLD'].value_counts().plot(kind='bar')" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 162, 536 | "metadata": {}, 537 | "outputs": [ 538 | { 539 | "data": { 540 | "text/plain": [ 541 | "" 542 | ] 543 | }, 544 | "execution_count": 162, 545 | "metadata": {}, 546 | "output_type": "execute_result" 547 | }, 548 | { 549 | "data": { 550 | "image/png": "\n", 551 | "text/plain": [ 552 | "
" 553 | ] 554 | }, 555 | "metadata": {}, 556 | "output_type": "display_data" 557 | } 558 | ], 559 | "source": [ 560 | "plt.figure(figsize=(10,5))\n", 561 | "df.loc[df['TLD'] > 3]['TLD'].value_counts().plot(kind='bar')" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": 163, 567 | "metadata": {}, 568 | "outputs": [ 569 | { 570 | "data": { 571 | "text/html": [ 572 | "
\n", 573 | "\n", 586 | "\n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | "
NumberOfPartsLengthLongestPartTLDRandomness
emil.engineering21611110.0
\n", 608 | "
" 609 | ], 610 | "text/plain": [ 611 | " NumberOfParts Length LongestPart TLD Randomness\n", 612 | "emil.engineering 2 16 11 11 0.0" 613 | ] 614 | }, 615 | "execution_count": 163, 616 | "metadata": {}, 617 | "output_type": "execute_result" 618 | } 619 | ], 620 | "source": [ 621 | "df.loc[df['TLD'] == 11].sample()" 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": null, 627 | "metadata": {}, 628 | "outputs": [], 629 | "source": [] 630 | } 631 | ], 632 | "metadata": { 633 | "kernelspec": { 634 | "display_name": "Python 3", 635 | "language": "python", 636 | "name": "python3" 637 | }, 638 | "language_info": { 639 | "codemirror_mode": { 640 | "name": "ipython", 641 | "version": 3 642 | }, 643 | "file_extension": ".py", 644 | "mimetype": "text/x-python", 645 | "name": "python", 646 | "nbconvert_exporter": "python", 647 | "pygments_lexer": "ipython3", 648 | "version": "3.6.7" 649 | } 650 | }, 651 | "nbformat": 4, 652 | "nbformat_minor": 2 653 | } 654 | --------------------------------------------------------------------------------