├── .coveragerc
├── .github
    ├── .codecov.yml
    ├── pull_request_template.md
    └── workflows
    │   ├── codecov.yml
    │   ├── codeql-analysis.yml
    │   ├── disabled
    │       ├── codecov.yml
    │       └── qa-gates.yml
    │   ├── pylint.yml
    │   └── python-app.yml
├── .gitignore
├── .idea
    ├── .gitignore
    ├── credentialLeakDB.iml
    ├── dbnavigator.xml
    ├── inspectionProfiles
    │   ├── Project_Default.xml
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── modules.xml
    └── vcs.xml
├── CHANGELOG
├── CONTRIBUTING.md
├── EER.png
├── README.md
├── SECURITY.md
├── __init__.py
├── api
    ├── __init__.py
    ├── enrichment.py
    ├── main.py
    └── models.py
├── config.SAMPLE.py
├── db.sql
├── lib
    ├── __init__.py
    ├── basecollector
    │   ├── __init__.py
    │   └── collector.py
    ├── baseenricher
    │   ├── __init__.py
    │   └── enricher.py
    ├── baseoutput
    │   ├── __init__.py
    │   └── output.py
    ├── baseparser
    │   ├── __init__.py
    │   └── parser.py
    ├── db
    │   ├── __init__.py
    │   └── db.py
    └── helpers.py
├── models
    ├── __init__.py
    ├── idf.py
    ├── indf.py
    └── outdf.py
├── modules
    ├── __init__.py
    ├── collectors
    │   ├── __init__.py
    │   ├── parser.py
    │   ├── sample.csv
    │   ├── spycloud.py
    │   ├── spycloud
    │   │   ├── __init__.py
    │   │   └── collector.py
    │   └── test_leaks
    │   │   ├── COMB
    │   │       └── test_data.txt
    │   │   └── README.md
    ├── enrichers
    │   ├── __init__.py
    │   ├── abuse_contact.py
    │   ├── external_email.py
    │   ├── ldap.py
    │   ├── ldap_lib.py
    │   └── vip.py
    ├── filters
    │   ├── __init__.py
    │   ├── deduper.py
    │   └── filter.py
    ├── output
    │   ├── __init__.py
    │   └── db.py
    └── parsers
    │   ├── __init__.py
    │   └── spycloud.py
├── requirements.txt
├── sonar-project.properties
└── tests
    ├── README.md
    ├── __init__.py
    ├── fixtures
        ├── data.csv
        ├── data_anonymized_spycloud.csv
        └── vips.txt
    ├── lib
        ├── __init__.py
        ├── basecollector
        │   ├── __init__.py
        │   └── test_collector.py
        ├── baseenricher
        │   ├── __init__.py
        │   └── test_enricher.py
        ├── baseoutput
        │   ├── __init__.py
        │   └── test_output.py
        ├── baseparser
        │   ├── __init__.py
        │   └── test_parser.py
        ├── test_helpers.py
        └── test_logger.py
    ├── modules
        ├── __init__.py
        └── enrichers
        │   ├── __init__.py
        │   └── test_external_email.py
    ├── test_collector_spycloud.py
    ├── test_deduper.py
    ├── test_enrichment.py
    ├── test_filter.py
    ├── test_main.py
    └── test_parser_spycloud.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit = venv/*
3 | 


--------------------------------------------------------------------------------
/.github/.codecov.yml:
--------------------------------------------------------------------------------
 1 | codecov:
 2 |   require_ci_to_pass: yes
 3 |   branch: main
 4 | 
 5 | coverage:
 6 |   precision: 2
 7 |   round: down
 8 |   range: "70...100"
 9 | 
10 | parsers:
11 |   gcov:
12 |     branch_detection:
13 |       conditional: yes
14 |       loop: yes
15 |       method: no
16 |       macro: no
17 | 
18 | comment:
19 |   layout: "reach,diff,flags,files,footer"
20 |   behavior: default
21 |   require_changes: no
22 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | # Pull request template
 2 | 
 3 | The model we are using is to have a fork of the repo, edit on the fork, then 
 4 | make a pull request. The PR needs then to be reviewed and 
 5 | merged into the ``main``  branch.
 6 | It will be sent through a couple of quality checks and if it does
 7 | not meet those, the PR needs to be adapted.
 8 | 
 9 | # Description
10 | New tool, Bug fixing, or Improvement?  
11 | Please include a summary of the change and which issue is fixed. Also include relevant motivation and context.
12 | 
13 | ## Related issue
14 | 
15 | ## Check list
16 | - [ ] Related issue / work item is attached
17 | - [ ] Unit-tests are written (if applicable)
18 | - [ ] Documentation is updated (if applicable)
19 | - [ ] Changes are tested, tests pass, no code linting errors and no high/critical vulnerabilities identified in codebase.
20 | 
21 | ## Testing
22 | - [ ] Did you write new unit tests for this change?
23 | - [ ] Did you write new integration tests for this change?
24 |   Include the test commands you ran locally to test this change
25 |   e.g.:
26 | ```bash
27 | pytest -v <mydirectory>
28 | ```
29 | 
30 | ## Monitoring
31 | - [ ] Will this change be covered by our existing monitoring? (no new canaries/metrics/dashboards/alarms are required)
32 | - [ ] Will this change have no (or positive) effect on resources and/or limits?
33 |   (including CPU, memory, AWS resources, calls to other services)
34 | - [ ] Can this change be deployed to Prod without triggering any alarms?
35 | 
36 | ## Rollout
37 | - [ ] Can this change be merged immediately into the pipeline upon approval?
38 | - [ ] Are all dependent changes already deployed to Prod?
39 | - [ ] Can this change be rolled back without any issues after deployment to Prod?
40 | 
41 | 
42 | 
43 | 
44 | This is the template we use in our projects.
45 | 


--------------------------------------------------------------------------------
/.github/workflows/codecov.yml:
--------------------------------------------------------------------------------
 1 | name: Codecov
 2 | on: [push]
 3 | jobs:
 4 |   run:
 5 |     runs-on: ubuntu-latest
 6 |     env:
 7 |       PORT: 8080
 8 |       DBHOST: localhost
 9 |       DBUSER: credentialleakdb
10 |       DBPASSWORD: 1234testForUnitTesting
11 |       DBNAME: credentialleakdb
12 |       VIPLIST: tests/fixtures/vips.txt
13 |     steps:
14 |     - uses: actions/checkout@v2
15 |       with:
16 |         fetch-depth: 0
17 |     - name: Set up Python 3.9
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: 3.9
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install flake8 pytest pytest-cov
25 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
26 |     - name: set up demo database
27 |       run: |
28 |         sudo apt install postgresql 
29 |         # sudo -u postgres pg_ctlcluster 12 main start
30 |         sudo service postgresql start
31 |         # sudo -u postgres pg_ctlcluster 12 main start
32 |         sudo -u postgres createdb credentialleakdb
33 |         sudo -u postgres createuser -s $DBUSER
34 |         sudo -u postgres psql $DBNAME < db.sql
35 |         sudo -u postgres psql -c "ALTER role $DBUSER WITH PASSWORD '$DBPASSWORD'"
36 |     - name: prepare environment and mocking
37 |       run: |
38 |         cp config.SAMPLE.py api/config.py
39 |         echo "PORT=$PORT" > ENV
40 |         echo "DBHOST=$DBHOST" >> ENV
41 |         echo "DBUSER=$DBUSER" >> ENV
42 |         echo "DBPASSWORD=$DBPASSWORD" >> ENV
43 |         echo "DBNAME=$DBNAME" >> ENV
44 |     - name: Generate coverage report
45 |       run: |
46 |         pip install pytest
47 |         pip install pytest-cov
48 |         pip install -r requirements.txt
49 |         python -m pytest -vv --cov=./ --cov-report=term --cov-report=xml tests/
50 |     - name: Upload coverage to Codecov
51 |       uses: codecov/codecov-action@v1
52 |       with:
53 |         token: ${{ secrets.CODECOV_TOKEN }}
54 |         files: ./coverage.xml
55 |         directory: ./coverage/reports/
56 |         flags: unittests
57 |         env_vars: OS,PYTHON
58 |         name: codecov-umbrella
59 |         fail_ci_if_error: true
60 |         path_to_write_report: ./coverage/codecov_report.txt
61 |         verbose: true
62 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ main ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ main ]
20 |   schedule:
21 |     - cron: '40 10 * * 0'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 | 
28 |     strategy:
29 |       fail-fast: false
30 |       matrix:
31 |         language: [ 'python' ]
32 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
33 |         # Learn more:
34 |         # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
35 | 
36 |     steps:
37 |     - name: Checkout repository
38 |       uses: actions/checkout@v2
39 | 
40 |     # Initializes the CodeQL tools for scanning.
41 |     - name: Initialize CodeQL
42 |       uses: github/codeql-action/init@v1
43 |       with:
44 |         languages: ${{ matrix.language }}
45 |         # If you wish to specify custom queries, you can do so here or in a config file.
46 |         # By default, queries listed here will override any specified in a config file.
47 |         # Prefix the list here with "+" to use these queries and those in the config file.
48 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
49 | 
50 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
51 |     # If this step fails, then you should remove it and run the build manually (see below)
52 |     - name: Autobuild
53 |       uses: github/codeql-action/autobuild@v1
54 | 
55 |     # ℹ️ Command-line programs to run using the OS shell.
56 |     # 📚 https://git.io/JvXDl
57 | 
58 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
59 |     #    and modify them (or add more) to build your code if your project
60 |     #    uses a compiled language
61 | 
62 |     #- run: |
63 |     #   make bootstrap
64 |     #   make release
65 | 
66 |     - name: Perform CodeQL Analysis
67 |       uses: github/codeql-action/analyze@v1
68 | 


--------------------------------------------------------------------------------
/.github/workflows/disabled/codecov.yml:
--------------------------------------------------------------------------------
 1 | name: Codecov run
 2 | on: [push]
 3 | jobs:
 4 |   run:
 5 |     runs-on: ${{ matrix.os }}
 6 |     strategy:
 7 |       matrix:
 8 |         os: [ubuntu-latest, debian-latest]
 9 |     env:
10 |       OS: ${{ matrix.os }}
11 |       PYTHON: '3.7'
12 |     steps:
13 |     - uses: actions/checkout@master
14 |     - name: Setup Python
15 |       uses: actions/setup-python@master
16 |       with:
17 |         python-version: 3.7
18 |     - name: Generate coverage report
19 |       run: |
20 |         pip install pytest
21 |         pip install pytest-cov
22 |         pytest --cov=./ --cov-report=xml
23 |     - name: Upload coverage to Codecov
24 |       uses: codecov/codecov-action@v1
25 |       with:
26 |         token: ${{ secrets.CODECOV_TOKEN }}
27 |         files: ./coverage1.xml,./coverage2.xml
28 |         directory: ./coverage/reports/
29 |         flags: unittests
30 |         env_vars: OS,PYTHON
31 |         name: codecov-umbrella
32 |         fail_ci_if_error: true
33 |         path_to_write_report: ./coverage/codecov_report.txt
34 |         verbose: true


--------------------------------------------------------------------------------
/.github/workflows/disabled/qa-gates.yml:
--------------------------------------------------------------------------------
 1 | name: QA Gates
 2 | # on:
 3 | #   push:
 4 | #     branches:
 5 | #       - master
 6 | #   pull_request:
 7 | #     types: [opened, synchronize, reopened]
 8 | on:
 9 |   push:
10 |     branches: [ main, release/* ]
11 |   pull_request:
12 |     branches: [ main ]
13 | jobs:
14 |   qa-gates:
15 |     name: SonarCloud
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: actions/checkout@v2
19 |         with:
20 |           fetch-depth: 0  # Shallow clones should be disabled for a better relevancy of analysis
21 |       - name: SonarCloud Scan
22 |         uses: SonarSource/sonarcloud-github-action@master
23 |         env:
24 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}  # Needed to get PR information, if any
25 |           SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}


--------------------------------------------------------------------------------
/.github/workflows/pylint.yml:
--------------------------------------------------------------------------------
 1 | name: Pylint
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     env:
 8 |       VIPLIST: tests/fixtures/vips.txt
 9 | 
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |     - uses: actions/checkout@v2
14 |     - name: Set up Python 3.8
15 |       uses: actions/setup-python@v1
16 |       with:
17 |         python-version: 3.8
18 |     - name: Install dependencies
19 |       run: |
20 |         python -m pip install --upgrade pip
21 |         pip install pylint
22 |         pip install -r requirements.txt
23 |     - name: prepare environment and mocking
24 |       run: |
25 |         cp config.SAMPLE.py api/config.py
26 |     - name: Analysing the code with pylint
27 |       run: |
28 |         export PYTHONPATH=$(pwd) && pylint --suggestion-mode=y --extension-pkg-whitelist='pydantic' -E -d C0301 -d E0611 api models tests modules lib
29 |         
30 | 


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: flak8 and pytest
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ main, develop, re-write-modules, release/* ]
 9 |   pull_request:
10 |     branches: [ main ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     env:
17 |       PORT: 8080
18 |       DBHOST: localhost
19 |       DBUSER: credentialleakdb
20 |       DBPASSWORD: 1234testForUnitTesting
21 |       DBNAME: credentialleakdb
22 |       VIPLIST: tests/fixtures/vips.txt
23 |     steps:
24 |     - uses: actions/checkout@v2
25 |       with:
26 |         fetch-depth: 0
27 |     - name: Set up Python 3.9
28 |       uses: actions/setup-python@v2
29 |       with:
30 |         python-version: 3.9
31 |     - name: Install dependencies
32 |       run: |
33 |         python -m pip install --upgrade pip
34 |         pip install flake8 pytest pytest-cov
35 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
36 |     - name: set up demo database
37 |       run: |
38 |         sudo apt install postgresql 
39 |         # sudo -u postgres pg_ctlcluster 12 main start
40 |         sudo service postgresql start
41 |         # sudo -u postgres pg_ctlcluster 12 main start
42 |         sudo -u postgres createdb credentialleakdb
43 |         sudo -u postgres createuser -s $DBUSER
44 |         sudo -u postgres psql $DBNAME < db.sql
45 |         sudo -u postgres psql -c "ALTER role $DBUSER WITH PASSWORD '$DBPASSWORD'"
46 |     - name: prepare environment and mocking
47 |       run: |
48 |         cp config.SAMPLE.py api/config.py
49 |         echo "PORT=$PORT" > ENV
50 |         echo "DBHOST=$DBHOST" >> ENV
51 |         echo "DBUSER=$DBUSER" >> ENV
52 |         echo "DBPASSWORD=$DBPASSWORD" >> ENV
53 |         echo "DBNAME=$DBNAME" >> ENV
54 |     - name: Lint with flake8
55 |       run: |
56 |         # stop the build if there are Python syntax errors or undefined names
57 |         flake8 . --count --select=E9,F63,F7,F82 --ignore=E251 --show-source --statistics --exclude venv
58 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
59 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --ignore=E251 --statistics --exclude venv
60 |     - name: Start the RESTful API server
61 |       run: |
62 |         uvicorn --host 127.0.0.1 --port 8080 --reload api.main:app &
63 |         # uvicorn --env-file ENV --host 127.0.0.1 --port 8080 --reload api.main:app &
64 |     
65 |     - name: Test with pytest
66 |       run: |
67 |         python -m pytest -vv --cov=./ --cov-report=term --cov-report=xml tests/
68 | 
69 |     - name: Validate coverage report exists
70 |       run: |
71 |         ls -lha
72 | 
73 |     - name: Snyk Security Scan
74 |       uses: snyk/actions/python@master
75 |       # continue-on-errormsg: true
76 |       env:
77 |         SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }}
78 |       with:
79 |         # https://support.snyk.io/hc/en-us/articles/360003812578-Our-full-CLI-reference
80 |         # args: --command=pipenv run --severity-threshold=high --fail-on=all --file=*req*.txt --dev --org=digits2 --debug
81 |         args: --command=python --severity-threshold=high --fail-on=all --file=requirements.txt --package-manager=pip --dev --org=digits2 --debug -skip-unresolved
82 | 
83 |     - name: SonarCloud Scan
84 |       uses: SonarSource/sonarcloud-github-action@master
85 |       env:
86 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}  # Needed to get PR information, if any
87 |         SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
88 | 
89 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # config secrets etc.
 2 | ENV
 3 | .env
 4 | config.py
 5 | api/config.py
 6 | 
 7 | # python venvs
 8 | .venv
 9 | venv
10 | 
11 | # editor
12 | .idea/
13 | 
14 | # real data directories, don't upload to github ;-)
15 | modules/collectors/real_data
16 | real_data
17 | data/
18 | VIPs.txt
19 | 
20 | poetry.lock
21 | 
22 | 
23 | *.swp
24 | */*.swp
25 | 
26 | 
27 | # test caches from pytest
28 | .pytest_cache
29 | */.pytest_cache
30 | __pycache__/
31 | */__pycache__/
32 | old
33 | cache
34 | .coverage
35 | coverage.xml
36 | 
37 | 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | 


--------------------------------------------------------------------------------
/.idea/credentialLeakDB.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <excludeFolder url="file://$MODULE_DIR$/venv" />
 6 |       <excludeFolder url="file://$MODULE_DIR$/api/.pytest_cache" />
 7 |       <excludeFolder url="file://$MODULE_DIR$/importer/.pytest_cache" />
 8 |       <excludeFolder url="file://$MODULE_DIR$/importer/real_data" />
 9 |       <excludeFolder url="file://$MODULE_DIR$/tests/.pytest_cache" />
10 |       <excludeFolder url="file://$MODULE_DIR$/.pytest_cache" />
11 |       <excludeFolder url="file://$MODULE_DIR$/data" />
12 |     </content>
13 |     <orderEntry type="inheritedJdk" />
14 |     <orderEntry type="sourceFolder" forTests="false" />
15 |   </component>
16 |   <component name="TestRunnerService">
17 |     <option name="PROJECT_TEST_RUNNER" value="pytest" />
18 |   </component>
19 | </module>


--------------------------------------------------------------------------------
/.idea/dbnavigator.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="DBNavigator.Project.DataEditorManager">
  4 |     <record-view-column-sorting-type value="BY_INDEX" />
  5 |     <value-preview-text-wrapping value="true" />
  6 |     <value-preview-pinned value="false" />
  7 |   </component>
  8 |   <component name="DBNavigator.Project.DataExportManager">
  9 |     <export-instructions>
 10 |       <create-header value="true" />
 11 |       <friendly-headers value="false" />
 12 |       <quote-values-containing-separator value="true" />
 13 |       <quote-all-values value="false" />
 14 |       <value-separator value="" />
 15 |       <file-name value="" />
 16 |       <file-location value="" />
 17 |       <scope value="GLOBAL" />
 18 |       <destination value="FILE" />
 19 |       <format value="EXCEL" />
 20 |       <charset value="windows-1252" />
 21 |     </export-instructions>
 22 |   </component>
 23 |   <component name="DBNavigator.Project.DatabaseBrowserManager">
 24 |     <autoscroll-to-editor value="false" />
 25 |     <autoscroll-from-editor value="true" />
 26 |     <show-object-properties value="true" />
 27 |     <loaded-nodes />
 28 |   </component>
 29 |   <component name="DBNavigator.Project.DatabaseFileManager">
 30 |     <open-files />
 31 |   </component>
 32 |   <component name="DBNavigator.Project.EditorStateManager">
 33 |     <last-used-providers />
 34 |   </component>
 35 |   <component name="DBNavigator.Project.ExecutionManager">
 36 |     <retain-sticky-names value="false" />
 37 |   </component>
 38 |   <component name="DBNavigator.Project.MethodExecutionManager">
 39 |     <method-browser />
 40 |     <execution-history>
 41 |       <group-entries value="true" />
 42 |       <execution-inputs />
 43 |     </execution-history>
 44 |     <argument-values-cache />
 45 |   </component>
 46 |   <component name="DBNavigator.Project.ObjectDependencyManager">
 47 |     <last-used-dependency-type value="INCOMING" />
 48 |   </component>
 49 |   <component name="DBNavigator.Project.ObjectQuickFilterManager">
 50 |     <last-used-operator value="EQUAL" />
 51 |     <filters />
 52 |   </component>
 53 |   <component name="DBNavigator.Project.ScriptExecutionManager" clear-outputs="true">
 54 |     <recently-used-interfaces />
 55 |   </component>
 56 |   <component name="DBNavigator.Project.Settings">
 57 |     <connections />
 58 |     <browser-settings>
 59 |       <general>
 60 |         <display-mode value="TABBED" />
 61 |         <navigation-history-size value="100" />
 62 |         <show-object-details value="false" />
 63 |       </general>
 64 |       <filters>
 65 |         <object-type-filter>
 66 |           <object-type name="SCHEMA" enabled="true" />
 67 |           <object-type name="USER" enabled="true" />
 68 |           <object-type name="ROLE" enabled="true" />
 69 |           <object-type name="PRIVILEGE" enabled="true" />
 70 |           <object-type name="CHARSET" enabled="true" />
 71 |           <object-type name="TABLE" enabled="true" />
 72 |           <object-type name="VIEW" enabled="true" />
 73 |           <object-type name="MATERIALIZED_VIEW" enabled="true" />
 74 |           <object-type name="NESTED_TABLE" enabled="true" />
 75 |           <object-type name="COLUMN" enabled="true" />
 76 |           <object-type name="INDEX" enabled="true" />
 77 |           <object-type name="CONSTRAINT" enabled="true" />
 78 |           <object-type name="DATASET_TRIGGER" enabled="true" />
 79 |           <object-type name="DATABASE_TRIGGER" enabled="true" />
 80 |           <object-type name="SYNONYM" enabled="true" />
 81 |           <object-type name="SEQUENCE" enabled="true" />
 82 |           <object-type name="PROCEDURE" enabled="true" />
 83 |           <object-type name="FUNCTION" enabled="true" />
 84 |           <object-type name="PACKAGE" enabled="true" />
 85 |           <object-type name="TYPE" enabled="true" />
 86 |           <object-type name="TYPE_ATTRIBUTE" enabled="true" />
 87 |           <object-type name="ARGUMENT" enabled="true" />
 88 |           <object-type name="DIMENSION" enabled="true" />
 89 |           <object-type name="CLUSTER" enabled="true" />
 90 |           <object-type name="DBLINK" enabled="true" />
 91 |         </object-type-filter>
 92 |       </filters>
 93 |       <sorting>
 94 |         <object-type name="COLUMN" sorting-type="NAME" />
 95 |         <object-type name="FUNCTION" sorting-type="NAME" />
 96 |         <object-type name="PROCEDURE" sorting-type="NAME" />
 97 |         <object-type name="ARGUMENT" sorting-type="POSITION" />
 98 |       </sorting>
 99 |       <default-editors>
100 |         <object-type name="VIEW" editor-type="SELECTION" />
101 |         <object-type name="PACKAGE" editor-type="SELECTION" />
102 |         <object-type name="TYPE" editor-type="SELECTION" />
103 |       </default-editors>
104 |     </browser-settings>
105 |     <navigation-settings>
106 |       <lookup-filters>
107 |         <lookup-objects>
108 |           <object-type name="SCHEMA" enabled="true" />
109 |           <object-type name="USER" enabled="false" />
110 |           <object-type name="ROLE" enabled="false" />
111 |           <object-type name="PRIVILEGE" enabled="false" />
112 |           <object-type name="CHARSET" enabled="false" />
113 |           <object-type name="TABLE" enabled="true" />
114 |           <object-type name="VIEW" enabled="true" />
115 |           <object-type name="MATERIALIZED VIEW" enabled="true" />
116 |           <object-type name="NESTED TABLE" enabled="false" />
117 |           <object-type name="COLUMN" enabled="false" />
118 |           <object-type name="INDEX" enabled="true" />
119 |           <object-type name="CONSTRAINT" enabled="true" />
120 |           <object-type name="DATASET TRIGGER" enabled="true" />
121 |           <object-type name="DATABASE TRIGGER" enabled="true" />
122 |           <object-type name="SYNONYM" enabled="false" />
123 |           <object-type name="SEQUENCE" enabled="true" />
124 |           <object-type name="PROCEDURE" enabled="true" />
125 |           <object-type name="FUNCTION" enabled="true" />
126 |           <object-type name="PACKAGE" enabled="true" />
127 |           <object-type name="TYPE" enabled="true" />
128 |           <object-type name="TYPE ATTRIBUTE" enabled="false" />
129 |           <object-type name="ARGUMENT" enabled="false" />
130 |           <object-type name="DIMENSION" enabled="false" />
131 |           <object-type name="CLUSTER" enabled="false" />
132 |           <object-type name="DBLINK" enabled="true" />
133 |         </lookup-objects>
134 |         <force-database-load value="false" />
135 |         <prompt-connection-selection value="true" />
136 |         <prompt-schema-selection value="true" />
137 |       </lookup-filters>
138 |     </navigation-settings>
139 |     <dataset-grid-settings>
140 |       <general>
141 |         <enable-zooming value="true" />
142 |         <enable-column-tooltip value="true" />
143 |       </general>
144 |       <sorting>
145 |         <nulls-first value="true" />
146 |         <max-sorting-columns value="4" />
147 |       </sorting>
148 |       <tracking-columns>
149 |         <columnNames value="" />
150 |         <visible value="true" />
151 |         <editable value="false" />
152 |       </tracking-columns>
153 |     </dataset-grid-settings>
154 |     <dataset-editor-settings>
155 |       <text-editor-popup>
156 |         <active value="false" />
157 |         <active-if-empty value="false" />
158 |         <data-length-threshold value="100" />
159 |         <popup-delay value="1000" />
160 |       </text-editor-popup>
161 |       <values-actions-popup>
162 |         <show-popup-button value="true" />
163 |         <element-count-threshold value="1000" />
164 |         <data-length-threshold value="250" />
165 |       </values-actions-popup>
166 |       <general>
167 |         <fetch-block-size value="100" />
168 |         <fetch-timeout value="30" />
169 |         <trim-whitespaces value="true" />
170 |         <convert-empty-strings-to-null value="true" />
171 |         <select-content-on-cell-edit value="true" />
172 |         <large-value-preview-active value="true" />
173 |       </general>
174 |       <filters>
175 |         <prompt-filter-dialog value="true" />
176 |         <default-filter-type value="BASIC" />
177 |       </filters>
178 |       <qualified-text-editor text-length-threshold="300">
179 |         <content-types>
180 |           <content-type name="Text" enabled="true" />
181 |           <content-type name="Properties" enabled="true" />
182 |           <content-type name="XML" enabled="true" />
183 |           <content-type name="DTD" enabled="true" />
184 |           <content-type name="HTML" enabled="true" />
185 |           <content-type name="XHTML" enabled="true" />
186 |           <content-type name="SQL" enabled="true" />
187 |           <content-type name="PL/SQL" enabled="true" />
188 |           <content-type name="JSON" enabled="true" />
189 |           <content-type name="JSON5" enabled="true" />
190 |           <content-type name="YAML" enabled="true" />
191 |         </content-types>
192 |       </qualified-text-editor>
193 |       <record-navigation>
194 |         <navigation-target value="VIEWER" />
195 |       </record-navigation>
196 |     </dataset-editor-settings>
197 |     <code-editor-settings>
198 |       <general>
199 |         <show-object-navigation-gutter value="false" />
200 |         <show-spec-declaration-navigation-gutter value="true" />
201 |         <enable-spellchecking value="true" />
202 |         <enable-reference-spellchecking value="false" />
203 |       </general>
204 |       <confirmations>
205 |         <save-changes value="false" />
206 |         <revert-changes value="true" />
207 |       </confirmations>
208 |     </code-editor-settings>
209 |     <code-completion-settings>
210 |       <filters>
211 |         <basic-filter>
212 |           <filter-element type="RESERVED_WORD" id="keyword" selected="true" />
213 |           <filter-element type="RESERVED_WORD" id="function" selected="true" />
214 |           <filter-element type="RESERVED_WORD" id="parameter" selected="true" />
215 |           <filter-element type="RESERVED_WORD" id="datatype" selected="true" />
216 |           <filter-element type="RESERVED_WORD" id="exception" selected="true" />
217 |           <filter-element type="OBJECT" id="schema" selected="true" />
218 |           <filter-element type="OBJECT" id="role" selected="true" />
219 |           <filter-element type="OBJECT" id="user" selected="true" />
220 |           <filter-element type="OBJECT" id="privilege" selected="true" />
221 |           <user-schema>
222 |             <filter-element type="OBJECT" id="table" selected="true" />
223 |             <filter-element type="OBJECT" id="view" selected="true" />
224 |             <filter-element type="OBJECT" id="materialized view" selected="true" />
225 |             <filter-element type="OBJECT" id="index" selected="true" />
226 |             <filter-element type="OBJECT" id="constraint" selected="true" />
227 |             <filter-element type="OBJECT" id="trigger" selected="true" />
228 |             <filter-element type="OBJECT" id="synonym" selected="false" />
229 |             <filter-element type="OBJECT" id="sequence" selected="true" />
230 |             <filter-element type="OBJECT" id="procedure" selected="true" />
231 |             <filter-element type="OBJECT" id="function" selected="true" />
232 |             <filter-element type="OBJECT" id="package" selected="true" />
233 |             <filter-element type="OBJECT" id="type" selected="true" />
234 |             <filter-element type="OBJECT" id="dimension" selected="true" />
235 |             <filter-element type="OBJECT" id="cluster" selected="true" />
236 |             <filter-element type="OBJECT" id="dblink" selected="true" />
237 |           </user-schema>
238 |           <public-schema>
239 |             <filter-element type="OBJECT" id="table" selected="false" />
240 |             <filter-element type="OBJECT" id="view" selected="false" />
241 |             <filter-element type="OBJECT" id="materialized view" selected="false" />
242 |             <filter-element type="OBJECT" id="index" selected="false" />
243 |             <filter-element type="OBJECT" id="constraint" selected="false" />
244 |             <filter-element type="OBJECT" id="trigger" selected="false" />
245 |             <filter-element type="OBJECT" id="synonym" selected="false" />
246 |             <filter-element type="OBJECT" id="sequence" selected="false" />
247 |             <filter-element type="OBJECT" id="procedure" selected="false" />
248 |             <filter-element type="OBJECT" id="function" selected="false" />
249 |             <filter-element type="OBJECT" id="package" selected="false" />
250 |             <filter-element type="OBJECT" id="type" selected="false" />
251 |             <filter-element type="OBJECT" id="dimension" selected="false" />
252 |             <filter-element type="OBJECT" id="cluster" selected="false" />
253 |             <filter-element type="OBJECT" id="dblink" selected="false" />
254 |           </public-schema>
255 |           <any-schema>
256 |             <filter-element type="OBJECT" id="table" selected="true" />
257 |             <filter-element type="OBJECT" id="view" selected="true" />
258 |             <filter-element type="OBJECT" id="materialized view" selected="true" />
259 |             <filter-element type="OBJECT" id="index" selected="true" />
260 |             <filter-element type="OBJECT" id="constraint" selected="true" />
261 |             <filter-element type="OBJECT" id="trigger" selected="true" />
262 |             <filter-element type="OBJECT" id="synonym" selected="true" />
263 |             <filter-element type="OBJECT" id="sequence" selected="true" />
264 |             <filter-element type="OBJECT" id="procedure" selected="true" />
265 |             <filter-element type="OBJECT" id="function" selected="true" />
266 |             <filter-element type="OBJECT" id="package" selected="true" />
267 |             <filter-element type="OBJECT" id="type" selected="true" />
268 |             <filter-element type="OBJECT" id="dimension" selected="true" />
269 |             <filter-element type="OBJECT" id="cluster" selected="true" />
270 |             <filter-element type="OBJECT" id="dblink" selected="true" />
271 |           </any-schema>
272 |         </basic-filter>
273 |         <extended-filter>
274 |           <filter-element type="RESERVED_WORD" id="keyword" selected="true" />
275 |           <filter-element type="RESERVED_WORD" id="function" selected="true" />
276 |           <filter-element type="RESERVED_WORD" id="parameter" selected="true" />
277 |           <filter-element type="RESERVED_WORD" id="datatype" selected="true" />
278 |           <filter-element type="RESERVED_WORD" id="exception" selected="true" />
279 |           <filter-element type="OBJECT" id="schema" selected="true" />
280 |           <filter-element type="OBJECT" id="user" selected="true" />
281 |           <filter-element type="OBJECT" id="role" selected="true" />
282 |           <filter-element type="OBJECT" id="privilege" selected="true" />
283 |           <user-schema>
284 |             <filter-element type="OBJECT" id="table" selected="true" />
285 |             <filter-element type="OBJECT" id="view" selected="true" />
286 |             <filter-element type="OBJECT" id="materialized view" selected="true" />
287 |             <filter-element type="OBJECT" id="index" selected="true" />
288 |             <filter-element type="OBJECT" id="constraint" selected="true" />
289 |             <filter-element type="OBJECT" id="trigger" selected="true" />
290 |             <filter-element type="OBJECT" id="synonym" selected="true" />
291 |             <filter-element type="OBJECT" id="sequence" selected="true" />
292 |             <filter-element type="OBJECT" id="procedure" selected="true" />
293 |             <filter-element type="OBJECT" id="function" selected="true" />
294 |             <filter-element type="OBJECT" id="package" selected="true" />
295 |             <filter-element type="OBJECT" id="type" selected="true" />
296 |             <filter-element type="OBJECT" id="dimension" selected="true" />
297 |             <filter-element type="OBJECT" id="cluster" selected="true" />
298 |             <filter-element type="OBJECT" id="dblink" selected="true" />
299 |           </user-schema>
300 |           <public-schema>
301 |             <filter-element type="OBJECT" id="table" selected="true" />
302 |             <filter-element type="OBJECT" id="view" selected="true" />
303 |             <filter-element type="OBJECT" id="materialized view" selected="true" />
304 |             <filter-element type="OBJECT" id="index" selected="true" />
305 |             <filter-element type="OBJECT" id="constraint" selected="true" />
306 |             <filter-element type="OBJECT" id="trigger" selected="true" />
307 |             <filter-element type="OBJECT" id="synonym" selected="true" />
308 |             <filter-element type="OBJECT" id="sequence" selected="true" />
309 |             <filter-element type="OBJECT" id="procedure" selected="true" />
310 |             <filter-element type="OBJECT" id="function" selected="true" />
311 |             <filter-element type="OBJECT" id="package" selected="true" />
312 |             <filter-element type="OBJECT" id="type" selected="true" />
313 |             <filter-element type="OBJECT" id="dimension" selected="true" />
314 |             <filter-element type="OBJECT" id="cluster" selected="true" />
315 |             <filter-element type="OBJECT" id="dblink" selected="true" />
316 |           </public-schema>
317 |           <any-schema>
318 |             <filter-element type="OBJECT" id="table" selected="true" />
319 |             <filter-element type="OBJECT" id="view" selected="true" />
320 |             <filter-element type="OBJECT" id="materialized view" selected="true" />
321 |             <filter-element type="OBJECT" id="index" selected="true" />
322 |             <filter-element type="OBJECT" id="constraint" selected="true" />
323 |             <filter-element type="OBJECT" id="trigger" selected="true" />
324 |             <filter-element type="OBJECT" id="synonym" selected="true" />
325 |             <filter-element type="OBJECT" id="sequence" selected="true" />
326 |             <filter-element type="OBJECT" id="procedure" selected="true" />
327 |             <filter-element type="OBJECT" id="function" selected="true" />
328 |             <filter-element type="OBJECT" id="package" selected="true" />
329 |             <filter-element type="OBJECT" id="type" selected="true" />
330 |             <filter-element type="OBJECT" id="dimension" selected="true" />
331 |             <filter-element type="OBJECT" id="cluster" selected="true" />
332 |             <filter-element type="OBJECT" id="dblink" selected="true" />
333 |           </any-schema>
334 |         </extended-filter>
335 |       </filters>
336 |       <sorting enabled="true">
337 |         <sorting-element type="RESERVED_WORD" id="keyword" />
338 |         <sorting-element type="RESERVED_WORD" id="datatype" />
339 |         <sorting-element type="OBJECT" id="column" />
340 |         <sorting-element type="OBJECT" id="table" />
341 |         <sorting-element type="OBJECT" id="view" />
342 |         <sorting-element type="OBJECT" id="materialized view" />
343 |         <sorting-element type="OBJECT" id="index" />
344 |         <sorting-element type="OBJECT" id="constraint" />
345 |         <sorting-element type="OBJECT" id="trigger" />
346 |         <sorting-element type="OBJECT" id="synonym" />
347 |         <sorting-element type="OBJECT" id="sequence" />
348 |         <sorting-element type="OBJECT" id="procedure" />
349 |         <sorting-element type="OBJECT" id="function" />
350 |         <sorting-element type="OBJECT" id="package" />
351 |         <sorting-element type="OBJECT" id="type" />
352 |         <sorting-element type="OBJECT" id="dimension" />
353 |         <sorting-element type="OBJECT" id="cluster" />
354 |         <sorting-element type="OBJECT" id="dblink" />
355 |         <sorting-element type="OBJECT" id="schema" />
356 |         <sorting-element type="OBJECT" id="role" />
357 |         <sorting-element type="OBJECT" id="user" />
358 |         <sorting-element type="RESERVED_WORD" id="function" />
359 |         <sorting-element type="RESERVED_WORD" id="parameter" />
360 |       </sorting>
361 |       <format>
362 |         <enforce-code-style-case value="true" />
363 |       </format>
364 |     </code-completion-settings>
365 |     <execution-engine-settings>
366 |       <statement-execution>
367 |         <fetch-block-size value="100" />
368 |         <execution-timeout value="20" />
369 |         <debug-execution-timeout value="600" />
370 |         <focus-result value="false" />
371 |         <prompt-execution value="false" />
372 |       </statement-execution>
373 |       <script-execution>
374 |         <command-line-interfaces />
375 |         <execution-timeout value="300" />
376 |       </script-execution>
377 |       <method-execution>
378 |         <execution-timeout value="30" />
379 |         <debug-execution-timeout value="600" />
380 |         <parameter-history-size value="10" />
381 |       </method-execution>
382 |     </execution-engine-settings>
383 |     <operation-settings>
384 |       <transactions>
385 |         <uncommitted-changes>
386 |           <on-project-close value="ASK" />
387 |           <on-disconnect value="ASK" />
388 |           <on-autocommit-toggle value="ASK" />
389 |         </uncommitted-changes>
390 |         <multiple-uncommitted-changes>
391 |           <on-commit value="ASK" />
392 |           <on-rollback value="ASK" />
393 |         </multiple-uncommitted-changes>
394 |       </transactions>
395 |       <session-browser>
396 |         <disconnect-session value="ASK" />
397 |         <kill-session value="ASK" />
398 |         <reload-on-filter-change value="false" />
399 |       </session-browser>
400 |       <compiler>
401 |         <compile-type value="KEEP" />
402 |         <compile-dependencies value="ASK" />
403 |         <always-show-controls value="false" />
404 |       </compiler>
405 |       <debugger>
406 |         <debugger-type value="JDBC" />
407 |         <use-generic-runners value="true" />
408 |       </debugger>
409 |     </operation-settings>
410 |     <ddl-file-settings>
411 |       <extensions>
412 |         <mapping file-type-id="VIEW" extensions="vw" />
413 |         <mapping file-type-id="TRIGGER" extensions="trg" />
414 |         <mapping file-type-id="PROCEDURE" extensions="prc" />
415 |         <mapping file-type-id="FUNCTION" extensions="fnc" />
416 |         <mapping file-type-id="PACKAGE" extensions="pkg" />
417 |         <mapping file-type-id="PACKAGE_SPEC" extensions="pks" />
418 |         <mapping file-type-id="PACKAGE_BODY" extensions="pkb" />
419 |         <mapping file-type-id="TYPE" extensions="tpe" />
420 |         <mapping file-type-id="TYPE_SPEC" extensions="tps" />
421 |         <mapping file-type-id="TYPE_BODY" extensions="tpb" />
422 |       </extensions>
423 |       <general>
424 |         <lookup-ddl-files value="true" />
425 |         <create-ddl-files value="false" />
426 |         <synchronize-ddl-files value="true" />
427 |         <use-qualified-names value="false" />
428 |         <make-scripts-rerunnable value="true" />
429 |       </general>
430 |     </ddl-file-settings>
431 |     <general-settings>
432 |       <regional-settings>
433 |         <date-format value="MEDIUM" />
434 |         <number-format value="UNGROUPED" />
435 |         <locale value="SYSTEM_DEFAULT" />
436 |         <use-custom-formats value="false" />
437 |       </regional-settings>
438 |       <environment>
439 |         <environment-types>
440 |           <environment-type id="development" name="Development" description="Development environment" color="-2430209/-12296320" readonly-code="false" readonly-data="false" />
441 |           <environment-type id="integration" name="Integration" description="Integration environment" color="-2621494/-12163514" readonly-code="true" readonly-data="false" />
442 |           <environment-type id="production" name="Production" description="Productive environment" color="-11574/-10271420" readonly-code="true" readonly-data="true" />
443 |           <environment-type id="other" name="Other" description="" color="-1576/-10724543" readonly-code="false" readonly-data="false" />
444 |         </environment-types>
445 |         <visibility-settings>
446 |           <connection-tabs value="true" />
447 |           <dialog-headers value="true" />
448 |           <object-editor-tabs value="true" />
449 |           <script-editor-tabs value="false" />
450 |           <execution-result-tabs value="true" />
451 |         </visibility-settings>
452 |       </environment>
453 |     </general-settings>
454 |   </component>
455 |   <component name="DBNavigator.Project.StatementExecutionManager">
456 |     <execution-variables />
457 |   </component>
458 | </project>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
 1 | <component name="InspectionProjectProfileManager">
 2 |   <profile version="1.0">
 3 |     <option name="myName" value="Project Default" />
 4 |     <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
 5 |       <option name="ignoredPackages">
 6 |         <value>
 7 |           <list size="1">
 8 |             <item index="0" class="java.lang.String" itemvalue="pkg-resources" />
 9 |           </list>
10 |         </value>
11 |       </option>
12 |     </inspection_tool>
13 |   </profile>
14 | </component>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="PROJECT_PROFILE" value="Default" />
4 |     <option name="USE_PROJECT_PROFILE" value="false" />
5 |     <version value="1.0" />
6 |   </settings>
7 | </component>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (credentialLeakDB)" project-jdk-type="Python SDK" />
4 |   <component name="PyCharmProfessionalAdvertiser">
5 |     <option name="shown" value="true" />
6 |   </component>
7 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/credentialLeakDB.iml" filepath="$PROJECT_DIR$/.idea/credentialLeakDB.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
 1 | 0.6 * complete re-write. Refactor everything.
 2 |     * make a structure of collectors, parsers, enrichers, filters, output modules
 3 |     * re-wrote spycloud parsing completely
 4 |     * re-wrote the output / postgresql storing of the data
 5 |     * lots of unit tests . Brings coverage > 80%
 6 | 
 7 | 0.5 * things sort of work, but feels buggy
 8 | 
 9 | 0.4 * added endpoints for /leak and /leak_data
10 |     * fixed the Answer format for all endpoints.
11 |     * Added autocommit
12 |     * Minor bugs.
13 | 
14 | 0.3 * moved to public github.com/EC-DIGIT-CSIRC/credentialLeakDB.git
15 |     * refactored code so that the Parser is now abstract, implement basic Spycloud parser
16 |     * refactored DB insert
17 |     * first API version, though still lots of bugs
18 | 0.2 moved to an internal proj. in github after OK from @ddurvaux
19 | 0.1 initial import
20 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | Please create a fork of this project, then make your changes and then send a 
2 | pull request (merge request in gitlab's lingo).


--------------------------------------------------------------------------------
/EER.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/EER.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # credentialleakDB
  2 | 
  3 | [![Pylint](https://github.com/EC-DIGIT-CSIRC/credentialLeakDB/actions/workflows/pylint.yml/badge.svg)](https://github.com/EC-DIGIT-CSIRC/credentialLeakDB/actions/workflows/pylint.yml)
  4 | [![flak8 and pytest](https://github.com/EC-DIGIT-CSIRC/credentialLeakDB/actions/workflows/python-app.yml/badge.svg)](https://github.com/EC-DIGIT-CSIRC/credentialLeakDB/actions/workflows/python-app.yml)
  5 | [![CodeQL](https://github.com/EC-DIGIT-CSIRC/credentialLeakDB/actions/workflows/codeql-analysis.yml/badge.svg)](https://github.com/EC-DIGIT-CSIRC/credentialLeakDB/actions/workflows/codeql-analysis.yml)
  6 | [![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=digits2_credentialLeakDB&metric=alert_status&token=cee9c8232570fa1000ab4770feb571fd3e85ff39)](https://sonarcloud.io/dashboard?id=digits2_credentialLeakDB)
  7 | [![Maintainability Rating](https://sonarcloud.io/api/project_badges/measure?project=digits2_credentialLeakDB&metric=sqale_rating&token=cee9c8232570fa1000ab4770feb571fd3e85ff39)](https://sonarcloud.io/dashboard?id=digits2_credentialLeakDB)
  8 | [![Reliability Rating](https://sonarcloud.io/api/project_badges/measure?project=digits2_credentialLeakDB&metric=reliability_rating&token=cee9c8232570fa1000ab4770feb571fd3e85ff39)](https://sonarcloud.io/dashboard?id=digits2_credentialLeakDB)
  9 | [![Security Rating](https://sonarcloud.io/api/project_badges/measure?project=digits2_credentialLeakDB&metric=security_rating&token=cee9c8232570fa1000ab4770feb571fd3e85ff39)](https://sonarcloud.io/dashboard?id=digits2_credentialLeakDB)
 10 | [![codecov](https://codecov.io/gh/EC-DIGIT-CSIRC/credentialLeakDB/branch/main/graph/badge.svg?token=SS5F8EXQON)](https://codecov.io/gh/EC-DIGIT-CSIRC/credentialLeakDB)
 11 | 
 12 | 
 13 | A database structure to store leaked credentials. 
 14 | 
 15 | Think: our own, internal [HaveIBeenPwned](https://haveibeenpwned.com/) database.
 16 | 
 17 | ## Why?
 18 | 
 19 | 1. To quickly find duplicates before sending it on to further process the data
 20 | 2. To have a way to load diverse credential breaches into a common structure and do common queries on it
 21 | 3. To quickly generate statistics on credential leaks
 22 | 4. To have a well defined interface to pass on data to pass it on to other automation steps
 23 | 
 24 | ## Documentation
 25 | 
 26 | ### Installation
 27 | 
 28 | #### Docker
 29 | 
 30 | #### Via pip and venv
 31 | 
 32 | ```bash
 33 | git clone https://github.com/EC-DIGIT-CSIRC/credentialLeakDB.git
 34 | cd credentialLeakDB
 35 | # create a virtualenv
 36 | virtualenv --python=python3.7 venv
 37 | source venv/bin/activate
 38 | pip install -r requirements.txt
 39 | ```
 40 | 
 41 | Next, make sure the following files exist:
 42 |   * ``VIPs.txt`` ... a \n separated list of email addresses which you would consider VIPs.
 43 |   * api/config.py ... see below
 44 |  
 45 | ### Database structure
 46 | Search in Confluence for "credentialLeakDB" in the Automation space.
 47 | 
 48 | SQL structure: [db.sql](db.sql)
 49 | 
 50 | The EER diagram __intentionally__ got simplified a lot. If we are going to store billions of repeated ``text`` datatype records, we can 
 51 | go back to more normalization. For now, however, this seems to be enough.
 52 | 
 53 | 
 54 | ![EER Diagram](EER.png)
 55 | 
 56 | 
 57 | 
 58 | ### Meaning of the fields
 59 | 
 60 | #### Table ``leak``
 61 | 
 62 | |      Column       |           Type           | Collation | Nullable |  Description          |                                          
 63 | |------------------ | ------------------------ | --------- | -------- | ----------------------------------------------------------------------------------------------------------------- |
 64 | | ``id``                | integer                  |           | not null | _primary key. Auto-generated_. |
 65 | | ``breach_ts``         | timestamp with time zone |           |          | If known, the timestamp when the breach happened. |
 66 | | ``source_publish_ts`` | timestamp with time zone |           |          | The timestamp according when the source (f.ex. Spycloud) published the data. |
 67 | | ``ingestion_ts``      | timestamp with time zone |           | not null | The timestamp when we ingested the data. |
 68 | | ``summary``           | text                     |           | not null | A short summary (slug) of the leak. Used for displaying it somewhere |
 69 | | ``ticket_id``         | text                     |           |          |  |
 70 | | ``reporter_name``     | text                     |           |          | The name of the reporter where we got the notification from. E.g. CERT-eu, Spycloud, etc... Who sent us the data? |
 71 | | ``source_name``       | text                     |           |          | The name of the source where this leak came from. Either the name of a collection or some other name. |
 72 | 
 73 | ```
 74 | Indexes:
 75 |     "leak_pkey" PRIMARY KEY, btree (id)
 76 | Referenced by:
 77 |     TABLE "leak_data" CONSTRAINT "leak_data_leak_id_fkey" FOREIGN KEY (leak_id) REFERENCES leak(id)
 78 |  ```
 79 |  
 80 |  #### Table ``leak_data``
 81 |                                                                                                                     
 82 | |        Column        |  Type   | Collation | Nullable |  Description                                                             
 83 | --------------------- | ------- | --------- | -------- | -----------------------------------------------------------------------------------------------------------------------------------
 84 |  ``id``                   | integer |           | not null | _primary key, auto-generated_. | 
 85 |  ``leak_id``              | integer |           | not null | references a ``leak(id)`` | 
 86 |  ``email``                | text    |           | not null | The email address associated with the leak. | 
 87 |  ``password``             | text    |           | not null | Either the encrypted or unencrypted password. If the unencrypted password is available, that is what is going to be in this field. |
 88 |  ``password_plain``       | text    |           |          | The plaintext password, if known. |
 89 |  ``password_hashed``      | text    |           |          | The hashed password, if known. |
 90 |  ``hash_algo``            | text    |           |          | If we can determine the hashing algo and the password_hashed field is set, for example "md5" or "sha1" |
 91 |  ``ticket_id``            | text    |           |          | References the ticket systems' ticket ID associated with handling this credential leak . This ticket could contain infos on how we contacted the affected user. | 
 92 |  ``email_verified``       | boolean |           |          | If the email address was verified if it does exist and is active | 
 93 |  ``password_verified_ok`` | boolean |           |          | Was that password still valid / active? | 
 94 |  ``ip``                   | inet    |           |          | IP address of the client PC in case of a password stealer. | 
 95 |  ``domain``               | text    |           |          | Domain address of the user's email address. | 
 96 |  ``browser``              | text    |           |          | If the password was leaked via a password stealer malware, then the browser of the user goes here. Otherwise empty. | 
 97 |  ``malware_name``         | text    |           |          | If the password was leaked via a password stealer malware, then the malware name goes here. Otherwise empty. |
 98 |  ``infected_machine``     | text    |           |          | If the password was leaked via a password stealer malware, then the infected (Windows) PC name (some ID for the machine) goes here. |
 99 |  ``dg``                   | text    |           | not null | The affected DG (in other organisations, this would be called "department")
100 |  ``count_seen``           | integer |           |          | How often did we already see this unique combination (leak, email, password, domain). I.e. this is a duplicate counter.  | 
101 | 
102 | ```
103 | Indexes:
104 |     "leak_data_pkey" PRIMARY KEY, btree (id)
105 |     "constr_unique_leak_data_leak_id_email_password_domain" UNIQUE CONSTRAINT, btree (leak_id, email, password, domain)
106 |     "idx_leak_data_unique_leak_id_email_password_domain" UNIQUE, btree (leak_id, email, password, domain)
107 |     "idx_leak_data_dg" btree (dg)
108 |     "idx_leak_data_email" btree (upper(email))
109 |     "idx_leak_data_email_password_machine" btree (email, password, infected_machine)
110 |     "idx_leak_data_malware_name" btree (malware_name)
111 | Foreign-key constraints:
112 |     "leak_data_leak_id_fkey" FOREIGN KEY (leak_id) REFERENCES leak(id)
113 | ```    
114 | 
115 | 
116 | # Usage of the API
117 | 
118 | Here is how to use the API endpoints: you can start the server (follow the instructions below) and go to ``$servername/docs`` where $servername is of course the domain / IP address you installed it under. The ``docs/`` endpoint hosts a swagger / OpenAPI 3 
119 | 
120 | ## GET parameters
121 | 
122 | These are pretty self-explanatory thanks to the swagger UI.
123 | 
124 | ## POST and PUT
125 | 
126 | For HTTP POST (a.k.a INSERT into DB) you will need to provide the following JSON info:
127 | 
128 | ### leak object
129 | ```json
130 | {
131 |   "id": 0,
132 |   "ticket_id": "string",
133 |   "summary": "string",
134 |   "reporter_name": "string",
135 |   "source_name": "string",
136 |   "breach_ts": "2021-03-29T12:21:56.370Z",
137 |   "source_publish_ts": "2021-03-29T12:21:56.370Z"
138 | }
139 | 
140 | ```
141 | 
142 | The ``id`` field *only* needs to be filled out when PUTing data there (a.k.a UPDATE statement). Otherwise please leave it out when POSTing a new leak_data row.
143 | The id is the internal automatically generated primary key (ID) and will be assigned. So when you use the ``HTTP POST /leak`` endpoint, please leave out ``id``. The answer will be a JSON array with a dict with the id inside, such as:
144 | 
145 | ```json
146 | {
147 |   "meta": {
148 |     "version": "0.5",
149 |     "duration": 0.006,
150 |     "count": 1
151 |   },
152 |   "data": [
153 |     {
154 |       "id": 18
155 |     }
156 |   ],
157 |   "error": null
158 | }
159 | ```
160 | 
161 | Meaning: the version of the API was 0.5, the query duration was 0.006 sec (6 millisec), one answer. The ``data`` array contains one element: id=18. Meaning, the ID of the inserted leak object was 18. You can now reference this in the leak_data object insertion.
162 | 
163 | ### leak_data object
164 | 
165 | Same as the leak object, here the ``id`` field *only* needs to be filled out when PUTing data there (a.k.a UPDATE statement). Otherwise please leave it out when POSTing a new leak_data row. **Note well**: the leak_id field needs to be filled out in this case. You **first** have to create leak object and then afterwards the leak_data object.
166 | 
167 | ```json
168 | {
169 |   "id": 0,
170 |   "leak_id": 0,
171 |   "email": "user@example.com",
172 |   "password": "string",
173 |   "password_plain": "string",
174 |   "password_hashed": "string",
175 |   "hash_algo": "string",
176 |   "ticket_id": "string",
177 |   "email_verified": true,
178 |   "password_verified_ok": true,
179 |   "ip": "string",
180 |   "domain": "string",
181 |   "browser": "string",
182 |   "malware_name": "string",
183 |   "infected_machine": "string",
184 |   "dg": "string"
185 | }
186 | ```
187 | 
188 | ## ``import/csv/`` endpoint
189 | 
190 | Also pretty self-explanatory. You need to first create a leak object, give it's ID as a GET-style parameter and upload the CSV in spycloud format via the Form.
191 | 
192 | 
193 | ## Installation
194 | 
195 | 1. Install git and checkout this repository:
196 | ```bash
197 | apt install git
198 | git clone ...
199 | cd credentialLeakDB
200 | ```
201 | 
202 | 3. Install Postgresql:
203 | ```bash 
204 | # in Ubuntu:
205 | apt install postgresql-12           
206 | # alternatively, if you are in Debian 10, you can also use postgresql-11, both work:
207 | # apt install postgresql-11
208 | ```
209 | 
210 | 2. as user postgres:
211 | ```bash
212 | sudo su - postgres
213 | createdb credentialleakdb
214 | createuser credentialleakdb
215 | psql -c "ALTER ROLE credentialleakdb WITH PASSWORD '<insert some random password here>'" template1
216 | ```
217 | 
218 | 3. create the DB:
219 | ```psql -u credentialleakdb credentialleakdb < db.sql```
220 | 
221 | 5. set the env vars: 
222 | ```bash
223 | export PORT=8080
224 | export DBNAME=credentialleakdb
225 | export DBUSER=credentialleakdb
226 | export DBPASSWORD=... <insert the password you gave the user> ...
227 | export DBHOST=localhost
228 | ```
229 | 5. Create a virtual environment if it does not exist yet:
230 |    ```bash
231 |    virtualenv --python=python3.7 venv
232 |    source venv/bin/activate
233 |    pip install -r requirements.txt
234 |    ```
235 | 5. start the program from the main directory:
236 | ```bash
237 | export PYTHONPATH=$(pwd); uvicorn --reload --host 0.0.0.0 --port $PORT api.main:app
238 | ```
239 | 
240 | ## Configuration.
241 | 
242 | Please copy the file ``config.SAMPLE.py`` to ``api/config.py`` and adjust accordingly.
243 | Here you can set API keys etc.
244 | 
245 | 
246 | 
247 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 | 
3 | ## Reporting a Vulnerability
4 | 
5 | All bug reports should please go to ec-digit-csirc@ec.europa.eu. Thanks.
6 | Pull requests welcome!
7 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/__init__.py


--------------------------------------------------------------------------------
/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/api/__init__.py


--------------------------------------------------------------------------------
/api/enrichment.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Enrichment code
 3 | 
 4 | Author: Aaron Kaplan
 5 | License: see LICENSE
 6 | 
 7 | This basically just pulls in the enricher classes.
 8 | 
 9 | """
10 | from modules.enrichers.ldap_lib import CEDQuery
11 | from modules.enrichers.ldap import LDAPEnricher
12 | from modules.enrichers.vip import VIPEnricher
13 | from modules.enrichers.external_email import ExternalEmailEnricher
14 | 


--------------------------------------------------------------------------------
/api/main.py:
--------------------------------------------------------------------------------
   1 | """
   2 | FastAPI based API on the credentialLeakDB
   3 | 
   4 | Author: Aaron Kaplan
   5 | License: see LICENSE
   6 | 
   7 | """
   8 | 
   9 | # system / base packages
  10 | from lib.helpers import getlogger, anonymize_password
  11 | import os
  12 | import shutil
  13 | import time
  14 | from pathlib import Path
  15 | from tempfile import SpooledTemporaryFile
  16 | from typing import List
  17 | 
  18 | # database, ASGI, etc.
  19 | import pandas as pd
  20 | import psycopg2
  21 | import psycopg2.extras
  22 | import uvicorn
  23 | from fastapi import FastAPI, HTTPException, File, UploadFile, Depends, Security, Response
  24 | from fastapi.security.api_key import APIKeyHeader, APIKey, Request
  25 | from pydantic import EmailStr
  26 | 
  27 | # packages from this code repo
  28 | from api.config import config
  29 | from lib.db.db import _get_db, _close_db, _connect_db, DSN
  30 | from models.idf import InternalDataFormat
  31 | from models.outdf import Leak, LeakData, Answer, AnswerMeta
  32 | from modules.collectors.parser import BaseParser  # XXX FIXME: this should be in lib, no? Or called "genericparser"
  33 | from modules.collectors.spycloud.collector import SpyCloudCollector
  34 | from modules.enrichers.abuse_contact import AbuseContactLookup
  35 | from modules.enrichers.external_email import ExternalEmailEnricher
  36 | from modules.enrichers.ldap import LDAPEnricher
  37 | from modules.enrichers.vip import VIPEnricher
  38 | from modules.filters.deduper import Deduper
  39 | from modules.filters.filter import Filter
  40 | from modules.output.db import PostgresqlOutput
  41 | from modules.parsers.spycloud import SpyCloudParser
  42 | 
  43 | ###############################################################################
  44 | # API key stuff
  45 | API_KEYLEN = 32
  46 | API_KEY_NAME = "x-api-key"
  47 | api_key_header = APIKeyHeader(name = API_KEY_NAME, auto_error = True)
  48 | 
  49 | VER = "0.6"
  50 | 
  51 | logger = getlogger(__name__)
  52 | 
  53 | app = FastAPI(title = "CredentialLeakDB", version = VER, )  # root_path='/api/v1')
  54 | 
  55 | 
  56 | # ##############################################################################
  57 | # DB specific functions
  58 | @app.on_event('startup')
  59 | def get_db():
  60 |     return _get_db()
  61 | 
  62 | 
  63 | @app.on_event('shutdown')
  64 | def close_db():
  65 |     return _close_db()
  66 | 
  67 | 
  68 | # ##############################################################################
  69 | # security / authentication
  70 | def fetch_valid_api_keys() -> List[str]:
  71 |     """Fetch the list of valid API keys from a DB or a config file.
  72 | 
  73 |     :returns: List of strings - the API keys
  74 |     """
  75 |     return config['api_keys']
  76 | 
  77 | 
  78 | def is_valid_api_key(key: str) -> bool:
  79 |     """
  80 |     Validate a given key if it is in the list of allowed API keys *or* if the source IP where the
  81 |     request is coming from in in a list of valid IP addresses.
  82 | 
  83 |     :param key: the API key
  84 |     :returns: boolean: YES/NO
  85 |     """
  86 | 
  87 |     valid_api_keys = fetch_valid_api_keys()
  88 | 
  89 |     # allowed_ips = ['127.0.0.1',
  90 |     #                '192.168.1.1',     # my own IP, in this example an RFC1918
  91 |     #               ]
  92 |     # if key in valid_api_keys or (request.client.host in allowed_ips):
  93 |     if key in valid_api_keys:
  94 |         return True
  95 |     return False
  96 | 
  97 | 
  98 | def validate_api_key_header(apikeyheader: str = Security(api_key_header)):
  99 |     """
 100 |     Validate if a given API key is present in the HTTP apikeyheader.
 101 | 
 102 |     :param apikeyheader: the required HTTP Header
 103 |     :returns: the apikey apikeyheader again, if it is valid. Otherwise, raise an HTTPException and return 403.
 104 |     """
 105 |     if not apikeyheader:
 106 |         raise HTTPException(status_code = 403,
 107 |                             detail = """need API key. Please get in contact with the admins of this
 108 |                             site in order get your API key.""")
 109 |     if is_valid_api_key(apikeyheader):
 110 |         return apikeyheader
 111 |     else:
 112 |         raise HTTPException(
 113 |             status_code = 403,  # HTTP FORBIDDEN
 114 |             detail = """Could not validate the provided credentials. Please get in contact with the admins of this
 115 |             site in order get your API key."""
 116 |         )
 117 | 
 118 | 
 119 | # ##############################################################################
 120 | # File uploading
 121 | async def store_file(orig_filename: str, _file: SpooledTemporaryFile,
 122 |                      upload_path=os.getenv('UPLOAD_PATH', default = '/tmp')) -> str:
 123 |     """
 124 |     Stores a SpooledTemporaryFile to a permanent location and returns the path to it
 125 | 
 126 |     :param orig_filename:  the filename according to multipart
 127 |     :param _file: the SpooledTemporary File
 128 |     :param upload_path: where the uploaded file should be stored permanently
 129 |     :returns: full path to the stored file
 130 |     """
 131 |     # Unfortunately we need to really shutil.copyfileobj() the file object to disk, even though we already have a
 132 |     # SpooledTemporaryFile object... this is needed for SpooledTemporaryFiles . Sucks. See here:
 133 |     #   https://stackoverflow.com/questions/94153/how-do-i-persist-to-disk-a-temporary-file-using-python
 134 |     #
 135 |     # filepath syntax:  <UPLOAD_PATH>/<original filename>
 136 |     #   example: /tmp/Spycloud.csv
 137 |     path = "{}/{}".format(upload_path, orig_filename)  # prefix, orig_filename, sha256, pid, suffix)
 138 |     logger.info("storing %s ... to %s" % (orig_filename, path))
 139 |     _file.seek(0)
 140 |     with open(path, "w+b") as outfile:
 141 |         shutil.copyfileobj(_file._file, outfile)
 142 |     return path
 143 | 
 144 | 
 145 | async def check_file(filename: str) -> bool:
 146 |     return True  # XXX FIXME Implement
 147 | 
 148 | 
 149 | # ====================================================
 150 | # API endpoints
 151 | 
 152 | @app.get("/ping",
 153 |          name = "Ping test",
 154 |          summary = "Run a ping test, to check if the service is running",
 155 |          tags = ["Tests"])
 156 | async def ping():
 157 |     """A simple ping / liveliness test endpoint. No API Key required."""
 158 |     return {"message": "pong"}
 159 | 
 160 | 
 161 | @app.get("/timeout_test",
 162 |          name = "A simple timeout test",
 163 |          summary = "Call this and the GET request will sleep for 5 seconds",
 164 |          tags = ["Tests"])
 165 | async def timeout_test():
 166 |     """A simple timeout/ liveliness test endpoint. No API Key required."""
 167 |     time.sleep(5)
 168 |     return {"message": "OK"}
 169 | 
 170 | 
 171 | @app.get("/", tags = ["Tests"])
 172 | async def root(api_key: APIKey = Depends(validate_api_key_header)):
 173 |     """A simple hello world endpoint. This one requires an API key."""
 174 |     return {"message": "Hello World"}  # , "root_path": request.scope.get("root_path")}
 175 | 
 176 | 
 177 | # ##############################################################################
 178 | # General API endpoints
 179 | 
 180 | 
 181 | @app.get('/user/{email}',
 182 |          tags = ["General queries"],
 183 |          status_code = 200,
 184 |          response_model = Answer)
 185 | async def get_user_by_email(email: EmailStr,
 186 |                             response: Response,
 187 |                             api_key: APIKey = Depends(validate_api_key_header)) -> Answer:
 188 |     """
 189 |     Get the all credential leaks in the DB of a given user specified by his email address.
 190 | 
 191 |     # Parameters
 192 |       * email: string. The email address of the user (case insensitive).
 193 | 
 194 |     # Returns
 195 |       * A JSON Answer object with rows being an array of answers, or [] in case there was no data in the DB
 196 |     """
 197 |     sql = """SELECT * from leak_data where upper(email)=upper(%s)"""
 198 |     t0 = time.time()
 199 |     db = get_db()
 200 |     try:
 201 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 202 |         cur.execute(sql, (email,))
 203 |         rows = cur.fetchall()
 204 |         if len(rows) == 0:  # return 404 in case no data was found
 205 |             response.status_code = 404
 206 |         t1 = time.time()
 207 |         d = round(t1 - t0, 3)
 208 |         return Answer(success = True, errormsg = None,
 209 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 210 |     except Exception as ex:
 211 |         return Answer(success = False, errormsg = str(ex), data = [])
 212 | 
 213 | 
 214 | @app.get('/user_and_password/{email}/{password}',
 215 |          tags = ["General queries"],
 216 |          status_code = 200,
 217 |          response_model = Answer)
 218 | async def get_user_by_email_and_password(email: EmailStr,
 219 |                                          password: str,
 220 |                                          response: Response,
 221 |                                          api_key: APIKey = Depends(validate_api_key_header)
 222 |                                          ) -> Answer:
 223 |     """
 224 |     Get the all credential leaks in the DB of a given user given by the combination email + password.
 225 |     Note that both email and password must match (where email is case insensitive, the password *is case sensitive*).
 226 | 
 227 |     # Parameters
 228 |       * email: string. The email address of the user (**case insensitive**, since email is usually case insensitive).
 229 |       * password: string. The (hashed or plaintext) password (**note: this is case sensitive**)
 230 | 
 231 |     # Returns
 232 |       * A JSON Answer object with rows being an array of answers, or [] in case there was no data in the DB
 233 | 
 234 |     # Example
 235 |     ``foo@example.com`` and ``12345`` -->
 236 | 
 237 |     ``{ "meta": { ... }, "data": [ { "id": 14, "leak_id": 1, "email": "aaron@example.com", "password": "12345", ...,  ],
 238 |         "errormsg": null }``
 239 | 
 240 |     """
 241 |     sql = """SELECT * from leak_data where upper(email)=upper(%s) and password=%s"""
 242 |     t0 = time.time()
 243 |     db = get_db()
 244 |     try:
 245 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 246 |         cur.execute(sql, (email, password))
 247 |         rows = cur.fetchall()
 248 |         if len(rows) == 0:  # return 404 in case no data was found
 249 |             response.status_code = 404
 250 |         t1 = time.time()
 251 |         d = round(t1 - t0, 3)
 252 |         return Answer(success = True, errormsg = None,
 253 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 254 |     except Exception as ex:
 255 |         return Answer(success = False, errormsg = str(ex), data = [])
 256 | 
 257 | 
 258 | @app.get('/exists/by_email/{email}',
 259 |          tags = ["General queries"],
 260 |          status_code = 200,
 261 |          response_model = Answer)
 262 | async def check_user_by_email(email: EmailStr,
 263 |                               response: Response,
 264 |                               api_key: APIKey = Depends(validate_api_key_header)
 265 |                               ) -> Answer:
 266 |     """
 267 |     Check if a certain email address was present in any leak.
 268 | 
 269 |     # Parameters
 270 |     * email: string. The email address of the user (**case insensitive**, since email is usually case insensitive).
 271 | 
 272 |     # Returns
 273 |     * A JSON Answer object with rows being an array of answers, or [] in case there was no data in the DB
 274 | 
 275 |     # Example
 276 |     ``foo@example.com`` -->
 277 |     ``{ "meta": { "version": "0.5", "duration": 0.002, "count": 1 }, "data": [ { "count": 1 } ], "success": true,
 278 |         "errormsg": null }``
 279 |     """
 280 |     sql = """SELECT count(*) from leak_data where upper(email)=upper(%s)"""
 281 |     t0 = time.time()
 282 |     db = get_db()
 283 |     try:
 284 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 285 |         cur.execute(sql, (email,))
 286 |         rows = cur.fetchall()
 287 |         t1 = time.time()
 288 |         d = round(t1 - t0, 3)
 289 |         return Answer(success = True, errormsg = None,
 290 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 291 |     except Exception as ex:
 292 |         return Answer(success = False, errormsg = str(ex), data = [])
 293 | 
 294 | 
 295 | @app.get('/exists/by_password/{password}',
 296 |          tags = ["General queries"],
 297 |          status_code = 200,
 298 |          response_model = Answer)
 299 | async def check_user_by_password(password: str,
 300 |                                  response: Response,
 301 |                                  api_key: APIKey = Depends(validate_api_key_header)
 302 |                                  ) -> Answer:
 303 |     """
 304 |     Check if a user exists with the given password (either plaintext or hashed) in the DB. If so, return the user.
 305 | 
 306 |     # Parameters
 307 |     * password: string. The password to be searched.
 308 | 
 309 |     # Returns
 310 |     * A JSON Answer object with rows being an array of answers, or [] in case there was no data in the DB
 311 | 
 312 |     # Example
 313 |     ``12345`` -->
 314 |     ``{ "meta": { ... }, "data": [ { "id": 14, "leak_id": 1, "email": "aaron@example.com", "password": "12345",
 315 |         ...,  ], "errormsg": null }``
 316 |     """
 317 |     # can do better... use the hashid library?
 318 | 
 319 |     sql = """SELECT count(*) from leak_data where password=%s or password_plain=%s or password_hashed=%s"""
 320 |     t0 = time.time()
 321 |     db = get_db()
 322 |     try:
 323 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 324 |         cur.execute(sql, (password, password, password))
 325 |         rows = cur.fetchall()
 326 |         t1 = time.time()
 327 |         d = round(t1 - t0, 3)
 328 |         return Answer(success = True, errormsg = None,
 329 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 330 |     except Exception as ex:
 331 |         return Answer(success = False, errormsg = str(ex), data = [])
 332 | 
 333 | 
 334 | @app.get('/exists/by_domain/{domain}',
 335 |          tags = ["General queries"],
 336 |          status_code = 200,
 337 |          response_model = Answer)
 338 | async def check_by_domain(domain: str,
 339 |                           response: Response,
 340 |                           api_key: APIKey = Depends(validate_api_key_header)) -> Answer:
 341 |     """
 342 |     Check if a given domain appears in some leak.
 343 | 
 344 |     # Parameters
 345 |       * domain : string. The domain to search for (case insensitive).
 346 | 
 347 |     # Returns:
 348 |     A JSON Answer object with the count of occurrences in the data: field.
 349 |     """
 350 | 
 351 |     sql = """SELECT count(*) from leak_data where upper(domain)=upper(%s)"""
 352 |     t0 = time.time()
 353 |     db = get_db()
 354 |     try:
 355 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 356 |         cur.execute(sql, (domain,))
 357 |         rows = cur.fetchall()
 358 |         t1 = time.time()
 359 |         d = round(t1 - t0, 3)
 360 |         return Answer(success = True, errormsg = None,
 361 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 362 |     except Exception as ex:
 363 |         return Answer(success = False, errormsg = str(ex), data = [])
 364 | 
 365 | 
 366 | # ##############################################################################
 367 | # Reference data (reporter, source, etc) starts here
 368 | @app.get('/reporter',
 369 |          tags = ["Reference data"],
 370 |          status_code = 200,
 371 |          response_model = Answer)
 372 | async def get_reporters(response: Response,
 373 |                         api_key: APIKey = Depends(validate_api_key_header)) -> Answer:
 374 |     """
 375 |     Get the all reporter_name entries (sorted, unique).
 376 | 
 377 |     # Parameters
 378 | 
 379 |     # Returns
 380 |       * A JSON Answer object with data containing an array of answers, or [] in case there was no data in the DB
 381 |     """
 382 |     sql = """SELECT distinct(reporter_name) from leak ORDER by reporter_name asc"""
 383 |     t0 = time.time()
 384 |     db = get_db()
 385 |     try:
 386 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 387 |         cur.execute(sql)
 388 |         rows = cur.fetchall()
 389 |         if len(rows) == 0:  # return 404 in case no data was found
 390 |             response.status_code = 404
 391 |         t1 = time.time()
 392 |         d = round(t1 - t0, 3)
 393 |         return Answer(success = True, errormsg = None,
 394 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 395 |     except Exception as ex:
 396 |         return Answer(success = False, errormsg = str(ex), data = [])
 397 | 
 398 | 
 399 | @app.get('/source_name',
 400 |          tags = ["Reference data"],
 401 |          status_code = 200,
 402 |          response_model = Answer)
 403 | async def get_sources(response: Response,
 404 |                       api_key: APIKey = Depends(validate_api_key_header)) -> Answer:
 405 |     """
 406 |     Get the all names of sources of leaks (sorted, unique) - i.e. "SpyCloud", "HaveIBeenPwned", etc..
 407 | 
 408 |     # Parameters
 409 | 
 410 |     # Returns
 411 |       * A JSON Answer object with data containing an array of answers, or [] in case there was no data in the DB
 412 |     """
 413 |     sql = """SELECT distinct(source_name) from leak ORDER by source_name asc"""
 414 |     t0 = time.time()
 415 |     db = get_db()
 416 |     try:
 417 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 418 |         cur.execute(sql)
 419 |         rows = cur.fetchall()
 420 |         if len(rows) == 0:  # return 404 in case no data was found
 421 |             response.status_code = 404
 422 |         t1 = time.time()
 423 |         d = round(t1 - t0, 3)
 424 |         return Answer(success = True, errormsg = None,
 425 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 426 |     except Exception as ex:
 427 |         return Answer(success = False, errormsg = str(ex), data = [])
 428 | 
 429 | 
 430 | # ##############################################################################
 431 | # Leak table starts here
 432 | 
 433 | @app.get("/leak/all",
 434 |          tags = ["Leak"],
 435 |          status_code = 200,
 436 |          response_model = Answer)
 437 | async def get_all_leaks(response: Response,
 438 |                         api_key: APIKey = Depends(validate_api_key_header)) -> Answer:
 439 |     """Fetch all leaks.
 440 | 
 441 |     # Parameters
 442 | 
 443 |     # Returns
 444 |      * A JSON Answer object with all leak (i.e. meta-data of leaks) data from the `leak` table.
 445 |     """
 446 | 
 447 |     t0 = time.time()
 448 |     sql = "SELECT * from leak"
 449 |     db = get_db()
 450 |     try:
 451 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 452 |         cur.execute(sql)
 453 |         rows = cur.fetchall()
 454 |         if len(rows) == 0:  # return 404 in case no data was found
 455 |             response.status_code = 404
 456 |         t1 = time.time()
 457 |         d = round(t1 - t0, 3)
 458 |         return Answer(success = True, errormsg = None,
 459 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 460 |     except Exception as ex:
 461 |         return Answer(success = False, errormsg = str(ex), data = [])
 462 | 
 463 | 
 464 | @app.get("/leak/by_ticket_id/{ticket_id}",
 465 |          tags = ["Leak"],
 466 |          status_code = 200,
 467 |          response_model = Answer)
 468 | async def get_leak_by_ticket_id(ticket_id: str,
 469 |                                 response: Response,
 470 |                                 api_key: APIKey = Depends(validate_api_key_header)
 471 |                                 ) -> Answer:
 472 |     """Fetch a leak by its ticket system id"""
 473 |     t0 = time.time()
 474 |     sql = "SELECT * from leak WHERE ticket_id = %s"
 475 |     db = get_db()
 476 |     try:
 477 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 478 |         cur.execute(sql, (ticket_id,))
 479 |         rows = cur.fetchall()
 480 |         if len(rows) == 0:  # return 404 in case no data was found
 481 |             response.status_code = 404
 482 |         t1 = time.time()
 483 |         d = round(t1 - t0, 3)
 484 |         return Answer(success = True, errormsg = None,
 485 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 486 |     except Exception as ex:
 487 |         return Answer(success = False, errormsg = str(ex), data = [])
 488 | 
 489 | 
 490 | @app.get("/leak/by_summary/{summary}",
 491 |          tags = ["Leak"],
 492 |          status_code = 200,
 493 |          response_model = Answer)
 494 | async def get_leak_by_summary(summary: str,
 495 |                               response: Response,
 496 |                               api_key: APIKey = Depends(validate_api_key_header)
 497 |                               ) -> Answer:
 498 |     """Fetch a leak by summary"""
 499 |     sql = "SELECT * from leak WHERE summary = %s"
 500 |     t0 = time.time()
 501 |     db = get_db()
 502 |     try:
 503 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 504 |         cur.execute(sql, (summary,))
 505 |         rows = cur.fetchall()
 506 |         if len(rows) == 0:  # return 404 in case no data was found
 507 |             response.status_code = 404
 508 |         t1 = time.time()
 509 |         d = round(t1 - t0, 3)
 510 |         return Answer(success = True, errormsg = None,
 511 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 512 |     except Exception as ex:
 513 |         return Answer(success = False, errormsg = str(ex), data = [])
 514 | 
 515 | 
 516 | @app.get("/leak/by_reporter/{reporter}",
 517 |          tags = ["Leak"],
 518 |          status_code = 200,
 519 |          response_model = Answer)
 520 | async def get_leak_by_reporter(reporter: str,
 521 |                                response: Response,
 522 |                                api_key: APIKey = Depends(validate_api_key_header)
 523 |                                ) -> Answer:
 524 |     """Fetch a leak by its reporter. """
 525 |     sql = "SELECT * from leak WHERE reporter_name = %s"
 526 |     t0 = time.time()
 527 |     db = get_db()
 528 |     try:
 529 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 530 |         cur.execute(sql, (reporter,))
 531 |         rows = cur.fetchall()
 532 |         if len(rows) == 0:  # return 404 in case no data was found
 533 |             response.status_code = 404
 534 |         t1 = time.time()
 535 |         d = round(t1 - t0, 3)
 536 |         return Answer(success = True, errormsg = None,
 537 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 538 |     except Exception as ex:
 539 |         return Answer(success = False, errormsg = str(ex), data = [])
 540 | 
 541 | 
 542 | @app.get("/leak/by_source/{source_name}",
 543 |          tags = ["Leak"],
 544 |          status_code = 200,
 545 |          response_model = Answer)
 546 | async def get_leak_by_source(source_name: str,
 547 |                              response: Response,
 548 |                              api_key: APIKey = Depends(validate_api_key_header)
 549 |                              ) -> Answer:
 550 |     """Fetch all leaks by their source (i.e. *who* collected the leak data (spycloud, HaveIBeenPwned, etc.).
 551 | 
 552 |     # Parameters
 553 |       * source_name: string. The name of the source (case insensitive).
 554 | 
 555 |     # Returns
 556 |       * a JSON Answer object with all leaks for that given source_name.
 557 |     """
 558 | 
 559 |     sql = "SELECT * from leak WHERE upper(source_name) = upper(%s)"
 560 |     t0 = time.time()
 561 |     db = get_db()
 562 |     try:
 563 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 564 |         cur.execute(sql, (source_name,))
 565 |         rows = cur.fetchall()
 566 |         if len(rows) == 0:  # return 404 in case no data was found
 567 |             response.status_code = 404
 568 |         t1 = time.time()
 569 |         d = round(t1 - t0, 3)
 570 |         return Answer(success = True, errormsg = None,
 571 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 572 |     except Exception as ex:
 573 |         return Answer(success = False, errormsg = str(ex), data = [])
 574 | 
 575 | 
 576 | @app.get("/leak/{_id}", tags = ["Leak"],
 577 |          description = 'Get the leak info by its ID.',
 578 |          status_code = 200,
 579 |          response_model = Answer)
 580 | async def get_leak_by_id(_id: int,
 581 |                          response: Response,
 582 |                          api_key: APIKey = Depends(validate_api_key_header)
 583 |                          ) -> Answer:
 584 |     """Fetch a leak by its ID"""
 585 |     t0 = time.time()
 586 |     sql = "SELECT * from leak WHERE id = %s"
 587 |     db = get_db()
 588 |     try:
 589 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 590 |         cur.execute(sql, (_id,))
 591 |         rows = cur.fetchall()
 592 |         if len(rows) == 0:  # return 404 in case no data was found
 593 |             response.status_code = 404
 594 |         t1 = time.time()
 595 |         d = round(t1 - t0, 3)
 596 |         return Answer(success = True, errormsg = None,
 597 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 598 |     except Exception as ex:
 599 |         return Answer(success = False, errormsg = str(ex), data = [])
 600 | 
 601 | 
 602 | @app.post("/leak/",
 603 |           tags = ["Leak"],
 604 |           description = "INSERT a new leak into the DB",
 605 |           status_code = 201,
 606 |           response_model = Answer)
 607 | async def new_leak(leak: Leak,
 608 |                    response: Response,
 609 |                    api_key: APIKey = Depends(validate_api_key_header)
 610 |                    ) -> Answer:
 611 |     """
 612 |     INSERT a new leak into the leak table in the database.
 613 | 
 614 |     # Parameters
 615 |       * leak:  a Leak object. Note that all fields must be set, except for leak.id
 616 |     # Returns
 617 |       * a JSON Answer object with the leak_id in the data: field
 618 | 
 619 |     """
 620 |     sql = """INSERT into leak
 621 |              (summary, ticket_id, reporter_name, source_name, breach_ts, source_publish_ts, ingestion_ts)
 622 |              VALUES (%s, %s, %s, %s, %s, %s, now())
 623 |              RETURNING id
 624 |         """
 625 |     t0 = time.time()
 626 |     db = get_db()
 627 |     try:
 628 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 629 |         cur.execute(sql, (leak.summary, leak.ticket_id, leak.reporter_name, leak.source_name, leak.breach_ts,
 630 |                           leak.source_publish_ts,))
 631 |         rows = cur.fetchall()
 632 |         if len(rows) == 0:  # return 400 in case the INSERT failed.
 633 |             response.status_code = 400
 634 |         t1 = time.time()
 635 |         d = round(t1 - t0, 3)
 636 |         return Answer(success = True, errormsg = None,
 637 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 638 |     except Exception as ex:
 639 |         return Answer(success = False, errormsg = str(ex), data = [])
 640 | 
 641 | 
 642 | @app.put("/leak/",
 643 |          tags = ["Leak"],
 644 |          status_code = 200,
 645 |          response_model = Answer)
 646 | async def update_leak(leak: Leak,
 647 |                       response: Response,
 648 |                       api_key: APIKey = Depends(validate_api_key_header)
 649 |                       ) -> Answer:
 650 |     """
 651 |     UPDATE an existing leak.
 652 | 
 653 |     # Parameters
 654 |       * leak: a Leak object. Note that all fields must be set in the Leak object.
 655 |     # Returns
 656 |       * a JSON Answer object with the ID of the updated leak.
 657 |     """
 658 |     sql = """UPDATE leak SET
 659 |                 summary = %s, ticket_id = %s, reporter_name = %s, source_name = %s,
 660 |                 breach_ts = %s, source_publish_ts = %s
 661 |              WHERE id = %s
 662 |              RETURNING id
 663 |         """
 664 |     t0 = time.time()
 665 |     db = get_db()
 666 |     if not leak.id:
 667 |         return Answer(success = False, errormsg = "id %s not given. Please specify a leak.id you want to UPDATE",
 668 |                       data = [])
 669 |     try:
 670 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 671 |         cur.execute(sql, (leak.summary, leak.ticket_id, leak.reporter_name,
 672 |                           leak.source_name, leak.breach_ts, leak.source_publish_ts, leak.id))
 673 |         rows = cur.fetchall()
 674 |         if len(rows) == 0:  # return 400 in case the INSERT failed.
 675 |             response.status_code = 400
 676 |         t1 = time.time()
 677 |         d = round(t1 - t0, 3)
 678 |         return Answer(success = True, errormsg = None,
 679 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 680 |     except Exception as ex:
 681 |         return Answer(success = False, errormsg = str(ex), data = [])
 682 | 
 683 | 
 684 | # ############################################################################################################
 685 | # Leak Data starts here
 686 | 
 687 | @app.get("/leak_data/{leak_data_id}",
 688 |          tags = ["Leak Data"],
 689 |          status_code = 200,
 690 |          response_model = Answer)
 691 | async def get_leak_data_by_id(leak_data_id: int,
 692 |                               response: Response,
 693 |                               api_key: APIKey = Depends(validate_api_key_header)) -> Answer:
 694 |     """
 695 |     Fetch all leak data entries of a given id.
 696 | 
 697 |     # Parameters
 698 |         * leak_data_id: integer, the DB internal leak_data_id.
 699 | 
 700 |     # Returns
 701 |      * A JSON Answer object with the corresponding leak data (i.e. actual usernames, passwords) from the `leak_data`
 702 |        table which are contained within the specified leak (leak_data_id).
 703 |     """
 704 |     t0 = time.time()
 705 |     sql = "SELECT * from leak_data where id=%s"
 706 |     db = get_db()
 707 |     try:
 708 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 709 |         cur.execute(sql, (leak_data_id,))
 710 |         rows = cur.fetchall()
 711 |         if len(rows) == 0:  # return 404 in case no data was found
 712 |             response.status_code = 404
 713 |         t1 = time.time()
 714 |         d = round(t1 - t0, 3)
 715 |         return Answer(success = True, errormsg = None,
 716 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 717 |     except Exception as ex:
 718 |         return Answer(success = False, errormsg = str(ex), data = [])
 719 | 
 720 | 
 721 | @app.get("/leak_data/by_ticket_id/{ticket_id}",
 722 |          tags = ["Leak Data"],
 723 |          status_code = 200,
 724 |          response_model = Answer)
 725 | async def get_leak_data_by_ticket_id(ticket_id: str,
 726 |                                      response: Response,
 727 |                                      api_key: APIKey = Depends(validate_api_key_header)
 728 |                                      ) -> Answer:
 729 |     """Fetch a leak row (leak_data table) by its ticket system id
 730 | 
 731 |     # Parameters
 732 |       * ticket_id: string. The ticket system ID which references the leak_data row
 733 |     # Returns
 734 |       * a JSON Answer object with the leak data row or in data.
 735 |     """
 736 |     sql = "SELECT * from leak_data WHERE ticket_id = %s"
 737 |     t0 = time.time()
 738 |     db = get_db()
 739 |     try:
 740 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 741 |         cur.execute(sql, (ticket_id,))
 742 |         rows = cur.fetchall()
 743 |         if len(rows) == 0:  # return 404 in case no data was found
 744 |             response.status_code = 404
 745 |         t1 = time.time()
 746 |         d = round(t1 - t0, 3)
 747 |         return Answer(success = True, errormsg = None,
 748 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 749 |     except Exception as ex:
 750 |         return Answer(success = False, errormsg = str(ex), data = [])
 751 | 
 752 | 
 753 | @app.post("/leak_data/",
 754 |           tags = ["Leak Data"],
 755 |           status_code = 201,
 756 |           response_model = Answer)
 757 | async def new_leak_data(row: LeakData,
 758 |                         response: Response,
 759 |                         api_key: APIKey = Depends(validate_api_key_header)
 760 |                         ) -> Answer:
 761 |     """
 762 |     INSERT a new leak_data row into the leak_data table.
 763 | 
 764 |     # Parameters
 765 |       * row: a leakData object. If that data already exists, it will not be inserted again.
 766 |     # Returns
 767 |       * a JSON Answer object containing the ID of the inserted leak_data row.
 768 |     """
 769 |     sql = """INSERT into leak_data
 770 |              (leak_id, email, password, password_plain, password_hashed, hash_algo, ticket_id,
 771 |              email_verified, password_verified_ok, ip, domain, browser, malware_name, infected_machine, dg)
 772 |              VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
 773 |              ON CONFLICT ON CONSTRAINT constr_unique_leak_data_leak_id_email_password_domain DO UPDATE SET email=%s
 774 |              RETURNING id
 775 |         """
 776 |     t0 = time.time()
 777 |     db = get_db()
 778 |     logger.debug(row)
 779 |     try:
 780 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 781 |         cur.execute(sql, (row.leak_id, row.email, row.password, row.password_plain, row.password_hashed, row.hash_algo,
 782 |                           row.ticket_id, row.email_verified, row.password_verified_ok, row.ip, row.domain, row.browser,
 783 |                           row.malware_name, row.infected_machine, row.dg, row.email))
 784 |         rows = cur.fetchall()
 785 |         if len(rows) == 0:  # return 400 in case the INSERT failed.
 786 |             response.status_code = 400
 787 |         t1 = time.time()
 788 |         d = round(t1 - t0, 3)
 789 |         return Answer(success = True, errormsg = None,
 790 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 791 |     except Exception as ex:
 792 |         return Answer(success = False, errormsg = str(ex), data = [])
 793 | 
 794 | 
 795 | @app.put("/leak_data/",
 796 |          tags = ["Leak Data"],
 797 |          status_code = 200,
 798 |          response_model = Answer)
 799 | async def update_leak_data(row: LeakData,
 800 |                            request: Request,
 801 |                            response: Response,
 802 |                            api_key: APIKey = Depends(validate_api_key_header)
 803 |                            ) -> Answer:
 804 |     """
 805 |     UPDATE leak_data row in the leak_data table.
 806 | 
 807 |     # Parameters
 808 |       * row : a leakData object with all the relevant information. Please note that you **have to** supply all fields,
 809 |         even if you do not plan to update them. In other words: you might have to GET / the leak_data object first.
 810 |     # Returns
 811 |       * a JSON Answer object containing the ID of the inserted leak_data row.
 812 |     """
 813 |     sql = """UPDATE leak_data SET
 814 |                 leak_id = %s,
 815 |                 email = %s,
 816 |                 password = %s,
 817 |                 password_plain = %s,
 818 |                 password_hashed = %s,
 819 |                 hash_algo = %s,
 820 |                 ticket_id = %s,
 821 |                 email_verified = %s,
 822 |                 password_verified_ok = %s,
 823 |                 ip = %s,
 824 |                 domain = %s,
 825 |                 browser = %s,
 826 |                 malware_name = %s,
 827 |                 infected_machine = %s,
 828 |                 dg = %s
 829 |              WHERE id = %s
 830 |              RETURNING id
 831 |         """
 832 |     t0 = time.time()
 833 |     db = get_db()
 834 |     try:
 835 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
 836 |         logger.debug("HTTP request: '%r'" % request)
 837 |         logger.debug("SQL command: '%s'" % cur.mogrify(sql, (row.leak_id, row.email, row.password, row.password_plain,
 838 |                                                               row.password_hashed, row.hash_algo,
 839 |                                                               row.ticket_id, row.email_verified,
 840 |                                                               row.password_verified_ok, row.ip, row.domain, row.browser,
 841 |                                                               row.malware_name, row.infected_machine, row.dg, row.id)))
 842 |         cur.execute(sql, (row.leak_id, row.email, row.password, row.password_plain, row.password_hashed, row.hash_algo,
 843 |                           row.ticket_id, row.email_verified, row.password_verified_ok, row.ip, row.domain, row.browser,
 844 |                           row.malware_name, row.infected_machine, row.dg, row.id))
 845 |         db.commit()
 846 |         rows = cur.fetchall()
 847 |         if len(rows) == 0:  # return 400 in case the INSERT failed.
 848 |             response.status_code = 400
 849 |         t1 = time.time()
 850 |         d = round(t1 - t0, 3)
 851 |         return Answer(success = True, errormsg = None,
 852 |                       meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows)
 853 |     except Exception as ex:
 854 |         return Answer(success = False, errormsg = str(ex), data = [])
 855 | 
 856 | 
 857 | # ############################################################################################################
 858 | # CSV file importing
 859 | 
 860 | def enrich(item: InternalDataFormat, leak_id: str) -> InternalDataFormat:
 861 |     """Initial enricher chain. This SHOULD be configurable and a pipeline via a MQ."""
 862 |     # set leak_id
 863 |     item.leak_id = leak_id
 864 | 
 865 |     # VIP status
 866 |     if not item.is_vip:
 867 |         vip_enricher = VIPEnricher()
 868 |         item.is_vip = vip_enricher.is_vip(item.email)
 869 | 
 870 |     # DG
 871 |     ldap_enricher = LDAPEnricher()
 872 |     if not item.dg:
 873 |         dg = ldap_enricher.email_to_dg(item.email)
 874 |         if not dg:
 875 |             dg = "Unknown"
 876 |         item.dg = dg
 877 | 
 878 |     # Active account or outdated?
 879 |     if not item.is_active_account:
 880 |         item.is_active_account = ldap_enricher.exists(item.email)
 881 | 
 882 |     # External Address or internal?
 883 |     if not item.external_user:
 884 |         ext_email_enricher = ExternalEmailEnricher()
 885 |         item.external_user = ext_email_enricher.is_external_email(item.email)
 886 | 
 887 |     # credential Type
 888 |     if not item.credential_type:
 889 |         item.credential_type = ["EU Login"]  # XXX FIXME! This is mock-up data!
 890 | 
 891 |     # Abuse contact / report to
 892 |     if not item.report_to:
 893 |         abuse_enricher = AbuseContactLookup()
 894 |         item.report_to = abuse_enricher.lookup(item.email)
 895 | 
 896 |     # all is good, we went through the pipeline
 897 |     item.notify = True
 898 |     item.needs_human_intervention = False
 899 |     item.error_msg = None
 900 |     return item
 901 | 
 902 | 
 903 | def store(idf: InternalDataFormat) -> InternalDataFormat:
 904 |     """Store the item in the DB.
 905 | 
 906 |     :returns the idf item.
 907 |     """
 908 |     # XXX FIXME!! need to implement / refactor existing code.
 909 |     # convert the idf to the DB row
 910 | 
 911 |     return idf
 912 | 
 913 | 
 914 | def convert_to_output(idf: InternalDataFormat) -> LeakData:
 915 |     """Convert the internal data format to the output data format.
 916 | 
 917 |     ":returns LeakData
 918 |     """
 919 |     output_data_entry = LeakData(**idf.dict())  # here the validation pydantic magic happens
 920 |     return output_data_entry
 921 | 
 922 | 
 923 | @app.post("/import/csv/spycloud/{parent_ticket_id}",
 924 |           tags = ["CSV import"],
 925 |           status_code = 200,
 926 |           response_model = Answer)
 927 | async def import_csv_spycloud(parent_ticket_id: str,
 928 |                               response: Response,
 929 |                               summary: str = None,
 930 |                               _file: UploadFile = File(...),
 931 |                               api_key: APIKey = Depends(validate_api_key_header)) -> Answer:
 932 |     """
 933 |     Import a spycloud CSV file into the DB. Note that you do not need to specify a leak_id parameter here.
 934 |     The API will automatically create a leak object in the DB for you and link it.
 935 | 
 936 |     # Parameters
 937 |      * parent_ticket_id: a ticket ID which allows us to link the leak object to the ticket
 938 |      * summary: a summary string for the new leak object (if it's created)
 939 |      * _file: a file which must be uploaded via HTML forms/multipart.
 940 | 
 941 |     # Returns
 942 |      * a JSON Answer object where the data: field is the **deduplicated** CSV file (i.e. lines which were already
 943 |        imported as part of that leak (same username, same password, same domain) will not be returned.
 944 |        In other words, data: [] contains the rows from the CSV file which did not yet exist in the DB.
 945 |     """
 946 | 
 947 |     t0 = time.time()
 948 | 
 949 |     if not parent_ticket_id:
 950 |         response.status_code = 400
 951 |         return Answer(success = False,
 952 |                       errormsg = "Please specify a parent_ticket_id as a GET-style parameter in the URL. "
 953 |                                  "This is the parameter, needed to link the sub-issues against", data = [])
 954 |     if not summary:
 955 |         response.status_code = 400
 956 |         return Answer(success = False,
 957 |                       errormsg = "Please specify a summary for the Leak object which needs to be created. ", data = [])
 958 | 
 959 |     # first check if the leak_id for that summary already exists and if it's already linked to the parent_ticket_id.
 960 |     sql = """SELECT id from leak where summary = %s and ticket_id=%s"""
 961 |     db = get_db()
 962 |     try:
 963 |         with db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) as cur:
 964 |             logger.debug(cur.mogrify(sql, (summary, parent_ticket_id)))
 965 |             cur.execute(sql, (summary, parent_ticket_id))
 966 |             rows = cur.fetchall()
 967 |             nr_results = len(rows)
 968 |             if nr_results >= 1:
 969 |                 # take the first one
 970 |                 leak_id = rows[0]['id']
 971 |                 logger.info("Found existing leak object: %s" % leak_id)
 972 |             else:
 973 |                 # nothing found, create one
 974 |                 source_name = "SpyCloud"
 975 |                 leak = Leak(ticket_id = parent_ticket_id, summary = summary, source_name = source_name)
 976 |                 answer = await new_leak(leak, response = response, api_key = api_key)
 977 |                 logger.info("Did not find existing leak object, creating one")
 978 |                 if answer.success:
 979 |                     leak_id = int(answer.data[0]['id'])
 980 |                     logger.info("Created with id %s" % leak_id)
 981 |                 else:
 982 |                     logger.error("Could not create leak object for spycloud CSV file")
 983 |                     return Answer(success = False, errormsg = "could not create leak object", data = [])
 984 |     except Exception as ex:
 985 |         return Answer(success = False, errormsg = str(ex), data = [])
 986 | 
 987 |     # okay, we found the leak, let's insert the CSV
 988 |     # noinspection PyTypeChecker
 989 |     file_on_disk = await store_file(_file.filename, _file.file)
 990 |     await check_file(file_on_disk)  # XXX FIXME. Additional checks on the dumped file still missing
 991 | 
 992 |     collector = SpyCloudCollector()
 993 |     status, df = collector.collect(Path(file_on_disk))
 994 |     if status != "OK":
 995 |         return Answer(success = False, errormsg = "Could not read input CSV file", data = [])
 996 | 
 997 |     p = SpyCloudParser()
 998 |     try:
 999 |         items = p.parse(df)
1000 |     except Exception as ex:
1001 |         return Answer(success = False, errormsg = str(ex), data = [])
1002 | 
1003 |     deduper = Deduper()
1004 |     db_output = PostgresqlOutput()
1005 |     filter = Filter()
1006 | 
1007 |     data = []
1008 |     for item in items:  # FIXME: this pipeline could be done nicer with functools and reduce
1009 |         # send it through the complete pipeline
1010 |         item = filter.filter(item)
1011 |         email = item.email
1012 |         password = anonymize_password(item.password)
1013 |         if not item:
1014 |             logger.info("skipping item (%s, %s), It got filtered out by the filter." % (email, password))
1015 |             continue
1016 |         try:
1017 |             item = deduper.dedup(item)
1018 |             if not item:
1019 |                 logger.info("skipping item (%s, %s), since it already existed in the DB." % (email, password))
1020 |                 continue  # next item
1021 |         except Exception as ex:
1022 |             logger.error("Could not deduplicate item (%s, %s). Skipping this row. Reason: %s" % (email, password, str(ex)))
1023 |             continue
1024 |         try:
1025 |             item = enrich(item, leak_id = leak_id)
1026 |             item.leak_id = leak_id
1027 |         except Exception as ex:
1028 |             errmsg = "Could not enrich item (%s, %s). Skipping this row. Reason: %s" % (email, password, str(ex),)
1029 |             logger.error(errmsg)
1030 |             item.error_msg = errmsg
1031 |             item.needs_human_intervention = True
1032 |             item.notify = False
1033 |         if item.external_user:
1034 |             item.notify = False
1035 |         # after all is finished, convert to output format and return the (deduped) row
1036 |         # convert to output format:
1037 |         out_item = convert_to_output(item)
1038 |         logger.info(out_item)
1039 | 
1040 |         # and finally, store it in the DB
1041 |         if not item.needs_human_intervention:
1042 |             try:
1043 |                 db_output.process(out_item)
1044 |             except Exception as ex:
1045 |                 errmsg = "Could not store row. Skipping this row. Reason: %s" % str(ex)
1046 |                 logger.error(errmsg)
1047 |                 out_item.error_msg = errmsg
1048 |                 out_item.needs_human_intervention = True
1049 |                 out_item.notify = False
1050 | 
1051 |         data.append(out_item)
1052 |     # done! Emit all the output items with the header
1053 |     t1 = time.time()
1054 |     d = round(t1 - t0, 3)
1055 |     return Answer(success = True, errormsg = None,
1056 |                   meta = AnswerMeta(version = VER, duration = d, count = len(data)),
1057 |                   data = data)
1058 | 
1059 | 
1060 | # noinspection PyTypeChecker
1061 | @app.post("/import/csv/by_leak/{leak_id}",
1062 |           tags = ["CSV import"],
1063 |           status_code = 200,
1064 |           response_model = Answer)
1065 | async def import_csv_with_leak_id(leak_id: int,
1066 |                                   response: Response,
1067 |                                   _file: UploadFile = File(...),
1068 |                                   api_key: APIKey = Depends(validate_api_key_header)
1069 |                                   ) -> Answer:
1070 |     """
1071 |     Import a CSV file into the DB. You **need** to specify a ?leak_id=<int> parameter so that the CSV file may be
1072 |     linked to a leak_id. Failure to provide a leak_id will result in the file not being imported into the DB.
1073 | 
1074 |     # Parameters
1075 |       * leak_id : int. As a GET parameter. This allows the DB to link the leak data (CSV file) to the leak_id entry in
1076 |         in the leak table.
1077 |       * _file: a file which must be uploaded via HTML forms/multipart.
1078 | 
1079 |     # Returns
1080 |       * a JSON Answer object where the data: field is the **deduplicated** CSV file (i.e. lines which were already
1081 |         imported as part of that leak (same username, same password, same domain) will not be returned.
1082 |         In other words, data: [] contains the rows from the CSV file which did not yet exist in the DB.
1083 |     """
1084 | 
1085 |     t0 = time.time()
1086 | 
1087 |     if not leak_id:
1088 |         return Answer(success = False, errormsg = "Please specify a leak_id GET-style parameter in the URL", data = [])
1089 | 
1090 |     # first check if the leak_id exists
1091 |     sql = """SELECT count(*) from leak where id = %s"""
1092 |     db = get_db()
1093 |     try:
1094 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
1095 |         cur.execute(sql, (leak_id,))
1096 |         rows = cur.fetchone()
1097 |         nr_results = int(rows['count'])
1098 |         if nr_results != 1:
1099 |             response.status_code = 404
1100 |             return Answer(success = False, errormsg = "Leak ID %s not found" % leak_id, data = [])
1101 |     except Exception as ex:
1102 |         return Answer(success = False, errormsg = str(ex), data = [])
1103 | 
1104 |     # okay, we found the leak, let's insert the CSV
1105 |     file_on_disk = await store_file(_file.filename, _file.file)
1106 |     await check_file(file_on_disk)  # XXX FIXME. Additional checks on the dumped file still missing
1107 | 
1108 |     p = BaseParser()
1109 |     df = pd.DataFrame()
1110 |     try:
1111 |         df = p.parse_file(Path(file_on_disk), leak_id = leak_id)
1112 |     except Exception as ex:
1113 |         return Answer(success = False, errormsg = str(ex), data = [])
1114 | 
1115 |     df = p.normalize_data(df, leak_id = leak_id)
1116 |     """
1117 |     Now, after normalization, the df is in the format:
1118 |       leak_id, email, password, password_plain, password_hashed, hash_algo, ticket_id, email_verified,
1119 |          password_verified_ok, ip, domain, browser , malware_name, infected_machine, dg
1120 | 
1121 |     Example
1122 |     -------
1123 |     [5 rows x 15 columns]
1124 |        leak_id                email  ... infected_machine     dg
1125 |     0        1    aaron@example.com  ...     local_laptop  DIGIT
1126 |     1        1    sarah@example.com  ...    sarahs_laptop  DIGIT
1127 |     2        1  rousben@example.com  ...      WORKSTATION  DIGIT
1128 |     3        1    david@example.com  ...      Macbook Pro  DIGIT
1129 |     4        1    lauri@example.com  ...  Raspberry PI 3+  DIGIT
1130 |     5        1  natasha@example.com  ...  Raspberry PI 3+  DIGIT
1131 | 
1132 |     """
1133 | 
1134 |     inserted_ids = []
1135 |     for r in df.reset_index().to_dict(orient = 'records'):
1136 |         sql = """
1137 |         INSERT into leak_data(
1138 |           leak_id, email, password, password_plain, password_hashed, hash_algo, ticket_id, email_verified,
1139 |           password_verified_ok, ip, domain, browser , malware_name, infected_machine, dg
1140 |           )
1141 |         VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )
1142 |         ON CONFLICT ON CONSTRAINT constr_unique_leak_data_leak_id_email_password_domain
1143 |         DO UPDATE SET  count_seen = leak_data.count_seen + 1
1144 |         RETURNING id
1145 |         """
1146 |         try:
1147 |             cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
1148 |             cur.execute(sql, (r['leak_id'], r['email'], r['password'], r['password_plain'], r['password_hashed'],
1149 |                               r['hash_algo'], r['ticket_id'], r['email_verified'], r['password_verified_ok'], r['ip'],
1150 |                               r['domain'], r['browser'], r['malware_name'], r['infected_machine'], r['dg']))
1151 |             leak_data_id = int(cur.fetchone()['id'])
1152 |             inserted_ids.append(leak_data_id)
1153 |         except Exception as ex:
1154 |             return Answer(success = False, errormsg = str(ex), data = [])
1155 |     t1 = time.time()
1156 |     d = round(t1 - t0, 3)
1157 | 
1158 |     # now get the data of all the IDs / dedup
1159 |     try:
1160 |         sql = """SELECT * from leak_data where id in %s"""
1161 |         cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
1162 |         cur.execute(sql, (tuple(inserted_ids),))
1163 |         data = cur.fetchall()
1164 |         return Answer(success = True, errormsg = None,
1165 |                       meta = AnswerMeta(version = VER, duration = d, count = len(inserted_ids)), data = data)
1166 |     except Exception as ex:
1167 |         return Answer(success = False, errormsg = str(ex), data = [])
1168 | 
1169 | 
1170 | # ############################################################################################################
1171 | # enrichers
1172 | 
1173 | @app.get('/enrich/email_to_dg/{email}',
1174 |          tags = ["Enricher"],
1175 |          status_code = 200,
1176 |          response_model = Answer)
1177 | async def enrich_dg_by_email(email: EmailStr,
1178 |                              response: Response,
1179 |                              api_key: APIKey = Depends(validate_api_key_header)) -> Answer:
1180 |     """
1181 |     Enricher function: insert an email, returns the DG.
1182 | 
1183 |     :param email:
1184 |     :return: The DG or "Unknown"
1185 |     """
1186 |     t0 = time.time()
1187 |     le = LDAPEnricher()
1188 |     retval = le.email_to_dg(email)
1189 |     t1 = time.time()
1190 |     d = round(t1 - t0, 3)
1191 |     if not retval:
1192 |         response.status_code = 404
1193 |         return Answer(success = False, errormsg = "not found",
1194 |                       meta = AnswerMeta(version = VER, duration = d, count = 0), data = [])
1195 |     else:
1196 |         response.status_code = 200
1197 |         return Answer(success = True, errormsg = None, meta = AnswerMeta(version = VER, duration = d, count = 1),
1198 |                       data = [{"dg": retval}])
1199 | 
1200 | 
1201 | @app.get('/enrich/email_to_userid/{email}',
1202 |          tags = ["Enricher"],
1203 |          status_code = 200,
1204 |          response_model = Answer)
1205 | async def enrich_userid_by_email(email: EmailStr, response: Response,
1206 |                                  api_key: APIKey = Depends(validate_api_key_header)) -> Answer:
1207 |     t0 = time.time()
1208 |     le = LDAPEnricher()
1209 |     retval = le.email_to_user_id(email)
1210 |     t1 = time.time()
1211 |     d = round(t1 - t0, 3)
1212 |     if not retval:
1213 |         response.status_code = 404
1214 |         return Answer(success = False, errormsg = "not found",
1215 |                       meta = AnswerMeta(version = VER, duration = d, count = 0), data = [])
1216 |     else:
1217 |         response.status_code = 200
1218 |         return Answer(success = True, errormsg = None, meta = AnswerMeta(version = VER, duration = d, count = 1),
1219 |                       data = [{"ecMoniker": retval}])
1220 | 
1221 | 
1222 | @app.get('/enrich/email_to_vip/{email}',
1223 |          tags = ["Enricher"],
1224 |          status_code = 200,
1225 |          response_model = Answer)
1226 | async def enrich_vip_via_email(email: EmailStr, response: Response,
1227 |                                api_key: APIKey = Depends(validate_api_key_header)) -> Answer:
1228 |     t0 = time.time()
1229 |     enr = VIPEnricher()
1230 |     retval = enr.is_vip(email)
1231 |     t1 = time.time()
1232 |     d = round(t1 - t0, 3)
1233 |     response.status_code = 200
1234 |     return Answer(success = True, errormsg = None, meta = AnswerMeta(version = VER, duration = d, count = 1),
1235 |                   data = [{"is_vip": retval}])
1236 | 
1237 | 
1238 | if __name__ == "__main__":
1239 |     db_conn = _connect_db(DSN)
1240 |     uvicorn.run(app, debug = True, port = os.getenv('PORT', default = 8080))
1241 | 


--------------------------------------------------------------------------------
/api/models.py:
--------------------------------------------------------------------------------
 1 | """Pydantic models definitions
 2 | 
 3 | Author: Aaron Kaplan
 4 | License: see LICENSE.
 5 | """
 6 | 
 7 | import datetime
 8 | from enum import Enum
 9 | from typing import Optional, Dict, List  # Union
10 | 
11 | from pydantic import BaseModel, EmailStr
12 | 
13 | 
14 | class Leak(BaseModel):
15 |     id: Optional[int]
16 |     ticket_id: Optional[str]
17 |     summary: str
18 |     reporter_name: Optional[str]
19 |     source_name: Optional[str]
20 |     breach_ts: Optional[datetime.datetime]
21 |     source_publish_ts: Optional[datetime.datetime]
22 | 
23 | 
24 | class CredentialType(Enum):
25 |     is_external = "External"
26 |     is_proxy_login = "Proxy"
27 |     is_EU_login = "EU Login"
28 |     is_domain_login = "Domain"
29 |     is_secem_login = "SECEM"
30 | 
31 | 
32 | class LeakData(BaseModel):
33 |     id: Optional[int]
34 |     leak_id: int
35 |     email: EmailStr
36 |     password: str
37 |     password_plain: Optional[str]
38 |     password_hashed: Optional[str]
39 |     hash_algo: Optional[str]
40 |     ticket_id: Optional[str]
41 |     email_verified: Optional[bool]
42 |     password_verified_ok: Optional[bool]
43 |     ip: Optional[str]
44 |     domain: Optional[str]
45 |     target_domain: Optional[str]  # new
46 |     browser: Optional[str]
47 |     malware_name: Optional[str]
48 |     infected_machine: Optional[str]
49 |     dg: Optional[str]
50 |     is_vip: Optional[bool]
51 |     credential_type: Optional[List[CredentialType]]
52 |     report_to: Optional[List[str]]  # the security contact to report this to, in case it's not the the user directly.
53 |     #
54 |     # meta stuff and things for error reporting
55 |     count_seen: Optional[int] = 1
56 |     original_line: Optional[str]        # the original CSV file in case of errors
57 |     error_msg: Optional[str]
58 |     notify: bool
59 |     needs_human_intervention: bool
60 | 
61 | 
62 | class AnswerMeta(BaseModel):
63 |     version: str
64 |     duration: float
65 |     count: int
66 | 
67 | 
68 | class Answer(BaseModel):
69 |     meta: Optional[AnswerMeta]
70 |     data: List[Dict]  # Union[Dict,List]
71 |     success: bool
72 |     errormsg: Optional[str] = ""
73 | 
74 | 
75 | """ Example:
76 | Multiple answers:
77 | { "meta": { "version": "rel-1.0", "duration": 0.78, "count": 3 }, "data": [ <dict>, <dict>, <dict> ], "success": true,
78 |   "errormsg": "all OK" }
79 | 
80 | No data:
81 | { "meta": { "version": "rel-1.0", "duration": 0.78 , "count": 0 }, "data": [], "success": true, "errormsg": "all OK" }
82 | 
83 | Single result:
84 | { "meta": { "version": "rel-1.0", "duration": 0.78 , "count": 1 }, "data": [ { "foo": "bar", "baz": 77 } ],
85 |   "success": true, "errormsg": "all OK" }
86 | """
87 | 


--------------------------------------------------------------------------------
/config.SAMPLE.py:
--------------------------------------------------------------------------------
 1 | """Configuration stored here.
 2 | To make this work, please copy it over to api/config.py (make sure you don't overwrite
 3 | an existing file!!!
 4 | Edit that file there and add a random string to the list.
 5 | Communicate that random string to the API key user.
 6 | 
 7 | Then reload the server (or it gets reloaded automatically).
 8 | """
 9 | 
10 | 
11 | config = {
12 |     "api_keys": ["random-test-api-key", "another-example-api-key"]
13 | }
14 | 


--------------------------------------------------------------------------------
/db.sql:
--------------------------------------------------------------------------------
  1 | --
  2 | -- PostgreSQL database dump
  3 | --
  4 | 
  5 | -- Dumped from database version 11.10
  6 | -- Dumped by pg_dump version 11.10
  7 | 
  8 | SET statement_timeout = 0;
  9 | SET lock_timeout = 0;
 10 | SET idle_in_transaction_session_timeout = 0;
 11 | SET client_encoding = 'UTF8';
 12 | SET standard_conforming_strings = on;
 13 | SELECT pg_catalog.set_config('search_path', '', false);
 14 | SET check_function_bodies = false;
 15 | SET xmloption = content;
 16 | SET client_min_messages = warning;
 17 | SET row_security = off;
 18 | 
 19 | SET default_tablespace = '';
 20 | 
 21 | SET default_with_oids = false;
 22 | 
 23 | --
 24 | -- Name: leak; Type: TABLE; Schema: public; Owner: credentialleakdb
 25 | --
 26 | 
 27 | CREATE TABLE public.leak (
 28 |     id integer NOT NULL,
 29 |     breach_ts timestamp with time zone,
 30 |     source_publish_ts timestamp with time zone,
 31 |     ingestion_ts timestamp with time zone NOT NULL,
 32 |     summary text NOT NULL,
 33 |     ticket_id text,
 34 |     reporter_name text,
 35 |     source_name text
 36 | );
 37 | 
 38 | 
 39 | ALTER TABLE public.leak OWNER TO credentialleakdb;
 40 | 
 41 | --
 42 | -- Name: COLUMN leak.breach_ts; Type: COMMENT; Schema: public; Owner: credentialleakdb
 43 | --
 44 | 
 45 | COMMENT ON COLUMN public.leak.breach_ts IS 'If known, the timestamp when the breach happened.';
 46 | 
 47 | 
 48 | --
 49 | -- Name: COLUMN leak.source_publish_ts; Type: COMMENT; Schema: public; Owner: credentialleakdb
 50 | --
 51 | 
 52 | COMMENT ON COLUMN public.leak.source_publish_ts IS 'The timestamp according when the source (e.g. spycloud) published the data.';
 53 | 
 54 | 
 55 | --
 56 | -- Name: COLUMN leak.ingestion_ts; Type: COMMENT; Schema: public; Owner: credentialleakdb
 57 | --
 58 | 
 59 | COMMENT ON COLUMN public.leak.ingestion_ts IS 'The timestamp when we ingested the data.';
 60 | 
 61 | 
 62 | --
 63 | -- Name: COLUMN leak.summary; Type: COMMENT; Schema: public; Owner: credentialleakdb
 64 | --
 65 | 
 66 | COMMENT ON COLUMN public.leak.summary IS 'A short summary (slug) of the leak. Used for displaying it somewhere';
 67 | 
 68 | 
 69 | --
 70 | -- Name: COLUMN leak.reporter_name; Type: COMMENT; Schema: public; Owner: credentialleakdb
 71 | --
 72 | 
 73 | COMMENT ON COLUMN public.leak.reporter_name IS 'The name of the reporter where we got the notification from. E.g. CERT-eu, Spycloud, etc... Who sent us the data?';
 74 | 
 75 | 
 76 | --
 77 | -- Name: COLUMN leak.source_name; Type: COMMENT; Schema: public; Owner: credentialleakdb
 78 | --
 79 | 
 80 | COMMENT ON COLUMN public.leak.source_name IS 'The name of the source where this leak came from. Either the name of a collection or some other name.';
 81 | 
 82 | 
 83 | --
 84 | -- Name: leak_data; Type: TABLE; Schema: public; Owner: credentialleakdb
 85 | --
 86 | 
 87 | CREATE TABLE public.leak_data (
 88 |     id integer NOT NULL,
 89 |     leak_id integer NOT NULL,
 90 |     email text NOT NULL,
 91 |     password text NOT NULL,
 92 |     password_plain text,
 93 |     password_hashed text,
 94 |     hash_algo text,
 95 |     ticket_id text,
 96 |     email_verified boolean DEFAULT false,
 97 |     password_verified_ok boolean DEFAULT false,
 98 |     ip inet,
 99 |     domain text,
100 |     target_domain text,
101 |     browser text,
102 |     malware_name text,
103 |     infected_machine text,
104 |     dg text NOT NULL,
105 |     count_seen integer DEFAULT 1
106 | );
107 | 
108 | 
109 | ALTER TABLE public.leak_data OWNER TO credentialleakdb;
110 | 
111 | --
112 | -- Name: COLUMN leak_data.password; Type: COMMENT; Schema: public; Owner: credentialleakdb
113 | --
114 | 
115 | COMMENT ON COLUMN public.leak_data.password IS 'Either the encrypted or unencrypted password. If the unencrypted password is available, that is what is going to be in this field.';
116 | 
117 | 
118 | --
119 | -- Name: COLUMN leak_data.hash_algo; Type: COMMENT; Schema: public; Owner: credentialleakdb
120 | --
121 | 
122 | COMMENT ON COLUMN public.leak_data.hash_algo IS 'If we can determine the hashing algo and the password_hashed field is set';
123 | 
124 | 
125 | --
126 | -- Name: COLUMN leak_data.malware_name; Type: COMMENT; Schema: public; Owner: credentialleakdb
127 | --
128 | 
129 | COMMENT ON COLUMN public.leak_data.malware_name IS 'If the password was leaked via a credential stealer malware, then the malware name goes here.';
130 | 
131 | 
132 | --
133 | -- Name: COLUMN leak_data.infected_machine; Type: COMMENT; Schema: public; Owner: credentialleakdb
134 | --
135 | 
136 | COMMENT ON COLUMN public.leak_data.infected_machine IS 'The infected machine (some ID for the machine)';
137 | 
138 | 
139 | --
140 | -- Name: COLUMN leak_data.dg; Type: COMMENT; Schema: public; Owner: credentialleakdb
141 | --
142 | 
143 | COMMENT ON COLUMN public.leak_data.dg IS 'The affected DG';
144 | 
145 | 
146 | --
147 | -- Name: leak_data_id_seq; Type: SEQUENCE; Schema: public; Owner: credentialleakdb
148 | --
149 | 
150 | CREATE SEQUENCE public.leak_data_id_seq
151 |     AS integer
152 |     START WITH 1
153 |     INCREMENT BY 1
154 |     NO MINVALUE
155 |     NO MAXVALUE
156 |     CACHE 1;
157 | 
158 | 
159 | ALTER TABLE public.leak_data_id_seq OWNER TO credentialleakdb;
160 | 
161 | --
162 | -- Name: leak_data_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: credentialleakdb
163 | --
164 | 
165 | ALTER SEQUENCE public.leak_data_id_seq OWNED BY public.leak_data.id;
166 | 
167 | 
168 | --
169 | -- Name: leak_id_seq; Type: SEQUENCE; Schema: public; Owner: credentialleakdb
170 | --
171 | 
172 | CREATE SEQUENCE public.leak_id_seq
173 |     AS integer
174 |     START WITH 1
175 |     INCREMENT BY 1
176 |     NO MINVALUE
177 |     NO MAXVALUE
178 |     CACHE 1;
179 | 
180 | 
181 | ALTER TABLE public.leak_id_seq OWNER TO credentialleakdb;
182 | 
183 | --
184 | -- Name: leak_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: credentialleakdb
185 | --
186 | 
187 | ALTER SEQUENCE public.leak_id_seq OWNED BY public.leak.id;
188 | 
189 | 
190 | --
191 | -- Name: leak id; Type: DEFAULT; Schema: public; Owner: credentialleakdb
192 | --
193 | 
194 | ALTER TABLE ONLY public.leak ALTER COLUMN id SET DEFAULT nextval('public.leak_id_seq'::regclass);
195 | 
196 | 
197 | --
198 | -- Name: leak_data id; Type: DEFAULT; Schema: public; Owner: credentialleakdb
199 | --
200 | 
201 | ALTER TABLE ONLY public.leak_data ALTER COLUMN id SET DEFAULT nextval('public.leak_data_id_seq'::regclass);
202 | 
203 | 
204 | SELECT pg_catalog.setval('public.leak_id_seq', 1, true);
205 | --
206 | -- Data for Name: leak; Type: TABLE DATA; Schema: public; Owner: credentialleakdb
207 | --
208 | 
209 | COPY public.leak (id, breach_ts, source_publish_ts, ingestion_ts, summary, ticket_id, reporter_name, source_name) FROM stdin;
210 | 1	2021-03-08 13:58:41.179+01	2021-03-08 13:58:41.179+01	2021-03-06 23:40:20.116348+01	CIT0DAY-2	CSIRC-99999	aaron	HaveIBennPwned
211 | 2	2021-03-06 23:40:47.266962+01	2021-03-06 23:40:47.266962+01	2021-03-06 23:40:47.266962+01	COMB	CSIRC-102	aaron	independen research
212 | 3	2021-03-06 23:41:10.245034+01	2021-03-06 23:41:10.245034+01	2021-03-06 23:41:10.245034+01	cit0day	CSIRC-103	aaron	HaveIBeenPwned
213 | \.
214 | 
215 | 
216 | --
217 | -- Data for Name: leak_data; Type: TABLE DATA; Schema: public; Owner: credentialleakdb
218 | --
219 | 
220 | SELECT pg_catalog.setval('public.leak_data_id_seq', 1, true);
221 | 
222 | COPY public.leak_data (id, leak_id, email, password, password_plain, password_hashed, hash_algo, ticket_id, email_verified, password_verified_ok, ip, domain, browser, malware_name, infected_machine, dg, count_seen) FROM stdin;
223 | 1	1	aaron@example.com	12345	12345	\N	\N	CISRC-199	f	f	1.2.3.4	example.com	Google Chrome	\N	local_laptop	DIGIT	25
224 | 2	1	sarah@example.com	123456	123456	\N	\N	CISRC-199	f	f	1.2.3.5	example.com	Firefox	\N	sarahs_laptop	DIGIT	8
225 | 3	1	ben@example.com	ohk7do7gil6O	ohk7do7gil6O	4aa7985dad6e1f02238c2e2afc521c4d3dd30650656cd07bf0b7cfd3cd1190b7	sha256	CISRC-199	f	f	1.2.3.5	example.com	Firefox	\N	WORKSTATION	DIGIT	8
226 | 4	1	david@example.com	24b3f998468a9da4105e6c78f5444532cde99d53c011715754194c3b4f3e37b4	\N	24b3f998468a9da4105e6c78f5444532cde99d53c011715754194c3b4f3e37b4	sha256	CISRC-199	f	f	8.8.8.8	example.com	Firefox	\N	Macbook Pro	DIGIT	8
227 | 5	2	lauri@example.com	Vie5kuuwiroo	Vie5kuuwiroo	\N	\N	CISRC-200	t	t	9.9.9.9	example.com	Firefox	\N	Raspberry PI 3+	DIGIT	8
228 | 6	2	natasha@example.com	1235kuuwiroo	1235kuuwiroo	\N	\N	CISRC-201	t	t	9.9.9.9	example.com	Firefox	\N	Raspberry PI 3+	DIGIT	2
229 | \.
230 | 
231 | 
232 | --
233 | -- Name: leak_data_id_seq; Type: SEQUENCE SET; Schema: public; Owner: credentialleakdb
234 | --
235 | 
236 | SELECT pg_catalog.setval('public.leak_data_id_seq', 7, true);
237 | 
238 | 
239 | --
240 | -- Name: leak_id_seq; Type: SEQUENCE SET; Schema: public; Owner: credentialleakdb
241 | --
242 | 
243 | SELECT pg_catalog.setval('public.leak_id_seq', 4, true);
244 | 
245 | 
246 | --
247 | -- Name: leak_data constr_unique_leak_data_leak_id_email_password_domain; Type: CONSTRAINT; Schema: public; Owner: credentialleakdb
248 | --
249 | 
250 | ALTER TABLE ONLY public.leak_data
251 |     ADD CONSTRAINT constr_unique_leak_data_leak_id_email_password_domain UNIQUE (leak_id, email, password, domain);
252 | 
253 | 
254 | --
255 | -- Name: leak_data leak_data_pkey; Type: CONSTRAINT; Schema: public; Owner: credentialleakdb
256 | --
257 | 
258 | ALTER TABLE ONLY public.leak_data
259 |     ADD CONSTRAINT leak_data_pkey PRIMARY KEY (id);
260 | 
261 | 
262 | --
263 | -- Name: leak leak_pkey; Type: CONSTRAINT; Schema: public; Owner: credentialleakdb
264 | --
265 | 
266 | ALTER TABLE ONLY public.leak
267 |     ADD CONSTRAINT leak_pkey PRIMARY KEY (id);
268 | 
269 | 
270 | --
271 | -- Name: idx_leak_data_dg; Type: INDEX; Schema: public; Owner: credentialleakdb
272 | --
273 | 
274 | CREATE INDEX idx_leak_data_dg ON public.leak_data USING btree (dg);
275 | 
276 | 
277 | --
278 | -- Name: idx_leak_data_email; Type: INDEX; Schema: public; Owner: credentialleakdb
279 | --
280 | 
281 | CREATE INDEX idx_leak_data_email ON public.leak_data USING btree (upper(email));
282 | 
283 | 
284 | --
285 | -- Name: idx_leak_data_email_password_machine; Type: INDEX; Schema: public; Owner: credentialleakdb
286 | --
287 | 
288 | CREATE INDEX idx_leak_data_email_password_machine ON public.leak_data USING btree (email, password, infected_machine);
289 | 
290 | 
291 | --
292 | -- Name: idx_leak_data_malware_name; Type: INDEX; Schema: public; Owner: credentialleakdb
293 | --
294 | 
295 | CREATE INDEX idx_leak_data_malware_name ON public.leak_data USING btree (malware_name);
296 | 
297 | 
298 | --
299 | -- Name: idx_leak_data_unique_leak_id_email_password_domain; Type: INDEX; Schema: public; Owner: credentialleakdb
300 | --
301 | 
302 | CREATE UNIQUE INDEX idx_leak_data_unique_leak_id_email_password_domain ON public.leak_data USING btree (leak_id, email, password, domain);
303 | 
304 | 
305 | --
306 | -- Name: leak_data leak_data_leak_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: credentialleakdb
307 | --
308 | 
309 | ALTER TABLE ONLY public.leak_data
310 |     ADD CONSTRAINT leak_data_leak_id_fkey FOREIGN KEY (leak_id) REFERENCES public.leak(id);
311 | 
312 | 
313 | --
314 | -- PostgreSQL database dump complete
315 | --
316 | 
317 | 


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/lib/__init__.py


--------------------------------------------------------------------------------
/lib/basecollector/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/lib/basecollector/__init__.py


--------------------------------------------------------------------------------
/lib/basecollector/collector.py:
--------------------------------------------------------------------------------
 1 | """
 2 | BaseCollector
 3 | 
 4 | This implements the abstract collector interface
 5 | """
 6 | import pandas as pd
 7 | import logging
 8 | 
 9 | 
10 | class BaseCollector:
11 |     """
12 |     BaseCollector: purely abstract class which defines the interface:
13 |       collect(input_source)
14 | 
15 |     Please note that this does *not* yet return a data frame in the internal data format (IDF).
16 |     So all that a BaseCollector shall return is a tuple ("OK"/some_error string and a pandas DF (which may be empty
17 |     in case of error).
18 | 
19 |     Example:
20 |         ("OK", pd.DataFrame(... my data...) )           --> all ok, the data is in the DF.
21 |     or
22 |         ("Could not parse CSV file: file does not exist", pd.DataFrame())   --> error message and empty DF.
23 | 
24 |     The role of the Collector is to
25 |      1. fetch the data
26 |      2. check if the data is complete
27 |      3. put it into an internal format (in our case a pandas DF) which may be processed by a parser
28 |      4. return it as pandas DF to the next processing step in the chain
29 |      5. return errors in case it encountered errors in validation.
30 |     """
31 | 
32 |     def __init__(self):
33 |         pass
34 | 
35 |     def collect(self, input_file: str, **kwargs) -> (str, pd.DataFrame):
36 |         """
37 |         Collect the data from input_file and return a pandas DF.
38 | 
39 |         :rtype: tuple return code ("OK" in case of success) and pandas DataFrame with the data
40 |         """
41 |         try:
42 |             with open(input_file, "r") as f:
43 |                 df = pd.read_csv(f, **kwargs)
44 |                 return "OK", df
45 |         except Exception as ex:
46 |             logging.exception("could not parse CSV file. Reason: %r" % (str(ex),))
47 |             return str(ex), pd.DataFrame()
48 | 


--------------------------------------------------------------------------------
/lib/baseenricher/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/lib/baseenricher/__init__.py


--------------------------------------------------------------------------------
/lib/baseenricher/enricher.py:
--------------------------------------------------------------------------------
 1 | """Purely abstract base enricher class."""
 2 | 
 3 | from models.idf import InternalDataFormat
 4 | 
 5 | 
 6 | class BaseEnricher:
 7 |     def __init__(self):
 8 |         pass
 9 | 
10 |     def enrich(self, idf: InternalDataFormat) -> InternalDataFormat:
11 |         return idf
12 | 


--------------------------------------------------------------------------------
/lib/baseoutput/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/lib/baseoutput/__init__.py


--------------------------------------------------------------------------------
/lib/baseoutput/output.py:
--------------------------------------------------------------------------------
 1 | """Base, abstract Output class"""
 2 | 
 3 | from models.outdf import Answer
 4 | 
 5 | 
 6 | class BaseOutput:
 7 |     def __init__(self):
 8 |         pass
 9 | 
10 |     def process(self, output_data: Answer) -> bool:
11 |         """
12 |         Process the output_data and do something with it.
13 | 
14 |         :returns bool... True on success.
15 |         """
16 |         return True
17 | 


--------------------------------------------------------------------------------
/lib/baseparser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/lib/baseparser/__init__.py


--------------------------------------------------------------------------------
/lib/baseparser/parser.py:
--------------------------------------------------------------------------------
 1 | """Base Parser definitions. Purely abstract."""
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from models.idf import InternalDataFormat
 6 | 
 7 | 
 8 | class BaseParser:
 9 |     def __init__(self):
10 |         pass
11 | 
12 |     def parse(self, df: pd.DataFrame) -> InternalDataFormat:
13 |         pass
14 | 


--------------------------------------------------------------------------------
/lib/db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/lib/db/__init__.py


--------------------------------------------------------------------------------
/lib/db/db.py:
--------------------------------------------------------------------------------
 1 | """Very very lightweight DB abstraction"""
 2 | 
 3 | import os
 4 | import psycopg2
 5 | import psycopg2.extras
 6 | 
 7 | from fastapi import HTTPException
 8 | import logging
 9 | 
10 | 
11 | #################################
12 | # DB functions
13 | 
14 | db_conn = None
15 | DSN = "host=%s dbname=%s user=%s password=%s" % (os.getenv('DBHOST', 'localhost'),
16 |                                                  os.getenv('DBNAME', 'credentialleakdb'),
17 |                                                  os.getenv('DBUSER', 'credentialleakdb'),
18 |                                                  os.getenv('DBPASSWORD'))
19 | 
20 | 
21 | def _get_db():
22 |     """
23 |     Open a new database connection if there is none yet for the
24 |     current application context.
25 | 
26 |     :returns: the DB handle."""
27 |     global db_conn
28 | 
29 |     if not db_conn:
30 |         db_conn = _connect_db(DSN)
31 |     return db_conn
32 | 
33 | 
34 | # noinspection PyUnresolvedReferences
35 | def _close_db():
36 |     """Closes the database again at the end of the request."""
37 |     global db_conn
38 | 
39 |     logging.info('shutting down....')
40 |     if db_conn:
41 |         db_conn.close()
42 |         db_conn = None
43 |     return db_conn
44 | 
45 | 
46 | def _connect_db(dsn: str):
47 |     """Connects to the specific database.
48 | 
49 |     :param dsn: the database connection string.
50 |     :returns: the DB connection handle
51 |     """
52 |     try:
53 |         conn = psycopg2.connect(dsn)
54 |         conn.set_session(autocommit=True)
55 |     except Exception as ex:
56 |         raise HTTPException(status_code=500, detail="could not connect to the DB. Reason: %s" % (str(ex)))
57 |     logging.info("connection to DB established")
58 |     return conn
59 | 


--------------------------------------------------------------------------------
/lib/helpers.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | LOG_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 6 | LOG_FORMAT = '%(asctime)s - [%(name)s:%(module)s:%(funcName)s] - %(levelname)s - %(message)s'
 7 | 
 8 | def getlogger(name: str, log_level=logging.INFO) -> logging.Logger:
 9 |     """This is how we do logging. How to use it:
10 | 
11 |     Add the following code snippet to every module
12 |     ```
13 |     logger = getlogger(__name__)
14 |     logger.info("foobar")
15 |     ```
16 | 
17 |     :param name - name of the logger
18 |     :param log_level - default log level
19 | 
20 |     :returns logging.Logger object
21 |     """
22 |     logger = logging.getLogger(name)
23 |     logger.setLevel(log_level)
24 | 
25 |     # create console handler
26 |     ch = logging.StreamHandler()
27 | 
28 |     formatter = logging.Formatter(LOG_FORMAT)
29 |     ch.setFormatter(formatter)
30 |     logger.addHandler(ch)
31 | 
32 |     logger.info('Logger ready')
33 | 
34 |     return logger
35 | 
36 | 
37 | def peek_into_file(fname: Path) -> csv.Dialect:
38 |     """
39 |     Peek into a file in order to determine the dialect for pandas.read_csv() / csv functions.
40 | 
41 |     :param fname: a Path object for the filename
42 |     :return: a csv.Dialect
43 |     """
44 | 
45 |     with fname.open(mode = 'r') as f:
46 |         sniffer = csv.Sniffer()
47 |         logging.debug("has apikeyheader: %s", sniffer.has_header(f.readline()))
48 |         f.seek(0)
49 |         dialect = sniffer.sniff(f.readline(50))
50 |         logging.debug("delim: '%s'", dialect.delimiter)
51 |         logging.debug("quotechar: '%s'", dialect.quotechar)
52 |         logging.debug("doublequote: %s", dialect.doublequote)
53 |         logging.debug("escapechar: '%s'", dialect.escapechar)
54 |         logging.debug("lineterminator: %r", dialect.lineterminator)
55 |         logging.debug("quoting: %s", dialect.quoting)
56 |         logging.debug("skipinitialspace: %s", dialect.skipinitialspace)
57 |         # noinspection PyTypeChecker
58 |         return dialect
59 | 
60 | 
61 | def anonymize_password(password: str) -> str:
62 |     """
63 |     "*"-out the characters of a password. Must be 4 chars in length at least.
64 | 
65 |     :param password: str
66 |     :returns anonymized password (str):
67 | 
68 |     """
69 |     anon_password = password
70 |     if password and len(password) >= 4:
71 |         prefix = password[:1]
72 |         suffix = password[-2:]
73 |         anon_password = prefix + "*" * (len(password) - 3) + suffix
74 |     return anon_password
75 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/models/__init__.py


--------------------------------------------------------------------------------
/models/idf.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | from pydantic import BaseModel, IPvAnyAddress
 3 | 
 4 | 
 5 | class InternalDataFormat(BaseModel):
 6 |     """The Internal Data Format (IDF)."""
 7 |     leak_id: Optional[str]      # the leak(id) reference
 8 |     email: str
 9 |     password: Optional[str]     # not mandatory yet
10 |     password_plain: Optional[str]
11 |     password_hashed: Optional[str]
12 |     hash_algo: Optional[str]
13 |     ticket_id: Optional[str]
14 |     email_verified: Optional[bool] = False
15 |     password_verified_ok: Optional[bool] = False
16 |     ip: Optional[IPvAnyAddress]
17 |     domain: Optional[str]
18 |     target_domain: Optional[str]
19 |     browser: Optional[str]
20 |     malware_name: Optional[str]
21 |     infected_machine: Optional[str]
22 |     #
23 |     # flags set by the enrichers
24 |     dg: Optional[str]
25 |     external_user: Optional[bool]
26 |     is_vip: Optional[bool]
27 |     is_active_account: Optional[bool]
28 |     credential_type: Optional[List[str]]    # External, EU Login, etc.
29 |     report_to: Optional[List[str]]          # whom to report this to?
30 |     #
31 |     # meta stuff and things for error reporting
32 |     count_seen: Optional[int] = 1
33 |     original_line: Optional[str]
34 |     error_msg: Optional[str]
35 |     notify: Optional[bool]
36 |     needs_human_intervention: Optional[bool]
37 | 


--------------------------------------------------------------------------------
/models/indf.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from typing import Optional, Union
 3 | from pydantic import BaseModel, IPvAnyAddress
 4 | 
 5 | 
 6 | class SpyCloudInputEntry(BaseModel):
 7 |     """The SpyCloud intput format - one entry."""
 8 |     breach_title: str
 9 |     spycloud_publish_date: Optional[Union[str, datetime]]
10 |     breach_date: Optional[Union[str, datetime]]
11 |     email: str  # mandatory
12 |     domain: str  # mandatory
13 |     username: Optional[str]
14 |     password: str
15 |     salt: Optional[str]
16 |     target_domain: Optional[str]
17 |     target_url: Optional[str]
18 |     password_plaintext: str = None
19 |     sighting: Optional[int]
20 |     severity: Optional[str]
21 |     status: Optional[str]
22 |     password_type: Optional[str]
23 |     cc_number: Optional[str]
24 |     infected_path: Optional[str]
25 |     infected_machine_id: Optional[str]
26 |     email_domain: str
27 |     cc_expiration: Optional[str]
28 |     cc_last_four: Optional[str]
29 |     email_username: str
30 |     user_browser: Optional[str]
31 |     infected_time: Optional[Union[str, datetime]]
32 |     ip_addresses: Optional[Union[str, IPvAnyAddress]]
33 | 


--------------------------------------------------------------------------------
/models/outdf.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | from enum import Enum
 3 | from typing import Optional, Dict, List  # Union
 4 | from pydantic import BaseModel, EmailStr
 5 | 
 6 | 
 7 | class Leak(BaseModel):
 8 |     id: Optional[int]
 9 |     ticket_id: Optional[str]
10 |     summary: str
11 |     reporter_name: Optional[str]
12 |     source_name: Optional[str]
13 |     breach_ts: Optional[datetime.datetime]
14 |     source_publish_ts: Optional[datetime.datetime]
15 | 
16 | 
17 | class CredentialType(Enum):
18 |     is_external = "External"
19 |     is_proxy_login = "Proxy"
20 |     is_EU_login = "EU Login"
21 |     is_domain_login = "Domain"
22 |     is_secem_login = "SECEM"
23 | 
24 | 
25 | class LeakData(BaseModel):
26 |     id: Optional[int]
27 |     leak_id: int
28 |     email: EmailStr
29 |     password: str
30 |     password_plain: Optional[str]
31 |     password_hashed: Optional[str]
32 |     hash_algo: Optional[str]
33 |     ticket_id: Optional[str]
34 |     email_verified: Optional[bool]
35 |     password_verified_ok: Optional[bool]
36 |     ip: Optional[str]
37 |     domain: Optional[str]
38 |     target_domain: Optional[str]  # new
39 |     browser: Optional[str]
40 |     malware_name: Optional[str]
41 |     infected_machine: Optional[str]
42 |     dg: Optional[str]
43 |     is_vip: Optional[bool]
44 |     credential_type: Optional[List[CredentialType]]
45 |     report_to: Optional[List[str]]  # the security contact to report this to, in case it's not the the user directly.
46 |     #
47 |     # meta stuff and things for error reporting
48 |     count_seen: Optional[int] = 1
49 |     original_line: Optional[str]  # the original CSV file in case of errors
50 |     error_msg: Optional[str]
51 |     notify: bool
52 |     needs_human_intervention: bool
53 | 
54 | 
55 | class AnswerMeta(BaseModel):
56 |     version: str
57 |     duration: float
58 |     count: int
59 | 
60 | 
61 | class Answer(BaseModel):
62 |     meta: Optional[AnswerMeta]
63 |     data: List[Dict]  # Union[Dict,List]
64 |     success: bool
65 |     errormsg: Optional[str] = ""
66 | 


--------------------------------------------------------------------------------
/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/modules/__init__.py


--------------------------------------------------------------------------------
/modules/collectors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/modules/collectors/__init__.py


--------------------------------------------------------------------------------
/modules/collectors/parser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """importer.parser """
 3 | 
 4 | 
 5 | from lib.helpers import getlogger
 6 | from pathlib import Path
 7 | import csv
 8 | import time
 9 | 
10 | import pandas as pd
11 | 
12 | debug = True
13 | 
14 | logger = getlogger(__name__)
15 | 
16 | 
17 | # noinspection PyTypeChecker
18 | def peek_into_file(fname: Path) -> csv.Dialect:
19 |     """
20 |     Peek into a file in order to determine the dialect for pandas.read_csv() / csv functions.
21 | 
22 |     :param fname: a Path object for the filename
23 |     :return: a csv.Dialect
24 |     """
25 | 
26 |     with fname.open(mode='r') as f:
27 |         sniffer = csv.Sniffer()
28 |         logger.debug("has apikeyheader: %s", sniffer.has_header(f.readline()))
29 |         f.seek(0)
30 |         dialect = sniffer.sniff(f.readline(50))
31 |         logger.debug("delim: '%s'", dialect.delimiter)
32 |         logger.debug("quotechar: '%s'", dialect.quotechar)
33 |         logger.debug("doublequote: %s", dialect.doublequote)
34 |         logger.debug("escapechar: '%s'", dialect.escapechar)
35 |         logger.debug("lineterminator: %r", dialect.lineterminator)
36 |         logger.debug("quoting: %s", dialect.quoting)
37 |         logger.debug("skipinitialspace: %s", dialect.skipinitialspace)
38 |         return dialect
39 | 
40 | 
41 | class BaseParser:
42 |     """The abstract Parser class."""
43 |     def __init__(self):
44 |         pass
45 | 
46 |     def parse_file(self, fname: Path, leak_id: int = None, csv_dialect=None) -> pd.DataFrame:
47 |         """Parse file (non-recursive) and returns a DataFrame with the contents.
48 |         Overwrite this method in YOUR Parser subclass.
49 | 
50 |         # Parameters
51 |           * fname: a Path object with the filename of the CSV file which should be parsed.
52 |           * leak_id: the leak_id in the DB which is associated with that CSV dump file.
53 |         # Returns
54 |             a DataFrame
55 |             number of errors while parsing
56 |         """
57 |         logger.info("Parsing file %s..." % fname)
58 |         try:
59 |             if csv_dialect:
60 |                 dialect = csv_dialect
61 |             else:
62 |                 dialect = peek_into_file(fname)     # try to guess
63 |             df = pd.read_csv(fname, dialect=dialect, error_bad_lines=False, warn_bad_lines=True)  # , usecols=range(2))
64 |             logger.debug(df.head())
65 |             logger.debug(df.info())
66 |             logger.debug("Parsing file 2...")
67 |             df.insert(0, 'leak_id', leak_id)
68 |             logger.debug(df.head())
69 |             logger.debug("parsed %s", fname)
70 |             return df
71 | 
72 |         except Exception as ex:
73 |             logger.error("could not pandas.read_csv(%s). Reason: %s. Skipping file." % (fname, str(ex)))
74 |             raise ex        # pass it on
75 | 
76 |     def normalize_data(self, df: pd.DataFrame, leak_id: int = None) -> pd.DataFrame:
77 |         """
78 |         Normalize the given data / data frame
79 | 
80 |         :param df: a pandas df with the leak_data
81 |         :param leak_id: foreign key to the leak table
82 |         :return: a pandas df
83 |         """
84 |         # replace NaN with None
85 |         return df.where(pd.notnull(df), None)
86 | 
87 | 
88 | if __name__ == "__main__":
89 | 
90 | 
91 |     p = BaseParser()
92 |     t0 = time.time()
93 |     # p.parse_recursively('test_leaks', '*.txt')
94 |     t1 = time.time()
95 |     logger.info("processed everything in %f [sec]", (t1 - t0))
96 | 


--------------------------------------------------------------------------------
/modules/collectors/sample.csv:
--------------------------------------------------------------------------------
1 | email,password
2 | aaron@example.com,12345
3 | benoit@foo.bar.com,12345
4 | sarah@example.com,123456
5 | 


--------------------------------------------------------------------------------
/modules/collectors/spycloud.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Spycloud parser"""
 3 | import collections
 4 | import logging
 5 | from pathlib import Path
 6 | 
 7 | 
 8 | import pandas as pd
 9 | # from parser import BaseParser
10 | from .parser import BaseParser
11 | 
12 | 
13 | class SpycloudParser(BaseParser):
14 |     """Parse Spycloud CSVs"""
15 |     def parse_file(self, fname: Path, csv_dialect='excel', leak_id=None) -> pd.DataFrame:
16 |         """Parse the Spycloud CSV files, which are in the form:
17 | 
18 |             breach_title,spycloud_publish_date,breach_date,email,domain,username,password,salt,target_domain,target_url,password_plaintext,sighting,severity,status,password_type,cc_number,infected_path,infected_machine_id,email_domain,cc_expiration,cc_last_four,email_username,user_browser,infected_time,ip_addresses
19 | 
20 |         Returns:
21 |             a DataFrame
22 |             number of errors while parsing
23 |         """
24 |         logging.debug("Parsing SPYCLOUD file %s...", fname)
25 |         try:
26 |             # df = pd.read_csv(fname, dialect=csv_dialect, header=1, error_bad_lines=False, warn_bad_lines=True)
27 |             df = pd.read_csv(fname, error_bad_lines=False, warn_bad_lines=True)
28 |             logging.debug(df)
29 |             return df
30 | 
31 |         except Exception as ex:
32 |             logging.error("could not pandas.read_csv(%s). Reason: %s. Skipping file." % (fname, str(ex)))
33 |             return pd.DataFrame()
34 | 
35 |     def normalize_data(self, df: pd.DataFrame, leak_id=None) -> pd.DataFrame:
36 |         """Bring the pandas DataFrame into an internal data format."""
37 | 
38 |         """ Spycloud headers:
39 |           breach_title, spycloud_publish_date, breach_date, email,     domain, username, password, salt, target_domain, target_url, password_plaintext, sighting, severity, status, password_type, cc_number, infected_path, infected_machine_id, email_domain, cc_expiration, cc_last_four, email_username, user_browser, infected_time, ip_addresses
40 |         map to:
41 |           _,            leak.source_publish_ts, leak.breach_ts, email, domain, _,        password, _,    target_domain, _,          password_plain, _,            _,          _,    hash_algo, _, _,                          infected_machine, _ , _, _, _, browser, _, ip
42 |         """
43 |         mapping_tbl = collections.OrderedDict({
44 |             "breach_title": None,
45 |             "spycloud_publish_date": None,
46 |             "breach_date": None,
47 |             "email": "email",
48 |             "domain": None,
49 |             "username": None,
50 |             "password": "password",
51 |             "salt": None,
52 |             "target_domain": "target_domain",
53 |             "target_url": None,
54 |             "password_plaintext": "password_plain",
55 |             "sighting": None,
56 |             "severity": None,
57 |             "status": None,
58 |             "password_type": "hash_algo",
59 |             "cc_number": None,
60 |             "infected_path": None,
61 |             "infected_machine_id": "infected_machine",
62 |             "email_domain": "domain",
63 |             "cc_expiration": None,
64 |             "cc_last_four": None,
65 |             "email_username": None,
66 |             "user_browser": "browser",
67 |             "infected_time": None,
68 |             "ip_addresses": "ip"
69 |         })
70 | 
71 |         # This complexity sucks! need to get rid of it. No, itertools won't make it more understandable.
72 |         retdf = pd.DataFrame()
73 |         for i, r in df.iterrows():       # go over all df rows. Returns index, row
74 |             # print(f"{i}:{r}")
75 |             retrow = dict()             # build up what we want to return
76 |             for k, v in r.items():       # go over all key-val items in the row
77 |                 # print(f"{k}:{v}", file=sys.stderr)
78 |                 if k in mapping_tbl.keys():
79 |                     map_to = mapping_tbl[k]
80 |                     if k == 'ip_addresses' and v == '-':
81 |                         v = None
82 |                     if map_to:
83 |                         # print(f"mapping {k} to {map_to}!")
84 |                         retrow[map_to] = v
85 |                     else:
86 |                         # don't map it
87 |                         pass
88 |             logging.debug("retrow = %r" % retrow)
89 |             retdf = retdf.append(pd.Series(retrow), ignore_index=True)
90 |         # retdf[:,'leak_id'] = leak_id
91 |         logging.debug("retdf: %s" % retdf)
92 |         return retdf
93 | 


--------------------------------------------------------------------------------
/modules/collectors/spycloud/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/modules/collectors/spycloud/__init__.py


--------------------------------------------------------------------------------
/modules/collectors/spycloud/collector.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Spycloud collector
 3 | 
 4 | This code implements a SpyCloud collector (inherits from BaseCollector)
 5 | 
 6 | Upon running a SpyCloud parser on a CSV, the result will be a
 7 | """
 8 | from pathlib import Path
 9 | import logging
10 | import pandas as pd
11 | 
12 | from lib.basecollector.collector import BaseCollector
13 | from lib.helpers import peek_into_file
14 | 
15 | NaN_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', '<NA>', 'N/A',
16 |               'NA', 'NULL', 'NaN', 'n/a', 'null', '-']
17 | 
18 | 
19 | class SpyCloudCollector(BaseCollector):
20 |     def __init__(self):
21 |         super().__init__()
22 | 
23 |     def collect(self, input_file: Path, **kwargs) -> (str, pd.DataFrame):
24 |         try:
25 |             dialect = peek_into_file(input_file)
26 |             df = pd.read_csv(input_file, dialect=dialect, na_values=NaN_values,
27 |                              keep_default_na=False, error_bad_lines=False, warn_bad_lines=True)
28 |             # XXX FIXME: need to collect the list of (pandas-) unparseable rows and present to user.
29 |             # For now we simply fail on the whole file. Good enough for the moment.
30 |         except pd.errors.ParserError as ex:
31 |             logging.error("could not parse CSV file. Reason: %r" % (str(ex),))
32 |             return str(ex), pd.DataFrame()
33 |         return "OK", df
34 | 


--------------------------------------------------------------------------------
/modules/collectors/test_leaks/COMB/test_data.txt:
--------------------------------------------------------------------------------
 1 | 5hv 209@hotmail.com:Adam
 2 | 5hv @bseomail.com:169818
 3 | 5hv deniz@gmail.com:1234567
 4 | 5hv lol@yahoo.com.au:5hvm
 5 | 5hv sä±k iåÿ m@gmail.com:1234w
 6 | 5hv!200@mail.ru:Ali4203642036
 7 | 5hv!@aol.com:encuestas67
 8 | 5hv!@fsf.com:gunther
 9 | 5hv!@mail.ru:13371337
10 | 5hv!@mail.ru:13371337Р№
11 | 5hv!@mail.ru:200029cs
12 | 5hv!@rambler.ru:13371337
13 | 5hv!@redi.com:http
14 | 5hv!@redi.com:http://www.javatpoint.com/RegisterAn.gif
15 | 5hv!@yandex.ru:200029cs
16 | 5hv!_bosha@mail.ru:5hvibosha
17 | 5hv!_saya@mail.ru:3331348s
18 | 5hv!eva1978@mail.ru:5hvm2005
19 | 5hv!ka_love@bk.ru:52m86m12
20 | 5hv#@mail.ru:facd4321
21 | 5hv$-5hv$2017@inbox.ru:1q1a1z
22 | 5hv$.a$da$da$.01@mail.ru:7010ckfdf
23 | 5hv$.a$da$da$.09@mail.ru:7010ckfdf
24 | 5hv$621#621@hotmail.com:hotgirl14
25 | 5hv$621@hotmail.com:hotgirl14
26 | 5hv$baev04@mail.ru:877505b
27 | 5hv$baev06@mail.ru:877505b
28 | 5hv$da15@inbox.ru:q1w2e3azsd
29 | 5hv$dad@gmail.com:internet
30 | 5hv$df$af@inbox.ru:053320107b
31 | 5hv$dfez$g@rambler.ru:7utIrccfiq
32 | 5hv$ev01@windowslive.com:ertek124578
33 | 5hv$h1976@mail.ru:bkmdbyf2010
34 | 5hv$h1978@mail.ru:bkmdbyf2010
35 | 5hv$h1981@mail.ru:0007
36 | 5hv$h1983@mail.ru:0007
37 | 5hv$hevo@list.ru:galina2612
38 | 5hv$hkevich_lili@mail.ru:OfDtKm123
39 | 5hv$ik159@list.ru:5hvsik12345
40 | 5hv$ki03@mail.ru:qwerty123456
41 | 5hv$ki05@mail.ru:qwerty123456
42 | 5hv$ko762@gmail.com:ZADYMA2469
43 | 5hv$tra3000@yandex.ru:mega667
44 | 5hv&#39;janelesley@yahoo.com:danthony12
45 | 5hv&#39;sfamilydaycare@yahoo.com:zayas65
46 | 5hv&&monika11@aol.com:24crow
47 | 5hv&alex@myspace.com:alexander0
48 | 5hv&elizabeth@sbcglobal.net:5hv123
49 | 5hv&oreo@netzero.com:onorio1
50 | 5hv's11@mail.ru:111111ga
51 | 5hv's12@mail.ru:111111ga
52 | 5hv's13@mail.ru:111111ga
53 | 5hv'slen@yahoo.com:181818
54 | 5hv*litterprincesita@hotmail.com:zuricata16
55 | 5hv*litterprincessita@hotmail.com:zuricata16
56 | 


--------------------------------------------------------------------------------
/modules/collectors/test_leaks/README.md:
--------------------------------------------------------------------------------
 1 | # Overview 
 2 | 
 3 | These leaks are from public leaks and are truncated data! They are here only for testing.
 4 | Email addresses have been pseudonymized for privacy reasons. They are not real email addresses.
 5 | 
 6 | 
 7 | 
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/modules/enrichers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/modules/enrichers/__init__.py


--------------------------------------------------------------------------------
/modules/enrichers/abuse_contact.py:
--------------------------------------------------------------------------------
 1 | """AbuseContactLookup: look up the right abuse contact based on a user's email address."""
 2 | 
 3 | import collections
 4 | import re
 5 | from typing import List
 6 | 
 7 | 
 8 | class AbuseContactLookup:
 9 |     """A simple abuse contact lookup class."""
10 | 
11 |     def lookup(self, email: str) -> List[str]:
12 |         """Look up the right abuse contact for credential leaks based on the email address.
13 |         Example:
14 |             lookup("example@jrc.it")   --> "reports@jrc.it"
15 | 
16 |         :argument email: the email address
17 |         :rtype string: string
18 |         :returns email: the email address for the abuse contact
19 |         """
20 | 
21 |         """The following mapping table is of the form:
22 |            regular expression   --> email address or "DIRECT".   If DIRECT is returned, send directly to the email addr.
23 |            The matching should proceed top down
24 |         """
25 | 
26 |         mapping_table = collections.OrderedDict({
27 |             re.compile(r"example\.ec\.europa\.eu", re.X): ["ec-digit-csirc@ec.europa.eu"],       # example
28 |             re.compile(r".*\.ec\.europa\.eu", re.X): "DIRECT",
29 |             re.compile(r".*", re.X): "DIRECT"          # the default catch-all rule. Don't delete!
30 |         })
31 | 
32 |         domain = email.split('@')[-1]
33 |         for k, v in mapping_table.items():
34 |             if re.match(k, domain):
35 |                 if v == "DIRECT":
36 |                     return [email]
37 |                 else:
38 |                     return v
39 |         return [""]
40 | 


--------------------------------------------------------------------------------
/modules/enrichers/external_email.py:
--------------------------------------------------------------------------------
 1 | """ExternalEmailEnricher"""
 2 | 
 3 | 
 4 | class ExternalEmailEnricher:
 5 |     """Can determine if an Email Adress is an (organisation-) external email address. Also super trivial code."""
 6 | 
 7 |     @staticmethod
 8 |     def is_internal_email(email: str) -> bool:
 9 |         email = email.lower()
10 |         if email and email.endswith('europa.eu') or email.endswith('jrc.it'):
11 |             return True
12 |         else:
13 |             return False
14 | 
15 |     @staticmethod
16 |     def is_external_email(email: str) -> bool:
17 |         return not ExternalEmailEnricher.is_internal_email(email)
18 | 


--------------------------------------------------------------------------------
/modules/enrichers/ldap.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from typing import Union
 4 | 
 5 | from modules.enrichers.ldap_lib import CEDQuery
 6 | 
 7 | 
 8 | class LDAPEnricher:
 9 |     """LDAP Enricher can query LDAP and offers multiple functions such as email-> dg"""
10 | 
11 |     simulate_ldap: bool = False
12 | 
13 |     def __init__(self):
14 |         self.simulate_ldap = bool(os.getenv('SIMULATE_LDAP', default = False))
15 |         self.ced = CEDQuery()
16 | 
17 |     def email_to_dg(self, email: str) -> str:
18 |         """Return the DG of an email. Note that there might be multiple DGs, we just return the first one here."""
19 | 
20 |         if self.simulate_ldap:
21 |             return "Not connected to LDAP"
22 |         try:
23 |             results = self.ced.search_by_mail(email)
24 |             if results and results[0]['attributes'] and results[0]['attributes']['dg'] and \
25 |                     results[0]['attributes']['dg'][0]:
26 |                 return results[0]['attributes']['dg'][0]
27 |             else:
28 |                 return "Unknown"
29 |         except Exception as ex:
30 |             logging.error("could not query LDAP/CED. Reason: %s" % str(ex))
31 |             raise ex
32 | 
33 |     def email_to_user_id(self, email: str) -> Union[str, None]:
34 |         """Return the userID of an email. """
35 | 
36 |         if self.simulate_ldap:
37 |             return "Not connected to LDAP"
38 |         try:
39 |             results = self.ced.search_by_mail(email)
40 |             if results and results[0]['attributes'] and results[0]['attributes']['ecMoniker'] and \
41 |                     results[0]['attributes']['ecMoniker'][0]:
42 |                 return results[0]['attributes']['ecMoniker'][0]
43 |             else:
44 |                 return None
45 |         except Exception as ex:
46 |             logging.error("could not query LDAP/CED. Reason: %s" % str(ex))
47 |             raise ex
48 | 
49 |     def email_to_status(self, email: str) -> str:
50 |         """Return the active status."""
51 | 
52 |         if self.simulate_ldap:
53 |             return "Not connected to LDAP"
54 | 
55 |         try:
56 |             results = self.ced.search_by_mail(email)
57 |             if results and results[0]['attributes'] and results[0]['attributes']['recordStatus'] and \
58 |                     results[0]['attributes']['recordStatus'][0]:
59 |                 return results[0]['attributes']['recordStatus'][0]
60 |         except Exception as ex:
61 |             logging.error("could not query LDAP/CED. Reason: %s" % str(ex))
62 |             raise ex
63 | 
64 |     def exists(self, email: str) -> bool:
65 |         """Check if a user exists."""
66 | 
67 |         if self.simulate_ldap:
68 |             return False
69 | 
70 |         status = self.email_to_status(email)
71 |         if status and status.upper() == "A":
72 |             return True
73 |         else:
74 |             return False
75 | 


--------------------------------------------------------------------------------
/modules/enrichers/ldap_lib.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import logging
 4 | from ldap3 import Server, Connection, ALL
 5 | 
 6 | import json
 7 | 
 8 | from typing import List
 9 | 
10 | 
11 | class CEDQuery:
12 |     """ CEDQuery class. Encapsulates the LDAP connect and queries to CED.
13 |     Author: L. Aaron Kaplan <leon-aaron.kaplan@ext.ec.europa.eu>
14 |     """
15 | 
16 |     is_connected = False
17 |     conn = None
18 | 
19 |     def __init__(self):
20 |         """ init() function. Automatically connects to LDAP (calls the connect_ldap() function). """
21 |         if not self.is_connected:
22 |             self.server = os.getenv('CED_SERVER', default = 'localhost')
23 |             self.port = int(os.getenv('CED_PORT', default = 389))
24 |             self.user = os.getenv('CED_USER')
25 |             self.password = os.getenv('CED_PASSWORD')
26 |             self.base_dn = os.getenv('CED_BASEDN')
27 |             try:
28 |                 self.connect_ldap(self.server, self.port, self.user, self.password)
29 |             except Exception as ex:
30 |                 logging.error("could ot connect to LDAP. Reason: %s" % str(ex))
31 |                 self.is_connected = False
32 | 
33 |     def connect_ldap(self, server="ldap.example.com", port=389, user=None, password=None):
34 |         """ Connects to the CED LDAP server. Returns None on failure. """
35 |         try:
36 |             ldap_server = Server(server, port = port, get_info = ALL)
37 |             self.conn = Connection(ldap_server, user = user, password = password)
38 |             self.is_connected = self.conn.bind()
39 |             print("Connection = %s" % self.conn)
40 |             logging.info("connect_ldap(): self.conn = %s" % (self.conn,))
41 |             logging.info("connect_ldap(): conn.bind() = %s" % (self.conn.bind(),))
42 |         except Exception as ex:
43 |             logging.error("error connecting to CED. Reason: %s" % (str(ex)))
44 |             self.is_connected = False
45 |             return None
46 | 
47 |     def search_by_mail(self, email: str) -> List[dict]:
48 |         attributes = ['cn', 'dg', 'uid', 'ecMoniker', 'employeeType', 'recordStatus', 'sn', 'givenName', 'mail']
49 |         if not self.is_connected:
50 |             logging.error("Could not search via email. Not connected to LDAP.")
51 |             raise Exception("Could not search via email. Not connected to LDAP.")
52 |         try:
53 |             self.conn.search(self.base_dn, "(mail=%s)" % (email,), attributes = attributes)
54 |         except Exception as ex:
55 |             logging.error("could not search LDAP. error: %s" % str(ex))
56 |             raise ex
57 |         logging.info("search_by_mail(): %s" % (self.conn.entries,))
58 |         results = []
59 |         for entry in self.conn.entries:
60 |             results.append(json.loads(entry.entry_to_json()))
61 |         return results  # yeah, a list comprehension would be more pythonic
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     ced = CEDQuery()
66 |     email = sys.argv[1]
67 |     print(ced.search_by_mail(email))
68 | 


--------------------------------------------------------------------------------
/modules/enrichers/vip.py:
--------------------------------------------------------------------------------
 1 | """VIP Enricher. Can determine if an email addr. is a VIP and needs to be treated specially."""
 2 | 
 3 | import os
 4 | import logging
 5 | from pathlib import Path
 6 | 
 7 | from typing import List
 8 | 
 9 | 
10 | class VIPEnricher:
11 |     """Can determine if an Email Address is a VIP. Super trivial code."""
12 | 
13 |     vips = []
14 | 
15 |     def __init__(self, vipfile: Path = Path('VIPs.txt')):
16 |         try:
17 |             self.load_vips(os.getenv('VIPLIST', default = vipfile))
18 |         except Exception as ex:
19 |             logging.error("Could not load VIP list. Using an empty list and continuing. Exception: %s" % str(ex))
20 | 
21 |     def load_vips(self, path: Path) -> List[str]:
22 |         """Load the external reference data set of the known VIPs."""
23 |         with open(path, 'r') as f:
24 |             self.vips = [x.strip().upper() for x in f.readlines()]
25 |             return self.vips
26 | 
27 |     def is_vip(self, email: str) -> bool:
28 |         """Check if an email address is a VIP."""
29 |         return email.upper() in self.vips
30 | 
31 |     def __str__(self):
32 |         return ",".join(self.vips)
33 | 
34 |     def __repr__(self):
35 |         return ",".join(self.vips)
36 | 


--------------------------------------------------------------------------------
/modules/filters/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/modules/filters/__init__.py


--------------------------------------------------------------------------------
/modules/filters/deduper.py:
--------------------------------------------------------------------------------
 1 | """Deduper - this package offers different deduplicaton functions."""
 2 | 
 3 | import logging
 4 | from typing import Union
 5 | 
 6 | import psycopg2
 7 | import psycopg2.extras
 8 | 
 9 | from lib.db.db import _get_db
10 | 
11 | from models.idf import InternalDataFormat
12 | 
13 | 
14 | class Deduper:
15 |     """The DB based deduper."""
16 | 
17 |     bloomf_loaded = False
18 | 
19 |     def __init__(self):
20 |         pass
21 | 
22 |     def load_bf(self):
23 |         # XXX IMPROVEMENT: we might want to use bloomfilters here
24 |         self.bloomf_loaded = True
25 | 
26 |     def dedup(self, idf: InternalDataFormat) -> Union[None, InternalDataFormat]:
27 |         """Deduplicate an IDF element based on the existence in the DB.
28 |         FIXME: this is O(n^2) with n entries in the DB unless indexed properly. Think about indices or a bloom filter
29 | 
30 |         :param idf - internal data format element
31 |         :returns: None if it already exists, otherwise the idf
32 |         :raises Exception on DB problem
33 | 
34 |         """
35 |         if not self.bloomf_loaded:
36 |             self.load_bf()
37 |             self.bloomf_loaded = True
38 |         # at the moment, we'll use postgresql
39 | 
40 |         conn = _get_db()
41 |         sql = "SELECT count(*) from leak_data WHERE email=%s and password=%s"
42 | 
43 |         try:
44 |             cur = conn.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
45 |             cur.execute(sql, (idf.email, idf.password))
46 |             rows = cur.fetchall()
47 |             count = int(rows[0]['count'])
48 |             if count >= 1:
49 |                 # row already exists, return None
50 |                 return None
51 |             else:
52 |                 return idf
53 |         except Exception as ex:
54 |             logging.error("Deduper: could not select data from the DB. Reason: %s" % (str(ex)))
55 |             raise ex
56 | 


--------------------------------------------------------------------------------
/modules/filters/filter.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | from models.idf import InternalDataFormat
 4 | 
 5 | 
 6 | class Filter:
 7 |     def __init__(self):
 8 |         pass
 9 | 
10 |     def filter(self, idf: InternalDataFormat) -> Union[None, InternalDataFormat]:
11 |         """Here we could implement all kinds of filters on data elements or whole rows.
12 |         At the moment, this is a NOP.
13 |         """
14 |         return idf
15 | 


--------------------------------------------------------------------------------
/modules/output/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/modules/output/__init__.py


--------------------------------------------------------------------------------
/modules/output/db.py:
--------------------------------------------------------------------------------
 1 | """Database output module. Stores an IDF item to the DB."""
 2 | from lib.helpers import getlogger
 3 | 
 4 | import psycopg2
 5 | import psycopg2.extras
 6 | 
 7 | from lib.baseoutput.output import BaseOutput
 8 | from lib.db.db import _get_db
 9 | from models.outdf import LeakData
10 | 
11 | 
12 | logger = getlogger(__name__)
13 | 
14 | 
15 | class PostgresqlOutput(BaseOutput):
16 |     dbconn = None
17 | 
18 |     def __init__(self):
19 |         super().__init__()
20 |         self.dbconn = _get_db()
21 | 
22 |     def process(self, data: LeakData) -> bool:
23 |         """Store the output format data into Postgresql.
24 | 
25 |         :returns True on success
26 |         :raises psycopg2.Error exception
27 |         """
28 | 
29 |         sql = """
30 |                 INSERT into leak_data(
31 |                   leak_id, email, password, password_plain, password_hashed, hash_algo, ticket_id, email_verified,
32 |                   password_verified_ok, ip, domain, browser , malware_name, infected_machine, dg
33 |                   )
34 |                 VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )
35 |                 ON CONFLICT ON CONSTRAINT constr_unique_leak_data_leak_id_email_password_domain
36 |                 DO UPDATE SET  count_seen = leak_data.count_seen + 1
37 |                 RETURNING id
38 |                 """
39 |         if data:
40 |             try:
41 |                 with self.dbconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
42 |                     print(cur.mogrify(sql, (
43 |                         data.leak_id, data.email, data.password, data.password_plain, data.password, data.hash_algo,
44 |                         data.ticket_id, data.email_verified, data.password_verified_ok, data.ip, data.domain,
45 |                         data.browser, data.malware_name, data.infected_machine, data.dg)))
46 |                     cur.execute(sql, (
47 |                         data.leak_id, data.email, data.password, data.password_plain, data.password, data.hash_algo,
48 |                         data.ticket_id, data.email_verified, data.password_verified_ok, data.ip, data.domain,
49 |                         data.browser, data.malware_name, data.infected_machine, data.dg))
50 |                     leak_data_id = int(cur.fetchone()['id'])
51 |                     print("leak_data_id: %s" % leak_data_id)
52 |             except psycopg2.Error as ex:
53 |                 logger.error("%s(): error: %s" % (self.process.__name__, ex.pgerror))
54 |                 raise ex
55 |             return True
56 | 


--------------------------------------------------------------------------------
/modules/parsers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/modules/parsers/__init__.py


--------------------------------------------------------------------------------
/modules/parsers/spycloud.py:
--------------------------------------------------------------------------------
 1 | """
 2 | SpyCloud Parser
 3 | 
 4 | Accepts a pandas DF, parses and validates it against the *IN*put format and returns it in the *internal* IDF format
 5 | 
 6 | """
 7 | 
 8 | import logging
 9 | # from typing import List
10 | 
11 | from pydantic import parse_obj_as, ValidationError
12 | import pandas as pd
13 | import numpy as np
14 | from typing import List
15 | 
16 | from lib.baseparser.parser import BaseParser
17 | from models.indf import SpyCloudInputEntry
18 | from models.idf import InternalDataFormat
19 | 
20 | 
21 | class SpyCloudParser(BaseParser):
22 |     def __init__(self):
23 |         """init"""
24 |         super().__init__()
25 | 
26 |     def parse(self, df: pd.DataFrame) -> List[InternalDataFormat]:
27 |         """parse a pandas DF and return the data in the Internal Data Format."""
28 | 
29 |         # First, map empty columns to None so that it fits nicely into the IDF
30 |         df.replace({"-": None}, inplace = True)
31 |         df.replace({"nan": None}, inplace = True)
32 |         df.replace({np.nan: None}, inplace = True)
33 |         df.replace({'breach_date': {'Unknown': None}}, inplace = True)
34 | 
35 |         # some initial checks on the df
36 | 
37 |         # validate via pydantic
38 |         items = []
39 |         for row in df.reset_index().to_dict(orient = 'records'):
40 |             logging.debug("row=%s" % row)
41 |             idf_dict = dict(email = None, password = None, notify = False, domain = None, error_msg = "incomplete data",
42 |                             needs_human_intervention = True)
43 |             idf_dict['original_line'] = str(row)
44 |             try:
45 |                 input_data_item = parse_obj_as(SpyCloudInputEntry, row)  # here the validation magic happens
46 |                 idf_dict = input_data_item.dict()  # conversion magic happens between input format and internal df
47 |                 idf_dict['domain'] = input_data_item.email_domain        # map specific fields
48 |             except Exception as ex:
49 |                 idf_dict['needs_human_intervention'] = True
50 |                 idf_dict['notify'] = False
51 |                 idf_dict['error_msg'] = str(ex)
52 |                 logging.error("could not parse CSV row. Original line: %r.\nReason: %s" % (repr(row), str(ex)))
53 |                 logging.debug("idf_dict = %s" % idf_dict)
54 |             else:
55 |                 logging.error("everything successfully converted")
56 |                 idf_dict['needs_human_intervention'] = False
57 |                 idf_dict['notify'] = True
58 |                 idf_dict['error_msg'] = None
59 |             finally:
60 |                 try:
61 |                     idf = InternalDataFormat(**idf_dict)  # another step of validation happens here
62 |                     logging.debug("idf = %r" % idf)
63 |                 except Exception as ex2:
64 |                     logging.error("Exception in finally. idf_dict = %r" % idf_dict)
65 |                     raise ex2
66 |                 else:
67 |                     items.append(idf)
68 | 
69 |         return items
70 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | astroid==2.4.2
 2 | attrs==20.3.0
 3 | certifi==2022.12.7
 4 | chardet==4.0.0
 5 | click==7.1.2
 6 | dnspython==2.1.0
 7 | email-validator==1.1.2
 8 | fastapi==0.65.2
 9 | h11==0.12.0
10 | importlib-metadata==3.7.0
11 | iniconfig==1.1.1
12 | isort==5.7.0
13 | lazy-object-proxy==1.4.3
14 | mccabe==0.6.1
15 | numpy==1.22.0
16 | packaging==20.9
17 | pandas==1.2.1
18 | pluggy==0.13.1
19 | psycopg2-binary==2.8.6
20 | py==1.10.0
21 | pydantic==1.7.4
22 | pylint-venv==2.1.1
23 | pylint==2.6.2
24 | pyparsing==2.4.7
25 | pytest==6.2.2
26 | python-dateutil==2.8.1
27 | python-dotenv==0.15.0
28 | python-multipart==0.0.5
29 | pytz==2021.1
30 | requests==2.31.0
31 | six==1.15.0
32 | starlette==0.27.0
33 | toml==0.10.2
34 | tqdm==4.56.0
35 | typed-ast==1.4.2
36 | typing-extensions==3.7.4.3
37 | urllib3==1.26.5
38 | uvicorn==0.13.3
39 | wrapt==1.12.1
40 | zipp==3.4.0
41 | coverage==5.5
42 | pytest-cov==2.11.1
43 | ldap3==2.9
44 | pyasn1==0.4.8
45 | 


--------------------------------------------------------------------------------
/sonar-project.properties:
--------------------------------------------------------------------------------
 1 | sonar.projectKey=digits2_credentialLeakDB
 2 | sonar.organization=digits2
 3 | # This is the name and version displayed in the SonarCloud UI.
 4 | sonar.projectName=credentialLeakDB
 5 | sonar.projectVersion=1.12.0
 6 | # Path is relative to the sonar-project.properties file. Replace "\" by "/" on Windows.
 7 | #sonar.sources=.
 8 | # Encoding of the source code. Default is default system encoding
 9 | #sonar.sourceEncoding=UTF-8
10 | sonar.coverage.exclusions=doc/**,venv/**
11 | sonar.exclusions=doc/**,tests/**,venv/**
12 | # duplications exclusions
13 | sonar.cpd.exclusions=doc/**
14 | # python coverage config
15 | sonar.python.coverage.reportPaths=/github/workspace/coverage.xml
16 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | # Unit tests and test data
 2 | 
 3 | We use sonarcloud and codecov (post bash exploit :) ) for coverage testing.
 4 | 
 5 | ## Codecoverage of the unit tests over time
 6 | ![Coverage over time](https://codecov.io/gh/EC-DIGIT-CSIRC/credentialLeakDB/branch/main/graphs/commits.svg)
 7 | 
 8 | ## Weak spots (sunburst diagram, tree)
 9 | ![Weak spots](https://codecov.io/gh/EC-DIGIT-CSIRC/credentialLeakDB/branch/main/graphs/sunburst.svg)
10 | ![quadtree graph](https://codecov.io/gh/EC-DIGIT-CSIRC/credentialLeakDB/branch/main/graphs/tree.svg)
11 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/tests/__init__.py


--------------------------------------------------------------------------------
/tests/fixtures/data.csv:
--------------------------------------------------------------------------------
1 | email,password,password_plain,password_hashed,hash_algo,ticket_id,email_verified,password_verified_ok,ip,domain,browser,malware_name,infected_machine,dg
2 | aaron@example.com,12345,12345,,,CISRC-199,f,f,1.2.3.4,example.com,Google Chrome,,local_laptop,DIGIT
3 | sarah@example.com,123456,123456,,,CISRC-199,f,f,1.2.3.5,example.com,Firefox,,sarahs_laptop,DIGIT
4 | peter@example.com,ohk7do7gil6O,ohk7do7gil6O,4aa7985dad6e1f02238c2e2afc521c4d3dd30650656cd07bf0b7cfd3cd1190b7,sha256,CISRC-199,f,f,1.2.3.5,example.com,Firefox,,WORKSTATION,DIGIT
5 | david@example.com,24b3f998468a9da4105e6c78f5444532cde99d53c011715754194c3b4f3e37b4,,24b3f998468a9da4105e6c78f5444532cde99d53c011715754194c3b4f3e37b4,sha256,CISRC-199,f,f,8.8.8.8,example.com,Firefox,,Macbook Pro,DIGIT
6 | lauri@example.com,Vie5kuuwiroo,Vie5kuuwiroo,,,CISRC-200,t,t,9.9.9.9,example.com,Firefox,,Raspberry PI 3+,DIGIT
7 | natasha@example.com,1235kuuwiroo,1235kuuwiroo,,,CISRC-201,t,t,9.9.9.9,example.com,Firefox,,Raspberry PI 3+,DIGIT
8 | 


--------------------------------------------------------------------------------
/tests/fixtures/data_anonymized_spycloud.csv:
--------------------------------------------------------------------------------
1 | breach_title,spycloud_publish_date,breach_date,email,domain,username,password,target_domain,target_url,password_plaintext,sighting,severity,password_type,email_username,user_browser,infected_time,email_domain,ip_addresses,infected_machine_id
2 | Freedom Fox Combo List,2020-06-25,Unknown,peter@example.com,example.com,-,12345,-,-,12345,1,High,plaintext,peter.petersson,-,-,example.com,-,-
3 | Freedom Fox Combo List,2020-06-25,Unknown,bob.inventedname@ec.europa.eu,europa.eu,-,123456,-,-,123456,1,High,plaintext,bob.inventedname,-,-,ec.europa.eu,-,-
4 | Freedom Fox Combo List,2020-06-25,Unknown,karen.inventedname@ec.europa.eu,europa.eu,-,reallyweakpassword,-,-,reallyweakpassword,1,High,plaintext,karen.inventedname,-,-,ec.europa.eu,-,-
5 | 


--------------------------------------------------------------------------------
/tests/fixtures/vips.txt:
--------------------------------------------------------------------------------
1 | aaron@example.com
2 | benoit@example.com
3 | sarah@example.com
4 | lauri@example.com


--------------------------------------------------------------------------------
/tests/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/tests/lib/__init__.py


--------------------------------------------------------------------------------
/tests/lib/basecollector/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/tests/lib/basecollector/__init__.py


--------------------------------------------------------------------------------
/tests/lib/basecollector/test_collector.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from lib.basecollector.collector import *
 6 | 
 7 | 
 8 | class TestBaseCollector(unittest.TestCase):
 9 |     def test_collect(self):
10 |         valid_csv_file = 'tests/fixtures/data.csv'
11 |         invalid_csv_file = 'tests/fixtures/dataDOESNTEXIST.csv'
12 | 
13 |         tc = BaseCollector()
14 |         df: pd.DataFrame
15 |         status, df = tc.collect(valid_csv_file)
16 |         assert status == "OK"
17 |         assert not df.empty
18 |         assert df.shape[0] > 1
19 | 
20 |         status, df = tc.collect(invalid_csv_file)
21 |         assert status != "OK"
22 |         assert df.empty
23 | 


--------------------------------------------------------------------------------
/tests/lib/baseenricher/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/tests/lib/baseenricher/__init__.py


--------------------------------------------------------------------------------
/tests/lib/baseenricher/test_enricher.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from lib.baseenricher.enricher import BaseEnricher
 4 | from models.idf import InternalDataFormat
 5 | 
 6 | 
 7 | class TestBaseEnricher(unittest.TestCase):
 8 |     def test_enrich(self):
 9 |         idf = InternalDataFormat(email="foo@example.com", password = "12345", notify = True)
10 |         te = BaseEnricher()
11 |         result = te.enrich(idf)
12 |         assert result == idf


--------------------------------------------------------------------------------
/tests/lib/baseoutput/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/tests/lib/baseoutput/__init__.py


--------------------------------------------------------------------------------
/tests/lib/baseoutput/test_output.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from lib.baseoutput.output import BaseOutput
4 | 
5 | class TestBaseOutput(unittest.TestCase):
6 |     def test_process(self):
7 |         to = BaseOutput()
8 |         assert to.process("test_outputfile.txt")
9 | 


--------------------------------------------------------------------------------
/tests/lib/baseparser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/tests/lib/baseparser/__init__.py


--------------------------------------------------------------------------------
/tests/lib/baseparser/test_parser.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from lib.baseparser.parser import BaseParser
 6 | 
 7 | class TestBaseParser(unittest.TestCase):
 8 |     def test_parse(self):
 9 |         tp = BaseParser()
10 |         df = pd.DataFrame()
11 |         tp.parse(df)
12 |         assert True     # not very useful right now but the structure for the test case is here
13 | 


--------------------------------------------------------------------------------
/tests/lib/test_helpers.py:
--------------------------------------------------------------------------------
 1 | from lib.helpers import anonymize_password
 2 | 
 3 | def test_anonymize_password():
 4 |     pass1 = "12345678"
 5 |     expected = "1*****78"
 6 |     assert anonymize_password(pass1) == expected
 7 | 
 8 |     pass2 = "123"
 9 |     expected = "123"
10 |     assert anonymize_password(pass2) == expected
11 | 
12 |     pass3 = "12"
13 |     expected = "12"
14 |     assert anonymize_password(pass3) == expected
15 | 
16 |     pass4 = ""
17 |     expected = ""
18 |     assert anonymize_password(pass4) == expected
19 | 
20 |     pass5 = None
21 |     expected = None
22 |     assert anonymize_password(pass5) == expected
23 | 


--------------------------------------------------------------------------------
/tests/lib/test_logger.py:
--------------------------------------------------------------------------------
 1 | from lib.helpers import getlogger
 2 | 
 3 | 
 4 | logger = getlogger(__name__)
 5 | 
 6 | 
 7 | class Foo:
 8 |     def __init__(self):
 9 |         pass
10 | 
11 |     def do_smthg(self):
12 |         logger.info("bar")
13 |         print("baz")
14 | 
15 | 
16 | def test_logger():
17 |     logger.info("starting up the class")
18 | 
19 |     f = Foo()
20 |     f.do_smthg()
21 |     logger.info("DONE")
22 |     assert True
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     test_logger()
27 | 


--------------------------------------------------------------------------------
/tests/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/tests/modules/__init__.py


--------------------------------------------------------------------------------
/tests/modules/enrichers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/tests/modules/enrichers/__init__.py


--------------------------------------------------------------------------------
/tests/modules/enrichers/test_external_email.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from modules.enrichers.external_email import ExternalEmailEnricher
 4 | 
 5 | class TestExternalEmailEnricher(unittest.TestCase):
 6 |     def test_is_external(self):
 7 |         external_email = "foobar@example.com"
 8 |         tee = ExternalEmailEnricher()
 9 |         assert tee.is_external_email(external_email)
10 | 
11 |         internal_email = "foobar.example@ec.europa.eu"
12 |         assert tee.is_internal_email(internal_email)
13 | 


--------------------------------------------------------------------------------
/tests/test_collector_spycloud.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | from modules.collectors.spycloud.collector import SpyCloudCollector
 6 | 
 7 | 
 8 | class SpyCloudCollectorTest(unittest.TestCase):
 9 |     def test_collect(self):
10 |         path = Path('tests/fixtures/data_anonymized_spycloud.csv')
11 |         tc = SpyCloudCollector()
12 |         statuscode, data = tc.collect(path)
13 |         assert statuscode == "OK"
14 |         assert data.iloc[0]['breach_title'] == 'Freedom Fox Combo List'
15 |         assert data.iloc[0]['email'] == 'peter@example.com'
16 | 


--------------------------------------------------------------------------------
/tests/test_deduper.py:
--------------------------------------------------------------------------------
 1 | from models.idf import InternalDataFormat
 2 | 
 3 | from modules.filters.deduper import Deduper
 4 | 
 5 | 
 6 | def test_load_bf():
 7 |     dd = Deduper()
 8 |     assert not dd.bloomf_loaded
 9 |     dd.load_bf()
10 |     assert dd.bloomf_loaded
11 | 
12 | 
13 | def test_dedup():
14 |     dd = Deduper()
15 |     idf = InternalDataFormat(email="aaron@example.com", password="12345",
16 |                              notify=False, needs_human_intervention=False)
17 |     idf2 = dd.dedup(idf)
18 |     assert not idf2
19 |     idf = InternalDataFormat(email="aaron999735@example.com", password="12345XXX",
20 |                              notify=False, needs_human_intervention=False)
21 |     idf2 = dd.dedup(idf)
22 |     assert idf2
23 | 


--------------------------------------------------------------------------------
/tests/test_enrichment.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from pathlib import Path
 3 | 
 4 | # from modules.enrichers.ldap import LDAPEnricher
 5 | from modules.enrichers.external_email import ExternalEmailEnricher
 6 | from modules.enrichers.abuse_contact import AbuseContactLookup
 7 | from modules.enrichers.vip import VIPEnricher
 8 | 
 9 | 
10 | class TestVIPenrichment(unittest.TestCase):
11 | 
12 |     def test_load_vips(self):
13 |         path = 'tests/fixtures/vips.txt'
14 |         te = VIPEnricher(Path(path))
15 | 
16 |         assert te.is_vip('AARON@example.com')
17 |         assert te.is_vip('aaron@example.com')
18 |         assert not te.is_vip('foobar-doesnotexist')
19 | 
20 |     def test_load_vips_invalid_path(self):
21 |         path = 'tests/fixtures/vips.txt-doesnotexist'
22 |         te = VIPEnricher(Path(path))  # will pass because there we catch the exception
23 |         self.assertRaises(Exception, te.load_vips, path)
24 | 
25 | 
26 | class TestIsExternalEmail(unittest.TestCase):
27 |     def test_is_internal(self):
28 |         email = "foobar.example@ext.ec.europa.eu"
29 |         te = ExternalEmailEnricher()
30 |         assert te.is_internal_email(email)
31 |         domain = "ec.europa.eu"
32 |         assert te.is_internal_email(domain)
33 | 
34 |     def test_is_external(self):
35 |         email = "aaron@example.com"
36 |         te = ExternalEmailEnricher()
37 |         assert te.is_external_email(email)
38 | 
39 | 
40 | class TestAbuseContactLookup(unittest.TestCase):
41 |     def test_lookup(self):
42 |         email = "aaron@example.com"
43 |         te = AbuseContactLookup()
44 |         assert email == te.lookup(email)[0]
45 |         email = "aaron@example.ec.europa.eu"
46 |         assert "ec-digit-csirc@ec.europa.eu" == te.lookup(email)[0]
47 | 


--------------------------------------------------------------------------------
/tests/test_filter.py:
--------------------------------------------------------------------------------
 1 | from models.idf import InternalDataFormat
 2 | 
 3 | from modules.filters.filter import Filter
 4 | 
 5 | 
 6 | def test_filter():
 7 |     fi = Filter()
 8 |     idf = InternalDataFormat(email = "aaron@example.com", password = "12345", notify = False,
 9 |                              needs_human_intervention = False)
10 |     idf2 = fi.filter(idf)
11 |     assert idf2 == idf
12 | 


--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
  1 | from lib.helpers import getlogger
  2 | 
  3 | import urllib.parse
  4 | import uuid
  5 | import unittest
  6 | 
  7 | from fastapi.testclient import TestClient
  8 | 
  9 | from lib.db.db import _connect_db as connect_db
 10 | 
 11 | from api.main import *
 12 | 
 13 | VALID_AUTH = {'x-api-key': 'random-test-api-key'}
 14 | INVALID_AUTH = {'x-api-key': 'random-test-api-XXX'}
 15 | 
 16 | logger = getlogger(__name__)
 17 | client = TestClient(app)  # ,  base_url='http://localhost:8080/')
 18 | 
 19 | 
 20 | def test_ping():
 21 |     response = client.get("/ping")
 22 |     assert response.status_code == 200
 23 |     assert response.json() == {"message": "pong"}
 24 | 
 25 | 
 26 | class DBTestCases(unittest.TestCase):
 27 |     def test_get_db(self):
 28 |         assert get_db() is not None
 29 | 
 30 |     def test_close_db(self):
 31 |         get_db()  # initialize connection
 32 |         self.assertIsNone(close_db())
 33 |         get_db()  # re-initialize connection
 34 | 
 35 |     def test_connect_invalid_db(self):
 36 |         self.assertRaises(Exception, connect_db, 'SOME INVALID DSN')
 37 | 
 38 | 
 39 | def test_fetch_valid_api_keys():
 40 |     assert True
 41 | 
 42 | 
 43 | class APIKeyTests(unittest.TestCase):
 44 |     """Test API key functions"""
 45 | 
 46 |     def test_validate_api_key_header(self):
 47 |         self.assertRaises(Exception, validate_api_key_header, "")
 48 | 
 49 |     def test_is_valid_api_key(self):
 50 |         assert is_valid_api_key(VALID_AUTH['x-api-key'])
 51 | 
 52 |     def test_is_INVALID_api_key(self):
 53 |         assert not is_valid_api_key(INVALID_AUTH['x-api-key'])
 54 | 
 55 |     def test_validate_api_key(self):
 56 |         assert True
 57 | 
 58 | 
 59 | def test_root_auth():
 60 |     response = client.get("/", headers = VALID_AUTH)
 61 |     assert response.status_code == 200
 62 |     assert response.json() == {"message": "Hello World"}
 63 | 
 64 | 
 65 | # noinspection PyPep8Naming
 66 | def test_root_INVALID_auth():
 67 |     response = client.get("/", headers = INVALID_AUTH)
 68 |     assert response.status_code == 403
 69 | 
 70 | 
 71 | def test_get_user_by_email():
 72 |     email = urllib.parse.quote("aaron@example.com")
 73 |     response = client.get("/user/%s" % email, headers = VALID_AUTH)
 74 |     assert response.status_code == 200
 75 |     data = response.json()
 76 |     assert "meta" in response.text and "data" in response.text and data['meta']['count'] >= 1
 77 | 
 78 | 
 79 | # noinspection PyPep8Naming
 80 | def test_get_nonexistent_user_by_INVALID_email():
 81 |     email = urllib.parse.quote("aaron@doesnotexist.com")
 82 |     response = client.get("/user/%s" % email, headers = VALID_AUTH)
 83 |     assert response.status_code != 200
 84 |     data = response.json()
 85 |     assert "meta" in response.text and "data" in response.text and data['meta']['count'] == 0
 86 | 
 87 | 
 88 | def test_get_user_by_email_and_password():
 89 |     email = urllib.parse.quote("aaron@example.com")
 90 |     passwd = "12345"
 91 |     response = client.get("/user_and_password/%s/%s" % (email, passwd), headers = VALID_AUTH)
 92 |     assert response.status_code == 200
 93 |     data = response.json()
 94 |     assert "meta" in response.text and "data" in response.text and data['meta']['count'] >= 1
 95 | 
 96 | 
 97 | # noinspection PyPep8Naming
 98 | def test_get_nonexistent_user_by_email_and_INVALID_password():
 99 |     email = urllib.parse.quote("aaron@example.com")
100 |     passwd = "12345XXXXXXXXXX"
101 |     response = client.get("/user_and_password/%s/%s" % (email, passwd), headers = VALID_AUTH)
102 |     assert response.status_code == 404
103 |     data = response.json()
104 |     assert "meta" in response.text and "data" in response.text and data['meta']['count'] == 0
105 | 
106 | 
107 | def test_check_user_by_email():
108 |     email = urllib.parse.quote("aaron@example.com")
109 |     response = client.get("/exists/by_email/%s" % email, headers = VALID_AUTH)
110 |     assert response.status_code == 200
111 |     data = response.json()
112 |     assert "meta" in response.text and "data" in response.text and data['meta']['count'] >= 1
113 | 
114 | 
115 | # noinspection PyPep8Naming
116 | def test_check_nonexistent_user_by_INVALID_email():
117 |     email = urllib.parse.quote("aaron@doesnotexist.com")
118 |     response = client.get("/exists/by_email/%s" % email, headers = VALID_AUTH)
119 |     assert response.status_code == 200
120 |     data = response.json()
121 |     print(data)
122 |     assert "meta" in response.text and "data" in response.text and data['data'][0]['count'] == 0
123 | 
124 | 
125 | def test_check_user_by_password():
126 |     password = "12345"
127 |     response = client.get("/exists/by_password/%s" % password, headers = VALID_AUTH)
128 |     assert response.status_code == 200
129 |     data = response.json()
130 |     assert "meta" in response.text and "data" in response.text and data['meta']['count'] >= 1
131 | 
132 | 
133 | # noinspection PyPep8Naming
134 | def test_check_nonexistent_user_by_INVALID_password():
135 |     password = 'DOESNOTEXIST@59w47YTISJGw496UASGJSATARSASJKGJSAKGASRG'
136 |     response = client.get("/exists/by_password/%s" % password, headers = VALID_AUTH)
137 |     assert response.status_code == 200
138 |     data = response.json()
139 |     assert "meta" in response.text and "data" in response.text and data['data'][0]['count'] == 0
140 | 
141 | 
142 | def test_check_user_by_domain():
143 |     domain = "example.com"
144 |     response = client.get("/exists/by_domain/%s" % domain, headers = VALID_AUTH)
145 |     assert response.status_code == 200
146 |     data = response.json()
147 |     assert "meta" in response.text and "data" in response.text and data['meta']['count'] >= 1
148 | 
149 | 
150 | # noinspection PyPep8Naming
151 | def test_check_nonexistent_user_by_INVALID_domain():
152 |     domain = "example.com-foobar-2esugksti2uwasgjskhsjhsa.net"
153 |     response = client.get("/exists/by_domain/%s" % domain, headers = VALID_AUTH)
154 |     assert response.status_code == 200
155 |     data = response.json()
156 |     assert "meta" in response.text and "data" in response.text and data['data'][0]['count'] == 0
157 | 
158 | 
159 | def test_get_reporters():
160 |     response = client.get("/reporter/", headers = VALID_AUTH)
161 |     assert response.status_code == 200
162 |     data = response.json()
163 |     assert "meta" in response.text and \
164 |            "data" in response.text and \
165 |            data['meta']['count'] >= 1 and \
166 |            data['data'][0]['reporter_name'] == 'aaron'
167 | 
168 | 
169 | def test_get_sources():
170 |     response = client.get("/source_name/", headers = VALID_AUTH)
171 |     assert response.status_code == 200
172 |     data = response.json()
173 |     answerset = set(i['source_name'] for i in data['data'])
174 |     print(answerset)
175 |     assert "meta" in response.text and \
176 |            "data" in response.text and \
177 |            data['meta']['count'] >= 1 and \
178 |            "HaveIBeenPwned" in answerset
179 | 
180 | 
181 | def test_new_leak():
182 |     test_data = {
183 |         "ticket_id": "CSIRC-202",
184 |         "summary": "a test leak, please ignore",
185 |         "reporter_name": "aaron",
186 |         "source_name": "spycloud",
187 |         "breach_ts": "2021-03-24T16:08:33.405Z",
188 |         "source_publish_ts": "2021-03-24T16:08:33.405Z"
189 |     }
190 |     response = client.post("/leak/", json = test_data, headers = VALID_AUTH)
191 |     assert response.status_code == 201
192 |     data = response.json()
193 |     assert "meta" in response.text and \
194 |            "data" in response.text and \
195 |            data['meta']['count'] >= 1 and \
196 |            data['data'][0]['id'] >= 1
197 |     return int(data['data'][0]['id'])
198 | 
199 | 
200 | def test_update_leak():
201 |     test_data = {
202 |         "ticket_id": "CSIRC-202",
203 |         "summary": "an UPDATE-able test leak, please ignore",
204 |         "reporter_name": "aaron",
205 |         "source_name": "spycloud",
206 |         "breach_ts": "2021-01-01T00:00:00.000Z",
207 |         "source_publish_ts": "2021-01-02T00:00:00.000Z",
208 |     }
209 |     response = client.post("/leak/", json = test_data, headers = VALID_AUTH)
210 |     assert response.status_code == 201
211 |     data = response.json()
212 |     assert "meta" in response.text and \
213 |            "data" in response.text and \
214 |            data['meta']['count'] >= 1 and \
215 |            data['data'][0]['id'] >= 1
216 |     _id = data['data'][0]['id']
217 | 
218 |     # now UPDATE it
219 |     test_data['summary'] = "We UPDATED the test leak now!"
220 |     test_data['id'] = _id
221 |     response = client.put('/leak/', json = test_data, headers = VALID_AUTH)
222 |     assert response.status_code == 200
223 | 
224 |     # fetch the results and see if it's really updated
225 |     response = client.get('/leak/%s' % (_id,), headers = VALID_AUTH)
226 |     assert response.status_code == 200
227 |     assert response.json()['data'][0]['summary'] == "We UPDATED the test leak now!"
228 | 
229 |     # now try to fetch an invalid ID
230 |     response = client.get('/leak/%s' % (_id + 10000,), headers = VALID_AUTH)
231 |     assert response.status_code == 404
232 | 
233 | 
234 | # noinspection PyPep8Naming
235 | def test_update_INVALID_leak():
236 |     test_data = {
237 |         "id": -1,
238 |         "ticket_id": "CSIRC-202",
239 |         "summary": "trying to update a leak which does NOT EXIST",
240 |         "reporter_name": "aaron",
241 |         "source_name": "spycloud",
242 |         "breach_ts": "2021-01-01T00:00:00.000Z",
243 |         "source_publish_ts": "2021-01-02T00:00:00.000Z",
244 |     }
245 |     response = client.put('/leak/', json = test_data, headers = VALID_AUTH)
246 |     assert response.status_code == 400
247 |     assert response.json()['data'] == []
248 | 
249 | 
250 | # By summary
251 | def test_get_leak_by_summary():
252 |     summary = "COMB"
253 |     response = client.get('/leak/by_summary/%s' % (summary,), headers = VALID_AUTH)
254 |     assert response.status_code == 200
255 |     data = response.json()
256 |     assert data['meta']['count'] >= 1
257 |     assert data['data'][0]['summary'] == summary
258 |     assert data['data'][0]['reporter_name'] == 'aaron'
259 | 
260 | 
261 | # noinspection PyPep8Naming
262 | def test_get_leak_by_INVALID_summary():
263 |     summary = "COMB-XXX-DOESNETEXIST"
264 |     response = client.get('/leak/by_summary/%s' % (summary,), headers = VALID_AUTH)
265 |     assert response.status_code == 404
266 |     data = response.json()
267 |     assert data['meta']['count'] == 0
268 | 
269 | 
270 | # By ticket_id
271 | def test_get_leak_by_ticket_id():
272 |     ticket_id = "CSIRC-102"  # we know that exists based on the db.sql import
273 |     response = client.get('/leak/by_ticket_id/%s' % (ticket_id,), headers = VALID_AUTH)
274 |     assert response.status_code == 200
275 |     data = response.json()
276 |     assert data['meta']['count'] >= 1
277 |     assert data['data'][0]['summary'] == "COMB"
278 | 
279 | 
280 | # noinspection PyPep8Naming
281 | def test_get_leak_by_INVALID_ticket_id():
282 |     ticket_id = "COMB-XXX-DOESNETEXIST"
283 |     response = client.get('/leak/by_ticket_id/%s' % (ticket_id,), headers = VALID_AUTH)
284 |     assert response.status_code == 404
285 |     data = response.json()
286 |     assert data['meta']['count'] == 0
287 | 
288 | 
289 | def test_get_all_leaks():
290 |     response = client.get('/leak/all', headers = VALID_AUTH)
291 |     assert response.status_code == 200
292 |     data = response.json()
293 |     assert data['meta']['count'] > 0
294 | 
295 | 
296 | def test_get_leak_by_reporter():
297 |     response = client.get('leak/by_reporter/%s' % ("aaron",), headers = VALID_AUTH)
298 |     assert response.status_code == 200
299 |     data = response.json()
300 |     assert data['meta']['count'] > 0
301 | 
302 | 
303 | def test_get_leak_by_source():
304 |     response = client.get('leak/by_source/%s' % ("spycloud",), headers = VALID_AUTH)
305 |     assert response.status_code == 200
306 |     data = response.json()
307 |     assert data['meta']['count'] > 0
308 | 
309 | 
310 | # #################################################################################
311 | # leak_data
312 | 
313 | def test_get_leak_data_by_leak():
314 |     leak_id = 1  # we know this exists by the db.sql INSERT
315 |     response = client.get('/leak_data/%s' % (leak_id,), headers = VALID_AUTH)
316 |     assert response.status_code == 200
317 |     data = response.json()
318 |     assert data['meta']['count'] >= 1
319 |     assert data['data'][0]['email'] == 'aaron@example.com'
320 | 
321 | 
322 | # noinspection PyPep8Naming
323 | def test_get_leak_data_by_INVALID_leak():
324 |     leak_id = -1  # we know this does not exist
325 |     response = client.get('/leak_data/%s' % (leak_id,), headers = VALID_AUTH)
326 |     assert response.status_code == 404
327 |     data = response.json()
328 |     assert data['meta']['count'] == 0
329 |     assert data['data'] == []
330 | 
331 | 
332 | def test_get_leak_data_by_ticket_id():
333 |     ticket_id = 'CISRC-199'  # we know this exists by the db.sql INSERT
334 |     response = client.get('/leak_data/by_ticket_id/%s' % (ticket_id,), headers = VALID_AUTH)
335 |     assert response.status_code == 200
336 |     data = response.json()
337 |     assert data['meta']['count'] >= 1
338 |     assert data['data'][0]['email'] == 'aaron@example.com'
339 |     assert data['data'][1]['email'] == 'sarah@example.com'
340 | 
341 | 
342 | def insert_leak_data(d: dict) -> int:
343 |     """ generic test function for INSERTing a leak_data row given by d.
344 | 
345 |     @:param d: a row as dict
346 |     @:returns ID: ID of the newly inserted row
347 |     @:rtype: int
348 |     """
349 |     response = client.post("/leak_data/", json = d, headers = VALID_AUTH)
350 |     print(response)
351 |     print(response.text)
352 |     assert response.status_code == 201
353 |     data = response.json()
354 |     print(data)
355 |     assert "meta" in data and \
356 |            "data" in data and \
357 |            data['meta']['count'] >= 1 and \
358 |            data['data'][0]['id'] >= 1
359 |     return data['data'][0]['id']
360 | 
361 | 
362 | def test_new_leak_data():
363 |     """ INSERT a new leak_data row."""
364 |     test_data = {
365 |         "leak_id": 1,
366 |         "email": "aaron2@example.com",
367 |         "password": "000000",
368 |         "password_plain": "000000",
369 |         "password_hashed": "d232105eb59a344df4b54db1c24009b1",
370 |         "hash_algo": "md5",
371 |         "ticket_id": "CSIRC-102",
372 |         "email_verified": False,
373 |         "password_verified_ok": False,
374 |         "ip": "5.6.7.8",
375 |         "domain": "example.com",
376 |         "browser": "Chrome",
377 |         "malware_name": "n/a",
378 |         "infected_machine": "n/a",
379 |         "dg": "DIGIT",
380 |         "needs_human_intervention": False,
381 |         "notify": False
382 |     }
383 |     _id = insert_leak_data(test_data)
384 |     assert _id >= 0
385 |     return _id
386 | 
387 | 
388 | def test_update_leak_data():
389 |     random_str = uuid.uuid4()
390 |     test_data = {
391 |         "leak_id": 1,
392 |         "email": "aaron%s@example.com" % (random_str,),
393 |         "password": "000000",
394 |         "password_plain": "000000",
395 |         "password_hashed": "d232105eb59a344df4b54db1c24009b1",
396 |         "hash_algo": "md5",
397 |         "ticket_id": "CSIRC-102",
398 |         "email_verified": False,
399 |         "password_verified_ok": False,
400 |         "ip": "5.6.7.8",
401 |         "domain": "example.com",
402 |         "browser": "Chrome",
403 |         "malware_name": "n/a",
404 |         "infected_machine": "n/a",
405 |         "dg": "DIGIT",
406 |         "needs_human_intervention": False,
407 |         "notify": False
408 |     }
409 |     # create my own leak_data row
410 |     _id = insert_leak_data(test_data)
411 | 
412 |     # now UPDATE it
413 |     random_str2 = uuid.uuid4()
414 |     email2 = "aaron-%s@example.com" % random_str2
415 | 
416 |     test_data['id'] = _id
417 |     test_data.update({"email": email2})
418 |     response = client.put('/leak_data/', json = test_data, headers = VALID_AUTH)
419 |     assert response.status_code == 200
420 |     print("after UPDATE: response = %r" % response.json())
421 | 
422 |     # fetch the results and see if it's really updated
423 |     response = client.get('/leak_data/%s' % (_id,), headers = VALID_AUTH)
424 |     assert response.status_code == 200
425 |     print("data: %r" % response.json()['data'])
426 |     assert response.json()['data'][0]['email'] == email2
427 | 
428 | 
429 | def test_import_csv_with_leak_id():
430 |     _id = test_new_leak()
431 |     fixtures_file = "./tests/fixtures/data.csv"
432 |     f = open(fixtures_file, "rb")
433 |     response = client.post('/import/csv/by_leak/%s' % (_id,), files = {"_file": f}, headers = VALID_AUTH)
434 |     logger.info("response = %r" % response.text)
435 |     assert 200 <= response.status_code < 300
436 |     assert response.json()['meta']['count'] >= 0
437 | 
438 | 
439 | def test_check_file():
440 |     assert True  # trivial check, not implemented yet actually in main.py
441 | 
442 | 
443 | def test_enrich_email_to_vip():
444 |     email_vip = "aaron@example.com"
445 |     response = client.get('/enrich/email_to_vip/%s' % (email_vip,), headers = VALID_AUTH)
446 |     assert response.status_code == 200
447 |     data = response.json()
448 |     assert data['meta']['count'] >= 1
449 |     assert data['data'][0]['is_vip']
450 | 
451 | 
452 | # noinspection PyPep8Naming
453 | def test_enrich_email_to_vip_INVALID():
454 |     email_vip = "aaron-invalid-does-not-exist@example.com"
455 |     response = client.get('/enrich/email_to_vip/%s' % (email_vip,), headers = VALID_AUTH)
456 |     assert response.status_code == 200
457 |     data = response.json()
458 |     assert data['meta']['count'] >= 1
459 |     assert not data['data'][0]['is_vip']
460 | 
461 | 
462 | class TestImportCSVSpycloud(unittest.TestCase):
463 |     def test_import_csv_spycloud_invalid_ticket_id(self):
464 |         fixtures_file = "./tests/fixtures/data_anonymized_spycloud.csv"
465 |         f = open(fixtures_file, "rb")
466 |         response = client.post('/import/csv/spycloud/?summary=test2', files = {"_file": f}, headers = VALID_AUTH)
467 |         assert response.status_code >= 400
468 | 
469 |     def test_import_csv_spycloud(self):
470 |         fixtures_file = "./tests/fixtures/data_anonymized_spycloud.csv"
471 |         f = open(fixtures_file, "rb")
472 |         response = client.post('/import/csv/spycloud/%s?summary=test2' % ("ticket99",), files = {"_file": f},
473 |                                headers = VALID_AUTH)
474 |         assert 200 <= response.status_code < 300
475 |         assert response.json()['meta']['count'] >= 0
476 | 
477 | 
478 | class TestEnricherEmailToDG(unittest.TestCase):
479 |     response = None
480 | 
481 |     def test_enrich_dg_by_email(self):
482 |         email = "aaron@example.com"
483 |         if not os.getenv('CED_SERVER'):
484 |             with self.assertRaises(Exception):
485 |                 client.get('/enrich/email_to_dg/%s' % (email,), headers = VALID_AUTH)
486 |         else:
487 |             response = client.get('/enrich/email_to_dg/%s' % (email,), headers = VALID_AUTH)
488 |             assert response.status_code == 200
489 |             data = response.json()
490 |             assert data['meta']['count'] >= 1
491 |             assert data['data'][0]['dg']
492 | 


--------------------------------------------------------------------------------
/tests/test_parser_spycloud.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from pathlib import Path
 3 | from modules.parsers.spycloud import SpyCloudParser
 4 | from modules.collectors.spycloud.collector import SpyCloudCollector
 5 | 
 6 | 
 7 | class SpyCloudParserTest(unittest.TestCase):
 8 |     def test_parse(self):
 9 |         path = 'tests/fixtures/data_anonymized_spycloud.csv'
10 |         tc = SpyCloudCollector()
11 |         statuscode, df = tc.collect(Path(path))
12 |         assert statuscode == "OK"
13 |         tp = SpyCloudParser()
14 |         idf = tp.parse(df)
15 |         assert idf
16 |         # print([ i for i in idf ])
17 |         for i in idf:
18 |             if "error_msg" in i.dict() and i.error_msg:
19 |                 print("error_msg: %s" % i.error_msg)
20 |                 print("orig_line: %s" % i.original_line)
21 | 


--------------------------------------------------------------------------------