├── .coveragerc ├── .github ├── .codecov.yml ├── pull_request_template.md └── workflows │ ├── codecov.yml │ ├── codeql-analysis.yml │ ├── disabled │ ├── codecov.yml │ └── qa-gates.yml │ ├── pylint.yml │ └── python-app.yml ├── .gitignore ├── .idea ├── .gitignore ├── credentialLeakDB.iml ├── dbnavigator.xml ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── CHANGELOG ├── CONTRIBUTING.md ├── EER.png ├── README.md ├── SECURITY.md ├── __init__.py ├── api ├── __init__.py ├── enrichment.py ├── main.py └── models.py ├── config.SAMPLE.py ├── db.sql ├── lib ├── __init__.py ├── basecollector │ ├── __init__.py │ └── collector.py ├── baseenricher │ ├── __init__.py │ └── enricher.py ├── baseoutput │ ├── __init__.py │ └── output.py ├── baseparser │ ├── __init__.py │ └── parser.py ├── db │ ├── __init__.py │ └── db.py └── helpers.py ├── models ├── __init__.py ├── idf.py ├── indf.py └── outdf.py ├── modules ├── __init__.py ├── collectors │ ├── __init__.py │ ├── parser.py │ ├── sample.csv │ ├── spycloud.py │ ├── spycloud │ │ ├── __init__.py │ │ └── collector.py │ └── test_leaks │ │ ├── COMB │ │ └── test_data.txt │ │ └── README.md ├── enrichers │ ├── __init__.py │ ├── abuse_contact.py │ ├── external_email.py │ ├── ldap.py │ ├── ldap_lib.py │ └── vip.py ├── filters │ ├── __init__.py │ ├── deduper.py │ └── filter.py ├── output │ ├── __init__.py │ └── db.py └── parsers │ ├── __init__.py │ └── spycloud.py ├── requirements.txt ├── sonar-project.properties └── tests ├── README.md ├── __init__.py ├── fixtures ├── data.csv ├── data_anonymized_spycloud.csv └── vips.txt ├── lib ├── __init__.py ├── basecollector │ ├── __init__.py │ └── test_collector.py ├── baseenricher │ ├── __init__.py │ └── test_enricher.py ├── baseoutput │ ├── __init__.py │ └── test_output.py ├── baseparser │ ├── __init__.py │ └── test_parser.py ├── test_helpers.py └── test_logger.py ├── modules ├── __init__.py └── enrichers │ ├── __init__.py │ └── test_external_email.py ├── test_collector_spycloud.py ├── test_deduper.py ├── test_enrichment.py ├── test_filter.py ├── test_main.py └── test_parser_spycloud.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = venv/* 3 | -------------------------------------------------------------------------------- /.github/.codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | require_ci_to_pass: yes 3 | branch: main 4 | 5 | coverage: 6 | precision: 2 7 | round: down 8 | range: "70...100" 9 | 10 | parsers: 11 | gcov: 12 | branch_detection: 13 | conditional: yes 14 | loop: yes 15 | method: no 16 | macro: no 17 | 18 | comment: 19 | layout: "reach,diff,flags,files,footer" 20 | behavior: default 21 | require_changes: no 22 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Pull request template 2 | 3 | The model we are using is to have a fork of the repo, edit on the fork, then 4 | make a pull request. The PR needs then to be reviewed and 5 | merged into the ``main`` branch. 6 | It will be sent through a couple of quality checks and if it does 7 | not meet those, the PR needs to be adapted. 8 | 9 | # Description 10 | New tool, Bug fixing, or Improvement? 11 | Please include a summary of the change and which issue is fixed. Also include relevant motivation and context. 12 | 13 | ## Related issue 14 | 15 | ## Check list 16 | - [ ] Related issue / work item is attached 17 | - [ ] Unit-tests are written (if applicable) 18 | - [ ] Documentation is updated (if applicable) 19 | - [ ] Changes are tested, tests pass, no code linting errors and no high/critical vulnerabilities identified in codebase. 20 | 21 | ## Testing 22 | - [ ] Did you write new unit tests for this change? 23 | - [ ] Did you write new integration tests for this change? 24 | Include the test commands you ran locally to test this change 25 | e.g.: 26 | ```bash 27 | pytest -v 28 | ``` 29 | 30 | ## Monitoring 31 | - [ ] Will this change be covered by our existing monitoring? (no new canaries/metrics/dashboards/alarms are required) 32 | - [ ] Will this change have no (or positive) effect on resources and/or limits? 33 | (including CPU, memory, AWS resources, calls to other services) 34 | - [ ] Can this change be deployed to Prod without triggering any alarms? 35 | 36 | ## Rollout 37 | - [ ] Can this change be merged immediately into the pipeline upon approval? 38 | - [ ] Are all dependent changes already deployed to Prod? 39 | - [ ] Can this change be rolled back without any issues after deployment to Prod? 40 | 41 | 42 | 43 | 44 | This is the template we use in our projects. 45 | -------------------------------------------------------------------------------- /.github/workflows/codecov.yml: -------------------------------------------------------------------------------- 1 | name: Codecov 2 | on: [push] 3 | jobs: 4 | run: 5 | runs-on: ubuntu-latest 6 | env: 7 | PORT: 8080 8 | DBHOST: localhost 9 | DBUSER: credentialleakdb 10 | DBPASSWORD: 1234testForUnitTesting 11 | DBNAME: credentialleakdb 12 | VIPLIST: tests/fixtures/vips.txt 13 | steps: 14 | - uses: actions/checkout@v2 15 | with: 16 | fetch-depth: 0 17 | - name: Set up Python 3.9 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: 3.9 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install flake8 pytest pytest-cov 25 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 26 | - name: set up demo database 27 | run: | 28 | sudo apt install postgresql 29 | # sudo -u postgres pg_ctlcluster 12 main start 30 | sudo service postgresql start 31 | # sudo -u postgres pg_ctlcluster 12 main start 32 | sudo -u postgres createdb credentialleakdb 33 | sudo -u postgres createuser -s $DBUSER 34 | sudo -u postgres psql $DBNAME < db.sql 35 | sudo -u postgres psql -c "ALTER role $DBUSER WITH PASSWORD '$DBPASSWORD'" 36 | - name: prepare environment and mocking 37 | run: | 38 | cp config.SAMPLE.py api/config.py 39 | echo "PORT=$PORT" > ENV 40 | echo "DBHOST=$DBHOST" >> ENV 41 | echo "DBUSER=$DBUSER" >> ENV 42 | echo "DBPASSWORD=$DBPASSWORD" >> ENV 43 | echo "DBNAME=$DBNAME" >> ENV 44 | - name: Generate coverage report 45 | run: | 46 | pip install pytest 47 | pip install pytest-cov 48 | pip install -r requirements.txt 49 | python -m pytest -vv --cov=./ --cov-report=term --cov-report=xml tests/ 50 | - name: Upload coverage to Codecov 51 | uses: codecov/codecov-action@v1 52 | with: 53 | token: ${{ secrets.CODECOV_TOKEN }} 54 | files: ./coverage.xml 55 | directory: ./coverage/reports/ 56 | flags: unittests 57 | env_vars: OS,PYTHON 58 | name: codecov-umbrella 59 | fail_ci_if_error: true 60 | path_to_write_report: ./coverage/codecov_report.txt 61 | verbose: true 62 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ main ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ main ] 20 | schedule: 21 | - cron: '40 10 * * 0' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | 28 | strategy: 29 | fail-fast: false 30 | matrix: 31 | language: [ 'python' ] 32 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] 33 | # Learn more: 34 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed 35 | 36 | steps: 37 | - name: Checkout repository 38 | uses: actions/checkout@v2 39 | 40 | # Initializes the CodeQL tools for scanning. 41 | - name: Initialize CodeQL 42 | uses: github/codeql-action/init@v1 43 | with: 44 | languages: ${{ matrix.language }} 45 | # If you wish to specify custom queries, you can do so here or in a config file. 46 | # By default, queries listed here will override any specified in a config file. 47 | # Prefix the list here with "+" to use these queries and those in the config file. 48 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 49 | 50 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 51 | # If this step fails, then you should remove it and run the build manually (see below) 52 | - name: Autobuild 53 | uses: github/codeql-action/autobuild@v1 54 | 55 | # ℹ️ Command-line programs to run using the OS shell. 56 | # 📚 https://git.io/JvXDl 57 | 58 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 59 | # and modify them (or add more) to build your code if your project 60 | # uses a compiled language 61 | 62 | #- run: | 63 | # make bootstrap 64 | # make release 65 | 66 | - name: Perform CodeQL Analysis 67 | uses: github/codeql-action/analyze@v1 68 | -------------------------------------------------------------------------------- /.github/workflows/disabled/codecov.yml: -------------------------------------------------------------------------------- 1 | name: Codecov run 2 | on: [push] 3 | jobs: 4 | run: 5 | runs-on: ${{ matrix.os }} 6 | strategy: 7 | matrix: 8 | os: [ubuntu-latest, debian-latest] 9 | env: 10 | OS: ${{ matrix.os }} 11 | PYTHON: '3.7' 12 | steps: 13 | - uses: actions/checkout@master 14 | - name: Setup Python 15 | uses: actions/setup-python@master 16 | with: 17 | python-version: 3.7 18 | - name: Generate coverage report 19 | run: | 20 | pip install pytest 21 | pip install pytest-cov 22 | pytest --cov=./ --cov-report=xml 23 | - name: Upload coverage to Codecov 24 | uses: codecov/codecov-action@v1 25 | with: 26 | token: ${{ secrets.CODECOV_TOKEN }} 27 | files: ./coverage1.xml,./coverage2.xml 28 | directory: ./coverage/reports/ 29 | flags: unittests 30 | env_vars: OS,PYTHON 31 | name: codecov-umbrella 32 | fail_ci_if_error: true 33 | path_to_write_report: ./coverage/codecov_report.txt 34 | verbose: true -------------------------------------------------------------------------------- /.github/workflows/disabled/qa-gates.yml: -------------------------------------------------------------------------------- 1 | name: QA Gates 2 | # on: 3 | # push: 4 | # branches: 5 | # - master 6 | # pull_request: 7 | # types: [opened, synchronize, reopened] 8 | on: 9 | push: 10 | branches: [ main, release/* ] 11 | pull_request: 12 | branches: [ main ] 13 | jobs: 14 | qa-gates: 15 | name: SonarCloud 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v2 19 | with: 20 | fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis 21 | - name: SonarCloud Scan 22 | uses: SonarSource/sonarcloud-github-action@master 23 | env: 24 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Needed to get PR information, if any 25 | SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} -------------------------------------------------------------------------------- /.github/workflows/pylint.yml: -------------------------------------------------------------------------------- 1 | name: Pylint 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | env: 8 | VIPLIST: tests/fixtures/vips.txt 9 | 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Set up Python 3.8 15 | uses: actions/setup-python@v1 16 | with: 17 | python-version: 3.8 18 | - name: Install dependencies 19 | run: | 20 | python -m pip install --upgrade pip 21 | pip install pylint 22 | pip install -r requirements.txt 23 | - name: prepare environment and mocking 24 | run: | 25 | cp config.SAMPLE.py api/config.py 26 | - name: Analysing the code with pylint 27 | run: | 28 | export PYTHONPATH=$(pwd) && pylint --suggestion-mode=y --extension-pkg-whitelist='pydantic' -E -d C0301 -d E0611 api models tests modules lib 29 | 30 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: flak8 and pytest 5 | 6 | on: 7 | push: 8 | branches: [ main, develop, re-write-modules, release/* ] 9 | pull_request: 10 | branches: [ main ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | env: 17 | PORT: 8080 18 | DBHOST: localhost 19 | DBUSER: credentialleakdb 20 | DBPASSWORD: 1234testForUnitTesting 21 | DBNAME: credentialleakdb 22 | VIPLIST: tests/fixtures/vips.txt 23 | steps: 24 | - uses: actions/checkout@v2 25 | with: 26 | fetch-depth: 0 27 | - name: Set up Python 3.9 28 | uses: actions/setup-python@v2 29 | with: 30 | python-version: 3.9 31 | - name: Install dependencies 32 | run: | 33 | python -m pip install --upgrade pip 34 | pip install flake8 pytest pytest-cov 35 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 36 | - name: set up demo database 37 | run: | 38 | sudo apt install postgresql 39 | # sudo -u postgres pg_ctlcluster 12 main start 40 | sudo service postgresql start 41 | # sudo -u postgres pg_ctlcluster 12 main start 42 | sudo -u postgres createdb credentialleakdb 43 | sudo -u postgres createuser -s $DBUSER 44 | sudo -u postgres psql $DBNAME < db.sql 45 | sudo -u postgres psql -c "ALTER role $DBUSER WITH PASSWORD '$DBPASSWORD'" 46 | - name: prepare environment and mocking 47 | run: | 48 | cp config.SAMPLE.py api/config.py 49 | echo "PORT=$PORT" > ENV 50 | echo "DBHOST=$DBHOST" >> ENV 51 | echo "DBUSER=$DBUSER" >> ENV 52 | echo "DBPASSWORD=$DBPASSWORD" >> ENV 53 | echo "DBNAME=$DBNAME" >> ENV 54 | - name: Lint with flake8 55 | run: | 56 | # stop the build if there are Python syntax errors or undefined names 57 | flake8 . --count --select=E9,F63,F7,F82 --ignore=E251 --show-source --statistics --exclude venv 58 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 59 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --ignore=E251 --statistics --exclude venv 60 | - name: Start the RESTful API server 61 | run: | 62 | uvicorn --host 127.0.0.1 --port 8080 --reload api.main:app & 63 | # uvicorn --env-file ENV --host 127.0.0.1 --port 8080 --reload api.main:app & 64 | 65 | - name: Test with pytest 66 | run: | 67 | python -m pytest -vv --cov=./ --cov-report=term --cov-report=xml tests/ 68 | 69 | - name: Validate coverage report exists 70 | run: | 71 | ls -lha 72 | 73 | - name: Snyk Security Scan 74 | uses: snyk/actions/python@master 75 | # continue-on-errormsg: true 76 | env: 77 | SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }} 78 | with: 79 | # https://support.snyk.io/hc/en-us/articles/360003812578-Our-full-CLI-reference 80 | # args: --command=pipenv run --severity-threshold=high --fail-on=all --file=*req*.txt --dev --org=digits2 --debug 81 | args: --command=python --severity-threshold=high --fail-on=all --file=requirements.txt --package-manager=pip --dev --org=digits2 --debug -skip-unresolved 82 | 83 | - name: SonarCloud Scan 84 | uses: SonarSource/sonarcloud-github-action@master 85 | env: 86 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Needed to get PR information, if any 87 | SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} 88 | 89 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # config secrets etc. 2 | ENV 3 | .env 4 | config.py 5 | api/config.py 6 | 7 | # python venvs 8 | .venv 9 | venv 10 | 11 | # editor 12 | .idea/ 13 | 14 | # real data directories, don't upload to github ;-) 15 | modules/collectors/real_data 16 | real_data 17 | data/ 18 | VIPs.txt 19 | 20 | poetry.lock 21 | 22 | 23 | *.swp 24 | */*.swp 25 | 26 | 27 | # test caches from pytest 28 | .pytest_cache 29 | */.pytest_cache 30 | __pycache__/ 31 | */__pycache__/ 32 | old 33 | cache 34 | .coverage 35 | coverage.xml 36 | 37 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/credentialLeakDB.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 19 | -------------------------------------------------------------------------------- /.idea/dbnavigator.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 14 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | 0.6 * complete re-write. Refactor everything. 2 | * make a structure of collectors, parsers, enrichers, filters, output modules 3 | * re-wrote spycloud parsing completely 4 | * re-wrote the output / postgresql storing of the data 5 | * lots of unit tests . Brings coverage > 80% 6 | 7 | 0.5 * things sort of work, but feels buggy 8 | 9 | 0.4 * added endpoints for /leak and /leak_data 10 | * fixed the Answer format for all endpoints. 11 | * Added autocommit 12 | * Minor bugs. 13 | 14 | 0.3 * moved to public github.com/EC-DIGIT-CSIRC/credentialLeakDB.git 15 | * refactored code so that the Parser is now abstract, implement basic Spycloud parser 16 | * refactored DB insert 17 | * first API version, though still lots of bugs 18 | 0.2 moved to an internal proj. in github after OK from @ddurvaux 19 | 0.1 initial import 20 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Please create a fork of this project, then make your changes and then send a 2 | pull request (merge request in gitlab's lingo). -------------------------------------------------------------------------------- /EER.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/EER.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # credentialleakDB 2 | 3 | [![Pylint](https://github.com/EC-DIGIT-CSIRC/credentialLeakDB/actions/workflows/pylint.yml/badge.svg)](https://github.com/EC-DIGIT-CSIRC/credentialLeakDB/actions/workflows/pylint.yml) 4 | [![flak8 and pytest](https://github.com/EC-DIGIT-CSIRC/credentialLeakDB/actions/workflows/python-app.yml/badge.svg)](https://github.com/EC-DIGIT-CSIRC/credentialLeakDB/actions/workflows/python-app.yml) 5 | [![CodeQL](https://github.com/EC-DIGIT-CSIRC/credentialLeakDB/actions/workflows/codeql-analysis.yml/badge.svg)](https://github.com/EC-DIGIT-CSIRC/credentialLeakDB/actions/workflows/codeql-analysis.yml) 6 | [![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=digits2_credentialLeakDB&metric=alert_status&token=cee9c8232570fa1000ab4770feb571fd3e85ff39)](https://sonarcloud.io/dashboard?id=digits2_credentialLeakDB) 7 | [![Maintainability Rating](https://sonarcloud.io/api/project_badges/measure?project=digits2_credentialLeakDB&metric=sqale_rating&token=cee9c8232570fa1000ab4770feb571fd3e85ff39)](https://sonarcloud.io/dashboard?id=digits2_credentialLeakDB) 8 | [![Reliability Rating](https://sonarcloud.io/api/project_badges/measure?project=digits2_credentialLeakDB&metric=reliability_rating&token=cee9c8232570fa1000ab4770feb571fd3e85ff39)](https://sonarcloud.io/dashboard?id=digits2_credentialLeakDB) 9 | [![Security Rating](https://sonarcloud.io/api/project_badges/measure?project=digits2_credentialLeakDB&metric=security_rating&token=cee9c8232570fa1000ab4770feb571fd3e85ff39)](https://sonarcloud.io/dashboard?id=digits2_credentialLeakDB) 10 | [![codecov](https://codecov.io/gh/EC-DIGIT-CSIRC/credentialLeakDB/branch/main/graph/badge.svg?token=SS5F8EXQON)](https://codecov.io/gh/EC-DIGIT-CSIRC/credentialLeakDB) 11 | 12 | 13 | A database structure to store leaked credentials. 14 | 15 | Think: our own, internal [HaveIBeenPwned](https://haveibeenpwned.com/) database. 16 | 17 | ## Why? 18 | 19 | 1. To quickly find duplicates before sending it on to further process the data 20 | 2. To have a way to load diverse credential breaches into a common structure and do common queries on it 21 | 3. To quickly generate statistics on credential leaks 22 | 4. To have a well defined interface to pass on data to pass it on to other automation steps 23 | 24 | ## Documentation 25 | 26 | ### Installation 27 | 28 | #### Docker 29 | 30 | #### Via pip and venv 31 | 32 | ```bash 33 | git clone https://github.com/EC-DIGIT-CSIRC/credentialLeakDB.git 34 | cd credentialLeakDB 35 | # create a virtualenv 36 | virtualenv --python=python3.7 venv 37 | source venv/bin/activate 38 | pip install -r requirements.txt 39 | ``` 40 | 41 | Next, make sure the following files exist: 42 | * ``VIPs.txt`` ... a \n separated list of email addresses which you would consider VIPs. 43 | * api/config.py ... see below 44 | 45 | ### Database structure 46 | Search in Confluence for "credentialLeakDB" in the Automation space. 47 | 48 | SQL structure: [db.sql](db.sql) 49 | 50 | The EER diagram __intentionally__ got simplified a lot. If we are going to store billions of repeated ``text`` datatype records, we can 51 | go back to more normalization. For now, however, this seems to be enough. 52 | 53 | 54 | ![EER Diagram](EER.png) 55 | 56 | 57 | 58 | ### Meaning of the fields 59 | 60 | #### Table ``leak`` 61 | 62 | | Column | Type | Collation | Nullable | Description | 63 | |------------------ | ------------------------ | --------- | -------- | ----------------------------------------------------------------------------------------------------------------- | 64 | | ``id`` | integer | | not null | _primary key. Auto-generated_. | 65 | | ``breach_ts`` | timestamp with time zone | | | If known, the timestamp when the breach happened. | 66 | | ``source_publish_ts`` | timestamp with time zone | | | The timestamp according when the source (f.ex. Spycloud) published the data. | 67 | | ``ingestion_ts`` | timestamp with time zone | | not null | The timestamp when we ingested the data. | 68 | | ``summary`` | text | | not null | A short summary (slug) of the leak. Used for displaying it somewhere | 69 | | ``ticket_id`` | text | | | | 70 | | ``reporter_name`` | text | | | The name of the reporter where we got the notification from. E.g. CERT-eu, Spycloud, etc... Who sent us the data? | 71 | | ``source_name`` | text | | | The name of the source where this leak came from. Either the name of a collection or some other name. | 72 | 73 | ``` 74 | Indexes: 75 | "leak_pkey" PRIMARY KEY, btree (id) 76 | Referenced by: 77 | TABLE "leak_data" CONSTRAINT "leak_data_leak_id_fkey" FOREIGN KEY (leak_id) REFERENCES leak(id) 78 | ``` 79 | 80 | #### Table ``leak_data`` 81 | 82 | | Column | Type | Collation | Nullable | Description 83 | --------------------- | ------- | --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------- 84 | ``id`` | integer | | not null | _primary key, auto-generated_. | 85 | ``leak_id`` | integer | | not null | references a ``leak(id)`` | 86 | ``email`` | text | | not null | The email address associated with the leak. | 87 | ``password`` | text | | not null | Either the encrypted or unencrypted password. If the unencrypted password is available, that is what is going to be in this field. | 88 | ``password_plain`` | text | | | The plaintext password, if known. | 89 | ``password_hashed`` | text | | | The hashed password, if known. | 90 | ``hash_algo`` | text | | | If we can determine the hashing algo and the password_hashed field is set, for example "md5" or "sha1" | 91 | ``ticket_id`` | text | | | References the ticket systems' ticket ID associated with handling this credential leak . This ticket could contain infos on how we contacted the affected user. | 92 | ``email_verified`` | boolean | | | If the email address was verified if it does exist and is active | 93 | ``password_verified_ok`` | boolean | | | Was that password still valid / active? | 94 | ``ip`` | inet | | | IP address of the client PC in case of a password stealer. | 95 | ``domain`` | text | | | Domain address of the user's email address. | 96 | ``browser`` | text | | | If the password was leaked via a password stealer malware, then the browser of the user goes here. Otherwise empty. | 97 | ``malware_name`` | text | | | If the password was leaked via a password stealer malware, then the malware name goes here. Otherwise empty. | 98 | ``infected_machine`` | text | | | If the password was leaked via a password stealer malware, then the infected (Windows) PC name (some ID for the machine) goes here. | 99 | ``dg`` | text | | not null | The affected DG (in other organisations, this would be called "department") 100 | ``count_seen`` | integer | | | How often did we already see this unique combination (leak, email, password, domain). I.e. this is a duplicate counter. | 101 | 102 | ``` 103 | Indexes: 104 | "leak_data_pkey" PRIMARY KEY, btree (id) 105 | "constr_unique_leak_data_leak_id_email_password_domain" UNIQUE CONSTRAINT, btree (leak_id, email, password, domain) 106 | "idx_leak_data_unique_leak_id_email_password_domain" UNIQUE, btree (leak_id, email, password, domain) 107 | "idx_leak_data_dg" btree (dg) 108 | "idx_leak_data_email" btree (upper(email)) 109 | "idx_leak_data_email_password_machine" btree (email, password, infected_machine) 110 | "idx_leak_data_malware_name" btree (malware_name) 111 | Foreign-key constraints: 112 | "leak_data_leak_id_fkey" FOREIGN KEY (leak_id) REFERENCES leak(id) 113 | ``` 114 | 115 | 116 | # Usage of the API 117 | 118 | Here is how to use the API endpoints: you can start the server (follow the instructions below) and go to ``$servername/docs`` where $servername is of course the domain / IP address you installed it under. The ``docs/`` endpoint hosts a swagger / OpenAPI 3 119 | 120 | ## GET parameters 121 | 122 | These are pretty self-explanatory thanks to the swagger UI. 123 | 124 | ## POST and PUT 125 | 126 | For HTTP POST (a.k.a INSERT into DB) you will need to provide the following JSON info: 127 | 128 | ### leak object 129 | ```json 130 | { 131 | "id": 0, 132 | "ticket_id": "string", 133 | "summary": "string", 134 | "reporter_name": "string", 135 | "source_name": "string", 136 | "breach_ts": "2021-03-29T12:21:56.370Z", 137 | "source_publish_ts": "2021-03-29T12:21:56.370Z" 138 | } 139 | 140 | ``` 141 | 142 | The ``id`` field *only* needs to be filled out when PUTing data there (a.k.a UPDATE statement). Otherwise please leave it out when POSTing a new leak_data row. 143 | The id is the internal automatically generated primary key (ID) and will be assigned. So when you use the ``HTTP POST /leak`` endpoint, please leave out ``id``. The answer will be a JSON array with a dict with the id inside, such as: 144 | 145 | ```json 146 | { 147 | "meta": { 148 | "version": "0.5", 149 | "duration": 0.006, 150 | "count": 1 151 | }, 152 | "data": [ 153 | { 154 | "id": 18 155 | } 156 | ], 157 | "error": null 158 | } 159 | ``` 160 | 161 | Meaning: the version of the API was 0.5, the query duration was 0.006 sec (6 millisec), one answer. The ``data`` array contains one element: id=18. Meaning, the ID of the inserted leak object was 18. You can now reference this in the leak_data object insertion. 162 | 163 | ### leak_data object 164 | 165 | Same as the leak object, here the ``id`` field *only* needs to be filled out when PUTing data there (a.k.a UPDATE statement). Otherwise please leave it out when POSTing a new leak_data row. **Note well**: the leak_id field needs to be filled out in this case. You **first** have to create leak object and then afterwards the leak_data object. 166 | 167 | ```json 168 | { 169 | "id": 0, 170 | "leak_id": 0, 171 | "email": "user@example.com", 172 | "password": "string", 173 | "password_plain": "string", 174 | "password_hashed": "string", 175 | "hash_algo": "string", 176 | "ticket_id": "string", 177 | "email_verified": true, 178 | "password_verified_ok": true, 179 | "ip": "string", 180 | "domain": "string", 181 | "browser": "string", 182 | "malware_name": "string", 183 | "infected_machine": "string", 184 | "dg": "string" 185 | } 186 | ``` 187 | 188 | ## ``import/csv/`` endpoint 189 | 190 | Also pretty self-explanatory. You need to first create a leak object, give it's ID as a GET-style parameter and upload the CSV in spycloud format via the Form. 191 | 192 | 193 | ## Installation 194 | 195 | 1. Install git and checkout this repository: 196 | ```bash 197 | apt install git 198 | git clone ... 199 | cd credentialLeakDB 200 | ``` 201 | 202 | 3. Install Postgresql: 203 | ```bash 204 | # in Ubuntu: 205 | apt install postgresql-12 206 | # alternatively, if you are in Debian 10, you can also use postgresql-11, both work: 207 | # apt install postgresql-11 208 | ``` 209 | 210 | 2. as user postgres: 211 | ```bash 212 | sudo su - postgres 213 | createdb credentialleakdb 214 | createuser credentialleakdb 215 | psql -c "ALTER ROLE credentialleakdb WITH PASSWORD ''" template1 216 | ``` 217 | 218 | 3. create the DB: 219 | ```psql -u credentialleakdb credentialleakdb < db.sql``` 220 | 221 | 5. set the env vars: 222 | ```bash 223 | export PORT=8080 224 | export DBNAME=credentialleakdb 225 | export DBUSER=credentialleakdb 226 | export DBPASSWORD=... ... 227 | export DBHOST=localhost 228 | ``` 229 | 5. Create a virtual environment if it does not exist yet: 230 | ```bash 231 | virtualenv --python=python3.7 venv 232 | source venv/bin/activate 233 | pip install -r requirements.txt 234 | ``` 235 | 5. start the program from the main directory: 236 | ```bash 237 | export PYTHONPATH=$(pwd); uvicorn --reload --host 0.0.0.0 --port $PORT api.main:app 238 | ``` 239 | 240 | ## Configuration. 241 | 242 | Please copy the file ``config.SAMPLE.py`` to ``api/config.py`` and adjust accordingly. 243 | Here you can set API keys etc. 244 | 245 | 246 | 247 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Reporting a Vulnerability 4 | 5 | All bug reports should please go to ec-digit-csirc@ec.europa.eu. Thanks. 6 | Pull requests welcome! 7 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/__init__.py -------------------------------------------------------------------------------- /api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/api/__init__.py -------------------------------------------------------------------------------- /api/enrichment.py: -------------------------------------------------------------------------------- 1 | """ 2 | Enrichment code 3 | 4 | Author: Aaron Kaplan 5 | License: see LICENSE 6 | 7 | This basically just pulls in the enricher classes. 8 | 9 | """ 10 | from modules.enrichers.ldap_lib import CEDQuery 11 | from modules.enrichers.ldap import LDAPEnricher 12 | from modules.enrichers.vip import VIPEnricher 13 | from modules.enrichers.external_email import ExternalEmailEnricher 14 | -------------------------------------------------------------------------------- /api/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | FastAPI based API on the credentialLeakDB 3 | 4 | Author: Aaron Kaplan 5 | License: see LICENSE 6 | 7 | """ 8 | 9 | # system / base packages 10 | from lib.helpers import getlogger, anonymize_password 11 | import os 12 | import shutil 13 | import time 14 | from pathlib import Path 15 | from tempfile import SpooledTemporaryFile 16 | from typing import List 17 | 18 | # database, ASGI, etc. 19 | import pandas as pd 20 | import psycopg2 21 | import psycopg2.extras 22 | import uvicorn 23 | from fastapi import FastAPI, HTTPException, File, UploadFile, Depends, Security, Response 24 | from fastapi.security.api_key import APIKeyHeader, APIKey, Request 25 | from pydantic import EmailStr 26 | 27 | # packages from this code repo 28 | from api.config import config 29 | from lib.db.db import _get_db, _close_db, _connect_db, DSN 30 | from models.idf import InternalDataFormat 31 | from models.outdf import Leak, LeakData, Answer, AnswerMeta 32 | from modules.collectors.parser import BaseParser # XXX FIXME: this should be in lib, no? Or called "genericparser" 33 | from modules.collectors.spycloud.collector import SpyCloudCollector 34 | from modules.enrichers.abuse_contact import AbuseContactLookup 35 | from modules.enrichers.external_email import ExternalEmailEnricher 36 | from modules.enrichers.ldap import LDAPEnricher 37 | from modules.enrichers.vip import VIPEnricher 38 | from modules.filters.deduper import Deduper 39 | from modules.filters.filter import Filter 40 | from modules.output.db import PostgresqlOutput 41 | from modules.parsers.spycloud import SpyCloudParser 42 | 43 | ############################################################################### 44 | # API key stuff 45 | API_KEYLEN = 32 46 | API_KEY_NAME = "x-api-key" 47 | api_key_header = APIKeyHeader(name = API_KEY_NAME, auto_error = True) 48 | 49 | VER = "0.6" 50 | 51 | logger = getlogger(__name__) 52 | 53 | app = FastAPI(title = "CredentialLeakDB", version = VER, ) # root_path='/api/v1') 54 | 55 | 56 | # ############################################################################## 57 | # DB specific functions 58 | @app.on_event('startup') 59 | def get_db(): 60 | return _get_db() 61 | 62 | 63 | @app.on_event('shutdown') 64 | def close_db(): 65 | return _close_db() 66 | 67 | 68 | # ############################################################################## 69 | # security / authentication 70 | def fetch_valid_api_keys() -> List[str]: 71 | """Fetch the list of valid API keys from a DB or a config file. 72 | 73 | :returns: List of strings - the API keys 74 | """ 75 | return config['api_keys'] 76 | 77 | 78 | def is_valid_api_key(key: str) -> bool: 79 | """ 80 | Validate a given key if it is in the list of allowed API keys *or* if the source IP where the 81 | request is coming from in in a list of valid IP addresses. 82 | 83 | :param key: the API key 84 | :returns: boolean: YES/NO 85 | """ 86 | 87 | valid_api_keys = fetch_valid_api_keys() 88 | 89 | # allowed_ips = ['127.0.0.1', 90 | # '192.168.1.1', # my own IP, in this example an RFC1918 91 | # ] 92 | # if key in valid_api_keys or (request.client.host in allowed_ips): 93 | if key in valid_api_keys: 94 | return True 95 | return False 96 | 97 | 98 | def validate_api_key_header(apikeyheader: str = Security(api_key_header)): 99 | """ 100 | Validate if a given API key is present in the HTTP apikeyheader. 101 | 102 | :param apikeyheader: the required HTTP Header 103 | :returns: the apikey apikeyheader again, if it is valid. Otherwise, raise an HTTPException and return 403. 104 | """ 105 | if not apikeyheader: 106 | raise HTTPException(status_code = 403, 107 | detail = """need API key. Please get in contact with the admins of this 108 | site in order get your API key.""") 109 | if is_valid_api_key(apikeyheader): 110 | return apikeyheader 111 | else: 112 | raise HTTPException( 113 | status_code = 403, # HTTP FORBIDDEN 114 | detail = """Could not validate the provided credentials. Please get in contact with the admins of this 115 | site in order get your API key.""" 116 | ) 117 | 118 | 119 | # ############################################################################## 120 | # File uploading 121 | async def store_file(orig_filename: str, _file: SpooledTemporaryFile, 122 | upload_path=os.getenv('UPLOAD_PATH', default = '/tmp')) -> str: 123 | """ 124 | Stores a SpooledTemporaryFile to a permanent location and returns the path to it 125 | 126 | :param orig_filename: the filename according to multipart 127 | :param _file: the SpooledTemporary File 128 | :param upload_path: where the uploaded file should be stored permanently 129 | :returns: full path to the stored file 130 | """ 131 | # Unfortunately we need to really shutil.copyfileobj() the file object to disk, even though we already have a 132 | # SpooledTemporaryFile object... this is needed for SpooledTemporaryFiles . Sucks. See here: 133 | # https://stackoverflow.com/questions/94153/how-do-i-persist-to-disk-a-temporary-file-using-python 134 | # 135 | # filepath syntax: / 136 | # example: /tmp/Spycloud.csv 137 | path = "{}/{}".format(upload_path, orig_filename) # prefix, orig_filename, sha256, pid, suffix) 138 | logger.info("storing %s ... to %s" % (orig_filename, path)) 139 | _file.seek(0) 140 | with open(path, "w+b") as outfile: 141 | shutil.copyfileobj(_file._file, outfile) 142 | return path 143 | 144 | 145 | async def check_file(filename: str) -> bool: 146 | return True # XXX FIXME Implement 147 | 148 | 149 | # ==================================================== 150 | # API endpoints 151 | 152 | @app.get("/ping", 153 | name = "Ping test", 154 | summary = "Run a ping test, to check if the service is running", 155 | tags = ["Tests"]) 156 | async def ping(): 157 | """A simple ping / liveliness test endpoint. No API Key required.""" 158 | return {"message": "pong"} 159 | 160 | 161 | @app.get("/timeout_test", 162 | name = "A simple timeout test", 163 | summary = "Call this and the GET request will sleep for 5 seconds", 164 | tags = ["Tests"]) 165 | async def timeout_test(): 166 | """A simple timeout/ liveliness test endpoint. No API Key required.""" 167 | time.sleep(5) 168 | return {"message": "OK"} 169 | 170 | 171 | @app.get("/", tags = ["Tests"]) 172 | async def root(api_key: APIKey = Depends(validate_api_key_header)): 173 | """A simple hello world endpoint. This one requires an API key.""" 174 | return {"message": "Hello World"} # , "root_path": request.scope.get("root_path")} 175 | 176 | 177 | # ############################################################################## 178 | # General API endpoints 179 | 180 | 181 | @app.get('/user/{email}', 182 | tags = ["General queries"], 183 | status_code = 200, 184 | response_model = Answer) 185 | async def get_user_by_email(email: EmailStr, 186 | response: Response, 187 | api_key: APIKey = Depends(validate_api_key_header)) -> Answer: 188 | """ 189 | Get the all credential leaks in the DB of a given user specified by his email address. 190 | 191 | # Parameters 192 | * email: string. The email address of the user (case insensitive). 193 | 194 | # Returns 195 | * A JSON Answer object with rows being an array of answers, or [] in case there was no data in the DB 196 | """ 197 | sql = """SELECT * from leak_data where upper(email)=upper(%s)""" 198 | t0 = time.time() 199 | db = get_db() 200 | try: 201 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 202 | cur.execute(sql, (email,)) 203 | rows = cur.fetchall() 204 | if len(rows) == 0: # return 404 in case no data was found 205 | response.status_code = 404 206 | t1 = time.time() 207 | d = round(t1 - t0, 3) 208 | return Answer(success = True, errormsg = None, 209 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 210 | except Exception as ex: 211 | return Answer(success = False, errormsg = str(ex), data = []) 212 | 213 | 214 | @app.get('/user_and_password/{email}/{password}', 215 | tags = ["General queries"], 216 | status_code = 200, 217 | response_model = Answer) 218 | async def get_user_by_email_and_password(email: EmailStr, 219 | password: str, 220 | response: Response, 221 | api_key: APIKey = Depends(validate_api_key_header) 222 | ) -> Answer: 223 | """ 224 | Get the all credential leaks in the DB of a given user given by the combination email + password. 225 | Note that both email and password must match (where email is case insensitive, the password *is case sensitive*). 226 | 227 | # Parameters 228 | * email: string. The email address of the user (**case insensitive**, since email is usually case insensitive). 229 | * password: string. The (hashed or plaintext) password (**note: this is case sensitive**) 230 | 231 | # Returns 232 | * A JSON Answer object with rows being an array of answers, or [] in case there was no data in the DB 233 | 234 | # Example 235 | ``foo@example.com`` and ``12345`` --> 236 | 237 | ``{ "meta": { ... }, "data": [ { "id": 14, "leak_id": 1, "email": "aaron@example.com", "password": "12345", ..., ], 238 | "errormsg": null }`` 239 | 240 | """ 241 | sql = """SELECT * from leak_data where upper(email)=upper(%s) and password=%s""" 242 | t0 = time.time() 243 | db = get_db() 244 | try: 245 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 246 | cur.execute(sql, (email, password)) 247 | rows = cur.fetchall() 248 | if len(rows) == 0: # return 404 in case no data was found 249 | response.status_code = 404 250 | t1 = time.time() 251 | d = round(t1 - t0, 3) 252 | return Answer(success = True, errormsg = None, 253 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 254 | except Exception as ex: 255 | return Answer(success = False, errormsg = str(ex), data = []) 256 | 257 | 258 | @app.get('/exists/by_email/{email}', 259 | tags = ["General queries"], 260 | status_code = 200, 261 | response_model = Answer) 262 | async def check_user_by_email(email: EmailStr, 263 | response: Response, 264 | api_key: APIKey = Depends(validate_api_key_header) 265 | ) -> Answer: 266 | """ 267 | Check if a certain email address was present in any leak. 268 | 269 | # Parameters 270 | * email: string. The email address of the user (**case insensitive**, since email is usually case insensitive). 271 | 272 | # Returns 273 | * A JSON Answer object with rows being an array of answers, or [] in case there was no data in the DB 274 | 275 | # Example 276 | ``foo@example.com`` --> 277 | ``{ "meta": { "version": "0.5", "duration": 0.002, "count": 1 }, "data": [ { "count": 1 } ], "success": true, 278 | "errormsg": null }`` 279 | """ 280 | sql = """SELECT count(*) from leak_data where upper(email)=upper(%s)""" 281 | t0 = time.time() 282 | db = get_db() 283 | try: 284 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 285 | cur.execute(sql, (email,)) 286 | rows = cur.fetchall() 287 | t1 = time.time() 288 | d = round(t1 - t0, 3) 289 | return Answer(success = True, errormsg = None, 290 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 291 | except Exception as ex: 292 | return Answer(success = False, errormsg = str(ex), data = []) 293 | 294 | 295 | @app.get('/exists/by_password/{password}', 296 | tags = ["General queries"], 297 | status_code = 200, 298 | response_model = Answer) 299 | async def check_user_by_password(password: str, 300 | response: Response, 301 | api_key: APIKey = Depends(validate_api_key_header) 302 | ) -> Answer: 303 | """ 304 | Check if a user exists with the given password (either plaintext or hashed) in the DB. If so, return the user. 305 | 306 | # Parameters 307 | * password: string. The password to be searched. 308 | 309 | # Returns 310 | * A JSON Answer object with rows being an array of answers, or [] in case there was no data in the DB 311 | 312 | # Example 313 | ``12345`` --> 314 | ``{ "meta": { ... }, "data": [ { "id": 14, "leak_id": 1, "email": "aaron@example.com", "password": "12345", 315 | ..., ], "errormsg": null }`` 316 | """ 317 | # can do better... use the hashid library? 318 | 319 | sql = """SELECT count(*) from leak_data where password=%s or password_plain=%s or password_hashed=%s""" 320 | t0 = time.time() 321 | db = get_db() 322 | try: 323 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 324 | cur.execute(sql, (password, password, password)) 325 | rows = cur.fetchall() 326 | t1 = time.time() 327 | d = round(t1 - t0, 3) 328 | return Answer(success = True, errormsg = None, 329 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 330 | except Exception as ex: 331 | return Answer(success = False, errormsg = str(ex), data = []) 332 | 333 | 334 | @app.get('/exists/by_domain/{domain}', 335 | tags = ["General queries"], 336 | status_code = 200, 337 | response_model = Answer) 338 | async def check_by_domain(domain: str, 339 | response: Response, 340 | api_key: APIKey = Depends(validate_api_key_header)) -> Answer: 341 | """ 342 | Check if a given domain appears in some leak. 343 | 344 | # Parameters 345 | * domain : string. The domain to search for (case insensitive). 346 | 347 | # Returns: 348 | A JSON Answer object with the count of occurrences in the data: field. 349 | """ 350 | 351 | sql = """SELECT count(*) from leak_data where upper(domain)=upper(%s)""" 352 | t0 = time.time() 353 | db = get_db() 354 | try: 355 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 356 | cur.execute(sql, (domain,)) 357 | rows = cur.fetchall() 358 | t1 = time.time() 359 | d = round(t1 - t0, 3) 360 | return Answer(success = True, errormsg = None, 361 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 362 | except Exception as ex: 363 | return Answer(success = False, errormsg = str(ex), data = []) 364 | 365 | 366 | # ############################################################################## 367 | # Reference data (reporter, source, etc) starts here 368 | @app.get('/reporter', 369 | tags = ["Reference data"], 370 | status_code = 200, 371 | response_model = Answer) 372 | async def get_reporters(response: Response, 373 | api_key: APIKey = Depends(validate_api_key_header)) -> Answer: 374 | """ 375 | Get the all reporter_name entries (sorted, unique). 376 | 377 | # Parameters 378 | 379 | # Returns 380 | * A JSON Answer object with data containing an array of answers, or [] in case there was no data in the DB 381 | """ 382 | sql = """SELECT distinct(reporter_name) from leak ORDER by reporter_name asc""" 383 | t0 = time.time() 384 | db = get_db() 385 | try: 386 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 387 | cur.execute(sql) 388 | rows = cur.fetchall() 389 | if len(rows) == 0: # return 404 in case no data was found 390 | response.status_code = 404 391 | t1 = time.time() 392 | d = round(t1 - t0, 3) 393 | return Answer(success = True, errormsg = None, 394 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 395 | except Exception as ex: 396 | return Answer(success = False, errormsg = str(ex), data = []) 397 | 398 | 399 | @app.get('/source_name', 400 | tags = ["Reference data"], 401 | status_code = 200, 402 | response_model = Answer) 403 | async def get_sources(response: Response, 404 | api_key: APIKey = Depends(validate_api_key_header)) -> Answer: 405 | """ 406 | Get the all names of sources of leaks (sorted, unique) - i.e. "SpyCloud", "HaveIBeenPwned", etc.. 407 | 408 | # Parameters 409 | 410 | # Returns 411 | * A JSON Answer object with data containing an array of answers, or [] in case there was no data in the DB 412 | """ 413 | sql = """SELECT distinct(source_name) from leak ORDER by source_name asc""" 414 | t0 = time.time() 415 | db = get_db() 416 | try: 417 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 418 | cur.execute(sql) 419 | rows = cur.fetchall() 420 | if len(rows) == 0: # return 404 in case no data was found 421 | response.status_code = 404 422 | t1 = time.time() 423 | d = round(t1 - t0, 3) 424 | return Answer(success = True, errormsg = None, 425 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 426 | except Exception as ex: 427 | return Answer(success = False, errormsg = str(ex), data = []) 428 | 429 | 430 | # ############################################################################## 431 | # Leak table starts here 432 | 433 | @app.get("/leak/all", 434 | tags = ["Leak"], 435 | status_code = 200, 436 | response_model = Answer) 437 | async def get_all_leaks(response: Response, 438 | api_key: APIKey = Depends(validate_api_key_header)) -> Answer: 439 | """Fetch all leaks. 440 | 441 | # Parameters 442 | 443 | # Returns 444 | * A JSON Answer object with all leak (i.e. meta-data of leaks) data from the `leak` table. 445 | """ 446 | 447 | t0 = time.time() 448 | sql = "SELECT * from leak" 449 | db = get_db() 450 | try: 451 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 452 | cur.execute(sql) 453 | rows = cur.fetchall() 454 | if len(rows) == 0: # return 404 in case no data was found 455 | response.status_code = 404 456 | t1 = time.time() 457 | d = round(t1 - t0, 3) 458 | return Answer(success = True, errormsg = None, 459 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 460 | except Exception as ex: 461 | return Answer(success = False, errormsg = str(ex), data = []) 462 | 463 | 464 | @app.get("/leak/by_ticket_id/{ticket_id}", 465 | tags = ["Leak"], 466 | status_code = 200, 467 | response_model = Answer) 468 | async def get_leak_by_ticket_id(ticket_id: str, 469 | response: Response, 470 | api_key: APIKey = Depends(validate_api_key_header) 471 | ) -> Answer: 472 | """Fetch a leak by its ticket system id""" 473 | t0 = time.time() 474 | sql = "SELECT * from leak WHERE ticket_id = %s" 475 | db = get_db() 476 | try: 477 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 478 | cur.execute(sql, (ticket_id,)) 479 | rows = cur.fetchall() 480 | if len(rows) == 0: # return 404 in case no data was found 481 | response.status_code = 404 482 | t1 = time.time() 483 | d = round(t1 - t0, 3) 484 | return Answer(success = True, errormsg = None, 485 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 486 | except Exception as ex: 487 | return Answer(success = False, errormsg = str(ex), data = []) 488 | 489 | 490 | @app.get("/leak/by_summary/{summary}", 491 | tags = ["Leak"], 492 | status_code = 200, 493 | response_model = Answer) 494 | async def get_leak_by_summary(summary: str, 495 | response: Response, 496 | api_key: APIKey = Depends(validate_api_key_header) 497 | ) -> Answer: 498 | """Fetch a leak by summary""" 499 | sql = "SELECT * from leak WHERE summary = %s" 500 | t0 = time.time() 501 | db = get_db() 502 | try: 503 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 504 | cur.execute(sql, (summary,)) 505 | rows = cur.fetchall() 506 | if len(rows) == 0: # return 404 in case no data was found 507 | response.status_code = 404 508 | t1 = time.time() 509 | d = round(t1 - t0, 3) 510 | return Answer(success = True, errormsg = None, 511 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 512 | except Exception as ex: 513 | return Answer(success = False, errormsg = str(ex), data = []) 514 | 515 | 516 | @app.get("/leak/by_reporter/{reporter}", 517 | tags = ["Leak"], 518 | status_code = 200, 519 | response_model = Answer) 520 | async def get_leak_by_reporter(reporter: str, 521 | response: Response, 522 | api_key: APIKey = Depends(validate_api_key_header) 523 | ) -> Answer: 524 | """Fetch a leak by its reporter. """ 525 | sql = "SELECT * from leak WHERE reporter_name = %s" 526 | t0 = time.time() 527 | db = get_db() 528 | try: 529 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 530 | cur.execute(sql, (reporter,)) 531 | rows = cur.fetchall() 532 | if len(rows) == 0: # return 404 in case no data was found 533 | response.status_code = 404 534 | t1 = time.time() 535 | d = round(t1 - t0, 3) 536 | return Answer(success = True, errormsg = None, 537 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 538 | except Exception as ex: 539 | return Answer(success = False, errormsg = str(ex), data = []) 540 | 541 | 542 | @app.get("/leak/by_source/{source_name}", 543 | tags = ["Leak"], 544 | status_code = 200, 545 | response_model = Answer) 546 | async def get_leak_by_source(source_name: str, 547 | response: Response, 548 | api_key: APIKey = Depends(validate_api_key_header) 549 | ) -> Answer: 550 | """Fetch all leaks by their source (i.e. *who* collected the leak data (spycloud, HaveIBeenPwned, etc.). 551 | 552 | # Parameters 553 | * source_name: string. The name of the source (case insensitive). 554 | 555 | # Returns 556 | * a JSON Answer object with all leaks for that given source_name. 557 | """ 558 | 559 | sql = "SELECT * from leak WHERE upper(source_name) = upper(%s)" 560 | t0 = time.time() 561 | db = get_db() 562 | try: 563 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 564 | cur.execute(sql, (source_name,)) 565 | rows = cur.fetchall() 566 | if len(rows) == 0: # return 404 in case no data was found 567 | response.status_code = 404 568 | t1 = time.time() 569 | d = round(t1 - t0, 3) 570 | return Answer(success = True, errormsg = None, 571 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 572 | except Exception as ex: 573 | return Answer(success = False, errormsg = str(ex), data = []) 574 | 575 | 576 | @app.get("/leak/{_id}", tags = ["Leak"], 577 | description = 'Get the leak info by its ID.', 578 | status_code = 200, 579 | response_model = Answer) 580 | async def get_leak_by_id(_id: int, 581 | response: Response, 582 | api_key: APIKey = Depends(validate_api_key_header) 583 | ) -> Answer: 584 | """Fetch a leak by its ID""" 585 | t0 = time.time() 586 | sql = "SELECT * from leak WHERE id = %s" 587 | db = get_db() 588 | try: 589 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 590 | cur.execute(sql, (_id,)) 591 | rows = cur.fetchall() 592 | if len(rows) == 0: # return 404 in case no data was found 593 | response.status_code = 404 594 | t1 = time.time() 595 | d = round(t1 - t0, 3) 596 | return Answer(success = True, errormsg = None, 597 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 598 | except Exception as ex: 599 | return Answer(success = False, errormsg = str(ex), data = []) 600 | 601 | 602 | @app.post("/leak/", 603 | tags = ["Leak"], 604 | description = "INSERT a new leak into the DB", 605 | status_code = 201, 606 | response_model = Answer) 607 | async def new_leak(leak: Leak, 608 | response: Response, 609 | api_key: APIKey = Depends(validate_api_key_header) 610 | ) -> Answer: 611 | """ 612 | INSERT a new leak into the leak table in the database. 613 | 614 | # Parameters 615 | * leak: a Leak object. Note that all fields must be set, except for leak.id 616 | # Returns 617 | * a JSON Answer object with the leak_id in the data: field 618 | 619 | """ 620 | sql = """INSERT into leak 621 | (summary, ticket_id, reporter_name, source_name, breach_ts, source_publish_ts, ingestion_ts) 622 | VALUES (%s, %s, %s, %s, %s, %s, now()) 623 | RETURNING id 624 | """ 625 | t0 = time.time() 626 | db = get_db() 627 | try: 628 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 629 | cur.execute(sql, (leak.summary, leak.ticket_id, leak.reporter_name, leak.source_name, leak.breach_ts, 630 | leak.source_publish_ts,)) 631 | rows = cur.fetchall() 632 | if len(rows) == 0: # return 400 in case the INSERT failed. 633 | response.status_code = 400 634 | t1 = time.time() 635 | d = round(t1 - t0, 3) 636 | return Answer(success = True, errormsg = None, 637 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 638 | except Exception as ex: 639 | return Answer(success = False, errormsg = str(ex), data = []) 640 | 641 | 642 | @app.put("/leak/", 643 | tags = ["Leak"], 644 | status_code = 200, 645 | response_model = Answer) 646 | async def update_leak(leak: Leak, 647 | response: Response, 648 | api_key: APIKey = Depends(validate_api_key_header) 649 | ) -> Answer: 650 | """ 651 | UPDATE an existing leak. 652 | 653 | # Parameters 654 | * leak: a Leak object. Note that all fields must be set in the Leak object. 655 | # Returns 656 | * a JSON Answer object with the ID of the updated leak. 657 | """ 658 | sql = """UPDATE leak SET 659 | summary = %s, ticket_id = %s, reporter_name = %s, source_name = %s, 660 | breach_ts = %s, source_publish_ts = %s 661 | WHERE id = %s 662 | RETURNING id 663 | """ 664 | t0 = time.time() 665 | db = get_db() 666 | if not leak.id: 667 | return Answer(success = False, errormsg = "id %s not given. Please specify a leak.id you want to UPDATE", 668 | data = []) 669 | try: 670 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 671 | cur.execute(sql, (leak.summary, leak.ticket_id, leak.reporter_name, 672 | leak.source_name, leak.breach_ts, leak.source_publish_ts, leak.id)) 673 | rows = cur.fetchall() 674 | if len(rows) == 0: # return 400 in case the INSERT failed. 675 | response.status_code = 400 676 | t1 = time.time() 677 | d = round(t1 - t0, 3) 678 | return Answer(success = True, errormsg = None, 679 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 680 | except Exception as ex: 681 | return Answer(success = False, errormsg = str(ex), data = []) 682 | 683 | 684 | # ############################################################################################################ 685 | # Leak Data starts here 686 | 687 | @app.get("/leak_data/{leak_data_id}", 688 | tags = ["Leak Data"], 689 | status_code = 200, 690 | response_model = Answer) 691 | async def get_leak_data_by_id(leak_data_id: int, 692 | response: Response, 693 | api_key: APIKey = Depends(validate_api_key_header)) -> Answer: 694 | """ 695 | Fetch all leak data entries of a given id. 696 | 697 | # Parameters 698 | * leak_data_id: integer, the DB internal leak_data_id. 699 | 700 | # Returns 701 | * A JSON Answer object with the corresponding leak data (i.e. actual usernames, passwords) from the `leak_data` 702 | table which are contained within the specified leak (leak_data_id). 703 | """ 704 | t0 = time.time() 705 | sql = "SELECT * from leak_data where id=%s" 706 | db = get_db() 707 | try: 708 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 709 | cur.execute(sql, (leak_data_id,)) 710 | rows = cur.fetchall() 711 | if len(rows) == 0: # return 404 in case no data was found 712 | response.status_code = 404 713 | t1 = time.time() 714 | d = round(t1 - t0, 3) 715 | return Answer(success = True, errormsg = None, 716 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 717 | except Exception as ex: 718 | return Answer(success = False, errormsg = str(ex), data = []) 719 | 720 | 721 | @app.get("/leak_data/by_ticket_id/{ticket_id}", 722 | tags = ["Leak Data"], 723 | status_code = 200, 724 | response_model = Answer) 725 | async def get_leak_data_by_ticket_id(ticket_id: str, 726 | response: Response, 727 | api_key: APIKey = Depends(validate_api_key_header) 728 | ) -> Answer: 729 | """Fetch a leak row (leak_data table) by its ticket system id 730 | 731 | # Parameters 732 | * ticket_id: string. The ticket system ID which references the leak_data row 733 | # Returns 734 | * a JSON Answer object with the leak data row or in data. 735 | """ 736 | sql = "SELECT * from leak_data WHERE ticket_id = %s" 737 | t0 = time.time() 738 | db = get_db() 739 | try: 740 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 741 | cur.execute(sql, (ticket_id,)) 742 | rows = cur.fetchall() 743 | if len(rows) == 0: # return 404 in case no data was found 744 | response.status_code = 404 745 | t1 = time.time() 746 | d = round(t1 - t0, 3) 747 | return Answer(success = True, errormsg = None, 748 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 749 | except Exception as ex: 750 | return Answer(success = False, errormsg = str(ex), data = []) 751 | 752 | 753 | @app.post("/leak_data/", 754 | tags = ["Leak Data"], 755 | status_code = 201, 756 | response_model = Answer) 757 | async def new_leak_data(row: LeakData, 758 | response: Response, 759 | api_key: APIKey = Depends(validate_api_key_header) 760 | ) -> Answer: 761 | """ 762 | INSERT a new leak_data row into the leak_data table. 763 | 764 | # Parameters 765 | * row: a leakData object. If that data already exists, it will not be inserted again. 766 | # Returns 767 | * a JSON Answer object containing the ID of the inserted leak_data row. 768 | """ 769 | sql = """INSERT into leak_data 770 | (leak_id, email, password, password_plain, password_hashed, hash_algo, ticket_id, 771 | email_verified, password_verified_ok, ip, domain, browser, malware_name, infected_machine, dg) 772 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 773 | ON CONFLICT ON CONSTRAINT constr_unique_leak_data_leak_id_email_password_domain DO UPDATE SET email=%s 774 | RETURNING id 775 | """ 776 | t0 = time.time() 777 | db = get_db() 778 | logger.debug(row) 779 | try: 780 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 781 | cur.execute(sql, (row.leak_id, row.email, row.password, row.password_plain, row.password_hashed, row.hash_algo, 782 | row.ticket_id, row.email_verified, row.password_verified_ok, row.ip, row.domain, row.browser, 783 | row.malware_name, row.infected_machine, row.dg, row.email)) 784 | rows = cur.fetchall() 785 | if len(rows) == 0: # return 400 in case the INSERT failed. 786 | response.status_code = 400 787 | t1 = time.time() 788 | d = round(t1 - t0, 3) 789 | return Answer(success = True, errormsg = None, 790 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 791 | except Exception as ex: 792 | return Answer(success = False, errormsg = str(ex), data = []) 793 | 794 | 795 | @app.put("/leak_data/", 796 | tags = ["Leak Data"], 797 | status_code = 200, 798 | response_model = Answer) 799 | async def update_leak_data(row: LeakData, 800 | request: Request, 801 | response: Response, 802 | api_key: APIKey = Depends(validate_api_key_header) 803 | ) -> Answer: 804 | """ 805 | UPDATE leak_data row in the leak_data table. 806 | 807 | # Parameters 808 | * row : a leakData object with all the relevant information. Please note that you **have to** supply all fields, 809 | even if you do not plan to update them. In other words: you might have to GET / the leak_data object first. 810 | # Returns 811 | * a JSON Answer object containing the ID of the inserted leak_data row. 812 | """ 813 | sql = """UPDATE leak_data SET 814 | leak_id = %s, 815 | email = %s, 816 | password = %s, 817 | password_plain = %s, 818 | password_hashed = %s, 819 | hash_algo = %s, 820 | ticket_id = %s, 821 | email_verified = %s, 822 | password_verified_ok = %s, 823 | ip = %s, 824 | domain = %s, 825 | browser = %s, 826 | malware_name = %s, 827 | infected_machine = %s, 828 | dg = %s 829 | WHERE id = %s 830 | RETURNING id 831 | """ 832 | t0 = time.time() 833 | db = get_db() 834 | try: 835 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 836 | logger.debug("HTTP request: '%r'" % request) 837 | logger.debug("SQL command: '%s'" % cur.mogrify(sql, (row.leak_id, row.email, row.password, row.password_plain, 838 | row.password_hashed, row.hash_algo, 839 | row.ticket_id, row.email_verified, 840 | row.password_verified_ok, row.ip, row.domain, row.browser, 841 | row.malware_name, row.infected_machine, row.dg, row.id))) 842 | cur.execute(sql, (row.leak_id, row.email, row.password, row.password_plain, row.password_hashed, row.hash_algo, 843 | row.ticket_id, row.email_verified, row.password_verified_ok, row.ip, row.domain, row.browser, 844 | row.malware_name, row.infected_machine, row.dg, row.id)) 845 | db.commit() 846 | rows = cur.fetchall() 847 | if len(rows) == 0: # return 400 in case the INSERT failed. 848 | response.status_code = 400 849 | t1 = time.time() 850 | d = round(t1 - t0, 3) 851 | return Answer(success = True, errormsg = None, 852 | meta = AnswerMeta(version = VER, duration = d, count = len(rows)), data = rows) 853 | except Exception as ex: 854 | return Answer(success = False, errormsg = str(ex), data = []) 855 | 856 | 857 | # ############################################################################################################ 858 | # CSV file importing 859 | 860 | def enrich(item: InternalDataFormat, leak_id: str) -> InternalDataFormat: 861 | """Initial enricher chain. This SHOULD be configurable and a pipeline via a MQ.""" 862 | # set leak_id 863 | item.leak_id = leak_id 864 | 865 | # VIP status 866 | if not item.is_vip: 867 | vip_enricher = VIPEnricher() 868 | item.is_vip = vip_enricher.is_vip(item.email) 869 | 870 | # DG 871 | ldap_enricher = LDAPEnricher() 872 | if not item.dg: 873 | dg = ldap_enricher.email_to_dg(item.email) 874 | if not dg: 875 | dg = "Unknown" 876 | item.dg = dg 877 | 878 | # Active account or outdated? 879 | if not item.is_active_account: 880 | item.is_active_account = ldap_enricher.exists(item.email) 881 | 882 | # External Address or internal? 883 | if not item.external_user: 884 | ext_email_enricher = ExternalEmailEnricher() 885 | item.external_user = ext_email_enricher.is_external_email(item.email) 886 | 887 | # credential Type 888 | if not item.credential_type: 889 | item.credential_type = ["EU Login"] # XXX FIXME! This is mock-up data! 890 | 891 | # Abuse contact / report to 892 | if not item.report_to: 893 | abuse_enricher = AbuseContactLookup() 894 | item.report_to = abuse_enricher.lookup(item.email) 895 | 896 | # all is good, we went through the pipeline 897 | item.notify = True 898 | item.needs_human_intervention = False 899 | item.error_msg = None 900 | return item 901 | 902 | 903 | def store(idf: InternalDataFormat) -> InternalDataFormat: 904 | """Store the item in the DB. 905 | 906 | :returns the idf item. 907 | """ 908 | # XXX FIXME!! need to implement / refactor existing code. 909 | # convert the idf to the DB row 910 | 911 | return idf 912 | 913 | 914 | def convert_to_output(idf: InternalDataFormat) -> LeakData: 915 | """Convert the internal data format to the output data format. 916 | 917 | ":returns LeakData 918 | """ 919 | output_data_entry = LeakData(**idf.dict()) # here the validation pydantic magic happens 920 | return output_data_entry 921 | 922 | 923 | @app.post("/import/csv/spycloud/{parent_ticket_id}", 924 | tags = ["CSV import"], 925 | status_code = 200, 926 | response_model = Answer) 927 | async def import_csv_spycloud(parent_ticket_id: str, 928 | response: Response, 929 | summary: str = None, 930 | _file: UploadFile = File(...), 931 | api_key: APIKey = Depends(validate_api_key_header)) -> Answer: 932 | """ 933 | Import a spycloud CSV file into the DB. Note that you do not need to specify a leak_id parameter here. 934 | The API will automatically create a leak object in the DB for you and link it. 935 | 936 | # Parameters 937 | * parent_ticket_id: a ticket ID which allows us to link the leak object to the ticket 938 | * summary: a summary string for the new leak object (if it's created) 939 | * _file: a file which must be uploaded via HTML forms/multipart. 940 | 941 | # Returns 942 | * a JSON Answer object where the data: field is the **deduplicated** CSV file (i.e. lines which were already 943 | imported as part of that leak (same username, same password, same domain) will not be returned. 944 | In other words, data: [] contains the rows from the CSV file which did not yet exist in the DB. 945 | """ 946 | 947 | t0 = time.time() 948 | 949 | if not parent_ticket_id: 950 | response.status_code = 400 951 | return Answer(success = False, 952 | errormsg = "Please specify a parent_ticket_id as a GET-style parameter in the URL. " 953 | "This is the parameter, needed to link the sub-issues against", data = []) 954 | if not summary: 955 | response.status_code = 400 956 | return Answer(success = False, 957 | errormsg = "Please specify a summary for the Leak object which needs to be created. ", data = []) 958 | 959 | # first check if the leak_id for that summary already exists and if it's already linked to the parent_ticket_id. 960 | sql = """SELECT id from leak where summary = %s and ticket_id=%s""" 961 | db = get_db() 962 | try: 963 | with db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) as cur: 964 | logger.debug(cur.mogrify(sql, (summary, parent_ticket_id))) 965 | cur.execute(sql, (summary, parent_ticket_id)) 966 | rows = cur.fetchall() 967 | nr_results = len(rows) 968 | if nr_results >= 1: 969 | # take the first one 970 | leak_id = rows[0]['id'] 971 | logger.info("Found existing leak object: %s" % leak_id) 972 | else: 973 | # nothing found, create one 974 | source_name = "SpyCloud" 975 | leak = Leak(ticket_id = parent_ticket_id, summary = summary, source_name = source_name) 976 | answer = await new_leak(leak, response = response, api_key = api_key) 977 | logger.info("Did not find existing leak object, creating one") 978 | if answer.success: 979 | leak_id = int(answer.data[0]['id']) 980 | logger.info("Created with id %s" % leak_id) 981 | else: 982 | logger.error("Could not create leak object for spycloud CSV file") 983 | return Answer(success = False, errormsg = "could not create leak object", data = []) 984 | except Exception as ex: 985 | return Answer(success = False, errormsg = str(ex), data = []) 986 | 987 | # okay, we found the leak, let's insert the CSV 988 | # noinspection PyTypeChecker 989 | file_on_disk = await store_file(_file.filename, _file.file) 990 | await check_file(file_on_disk) # XXX FIXME. Additional checks on the dumped file still missing 991 | 992 | collector = SpyCloudCollector() 993 | status, df = collector.collect(Path(file_on_disk)) 994 | if status != "OK": 995 | return Answer(success = False, errormsg = "Could not read input CSV file", data = []) 996 | 997 | p = SpyCloudParser() 998 | try: 999 | items = p.parse(df) 1000 | except Exception as ex: 1001 | return Answer(success = False, errormsg = str(ex), data = []) 1002 | 1003 | deduper = Deduper() 1004 | db_output = PostgresqlOutput() 1005 | filter = Filter() 1006 | 1007 | data = [] 1008 | for item in items: # FIXME: this pipeline could be done nicer with functools and reduce 1009 | # send it through the complete pipeline 1010 | item = filter.filter(item) 1011 | email = item.email 1012 | password = anonymize_password(item.password) 1013 | if not item: 1014 | logger.info("skipping item (%s, %s), It got filtered out by the filter." % (email, password)) 1015 | continue 1016 | try: 1017 | item = deduper.dedup(item) 1018 | if not item: 1019 | logger.info("skipping item (%s, %s), since it already existed in the DB." % (email, password)) 1020 | continue # next item 1021 | except Exception as ex: 1022 | logger.error("Could not deduplicate item (%s, %s). Skipping this row. Reason: %s" % (email, password, str(ex))) 1023 | continue 1024 | try: 1025 | item = enrich(item, leak_id = leak_id) 1026 | item.leak_id = leak_id 1027 | except Exception as ex: 1028 | errmsg = "Could not enrich item (%s, %s). Skipping this row. Reason: %s" % (email, password, str(ex),) 1029 | logger.error(errmsg) 1030 | item.error_msg = errmsg 1031 | item.needs_human_intervention = True 1032 | item.notify = False 1033 | if item.external_user: 1034 | item.notify = False 1035 | # after all is finished, convert to output format and return the (deduped) row 1036 | # convert to output format: 1037 | out_item = convert_to_output(item) 1038 | logger.info(out_item) 1039 | 1040 | # and finally, store it in the DB 1041 | if not item.needs_human_intervention: 1042 | try: 1043 | db_output.process(out_item) 1044 | except Exception as ex: 1045 | errmsg = "Could not store row. Skipping this row. Reason: %s" % str(ex) 1046 | logger.error(errmsg) 1047 | out_item.error_msg = errmsg 1048 | out_item.needs_human_intervention = True 1049 | out_item.notify = False 1050 | 1051 | data.append(out_item) 1052 | # done! Emit all the output items with the header 1053 | t1 = time.time() 1054 | d = round(t1 - t0, 3) 1055 | return Answer(success = True, errormsg = None, 1056 | meta = AnswerMeta(version = VER, duration = d, count = len(data)), 1057 | data = data) 1058 | 1059 | 1060 | # noinspection PyTypeChecker 1061 | @app.post("/import/csv/by_leak/{leak_id}", 1062 | tags = ["CSV import"], 1063 | status_code = 200, 1064 | response_model = Answer) 1065 | async def import_csv_with_leak_id(leak_id: int, 1066 | response: Response, 1067 | _file: UploadFile = File(...), 1068 | api_key: APIKey = Depends(validate_api_key_header) 1069 | ) -> Answer: 1070 | """ 1071 | Import a CSV file into the DB. You **need** to specify a ?leak_id= parameter so that the CSV file may be 1072 | linked to a leak_id. Failure to provide a leak_id will result in the file not being imported into the DB. 1073 | 1074 | # Parameters 1075 | * leak_id : int. As a GET parameter. This allows the DB to link the leak data (CSV file) to the leak_id entry in 1076 | in the leak table. 1077 | * _file: a file which must be uploaded via HTML forms/multipart. 1078 | 1079 | # Returns 1080 | * a JSON Answer object where the data: field is the **deduplicated** CSV file (i.e. lines which were already 1081 | imported as part of that leak (same username, same password, same domain) will not be returned. 1082 | In other words, data: [] contains the rows from the CSV file which did not yet exist in the DB. 1083 | """ 1084 | 1085 | t0 = time.time() 1086 | 1087 | if not leak_id: 1088 | return Answer(success = False, errormsg = "Please specify a leak_id GET-style parameter in the URL", data = []) 1089 | 1090 | # first check if the leak_id exists 1091 | sql = """SELECT count(*) from leak where id = %s""" 1092 | db = get_db() 1093 | try: 1094 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 1095 | cur.execute(sql, (leak_id,)) 1096 | rows = cur.fetchone() 1097 | nr_results = int(rows['count']) 1098 | if nr_results != 1: 1099 | response.status_code = 404 1100 | return Answer(success = False, errormsg = "Leak ID %s not found" % leak_id, data = []) 1101 | except Exception as ex: 1102 | return Answer(success = False, errormsg = str(ex), data = []) 1103 | 1104 | # okay, we found the leak, let's insert the CSV 1105 | file_on_disk = await store_file(_file.filename, _file.file) 1106 | await check_file(file_on_disk) # XXX FIXME. Additional checks on the dumped file still missing 1107 | 1108 | p = BaseParser() 1109 | df = pd.DataFrame() 1110 | try: 1111 | df = p.parse_file(Path(file_on_disk), leak_id = leak_id) 1112 | except Exception as ex: 1113 | return Answer(success = False, errormsg = str(ex), data = []) 1114 | 1115 | df = p.normalize_data(df, leak_id = leak_id) 1116 | """ 1117 | Now, after normalization, the df is in the format: 1118 | leak_id, email, password, password_plain, password_hashed, hash_algo, ticket_id, email_verified, 1119 | password_verified_ok, ip, domain, browser , malware_name, infected_machine, dg 1120 | 1121 | Example 1122 | ------- 1123 | [5 rows x 15 columns] 1124 | leak_id email ... infected_machine dg 1125 | 0 1 aaron@example.com ... local_laptop DIGIT 1126 | 1 1 sarah@example.com ... sarahs_laptop DIGIT 1127 | 2 1 rousben@example.com ... WORKSTATION DIGIT 1128 | 3 1 david@example.com ... Macbook Pro DIGIT 1129 | 4 1 lauri@example.com ... Raspberry PI 3+ DIGIT 1130 | 5 1 natasha@example.com ... Raspberry PI 3+ DIGIT 1131 | 1132 | """ 1133 | 1134 | inserted_ids = [] 1135 | for r in df.reset_index().to_dict(orient = 'records'): 1136 | sql = """ 1137 | INSERT into leak_data( 1138 | leak_id, email, password, password_plain, password_hashed, hash_algo, ticket_id, email_verified, 1139 | password_verified_ok, ip, domain, browser , malware_name, infected_machine, dg 1140 | ) 1141 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s ) 1142 | ON CONFLICT ON CONSTRAINT constr_unique_leak_data_leak_id_email_password_domain 1143 | DO UPDATE SET count_seen = leak_data.count_seen + 1 1144 | RETURNING id 1145 | """ 1146 | try: 1147 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 1148 | cur.execute(sql, (r['leak_id'], r['email'], r['password'], r['password_plain'], r['password_hashed'], 1149 | r['hash_algo'], r['ticket_id'], r['email_verified'], r['password_verified_ok'], r['ip'], 1150 | r['domain'], r['browser'], r['malware_name'], r['infected_machine'], r['dg'])) 1151 | leak_data_id = int(cur.fetchone()['id']) 1152 | inserted_ids.append(leak_data_id) 1153 | except Exception as ex: 1154 | return Answer(success = False, errormsg = str(ex), data = []) 1155 | t1 = time.time() 1156 | d = round(t1 - t0, 3) 1157 | 1158 | # now get the data of all the IDs / dedup 1159 | try: 1160 | sql = """SELECT * from leak_data where id in %s""" 1161 | cur = db.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 1162 | cur.execute(sql, (tuple(inserted_ids),)) 1163 | data = cur.fetchall() 1164 | return Answer(success = True, errormsg = None, 1165 | meta = AnswerMeta(version = VER, duration = d, count = len(inserted_ids)), data = data) 1166 | except Exception as ex: 1167 | return Answer(success = False, errormsg = str(ex), data = []) 1168 | 1169 | 1170 | # ############################################################################################################ 1171 | # enrichers 1172 | 1173 | @app.get('/enrich/email_to_dg/{email}', 1174 | tags = ["Enricher"], 1175 | status_code = 200, 1176 | response_model = Answer) 1177 | async def enrich_dg_by_email(email: EmailStr, 1178 | response: Response, 1179 | api_key: APIKey = Depends(validate_api_key_header)) -> Answer: 1180 | """ 1181 | Enricher function: insert an email, returns the DG. 1182 | 1183 | :param email: 1184 | :return: The DG or "Unknown" 1185 | """ 1186 | t0 = time.time() 1187 | le = LDAPEnricher() 1188 | retval = le.email_to_dg(email) 1189 | t1 = time.time() 1190 | d = round(t1 - t0, 3) 1191 | if not retval: 1192 | response.status_code = 404 1193 | return Answer(success = False, errormsg = "not found", 1194 | meta = AnswerMeta(version = VER, duration = d, count = 0), data = []) 1195 | else: 1196 | response.status_code = 200 1197 | return Answer(success = True, errormsg = None, meta = AnswerMeta(version = VER, duration = d, count = 1), 1198 | data = [{"dg": retval}]) 1199 | 1200 | 1201 | @app.get('/enrich/email_to_userid/{email}', 1202 | tags = ["Enricher"], 1203 | status_code = 200, 1204 | response_model = Answer) 1205 | async def enrich_userid_by_email(email: EmailStr, response: Response, 1206 | api_key: APIKey = Depends(validate_api_key_header)) -> Answer: 1207 | t0 = time.time() 1208 | le = LDAPEnricher() 1209 | retval = le.email_to_user_id(email) 1210 | t1 = time.time() 1211 | d = round(t1 - t0, 3) 1212 | if not retval: 1213 | response.status_code = 404 1214 | return Answer(success = False, errormsg = "not found", 1215 | meta = AnswerMeta(version = VER, duration = d, count = 0), data = []) 1216 | else: 1217 | response.status_code = 200 1218 | return Answer(success = True, errormsg = None, meta = AnswerMeta(version = VER, duration = d, count = 1), 1219 | data = [{"ecMoniker": retval}]) 1220 | 1221 | 1222 | @app.get('/enrich/email_to_vip/{email}', 1223 | tags = ["Enricher"], 1224 | status_code = 200, 1225 | response_model = Answer) 1226 | async def enrich_vip_via_email(email: EmailStr, response: Response, 1227 | api_key: APIKey = Depends(validate_api_key_header)) -> Answer: 1228 | t0 = time.time() 1229 | enr = VIPEnricher() 1230 | retval = enr.is_vip(email) 1231 | t1 = time.time() 1232 | d = round(t1 - t0, 3) 1233 | response.status_code = 200 1234 | return Answer(success = True, errormsg = None, meta = AnswerMeta(version = VER, duration = d, count = 1), 1235 | data = [{"is_vip": retval}]) 1236 | 1237 | 1238 | if __name__ == "__main__": 1239 | db_conn = _connect_db(DSN) 1240 | uvicorn.run(app, debug = True, port = os.getenv('PORT', default = 8080)) 1241 | -------------------------------------------------------------------------------- /api/models.py: -------------------------------------------------------------------------------- 1 | """Pydantic models definitions 2 | 3 | Author: Aaron Kaplan 4 | License: see LICENSE. 5 | """ 6 | 7 | import datetime 8 | from enum import Enum 9 | from typing import Optional, Dict, List # Union 10 | 11 | from pydantic import BaseModel, EmailStr 12 | 13 | 14 | class Leak(BaseModel): 15 | id: Optional[int] 16 | ticket_id: Optional[str] 17 | summary: str 18 | reporter_name: Optional[str] 19 | source_name: Optional[str] 20 | breach_ts: Optional[datetime.datetime] 21 | source_publish_ts: Optional[datetime.datetime] 22 | 23 | 24 | class CredentialType(Enum): 25 | is_external = "External" 26 | is_proxy_login = "Proxy" 27 | is_EU_login = "EU Login" 28 | is_domain_login = "Domain" 29 | is_secem_login = "SECEM" 30 | 31 | 32 | class LeakData(BaseModel): 33 | id: Optional[int] 34 | leak_id: int 35 | email: EmailStr 36 | password: str 37 | password_plain: Optional[str] 38 | password_hashed: Optional[str] 39 | hash_algo: Optional[str] 40 | ticket_id: Optional[str] 41 | email_verified: Optional[bool] 42 | password_verified_ok: Optional[bool] 43 | ip: Optional[str] 44 | domain: Optional[str] 45 | target_domain: Optional[str] # new 46 | browser: Optional[str] 47 | malware_name: Optional[str] 48 | infected_machine: Optional[str] 49 | dg: Optional[str] 50 | is_vip: Optional[bool] 51 | credential_type: Optional[List[CredentialType]] 52 | report_to: Optional[List[str]] # the security contact to report this to, in case it's not the the user directly. 53 | # 54 | # meta stuff and things for error reporting 55 | count_seen: Optional[int] = 1 56 | original_line: Optional[str] # the original CSV file in case of errors 57 | error_msg: Optional[str] 58 | notify: bool 59 | needs_human_intervention: bool 60 | 61 | 62 | class AnswerMeta(BaseModel): 63 | version: str 64 | duration: float 65 | count: int 66 | 67 | 68 | class Answer(BaseModel): 69 | meta: Optional[AnswerMeta] 70 | data: List[Dict] # Union[Dict,List] 71 | success: bool 72 | errormsg: Optional[str] = "" 73 | 74 | 75 | """ Example: 76 | Multiple answers: 77 | { "meta": { "version": "rel-1.0", "duration": 0.78, "count": 3 }, "data": [ , , ], "success": true, 78 | "errormsg": "all OK" } 79 | 80 | No data: 81 | { "meta": { "version": "rel-1.0", "duration": 0.78 , "count": 0 }, "data": [], "success": true, "errormsg": "all OK" } 82 | 83 | Single result: 84 | { "meta": { "version": "rel-1.0", "duration": 0.78 , "count": 1 }, "data": [ { "foo": "bar", "baz": 77 } ], 85 | "success": true, "errormsg": "all OK" } 86 | """ 87 | -------------------------------------------------------------------------------- /config.SAMPLE.py: -------------------------------------------------------------------------------- 1 | """Configuration stored here. 2 | To make this work, please copy it over to api/config.py (make sure you don't overwrite 3 | an existing file!!! 4 | Edit that file there and add a random string to the list. 5 | Communicate that random string to the API key user. 6 | 7 | Then reload the server (or it gets reloaded automatically). 8 | """ 9 | 10 | 11 | config = { 12 | "api_keys": ["random-test-api-key", "another-example-api-key"] 13 | } 14 | -------------------------------------------------------------------------------- /db.sql: -------------------------------------------------------------------------------- 1 | -- 2 | -- PostgreSQL database dump 3 | -- 4 | 5 | -- Dumped from database version 11.10 6 | -- Dumped by pg_dump version 11.10 7 | 8 | SET statement_timeout = 0; 9 | SET lock_timeout = 0; 10 | SET idle_in_transaction_session_timeout = 0; 11 | SET client_encoding = 'UTF8'; 12 | SET standard_conforming_strings = on; 13 | SELECT pg_catalog.set_config('search_path', '', false); 14 | SET check_function_bodies = false; 15 | SET xmloption = content; 16 | SET client_min_messages = warning; 17 | SET row_security = off; 18 | 19 | SET default_tablespace = ''; 20 | 21 | SET default_with_oids = false; 22 | 23 | -- 24 | -- Name: leak; Type: TABLE; Schema: public; Owner: credentialleakdb 25 | -- 26 | 27 | CREATE TABLE public.leak ( 28 | id integer NOT NULL, 29 | breach_ts timestamp with time zone, 30 | source_publish_ts timestamp with time zone, 31 | ingestion_ts timestamp with time zone NOT NULL, 32 | summary text NOT NULL, 33 | ticket_id text, 34 | reporter_name text, 35 | source_name text 36 | ); 37 | 38 | 39 | ALTER TABLE public.leak OWNER TO credentialleakdb; 40 | 41 | -- 42 | -- Name: COLUMN leak.breach_ts; Type: COMMENT; Schema: public; Owner: credentialleakdb 43 | -- 44 | 45 | COMMENT ON COLUMN public.leak.breach_ts IS 'If known, the timestamp when the breach happened.'; 46 | 47 | 48 | -- 49 | -- Name: COLUMN leak.source_publish_ts; Type: COMMENT; Schema: public; Owner: credentialleakdb 50 | -- 51 | 52 | COMMENT ON COLUMN public.leak.source_publish_ts IS 'The timestamp according when the source (e.g. spycloud) published the data.'; 53 | 54 | 55 | -- 56 | -- Name: COLUMN leak.ingestion_ts; Type: COMMENT; Schema: public; Owner: credentialleakdb 57 | -- 58 | 59 | COMMENT ON COLUMN public.leak.ingestion_ts IS 'The timestamp when we ingested the data.'; 60 | 61 | 62 | -- 63 | -- Name: COLUMN leak.summary; Type: COMMENT; Schema: public; Owner: credentialleakdb 64 | -- 65 | 66 | COMMENT ON COLUMN public.leak.summary IS 'A short summary (slug) of the leak. Used for displaying it somewhere'; 67 | 68 | 69 | -- 70 | -- Name: COLUMN leak.reporter_name; Type: COMMENT; Schema: public; Owner: credentialleakdb 71 | -- 72 | 73 | COMMENT ON COLUMN public.leak.reporter_name IS 'The name of the reporter where we got the notification from. E.g. CERT-eu, Spycloud, etc... Who sent us the data?'; 74 | 75 | 76 | -- 77 | -- Name: COLUMN leak.source_name; Type: COMMENT; Schema: public; Owner: credentialleakdb 78 | -- 79 | 80 | COMMENT ON COLUMN public.leak.source_name IS 'The name of the source where this leak came from. Either the name of a collection or some other name.'; 81 | 82 | 83 | -- 84 | -- Name: leak_data; Type: TABLE; Schema: public; Owner: credentialleakdb 85 | -- 86 | 87 | CREATE TABLE public.leak_data ( 88 | id integer NOT NULL, 89 | leak_id integer NOT NULL, 90 | email text NOT NULL, 91 | password text NOT NULL, 92 | password_plain text, 93 | password_hashed text, 94 | hash_algo text, 95 | ticket_id text, 96 | email_verified boolean DEFAULT false, 97 | password_verified_ok boolean DEFAULT false, 98 | ip inet, 99 | domain text, 100 | target_domain text, 101 | browser text, 102 | malware_name text, 103 | infected_machine text, 104 | dg text NOT NULL, 105 | count_seen integer DEFAULT 1 106 | ); 107 | 108 | 109 | ALTER TABLE public.leak_data OWNER TO credentialleakdb; 110 | 111 | -- 112 | -- Name: COLUMN leak_data.password; Type: COMMENT; Schema: public; Owner: credentialleakdb 113 | -- 114 | 115 | COMMENT ON COLUMN public.leak_data.password IS 'Either the encrypted or unencrypted password. If the unencrypted password is available, that is what is going to be in this field.'; 116 | 117 | 118 | -- 119 | -- Name: COLUMN leak_data.hash_algo; Type: COMMENT; Schema: public; Owner: credentialleakdb 120 | -- 121 | 122 | COMMENT ON COLUMN public.leak_data.hash_algo IS 'If we can determine the hashing algo and the password_hashed field is set'; 123 | 124 | 125 | -- 126 | -- Name: COLUMN leak_data.malware_name; Type: COMMENT; Schema: public; Owner: credentialleakdb 127 | -- 128 | 129 | COMMENT ON COLUMN public.leak_data.malware_name IS 'If the password was leaked via a credential stealer malware, then the malware name goes here.'; 130 | 131 | 132 | -- 133 | -- Name: COLUMN leak_data.infected_machine; Type: COMMENT; Schema: public; Owner: credentialleakdb 134 | -- 135 | 136 | COMMENT ON COLUMN public.leak_data.infected_machine IS 'The infected machine (some ID for the machine)'; 137 | 138 | 139 | -- 140 | -- Name: COLUMN leak_data.dg; Type: COMMENT; Schema: public; Owner: credentialleakdb 141 | -- 142 | 143 | COMMENT ON COLUMN public.leak_data.dg IS 'The affected DG'; 144 | 145 | 146 | -- 147 | -- Name: leak_data_id_seq; Type: SEQUENCE; Schema: public; Owner: credentialleakdb 148 | -- 149 | 150 | CREATE SEQUENCE public.leak_data_id_seq 151 | AS integer 152 | START WITH 1 153 | INCREMENT BY 1 154 | NO MINVALUE 155 | NO MAXVALUE 156 | CACHE 1; 157 | 158 | 159 | ALTER TABLE public.leak_data_id_seq OWNER TO credentialleakdb; 160 | 161 | -- 162 | -- Name: leak_data_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: credentialleakdb 163 | -- 164 | 165 | ALTER SEQUENCE public.leak_data_id_seq OWNED BY public.leak_data.id; 166 | 167 | 168 | -- 169 | -- Name: leak_id_seq; Type: SEQUENCE; Schema: public; Owner: credentialleakdb 170 | -- 171 | 172 | CREATE SEQUENCE public.leak_id_seq 173 | AS integer 174 | START WITH 1 175 | INCREMENT BY 1 176 | NO MINVALUE 177 | NO MAXVALUE 178 | CACHE 1; 179 | 180 | 181 | ALTER TABLE public.leak_id_seq OWNER TO credentialleakdb; 182 | 183 | -- 184 | -- Name: leak_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: credentialleakdb 185 | -- 186 | 187 | ALTER SEQUENCE public.leak_id_seq OWNED BY public.leak.id; 188 | 189 | 190 | -- 191 | -- Name: leak id; Type: DEFAULT; Schema: public; Owner: credentialleakdb 192 | -- 193 | 194 | ALTER TABLE ONLY public.leak ALTER COLUMN id SET DEFAULT nextval('public.leak_id_seq'::regclass); 195 | 196 | 197 | -- 198 | -- Name: leak_data id; Type: DEFAULT; Schema: public; Owner: credentialleakdb 199 | -- 200 | 201 | ALTER TABLE ONLY public.leak_data ALTER COLUMN id SET DEFAULT nextval('public.leak_data_id_seq'::regclass); 202 | 203 | 204 | SELECT pg_catalog.setval('public.leak_id_seq', 1, true); 205 | -- 206 | -- Data for Name: leak; Type: TABLE DATA; Schema: public; Owner: credentialleakdb 207 | -- 208 | 209 | COPY public.leak (id, breach_ts, source_publish_ts, ingestion_ts, summary, ticket_id, reporter_name, source_name) FROM stdin; 210 | 1 2021-03-08 13:58:41.179+01 2021-03-08 13:58:41.179+01 2021-03-06 23:40:20.116348+01 CIT0DAY-2 CSIRC-99999 aaron HaveIBennPwned 211 | 2 2021-03-06 23:40:47.266962+01 2021-03-06 23:40:47.266962+01 2021-03-06 23:40:47.266962+01 COMB CSIRC-102 aaron independen research 212 | 3 2021-03-06 23:41:10.245034+01 2021-03-06 23:41:10.245034+01 2021-03-06 23:41:10.245034+01 cit0day CSIRC-103 aaron HaveIBeenPwned 213 | \. 214 | 215 | 216 | -- 217 | -- Data for Name: leak_data; Type: TABLE DATA; Schema: public; Owner: credentialleakdb 218 | -- 219 | 220 | SELECT pg_catalog.setval('public.leak_data_id_seq', 1, true); 221 | 222 | COPY public.leak_data (id, leak_id, email, password, password_plain, password_hashed, hash_algo, ticket_id, email_verified, password_verified_ok, ip, domain, browser, malware_name, infected_machine, dg, count_seen) FROM stdin; 223 | 1 1 aaron@example.com 12345 12345 \N \N CISRC-199 f f 1.2.3.4 example.com Google Chrome \N local_laptop DIGIT 25 224 | 2 1 sarah@example.com 123456 123456 \N \N CISRC-199 f f 1.2.3.5 example.com Firefox \N sarahs_laptop DIGIT 8 225 | 3 1 ben@example.com ohk7do7gil6O ohk7do7gil6O 4aa7985dad6e1f02238c2e2afc521c4d3dd30650656cd07bf0b7cfd3cd1190b7 sha256 CISRC-199 f f 1.2.3.5 example.com Firefox \N WORKSTATION DIGIT 8 226 | 4 1 david@example.com 24b3f998468a9da4105e6c78f5444532cde99d53c011715754194c3b4f3e37b4 \N 24b3f998468a9da4105e6c78f5444532cde99d53c011715754194c3b4f3e37b4 sha256 CISRC-199 f f 8.8.8.8 example.com Firefox \N Macbook Pro DIGIT 8 227 | 5 2 lauri@example.com Vie5kuuwiroo Vie5kuuwiroo \N \N CISRC-200 t t 9.9.9.9 example.com Firefox \N Raspberry PI 3+ DIGIT 8 228 | 6 2 natasha@example.com 1235kuuwiroo 1235kuuwiroo \N \N CISRC-201 t t 9.9.9.9 example.com Firefox \N Raspberry PI 3+ DIGIT 2 229 | \. 230 | 231 | 232 | -- 233 | -- Name: leak_data_id_seq; Type: SEQUENCE SET; Schema: public; Owner: credentialleakdb 234 | -- 235 | 236 | SELECT pg_catalog.setval('public.leak_data_id_seq', 7, true); 237 | 238 | 239 | -- 240 | -- Name: leak_id_seq; Type: SEQUENCE SET; Schema: public; Owner: credentialleakdb 241 | -- 242 | 243 | SELECT pg_catalog.setval('public.leak_id_seq', 4, true); 244 | 245 | 246 | -- 247 | -- Name: leak_data constr_unique_leak_data_leak_id_email_password_domain; Type: CONSTRAINT; Schema: public; Owner: credentialleakdb 248 | -- 249 | 250 | ALTER TABLE ONLY public.leak_data 251 | ADD CONSTRAINT constr_unique_leak_data_leak_id_email_password_domain UNIQUE (leak_id, email, password, domain); 252 | 253 | 254 | -- 255 | -- Name: leak_data leak_data_pkey; Type: CONSTRAINT; Schema: public; Owner: credentialleakdb 256 | -- 257 | 258 | ALTER TABLE ONLY public.leak_data 259 | ADD CONSTRAINT leak_data_pkey PRIMARY KEY (id); 260 | 261 | 262 | -- 263 | -- Name: leak leak_pkey; Type: CONSTRAINT; Schema: public; Owner: credentialleakdb 264 | -- 265 | 266 | ALTER TABLE ONLY public.leak 267 | ADD CONSTRAINT leak_pkey PRIMARY KEY (id); 268 | 269 | 270 | -- 271 | -- Name: idx_leak_data_dg; Type: INDEX; Schema: public; Owner: credentialleakdb 272 | -- 273 | 274 | CREATE INDEX idx_leak_data_dg ON public.leak_data USING btree (dg); 275 | 276 | 277 | -- 278 | -- Name: idx_leak_data_email; Type: INDEX; Schema: public; Owner: credentialleakdb 279 | -- 280 | 281 | CREATE INDEX idx_leak_data_email ON public.leak_data USING btree (upper(email)); 282 | 283 | 284 | -- 285 | -- Name: idx_leak_data_email_password_machine; Type: INDEX; Schema: public; Owner: credentialleakdb 286 | -- 287 | 288 | CREATE INDEX idx_leak_data_email_password_machine ON public.leak_data USING btree (email, password, infected_machine); 289 | 290 | 291 | -- 292 | -- Name: idx_leak_data_malware_name; Type: INDEX; Schema: public; Owner: credentialleakdb 293 | -- 294 | 295 | CREATE INDEX idx_leak_data_malware_name ON public.leak_data USING btree (malware_name); 296 | 297 | 298 | -- 299 | -- Name: idx_leak_data_unique_leak_id_email_password_domain; Type: INDEX; Schema: public; Owner: credentialleakdb 300 | -- 301 | 302 | CREATE UNIQUE INDEX idx_leak_data_unique_leak_id_email_password_domain ON public.leak_data USING btree (leak_id, email, password, domain); 303 | 304 | 305 | -- 306 | -- Name: leak_data leak_data_leak_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: credentialleakdb 307 | -- 308 | 309 | ALTER TABLE ONLY public.leak_data 310 | ADD CONSTRAINT leak_data_leak_id_fkey FOREIGN KEY (leak_id) REFERENCES public.leak(id); 311 | 312 | 313 | -- 314 | -- PostgreSQL database dump complete 315 | -- 316 | 317 | -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/lib/__init__.py -------------------------------------------------------------------------------- /lib/basecollector/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/lib/basecollector/__init__.py -------------------------------------------------------------------------------- /lib/basecollector/collector.py: -------------------------------------------------------------------------------- 1 | """ 2 | BaseCollector 3 | 4 | This implements the abstract collector interface 5 | """ 6 | import pandas as pd 7 | import logging 8 | 9 | 10 | class BaseCollector: 11 | """ 12 | BaseCollector: purely abstract class which defines the interface: 13 | collect(input_source) 14 | 15 | Please note that this does *not* yet return a data frame in the internal data format (IDF). 16 | So all that a BaseCollector shall return is a tuple ("OK"/some_error string and a pandas DF (which may be empty 17 | in case of error). 18 | 19 | Example: 20 | ("OK", pd.DataFrame(... my data...) ) --> all ok, the data is in the DF. 21 | or 22 | ("Could not parse CSV file: file does not exist", pd.DataFrame()) --> error message and empty DF. 23 | 24 | The role of the Collector is to 25 | 1. fetch the data 26 | 2. check if the data is complete 27 | 3. put it into an internal format (in our case a pandas DF) which may be processed by a parser 28 | 4. return it as pandas DF to the next processing step in the chain 29 | 5. return errors in case it encountered errors in validation. 30 | """ 31 | 32 | def __init__(self): 33 | pass 34 | 35 | def collect(self, input_file: str, **kwargs) -> (str, pd.DataFrame): 36 | """ 37 | Collect the data from input_file and return a pandas DF. 38 | 39 | :rtype: tuple return code ("OK" in case of success) and pandas DataFrame with the data 40 | """ 41 | try: 42 | with open(input_file, "r") as f: 43 | df = pd.read_csv(f, **kwargs) 44 | return "OK", df 45 | except Exception as ex: 46 | logging.exception("could not parse CSV file. Reason: %r" % (str(ex),)) 47 | return str(ex), pd.DataFrame() 48 | -------------------------------------------------------------------------------- /lib/baseenricher/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/lib/baseenricher/__init__.py -------------------------------------------------------------------------------- /lib/baseenricher/enricher.py: -------------------------------------------------------------------------------- 1 | """Purely abstract base enricher class.""" 2 | 3 | from models.idf import InternalDataFormat 4 | 5 | 6 | class BaseEnricher: 7 | def __init__(self): 8 | pass 9 | 10 | def enrich(self, idf: InternalDataFormat) -> InternalDataFormat: 11 | return idf 12 | -------------------------------------------------------------------------------- /lib/baseoutput/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/lib/baseoutput/__init__.py -------------------------------------------------------------------------------- /lib/baseoutput/output.py: -------------------------------------------------------------------------------- 1 | """Base, abstract Output class""" 2 | 3 | from models.outdf import Answer 4 | 5 | 6 | class BaseOutput: 7 | def __init__(self): 8 | pass 9 | 10 | def process(self, output_data: Answer) -> bool: 11 | """ 12 | Process the output_data and do something with it. 13 | 14 | :returns bool... True on success. 15 | """ 16 | return True 17 | -------------------------------------------------------------------------------- /lib/baseparser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/lib/baseparser/__init__.py -------------------------------------------------------------------------------- /lib/baseparser/parser.py: -------------------------------------------------------------------------------- 1 | """Base Parser definitions. Purely abstract.""" 2 | 3 | import pandas as pd 4 | 5 | from models.idf import InternalDataFormat 6 | 7 | 8 | class BaseParser: 9 | def __init__(self): 10 | pass 11 | 12 | def parse(self, df: pd.DataFrame) -> InternalDataFormat: 13 | pass 14 | -------------------------------------------------------------------------------- /lib/db/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/lib/db/__init__.py -------------------------------------------------------------------------------- /lib/db/db.py: -------------------------------------------------------------------------------- 1 | """Very very lightweight DB abstraction""" 2 | 3 | import os 4 | import psycopg2 5 | import psycopg2.extras 6 | 7 | from fastapi import HTTPException 8 | import logging 9 | 10 | 11 | ################################# 12 | # DB functions 13 | 14 | db_conn = None 15 | DSN = "host=%s dbname=%s user=%s password=%s" % (os.getenv('DBHOST', 'localhost'), 16 | os.getenv('DBNAME', 'credentialleakdb'), 17 | os.getenv('DBUSER', 'credentialleakdb'), 18 | os.getenv('DBPASSWORD')) 19 | 20 | 21 | def _get_db(): 22 | """ 23 | Open a new database connection if there is none yet for the 24 | current application context. 25 | 26 | :returns: the DB handle.""" 27 | global db_conn 28 | 29 | if not db_conn: 30 | db_conn = _connect_db(DSN) 31 | return db_conn 32 | 33 | 34 | # noinspection PyUnresolvedReferences 35 | def _close_db(): 36 | """Closes the database again at the end of the request.""" 37 | global db_conn 38 | 39 | logging.info('shutting down....') 40 | if db_conn: 41 | db_conn.close() 42 | db_conn = None 43 | return db_conn 44 | 45 | 46 | def _connect_db(dsn: str): 47 | """Connects to the specific database. 48 | 49 | :param dsn: the database connection string. 50 | :returns: the DB connection handle 51 | """ 52 | try: 53 | conn = psycopg2.connect(dsn) 54 | conn.set_session(autocommit=True) 55 | except Exception as ex: 56 | raise HTTPException(status_code=500, detail="could not connect to the DB. Reason: %s" % (str(ex))) 57 | logging.info("connection to DB established") 58 | return conn 59 | -------------------------------------------------------------------------------- /lib/helpers.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import logging 3 | from pathlib import Path 4 | 5 | LOG_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' 6 | LOG_FORMAT = '%(asctime)s - [%(name)s:%(module)s:%(funcName)s] - %(levelname)s - %(message)s' 7 | 8 | def getlogger(name: str, log_level=logging.INFO) -> logging.Logger: 9 | """This is how we do logging. How to use it: 10 | 11 | Add the following code snippet to every module 12 | ``` 13 | logger = getlogger(__name__) 14 | logger.info("foobar") 15 | ``` 16 | 17 | :param name - name of the logger 18 | :param log_level - default log level 19 | 20 | :returns logging.Logger object 21 | """ 22 | logger = logging.getLogger(name) 23 | logger.setLevel(log_level) 24 | 25 | # create console handler 26 | ch = logging.StreamHandler() 27 | 28 | formatter = logging.Formatter(LOG_FORMAT) 29 | ch.setFormatter(formatter) 30 | logger.addHandler(ch) 31 | 32 | logger.info('Logger ready') 33 | 34 | return logger 35 | 36 | 37 | def peek_into_file(fname: Path) -> csv.Dialect: 38 | """ 39 | Peek into a file in order to determine the dialect for pandas.read_csv() / csv functions. 40 | 41 | :param fname: a Path object for the filename 42 | :return: a csv.Dialect 43 | """ 44 | 45 | with fname.open(mode = 'r') as f: 46 | sniffer = csv.Sniffer() 47 | logging.debug("has apikeyheader: %s", sniffer.has_header(f.readline())) 48 | f.seek(0) 49 | dialect = sniffer.sniff(f.readline(50)) 50 | logging.debug("delim: '%s'", dialect.delimiter) 51 | logging.debug("quotechar: '%s'", dialect.quotechar) 52 | logging.debug("doublequote: %s", dialect.doublequote) 53 | logging.debug("escapechar: '%s'", dialect.escapechar) 54 | logging.debug("lineterminator: %r", dialect.lineterminator) 55 | logging.debug("quoting: %s", dialect.quoting) 56 | logging.debug("skipinitialspace: %s", dialect.skipinitialspace) 57 | # noinspection PyTypeChecker 58 | return dialect 59 | 60 | 61 | def anonymize_password(password: str) -> str: 62 | """ 63 | "*"-out the characters of a password. Must be 4 chars in length at least. 64 | 65 | :param password: str 66 | :returns anonymized password (str): 67 | 68 | """ 69 | anon_password = password 70 | if password and len(password) >= 4: 71 | prefix = password[:1] 72 | suffix = password[-2:] 73 | anon_password = prefix + "*" * (len(password) - 3) + suffix 74 | return anon_password 75 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/models/__init__.py -------------------------------------------------------------------------------- /models/idf.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | from pydantic import BaseModel, IPvAnyAddress 3 | 4 | 5 | class InternalDataFormat(BaseModel): 6 | """The Internal Data Format (IDF).""" 7 | leak_id: Optional[str] # the leak(id) reference 8 | email: str 9 | password: Optional[str] # not mandatory yet 10 | password_plain: Optional[str] 11 | password_hashed: Optional[str] 12 | hash_algo: Optional[str] 13 | ticket_id: Optional[str] 14 | email_verified: Optional[bool] = False 15 | password_verified_ok: Optional[bool] = False 16 | ip: Optional[IPvAnyAddress] 17 | domain: Optional[str] 18 | target_domain: Optional[str] 19 | browser: Optional[str] 20 | malware_name: Optional[str] 21 | infected_machine: Optional[str] 22 | # 23 | # flags set by the enrichers 24 | dg: Optional[str] 25 | external_user: Optional[bool] 26 | is_vip: Optional[bool] 27 | is_active_account: Optional[bool] 28 | credential_type: Optional[List[str]] # External, EU Login, etc. 29 | report_to: Optional[List[str]] # whom to report this to? 30 | # 31 | # meta stuff and things for error reporting 32 | count_seen: Optional[int] = 1 33 | original_line: Optional[str] 34 | error_msg: Optional[str] 35 | notify: Optional[bool] 36 | needs_human_intervention: Optional[bool] 37 | -------------------------------------------------------------------------------- /models/indf.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Optional, Union 3 | from pydantic import BaseModel, IPvAnyAddress 4 | 5 | 6 | class SpyCloudInputEntry(BaseModel): 7 | """The SpyCloud intput format - one entry.""" 8 | breach_title: str 9 | spycloud_publish_date: Optional[Union[str, datetime]] 10 | breach_date: Optional[Union[str, datetime]] 11 | email: str # mandatory 12 | domain: str # mandatory 13 | username: Optional[str] 14 | password: str 15 | salt: Optional[str] 16 | target_domain: Optional[str] 17 | target_url: Optional[str] 18 | password_plaintext: str = None 19 | sighting: Optional[int] 20 | severity: Optional[str] 21 | status: Optional[str] 22 | password_type: Optional[str] 23 | cc_number: Optional[str] 24 | infected_path: Optional[str] 25 | infected_machine_id: Optional[str] 26 | email_domain: str 27 | cc_expiration: Optional[str] 28 | cc_last_four: Optional[str] 29 | email_username: str 30 | user_browser: Optional[str] 31 | infected_time: Optional[Union[str, datetime]] 32 | ip_addresses: Optional[Union[str, IPvAnyAddress]] 33 | -------------------------------------------------------------------------------- /models/outdf.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from enum import Enum 3 | from typing import Optional, Dict, List # Union 4 | from pydantic import BaseModel, EmailStr 5 | 6 | 7 | class Leak(BaseModel): 8 | id: Optional[int] 9 | ticket_id: Optional[str] 10 | summary: str 11 | reporter_name: Optional[str] 12 | source_name: Optional[str] 13 | breach_ts: Optional[datetime.datetime] 14 | source_publish_ts: Optional[datetime.datetime] 15 | 16 | 17 | class CredentialType(Enum): 18 | is_external = "External" 19 | is_proxy_login = "Proxy" 20 | is_EU_login = "EU Login" 21 | is_domain_login = "Domain" 22 | is_secem_login = "SECEM" 23 | 24 | 25 | class LeakData(BaseModel): 26 | id: Optional[int] 27 | leak_id: int 28 | email: EmailStr 29 | password: str 30 | password_plain: Optional[str] 31 | password_hashed: Optional[str] 32 | hash_algo: Optional[str] 33 | ticket_id: Optional[str] 34 | email_verified: Optional[bool] 35 | password_verified_ok: Optional[bool] 36 | ip: Optional[str] 37 | domain: Optional[str] 38 | target_domain: Optional[str] # new 39 | browser: Optional[str] 40 | malware_name: Optional[str] 41 | infected_machine: Optional[str] 42 | dg: Optional[str] 43 | is_vip: Optional[bool] 44 | credential_type: Optional[List[CredentialType]] 45 | report_to: Optional[List[str]] # the security contact to report this to, in case it's not the the user directly. 46 | # 47 | # meta stuff and things for error reporting 48 | count_seen: Optional[int] = 1 49 | original_line: Optional[str] # the original CSV file in case of errors 50 | error_msg: Optional[str] 51 | notify: bool 52 | needs_human_intervention: bool 53 | 54 | 55 | class AnswerMeta(BaseModel): 56 | version: str 57 | duration: float 58 | count: int 59 | 60 | 61 | class Answer(BaseModel): 62 | meta: Optional[AnswerMeta] 63 | data: List[Dict] # Union[Dict,List] 64 | success: bool 65 | errormsg: Optional[str] = "" 66 | -------------------------------------------------------------------------------- /modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/modules/__init__.py -------------------------------------------------------------------------------- /modules/collectors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/modules/collectors/__init__.py -------------------------------------------------------------------------------- /modules/collectors/parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """importer.parser """ 3 | 4 | 5 | from lib.helpers import getlogger 6 | from pathlib import Path 7 | import csv 8 | import time 9 | 10 | import pandas as pd 11 | 12 | debug = True 13 | 14 | logger = getlogger(__name__) 15 | 16 | 17 | # noinspection PyTypeChecker 18 | def peek_into_file(fname: Path) -> csv.Dialect: 19 | """ 20 | Peek into a file in order to determine the dialect for pandas.read_csv() / csv functions. 21 | 22 | :param fname: a Path object for the filename 23 | :return: a csv.Dialect 24 | """ 25 | 26 | with fname.open(mode='r') as f: 27 | sniffer = csv.Sniffer() 28 | logger.debug("has apikeyheader: %s", sniffer.has_header(f.readline())) 29 | f.seek(0) 30 | dialect = sniffer.sniff(f.readline(50)) 31 | logger.debug("delim: '%s'", dialect.delimiter) 32 | logger.debug("quotechar: '%s'", dialect.quotechar) 33 | logger.debug("doublequote: %s", dialect.doublequote) 34 | logger.debug("escapechar: '%s'", dialect.escapechar) 35 | logger.debug("lineterminator: %r", dialect.lineterminator) 36 | logger.debug("quoting: %s", dialect.quoting) 37 | logger.debug("skipinitialspace: %s", dialect.skipinitialspace) 38 | return dialect 39 | 40 | 41 | class BaseParser: 42 | """The abstract Parser class.""" 43 | def __init__(self): 44 | pass 45 | 46 | def parse_file(self, fname: Path, leak_id: int = None, csv_dialect=None) -> pd.DataFrame: 47 | """Parse file (non-recursive) and returns a DataFrame with the contents. 48 | Overwrite this method in YOUR Parser subclass. 49 | 50 | # Parameters 51 | * fname: a Path object with the filename of the CSV file which should be parsed. 52 | * leak_id: the leak_id in the DB which is associated with that CSV dump file. 53 | # Returns 54 | a DataFrame 55 | number of errors while parsing 56 | """ 57 | logger.info("Parsing file %s..." % fname) 58 | try: 59 | if csv_dialect: 60 | dialect = csv_dialect 61 | else: 62 | dialect = peek_into_file(fname) # try to guess 63 | df = pd.read_csv(fname, dialect=dialect, error_bad_lines=False, warn_bad_lines=True) # , usecols=range(2)) 64 | logger.debug(df.head()) 65 | logger.debug(df.info()) 66 | logger.debug("Parsing file 2...") 67 | df.insert(0, 'leak_id', leak_id) 68 | logger.debug(df.head()) 69 | logger.debug("parsed %s", fname) 70 | return df 71 | 72 | except Exception as ex: 73 | logger.error("could not pandas.read_csv(%s). Reason: %s. Skipping file." % (fname, str(ex))) 74 | raise ex # pass it on 75 | 76 | def normalize_data(self, df: pd.DataFrame, leak_id: int = None) -> pd.DataFrame: 77 | """ 78 | Normalize the given data / data frame 79 | 80 | :param df: a pandas df with the leak_data 81 | :param leak_id: foreign key to the leak table 82 | :return: a pandas df 83 | """ 84 | # replace NaN with None 85 | return df.where(pd.notnull(df), None) 86 | 87 | 88 | if __name__ == "__main__": 89 | 90 | 91 | p = BaseParser() 92 | t0 = time.time() 93 | # p.parse_recursively('test_leaks', '*.txt') 94 | t1 = time.time() 95 | logger.info("processed everything in %f [sec]", (t1 - t0)) 96 | -------------------------------------------------------------------------------- /modules/collectors/sample.csv: -------------------------------------------------------------------------------- 1 | email,password 2 | aaron@example.com,12345 3 | benoit@foo.bar.com,12345 4 | sarah@example.com,123456 5 | -------------------------------------------------------------------------------- /modules/collectors/spycloud.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Spycloud parser""" 3 | import collections 4 | import logging 5 | from pathlib import Path 6 | 7 | 8 | import pandas as pd 9 | # from parser import BaseParser 10 | from .parser import BaseParser 11 | 12 | 13 | class SpycloudParser(BaseParser): 14 | """Parse Spycloud CSVs""" 15 | def parse_file(self, fname: Path, csv_dialect='excel', leak_id=None) -> pd.DataFrame: 16 | """Parse the Spycloud CSV files, which are in the form: 17 | 18 | breach_title,spycloud_publish_date,breach_date,email,domain,username,password,salt,target_domain,target_url,password_plaintext,sighting,severity,status,password_type,cc_number,infected_path,infected_machine_id,email_domain,cc_expiration,cc_last_four,email_username,user_browser,infected_time,ip_addresses 19 | 20 | Returns: 21 | a DataFrame 22 | number of errors while parsing 23 | """ 24 | logging.debug("Parsing SPYCLOUD file %s...", fname) 25 | try: 26 | # df = pd.read_csv(fname, dialect=csv_dialect, header=1, error_bad_lines=False, warn_bad_lines=True) 27 | df = pd.read_csv(fname, error_bad_lines=False, warn_bad_lines=True) 28 | logging.debug(df) 29 | return df 30 | 31 | except Exception as ex: 32 | logging.error("could not pandas.read_csv(%s). Reason: %s. Skipping file." % (fname, str(ex))) 33 | return pd.DataFrame() 34 | 35 | def normalize_data(self, df: pd.DataFrame, leak_id=None) -> pd.DataFrame: 36 | """Bring the pandas DataFrame into an internal data format.""" 37 | 38 | """ Spycloud headers: 39 | breach_title, spycloud_publish_date, breach_date, email, domain, username, password, salt, target_domain, target_url, password_plaintext, sighting, severity, status, password_type, cc_number, infected_path, infected_machine_id, email_domain, cc_expiration, cc_last_four, email_username, user_browser, infected_time, ip_addresses 40 | map to: 41 | _, leak.source_publish_ts, leak.breach_ts, email, domain, _, password, _, target_domain, _, password_plain, _, _, _, hash_algo, _, _, infected_machine, _ , _, _, _, browser, _, ip 42 | """ 43 | mapping_tbl = collections.OrderedDict({ 44 | "breach_title": None, 45 | "spycloud_publish_date": None, 46 | "breach_date": None, 47 | "email": "email", 48 | "domain": None, 49 | "username": None, 50 | "password": "password", 51 | "salt": None, 52 | "target_domain": "target_domain", 53 | "target_url": None, 54 | "password_plaintext": "password_plain", 55 | "sighting": None, 56 | "severity": None, 57 | "status": None, 58 | "password_type": "hash_algo", 59 | "cc_number": None, 60 | "infected_path": None, 61 | "infected_machine_id": "infected_machine", 62 | "email_domain": "domain", 63 | "cc_expiration": None, 64 | "cc_last_four": None, 65 | "email_username": None, 66 | "user_browser": "browser", 67 | "infected_time": None, 68 | "ip_addresses": "ip" 69 | }) 70 | 71 | # This complexity sucks! need to get rid of it. No, itertools won't make it more understandable. 72 | retdf = pd.DataFrame() 73 | for i, r in df.iterrows(): # go over all df rows. Returns index, row 74 | # print(f"{i}:{r}") 75 | retrow = dict() # build up what we want to return 76 | for k, v in r.items(): # go over all key-val items in the row 77 | # print(f"{k}:{v}", file=sys.stderr) 78 | if k in mapping_tbl.keys(): 79 | map_to = mapping_tbl[k] 80 | if k == 'ip_addresses' and v == '-': 81 | v = None 82 | if map_to: 83 | # print(f"mapping {k} to {map_to}!") 84 | retrow[map_to] = v 85 | else: 86 | # don't map it 87 | pass 88 | logging.debug("retrow = %r" % retrow) 89 | retdf = retdf.append(pd.Series(retrow), ignore_index=True) 90 | # retdf[:,'leak_id'] = leak_id 91 | logging.debug("retdf: %s" % retdf) 92 | return retdf 93 | -------------------------------------------------------------------------------- /modules/collectors/spycloud/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/modules/collectors/spycloud/__init__.py -------------------------------------------------------------------------------- /modules/collectors/spycloud/collector.py: -------------------------------------------------------------------------------- 1 | """ 2 | Spycloud collector 3 | 4 | This code implements a SpyCloud collector (inherits from BaseCollector) 5 | 6 | Upon running a SpyCloud parser on a CSV, the result will be a 7 | """ 8 | from pathlib import Path 9 | import logging 10 | import pandas as pd 11 | 12 | from lib.basecollector.collector import BaseCollector 13 | from lib.helpers import peek_into_file 14 | 15 | NaN_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', '', 'N/A', 16 | 'NA', 'NULL', 'NaN', 'n/a', 'null', '-'] 17 | 18 | 19 | class SpyCloudCollector(BaseCollector): 20 | def __init__(self): 21 | super().__init__() 22 | 23 | def collect(self, input_file: Path, **kwargs) -> (str, pd.DataFrame): 24 | try: 25 | dialect = peek_into_file(input_file) 26 | df = pd.read_csv(input_file, dialect=dialect, na_values=NaN_values, 27 | keep_default_na=False, error_bad_lines=False, warn_bad_lines=True) 28 | # XXX FIXME: need to collect the list of (pandas-) unparseable rows and present to user. 29 | # For now we simply fail on the whole file. Good enough for the moment. 30 | except pd.errors.ParserError as ex: 31 | logging.error("could not parse CSV file. Reason: %r" % (str(ex),)) 32 | return str(ex), pd.DataFrame() 33 | return "OK", df 34 | -------------------------------------------------------------------------------- /modules/collectors/test_leaks/COMB/test_data.txt: -------------------------------------------------------------------------------- 1 | 5hv 209@hotmail.com:Adam 2 | 5hv @bseomail.com:169818 3 | 5hv deniz@gmail.com:1234567 4 | 5hv lol@yahoo.com.au:5hvm 5 | 5hv sä±k iåÿ m@gmail.com:1234w 6 | 5hv!200@mail.ru:Ali4203642036 7 | 5hv!@aol.com:encuestas67 8 | 5hv!@fsf.com:gunther 9 | 5hv!@mail.ru:13371337 10 | 5hv!@mail.ru:13371337Р№ 11 | 5hv!@mail.ru:200029cs 12 | 5hv!@rambler.ru:13371337 13 | 5hv!@redi.com:http 14 | 5hv!@redi.com:http://www.javatpoint.com/RegisterAn.gif 15 | 5hv!@yandex.ru:200029cs 16 | 5hv!_bosha@mail.ru:5hvibosha 17 | 5hv!_saya@mail.ru:3331348s 18 | 5hv!eva1978@mail.ru:5hvm2005 19 | 5hv!ka_love@bk.ru:52m86m12 20 | 5hv#@mail.ru:facd4321 21 | 5hv$-5hv$2017@inbox.ru:1q1a1z 22 | 5hv$.a$da$da$.01@mail.ru:7010ckfdf 23 | 5hv$.a$da$da$.09@mail.ru:7010ckfdf 24 | 5hv$621#621@hotmail.com:hotgirl14 25 | 5hv$621@hotmail.com:hotgirl14 26 | 5hv$baev04@mail.ru:877505b 27 | 5hv$baev06@mail.ru:877505b 28 | 5hv$da15@inbox.ru:q1w2e3azsd 29 | 5hv$dad@gmail.com:internet 30 | 5hv$df$af@inbox.ru:053320107b 31 | 5hv$dfez$g@rambler.ru:7utIrccfiq 32 | 5hv$ev01@windowslive.com:ertek124578 33 | 5hv$h1976@mail.ru:bkmdbyf2010 34 | 5hv$h1978@mail.ru:bkmdbyf2010 35 | 5hv$h1981@mail.ru:0007 36 | 5hv$h1983@mail.ru:0007 37 | 5hv$hevo@list.ru:galina2612 38 | 5hv$hkevich_lili@mail.ru:OfDtKm123 39 | 5hv$ik159@list.ru:5hvsik12345 40 | 5hv$ki03@mail.ru:qwerty123456 41 | 5hv$ki05@mail.ru:qwerty123456 42 | 5hv$ko762@gmail.com:ZADYMA2469 43 | 5hv$tra3000@yandex.ru:mega667 44 | 5hv'janelesley@yahoo.com:danthony12 45 | 5hv'sfamilydaycare@yahoo.com:zayas65 46 | 5hv&&monika11@aol.com:24crow 47 | 5hv&alex@myspace.com:alexander0 48 | 5hv&elizabeth@sbcglobal.net:5hv123 49 | 5hv&oreo@netzero.com:onorio1 50 | 5hv's11@mail.ru:111111ga 51 | 5hv's12@mail.ru:111111ga 52 | 5hv's13@mail.ru:111111ga 53 | 5hv'slen@yahoo.com:181818 54 | 5hv*litterprincesita@hotmail.com:zuricata16 55 | 5hv*litterprincessita@hotmail.com:zuricata16 56 | -------------------------------------------------------------------------------- /modules/collectors/test_leaks/README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | These leaks are from public leaks and are truncated data! They are here only for testing. 4 | Email addresses have been pseudonymized for privacy reasons. They are not real email addresses. 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /modules/enrichers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/modules/enrichers/__init__.py -------------------------------------------------------------------------------- /modules/enrichers/abuse_contact.py: -------------------------------------------------------------------------------- 1 | """AbuseContactLookup: look up the right abuse contact based on a user's email address.""" 2 | 3 | import collections 4 | import re 5 | from typing import List 6 | 7 | 8 | class AbuseContactLookup: 9 | """A simple abuse contact lookup class.""" 10 | 11 | def lookup(self, email: str) -> List[str]: 12 | """Look up the right abuse contact for credential leaks based on the email address. 13 | Example: 14 | lookup("example@jrc.it") --> "reports@jrc.it" 15 | 16 | :argument email: the email address 17 | :rtype string: string 18 | :returns email: the email address for the abuse contact 19 | """ 20 | 21 | """The following mapping table is of the form: 22 | regular expression --> email address or "DIRECT". If DIRECT is returned, send directly to the email addr. 23 | The matching should proceed top down 24 | """ 25 | 26 | mapping_table = collections.OrderedDict({ 27 | re.compile(r"example\.ec\.europa\.eu", re.X): ["ec-digit-csirc@ec.europa.eu"], # example 28 | re.compile(r".*\.ec\.europa\.eu", re.X): "DIRECT", 29 | re.compile(r".*", re.X): "DIRECT" # the default catch-all rule. Don't delete! 30 | }) 31 | 32 | domain = email.split('@')[-1] 33 | for k, v in mapping_table.items(): 34 | if re.match(k, domain): 35 | if v == "DIRECT": 36 | return [email] 37 | else: 38 | return v 39 | return [""] 40 | -------------------------------------------------------------------------------- /modules/enrichers/external_email.py: -------------------------------------------------------------------------------- 1 | """ExternalEmailEnricher""" 2 | 3 | 4 | class ExternalEmailEnricher: 5 | """Can determine if an Email Adress is an (organisation-) external email address. Also super trivial code.""" 6 | 7 | @staticmethod 8 | def is_internal_email(email: str) -> bool: 9 | email = email.lower() 10 | if email and email.endswith('europa.eu') or email.endswith('jrc.it'): 11 | return True 12 | else: 13 | return False 14 | 15 | @staticmethod 16 | def is_external_email(email: str) -> bool: 17 | return not ExternalEmailEnricher.is_internal_email(email) 18 | -------------------------------------------------------------------------------- /modules/enrichers/ldap.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Union 4 | 5 | from modules.enrichers.ldap_lib import CEDQuery 6 | 7 | 8 | class LDAPEnricher: 9 | """LDAP Enricher can query LDAP and offers multiple functions such as email-> dg""" 10 | 11 | simulate_ldap: bool = False 12 | 13 | def __init__(self): 14 | self.simulate_ldap = bool(os.getenv('SIMULATE_LDAP', default = False)) 15 | self.ced = CEDQuery() 16 | 17 | def email_to_dg(self, email: str) -> str: 18 | """Return the DG of an email. Note that there might be multiple DGs, we just return the first one here.""" 19 | 20 | if self.simulate_ldap: 21 | return "Not connected to LDAP" 22 | try: 23 | results = self.ced.search_by_mail(email) 24 | if results and results[0]['attributes'] and results[0]['attributes']['dg'] and \ 25 | results[0]['attributes']['dg'][0]: 26 | return results[0]['attributes']['dg'][0] 27 | else: 28 | return "Unknown" 29 | except Exception as ex: 30 | logging.error("could not query LDAP/CED. Reason: %s" % str(ex)) 31 | raise ex 32 | 33 | def email_to_user_id(self, email: str) -> Union[str, None]: 34 | """Return the userID of an email. """ 35 | 36 | if self.simulate_ldap: 37 | return "Not connected to LDAP" 38 | try: 39 | results = self.ced.search_by_mail(email) 40 | if results and results[0]['attributes'] and results[0]['attributes']['ecMoniker'] and \ 41 | results[0]['attributes']['ecMoniker'][0]: 42 | return results[0]['attributes']['ecMoniker'][0] 43 | else: 44 | return None 45 | except Exception as ex: 46 | logging.error("could not query LDAP/CED. Reason: %s" % str(ex)) 47 | raise ex 48 | 49 | def email_to_status(self, email: str) -> str: 50 | """Return the active status.""" 51 | 52 | if self.simulate_ldap: 53 | return "Not connected to LDAP" 54 | 55 | try: 56 | results = self.ced.search_by_mail(email) 57 | if results and results[0]['attributes'] and results[0]['attributes']['recordStatus'] and \ 58 | results[0]['attributes']['recordStatus'][0]: 59 | return results[0]['attributes']['recordStatus'][0] 60 | except Exception as ex: 61 | logging.error("could not query LDAP/CED. Reason: %s" % str(ex)) 62 | raise ex 63 | 64 | def exists(self, email: str) -> bool: 65 | """Check if a user exists.""" 66 | 67 | if self.simulate_ldap: 68 | return False 69 | 70 | status = self.email_to_status(email) 71 | if status and status.upper() == "A": 72 | return True 73 | else: 74 | return False 75 | -------------------------------------------------------------------------------- /modules/enrichers/ldap_lib.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import logging 4 | from ldap3 import Server, Connection, ALL 5 | 6 | import json 7 | 8 | from typing import List 9 | 10 | 11 | class CEDQuery: 12 | """ CEDQuery class. Encapsulates the LDAP connect and queries to CED. 13 | Author: L. Aaron Kaplan 14 | """ 15 | 16 | is_connected = False 17 | conn = None 18 | 19 | def __init__(self): 20 | """ init() function. Automatically connects to LDAP (calls the connect_ldap() function). """ 21 | if not self.is_connected: 22 | self.server = os.getenv('CED_SERVER', default = 'localhost') 23 | self.port = int(os.getenv('CED_PORT', default = 389)) 24 | self.user = os.getenv('CED_USER') 25 | self.password = os.getenv('CED_PASSWORD') 26 | self.base_dn = os.getenv('CED_BASEDN') 27 | try: 28 | self.connect_ldap(self.server, self.port, self.user, self.password) 29 | except Exception as ex: 30 | logging.error("could ot connect to LDAP. Reason: %s" % str(ex)) 31 | self.is_connected = False 32 | 33 | def connect_ldap(self, server="ldap.example.com", port=389, user=None, password=None): 34 | """ Connects to the CED LDAP server. Returns None on failure. """ 35 | try: 36 | ldap_server = Server(server, port = port, get_info = ALL) 37 | self.conn = Connection(ldap_server, user = user, password = password) 38 | self.is_connected = self.conn.bind() 39 | print("Connection = %s" % self.conn) 40 | logging.info("connect_ldap(): self.conn = %s" % (self.conn,)) 41 | logging.info("connect_ldap(): conn.bind() = %s" % (self.conn.bind(),)) 42 | except Exception as ex: 43 | logging.error("error connecting to CED. Reason: %s" % (str(ex))) 44 | self.is_connected = False 45 | return None 46 | 47 | def search_by_mail(self, email: str) -> List[dict]: 48 | attributes = ['cn', 'dg', 'uid', 'ecMoniker', 'employeeType', 'recordStatus', 'sn', 'givenName', 'mail'] 49 | if not self.is_connected: 50 | logging.error("Could not search via email. Not connected to LDAP.") 51 | raise Exception("Could not search via email. Not connected to LDAP.") 52 | try: 53 | self.conn.search(self.base_dn, "(mail=%s)" % (email,), attributes = attributes) 54 | except Exception as ex: 55 | logging.error("could not search LDAP. error: %s" % str(ex)) 56 | raise ex 57 | logging.info("search_by_mail(): %s" % (self.conn.entries,)) 58 | results = [] 59 | for entry in self.conn.entries: 60 | results.append(json.loads(entry.entry_to_json())) 61 | return results # yeah, a list comprehension would be more pythonic 62 | 63 | 64 | if __name__ == "__main__": 65 | ced = CEDQuery() 66 | email = sys.argv[1] 67 | print(ced.search_by_mail(email)) 68 | -------------------------------------------------------------------------------- /modules/enrichers/vip.py: -------------------------------------------------------------------------------- 1 | """VIP Enricher. Can determine if an email addr. is a VIP and needs to be treated specially.""" 2 | 3 | import os 4 | import logging 5 | from pathlib import Path 6 | 7 | from typing import List 8 | 9 | 10 | class VIPEnricher: 11 | """Can determine if an Email Address is a VIP. Super trivial code.""" 12 | 13 | vips = [] 14 | 15 | def __init__(self, vipfile: Path = Path('VIPs.txt')): 16 | try: 17 | self.load_vips(os.getenv('VIPLIST', default = vipfile)) 18 | except Exception as ex: 19 | logging.error("Could not load VIP list. Using an empty list and continuing. Exception: %s" % str(ex)) 20 | 21 | def load_vips(self, path: Path) -> List[str]: 22 | """Load the external reference data set of the known VIPs.""" 23 | with open(path, 'r') as f: 24 | self.vips = [x.strip().upper() for x in f.readlines()] 25 | return self.vips 26 | 27 | def is_vip(self, email: str) -> bool: 28 | """Check if an email address is a VIP.""" 29 | return email.upper() in self.vips 30 | 31 | def __str__(self): 32 | return ",".join(self.vips) 33 | 34 | def __repr__(self): 35 | return ",".join(self.vips) 36 | -------------------------------------------------------------------------------- /modules/filters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/modules/filters/__init__.py -------------------------------------------------------------------------------- /modules/filters/deduper.py: -------------------------------------------------------------------------------- 1 | """Deduper - this package offers different deduplicaton functions.""" 2 | 3 | import logging 4 | from typing import Union 5 | 6 | import psycopg2 7 | import psycopg2.extras 8 | 9 | from lib.db.db import _get_db 10 | 11 | from models.idf import InternalDataFormat 12 | 13 | 14 | class Deduper: 15 | """The DB based deduper.""" 16 | 17 | bloomf_loaded = False 18 | 19 | def __init__(self): 20 | pass 21 | 22 | def load_bf(self): 23 | # XXX IMPROVEMENT: we might want to use bloomfilters here 24 | self.bloomf_loaded = True 25 | 26 | def dedup(self, idf: InternalDataFormat) -> Union[None, InternalDataFormat]: 27 | """Deduplicate an IDF element based on the existence in the DB. 28 | FIXME: this is O(n^2) with n entries in the DB unless indexed properly. Think about indices or a bloom filter 29 | 30 | :param idf - internal data format element 31 | :returns: None if it already exists, otherwise the idf 32 | :raises Exception on DB problem 33 | 34 | """ 35 | if not self.bloomf_loaded: 36 | self.load_bf() 37 | self.bloomf_loaded = True 38 | # at the moment, we'll use postgresql 39 | 40 | conn = _get_db() 41 | sql = "SELECT count(*) from leak_data WHERE email=%s and password=%s" 42 | 43 | try: 44 | cur = conn.cursor(cursor_factory = psycopg2.extras.RealDictCursor) 45 | cur.execute(sql, (idf.email, idf.password)) 46 | rows = cur.fetchall() 47 | count = int(rows[0]['count']) 48 | if count >= 1: 49 | # row already exists, return None 50 | return None 51 | else: 52 | return idf 53 | except Exception as ex: 54 | logging.error("Deduper: could not select data from the DB. Reason: %s" % (str(ex))) 55 | raise ex 56 | -------------------------------------------------------------------------------- /modules/filters/filter.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from models.idf import InternalDataFormat 4 | 5 | 6 | class Filter: 7 | def __init__(self): 8 | pass 9 | 10 | def filter(self, idf: InternalDataFormat) -> Union[None, InternalDataFormat]: 11 | """Here we could implement all kinds of filters on data elements or whole rows. 12 | At the moment, this is a NOP. 13 | """ 14 | return idf 15 | -------------------------------------------------------------------------------- /modules/output/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/modules/output/__init__.py -------------------------------------------------------------------------------- /modules/output/db.py: -------------------------------------------------------------------------------- 1 | """Database output module. Stores an IDF item to the DB.""" 2 | from lib.helpers import getlogger 3 | 4 | import psycopg2 5 | import psycopg2.extras 6 | 7 | from lib.baseoutput.output import BaseOutput 8 | from lib.db.db import _get_db 9 | from models.outdf import LeakData 10 | 11 | 12 | logger = getlogger(__name__) 13 | 14 | 15 | class PostgresqlOutput(BaseOutput): 16 | dbconn = None 17 | 18 | def __init__(self): 19 | super().__init__() 20 | self.dbconn = _get_db() 21 | 22 | def process(self, data: LeakData) -> bool: 23 | """Store the output format data into Postgresql. 24 | 25 | :returns True on success 26 | :raises psycopg2.Error exception 27 | """ 28 | 29 | sql = """ 30 | INSERT into leak_data( 31 | leak_id, email, password, password_plain, password_hashed, hash_algo, ticket_id, email_verified, 32 | password_verified_ok, ip, domain, browser , malware_name, infected_machine, dg 33 | ) 34 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s ) 35 | ON CONFLICT ON CONSTRAINT constr_unique_leak_data_leak_id_email_password_domain 36 | DO UPDATE SET count_seen = leak_data.count_seen + 1 37 | RETURNING id 38 | """ 39 | if data: 40 | try: 41 | with self.dbconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: 42 | print(cur.mogrify(sql, ( 43 | data.leak_id, data.email, data.password, data.password_plain, data.password, data.hash_algo, 44 | data.ticket_id, data.email_verified, data.password_verified_ok, data.ip, data.domain, 45 | data.browser, data.malware_name, data.infected_machine, data.dg))) 46 | cur.execute(sql, ( 47 | data.leak_id, data.email, data.password, data.password_plain, data.password, data.hash_algo, 48 | data.ticket_id, data.email_verified, data.password_verified_ok, data.ip, data.domain, 49 | data.browser, data.malware_name, data.infected_machine, data.dg)) 50 | leak_data_id = int(cur.fetchone()['id']) 51 | print("leak_data_id: %s" % leak_data_id) 52 | except psycopg2.Error as ex: 53 | logger.error("%s(): error: %s" % (self.process.__name__, ex.pgerror)) 54 | raise ex 55 | return True 56 | -------------------------------------------------------------------------------- /modules/parsers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/modules/parsers/__init__.py -------------------------------------------------------------------------------- /modules/parsers/spycloud.py: -------------------------------------------------------------------------------- 1 | """ 2 | SpyCloud Parser 3 | 4 | Accepts a pandas DF, parses and validates it against the *IN*put format and returns it in the *internal* IDF format 5 | 6 | """ 7 | 8 | import logging 9 | # from typing import List 10 | 11 | from pydantic import parse_obj_as, ValidationError 12 | import pandas as pd 13 | import numpy as np 14 | from typing import List 15 | 16 | from lib.baseparser.parser import BaseParser 17 | from models.indf import SpyCloudInputEntry 18 | from models.idf import InternalDataFormat 19 | 20 | 21 | class SpyCloudParser(BaseParser): 22 | def __init__(self): 23 | """init""" 24 | super().__init__() 25 | 26 | def parse(self, df: pd.DataFrame) -> List[InternalDataFormat]: 27 | """parse a pandas DF and return the data in the Internal Data Format.""" 28 | 29 | # First, map empty columns to None so that it fits nicely into the IDF 30 | df.replace({"-": None}, inplace = True) 31 | df.replace({"nan": None}, inplace = True) 32 | df.replace({np.nan: None}, inplace = True) 33 | df.replace({'breach_date': {'Unknown': None}}, inplace = True) 34 | 35 | # some initial checks on the df 36 | 37 | # validate via pydantic 38 | items = [] 39 | for row in df.reset_index().to_dict(orient = 'records'): 40 | logging.debug("row=%s" % row) 41 | idf_dict = dict(email = None, password = None, notify = False, domain = None, error_msg = "incomplete data", 42 | needs_human_intervention = True) 43 | idf_dict['original_line'] = str(row) 44 | try: 45 | input_data_item = parse_obj_as(SpyCloudInputEntry, row) # here the validation magic happens 46 | idf_dict = input_data_item.dict() # conversion magic happens between input format and internal df 47 | idf_dict['domain'] = input_data_item.email_domain # map specific fields 48 | except Exception as ex: 49 | idf_dict['needs_human_intervention'] = True 50 | idf_dict['notify'] = False 51 | idf_dict['error_msg'] = str(ex) 52 | logging.error("could not parse CSV row. Original line: %r.\nReason: %s" % (repr(row), str(ex))) 53 | logging.debug("idf_dict = %s" % idf_dict) 54 | else: 55 | logging.error("everything successfully converted") 56 | idf_dict['needs_human_intervention'] = False 57 | idf_dict['notify'] = True 58 | idf_dict['error_msg'] = None 59 | finally: 60 | try: 61 | idf = InternalDataFormat(**idf_dict) # another step of validation happens here 62 | logging.debug("idf = %r" % idf) 63 | except Exception as ex2: 64 | logging.error("Exception in finally. idf_dict = %r" % idf_dict) 65 | raise ex2 66 | else: 67 | items.append(idf) 68 | 69 | return items 70 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | astroid==2.4.2 2 | attrs==20.3.0 3 | certifi==2022.12.7 4 | chardet==4.0.0 5 | click==7.1.2 6 | dnspython==2.1.0 7 | email-validator==1.1.2 8 | fastapi==0.65.2 9 | h11==0.12.0 10 | importlib-metadata==3.7.0 11 | iniconfig==1.1.1 12 | isort==5.7.0 13 | lazy-object-proxy==1.4.3 14 | mccabe==0.6.1 15 | numpy==1.22.0 16 | packaging==20.9 17 | pandas==1.2.1 18 | pluggy==0.13.1 19 | psycopg2-binary==2.8.6 20 | py==1.10.0 21 | pydantic==1.7.4 22 | pylint-venv==2.1.1 23 | pylint==2.6.2 24 | pyparsing==2.4.7 25 | pytest==6.2.2 26 | python-dateutil==2.8.1 27 | python-dotenv==0.15.0 28 | python-multipart==0.0.5 29 | pytz==2021.1 30 | requests==2.31.0 31 | six==1.15.0 32 | starlette==0.27.0 33 | toml==0.10.2 34 | tqdm==4.56.0 35 | typed-ast==1.4.2 36 | typing-extensions==3.7.4.3 37 | urllib3==1.26.5 38 | uvicorn==0.13.3 39 | wrapt==1.12.1 40 | zipp==3.4.0 41 | coverage==5.5 42 | pytest-cov==2.11.1 43 | ldap3==2.9 44 | pyasn1==0.4.8 45 | -------------------------------------------------------------------------------- /sonar-project.properties: -------------------------------------------------------------------------------- 1 | sonar.projectKey=digits2_credentialLeakDB 2 | sonar.organization=digits2 3 | # This is the name and version displayed in the SonarCloud UI. 4 | sonar.projectName=credentialLeakDB 5 | sonar.projectVersion=1.12.0 6 | # Path is relative to the sonar-project.properties file. Replace "\" by "/" on Windows. 7 | #sonar.sources=. 8 | # Encoding of the source code. Default is default system encoding 9 | #sonar.sourceEncoding=UTF-8 10 | sonar.coverage.exclusions=doc/**,venv/** 11 | sonar.exclusions=doc/**,tests/**,venv/** 12 | # duplications exclusions 13 | sonar.cpd.exclusions=doc/** 14 | # python coverage config 15 | sonar.python.coverage.reportPaths=/github/workspace/coverage.xml 16 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Unit tests and test data 2 | 3 | We use sonarcloud and codecov (post bash exploit :) ) for coverage testing. 4 | 5 | ## Codecoverage of the unit tests over time 6 | ![Coverage over time](https://codecov.io/gh/EC-DIGIT-CSIRC/credentialLeakDB/branch/main/graphs/commits.svg) 7 | 8 | ## Weak spots (sunburst diagram, tree) 9 | ![Weak spots](https://codecov.io/gh/EC-DIGIT-CSIRC/credentialLeakDB/branch/main/graphs/sunburst.svg) 10 | ![quadtree graph](https://codecov.io/gh/EC-DIGIT-CSIRC/credentialLeakDB/branch/main/graphs/tree.svg) 11 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/tests/__init__.py -------------------------------------------------------------------------------- /tests/fixtures/data.csv: -------------------------------------------------------------------------------- 1 | email,password,password_plain,password_hashed,hash_algo,ticket_id,email_verified,password_verified_ok,ip,domain,browser,malware_name,infected_machine,dg 2 | aaron@example.com,12345,12345,,,CISRC-199,f,f,1.2.3.4,example.com,Google Chrome,,local_laptop,DIGIT 3 | sarah@example.com,123456,123456,,,CISRC-199,f,f,1.2.3.5,example.com,Firefox,,sarahs_laptop,DIGIT 4 | peter@example.com,ohk7do7gil6O,ohk7do7gil6O,4aa7985dad6e1f02238c2e2afc521c4d3dd30650656cd07bf0b7cfd3cd1190b7,sha256,CISRC-199,f,f,1.2.3.5,example.com,Firefox,,WORKSTATION,DIGIT 5 | david@example.com,24b3f998468a9da4105e6c78f5444532cde99d53c011715754194c3b4f3e37b4,,24b3f998468a9da4105e6c78f5444532cde99d53c011715754194c3b4f3e37b4,sha256,CISRC-199,f,f,8.8.8.8,example.com,Firefox,,Macbook Pro,DIGIT 6 | lauri@example.com,Vie5kuuwiroo,Vie5kuuwiroo,,,CISRC-200,t,t,9.9.9.9,example.com,Firefox,,Raspberry PI 3+,DIGIT 7 | natasha@example.com,1235kuuwiroo,1235kuuwiroo,,,CISRC-201,t,t,9.9.9.9,example.com,Firefox,,Raspberry PI 3+,DIGIT 8 | -------------------------------------------------------------------------------- /tests/fixtures/data_anonymized_spycloud.csv: -------------------------------------------------------------------------------- 1 | breach_title,spycloud_publish_date,breach_date,email,domain,username,password,target_domain,target_url,password_plaintext,sighting,severity,password_type,email_username,user_browser,infected_time,email_domain,ip_addresses,infected_machine_id 2 | Freedom Fox Combo List,2020-06-25,Unknown,peter@example.com,example.com,-,12345,-,-,12345,1,High,plaintext,peter.petersson,-,-,example.com,-,- 3 | Freedom Fox Combo List,2020-06-25,Unknown,bob.inventedname@ec.europa.eu,europa.eu,-,123456,-,-,123456,1,High,plaintext,bob.inventedname,-,-,ec.europa.eu,-,- 4 | Freedom Fox Combo List,2020-06-25,Unknown,karen.inventedname@ec.europa.eu,europa.eu,-,reallyweakpassword,-,-,reallyweakpassword,1,High,plaintext,karen.inventedname,-,-,ec.europa.eu,-,- 5 | -------------------------------------------------------------------------------- /tests/fixtures/vips.txt: -------------------------------------------------------------------------------- 1 | aaron@example.com 2 | benoit@example.com 3 | sarah@example.com 4 | lauri@example.com -------------------------------------------------------------------------------- /tests/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/tests/lib/__init__.py -------------------------------------------------------------------------------- /tests/lib/basecollector/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/tests/lib/basecollector/__init__.py -------------------------------------------------------------------------------- /tests/lib/basecollector/test_collector.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | from lib.basecollector.collector import * 6 | 7 | 8 | class TestBaseCollector(unittest.TestCase): 9 | def test_collect(self): 10 | valid_csv_file = 'tests/fixtures/data.csv' 11 | invalid_csv_file = 'tests/fixtures/dataDOESNTEXIST.csv' 12 | 13 | tc = BaseCollector() 14 | df: pd.DataFrame 15 | status, df = tc.collect(valid_csv_file) 16 | assert status == "OK" 17 | assert not df.empty 18 | assert df.shape[0] > 1 19 | 20 | status, df = tc.collect(invalid_csv_file) 21 | assert status != "OK" 22 | assert df.empty 23 | -------------------------------------------------------------------------------- /tests/lib/baseenricher/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/tests/lib/baseenricher/__init__.py -------------------------------------------------------------------------------- /tests/lib/baseenricher/test_enricher.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from lib.baseenricher.enricher import BaseEnricher 4 | from models.idf import InternalDataFormat 5 | 6 | 7 | class TestBaseEnricher(unittest.TestCase): 8 | def test_enrich(self): 9 | idf = InternalDataFormat(email="foo@example.com", password = "12345", notify = True) 10 | te = BaseEnricher() 11 | result = te.enrich(idf) 12 | assert result == idf -------------------------------------------------------------------------------- /tests/lib/baseoutput/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/tests/lib/baseoutput/__init__.py -------------------------------------------------------------------------------- /tests/lib/baseoutput/test_output.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from lib.baseoutput.output import BaseOutput 4 | 5 | class TestBaseOutput(unittest.TestCase): 6 | def test_process(self): 7 | to = BaseOutput() 8 | assert to.process("test_outputfile.txt") 9 | -------------------------------------------------------------------------------- /tests/lib/baseparser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/tests/lib/baseparser/__init__.py -------------------------------------------------------------------------------- /tests/lib/baseparser/test_parser.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | from lib.baseparser.parser import BaseParser 6 | 7 | class TestBaseParser(unittest.TestCase): 8 | def test_parse(self): 9 | tp = BaseParser() 10 | df = pd.DataFrame() 11 | tp.parse(df) 12 | assert True # not very useful right now but the structure for the test case is here 13 | -------------------------------------------------------------------------------- /tests/lib/test_helpers.py: -------------------------------------------------------------------------------- 1 | from lib.helpers import anonymize_password 2 | 3 | def test_anonymize_password(): 4 | pass1 = "12345678" 5 | expected = "1*****78" 6 | assert anonymize_password(pass1) == expected 7 | 8 | pass2 = "123" 9 | expected = "123" 10 | assert anonymize_password(pass2) == expected 11 | 12 | pass3 = "12" 13 | expected = "12" 14 | assert anonymize_password(pass3) == expected 15 | 16 | pass4 = "" 17 | expected = "" 18 | assert anonymize_password(pass4) == expected 19 | 20 | pass5 = None 21 | expected = None 22 | assert anonymize_password(pass5) == expected 23 | -------------------------------------------------------------------------------- /tests/lib/test_logger.py: -------------------------------------------------------------------------------- 1 | from lib.helpers import getlogger 2 | 3 | 4 | logger = getlogger(__name__) 5 | 6 | 7 | class Foo: 8 | def __init__(self): 9 | pass 10 | 11 | def do_smthg(self): 12 | logger.info("bar") 13 | print("baz") 14 | 15 | 16 | def test_logger(): 17 | logger.info("starting up the class") 18 | 19 | f = Foo() 20 | f.do_smthg() 21 | logger.info("DONE") 22 | assert True 23 | 24 | 25 | if __name__ == "__main__": 26 | test_logger() 27 | -------------------------------------------------------------------------------- /tests/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/tests/modules/__init__.py -------------------------------------------------------------------------------- /tests/modules/enrichers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC-DIGIT-CSIRC/credentialLeakDB/54bc1178b5255cb6fb6c4af830b4d31a4efa6459/tests/modules/enrichers/__init__.py -------------------------------------------------------------------------------- /tests/modules/enrichers/test_external_email.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from modules.enrichers.external_email import ExternalEmailEnricher 4 | 5 | class TestExternalEmailEnricher(unittest.TestCase): 6 | def test_is_external(self): 7 | external_email = "foobar@example.com" 8 | tee = ExternalEmailEnricher() 9 | assert tee.is_external_email(external_email) 10 | 11 | internal_email = "foobar.example@ec.europa.eu" 12 | assert tee.is_internal_email(internal_email) 13 | -------------------------------------------------------------------------------- /tests/test_collector_spycloud.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from pathlib import Path 4 | 5 | from modules.collectors.spycloud.collector import SpyCloudCollector 6 | 7 | 8 | class SpyCloudCollectorTest(unittest.TestCase): 9 | def test_collect(self): 10 | path = Path('tests/fixtures/data_anonymized_spycloud.csv') 11 | tc = SpyCloudCollector() 12 | statuscode, data = tc.collect(path) 13 | assert statuscode == "OK" 14 | assert data.iloc[0]['breach_title'] == 'Freedom Fox Combo List' 15 | assert data.iloc[0]['email'] == 'peter@example.com' 16 | -------------------------------------------------------------------------------- /tests/test_deduper.py: -------------------------------------------------------------------------------- 1 | from models.idf import InternalDataFormat 2 | 3 | from modules.filters.deduper import Deduper 4 | 5 | 6 | def test_load_bf(): 7 | dd = Deduper() 8 | assert not dd.bloomf_loaded 9 | dd.load_bf() 10 | assert dd.bloomf_loaded 11 | 12 | 13 | def test_dedup(): 14 | dd = Deduper() 15 | idf = InternalDataFormat(email="aaron@example.com", password="12345", 16 | notify=False, needs_human_intervention=False) 17 | idf2 = dd.dedup(idf) 18 | assert not idf2 19 | idf = InternalDataFormat(email="aaron999735@example.com", password="12345XXX", 20 | notify=False, needs_human_intervention=False) 21 | idf2 = dd.dedup(idf) 22 | assert idf2 23 | -------------------------------------------------------------------------------- /tests/test_enrichment.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from pathlib import Path 3 | 4 | # from modules.enrichers.ldap import LDAPEnricher 5 | from modules.enrichers.external_email import ExternalEmailEnricher 6 | from modules.enrichers.abuse_contact import AbuseContactLookup 7 | from modules.enrichers.vip import VIPEnricher 8 | 9 | 10 | class TestVIPenrichment(unittest.TestCase): 11 | 12 | def test_load_vips(self): 13 | path = 'tests/fixtures/vips.txt' 14 | te = VIPEnricher(Path(path)) 15 | 16 | assert te.is_vip('AARON@example.com') 17 | assert te.is_vip('aaron@example.com') 18 | assert not te.is_vip('foobar-doesnotexist') 19 | 20 | def test_load_vips_invalid_path(self): 21 | path = 'tests/fixtures/vips.txt-doesnotexist' 22 | te = VIPEnricher(Path(path)) # will pass because there we catch the exception 23 | self.assertRaises(Exception, te.load_vips, path) 24 | 25 | 26 | class TestIsExternalEmail(unittest.TestCase): 27 | def test_is_internal(self): 28 | email = "foobar.example@ext.ec.europa.eu" 29 | te = ExternalEmailEnricher() 30 | assert te.is_internal_email(email) 31 | domain = "ec.europa.eu" 32 | assert te.is_internal_email(domain) 33 | 34 | def test_is_external(self): 35 | email = "aaron@example.com" 36 | te = ExternalEmailEnricher() 37 | assert te.is_external_email(email) 38 | 39 | 40 | class TestAbuseContactLookup(unittest.TestCase): 41 | def test_lookup(self): 42 | email = "aaron@example.com" 43 | te = AbuseContactLookup() 44 | assert email == te.lookup(email)[0] 45 | email = "aaron@example.ec.europa.eu" 46 | assert "ec-digit-csirc@ec.europa.eu" == te.lookup(email)[0] 47 | -------------------------------------------------------------------------------- /tests/test_filter.py: -------------------------------------------------------------------------------- 1 | from models.idf import InternalDataFormat 2 | 3 | from modules.filters.filter import Filter 4 | 5 | 6 | def test_filter(): 7 | fi = Filter() 8 | idf = InternalDataFormat(email = "aaron@example.com", password = "12345", notify = False, 9 | needs_human_intervention = False) 10 | idf2 = fi.filter(idf) 11 | assert idf2 == idf 12 | -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | from lib.helpers import getlogger 2 | 3 | import urllib.parse 4 | import uuid 5 | import unittest 6 | 7 | from fastapi.testclient import TestClient 8 | 9 | from lib.db.db import _connect_db as connect_db 10 | 11 | from api.main import * 12 | 13 | VALID_AUTH = {'x-api-key': 'random-test-api-key'} 14 | INVALID_AUTH = {'x-api-key': 'random-test-api-XXX'} 15 | 16 | logger = getlogger(__name__) 17 | client = TestClient(app) # , base_url='http://localhost:8080/') 18 | 19 | 20 | def test_ping(): 21 | response = client.get("/ping") 22 | assert response.status_code == 200 23 | assert response.json() == {"message": "pong"} 24 | 25 | 26 | class DBTestCases(unittest.TestCase): 27 | def test_get_db(self): 28 | assert get_db() is not None 29 | 30 | def test_close_db(self): 31 | get_db() # initialize connection 32 | self.assertIsNone(close_db()) 33 | get_db() # re-initialize connection 34 | 35 | def test_connect_invalid_db(self): 36 | self.assertRaises(Exception, connect_db, 'SOME INVALID DSN') 37 | 38 | 39 | def test_fetch_valid_api_keys(): 40 | assert True 41 | 42 | 43 | class APIKeyTests(unittest.TestCase): 44 | """Test API key functions""" 45 | 46 | def test_validate_api_key_header(self): 47 | self.assertRaises(Exception, validate_api_key_header, "") 48 | 49 | def test_is_valid_api_key(self): 50 | assert is_valid_api_key(VALID_AUTH['x-api-key']) 51 | 52 | def test_is_INVALID_api_key(self): 53 | assert not is_valid_api_key(INVALID_AUTH['x-api-key']) 54 | 55 | def test_validate_api_key(self): 56 | assert True 57 | 58 | 59 | def test_root_auth(): 60 | response = client.get("/", headers = VALID_AUTH) 61 | assert response.status_code == 200 62 | assert response.json() == {"message": "Hello World"} 63 | 64 | 65 | # noinspection PyPep8Naming 66 | def test_root_INVALID_auth(): 67 | response = client.get("/", headers = INVALID_AUTH) 68 | assert response.status_code == 403 69 | 70 | 71 | def test_get_user_by_email(): 72 | email = urllib.parse.quote("aaron@example.com") 73 | response = client.get("/user/%s" % email, headers = VALID_AUTH) 74 | assert response.status_code == 200 75 | data = response.json() 76 | assert "meta" in response.text and "data" in response.text and data['meta']['count'] >= 1 77 | 78 | 79 | # noinspection PyPep8Naming 80 | def test_get_nonexistent_user_by_INVALID_email(): 81 | email = urllib.parse.quote("aaron@doesnotexist.com") 82 | response = client.get("/user/%s" % email, headers = VALID_AUTH) 83 | assert response.status_code != 200 84 | data = response.json() 85 | assert "meta" in response.text and "data" in response.text and data['meta']['count'] == 0 86 | 87 | 88 | def test_get_user_by_email_and_password(): 89 | email = urllib.parse.quote("aaron@example.com") 90 | passwd = "12345" 91 | response = client.get("/user_and_password/%s/%s" % (email, passwd), headers = VALID_AUTH) 92 | assert response.status_code == 200 93 | data = response.json() 94 | assert "meta" in response.text and "data" in response.text and data['meta']['count'] >= 1 95 | 96 | 97 | # noinspection PyPep8Naming 98 | def test_get_nonexistent_user_by_email_and_INVALID_password(): 99 | email = urllib.parse.quote("aaron@example.com") 100 | passwd = "12345XXXXXXXXXX" 101 | response = client.get("/user_and_password/%s/%s" % (email, passwd), headers = VALID_AUTH) 102 | assert response.status_code == 404 103 | data = response.json() 104 | assert "meta" in response.text and "data" in response.text and data['meta']['count'] == 0 105 | 106 | 107 | def test_check_user_by_email(): 108 | email = urllib.parse.quote("aaron@example.com") 109 | response = client.get("/exists/by_email/%s" % email, headers = VALID_AUTH) 110 | assert response.status_code == 200 111 | data = response.json() 112 | assert "meta" in response.text and "data" in response.text and data['meta']['count'] >= 1 113 | 114 | 115 | # noinspection PyPep8Naming 116 | def test_check_nonexistent_user_by_INVALID_email(): 117 | email = urllib.parse.quote("aaron@doesnotexist.com") 118 | response = client.get("/exists/by_email/%s" % email, headers = VALID_AUTH) 119 | assert response.status_code == 200 120 | data = response.json() 121 | print(data) 122 | assert "meta" in response.text and "data" in response.text and data['data'][0]['count'] == 0 123 | 124 | 125 | def test_check_user_by_password(): 126 | password = "12345" 127 | response = client.get("/exists/by_password/%s" % password, headers = VALID_AUTH) 128 | assert response.status_code == 200 129 | data = response.json() 130 | assert "meta" in response.text and "data" in response.text and data['meta']['count'] >= 1 131 | 132 | 133 | # noinspection PyPep8Naming 134 | def test_check_nonexistent_user_by_INVALID_password(): 135 | password = 'DOESNOTEXIST@59w47YTISJGw496UASGJSATARSASJKGJSAKGASRG' 136 | response = client.get("/exists/by_password/%s" % password, headers = VALID_AUTH) 137 | assert response.status_code == 200 138 | data = response.json() 139 | assert "meta" in response.text and "data" in response.text and data['data'][0]['count'] == 0 140 | 141 | 142 | def test_check_user_by_domain(): 143 | domain = "example.com" 144 | response = client.get("/exists/by_domain/%s" % domain, headers = VALID_AUTH) 145 | assert response.status_code == 200 146 | data = response.json() 147 | assert "meta" in response.text and "data" in response.text and data['meta']['count'] >= 1 148 | 149 | 150 | # noinspection PyPep8Naming 151 | def test_check_nonexistent_user_by_INVALID_domain(): 152 | domain = "example.com-foobar-2esugksti2uwasgjskhsjhsa.net" 153 | response = client.get("/exists/by_domain/%s" % domain, headers = VALID_AUTH) 154 | assert response.status_code == 200 155 | data = response.json() 156 | assert "meta" in response.text and "data" in response.text and data['data'][0]['count'] == 0 157 | 158 | 159 | def test_get_reporters(): 160 | response = client.get("/reporter/", headers = VALID_AUTH) 161 | assert response.status_code == 200 162 | data = response.json() 163 | assert "meta" in response.text and \ 164 | "data" in response.text and \ 165 | data['meta']['count'] >= 1 and \ 166 | data['data'][0]['reporter_name'] == 'aaron' 167 | 168 | 169 | def test_get_sources(): 170 | response = client.get("/source_name/", headers = VALID_AUTH) 171 | assert response.status_code == 200 172 | data = response.json() 173 | answerset = set(i['source_name'] for i in data['data']) 174 | print(answerset) 175 | assert "meta" in response.text and \ 176 | "data" in response.text and \ 177 | data['meta']['count'] >= 1 and \ 178 | "HaveIBeenPwned" in answerset 179 | 180 | 181 | def test_new_leak(): 182 | test_data = { 183 | "ticket_id": "CSIRC-202", 184 | "summary": "a test leak, please ignore", 185 | "reporter_name": "aaron", 186 | "source_name": "spycloud", 187 | "breach_ts": "2021-03-24T16:08:33.405Z", 188 | "source_publish_ts": "2021-03-24T16:08:33.405Z" 189 | } 190 | response = client.post("/leak/", json = test_data, headers = VALID_AUTH) 191 | assert response.status_code == 201 192 | data = response.json() 193 | assert "meta" in response.text and \ 194 | "data" in response.text and \ 195 | data['meta']['count'] >= 1 and \ 196 | data['data'][0]['id'] >= 1 197 | return int(data['data'][0]['id']) 198 | 199 | 200 | def test_update_leak(): 201 | test_data = { 202 | "ticket_id": "CSIRC-202", 203 | "summary": "an UPDATE-able test leak, please ignore", 204 | "reporter_name": "aaron", 205 | "source_name": "spycloud", 206 | "breach_ts": "2021-01-01T00:00:00.000Z", 207 | "source_publish_ts": "2021-01-02T00:00:00.000Z", 208 | } 209 | response = client.post("/leak/", json = test_data, headers = VALID_AUTH) 210 | assert response.status_code == 201 211 | data = response.json() 212 | assert "meta" in response.text and \ 213 | "data" in response.text and \ 214 | data['meta']['count'] >= 1 and \ 215 | data['data'][0]['id'] >= 1 216 | _id = data['data'][0]['id'] 217 | 218 | # now UPDATE it 219 | test_data['summary'] = "We UPDATED the test leak now!" 220 | test_data['id'] = _id 221 | response = client.put('/leak/', json = test_data, headers = VALID_AUTH) 222 | assert response.status_code == 200 223 | 224 | # fetch the results and see if it's really updated 225 | response = client.get('/leak/%s' % (_id,), headers = VALID_AUTH) 226 | assert response.status_code == 200 227 | assert response.json()['data'][0]['summary'] == "We UPDATED the test leak now!" 228 | 229 | # now try to fetch an invalid ID 230 | response = client.get('/leak/%s' % (_id + 10000,), headers = VALID_AUTH) 231 | assert response.status_code == 404 232 | 233 | 234 | # noinspection PyPep8Naming 235 | def test_update_INVALID_leak(): 236 | test_data = { 237 | "id": -1, 238 | "ticket_id": "CSIRC-202", 239 | "summary": "trying to update a leak which does NOT EXIST", 240 | "reporter_name": "aaron", 241 | "source_name": "spycloud", 242 | "breach_ts": "2021-01-01T00:00:00.000Z", 243 | "source_publish_ts": "2021-01-02T00:00:00.000Z", 244 | } 245 | response = client.put('/leak/', json = test_data, headers = VALID_AUTH) 246 | assert response.status_code == 400 247 | assert response.json()['data'] == [] 248 | 249 | 250 | # By summary 251 | def test_get_leak_by_summary(): 252 | summary = "COMB" 253 | response = client.get('/leak/by_summary/%s' % (summary,), headers = VALID_AUTH) 254 | assert response.status_code == 200 255 | data = response.json() 256 | assert data['meta']['count'] >= 1 257 | assert data['data'][0]['summary'] == summary 258 | assert data['data'][0]['reporter_name'] == 'aaron' 259 | 260 | 261 | # noinspection PyPep8Naming 262 | def test_get_leak_by_INVALID_summary(): 263 | summary = "COMB-XXX-DOESNETEXIST" 264 | response = client.get('/leak/by_summary/%s' % (summary,), headers = VALID_AUTH) 265 | assert response.status_code == 404 266 | data = response.json() 267 | assert data['meta']['count'] == 0 268 | 269 | 270 | # By ticket_id 271 | def test_get_leak_by_ticket_id(): 272 | ticket_id = "CSIRC-102" # we know that exists based on the db.sql import 273 | response = client.get('/leak/by_ticket_id/%s' % (ticket_id,), headers = VALID_AUTH) 274 | assert response.status_code == 200 275 | data = response.json() 276 | assert data['meta']['count'] >= 1 277 | assert data['data'][0]['summary'] == "COMB" 278 | 279 | 280 | # noinspection PyPep8Naming 281 | def test_get_leak_by_INVALID_ticket_id(): 282 | ticket_id = "COMB-XXX-DOESNETEXIST" 283 | response = client.get('/leak/by_ticket_id/%s' % (ticket_id,), headers = VALID_AUTH) 284 | assert response.status_code == 404 285 | data = response.json() 286 | assert data['meta']['count'] == 0 287 | 288 | 289 | def test_get_all_leaks(): 290 | response = client.get('/leak/all', headers = VALID_AUTH) 291 | assert response.status_code == 200 292 | data = response.json() 293 | assert data['meta']['count'] > 0 294 | 295 | 296 | def test_get_leak_by_reporter(): 297 | response = client.get('leak/by_reporter/%s' % ("aaron",), headers = VALID_AUTH) 298 | assert response.status_code == 200 299 | data = response.json() 300 | assert data['meta']['count'] > 0 301 | 302 | 303 | def test_get_leak_by_source(): 304 | response = client.get('leak/by_source/%s' % ("spycloud",), headers = VALID_AUTH) 305 | assert response.status_code == 200 306 | data = response.json() 307 | assert data['meta']['count'] > 0 308 | 309 | 310 | # ################################################################################# 311 | # leak_data 312 | 313 | def test_get_leak_data_by_leak(): 314 | leak_id = 1 # we know this exists by the db.sql INSERT 315 | response = client.get('/leak_data/%s' % (leak_id,), headers = VALID_AUTH) 316 | assert response.status_code == 200 317 | data = response.json() 318 | assert data['meta']['count'] >= 1 319 | assert data['data'][0]['email'] == 'aaron@example.com' 320 | 321 | 322 | # noinspection PyPep8Naming 323 | def test_get_leak_data_by_INVALID_leak(): 324 | leak_id = -1 # we know this does not exist 325 | response = client.get('/leak_data/%s' % (leak_id,), headers = VALID_AUTH) 326 | assert response.status_code == 404 327 | data = response.json() 328 | assert data['meta']['count'] == 0 329 | assert data['data'] == [] 330 | 331 | 332 | def test_get_leak_data_by_ticket_id(): 333 | ticket_id = 'CISRC-199' # we know this exists by the db.sql INSERT 334 | response = client.get('/leak_data/by_ticket_id/%s' % (ticket_id,), headers = VALID_AUTH) 335 | assert response.status_code == 200 336 | data = response.json() 337 | assert data['meta']['count'] >= 1 338 | assert data['data'][0]['email'] == 'aaron@example.com' 339 | assert data['data'][1]['email'] == 'sarah@example.com' 340 | 341 | 342 | def insert_leak_data(d: dict) -> int: 343 | """ generic test function for INSERTing a leak_data row given by d. 344 | 345 | @:param d: a row as dict 346 | @:returns ID: ID of the newly inserted row 347 | @:rtype: int 348 | """ 349 | response = client.post("/leak_data/", json = d, headers = VALID_AUTH) 350 | print(response) 351 | print(response.text) 352 | assert response.status_code == 201 353 | data = response.json() 354 | print(data) 355 | assert "meta" in data and \ 356 | "data" in data and \ 357 | data['meta']['count'] >= 1 and \ 358 | data['data'][0]['id'] >= 1 359 | return data['data'][0]['id'] 360 | 361 | 362 | def test_new_leak_data(): 363 | """ INSERT a new leak_data row.""" 364 | test_data = { 365 | "leak_id": 1, 366 | "email": "aaron2@example.com", 367 | "password": "000000", 368 | "password_plain": "000000", 369 | "password_hashed": "d232105eb59a344df4b54db1c24009b1", 370 | "hash_algo": "md5", 371 | "ticket_id": "CSIRC-102", 372 | "email_verified": False, 373 | "password_verified_ok": False, 374 | "ip": "5.6.7.8", 375 | "domain": "example.com", 376 | "browser": "Chrome", 377 | "malware_name": "n/a", 378 | "infected_machine": "n/a", 379 | "dg": "DIGIT", 380 | "needs_human_intervention": False, 381 | "notify": False 382 | } 383 | _id = insert_leak_data(test_data) 384 | assert _id >= 0 385 | return _id 386 | 387 | 388 | def test_update_leak_data(): 389 | random_str = uuid.uuid4() 390 | test_data = { 391 | "leak_id": 1, 392 | "email": "aaron%s@example.com" % (random_str,), 393 | "password": "000000", 394 | "password_plain": "000000", 395 | "password_hashed": "d232105eb59a344df4b54db1c24009b1", 396 | "hash_algo": "md5", 397 | "ticket_id": "CSIRC-102", 398 | "email_verified": False, 399 | "password_verified_ok": False, 400 | "ip": "5.6.7.8", 401 | "domain": "example.com", 402 | "browser": "Chrome", 403 | "malware_name": "n/a", 404 | "infected_machine": "n/a", 405 | "dg": "DIGIT", 406 | "needs_human_intervention": False, 407 | "notify": False 408 | } 409 | # create my own leak_data row 410 | _id = insert_leak_data(test_data) 411 | 412 | # now UPDATE it 413 | random_str2 = uuid.uuid4() 414 | email2 = "aaron-%s@example.com" % random_str2 415 | 416 | test_data['id'] = _id 417 | test_data.update({"email": email2}) 418 | response = client.put('/leak_data/', json = test_data, headers = VALID_AUTH) 419 | assert response.status_code == 200 420 | print("after UPDATE: response = %r" % response.json()) 421 | 422 | # fetch the results and see if it's really updated 423 | response = client.get('/leak_data/%s' % (_id,), headers = VALID_AUTH) 424 | assert response.status_code == 200 425 | print("data: %r" % response.json()['data']) 426 | assert response.json()['data'][0]['email'] == email2 427 | 428 | 429 | def test_import_csv_with_leak_id(): 430 | _id = test_new_leak() 431 | fixtures_file = "./tests/fixtures/data.csv" 432 | f = open(fixtures_file, "rb") 433 | response = client.post('/import/csv/by_leak/%s' % (_id,), files = {"_file": f}, headers = VALID_AUTH) 434 | logger.info("response = %r" % response.text) 435 | assert 200 <= response.status_code < 300 436 | assert response.json()['meta']['count'] >= 0 437 | 438 | 439 | def test_check_file(): 440 | assert True # trivial check, not implemented yet actually in main.py 441 | 442 | 443 | def test_enrich_email_to_vip(): 444 | email_vip = "aaron@example.com" 445 | response = client.get('/enrich/email_to_vip/%s' % (email_vip,), headers = VALID_AUTH) 446 | assert response.status_code == 200 447 | data = response.json() 448 | assert data['meta']['count'] >= 1 449 | assert data['data'][0]['is_vip'] 450 | 451 | 452 | # noinspection PyPep8Naming 453 | def test_enrich_email_to_vip_INVALID(): 454 | email_vip = "aaron-invalid-does-not-exist@example.com" 455 | response = client.get('/enrich/email_to_vip/%s' % (email_vip,), headers = VALID_AUTH) 456 | assert response.status_code == 200 457 | data = response.json() 458 | assert data['meta']['count'] >= 1 459 | assert not data['data'][0]['is_vip'] 460 | 461 | 462 | class TestImportCSVSpycloud(unittest.TestCase): 463 | def test_import_csv_spycloud_invalid_ticket_id(self): 464 | fixtures_file = "./tests/fixtures/data_anonymized_spycloud.csv" 465 | f = open(fixtures_file, "rb") 466 | response = client.post('/import/csv/spycloud/?summary=test2', files = {"_file": f}, headers = VALID_AUTH) 467 | assert response.status_code >= 400 468 | 469 | def test_import_csv_spycloud(self): 470 | fixtures_file = "./tests/fixtures/data_anonymized_spycloud.csv" 471 | f = open(fixtures_file, "rb") 472 | response = client.post('/import/csv/spycloud/%s?summary=test2' % ("ticket99",), files = {"_file": f}, 473 | headers = VALID_AUTH) 474 | assert 200 <= response.status_code < 300 475 | assert response.json()['meta']['count'] >= 0 476 | 477 | 478 | class TestEnricherEmailToDG(unittest.TestCase): 479 | response = None 480 | 481 | def test_enrich_dg_by_email(self): 482 | email = "aaron@example.com" 483 | if not os.getenv('CED_SERVER'): 484 | with self.assertRaises(Exception): 485 | client.get('/enrich/email_to_dg/%s' % (email,), headers = VALID_AUTH) 486 | else: 487 | response = client.get('/enrich/email_to_dg/%s' % (email,), headers = VALID_AUTH) 488 | assert response.status_code == 200 489 | data = response.json() 490 | assert data['meta']['count'] >= 1 491 | assert data['data'][0]['dg'] 492 | -------------------------------------------------------------------------------- /tests/test_parser_spycloud.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from pathlib import Path 3 | from modules.parsers.spycloud import SpyCloudParser 4 | from modules.collectors.spycloud.collector import SpyCloudCollector 5 | 6 | 7 | class SpyCloudParserTest(unittest.TestCase): 8 | def test_parse(self): 9 | path = 'tests/fixtures/data_anonymized_spycloud.csv' 10 | tc = SpyCloudCollector() 11 | statuscode, df = tc.collect(Path(path)) 12 | assert statuscode == "OK" 13 | tp = SpyCloudParser() 14 | idf = tp.parse(df) 15 | assert idf 16 | # print([ i for i in idf ]) 17 | for i in idf: 18 | if "error_msg" in i.dict() and i.error_msg: 19 | print("error_msg: %s" % i.error_msg) 20 | print("orig_line: %s" % i.original_line) 21 | --------------------------------------------------------------------------------