├── .DS_Store
├── .flake8
├── .github
    ├── pull_request_template.md
    └── workflows
    │   └── python-app.yml
├── .gitignore
├── CITATION.cff
├── Dockerfile
├── Dockerfile-test
├── LICENSE
├── README.md
├── bin
    ├── osd2f
    └── osd2f-decrypt-submissions
├── docs
    ├── adding_new_anonymizers.md
    ├── basic_authentication.md
    ├── deploying_as_a_container.md
    ├── deploying_to_azure.md
    ├── development.md
    ├── microsoft_authentication.md
    ├── protecting_downloads.md
    ├── stresstests.md
    ├── using_entry_encryption.md
    └── using_secret_stores.md
├── mockdata
    └── sample
    │   ├── README.md
    │   ├── sample-platform-russellrodney-3.tar
    │   ├── sample-platform-russellrodney-3.tar.gz
    │   ├── sample-platform-russellrodney-3.zip
    │   └── sample-platform-russellrodney-3
    │       ├── ads_clicked
    │           └── ads_clicked.json
    │       ├── comments
    │           └── comments.json
    │       ├── companies_followed
    │           └── companies_followed.json
    │       ├── engagement
    │           └── engagement.json
    │       ├── posts
    │           ├── posts_0.json
    │           └── posts_1.json
    │       ├── profile_interests
    │           └── profile_interests.json
    │       └── short_messages
    │           └── messages.json
├── osd2f
    ├── __init__.py
    ├── __main__.py
    ├── anonymizers
    │   ├── __init__.py
    │   └── sample_platform.py
    ├── cli.py
    ├── config.py
    ├── database
    │   ├── __init__.py
    │   ├── configuration.py
    │   ├── logs.py
    │   └── submissions.py
    ├── definitions
    │   ├── __init__.py
    │   ├── content_settings.py
    │   ├── security_settings.py
    │   └── submissions.py
    ├── javascript
    │   ├── file_upload.js
    │   ├── parsing
    │   │   ├── fileparser.js
    │   │   ├── jsonparsing.js
    │   │   └── objparsing.js
    │   ├── server_interaction.js
    │   ├── tests
    │   │   ├── fileparsing.test.js
    │   │   ├── jsonparsing.test.js
    │   │   └── objectparsing.test.js
    │   ├── visualization_components
    │   │   ├── consentConfirmation.vue
    │   │   ├── donationContainer.vue
    │   │   └── donationTable.vue
    │   └── visualize.js
    ├── logger.py
    ├── security
    │   ├── __init__.py
    │   ├── authorization
    │   │   ├── __init__.py
    │   │   ├── basic_auth.py
    │   │   ├── microsoft_msal.py
    │   │   └── not_confgured.py
    │   ├── download_encryption
    │   │   ├── __init__.py
    │   │   └── encrypted_zipfile.py
    │   ├── entry_encryption
    │   │   ├── __init__.py
    │   │   ├── file_decryption.py
    │   │   └── secure_entry_singleton.py
    │   └── secrets
    │   │   ├── __init__.py
    │   │   └── azure_keyvault.py
    ├── server.py
    ├── settings
    │   ├── .DS_Store
    │   ├── default_content_settings.yaml
    │   └── default_upload_settings.yaml
    ├── static
    │   ├── .DS_Store
    │   ├── favicon.ico
    │   ├── js
    │   │   ├── libarchive
    │   │   │   ├── wasm-gen
    │   │   │   │   ├── libarchive.js
    │   │   │   │   └── libarchive.wasm
    │   │   │   └── worker-bundle.js
    │   │   ├── main.js
    │   │   └── main.js.LICENSE.txt
    │   ├── keylock.png
    │   ├── skull_phone_cc.jpg
    │   └── study_cc.jpg
    ├── templates
    │   ├── blocks
    │   │   ├── bootstrap_scripts.html.jinja
    │   │   ├── circles_row.html.jinja
    │   │   ├── footer.html.jinja
    │   │   ├── head.html.jinja
    │   │   ├── jumbotron.html.jinja
    │   │   ├── navbar.html.jinja
    │   │   └── two_block_row.html.jinja
    │   └── formats
    │   │   ├── base.html.jinja
    │   │   ├── researcher_template.html.jinja
    │   │   ├── static_template.html.jinja
    │   │   └── upload_template.html.jinja
    └── utils.py
├── package-lock.json
├── package.json
├── requirements.txt
├── requirements_dev.txt
├── scripts
    ├── locally_decrypt_entries.py
    ├── locust_stress_testing.py
    └── sample_data_generator.py
├── setup.py
├── tests
    ├── __init__.py
    ├── anonymizer_module_test.py
    ├── content_configuration_test.py
    ├── create_app_test.py
    ├── db_interaction_test.py
    ├── download_data_protection_test.py
    ├── initialization_test.py
    ├── local_decryption_test.py
    ├── sample_anonymizer_test.py
    ├── sample_data_generator_test.py
    ├── security_authorization_test.py
    ├── security_entry_test.py
    ├── security_secrets_test.py
    └── utils_settings_test.py
└── webpack.config.js


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/.DS_Store


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | # Recommend matching the black line length (default 88),
3 | # rather than using the flake8 default of 79:
4 | max-line-length = 88
5 | extend-ignore =
6 |     # See https://github.com/PyCQA/pycodestyle/issues/373
7 |     E203,
8 | exclude = node_modules, static


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | # FEATURE ADDED OR BUG FIXED
 2 | 
 3 | ## Closes #issueNO
 4 | ____
 5 | Paragraph description of change
 6 | 
 7 | ## Assumptions
 8 | Any underlying assumptions for this change (e.g. filetypes, supported browsers, deploy environments)
 9 | 
10 | ## Usage / Minimal Example
11 | 
12 | Instructions on how to verify changes this PR by running code
13 | 
14 | **before**
15 | 
16 | ```bash
17 | echo "stuff breaks"
18 | osd2f
19 | 
20 | ```
21 | ```python
22 | ValueError: Stuff broke!
23 | ```
24 | **after**
25 | ```bash
26 | osd2f
27 | ```
28 | ```python
29 | Success!
30 | ```
31 | 
32 | ## Checklist
33 | - [ ] Added tests if appropriate (and it should always be)
34 | - [ ] Created new issues when required
35 | 


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python application
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [main]
 9 |   pull_request:
10 |     branches: [main]
11 | 
12 | jobs:
13 |   development_build:
14 |     runs-on: ubuntu-latest
15 | 
16 |     steps:
17 |       - uses: actions/checkout@v4
18 |       - name: Set up Python 3.9
19 |         uses: actions/setup-python@v4
20 |         with:
21 |           python-version: 3.9
22 |       - name: Install dependencies
23 |         run: |
24 |           python -m pip install --upgrade pip
25 |           pip install -r requirements.txt
26 |           pip install -r requirements_dev.txt
27 |       - name: Lint with flake8
28 |         run: flake8 ./
29 |       - name: mypy
30 |         run: mypy ./osd2f/ --ignore-missing-imports
31 |       - name: Test with pytest
32 |         run: pytest ./
33 |       - name: Install & do a dry run
34 |         run: |
35 |           pip install -e ./
36 |           osd2f --dry-run
37 |       - name: Check config generation functionality
38 |         run: |
39 |           osd2f --generate-current-config cc.yaml
40 |           [ -s cc.yaml ] #check whether the file is not empty
41 |       - name: Run Jest
42 |         uses: stefanoeb/jest-action@1.0.3
43 | 
44 |   release_build:
45 |     runs-on: ubuntu-latest
46 | 
47 |     steps:
48 |       - uses: actions/checkout@v4
49 |       - name: Set up Python 3.9
50 |         uses: actions/setup-python@v4
51 |         with:
52 |           python-version: 3.9
53 |       - name: Install with plain pip
54 |         run: pip install ./
55 |       - name: Do a dry run
56 |         run: osd2f --dry-run
57 | 
58 |   docker_build:
59 |     runs-on: ubuntu-latest
60 | 
61 |     steps:
62 |       - name: checkout files
63 |         uses: actions/checkout@v4
64 |       - name: build Docker image
65 |         uses: docker/build-push-action@v4
66 |         with:
67 |           file: Dockerfile
68 |           push: false
69 |           load: ${{ github.event_name == 'pull_request' }}
70 |           context: .
71 | 
72 |   docker_test_build:
73 |     runs-on: ubuntu-latest
74 | 
75 |     steps:
76 |       - name: checkout files
77 |         uses: actions/checkout@v4
78 |       - name: build Docker image
79 |         uses: docker/build-push-action@v4
80 |         with:
81 |           file: Dockerfile-test
82 |           push: false
83 |           load: ${{ github.event_name == 'pull_request' }}
84 |           context: .
85 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Extension of : https://github.com/github/gitignore/blob/master/Python.gitignore
  2 | # License : CCO 1.0
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | cover/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | .pybuilder/
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | #   For a library or package, you might want to ignore these files since the code is
 90 | #   intended to run in multiple environments; otherwise, check them in:
 91 | # .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101 | __pypackages__/
102 | 
103 | # Celery stuff
104 | celerybeat-schedule
105 | celerybeat.pid
106 | 
107 | # SageMath parsed files
108 | *.sage.py
109 | 
110 | # VScode stuff
111 | .vscode/
112 | 
113 | # Environments
114 | .env
115 | .venv
116 | env/
117 | venv/
118 | ENV/
119 | env.bak/
120 | venv.bak/
121 | 
122 | # Spyder project settings
123 | .spyderproject
124 | .spyproject
125 | 
126 | # Rope project settings
127 | .ropeproject
128 | 
129 | # mkdocs documentation
130 | /site
131 | 
132 | # mypy
133 | .mypy_cache/
134 | .dmypy.json
135 | dmypy.json
136 | 
137 | # Pyre type checker
138 | .pyre/
139 | 
140 | # pytype static type analyzer
141 | .pytype/
142 | 
143 | # Cython debug symbols
144 | cython_debug/
145 | 
146 | # frontend dependencies
147 | node_modules/
148 | 
149 | # Azure configs
150 | .azure/
151 | 
152 | # IDE specific things
153 | .vscode
154 | 
155 | # database files
156 | *.db 
157 | *-shm 
158 | *-wal


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 | - family-names: "Araujo"
 5 |   given-names: "Theo"
 6 | - family-names: "Ausloos"
 7 |   given-names: "Jef"
 8 | - family-names: "van Atteveldt"
 9 |   given-names: "Wouter"
10 | - family-names: "Loecherbach"
11 |   given-names: "Felicia"  
12 | - family-names: "Moeller"
13 |   given-names: "Judith"
14 | - family-names: "Ohme"
15 |   given-names: "Jakob"
16 | - family-names: "Trilling"
17 |   given-names: "Damian"
18 | - family-names: "van de Velde"
19 |   given-names: "Bob"
20 | - family-names: "de Vreese"
21 |   given-names: "Claes"
22 | - family-names: "Welbers"
23 |   given-names: "Kasper"
24 | title: "OSD2F: Open Source Data Donation Framework"
25 | doi: 10.31235/osf.io/xjk6t
26 | url: "https://github.com/uvacw/osd2f"
27 | 
28 | references:
29 |   - authors:
30 |       - family-names: "Araujo"
31 |         given-names: "Theo"
32 |       - family-names: "Ausloos"
33 |         given-names: "Jef"
34 |       - family-names: "van Atteveldt"
35 |         given-names: "Wouter"
36 |       - family-names: "Loecherbach"
37 |         given-names: "Felicia"  
38 |       - family-names: "Moeller"
39 |         given-names: "Judith"
40 |       - family-names: "Ohme"
41 |         given-names: "Jakob"
42 |       - family-names: "Trilling"
43 |         given-names: "Damian"
44 |       - family-names: "van de Velde"
45 |         given-names: "Bob"
46 |       - family-names: "de Vreese"
47 |         given-names: "Claes"
48 |       - family-names: "Welbers"
49 |         given-names: "Kasper"
50 |     doi: 10.31235/osf.io/xjk6t
51 |     journal: "Computational Communication Research"
52 |     title: "OSD2F: Open Source Data Donation Framework"
53 |     type: article
54 |     year: Forthcoming


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.7-buster
 2 | 
 3 | EXPOSE 8000
 4 | 
 5 | ENV OSD2F_SECRET=""
 6 | ENV OSD2F_MODE="Development"
 7 | ENV OSD2F_DB_URL="sqlite://:memory:"
 8 | 
 9 | # make code available
10 | COPY ./ ./osd2f
11 | 
12 | # add build-secret to hypercorn config
13 | 
14 | WORKDIR /osd2f
15 | 
16 | # setup dependencies
17 | RUN pip install ./ 
18 | 
19 | # minimal check to make sure the install works
20 | RUN osd2f --dry-run
21 | 
22 | # set the default command for the container (i.e. running production)
23 | CMD [ "hypercorn", "osd2f.__main__:app", "-b", "0.0.0.0:8000" ]
24 | 


--------------------------------------------------------------------------------
/Dockerfile-test:
--------------------------------------------------------------------------------
 1 | # Tests for Python 3.9 compatibilty
 2 | 
 3 | FROM python:3.9.9-buster
 4 | 
 5 | EXPOSE 8000
 6 | ARG secret 
 7 | 
 8 | ENV OSD2F_SECRET=$secret
 9 | ENV OSD2F_MODE="Development"
10 | ENV OSD2F_DB_URL="sqlite://:memory:"
11 | 
12 | ## make code available
13 | COPY ./ ./osd2f
14 | 
15 | ## add build-secret to hypercorn config
16 | 
17 | WORKDIR /osd2f
18 | 
19 | ## setup dependencies
20 | RUN pip install ./ 
21 | RUN pip install -r requirements.txt
22 | RUN pip install -r requirements_dev.txt
23 | 
24 | ## run tests
25 | RUN flake8 ./
26 | RUN mypy ./osd2f/ --ignore-missing-imports
27 | RUN pytest ./
28 | 
29 | RUN osd2f --dry-run
30 | 
31 | # Tests for Python 3.8 compatibility
32 | 
33 | FROM python:3.8.12-buster
34 | 
35 | EXPOSE 8000
36 | ARG secret 
37 | 
38 | ENV OSD2F_SECRET=$secret
39 | ENV OSD2F_MODE="Development"
40 | ENV OSD2F_DB_URL="sqlite://:memory:"
41 | 
42 | ## make code available
43 | COPY ./ ./osd2f
44 | 
45 | ## add build-secret to hypercorn config
46 | 
47 | WORKDIR /osd2f
48 | 
49 | ## setup dependencies
50 | RUN pip install ./ 
51 | RUN pip install -r requirements.txt
52 | RUN pip install -r requirements_dev.txt
53 | 
54 | ## run tests
55 | RUN flake8 ./
56 | RUN mypy ./osd2f/ --ignore-missing-imports
57 | RUN pytest ./
58 | 
59 | RUN osd2f --dry-run
60 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![Python application](https://github.com/uvacw/osd2f/workflows/Python%20application/badge.svg?branch=main)
  2 | <a href="https://github.com/psf/black"><img alt="Code style: black" src="https://img.shields.io/badge/code%20style-black-000000.svg"></a>
  3 | # OSD2F: Open Source Data Donation Framework (No longer maintained)
  4 | 
  5 | ## ⚠️ Update: this repository is archived ⚠️
  6 | 
  7 | This repository is being archived as is. The code can be reused by others as specified in the license, yet security updates and maintenance are not currently being done. Those interested in using the code must therefore consider performing any relevant security updates priot to using the tool. The OSD2F authors are now working on a new data donation infrastructure, which can be found here: [https://datadonation.eu](https://datadonation.eu). This infrastructure contains a stand-alone tool (PORT) which is actively maintained and updated.
  8 | 
  9 | 
 10 | 
 11 | ## Goal
 12 | 
 13 | Use OSD2F to run your own Data Donation service. The aim of this project is to facilitate 
 14 | scientists to collect data donations, by providing an easy-to-use web-based data donation 
 15 | platform. Here, scientists can instruct participants in their research to upload data 
 16 | exports from major online platforms (generally based on participants rights to their own
 17 | data under GDPR).
 18 | 
 19 | The App aims to be as export agnostic as possible while keeping things feasible to maintain.
 20 | You can specify the files and the whitelist of JSON fields through YAML configuration. 
 21 | As such it supports Data Donation Packages of arbitrary format in JSON files (although it assumes they are UTF-8 encoded). 
 22 | 
 23 | ## Using OSD2F locally
 24 | 
 25 | Installing the OSD2F locally is relatively simple by using pip's support for installation straight from 
 26 | VCS. However, we recommend local installation only in cases in which you want to familiarize yourself
 27 | with OSD2F and **never for production (real data collection) purposes**. 
 28 | 
 29 | ***Note:** There is a different set of instructions for development purposes in the [development docs](docs/development.md)*
 30 | 
 31 | ### Installation (not for development)
 32 | 
 33 | OSD2F requires python 3.8 or up, check your version by running:
 34 | 
 35 | ```bash
 36 | python --version
 37 | ```
 38 | should say something like:
 39 | > Python 3.8.0
 40 | 
 41 | *Note: it's recommended to use a virtual environment, please consult de [development docs](docs/development.md) for more information.*
 42 | 
 43 | ```bash
 44 | pip install git+https://github.com/uvacw/osd2f
 45 | ```
 46 | 
 47 | ### Running
 48 | 
 49 | ```bash
 50 | osd2f -h # see help
 51 | ```
 52 | 
 53 | ```bash
 54 | osd2f -m Testing # to run a testing instance
 55 | ```
 56 | 
 57 | You can configure the text content of the webpages. The easiest way to get started
 58 | is by generating a YAML file with the default values and editing it to your liking:
 59 | 
 60 | ```bash
 61 | osd2f --generate-current-config config.yaml
 62 | ```
 63 | 
 64 | You can start the server with this content configuration by passing a file-path 
 65 | via the CLI. 
 66 | 
 67 | ```bash
 68 | osd2f --content-configuration config.yaml # make sure you've edited it first
 69 | ```
 70 | 
 71 | ***Note**: OSD2F will store the configuration in the database. In development mode, the
 72 | most recently edited version is used between the database and the file.*
 73 | 
 74 | ## See also:
 75 | 
 76 | 1. [how to develop](docs/development.md)
 77 | 2. [Deploying to Azure](docs/deploying_to_azure.md)
 78 | 3. [Running stresstests](docs/stresstests.md)
 79 | 4. [Testing the researcher login with basic auth](docs/basic_authentication.md)
 80 | 5. [Using Microsoft Authentication via SSO](docs/microsoft_authentication.md)
 81 | 6. [Setting password on researcher downloads](docs/protecting_downloads.md)
 82 | 7. [Adding additional (server side) anonymizers](docs/adding_new_anonymizers.md)
 83 | 
 84 | ## Credits:
 85 | 
 86 | If you use this tool, please cite the paper:
 87 | 
 88 | *APA:*
 89 | 
 90 | Araujo, T., Ausloos, J., van Atteveldt, W., Loecherbach, F., Moeller, J., Ohme, J., Trilling, D., van de Velde, B., de Vreese, C., & Welbers, K. (Forthcoming). OSD2F: An Open-Source Data Donation Framework. *Computational Communication Research*, https://osf.io/preprints/socarxiv/xjk6t/
 91 | 
 92 | *Bibtex:* 
 93 | 
 94 | ```
 95 | @article{osd2f,
 96 |  title={OSD2F: An Open-Source Data Donation Framework},
 97 |  DOI={10.31235/osf.io/xjk6t},
 98 |  author={Araujo, Theo and Ausloos, Jef and {van Atteveldt}, Wouter and Loecherbach, Felicia and Moeller, Judith and Ohme, Jakob and Trilling, Damian and {van de Velde}, Bob and {de Vreese}, Claes and Welbers, Kasper},
 99 |  year={forthcoming},
100 |  journal = {Computational Communication Research}
101 | }
102 | ```
103 | 
104 | 
105 | This tool is inspired in earlier approaches that enable researchers to partner with individuals willing to donate their data for academic research, including [Web Historian](https://github.com/erickaakcire/webhistorian) (Menchen-Trevino, 2016), among others.
106 | 


--------------------------------------------------------------------------------
/bin/osd2f:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from osd2f import cli
3 | 
4 | cli.parse_and_run()


--------------------------------------------------------------------------------
/bin/osd2f-decrypt-submissions:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from scripts.locally_decrypt_entries import run_script
3 | 
4 | run_script()


--------------------------------------------------------------------------------
/docs/adding_new_anonymizers.md:
--------------------------------------------------------------------------------
  1 | # Adding new anonymizer functions
  2 | 
  3 | This guide explains how to add anonymization functions to the codebase. You will need
  4 | Python skill to create a new anonymizer.
  5 | 
  6 | ## What are anonymizers
  7 | 
  8 | Simply put, anonymizers are functions that run on the server *before* data is collected. They 
  9 | are meant for more complex processing of submission entries for privacy protecting purposes. For 
 10 | instance, they could be named entity recognition functions that filter out personal names using
 11 | (pretrained) machine learning models. 
 12 | 
 13 | Anonymizers are run server-side **before consent is given** and should never store data on disk. 
 14 | 
 15 | ## How do anonymizers work
 16 | 
 17 | When a user selects files to upload, the containing entries are parsed. This parsing removes fields
 18 | that are not on the whitelist, and 'flattens' the dictionaries using `.` notation. After this client-side
 19 | parsing, the entries are send to the server where the configured anonymizers are called.
 20 | 
 21 | For each file that has a configured anonymizer, each entry is provided to that anonymizer function. The
 22 | anonymizer function should return the entry after changing it's contents. These entries are then returned
 23 | to the client for respondents to inspect and provide consent on the real upload. 
 24 | 
 25 | ## Creating a new anonymizer
 26 | 
 27 | You can create an anonymizer by adding a file to the `osd2f/anonymizers` directory, importing it in the 
 28 | `osd2f/anonymizers/__init__.py` file and configuring the use of the anonymizer in the upload settings 
 29 | YAML file used for your deployment. 
 30 | 
 31 | ### Writing a new anonymizer
 32 | 
 33 | Anonymizers should have this signature: 
 34 | 
 35 | ```python
 36 | async def your_anonymizer(
 37 |     entry: typing.Dict[str, typing.Any], argument: str = "optional_string_argument"
 38 | ) -> typing.Dict[str, typing.Any]
 39 | ```
 40 | 
 41 | For example, we'll implement an anonymizer that goes through a text and removes any word after a 
 42 | given substring:
 43 | 
 44 | ```python
 45 | # in new file: osd2f/anonymizers/nextword_anonymizer.py
 46 | from typing import Any, Dict
 47 | 
 48 | async def nextword_anonymizer(entry: Dict[str,Any], substring: str):
 49 |     
 50 |     redacted_entry = {}
 51 |     for field, value in entry.items():
 52 |         # keep field values that are not a string as-is
 53 |         # in the redacted entry
 54 |         if type(value)!=str:
 55 |             redacted_entry[field] = value
 56 |             continue
 57 |         
 58 |         # naively split the value on spaces
 59 |         # and keep the words that do not come after
 60 |         # the substring
 61 |         previous_token = ""
 62 |         new_value = []
 63 |         for token in value.split(" "):
 64 |             if previous_token!=substring:
 65 |                 new_value.append(token)
 66 |             else:
 67 |                 new_value.append("<redacted>")
 68 |             previous_token = token
 69 | 
 70 |         redacted_entry[field] = " ".join(new_value)
 71 |     
 72 |     # make sure to return the redacted version of the entry
 73 |     return redacted_entry
 74 | 
 75 | ```
 76 | 
 77 | You can test the function:
 78 | 
 79 | ```python
 80 | from osd2f.anonymizers.nextword_anonymizer import nextword_anonymizer
 81 | 
 82 | fake_entry = {
 83 |     "text" : "mr Darcy was unamused, but so was mr bennet"
 84 | }
 85 | 
 86 | await nextword_anonymizer(fake_entry, "mr")
 87 | 
 88 | ```
 89 | outputs: 
 90 | ``` {'text': 'mr <redacted> was unamused, but so was mr <redacted>'} ```
 91 | 
 92 | 
 93 | ### Adding the anonymizer to imports
 94 | 
 95 | For OSD2F to recognize the new anonymizer, it needs to be added to the `osd2f/anonymizers/__init__.py` file, like so:
 96 | 
 97 | ```python
 98 | # in osd2f/anonymizers/__init__.py 
 99 | import re
100 | import typing
101 | 
102 | from .sample_platform import redact_text
103 | from .nextword_anonymizer import nextword_anonymizer # <- import the new anonymizer function
104 | from ..definitions import Submission, SubmissionList, UploadSettings
105 | from ..logger import logger
106 | 
107 | options: typing.Dict[str, typing.Callable[[typing.Dict, str], typing.Awaitable]] = {
108 |     redact_text.__name__: redact_text,  # noqa
109 |     nextword_anonymizer.__name__ : nextword_anonymizer # noqa <- add it to the options
110 | }
111 | 
112 | ...rest of the file...
113 | ```
114 | 
115 | 
116 | ### Configuring settings to use this anonymizer
117 | 
118 | Let's try to use this new anonymizer. First, we create an upload settings file:
119 | 
120 | ```yaml
121 | # in osd2f/settings/default_upload_settings.yaml
122 | files:
123 |   example.json:
124 |     anonymizers:
125 |       - nextword_anonymizer : "mr"
126 |     accepted_fields:
127 |       - text
128 |       - title
129 |       - number
130 | ```
131 | 
132 | Then we create a file to donate called `example.json`:
133 | 
134 | ```json
135 | [
136 |   {
137 |     "title": "mr Frogs day out",
138 |     "text": "mr Frog was driving on the windy road towards mr Toad",
139 |     "number": 100,
140 |     "other": "is not on the whitelist"
141 |   }
142 | ]
143 | ```
144 | 
145 | We start the OSD2F platform:
146 | 
147 | ```bash
148 | OSD2F_SECRET=secret \                                                              
149 | OSD2F_ENTRY_SECRET=TESTSECRET \
150 | OSD2F_MODE=Development \
151 | osd2f -vvv
152 | ```
153 | 
154 | We upload our `example.json` file on the [upload page](http://localhost:5000/upload)
155 | 
156 | Press the `inspect & edit` button and you will see the redacted result in the table! 


--------------------------------------------------------------------------------
/docs/basic_authentication.md:
--------------------------------------------------------------------------------
 1 | # Basic Authentication for easier testing / local installs
 2 | 
 3 | ## Important note
 4 | 
 5 | Basic authentication is not considered a 'safe' authorization mechanism by todays standards. 
 6 | This implementation serves to make login testing easier without requiring an OAuth platform
 7 | to be available. 
 8 | 
 9 | Some reasons why you should not use basic auth in production:
10 | 1. Passwords are send unencrypted, so any communication outside HTTPS leaks the password
11 | 2. Browsers tend to automatically store basic auth username-password combinations, and do
12 |    so in an insecure fashion
13 | 
14 | ## How does it work
15 | 
16 | Basic auth will prompt researchers for a username-password combination provided as an environment
17 | configuration. 
18 | 
19 | ```bash
20 | 
21 | OSD2F_BASIC_AUTH="username21;unguessablepassword" osd2f -m Development
22 | ```
23 | 
24 | Will start a (development) server that allows researchers to login by entering the username `username21` and
25 | password `unguessablepassword`. Needless to say, you must be very carefull about who knows the username and
26 | password.


--------------------------------------------------------------------------------
/docs/deploying_as_a_container.md:
--------------------------------------------------------------------------------
 1 | # Deploying OSD2F as a container
 2 | 
 3 | ## What is a container and why use it?
 4 | 
 5 | Containers, most populairly [docker containers](https://www.docker.com/resources/what-container) are ways
 6 | to package an application, making sure all dependencies and environment characteristics are wrapped
 7 | together. This makes containers ideal for deployment across different environments, without worrying
 8 | about OS compatibilities, libraries that need to be installed on servers etcetera. 
 9 | 
10 | The popularity of containers as a deployment model is clear in the broad support. PaaS offerings such
11 | as [Google app engine](https://cloud.google.com/appengine/docs/flexible), [Amazon ECS](https://aws.amazon.com/ecs/) 
12 | and [Microsoft Azure Container Instances](https://docs.microsoft.com/en-us/azure/container-instances/container-instances-quickstart)
13 | support running arbitrary containers without requiring any advanced cloud management skills.
14 | 
15 | For more advanced setups, the common deployment infrastructure is [Kubernetes (k8s)](https://kubernetes.io/), a container orchestration platform that combines containers and allows for their deployment across
16 | servers. 
17 | 
18 | ### TL;DR: 
19 | * containers make applications easy to move between servers
20 | * containers are widely supported by cloud providers
21 | 
22 | ## Creating an OSD2F container
23 | 
24 | ### Building a test container
25 | 
26 | If you want to test whether the code still works after modifications:
27 | 
28 | ```bash
29 | docker build -t osd2f-test -f Dockerfile-test ./
30 | ```
31 | If the build is succesfull, that means all tests have passed.
32 | You can access the container by running it:
33 | 
34 | ```bash
35 | docker run -it osd2f-test bash
36 | ```
37 | 
38 | This is slower to build and contains more dependencies that are normally only used for development. This
39 | is not the container specification that is meant for production deployments.
40 | 
41 | ### Building a container for deployment
42 | 
43 | ```bash
44 | docker build -t osd2f -f Dockerfile ./ 
45 | ```
46 | 
47 | Running the container (using port 8000), the `-p` flag sets the host port of you machine to refer to the port of the container. The `-e` flags are used to set environment variables. Note that production instances
48 | always require a session secret. The example here is not suited for production, as you should avoid allowing researcher access through basic authentication and the database is an in-memory database that
49 | will be reset to empty every time the container is restarted. 
50 | 
51 | ```bash
52 | docker run -it \
53 |     -e OSD2F_MODE="Production" \
54 |     -e OSD2F_BASIC_AUTH='user;pass' \
55 |     -e OSD2F_SECRET="a big secret here" \
56 |     -e OSD2F_DB_URL="sqlite://:memory:" \
57 |     -p 8000:8000 \
58 |     osd2f
59 | ```
60 | You should be able to reach the server now at http://localhost:8000/
61 | 
62 | ## Deploying containers to production
63 | 
64 | Container use in production is strongly related to the solution you will be using. Some deployment platforms enable you to upload the docker image through a CLI tool or as part of a CI/CD interface. Other systems such as [Kubernetes (k8s)](https://kubernetes.io/) require the docker image to be available in a repository.
65 | 
66 | You can push the container image to a repository of your choosing. The syntax ([as specified by Docker](https://docs.docker.com/docker-hub/repos/)) is the following: 
67 | 
68 | ```bash
69 | docker push <hub-user>/<repo-name>:<tag>
70 | ```
71 | 
72 | Whether and which repository to use depends on the the platform you choose to use for the deployment. Note that running the container on a single server will risk limited availability (downtime when this server experiences issues) and comes at considerable operational overhead (configuring security, keeping the systrem up-to-date, backing up data etcetera). 
73 | 
74 | 


--------------------------------------------------------------------------------
/docs/deploying_to_azure.md:
--------------------------------------------------------------------------------
  1 | # Deploying OSD2F to azure
  2 | 
  3 | ## disclaimer
  4 | 
  5 | This documentation is intended to demonstrate how to set up OSD2F as an Azure webapp service. It is oriented towards putting an interface out there to see, but is not set up for actual data collection. The needs and conditions of your specific project may impact the way the app should be configured. Consult you cloud engineer before applying the below steps to understand how to adapt them to your project.
  6 | 
  7 | # general preparations
  8 | 
  9 | ## Make sure you azure CLI client is logged in and selected the appropriate subscription. 
 10 | 
 11 | ```bash
 12 | az login
 13 | az account set --subscription <subscription-to-publish-to>
 14 | export AZURE_RESOURCE_GROUP=<your-resource-group>
 15 | export WEBAPPNAME="osd2f-test" # must be globally unique, e.g. unused on Azure
 16 | ```
 17 | 
 18 | Doublecheck with:
 19 | ```bash
 20 | az account show
 21 | ```
 22 | 
 23 | ## creating the webapp
 24 | 
 25 | Using webapp up will setup the webapp, the appservice and the plan required. The app won't work before we also apply the other commands. Make sure to be inside the OSD2F folder (locally) when running this command.
 26 | 
 27 | ```bash
 28 | # python 3.9 is in early access on Azure (2021-11-05),
 29 | # you can select it in the Settings > Configuration 
 30 | # panel of the App Service under `Minor version`
 31 | az webapp up  \
 32 |     --runtime 'python|3.8' \
 33 |     --location "West Europe" \
 34 |     --sku F1 \
 35 |     --verbose \
 36 |     --name $WEBAPPNAME
 37 | ```
 38 | 
 39 | Minor addition for security:
 40 | ```bash
 41 | az webapp identity assign --resource-group $AZURE_RESOURCE_GROUP --name $WEBAPPNAME 
 42 | ```
 43 | 
 44 | # setting up config with in-memory db
 45 | 
 46 | ## 1. Setup desired settings:
 47 | 
 48 | ```bash
 49 | az webapp config appsettings set --name  $WEBAPPNAME\
 50 |     --resource-group $AZURE_RESOURCE_GROUP \
 51 |     --settings \
 52 |         OSD2F_SECRET=$RANDOM$RANDOM$RANDOM$RANDOM \
 53 |         OSD2F_DB_URL="sqlite://:memory:"  \
 54 |         OSD2F_MODE="Production"
 55 | ```
 56 | Please note: 
 57 | 
 58 |  - **OSD2F_SECRET** : This will introduce a random secret that is different every time
 59 | this command is run. The secret is used by the server to maintain
 60 | sessions, so running this command will 'logout' any ongoing session.
 61 | - **OSD2F_DB_URL** : The database to use, the example has an in-memory database, see the next section for a setup with a proper database.
 62 | - **OSD2F_MODE** : the mode in which to run the server, should pretty much always be production for internet facing deployments. 
 63 | 
 64 | **NOTE**: deploying secrets in this way is not 'safe', anyone with 
 65 |           admin access to this resource group will be able to see
 66 |           the secret!
 67 | 
 68 | set the custom startup command. We use the hypercorn ASGI server middleware for performance reasons. 
 69 | 
 70 | ```bash
 71 | az webapp config set \
 72 |     --resource-group $AZURE_RESOURCE_GROUP \
 73 |     --name $WEBAPPNAME \
 74 |     --startup-file "python -m hypercorn osd2f.__main__:app -b 0.0.0.0"
 75 | ```
 76 | 
 77 | # setting up config with real database
 78 | 
 79 | ## 1. Create the database
 80 | 
 81 | We'll assume a Postgres database, but anything supported by Tortoise should work. 
 82 | ## 2. You can now formulate a connection string
 83 | 
 84 | test in locally (dont forget to whitelist your IP address in the database firewall rules):
 85 | 
 86 | ```bash
 87 | # you should have the admin user (db_user) password (db_pass) and database name (db_name)
 88 | db_user='postgres'; \
 89 | db_pass='YOUR-PASSWORD-HERE'; \
 90 | db_name="YOUR-DATABASE-NAME-HERE"; \
 91 | osd2f -db "postgres://$db_user@$db_name:$db_pass@$db_name.postgres.database.azure.com:5432/postgres?ssl=True"
 92 | ```
 93 | 
 94 | If you see an error related to "hba_config" it probably means access is incorrectly configured. Check:
 95 | 
 96 | - [ ] did you add the the database server name after the username? (e.g. `user@database:password` )
 97 | - [ ] if you are trying to connect from a local machine, did you whitelist your IP in the database security configuration?
 98 | ## 3. Set 'Allow access to Azure services' to 'Yes' 
 99 | 
100 | This will allow the webapp to connect. Do this in the security configuration of the database you want to connect to. 
101 | 
102 | ## 4. Setup desired settings:
103 | 
104 | ```bash
105 | az webapp config appsettings set --name  $WEBAPPNAME\
106 |     --settings \
107 |         OSD2F_SECRET=$RANDOM$RANDOM$RANDOM$RANDOM \
108 |         OSD2F_MODE="Production"
109 | ```
110 | Please note: 
111 | 
112 |  - **OSD2F_SECRET** : This will introduce a random secret that is different every time
113 | this command is run. The secret is used by the server to maintain
114 | sessions, so running this command will 'logout' any ongoing session.
115 | - **OSD2F_MODE** : the mode in which to run the server, should pretty much always be production for internet facing deployments. 
116 | 
117 | **NOTE**: deploying secrets in this way is not 'safe', anyone with 
118 |           admin access to this resource group will be able to see
119 |           the secret! Consider using a [secret store](./using_secret_stores.md)
120 | 
121 | set the custom startup command. We use the hypercorn ASGI server middleware for performance reasons. 
122 | 
123 | ## 5. Add the connection string
124 | ```bash 
125 | db_user='postgres'; \
126 | db_pass='YOUR-PASSWORD-HERE'; \
127 | db_name="YOUR-DATABASE-NAME-HERE"; \
128 | \
129 | az webapp config connection-string set \
130 |     --name $WEBAPPNAME \
131 |     -t PostgreSQL \
132 |     --settings custom1="postgres://$db_user@$db_name:$db_pass@$db_name.postgres.database.azure.com:5432/postgres?ssl=True"
133 | ```
134 | ## 6. We map the Azure protected database connection strings to the startup command of OSD2F.
135 | 
136 | ```bash
137 | az webapp config set \
138 |     --resource-group "<YOUR-RESOURCE-GROUP>"
139 |     --name $WEBAPPNAME
140 |     --startup-file 'OSD2F_DB_URL=$POSTGRESQLCONNSTR_custom1 python -m hypercorn osd2f.__main__:app -b 0.0.0.0'
141 | ```
142 | 
143 | ## HINT: Check the webapp settings 
144 | 
145 | In the app-service > settings > configurations tab, you can check whether the correct database URL string was received and, under general settings, whether the correct startup command was registered. 
146 | 
147 | # deploying the app 
148 | 
149 | Deploying the app uploads the source code and provisions the application. If you want changes to the code to go live, this is the command to run. 
150 | 
151 | ```bash
152 | az webapp up  \
153 |     --runtime 'python|3.8' \
154 |     --location "West Europe" \
155 |     --name $WEBAPPNAME
156 | ```
157 | 
158 | # updating the app
159 | 
160 | If at a certain point you need to update the app settings (e.g., change from SQLlite to Postgres), you will also need to include the ```resource-group``` parameter in the Azure commands. You can get the resource-group info from the app overview. 
161 | 
162 | Afterwards, you can define it as an environment variable:
163 | ```export RESOURCEGROUPNAME="includethenamehere"```
164 | 
165 | And include it along with the commands above, after the webapp name. For example:
166 | ```
167 | az webapp config connection-string set \
168 |     --name $WEBAPPNAME --resource-group $RESOURCEGROUPNAME\
169 |     ...
170 | ```
171 | 
172 | # applying new configurations (temporary method)
173 | 
174 | There is currently no configuration interface for the content of the app. You can update remote (e.g. Azure)
175 | webapp content configurations by locally creating a content-file and running the app with the remote
176 | database connection. 
177 | 
178 | For example:
179 | 
180 | ```bash
181 | OSD2F_DB_URL="<your database string>" \
182 | OSD2F_SECRET="arbitrary string" \
183 | osd2f \
184 | -m Production \
185 | -cc your_content_settings.yaml
186 | ```
187 | 
188 | 
189 | 


--------------------------------------------------------------------------------
/docs/development.md:
--------------------------------------------------------------------------------
  1 | # Development
  2 | 
  3 | ## Core assumptions
  4 | 
  5 | Choices in this codebase are based on some assumptions as to the uses of the
  6 | framework. Because of these assumptions some things are simple, whereas others
  7 | are harder. When contributing code, please make sure you keep these assumptions
  8 | in mind:
  9 | 
 10 | - **Functionality should be generic over many kinds of donation formats**. 
 11 | This means that the frontend, endpoints, anonymizers and configurations should be able to handle pretty arbitrary JSON*. Assumptions about existing fields, datatypes etcetera should be limited to
 12 |   - the default configuration file example
 13 |   - export source specific anonymizers
 14 | - **Configuration targets users with low technical expertise**, 
 15 | which means the selection of fields to include and anonymizers to use should be relatively easy to infer from the example configuration. It also means that we want to avoid making content decisions in code.
 16 | - **This framework is for collection, *not* analysis**. 
 17 | The intended use of this framework is to provide a participant facing data submission interface with good privacy guarantees. The researchers who administer the deployment can download the data to do analysis in their own environment. The entries submitted can therefore be treated as a 'black box'. This helps maintain flexibility (no database migrations for new donation types) and maintainability (changes in export formats can be upgraded via configuration only).
 18 | - **All content data is sensitive** and should never be in any logs or 
 19 |   (disk) storage UNLESS after the explicit consent step. This also means
 20 |   that AT NO POINT any of the JSON fields or values should be in a 
 21 |   `print()`, `logger.info()`, `logger.warning()`, `logger.critical()` or any other stdout/stderr statement. For local development and testing purposes,
 22 |   you can use `logger.debug()` to contain content information.
 23 | 
 24 | 
 25 | ## Installation for development
 26 | 
 27 | You can install this Python Package for local development purposes. To do 
 28 | so, we *strongly* advice using a virtual environment context. 
 29 | 
 30 | In addition, please note that OSD2F was written for Python `3.9`. Using
 31 | a virtual environment should make it easy to install this version without impacting your other Python projects.
 32 | 
 33 | 
 34 | ----
 35 | ##### Example using the popular [anaconda distribution of python](https://www.anaconda.com/)
 36 | 
 37 | ```bash 
 38 | conda create -n osd2f python=3.9 # only required once
 39 | conda activate osd2f # run at the start of each osd2f development session
 40 | ```
 41 | ----
 42 | 
 43 | ### 1. Clone the repository
 44 | 
 45 | You can clone the git repository so you can easily switch between branches.
 46 | 
 47 | ```bash
 48 | # get the repository
 49 | git clone git@github.com:uvacw/osd2f.git
 50 | # move to project ROOT
 51 | cd osd2f
 52 | # you know you're in the right place if it contains setup.py
 53 | ls setup.py
 54 | # shows: `setup.py`, if it says 'cannot access' you in the wrong place
 55 | ```
 56 | 
 57 | ### 2. Install the package in editable mode
 58 | 
 59 | For development purposes, you should install the package using the `-e` pip flag 
 60 | to ensure it is available in 'editable' mode ([see the docs](https://pip.pypa.io/en/stable/reference/pip_install/)).
 61 | 
 62 | ```bash
 63 | # at the repository root (osd2f/)
 64 | pip install -e ./
 65 | ```
 66 | 
 67 | ### 3. Install development requirements 
 68 | 
 69 | There are additional requirements for development purposes that 
 70 | mainly serve to ensure proper formatting and static analysis. Install
 71 | them seperately:
 72 | 
 73 | ```bash
 74 | # at the repository root (osd2f/)
 75 | pip install -r requirements_dev.txt
 76 | ```
 77 | 
 78 | ### 4. Run the code in development mode
 79 | 
 80 | While developing, it's probably nice to use development mode *and* set the
 81 | log level to DEBUG. You can do so by:
 82 | 
 83 | ```bash
 84 | osd2f -m Development -vvv 
 85 | ```
 86 | The server will now automatically reload when changes are detected. In addition, the settings `yaml` file will be reloaded for each request so
 87 | you can quickly iterate on it. 
 88 | 
 89 | ### javascript
 90 | 
 91 | If you are planning to touch the javascript part of the application, you
 92 | are recommended to install the npm packages
 93 | 
 94 | ```bash
 95 | npm i --also=dev
 96 | ```
 97 | 
 98 | During development, it's probably nice to have human readable javascript in the
 99 | browser (so you can use the build-in debuggers). Use `npm run development` to have webpack watch the javascript files and re-generate a human-readable `main.js` while you work. Once your javascript works well, use `npm run build` to generate the proper minified `main.js` to check in. 
100 | 
101 | 
102 | ## About fake data
103 | 
104 | Fake data is part of this repository to demonstrate potential donations. It allows you to play around with data
105 | that on it's service should be similar to real donations when testing your deployment, developing new anonymizes or
106 | visualizations. 
107 | 
108 | Fake data was generated using the 'faker' package implementation in [scripts/sample_data_generator.py](../scripts/sample_data_generator.py), using the command:
109 | 
110 | ```bash
111 | python scripts/sample_data_generator.py -o mockdata/sample --overwrite -i 2 -z -tz -t
112 | ```
113 | 
114 | More information about how to use this script, consult the help:
115 | 
116 | ```bash
117 | python scripts/sample_data_generator.py -h
118 | ```
119 | 
120 | ## Code style & checks
121 | 
122 | There are a number of checks to run in order to guarantee all tests pass, formatting is correct and typing is properly applied. You can run these manually:
123 | 
124 | ```bash
125 | flake8 ./ # formatting analysis
126 | mypy ./ # static analysis
127 | pytest ./ # unittests
128 | ```
129 | 
130 | You can opt to run `black` seperately to apply auto-formatting (`flake8-black` only checks, without corrections).
131 | 
132 | ```bash
133 | black ./
134 | ```
135 | 
136 | Note that most IDEs (e.g. PyCharms, VSCode, ...) allow you to automatically run these commands every time you save, commit or attempt to push the code. We especially advice you to run black on every save. 


--------------------------------------------------------------------------------
/docs/microsoft_authentication.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Authentication using MSAL
 2 | 
 3 | ## How does it work? 
 4 | 
 5 | The application is registered in the `App registrations` with access rights to *read* user
 6 | information (e.g. email). 
 7 | 
 8 | Using environment variables, the application is configured to accept only a specific set of
 9 | email addresses. 
10 | 
11 | Users trying to access `/researcher*` paths are redirected to Azure and asked to provide the
12 | application with read access to their information. 
13 | 
14 | The app uses the access information to check whether the user has an email in the authorized emails list. If so, it sets a session-cookie providing access to the `/researcher` page and downloads.
15 | 
16 | ## Configuring the app in Azure
17 | 
18 | 1. Go to `App registrations`
19 | 2. select `New registration`
20 | 3. Pick a Name
21 | 4. set `accounts in this organizational directory only (Single tenant)`
22 | 5. The `Redirect URI` should match the endpoint that requires authentication. 
23 |    For local testing, this could be `http://localhost:5000/login`. 
24 | 
25 | 
26 | ## Configuring the server
27 | 
28 | The server is configured by passing a serialized JSON object as the `MSAL_CONFIG` environment variable. The contents are something like this: 
29 | 
30 | ```json
31 | {
32 |     "client_id":"a-provided-client-id",     // Application (client) ID
33 |     "secret":"the-application-secret",      // a secret created when generating the app registration
34 |     "tenant_id":"azure-tenant-id",          // Directory (tenant) ID
35 |     "redirect_url": "localhost:5000/login"  // location microsoft should send users to after login in,
36 |                                             // must match an App registration entry
37 |     // users you want to provide access, note that they
38 |     // should be part of the active directory in the same tenant as 
39 |     // the application
40 |     "allowed_users":"allowed-user-one@azure.nl;allowed_user_two@somewhere.com"
41 | }
42 | ```
43 | 
44 | An example of running this locally would be: 
45 | 
46 | ```bash
47 | export MSAL_CONFIG='{"client_id":"a-provided-client-id",  "secret":"the-application-secret", "tenant_id":"azure-tenant-id", "allowed_users":"allowed-user-one@azure.nl;allowed_user_two@somewhere.com"}'
48 | export OSD2F_SECRET="a-safe-production-secret"
49 | 
50 | osd2f -m Development -db "sqlite://:memory:" -vv
51 | 
52 | ```
53 | 
54 | Note that changing the environment variable in a cloud environment might require restarting the service.
55 | 
56 | ## See Also
57 | 
58 | - Testing local researcher pages with [basic auth](/docs/basic_authentication.md)


--------------------------------------------------------------------------------
/docs/protecting_downloads.md:
--------------------------------------------------------------------------------
 1 | # Protecting downloads with passwords
 2 | 
 3 | To 'nudge' researchers to be carefull with respondent data, it is possible
 4 | to set a data-password. This will change researcher downloads from `.json` 
 5 | or `.csv` files to zipped versions of these files protected with the
 6 | specified password. 
 7 | 
 8 | Things to note:
 9 | 
10 | 1. The zipfiles use AES encryption, which is stronger, but not supported by default on 
11 | many operating systems. Use OS specific software that suppports this encryption, for example:
12 |   * Linux: [PeaZip](https://peazip.github.io/)
13 |   * Max OSX [The Unarchiver](https://theunarchiver.com/)
14 |   * Windows: [7zip](https://www.7-zip.org/)
15 | 
16 | 2. Long passwords help create better protected files, but never consider password protected
17 | zipfiles to be 'unbreakable'. They protect only to layman users, not motivated attackers.
18 | 
19 | 3. You can use a secret-manager to avoid putting the password direcly into CLI arguments or 
20 | environment variables.
21 | 
22 | ## How to enable password protected downloads:
23 | 
24 | 1. Using environment variables:
25 | ```bash
26 | # enable access to the researcher interface by 
27 | # setting basic authentication
28 | export OSD2F_BASIC_AUTH="admin;testpassword" 
29 | 
30 | # set the password
31 | export OSD2F_DATA_PASSWORD=<your password>
32 | 
33 | # start the server
34 | osd2f -m Development -vvv
35 | ```
36 | 
37 | 2. Using a CLI command
38 | ```bash
39 | # enable access to the researcher interface by 
40 | # setting basic authentication
41 | export OSD2F_BASIC_AUTH="admin;testpassword" 
42 | 
43 | osd2f --download-password <your-password> -m Development
44 | ```


--------------------------------------------------------------------------------
/docs/stresstests.md:
--------------------------------------------------------------------------------
 1 | # Running a stress test
 2 | 
 3 | ## Requiments
 4 | 
 5 | - `osd2f` is installed
 6 | - `requirments_dev.txt` dependencies are installed
 7 | 
 8 | ## In short
 9 | 
10 | Stresstest help you pinpoint the amount of traffic your server is able to
11 | handle. OSD2F provides a script for the popular Python load-test library 
12 | [locust](https://locust.io/). 
13 | 
14 | The files submitted are generated using the mock data generating scripts. 
15 | 
16 | ## How to run a stress test
17 | 
18 | To run a stresstest, you require a running instance of OSD2F, either locally 
19 | or on a reachable address. You can run the script from CLI (no interface)
20 | using:
21 | 
22 | ```bash
23 | locust \
24 |     --host http://localhost:5000 \
25 |     -f scripts/locust_stress_test.py \
26 |     --headless \
27 |     --users 100 \
28 |     -t 60sec
29 | ```
30 | 
31 | where:
32 | - `host` is the location of the server you want to stresstest
33 | - `-f` points to the stress test file
34 | - `headless` means no locust web interface is started
35 | - `users` is the amount of concurrent users simulated. The script 
36 |   assumes each user will send 20 logs for each 1 call to anonymization
37 |   and 1 call to submissions. For details, review the stresstest script.
38 | 
39 | ## important notes
40 | 
41 | - The data-sizes and ratio of logs/anomymization/submission calls should
42 |   be based on empirical observations in your sample. Current numbers may 
43 |   not reflect those for your population or use-case. 


--------------------------------------------------------------------------------
/docs/using_entry_encryption.md:
--------------------------------------------------------------------------------
 1 | # Entry encryption
 2 | 
 3 | ## What is it?
 4 | 
 5 | The collected data is stored in a database. This database should obviously be encrypted, which
 6 | is a standard feature cloud platforms provide (see [aws](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Overview.Encryption.html), [azure](https://docs.microsoft.com/en-us/azure/azure-sql/database/security-overview), [gcp](https://cloud.google.com/sql/faq#encryption) documentation for more details).
 7 | 
 8 | Entry encryption is a feature of OSD2F that works as an additional layer of protection. Donations are
 9 | stored in a per-row fashion, having a couple of metadata columns (including the `submission id`, `timestamp` and `filename`) apart from a 'blob' field that contains the actual donation called the `entry`. Entry encryption takes this potentially sensitive field and encrypts it before storing it in the database. By doing so, even if someone would get access to the database, the content of donations would be unusable.
10 | 
11 | ## How to enable entry encryption
12 | 
13 | You can enable entry encryption by providing a passphrase either through the commandline or as an environment variable.
14 | 
15 | ```bash
16 | OSD2F_SECRET=secret \
17 | OSD2F_BASIC_AUTH="user;pass" \
18 | OSD2F_ENTRY_SECRET=TESTSECRET \
19 | osd2f -db sqlite://encryption.db -m Development
20 | ```
21 | 
22 | Entries will now be encrypted before storage and decrypted before download (so researchers do not notice the difference). You can see the encryption by uploading some data and checking your database.
23 | 
24 | ### Disabling decryption on download & local decryption
25 | 
26 | In some use-cases, you might want to keep entries decrypted for downloads. This means the files downloaded by a researcher only contain readable metadata. 
27 | 
28 | You can do so by providing a cli flag or environment variable:
29 | 
30 | ```bash
31 | OSD2F_SECRET=secret \
32 | OSD2F_BASIC_AUTH="user;pass" \
33 | OSD2F_ENTRY_SECRET=TESTSECRET \
34 | OSD2F_ENTRY_DECRYPT_DISABLE=True \
35 | osd2f -db sqlite://encryption.db -m Development
36 | ```
37 | 
38 | If you go to the researcher page and download the `json` file of submissions, they will look something like this:
39 | 
40 | >{"db_id": 1, "submission_id": "test", "filename": "ads_clicked.json", "n_deleted_across_file": 0, "insert_timestamp": "2021-10-22T12:46:47.544248+00:00", "entry": {"encrypted": "gAAAAABhcrK3qYMvBJeTyQWm-d_mKABeiNsRP49-UTaRphxjecNtJDuidYeCNZ-pWUPTRRpfdIh_48iVEqC5QawHBjnp1iw11nAOlCUR4M9nkqbkn-BATurrGJ8OV7zxbdcU6sgzeGAW2Ntgky5o0e4ozV-o66t1AmF2Kp5bc4xa--UcejOBMZjyoItNI-fD12WJxRlUpK_kkSMkZsixjLtUS0ADzonjLw=="}}, {"db_id": 2, "submission_id": "test", "filename": "ads_clicked.json", "n_deleted_across_file": 0, "insert_timestamp": "2021-10-22T12:46:47.544498+00:00", "entry": {"encrypted": "gAAAAABhcrK...
41 | 
42 | Researchers can locally decrypt the entries if they have OSD2F installed. They can do so by running:
43 | 
44 | ```bash
45 | osd2f-decrypt-submissions osd2f_completed_submissions.json decrypted_submissions.json TESTSECRET
46 | ```
47 | 
48 | The `decrypted_submissions.json` file should look something like this:
49 | 
50 | > [{"db_id": 1, "submission_id": "test", "filename": "ads_clicked.json", "n_deleted_across_file": 0, "insert_timestamp": "2021-10-22T12:46:47.544248+00:00", "entry": {"activity": "click", "ad_title": "Organic global Graphical User Interface", "timestamp": 1628971624}}, {"db_id": 2, "submission_id": "test", "filename": "ads_clicked.json", "n_deleted_across_file": 0, "insert_timestamp": "2021-10-22T12:46:47.544498+00:00", "entry": {"activity": "expand", "ad_title": "Upgradable scalable throughput", "timestamp": 1589681049}}, {"db_id": 3, "submission_id": "test", "filename": "ads_clicked.json", "n_deleted_across_file": 0, "insert_timestamp": "2021-10-22T12:46:47.544694+00:00", "entry": {"activity": "watch", "ad_title": "Organized asynchronous challenge", "timestamp": 1625135602}}, ....
51 | 
52 | ## Notes
53 | 
54 | The entry encryption secret value supports the same secret store functionality as other setting fields, see [using secret stores](./using_secret_stores.md)
55 | 


--------------------------------------------------------------------------------
/docs/using_secret_stores.md:
--------------------------------------------------------------------------------
 1 | # Keeping configuration information secret
 2 | 
 3 | Parts of the configuration of an OSD2F deployment are sensitive. Knowing`OSD2F_SECRET` means 
 4 | you can impersonate other users. The `OSD2F_DB_URL` can include the username & password
 5 | for the database. 
 6 | 
 7 | Secret information should *never* be part of your repository. Generally, OSD2F accepts 
 8 | sensitive information via environment variables, which allows your deployment environment
 9 | to implement secret management. However, in some situations it is more convenient to 
10 | leverage a secret store, or 'keyvault' directly from the application. This document
11 | lists supported keystore solutions. 
12 | 
13 | ## General usage
14 | 
15 | When the OSD2F application is started, it looks through the environment variables
16 | and changes variables with known prefixes. It will substitute the environment
17 | variables with the corresponding keystore values in-memory. 
18 | 
19 | By using the appropriate prefix-format, any environment variable value can be
20 | retrieved on runtime from a secret store.
21 | 
22 | 
23 | ## Azure keyvault
24 | 
25 | OSD2F supports the Azure Keyvault solution provided by microsoft. It relies on contextual
26 | authentication through the default credentials in the environment. Azure keyvault references
27 | should follow the format:
28 | 
29 | > azure-keyvault::your-keyvault-location::name-of-key
30 | 
31 | For example, if the keyvault is called `osd2f-test`, it should have a location such as
32 | `https://osd2f-test.vault.azure.net/`. We store a database URL with the key name `OSD2F-DB-URL` (azure doesn't accept underscores in key names) and the value `sqlite://keyvault-test`. To use this key (locally), make sure the right credentials are set (e.g. `az login` to the appropriate subscription). Then start OSD2F:
33 | 
34 | ```bash
35 | # we use the normal env variable, but the value is the azure-keyvault specification
36 | # instead of the 'real' value we want to use. 
37 | export OSD2F_DB_URL='azure-keyvault::https://osd2f-test.vault.azure.net/::OSD2F-DB-URL' 
38 | osd2f -m Development
39 | ```
40 | 
41 | Observe that the application makes the expected `keyvault-test` sqlite database file. 
42 | 
43 | ### Requirements when deploying 
44 | 
45 | If you are deploying a OSD2F app, most likely to Azure, make sure the webapp has the `secret` `Get` and `Key` `Get` permissions. You can add these via the KeyVault Access policies or by issuing the command: 
46 | 
47 | ```bash
48 | export WEBAPP_ID="your webapp PRINCIPLE ID"
49 | export KEYVAULT_NAME="your keyvault name"
50 | 
51 | az keyvault set-policy \
52 |     --name $KEYVAULT_NAME \
53 |     --object-id $WEBAPP_ID \
54 |     --secret-permissions get \
55 |     --key-permissions get
56 | ```
57 | Note that this gives the webapp permission to *all* secrets in this keyvault. We recommend using
58 | separate keyvaults for separate applications or services.


--------------------------------------------------------------------------------
/mockdata/sample/README.md:
--------------------------------------------------------------------------------
1 | 
2 | # THIS FOLDER CONTAINS MOCK-DATA
3 | 
4 | ## Data was generated using [faker](https://faker.readthedocs.io/en/master/)
5 | 
6 | ## Any similarity to real-world data is purely due to chance
7 |         


--------------------------------------------------------------------------------
/mockdata/sample/sample-platform-russellrodney-3.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/mockdata/sample/sample-platform-russellrodney-3.tar


--------------------------------------------------------------------------------
/mockdata/sample/sample-platform-russellrodney-3.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/mockdata/sample/sample-platform-russellrodney-3.tar.gz


--------------------------------------------------------------------------------
/mockdata/sample/sample-platform-russellrodney-3.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/mockdata/sample/sample-platform-russellrodney-3.zip


--------------------------------------------------------------------------------
/mockdata/sample/sample-platform-russellrodney-3/ads_clicked/ads_clicked.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "ad_title": "Expanded reciprocal matrix",
 4 |     "activity": "expand",
 5 |     "timestamp": 1581895542
 6 |   },
 7 |   {
 8 |     "ad_title": "Operative radical analyzer",
 9 |     "activity": "click",
10 |     "timestamp": 1610175305
11 |   },
12 |   {
13 |     "ad_title": "Visionary neutral adapter",
14 |     "activity": "watch",
15 |     "timestamp": 1644500427
16 |   },
17 |   {
18 |     "ad_title": "Front-line logistical contingency",
19 |     "activity": "expand",
20 |     "timestamp": 1609186981
21 |   },
22 |   {
23 |     "ad_title": "Synergized intermediate architecture",
24 |     "activity": "expand",
25 |     "timestamp": 1599028632
26 |   },
27 |   {
28 |     "ad_title": "Stand-alone object-oriented policy",
29 |     "activity": "watch",
30 |     "timestamp": 1638567540
31 |   },
32 |   {
33 |     "ad_title": "Ergonomic stable architecture",
34 |     "activity": "watch",
35 |     "timestamp": 1639132978
36 |   },
37 |   {
38 |     "ad_title": "Exclusive stable initiative",
39 |     "activity": "expand",
40 |     "timestamp": 1630291094
41 |   },
42 |   {
43 |     "ad_title": "Implemented local middleware",
44 |     "activity": "watch",
45 |     "timestamp": 1589384192
46 |   },
47 |   {
48 |     "ad_title": "Digitized demand-driven support",
49 |     "activity": "expand",
50 |     "timestamp": 1624937617
51 |   }
52 | ]


--------------------------------------------------------------------------------
/mockdata/sample/sample-platform-russellrodney-3/companies_followed/companies_followed.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "companies_followed": [
 3 |     {
 4 |       "company_name": "Kim-Suarez",
 5 |       "timestamp": 1638967278
 6 |     },
 7 |     {
 8 |       "company_name": "Mayer-Wallace",
 9 |       "timestamp": 1593287415
10 |     },
11 |     {
12 |       "company_name": "Brady, Robinson and Delgado",
13 |       "timestamp": 1621384091
14 |     },
15 |     {
16 |       "company_name": "Rocha, Ortega and Cook",
17 |       "timestamp": 1606350772
18 |     },
19 |     {
20 |       "company_name": "Jensen, Gonzalez and Santos",
21 |       "timestamp": 1628666782
22 |     },
23 |     {
24 |       "company_name": "Bradley-Evans",
25 |       "timestamp": 1600858011
26 |     },
27 |     {
28 |       "company_name": "Mora, Santos and Fischer",
29 |       "timestamp": 1616867040
30 |     },
31 |     {
32 |       "company_name": "Thompson Group",
33 |       "timestamp": 1588028299
34 |     },
35 |     {
36 |       "company_name": "Phillips-Winters",
37 |       "timestamp": 1609999846
38 |     },
39 |     {
40 |       "company_name": "Ellis, Edwards and Rodriguez",
41 |       "timestamp": 1584500614
42 |     },
43 |     {
44 |       "company_name": "Ruiz, Edwards and Chavez",
45 |       "timestamp": 1611792874
46 |     },
47 |     {
48 |       "company_name": "Romero LLC",
49 |       "timestamp": 1583736863
50 |     },
51 |     {
52 |       "company_name": "Hall-Solomon",
53 |       "timestamp": 1577894523
54 |     },
55 |     {
56 |       "company_name": "Gray, Sawyer and Foster",
57 |       "timestamp": 1652433971
58 |     },
59 |     {
60 |       "company_name": "Miller Group",
61 |       "timestamp": 1639524701
62 |     },
63 |     {
64 |       "company_name": "Evans Inc",
65 |       "timestamp": 1582404996
66 |     },
67 |     {
68 |       "company_name": "Anderson LLC",
69 |       "timestamp": 1592059402
70 |     },
71 |     {
72 |       "company_name": "Ramirez, Terry and Hardy",
73 |       "timestamp": 1624933766
74 |     },
75 |     {
76 |       "company_name": "Little-Miller",
77 |       "timestamp": 1633597087
78 |     },
79 |     {
80 |       "company_name": "Sawyer-Rice",
81 |       "timestamp": 1637290723
82 |     }
83 |   ]
84 | }


--------------------------------------------------------------------------------
/mockdata/sample/sample-platform-russellrodney-3/engagement/engagement.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "engagement_info": [
  3 |     {
  4 |       "timestamp": 1603583147,
  5 |       "engagement_type": "like",
  6 |       "object": "uuid5840629"
  7 |     },
  8 |     {
  9 |       "timestamp": 1604131197,
 10 |       "engagement_type": "click",
 11 |       "object": "uuid5498207"
 12 |     },
 13 |     {
 14 |       "timestamp": 1631666723,
 15 |       "engagement_type": "listen",
 16 |       "object": "uuid4948296"
 17 |     },
 18 |     {
 19 |       "timestamp": 1634712013,
 20 |       "engagement_type": "click",
 21 |       "object": "uuid1253992"
 22 |     },
 23 |     {
 24 |       "timestamp": 1651598806,
 25 |       "engagement_type": "like",
 26 |       "object": "uuid47263"
 27 |     },
 28 |     {
 29 |       "timestamp": 1614837215,
 30 |       "engagement_type": "listen",
 31 |       "object": "uuid7885513"
 32 |     },
 33 |     {
 34 |       "timestamp": 1613519961,
 35 |       "engagement_type": "like",
 36 |       "object": "uuid8145538"
 37 |     },
 38 |     {
 39 |       "timestamp": 1597582736,
 40 |       "engagement_type": "listen",
 41 |       "object": "uuid612408"
 42 |     },
 43 |     {
 44 |       "timestamp": 1604838142,
 45 |       "engagement_type": "listen",
 46 |       "object": "uuid1514152"
 47 |     },
 48 |     {
 49 |       "timestamp": 1639114959,
 50 |       "engagement_type": "click",
 51 |       "object": "uuid6109172"
 52 |     },
 53 |     {
 54 |       "timestamp": 1628743660,
 55 |       "engagement_type": "recommend",
 56 |       "object": "uuid6792778"
 57 |     },
 58 |     {
 59 |       "timestamp": 1638269231,
 60 |       "engagement_type": "recommend",
 61 |       "object": "uuid2059735"
 62 |     },
 63 |     {
 64 |       "timestamp": 1633797338,
 65 |       "engagement_type": "share",
 66 |       "object": "uuid6858714"
 67 |     },
 68 |     {
 69 |       "timestamp": 1593092623,
 70 |       "engagement_type": "share",
 71 |       "object": "uuid671610"
 72 |     },
 73 |     {
 74 |       "timestamp": 1633545231,
 75 |       "engagement_type": "click",
 76 |       "object": "uuid8293052"
 77 |     },
 78 |     {
 79 |       "timestamp": 1616812065,
 80 |       "engagement_type": "listen",
 81 |       "object": "uuid237794"
 82 |     },
 83 |     {
 84 |       "timestamp": 1599627588,
 85 |       "engagement_type": "click",
 86 |       "object": "uuid5748854"
 87 |     },
 88 |     {
 89 |       "timestamp": 1613363432,
 90 |       "engagement_type": "like",
 91 |       "object": "uuid6260369"
 92 |     },
 93 |     {
 94 |       "timestamp": 1596187832,
 95 |       "engagement_type": "recommend",
 96 |       "object": "uuid6008933"
 97 |     },
 98 |     {
 99 |       "timestamp": 1634398469,
100 |       "engagement_type": "like",
101 |       "object": "uuid845728"
102 |     }
103 |   ]
104 | }


--------------------------------------------------------------------------------
/mockdata/sample/sample-platform-russellrodney-3/profile_interests/profile_interests.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "profile_interests": [
 3 |     "Diverse solution-oriented moderator",
 4 |     "Stand-alone content-based orchestration",
 5 |     "Configurable zero tolerance collaboration",
 6 |     "Adaptive needs-based matrix",
 7 |     "Future-proofed cohesive migration",
 8 |     "Phased static projection",
 9 |     "Optimized disintermediate help-desk",
10 |     "Automated web-enabled access",
11 |     "Multi-lateral encompassing analyzer",
12 |     "Cloned reciprocal instruction set"
13 |   ]
14 | }


--------------------------------------------------------------------------------
/mockdata/sample/sample-platform-russellrodney-3/short_messages/messages.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "messages.collection": [
  3 |     {
  4 |       "id": "41235082236765",
  5 |       "message": "Enter own sure traditional white this. Point dark could gas mention speech. Reveal all laugh son right."
  6 |     },
  7 |     {
  8 |       "id": "149224703736780",
  9 |       "message": "Oil there support month away skin hold."
 10 |     },
 11 |     {
 12 |       "id": "23011685740814",
 13 |       "message": "Simply behavior watch teacher society staff role run. Avoid major off you ask expert wait."
 14 |     },
 15 |     {
 16 |       "id": "84976717333574",
 17 |       "message": "Piece moment young prepare. Possible then ground break her religious guess include. Skill here nothing huge work research note until. Trouble nor thank arm sport study note travel."
 18 |     },
 19 |     {
 20 |       "id": "14115030275390",
 21 |       "message": "Fly sell such produce however center. Century relate attorney television former threat movie. Professor book short nice. Father spend call anything receive above."
 22 |     },
 23 |     {
 24 |       "id": "53559994766831",
 25 |       "message": "Would against front. Behavior young voice her really community citizen. Guess building yard end color various."
 26 |     },
 27 |     {
 28 |       "id": "153916030071148",
 29 |       "message": "Such none local federal large already. Involve me technology hand environment happy. Enjoy ask point window. Military paper government most."
 30 |     },
 31 |     {
 32 |       "id": "12523935577497",
 33 |       "message": "Easy of want usually this give. Language travel much book situation very nor. Anything myself series protect sea upon."
 34 |     },
 35 |     {
 36 |       "id": "133967398893359",
 37 |       "message": "Last term natural game prepare give win myself. Develop strategy lay management sister."
 38 |     },
 39 |     {
 40 |       "id": "146973493929326",
 41 |       "message": "Speak why clear air happen TV. Peace research property right. Floor blue quality response attack."
 42 |     },
 43 |     {
 44 |       "id": "48749458458208",
 45 |       "message": "Cup thus image part by. Pay charge factor glass recent world. Human stop responsibility."
 46 |     },
 47 |     {
 48 |       "id": "131628757986948",
 49 |       "message": "Imagine method smile something modern nice price. Top nation teach site wish actually that. Lead four large. Cost human professional next someone try."
 50 |     },
 51 |     {
 52 |       "id": "74367885815084",
 53 |       "message": "Human enter indicate whose interest modern. School city this. Idea tough probably behind provide attack wide. Ability front through drop real."
 54 |     },
 55 |     {
 56 |       "id": "1382079388842",
 57 |       "message": "Option us meet create staff question. Anything firm great traditional avoid base. Since simple consider true."
 58 |     },
 59 |     {
 60 |       "id": "33199591687420",
 61 |       "message": "Either compare certainly return. Summer keep prevent save rather economic. Realize little now control."
 62 |     },
 63 |     {
 64 |       "id": "7841019827548",
 65 |       "message": "Laugh water likely able mean. Direction cell ask who. Great for speak industry you choice."
 66 |     },
 67 |     {
 68 |       "id": "68220694831320",
 69 |       "message": "Wish environment line author early. Gun edge part where woman church speech. But design test dog personal wall."
 70 |     },
 71 |     {
 72 |       "id": "133787513169096",
 73 |       "message": "Rest require population better near if."
 74 |     },
 75 |     {
 76 |       "id": "81257448652668",
 77 |       "message": "Five green example reach sometimes. White scene four thank able. Political quickly chance their own. North never collection professor quite dinner."
 78 |     },
 79 |     {
 80 |       "id": "795951252521",
 81 |       "message": "Table lot light red type rate treat training. Write budget government strong between leave."
 82 |     },
 83 |     {
 84 |       "id": "26294001537257",
 85 |       "message": "Out leave they heavy top or well style. Heavy current actually school. Approach step somebody capital might recognize husband. Read face ability well."
 86 |     },
 87 |     {
 88 |       "id": "46799692120291",
 89 |       "message": "Small attack game reason policy. Beat yes create word."
 90 |     },
 91 |     {
 92 |       "id": "39593755964345",
 93 |       "message": "Although someone eat room instead southern available just. National within only exist bit for relationship."
 94 |     },
 95 |     {
 96 |       "id": "60905984795612",
 97 |       "message": "Include process street real."
 98 |     },
 99 |     {
100 |       "id": "16454225837983",
101 |       "message": "Must world add soon along probably pay. Ok every fish item by. Necessary behavior stay trip."
102 |     },
103 |     {
104 |       "id": "143242729829108",
105 |       "message": "Fast see beat until action back. Standard compare beautiful bed part. Receive term card far debate. Figure music majority cut professional."
106 |     },
107 |     {
108 |       "id": "65886234678241",
109 |       "message": "Represent market deal out. Plan win worker generation painting resource. Natural case space show manager."
110 |     },
111 |     {
112 |       "id": "146373855137944",
113 |       "message": "Modern customer major half whom risk. Outside form occur occur kid may factor activity."
114 |     },
115 |     {
116 |       "id": "16128138399593",
117 |       "message": "Off upon modern single guy. Hospital own role."
118 |     },
119 |     {
120 |       "id": "103223191449550",
121 |       "message": "Happen any message perform scene find. Economy he technology bill toward remain feel."
122 |     },
123 |     {
124 |       "id": "2508562226437",
125 |       "message": "Whom civil majority stay subject billion far. Government group mind suggest pick. Life next pass herself campaign whether."
126 |     },
127 |     {
128 |       "id": "109328233623546",
129 |       "message": "Fast close head without building note. Purpose one firm among recent somebody."
130 |     },
131 |     {
132 |       "id": "124318851756247",
133 |       "message": "Car cell however what. Follow skill attention key significant support."
134 |     },
135 |     {
136 |       "id": "68123734318477",
137 |       "message": "Green capital line green know American turn behind. Under especially marriage rock owner simple mission. Politics idea middle pull."
138 |     },
139 |     {
140 |       "id": "21886972296761",
141 |       "message": "Ask sign subject production. Result power thing agreement hope table economy international."
142 |     },
143 |     {
144 |       "id": "149264325012834",
145 |       "message": "Until special building. Language throughout fill goal. Learn would wear side chance poor you south. Theory follow style."
146 |     },
147 |     {
148 |       "id": "67047309889302",
149 |       "message": "Official guy serve room to here reason. Heart film avoid old PM concern when."
150 |     },
151 |     {
152 |       "id": "113644532048929",
153 |       "message": "Store must prove. Even relationship affect information attention visit."
154 |     },
155 |     {
156 |       "id": "136208093342591",
157 |       "message": "Available allow very item strategy beyond."
158 |     },
159 |     {
160 |       "id": "43277280056510",
161 |       "message": "Surface her indicate image house. Animal entire many laugh against order store."
162 |     },
163 |     {
164 |       "id": "45741893036910",
165 |       "message": "Oil evening pay president check. Be whatever maybe exactly management believe deep."
166 |     },
167 |     {
168 |       "id": "53261547666450",
169 |       "message": "Development cultural listen huge. Important poor position mission explain. Parent time manager."
170 |     },
171 |     {
172 |       "id": "97998713613797",
173 |       "message": "Four near middle new son. Degree culture employee hold college PM account him. Indeed add especially talk front front their. Tonight particularly main defense tough skill present."
174 |     },
175 |     {
176 |       "id": "95880555845646",
177 |       "message": "Unit situation themselves smile me purpose. Get his stop return management he democratic service."
178 |     },
179 |     {
180 |       "id": "37898009817352",
181 |       "message": "Physical point week. Court smile thousand later."
182 |     },
183 |     {
184 |       "id": "64321022696164",
185 |       "message": "Goal less industry state. Agree summer change them head bar."
186 |     },
187 |     {
188 |       "id": "145830631368332",
189 |       "message": "Want model away question technology discussion main gas. Somebody staff mention car."
190 |     },
191 |     {
192 |       "id": "10442859844141",
193 |       "message": "Policy bank force law seven sell glass. Ahead memory sure."
194 |     },
195 |     {
196 |       "id": "161265853275001",
197 |       "message": "Forward garden support. Party president then enter professional. Care because job method consumer."
198 |     },
199 |     {
200 |       "id": "127661691475216",
201 |       "message": "Role yourself defense hand reality attorney race statement."
202 |     }
203 |   ]
204 | }


--------------------------------------------------------------------------------
/osd2f/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/__init__.py


--------------------------------------------------------------------------------
/osd2f/__main__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | 
 4 | from .logger import logger
 5 | from .server import create_app, start_app
 6 | 
 7 | if mode := os.environ.get("OSD2F_MODE"):
 8 |     assert mode in ("Development", "Testing", "Production")
 9 | else:
10 |     logger.critical("`OSD2F_MODE` must be set")
11 | 
12 | 
13 | if mode and __name__ == "__main__":
14 |     app = create_app(mode=mode)
15 |     start_app(app)
16 | elif mode:
17 |     app = create_app(mode=mode)
18 | 


--------------------------------------------------------------------------------
/osd2f/anonymizers/__init__.py:
--------------------------------------------------------------------------------
 1 | """Anonymizers
 2 | 
 3 | This sub-module contains functions that operate on individual entries
 4 | to do some form of anonymization, either by redacting (parts of) strings,
 5 | or by omitting entries entirely (e.g. returning None for some entries).
 6 | 
 7 | All anonymization functions should have the (entry, optional_string_param)
 8 | signature.
 9 | 
10 | Register
11 | 
12 | """
13 | 
14 | import re
15 | import typing
16 | 
17 | from .sample_platform import redact_text
18 | from ..definitions import Submission, SubmissionList, UploadSettings
19 | from ..logger import logger
20 | 
21 | options: typing.Dict[str, typing.Callable[[typing.Dict, str], typing.Awaitable]] = {
22 |     redact_text.__name__: redact_text  # noqa
23 | }
24 | 
25 | 
26 | async def apply(
27 |     file_entries: typing.List[typing.Dict[str, typing.Any]],
28 |     anonymizer: str,
29 |     optional_str_param: str = "",
30 | ) -> typing.List[typing.Dict[str, typing.Any]]:
31 |     if anonymizer not in options:
32 |         logger.warning(
33 |             f"Specified anonymizer {anonymizer} not found. "
34 |             f"Available anonymizers: {options}."
35 |         )
36 |         return []
37 | 
38 |     anonymized_entries = []
39 |     for entry in file_entries:
40 |         if entry is None:
41 |             continue
42 |         try:
43 |             processed_entry = await options[anonymizer](entry, optional_str_param)
44 |             anonymized_entries.append(processed_entry)
45 |         except:  # noqa
46 |             logger.warning(
47 |                 f"anonymizer `{anonymizer}` threw an error while parsing an entry"
48 |             )
49 |             continue
50 | 
51 |     return anonymized_entries
52 | 
53 | 
54 | async def anonymize_submission(submission: Submission, settings: UploadSettings):
55 |     for filename_pattern, setting in settings.files.items():
56 |         logger.debug(f"matching {filename_pattern} to {submission.filename}")
57 |         if not re.search(filename_pattern, submission.filename):
58 |             continue
59 |         # disregards settings for which no anonymizers are registered
60 |         if not setting.anonymizers:
61 |             continue
62 |         logger.debug(f"Applying {setting.anonymizers} to {submission.filename}")
63 |         # apply all anonymizers registered for file pattern
64 |         for anonymizer in setting.anonymizers:
65 |             function_name, arg = anonymizer.copy().popitem()
66 |             logger.debug(f"Applying {function_name} to {submission.filename}")
67 | 
68 |             submission.entries = await apply(
69 |                 file_entries=submission.entries,
70 |                 anonymizer=function_name,
71 |                 optional_str_param=arg,
72 |             )
73 |         # only match the first matching setting
74 |         break
75 |     return submission
76 | 
77 | 
78 | async def anonymize_submission_list(
79 |     submission_list: SubmissionList, settings: UploadSettings
80 | ) -> SubmissionList:
81 |     for i, submission in enumerate(submission_list.root):
82 |         logger.debug(f"at submission {i}")
83 |         await anonymize_submission(submission, settings)
84 |     return submission_list
85 | 


--------------------------------------------------------------------------------
/osd2f/cli.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import asyncio
  3 | import json
  4 | import logging
  5 | 
  6 | from osd2f import config
  7 | 
  8 | import yaml
  9 | 
 10 | from .config import Testing
 11 | from .database import initialize_database, stop_database
 12 | from .logger import logger
 13 | from .server import create_app, start_app
 14 | 
 15 | LOGFORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 16 | 
 17 | parser = argparse.ArgumentParser(
 18 |     prog="OSD2F webserver", usage="Start the webserver and collect data donations."
 19 | )
 20 | 
 21 | parser.add_argument(
 22 |     "-m",
 23 |     "--mode",
 24 |     action="store",
 25 |     default="Testing",
 26 |     help="Specify the mode to run in, defaults to 'Testing'",
 27 |     choices=[
 28 |         d
 29 |         for d in dir(config)
 30 |         if not d.startswith("_") and d[0] == d[0].upper() and d != "Config"
 31 |     ],
 32 | )
 33 | parser.add_argument(
 34 |     "-v",
 35 |     "--verbose",
 36 |     action="count",
 37 |     default=0,
 38 |     help="Verbosity of logging output, defaults to default=CRITICAL, "
 39 |     "v=WARNING, vv=INFO, vvv=DEBUG",
 40 | )
 41 | 
 42 | parser.add_argument(
 43 |     "-db",
 44 |     "--database-url",
 45 |     type=str,
 46 |     help="The database URL to use, overrides the `OSD2F_DB_URL` environment variable.",
 47 | )
 48 | 
 49 | parser.add_argument(
 50 |     "--secret",
 51 |     type=str,
 52 |     help="Overrides `OSD2F_SECRET` environment variable with"
 53 |     " an application secret for session security.",
 54 | )
 55 | 
 56 | parser.add_argument(
 57 |     "--download-password",
 58 |     type=str,
 59 |     help="Overrides `OSD2F_DATA_PASSWORD` environment variable "
 60 |     "for researcher download file password protection.",
 61 | )
 62 | 
 63 | parser.add_argument(
 64 |     "--entry-encryption-secret",
 65 |     type=str,
 66 |     help="Overrides `OSD2F_ENTRY_SECRET` environment variable. "
 67 |     "Encryption key for per-entry encryption/decryption for writing/reading "
 68 |     "from database.",
 69 | )
 70 | 
 71 | parser.add_argument(
 72 |     "--generate-current-config",
 73 |     type=str,
 74 |     help="Path to put an current content configuration YAML file.",
 75 | )
 76 | 
 77 | parser.add_argument(
 78 |     "-cc",
 79 |     "--content-configuration",
 80 |     type=str,
 81 |     help="A content configuration YAML file",
 82 | )
 83 | 
 84 | parser.add_argument(
 85 |     "--dry-run",
 86 |     action="store_true",
 87 |     help="test whether endpoints provide 200 code responses,"
 88 |     " just to make sure nothing broke.",
 89 | )
 90 | 
 91 | parser.add_argument(
 92 |     "--entry-decrypt-on-read-disabled",
 93 |     action="store_true",
 94 |     default=False,
 95 |     help="Keep entries downloaded through the researcher interface encrypted. "
 96 |     "Overrides the `OSD2F_ENTRY_DECRYPT_DISABLE` ENV variable",
 97 | )
 98 | 
 99 | 
100 | def parse_and_run():
101 |     args = parser.parse_args()
102 | 
103 |     if args.verbose == 0:
104 |         level = logging.CRITICAL
105 |     elif args.verbose == 1:
106 |         level = logging.WARNING
107 |     elif args.verbose == 2:
108 |         level = logging.INFO
109 |     elif args.verbose == 3:
110 |         level = logging.DEBUG
111 |     else:
112 |         print("UNKNOWN LOGLEVEL SPECIFIED")
113 |         level = logging.NOTSET
114 | 
115 |     logging.basicConfig(format=LOGFORMAT, level="WARNING")
116 |     logger.setLevel(level=level)
117 | 
118 |     logger.debug(
119 |         "If you see this, you are running with debug logging. "
120 |         "DO NOT DO THIS IN PRODUCTION."
121 |     )
122 | 
123 |     if args.content_configuration:
124 |         import osd2f.utils
125 | 
126 |         osd2f.utils.DISK_CONTENT_CONFIG_PATH = args.content_configuration
127 | 
128 |     app = create_app(
129 |         mode=args.mode,
130 |         database_url_override=args.database_url,
131 |         entry_secret_override=args.entry_encryption_secret,
132 |         entry_decrypt_disable=args.entry_decrypt_on_read_disabled,
133 |     )
134 |     if not args.dry_run and not args.generate_current_config:
135 |         start_app(app)
136 | 
137 |     elif args.generate_current_config:
138 |         from osd2f.utils import load_content_settings
139 | 
140 |         asyncio.run(app.startup())
141 |         settings = asyncio.run(load_content_settings(use_cache=False))
142 |         with open(args.generate_current_config, "w") as outputfile:
143 |             yaml.dump(settings.model_dump(by_alias=True), outputfile)
144 |         asyncio.run(app.shutdown())
145 | 
146 |     else:
147 |         asyncio.run(initialize_database(Testing.DB_URL))
148 |         tp = app.test_client()
149 |         assert asyncio.run(tp.get("/")).status_code == 200
150 |         assert asyncio.run(tp.get("/privacy")).status_code == 200
151 |         assert asyncio.run(tp.get("/upload")).status_code == 200
152 |         assert asyncio.run(tp.get("/static/js/main.js")).status_code == 200
153 |         assert asyncio.run(tp.get("/adv_anonymize_file")).status_code == 405
154 |         assert (
155 |             asyncio.run(
156 |                 tp.post(
157 |                     "/adv_anonymize_file",
158 |                     data=json.dumps(
159 |                         {
160 |                             "filename": "fn",
161 |                             "submission_id": "sid",
162 |                             "entries": [{}],
163 |                             "n_deleted": 0,
164 |                         }
165 |                     ),
166 |                 )
167 |             ).status_code
168 |             == 200
169 |         )
170 |         asyncio.run(stop_database())
171 | 


--------------------------------------------------------------------------------
/osd2f/config.py:
--------------------------------------------------------------------------------
 1 | import os as _os
 2 | import typing as _typing
 3 | 
 4 | from .security import translate_environment_vars
 5 | 
 6 | translate_environment_vars()  # resolve secrets in env variables on import
 7 | 
 8 | 
 9 | class Config:
10 |     DEBUG: bool = False
11 |     TESTING: bool = False
12 |     BIND: str = "127.0.0.1"
13 |     PORT: int = 5000
14 |     SECRET_KEY: _typing.Optional[str] = None
15 |     DATA_PASSWORD: str = _os.environ.get("OSD2F_DATA_PASSWORD", "")
16 |     ENTRY_SECRET: str = _os.environ.get("OSD2F_ENTRY_SECRET", "")
17 |     ENTRY_DECRYPT_DISABLE: bool = (
18 |         _os.environ.get("OSD2F_ENTRY_DECRYPT_DISABLE", "false").lower() == "true"
19 |     )
20 |     DB_URL = "sqlite://:memory:"
21 | 
22 |     # Allow for BIG submissions 4*16mb for
23 |     # in-memory anonymization.
24 |     # NOTE: protect POST endpoints with
25 |     #       xsrf tokens to avoid memory
26 |     #       based ddos attacks
27 |     MAX_CONTENT_LENGTH: int = 16777216 * 4
28 | 
29 |     SESSION_COOKIE_HTTPONLY = True
30 |     SESSION_COOKIE_SAMESITE = "Lax"
31 | 
32 | 
33 | class Testing(Config):
34 |     TESTING = True
35 | 
36 | 
37 | class Development(Config):
38 |     SESSION_COOKIE_SECURE = False
39 |     DEBUG = True
40 |     DB_URL = _os.environ.get("OSD2F_DB_URL", "sqlite://:memory:")
41 |     SECRET_KEY = "do not use in production"
42 | 
43 | 
44 | class Production(Config):
45 |     DEBUG = False
46 |     TESTING = False
47 |     BIND = "0.0.0.0"
48 |     PORT = 8000
49 |     SECRET_KEY = _os.environ.get("OSD2F_SECRET")
50 |     DB_URL = _os.environ.get("OSD2F_DB_URL", "")
51 |     SESSION_COOKIE_SECURE = True  # required HTTPS server
52 | 
53 | 
54 | # hypercorn
55 | bind = "0.0.0.0:8000"
56 | 


--------------------------------------------------------------------------------
/osd2f/database/__init__.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from tortoise import Tortoise
 4 | 
 5 | from .configuration import *  # noqa
 6 | from .logs import *  # noqa
 7 | from .submissions import *  # noqa
 8 | 
 9 | 
10 | async def initialize_database(db_url: str):
11 |     await Tortoise.init(db_url=db_url, modules={"models": ["osd2f.database"]})
12 |     await Tortoise.generate_schemas(safe=True)
13 |     start_logworker()  # noqa
14 | 
15 | 
16 | async def stop_database():
17 |     await asyncio.sleep(0.1)  # to avoid start/stop race-conditions during tests
18 |     await Tortoise.close_connections()
19 |     stop_logworker()  # noqa
20 | 


--------------------------------------------------------------------------------
/osd2f/database/configuration.py:
--------------------------------------------------------------------------------
 1 | import typing
 2 | 
 3 | from tortoise import fields
 4 | from tortoise.models import Model
 5 | 
 6 | from ..definitions import ContentSettings, UploadSettings
 7 | 
 8 | 
 9 | class DBConfigurationBlobs(Model):
10 |     id = fields.IntField(pk=True)
11 |     insert_timestamp = fields.DatetimeField(auto_now_add=True)
12 |     insert_user = fields.CharField(index=True, max_length=150, null=False)
13 |     config_type = fields.CharField(index=True, max_length=50, null=False)
14 |     config_blob = fields.JSONField(null=False)
15 | 
16 |     class Meta:
17 |         table = "osd2f_config"
18 | 
19 | 
20 | async def get_content_config() -> typing.Optional[DBConfigurationBlobs]:
21 |     config_item = (
22 |         await DBConfigurationBlobs.filter(config_type="content")
23 |         .order_by("-insert_timestamp")
24 |         .first()
25 |     )
26 |     return config_item
27 | 
28 | 
29 | async def set_content_config(user: str, content: ContentSettings):
30 |     await DBConfigurationBlobs.create(
31 |         insert_user=user, config_type="content", config_blob=content.model_dump_json()
32 |     )
33 | 
34 | 
35 | async def set_upload_config(user: str, content: UploadSettings):
36 |     await DBConfigurationBlobs.create(
37 |         insert_user=user, config_type="upload", config_blob=content.model_dump_json()
38 |     )
39 | 


--------------------------------------------------------------------------------
/osd2f/database/logs.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import queue
  3 | import time
  4 | import typing
  5 | from logging.handlers import QueueHandler
  6 | 
  7 | from tortoise import fields
  8 | from tortoise.models import Model
  9 | 
 10 | from ..logger import logger
 11 | 
 12 | clientLogQueue: queue.SimpleQueue = queue.SimpleQueue()
 13 | 
 14 | 
 15 | class DBLog(Model):
 16 |     id = fields.IntField(pk=True)
 17 |     insert_timestamp = fields.DatetimeField(auto_now_add=True)
 18 |     log_level = fields.CharField(index=True, max_length=100, null=False)
 19 |     log_source = fields.CharField(index=True, max_length=100, null=False)
 20 |     log_position = fields.CharField(index=True, max_length=5000, null=False)
 21 |     log_sid = fields.CharField(index=True, max_length=100, null=True)
 22 |     user_agent_string = fields.CharField(max_length=5000, null=True)
 23 |     log_entry = fields.JSONField(null=True)
 24 | 
 25 |     class Meta:
 26 |         table = "osd2f_logs"
 27 | 
 28 | 
 29 | def start_logworker():
 30 |     async def logworker():
 31 |         stop = False
 32 |         while 1:
 33 |             try:
 34 |                 log = clientLogQueue.get_nowait()
 35 |                 if log != "STOP":
 36 |                     try:
 37 |                         if log is not None:
 38 |                             await background_insert_log(**log)
 39 |                     except Exception as e:
 40 |                         print("ERROR INSERTING LOG", e)
 41 |                 else:
 42 |                     stop = True
 43 |                     logger.info("Stopping server logging worker")
 44 | 
 45 |             except queue.Empty:
 46 |                 if not stop:
 47 |                     await asyncio.sleep(0.1)
 48 |                     continue
 49 |                 else:
 50 |                     return
 51 | 
 52 |     asyncio.get_running_loop().create_task(logworker())
 53 | 
 54 | 
 55 | def stop_logworker():
 56 |     clientLogQueue.put("STOP", block=True)
 57 |     time.sleep(0.2)
 58 | 
 59 | 
 60 | async def background_insert_log(
 61 |     log_source: str,
 62 |     log_level: str,
 63 |     log_position: str,
 64 |     log_sid: typing.Optional[str] = None,
 65 |     entry: typing.Optional[typing.Dict] = None,
 66 |     user_agent_string: typing.Optional[str] = None,
 67 | ):
 68 | 
 69 |     await DBLog(
 70 |         log_source=log_source,
 71 |         log_level=log_level,
 72 |         log_position=log_position,
 73 |         log_sid=log_sid,
 74 |         log_entry=entry,
 75 |         user_agent_string=user_agent_string,
 76 |     ).save()
 77 | 
 78 |     return
 79 | 
 80 | 
 81 | async def insert_log(
 82 |     log_source: str,
 83 |     log_level: str,
 84 |     log_position: str,
 85 |     log_sid: typing.Optional[str] = None,
 86 |     entry: typing.Optional[typing.Dict] = None,
 87 |     user_agent_string: typing.Optional[str] = None,
 88 | ):
 89 |     clientLogQueue.put(
 90 |         dict(
 91 |             log_source=log_source,
 92 |             log_level=log_level,
 93 |             log_position=log_position,
 94 |             log_sid=log_sid,
 95 |             entry=entry,
 96 |             user_agent_string=user_agent_string,
 97 |         )
 98 |     )
 99 | 
100 | 
101 | async def get_activity_logs():
102 |     logs = await DBLog.all()
103 |     data = [
104 |         {
105 |             "db_id": log.id,
106 |             "insert_timestamp": log.insert_timestamp.isoformat(),
107 |             "log_level": log.log_level,
108 |             "source": log.log_source,
109 |             "position": log.log_position,
110 |             "submission_id": log.log_sid,
111 |             "user-agent-string": log.user_agent_string,
112 |             "entry": log.log_entry,
113 |         }
114 |         for log in logs
115 |     ]
116 |     return data
117 | 
118 | 
119 | def add_database_logging() -> queue.SimpleQueue:
120 |     """Forward logger statements to the database.
121 | 
122 |     Uses a QueueHandler and an asyncronous worker to
123 |     insert logs from logger.debug/info/warning/critical
124 |     to the application database.
125 | 
126 |     NOTE: messages over 5000 characters are shortened
127 |     """
128 | 
129 |     async def async_log_worker(q: queue.SimpleQueue):
130 |         while 1:
131 |             try:
132 |                 m = q.get_nowait()
133 |                 if m == "stop-logging":
134 |                     break
135 |                 if m.msg == "stop-logging":
136 |                     break
137 |                 if len(m.msg) < 5000:
138 |                     await insert_log("server", m.levelname, m.msg)
139 |                 else:
140 |                     await insert_log("server", m.levelname, m.msg[:4997] + "...")
141 |             except queue.Empty:
142 |                 await asyncio.sleep(0.1)
143 | 
144 |     logQueue: queue.SimpleQueue = queue.SimpleQueue()
145 |     h = QueueHandler(logQueue)
146 |     h.setLevel(logger.level)
147 |     print(h.level)
148 |     logger.addHandler(h)
149 |     asyncio.get_running_loop().create_task(async_log_worker(logQueue))
150 | 
151 |     return logQueue
152 | 


--------------------------------------------------------------------------------
/osd2f/database/submissions.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict, List
  2 | 
  3 | from tortoise import Tortoise, fields
  4 | from tortoise.models import Model
  5 | 
  6 | from ..definitions import OutputSubmission, Submission, SubmissionList
  7 | from ..logger import logger
  8 | from ..security.entry_encryption.secure_entry_singleton import SecureEntry
  9 | 
 10 | 
 11 | class DBSubmission(Model):
 12 |     id = fields.IntField(pk=True)
 13 |     submission_id = fields.CharField(index=True, max_length=100)
 14 |     filename = fields.CharField(index=True, max_length=5000)
 15 |     n_deleted = fields.IntField()
 16 |     insert_timestamp = fields.DatetimeField(auto_now_add=True)
 17 |     update_timestamp = fields.DatetimeField(auto_now=True)
 18 |     entry: Dict[str, Any] = fields.JSONField()
 19 | 
 20 |     class Meta:
 21 |         table = "submissions"
 22 | 
 23 | 
 24 | async def insert_submission(submission: Submission):
 25 |     logger.debug(submission)
 26 |     for entry in submission.entries:
 27 |         await DBSubmission.create(
 28 |             submission_id=submission.submission_id,
 29 |             filename=submission.filename,
 30 |             entry=SecureEntry.write_entry_field(entry),
 31 |             n_deleted=submission.n_deleted,
 32 |         )
 33 | 
 34 | 
 35 | async def get_submissions() -> List[OutputSubmission]:
 36 |     submissions = await DBSubmission.all()
 37 |     submission_dict: List[OutputSubmission] = []
 38 | 
 39 |     for si in submissions:
 40 |         entry = SecureEntry.read_entry_field(si.entry)
 41 |         sub = OutputSubmission.model_validate(
 42 |             dict(
 43 |                 db_id=si.id,
 44 |                 submission_id=si.submission_id,
 45 |                 filename=si.filename,
 46 |                 n_deleted_across_file=si.n_deleted,
 47 |                 insert_timestamp=si.insert_timestamp.isoformat(),
 48 |                 entry=dict(entry),
 49 |             ),
 50 |         )
 51 |         submission_dict.append(sub)
 52 | 
 53 |     return submission_dict
 54 | 
 55 | 
 56 | async def insert_submission_list(submissionlist: SubmissionList):
 57 |     if len(submissionlist.root) < 1:
 58 |         logger.info("Empty submissionlist")
 59 |         return
 60 | 
 61 |     logger.debug(
 62 |         f"Inserting {len(submissionlist.root)} files of data for submission "
 63 |         f"'{submissionlist.root[0].submission_id}'"
 64 |     )
 65 | 
 66 |     def subgenerator():
 67 |         for sub in submissionlist.root:
 68 |             for entry in sub.entries:
 69 |                 yield DBSubmission(
 70 |                     submission_id=sub.submission_id,
 71 |                     filename=sub.filename,
 72 |                     entry=SecureEntry.write_entry_field(entry),
 73 |                     n_deleted=sub.n_deleted,
 74 |                 )
 75 | 
 76 |     await DBSubmission.bulk_create(objects=subgenerator())
 77 | 
 78 | 
 79 | async def count_submissions():
 80 |     return await DBSubmission.all().count()
 81 | 
 82 | 
 83 | async def get_pending_participants():
 84 |     conn = Tortoise.get_connection("default")
 85 |     rs = await conn.execute_query(
 86 |         """
 87 |     WITH completed AS (
 88 |         SELECT DISTINCT log_sid FROM osd2f_logs
 89 |         WHERE
 90 |             log_SID IS NOT NULL
 91 |             AND log_position="Received the donation!"
 92 |         GROUP BY log_sid
 93 |     )
 94 |     SELECT
 95 |         osd2f_logs.log_sid AS submission_id,
 96 |         MIN(insert_timestamp) AS first_seen,
 97 |         MAX(insert_timestamp) AS last_seen
 98 |     FROM osd2f_logs
 99 |     OUTER LEFT JOIN completed ON osd2f_logs.log_sid=completed.log_sid
100 |     WHERE submission_id IS NOT NULL
101 |     GROUP BY submission_id
102 |     ORDER BY last_seen DESC
103 |     """
104 |     )
105 |     data = [
106 |         {
107 |             "submission_id": r["submission_id"],
108 |             "first_seen": r["first_seen"],
109 |             "last_seen": r["last_seen"],
110 |         }
111 |         for r in rs[1]
112 |     ]
113 |     return data
114 | 


--------------------------------------------------------------------------------
/osd2f/definitions/__init__.py:
--------------------------------------------------------------------------------
1 | from .content_settings import *  # noqa
2 | from .security_settings import *  # noqa
3 | from .submissions import *  # noqa
4 | 


--------------------------------------------------------------------------------
/osd2f/definitions/content_settings.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | from typing import Dict, List, Optional
  3 | 
  4 | from pydantic import BaseModel, ConfigDict, EmailStr
  5 | 
  6 | 
  7 | class FileSetting(BaseModel):
  8 |     in_key: Optional[str] = None
  9 |     accepted_fields: List[str]
 10 |     anonymizers: Optional[List[Dict[str, str]]] = None
 11 | 
 12 | 
 13 | class UploadSettings(BaseModel):
 14 |     files: Dict[str, FileSetting]
 15 | 
 16 | 
 17 | class BlockTypeEnum(str, Enum):
 18 |     jumbotron = "jumbotron"
 19 |     twoblockrow = "two_block_row"
 20 | 
 21 | 
 22 | class ImagePositionEnum(str, Enum):
 23 |     right = "right"
 24 |     left = "left"
 25 | 
 26 | 
 27 | class ContentButton(BaseModel):
 28 |     name: str
 29 |     link: str
 30 |     label: str
 31 | 
 32 | 
 33 | class PageTypeEnum(str, Enum):
 34 |     home = "home"
 35 |     privacy = "privacy"
 36 |     donate = "donate"
 37 | 
 38 | 
 39 | class CirclesRowCircle(BaseModel):
 40 |     image: str
 41 |     title: Optional[str] = None
 42 |     subtitle: Optional[str] = None
 43 | 
 44 | 
 45 | class ContentBlock(BaseModel):
 46 |     type: BlockTypeEnum
 47 |     id: str
 48 |     title: Optional[str] = None
 49 |     lines: List[str]
 50 |     buttons: List[ContentButton]
 51 |     image: Optional[str] = None
 52 |     image_pos: Optional[ImagePositionEnum] = None
 53 |     circles_row: Optional[List[CirclesRowCircle]] = None
 54 | 
 55 |     model_config = ConfigDict(use_enum_values=True)
 56 | 
 57 | 
 58 | class ContentPage(BaseModel):
 59 |     active: bool
 60 |     name: str
 61 |     blocks: List[ContentBlock]
 62 | 
 63 | 
 64 | class UploadBox(BaseModel):
 65 |     header: Optional[str] = None
 66 |     explanation: List[str]
 67 | 
 68 | 
 69 | class PreviewComponent(BaseModel):
 70 |     entries_in_file_text: str
 71 |     title: str
 72 |     explanation: List[str]
 73 |     previous_file_button: str
 74 |     next_file_button: str
 75 |     remove_rows_button: str
 76 |     search_prompt: str
 77 |     search_box_placeholder: str
 78 | 
 79 | 
 80 | class ConsentPopup(BaseModel):
 81 |     title: str
 82 |     lead: str
 83 |     points: Optional[List[str]] = None
 84 |     end_text: str
 85 |     decline_button: str
 86 |     accept_button: str
 87 | 
 88 | 
 89 | class UploadPage(BaseModel):
 90 |     blocks: List[ContentBlock]
 91 |     upload_box: UploadBox
 92 |     thanks_text: str
 93 |     file_indicator_text: str
 94 |     processing_text: str
 95 |     empty_selection: str
 96 |     donate_button: str
 97 |     inspect_button: str
 98 |     preview_component: PreviewComponent
 99 |     consent_popup: ConsentPopup
100 | 
101 | 
102 | class ContentSettings(BaseModel):
103 |     project_title: str
104 |     contact_us: EmailStr
105 |     static_pages: Dict[PageTypeEnum, ContentPage]
106 |     upload_page: UploadPage
107 | 
108 |     model_config = ConfigDict(use_enum_values=True)
109 | 


--------------------------------------------------------------------------------
/osd2f/definitions/security_settings.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | 
 3 | from pydantic import BaseModel, field_validator
 4 | 
 5 | 
 6 | class MSALConfiguration(BaseModel):
 7 |     tenant_id: str
 8 |     client_id: str
 9 |     secret: str
10 |     allowed_users: str
11 |     redirect_url: Optional[str]
12 | 
13 |     authority: Optional[str] = None
14 |     scope: List[str] = ["User.Read"]
15 | 
16 |     @field_validator("authority", mode="before", check_fields=True)  # type: ignore
17 |     def set_authority(cls, v, *, values, **kwargs):
18 |         return f"https://login.microsoftonline.com/{values['tenant_id']}"
19 | 


--------------------------------------------------------------------------------
/osd2f/definitions/submissions.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, List
 2 | 
 3 | from pydantic import BaseModel, RootModel
 4 | 
 5 | 
 6 | class Submission(BaseModel):
 7 |     submission_id: str
 8 |     filename: str
 9 |     n_deleted: int
10 |     entries: List[Dict[str, Any]]
11 | 
12 | 
13 | class OutputSubmission(BaseModel):
14 |     """Submissions as downloaded per-record."""
15 | 
16 |     db_id: int
17 |     submission_id: str
18 |     filename: str
19 |     n_deleted_across_file: int
20 |     insert_timestamp: str
21 |     entry: Dict[str, Any]
22 | 
23 | 
24 | class EncryptedEntry(BaseModel):
25 |     encrypted: str
26 | 
27 | 
28 | class EncryptedSubmission(BaseModel):
29 |     """Matches the downloaded submission format based on the database schema."""
30 | 
31 |     submission_id: str
32 |     filename: str
33 |     n_deleted_across_file: int
34 |     entry: EncryptedEntry
35 | 
36 | 
37 | class SubmissionList(RootModel):
38 |     """Submissions as send from the webbrowser.
39 |     Basically, a list of file submissions as one List."""
40 | 
41 |     root: List[Submission]
42 | 


--------------------------------------------------------------------------------
/osd2f/javascript/file_upload.js:
--------------------------------------------------------------------------------
  1 | // This is the javascript to handle folder loading &
  2 | // client-side filtering
  3 | 'use strict'
  4 | import 'blob-polyfill' // for safari File handling
  5 | 
  6 | import { Archive } from 'libarchive.js'
  7 | import { apply_adv_anonymization } from './server_interaction'
  8 | import { visualize } from './visualize'
  9 | import { getFilesFromDataTransferItems } from 'datatransfer-files-promise'
 10 | import { server } from './server_interaction'
 11 | import { fileReader } from './parsing/fileparser'
 12 | import { ParseJSON } from './parsing/jsonparsing'
 13 | 
 14 | export { visualize as vis } from './visualize'
 15 | export { server } from './server_interaction'
 16 | 
 17 | server.log('INFO', 'loaded js')
 18 | 
 19 | // loads the zipfile reading WASM libraries
 20 | Archive.init({ workerUrl: '/static/js/libarchive/worker-bundle.js' })
 21 | 
 22 | server.log('INFO', 'initialized archive worker')
 23 | 
 24 | // folderScanner handles folder uploads.
 25 | const folderScanner = function (webkitEntry, files) {
 26 |   if (webkitEntry.isDirectory) {
 27 |     let dir = webkitEntry.createReader()
 28 |     dir.readEntries(entries =>
 29 |       entries.forEach(entry => folderScanner(entry, files))
 30 |     )
 31 |   } else {
 32 |     files.push(webkitEntry)
 33 |   }
 34 | }
 35 | 
 36 | // reparseAsUTF8 stringifies an object, parses the string as UTF8
 37 | // and returns the JSON parsed result. This removes issues with
 38 | // UTF-8 donations, that JS assumes are UTF-16. 
 39 | const reparseAsUTF8 = function (object) {
 40 |   // drawn from https://stackoverflow.com/questions/52747566/what-encoding-facebook-uses-in-json-files-from-data-export
 41 |   function decode(s) {
 42 |     let d = new TextDecoder;
 43 |     let a = s.split('').map(r => r.charCodeAt());
 44 |     return d.decode(new Uint8Array(a));
 45 |   }
 46 | 
 47 |   let stringObj = JSON.stringify(object)
 48 | 
 49 | 
 50 |   let decodedString = decode(stringObj)
 51 | 
 52 |   // check whether translation did not introduce bad characters
 53 |   // that signal it was already UTF-16, in which case we return
 54 |   // the original, supposedly UTF16 object
 55 |   if (decodedString.search("�") > 0) {
 56 |     return object
 57 |   }
 58 | 
 59 |   // return the UTF8->UTF16 decoded object instead
 60 |   return JSON.parse(decodedString)
 61 | }
 62 | 
 63 | // countFileTypes takes a list of filenames and
 64 | // counts the lowercased extensions
 65 | function countFileTypes(arr) {
 66 |   let counts = new Object
 67 |   arr.
 68 |     map(e => e.split(".").pop().toLowerCase()).
 69 |     map(ext => counts[ext] = counts[ext] + 1 || 1)
 70 |   return counts
 71 | }
 72 | 
 73 | // fileLoadController checks whether files are in the whitelist, 
 74 | // and parses these files using the fileReader and the appropriate
 75 | // whitelist of fields for that particular file. 
 76 | export const fileLoadController = async function (sid, settings, files) {
 77 |   document.getElementById("empty_selection").classList.add("d-none")
 78 |   document.getElementById('processing').classList.remove('invisible')
 79 |   // we map filenames to the regex format filenames in
 80 |   // provided settings
 81 |   var setmatch
 82 |   setmatch = Object.fromEntries(
 83 |     files.map(file => {
 84 |       let nameRegex
 85 |       for (nameRegex of Object.keys(settings.files)) {
 86 |         if (RegExp(nameRegex).exec(file.name)) {
 87 |           return [file.name, nameRegex]
 88 |         }
 89 |       }
 90 |       return []
 91 |     })
 92 |   )
 93 |   // remove undefined keys, i.e. files that do not match any RegEx
 94 |   Object.keys(setmatch).map(k => {
 95 |     if (k === 'undefined') {
 96 |       delete setmatch[k]
 97 |     }
 98 |   })
 99 | 
100 |   let acceptedFiles
101 |   acceptedFiles = files.filter(f => setmatch[f.name] !== undefined)
102 | 
103 |   // log the count of selected files, the count of files
104 |   // matching the whitelist and a frequency table of the
105 |   // filetypes selected.
106 |   server.log("INFO", "files selected", sid,
107 |     {
108 |       "selected": files.length,
109 |       "matching_whitelist": acceptedFiles.length,
110 |       "types": countFileTypes(files.map(f => f.name))
111 |     })
112 | 
113 |   if (files.length > 0 && acceptedFiles.length == 0) {
114 |     document.getElementById("empty_selection").classList.remove("d-none")
115 |     server.log("ERROR", "empty selection", sid)
116 |   }
117 | 
118 |   let data = []
119 | 
120 |   let bar = document.getElementById('progress-bar')
121 |   bar.value = 0
122 |   let f
123 |   for (f of acceptedFiles) {
124 |     let content
125 |     // normal files
126 |     if (f.text != null) {
127 |       content = await f.text()
128 |     } else {
129 |       let extractedFile = await f.extract()
130 |       content = await extractedFile.text()
131 |     }
132 |     let fileob
133 |     fileob = new Object()
134 |     fileob['filename'] = f.name
135 |     fileob['submission_id'] = sid
136 |     fileob['n_deleted'] = 0
137 |     try {
138 |       server.log('INFO', 'file parsing', window.sid, {
139 |         file_match: setmatch[f.name]
140 |       })
141 |       fileob['entries'] = fileReader(
142 |         settings['files'][setmatch[f.name]].accepted_fields,
143 |         ParseJSON(content), // custom to support malformed
144 |         null,
145 |         settings['files'][setmatch[f.name]].in_key
146 |       )
147 |       server.log('INFO', 'reparsing file to UTF8')
148 |       try {
149 |         fileob = reparseAsUTF8(fileob)
150 |       } catch {
151 |         server.log("INFO", "file could not be reparsed, might be UTF16 already", window.sid)
152 |       }
153 | 
154 |       server.log('INFO', 'file send to anonymization', sid, {
155 |         file_match: setmatch[f.name]
156 |       })
157 |       fileob = await apply_adv_anonymization(fileob)
158 |       server.log('INFO', 'file anonymized', sid, {
159 |         file_match: setmatch[f.name]
160 |       })
161 |       data.push(fileob)
162 |     } catch (e) {
163 |       console.log(e)
164 |       server.log('ERROR', 'file matched, but is not JSON', sid)
165 |       console.log("Unable to parse file because it's not real JSON")
166 |     }
167 | 
168 |     // update the loading
169 |     let pos
170 |     pos = (data.length / acceptedFiles.length) * 100
171 | 
172 |     if (pos !== bar.value) {
173 |       bar.value = pos
174 |     }
175 |   }
176 | 
177 |   // filter failed files
178 |   data = data.filter(x => x)
179 | 
180 |   // show users that processing has completed
181 |   bar.value = 100
182 |   document.getElementById('processing').classList.add('invisible')
183 | 
184 |   server.log('INFO', 'starting visualization', sid)
185 |   visualize(data, content)
186 | }
187 | 
188 | // fileSelectHandler is used to detect files uploaded through
189 | // the file select prompt.
190 | export async function fileSelectHandler(e) {
191 |   server.log('INFO', 'file select detected', sid)
192 |   var filesSelected = e.target.files
193 |   if (filesSelected === undefined) {
194 |     server.log('INFO', 'file select empty', sid)
195 |     return // no files selected yet
196 |   }
197 | 
198 |   // if there is one file, which is an archive
199 |   if (RegExp('.*.zip$').exec(filesSelected[0].name) != null) {
200 |     server.log('INFO', 'file select is archive', sid)
201 | 
202 |     let archiveContent = await Archive.open(filesSelected[0])
203 |     let contentList = await archiveContent.getFilesArray()
204 |     let fl = contentList.map(c => c.file)
205 | 
206 |     fileLoadController(sid, settings, fl)
207 |   } else {
208 |     server.log('INFO', 'file select is single file', sid)
209 | 
210 |     fileLoadController(sid, settings, Array(filesSelected[0]))
211 |   }
212 | }
213 | document.getElementById('fileElem').onchange = fileSelectHandler
214 | 
215 | // fileDropHandler is used to detect files uploaded using 
216 | // the drag-and-drop interface.
217 | async function fileDropHandler(e) {
218 |   server.log('INFO', 'file drop detected', sid)
219 | 
220 |   let filesSelected = await getFilesFromDataTransferItems(e.dataTransfer.items)
221 | 
222 |   // if there is one file, which is an archive
223 |   if (
224 |     filesSelected.length == 1 &&
225 |     RegExp('.*.zip$').exec(filesSelected[0].name) != null
226 |   ) {
227 |     server.log('INFO', 'file drop is archive', sid)
228 | 
229 |     let archiveContent = await Archive.open(filesSelected[0])
230 |     let contentList = await archiveContent.getFilesArray()
231 |     let fl = contentList.map(c => c.file)
232 | 
233 |     fileLoadController(sid, settings, fl)
234 |   } else {
235 |     server.log('INFO', 'file drop is file(s)', sid)
236 | 
237 |     fileLoadController(sid, settings, filesSelected)
238 |   }
239 | }
240 | document
241 |   .getElementById('drop-area')
242 |   .addEventListener('drop', fileDropHandler, false)
243 | 


--------------------------------------------------------------------------------
/osd2f/javascript/parsing/fileparser.js:
--------------------------------------------------------------------------------
 1 | const { objReader } = require("./objparsing")
 2 | var jp = require("jsonpath")
 3 | 
 4 | // fileReader selects the starting point for recursive parsing
 5 | // for each object in the file and returns the resulting objects.
 6 | const fileReader = function (paths, objects, prepath, in_key) {
 7 |     // in case the data is nested in an object
 8 |     // rather than an array
 9 |     if (typeof in_key !== 'undefined' && in_key !== null) {
10 | 
11 |         var jsonPathSpec = '$["' + in_key + '"].*'
12 |         var nested_objects = jp.query(objects, jsonPathSpec)
13 | 
14 |         return fileReader(paths, nested_objects, in_key, undefined)
15 | 
16 |     }
17 | 
18 |     if (Array.isArray(objects)) {
19 |         // in case the contents is just one array of values,
20 |         // instead of an array of objects
21 |         if (paths.length == 0) {
22 |             let entries = []
23 |             let i = 0
24 |             while (i < objects.length) {
25 |                 entries.push({
26 |                     index: i,
27 |                     value: objects[i]
28 |                 })
29 |                 i++
30 |             }
31 |             return entries
32 |         } else {
33 |             // extract the whitelisted paths from all objects
34 |             // in the array contained in the file
35 |             return objects.map(obj => objReader(paths, obj))
36 |         }
37 |     }
38 | 
39 |     // If the objects is actually one object (not an array)
40 |     return [objReader(paths, objects)]
41 | }
42 | 
43 | module.exports.fileReader = fileReader


--------------------------------------------------------------------------------
/osd2f/javascript/parsing/jsonparsing.js:
--------------------------------------------------------------------------------
 1 | 
 2 | // ParseJSON is a helper that is lenient for badly
 3 | // formatted json
 4 | const ParseJSON = function (text_content) {
 5 |     try {
 6 |         return JSON.parse(text_content)
 7 |     }
 8 |     catch {
 9 |         return parseTwitterJSON(text_content)
10 |     }
11 | 
12 | }
13 | 
14 | 
15 | // parseTwitterJSON parses malformed JSON delivered by Twitter
16 | // it's actually javascript, but we deem an `eval` call too
17 | // insecure (it would allow for arbitrary code injection)
18 | const parseTwitterJSON = function (text_content) {
19 | 
20 |     // assume it's the first, global, key that is malformed
21 |     chunks = text_content.split("=")
22 |     main_key = chunks.shift()
23 |     body = chunks.join('=')
24 | 
25 |     // build it as proper JSON
26 |     fixed_content = '{ "' + main_key.trim() + '" :' + body + '}'
27 | 
28 |     fixed_content.replace('\\', '\\\\')
29 | 
30 |     return JSON.parse(fixed_content)
31 | }
32 | 
33 | module.exports.ParseJSON = ParseJSON


--------------------------------------------------------------------------------
/osd2f/javascript/parsing/objparsing.js:
--------------------------------------------------------------------------------
 1 | // objReader recursively parses JSON objects to extract
 2 | // the whitelisted fields and returns a flattened representation.
 3 | const objReader = function (spec, o, prev) {
 4 |     let flat_obj = {}
 5 | 
 6 |     let options = spec.map(p => p.split('.').shift(1))
 7 | 
 8 |     // if the object is the endpoint of a spec, 
 9 |     if (prev !== undefined && (spec.length === 0 || spec[0] === "")) {
10 |         flat_obj[prev] = o
11 |         return flat_obj
12 |     }
13 | 
14 |     let k
15 |     for (k of Object.keys(o)) {
16 |         if (options.filter(o => o == k).length == 0) {
17 |             continue
18 |         }
19 |         let newkey = [prev, k].filter(e => typeof e != 'undefined').join('.')
20 | 
21 |         let val = o[k]
22 |         let sub_spec = spec
23 |             .filter(s => s.startsWith(k + "."))
24 |             .map(s => s.substring(k.length + 1, s.length))
25 | 
26 |         if (Array.isArray(val)) {
27 |             if (sub_spec == "") {
28 |                 flat_obj[newkey] = val
29 |                 continue
30 |             }
31 | 
32 |             var ac
33 |             ac = val.map(c => objReader(sub_spec, c))
34 | 
35 |             // only append array values if they are not empty
36 |             if (ac.length > 0) {
37 |                 flat_obj[newkey] = ac
38 |             }
39 |             continue
40 |         }
41 | 
42 |         if (typeof val == 'object' && val != null) {
43 |             flat_obj = Object.assign(flat_obj, objReader(sub_spec, val, newkey))
44 | 
45 |             continue
46 |         }
47 | 
48 | 
49 |         flat_obj[newkey] = val
50 |     }
51 | 
52 | 
53 |     return flat_obj
54 | }
55 | 
56 | module.exports.objReader = objReader


--------------------------------------------------------------------------------
/osd2f/javascript/server_interaction.js:
--------------------------------------------------------------------------------
 1 | 'use strict'
 2 | 
 3 | // for use áfter local filtering of fields
 4 | // allows for advanced server-side anonymization
 5 | export async function apply_adv_anonymization (fileobj) {
 6 |   fileobj = await fetch('/adv_anonymize_file', {
 7 |     method: 'POST',
 8 |     mode: 'same-origin',
 9 |     credentials: 'same-origin',
10 |     headers: {
11 |       'Content-Type': 'application/json'
12 |     },
13 |     body: JSON.stringify(fileobj)
14 |   })
15 |     .then(response => {
16 |       return response.json()
17 |     })
18 |     .catch(response => console.log(response.error))
19 | 
20 |   return fileobj.data
21 | }
22 | 
23 | export const server = {
24 |   log: function (level, position, sid, entry) {
25 |     let params = {
26 |       level: level,
27 |       position: position,
28 |       cb: Math.random() // to avoid caching
29 |     }
30 |     if (sid != undefined) {
31 |       params['sid'] = sid
32 |     } else {
33 |       if (window.sid != undefined) {
34 |         params['sid'] = window.sid
35 |       }
36 |     }
37 |     if (entry != undefined) {
38 |       params['entry'] = JSON.stringify(entry)
39 |     }
40 | 
41 |     fetch('/log?' + new URLSearchParams(params), {
42 |       method: 'GET',
43 |       mode: 'same-origin',
44 |       credentials: 'same-origin'
45 |     })
46 |       .then(r => {})
47 |       .catch(e => {
48 |         console.log('Unable to log', level, position, 'due to', e)
49 |       })
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/osd2f/javascript/tests/fileparsing.test.js:
--------------------------------------------------------------------------------
  1 | const { fileReader } = require("../parsing/fileparser")
  2 | 
  3 | test("test array of objects file", () => {
  4 |     json_content = [
  5 |         {
  6 |             "key": "value"
  7 |         },
  8 |         {
  9 |             "key": "value2"
 10 |         }
 11 |     ]
 12 | 
 13 |     spec = {
 14 |         fields: ["key"]
 15 |     }
 16 | 
 17 |     parsed = fileReader(spec.fields, json_content)
 18 | 
 19 |     expect(parsed[0].key).toBe("value")
 20 |     expect(parsed[1].key).toBe("value2")
 21 | 
 22 | })
 23 | 
 24 | test("test array of objects nested in key", () => {
 25 |     json_content = {
 26 |         "main_key": [
 27 |             { "name": "obj1" },
 28 |             { "name": "obj2" }
 29 |         ]
 30 |     }
 31 | 
 32 | 
 33 |     spec = {
 34 |         in_key: "main_key",
 35 |         fields: ["name"]
 36 |     }
 37 | 
 38 |     parsed = fileReader(spec.fields, json_content, undefined, spec.in_key)
 39 | 
 40 |     expect(parsed[0].name).toBe("obj1")
 41 |     expect(parsed[1].name).toBe("obj2")
 42 | 
 43 | })
 44 | 
 45 | test("test object with array of values file", () => {
 46 |     json_content = {
 47 |         "main_key": [
 48 |             "value1",
 49 |             "value2",
 50 |             "value3"
 51 |         ]
 52 |     }
 53 | 
 54 |     spec = {
 55 |         in_key: "main_key"
 56 |     }
 57 | 
 58 |     parsed = fileReader([], json_content, undefined, spec.in_key)
 59 | 
 60 |     expect(parsed[0].index).toBe(0)
 61 |     expect(parsed[0].value).toBe("value1")
 62 | 
 63 |     expect(parsed[1].index).toBe(1)
 64 |     expect(parsed[1].value).toBe("value2")
 65 | 
 66 |     expect(parsed[2].index).toBe(2)
 67 |     expect(parsed[2].value).toBe("value3")
 68 | 
 69 | })
 70 | 
 71 | test("test file with '.' in main_key", () => {
 72 |     json_content = {
 73 |         "main.key": [
 74 |             { name: "obj1" },
 75 |             { name: "obj2" }
 76 |         ]
 77 |     }
 78 | 
 79 |     spec = {
 80 |         in_key: "main.key",
 81 |         fields: ["name"]
 82 |     }
 83 | 
 84 |     parsed = fileReader(spec.fields, json_content, undefined, spec.in_key)
 85 | 
 86 |     expect(parsed[0].name).toBe("obj1")
 87 |     expect(parsed[1].name).toBe("obj2")
 88 | 
 89 | })
 90 | 
 91 | test("test file with array of values obj", () => {
 92 |     json_content = [
 93 |         {
 94 |             "keywords": ["keyword A", "keyword B"]
 95 |         }
 96 |     ]
 97 | 
 98 |     spec = {
 99 |         fields: ["keywords"]
100 |     }
101 | 
102 |     parsed = fileReader(spec.fields, json_content)
103 | 
104 |     expect(parsed[0].keywords.length).toBe(2)
105 |     expect(parsed[0].keywords[0]).toBe("keyword A")
106 |     expect(parsed[0].keywords[1]).toBe("keyword B")
107 | })
108 | 
109 | test("test file with heavily nested values", () => {
110 |     json_content = { "window.YTD.stuff": [{ "one": { "two": { "three": [{ "nested": "obj" }] } } }, { "one": { "two": { "three": [{ "nested": "obj_two" }] } } }] }
111 | 
112 |     spec = {
113 |         in_key: "window.YTD.stuff",
114 |         fields: ["one.two.three"]
115 |     }
116 | 
117 |     parsed = fileReader(spec.fields, json_content, undefined, spec.in_key)
118 | 
119 |     expect(parsed[0]["one.two.three"][0].nested).toBe("obj")
120 |     expect(Array.isArray(parsed)).toBe(true)
121 | })
122 | 
123 | test("fields that result in object value", () => {
124 |     json_content = { "window.YTD.stuff": [{ "one": { "two": { "three": [{ "nested": "obj" }] } } }, { "one": { "two": { "three": [{ "nested": "obj_two" }] } } }] }
125 | 
126 |     spec = {
127 |         in_key: "window.YTD.stuff",
128 |         fields: ["one.two"]
129 |     }
130 | 
131 |     parsed = fileReader(spec.fields, json_content, undefined, spec.in_key)
132 | 
133 |     console.log(parsed)
134 |     expect(Array.isArray(parsed[0]["one.two"].three)).toBe(true)
135 | })


--------------------------------------------------------------------------------
/osd2f/javascript/tests/jsonparsing.test.js:
--------------------------------------------------------------------------------
 1 | const { ParseJSON } = require("../parsing/jsonparsing")
 2 | 
 3 | test("test regular JSON obj", () => {
 4 |     text_content = '{"content": [1, 2, 3]}'
 5 |     content = ParseJSON(text_content)
 6 | 
 7 |     expect(content.content.length).toBe(3)
 8 |     expect(content.content[0]).toBe(1)
 9 | 
10 | })
11 | 
12 | test("test regular JSON array", () => {
13 |     text_content = '[1,2,3]'
14 |     content = ParseJSON(text_content)
15 | 
16 |     expect(content.length).toBe(3)
17 |     expect(content[0]).toBe(1)
18 | })
19 | 
20 | test("bad (twitter) JSON", () => {
21 |     text_content = 'content = [ { "key" : "value" } ]'
22 |     content = ParseJSON(text_content)
23 | 
24 |     expect(content.content[0].key).toBe("value")
25 | })
26 | 
27 | test("twitter data with unescaped '\'", () => {
28 |     text_content = String.raw`content = [ { "text_with_slashes" : "new \n line \n!"}]`
29 |     content = ParseJSON(text_content)
30 | 
31 |     expect(content.content[0].text_with_slashes).toBe("new \n line \n!")
32 | 
33 | })


--------------------------------------------------------------------------------
/osd2f/javascript/tests/objectparsing.test.js:
--------------------------------------------------------------------------------
 1 | objparsing = require("../parsing/objparsing")
 2 | 
 3 | 
 4 | test("Parsing simple data", () => {
 5 |     simple_data = {
 6 |         "key": 1,
 7 |         "nested":
 8 |         {
 9 |             "key": 2
10 |         },
11 |         "nested_obj": {
12 |             "sub1": 1,
13 |             "sub2_ignored": 2
14 |         },
15 |         "nested_array": [
16 |             {
17 |                 "array_obj": 3,
18 |                 "array_obj_ignored": 3
19 |             }
20 |         ]
21 |     }
22 | 
23 |     simple_spec = {
24 |         fields: [
25 |             "key",
26 |             "nested.key",
27 |             "nested_obj.sub1",
28 |             "nested_array.array_obj",
29 |             "nonexisting_field"
30 |         ]
31 |     }
32 | 
33 |     // do the parsing
34 |     r = objparsing.objReader(simple_spec.fields, simple_data)
35 | 
36 |     // check whether specified and existing fields are recoverd
37 |     expect(r.key).toBe(1)
38 |     expect(r["nested.key"]).toBe(2)
39 |     expect(r["nested_obj.sub1"]).toBe(1)
40 |     expect(r.nested_array[0].array_obj).toBe(3)
41 | 
42 |     // check whether specified but missing fields are ignored
43 |     expect(r.nonexisting_field).toBe(undefined)
44 | 
45 |     // check whether ignored files are indeed ignored
46 |     expect(r["nested_obj.sub2_ignored"]).toBe(undefined)
47 |     expect(r.nested_array[0].array_obj_ignored).toBe(undefined)
48 | 
49 | })
50 | 
51 | test("Empty nested array which is a parent key should not show up as it's own witelisted field", () => {
52 |     data = {
53 |         key: []
54 |     }
55 | 
56 |     spec = {
57 |         fields: ["key.subfield"]
58 |     }
59 | 
60 |     r = objparsing.objReader(spec.fields, data)
61 | 
62 |     expect(r.key).toBe(undefined)
63 | 
64 | })


--------------------------------------------------------------------------------
/osd2f/javascript/visualization_components/consentConfirmation.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |     <div>
 3 |         <b-modal 
 4 |             :title="content.consent_popup.title"
 5 |             id="consent-modal" 
 6 |             :ok-title="content.consent_popup.accept_button"
 7 |             ok-variant="success"
 8 |             :cancel-title="content.consent_popup.decline_button"
 9 |             :ok-disabled="this.processing"
10 |             @ok="sub"
11 |         >
12 |             <b-overlay :show="show" rounded="sm">
13 |                 <div :aria-hidden="show ? 'true' : null">
14 |                     <div class="p-1"> {{ content.consent_popup.lead }} </div>
15 |                     <ul>
16 |                         <li v-for="lit in content.consent_popup.points"> {{lit}} </li>
17 |                     </ul>
18 | 
19 |                     <div class="p-1"> {{ content.consent_popup.end_text }} </div>
20 |                 </div>
21 |             </b-overlay>
22 |         </b-modal>
23 |     </div>
24 | </template>
25 | <script>
26 | import {server} from "../server_interaction"
27 | 
28 | export default {
29 |     props:{
30 |         donations : Array,
31 |         content: Object
32 |     },
33 |     data(){
34 |         return {
35 |             show: false,
36 |             processing : false
37 |             }
38 |     },
39 |     computed: {
40 |         n_entries() {
41 |             let total = 0
42 |             this.donations.forEach(d=>total+=d.entries.length)
43 |             return total
44 |         }
45 |     },
46 |     methods:{
47 |         sub(evt){
48 |             evt.preventDefault()
49 |             this.show = true
50 |             this.processing = true
51 | 
52 |             server.log("INFO", "consent given, uploading file")
53 | 
54 |             fetch('/upload', {
55 |                 method: 'POST',
56 |                 mode: 'same-origin',
57 |                 credentials: 'same-origin',
58 |                 headers: {
59 |                 'Content-Type': 'application/json'
60 |                 },
61 |                 body: JSON.stringify(this.donations)
62 |             })
63 |             .then(() => {
64 |             // remove processing queue
65 |                 this.processing = false
66 |                 noDonationYet = false
67 |                 this.show = false
68 |                 this.$bvModal.hide("consent-modal")
69 |                 this.$parent.$parent.donations = []
70 |                 document.getElementById('thankyou').classList.remove('invisible')
71 |             })
72 |             .catch(error => {
73 |             console.log('Error', error)
74 |             server.log("ERROR", "failed to upload file")
75 |             })
76 | 
77 |         }
78 |     }
79 | 
80 | }
81 | </script>


--------------------------------------------------------------------------------
/osd2f/javascript/visualization_components/donationContainer.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 | <div>
 3 |     <div v-if="donations.length>0">
 4 |         <div class='row justify-content-center p-4'>
 5 |             <h2> {{content.file_indicator_text}} {{total_entries}}</h2>
 6 |         </div>
 7 | 
 8 |             <div class="row justify-content-center p-4">
 9 |                 <b-row>
10 |                     <b-col ><b-button size="lg" variant="success" @click="showConsentModal">{{ content.donate_button }}</b-button></b-col>
11 |                     <b-col ><b-button size="sm" v-b-toggle.edit-donation variant="outline-primary"> {{ content.inspect_button }} </b-button></b-col>
12 |                 </b-row>
13 |             </div>
14 | 
15 |         <div class="row justify-content-center p-4">
16 |             <b-collapse id="edit-donation">
17 |                 <!-- actual inspection / remove interface -->
18 |                 <div class="row justify-content-center p-2">
19 |                     <h4> {{ content.preview_component.title }} </h4>
20 |                 </div>
21 |                 <b-tabs  card-vertical content-class="mt-3" v-model="tabIndex" justified fill>
22 |                     <b-tab :title=content.preview_component.title class="p-2"> 
23 |                         <h5> {{ content.preview_component.title }} </h5>
24 |                         <p v-for="p in content.preview_component.explanation">
25 |                             {{p}}
26 |                         </p>
27 |                         </b-tab>
28 |                     <b-tab v-for="fileob in donations" :title="fileob.filename" :key="fileob.filename" lazy>
29 |                         <donation-table v-bind:filedata=fileob v-bind:content=content></donation-table>
30 |                     </b-tab>
31 |                 </b-tabs>
32 | 
33 |                 <div class="text-center">
34 |                     <b-button-group class="mt-2">
35 |                         <b-button variant="primary" @click="tabIndex--">{{content.preview_component.previous_file_button}}</b-button>
36 |                         <b-button variant="primary" @click="tabIndex++">{{content.preview_component.next_file_button}}</b-button>
37 |                     </b-button-group>
38 |                 </div>
39 |                 
40 | 
41 |             </b-collapse>
42 |             <consent-confirmation :donations=this.donations :content=this.content></consent-confirmation>
43 |         </div>
44 |     </div>
45 | </div>
46 | </template>
47 | 
48 | <script>
49 | 
50 | import donationTable from './donationTable'
51 | import consentConfirmation from './consentConfirmation'
52 | import {server} from '../server_interaction.js'
53 | 
54 | export default {
55 |   components: { donationTable,  consentConfirmation},
56 |     props : {
57 |         donations: Array,
58 |         content: Object
59 |         },
60 |     updated : function(){
61 |         server.log("INFO", "Tables shown changed", window.sid)
62 |     },
63 |     data() {
64 |         return {
65 |             tabIndex: 0
66 |         }
67 |     },
68 |     computed:{
69 |         total_entries () {
70 |             let total = 0
71 |             this.donations.forEach(d=>total+=d.entries.length)
72 |             return total
73 |         }
74 |     },
75 |     methods: {
76 |         showConsentModal(){
77 |             this.$bvModal.show('consent-modal')
78 |             server.log("INFO", "Consent modal shown")
79 |         }
80 |     }
81 | 
82 | }
83 | </script>


--------------------------------------------------------------------------------
/osd2f/javascript/visualization_components/donationTable.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |  <div>
  3 |      <h5> File: {{ filedata.filename }} </h5>
  4 | 
  5 |     <span>{{content.preview_component.entries_in_file_text}} <b> {{ this.rows }} </b></span>
  6 |     
  7 |     <div class="row">
  8 |         <b-input-group :prepend="content.preview_component.search_prompt" class="mt-3">
  9 |             
 10 |             <b-form-input
 11 |                 id="filter-input"
 12 |                 v-model="filter"
 13 |                 type="search"
 14 |                 :placeholder="content.preview_component.search_box_placeholder"
 15 |             ></b-form-input>
 16 | 
 17 |             <b-input-group-append>
 18 |                 <b-button
 19 |                     variant="danger"
 20 |                     @click="removeSelection"
 21 |                     small
 22 |                 >{{content.preview_component.remove_rows_button}}</b-button>
 23 |             </b-input-group-append>
 24 |         </b-input-group>
 25 |     </div>
 26 |     <div class="row">
 27 |         <div class="col-10">
 28 |             <b-table 
 29 |                 id="file-table"
 30 |                 small
 31 |                 responsive
 32 |                 sticky-header="1200px"
 33 |                 thClass="col-2"
 34 |                 hover 
 35 |                 selectable
 36 |                 :items="items" 
 37 |                 :fields="showfields" 
 38 |                 :per-page="perPage" 
 39 |                 :current-page="currentPage"
 40 |                 :filter="filter"
 41 |                 @row-selected="onRowSelected"
 42 |                 @filtered="onFilterApply"
 43 |             >
 44 |             </b-table>
 45 |         </div>
 46 |     </div>
 47 | 
 48 |     <b-pagination 
 49 |         v-model="currentPage" 
 50 |         aria-controls="file-table"
 51 |         :total-rows="rows"
 52 |         :per-page="perPage"
 53 |     ></b-pagination>
 54 | 
 55 | 
 56 | </div>
 57 | </template>
 58 | 
 59 | <script>
 60 | import {server} from '../server_interaction'
 61 | export default {
 62 |     props: {
 63 |         fields: Array,
 64 |         filedata: Object,
 65 |         content: Object
 66 |     },
 67 |     // infer fields from entries
 68 |     created: function(){
 69 |         if (this.filedata.entries == undefined){return}
 70 |       var fields = new Set
 71 |       for (let i=0; i<this.filedata.entries.length; i++){
 72 |           Object.keys(this.filedata.entries[i]).forEach((f)=>fields.add(f))
 73 |       }
 74 |       let showfields = new Array
 75 |       fields.forEach(f => {
 76 |           let o = new Object
 77 |           o[f] = { 
 78 |               "label": f, 
 79 |               "tdClass":"colClass",
 80 |               "sortable" : true
 81 |               } 
 82 |           showfields.push(o)}
 83 |           )
 84 |       
 85 |       this.showfields = showfields
 86 |     },
 87 |     data(){
 88 |         return {
 89 |             perPage: 5,
 90 |             currentPage: 1,
 91 |             filter: null,
 92 |             selected : null
 93 |         }
 94 |     },
 95 |     computed: {
 96 |         rows(){
 97 |          try {
 98 |              return this.filedata.entries.length
 99 |          } catch {
100 |              console.log("no file yet")
101 |              return 0
102 |          }
103 |         },
104 |         items(){
105 |             if (this.filedata.entries == undefined || this.filedata.entries==null){
106 |                 return []
107 |             }
108 |             
109 |             return this.filedata.entries
110 |         }
111 |     },
112 |     methods: {
113 |         onRowSelected(items){
114 |             if (items.length>0){server.log("INFO","select row", this.filedata.submission_id)}
115 |             this.selected = items
116 |         },
117 |         removeSelection(){
118 |             if(this.selected==null){return}
119 |             server.log("INFO",`removed rows`, window.sid, {rows_removed:this.selected.length})
120 |             this.filedata.n_deleted += this.selected.length
121 |             this.filedata.entries = this.filedata.entries.filter(e => !this.selected.includes(e))
122 |             this.selected = null
123 |         },
124 |         onFilterApply(){
125 |             this.currentPage = 1
126 |         }
127 |     }
128 | 
129 | }
130 | </script>


--------------------------------------------------------------------------------
/osd2f/javascript/visualize.js:
--------------------------------------------------------------------------------
 1 | 'use strict'
 2 | 
 3 | import Vue from 'vue'
 4 | import { BootstrapVue, IconsPlugin } from 'bootstrap-vue'
 5 | import donationTable from './visualization_components/donationTable'
 6 | import donationContainer from './visualization_components/donationContainer'
 7 | 
 8 | // Import Bootstrap an BootstrapVue CSS files (order is important)
 9 | import 'bootstrap/dist/css/bootstrap.css'
10 | import 'bootstrap-vue/dist/bootstrap-vue.css'
11 | 
12 | Vue.use(BootstrapVue, IconsPlugin)
13 | 
14 | var app = new Vue({
15 |   el: '#visualization',
16 |   components: {
17 |     'donation-table': donationTable,
18 |     'donation-container': donationContainer
19 |   },
20 |   data: {
21 |     filedata: {},
22 |     fields: [],
23 |     donations: [],
24 |     content : {}
25 |   }
26 | })
27 | 
28 | // Placeholder visualization
29 | export function visualize (d,c) {
30 |   app.content = c.upload_page
31 |   app.donations = d
32 | }
33 | 


--------------------------------------------------------------------------------
/osd2f/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | logger = logging.getLogger("osd2f")
4 | 


--------------------------------------------------------------------------------
/osd2f/security/__init__.py:
--------------------------------------------------------------------------------
 1 | """Security wrappers dynamically set at request time.
 2 | 
 3 | Wrapper functions should have the (Callable, *args, **kwargs) -> Response signature.
 4 | """
 5 | 
 6 | import os
 7 | from functools import wraps
 8 | 
 9 | # Wrapper implementations for Authentication
10 | from .authorization.basic_auth import basic_authentication
11 | from .authorization.microsoft_msal import microsoft_msal_authentication
12 | from .authorization.not_confgured import no_authentication
13 | from .download_encryption.encrypted_zipfile import string_to_zipfile  # noqa
14 | from .secrets import azure_keyvault  # Environment secret resolvers
15 | from ..logger import logger  # Global module logger
16 | 
17 | RESOLVERS = {azure_keyvault.PREFIX: azure_keyvault.azure_keyvault_replace}
18 | 
19 | 
20 | def authorization_required(func):
21 |     """A decorator that implements authorization depending on configuration"""
22 | 
23 |     @wraps(func)
24 |     async def decorated_path(*args, **kwargs):
25 |         if os.environ.get("MSAL_CONFIG"):
26 |             logger.info("Using MSAL authentication")
27 |             return await microsoft_msal_authentication(func, *args, **kwargs)
28 |         if os.environ.get("OSD2F_BASIC_AUTH"):
29 |             logger.info("Using basic auth, NOT RECOMMENDED FOR PRODUCTION")
30 |             return await basic_authentication(func, *args, **kwargs)
31 |         else:
32 |             logger.info("Fall back to no authentication")
33 |             return await no_authentication(func, *args, **kwargs)
34 | 
35 |     return decorated_path
36 | 
37 | 
38 | def translate_environment_vars():
39 |     """Translate environment variable values to their secrets.
40 | 
41 |     Assumes environment variables matching a pattern:
42 |      `SECRETSTORE_PREFIX::DELIMITED::ARGUMENTS`
43 | 
44 |     should be translated by their respective resolver functions.
45 | 
46 |     """
47 |     # iterate through environment variables, re-assign if they match a resolver
48 |     # prefix.
49 |     for var, value in os.environ.items():
50 |         for prefix, func in RESOLVERS.items():
51 |             if value.startswith(prefix):
52 |                 os.environ[var] = func(value)
53 | 
54 | 
55 | def translate_value(value: str) -> str:
56 |     """Translate a given value to the appropriate secret.
57 | 
58 |     Assumes the value matches a pattern of a known resolver, e.g.:
59 |         `SECRETSTORE_PREFIX::DELIMITED::ARGUMENTS`
60 | 
61 |     secrets are resolved by their matching RESOLVERS function.
62 | 
63 |     String not matching any known prefix are ignored.
64 |     """
65 |     for prefix, func in RESOLVERS.items():
66 |         if value.startswith(prefix):
67 |             logger.debug(f"{value} resolved using {func.__name__}")
68 |             return func(value)
69 | 
70 |     logger.debug(f"{value} did not match any registered resolver.")
71 |     return value
72 | 


--------------------------------------------------------------------------------
/osd2f/security/authorization/__init__.py:
--------------------------------------------------------------------------------
1 | USER_FIELD = "user"
2 | 


--------------------------------------------------------------------------------
/osd2f/security/authorization/basic_auth.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from quart import Response, redirect, request, session
 4 | 
 5 | from ..authorization import USER_FIELD
 6 | from ...database import insert_log
 7 | 
 8 | 
 9 | async def basic_authentication(func, *args, **kwargs):
10 | 
11 |     # with active authorized session
12 |     if session.get(USER_FIELD):
13 |         await insert_log(
14 |             "server",
15 |             "INFO",
16 |             "download access by authorized user",
17 |             entry={USER_FIELD: session.get(USER_FIELD), "path": request.url},
18 |             user_agent_string=request.headers.get("User-Agent"),
19 |         )
20 |         return await func(*args, **kwargs)
21 | 
22 |     if not request.path.endswith("/login"):
23 |         session["CALLBACK"] = request.url
24 |         return redirect("/login")
25 | 
26 |     user, passw = os.environ["OSD2F_BASIC_AUTH"].split(";")
27 | 
28 |     ra = request.authorization
29 | 
30 |     if not ra:
31 |         return Response(
32 |             "", status=401, headers={"WWW-Authenticate": "Basic realm='data-donation'"}
33 |         )
34 | 
35 |     authenticated = (
36 |         ra and ra.type == "basic" and ra.username == user and ra.password == passw
37 |     )
38 | 
39 |     redirect_target = session.pop("CALLBACK", "/")
40 | 
41 |     if authenticated:
42 |         session[USER_FIELD] = f"{user}"
43 |         return redirect(redirect_target)
44 | 
45 |     return redirect("/")
46 | 


--------------------------------------------------------------------------------
/osd2f/security/authorization/microsoft_msal.py:
--------------------------------------------------------------------------------
  1 | """Microsoft MSAL authentication wrapper
  2 | 
  3 | Contains the route-wrapper for `auth()` paths configured to use
  4 | Microsoft Authentication Library.
  5 | 
  6 | Expects a JSON configuration set as a single string in the `MSAL_CONFIG` field.
  7 | """
  8 | 
  9 | import os
 10 | 
 11 | 
 12 | import msal
 13 | 
 14 | from quart import redirect, request, session
 15 | 
 16 | from ..authorization import USER_FIELD
 17 | from ...database import insert_log
 18 | from ...definitions import MSALConfiguration
 19 | from ...logger import logger
 20 | 
 21 | CALLBACK_FIELD = "callback_after_login"
 22 | 
 23 | 
 24 | async def microsoft_msal_authentication(func, *args, **kwargs):
 25 | 
 26 |     # with active authorized session
 27 |     if session.get(USER_FIELD):
 28 |         await insert_log(
 29 |             "server",
 30 |             "INFO",
 31 |             "download access by authorized user",
 32 |             entry={USER_FIELD: session.get(USER_FIELD), "path": request.url},
 33 |             user_agent_string=request.headers.get("User-Agent"),
 34 |         )
 35 |         return await func(*args, **kwargs)
 36 | 
 37 |     msal_auth = os.environ.get("MSAL_CONFIG")
 38 |     config = MSALConfiguration.model_validate_json(msal_auth)
 39 | 
 40 |     authorizer = msal.ConfidentialClientApplication(
 41 |         config.client_id,
 42 |         authority=config.authority,
 43 |         client_credential=config.secret,
 44 |     )
 45 |     accepted_users = [u.strip() for u in config.allowed_users.split(";")]
 46 | 
 47 |     # new user
 48 |     if not session.get(USER_FIELD) and not session.get("flow"):
 49 |         if not request.path.endswith("login"):
 50 |             session[CALLBACK_FIELD] = request.url
 51 |             logger.debug(f"Redirecting to `/login' from {request.url}")
 52 |             return redirect("/login")
 53 | 
 54 |         flow = authorizer.initiate_auth_code_flow(
 55 |             config.scope,
 56 |             redirect_uri=config.redirect_url,
 57 |         )
 58 |         session["flow"] = flow
 59 |         return redirect(flow["auth_uri"])
 60 | 
 61 |     # returning from microsoft authentication portal
 62 |     elif session.get("flow"):
 63 |         try:
 64 |             token = authorizer.acquire_token_by_auth_code_flow(
 65 |                 session.get("flow"), request.args
 66 |             )
 67 |         except ValueError:
 68 |             await insert_log(
 69 |                 "server", "WARN", "unable to acquire token for authentication"
 70 |             )
 71 |             session.clear()
 72 |             return 'Something went wrong, please <a href="/login"> try again </a>'
 73 |         session.pop("flow")
 74 |         if "id_token_claims" not in token:
 75 |             await insert_log(
 76 |                 "server",
 77 |                 "WARN",
 78 |                 "MSAL response did not contain `id_token_claims`, this may indicate "
 79 |                 "that the configuration must be checked by an organizational "
 80 |                 "administrator or is otherwise incomplete.",
 81 |             )
 82 |             return (
 83 |                 "This app is unable to verify your identity due to lacking rigths.",
 84 |                 500,
 85 |             )
 86 |         if token["id_token_claims"].get("preferred_username") in accepted_users:
 87 |             session[USER_FIELD] = token["id_token_claims"].get("preferred_username")
 88 | 
 89 |             callback_url = session.pop(CALLBACK_FIELD, request.url)
 90 |             logger.debug(f"Done authentication flow, returning user to {callback_url}")
 91 |             return redirect(callback_url)
 92 |         else:
 93 |             await insert_log(
 94 |                 "server",
 95 |                 "WARN",
 96 |                 "unauthorized access attempt",
 97 |                 user_agent_string=request.headers.get("User-Agent"),
 98 |             )
 99 |             return "Your account is not authorized", 403
100 | 
101 |     return redirect("/")
102 | 


--------------------------------------------------------------------------------
/osd2f/security/authorization/not_confgured.py:
--------------------------------------------------------------------------------
 1 | """Contains the route-override when no authentication is configured
 2 | """
 3 | 
 4 | 
 5 | async def no_authentication(func, *args, **kwargs):
 6 |     return (
 7 |         "Page unavailable: Authorization must be configured "
 8 |         "unless the app is in testing or debug mode.",
 9 |         501,
10 |     )
11 | 


--------------------------------------------------------------------------------
/osd2f/security/download_encryption/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/security/download_encryption/__init__.py


--------------------------------------------------------------------------------
/osd2f/security/download_encryption/encrypted_zipfile.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | 
 3 | import pyzipper
 4 | 
 5 | 
 6 | def string_to_zipfile(file_content: io.StringIO, filename: str, password: str) -> bytes:
 7 |     """Write a string body to a file in an encrypted zip archive."""
 8 |     zipio = io.BytesIO()
 9 |     with pyzipper.AESZipFile(zipio, "w", encryption=pyzipper.WZ_AES) as zipfile:
10 |         zipfile.setpassword(password.encode())
11 |         zipfile.writestr(filename, file_content.getvalue())
12 |     return zipio.getvalue()
13 | 


--------------------------------------------------------------------------------
/osd2f/security/entry_encryption/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/security/entry_encryption/__init__.py


--------------------------------------------------------------------------------
/osd2f/security/entry_encryption/file_decryption.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import csv
  3 | import json
  4 | import pathlib
  5 | from typing import Any, Dict, Iterable, List
  6 | 
  7 | from ...definitions.submissions import (
  8 |     EncryptedEntry,
  9 |     EncryptedSubmission,
 10 |     OutputSubmission,
 11 | )
 12 | from ...logger import logger
 13 | from ...security.entry_encryption.secure_entry_singleton import SecureEntry
 14 | 
 15 | 
 16 | class EntryFile(abc.ABC):
 17 |     def __init__(self, filename: pathlib.Path, read_mode: bool):
 18 |         self.read_mode = read_mode
 19 |         if not read_mode and pathlib.Path(filename).exists():
 20 |             raise Exception(f"File {filename} already exists!")
 21 | 
 22 |         parent_dir = pathlib.Path(filename).parent
 23 |         if not pathlib.Path(parent_dir).exists():
 24 |             pathlib.Path(parent_dir).mkdir(parents=True, exist_ok=True)
 25 | 
 26 |         if read_mode:
 27 |             self.file_obj = open(filename)
 28 |         else:
 29 |             self.file_obj = open(filename, "w+")
 30 | 
 31 |     def __del__(self):
 32 |         if hasattr(self, "file_obj") and not self.file_obj.closed:
 33 |             self.file_obj.close()
 34 | 
 35 |     def __exit__(self):
 36 |         if hasattr(self, "file_obj") and not self.file_obj.closed:
 37 |             self.file_obj.close()
 38 |         self.__del__()
 39 | 
 40 |     @abc.abstractmethod
 41 |     def read_entries(self) -> Iterable[OutputSubmission]:
 42 |         return dict()
 43 | 
 44 |     @abc.abstractmethod
 45 |     def append(self, entry: OutputSubmission) -> None:
 46 |         return
 47 | 
 48 | 
 49 | class JSONFile(EntryFile):
 50 |     def __init__(self, filename: pathlib.Path, read_mode: bool):
 51 |         self.entries: List[Dict[str, Any]] = []
 52 |         super().__init__(filename, read_mode)
 53 | 
 54 |     def read_entries(self) -> Iterable[OutputSubmission]:
 55 |         for raw_submission in json.load(self.file_obj):
 56 |             submission = EncryptedSubmission.model_validate(raw_submission)
 57 |             try:
 58 |                 EncryptedEntry.model_validate(submission.entry)
 59 |             except ValueError:
 60 |                 logger.warning("Encountered an unencrypted entry!")
 61 |                 yield OutputSubmission.model_validate(raw_submission)
 62 |             decrypted_sub = OutputSubmission.model_validate(raw_submission)
 63 |             decrypted_sub.entry = SecureEntry.read_entry_field(decrypted_sub.entry)
 64 |             yield decrypted_sub
 65 | 
 66 |     def append(self, entry: OutputSubmission) -> None:
 67 |         self.entries.append(entry.model_dump())
 68 | 
 69 |     def __del__(self):
 70 |         if (
 71 |             hasattr(self, "file_obj")
 72 |             and not self.file_obj.closed
 73 |             and not self.read_mode
 74 |         ):
 75 |             json.dump(self.entries, self.file_obj)
 76 |         self.entries = []
 77 |         return super().__del__()
 78 | 
 79 | 
 80 | class CSVFile(EntryFile):
 81 |     def __init__(self, filename: pathlib.Path, read_mode: bool):
 82 |         super().__init__(filename, read_mode)
 83 |         if not read_mode:
 84 |             headers = OutputSubmission.model_fields.keys()
 85 |             self.writer = csv.DictWriter(self.file_obj, fieldnames=headers)
 86 |             self.writer.writeheader()
 87 | 
 88 |     def read_entries(self) -> Iterable[OutputSubmission]:
 89 |         line = self.file_obj.readline().strip()
 90 |         header = line.split(csv.excel.delimiter)
 91 |         reader = csv.DictReader(self.file_obj, fieldnames=header)
 92 |         for e in reader:
 93 |             re: Dict[str, Any] = {k: v for k, v in e.items() if k != "entry"}
 94 |             re["entry"] = SecureEntry.read_entry_field(eval(e["entry"]))
 95 |             yield OutputSubmission.model_validate(re)
 96 | 
 97 |     def append(self, entry: OutputSubmission) -> None:
 98 | 
 99 |         self.writer.writerow(entry.model_dump())
100 | 
101 | 
102 | def decrypt_file(input_path: pathlib.Path, output_path: pathlib.Path) -> int:
103 | 
104 |     input_file: EntryFile
105 | 
106 |     if input_path.suffix == ".json":
107 |         input_file = JSONFile(input_path, read_mode=True)
108 | 
109 |     elif input_path.suffix == ".csv":
110 |         input_file = CSVFile(input_path, read_mode=True)
111 |     else:
112 |         raise NotImplementedError(
113 |             f"Unknown INPUT file type {input_path.suffix}, "
114 |             "make sure you unzipped the file."
115 |         )
116 | 
117 |     output_file: EntryFile
118 | 
119 |     if output_path.suffix == ".json":
120 |         output_file = JSONFile(output_path, read_mode=False)
121 |     elif output_path.suffix == ".csv":
122 |         output_file = CSVFile(output_path, read_mode=False)
123 |     else:
124 |         raise NotImplementedError(
125 |             f"Unknown OUTPUT file type {output_path.suffix}, "
126 |             "output should end with `.csv` or `.json`."
127 |         )
128 | 
129 |     touched_entries = 0
130 |     for entry in input_file.read_entries():
131 |         output_file.append(entry)
132 |         touched_entries += 1
133 | 
134 |     return touched_entries
135 | 


--------------------------------------------------------------------------------
/osd2f/security/entry_encryption/secure_entry_singleton.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import json
 3 | import random
 4 | from typing import Any, Dict
 5 | 
 6 | from cryptography.fernet import Fernet
 7 | from cryptography.hazmat.primitives import hashes
 8 | from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
 9 | 
10 | 
11 | from ...logger import logger
12 | 
13 | 
14 | class SecureEntry:
15 | 
16 |     __encryption_secret: bytes = b""
17 |     __decrypt_on_read: bool = True
18 | 
19 |     @classmethod
20 |     def set_secret(cls, secret: str):
21 |         if not secret:
22 |             cls.__encryption_secret = b""
23 |         else:
24 |             cls.__encryption_secret = cls.__create_key(secret.encode())
25 | 
26 |     @classmethod
27 |     def decrypt_on_read(cls, must_decrypt_on_read: bool):
28 |         cls.__decrypt_on_read = must_decrypt_on_read
29 | 
30 |     @classmethod
31 |     def write_entry_field(cls, entry_field: Dict[str, Any]) -> Dict[str, Any]:
32 |         if not cls.__encryption_secret:
33 |             return entry_field
34 |         f = Fernet(cls.__encryption_secret)
35 |         return {"encrypted": f.encrypt(json.dumps(entry_field).encode()).decode()}
36 | 
37 |     @classmethod
38 |     def read_entry_field(cls, entry_field: Dict[str, Any]) -> Dict[str, Any]:
39 |         if not cls.__encryption_secret or not cls.__decrypt_on_read:
40 |             return entry_field
41 |         encrypted_content = entry_field.get("encrypted")
42 | 
43 |         if not encrypted_content:
44 |             logger.warning(
45 |                 "Entry encryption was set, but an unencrypted " "entry was retrieved!"
46 |             )
47 |             return entry_field
48 |         f = Fernet(cls.__encryption_secret)
49 |         content = f.decrypt(encrypted_content.encode())
50 |         return json.loads(content.decode())
51 | 
52 |     @staticmethod
53 |     def __create_key(password: bytes) -> bytes:
54 |         random.seed(len(password))
55 |         salt = bytes(random.randint(0, 10**6))
56 |         kdf = PBKDF2HMAC(
57 |             algorithm=hashes.SHA256(),
58 |             length=32,
59 |             salt=salt,
60 |             iterations=320_000,
61 |         )
62 |         key = base64.urlsafe_b64encode(kdf.derive(password))
63 |         return key
64 | 


--------------------------------------------------------------------------------
/osd2f/security/secrets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/security/secrets/__init__.py


--------------------------------------------------------------------------------
/osd2f/security/secrets/azure_keyvault.py:
--------------------------------------------------------------------------------
 1 | from azure.identity import DefaultAzureCredential
 2 | from azure.keyvault.secrets import SecretClient
 3 | 
 4 | from ...logger import logger
 5 | 
 6 | PREFIX = "azure-keyvault"
 7 | 
 8 | 
 9 | def azure_keyvault_replace(value: str) -> str:
10 |     """translates environment variables formatted like:
11 | 
12 |     azure-keyvault::example.vault.azure.net::secret_name
13 | 
14 |     to the contents of the referred secrets.
15 | 
16 |     """
17 |     if len(value.split("::")) != 3:
18 |         logger.critical(
19 |             f"azure value {value} is incorrectly formatted, "
20 |             f"should be `{PREFIX}::KEYVAULT_URL::SECRET_NAME`"
21 |         )
22 |         exit()
23 |     _, keyvault_url, secret_name = value.split("::")
24 |     cred = DefaultAzureCredential()
25 |     client = SecretClient(keyvault_url, cred)
26 |     secret = client.get_secret(secret_name).value
27 |     return secret or ""
28 | 


--------------------------------------------------------------------------------
/osd2f/settings/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/settings/.DS_Store


--------------------------------------------------------------------------------
/osd2f/settings/default_content_settings.yaml:
--------------------------------------------------------------------------------
  1 | project_title: OSD2F
  2 | contact_us: email@domain.tld
  3 | static_pages:
  4 |   home:
  5 |     active: True
  6 |     name: The OSD2F project
  7 |     blocks:
  8 |       - type: jumbotron
  9 |         title: Data donation made easy
 10 |         id: top
 11 |         image: "/static/skull_phone_cc.jpg"
 12 |         lines:
 13 |           - A general way of donating data
 14 |           - For JSON based GDPR exports
 15 |           - To use with external survey and analysis tools
 16 |         buttons:
 17 |           - name: About the project
 18 |             label: "btn-primary"
 19 |             link: "#project"
 20 |           - name: How it works
 21 |             label: "btn-success"
 22 |             link: "/donate"
 23 | 
 24 |       - type: two_block_row
 25 |         id: project
 26 |         image: "/static/study_cc.jpg"
 27 |         image_pos: left
 28 |         title: OSD2F provides a whitelist based collection website
 29 |         lines:
 30 |           - Under GDPR, everyone should be able to export <br> their data in machine-readable format
 31 |           - Many platform provide standardized ways to get this data by <br> exporting it as a set of JSON files
 32 |           - This app allows researchers to easily and safely collect exported data donated by participants in their studies
 33 |         buttons:
 34 |           - name: About the developers
 35 |             label: "btn-primary"
 36 |             link: "#team"
 37 |           - name: Donate now
 38 |             label: "btn-success"
 39 |             link: "/donate"
 40 | 
 41 |       - type: two_block_row
 42 |         id: team
 43 |         title: Open Source Data Donation Framework
 44 |         lines:
 45 |           - The digital traces that people leave through their use of various online platforms provide tremendous opportunities for studying human behavior.
 46 |           - However, the collection of these data is hampered by legal, ethical and technical challenges.
 47 |           - We present a framework and tool for collecting these data through a data donation platform where consenting participants can securely submit their digital traces.
 48 |         circles_row:
 49 |           - title: Prof. Dr. Hypothetical collaborator
 50 |             subtitle: this person does not exist
 51 |             image: "https://thispersondoesnotexist.com/image"
 52 |           - title: Sponsored by
 53 |             subtitle: A random image
 54 |             image: "https://picsum.photos/200/300.jpg"
 55 |         buttons:
 56 |           - name: About the project
 57 |             label: "btn-primary"
 58 |             link: "#team"
 59 |           - name: Donate now
 60 |             label: "btn-success"
 61 |             link: "/donate"
 62 | 
 63 |   privacy:
 64 |     active: True
 65 |     name: Privacy protection
 66 |     blocks:
 67 |       - type: jumbotron
 68 |         title: Data donation made easy
 69 |         id: top
 70 |         lines:
 71 |           - A simple application for donation collection
 72 |         buttons: []
 73 | 
 74 |   donate:
 75 |     active: False
 76 |     name: Donation Page
 77 |     blocks: []
 78 | 
 79 | upload_page:
 80 |   blocks: []
 81 |   upload_box:
 82 |     header: "Select file(s):"
 83 |     explanation:
 84 |       - "You can use the file selector to select the zipfile from your platform"
 85 |       - "You can also drag the folder into this box"
 86 |   thanks_text: "Thanks for trying osd2f"
 87 |   processing_text: "processing your donation for preview"
 88 |   empty_selection: "Your selection does not contain any files appropriate to our study. <br> Did you select the correct (zip) file?"
 89 |   file_indicator_text: "Entries in your donation: "
 90 |   donate_button: "Donate"
 91 |   inspect_button: "Inspect & edit"
 92 |   preview_component:
 93 |     entries_in_file_text: "Entries in this file: "
 94 |     title: "Inspect & Edit your donation"
 95 |     explanation:
 96 |       - The top shows the files in your donation
 97 |       - In each file, you can search for content
 98 |       - You can remove content by clicking on the rows and pressing "remove selection"
 99 |     remove_rows_button: "remove selected rows"
100 |     search_prompt: "Search in file"
101 |     search_box_placeholder: "type to search this file"
102 |     previous_file_button: Previous file
103 |     next_file_button: Next file
104 |   consent_popup:
105 |     title: "I want to donate my data..."
106 |     lead: "This box explains the conditions of your donation"
107 |     points:
108 |       - how data is secured
109 |       - what the data will be used for
110 |       - how long the data will be stored
111 |     end_text: "By clicking below, you consent to these terms..."
112 |     decline_button: "I do not consent"
113 |     accept_button: "I consent"
114 | 


--------------------------------------------------------------------------------
/osd2f/settings/default_upload_settings.yaml:
--------------------------------------------------------------------------------
 1 | # Sample Platform upload settings
 2 | files:
 3 |   (^|/|\\)comments.json:
 4 |     # in key is required when the initial level of data
 5 |     # is not a list but an object (e.g. {} instead of [])
 6 |     # if keys are nested more than once, use a '.' to separate
 7 |     # the levels of the keys
 8 |     in_key: 'comment_information'
 9 |     anonymizers:
10 |       - redact_text: ''
11 |     # accepted_fields to include in the upload
12 |     # remove a field to filter it out
13 |     accepted_fields:
14 |       - timestamp
15 |       - title
16 |       - information.comment.comment_text
17 | 
18 |   # posts can be split into multiple files
19 |   # that end in `_<number>`.
20 |   posts(_(\d)+).json:
21 |     # anonymizers are functions applied on the server
22 |     # that can inspect the content of a potential donation
23 |     # and apply anonymization accordingly.
24 |     # the KEY is the function to apply, the VALUE a string-argument
25 |     anonymizers:
26 |       - redact_text: ''
27 |     accepted_fields:
28 |       - timestamp
29 |       - post_title
30 |       - keywords
31 |       - information.post.post_metadata.expanded_url
32 |       - information.post.post_metadata.source
33 |       - information.post.post_text
34 | 
35 |   # these are the likes & reactions of pages
36 |   engagement.json:
37 |     in_key: engagement_info
38 |     accepted_fields:
39 |       - engagement_type
40 |       - object
41 |       - timestamp
42 | 
43 |   ## alternative filename for likes & reactions of pages
44 |   companies_followed.json:
45 |     in_key: companies_followed
46 |     accepted_fields:
47 |       - company_name
48 |       - timestamp
49 | 
50 |   # these are the likes & reactions of posts & comments
51 |   profile_interests.json:
52 |     in_key: profile_interests
53 |     accepted_fields: []
54 | 
55 |   # Advertiser interactions file
56 |   ads_clicked.json:
57 |     # in_key :
58 |     accepted_fields:
59 |       - ad_title
60 |       - activity
61 |       - timestamp
62 | 
63 |   # Advertisers who uploaded user information
64 |   advertisers_who_uploaded_a_contact_list_with_your_information.json:
65 |     in_key: custom_audiences_v2
66 |     accepted_fields: []
67 | 
68 |   # Privacy Checkup
69 |   privacy_checkup_interactions.json:
70 |     in_key: privacy_checkup_interactions_v2
71 |     accepted_fields:
72 |       - name
73 |       - started_timestamp
74 |       - completed_timestamp
75 | 
76 |   # activity of an account
77 |   account_activity.json:
78 |     in_key: account_activity_v2
79 |     accepted_fields:
80 |       - action
81 |       - country
82 |       - site_name
83 |       - timestamp
84 | 
85 |   # short messages with a '.' in the in-key
86 |   messages.json:
87 |     in_key: messages.collection
88 |     accepted_fields:
89 |       - id
90 |       - message
91 | 


--------------------------------------------------------------------------------
/osd2f/static/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/static/.DS_Store


--------------------------------------------------------------------------------
/osd2f/static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/static/favicon.ico


--------------------------------------------------------------------------------
/osd2f/static/js/libarchive/wasm-gen/libarchive.wasm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/static/js/libarchive/wasm-gen/libarchive.wasm


--------------------------------------------------------------------------------
/osd2f/static/js/main.js.LICENSE.txt:
--------------------------------------------------------------------------------
1 | /*!
2 |  * Vue.js v2.6.12
3 |  * (c) 2014-2020 Evan You
4 |  * Released under the MIT License.
5 |  */
6 | 


--------------------------------------------------------------------------------
/osd2f/static/keylock.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/static/keylock.png


--------------------------------------------------------------------------------
/osd2f/static/skull_phone_cc.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/static/skull_phone_cc.jpg


--------------------------------------------------------------------------------
/osd2f/static/study_cc.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/static/study_cc.jpg


--------------------------------------------------------------------------------
/osd2f/templates/blocks/bootstrap_scripts.html.jinja:
--------------------------------------------------------------------------------
1 |     <script src="https://code.jquery.com/jquery-3.5.1.slim.min.js" integrity="sha384-DfXdz2htPH0lsSSs5nCTpuj/zy4C+OGpamoFVy38MVBnE+IbbVYUew+OrCXaRkfj" crossorigin="anonymous"></script>
2 |     <script src="https://cdn.jsdelivr.net/npm/bootstrap@4.5.3/dist/js/bootstrap.bundle.min.js" integrity="sha384-ho+j7jyWK8fNQe+A12Hb8AhRq26LrZ/JpcUGGOn+Y7RsweNrtN/tE3MoK7ZeZDyx" crossorigin="anonymous"></script>


--------------------------------------------------------------------------------
/osd2f/templates/blocks/circles_row.html.jinja:
--------------------------------------------------------------------------------
 1 | {% if contentblock.circles_row %}
 2 | <div class="row justify-content-center">
 3 |     {% for circleItem in contentblock.circles_row %}
 4 |         <div class="col-sm-4 text-center">
 5 |             <img src="{{circleItem.image}}" alt="" class="rounded-circle image-fluid" style="width: 100px; height: 100px;">
 6 |             <p class="m-0" style="font-size: x-small;">{{ circleItem.title | safe }}</p>
 7 |             <p class="text-muted" style="font-size: x-small;">{{ circleItem.subtitle | safe }}</p>
 8 |         </div>
 9 |     {% endfor %}
10 | </div>
11 | {% endif %}


--------------------------------------------------------------------------------
/osd2f/templates/blocks/footer.html.jinja:
--------------------------------------------------------------------------------
1 |     <footer>
2 |         <div class="container-fluid bg-dark m-0 pt-2 pb-2 text-center text-white text-bold">
3 |             <p style="font-size: x-small;" class="m-0">Contact us at: {{ content_settings.contact_us }}</p>
4 |         </div>
5 |     </footer>


--------------------------------------------------------------------------------
/osd2f/templates/blocks/head.html.jinja:
--------------------------------------------------------------------------------
 1 | <head>
 2 |     <meta charset="utf-8">
 3 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
 4 |     <meta name="viewport" content="width=device-width, initial-scale=1">
 5 |     <meta name="description" content="">
 6 |     <meta name="author" content="">
 7 | 
 8 |     <link rel="shortcut icon" href="{{ url_for('static', filename='favicon.ico') }}">
 9 |     <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.5.3/dist/css/bootstrap.min.css" integrity="sha384-TX8t27EcRE3e/ihU7zmQxVncDAy5uIKz4rEkgIXeMed4M0jlfIDPvg6uqKI2xXr2" crossorigin="anonymous">
10 |     
11 |     {%  if all_links_new_tab   %}
12 |         <base target="_blank">
13 |     {% endif %}
14 | </head>


--------------------------------------------------------------------------------
/osd2f/templates/blocks/jumbotron.html.jinja:
--------------------------------------------------------------------------------
 1 | <section class="jumbotron m-0" style="background-image: url({{ contentblock.image }}); background-size: cover;">
 2 |     <div class="container pt-4 {{ 'text-white' if contentblock.image else '' }} text-center">
 3 |         {% if contentblock.title %} <h1>{{contentblock.title|safe}} </h1> {% endif %} 
 4 |         <br>
 5 |         {% for line in contentblock.lines %} 
 6 |             <p class="lead font-weight-bold">{{ line | safe }} </p> 
 7 |         {% endfor %} 
 8 |         <br>
 9 |         {% include "blocks/circles_row.html.jinja" %}
10 |         {% for button in contentblock.buttons %} 
11 |             <a href="{{button.link}}"><button class="btn {{button.label}}" type="button">{{button.name}}</button></a>
12 |         {% endfor %} 
13 | 
14 |     </div>
15 | </section>
16 | 


--------------------------------------------------------------------------------
/osd2f/templates/blocks/navbar.html.jinja:
--------------------------------------------------------------------------------
 1 | <header>
 2 |     <div class="navbar navbar-dark shadow-sm navbar-expand-lg bg-dark fixed-top">
 3 |         <a class="navbar-brand" href="/#">{{ content_settings.project_title}} </a>
 4 |         <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
 5 |             <span class="navbar-toggler-icon"></span>
 6 |         </button>
 7 |         <div class="collapse navbar-collapse" id="navbarNav">
 8 |             <ul class="navbar-nav ml-auto">
 9 | 
10 |                 {% for pagename, page in content_settings.static_pages.items() %}
11 |                     {% if page.active == True %}
12 |                     <li class="nav-item {{ 'active' if pagename==current_page else '' }}">
13 |                         <a class="nav-link" href="/{{ pagename }}">
14 |                             <span>{{page.name}}</span>
15 |                         </a>
16 |                     </li>
17 |                     {% endif %}
18 |             
19 |             {% endfor %} 
20 |             </ul>
21 |                 
22 | 
23 |         </div>
24 | 
25 |     </div>
26 | </header>


--------------------------------------------------------------------------------
/osd2f/templates/blocks/two_block_row.html.jinja:
--------------------------------------------------------------------------------
 1 | <div class="row bg-secondary justify-content-center">
 2 |     {% if  contentblock.image and contentblock.image_pos=="left" %}
 3 |         <div class="col-xl-5 col-lg-6 p-0 bg-primary"><img src="{{ contentblock.image }}" alt="" class="img-fluid" style="height: 100%; width: 100%;"></div>
 4 |     {% endif %}
 5 |     <div class="col-xl-5 col-lg-6 p-5 bg-white">
 6 |         <h3 id="{{ contentblock.id }}">{{contentblock.title|safe}}</h3>
 7 |         {% for line in contentblock.lines %}
 8 |             <p>{{line|safe}}</p>
 9 |         {% endfor%}
10 |         {% include "blocks/circles_row.html.jinja" %}
11 |         <div class="text-center">
12 |             {% for button in contentblock.buttons %}
13 |             <a href="{{button.link}}"><button type="button" class="btn {{button.label}}">{{button.name}}</button></a>
14 |             {% endfor %}
15 |         </div>
16 |     </div>
17 |     {% if  contentblock.image and contentblock.image_pos=="right"%}
18 |         <div class="col-xl-5 col-lg-6 p-0 bg-primary"><img src="{{ contentblock.image }}" alt="" class="img-fluid" style="height: 100%; width: 100%;"></div>
19 |     {% endif %}
20 | </div>


--------------------------------------------------------------------------------
/osd2f/templates/formats/base.html.jinja:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | 
 4 | {% include "blocks/head.html.jinja" %}
 5 | 
 6 | <body>
 7 |     {% include "blocks/navbar.html.jinja" %}
 8 | 
 9 |     <main>
10 |         <div class="container pt-4"><br></div>
11 |         {% block content %} {% endblock %}    
12 |         <div class="container p-1"><br></div>     
13 |     </main>
14 | 
15 | </body>
16 | 
17 | {% include "blocks/footer.html.jinja" %}
18 | {% include "blocks/bootstrap_scripts.html.jinja" %} 


--------------------------------------------------------------------------------
/osd2f/templates/formats/static_template.html.jinja:
--------------------------------------------------------------------------------
 1 | {% extends "formats/base.html.jinja" %}
 2 | 
 3 | {% block content %} 
 4 |     <div class="container-fluid p-0 m-0 text-center">
 5 |     {% for contentblock in content_settings.static_pages[current_page].blocks %}
 6 | 
 7 |         {% if contentblock.type=="jumbotron" %} 
 8 |             {% include "blocks/jumbotron.html.jinja" %} 
 9 |         {% endif %} 
10 | 
11 | 
12 |         {% if contentblock.type=="two_block_row" %}
13 |             {% include "blocks/two_block_row.html.jinja" %}
14 |         {% endif%}
15 | 
16 |     {% endfor %} 
17 |     </div>
18 | 
19 | {% endblock content %} 


--------------------------------------------------------------------------------
/osd2f/templates/formats/upload_template.html.jinja:
--------------------------------------------------------------------------------
  1 | {% extends "formats/base.html.jinja" %}
  2 | 
  3 | {% block content %} 
  4 |     <div class="container-fluid pt-2 m-0 text-center">
  5 | 
  6 |     <!-- Additional explanations about uploads -->
  7 |     {% for contentblock in content_settings.upload_page.blocks %}
  8 | 
  9 |         {% if contentblock.type=="jumbotron" %} 
 10 |             {% include "blocks/jumbotron.html.jinja" %} 
 11 |         {% endif %} 
 12 | 
 13 | 
 14 |         {% if contentblock.type=="two_block_row" %}
 15 |             {% include "blocks/two_block_row.html.jinja" %}
 16 |         {% endif%}
 17 | 
 18 |     {% endfor %} 
 19 |     </div>
 20 | 
 21 |     <!-- Upload interface -->
 22 |     <div class="container-fluid min-vh-100">
 23 |     
 24 |         <!-- The dropbox for files -->
 25 |         <style>
 26 |             #drop-area {
 27 |                 border: 2px dashed #ccc;
 28 |                 border-radius: 20px;
 29 |                 width: 480px;
 30 |                 margin: 50px auto;
 31 |                 padding: 20px;
 32 |             }
 33 |             
 34 |             #drop-area.highlight {
 35 |                 border-color: cornflowerblue;
 36 |             }
 37 |         </style>
 38 |         <div class="row mb-0 pt-1 justify-content-center">
 39 |             <div id="drop-area" class="col-xl-5 col-lg-6 col-md-8 col-sm-10 p-6 mb-1">
 40 |                 <h3 class="text-center"> {{content_settings.upload_page.upload_box.header|safe}} </h3>
 41 |                 <progress id="progress-bar" max=100 value=0 style="width:100%; height:15px;"></progress>
 42 |                 <form class="my-form">
 43 |                 {% for par in content_settings.upload_page.upload_box.explanation%}
 44 |                     <p>{{par|safe}}</p>
 45 |                 {% endfor %}
 46 |                 <input type="file" id="fileElem">
 47 |                 <label class="button" for="fileElem"></label>
 48 |                 </form>
 49 |                 
 50 |             </div>      
 51 |         </div>
 52 | 
 53 |         <!-- The status messages (generally hidden) -->
 54 |         <div id="processing" class="invisible">
 55 |             <div class="row p-2 justify-content-center">
 56 |                 <h2 class="text-center">{{content_settings.upload_page.processing_text | safe}}</h2>
 57 |             </div>
 58 |             <div class="row p-2 justify-content-center">
 59 |                 <div class="spinner-border text-primary" role="status">
 60 |                     <span class="sr-only">Loading...</span>
 61 |                 </div>
 62 |             </div>
 63 |         </div>
 64 | 
 65 |         <!-- Selection without accepted content warning (generally hidden) -->
 66 |         <div id="empty_selection" class="d-none">
 67 |             <div class="row p-2 justify-content-center">
 68 |                 <h2 class="text-center">{{content_settings.upload_page.empty_selection | safe}}</h2>
 69 |             </div>
 70 |         </div>
 71 | 
 72 |         <!-- The "thank you" message (initially hidden!) -->
 73 |         <div class="row justify-content-center invisible" id="thankyou">
 74 |             <h1>{{content_settings.upload_page.thanks_text}}</h1>
 75 |         </div>
 76 | 
 77 |         <!-- The Veux component to interact with the uploaded data -->
 78 |         <div class="row justify-content-center p-4">
 79 |             <div id="visualization">
 80 |         
 81 |                 <donation-container
 82 |                 v-bind:donations="donations"
 83 |                 v-bind:content="content"
 84 |                 >
 85 |                 </donation-container>
 86 |             </div>
 87 |         </div>
 88 |     
 89 |     </div>
 90 | 
 91 | <!-- necessary scripts, including our own main.js -->
 92 |     <!-- parsing script -->
 93 |     <script src="/static/js/main.js"></script>
 94 |     <!-- page functionality related to presentation -->
 95 |     <script>
 96 |         let settings = {{ upload_settings.model_dump_json()|safe }};
 97 |         let content = {{ content_settings.model_dump_json()|safe }};
 98 |         let sid = {{sid|tojson|safe}};
 99 |         window.sid = sid;
100 | 
101 | 
102 |         let dropArea = document.getElementById("drop-area");
103 | 
104 |         ['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
105 |         dropArea.addEventListener(eventName, preventDefaults, false)
106 |         })
107 | 
108 |         function preventDefaults(e) {
109 |         e.preventDefault()
110 |         e.stopPropagation()
111 |         }
112 | 
113 |         ['dragenter', 'dragover'].forEach(eventName => {
114 |         dropArea.addEventListener(eventName, highlight, false)
115 |         });
116 | 
117 |         ['dragleave', 'drop'].forEach(eventName => {
118 |         dropArea.addEventListener(eventName, unhighlight, false)
119 |         })
120 | 
121 |         function highlight(e) {
122 |         dropArea.classList.add('highlight')
123 |         }
124 | 
125 |         function unhighlight(e) {
126 |         dropArea.classList.remove('highlight')
127 |         }
128 |         
129 |         
130 | 
131 |         // track whether the donation has been completed for the prevent close dialogue
132 |         let noDonationYet=true
133 | 
134 |         // prevent accidentally leaving before donating
135 |         window.onbeforeunload = confirmExit;
136 |         function confirmExit() {
137 |             if (noDonationYet) {
138 |             return "You haven't donated yet, are you sure you want to leave?";
139 |             }
140 |         }
141 | 
142 |     </script>
143 | {% endblock content %} 


--------------------------------------------------------------------------------
/osd2f/utils.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import functools
  3 | import os
  4 | import pathlib
  5 | import typing
  6 | from collections.abc import MutableMapping
  7 | 
  8 | import pytz
  9 | 
 10 | import yaml  # type: ignore
 11 | 
 12 | from .database import get_content_config, set_content_config
 13 | from .definitions import ContentSettings, UploadSettings
 14 | from .logger import logger
 15 | 
 16 | DISK_CONTENT_CONFIG_PATH: str = str(
 17 |     pathlib.Path(__file__)
 18 |     .parent.joinpath("settings")
 19 |     .joinpath("default_content_settings.yaml")
 20 | )
 21 | DISK_CONFIG_VERSION = ""
 22 | 
 23 | 
 24 | @functools.lru_cache
 25 | def _cached_load_settings() -> UploadSettings:
 26 |     return _load_settings_from_disk()
 27 | 
 28 | 
 29 | def _load_settings_from_disk() -> UploadSettings:
 30 |     settings_dir = pathlib.Path(__file__).parent.joinpath("settings")
 31 |     try:
 32 |         settings = UploadSettings.model_validate(
 33 |             yaml.safe_load(open(settings_dir.joinpath("upload_settings.yaml")))
 34 |         )
 35 |     except FileNotFoundError:
 36 |         logger.warning("No user provided `upload_settings.yaml` found, using defaults.")
 37 |         settings = UploadSettings.model_validate(
 38 |             yaml.safe_load(open(settings_dir.joinpath("default_upload_settings.yaml")))
 39 |         )
 40 |     return settings
 41 | 
 42 | 
 43 | def load_upload_settings(force_disk: bool = False) -> UploadSettings:
 44 |     if force_disk:
 45 |         logger.warning(
 46 |             "Settings are re-loaded from disk on every request, "
 47 |             "this eases debugging, but will hurt performance!"
 48 |         )
 49 |         return _load_settings_from_disk()
 50 |     else:
 51 |         return _cached_load_settings()
 52 | 
 53 | 
 54 | async def load_content_settings(use_cache: bool) -> ContentSettings:
 55 |     # load db config version
 56 |     db_config = await get_content_config()
 57 | 
 58 |     # load disk version ()
 59 |     global DISK_CONFIG_VERSION
 60 |     if not DISK_CONFIG_VERSION or not use_cache:
 61 |         disk_config = yaml.safe_load(open(DISK_CONTENT_CONFIG_PATH))
 62 |         DISK_CONFIG_VERSION = disk_config
 63 | 
 64 |     else:
 65 |         disk_config = DISK_CONFIG_VERSION
 66 | 
 67 |     disk_timestamp = pytz.UTC.localize(
 68 |         datetime.datetime.fromtimestamp(os.path.getmtime(DISK_CONTENT_CONFIG_PATH))
 69 |     )
 70 | 
 71 |     # if no database config exists, insert disk version in database and
 72 |     # use disk version
 73 |     if not db_config:
 74 |         config = ContentSettings.model_validate(disk_config)
 75 |         await set_content_config(user="default", content=config)
 76 |         return config
 77 | 
 78 |     # pick the most recent version
 79 |     if db_config.insert_timestamp > disk_timestamp:
 80 |         last_config = db_config.config_blob
 81 |     else:
 82 |         last_config = disk_config
 83 | 
 84 |     config = ContentSettings.model_validate(last_config)
 85 | 
 86 |     return config
 87 | 
 88 | 
 89 | def flatten(d: MutableMapping, parent_key: str = "", sep: str = "_"):
 90 |     items = []
 91 |     if type(d) == str:
 92 |         return d
 93 |     for k, v in d.items():
 94 |         new_key = parent_key + sep + k if parent_key else k
 95 |         if isinstance(v, MutableMapping):
 96 |             items.extend(flatten(v, new_key, sep=sep).items())
 97 |         elif type(v) == list:
 98 |             items.append((new_key, [flatten(vi, sep=sep) for vi in v]))
 99 |         else:
100 |             items.append((new_key, v))
101 |     return dict(items)
102 | 
103 | 
104 | def flatmap(
105 |     items: dict,
106 |     in_key: typing.Optional[str] = None,
107 | ):
108 | 
109 |     base = items if in_key is None else items.get(in_key, [])
110 | 
111 |     return [flatten(e, sep=".") for e in base]
112 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "osd2f",
 3 |   "version": "0.0.1",
 4 |   "description": "![Python application](https://github.com/uvacw/osd2f/workflows/Python%20application/badge.svg?branch=main) <a href=\"https://github.com/psf/black\"><img alt=\"Code style: black\" src=\"https://img.shields.io/badge/code%20style-black-000000.svg\"></a> # OSD2F: Open Source Data Donation Framework",
 5 |   "main": "static/js/file_upload.js",
 6 |   "private": true,
 7 |   "directories": {
 8 |     "doc": "docs",
 9 |     "test": "tests"
10 |   },
11 |   "scripts": {
12 |     "test": "jest",
13 |     "develop": "webpack watch --mode development",
14 |     "build": "webpack --mode production"
15 |   },
16 |   "repository": {
17 |     "type": "git",
18 |     "url": "git+https://github.com/uvacw/osd2f.git"
19 |   },
20 |   "author": "",
21 |   "license": "ISC",
22 |   "bugs": {
23 |     "url": "https://github.com/uvacw/osd2f/issues"
24 |   },
25 |   "homepage": "https://github.com/uvacw/osd2f#readme",
26 |   "dependencies": {
27 |     "blob-polyfill": "^5.0.20210201",
28 |     "bootstrap": "^4.6.0",
29 |     "bootstrap-vue": "^2.21.2",
30 |     "datatransfer-files-promise": "^1.3.1",
31 |     "glob-parent": "^5.1.1",
32 |     "jsonpath": "^1.1.1",
33 |     "libarchive.js": "^1.3.0",
34 |     "loader-utils": "^3.2.1",
35 |     "vue": "^2.6.12"
36 |   },
37 |   "devDependencies": {
38 |     "copy-webpack-plugin": "^7.0.0",
39 |     "css-loader": "^5.0.2",
40 |     "jest": "^27.5.1",
41 |     "postcss": "^8.2.15",
42 |     "style-loader": "^2.0.0",
43 |     "vue-loader": "^15.11.1",
44 |     "vue-template-compiler": "^2.6.12",
45 |     "webpack": "^5.76.0",
46 |     "webpack-cli": "^4.9.2"
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # This file is autogenerated by pip-compile with Python 3.9
  3 | # by the following command:
  4 | #
  5 | #    pip-compile
  6 | #
  7 | aiofiles==23.2.1
  8 |     # via quart
  9 | aiosqlite==0.17.0
 10 |     # via tortoise-orm
 11 | annotated-types==0.6.0
 12 |     # via pydantic
 13 | async-timeout==4.0.3
 14 |     # via asyncpg
 15 | asyncpg==0.29.0
 16 |     # via OSD2F (setup.py)
 17 | azure-core==1.30.1
 18 |     # via
 19 |     #   azure-identity
 20 |     #   azure-keyvault-secrets
 21 | azure-identity==1.16.0
 22 |     # via OSD2F (setup.py)
 23 | azure-keyvault-secrets==4.8.0
 24 |     # via OSD2F (setup.py)
 25 | blinker==1.8.2
 26 |     # via
 27 |     #   flask
 28 |     #   quart
 29 | certifi==2024.2.2
 30 |     # via requests
 31 | cffi==1.16.0
 32 |     # via cryptography
 33 | charset-normalizer==3.3.2
 34 |     # via requests
 35 | click==8.1.7
 36 |     # via
 37 |     #   flask
 38 |     #   quart
 39 | cryptography==42.0.7
 40 |     # via
 41 |     #   OSD2F (setup.py)
 42 |     #   azure-identity
 43 |     #   msal
 44 |     #   pyjwt
 45 | dnspython==2.6.1
 46 |     # via email-validator
 47 | email-validator==2.1.1
 48 |     # via pydantic
 49 | exceptiongroup==1.2.1
 50 |     # via taskgroup
 51 | flask==3.0.3
 52 |     # via quart
 53 | h11==0.14.0
 54 |     # via
 55 |     #   hypercorn
 56 |     #   wsproto
 57 | h2==4.1.0
 58 |     # via hypercorn
 59 | hpack==4.0.0
 60 |     # via h2
 61 | hypercorn==0.16.0
 62 |     # via
 63 |     #   OSD2F (setup.py)
 64 |     #   quart
 65 | hyperframe==6.0.1
 66 |     # via h2
 67 | idna==3.7
 68 |     # via
 69 |     #   email-validator
 70 |     #   requests
 71 | importlib-metadata==7.1.0
 72 |     # via
 73 |     #   flask
 74 |     #   quart
 75 | iso8601==1.1.0
 76 |     # via tortoise-orm
 77 | isodate==0.6.1
 78 |     # via azure-keyvault-secrets
 79 | itsdangerous==2.2.0
 80 |     # via
 81 |     #   flask
 82 |     #   quart
 83 | jinja2==3.1.4
 84 |     # via
 85 |     #   flask
 86 |     #   quart
 87 | markupsafe==2.1.5
 88 |     # via
 89 |     #   jinja2
 90 |     #   quart
 91 |     #   werkzeug
 92 | msal==1.28.0
 93 |     # via
 94 |     #   OSD2F (setup.py)
 95 |     #   azure-identity
 96 |     #   msal-extensions
 97 | msal-extensions==1.1.0
 98 |     # via azure-identity
 99 | packaging==24.0
100 |     # via msal-extensions
101 | portalocker==2.8.2
102 |     # via msal-extensions
103 | priority==2.0.0
104 |     # via hypercorn
105 | pycparser==2.22
106 |     # via cffi
107 | pycryptodomex==3.20.0
108 |     # via pyzipper
109 | pydantic[email]==2.7.1
110 |     # via
111 |     #   OSD2F (setup.py)
112 |     #   tortoise-orm
113 | pydantic-core==2.18.2
114 |     # via pydantic
115 | pyjwt[crypto]==2.8.0
116 |     # via
117 |     #   OSD2F (setup.py)
118 |     #   msal
119 | pypika-tortoise==0.1.6
120 |     # via tortoise-orm
121 | pytz==2024.1
122 |     # via tortoise-orm
123 | pyyaml==6.0.1
124 |     # via OSD2F (setup.py)
125 | pyzipper==0.3.6
126 |     # via OSD2F (setup.py)
127 | quart==0.19.5
128 |     # via OSD2F (setup.py)
129 | requests==2.31.0
130 |     # via
131 |     #   azure-core
132 |     #   msal
133 | six==1.16.0
134 |     # via
135 |     #   azure-core
136 |     #   isodate
137 | taskgroup==0.0.0a4
138 |     # via hypercorn
139 | tomli==2.0.1
140 |     # via hypercorn
141 | tortoise-orm==0.20.1
142 |     # via OSD2F (setup.py)
143 | typing-extensions==4.11.0
144 |     # via
145 |     #   aiosqlite
146 |     #   azure-core
147 |     #   azure-keyvault-secrets
148 |     #   pydantic
149 |     #   pydantic-core
150 |     #   quart
151 | urllib3==2.2.1
152 |     # via requests
153 | werkzeug==3.0.3
154 |     # via
155 |     #   flask
156 |     #   quart
157 | wsproto==1.2.0
158 |     # via hypercorn
159 | zipp==3.18.2
160 |     # via importlib-metadata
161 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
 1 | black==22.3.0
 2 | Faker==8.1.3
 3 | flake8==4.0.1
 4 | flake8-black==0.3.3
 5 | flake8-import-order==0.18.1
 6 | locust==2.25.0
 7 | mypy==1.10.0
 8 | mypy-extensions==1.0.0
 9 | pip-tools==7.4.1
10 | pytest==6.1.2
11 | aiounittest==1.4.0
12 | types-pytz==2024.1.0.20240417
13 | types-PyYAML==6.0.12.20240311


--------------------------------------------------------------------------------
/scripts/locally_decrypt_entries.py:
--------------------------------------------------------------------------------
 1 | """Decrypts downloaded submissions
 2 | 
 3 | Some OSD2F distributions are configured to keep
 4 | entries encrypted on download. This script locally
 5 | decrypts these entries, provided you have access to
 6 | the entry encryption secret.
 7 | 
 8 | Usage:
 9 | 
10 | python scripts/locally_decrypt_entries.py -h
11 | 
12 | """
13 | import argparse
14 | import logging
15 | import pathlib
16 | 
17 | from osd2f.logger import logger
18 | from osd2f.security import translate_value
19 | from osd2f.security.entry_encryption.file_decryption import decrypt_file
20 | from osd2f.security.entry_encryption.secure_entry_singleton import SecureEntry
21 | 
22 | parser = argparse.ArgumentParser(description=__doc__)
23 | 
24 | parser.add_argument(
25 |     "-v",
26 |     "--verbose",
27 |     action="count",
28 |     default=0,
29 |     help="Verbosity of logging output, defaults to default=CRITICAL, "
30 |     "v=WARNING, vv=INFO, vvv=DEBUG",
31 | )
32 | parser.add_argument("input_file", help="The file of submissions to decrypt.")
33 | parser.add_argument("output_file", help="The file to write decrypted submissions to")
34 | 
35 | parser.add_argument(
36 |     "secret",
37 |     help="The encryption secret, should be the same "
38 |     "as the secret used to configure the server. "
39 |     "May be a reference to a supported secret store such as Azure KeyVault",
40 | )
41 | 
42 | 
43 | def run_script():
44 |     args = parser.parse_args()
45 | 
46 |     if args.verbose == 0:
47 |         level = logging.CRITICAL
48 |     elif args.verbose == 1:
49 |         level = logging.WARNING
50 |     elif args.verbose == 2:
51 |         level = logging.INFO
52 |     elif args.verbose == 3:
53 |         level = logging.DEBUG
54 |     else:
55 |         print("UNKNOWN LOGLEVEL SPECIFIED")
56 |         level = logging.NOTSET
57 | 
58 |     logger.setLevel(level=level)
59 | 
60 |     secret = translate_value(args.secret)
61 |     SecureEntry.set_secret(secret)
62 | 
63 |     input_path = pathlib.Path(args.input_file)
64 |     output_path = pathlib.Path(args.output_file)
65 | 
66 |     touched_entries = decrypt_file(input_path=input_path, output_path=output_path)
67 | 
68 |     logger.info(
69 |         f"Done decrypting {touched_entries} entries from {input_path} to {output_path}"
70 |     )
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     run_script()
75 | 


--------------------------------------------------------------------------------
/scripts/locust_stress_testing.py:
--------------------------------------------------------------------------------
 1 | """Locust based stress testing
 2 | 
 3 | Run headless using:
 4 | 
 5 | locust --host 'http://localhost:5000' -f 'scripts/locust_stress_testing.py' \
 6 |     --headless --users 100 -t 60sec
 7 | 
 8 | Run with web interface:
 9 | 
10 | locust -f scripts/locust_stress_test.py
11 | 
12 | NOTE: it's recommended to use a ASGI tool such as hypercorn in production,
13 |       you should also test with such a framework to get realistic performance.
14 | 
15 | """
16 | 
17 | import faker
18 | 
19 | from locust import between, task
20 | from locust.contrib.fasthttp import FastHttpUser
21 | 
22 | from osd2f.utils import flatmap
23 | 
24 | from scripts import sample_data_generator
25 | 
26 | 
27 | class SampleParticipant(FastHttpUser):
28 |     wait_time = between(0.1, 5)
29 | 
30 |     def on_start(self):
31 |         """Generate a fake user and associated donation"""
32 |         f = faker.Faker()
33 |         self.user = f.user_name()
34 |         self.sid = f.uuid4()
35 |         self.entries = {
36 |             "comments.json": flatmap(
37 |                 sample_data_generator.generate_comments(user=self.user, n=1000),
38 |                 "comment_information",
39 |             ),
40 |             f"your_posts_{self.user}_1.json": flatmap(
41 |                 sample_data_generator.generate_posts(self.user, n=100)
42 |             ),
43 |             "engagement.json": flatmap(
44 |                 sample_data_generator.generate_engagement(self.user, 10),
45 |                 "engagement",
46 |             ),
47 |             "companies_followed.json": flatmap(
48 |                 sample_data_generator.generate_companies_followed(self.user, 100),
49 |                 "companies_followed",
50 |             ),
51 |             "ads_clicked.json": sample_data_generator.generate_ads_clicked(
52 |                 self.user, 50
53 |             ),
54 |             "profile_interests.json": [
55 |                 {"entry": e}
56 |                 for e in flatmap(
57 |                     sample_data_generator.generate_profile_interests(  # noqa
58 |                         self.user, 100
59 |                     ),
60 |                     "profile_interests",
61 |                 )
62 |             ],
63 |         }
64 | 
65 |     @task(20)
66 |     def send_log(self):
67 |         self.client.get("/log?position=locust&level=DEBUG")
68 | 
69 |     @task(1)
70 |     def send_anonymization(self):
71 |         for fn, entries in self.entries.items():
72 |             self.client.post(
73 |                 "/adv_anonymize_file",
74 |                 json={
75 |                     "submission_id": self.sid,
76 |                     "filename": fn,
77 |                     "n_deleted": 0,
78 |                     "entries": entries,
79 |                 },
80 |             )
81 | 
82 |     @task(1)
83 |     def send_submission(self):
84 |         submission = []
85 |         for fn, entries in self.entries.items():
86 |             submission.append(
87 |                 {
88 |                     "submission_id": self.sid,
89 |                     "filename": fn,
90 |                     "n_deleted": 0,
91 |                     "entries": entries,
92 |                 }
93 |             )
94 |         self.client.post("/upload", json=submission)
95 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | setup(
 6 |     name="OSD2F",
 7 |     python_requires=">3.8",
 8 |     version="0.1.2",
 9 |     description="Open Source Data Donation Framework",
10 |     author="Bob van de Velde",
11 |     author_email="osd2f@bob-as-a-service.com",
12 |     license=open("LICENSE").read(),
13 |     url="https://github.com/uvacw/osd2f",
14 |     packages=find_packages(),
15 |     package_data={
16 |         "osd2f": [
17 |             "static/*",
18 |             "templates/*",
19 |             "templates/*/*",
20 |             "settings/*",
21 |             "static/js/*",
22 |             "static/js/libarchive/*",
23 |             "static/js/libarchive/wasm-gen/*",
24 |         ]
25 |     },
26 |     scripts=["bin/osd2f", "bin/osd2f-decrypt-submissions"],
27 |     install_requires=[
28 |         "asyncpg",
29 |         "azure-keyvault-secrets",
30 |         "azure-identity",
31 |         "cryptography",
32 |         "hypercorn",
33 |         "msal",
34 |         "pyyaml",
35 |         "pydantic~=2.6",
36 |         "pydantic[email]",
37 |         "pyjwt>=2.4.0",  # dependency of MSAL, insecure < 2.4.0
38 |         "pyzipper",
39 |         "quart",
40 |         "tortoise-orm",
41 |     ],
42 | )
43 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/tests/__init__.py


--------------------------------------------------------------------------------
/tests/anonymizer_module_test.py:
--------------------------------------------------------------------------------
 1 | from aiounittest import AsyncTestCase
 2 | 
 3 | 
 4 | class test_anonymizer_package_interface(AsyncTestCase):
 5 |     async def test_apply(self):
 6 |         from osd2f import anonymizers
 7 | 
 8 |         # register a mock pass-through function as an anonymizer
 9 |         async def testfunc(e, _):
10 |             return e
11 | 
12 |         anonymizers.options["testfunc"] = testfunc
13 | 
14 |         entries = [{"title": f"entry {i}"} for i in range(100)]
15 | 
16 |         redacted_entries = await anonymizers.apply(entries, "testfunc")
17 | 
18 |         self.assertListEqual(entries, redacted_entries)
19 | 
20 |         anonymizers.options.pop("testfunc")
21 | 
22 |     async def test_options_conform_to_spec(self):
23 |         from osd2f import anonymizers
24 | 
25 |         for k, v in anonymizers.options.items():
26 |             self.assertEqual(k, v.__name__)
27 | 
28 |     async def test_submission_list_anonymization(self):
29 |         from osd2f import anonymizers
30 |         from osd2f.definitions import UploadSettings, SubmissionList, Submission
31 | 
32 |         async def testfunc(e, a):
33 |             e[a] = a
34 |             return e
35 | 
36 |         anonymizers.options["testfunc"] = testfunc
37 | 
38 |         settings = UploadSettings(
39 |             files={
40 |                 "file(_\\d)?.json": {
41 |                     "accepted_fields": [],
42 |                     "anonymizers": [{"testfunc": "a"}, {"testfunc": "b"}],
43 |                 }
44 |             }
45 |         )
46 |         submission_list = SubmissionList(
47 |             [
48 |                 Submission(
49 |                     entries=[{}], filename="file_2.json", submission_id="1", n_deleted=2
50 |                 )
51 |             ]
52 |         )
53 |         await anonymizers.anonymize_submission_list(
54 |             submission_list=submission_list, settings=settings
55 |         )
56 |         self.assertEqual(submission_list.root[0].entries[0], {"a": "a", "b": "b"})
57 | 
58 |     async def test_broken_anonymizer(self):
59 |         from osd2f import anonymizers
60 |         from osd2f.definitions import UploadSettings, SubmissionList, Submission
61 | 
62 |         async def brokenanonymizer(s: dict, arg: str = None):
63 |             if s.get("title") == "weird entry":
64 |                 raise ValueError("Help, I'm broken")
65 |             return s
66 | 
67 |         entries = [{"title": t} for t in ["normal entry"] * 10 + ["weird entry"]]
68 | 
69 |         anonymizers.options["brokenanonymizer"] = brokenanonymizer
70 | 
71 |         settings = UploadSettings(
72 |             files={
73 |                 "file(_\\d)?.json": {
74 |                     "accepted_fields": [],
75 |                     "anonymizers": [{"brokenanonymizer": "a"}],
76 |                 }
77 |             }
78 |         )
79 |         submission_list = SubmissionList(
80 |             [
81 |                 Submission(
82 |                     entries=entries,
83 |                     filename="file_2.json",
84 |                     submission_id="1",
85 |                     n_deleted=6,
86 |                 )
87 |             ]
88 |         )
89 |         await anonymizers.anonymize_submission_list(
90 |             submission_list=submission_list, settings=settings
91 |         )
92 | 
93 |         # check that all but one entry remains
94 |         self.assertEqual(len(submission_list.root[0].entries), 10)
95 | 


--------------------------------------------------------------------------------
/tests/create_app_test.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | import os
  3 | from unittest import TestCase
  4 | from unittest.mock import Mock, patch
  5 | 
  6 | 
  7 | class CreateAppTest(TestCase):
  8 |     def test_env_var_config(self):
  9 |         test_env_vars = {
 10 |             "OSD2F_DB_URL": "testdb",
 11 |             "OSD2F_SECRET": "testsecret",
 12 |             "OSD2F_DATA_PASSWORD": "datapassword",
 13 |         }
 14 |         old_env = os.environ.copy()
 15 |         os.environ.update(test_env_vars)
 16 |         import osd2f.config
 17 | 
 18 |         # force reload to trigger new processing of
 19 |         # env variables
 20 |         importlib.reload(osd2f.config)
 21 | 
 22 |         from osd2f.server import create_app
 23 | 
 24 |         app = create_app(mode="Production")
 25 | 
 26 |         assert app.config["DB_URL"] == "testdb"
 27 |         assert app.config["SECRET_KEY"] == "testsecret"
 28 |         assert app.config["DATA_PASSWORD"] == "datapassword"
 29 | 
 30 |         # reset to old environment
 31 |         os.environ = old_env
 32 |         importlib.reload(osd2f.config)
 33 | 
 34 |     def test_ovveride_var_config(self):
 35 |         test_env_vars = {
 36 |             "OSD2F_DB_URL": "testdb",
 37 |             "OSD2F_SECRET": "testsecret",
 38 |             "OSD2F_DATA_PASSWORD": "datapassword",
 39 |         }
 40 |         old_env = os.environ.copy()
 41 |         os.environ.update(test_env_vars)
 42 |         import osd2f.config
 43 | 
 44 |         # force reload to trigger new processing of
 45 |         # env variables
 46 |         importlib.reload(osd2f.config)
 47 | 
 48 |         from osd2f.server import create_app
 49 | 
 50 |         app = create_app(
 51 |             mode="Production",
 52 |             database_url_override="override_url",
 53 |             app_secret_override="override_secret",
 54 |             data_password_override="override_datapassword",
 55 |         )
 56 | 
 57 |         assert app.config["DB_URL"] == "override_url"
 58 |         assert app.config["SECRET_KEY"] == "override_secret"
 59 |         assert app.config["DATA_PASSWORD"] == "override_datapassword"
 60 | 
 61 |         # reset to old environment
 62 |         os.environ = old_env
 63 |         importlib.reload(osd2f.config)
 64 | 
 65 |     def test_overide_var_translation(self):
 66 | 
 67 |         mock_translate_value = Mock()
 68 |         with patch("osd2f.server.security.translate_value", mock_translate_value):
 69 |             from osd2f.server import create_app
 70 | 
 71 |             create_app(
 72 |                 mode="Production",
 73 |                 database_url_override="override_url",
 74 |                 app_secret_override="override_secret",
 75 |                 data_password_override="override_datapassword",
 76 |             )
 77 |             mock_translate_value.assert_called_with("override_url")
 78 | 
 79 |         mock_translate_value = Mock()
 80 |         with patch("osd2f.server.security.translate_value", mock_translate_value):
 81 |             from osd2f.server import create_app
 82 | 
 83 |             create_app(
 84 |                 mode="Production",
 85 |                 app_secret_override="override_secret",
 86 |                 data_password_override="override_datapassword",
 87 |             )
 88 |             mock_translate_value.assert_called_with("override_secret")
 89 | 
 90 |         mock_translate_value = Mock()
 91 |         with patch("osd2f.server.security.translate_value", mock_translate_value):
 92 |             from osd2f.server import create_app
 93 | 
 94 |             create_app(
 95 |                 mode="Production",
 96 |                 data_password_override="override_datapassword",
 97 |                 app_secret_override="tempsecret",
 98 |             )
 99 |             mock_translate_value.assert_called_with("tempsecret")
100 | 


--------------------------------------------------------------------------------
/tests/db_interaction_test.py:
--------------------------------------------------------------------------------
  1 | """Database test files.
  2 | 
  3 | We don't want to test our ORM package, so these tests target the convenvience
  4 | functions used.
  5 | """
  6 | 
  7 | import asyncio
  8 | import os
  9 | import sqlite3
 10 | import time
 11 | from unittest.mock import AsyncMock, patch
 12 | 
 13 | from aiounittest.case import AsyncTestCase
 14 | 
 15 | from osd2f.database import stop_database
 16 | 
 17 | 
 18 | class DatabaseStartStopTest(AsyncTestCase):
 19 |     async def test_initialize_database(self):
 20 |         from osd2f.database import initialize_database
 21 | 
 22 |         # we use a file simply because we want to access the same database
 23 |         # in the test as in the app context
 24 |         db_file = "test_temp"
 25 |         db_url = f"sqlite://{db_file}"
 26 | 
 27 |         await initialize_database(db_url=db_url)
 28 | 
 29 |         c = sqlite3.connect(db_file)
 30 | 
 31 |         # check if the submissions table can be queried
 32 |         c.execute("SELECT * FROM submissions").fetchall()
 33 | 
 34 |         os.remove(db_file)
 35 |         os.remove(db_file + "-shm")
 36 |         os.remove(db_file + "-wal")
 37 | 
 38 |         await stop_database()
 39 | 
 40 |     async def test_stop_database(self):
 41 |         close_mock = AsyncMock()
 42 |         with patch("tortoise.Tortoise.close_connections", close_mock):
 43 |             from osd2f.database import stop_database
 44 | 
 45 |             await stop_database()
 46 |             self.assertTrue(await close_mock.is_called())
 47 | 
 48 | 
 49 | class DatabaseInsertTest(AsyncTestCase):
 50 |     async def test_insert_submission(self):
 51 |         from osd2f.config import Testing
 52 |         from osd2f.definitions import Submission
 53 |         from osd2f.database import (
 54 |             DBSubmission,
 55 |             insert_submission,
 56 |             initialize_database,
 57 |             stop_database,
 58 |         )
 59 | 
 60 |         await initialize_database(Testing.DB_URL)
 61 | 
 62 |         nfiles = 10
 63 |         nentries = 10
 64 | 
 65 |         submissions = [
 66 |             Submission(
 67 |                 submission_id=f"testing-{i}",
 68 |                 filename=f"testing_{i}.json",
 69 |                 n_deleted=2,
 70 |                 entries=[{"entry": ii, "text": "here"} for ii in range(nentries)],
 71 |             )
 72 |             for i in range(nfiles)
 73 |         ]
 74 | 
 75 |         for sub in submissions:
 76 |             await insert_submission(sub)
 77 | 
 78 |         self.assertEqual(await DBSubmission.all().count(), nfiles * nentries)
 79 |         self.assertEqual(
 80 |             await DBSubmission.filter(n_deleted=2).count(), nfiles * nentries
 81 |         )
 82 | 
 83 |         await stop_database()
 84 | 
 85 | 
 86 | class UploadSubmissionTest(AsyncTestCase):
 87 |     async def test_upload_submission(self):
 88 |         from osd2f.definitions import Submission, SubmissionList
 89 | 
 90 |         sublist_db_mock = AsyncMock()
 91 | 
 92 |         nfiles = 10
 93 |         nentries = 10
 94 | 
 95 |         submissions = SubmissionList(
 96 |             [
 97 |                 Submission(
 98 |                     submission_id=f"testing-{i}",
 99 |                     filename=f"testing_{i}.json",
100 |                     n_deleted=10,
101 |                     entries=[{"entry": ii, "text": "here"} for ii in range(nentries)],
102 |                 )
103 |                 for i in range(nfiles)
104 |             ]
105 |         )
106 | 
107 |         with patch("osd2f.server.database.insert_submission_list", sublist_db_mock):
108 |             from osd2f import server
109 | 
110 |             testclient = server.app.test_client()
111 |             r = await testclient.post("/upload", data=submissions.model_dump_json())
112 |             assert r.status_code == 200
113 | 
114 |             sublist_db_mock.assert_called_once_with(submissionlist=submissions)
115 | 
116 | 
117 | class LogInsertTest(AsyncTestCase):
118 |     async def test_log_insert(self):
119 |         from osd2f.database import initialize_database, insert_log
120 | 
121 |         # we use a file simply because we want to access the same database
122 |         # in the test as in the app context
123 |         db_file = "test_temp"
124 |         db_url = f"sqlite://{db_file}"
125 | 
126 |         await initialize_database(db_url=db_url)
127 | 
128 |         await insert_log("backend", "INFO", "position")
129 |         await insert_log("backend", "INFO", "position")
130 |         await insert_log("backend", "INFO", "position", "sid_string")
131 |         await insert_log(
132 |             "backend", "INFO", "position", "sid_string2", {"thing": "value"}
133 |         )
134 | 
135 |         c = sqlite3.connect(db_file)
136 | 
137 |         # check if the submissions table received the inserts,
138 |         # because they are non-blocking, we'll have to just
139 |         # wait a bit
140 |         r = []
141 |         for i in range(100):
142 |             r = c.execute("SELECT * FROM osd2f_logs").fetchall()
143 |             if len(r) == 4:
144 |                 break
145 |             await asyncio.sleep(0.01)
146 | 
147 |         assert r, ValueError("No(t all) records returned")
148 | 
149 |         assert (
150 |             len(c.execute("SELECT * FROM osd2f_logs WHERE log_sid IS NULL").fetchall())
151 |             == 2
152 |         )
153 |         assert (
154 |             len(
155 |                 c.execute(
156 |                     "SELECT * FROM osd2f_logs WHERE log_sid IS NOT NULL"
157 |                 ).fetchall()
158 |             )
159 |             == 2
160 |         )
161 |         assert (
162 |             len(
163 |                 c.execute(
164 |                     "SELECT * FROM osd2f_logs WHERE log_entry IS NOT NULL"
165 |                 ).fetchall()
166 |             )
167 |             == 1
168 |         )
169 |         c.close()
170 | 
171 |         os.remove(db_file)
172 |         os.remove(db_file + "-shm")
173 |         os.remove(db_file + "-wal")
174 | 
175 |         await stop_database()
176 | 
177 | 
178 | class LoggerToDBTest(AsyncTestCase):
179 |     async def test_log_to_db(self):
180 |         from osd2f.database import initialize_database, add_database_logging
181 |         from osd2f.logger import logger
182 | 
183 |         # we use a file simply because we want to access the same database
184 |         # in the test as in the app context
185 |         db_file = "test_temp2"
186 |         db_url = f"sqlite://{db_file}"
187 | 
188 |         await initialize_database(db_url=db_url)
189 | 
190 |         logger.setLevel("DEBUG")
191 | 
192 |         q = add_database_logging()
193 | 
194 |         logger.debug("seen debug")
195 |         logger.info("seen info")
196 |         logger.warning("seen warning")
197 |         logger.critical("seen critical")
198 | 
199 |         q.put("stop")
200 | 
201 |         c = sqlite3.connect(db_file)
202 | 
203 |         r = []
204 |         for i in range(100):
205 |             r = c.execute("SELECT * FROM osd2f_logs").fetchall()
206 |             if len(r) == 4:
207 |                 break
208 |             time.sleep(0.01)
209 | 
210 |         os.remove(db_file)
211 |         os.remove(db_file + "-shm")
212 |         os.remove(db_file + "-wal")
213 | 
214 |         await stop_database()
215 | 


--------------------------------------------------------------------------------
/tests/download_data_protection_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from unittest.async_case import IsolatedAsyncioTestCase
 3 | 
 4 | from osd2f.server import create_app
 5 | 
 6 | 
 7 | class TestPasswordProtectedDownloads(IsolatedAsyncioTestCase):
 8 |     async def test_password_protected_downloads(self):
 9 |         from osd2f.security.authorization import USER_FIELD
10 | 
11 |         testapp = create_app(
12 |             data_password_override="testpassword", app_secret_override="testsecret"
13 |         )
14 |         await testapp.startup()
15 | 
16 |         # set placeholder to trigger authorization
17 |         os.environ["MSAL_CONFIG"] = "placeholder"
18 | 
19 |         tc = testapp.test_client()
20 | 
21 |         # set cookie to avoid real MSAL flow
22 |         async with tc.session_transaction() as session:
23 |             session[USER_FIELD] = "testuser"
24 | 
25 |         r = await tc.get("/researcher/osd2f_completed_submissions.json.zip")
26 |         assert r.status_code == 200
27 | 
28 |         os.environ.pop("MSAL_CONFIG")
29 | 
30 |         await testapp.shutdown()
31 | 


--------------------------------------------------------------------------------
/tests/initialization_test.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase, mock
 2 | 
 3 | 
 4 | class InitializationTests(TestCase):
 5 |     def test_production_init_without_secret(self):
 6 |         from osd2f.server import create_app, config
 7 | 
 8 |         config.Production.SECRET_KEY = None
 9 | 
10 |         self.assertRaises(Exception, create_app, mode="Production")
11 | 
12 |     def test_production_init_without_database(self):
13 |         from osd2f.server import create_app, config
14 | 
15 |         config.Production.SECRET_KEY = "not none"
16 | 
17 |         self.assertRaises(Exception, create_app, mode="Production")
18 | 
19 |     def test_production_init_with_secret_and_db(self):
20 |         # must be set before import
21 |         from osd2f.server import app, config, create_app
22 | 
23 |         config.Production.SECRET_KEY = "not none"
24 |         config.Production.DB_URL = "sqlite:memory"
25 | 
26 |         app.run = mock.Mock()
27 |         create_app(mode="Production")
28 |         config.Production.DB_URL = None
29 | 


--------------------------------------------------------------------------------
/tests/sample_anonymizer_test.py:
--------------------------------------------------------------------------------
 1 | from aiounittest import AsyncTestCase
 2 | 
 3 | 
 4 | class test_redact_text(AsyncTestCase):
 5 |     def test_in_options(self):
 6 |         from osd2f.anonymizers.sample_platform import redact_text
 7 |         from osd2f.anonymizers import options
 8 | 
 9 |         self.assertTrue(redact_text.__name__ in options)
10 | 
11 |     async def test_parses_title(self):
12 |         from osd2f.anonymizers.sample_platform import redact_text
13 | 
14 |         user = "henk"
15 |         correspondent = "arie"
16 |         title = f"{user} wrote on {correspondent}'s timeline."
17 | 
18 |         entry = {"title": title}
19 |         redacted = await redact_text(entry)
20 | 
21 |         self.assertIsNotNone(redacted)
22 |         self.assertFalse(user in redacted["title"])
23 |         self.assertFalse(correspondent in redacted["title"])
24 | 
25 |     async def test_parses_post(self):
26 |         from osd2f.anonymizers.sample_platform import redact_text
27 | 
28 |         user = "henk"
29 |         correspondent = "arie"
30 |         title = f"{user} wrote on {correspondent}'s timeline."
31 |         post = f"Hey {correspondent}, how's life? missing you! -{user}"
32 |         entry = {"title": title, "data": [{"post": post}]}
33 | 
34 |         redacted = await redact_text(entry)
35 | 
36 |         self.assertIsNotNone(redacted)
37 |         self.assertFalse(user in redacted["data"][0]["post"])
38 |         self.assertFalse(correspondent in redacted["data"][0]["post"])
39 | 


--------------------------------------------------------------------------------
/tests/sample_data_generator_test.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | import pathlib
  4 | import re
  5 | import shutil
  6 | from unittest import TestCase
  7 | 
  8 | 
  9 | class MinimalSampleGeneratorTest(TestCase):
 10 |     """Very high-level test of the sample data generator script.
 11 | 
 12 |     The Sample script is basically a set of assumptions written as code. As
 13 |     such, a full set of tests is not really worthwhile (basically, they would just
 14 |     be a repetition of the assumptions).
 15 | 
 16 |     Instead, we check whether
 17 |         1. the script is importable and runnable
 18 |         2. the expected files are generated
 19 |         3. generated structure matches that of mock data
 20 | 
 21 | 
 22 |     """
 23 | 
 24 |     def test_sample_generator_import(self):
 25 |         from scripts import sample_data_generator  # noqa
 26 | 
 27 |     def test_sample_generator_output(self):
 28 |         from scripts.sample_data_generator import generate_bundle
 29 | 
 30 |         testdir = "temp_test_data"
 31 |         self.assertFalse(pathlib.posixpath.exists(testdir))
 32 |         generate_bundle(
 33 |             testdir,
 34 |             overwrite=False,
 35 |             include_tar_variant=True,
 36 |             include_targz_variant=True,
 37 |             include_zip_variant=True,
 38 |             indents=2,
 39 |             n_companies_followed=10,
 40 |             n_engagement=10,
 41 |             n_comments=10,
 42 |             n_ads_clicked=10,
 43 |             n_post_files=2,
 44 |             n_profile_interests=10,
 45 |             n_posts=10,
 46 |             n_short_messages=10,
 47 |         )
 48 | 
 49 |         self.assertTrue(glob.glob("temp_test_data/README.md"))
 50 |         self.assertTrue(glob.glob("temp_test_data/sample-*.zip"))
 51 |         self.assertTrue(glob.glob("temp_test_data/sample-*.tar.gz"))
 52 |         self.assertTrue(glob.glob("temp_test_data/sample-*.tar"))
 53 |         self.assertTrue(glob.glob("temp_test_data/sample-*/posts/posts_0.json"))
 54 |         self.assertTrue(glob.glob("temp_test_data/sample-*/posts/posts_1.json"))
 55 |         self.assertTrue(glob.glob("temp_test_data/sample-*/engagement/engagement.json"))
 56 |         self.assertTrue(
 57 |             glob.glob("temp_test_data/sample-*/short_messages/messages.json")
 58 |         )
 59 |         self.assertTrue(
 60 |             glob.glob(
 61 |                 "temp_test_data/sample-*/profile_interests/profile_interests.json"
 62 |             )
 63 |         )
 64 |         self.assertTrue(glob.glob("temp_test_data/sample-*/comments/comments.json"))
 65 |         self.assertTrue(
 66 |             glob.glob("temp_test_data/sample-*/ads_clicked/ads_clicked.json")
 67 |         )
 68 |         self.assertTrue(
 69 |             glob.glob(
 70 |                 "temp_test_data/sample-*/companies_followed/companies_followed.json"
 71 |             )
 72 |         )
 73 | 
 74 |         shutil.rmtree(testdir)
 75 | 
 76 |     def test_sample_mockdata_format_equal_to_script_output(self):
 77 |         from scripts.sample_data_generator import generate_bundle
 78 | 
 79 |         base_testdir = "temp_test_data"
 80 |         testdir = os.path.join(base_testdir, "sample")
 81 |         self.assertFalse(pathlib.posixpath.exists(testdir))
 82 |         generate_bundle(
 83 |             testdir,
 84 |             overwrite=False,
 85 |             include_tar_variant=True,
 86 |             include_targz_variant=True,
 87 |             include_zip_variant=True,
 88 |             indents=2,
 89 |             n_companies_followed=20,
 90 |             n_engagement=20,
 91 |             n_comments=10,
 92 |             n_ads_clicked=10,
 93 |             n_post_files=2,
 94 |             n_profile_interests=10,
 95 |             n_posts=10,
 96 |             n_short_messages=10,
 97 |         )
 98 | 
 99 |         sample_mockdata_paths = glob.glob("mockdata/sample/**", recursive=True)
100 |         testdir_paths = glob.glob(os.path.join(testdir, "**"), recursive=True)
101 | 
102 |         gp = re.compile("(sample-[A-z-0-9]*)")
103 | 
104 |         def generalized(ps):
105 |             return [gp.sub("sample-*/", p).split("/", 1)[1] for p in ps]
106 | 
107 |         self.assertListEqual(
108 |             sorted(generalized(sample_mockdata_paths)),
109 |             sorted(generalized(testdir_paths)),
110 |         )
111 | 
112 |         shutil.rmtree(base_testdir)
113 | 


--------------------------------------------------------------------------------
/tests/security_entry_test.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import importlib
  3 | from unittest.mock import AsyncMock, Mock, patch
  4 | 
  5 | from aiounittest.case import AsyncTestCase
  6 | 
  7 | from osd2f.database.submissions import (
  8 |     get_submissions,
  9 |     insert_submission,
 10 |     insert_submission_list,
 11 | )
 12 | 
 13 | 
 14 | class ConfigTest(AsyncTestCase):
 15 |     def test_cli_override(self):
 16 |         from osd2f.server import create_app
 17 | 
 18 |         m = Mock()
 19 |         with patch("osd2f.server.SecureEntry.set_secret", m):
 20 |             create_app(entry_secret_override="entry_override")
 21 |             m.assert_called_once_with(secret="entry_override")
 22 | 
 23 |     def test_env_var_use(self):
 24 |         import osd2f.config
 25 |         from osd2f.server import create_app
 26 | 
 27 |         m = Mock()
 28 |         with patch(
 29 |             "osd2f.config._os.environ", {"OSD2F_ENTRY_SECRET": "another_secret"}
 30 |         ), patch("osd2f.server.SecureEntry.set_secret", m):
 31 |             # force reload to trigger new processing of
 32 |             # env variables
 33 |             importlib.reload(osd2f.config)
 34 | 
 35 |             create_app()
 36 |             m.assert_called_once_with(secret="another_secret")
 37 | 
 38 |     def test_env_var_override(self):
 39 |         import osd2f.config
 40 |         from osd2f.server import create_app
 41 | 
 42 |         m = Mock()
 43 |         with patch(
 44 |             "osd2f.config._os.environ", {"OSD2F_ENTRY_SECRET": "another_secret"}
 45 |         ), patch("osd2f.server.SecureEntry.set_secret", m):
 46 |             # force reload to trigger new processing of
 47 |             # env variables
 48 |             importlib.reload(osd2f.config)
 49 | 
 50 |             create_app(entry_secret_override="entry_override")
 51 |             m.assert_called_once_with(secret="entry_override")
 52 | 
 53 | 
 54 | class SecureEntryTest(AsyncTestCase):
 55 |     def test_without_secret(self):
 56 |         from osd2f.security.entry_encryption.secure_entry_singleton import SecureEntry
 57 | 
 58 |         SecureEntry.set_secret("")
 59 | 
 60 |         entry = {"stuff": "is unsafe"}
 61 |         unencrypted = SecureEntry.write_entry_field(entry.copy())
 62 | 
 63 |         self.assertEqual(entry, unencrypted)
 64 | 
 65 |         loaded_entry = SecureEntry.read_entry_field(entry.copy())
 66 |         self.assertEqual(entry, loaded_entry)
 67 | 
 68 |     def test_with_secret(self):
 69 |         from osd2f.security.entry_encryption.secure_entry_singleton import SecureEntry
 70 | 
 71 |         SecureEntry.set_secret("secret")
 72 |         entry = {"stuff": "is safe"}
 73 | 
 74 |         encrypted = SecureEntry.write_entry_field(entry.copy())
 75 |         self.assertIsNotNone(encrypted.get("encrypted"))
 76 |         self.assertEqual(entry, SecureEntry.read_entry_field(encrypted))
 77 | 
 78 |     def test_consistent_key(self):
 79 |         from osd2f.security.entry_encryption.secure_entry_singleton import SecureEntry
 80 | 
 81 |         m = {"thing": "to encrypt"}
 82 |         SecureEntry.set_secret("secret")
 83 |         e = SecureEntry.write_entry_field(m.copy())
 84 |         SecureEntry.set_secret("secret")
 85 |         m2 = SecureEntry.read_entry_field(e)
 86 | 
 87 |         self.assertEqual(m, m2)
 88 | 
 89 | 
 90 | class DatabaseOperationsTest(AsyncTestCase):
 91 |     async def test_insert_submission(self):
 92 |         from osd2f.definitions.submissions import Submission
 93 | 
 94 |         class MockSecureEntry:
 95 |             pass
 96 | 
 97 |         MockSecureEntry.read_entry_field = Mock()
 98 |         MockSecureEntry.write_entry_field = Mock()
 99 |         with patch("osd2f.database.submissions.SecureEntry", MockSecureEntry), patch(
100 |             "osd2f.database.DBSubmission.create", AsyncMock()
101 |         ):
102 |             s = Submission(
103 |                 submission_id="id",
104 |                 filename="file",
105 |                 entries=[{"thing": "here"}],
106 |                 n_deleted=0,
107 |             )
108 |             await insert_submission(s)
109 |             MockSecureEntry.write_entry_field.assert_called_once_with(s.entries[0])
110 | 
111 |     async def test_insert_submission_list(self):
112 |         from osd2f.definitions.submissions import Submission, SubmissionList
113 | 
114 |         class MockSecureEntry:
115 |             pass
116 | 
117 |         MockSecureEntry.read_entry_field = Mock()
118 |         MockSecureEntry.write_entry_field = Mock()
119 | 
120 |         async def mock_bulk_create(objects):
121 |             for i in objects:
122 |                 pass
123 | 
124 |         with patch("osd2f.database.submissions.SecureEntry", MockSecureEntry), patch(
125 |             "osd2f.database.DBSubmission.bulk_create", mock_bulk_create
126 |         ):
127 |             s = Submission(
128 |                 submission_id="id",
129 |                 filename="file",
130 |                 entries=[{"thing": "here"}],
131 |                 n_deleted=0,
132 |             )
133 |             await insert_submission_list(SubmissionList([s]))
134 |             MockSecureEntry.write_entry_field.assert_called_once_with(s.entries[0])
135 | 
136 |     async def test_get_submission(self):
137 | 
138 |         from osd2f.database.submissions import DBSubmission
139 | 
140 |         class MockSecureEntry:
141 |             pass
142 | 
143 |         s = DBSubmission(
144 |             id=5,
145 |             submission_id="id",
146 |             filename="file",
147 |             entry={"thing": "here"},
148 |             n_deleted=0,
149 |             insert_timestamp=datetime.datetime.now(),
150 |             update_timestamp=datetime.datetime.now(),
151 |         )
152 | 
153 |         MockSecureEntry.read_entry_field = Mock(return_value=s)
154 |         MockSecureEntry.write_entry_field = Mock()
155 | 
156 |         DBSubmission.all = AsyncMock(return_value=[s])
157 |         with patch("osd2f.database.submissions.SecureEntry", MockSecureEntry), patch(
158 |             "osd2f.database.DBSubmission.all", AsyncMock(return_value=[s])
159 |         ):
160 | 
161 |             await get_submissions()
162 |             MockSecureEntry.read_entry_field.assert_called_once()
163 | 


--------------------------------------------------------------------------------
/tests/security_secrets_test.py:
--------------------------------------------------------------------------------
 1 | from importlib import reload
 2 | from unittest.mock import Mock, patch
 3 | 
 4 | from aiounittest.case import AsyncTestCase
 5 | 
 6 | 
 7 | class SecretResolverTest(AsyncTestCase):
 8 |     def test_load_with_config(self):
 9 |         m = Mock()
10 | 
11 |         with patch("osd2f.security.translate_environment_vars", m):
12 |             import osd2f.config  # imported for side-effect
13 | 
14 |             # reloaded in case the module was already in cache
15 |             # due to another test
16 |             reload(osd2f.config)
17 | 
18 |             m.assert_called()  # might be called more than once, depending on cache
19 | 
20 |     def test_azure_keyvault_env_translation(self):
21 |         def m(s):
22 |             return "resolved" + s
23 | 
24 |         import os
25 |         from osd2f.security.secrets import azure_keyvault
26 | 
27 |         os.environ["azure_secret"] = f"{azure_keyvault.PREFIX}::test-keyvault::value"
28 | 
29 |         other_secret = "another-secret::somehwere::key"
30 |         os.environ["not_azure_secret"] = other_secret
31 | 
32 |         with patch("osd2f.security.RESOLVERS", {azure_keyvault.PREFIX: m}):
33 |             from osd2f.security import translate_environment_vars
34 | 
35 |             translate_environment_vars()
36 | 
37 |             # azure key should be resolved
38 |             assert os.environ["azure_secret"].startswith("resolved")
39 |             # non azure key should not be resolved
40 |             assert os.environ["not_azure_secret"] == other_secret
41 | 
42 |     def test_azure_keyvault_var_translation(self):
43 |         def m(s):
44 |             return "resolved" + s
45 | 
46 |         from osd2f.security.secrets import azure_keyvault
47 | 
48 |         secret = f"{azure_keyvault.PREFIX}::test-keyvault::value"
49 |         other_secret = "another-secret::somehwere::key"
50 | 
51 |         with patch("osd2f.security.RESOLVERS", {azure_keyvault.PREFIX: m}):
52 |             from osd2f.security import translate_value
53 | 
54 |             resolved_secret = translate_value(secret)
55 |             unresolved_secret = translate_value(other_secret)
56 | 
57 |             # azure key should be resolved
58 |             assert resolved_secret.startswith("resolved")
59 |             # non azure key should not be resolved
60 |             assert unresolved_secret == other_secret
61 | 


--------------------------------------------------------------------------------
/tests/utils_settings_test.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | from unittest.mock import Mock, patch
 3 | 
 4 | 
 5 | class test_util_settings_loader(TestCase):
 6 |     def test_settings_caching(self):
 7 |         disk_load = Mock()
 8 |         with patch("osd2f.utils._load_settings_from_disk", disk_load):
 9 |             from osd2f.utils import load_upload_settings
10 | 
11 |             load_upload_settings()
12 |             load_upload_settings()
13 |             self.assertTrue(disk_load.assert_called_once)
14 | 
15 |     def test_settings_without_caching(self):
16 |         disk_load = Mock()
17 |         with patch("osd2f.utils._load_settings_from_disk", disk_load):
18 |             from osd2f.utils import load_upload_settings
19 | 
20 |             load_upload_settings(True)
21 |             load_upload_settings(True)
22 |             self.assertTrue(disk_load.call_count == 2)
23 | 


--------------------------------------------------------------------------------
/webpack.config.js:
--------------------------------------------------------------------------------
 1 | const path = require('path')
 2 | const CopyWebpackPlugin = require('copy-webpack-plugin')
 3 | const {VueLoaderPlugin} = require('vue-loader')
 4 | 
 5 | module.exports = {
 6 |   module: {
 7 |     rules: [
 8 |       {
 9 |         test: /.css$/i,
10 |         use: ['style-loader', 'css-loader']
11 |       },
12 |       {
13 |         test: /\.vue$/,
14 |         loader: 'vue-loader'
15 |       }
16 |     ]
17 |   },
18 | 
19 |   entry: path.resolve(__dirname, 'osd2f', 'javascript', 'file_upload.js'),
20 | 
21 |   output: {
22 |     filename: 'main.js',
23 |     library: 'file_upload',
24 |     libraryTarget: 'window',
25 |     path: path.resolve(__dirname, 'osd2f', 'static', 'js')
26 |   },
27 | 
28 |   resolve: {
29 |     alias: {
30 |       vue$: 'vue/dist/vue.esm.js'
31 |     },
32 |     extensions: ['*', '.js', '.vue', '.json']
33 |   },
34 | 
35 |   plugins: [
36 |     new VueLoaderPlugin(),
37 |     new CopyWebpackPlugin({
38 |       patterns: [
39 |         {
40 |           // libarchive requires the distribution bundles to be available
41 |           // for the web worker.
42 |           from: path.resolve(
43 |             __dirname,
44 |             'node_modules',
45 |             'libarchive.js',
46 |             'dist'
47 |           ),
48 |           to: path.resolve(__dirname, 'osd2f', 'static', 'js', 'libarchive')
49 |         }
50 |       ]
51 |     })
52 |   ]
53 | }
54 | 


--------------------------------------------------------------------------------