├── .DS_Store ├── .flake8 ├── .github ├── pull_request_template.md └── workflows │ └── python-app.yml ├── .gitignore ├── CITATION.cff ├── Dockerfile ├── Dockerfile-test ├── LICENSE ├── README.md ├── bin ├── osd2f └── osd2f-decrypt-submissions ├── docs ├── adding_new_anonymizers.md ├── basic_authentication.md ├── deploying_as_a_container.md ├── deploying_to_azure.md ├── development.md ├── microsoft_authentication.md ├── protecting_downloads.md ├── stresstests.md ├── using_entry_encryption.md └── using_secret_stores.md ├── mockdata └── sample │ ├── README.md │ ├── sample-platform-russellrodney-3.tar │ ├── sample-platform-russellrodney-3.tar.gz │ ├── sample-platform-russellrodney-3.zip │ └── sample-platform-russellrodney-3 │ ├── ads_clicked │ └── ads_clicked.json │ ├── comments │ └── comments.json │ ├── companies_followed │ └── companies_followed.json │ ├── engagement │ └── engagement.json │ ├── posts │ ├── posts_0.json │ └── posts_1.json │ ├── profile_interests │ └── profile_interests.json │ └── short_messages │ └── messages.json ├── osd2f ├── __init__.py ├── __main__.py ├── anonymizers │ ├── __init__.py │ └── sample_platform.py ├── cli.py ├── config.py ├── database │ ├── __init__.py │ ├── configuration.py │ ├── logs.py │ └── submissions.py ├── definitions │ ├── __init__.py │ ├── content_settings.py │ ├── security_settings.py │ └── submissions.py ├── javascript │ ├── file_upload.js │ ├── parsing │ │ ├── fileparser.js │ │ ├── jsonparsing.js │ │ └── objparsing.js │ ├── server_interaction.js │ ├── tests │ │ ├── fileparsing.test.js │ │ ├── jsonparsing.test.js │ │ └── objectparsing.test.js │ ├── visualization_components │ │ ├── consentConfirmation.vue │ │ ├── donationContainer.vue │ │ └── donationTable.vue │ └── visualize.js ├── logger.py ├── security │ ├── __init__.py │ ├── authorization │ │ ├── __init__.py │ │ ├── basic_auth.py │ │ ├── microsoft_msal.py │ │ └── not_confgured.py │ ├── download_encryption │ │ ├── __init__.py │ │ └── encrypted_zipfile.py │ ├── entry_encryption │ │ ├── __init__.py │ │ ├── file_decryption.py │ │ └── secure_entry_singleton.py │ └── secrets │ │ ├── __init__.py │ │ └── azure_keyvault.py ├── server.py ├── settings │ ├── .DS_Store │ ├── default_content_settings.yaml │ └── default_upload_settings.yaml ├── static │ ├── .DS_Store │ ├── favicon.ico │ ├── js │ │ ├── libarchive │ │ │ ├── wasm-gen │ │ │ │ ├── libarchive.js │ │ │ │ └── libarchive.wasm │ │ │ └── worker-bundle.js │ │ ├── main.js │ │ └── main.js.LICENSE.txt │ ├── keylock.png │ ├── skull_phone_cc.jpg │ └── study_cc.jpg ├── templates │ ├── blocks │ │ ├── bootstrap_scripts.html.jinja │ │ ├── circles_row.html.jinja │ │ ├── footer.html.jinja │ │ ├── head.html.jinja │ │ ├── jumbotron.html.jinja │ │ ├── navbar.html.jinja │ │ └── two_block_row.html.jinja │ └── formats │ │ ├── base.html.jinja │ │ ├── researcher_template.html.jinja │ │ ├── static_template.html.jinja │ │ └── upload_template.html.jinja └── utils.py ├── package-lock.json ├── package.json ├── requirements.txt ├── requirements_dev.txt ├── scripts ├── locally_decrypt_entries.py ├── locust_stress_testing.py └── sample_data_generator.py ├── setup.py ├── tests ├── __init__.py ├── anonymizer_module_test.py ├── content_configuration_test.py ├── create_app_test.py ├── db_interaction_test.py ├── download_data_protection_test.py ├── initialization_test.py ├── local_decryption_test.py ├── sample_anonymizer_test.py ├── sample_data_generator_test.py ├── security_authorization_test.py ├── security_entry_test.py ├── security_secrets_test.py └── utils_settings_test.py └── webpack.config.js /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/.DS_Store -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # Recommend matching the black line length (default 88), 3 | # rather than using the flake8 default of 79: 4 | max-line-length = 88 5 | extend-ignore = 6 | # See https://github.com/PyCQA/pycodestyle/issues/373 7 | E203, 8 | exclude = node_modules, static -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # FEATURE ADDED OR BUG FIXED 2 | 3 | ## Closes #issueNO 4 | ____ 5 | Paragraph description of change 6 | 7 | ## Assumptions 8 | Any underlying assumptions for this change (e.g. filetypes, supported browsers, deploy environments) 9 | 10 | ## Usage / Minimal Example 11 | 12 | Instructions on how to verify changes this PR by running code 13 | 14 | **before** 15 | 16 | ```bash 17 | echo "stuff breaks" 18 | osd2f 19 | 20 | ``` 21 | ```python 22 | ValueError: Stuff broke! 23 | ``` 24 | **after** 25 | ```bash 26 | osd2f 27 | ``` 28 | ```python 29 | Success! 30 | ``` 31 | 32 | ## Checklist 33 | - [ ] Added tests if appropriate (and it should always be) 34 | - [ ] Created new issues when required 35 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python application 5 | 6 | on: 7 | push: 8 | branches: [main] 9 | pull_request: 10 | branches: [main] 11 | 12 | jobs: 13 | development_build: 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | - name: Set up Python 3.9 19 | uses: actions/setup-python@v4 20 | with: 21 | python-version: 3.9 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install -r requirements.txt 26 | pip install -r requirements_dev.txt 27 | - name: Lint with flake8 28 | run: flake8 ./ 29 | - name: mypy 30 | run: mypy ./osd2f/ --ignore-missing-imports 31 | - name: Test with pytest 32 | run: pytest ./ 33 | - name: Install & do a dry run 34 | run: | 35 | pip install -e ./ 36 | osd2f --dry-run 37 | - name: Check config generation functionality 38 | run: | 39 | osd2f --generate-current-config cc.yaml 40 | [ -s cc.yaml ] #check whether the file is not empty 41 | - name: Run Jest 42 | uses: stefanoeb/jest-action@1.0.3 43 | 44 | release_build: 45 | runs-on: ubuntu-latest 46 | 47 | steps: 48 | - uses: actions/checkout@v4 49 | - name: Set up Python 3.9 50 | uses: actions/setup-python@v4 51 | with: 52 | python-version: 3.9 53 | - name: Install with plain pip 54 | run: pip install ./ 55 | - name: Do a dry run 56 | run: osd2f --dry-run 57 | 58 | docker_build: 59 | runs-on: ubuntu-latest 60 | 61 | steps: 62 | - name: checkout files 63 | uses: actions/checkout@v4 64 | - name: build Docker image 65 | uses: docker/build-push-action@v4 66 | with: 67 | file: Dockerfile 68 | push: false 69 | load: ${{ github.event_name == 'pull_request' }} 70 | context: . 71 | 72 | docker_test_build: 73 | runs-on: ubuntu-latest 74 | 75 | steps: 76 | - name: checkout files 77 | uses: actions/checkout@v4 78 | - name: build Docker image 79 | uses: docker/build-push-action@v4 80 | with: 81 | file: Dockerfile-test 82 | push: false 83 | load: ${{ github.event_name == 'pull_request' }} 84 | context: . 85 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Extension of : https://github.com/github/gitignore/blob/master/Python.gitignore 2 | # License : CCO 1.0 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cover/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 101 | __pypackages__/ 102 | 103 | # Celery stuff 104 | celerybeat-schedule 105 | celerybeat.pid 106 | 107 | # SageMath parsed files 108 | *.sage.py 109 | 110 | # VScode stuff 111 | .vscode/ 112 | 113 | # Environments 114 | .env 115 | .venv 116 | env/ 117 | venv/ 118 | ENV/ 119 | env.bak/ 120 | venv.bak/ 121 | 122 | # Spyder project settings 123 | .spyderproject 124 | .spyproject 125 | 126 | # Rope project settings 127 | .ropeproject 128 | 129 | # mkdocs documentation 130 | /site 131 | 132 | # mypy 133 | .mypy_cache/ 134 | .dmypy.json 135 | dmypy.json 136 | 137 | # Pyre type checker 138 | .pyre/ 139 | 140 | # pytype static type analyzer 141 | .pytype/ 142 | 143 | # Cython debug symbols 144 | cython_debug/ 145 | 146 | # frontend dependencies 147 | node_modules/ 148 | 149 | # Azure configs 150 | .azure/ 151 | 152 | # IDE specific things 153 | .vscode 154 | 155 | # database files 156 | *.db 157 | *-shm 158 | *-wal -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Araujo" 5 | given-names: "Theo" 6 | - family-names: "Ausloos" 7 | given-names: "Jef" 8 | - family-names: "van Atteveldt" 9 | given-names: "Wouter" 10 | - family-names: "Loecherbach" 11 | given-names: "Felicia" 12 | - family-names: "Moeller" 13 | given-names: "Judith" 14 | - family-names: "Ohme" 15 | given-names: "Jakob" 16 | - family-names: "Trilling" 17 | given-names: "Damian" 18 | - family-names: "van de Velde" 19 | given-names: "Bob" 20 | - family-names: "de Vreese" 21 | given-names: "Claes" 22 | - family-names: "Welbers" 23 | given-names: "Kasper" 24 | title: "OSD2F: Open Source Data Donation Framework" 25 | doi: 10.31235/osf.io/xjk6t 26 | url: "https://github.com/uvacw/osd2f" 27 | 28 | references: 29 | - authors: 30 | - family-names: "Araujo" 31 | given-names: "Theo" 32 | - family-names: "Ausloos" 33 | given-names: "Jef" 34 | - family-names: "van Atteveldt" 35 | given-names: "Wouter" 36 | - family-names: "Loecherbach" 37 | given-names: "Felicia" 38 | - family-names: "Moeller" 39 | given-names: "Judith" 40 | - family-names: "Ohme" 41 | given-names: "Jakob" 42 | - family-names: "Trilling" 43 | given-names: "Damian" 44 | - family-names: "van de Velde" 45 | given-names: "Bob" 46 | - family-names: "de Vreese" 47 | given-names: "Claes" 48 | - family-names: "Welbers" 49 | given-names: "Kasper" 50 | doi: 10.31235/osf.io/xjk6t 51 | journal: "Computational Communication Research" 52 | title: "OSD2F: Open Source Data Donation Framework" 53 | type: article 54 | year: Forthcoming -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.7-buster 2 | 3 | EXPOSE 8000 4 | 5 | ENV OSD2F_SECRET="" 6 | ENV OSD2F_MODE="Development" 7 | ENV OSD2F_DB_URL="sqlite://:memory:" 8 | 9 | # make code available 10 | COPY ./ ./osd2f 11 | 12 | # add build-secret to hypercorn config 13 | 14 | WORKDIR /osd2f 15 | 16 | # setup dependencies 17 | RUN pip install ./ 18 | 19 | # minimal check to make sure the install works 20 | RUN osd2f --dry-run 21 | 22 | # set the default command for the container (i.e. running production) 23 | CMD [ "hypercorn", "osd2f.__main__:app", "-b", "0.0.0.0:8000" ] 24 | -------------------------------------------------------------------------------- /Dockerfile-test: -------------------------------------------------------------------------------- 1 | # Tests for Python 3.9 compatibilty 2 | 3 | FROM python:3.9.9-buster 4 | 5 | EXPOSE 8000 6 | ARG secret 7 | 8 | ENV OSD2F_SECRET=$secret 9 | ENV OSD2F_MODE="Development" 10 | ENV OSD2F_DB_URL="sqlite://:memory:" 11 | 12 | ## make code available 13 | COPY ./ ./osd2f 14 | 15 | ## add build-secret to hypercorn config 16 | 17 | WORKDIR /osd2f 18 | 19 | ## setup dependencies 20 | RUN pip install ./ 21 | RUN pip install -r requirements.txt 22 | RUN pip install -r requirements_dev.txt 23 | 24 | ## run tests 25 | RUN flake8 ./ 26 | RUN mypy ./osd2f/ --ignore-missing-imports 27 | RUN pytest ./ 28 | 29 | RUN osd2f --dry-run 30 | 31 | # Tests for Python 3.8 compatibility 32 | 33 | FROM python:3.8.12-buster 34 | 35 | EXPOSE 8000 36 | ARG secret 37 | 38 | ENV OSD2F_SECRET=$secret 39 | ENV OSD2F_MODE="Development" 40 | ENV OSD2F_DB_URL="sqlite://:memory:" 41 | 42 | ## make code available 43 | COPY ./ ./osd2f 44 | 45 | ## add build-secret to hypercorn config 46 | 47 | WORKDIR /osd2f 48 | 49 | ## setup dependencies 50 | RUN pip install ./ 51 | RUN pip install -r requirements.txt 52 | RUN pip install -r requirements_dev.txt 53 | 54 | ## run tests 55 | RUN flake8 ./ 56 | RUN mypy ./osd2f/ --ignore-missing-imports 57 | RUN pytest ./ 58 | 59 | RUN osd2f --dry-run 60 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Python application](https://github.com/uvacw/osd2f/workflows/Python%20application/badge.svg?branch=main) 2 | Code style: black 3 | # OSD2F: Open Source Data Donation Framework (No longer maintained) 4 | 5 | ## ⚠️ Update: this repository is archived ⚠️ 6 | 7 | This repository is being archived as is. The code can be reused by others as specified in the license, yet security updates and maintenance are not currently being done. Those interested in using the code must therefore consider performing any relevant security updates priot to using the tool. The OSD2F authors are now working on a new data donation infrastructure, which can be found here: [https://datadonation.eu](https://datadonation.eu). This infrastructure contains a stand-alone tool (PORT) which is actively maintained and updated. 8 | 9 | 10 | 11 | ## Goal 12 | 13 | Use OSD2F to run your own Data Donation service. The aim of this project is to facilitate 14 | scientists to collect data donations, by providing an easy-to-use web-based data donation 15 | platform. Here, scientists can instruct participants in their research to upload data 16 | exports from major online platforms (generally based on participants rights to their own 17 | data under GDPR). 18 | 19 | The App aims to be as export agnostic as possible while keeping things feasible to maintain. 20 | You can specify the files and the whitelist of JSON fields through YAML configuration. 21 | As such it supports Data Donation Packages of arbitrary format in JSON files (although it assumes they are UTF-8 encoded). 22 | 23 | ## Using OSD2F locally 24 | 25 | Installing the OSD2F locally is relatively simple by using pip's support for installation straight from 26 | VCS. However, we recommend local installation only in cases in which you want to familiarize yourself 27 | with OSD2F and **never for production (real data collection) purposes**. 28 | 29 | ***Note:** There is a different set of instructions for development purposes in the [development docs](docs/development.md)* 30 | 31 | ### Installation (not for development) 32 | 33 | OSD2F requires python 3.8 or up, check your version by running: 34 | 35 | ```bash 36 | python --version 37 | ``` 38 | should say something like: 39 | > Python 3.8.0 40 | 41 | *Note: it's recommended to use a virtual environment, please consult de [development docs](docs/development.md) for more information.* 42 | 43 | ```bash 44 | pip install git+https://github.com/uvacw/osd2f 45 | ``` 46 | 47 | ### Running 48 | 49 | ```bash 50 | osd2f -h # see help 51 | ``` 52 | 53 | ```bash 54 | osd2f -m Testing # to run a testing instance 55 | ``` 56 | 57 | You can configure the text content of the webpages. The easiest way to get started 58 | is by generating a YAML file with the default values and editing it to your liking: 59 | 60 | ```bash 61 | osd2f --generate-current-config config.yaml 62 | ``` 63 | 64 | You can start the server with this content configuration by passing a file-path 65 | via the CLI. 66 | 67 | ```bash 68 | osd2f --content-configuration config.yaml # make sure you've edited it first 69 | ``` 70 | 71 | ***Note**: OSD2F will store the configuration in the database. In development mode, the 72 | most recently edited version is used between the database and the file.* 73 | 74 | ## See also: 75 | 76 | 1. [how to develop](docs/development.md) 77 | 2. [Deploying to Azure](docs/deploying_to_azure.md) 78 | 3. [Running stresstests](docs/stresstests.md) 79 | 4. [Testing the researcher login with basic auth](docs/basic_authentication.md) 80 | 5. [Using Microsoft Authentication via SSO](docs/microsoft_authentication.md) 81 | 6. [Setting password on researcher downloads](docs/protecting_downloads.md) 82 | 7. [Adding additional (server side) anonymizers](docs/adding_new_anonymizers.md) 83 | 84 | ## Credits: 85 | 86 | If you use this tool, please cite the paper: 87 | 88 | *APA:* 89 | 90 | Araujo, T., Ausloos, J., van Atteveldt, W., Loecherbach, F., Moeller, J., Ohme, J., Trilling, D., van de Velde, B., de Vreese, C., & Welbers, K. (Forthcoming). OSD2F: An Open-Source Data Donation Framework. *Computational Communication Research*, https://osf.io/preprints/socarxiv/xjk6t/ 91 | 92 | *Bibtex:* 93 | 94 | ``` 95 | @article{osd2f, 96 | title={OSD2F: An Open-Source Data Donation Framework}, 97 | DOI={10.31235/osf.io/xjk6t}, 98 | author={Araujo, Theo and Ausloos, Jef and {van Atteveldt}, Wouter and Loecherbach, Felicia and Moeller, Judith and Ohme, Jakob and Trilling, Damian and {van de Velde}, Bob and {de Vreese}, Claes and Welbers, Kasper}, 99 | year={forthcoming}, 100 | journal = {Computational Communication Research} 101 | } 102 | ``` 103 | 104 | 105 | This tool is inspired in earlier approaches that enable researchers to partner with individuals willing to donate their data for academic research, including [Web Historian](https://github.com/erickaakcire/webhistorian) (Menchen-Trevino, 2016), among others. 106 | -------------------------------------------------------------------------------- /bin/osd2f: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from osd2f import cli 3 | 4 | cli.parse_and_run() -------------------------------------------------------------------------------- /bin/osd2f-decrypt-submissions: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from scripts.locally_decrypt_entries import run_script 3 | 4 | run_script() -------------------------------------------------------------------------------- /docs/adding_new_anonymizers.md: -------------------------------------------------------------------------------- 1 | # Adding new anonymizer functions 2 | 3 | This guide explains how to add anonymization functions to the codebase. You will need 4 | Python skill to create a new anonymizer. 5 | 6 | ## What are anonymizers 7 | 8 | Simply put, anonymizers are functions that run on the server *before* data is collected. They 9 | are meant for more complex processing of submission entries for privacy protecting purposes. For 10 | instance, they could be named entity recognition functions that filter out personal names using 11 | (pretrained) machine learning models. 12 | 13 | Anonymizers are run server-side **before consent is given** and should never store data on disk. 14 | 15 | ## How do anonymizers work 16 | 17 | When a user selects files to upload, the containing entries are parsed. This parsing removes fields 18 | that are not on the whitelist, and 'flattens' the dictionaries using `.` notation. After this client-side 19 | parsing, the entries are send to the server where the configured anonymizers are called. 20 | 21 | For each file that has a configured anonymizer, each entry is provided to that anonymizer function. The 22 | anonymizer function should return the entry after changing it's contents. These entries are then returned 23 | to the client for respondents to inspect and provide consent on the real upload. 24 | 25 | ## Creating a new anonymizer 26 | 27 | You can create an anonymizer by adding a file to the `osd2f/anonymizers` directory, importing it in the 28 | `osd2f/anonymizers/__init__.py` file and configuring the use of the anonymizer in the upload settings 29 | YAML file used for your deployment. 30 | 31 | ### Writing a new anonymizer 32 | 33 | Anonymizers should have this signature: 34 | 35 | ```python 36 | async def your_anonymizer( 37 | entry: typing.Dict[str, typing.Any], argument: str = "optional_string_argument" 38 | ) -> typing.Dict[str, typing.Any] 39 | ``` 40 | 41 | For example, we'll implement an anonymizer that goes through a text and removes any word after a 42 | given substring: 43 | 44 | ```python 45 | # in new file: osd2f/anonymizers/nextword_anonymizer.py 46 | from typing import Any, Dict 47 | 48 | async def nextword_anonymizer(entry: Dict[str,Any], substring: str): 49 | 50 | redacted_entry = {} 51 | for field, value in entry.items(): 52 | # keep field values that are not a string as-is 53 | # in the redacted entry 54 | if type(value)!=str: 55 | redacted_entry[field] = value 56 | continue 57 | 58 | # naively split the value on spaces 59 | # and keep the words that do not come after 60 | # the substring 61 | previous_token = "" 62 | new_value = [] 63 | for token in value.split(" "): 64 | if previous_token!=substring: 65 | new_value.append(token) 66 | else: 67 | new_value.append("") 68 | previous_token = token 69 | 70 | redacted_entry[field] = " ".join(new_value) 71 | 72 | # make sure to return the redacted version of the entry 73 | return redacted_entry 74 | 75 | ``` 76 | 77 | You can test the function: 78 | 79 | ```python 80 | from osd2f.anonymizers.nextword_anonymizer import nextword_anonymizer 81 | 82 | fake_entry = { 83 | "text" : "mr Darcy was unamused, but so was mr bennet" 84 | } 85 | 86 | await nextword_anonymizer(fake_entry, "mr") 87 | 88 | ``` 89 | outputs: 90 | ``` {'text': 'mr was unamused, but so was mr '} ``` 91 | 92 | 93 | ### Adding the anonymizer to imports 94 | 95 | For OSD2F to recognize the new anonymizer, it needs to be added to the `osd2f/anonymizers/__init__.py` file, like so: 96 | 97 | ```python 98 | # in osd2f/anonymizers/__init__.py 99 | import re 100 | import typing 101 | 102 | from .sample_platform import redact_text 103 | from .nextword_anonymizer import nextword_anonymizer # <- import the new anonymizer function 104 | from ..definitions import Submission, SubmissionList, UploadSettings 105 | from ..logger import logger 106 | 107 | options: typing.Dict[str, typing.Callable[[typing.Dict, str], typing.Awaitable]] = { 108 | redact_text.__name__: redact_text, # noqa 109 | nextword_anonymizer.__name__ : nextword_anonymizer # noqa <- add it to the options 110 | } 111 | 112 | ...rest of the file... 113 | ``` 114 | 115 | 116 | ### Configuring settings to use this anonymizer 117 | 118 | Let's try to use this new anonymizer. First, we create an upload settings file: 119 | 120 | ```yaml 121 | # in osd2f/settings/default_upload_settings.yaml 122 | files: 123 | example.json: 124 | anonymizers: 125 | - nextword_anonymizer : "mr" 126 | accepted_fields: 127 | - text 128 | - title 129 | - number 130 | ``` 131 | 132 | Then we create a file to donate called `example.json`: 133 | 134 | ```json 135 | [ 136 | { 137 | "title": "mr Frogs day out", 138 | "text": "mr Frog was driving on the windy road towards mr Toad", 139 | "number": 100, 140 | "other": "is not on the whitelist" 141 | } 142 | ] 143 | ``` 144 | 145 | We start the OSD2F platform: 146 | 147 | ```bash 148 | OSD2F_SECRET=secret \ 149 | OSD2F_ENTRY_SECRET=TESTSECRET \ 150 | OSD2F_MODE=Development \ 151 | osd2f -vvv 152 | ``` 153 | 154 | We upload our `example.json` file on the [upload page](http://localhost:5000/upload) 155 | 156 | Press the `inspect & edit` button and you will see the redacted result in the table! -------------------------------------------------------------------------------- /docs/basic_authentication.md: -------------------------------------------------------------------------------- 1 | # Basic Authentication for easier testing / local installs 2 | 3 | ## Important note 4 | 5 | Basic authentication is not considered a 'safe' authorization mechanism by todays standards. 6 | This implementation serves to make login testing easier without requiring an OAuth platform 7 | to be available. 8 | 9 | Some reasons why you should not use basic auth in production: 10 | 1. Passwords are send unencrypted, so any communication outside HTTPS leaks the password 11 | 2. Browsers tend to automatically store basic auth username-password combinations, and do 12 | so in an insecure fashion 13 | 14 | ## How does it work 15 | 16 | Basic auth will prompt researchers for a username-password combination provided as an environment 17 | configuration. 18 | 19 | ```bash 20 | 21 | OSD2F_BASIC_AUTH="username21;unguessablepassword" osd2f -m Development 22 | ``` 23 | 24 | Will start a (development) server that allows researchers to login by entering the username `username21` and 25 | password `unguessablepassword`. Needless to say, you must be very carefull about who knows the username and 26 | password. -------------------------------------------------------------------------------- /docs/deploying_as_a_container.md: -------------------------------------------------------------------------------- 1 | # Deploying OSD2F as a container 2 | 3 | ## What is a container and why use it? 4 | 5 | Containers, most populairly [docker containers](https://www.docker.com/resources/what-container) are ways 6 | to package an application, making sure all dependencies and environment characteristics are wrapped 7 | together. This makes containers ideal for deployment across different environments, without worrying 8 | about OS compatibilities, libraries that need to be installed on servers etcetera. 9 | 10 | The popularity of containers as a deployment model is clear in the broad support. PaaS offerings such 11 | as [Google app engine](https://cloud.google.com/appengine/docs/flexible), [Amazon ECS](https://aws.amazon.com/ecs/) 12 | and [Microsoft Azure Container Instances](https://docs.microsoft.com/en-us/azure/container-instances/container-instances-quickstart) 13 | support running arbitrary containers without requiring any advanced cloud management skills. 14 | 15 | For more advanced setups, the common deployment infrastructure is [Kubernetes (k8s)](https://kubernetes.io/), a container orchestration platform that combines containers and allows for their deployment across 16 | servers. 17 | 18 | ### TL;DR: 19 | * containers make applications easy to move between servers 20 | * containers are widely supported by cloud providers 21 | 22 | ## Creating an OSD2F container 23 | 24 | ### Building a test container 25 | 26 | If you want to test whether the code still works after modifications: 27 | 28 | ```bash 29 | docker build -t osd2f-test -f Dockerfile-test ./ 30 | ``` 31 | If the build is succesfull, that means all tests have passed. 32 | You can access the container by running it: 33 | 34 | ```bash 35 | docker run -it osd2f-test bash 36 | ``` 37 | 38 | This is slower to build and contains more dependencies that are normally only used for development. This 39 | is not the container specification that is meant for production deployments. 40 | 41 | ### Building a container for deployment 42 | 43 | ```bash 44 | docker build -t osd2f -f Dockerfile ./ 45 | ``` 46 | 47 | Running the container (using port 8000), the `-p` flag sets the host port of you machine to refer to the port of the container. The `-e` flags are used to set environment variables. Note that production instances 48 | always require a session secret. The example here is not suited for production, as you should avoid allowing researcher access through basic authentication and the database is an in-memory database that 49 | will be reset to empty every time the container is restarted. 50 | 51 | ```bash 52 | docker run -it \ 53 | -e OSD2F_MODE="Production" \ 54 | -e OSD2F_BASIC_AUTH='user;pass' \ 55 | -e OSD2F_SECRET="a big secret here" \ 56 | -e OSD2F_DB_URL="sqlite://:memory:" \ 57 | -p 8000:8000 \ 58 | osd2f 59 | ``` 60 | You should be able to reach the server now at http://localhost:8000/ 61 | 62 | ## Deploying containers to production 63 | 64 | Container use in production is strongly related to the solution you will be using. Some deployment platforms enable you to upload the docker image through a CLI tool or as part of a CI/CD interface. Other systems such as [Kubernetes (k8s)](https://kubernetes.io/) require the docker image to be available in a repository. 65 | 66 | You can push the container image to a repository of your choosing. The syntax ([as specified by Docker](https://docs.docker.com/docker-hub/repos/)) is the following: 67 | 68 | ```bash 69 | docker push /: 70 | ``` 71 | 72 | Whether and which repository to use depends on the the platform you choose to use for the deployment. Note that running the container on a single server will risk limited availability (downtime when this server experiences issues) and comes at considerable operational overhead (configuring security, keeping the systrem up-to-date, backing up data etcetera). 73 | 74 | -------------------------------------------------------------------------------- /docs/deploying_to_azure.md: -------------------------------------------------------------------------------- 1 | # Deploying OSD2F to azure 2 | 3 | ## disclaimer 4 | 5 | This documentation is intended to demonstrate how to set up OSD2F as an Azure webapp service. It is oriented towards putting an interface out there to see, but is not set up for actual data collection. The needs and conditions of your specific project may impact the way the app should be configured. Consult you cloud engineer before applying the below steps to understand how to adapt them to your project. 6 | 7 | # general preparations 8 | 9 | ## Make sure you azure CLI client is logged in and selected the appropriate subscription. 10 | 11 | ```bash 12 | az login 13 | az account set --subscription 14 | export AZURE_RESOURCE_GROUP= 15 | export WEBAPPNAME="osd2f-test" # must be globally unique, e.g. unused on Azure 16 | ``` 17 | 18 | Doublecheck with: 19 | ```bash 20 | az account show 21 | ``` 22 | 23 | ## creating the webapp 24 | 25 | Using webapp up will setup the webapp, the appservice and the plan required. The app won't work before we also apply the other commands. Make sure to be inside the OSD2F folder (locally) when running this command. 26 | 27 | ```bash 28 | # python 3.9 is in early access on Azure (2021-11-05), 29 | # you can select it in the Settings > Configuration 30 | # panel of the App Service under `Minor version` 31 | az webapp up \ 32 | --runtime 'python|3.8' \ 33 | --location "West Europe" \ 34 | --sku F1 \ 35 | --verbose \ 36 | --name $WEBAPPNAME 37 | ``` 38 | 39 | Minor addition for security: 40 | ```bash 41 | az webapp identity assign --resource-group $AZURE_RESOURCE_GROUP --name $WEBAPPNAME 42 | ``` 43 | 44 | # setting up config with in-memory db 45 | 46 | ## 1. Setup desired settings: 47 | 48 | ```bash 49 | az webapp config appsettings set --name $WEBAPPNAME\ 50 | --resource-group $AZURE_RESOURCE_GROUP \ 51 | --settings \ 52 | OSD2F_SECRET=$RANDOM$RANDOM$RANDOM$RANDOM \ 53 | OSD2F_DB_URL="sqlite://:memory:" \ 54 | OSD2F_MODE="Production" 55 | ``` 56 | Please note: 57 | 58 | - **OSD2F_SECRET** : This will introduce a random secret that is different every time 59 | this command is run. The secret is used by the server to maintain 60 | sessions, so running this command will 'logout' any ongoing session. 61 | - **OSD2F_DB_URL** : The database to use, the example has an in-memory database, see the next section for a setup with a proper database. 62 | - **OSD2F_MODE** : the mode in which to run the server, should pretty much always be production for internet facing deployments. 63 | 64 | **NOTE**: deploying secrets in this way is not 'safe', anyone with 65 | admin access to this resource group will be able to see 66 | the secret! 67 | 68 | set the custom startup command. We use the hypercorn ASGI server middleware for performance reasons. 69 | 70 | ```bash 71 | az webapp config set \ 72 | --resource-group $AZURE_RESOURCE_GROUP \ 73 | --name $WEBAPPNAME \ 74 | --startup-file "python -m hypercorn osd2f.__main__:app -b 0.0.0.0" 75 | ``` 76 | 77 | # setting up config with real database 78 | 79 | ## 1. Create the database 80 | 81 | We'll assume a Postgres database, but anything supported by Tortoise should work. 82 | ## 2. You can now formulate a connection string 83 | 84 | test in locally (dont forget to whitelist your IP address in the database firewall rules): 85 | 86 | ```bash 87 | # you should have the admin user (db_user) password (db_pass) and database name (db_name) 88 | db_user='postgres'; \ 89 | db_pass='YOUR-PASSWORD-HERE'; \ 90 | db_name="YOUR-DATABASE-NAME-HERE"; \ 91 | osd2f -db "postgres://$db_user@$db_name:$db_pass@$db_name.postgres.database.azure.com:5432/postgres?ssl=True" 92 | ``` 93 | 94 | If you see an error related to "hba_config" it probably means access is incorrectly configured. Check: 95 | 96 | - [ ] did you add the the database server name after the username? (e.g. `user@database:password` ) 97 | - [ ] if you are trying to connect from a local machine, did you whitelist your IP in the database security configuration? 98 | ## 3. Set 'Allow access to Azure services' to 'Yes' 99 | 100 | This will allow the webapp to connect. Do this in the security configuration of the database you want to connect to. 101 | 102 | ## 4. Setup desired settings: 103 | 104 | ```bash 105 | az webapp config appsettings set --name $WEBAPPNAME\ 106 | --settings \ 107 | OSD2F_SECRET=$RANDOM$RANDOM$RANDOM$RANDOM \ 108 | OSD2F_MODE="Production" 109 | ``` 110 | Please note: 111 | 112 | - **OSD2F_SECRET** : This will introduce a random secret that is different every time 113 | this command is run. The secret is used by the server to maintain 114 | sessions, so running this command will 'logout' any ongoing session. 115 | - **OSD2F_MODE** : the mode in which to run the server, should pretty much always be production for internet facing deployments. 116 | 117 | **NOTE**: deploying secrets in this way is not 'safe', anyone with 118 | admin access to this resource group will be able to see 119 | the secret! Consider using a [secret store](./using_secret_stores.md) 120 | 121 | set the custom startup command. We use the hypercorn ASGI server middleware for performance reasons. 122 | 123 | ## 5. Add the connection string 124 | ```bash 125 | db_user='postgres'; \ 126 | db_pass='YOUR-PASSWORD-HERE'; \ 127 | db_name="YOUR-DATABASE-NAME-HERE"; \ 128 | \ 129 | az webapp config connection-string set \ 130 | --name $WEBAPPNAME \ 131 | -t PostgreSQL \ 132 | --settings custom1="postgres://$db_user@$db_name:$db_pass@$db_name.postgres.database.azure.com:5432/postgres?ssl=True" 133 | ``` 134 | ## 6. We map the Azure protected database connection strings to the startup command of OSD2F. 135 | 136 | ```bash 137 | az webapp config set \ 138 | --resource-group "" 139 | --name $WEBAPPNAME 140 | --startup-file 'OSD2F_DB_URL=$POSTGRESQLCONNSTR_custom1 python -m hypercorn osd2f.__main__:app -b 0.0.0.0' 141 | ``` 142 | 143 | ## HINT: Check the webapp settings 144 | 145 | In the app-service > settings > configurations tab, you can check whether the correct database URL string was received and, under general settings, whether the correct startup command was registered. 146 | 147 | # deploying the app 148 | 149 | Deploying the app uploads the source code and provisions the application. If you want changes to the code to go live, this is the command to run. 150 | 151 | ```bash 152 | az webapp up \ 153 | --runtime 'python|3.8' \ 154 | --location "West Europe" \ 155 | --name $WEBAPPNAME 156 | ``` 157 | 158 | # updating the app 159 | 160 | If at a certain point you need to update the app settings (e.g., change from SQLlite to Postgres), you will also need to include the ```resource-group``` parameter in the Azure commands. You can get the resource-group info from the app overview. 161 | 162 | Afterwards, you can define it as an environment variable: 163 | ```export RESOURCEGROUPNAME="includethenamehere"``` 164 | 165 | And include it along with the commands above, after the webapp name. For example: 166 | ``` 167 | az webapp config connection-string set \ 168 | --name $WEBAPPNAME --resource-group $RESOURCEGROUPNAME\ 169 | ... 170 | ``` 171 | 172 | # applying new configurations (temporary method) 173 | 174 | There is currently no configuration interface for the content of the app. You can update remote (e.g. Azure) 175 | webapp content configurations by locally creating a content-file and running the app with the remote 176 | database connection. 177 | 178 | For example: 179 | 180 | ```bash 181 | OSD2F_DB_URL="" \ 182 | OSD2F_SECRET="arbitrary string" \ 183 | osd2f \ 184 | -m Production \ 185 | -cc your_content_settings.yaml 186 | ``` 187 | 188 | 189 | -------------------------------------------------------------------------------- /docs/development.md: -------------------------------------------------------------------------------- 1 | # Development 2 | 3 | ## Core assumptions 4 | 5 | Choices in this codebase are based on some assumptions as to the uses of the 6 | framework. Because of these assumptions some things are simple, whereas others 7 | are harder. When contributing code, please make sure you keep these assumptions 8 | in mind: 9 | 10 | - **Functionality should be generic over many kinds of donation formats**. 11 | This means that the frontend, endpoints, anonymizers and configurations should be able to handle pretty arbitrary JSON*. Assumptions about existing fields, datatypes etcetera should be limited to 12 | - the default configuration file example 13 | - export source specific anonymizers 14 | - **Configuration targets users with low technical expertise**, 15 | which means the selection of fields to include and anonymizers to use should be relatively easy to infer from the example configuration. It also means that we want to avoid making content decisions in code. 16 | - **This framework is for collection, *not* analysis**. 17 | The intended use of this framework is to provide a participant facing data submission interface with good privacy guarantees. The researchers who administer the deployment can download the data to do analysis in their own environment. The entries submitted can therefore be treated as a 'black box'. This helps maintain flexibility (no database migrations for new donation types) and maintainability (changes in export formats can be upgraded via configuration only). 18 | - **All content data is sensitive** and should never be in any logs or 19 | (disk) storage UNLESS after the explicit consent step. This also means 20 | that AT NO POINT any of the JSON fields or values should be in a 21 | `print()`, `logger.info()`, `logger.warning()`, `logger.critical()` or any other stdout/stderr statement. For local development and testing purposes, 22 | you can use `logger.debug()` to contain content information. 23 | 24 | 25 | ## Installation for development 26 | 27 | You can install this Python Package for local development purposes. To do 28 | so, we *strongly* advice using a virtual environment context. 29 | 30 | In addition, please note that OSD2F was written for Python `3.9`. Using 31 | a virtual environment should make it easy to install this version without impacting your other Python projects. 32 | 33 | 34 | ---- 35 | ##### Example using the popular [anaconda distribution of python](https://www.anaconda.com/) 36 | 37 | ```bash 38 | conda create -n osd2f python=3.9 # only required once 39 | conda activate osd2f # run at the start of each osd2f development session 40 | ``` 41 | ---- 42 | 43 | ### 1. Clone the repository 44 | 45 | You can clone the git repository so you can easily switch between branches. 46 | 47 | ```bash 48 | # get the repository 49 | git clone git@github.com:uvacw/osd2f.git 50 | # move to project ROOT 51 | cd osd2f 52 | # you know you're in the right place if it contains setup.py 53 | ls setup.py 54 | # shows: `setup.py`, if it says 'cannot access' you in the wrong place 55 | ``` 56 | 57 | ### 2. Install the package in editable mode 58 | 59 | For development purposes, you should install the package using the `-e` pip flag 60 | to ensure it is available in 'editable' mode ([see the docs](https://pip.pypa.io/en/stable/reference/pip_install/)). 61 | 62 | ```bash 63 | # at the repository root (osd2f/) 64 | pip install -e ./ 65 | ``` 66 | 67 | ### 3. Install development requirements 68 | 69 | There are additional requirements for development purposes that 70 | mainly serve to ensure proper formatting and static analysis. Install 71 | them seperately: 72 | 73 | ```bash 74 | # at the repository root (osd2f/) 75 | pip install -r requirements_dev.txt 76 | ``` 77 | 78 | ### 4. Run the code in development mode 79 | 80 | While developing, it's probably nice to use development mode *and* set the 81 | log level to DEBUG. You can do so by: 82 | 83 | ```bash 84 | osd2f -m Development -vvv 85 | ``` 86 | The server will now automatically reload when changes are detected. In addition, the settings `yaml` file will be reloaded for each request so 87 | you can quickly iterate on it. 88 | 89 | ### javascript 90 | 91 | If you are planning to touch the javascript part of the application, you 92 | are recommended to install the npm packages 93 | 94 | ```bash 95 | npm i --also=dev 96 | ``` 97 | 98 | During development, it's probably nice to have human readable javascript in the 99 | browser (so you can use the build-in debuggers). Use `npm run development` to have webpack watch the javascript files and re-generate a human-readable `main.js` while you work. Once your javascript works well, use `npm run build` to generate the proper minified `main.js` to check in. 100 | 101 | 102 | ## About fake data 103 | 104 | Fake data is part of this repository to demonstrate potential donations. It allows you to play around with data 105 | that on it's service should be similar to real donations when testing your deployment, developing new anonymizes or 106 | visualizations. 107 | 108 | Fake data was generated using the 'faker' package implementation in [scripts/sample_data_generator.py](../scripts/sample_data_generator.py), using the command: 109 | 110 | ```bash 111 | python scripts/sample_data_generator.py -o mockdata/sample --overwrite -i 2 -z -tz -t 112 | ``` 113 | 114 | More information about how to use this script, consult the help: 115 | 116 | ```bash 117 | python scripts/sample_data_generator.py -h 118 | ``` 119 | 120 | ## Code style & checks 121 | 122 | There are a number of checks to run in order to guarantee all tests pass, formatting is correct and typing is properly applied. You can run these manually: 123 | 124 | ```bash 125 | flake8 ./ # formatting analysis 126 | mypy ./ # static analysis 127 | pytest ./ # unittests 128 | ``` 129 | 130 | You can opt to run `black` seperately to apply auto-formatting (`flake8-black` only checks, without corrections). 131 | 132 | ```bash 133 | black ./ 134 | ``` 135 | 136 | Note that most IDEs (e.g. PyCharms, VSCode, ...) allow you to automatically run these commands every time you save, commit or attempt to push the code. We especially advice you to run black on every save. -------------------------------------------------------------------------------- /docs/microsoft_authentication.md: -------------------------------------------------------------------------------- 1 | # Microsoft Authentication using MSAL 2 | 3 | ## How does it work? 4 | 5 | The application is registered in the `App registrations` with access rights to *read* user 6 | information (e.g. email). 7 | 8 | Using environment variables, the application is configured to accept only a specific set of 9 | email addresses. 10 | 11 | Users trying to access `/researcher*` paths are redirected to Azure and asked to provide the 12 | application with read access to their information. 13 | 14 | The app uses the access information to check whether the user has an email in the authorized emails list. If so, it sets a session-cookie providing access to the `/researcher` page and downloads. 15 | 16 | ## Configuring the app in Azure 17 | 18 | 1. Go to `App registrations` 19 | 2. select `New registration` 20 | 3. Pick a Name 21 | 4. set `accounts in this organizational directory only (Single tenant)` 22 | 5. The `Redirect URI` should match the endpoint that requires authentication. 23 | For local testing, this could be `http://localhost:5000/login`. 24 | 25 | 26 | ## Configuring the server 27 | 28 | The server is configured by passing a serialized JSON object as the `MSAL_CONFIG` environment variable. The contents are something like this: 29 | 30 | ```json 31 | { 32 | "client_id":"a-provided-client-id", // Application (client) ID 33 | "secret":"the-application-secret", // a secret created when generating the app registration 34 | "tenant_id":"azure-tenant-id", // Directory (tenant) ID 35 | "redirect_url": "localhost:5000/login" // location microsoft should send users to after login in, 36 | // must match an App registration entry 37 | // users you want to provide access, note that they 38 | // should be part of the active directory in the same tenant as 39 | // the application 40 | "allowed_users":"allowed-user-one@azure.nl;allowed_user_two@somewhere.com" 41 | } 42 | ``` 43 | 44 | An example of running this locally would be: 45 | 46 | ```bash 47 | export MSAL_CONFIG='{"client_id":"a-provided-client-id", "secret":"the-application-secret", "tenant_id":"azure-tenant-id", "allowed_users":"allowed-user-one@azure.nl;allowed_user_two@somewhere.com"}' 48 | export OSD2F_SECRET="a-safe-production-secret" 49 | 50 | osd2f -m Development -db "sqlite://:memory:" -vv 51 | 52 | ``` 53 | 54 | Note that changing the environment variable in a cloud environment might require restarting the service. 55 | 56 | ## See Also 57 | 58 | - Testing local researcher pages with [basic auth](/docs/basic_authentication.md) -------------------------------------------------------------------------------- /docs/protecting_downloads.md: -------------------------------------------------------------------------------- 1 | # Protecting downloads with passwords 2 | 3 | To 'nudge' researchers to be carefull with respondent data, it is possible 4 | to set a data-password. This will change researcher downloads from `.json` 5 | or `.csv` files to zipped versions of these files protected with the 6 | specified password. 7 | 8 | Things to note: 9 | 10 | 1. The zipfiles use AES encryption, which is stronger, but not supported by default on 11 | many operating systems. Use OS specific software that suppports this encryption, for example: 12 | * Linux: [PeaZip](https://peazip.github.io/) 13 | * Max OSX [The Unarchiver](https://theunarchiver.com/) 14 | * Windows: [7zip](https://www.7-zip.org/) 15 | 16 | 2. Long passwords help create better protected files, but never consider password protected 17 | zipfiles to be 'unbreakable'. They protect only to layman users, not motivated attackers. 18 | 19 | 3. You can use a secret-manager to avoid putting the password direcly into CLI arguments or 20 | environment variables. 21 | 22 | ## How to enable password protected downloads: 23 | 24 | 1. Using environment variables: 25 | ```bash 26 | # enable access to the researcher interface by 27 | # setting basic authentication 28 | export OSD2F_BASIC_AUTH="admin;testpassword" 29 | 30 | # set the password 31 | export OSD2F_DATA_PASSWORD= 32 | 33 | # start the server 34 | osd2f -m Development -vvv 35 | ``` 36 | 37 | 2. Using a CLI command 38 | ```bash 39 | # enable access to the researcher interface by 40 | # setting basic authentication 41 | export OSD2F_BASIC_AUTH="admin;testpassword" 42 | 43 | osd2f --download-password -m Development 44 | ``` -------------------------------------------------------------------------------- /docs/stresstests.md: -------------------------------------------------------------------------------- 1 | # Running a stress test 2 | 3 | ## Requiments 4 | 5 | - `osd2f` is installed 6 | - `requirments_dev.txt` dependencies are installed 7 | 8 | ## In short 9 | 10 | Stresstest help you pinpoint the amount of traffic your server is able to 11 | handle. OSD2F provides a script for the popular Python load-test library 12 | [locust](https://locust.io/). 13 | 14 | The files submitted are generated using the mock data generating scripts. 15 | 16 | ## How to run a stress test 17 | 18 | To run a stresstest, you require a running instance of OSD2F, either locally 19 | or on a reachable address. You can run the script from CLI (no interface) 20 | using: 21 | 22 | ```bash 23 | locust \ 24 | --host http://localhost:5000 \ 25 | -f scripts/locust_stress_test.py \ 26 | --headless \ 27 | --users 100 \ 28 | -t 60sec 29 | ``` 30 | 31 | where: 32 | - `host` is the location of the server you want to stresstest 33 | - `-f` points to the stress test file 34 | - `headless` means no locust web interface is started 35 | - `users` is the amount of concurrent users simulated. The script 36 | assumes each user will send 20 logs for each 1 call to anonymization 37 | and 1 call to submissions. For details, review the stresstest script. 38 | 39 | ## important notes 40 | 41 | - The data-sizes and ratio of logs/anomymization/submission calls should 42 | be based on empirical observations in your sample. Current numbers may 43 | not reflect those for your population or use-case. -------------------------------------------------------------------------------- /docs/using_entry_encryption.md: -------------------------------------------------------------------------------- 1 | # Entry encryption 2 | 3 | ## What is it? 4 | 5 | The collected data is stored in a database. This database should obviously be encrypted, which 6 | is a standard feature cloud platforms provide (see [aws](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Overview.Encryption.html), [azure](https://docs.microsoft.com/en-us/azure/azure-sql/database/security-overview), [gcp](https://cloud.google.com/sql/faq#encryption) documentation for more details). 7 | 8 | Entry encryption is a feature of OSD2F that works as an additional layer of protection. Donations are 9 | stored in a per-row fashion, having a couple of metadata columns (including the `submission id`, `timestamp` and `filename`) apart from a 'blob' field that contains the actual donation called the `entry`. Entry encryption takes this potentially sensitive field and encrypts it before storing it in the database. By doing so, even if someone would get access to the database, the content of donations would be unusable. 10 | 11 | ## How to enable entry encryption 12 | 13 | You can enable entry encryption by providing a passphrase either through the commandline or as an environment variable. 14 | 15 | ```bash 16 | OSD2F_SECRET=secret \ 17 | OSD2F_BASIC_AUTH="user;pass" \ 18 | OSD2F_ENTRY_SECRET=TESTSECRET \ 19 | osd2f -db sqlite://encryption.db -m Development 20 | ``` 21 | 22 | Entries will now be encrypted before storage and decrypted before download (so researchers do not notice the difference). You can see the encryption by uploading some data and checking your database. 23 | 24 | ### Disabling decryption on download & local decryption 25 | 26 | In some use-cases, you might want to keep entries decrypted for downloads. This means the files downloaded by a researcher only contain readable metadata. 27 | 28 | You can do so by providing a cli flag or environment variable: 29 | 30 | ```bash 31 | OSD2F_SECRET=secret \ 32 | OSD2F_BASIC_AUTH="user;pass" \ 33 | OSD2F_ENTRY_SECRET=TESTSECRET \ 34 | OSD2F_ENTRY_DECRYPT_DISABLE=True \ 35 | osd2f -db sqlite://encryption.db -m Development 36 | ``` 37 | 38 | If you go to the researcher page and download the `json` file of submissions, they will look something like this: 39 | 40 | >{"db_id": 1, "submission_id": "test", "filename": "ads_clicked.json", "n_deleted_across_file": 0, "insert_timestamp": "2021-10-22T12:46:47.544248+00:00", "entry": {"encrypted": "gAAAAABhcrK3qYMvBJeTyQWm-d_mKABeiNsRP49-UTaRphxjecNtJDuidYeCNZ-pWUPTRRpfdIh_48iVEqC5QawHBjnp1iw11nAOlCUR4M9nkqbkn-BATurrGJ8OV7zxbdcU6sgzeGAW2Ntgky5o0e4ozV-o66t1AmF2Kp5bc4xa--UcejOBMZjyoItNI-fD12WJxRlUpK_kkSMkZsixjLtUS0ADzonjLw=="}}, {"db_id": 2, "submission_id": "test", "filename": "ads_clicked.json", "n_deleted_across_file": 0, "insert_timestamp": "2021-10-22T12:46:47.544498+00:00", "entry": {"encrypted": "gAAAAABhcrK... 41 | 42 | Researchers can locally decrypt the entries if they have OSD2F installed. They can do so by running: 43 | 44 | ```bash 45 | osd2f-decrypt-submissions osd2f_completed_submissions.json decrypted_submissions.json TESTSECRET 46 | ``` 47 | 48 | The `decrypted_submissions.json` file should look something like this: 49 | 50 | > [{"db_id": 1, "submission_id": "test", "filename": "ads_clicked.json", "n_deleted_across_file": 0, "insert_timestamp": "2021-10-22T12:46:47.544248+00:00", "entry": {"activity": "click", "ad_title": "Organic global Graphical User Interface", "timestamp": 1628971624}}, {"db_id": 2, "submission_id": "test", "filename": "ads_clicked.json", "n_deleted_across_file": 0, "insert_timestamp": "2021-10-22T12:46:47.544498+00:00", "entry": {"activity": "expand", "ad_title": "Upgradable scalable throughput", "timestamp": 1589681049}}, {"db_id": 3, "submission_id": "test", "filename": "ads_clicked.json", "n_deleted_across_file": 0, "insert_timestamp": "2021-10-22T12:46:47.544694+00:00", "entry": {"activity": "watch", "ad_title": "Organized asynchronous challenge", "timestamp": 1625135602}}, .... 51 | 52 | ## Notes 53 | 54 | The entry encryption secret value supports the same secret store functionality as other setting fields, see [using secret stores](./using_secret_stores.md) 55 | -------------------------------------------------------------------------------- /docs/using_secret_stores.md: -------------------------------------------------------------------------------- 1 | # Keeping configuration information secret 2 | 3 | Parts of the configuration of an OSD2F deployment are sensitive. Knowing`OSD2F_SECRET` means 4 | you can impersonate other users. The `OSD2F_DB_URL` can include the username & password 5 | for the database. 6 | 7 | Secret information should *never* be part of your repository. Generally, OSD2F accepts 8 | sensitive information via environment variables, which allows your deployment environment 9 | to implement secret management. However, in some situations it is more convenient to 10 | leverage a secret store, or 'keyvault' directly from the application. This document 11 | lists supported keystore solutions. 12 | 13 | ## General usage 14 | 15 | When the OSD2F application is started, it looks through the environment variables 16 | and changes variables with known prefixes. It will substitute the environment 17 | variables with the corresponding keystore values in-memory. 18 | 19 | By using the appropriate prefix-format, any environment variable value can be 20 | retrieved on runtime from a secret store. 21 | 22 | 23 | ## Azure keyvault 24 | 25 | OSD2F supports the Azure Keyvault solution provided by microsoft. It relies on contextual 26 | authentication through the default credentials in the environment. Azure keyvault references 27 | should follow the format: 28 | 29 | > azure-keyvault::your-keyvault-location::name-of-key 30 | 31 | For example, if the keyvault is called `osd2f-test`, it should have a location such as 32 | `https://osd2f-test.vault.azure.net/`. We store a database URL with the key name `OSD2F-DB-URL` (azure doesn't accept underscores in key names) and the value `sqlite://keyvault-test`. To use this key (locally), make sure the right credentials are set (e.g. `az login` to the appropriate subscription). Then start OSD2F: 33 | 34 | ```bash 35 | # we use the normal env variable, but the value is the azure-keyvault specification 36 | # instead of the 'real' value we want to use. 37 | export OSD2F_DB_URL='azure-keyvault::https://osd2f-test.vault.azure.net/::OSD2F-DB-URL' 38 | osd2f -m Development 39 | ``` 40 | 41 | Observe that the application makes the expected `keyvault-test` sqlite database file. 42 | 43 | ### Requirements when deploying 44 | 45 | If you are deploying a OSD2F app, most likely to Azure, make sure the webapp has the `secret` `Get` and `Key` `Get` permissions. You can add these via the KeyVault Access policies or by issuing the command: 46 | 47 | ```bash 48 | export WEBAPP_ID="your webapp PRINCIPLE ID" 49 | export KEYVAULT_NAME="your keyvault name" 50 | 51 | az keyvault set-policy \ 52 | --name $KEYVAULT_NAME \ 53 | --object-id $WEBAPP_ID \ 54 | --secret-permissions get \ 55 | --key-permissions get 56 | ``` 57 | Note that this gives the webapp permission to *all* secrets in this keyvault. We recommend using 58 | separate keyvaults for separate applications or services. -------------------------------------------------------------------------------- /mockdata/sample/README.md: -------------------------------------------------------------------------------- 1 | 2 | # THIS FOLDER CONTAINS MOCK-DATA 3 | 4 | ## Data was generated using [faker](https://faker.readthedocs.io/en/master/) 5 | 6 | ## Any similarity to real-world data is purely due to chance 7 | -------------------------------------------------------------------------------- /mockdata/sample/sample-platform-russellrodney-3.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/mockdata/sample/sample-platform-russellrodney-3.tar -------------------------------------------------------------------------------- /mockdata/sample/sample-platform-russellrodney-3.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/mockdata/sample/sample-platform-russellrodney-3.tar.gz -------------------------------------------------------------------------------- /mockdata/sample/sample-platform-russellrodney-3.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/mockdata/sample/sample-platform-russellrodney-3.zip -------------------------------------------------------------------------------- /mockdata/sample/sample-platform-russellrodney-3/ads_clicked/ads_clicked.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "ad_title": "Expanded reciprocal matrix", 4 | "activity": "expand", 5 | "timestamp": 1581895542 6 | }, 7 | { 8 | "ad_title": "Operative radical analyzer", 9 | "activity": "click", 10 | "timestamp": 1610175305 11 | }, 12 | { 13 | "ad_title": "Visionary neutral adapter", 14 | "activity": "watch", 15 | "timestamp": 1644500427 16 | }, 17 | { 18 | "ad_title": "Front-line logistical contingency", 19 | "activity": "expand", 20 | "timestamp": 1609186981 21 | }, 22 | { 23 | "ad_title": "Synergized intermediate architecture", 24 | "activity": "expand", 25 | "timestamp": 1599028632 26 | }, 27 | { 28 | "ad_title": "Stand-alone object-oriented policy", 29 | "activity": "watch", 30 | "timestamp": 1638567540 31 | }, 32 | { 33 | "ad_title": "Ergonomic stable architecture", 34 | "activity": "watch", 35 | "timestamp": 1639132978 36 | }, 37 | { 38 | "ad_title": "Exclusive stable initiative", 39 | "activity": "expand", 40 | "timestamp": 1630291094 41 | }, 42 | { 43 | "ad_title": "Implemented local middleware", 44 | "activity": "watch", 45 | "timestamp": 1589384192 46 | }, 47 | { 48 | "ad_title": "Digitized demand-driven support", 49 | "activity": "expand", 50 | "timestamp": 1624937617 51 | } 52 | ] -------------------------------------------------------------------------------- /mockdata/sample/sample-platform-russellrodney-3/companies_followed/companies_followed.json: -------------------------------------------------------------------------------- 1 | { 2 | "companies_followed": [ 3 | { 4 | "company_name": "Kim-Suarez", 5 | "timestamp": 1638967278 6 | }, 7 | { 8 | "company_name": "Mayer-Wallace", 9 | "timestamp": 1593287415 10 | }, 11 | { 12 | "company_name": "Brady, Robinson and Delgado", 13 | "timestamp": 1621384091 14 | }, 15 | { 16 | "company_name": "Rocha, Ortega and Cook", 17 | "timestamp": 1606350772 18 | }, 19 | { 20 | "company_name": "Jensen, Gonzalez and Santos", 21 | "timestamp": 1628666782 22 | }, 23 | { 24 | "company_name": "Bradley-Evans", 25 | "timestamp": 1600858011 26 | }, 27 | { 28 | "company_name": "Mora, Santos and Fischer", 29 | "timestamp": 1616867040 30 | }, 31 | { 32 | "company_name": "Thompson Group", 33 | "timestamp": 1588028299 34 | }, 35 | { 36 | "company_name": "Phillips-Winters", 37 | "timestamp": 1609999846 38 | }, 39 | { 40 | "company_name": "Ellis, Edwards and Rodriguez", 41 | "timestamp": 1584500614 42 | }, 43 | { 44 | "company_name": "Ruiz, Edwards and Chavez", 45 | "timestamp": 1611792874 46 | }, 47 | { 48 | "company_name": "Romero LLC", 49 | "timestamp": 1583736863 50 | }, 51 | { 52 | "company_name": "Hall-Solomon", 53 | "timestamp": 1577894523 54 | }, 55 | { 56 | "company_name": "Gray, Sawyer and Foster", 57 | "timestamp": 1652433971 58 | }, 59 | { 60 | "company_name": "Miller Group", 61 | "timestamp": 1639524701 62 | }, 63 | { 64 | "company_name": "Evans Inc", 65 | "timestamp": 1582404996 66 | }, 67 | { 68 | "company_name": "Anderson LLC", 69 | "timestamp": 1592059402 70 | }, 71 | { 72 | "company_name": "Ramirez, Terry and Hardy", 73 | "timestamp": 1624933766 74 | }, 75 | { 76 | "company_name": "Little-Miller", 77 | "timestamp": 1633597087 78 | }, 79 | { 80 | "company_name": "Sawyer-Rice", 81 | "timestamp": 1637290723 82 | } 83 | ] 84 | } -------------------------------------------------------------------------------- /mockdata/sample/sample-platform-russellrodney-3/engagement/engagement.json: -------------------------------------------------------------------------------- 1 | { 2 | "engagement_info": [ 3 | { 4 | "timestamp": 1603583147, 5 | "engagement_type": "like", 6 | "object": "uuid5840629" 7 | }, 8 | { 9 | "timestamp": 1604131197, 10 | "engagement_type": "click", 11 | "object": "uuid5498207" 12 | }, 13 | { 14 | "timestamp": 1631666723, 15 | "engagement_type": "listen", 16 | "object": "uuid4948296" 17 | }, 18 | { 19 | "timestamp": 1634712013, 20 | "engagement_type": "click", 21 | "object": "uuid1253992" 22 | }, 23 | { 24 | "timestamp": 1651598806, 25 | "engagement_type": "like", 26 | "object": "uuid47263" 27 | }, 28 | { 29 | "timestamp": 1614837215, 30 | "engagement_type": "listen", 31 | "object": "uuid7885513" 32 | }, 33 | { 34 | "timestamp": 1613519961, 35 | "engagement_type": "like", 36 | "object": "uuid8145538" 37 | }, 38 | { 39 | "timestamp": 1597582736, 40 | "engagement_type": "listen", 41 | "object": "uuid612408" 42 | }, 43 | { 44 | "timestamp": 1604838142, 45 | "engagement_type": "listen", 46 | "object": "uuid1514152" 47 | }, 48 | { 49 | "timestamp": 1639114959, 50 | "engagement_type": "click", 51 | "object": "uuid6109172" 52 | }, 53 | { 54 | "timestamp": 1628743660, 55 | "engagement_type": "recommend", 56 | "object": "uuid6792778" 57 | }, 58 | { 59 | "timestamp": 1638269231, 60 | "engagement_type": "recommend", 61 | "object": "uuid2059735" 62 | }, 63 | { 64 | "timestamp": 1633797338, 65 | "engagement_type": "share", 66 | "object": "uuid6858714" 67 | }, 68 | { 69 | "timestamp": 1593092623, 70 | "engagement_type": "share", 71 | "object": "uuid671610" 72 | }, 73 | { 74 | "timestamp": 1633545231, 75 | "engagement_type": "click", 76 | "object": "uuid8293052" 77 | }, 78 | { 79 | "timestamp": 1616812065, 80 | "engagement_type": "listen", 81 | "object": "uuid237794" 82 | }, 83 | { 84 | "timestamp": 1599627588, 85 | "engagement_type": "click", 86 | "object": "uuid5748854" 87 | }, 88 | { 89 | "timestamp": 1613363432, 90 | "engagement_type": "like", 91 | "object": "uuid6260369" 92 | }, 93 | { 94 | "timestamp": 1596187832, 95 | "engagement_type": "recommend", 96 | "object": "uuid6008933" 97 | }, 98 | { 99 | "timestamp": 1634398469, 100 | "engagement_type": "like", 101 | "object": "uuid845728" 102 | } 103 | ] 104 | } -------------------------------------------------------------------------------- /mockdata/sample/sample-platform-russellrodney-3/profile_interests/profile_interests.json: -------------------------------------------------------------------------------- 1 | { 2 | "profile_interests": [ 3 | "Diverse solution-oriented moderator", 4 | "Stand-alone content-based orchestration", 5 | "Configurable zero tolerance collaboration", 6 | "Adaptive needs-based matrix", 7 | "Future-proofed cohesive migration", 8 | "Phased static projection", 9 | "Optimized disintermediate help-desk", 10 | "Automated web-enabled access", 11 | "Multi-lateral encompassing analyzer", 12 | "Cloned reciprocal instruction set" 13 | ] 14 | } -------------------------------------------------------------------------------- /mockdata/sample/sample-platform-russellrodney-3/short_messages/messages.json: -------------------------------------------------------------------------------- 1 | { 2 | "messages.collection": [ 3 | { 4 | "id": "41235082236765", 5 | "message": "Enter own sure traditional white this. Point dark could gas mention speech. Reveal all laugh son right." 6 | }, 7 | { 8 | "id": "149224703736780", 9 | "message": "Oil there support month away skin hold." 10 | }, 11 | { 12 | "id": "23011685740814", 13 | "message": "Simply behavior watch teacher society staff role run. Avoid major off you ask expert wait." 14 | }, 15 | { 16 | "id": "84976717333574", 17 | "message": "Piece moment young prepare. Possible then ground break her religious guess include. Skill here nothing huge work research note until. Trouble nor thank arm sport study note travel." 18 | }, 19 | { 20 | "id": "14115030275390", 21 | "message": "Fly sell such produce however center. Century relate attorney television former threat movie. Professor book short nice. Father spend call anything receive above." 22 | }, 23 | { 24 | "id": "53559994766831", 25 | "message": "Would against front. Behavior young voice her really community citizen. Guess building yard end color various." 26 | }, 27 | { 28 | "id": "153916030071148", 29 | "message": "Such none local federal large already. Involve me technology hand environment happy. Enjoy ask point window. Military paper government most." 30 | }, 31 | { 32 | "id": "12523935577497", 33 | "message": "Easy of want usually this give. Language travel much book situation very nor. Anything myself series protect sea upon." 34 | }, 35 | { 36 | "id": "133967398893359", 37 | "message": "Last term natural game prepare give win myself. Develop strategy lay management sister." 38 | }, 39 | { 40 | "id": "146973493929326", 41 | "message": "Speak why clear air happen TV. Peace research property right. Floor blue quality response attack." 42 | }, 43 | { 44 | "id": "48749458458208", 45 | "message": "Cup thus image part by. Pay charge factor glass recent world. Human stop responsibility." 46 | }, 47 | { 48 | "id": "131628757986948", 49 | "message": "Imagine method smile something modern nice price. Top nation teach site wish actually that. Lead four large. Cost human professional next someone try." 50 | }, 51 | { 52 | "id": "74367885815084", 53 | "message": "Human enter indicate whose interest modern. School city this. Idea tough probably behind provide attack wide. Ability front through drop real." 54 | }, 55 | { 56 | "id": "1382079388842", 57 | "message": "Option us meet create staff question. Anything firm great traditional avoid base. Since simple consider true." 58 | }, 59 | { 60 | "id": "33199591687420", 61 | "message": "Either compare certainly return. Summer keep prevent save rather economic. Realize little now control." 62 | }, 63 | { 64 | "id": "7841019827548", 65 | "message": "Laugh water likely able mean. Direction cell ask who. Great for speak industry you choice." 66 | }, 67 | { 68 | "id": "68220694831320", 69 | "message": "Wish environment line author early. Gun edge part where woman church speech. But design test dog personal wall." 70 | }, 71 | { 72 | "id": "133787513169096", 73 | "message": "Rest require population better near if." 74 | }, 75 | { 76 | "id": "81257448652668", 77 | "message": "Five green example reach sometimes. White scene four thank able. Political quickly chance their own. North never collection professor quite dinner." 78 | }, 79 | { 80 | "id": "795951252521", 81 | "message": "Table lot light red type rate treat training. Write budget government strong between leave." 82 | }, 83 | { 84 | "id": "26294001537257", 85 | "message": "Out leave they heavy top or well style. Heavy current actually school. Approach step somebody capital might recognize husband. Read face ability well." 86 | }, 87 | { 88 | "id": "46799692120291", 89 | "message": "Small attack game reason policy. Beat yes create word." 90 | }, 91 | { 92 | "id": "39593755964345", 93 | "message": "Although someone eat room instead southern available just. National within only exist bit for relationship." 94 | }, 95 | { 96 | "id": "60905984795612", 97 | "message": "Include process street real." 98 | }, 99 | { 100 | "id": "16454225837983", 101 | "message": "Must world add soon along probably pay. Ok every fish item by. Necessary behavior stay trip." 102 | }, 103 | { 104 | "id": "143242729829108", 105 | "message": "Fast see beat until action back. Standard compare beautiful bed part. Receive term card far debate. Figure music majority cut professional." 106 | }, 107 | { 108 | "id": "65886234678241", 109 | "message": "Represent market deal out. Plan win worker generation painting resource. Natural case space show manager." 110 | }, 111 | { 112 | "id": "146373855137944", 113 | "message": "Modern customer major half whom risk. Outside form occur occur kid may factor activity." 114 | }, 115 | { 116 | "id": "16128138399593", 117 | "message": "Off upon modern single guy. Hospital own role." 118 | }, 119 | { 120 | "id": "103223191449550", 121 | "message": "Happen any message perform scene find. Economy he technology bill toward remain feel." 122 | }, 123 | { 124 | "id": "2508562226437", 125 | "message": "Whom civil majority stay subject billion far. Government group mind suggest pick. Life next pass herself campaign whether." 126 | }, 127 | { 128 | "id": "109328233623546", 129 | "message": "Fast close head without building note. Purpose one firm among recent somebody." 130 | }, 131 | { 132 | "id": "124318851756247", 133 | "message": "Car cell however what. Follow skill attention key significant support." 134 | }, 135 | { 136 | "id": "68123734318477", 137 | "message": "Green capital line green know American turn behind. Under especially marriage rock owner simple mission. Politics idea middle pull." 138 | }, 139 | { 140 | "id": "21886972296761", 141 | "message": "Ask sign subject production. Result power thing agreement hope table economy international." 142 | }, 143 | { 144 | "id": "149264325012834", 145 | "message": "Until special building. Language throughout fill goal. Learn would wear side chance poor you south. Theory follow style." 146 | }, 147 | { 148 | "id": "67047309889302", 149 | "message": "Official guy serve room to here reason. Heart film avoid old PM concern when." 150 | }, 151 | { 152 | "id": "113644532048929", 153 | "message": "Store must prove. Even relationship affect information attention visit." 154 | }, 155 | { 156 | "id": "136208093342591", 157 | "message": "Available allow very item strategy beyond." 158 | }, 159 | { 160 | "id": "43277280056510", 161 | "message": "Surface her indicate image house. Animal entire many laugh against order store." 162 | }, 163 | { 164 | "id": "45741893036910", 165 | "message": "Oil evening pay president check. Be whatever maybe exactly management believe deep." 166 | }, 167 | { 168 | "id": "53261547666450", 169 | "message": "Development cultural listen huge. Important poor position mission explain. Parent time manager." 170 | }, 171 | { 172 | "id": "97998713613797", 173 | "message": "Four near middle new son. Degree culture employee hold college PM account him. Indeed add especially talk front front their. Tonight particularly main defense tough skill present." 174 | }, 175 | { 176 | "id": "95880555845646", 177 | "message": "Unit situation themselves smile me purpose. Get his stop return management he democratic service." 178 | }, 179 | { 180 | "id": "37898009817352", 181 | "message": "Physical point week. Court smile thousand later." 182 | }, 183 | { 184 | "id": "64321022696164", 185 | "message": "Goal less industry state. Agree summer change them head bar." 186 | }, 187 | { 188 | "id": "145830631368332", 189 | "message": "Want model away question technology discussion main gas. Somebody staff mention car." 190 | }, 191 | { 192 | "id": "10442859844141", 193 | "message": "Policy bank force law seven sell glass. Ahead memory sure." 194 | }, 195 | { 196 | "id": "161265853275001", 197 | "message": "Forward garden support. Party president then enter professional. Care because job method consumer." 198 | }, 199 | { 200 | "id": "127661691475216", 201 | "message": "Role yourself defense hand reality attorney race statement." 202 | } 203 | ] 204 | } -------------------------------------------------------------------------------- /osd2f/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/__init__.py -------------------------------------------------------------------------------- /osd2f/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | 4 | from .logger import logger 5 | from .server import create_app, start_app 6 | 7 | if mode := os.environ.get("OSD2F_MODE"): 8 | assert mode in ("Development", "Testing", "Production") 9 | else: 10 | logger.critical("`OSD2F_MODE` must be set") 11 | 12 | 13 | if mode and __name__ == "__main__": 14 | app = create_app(mode=mode) 15 | start_app(app) 16 | elif mode: 17 | app = create_app(mode=mode) 18 | -------------------------------------------------------------------------------- /osd2f/anonymizers/__init__.py: -------------------------------------------------------------------------------- 1 | """Anonymizers 2 | 3 | This sub-module contains functions that operate on individual entries 4 | to do some form of anonymization, either by redacting (parts of) strings, 5 | or by omitting entries entirely (e.g. returning None for some entries). 6 | 7 | All anonymization functions should have the (entry, optional_string_param) 8 | signature. 9 | 10 | Register 11 | 12 | """ 13 | 14 | import re 15 | import typing 16 | 17 | from .sample_platform import redact_text 18 | from ..definitions import Submission, SubmissionList, UploadSettings 19 | from ..logger import logger 20 | 21 | options: typing.Dict[str, typing.Callable[[typing.Dict, str], typing.Awaitable]] = { 22 | redact_text.__name__: redact_text # noqa 23 | } 24 | 25 | 26 | async def apply( 27 | file_entries: typing.List[typing.Dict[str, typing.Any]], 28 | anonymizer: str, 29 | optional_str_param: str = "", 30 | ) -> typing.List[typing.Dict[str, typing.Any]]: 31 | if anonymizer not in options: 32 | logger.warning( 33 | f"Specified anonymizer {anonymizer} not found. " 34 | f"Available anonymizers: {options}." 35 | ) 36 | return [] 37 | 38 | anonymized_entries = [] 39 | for entry in file_entries: 40 | if entry is None: 41 | continue 42 | try: 43 | processed_entry = await options[anonymizer](entry, optional_str_param) 44 | anonymized_entries.append(processed_entry) 45 | except: # noqa 46 | logger.warning( 47 | f"anonymizer `{anonymizer}` threw an error while parsing an entry" 48 | ) 49 | continue 50 | 51 | return anonymized_entries 52 | 53 | 54 | async def anonymize_submission(submission: Submission, settings: UploadSettings): 55 | for filename_pattern, setting in settings.files.items(): 56 | logger.debug(f"matching {filename_pattern} to {submission.filename}") 57 | if not re.search(filename_pattern, submission.filename): 58 | continue 59 | # disregards settings for which no anonymizers are registered 60 | if not setting.anonymizers: 61 | continue 62 | logger.debug(f"Applying {setting.anonymizers} to {submission.filename}") 63 | # apply all anonymizers registered for file pattern 64 | for anonymizer in setting.anonymizers: 65 | function_name, arg = anonymizer.copy().popitem() 66 | logger.debug(f"Applying {function_name} to {submission.filename}") 67 | 68 | submission.entries = await apply( 69 | file_entries=submission.entries, 70 | anonymizer=function_name, 71 | optional_str_param=arg, 72 | ) 73 | # only match the first matching setting 74 | break 75 | return submission 76 | 77 | 78 | async def anonymize_submission_list( 79 | submission_list: SubmissionList, settings: UploadSettings 80 | ) -> SubmissionList: 81 | for i, submission in enumerate(submission_list.root): 82 | logger.debug(f"at submission {i}") 83 | await anonymize_submission(submission, settings) 84 | return submission_list 85 | -------------------------------------------------------------------------------- /osd2f/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | import json 4 | import logging 5 | 6 | from osd2f import config 7 | 8 | import yaml 9 | 10 | from .config import Testing 11 | from .database import initialize_database, stop_database 12 | from .logger import logger 13 | from .server import create_app, start_app 14 | 15 | LOGFORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 16 | 17 | parser = argparse.ArgumentParser( 18 | prog="OSD2F webserver", usage="Start the webserver and collect data donations." 19 | ) 20 | 21 | parser.add_argument( 22 | "-m", 23 | "--mode", 24 | action="store", 25 | default="Testing", 26 | help="Specify the mode to run in, defaults to 'Testing'", 27 | choices=[ 28 | d 29 | for d in dir(config) 30 | if not d.startswith("_") and d[0] == d[0].upper() and d != "Config" 31 | ], 32 | ) 33 | parser.add_argument( 34 | "-v", 35 | "--verbose", 36 | action="count", 37 | default=0, 38 | help="Verbosity of logging output, defaults to default=CRITICAL, " 39 | "v=WARNING, vv=INFO, vvv=DEBUG", 40 | ) 41 | 42 | parser.add_argument( 43 | "-db", 44 | "--database-url", 45 | type=str, 46 | help="The database URL to use, overrides the `OSD2F_DB_URL` environment variable.", 47 | ) 48 | 49 | parser.add_argument( 50 | "--secret", 51 | type=str, 52 | help="Overrides `OSD2F_SECRET` environment variable with" 53 | " an application secret for session security.", 54 | ) 55 | 56 | parser.add_argument( 57 | "--download-password", 58 | type=str, 59 | help="Overrides `OSD2F_DATA_PASSWORD` environment variable " 60 | "for researcher download file password protection.", 61 | ) 62 | 63 | parser.add_argument( 64 | "--entry-encryption-secret", 65 | type=str, 66 | help="Overrides `OSD2F_ENTRY_SECRET` environment variable. " 67 | "Encryption key for per-entry encryption/decryption for writing/reading " 68 | "from database.", 69 | ) 70 | 71 | parser.add_argument( 72 | "--generate-current-config", 73 | type=str, 74 | help="Path to put an current content configuration YAML file.", 75 | ) 76 | 77 | parser.add_argument( 78 | "-cc", 79 | "--content-configuration", 80 | type=str, 81 | help="A content configuration YAML file", 82 | ) 83 | 84 | parser.add_argument( 85 | "--dry-run", 86 | action="store_true", 87 | help="test whether endpoints provide 200 code responses," 88 | " just to make sure nothing broke.", 89 | ) 90 | 91 | parser.add_argument( 92 | "--entry-decrypt-on-read-disabled", 93 | action="store_true", 94 | default=False, 95 | help="Keep entries downloaded through the researcher interface encrypted. " 96 | "Overrides the `OSD2F_ENTRY_DECRYPT_DISABLE` ENV variable", 97 | ) 98 | 99 | 100 | def parse_and_run(): 101 | args = parser.parse_args() 102 | 103 | if args.verbose == 0: 104 | level = logging.CRITICAL 105 | elif args.verbose == 1: 106 | level = logging.WARNING 107 | elif args.verbose == 2: 108 | level = logging.INFO 109 | elif args.verbose == 3: 110 | level = logging.DEBUG 111 | else: 112 | print("UNKNOWN LOGLEVEL SPECIFIED") 113 | level = logging.NOTSET 114 | 115 | logging.basicConfig(format=LOGFORMAT, level="WARNING") 116 | logger.setLevel(level=level) 117 | 118 | logger.debug( 119 | "If you see this, you are running with debug logging. " 120 | "DO NOT DO THIS IN PRODUCTION." 121 | ) 122 | 123 | if args.content_configuration: 124 | import osd2f.utils 125 | 126 | osd2f.utils.DISK_CONTENT_CONFIG_PATH = args.content_configuration 127 | 128 | app = create_app( 129 | mode=args.mode, 130 | database_url_override=args.database_url, 131 | entry_secret_override=args.entry_encryption_secret, 132 | entry_decrypt_disable=args.entry_decrypt_on_read_disabled, 133 | ) 134 | if not args.dry_run and not args.generate_current_config: 135 | start_app(app) 136 | 137 | elif args.generate_current_config: 138 | from osd2f.utils import load_content_settings 139 | 140 | asyncio.run(app.startup()) 141 | settings = asyncio.run(load_content_settings(use_cache=False)) 142 | with open(args.generate_current_config, "w") as outputfile: 143 | yaml.dump(settings.model_dump(by_alias=True), outputfile) 144 | asyncio.run(app.shutdown()) 145 | 146 | else: 147 | asyncio.run(initialize_database(Testing.DB_URL)) 148 | tp = app.test_client() 149 | assert asyncio.run(tp.get("/")).status_code == 200 150 | assert asyncio.run(tp.get("/privacy")).status_code == 200 151 | assert asyncio.run(tp.get("/upload")).status_code == 200 152 | assert asyncio.run(tp.get("/static/js/main.js")).status_code == 200 153 | assert asyncio.run(tp.get("/adv_anonymize_file")).status_code == 405 154 | assert ( 155 | asyncio.run( 156 | tp.post( 157 | "/adv_anonymize_file", 158 | data=json.dumps( 159 | { 160 | "filename": "fn", 161 | "submission_id": "sid", 162 | "entries": [{}], 163 | "n_deleted": 0, 164 | } 165 | ), 166 | ) 167 | ).status_code 168 | == 200 169 | ) 170 | asyncio.run(stop_database()) 171 | -------------------------------------------------------------------------------- /osd2f/config.py: -------------------------------------------------------------------------------- 1 | import os as _os 2 | import typing as _typing 3 | 4 | from .security import translate_environment_vars 5 | 6 | translate_environment_vars() # resolve secrets in env variables on import 7 | 8 | 9 | class Config: 10 | DEBUG: bool = False 11 | TESTING: bool = False 12 | BIND: str = "127.0.0.1" 13 | PORT: int = 5000 14 | SECRET_KEY: _typing.Optional[str] = None 15 | DATA_PASSWORD: str = _os.environ.get("OSD2F_DATA_PASSWORD", "") 16 | ENTRY_SECRET: str = _os.environ.get("OSD2F_ENTRY_SECRET", "") 17 | ENTRY_DECRYPT_DISABLE: bool = ( 18 | _os.environ.get("OSD2F_ENTRY_DECRYPT_DISABLE", "false").lower() == "true" 19 | ) 20 | DB_URL = "sqlite://:memory:" 21 | 22 | # Allow for BIG submissions 4*16mb for 23 | # in-memory anonymization. 24 | # NOTE: protect POST endpoints with 25 | # xsrf tokens to avoid memory 26 | # based ddos attacks 27 | MAX_CONTENT_LENGTH: int = 16777216 * 4 28 | 29 | SESSION_COOKIE_HTTPONLY = True 30 | SESSION_COOKIE_SAMESITE = "Lax" 31 | 32 | 33 | class Testing(Config): 34 | TESTING = True 35 | 36 | 37 | class Development(Config): 38 | SESSION_COOKIE_SECURE = False 39 | DEBUG = True 40 | DB_URL = _os.environ.get("OSD2F_DB_URL", "sqlite://:memory:") 41 | SECRET_KEY = "do not use in production" 42 | 43 | 44 | class Production(Config): 45 | DEBUG = False 46 | TESTING = False 47 | BIND = "0.0.0.0" 48 | PORT = 8000 49 | SECRET_KEY = _os.environ.get("OSD2F_SECRET") 50 | DB_URL = _os.environ.get("OSD2F_DB_URL", "") 51 | SESSION_COOKIE_SECURE = True # required HTTPS server 52 | 53 | 54 | # hypercorn 55 | bind = "0.0.0.0:8000" 56 | -------------------------------------------------------------------------------- /osd2f/database/__init__.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from tortoise import Tortoise 4 | 5 | from .configuration import * # noqa 6 | from .logs import * # noqa 7 | from .submissions import * # noqa 8 | 9 | 10 | async def initialize_database(db_url: str): 11 | await Tortoise.init(db_url=db_url, modules={"models": ["osd2f.database"]}) 12 | await Tortoise.generate_schemas(safe=True) 13 | start_logworker() # noqa 14 | 15 | 16 | async def stop_database(): 17 | await asyncio.sleep(0.1) # to avoid start/stop race-conditions during tests 18 | await Tortoise.close_connections() 19 | stop_logworker() # noqa 20 | -------------------------------------------------------------------------------- /osd2f/database/configuration.py: -------------------------------------------------------------------------------- 1 | import typing 2 | 3 | from tortoise import fields 4 | from tortoise.models import Model 5 | 6 | from ..definitions import ContentSettings, UploadSettings 7 | 8 | 9 | class DBConfigurationBlobs(Model): 10 | id = fields.IntField(pk=True) 11 | insert_timestamp = fields.DatetimeField(auto_now_add=True) 12 | insert_user = fields.CharField(index=True, max_length=150, null=False) 13 | config_type = fields.CharField(index=True, max_length=50, null=False) 14 | config_blob = fields.JSONField(null=False) 15 | 16 | class Meta: 17 | table = "osd2f_config" 18 | 19 | 20 | async def get_content_config() -> typing.Optional[DBConfigurationBlobs]: 21 | config_item = ( 22 | await DBConfigurationBlobs.filter(config_type="content") 23 | .order_by("-insert_timestamp") 24 | .first() 25 | ) 26 | return config_item 27 | 28 | 29 | async def set_content_config(user: str, content: ContentSettings): 30 | await DBConfigurationBlobs.create( 31 | insert_user=user, config_type="content", config_blob=content.model_dump_json() 32 | ) 33 | 34 | 35 | async def set_upload_config(user: str, content: UploadSettings): 36 | await DBConfigurationBlobs.create( 37 | insert_user=user, config_type="upload", config_blob=content.model_dump_json() 38 | ) 39 | -------------------------------------------------------------------------------- /osd2f/database/logs.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import queue 3 | import time 4 | import typing 5 | from logging.handlers import QueueHandler 6 | 7 | from tortoise import fields 8 | from tortoise.models import Model 9 | 10 | from ..logger import logger 11 | 12 | clientLogQueue: queue.SimpleQueue = queue.SimpleQueue() 13 | 14 | 15 | class DBLog(Model): 16 | id = fields.IntField(pk=True) 17 | insert_timestamp = fields.DatetimeField(auto_now_add=True) 18 | log_level = fields.CharField(index=True, max_length=100, null=False) 19 | log_source = fields.CharField(index=True, max_length=100, null=False) 20 | log_position = fields.CharField(index=True, max_length=5000, null=False) 21 | log_sid = fields.CharField(index=True, max_length=100, null=True) 22 | user_agent_string = fields.CharField(max_length=5000, null=True) 23 | log_entry = fields.JSONField(null=True) 24 | 25 | class Meta: 26 | table = "osd2f_logs" 27 | 28 | 29 | def start_logworker(): 30 | async def logworker(): 31 | stop = False 32 | while 1: 33 | try: 34 | log = clientLogQueue.get_nowait() 35 | if log != "STOP": 36 | try: 37 | if log is not None: 38 | await background_insert_log(**log) 39 | except Exception as e: 40 | print("ERROR INSERTING LOG", e) 41 | else: 42 | stop = True 43 | logger.info("Stopping server logging worker") 44 | 45 | except queue.Empty: 46 | if not stop: 47 | await asyncio.sleep(0.1) 48 | continue 49 | else: 50 | return 51 | 52 | asyncio.get_running_loop().create_task(logworker()) 53 | 54 | 55 | def stop_logworker(): 56 | clientLogQueue.put("STOP", block=True) 57 | time.sleep(0.2) 58 | 59 | 60 | async def background_insert_log( 61 | log_source: str, 62 | log_level: str, 63 | log_position: str, 64 | log_sid: typing.Optional[str] = None, 65 | entry: typing.Optional[typing.Dict] = None, 66 | user_agent_string: typing.Optional[str] = None, 67 | ): 68 | 69 | await DBLog( 70 | log_source=log_source, 71 | log_level=log_level, 72 | log_position=log_position, 73 | log_sid=log_sid, 74 | log_entry=entry, 75 | user_agent_string=user_agent_string, 76 | ).save() 77 | 78 | return 79 | 80 | 81 | async def insert_log( 82 | log_source: str, 83 | log_level: str, 84 | log_position: str, 85 | log_sid: typing.Optional[str] = None, 86 | entry: typing.Optional[typing.Dict] = None, 87 | user_agent_string: typing.Optional[str] = None, 88 | ): 89 | clientLogQueue.put( 90 | dict( 91 | log_source=log_source, 92 | log_level=log_level, 93 | log_position=log_position, 94 | log_sid=log_sid, 95 | entry=entry, 96 | user_agent_string=user_agent_string, 97 | ) 98 | ) 99 | 100 | 101 | async def get_activity_logs(): 102 | logs = await DBLog.all() 103 | data = [ 104 | { 105 | "db_id": log.id, 106 | "insert_timestamp": log.insert_timestamp.isoformat(), 107 | "log_level": log.log_level, 108 | "source": log.log_source, 109 | "position": log.log_position, 110 | "submission_id": log.log_sid, 111 | "user-agent-string": log.user_agent_string, 112 | "entry": log.log_entry, 113 | } 114 | for log in logs 115 | ] 116 | return data 117 | 118 | 119 | def add_database_logging() -> queue.SimpleQueue: 120 | """Forward logger statements to the database. 121 | 122 | Uses a QueueHandler and an asyncronous worker to 123 | insert logs from logger.debug/info/warning/critical 124 | to the application database. 125 | 126 | NOTE: messages over 5000 characters are shortened 127 | """ 128 | 129 | async def async_log_worker(q: queue.SimpleQueue): 130 | while 1: 131 | try: 132 | m = q.get_nowait() 133 | if m == "stop-logging": 134 | break 135 | if m.msg == "stop-logging": 136 | break 137 | if len(m.msg) < 5000: 138 | await insert_log("server", m.levelname, m.msg) 139 | else: 140 | await insert_log("server", m.levelname, m.msg[:4997] + "...") 141 | except queue.Empty: 142 | await asyncio.sleep(0.1) 143 | 144 | logQueue: queue.SimpleQueue = queue.SimpleQueue() 145 | h = QueueHandler(logQueue) 146 | h.setLevel(logger.level) 147 | print(h.level) 148 | logger.addHandler(h) 149 | asyncio.get_running_loop().create_task(async_log_worker(logQueue)) 150 | 151 | return logQueue 152 | -------------------------------------------------------------------------------- /osd2f/database/submissions.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List 2 | 3 | from tortoise import Tortoise, fields 4 | from tortoise.models import Model 5 | 6 | from ..definitions import OutputSubmission, Submission, SubmissionList 7 | from ..logger import logger 8 | from ..security.entry_encryption.secure_entry_singleton import SecureEntry 9 | 10 | 11 | class DBSubmission(Model): 12 | id = fields.IntField(pk=True) 13 | submission_id = fields.CharField(index=True, max_length=100) 14 | filename = fields.CharField(index=True, max_length=5000) 15 | n_deleted = fields.IntField() 16 | insert_timestamp = fields.DatetimeField(auto_now_add=True) 17 | update_timestamp = fields.DatetimeField(auto_now=True) 18 | entry: Dict[str, Any] = fields.JSONField() 19 | 20 | class Meta: 21 | table = "submissions" 22 | 23 | 24 | async def insert_submission(submission: Submission): 25 | logger.debug(submission) 26 | for entry in submission.entries: 27 | await DBSubmission.create( 28 | submission_id=submission.submission_id, 29 | filename=submission.filename, 30 | entry=SecureEntry.write_entry_field(entry), 31 | n_deleted=submission.n_deleted, 32 | ) 33 | 34 | 35 | async def get_submissions() -> List[OutputSubmission]: 36 | submissions = await DBSubmission.all() 37 | submission_dict: List[OutputSubmission] = [] 38 | 39 | for si in submissions: 40 | entry = SecureEntry.read_entry_field(si.entry) 41 | sub = OutputSubmission.model_validate( 42 | dict( 43 | db_id=si.id, 44 | submission_id=si.submission_id, 45 | filename=si.filename, 46 | n_deleted_across_file=si.n_deleted, 47 | insert_timestamp=si.insert_timestamp.isoformat(), 48 | entry=dict(entry), 49 | ), 50 | ) 51 | submission_dict.append(sub) 52 | 53 | return submission_dict 54 | 55 | 56 | async def insert_submission_list(submissionlist: SubmissionList): 57 | if len(submissionlist.root) < 1: 58 | logger.info("Empty submissionlist") 59 | return 60 | 61 | logger.debug( 62 | f"Inserting {len(submissionlist.root)} files of data for submission " 63 | f"'{submissionlist.root[0].submission_id}'" 64 | ) 65 | 66 | def subgenerator(): 67 | for sub in submissionlist.root: 68 | for entry in sub.entries: 69 | yield DBSubmission( 70 | submission_id=sub.submission_id, 71 | filename=sub.filename, 72 | entry=SecureEntry.write_entry_field(entry), 73 | n_deleted=sub.n_deleted, 74 | ) 75 | 76 | await DBSubmission.bulk_create(objects=subgenerator()) 77 | 78 | 79 | async def count_submissions(): 80 | return await DBSubmission.all().count() 81 | 82 | 83 | async def get_pending_participants(): 84 | conn = Tortoise.get_connection("default") 85 | rs = await conn.execute_query( 86 | """ 87 | WITH completed AS ( 88 | SELECT DISTINCT log_sid FROM osd2f_logs 89 | WHERE 90 | log_SID IS NOT NULL 91 | AND log_position="Received the donation!" 92 | GROUP BY log_sid 93 | ) 94 | SELECT 95 | osd2f_logs.log_sid AS submission_id, 96 | MIN(insert_timestamp) AS first_seen, 97 | MAX(insert_timestamp) AS last_seen 98 | FROM osd2f_logs 99 | OUTER LEFT JOIN completed ON osd2f_logs.log_sid=completed.log_sid 100 | WHERE submission_id IS NOT NULL 101 | GROUP BY submission_id 102 | ORDER BY last_seen DESC 103 | """ 104 | ) 105 | data = [ 106 | { 107 | "submission_id": r["submission_id"], 108 | "first_seen": r["first_seen"], 109 | "last_seen": r["last_seen"], 110 | } 111 | for r in rs[1] 112 | ] 113 | return data 114 | -------------------------------------------------------------------------------- /osd2f/definitions/__init__.py: -------------------------------------------------------------------------------- 1 | from .content_settings import * # noqa 2 | from .security_settings import * # noqa 3 | from .submissions import * # noqa 4 | -------------------------------------------------------------------------------- /osd2f/definitions/content_settings.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import Dict, List, Optional 3 | 4 | from pydantic import BaseModel, ConfigDict, EmailStr 5 | 6 | 7 | class FileSetting(BaseModel): 8 | in_key: Optional[str] = None 9 | accepted_fields: List[str] 10 | anonymizers: Optional[List[Dict[str, str]]] = None 11 | 12 | 13 | class UploadSettings(BaseModel): 14 | files: Dict[str, FileSetting] 15 | 16 | 17 | class BlockTypeEnum(str, Enum): 18 | jumbotron = "jumbotron" 19 | twoblockrow = "two_block_row" 20 | 21 | 22 | class ImagePositionEnum(str, Enum): 23 | right = "right" 24 | left = "left" 25 | 26 | 27 | class ContentButton(BaseModel): 28 | name: str 29 | link: str 30 | label: str 31 | 32 | 33 | class PageTypeEnum(str, Enum): 34 | home = "home" 35 | privacy = "privacy" 36 | donate = "donate" 37 | 38 | 39 | class CirclesRowCircle(BaseModel): 40 | image: str 41 | title: Optional[str] = None 42 | subtitle: Optional[str] = None 43 | 44 | 45 | class ContentBlock(BaseModel): 46 | type: BlockTypeEnum 47 | id: str 48 | title: Optional[str] = None 49 | lines: List[str] 50 | buttons: List[ContentButton] 51 | image: Optional[str] = None 52 | image_pos: Optional[ImagePositionEnum] = None 53 | circles_row: Optional[List[CirclesRowCircle]] = None 54 | 55 | model_config = ConfigDict(use_enum_values=True) 56 | 57 | 58 | class ContentPage(BaseModel): 59 | active: bool 60 | name: str 61 | blocks: List[ContentBlock] 62 | 63 | 64 | class UploadBox(BaseModel): 65 | header: Optional[str] = None 66 | explanation: List[str] 67 | 68 | 69 | class PreviewComponent(BaseModel): 70 | entries_in_file_text: str 71 | title: str 72 | explanation: List[str] 73 | previous_file_button: str 74 | next_file_button: str 75 | remove_rows_button: str 76 | search_prompt: str 77 | search_box_placeholder: str 78 | 79 | 80 | class ConsentPopup(BaseModel): 81 | title: str 82 | lead: str 83 | points: Optional[List[str]] = None 84 | end_text: str 85 | decline_button: str 86 | accept_button: str 87 | 88 | 89 | class UploadPage(BaseModel): 90 | blocks: List[ContentBlock] 91 | upload_box: UploadBox 92 | thanks_text: str 93 | file_indicator_text: str 94 | processing_text: str 95 | empty_selection: str 96 | donate_button: str 97 | inspect_button: str 98 | preview_component: PreviewComponent 99 | consent_popup: ConsentPopup 100 | 101 | 102 | class ContentSettings(BaseModel): 103 | project_title: str 104 | contact_us: EmailStr 105 | static_pages: Dict[PageTypeEnum, ContentPage] 106 | upload_page: UploadPage 107 | 108 | model_config = ConfigDict(use_enum_values=True) 109 | -------------------------------------------------------------------------------- /osd2f/definitions/security_settings.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | 3 | from pydantic import BaseModel, field_validator 4 | 5 | 6 | class MSALConfiguration(BaseModel): 7 | tenant_id: str 8 | client_id: str 9 | secret: str 10 | allowed_users: str 11 | redirect_url: Optional[str] 12 | 13 | authority: Optional[str] = None 14 | scope: List[str] = ["User.Read"] 15 | 16 | @field_validator("authority", mode="before", check_fields=True) # type: ignore 17 | def set_authority(cls, v, *, values, **kwargs): 18 | return f"https://login.microsoftonline.com/{values['tenant_id']}" 19 | -------------------------------------------------------------------------------- /osd2f/definitions/submissions.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List 2 | 3 | from pydantic import BaseModel, RootModel 4 | 5 | 6 | class Submission(BaseModel): 7 | submission_id: str 8 | filename: str 9 | n_deleted: int 10 | entries: List[Dict[str, Any]] 11 | 12 | 13 | class OutputSubmission(BaseModel): 14 | """Submissions as downloaded per-record.""" 15 | 16 | db_id: int 17 | submission_id: str 18 | filename: str 19 | n_deleted_across_file: int 20 | insert_timestamp: str 21 | entry: Dict[str, Any] 22 | 23 | 24 | class EncryptedEntry(BaseModel): 25 | encrypted: str 26 | 27 | 28 | class EncryptedSubmission(BaseModel): 29 | """Matches the downloaded submission format based on the database schema.""" 30 | 31 | submission_id: str 32 | filename: str 33 | n_deleted_across_file: int 34 | entry: EncryptedEntry 35 | 36 | 37 | class SubmissionList(RootModel): 38 | """Submissions as send from the webbrowser. 39 | Basically, a list of file submissions as one List.""" 40 | 41 | root: List[Submission] 42 | -------------------------------------------------------------------------------- /osd2f/javascript/file_upload.js: -------------------------------------------------------------------------------- 1 | // This is the javascript to handle folder loading & 2 | // client-side filtering 3 | 'use strict' 4 | import 'blob-polyfill' // for safari File handling 5 | 6 | import { Archive } from 'libarchive.js' 7 | import { apply_adv_anonymization } from './server_interaction' 8 | import { visualize } from './visualize' 9 | import { getFilesFromDataTransferItems } from 'datatransfer-files-promise' 10 | import { server } from './server_interaction' 11 | import { fileReader } from './parsing/fileparser' 12 | import { ParseJSON } from './parsing/jsonparsing' 13 | 14 | export { visualize as vis } from './visualize' 15 | export { server } from './server_interaction' 16 | 17 | server.log('INFO', 'loaded js') 18 | 19 | // loads the zipfile reading WASM libraries 20 | Archive.init({ workerUrl: '/static/js/libarchive/worker-bundle.js' }) 21 | 22 | server.log('INFO', 'initialized archive worker') 23 | 24 | // folderScanner handles folder uploads. 25 | const folderScanner = function (webkitEntry, files) { 26 | if (webkitEntry.isDirectory) { 27 | let dir = webkitEntry.createReader() 28 | dir.readEntries(entries => 29 | entries.forEach(entry => folderScanner(entry, files)) 30 | ) 31 | } else { 32 | files.push(webkitEntry) 33 | } 34 | } 35 | 36 | // reparseAsUTF8 stringifies an object, parses the string as UTF8 37 | // and returns the JSON parsed result. This removes issues with 38 | // UTF-8 donations, that JS assumes are UTF-16. 39 | const reparseAsUTF8 = function (object) { 40 | // drawn from https://stackoverflow.com/questions/52747566/what-encoding-facebook-uses-in-json-files-from-data-export 41 | function decode(s) { 42 | let d = new TextDecoder; 43 | let a = s.split('').map(r => r.charCodeAt()); 44 | return d.decode(new Uint8Array(a)); 45 | } 46 | 47 | let stringObj = JSON.stringify(object) 48 | 49 | 50 | let decodedString = decode(stringObj) 51 | 52 | // check whether translation did not introduce bad characters 53 | // that signal it was already UTF-16, in which case we return 54 | // the original, supposedly UTF16 object 55 | if (decodedString.search("�") > 0) { 56 | return object 57 | } 58 | 59 | // return the UTF8->UTF16 decoded object instead 60 | return JSON.parse(decodedString) 61 | } 62 | 63 | // countFileTypes takes a list of filenames and 64 | // counts the lowercased extensions 65 | function countFileTypes(arr) { 66 | let counts = new Object 67 | arr. 68 | map(e => e.split(".").pop().toLowerCase()). 69 | map(ext => counts[ext] = counts[ext] + 1 || 1) 70 | return counts 71 | } 72 | 73 | // fileLoadController checks whether files are in the whitelist, 74 | // and parses these files using the fileReader and the appropriate 75 | // whitelist of fields for that particular file. 76 | export const fileLoadController = async function (sid, settings, files) { 77 | document.getElementById("empty_selection").classList.add("d-none") 78 | document.getElementById('processing').classList.remove('invisible') 79 | // we map filenames to the regex format filenames in 80 | // provided settings 81 | var setmatch 82 | setmatch = Object.fromEntries( 83 | files.map(file => { 84 | let nameRegex 85 | for (nameRegex of Object.keys(settings.files)) { 86 | if (RegExp(nameRegex).exec(file.name)) { 87 | return [file.name, nameRegex] 88 | } 89 | } 90 | return [] 91 | }) 92 | ) 93 | // remove undefined keys, i.e. files that do not match any RegEx 94 | Object.keys(setmatch).map(k => { 95 | if (k === 'undefined') { 96 | delete setmatch[k] 97 | } 98 | }) 99 | 100 | let acceptedFiles 101 | acceptedFiles = files.filter(f => setmatch[f.name] !== undefined) 102 | 103 | // log the count of selected files, the count of files 104 | // matching the whitelist and a frequency table of the 105 | // filetypes selected. 106 | server.log("INFO", "files selected", sid, 107 | { 108 | "selected": files.length, 109 | "matching_whitelist": acceptedFiles.length, 110 | "types": countFileTypes(files.map(f => f.name)) 111 | }) 112 | 113 | if (files.length > 0 && acceptedFiles.length == 0) { 114 | document.getElementById("empty_selection").classList.remove("d-none") 115 | server.log("ERROR", "empty selection", sid) 116 | } 117 | 118 | let data = [] 119 | 120 | let bar = document.getElementById('progress-bar') 121 | bar.value = 0 122 | let f 123 | for (f of acceptedFiles) { 124 | let content 125 | // normal files 126 | if (f.text != null) { 127 | content = await f.text() 128 | } else { 129 | let extractedFile = await f.extract() 130 | content = await extractedFile.text() 131 | } 132 | let fileob 133 | fileob = new Object() 134 | fileob['filename'] = f.name 135 | fileob['submission_id'] = sid 136 | fileob['n_deleted'] = 0 137 | try { 138 | server.log('INFO', 'file parsing', window.sid, { 139 | file_match: setmatch[f.name] 140 | }) 141 | fileob['entries'] = fileReader( 142 | settings['files'][setmatch[f.name]].accepted_fields, 143 | ParseJSON(content), // custom to support malformed 144 | null, 145 | settings['files'][setmatch[f.name]].in_key 146 | ) 147 | server.log('INFO', 'reparsing file to UTF8') 148 | try { 149 | fileob = reparseAsUTF8(fileob) 150 | } catch { 151 | server.log("INFO", "file could not be reparsed, might be UTF16 already", window.sid) 152 | } 153 | 154 | server.log('INFO', 'file send to anonymization', sid, { 155 | file_match: setmatch[f.name] 156 | }) 157 | fileob = await apply_adv_anonymization(fileob) 158 | server.log('INFO', 'file anonymized', sid, { 159 | file_match: setmatch[f.name] 160 | }) 161 | data.push(fileob) 162 | } catch (e) { 163 | console.log(e) 164 | server.log('ERROR', 'file matched, but is not JSON', sid) 165 | console.log("Unable to parse file because it's not real JSON") 166 | } 167 | 168 | // update the loading 169 | let pos 170 | pos = (data.length / acceptedFiles.length) * 100 171 | 172 | if (pos !== bar.value) { 173 | bar.value = pos 174 | } 175 | } 176 | 177 | // filter failed files 178 | data = data.filter(x => x) 179 | 180 | // show users that processing has completed 181 | bar.value = 100 182 | document.getElementById('processing').classList.add('invisible') 183 | 184 | server.log('INFO', 'starting visualization', sid) 185 | visualize(data, content) 186 | } 187 | 188 | // fileSelectHandler is used to detect files uploaded through 189 | // the file select prompt. 190 | export async function fileSelectHandler(e) { 191 | server.log('INFO', 'file select detected', sid) 192 | var filesSelected = e.target.files 193 | if (filesSelected === undefined) { 194 | server.log('INFO', 'file select empty', sid) 195 | return // no files selected yet 196 | } 197 | 198 | // if there is one file, which is an archive 199 | if (RegExp('.*.zip$').exec(filesSelected[0].name) != null) { 200 | server.log('INFO', 'file select is archive', sid) 201 | 202 | let archiveContent = await Archive.open(filesSelected[0]) 203 | let contentList = await archiveContent.getFilesArray() 204 | let fl = contentList.map(c => c.file) 205 | 206 | fileLoadController(sid, settings, fl) 207 | } else { 208 | server.log('INFO', 'file select is single file', sid) 209 | 210 | fileLoadController(sid, settings, Array(filesSelected[0])) 211 | } 212 | } 213 | document.getElementById('fileElem').onchange = fileSelectHandler 214 | 215 | // fileDropHandler is used to detect files uploaded using 216 | // the drag-and-drop interface. 217 | async function fileDropHandler(e) { 218 | server.log('INFO', 'file drop detected', sid) 219 | 220 | let filesSelected = await getFilesFromDataTransferItems(e.dataTransfer.items) 221 | 222 | // if there is one file, which is an archive 223 | if ( 224 | filesSelected.length == 1 && 225 | RegExp('.*.zip$').exec(filesSelected[0].name) != null 226 | ) { 227 | server.log('INFO', 'file drop is archive', sid) 228 | 229 | let archiveContent = await Archive.open(filesSelected[0]) 230 | let contentList = await archiveContent.getFilesArray() 231 | let fl = contentList.map(c => c.file) 232 | 233 | fileLoadController(sid, settings, fl) 234 | } else { 235 | server.log('INFO', 'file drop is file(s)', sid) 236 | 237 | fileLoadController(sid, settings, filesSelected) 238 | } 239 | } 240 | document 241 | .getElementById('drop-area') 242 | .addEventListener('drop', fileDropHandler, false) 243 | -------------------------------------------------------------------------------- /osd2f/javascript/parsing/fileparser.js: -------------------------------------------------------------------------------- 1 | const { objReader } = require("./objparsing") 2 | var jp = require("jsonpath") 3 | 4 | // fileReader selects the starting point for recursive parsing 5 | // for each object in the file and returns the resulting objects. 6 | const fileReader = function (paths, objects, prepath, in_key) { 7 | // in case the data is nested in an object 8 | // rather than an array 9 | if (typeof in_key !== 'undefined' && in_key !== null) { 10 | 11 | var jsonPathSpec = '$["' + in_key + '"].*' 12 | var nested_objects = jp.query(objects, jsonPathSpec) 13 | 14 | return fileReader(paths, nested_objects, in_key, undefined) 15 | 16 | } 17 | 18 | if (Array.isArray(objects)) { 19 | // in case the contents is just one array of values, 20 | // instead of an array of objects 21 | if (paths.length == 0) { 22 | let entries = [] 23 | let i = 0 24 | while (i < objects.length) { 25 | entries.push({ 26 | index: i, 27 | value: objects[i] 28 | }) 29 | i++ 30 | } 31 | return entries 32 | } else { 33 | // extract the whitelisted paths from all objects 34 | // in the array contained in the file 35 | return objects.map(obj => objReader(paths, obj)) 36 | } 37 | } 38 | 39 | // If the objects is actually one object (not an array) 40 | return [objReader(paths, objects)] 41 | } 42 | 43 | module.exports.fileReader = fileReader -------------------------------------------------------------------------------- /osd2f/javascript/parsing/jsonparsing.js: -------------------------------------------------------------------------------- 1 | 2 | // ParseJSON is a helper that is lenient for badly 3 | // formatted json 4 | const ParseJSON = function (text_content) { 5 | try { 6 | return JSON.parse(text_content) 7 | } 8 | catch { 9 | return parseTwitterJSON(text_content) 10 | } 11 | 12 | } 13 | 14 | 15 | // parseTwitterJSON parses malformed JSON delivered by Twitter 16 | // it's actually javascript, but we deem an `eval` call too 17 | // insecure (it would allow for arbitrary code injection) 18 | const parseTwitterJSON = function (text_content) { 19 | 20 | // assume it's the first, global, key that is malformed 21 | chunks = text_content.split("=") 22 | main_key = chunks.shift() 23 | body = chunks.join('=') 24 | 25 | // build it as proper JSON 26 | fixed_content = '{ "' + main_key.trim() + '" :' + body + '}' 27 | 28 | fixed_content.replace('\\', '\\\\') 29 | 30 | return JSON.parse(fixed_content) 31 | } 32 | 33 | module.exports.ParseJSON = ParseJSON -------------------------------------------------------------------------------- /osd2f/javascript/parsing/objparsing.js: -------------------------------------------------------------------------------- 1 | // objReader recursively parses JSON objects to extract 2 | // the whitelisted fields and returns a flattened representation. 3 | const objReader = function (spec, o, prev) { 4 | let flat_obj = {} 5 | 6 | let options = spec.map(p => p.split('.').shift(1)) 7 | 8 | // if the object is the endpoint of a spec, 9 | if (prev !== undefined && (spec.length === 0 || spec[0] === "")) { 10 | flat_obj[prev] = o 11 | return flat_obj 12 | } 13 | 14 | let k 15 | for (k of Object.keys(o)) { 16 | if (options.filter(o => o == k).length == 0) { 17 | continue 18 | } 19 | let newkey = [prev, k].filter(e => typeof e != 'undefined').join('.') 20 | 21 | let val = o[k] 22 | let sub_spec = spec 23 | .filter(s => s.startsWith(k + ".")) 24 | .map(s => s.substring(k.length + 1, s.length)) 25 | 26 | if (Array.isArray(val)) { 27 | if (sub_spec == "") { 28 | flat_obj[newkey] = val 29 | continue 30 | } 31 | 32 | var ac 33 | ac = val.map(c => objReader(sub_spec, c)) 34 | 35 | // only append array values if they are not empty 36 | if (ac.length > 0) { 37 | flat_obj[newkey] = ac 38 | } 39 | continue 40 | } 41 | 42 | if (typeof val == 'object' && val != null) { 43 | flat_obj = Object.assign(flat_obj, objReader(sub_spec, val, newkey)) 44 | 45 | continue 46 | } 47 | 48 | 49 | flat_obj[newkey] = val 50 | } 51 | 52 | 53 | return flat_obj 54 | } 55 | 56 | module.exports.objReader = objReader -------------------------------------------------------------------------------- /osd2f/javascript/server_interaction.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | // for use áfter local filtering of fields 4 | // allows for advanced server-side anonymization 5 | export async function apply_adv_anonymization (fileobj) { 6 | fileobj = await fetch('/adv_anonymize_file', { 7 | method: 'POST', 8 | mode: 'same-origin', 9 | credentials: 'same-origin', 10 | headers: { 11 | 'Content-Type': 'application/json' 12 | }, 13 | body: JSON.stringify(fileobj) 14 | }) 15 | .then(response => { 16 | return response.json() 17 | }) 18 | .catch(response => console.log(response.error)) 19 | 20 | return fileobj.data 21 | } 22 | 23 | export const server = { 24 | log: function (level, position, sid, entry) { 25 | let params = { 26 | level: level, 27 | position: position, 28 | cb: Math.random() // to avoid caching 29 | } 30 | if (sid != undefined) { 31 | params['sid'] = sid 32 | } else { 33 | if (window.sid != undefined) { 34 | params['sid'] = window.sid 35 | } 36 | } 37 | if (entry != undefined) { 38 | params['entry'] = JSON.stringify(entry) 39 | } 40 | 41 | fetch('/log?' + new URLSearchParams(params), { 42 | method: 'GET', 43 | mode: 'same-origin', 44 | credentials: 'same-origin' 45 | }) 46 | .then(r => {}) 47 | .catch(e => { 48 | console.log('Unable to log', level, position, 'due to', e) 49 | }) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /osd2f/javascript/tests/fileparsing.test.js: -------------------------------------------------------------------------------- 1 | const { fileReader } = require("../parsing/fileparser") 2 | 3 | test("test array of objects file", () => { 4 | json_content = [ 5 | { 6 | "key": "value" 7 | }, 8 | { 9 | "key": "value2" 10 | } 11 | ] 12 | 13 | spec = { 14 | fields: ["key"] 15 | } 16 | 17 | parsed = fileReader(spec.fields, json_content) 18 | 19 | expect(parsed[0].key).toBe("value") 20 | expect(parsed[1].key).toBe("value2") 21 | 22 | }) 23 | 24 | test("test array of objects nested in key", () => { 25 | json_content = { 26 | "main_key": [ 27 | { "name": "obj1" }, 28 | { "name": "obj2" } 29 | ] 30 | } 31 | 32 | 33 | spec = { 34 | in_key: "main_key", 35 | fields: ["name"] 36 | } 37 | 38 | parsed = fileReader(spec.fields, json_content, undefined, spec.in_key) 39 | 40 | expect(parsed[0].name).toBe("obj1") 41 | expect(parsed[1].name).toBe("obj2") 42 | 43 | }) 44 | 45 | test("test object with array of values file", () => { 46 | json_content = { 47 | "main_key": [ 48 | "value1", 49 | "value2", 50 | "value3" 51 | ] 52 | } 53 | 54 | spec = { 55 | in_key: "main_key" 56 | } 57 | 58 | parsed = fileReader([], json_content, undefined, spec.in_key) 59 | 60 | expect(parsed[0].index).toBe(0) 61 | expect(parsed[0].value).toBe("value1") 62 | 63 | expect(parsed[1].index).toBe(1) 64 | expect(parsed[1].value).toBe("value2") 65 | 66 | expect(parsed[2].index).toBe(2) 67 | expect(parsed[2].value).toBe("value3") 68 | 69 | }) 70 | 71 | test("test file with '.' in main_key", () => { 72 | json_content = { 73 | "main.key": [ 74 | { name: "obj1" }, 75 | { name: "obj2" } 76 | ] 77 | } 78 | 79 | spec = { 80 | in_key: "main.key", 81 | fields: ["name"] 82 | } 83 | 84 | parsed = fileReader(spec.fields, json_content, undefined, spec.in_key) 85 | 86 | expect(parsed[0].name).toBe("obj1") 87 | expect(parsed[1].name).toBe("obj2") 88 | 89 | }) 90 | 91 | test("test file with array of values obj", () => { 92 | json_content = [ 93 | { 94 | "keywords": ["keyword A", "keyword B"] 95 | } 96 | ] 97 | 98 | spec = { 99 | fields: ["keywords"] 100 | } 101 | 102 | parsed = fileReader(spec.fields, json_content) 103 | 104 | expect(parsed[0].keywords.length).toBe(2) 105 | expect(parsed[0].keywords[0]).toBe("keyword A") 106 | expect(parsed[0].keywords[1]).toBe("keyword B") 107 | }) 108 | 109 | test("test file with heavily nested values", () => { 110 | json_content = { "window.YTD.stuff": [{ "one": { "two": { "three": [{ "nested": "obj" }] } } }, { "one": { "two": { "three": [{ "nested": "obj_two" }] } } }] } 111 | 112 | spec = { 113 | in_key: "window.YTD.stuff", 114 | fields: ["one.two.three"] 115 | } 116 | 117 | parsed = fileReader(spec.fields, json_content, undefined, spec.in_key) 118 | 119 | expect(parsed[0]["one.two.three"][0].nested).toBe("obj") 120 | expect(Array.isArray(parsed)).toBe(true) 121 | }) 122 | 123 | test("fields that result in object value", () => { 124 | json_content = { "window.YTD.stuff": [{ "one": { "two": { "three": [{ "nested": "obj" }] } } }, { "one": { "two": { "three": [{ "nested": "obj_two" }] } } }] } 125 | 126 | spec = { 127 | in_key: "window.YTD.stuff", 128 | fields: ["one.two"] 129 | } 130 | 131 | parsed = fileReader(spec.fields, json_content, undefined, spec.in_key) 132 | 133 | console.log(parsed) 134 | expect(Array.isArray(parsed[0]["one.two"].three)).toBe(true) 135 | }) -------------------------------------------------------------------------------- /osd2f/javascript/tests/jsonparsing.test.js: -------------------------------------------------------------------------------- 1 | const { ParseJSON } = require("../parsing/jsonparsing") 2 | 3 | test("test regular JSON obj", () => { 4 | text_content = '{"content": [1, 2, 3]}' 5 | content = ParseJSON(text_content) 6 | 7 | expect(content.content.length).toBe(3) 8 | expect(content.content[0]).toBe(1) 9 | 10 | }) 11 | 12 | test("test regular JSON array", () => { 13 | text_content = '[1,2,3]' 14 | content = ParseJSON(text_content) 15 | 16 | expect(content.length).toBe(3) 17 | expect(content[0]).toBe(1) 18 | }) 19 | 20 | test("bad (twitter) JSON", () => { 21 | text_content = 'content = [ { "key" : "value" } ]' 22 | content = ParseJSON(text_content) 23 | 24 | expect(content.content[0].key).toBe("value") 25 | }) 26 | 27 | test("twitter data with unescaped '\'", () => { 28 | text_content = String.raw`content = [ { "text_with_slashes" : "new \n line \n!"}]` 29 | content = ParseJSON(text_content) 30 | 31 | expect(content.content[0].text_with_slashes).toBe("new \n line \n!") 32 | 33 | }) -------------------------------------------------------------------------------- /osd2f/javascript/tests/objectparsing.test.js: -------------------------------------------------------------------------------- 1 | objparsing = require("../parsing/objparsing") 2 | 3 | 4 | test("Parsing simple data", () => { 5 | simple_data = { 6 | "key": 1, 7 | "nested": 8 | { 9 | "key": 2 10 | }, 11 | "nested_obj": { 12 | "sub1": 1, 13 | "sub2_ignored": 2 14 | }, 15 | "nested_array": [ 16 | { 17 | "array_obj": 3, 18 | "array_obj_ignored": 3 19 | } 20 | ] 21 | } 22 | 23 | simple_spec = { 24 | fields: [ 25 | "key", 26 | "nested.key", 27 | "nested_obj.sub1", 28 | "nested_array.array_obj", 29 | "nonexisting_field" 30 | ] 31 | } 32 | 33 | // do the parsing 34 | r = objparsing.objReader(simple_spec.fields, simple_data) 35 | 36 | // check whether specified and existing fields are recoverd 37 | expect(r.key).toBe(1) 38 | expect(r["nested.key"]).toBe(2) 39 | expect(r["nested_obj.sub1"]).toBe(1) 40 | expect(r.nested_array[0].array_obj).toBe(3) 41 | 42 | // check whether specified but missing fields are ignored 43 | expect(r.nonexisting_field).toBe(undefined) 44 | 45 | // check whether ignored files are indeed ignored 46 | expect(r["nested_obj.sub2_ignored"]).toBe(undefined) 47 | expect(r.nested_array[0].array_obj_ignored).toBe(undefined) 48 | 49 | }) 50 | 51 | test("Empty nested array which is a parent key should not show up as it's own witelisted field", () => { 52 | data = { 53 | key: [] 54 | } 55 | 56 | spec = { 57 | fields: ["key.subfield"] 58 | } 59 | 60 | r = objparsing.objReader(spec.fields, data) 61 | 62 | expect(r.key).toBe(undefined) 63 | 64 | }) -------------------------------------------------------------------------------- /osd2f/javascript/visualization_components/consentConfirmation.vue: -------------------------------------------------------------------------------- 1 | 25 | -------------------------------------------------------------------------------- /osd2f/javascript/visualization_components/donationContainer.vue: -------------------------------------------------------------------------------- 1 | 47 | 48 | -------------------------------------------------------------------------------- /osd2f/javascript/visualization_components/donationTable.vue: -------------------------------------------------------------------------------- 1 | 58 | 59 | -------------------------------------------------------------------------------- /osd2f/javascript/visualize.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | import Vue from 'vue' 4 | import { BootstrapVue, IconsPlugin } from 'bootstrap-vue' 5 | import donationTable from './visualization_components/donationTable' 6 | import donationContainer from './visualization_components/donationContainer' 7 | 8 | // Import Bootstrap an BootstrapVue CSS files (order is important) 9 | import 'bootstrap/dist/css/bootstrap.css' 10 | import 'bootstrap-vue/dist/bootstrap-vue.css' 11 | 12 | Vue.use(BootstrapVue, IconsPlugin) 13 | 14 | var app = new Vue({ 15 | el: '#visualization', 16 | components: { 17 | 'donation-table': donationTable, 18 | 'donation-container': donationContainer 19 | }, 20 | data: { 21 | filedata: {}, 22 | fields: [], 23 | donations: [], 24 | content : {} 25 | } 26 | }) 27 | 28 | // Placeholder visualization 29 | export function visualize (d,c) { 30 | app.content = c.upload_page 31 | app.donations = d 32 | } 33 | -------------------------------------------------------------------------------- /osd2f/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger("osd2f") 4 | -------------------------------------------------------------------------------- /osd2f/security/__init__.py: -------------------------------------------------------------------------------- 1 | """Security wrappers dynamically set at request time. 2 | 3 | Wrapper functions should have the (Callable, *args, **kwargs) -> Response signature. 4 | """ 5 | 6 | import os 7 | from functools import wraps 8 | 9 | # Wrapper implementations for Authentication 10 | from .authorization.basic_auth import basic_authentication 11 | from .authorization.microsoft_msal import microsoft_msal_authentication 12 | from .authorization.not_confgured import no_authentication 13 | from .download_encryption.encrypted_zipfile import string_to_zipfile # noqa 14 | from .secrets import azure_keyvault # Environment secret resolvers 15 | from ..logger import logger # Global module logger 16 | 17 | RESOLVERS = {azure_keyvault.PREFIX: azure_keyvault.azure_keyvault_replace} 18 | 19 | 20 | def authorization_required(func): 21 | """A decorator that implements authorization depending on configuration""" 22 | 23 | @wraps(func) 24 | async def decorated_path(*args, **kwargs): 25 | if os.environ.get("MSAL_CONFIG"): 26 | logger.info("Using MSAL authentication") 27 | return await microsoft_msal_authentication(func, *args, **kwargs) 28 | if os.environ.get("OSD2F_BASIC_AUTH"): 29 | logger.info("Using basic auth, NOT RECOMMENDED FOR PRODUCTION") 30 | return await basic_authentication(func, *args, **kwargs) 31 | else: 32 | logger.info("Fall back to no authentication") 33 | return await no_authentication(func, *args, **kwargs) 34 | 35 | return decorated_path 36 | 37 | 38 | def translate_environment_vars(): 39 | """Translate environment variable values to their secrets. 40 | 41 | Assumes environment variables matching a pattern: 42 | `SECRETSTORE_PREFIX::DELIMITED::ARGUMENTS` 43 | 44 | should be translated by their respective resolver functions. 45 | 46 | """ 47 | # iterate through environment variables, re-assign if they match a resolver 48 | # prefix. 49 | for var, value in os.environ.items(): 50 | for prefix, func in RESOLVERS.items(): 51 | if value.startswith(prefix): 52 | os.environ[var] = func(value) 53 | 54 | 55 | def translate_value(value: str) -> str: 56 | """Translate a given value to the appropriate secret. 57 | 58 | Assumes the value matches a pattern of a known resolver, e.g.: 59 | `SECRETSTORE_PREFIX::DELIMITED::ARGUMENTS` 60 | 61 | secrets are resolved by their matching RESOLVERS function. 62 | 63 | String not matching any known prefix are ignored. 64 | """ 65 | for prefix, func in RESOLVERS.items(): 66 | if value.startswith(prefix): 67 | logger.debug(f"{value} resolved using {func.__name__}") 68 | return func(value) 69 | 70 | logger.debug(f"{value} did not match any registered resolver.") 71 | return value 72 | -------------------------------------------------------------------------------- /osd2f/security/authorization/__init__.py: -------------------------------------------------------------------------------- 1 | USER_FIELD = "user" 2 | -------------------------------------------------------------------------------- /osd2f/security/authorization/basic_auth.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from quart import Response, redirect, request, session 4 | 5 | from ..authorization import USER_FIELD 6 | from ...database import insert_log 7 | 8 | 9 | async def basic_authentication(func, *args, **kwargs): 10 | 11 | # with active authorized session 12 | if session.get(USER_FIELD): 13 | await insert_log( 14 | "server", 15 | "INFO", 16 | "download access by authorized user", 17 | entry={USER_FIELD: session.get(USER_FIELD), "path": request.url}, 18 | user_agent_string=request.headers.get("User-Agent"), 19 | ) 20 | return await func(*args, **kwargs) 21 | 22 | if not request.path.endswith("/login"): 23 | session["CALLBACK"] = request.url 24 | return redirect("/login") 25 | 26 | user, passw = os.environ["OSD2F_BASIC_AUTH"].split(";") 27 | 28 | ra = request.authorization 29 | 30 | if not ra: 31 | return Response( 32 | "", status=401, headers={"WWW-Authenticate": "Basic realm='data-donation'"} 33 | ) 34 | 35 | authenticated = ( 36 | ra and ra.type == "basic" and ra.username == user and ra.password == passw 37 | ) 38 | 39 | redirect_target = session.pop("CALLBACK", "/") 40 | 41 | if authenticated: 42 | session[USER_FIELD] = f"{user}" 43 | return redirect(redirect_target) 44 | 45 | return redirect("/") 46 | -------------------------------------------------------------------------------- /osd2f/security/authorization/microsoft_msal.py: -------------------------------------------------------------------------------- 1 | """Microsoft MSAL authentication wrapper 2 | 3 | Contains the route-wrapper for `auth()` paths configured to use 4 | Microsoft Authentication Library. 5 | 6 | Expects a JSON configuration set as a single string in the `MSAL_CONFIG` field. 7 | """ 8 | 9 | import os 10 | 11 | 12 | import msal 13 | 14 | from quart import redirect, request, session 15 | 16 | from ..authorization import USER_FIELD 17 | from ...database import insert_log 18 | from ...definitions import MSALConfiguration 19 | from ...logger import logger 20 | 21 | CALLBACK_FIELD = "callback_after_login" 22 | 23 | 24 | async def microsoft_msal_authentication(func, *args, **kwargs): 25 | 26 | # with active authorized session 27 | if session.get(USER_FIELD): 28 | await insert_log( 29 | "server", 30 | "INFO", 31 | "download access by authorized user", 32 | entry={USER_FIELD: session.get(USER_FIELD), "path": request.url}, 33 | user_agent_string=request.headers.get("User-Agent"), 34 | ) 35 | return await func(*args, **kwargs) 36 | 37 | msal_auth = os.environ.get("MSAL_CONFIG") 38 | config = MSALConfiguration.model_validate_json(msal_auth) 39 | 40 | authorizer = msal.ConfidentialClientApplication( 41 | config.client_id, 42 | authority=config.authority, 43 | client_credential=config.secret, 44 | ) 45 | accepted_users = [u.strip() for u in config.allowed_users.split(";")] 46 | 47 | # new user 48 | if not session.get(USER_FIELD) and not session.get("flow"): 49 | if not request.path.endswith("login"): 50 | session[CALLBACK_FIELD] = request.url 51 | logger.debug(f"Redirecting to `/login' from {request.url}") 52 | return redirect("/login") 53 | 54 | flow = authorizer.initiate_auth_code_flow( 55 | config.scope, 56 | redirect_uri=config.redirect_url, 57 | ) 58 | session["flow"] = flow 59 | return redirect(flow["auth_uri"]) 60 | 61 | # returning from microsoft authentication portal 62 | elif session.get("flow"): 63 | try: 64 | token = authorizer.acquire_token_by_auth_code_flow( 65 | session.get("flow"), request.args 66 | ) 67 | except ValueError: 68 | await insert_log( 69 | "server", "WARN", "unable to acquire token for authentication" 70 | ) 71 | session.clear() 72 | return 'Something went wrong, please try again ' 73 | session.pop("flow") 74 | if "id_token_claims" not in token: 75 | await insert_log( 76 | "server", 77 | "WARN", 78 | "MSAL response did not contain `id_token_claims`, this may indicate " 79 | "that the configuration must be checked by an organizational " 80 | "administrator or is otherwise incomplete.", 81 | ) 82 | return ( 83 | "This app is unable to verify your identity due to lacking rigths.", 84 | 500, 85 | ) 86 | if token["id_token_claims"].get("preferred_username") in accepted_users: 87 | session[USER_FIELD] = token["id_token_claims"].get("preferred_username") 88 | 89 | callback_url = session.pop(CALLBACK_FIELD, request.url) 90 | logger.debug(f"Done authentication flow, returning user to {callback_url}") 91 | return redirect(callback_url) 92 | else: 93 | await insert_log( 94 | "server", 95 | "WARN", 96 | "unauthorized access attempt", 97 | user_agent_string=request.headers.get("User-Agent"), 98 | ) 99 | return "Your account is not authorized", 403 100 | 101 | return redirect("/") 102 | -------------------------------------------------------------------------------- /osd2f/security/authorization/not_confgured.py: -------------------------------------------------------------------------------- 1 | """Contains the route-override when no authentication is configured 2 | """ 3 | 4 | 5 | async def no_authentication(func, *args, **kwargs): 6 | return ( 7 | "Page unavailable: Authorization must be configured " 8 | "unless the app is in testing or debug mode.", 9 | 501, 10 | ) 11 | -------------------------------------------------------------------------------- /osd2f/security/download_encryption/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/security/download_encryption/__init__.py -------------------------------------------------------------------------------- /osd2f/security/download_encryption/encrypted_zipfile.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | import pyzipper 4 | 5 | 6 | def string_to_zipfile(file_content: io.StringIO, filename: str, password: str) -> bytes: 7 | """Write a string body to a file in an encrypted zip archive.""" 8 | zipio = io.BytesIO() 9 | with pyzipper.AESZipFile(zipio, "w", encryption=pyzipper.WZ_AES) as zipfile: 10 | zipfile.setpassword(password.encode()) 11 | zipfile.writestr(filename, file_content.getvalue()) 12 | return zipio.getvalue() 13 | -------------------------------------------------------------------------------- /osd2f/security/entry_encryption/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/security/entry_encryption/__init__.py -------------------------------------------------------------------------------- /osd2f/security/entry_encryption/file_decryption.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import csv 3 | import json 4 | import pathlib 5 | from typing import Any, Dict, Iterable, List 6 | 7 | from ...definitions.submissions import ( 8 | EncryptedEntry, 9 | EncryptedSubmission, 10 | OutputSubmission, 11 | ) 12 | from ...logger import logger 13 | from ...security.entry_encryption.secure_entry_singleton import SecureEntry 14 | 15 | 16 | class EntryFile(abc.ABC): 17 | def __init__(self, filename: pathlib.Path, read_mode: bool): 18 | self.read_mode = read_mode 19 | if not read_mode and pathlib.Path(filename).exists(): 20 | raise Exception(f"File {filename} already exists!") 21 | 22 | parent_dir = pathlib.Path(filename).parent 23 | if not pathlib.Path(parent_dir).exists(): 24 | pathlib.Path(parent_dir).mkdir(parents=True, exist_ok=True) 25 | 26 | if read_mode: 27 | self.file_obj = open(filename) 28 | else: 29 | self.file_obj = open(filename, "w+") 30 | 31 | def __del__(self): 32 | if hasattr(self, "file_obj") and not self.file_obj.closed: 33 | self.file_obj.close() 34 | 35 | def __exit__(self): 36 | if hasattr(self, "file_obj") and not self.file_obj.closed: 37 | self.file_obj.close() 38 | self.__del__() 39 | 40 | @abc.abstractmethod 41 | def read_entries(self) -> Iterable[OutputSubmission]: 42 | return dict() 43 | 44 | @abc.abstractmethod 45 | def append(self, entry: OutputSubmission) -> None: 46 | return 47 | 48 | 49 | class JSONFile(EntryFile): 50 | def __init__(self, filename: pathlib.Path, read_mode: bool): 51 | self.entries: List[Dict[str, Any]] = [] 52 | super().__init__(filename, read_mode) 53 | 54 | def read_entries(self) -> Iterable[OutputSubmission]: 55 | for raw_submission in json.load(self.file_obj): 56 | submission = EncryptedSubmission.model_validate(raw_submission) 57 | try: 58 | EncryptedEntry.model_validate(submission.entry) 59 | except ValueError: 60 | logger.warning("Encountered an unencrypted entry!") 61 | yield OutputSubmission.model_validate(raw_submission) 62 | decrypted_sub = OutputSubmission.model_validate(raw_submission) 63 | decrypted_sub.entry = SecureEntry.read_entry_field(decrypted_sub.entry) 64 | yield decrypted_sub 65 | 66 | def append(self, entry: OutputSubmission) -> None: 67 | self.entries.append(entry.model_dump()) 68 | 69 | def __del__(self): 70 | if ( 71 | hasattr(self, "file_obj") 72 | and not self.file_obj.closed 73 | and not self.read_mode 74 | ): 75 | json.dump(self.entries, self.file_obj) 76 | self.entries = [] 77 | return super().__del__() 78 | 79 | 80 | class CSVFile(EntryFile): 81 | def __init__(self, filename: pathlib.Path, read_mode: bool): 82 | super().__init__(filename, read_mode) 83 | if not read_mode: 84 | headers = OutputSubmission.model_fields.keys() 85 | self.writer = csv.DictWriter(self.file_obj, fieldnames=headers) 86 | self.writer.writeheader() 87 | 88 | def read_entries(self) -> Iterable[OutputSubmission]: 89 | line = self.file_obj.readline().strip() 90 | header = line.split(csv.excel.delimiter) 91 | reader = csv.DictReader(self.file_obj, fieldnames=header) 92 | for e in reader: 93 | re: Dict[str, Any] = {k: v for k, v in e.items() if k != "entry"} 94 | re["entry"] = SecureEntry.read_entry_field(eval(e["entry"])) 95 | yield OutputSubmission.model_validate(re) 96 | 97 | def append(self, entry: OutputSubmission) -> None: 98 | 99 | self.writer.writerow(entry.model_dump()) 100 | 101 | 102 | def decrypt_file(input_path: pathlib.Path, output_path: pathlib.Path) -> int: 103 | 104 | input_file: EntryFile 105 | 106 | if input_path.suffix == ".json": 107 | input_file = JSONFile(input_path, read_mode=True) 108 | 109 | elif input_path.suffix == ".csv": 110 | input_file = CSVFile(input_path, read_mode=True) 111 | else: 112 | raise NotImplementedError( 113 | f"Unknown INPUT file type {input_path.suffix}, " 114 | "make sure you unzipped the file." 115 | ) 116 | 117 | output_file: EntryFile 118 | 119 | if output_path.suffix == ".json": 120 | output_file = JSONFile(output_path, read_mode=False) 121 | elif output_path.suffix == ".csv": 122 | output_file = CSVFile(output_path, read_mode=False) 123 | else: 124 | raise NotImplementedError( 125 | f"Unknown OUTPUT file type {output_path.suffix}, " 126 | "output should end with `.csv` or `.json`." 127 | ) 128 | 129 | touched_entries = 0 130 | for entry in input_file.read_entries(): 131 | output_file.append(entry) 132 | touched_entries += 1 133 | 134 | return touched_entries 135 | -------------------------------------------------------------------------------- /osd2f/security/entry_encryption/secure_entry_singleton.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | import random 4 | from typing import Any, Dict 5 | 6 | from cryptography.fernet import Fernet 7 | from cryptography.hazmat.primitives import hashes 8 | from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC 9 | 10 | 11 | from ...logger import logger 12 | 13 | 14 | class SecureEntry: 15 | 16 | __encryption_secret: bytes = b"" 17 | __decrypt_on_read: bool = True 18 | 19 | @classmethod 20 | def set_secret(cls, secret: str): 21 | if not secret: 22 | cls.__encryption_secret = b"" 23 | else: 24 | cls.__encryption_secret = cls.__create_key(secret.encode()) 25 | 26 | @classmethod 27 | def decrypt_on_read(cls, must_decrypt_on_read: bool): 28 | cls.__decrypt_on_read = must_decrypt_on_read 29 | 30 | @classmethod 31 | def write_entry_field(cls, entry_field: Dict[str, Any]) -> Dict[str, Any]: 32 | if not cls.__encryption_secret: 33 | return entry_field 34 | f = Fernet(cls.__encryption_secret) 35 | return {"encrypted": f.encrypt(json.dumps(entry_field).encode()).decode()} 36 | 37 | @classmethod 38 | def read_entry_field(cls, entry_field: Dict[str, Any]) -> Dict[str, Any]: 39 | if not cls.__encryption_secret or not cls.__decrypt_on_read: 40 | return entry_field 41 | encrypted_content = entry_field.get("encrypted") 42 | 43 | if not encrypted_content: 44 | logger.warning( 45 | "Entry encryption was set, but an unencrypted " "entry was retrieved!" 46 | ) 47 | return entry_field 48 | f = Fernet(cls.__encryption_secret) 49 | content = f.decrypt(encrypted_content.encode()) 50 | return json.loads(content.decode()) 51 | 52 | @staticmethod 53 | def __create_key(password: bytes) -> bytes: 54 | random.seed(len(password)) 55 | salt = bytes(random.randint(0, 10**6)) 56 | kdf = PBKDF2HMAC( 57 | algorithm=hashes.SHA256(), 58 | length=32, 59 | salt=salt, 60 | iterations=320_000, 61 | ) 62 | key = base64.urlsafe_b64encode(kdf.derive(password)) 63 | return key 64 | -------------------------------------------------------------------------------- /osd2f/security/secrets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/security/secrets/__init__.py -------------------------------------------------------------------------------- /osd2f/security/secrets/azure_keyvault.py: -------------------------------------------------------------------------------- 1 | from azure.identity import DefaultAzureCredential 2 | from azure.keyvault.secrets import SecretClient 3 | 4 | from ...logger import logger 5 | 6 | PREFIX = "azure-keyvault" 7 | 8 | 9 | def azure_keyvault_replace(value: str) -> str: 10 | """translates environment variables formatted like: 11 | 12 | azure-keyvault::example.vault.azure.net::secret_name 13 | 14 | to the contents of the referred secrets. 15 | 16 | """ 17 | if len(value.split("::")) != 3: 18 | logger.critical( 19 | f"azure value {value} is incorrectly formatted, " 20 | f"should be `{PREFIX}::KEYVAULT_URL::SECRET_NAME`" 21 | ) 22 | exit() 23 | _, keyvault_url, secret_name = value.split("::") 24 | cred = DefaultAzureCredential() 25 | client = SecretClient(keyvault_url, cred) 26 | secret = client.get_secret(secret_name).value 27 | return secret or "" 28 | -------------------------------------------------------------------------------- /osd2f/settings/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/settings/.DS_Store -------------------------------------------------------------------------------- /osd2f/settings/default_content_settings.yaml: -------------------------------------------------------------------------------- 1 | project_title: OSD2F 2 | contact_us: email@domain.tld 3 | static_pages: 4 | home: 5 | active: True 6 | name: The OSD2F project 7 | blocks: 8 | - type: jumbotron 9 | title: Data donation made easy 10 | id: top 11 | image: "/static/skull_phone_cc.jpg" 12 | lines: 13 | - A general way of donating data 14 | - For JSON based GDPR exports 15 | - To use with external survey and analysis tools 16 | buttons: 17 | - name: About the project 18 | label: "btn-primary" 19 | link: "#project" 20 | - name: How it works 21 | label: "btn-success" 22 | link: "/donate" 23 | 24 | - type: two_block_row 25 | id: project 26 | image: "/static/study_cc.jpg" 27 | image_pos: left 28 | title: OSD2F provides a whitelist based collection website 29 | lines: 30 | - Under GDPR, everyone should be able to export
their data in machine-readable format 31 | - Many platform provide standardized ways to get this data by
exporting it as a set of JSON files 32 | - This app allows researchers to easily and safely collect exported data donated by participants in their studies 33 | buttons: 34 | - name: About the developers 35 | label: "btn-primary" 36 | link: "#team" 37 | - name: Donate now 38 | label: "btn-success" 39 | link: "/donate" 40 | 41 | - type: two_block_row 42 | id: team 43 | title: Open Source Data Donation Framework 44 | lines: 45 | - The digital traces that people leave through their use of various online platforms provide tremendous opportunities for studying human behavior. 46 | - However, the collection of these data is hampered by legal, ethical and technical challenges. 47 | - We present a framework and tool for collecting these data through a data donation platform where consenting participants can securely submit their digital traces. 48 | circles_row: 49 | - title: Prof. Dr. Hypothetical collaborator 50 | subtitle: this person does not exist 51 | image: "https://thispersondoesnotexist.com/image" 52 | - title: Sponsored by 53 | subtitle: A random image 54 | image: "https://picsum.photos/200/300.jpg" 55 | buttons: 56 | - name: About the project 57 | label: "btn-primary" 58 | link: "#team" 59 | - name: Donate now 60 | label: "btn-success" 61 | link: "/donate" 62 | 63 | privacy: 64 | active: True 65 | name: Privacy protection 66 | blocks: 67 | - type: jumbotron 68 | title: Data donation made easy 69 | id: top 70 | lines: 71 | - A simple application for donation collection 72 | buttons: [] 73 | 74 | donate: 75 | active: False 76 | name: Donation Page 77 | blocks: [] 78 | 79 | upload_page: 80 | blocks: [] 81 | upload_box: 82 | header: "Select file(s):" 83 | explanation: 84 | - "You can use the file selector to select the zipfile from your platform" 85 | - "You can also drag the folder into this box" 86 | thanks_text: "Thanks for trying osd2f" 87 | processing_text: "processing your donation for preview" 88 | empty_selection: "Your selection does not contain any files appropriate to our study.
Did you select the correct (zip) file?" 89 | file_indicator_text: "Entries in your donation: " 90 | donate_button: "Donate" 91 | inspect_button: "Inspect & edit" 92 | preview_component: 93 | entries_in_file_text: "Entries in this file: " 94 | title: "Inspect & Edit your donation" 95 | explanation: 96 | - The top shows the files in your donation 97 | - In each file, you can search for content 98 | - You can remove content by clicking on the rows and pressing "remove selection" 99 | remove_rows_button: "remove selected rows" 100 | search_prompt: "Search in file" 101 | search_box_placeholder: "type to search this file" 102 | previous_file_button: Previous file 103 | next_file_button: Next file 104 | consent_popup: 105 | title: "I want to donate my data..." 106 | lead: "This box explains the conditions of your donation" 107 | points: 108 | - how data is secured 109 | - what the data will be used for 110 | - how long the data will be stored 111 | end_text: "By clicking below, you consent to these terms..." 112 | decline_button: "I do not consent" 113 | accept_button: "I consent" 114 | -------------------------------------------------------------------------------- /osd2f/settings/default_upload_settings.yaml: -------------------------------------------------------------------------------- 1 | # Sample Platform upload settings 2 | files: 3 | (^|/|\\)comments.json: 4 | # in key is required when the initial level of data 5 | # is not a list but an object (e.g. {} instead of []) 6 | # if keys are nested more than once, use a '.' to separate 7 | # the levels of the keys 8 | in_key: 'comment_information' 9 | anonymizers: 10 | - redact_text: '' 11 | # accepted_fields to include in the upload 12 | # remove a field to filter it out 13 | accepted_fields: 14 | - timestamp 15 | - title 16 | - information.comment.comment_text 17 | 18 | # posts can be split into multiple files 19 | # that end in `_`. 20 | posts(_(\d)+).json: 21 | # anonymizers are functions applied on the server 22 | # that can inspect the content of a potential donation 23 | # and apply anonymization accordingly. 24 | # the KEY is the function to apply, the VALUE a string-argument 25 | anonymizers: 26 | - redact_text: '' 27 | accepted_fields: 28 | - timestamp 29 | - post_title 30 | - keywords 31 | - information.post.post_metadata.expanded_url 32 | - information.post.post_metadata.source 33 | - information.post.post_text 34 | 35 | # these are the likes & reactions of pages 36 | engagement.json: 37 | in_key: engagement_info 38 | accepted_fields: 39 | - engagement_type 40 | - object 41 | - timestamp 42 | 43 | ## alternative filename for likes & reactions of pages 44 | companies_followed.json: 45 | in_key: companies_followed 46 | accepted_fields: 47 | - company_name 48 | - timestamp 49 | 50 | # these are the likes & reactions of posts & comments 51 | profile_interests.json: 52 | in_key: profile_interests 53 | accepted_fields: [] 54 | 55 | # Advertiser interactions file 56 | ads_clicked.json: 57 | # in_key : 58 | accepted_fields: 59 | - ad_title 60 | - activity 61 | - timestamp 62 | 63 | # Advertisers who uploaded user information 64 | advertisers_who_uploaded_a_contact_list_with_your_information.json: 65 | in_key: custom_audiences_v2 66 | accepted_fields: [] 67 | 68 | # Privacy Checkup 69 | privacy_checkup_interactions.json: 70 | in_key: privacy_checkup_interactions_v2 71 | accepted_fields: 72 | - name 73 | - started_timestamp 74 | - completed_timestamp 75 | 76 | # activity of an account 77 | account_activity.json: 78 | in_key: account_activity_v2 79 | accepted_fields: 80 | - action 81 | - country 82 | - site_name 83 | - timestamp 84 | 85 | # short messages with a '.' in the in-key 86 | messages.json: 87 | in_key: messages.collection 88 | accepted_fields: 89 | - id 90 | - message 91 | -------------------------------------------------------------------------------- /osd2f/static/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/static/.DS_Store -------------------------------------------------------------------------------- /osd2f/static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/static/favicon.ico -------------------------------------------------------------------------------- /osd2f/static/js/libarchive/wasm-gen/libarchive.wasm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/static/js/libarchive/wasm-gen/libarchive.wasm -------------------------------------------------------------------------------- /osd2f/static/js/main.js.LICENSE.txt: -------------------------------------------------------------------------------- 1 | /*! 2 | * Vue.js v2.6.12 3 | * (c) 2014-2020 Evan You 4 | * Released under the MIT License. 5 | */ 6 | -------------------------------------------------------------------------------- /osd2f/static/keylock.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/static/keylock.png -------------------------------------------------------------------------------- /osd2f/static/skull_phone_cc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/static/skull_phone_cc.jpg -------------------------------------------------------------------------------- /osd2f/static/study_cc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/osd2f/static/study_cc.jpg -------------------------------------------------------------------------------- /osd2f/templates/blocks/bootstrap_scripts.html.jinja: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /osd2f/templates/blocks/circles_row.html.jinja: -------------------------------------------------------------------------------- 1 | {% if contentblock.circles_row %} 2 |
3 | {% for circleItem in contentblock.circles_row %} 4 |
5 | 6 |

{{ circleItem.title | safe }}

7 |

{{ circleItem.subtitle | safe }}

8 |
9 | {% endfor %} 10 |
11 | {% endif %} -------------------------------------------------------------------------------- /osd2f/templates/blocks/footer.html.jinja: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Contact us at: {{ content_settings.contact_us }}

4 |
5 |
-------------------------------------------------------------------------------- /osd2f/templates/blocks/head.html.jinja: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | {% if all_links_new_tab %} 12 | 13 | {% endif %} 14 | -------------------------------------------------------------------------------- /osd2f/templates/blocks/jumbotron.html.jinja: -------------------------------------------------------------------------------- 1 |
2 |
3 | {% if contentblock.title %}

{{contentblock.title|safe}}

{% endif %} 4 |
5 | {% for line in contentblock.lines %} 6 |

{{ line | safe }}

7 | {% endfor %} 8 |
9 | {% include "blocks/circles_row.html.jinja" %} 10 | {% for button in contentblock.buttons %} 11 | 12 | {% endfor %} 13 | 14 |
15 |
16 | -------------------------------------------------------------------------------- /osd2f/templates/blocks/navbar.html.jinja: -------------------------------------------------------------------------------- 1 |
2 | 26 |
-------------------------------------------------------------------------------- /osd2f/templates/blocks/two_block_row.html.jinja: -------------------------------------------------------------------------------- 1 |
2 | {% if contentblock.image and contentblock.image_pos=="left" %} 3 |
4 | {% endif %} 5 |
6 |

{{contentblock.title|safe}}

7 | {% for line in contentblock.lines %} 8 |

{{line|safe}}

9 | {% endfor%} 10 | {% include "blocks/circles_row.html.jinja" %} 11 |
12 | {% for button in contentblock.buttons %} 13 | 14 | {% endfor %} 15 |
16 |
17 | {% if contentblock.image and contentblock.image_pos=="right"%} 18 |
19 | {% endif %} 20 |
-------------------------------------------------------------------------------- /osd2f/templates/formats/base.html.jinja: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | {% include "blocks/head.html.jinja" %} 5 | 6 | 7 | {% include "blocks/navbar.html.jinja" %} 8 | 9 |
10 |

11 | {% block content %} {% endblock %} 12 |

13 |
14 | 15 | 16 | 17 | {% include "blocks/footer.html.jinja" %} 18 | {% include "blocks/bootstrap_scripts.html.jinja" %} -------------------------------------------------------------------------------- /osd2f/templates/formats/static_template.html.jinja: -------------------------------------------------------------------------------- 1 | {% extends "formats/base.html.jinja" %} 2 | 3 | {% block content %} 4 |
5 | {% for contentblock in content_settings.static_pages[current_page].blocks %} 6 | 7 | {% if contentblock.type=="jumbotron" %} 8 | {% include "blocks/jumbotron.html.jinja" %} 9 | {% endif %} 10 | 11 | 12 | {% if contentblock.type=="two_block_row" %} 13 | {% include "blocks/two_block_row.html.jinja" %} 14 | {% endif%} 15 | 16 | {% endfor %} 17 |
18 | 19 | {% endblock content %} -------------------------------------------------------------------------------- /osd2f/templates/formats/upload_template.html.jinja: -------------------------------------------------------------------------------- 1 | {% extends "formats/base.html.jinja" %} 2 | 3 | {% block content %} 4 |
5 | 6 | 7 | {% for contentblock in content_settings.upload_page.blocks %} 8 | 9 | {% if contentblock.type=="jumbotron" %} 10 | {% include "blocks/jumbotron.html.jinja" %} 11 | {% endif %} 12 | 13 | 14 | {% if contentblock.type=="two_block_row" %} 15 | {% include "blocks/two_block_row.html.jinja" %} 16 | {% endif%} 17 | 18 | {% endfor %} 19 |
20 | 21 | 22 |
23 | 24 | 25 | 38 |
39 |
40 |

{{content_settings.upload_page.upload_box.header|safe}}

41 | 42 |
43 | {% for par in content_settings.upload_page.upload_box.explanation%} 44 |

{{par|safe}}

45 | {% endfor %} 46 | 47 | 48 |
49 | 50 |
51 |
52 | 53 | 54 | 64 | 65 | 66 |
67 |
68 |

{{content_settings.upload_page.empty_selection | safe}}

69 |
70 |
71 | 72 | 73 | 76 | 77 | 78 |
79 |
80 | 81 | 85 | 86 |
87 |
88 | 89 |
90 | 91 | 92 | 93 | 94 | 95 | 143 | {% endblock content %} -------------------------------------------------------------------------------- /osd2f/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import functools 3 | import os 4 | import pathlib 5 | import typing 6 | from collections.abc import MutableMapping 7 | 8 | import pytz 9 | 10 | import yaml # type: ignore 11 | 12 | from .database import get_content_config, set_content_config 13 | from .definitions import ContentSettings, UploadSettings 14 | from .logger import logger 15 | 16 | DISK_CONTENT_CONFIG_PATH: str = str( 17 | pathlib.Path(__file__) 18 | .parent.joinpath("settings") 19 | .joinpath("default_content_settings.yaml") 20 | ) 21 | DISK_CONFIG_VERSION = "" 22 | 23 | 24 | @functools.lru_cache 25 | def _cached_load_settings() -> UploadSettings: 26 | return _load_settings_from_disk() 27 | 28 | 29 | def _load_settings_from_disk() -> UploadSettings: 30 | settings_dir = pathlib.Path(__file__).parent.joinpath("settings") 31 | try: 32 | settings = UploadSettings.model_validate( 33 | yaml.safe_load(open(settings_dir.joinpath("upload_settings.yaml"))) 34 | ) 35 | except FileNotFoundError: 36 | logger.warning("No user provided `upload_settings.yaml` found, using defaults.") 37 | settings = UploadSettings.model_validate( 38 | yaml.safe_load(open(settings_dir.joinpath("default_upload_settings.yaml"))) 39 | ) 40 | return settings 41 | 42 | 43 | def load_upload_settings(force_disk: bool = False) -> UploadSettings: 44 | if force_disk: 45 | logger.warning( 46 | "Settings are re-loaded from disk on every request, " 47 | "this eases debugging, but will hurt performance!" 48 | ) 49 | return _load_settings_from_disk() 50 | else: 51 | return _cached_load_settings() 52 | 53 | 54 | async def load_content_settings(use_cache: bool) -> ContentSettings: 55 | # load db config version 56 | db_config = await get_content_config() 57 | 58 | # load disk version () 59 | global DISK_CONFIG_VERSION 60 | if not DISK_CONFIG_VERSION or not use_cache: 61 | disk_config = yaml.safe_load(open(DISK_CONTENT_CONFIG_PATH)) 62 | DISK_CONFIG_VERSION = disk_config 63 | 64 | else: 65 | disk_config = DISK_CONFIG_VERSION 66 | 67 | disk_timestamp = pytz.UTC.localize( 68 | datetime.datetime.fromtimestamp(os.path.getmtime(DISK_CONTENT_CONFIG_PATH)) 69 | ) 70 | 71 | # if no database config exists, insert disk version in database and 72 | # use disk version 73 | if not db_config: 74 | config = ContentSettings.model_validate(disk_config) 75 | await set_content_config(user="default", content=config) 76 | return config 77 | 78 | # pick the most recent version 79 | if db_config.insert_timestamp > disk_timestamp: 80 | last_config = db_config.config_blob 81 | else: 82 | last_config = disk_config 83 | 84 | config = ContentSettings.model_validate(last_config) 85 | 86 | return config 87 | 88 | 89 | def flatten(d: MutableMapping, parent_key: str = "", sep: str = "_"): 90 | items = [] 91 | if type(d) == str: 92 | return d 93 | for k, v in d.items(): 94 | new_key = parent_key + sep + k if parent_key else k 95 | if isinstance(v, MutableMapping): 96 | items.extend(flatten(v, new_key, sep=sep).items()) 97 | elif type(v) == list: 98 | items.append((new_key, [flatten(vi, sep=sep) for vi in v])) 99 | else: 100 | items.append((new_key, v)) 101 | return dict(items) 102 | 103 | 104 | def flatmap( 105 | items: dict, 106 | in_key: typing.Optional[str] = None, 107 | ): 108 | 109 | base = items if in_key is None else items.get(in_key, []) 110 | 111 | return [flatten(e, sep=".") for e in base] 112 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "osd2f", 3 | "version": "0.0.1", 4 | "description": "![Python application](https://github.com/uvacw/osd2f/workflows/Python%20application/badge.svg?branch=main) \"Code # OSD2F: Open Source Data Donation Framework", 5 | "main": "static/js/file_upload.js", 6 | "private": true, 7 | "directories": { 8 | "doc": "docs", 9 | "test": "tests" 10 | }, 11 | "scripts": { 12 | "test": "jest", 13 | "develop": "webpack watch --mode development", 14 | "build": "webpack --mode production" 15 | }, 16 | "repository": { 17 | "type": "git", 18 | "url": "git+https://github.com/uvacw/osd2f.git" 19 | }, 20 | "author": "", 21 | "license": "ISC", 22 | "bugs": { 23 | "url": "https://github.com/uvacw/osd2f/issues" 24 | }, 25 | "homepage": "https://github.com/uvacw/osd2f#readme", 26 | "dependencies": { 27 | "blob-polyfill": "^5.0.20210201", 28 | "bootstrap": "^4.6.0", 29 | "bootstrap-vue": "^2.21.2", 30 | "datatransfer-files-promise": "^1.3.1", 31 | "glob-parent": "^5.1.1", 32 | "jsonpath": "^1.1.1", 33 | "libarchive.js": "^1.3.0", 34 | "loader-utils": "^3.2.1", 35 | "vue": "^2.6.12" 36 | }, 37 | "devDependencies": { 38 | "copy-webpack-plugin": "^7.0.0", 39 | "css-loader": "^5.0.2", 40 | "jest": "^27.5.1", 41 | "postcss": "^8.2.15", 42 | "style-loader": "^2.0.0", 43 | "vue-loader": "^15.11.1", 44 | "vue-template-compiler": "^2.6.12", 45 | "webpack": "^5.76.0", 46 | "webpack-cli": "^4.9.2" 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.9 3 | # by the following command: 4 | # 5 | # pip-compile 6 | # 7 | aiofiles==23.2.1 8 | # via quart 9 | aiosqlite==0.17.0 10 | # via tortoise-orm 11 | annotated-types==0.6.0 12 | # via pydantic 13 | async-timeout==4.0.3 14 | # via asyncpg 15 | asyncpg==0.29.0 16 | # via OSD2F (setup.py) 17 | azure-core==1.30.1 18 | # via 19 | # azure-identity 20 | # azure-keyvault-secrets 21 | azure-identity==1.16.0 22 | # via OSD2F (setup.py) 23 | azure-keyvault-secrets==4.8.0 24 | # via OSD2F (setup.py) 25 | blinker==1.8.2 26 | # via 27 | # flask 28 | # quart 29 | certifi==2024.2.2 30 | # via requests 31 | cffi==1.16.0 32 | # via cryptography 33 | charset-normalizer==3.3.2 34 | # via requests 35 | click==8.1.7 36 | # via 37 | # flask 38 | # quart 39 | cryptography==42.0.7 40 | # via 41 | # OSD2F (setup.py) 42 | # azure-identity 43 | # msal 44 | # pyjwt 45 | dnspython==2.6.1 46 | # via email-validator 47 | email-validator==2.1.1 48 | # via pydantic 49 | exceptiongroup==1.2.1 50 | # via taskgroup 51 | flask==3.0.3 52 | # via quart 53 | h11==0.14.0 54 | # via 55 | # hypercorn 56 | # wsproto 57 | h2==4.1.0 58 | # via hypercorn 59 | hpack==4.0.0 60 | # via h2 61 | hypercorn==0.16.0 62 | # via 63 | # OSD2F (setup.py) 64 | # quart 65 | hyperframe==6.0.1 66 | # via h2 67 | idna==3.7 68 | # via 69 | # email-validator 70 | # requests 71 | importlib-metadata==7.1.0 72 | # via 73 | # flask 74 | # quart 75 | iso8601==1.1.0 76 | # via tortoise-orm 77 | isodate==0.6.1 78 | # via azure-keyvault-secrets 79 | itsdangerous==2.2.0 80 | # via 81 | # flask 82 | # quart 83 | jinja2==3.1.4 84 | # via 85 | # flask 86 | # quart 87 | markupsafe==2.1.5 88 | # via 89 | # jinja2 90 | # quart 91 | # werkzeug 92 | msal==1.28.0 93 | # via 94 | # OSD2F (setup.py) 95 | # azure-identity 96 | # msal-extensions 97 | msal-extensions==1.1.0 98 | # via azure-identity 99 | packaging==24.0 100 | # via msal-extensions 101 | portalocker==2.8.2 102 | # via msal-extensions 103 | priority==2.0.0 104 | # via hypercorn 105 | pycparser==2.22 106 | # via cffi 107 | pycryptodomex==3.20.0 108 | # via pyzipper 109 | pydantic[email]==2.7.1 110 | # via 111 | # OSD2F (setup.py) 112 | # tortoise-orm 113 | pydantic-core==2.18.2 114 | # via pydantic 115 | pyjwt[crypto]==2.8.0 116 | # via 117 | # OSD2F (setup.py) 118 | # msal 119 | pypika-tortoise==0.1.6 120 | # via tortoise-orm 121 | pytz==2024.1 122 | # via tortoise-orm 123 | pyyaml==6.0.1 124 | # via OSD2F (setup.py) 125 | pyzipper==0.3.6 126 | # via OSD2F (setup.py) 127 | quart==0.19.5 128 | # via OSD2F (setup.py) 129 | requests==2.31.0 130 | # via 131 | # azure-core 132 | # msal 133 | six==1.16.0 134 | # via 135 | # azure-core 136 | # isodate 137 | taskgroup==0.0.0a4 138 | # via hypercorn 139 | tomli==2.0.1 140 | # via hypercorn 141 | tortoise-orm==0.20.1 142 | # via OSD2F (setup.py) 143 | typing-extensions==4.11.0 144 | # via 145 | # aiosqlite 146 | # azure-core 147 | # azure-keyvault-secrets 148 | # pydantic 149 | # pydantic-core 150 | # quart 151 | urllib3==2.2.1 152 | # via requests 153 | werkzeug==3.0.3 154 | # via 155 | # flask 156 | # quart 157 | wsproto==1.2.0 158 | # via hypercorn 159 | zipp==3.18.2 160 | # via importlib-metadata 161 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | black==22.3.0 2 | Faker==8.1.3 3 | flake8==4.0.1 4 | flake8-black==0.3.3 5 | flake8-import-order==0.18.1 6 | locust==2.25.0 7 | mypy==1.10.0 8 | mypy-extensions==1.0.0 9 | pip-tools==7.4.1 10 | pytest==6.1.2 11 | aiounittest==1.4.0 12 | types-pytz==2024.1.0.20240417 13 | types-PyYAML==6.0.12.20240311 -------------------------------------------------------------------------------- /scripts/locally_decrypt_entries.py: -------------------------------------------------------------------------------- 1 | """Decrypts downloaded submissions 2 | 3 | Some OSD2F distributions are configured to keep 4 | entries encrypted on download. This script locally 5 | decrypts these entries, provided you have access to 6 | the entry encryption secret. 7 | 8 | Usage: 9 | 10 | python scripts/locally_decrypt_entries.py -h 11 | 12 | """ 13 | import argparse 14 | import logging 15 | import pathlib 16 | 17 | from osd2f.logger import logger 18 | from osd2f.security import translate_value 19 | from osd2f.security.entry_encryption.file_decryption import decrypt_file 20 | from osd2f.security.entry_encryption.secure_entry_singleton import SecureEntry 21 | 22 | parser = argparse.ArgumentParser(description=__doc__) 23 | 24 | parser.add_argument( 25 | "-v", 26 | "--verbose", 27 | action="count", 28 | default=0, 29 | help="Verbosity of logging output, defaults to default=CRITICAL, " 30 | "v=WARNING, vv=INFO, vvv=DEBUG", 31 | ) 32 | parser.add_argument("input_file", help="The file of submissions to decrypt.") 33 | parser.add_argument("output_file", help="The file to write decrypted submissions to") 34 | 35 | parser.add_argument( 36 | "secret", 37 | help="The encryption secret, should be the same " 38 | "as the secret used to configure the server. " 39 | "May be a reference to a supported secret store such as Azure KeyVault", 40 | ) 41 | 42 | 43 | def run_script(): 44 | args = parser.parse_args() 45 | 46 | if args.verbose == 0: 47 | level = logging.CRITICAL 48 | elif args.verbose == 1: 49 | level = logging.WARNING 50 | elif args.verbose == 2: 51 | level = logging.INFO 52 | elif args.verbose == 3: 53 | level = logging.DEBUG 54 | else: 55 | print("UNKNOWN LOGLEVEL SPECIFIED") 56 | level = logging.NOTSET 57 | 58 | logger.setLevel(level=level) 59 | 60 | secret = translate_value(args.secret) 61 | SecureEntry.set_secret(secret) 62 | 63 | input_path = pathlib.Path(args.input_file) 64 | output_path = pathlib.Path(args.output_file) 65 | 66 | touched_entries = decrypt_file(input_path=input_path, output_path=output_path) 67 | 68 | logger.info( 69 | f"Done decrypting {touched_entries} entries from {input_path} to {output_path}" 70 | ) 71 | 72 | 73 | if __name__ == "__main__": 74 | run_script() 75 | -------------------------------------------------------------------------------- /scripts/locust_stress_testing.py: -------------------------------------------------------------------------------- 1 | """Locust based stress testing 2 | 3 | Run headless using: 4 | 5 | locust --host 'http://localhost:5000' -f 'scripts/locust_stress_testing.py' \ 6 | --headless --users 100 -t 60sec 7 | 8 | Run with web interface: 9 | 10 | locust -f scripts/locust_stress_test.py 11 | 12 | NOTE: it's recommended to use a ASGI tool such as hypercorn in production, 13 | you should also test with such a framework to get realistic performance. 14 | 15 | """ 16 | 17 | import faker 18 | 19 | from locust import between, task 20 | from locust.contrib.fasthttp import FastHttpUser 21 | 22 | from osd2f.utils import flatmap 23 | 24 | from scripts import sample_data_generator 25 | 26 | 27 | class SampleParticipant(FastHttpUser): 28 | wait_time = between(0.1, 5) 29 | 30 | def on_start(self): 31 | """Generate a fake user and associated donation""" 32 | f = faker.Faker() 33 | self.user = f.user_name() 34 | self.sid = f.uuid4() 35 | self.entries = { 36 | "comments.json": flatmap( 37 | sample_data_generator.generate_comments(user=self.user, n=1000), 38 | "comment_information", 39 | ), 40 | f"your_posts_{self.user}_1.json": flatmap( 41 | sample_data_generator.generate_posts(self.user, n=100) 42 | ), 43 | "engagement.json": flatmap( 44 | sample_data_generator.generate_engagement(self.user, 10), 45 | "engagement", 46 | ), 47 | "companies_followed.json": flatmap( 48 | sample_data_generator.generate_companies_followed(self.user, 100), 49 | "companies_followed", 50 | ), 51 | "ads_clicked.json": sample_data_generator.generate_ads_clicked( 52 | self.user, 50 53 | ), 54 | "profile_interests.json": [ 55 | {"entry": e} 56 | for e in flatmap( 57 | sample_data_generator.generate_profile_interests( # noqa 58 | self.user, 100 59 | ), 60 | "profile_interests", 61 | ) 62 | ], 63 | } 64 | 65 | @task(20) 66 | def send_log(self): 67 | self.client.get("/log?position=locust&level=DEBUG") 68 | 69 | @task(1) 70 | def send_anonymization(self): 71 | for fn, entries in self.entries.items(): 72 | self.client.post( 73 | "/adv_anonymize_file", 74 | json={ 75 | "submission_id": self.sid, 76 | "filename": fn, 77 | "n_deleted": 0, 78 | "entries": entries, 79 | }, 80 | ) 81 | 82 | @task(1) 83 | def send_submission(self): 84 | submission = [] 85 | for fn, entries in self.entries.items(): 86 | submission.append( 87 | { 88 | "submission_id": self.sid, 89 | "filename": fn, 90 | "n_deleted": 0, 91 | "entries": entries, 92 | } 93 | ) 94 | self.client.post("/upload", json=submission) 95 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import find_packages, setup 4 | 5 | setup( 6 | name="OSD2F", 7 | python_requires=">3.8", 8 | version="0.1.2", 9 | description="Open Source Data Donation Framework", 10 | author="Bob van de Velde", 11 | author_email="osd2f@bob-as-a-service.com", 12 | license=open("LICENSE").read(), 13 | url="https://github.com/uvacw/osd2f", 14 | packages=find_packages(), 15 | package_data={ 16 | "osd2f": [ 17 | "static/*", 18 | "templates/*", 19 | "templates/*/*", 20 | "settings/*", 21 | "static/js/*", 22 | "static/js/libarchive/*", 23 | "static/js/libarchive/wasm-gen/*", 24 | ] 25 | }, 26 | scripts=["bin/osd2f", "bin/osd2f-decrypt-submissions"], 27 | install_requires=[ 28 | "asyncpg", 29 | "azure-keyvault-secrets", 30 | "azure-identity", 31 | "cryptography", 32 | "hypercorn", 33 | "msal", 34 | "pyyaml", 35 | "pydantic~=2.6", 36 | "pydantic[email]", 37 | "pyjwt>=2.4.0", # dependency of MSAL, insecure < 2.4.0 38 | "pyzipper", 39 | "quart", 40 | "tortoise-orm", 41 | ], 42 | ) 43 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uvacw/osd2f/90fc2882bb386a9d00fd4dfc802aeaabb6371148/tests/__init__.py -------------------------------------------------------------------------------- /tests/anonymizer_module_test.py: -------------------------------------------------------------------------------- 1 | from aiounittest import AsyncTestCase 2 | 3 | 4 | class test_anonymizer_package_interface(AsyncTestCase): 5 | async def test_apply(self): 6 | from osd2f import anonymizers 7 | 8 | # register a mock pass-through function as an anonymizer 9 | async def testfunc(e, _): 10 | return e 11 | 12 | anonymizers.options["testfunc"] = testfunc 13 | 14 | entries = [{"title": f"entry {i}"} for i in range(100)] 15 | 16 | redacted_entries = await anonymizers.apply(entries, "testfunc") 17 | 18 | self.assertListEqual(entries, redacted_entries) 19 | 20 | anonymizers.options.pop("testfunc") 21 | 22 | async def test_options_conform_to_spec(self): 23 | from osd2f import anonymizers 24 | 25 | for k, v in anonymizers.options.items(): 26 | self.assertEqual(k, v.__name__) 27 | 28 | async def test_submission_list_anonymization(self): 29 | from osd2f import anonymizers 30 | from osd2f.definitions import UploadSettings, SubmissionList, Submission 31 | 32 | async def testfunc(e, a): 33 | e[a] = a 34 | return e 35 | 36 | anonymizers.options["testfunc"] = testfunc 37 | 38 | settings = UploadSettings( 39 | files={ 40 | "file(_\\d)?.json": { 41 | "accepted_fields": [], 42 | "anonymizers": [{"testfunc": "a"}, {"testfunc": "b"}], 43 | } 44 | } 45 | ) 46 | submission_list = SubmissionList( 47 | [ 48 | Submission( 49 | entries=[{}], filename="file_2.json", submission_id="1", n_deleted=2 50 | ) 51 | ] 52 | ) 53 | await anonymizers.anonymize_submission_list( 54 | submission_list=submission_list, settings=settings 55 | ) 56 | self.assertEqual(submission_list.root[0].entries[0], {"a": "a", "b": "b"}) 57 | 58 | async def test_broken_anonymizer(self): 59 | from osd2f import anonymizers 60 | from osd2f.definitions import UploadSettings, SubmissionList, Submission 61 | 62 | async def brokenanonymizer(s: dict, arg: str = None): 63 | if s.get("title") == "weird entry": 64 | raise ValueError("Help, I'm broken") 65 | return s 66 | 67 | entries = [{"title": t} for t in ["normal entry"] * 10 + ["weird entry"]] 68 | 69 | anonymizers.options["brokenanonymizer"] = brokenanonymizer 70 | 71 | settings = UploadSettings( 72 | files={ 73 | "file(_\\d)?.json": { 74 | "accepted_fields": [], 75 | "anonymizers": [{"brokenanonymizer": "a"}], 76 | } 77 | } 78 | ) 79 | submission_list = SubmissionList( 80 | [ 81 | Submission( 82 | entries=entries, 83 | filename="file_2.json", 84 | submission_id="1", 85 | n_deleted=6, 86 | ) 87 | ] 88 | ) 89 | await anonymizers.anonymize_submission_list( 90 | submission_list=submission_list, settings=settings 91 | ) 92 | 93 | # check that all but one entry remains 94 | self.assertEqual(len(submission_list.root[0].entries), 10) 95 | -------------------------------------------------------------------------------- /tests/create_app_test.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | from unittest import TestCase 4 | from unittest.mock import Mock, patch 5 | 6 | 7 | class CreateAppTest(TestCase): 8 | def test_env_var_config(self): 9 | test_env_vars = { 10 | "OSD2F_DB_URL": "testdb", 11 | "OSD2F_SECRET": "testsecret", 12 | "OSD2F_DATA_PASSWORD": "datapassword", 13 | } 14 | old_env = os.environ.copy() 15 | os.environ.update(test_env_vars) 16 | import osd2f.config 17 | 18 | # force reload to trigger new processing of 19 | # env variables 20 | importlib.reload(osd2f.config) 21 | 22 | from osd2f.server import create_app 23 | 24 | app = create_app(mode="Production") 25 | 26 | assert app.config["DB_URL"] == "testdb" 27 | assert app.config["SECRET_KEY"] == "testsecret" 28 | assert app.config["DATA_PASSWORD"] == "datapassword" 29 | 30 | # reset to old environment 31 | os.environ = old_env 32 | importlib.reload(osd2f.config) 33 | 34 | def test_ovveride_var_config(self): 35 | test_env_vars = { 36 | "OSD2F_DB_URL": "testdb", 37 | "OSD2F_SECRET": "testsecret", 38 | "OSD2F_DATA_PASSWORD": "datapassword", 39 | } 40 | old_env = os.environ.copy() 41 | os.environ.update(test_env_vars) 42 | import osd2f.config 43 | 44 | # force reload to trigger new processing of 45 | # env variables 46 | importlib.reload(osd2f.config) 47 | 48 | from osd2f.server import create_app 49 | 50 | app = create_app( 51 | mode="Production", 52 | database_url_override="override_url", 53 | app_secret_override="override_secret", 54 | data_password_override="override_datapassword", 55 | ) 56 | 57 | assert app.config["DB_URL"] == "override_url" 58 | assert app.config["SECRET_KEY"] == "override_secret" 59 | assert app.config["DATA_PASSWORD"] == "override_datapassword" 60 | 61 | # reset to old environment 62 | os.environ = old_env 63 | importlib.reload(osd2f.config) 64 | 65 | def test_overide_var_translation(self): 66 | 67 | mock_translate_value = Mock() 68 | with patch("osd2f.server.security.translate_value", mock_translate_value): 69 | from osd2f.server import create_app 70 | 71 | create_app( 72 | mode="Production", 73 | database_url_override="override_url", 74 | app_secret_override="override_secret", 75 | data_password_override="override_datapassword", 76 | ) 77 | mock_translate_value.assert_called_with("override_url") 78 | 79 | mock_translate_value = Mock() 80 | with patch("osd2f.server.security.translate_value", mock_translate_value): 81 | from osd2f.server import create_app 82 | 83 | create_app( 84 | mode="Production", 85 | app_secret_override="override_secret", 86 | data_password_override="override_datapassword", 87 | ) 88 | mock_translate_value.assert_called_with("override_secret") 89 | 90 | mock_translate_value = Mock() 91 | with patch("osd2f.server.security.translate_value", mock_translate_value): 92 | from osd2f.server import create_app 93 | 94 | create_app( 95 | mode="Production", 96 | data_password_override="override_datapassword", 97 | app_secret_override="tempsecret", 98 | ) 99 | mock_translate_value.assert_called_with("tempsecret") 100 | -------------------------------------------------------------------------------- /tests/db_interaction_test.py: -------------------------------------------------------------------------------- 1 | """Database test files. 2 | 3 | We don't want to test our ORM package, so these tests target the convenvience 4 | functions used. 5 | """ 6 | 7 | import asyncio 8 | import os 9 | import sqlite3 10 | import time 11 | from unittest.mock import AsyncMock, patch 12 | 13 | from aiounittest.case import AsyncTestCase 14 | 15 | from osd2f.database import stop_database 16 | 17 | 18 | class DatabaseStartStopTest(AsyncTestCase): 19 | async def test_initialize_database(self): 20 | from osd2f.database import initialize_database 21 | 22 | # we use a file simply because we want to access the same database 23 | # in the test as in the app context 24 | db_file = "test_temp" 25 | db_url = f"sqlite://{db_file}" 26 | 27 | await initialize_database(db_url=db_url) 28 | 29 | c = sqlite3.connect(db_file) 30 | 31 | # check if the submissions table can be queried 32 | c.execute("SELECT * FROM submissions").fetchall() 33 | 34 | os.remove(db_file) 35 | os.remove(db_file + "-shm") 36 | os.remove(db_file + "-wal") 37 | 38 | await stop_database() 39 | 40 | async def test_stop_database(self): 41 | close_mock = AsyncMock() 42 | with patch("tortoise.Tortoise.close_connections", close_mock): 43 | from osd2f.database import stop_database 44 | 45 | await stop_database() 46 | self.assertTrue(await close_mock.is_called()) 47 | 48 | 49 | class DatabaseInsertTest(AsyncTestCase): 50 | async def test_insert_submission(self): 51 | from osd2f.config import Testing 52 | from osd2f.definitions import Submission 53 | from osd2f.database import ( 54 | DBSubmission, 55 | insert_submission, 56 | initialize_database, 57 | stop_database, 58 | ) 59 | 60 | await initialize_database(Testing.DB_URL) 61 | 62 | nfiles = 10 63 | nentries = 10 64 | 65 | submissions = [ 66 | Submission( 67 | submission_id=f"testing-{i}", 68 | filename=f"testing_{i}.json", 69 | n_deleted=2, 70 | entries=[{"entry": ii, "text": "here"} for ii in range(nentries)], 71 | ) 72 | for i in range(nfiles) 73 | ] 74 | 75 | for sub in submissions: 76 | await insert_submission(sub) 77 | 78 | self.assertEqual(await DBSubmission.all().count(), nfiles * nentries) 79 | self.assertEqual( 80 | await DBSubmission.filter(n_deleted=2).count(), nfiles * nentries 81 | ) 82 | 83 | await stop_database() 84 | 85 | 86 | class UploadSubmissionTest(AsyncTestCase): 87 | async def test_upload_submission(self): 88 | from osd2f.definitions import Submission, SubmissionList 89 | 90 | sublist_db_mock = AsyncMock() 91 | 92 | nfiles = 10 93 | nentries = 10 94 | 95 | submissions = SubmissionList( 96 | [ 97 | Submission( 98 | submission_id=f"testing-{i}", 99 | filename=f"testing_{i}.json", 100 | n_deleted=10, 101 | entries=[{"entry": ii, "text": "here"} for ii in range(nentries)], 102 | ) 103 | for i in range(nfiles) 104 | ] 105 | ) 106 | 107 | with patch("osd2f.server.database.insert_submission_list", sublist_db_mock): 108 | from osd2f import server 109 | 110 | testclient = server.app.test_client() 111 | r = await testclient.post("/upload", data=submissions.model_dump_json()) 112 | assert r.status_code == 200 113 | 114 | sublist_db_mock.assert_called_once_with(submissionlist=submissions) 115 | 116 | 117 | class LogInsertTest(AsyncTestCase): 118 | async def test_log_insert(self): 119 | from osd2f.database import initialize_database, insert_log 120 | 121 | # we use a file simply because we want to access the same database 122 | # in the test as in the app context 123 | db_file = "test_temp" 124 | db_url = f"sqlite://{db_file}" 125 | 126 | await initialize_database(db_url=db_url) 127 | 128 | await insert_log("backend", "INFO", "position") 129 | await insert_log("backend", "INFO", "position") 130 | await insert_log("backend", "INFO", "position", "sid_string") 131 | await insert_log( 132 | "backend", "INFO", "position", "sid_string2", {"thing": "value"} 133 | ) 134 | 135 | c = sqlite3.connect(db_file) 136 | 137 | # check if the submissions table received the inserts, 138 | # because they are non-blocking, we'll have to just 139 | # wait a bit 140 | r = [] 141 | for i in range(100): 142 | r = c.execute("SELECT * FROM osd2f_logs").fetchall() 143 | if len(r) == 4: 144 | break 145 | await asyncio.sleep(0.01) 146 | 147 | assert r, ValueError("No(t all) records returned") 148 | 149 | assert ( 150 | len(c.execute("SELECT * FROM osd2f_logs WHERE log_sid IS NULL").fetchall()) 151 | == 2 152 | ) 153 | assert ( 154 | len( 155 | c.execute( 156 | "SELECT * FROM osd2f_logs WHERE log_sid IS NOT NULL" 157 | ).fetchall() 158 | ) 159 | == 2 160 | ) 161 | assert ( 162 | len( 163 | c.execute( 164 | "SELECT * FROM osd2f_logs WHERE log_entry IS NOT NULL" 165 | ).fetchall() 166 | ) 167 | == 1 168 | ) 169 | c.close() 170 | 171 | os.remove(db_file) 172 | os.remove(db_file + "-shm") 173 | os.remove(db_file + "-wal") 174 | 175 | await stop_database() 176 | 177 | 178 | class LoggerToDBTest(AsyncTestCase): 179 | async def test_log_to_db(self): 180 | from osd2f.database import initialize_database, add_database_logging 181 | from osd2f.logger import logger 182 | 183 | # we use a file simply because we want to access the same database 184 | # in the test as in the app context 185 | db_file = "test_temp2" 186 | db_url = f"sqlite://{db_file}" 187 | 188 | await initialize_database(db_url=db_url) 189 | 190 | logger.setLevel("DEBUG") 191 | 192 | q = add_database_logging() 193 | 194 | logger.debug("seen debug") 195 | logger.info("seen info") 196 | logger.warning("seen warning") 197 | logger.critical("seen critical") 198 | 199 | q.put("stop") 200 | 201 | c = sqlite3.connect(db_file) 202 | 203 | r = [] 204 | for i in range(100): 205 | r = c.execute("SELECT * FROM osd2f_logs").fetchall() 206 | if len(r) == 4: 207 | break 208 | time.sleep(0.01) 209 | 210 | os.remove(db_file) 211 | os.remove(db_file + "-shm") 212 | os.remove(db_file + "-wal") 213 | 214 | await stop_database() 215 | -------------------------------------------------------------------------------- /tests/download_data_protection_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest.async_case import IsolatedAsyncioTestCase 3 | 4 | from osd2f.server import create_app 5 | 6 | 7 | class TestPasswordProtectedDownloads(IsolatedAsyncioTestCase): 8 | async def test_password_protected_downloads(self): 9 | from osd2f.security.authorization import USER_FIELD 10 | 11 | testapp = create_app( 12 | data_password_override="testpassword", app_secret_override="testsecret" 13 | ) 14 | await testapp.startup() 15 | 16 | # set placeholder to trigger authorization 17 | os.environ["MSAL_CONFIG"] = "placeholder" 18 | 19 | tc = testapp.test_client() 20 | 21 | # set cookie to avoid real MSAL flow 22 | async with tc.session_transaction() as session: 23 | session[USER_FIELD] = "testuser" 24 | 25 | r = await tc.get("/researcher/osd2f_completed_submissions.json.zip") 26 | assert r.status_code == 200 27 | 28 | os.environ.pop("MSAL_CONFIG") 29 | 30 | await testapp.shutdown() 31 | -------------------------------------------------------------------------------- /tests/initialization_test.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase, mock 2 | 3 | 4 | class InitializationTests(TestCase): 5 | def test_production_init_without_secret(self): 6 | from osd2f.server import create_app, config 7 | 8 | config.Production.SECRET_KEY = None 9 | 10 | self.assertRaises(Exception, create_app, mode="Production") 11 | 12 | def test_production_init_without_database(self): 13 | from osd2f.server import create_app, config 14 | 15 | config.Production.SECRET_KEY = "not none" 16 | 17 | self.assertRaises(Exception, create_app, mode="Production") 18 | 19 | def test_production_init_with_secret_and_db(self): 20 | # must be set before import 21 | from osd2f.server import app, config, create_app 22 | 23 | config.Production.SECRET_KEY = "not none" 24 | config.Production.DB_URL = "sqlite:memory" 25 | 26 | app.run = mock.Mock() 27 | create_app(mode="Production") 28 | config.Production.DB_URL = None 29 | -------------------------------------------------------------------------------- /tests/sample_anonymizer_test.py: -------------------------------------------------------------------------------- 1 | from aiounittest import AsyncTestCase 2 | 3 | 4 | class test_redact_text(AsyncTestCase): 5 | def test_in_options(self): 6 | from osd2f.anonymizers.sample_platform import redact_text 7 | from osd2f.anonymizers import options 8 | 9 | self.assertTrue(redact_text.__name__ in options) 10 | 11 | async def test_parses_title(self): 12 | from osd2f.anonymizers.sample_platform import redact_text 13 | 14 | user = "henk" 15 | correspondent = "arie" 16 | title = f"{user} wrote on {correspondent}'s timeline." 17 | 18 | entry = {"title": title} 19 | redacted = await redact_text(entry) 20 | 21 | self.assertIsNotNone(redacted) 22 | self.assertFalse(user in redacted["title"]) 23 | self.assertFalse(correspondent in redacted["title"]) 24 | 25 | async def test_parses_post(self): 26 | from osd2f.anonymizers.sample_platform import redact_text 27 | 28 | user = "henk" 29 | correspondent = "arie" 30 | title = f"{user} wrote on {correspondent}'s timeline." 31 | post = f"Hey {correspondent}, how's life? missing you! -{user}" 32 | entry = {"title": title, "data": [{"post": post}]} 33 | 34 | redacted = await redact_text(entry) 35 | 36 | self.assertIsNotNone(redacted) 37 | self.assertFalse(user in redacted["data"][0]["post"]) 38 | self.assertFalse(correspondent in redacted["data"][0]["post"]) 39 | -------------------------------------------------------------------------------- /tests/sample_data_generator_test.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import pathlib 4 | import re 5 | import shutil 6 | from unittest import TestCase 7 | 8 | 9 | class MinimalSampleGeneratorTest(TestCase): 10 | """Very high-level test of the sample data generator script. 11 | 12 | The Sample script is basically a set of assumptions written as code. As 13 | such, a full set of tests is not really worthwhile (basically, they would just 14 | be a repetition of the assumptions). 15 | 16 | Instead, we check whether 17 | 1. the script is importable and runnable 18 | 2. the expected files are generated 19 | 3. generated structure matches that of mock data 20 | 21 | 22 | """ 23 | 24 | def test_sample_generator_import(self): 25 | from scripts import sample_data_generator # noqa 26 | 27 | def test_sample_generator_output(self): 28 | from scripts.sample_data_generator import generate_bundle 29 | 30 | testdir = "temp_test_data" 31 | self.assertFalse(pathlib.posixpath.exists(testdir)) 32 | generate_bundle( 33 | testdir, 34 | overwrite=False, 35 | include_tar_variant=True, 36 | include_targz_variant=True, 37 | include_zip_variant=True, 38 | indents=2, 39 | n_companies_followed=10, 40 | n_engagement=10, 41 | n_comments=10, 42 | n_ads_clicked=10, 43 | n_post_files=2, 44 | n_profile_interests=10, 45 | n_posts=10, 46 | n_short_messages=10, 47 | ) 48 | 49 | self.assertTrue(glob.glob("temp_test_data/README.md")) 50 | self.assertTrue(glob.glob("temp_test_data/sample-*.zip")) 51 | self.assertTrue(glob.glob("temp_test_data/sample-*.tar.gz")) 52 | self.assertTrue(glob.glob("temp_test_data/sample-*.tar")) 53 | self.assertTrue(glob.glob("temp_test_data/sample-*/posts/posts_0.json")) 54 | self.assertTrue(glob.glob("temp_test_data/sample-*/posts/posts_1.json")) 55 | self.assertTrue(glob.glob("temp_test_data/sample-*/engagement/engagement.json")) 56 | self.assertTrue( 57 | glob.glob("temp_test_data/sample-*/short_messages/messages.json") 58 | ) 59 | self.assertTrue( 60 | glob.glob( 61 | "temp_test_data/sample-*/profile_interests/profile_interests.json" 62 | ) 63 | ) 64 | self.assertTrue(glob.glob("temp_test_data/sample-*/comments/comments.json")) 65 | self.assertTrue( 66 | glob.glob("temp_test_data/sample-*/ads_clicked/ads_clicked.json") 67 | ) 68 | self.assertTrue( 69 | glob.glob( 70 | "temp_test_data/sample-*/companies_followed/companies_followed.json" 71 | ) 72 | ) 73 | 74 | shutil.rmtree(testdir) 75 | 76 | def test_sample_mockdata_format_equal_to_script_output(self): 77 | from scripts.sample_data_generator import generate_bundle 78 | 79 | base_testdir = "temp_test_data" 80 | testdir = os.path.join(base_testdir, "sample") 81 | self.assertFalse(pathlib.posixpath.exists(testdir)) 82 | generate_bundle( 83 | testdir, 84 | overwrite=False, 85 | include_tar_variant=True, 86 | include_targz_variant=True, 87 | include_zip_variant=True, 88 | indents=2, 89 | n_companies_followed=20, 90 | n_engagement=20, 91 | n_comments=10, 92 | n_ads_clicked=10, 93 | n_post_files=2, 94 | n_profile_interests=10, 95 | n_posts=10, 96 | n_short_messages=10, 97 | ) 98 | 99 | sample_mockdata_paths = glob.glob("mockdata/sample/**", recursive=True) 100 | testdir_paths = glob.glob(os.path.join(testdir, "**"), recursive=True) 101 | 102 | gp = re.compile("(sample-[A-z-0-9]*)") 103 | 104 | def generalized(ps): 105 | return [gp.sub("sample-*/", p).split("/", 1)[1] for p in ps] 106 | 107 | self.assertListEqual( 108 | sorted(generalized(sample_mockdata_paths)), 109 | sorted(generalized(testdir_paths)), 110 | ) 111 | 112 | shutil.rmtree(base_testdir) 113 | -------------------------------------------------------------------------------- /tests/security_entry_test.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import importlib 3 | from unittest.mock import AsyncMock, Mock, patch 4 | 5 | from aiounittest.case import AsyncTestCase 6 | 7 | from osd2f.database.submissions import ( 8 | get_submissions, 9 | insert_submission, 10 | insert_submission_list, 11 | ) 12 | 13 | 14 | class ConfigTest(AsyncTestCase): 15 | def test_cli_override(self): 16 | from osd2f.server import create_app 17 | 18 | m = Mock() 19 | with patch("osd2f.server.SecureEntry.set_secret", m): 20 | create_app(entry_secret_override="entry_override") 21 | m.assert_called_once_with(secret="entry_override") 22 | 23 | def test_env_var_use(self): 24 | import osd2f.config 25 | from osd2f.server import create_app 26 | 27 | m = Mock() 28 | with patch( 29 | "osd2f.config._os.environ", {"OSD2F_ENTRY_SECRET": "another_secret"} 30 | ), patch("osd2f.server.SecureEntry.set_secret", m): 31 | # force reload to trigger new processing of 32 | # env variables 33 | importlib.reload(osd2f.config) 34 | 35 | create_app() 36 | m.assert_called_once_with(secret="another_secret") 37 | 38 | def test_env_var_override(self): 39 | import osd2f.config 40 | from osd2f.server import create_app 41 | 42 | m = Mock() 43 | with patch( 44 | "osd2f.config._os.environ", {"OSD2F_ENTRY_SECRET": "another_secret"} 45 | ), patch("osd2f.server.SecureEntry.set_secret", m): 46 | # force reload to trigger new processing of 47 | # env variables 48 | importlib.reload(osd2f.config) 49 | 50 | create_app(entry_secret_override="entry_override") 51 | m.assert_called_once_with(secret="entry_override") 52 | 53 | 54 | class SecureEntryTest(AsyncTestCase): 55 | def test_without_secret(self): 56 | from osd2f.security.entry_encryption.secure_entry_singleton import SecureEntry 57 | 58 | SecureEntry.set_secret("") 59 | 60 | entry = {"stuff": "is unsafe"} 61 | unencrypted = SecureEntry.write_entry_field(entry.copy()) 62 | 63 | self.assertEqual(entry, unencrypted) 64 | 65 | loaded_entry = SecureEntry.read_entry_field(entry.copy()) 66 | self.assertEqual(entry, loaded_entry) 67 | 68 | def test_with_secret(self): 69 | from osd2f.security.entry_encryption.secure_entry_singleton import SecureEntry 70 | 71 | SecureEntry.set_secret("secret") 72 | entry = {"stuff": "is safe"} 73 | 74 | encrypted = SecureEntry.write_entry_field(entry.copy()) 75 | self.assertIsNotNone(encrypted.get("encrypted")) 76 | self.assertEqual(entry, SecureEntry.read_entry_field(encrypted)) 77 | 78 | def test_consistent_key(self): 79 | from osd2f.security.entry_encryption.secure_entry_singleton import SecureEntry 80 | 81 | m = {"thing": "to encrypt"} 82 | SecureEntry.set_secret("secret") 83 | e = SecureEntry.write_entry_field(m.copy()) 84 | SecureEntry.set_secret("secret") 85 | m2 = SecureEntry.read_entry_field(e) 86 | 87 | self.assertEqual(m, m2) 88 | 89 | 90 | class DatabaseOperationsTest(AsyncTestCase): 91 | async def test_insert_submission(self): 92 | from osd2f.definitions.submissions import Submission 93 | 94 | class MockSecureEntry: 95 | pass 96 | 97 | MockSecureEntry.read_entry_field = Mock() 98 | MockSecureEntry.write_entry_field = Mock() 99 | with patch("osd2f.database.submissions.SecureEntry", MockSecureEntry), patch( 100 | "osd2f.database.DBSubmission.create", AsyncMock() 101 | ): 102 | s = Submission( 103 | submission_id="id", 104 | filename="file", 105 | entries=[{"thing": "here"}], 106 | n_deleted=0, 107 | ) 108 | await insert_submission(s) 109 | MockSecureEntry.write_entry_field.assert_called_once_with(s.entries[0]) 110 | 111 | async def test_insert_submission_list(self): 112 | from osd2f.definitions.submissions import Submission, SubmissionList 113 | 114 | class MockSecureEntry: 115 | pass 116 | 117 | MockSecureEntry.read_entry_field = Mock() 118 | MockSecureEntry.write_entry_field = Mock() 119 | 120 | async def mock_bulk_create(objects): 121 | for i in objects: 122 | pass 123 | 124 | with patch("osd2f.database.submissions.SecureEntry", MockSecureEntry), patch( 125 | "osd2f.database.DBSubmission.bulk_create", mock_bulk_create 126 | ): 127 | s = Submission( 128 | submission_id="id", 129 | filename="file", 130 | entries=[{"thing": "here"}], 131 | n_deleted=0, 132 | ) 133 | await insert_submission_list(SubmissionList([s])) 134 | MockSecureEntry.write_entry_field.assert_called_once_with(s.entries[0]) 135 | 136 | async def test_get_submission(self): 137 | 138 | from osd2f.database.submissions import DBSubmission 139 | 140 | class MockSecureEntry: 141 | pass 142 | 143 | s = DBSubmission( 144 | id=5, 145 | submission_id="id", 146 | filename="file", 147 | entry={"thing": "here"}, 148 | n_deleted=0, 149 | insert_timestamp=datetime.datetime.now(), 150 | update_timestamp=datetime.datetime.now(), 151 | ) 152 | 153 | MockSecureEntry.read_entry_field = Mock(return_value=s) 154 | MockSecureEntry.write_entry_field = Mock() 155 | 156 | DBSubmission.all = AsyncMock(return_value=[s]) 157 | with patch("osd2f.database.submissions.SecureEntry", MockSecureEntry), patch( 158 | "osd2f.database.DBSubmission.all", AsyncMock(return_value=[s]) 159 | ): 160 | 161 | await get_submissions() 162 | MockSecureEntry.read_entry_field.assert_called_once() 163 | -------------------------------------------------------------------------------- /tests/security_secrets_test.py: -------------------------------------------------------------------------------- 1 | from importlib import reload 2 | from unittest.mock import Mock, patch 3 | 4 | from aiounittest.case import AsyncTestCase 5 | 6 | 7 | class SecretResolverTest(AsyncTestCase): 8 | def test_load_with_config(self): 9 | m = Mock() 10 | 11 | with patch("osd2f.security.translate_environment_vars", m): 12 | import osd2f.config # imported for side-effect 13 | 14 | # reloaded in case the module was already in cache 15 | # due to another test 16 | reload(osd2f.config) 17 | 18 | m.assert_called() # might be called more than once, depending on cache 19 | 20 | def test_azure_keyvault_env_translation(self): 21 | def m(s): 22 | return "resolved" + s 23 | 24 | import os 25 | from osd2f.security.secrets import azure_keyvault 26 | 27 | os.environ["azure_secret"] = f"{azure_keyvault.PREFIX}::test-keyvault::value" 28 | 29 | other_secret = "another-secret::somehwere::key" 30 | os.environ["not_azure_secret"] = other_secret 31 | 32 | with patch("osd2f.security.RESOLVERS", {azure_keyvault.PREFIX: m}): 33 | from osd2f.security import translate_environment_vars 34 | 35 | translate_environment_vars() 36 | 37 | # azure key should be resolved 38 | assert os.environ["azure_secret"].startswith("resolved") 39 | # non azure key should not be resolved 40 | assert os.environ["not_azure_secret"] == other_secret 41 | 42 | def test_azure_keyvault_var_translation(self): 43 | def m(s): 44 | return "resolved" + s 45 | 46 | from osd2f.security.secrets import azure_keyvault 47 | 48 | secret = f"{azure_keyvault.PREFIX}::test-keyvault::value" 49 | other_secret = "another-secret::somehwere::key" 50 | 51 | with patch("osd2f.security.RESOLVERS", {azure_keyvault.PREFIX: m}): 52 | from osd2f.security import translate_value 53 | 54 | resolved_secret = translate_value(secret) 55 | unresolved_secret = translate_value(other_secret) 56 | 57 | # azure key should be resolved 58 | assert resolved_secret.startswith("resolved") 59 | # non azure key should not be resolved 60 | assert unresolved_secret == other_secret 61 | -------------------------------------------------------------------------------- /tests/utils_settings_test.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from unittest.mock import Mock, patch 3 | 4 | 5 | class test_util_settings_loader(TestCase): 6 | def test_settings_caching(self): 7 | disk_load = Mock() 8 | with patch("osd2f.utils._load_settings_from_disk", disk_load): 9 | from osd2f.utils import load_upload_settings 10 | 11 | load_upload_settings() 12 | load_upload_settings() 13 | self.assertTrue(disk_load.assert_called_once) 14 | 15 | def test_settings_without_caching(self): 16 | disk_load = Mock() 17 | with patch("osd2f.utils._load_settings_from_disk", disk_load): 18 | from osd2f.utils import load_upload_settings 19 | 20 | load_upload_settings(True) 21 | load_upload_settings(True) 22 | self.assertTrue(disk_load.call_count == 2) 23 | -------------------------------------------------------------------------------- /webpack.config.js: -------------------------------------------------------------------------------- 1 | const path = require('path') 2 | const CopyWebpackPlugin = require('copy-webpack-plugin') 3 | const {VueLoaderPlugin} = require('vue-loader') 4 | 5 | module.exports = { 6 | module: { 7 | rules: [ 8 | { 9 | test: /.css$/i, 10 | use: ['style-loader', 'css-loader'] 11 | }, 12 | { 13 | test: /\.vue$/, 14 | loader: 'vue-loader' 15 | } 16 | ] 17 | }, 18 | 19 | entry: path.resolve(__dirname, 'osd2f', 'javascript', 'file_upload.js'), 20 | 21 | output: { 22 | filename: 'main.js', 23 | library: 'file_upload', 24 | libraryTarget: 'window', 25 | path: path.resolve(__dirname, 'osd2f', 'static', 'js') 26 | }, 27 | 28 | resolve: { 29 | alias: { 30 | vue$: 'vue/dist/vue.esm.js' 31 | }, 32 | extensions: ['*', '.js', '.vue', '.json'] 33 | }, 34 | 35 | plugins: [ 36 | new VueLoaderPlugin(), 37 | new CopyWebpackPlugin({ 38 | patterns: [ 39 | { 40 | // libarchive requires the distribution bundles to be available 41 | // for the web worker. 42 | from: path.resolve( 43 | __dirname, 44 | 'node_modules', 45 | 'libarchive.js', 46 | 'dist' 47 | ), 48 | to: path.resolve(__dirname, 'osd2f', 'static', 'js', 'libarchive') 49 | } 50 | ] 51 | }) 52 | ] 53 | } 54 | --------------------------------------------------------------------------------