├── .dockerignore ├── .github ├── CODEOWNERS └── workflows │ ├── ci.yml │ └── docker.yaml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── LICENSE-COMMERCIAL ├── README.md ├── build_rust.py ├── config.sample.yaml ├── docker └── Dockerfile ├── docs └── api.md ├── mypy.ini ├── perf ├── .gitignore ├── config.yaml ├── dummy_scan.sh └── scanner_perf.py ├── poetry.lock ├── pyproject.toml ├── rust ├── Cargo.toml ├── build.rs └── src │ ├── crypto │ └── mod.rs │ └── lib.rs ├── scripts-dev └── lint.sh ├── src └── matrix_content_scanner │ ├── __init__.py │ ├── config.py │ ├── httpserver.py │ ├── logutils.py │ ├── mcs.py │ ├── mcs_rust │ ├── __init__.pyi │ └── crypto.pyi │ ├── py.typed │ ├── scanner │ ├── __init__.py │ ├── file_downloader.py │ └── scanner.py │ ├── servlets │ ├── __init__.py │ ├── download.py │ ├── public_key.py │ ├── scan.py │ └── thumbnail.py │ └── utils │ ├── __init__.py │ ├── constants.py │ ├── encrypted_file_metadata.py │ ├── errors.py │ ├── rust.py │ └── types.py ├── tests ├── __init__.py ├── scanner │ ├── __init__.py │ ├── test_file_downloader.py │ └── test_scanner.py ├── servlets │ ├── __init__.py │ ├── test_scan.py │ └── test_servlets.py ├── test_crypto.py ├── testutils.py └── utils │ ├── __init__.py │ └── test_encrypted_file_metadata.py └── tox.ini /.dockerignore: -------------------------------------------------------------------------------- 1 | # ignore everything by default 2 | * 3 | 4 | # things to include 5 | !src 6 | !README.md 7 | !pyproject.toml 8 | !setup.cfg 9 | !poetry.lock 10 | !Cargo.toml 11 | !Cargo.lock 12 | !build_rust.py 13 | !rust 14 | 15 | **/__pycache__ 16 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | 2 | # Automatically request reviews from the synapse-core team when a pull request comes in. 3 | * @element-hq/synapse-core 4 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Linting and Tests 2 | on: 3 | push: 4 | branches: ["main"] 5 | pull_request: 6 | 7 | jobs: 8 | check-code-style: 9 | name: Check code style 10 | runs-on: ubuntu-latest 11 | steps: 12 | - run: sudo apt-get install -y libmagic1 13 | - uses: actions/checkout@v4 14 | - name: Install Rust 15 | uses: dtolnay/rust-toolchain@stable 16 | 17 | - name: Setup Poetry 18 | uses: matrix-org/setup-python-poetry@v1 19 | with: 20 | install-project: "false" 21 | 22 | - name: Run ruff check 23 | run: poetry run ruff check --output-format=github . 24 | 25 | - name: Run ruff format 26 | run: poetry run ruff format --check . 27 | 28 | check-types: 29 | name: Check types with Mypy 30 | runs-on: ubuntu-latest 31 | steps: 32 | - run: sudo apt-get install -y libmagic1 33 | - uses: actions/checkout@v4 34 | - name: Install Rust 35 | uses: dtolnay/rust-toolchain@1.82.0 36 | - uses: Swatinem/rust-cache@v2 37 | 38 | - name: Setup Poetry 39 | uses: matrix-org/setup-python-poetry@v1 40 | with: 41 | # We have seen odd mypy failures that were resolved when we started 42 | # installing the project again: 43 | # https://github.com/matrix-org/synapse/pull/15376#issuecomment-1498983775 44 | # To make CI green, err towards caution and install the project. 45 | install-project: "true" 46 | 47 | # Cribbed from 48 | # https://github.com/AustinScola/mypy-cache-github-action/blob/85ea4f2972abed39b33bd02c36e341b28ca59213/src/restore.ts#L10-L17 49 | - name: Restore/persist mypy's cache 50 | uses: actions/cache@v4 51 | with: 52 | path: | 53 | .mypy_cache 54 | key: mypy-cache-${{ github.context.sha }} 55 | restore-keys: mypy-cache- 56 | 57 | - name: Run mypy 58 | run: poetry run mypy 59 | 60 | 61 | unit-tests: 62 | name: Unit tests 63 | runs-on: ubuntu-latest 64 | strategy: 65 | matrix: 66 | # Run the unit tests both against our oldest supported Python version 67 | # and the newest stable. 68 | python_version: [ "3.10", "3.12" ] 69 | steps: 70 | - run: sudo apt-get install -y libmagic1 71 | - uses: actions/checkout@v4 72 | - name: Install Rust 73 | uses: dtolnay/rust-toolchain@stable 74 | - uses: actions/setup-python@v2 75 | with: 76 | python-version: ${{ matrix.python_version }} 77 | - run: python -m pip install tox "poetry==1.8.3" 78 | - run: tox -e py 79 | -------------------------------------------------------------------------------- /.github/workflows/docker.yaml: -------------------------------------------------------------------------------- 1 | # GitHub actions workflow which builds and publishes the docker images. 2 | 3 | name: Build docker images 4 | 5 | on: 6 | push: 7 | tags: ["v*"] 8 | workflow_dispatch: # A build was manually requested 9 | 10 | permissions: 11 | contents: read 12 | id-token: write # needed for signing the images with GitHub OIDC Token 13 | 14 | jobs: 15 | build: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Install Cosign 19 | uses: sigstore/cosign-installer@v3 20 | 21 | - name: Log in to DockerHub 22 | uses: docker/login-action@v2 23 | with: 24 | username: ${{ secrets.DOCKER_HUB_USERNAME }} 25 | password: ${{ secrets.DOCKER_HUB_TOKEN }} 26 | 27 | - name: Calculate docker image tag 28 | id: set-tag 29 | uses: docker/metadata-action@master 30 | with: 31 | images: vectorim/matrix-content-scanner 32 | tags: | 33 | type=raw,value=latest 34 | type=pep440,pattern={{raw}} 35 | 36 | - name: Build and push all platforms 37 | id: build-and-push 38 | uses: docker/build-push-action@v3 39 | with: 40 | push: true 41 | labels: "gitsha1=${{ github.sha }}" 42 | tags: "${{ steps.set-tag.outputs.tags }}" 43 | file: "docker/Dockerfile" 44 | platforms: linux/amd64 45 | 46 | - name: Sign the images with GitHub OIDC Token 47 | env: 48 | DIGEST: ${{ steps.build-and-push.outputs.digest }} 49 | TAGS: ${{ steps.set-tag.outputs.tags }} 50 | run: | 51 | images="" 52 | for tag in ${TAGS}; do 53 | images+="${tag}@${DIGEST} " 54 | done 55 | cosign sign --yes ${images} 56 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea 2 | /.venv 3 | *.egg-info 4 | /.envrc 5 | /.tox 6 | _trial_temp 7 | __pycache__ 8 | /dist 9 | config.yaml 10 | /build 11 | /.vscode 12 | mcs_pickle.txt 13 | pickle 14 | 15 | # Poetry will create a setup.py, which we don't want to include. 16 | /setup.py 17 | 18 | # rust 19 | /target/ 20 | /src/matrix_content_scanner/*.so 21 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | # We make the whole root folder a workspace so that we can run `cargo` 2 | # commands from the root (rather than having to cd into rust/). 3 | 4 | [workspace] 5 | members = ["rust"] 6 | resolver = "2" 7 | -------------------------------------------------------------------------------- /LICENSE-COMMERCIAL: -------------------------------------------------------------------------------- 1 | Licensees holding a valid commercial license with Element may use this 2 | software in accordance with the terms contained in a written agreement 3 | between you and Element. 4 | 5 | To purchase a commercial license please contact our sales team at 6 | licensing@element.io 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Matrix Content Scanner 2 | 3 | A web service for scanning media hosted on a Matrix media repository. 4 | 5 | ## Installation 6 | 7 | This project requires libmagic to be installed on the system. On Debian/Ubuntu: 8 | 9 | ```commandline 10 | sudo apt install libmagic1 11 | ``` 12 | 13 | Then, preferably in a virtual environment, install the Matrix Content Scanner: 14 | 15 | ```commandline 16 | pip install matrix-content-scanner 17 | ``` 18 | 19 | ## Usage 20 | 21 | Copy and edit the [sample configuration file](https://github.com/matrix-org/matrix-content-scanner-python/blob/main/config.sample.yaml). 22 | Each key is documented in this file. 23 | 24 | Then run the content scanner (from within your virtual environment if one was created): 25 | 26 | ```commandline 27 | python -m matrix_content_scanner.mcs -c CONFIG_FILE 28 | ``` 29 | 30 | Where `CONFIG_FILE` is the path to your configuration file. 31 | 32 | ## Docker 33 | 34 | This project provides a Docker image to run it, published as 35 | `vectorim/matrix-content-scanner`. 36 | 37 | To use it, copy the [sample configuration file](/config.sample.yaml) into a dedicated 38 | directory, edit it accordingly with your requirements, and then mount this directory as 39 | `/data` in the image. Do not forget to also publish the port that the content scanner's 40 | Web server is configured to listen on. 41 | 42 | For example, assuming the port for the Web server is `8080`: 43 | 44 | ```shell 45 | docker run -p 8080:8080 -v /path/to/your/config/directory:/data vectorim/matrix-content-scanner 46 | ``` 47 | 48 | ## API 49 | 50 | See [the API documentation](/docs/api.md) for information about how clients are expected 51 | to interact with the Matrix Content Scanner. 52 | 53 | ## Migrating from the [legacy Matrix Content Scanner](https://github.com/matrix-org/matrix-content-scanner) 54 | 55 | Because it uses the same APIs and Olm pickle format as the legacy Matrix Content Scanner, 56 | this project can be used as a drop-in replacement. The only change (apart from the 57 | deployment instructions) is the configuration format: 58 | 59 | * the `server` section is renamed `web` 60 | * `scan.tempDirectory` is renamed `scan.temp_directory` 61 | * `scan.baseUrl` is renamed `download.base_homeserver_url` (and becomes optional) 62 | * `scan.doNotCacheExitCodes` is renamed `result_cache.exit_codes_to_ignore` 63 | * `scan.directDownload` is removed. Direct download always happens when `download.base_homeserver_url` 64 | is absent from the configuration file, and setting a value for it will always cause files to be 65 | downloaded from the server configured. 66 | * `proxy` is renamed `download.proxy` 67 | * `middleware.encryptedBody.pickleKey` is renamed `crypto.pickle_key` 68 | * `middleware.encryptedBody.picklePath` is renamed `crypto.pickle_path` 69 | * `acceptedMimeType` is renamed `scan.allowed_mimetypes` 70 | * `requestHeader` is renamed `download.additional_headers` and turned into a dictionary. 71 | 72 | Note that the format of the cryptographic pickle file and key are compatible between 73 | this project and the legacy Matrix Content Scanner. If no file exist at that path one will 74 | be created automatically. 75 | 76 | ## Development 77 | 78 | In a virtual environment with poetry (>=1.8.3) installed, run 79 | ```shell 80 | poetry install 81 | ``` 82 | 83 | To run the unit tests, you can use: 84 | ```shell 85 | tox -e py 86 | ``` 87 | 88 | To run the linters and `mypy` type checker, use `./scripts-dev/lint.sh`. 89 | 90 | 91 | ## Releasing 92 | 93 | The exact steps for releasing will vary; but this is an approach taken by the 94 | Synapse developers (assuming a Unix-like shell): 95 | 96 | 1. Set a shell variable to the version you are releasing (this just makes 97 | subsequent steps easier): 98 | ```shell 99 | version=X.Y.Z 100 | ``` 101 | 102 | 2. Update `setup.cfg` so that the `version` is correct. 103 | 104 | 3. Stage the changed files and commit. 105 | ```shell 106 | git add -u 107 | git commit -m v$version -n 108 | ``` 109 | 110 | 4. Push your changes. 111 | ```shell 112 | git push 113 | ``` 114 | 115 | 5. When ready, create a signed tag for the release: 116 | ```shell 117 | git tag -s v$version 118 | ``` 119 | Base the tag message on the changelog. 120 | 121 | 6. Push the tag. 122 | ```shell 123 | git push origin tag v$version 124 | ``` 125 | 126 | 7. Create a *release*, based on the tag you just pushed, on GitHub or GitLab. 127 | 128 | 8. Create a source distribution and upload it to PyPI: 129 | ```shell 130 | python -m build 131 | twine upload dist/matrix_content_scanner-$version* 132 | ``` 133 | -------------------------------------------------------------------------------- /build_rust.py: -------------------------------------------------------------------------------- 1 | # A build script for poetry that adds the rust extension. 2 | 3 | import os 4 | from typing import Any, Dict 5 | 6 | from setuptools_rust import Binding, RustExtension 7 | 8 | 9 | def build(setup_kwargs: Dict[str, Any]) -> None: 10 | original_project_dir = os.path.dirname(os.path.realpath(__file__)) 11 | cargo_toml_path = os.path.join(original_project_dir, "rust", "Cargo.toml") 12 | 13 | extension = RustExtension( 14 | target="matrix_content_scanner.mcs_rust", 15 | path=cargo_toml_path, 16 | binding=Binding.PyO3, 17 | py_limited_api=True, 18 | debug=False, 19 | ) 20 | setup_kwargs.setdefault("rust_extensions", []).append(extension) 21 | setup_kwargs["zip_safe"] = False 22 | -------------------------------------------------------------------------------- /config.sample.yaml: -------------------------------------------------------------------------------- 1 | # Configuration file template for the Matrix Content Scanner. 2 | # 3 | # Supported time units: 4 | # * ms, millisecond, milliseconds 5 | # * s, sec, secs, second, seconds 6 | # * m, min, mins, minute, minutes 7 | # * h, hour, hours 8 | # * d, day, days 9 | # * w, week, weeks 10 | # * y, year, years 11 | # If no unit is given, "seconds" are implied. 12 | # 13 | # Examples of supported size units can be found here: https://humanfriendly.readthedocs.io/en/latest/api.html#humanfriendly.parse_size 14 | # Size units use a decimal base, so 1KB means 1000 bytes, while 1KiB means 1024 bytes. 15 | 16 | # Configuration for hosting the HTTP(S) API. 17 | web: 18 | host: 127.0.0.1 19 | port: 8080 20 | 21 | # Configuration for scanning files. 22 | scan: 23 | # The script to run to scan a file. This script will be called with a path to the 24 | # downloaded file as its only argument, e.g. "./example.sh /temp/foo.bar/my_file". 25 | # Required. 26 | script: "./example.sh" 27 | 28 | # Directory in which to download files for scanning. Each file downloaded is removed 29 | # after the scan has completed. 30 | # Required. 31 | temp_directory: "/tmp" 32 | 33 | # Command to run to remove files from disk once they have been scanned. 34 | # Optional, defaults to "rm". 35 | removal_command: "srm" 36 | 37 | # List of allowed MIME types. If a file has a MIME type that's not in this list, its 38 | # scan is considered failed. 39 | # Unrecognised binary files are considered to be `application/octet-stream`. 40 | # Unrecognised text files are considered to be `text/plain`. 41 | # Optional, defaults to allowing all MIME types. 42 | allowed_mimetypes: ["image/jpeg"] 43 | 44 | # List of blocked MIME types. 45 | # If specified, `allowed_mimetypes` must not be specified as well. 46 | # If specified, a file whose MIME type is on this list will produce a scan that is 47 | # considered failed. 48 | # Unrecognised binary files are considered to be `application/octet-stream`. 49 | # Unrecognised text files are considered to be `text/plain`. 50 | # Optional. 51 | # blocked_mimetypes: ["image/jpeg"] 52 | 53 | # Configuration of scan result caching. 54 | # 55 | # Results are stored in a cache to avoid having to download and scan a file twice. There 56 | # is a unique entry in this cache for each set of media path (i.e. the 57 | # "server_name/media_id" identifier for the media), thumbnailing parameters and 58 | # encryption metadata. This means that, for example, the result for the scan of the media 59 | # "example.com/abc" and the result for the scan of the *thumbnail* of "example.com/abc" 60 | # will be stored in two separate entries. 61 | # 62 | # Each entry in the cache includes the result of the scan as well as a copy of the media 63 | # that was scanned. If the media fails the scan, however, or is larger than the configured 64 | # maximum size (if set), no copy of the media is stored in the result cache. 65 | result_cache: 66 | # List of exit codes from the scanning script that shouldn't cause the result of the 67 | # scan to be cached for future requests. 68 | # Optional, defaults to an empty list (i.e. results are cached regardless of the 69 | # script's exit code). 70 | exit_codes_to_ignore: [1, 2] 71 | 72 | # Maximum number of results that can be stored in the cache. If more files are 73 | # scanned before existing items reach their TTL, the least-recently accessed will be 74 | # evicted. 75 | # Optional, defaults to 1024. 76 | max_size: 2048 77 | 78 | # The maximum amount of time an entry will stay in the cache before being evicted. 79 | # Optional, defaults to 1 week. 80 | ttl: "1d" 81 | 82 | # The maximum cachable file size. If a file is bigger than this size, a copy of it 83 | # will be not be cached even if the scan succeeds. If the file is requested again, it 84 | # is downloaded again from the homeserver, but is not written to disk or scanned. 85 | # Optional, defaults to no maximum size. 86 | max_file_size: "100MB" 87 | 88 | 89 | # Configuration for downloading files. 90 | # When downloading files directly from their respective homeservers (which is the default 91 | # behaviour), the homeservers' default URLs are determined using .well-known discovery 92 | # (defaults to using the homeserver's domain if not available). 93 | # See https://spec.matrix.org/latest/client-server-api/#server-discovery for more info. 94 | # Settings in this section (apart from `base_homeserver_url`) apply to .well-known 95 | # discovery requests as well as file download ones. 96 | download: 97 | # If provided, all files are downloaded using the homeserver at this URL. If this 98 | # setting is provided, .well-known discovery is not used to determine the base URL 99 | # to use. 100 | # Optional, defaults to downloading files directly from their respective homeservers. 101 | base_homeserver_url: "https://matrix.org" 102 | 103 | # HTTP(S) proxy to use when sending requests. 104 | # Optional, defaults to no proxy. 105 | proxy: "http://10.0.0.1:3128" 106 | 107 | # Headers to send in outgoing requests. 108 | # Optional, defaults to no additional headers. 109 | additional_headers: 110 | user-agent: "matrix-content-scanner" 111 | 112 | # Configuration for decrypting Olm-encrypted request bodies. 113 | crypto: 114 | # The path to the Olm pickle file. This file contains the key pair to use when 115 | # encrypting and decrypting encrypted POST request bodies. 116 | # A new keypair will be created at startup if the pickle file doesn't already exist. 117 | # Required. 118 | pickle_path: "./pickle" 119 | 120 | # The key to use to decode the pickle file. 121 | # Required. 122 | pickle_key: "this_is_a_secret" 123 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PYTHON_VERSION=3.11 2 | 3 | FROM docker.io/python:${PYTHON_VERSION}-slim AS requirements 4 | 5 | 6 | # We install poetry in its own build stage to avoid its dependencies conflicting. 7 | RUN --mount=type=cache,target=/root/.cache/pip \ 8 | pip install --user "poetry==1.8.3" 9 | 10 | WORKDIR /mcs 11 | 12 | # Copy just what we need to run `poetry export`... 13 | COPY pyproject.toml poetry.lock /mcs/ 14 | 15 | # If specified, we won't verify the hashes of dependencies. 16 | # This is only needed if the hashes of dependencies cannot be checked for some 17 | # reason, such as when a git repository is used directly as a dependency. 18 | ARG TEST_ONLY_SKIP_DEP_HASH_VERIFICATION 19 | 20 | # If specified, we won't use the Poetry lockfile. 21 | # Instead, we'll just install what a regular `pip install` would from PyPI. 22 | ARG TEST_ONLY_IGNORE_POETRY_LOCKFILE 23 | 24 | # Export the dependencies, but only if we're actually going to use the Poetry lockfile. 25 | # Otherwise, just create an empty requirements file so that the Dockerfile can 26 | # proceed. 27 | RUN if [ -z "$TEST_ONLY_IGNORE_POETRY_LOCKFILE" ]; then \ 28 | /root/.local/bin/poetry export -o /mcs/requirements.txt ${TEST_ONLY_SKIP_DEP_HASH_VERIFICATION:+--without-hashes}; \ 29 | else \ 30 | touch /mcs/requirements.txt; \ 31 | fi 32 | 33 | ### 34 | ### Stage 1: builder 35 | ### 36 | FROM docker.io/library/python:${PYTHON_VERSION}-slim AS builder 37 | 38 | RUN \ 39 | --mount=type=cache,target=/var/cache/apt,sharing=locked \ 40 | --mount=type=cache,target=/var/lib/apt,sharing=locked \ 41 | apt-get update -qq && apt-get install -yqq \ 42 | build-essential \ 43 | curl 44 | 45 | # Install rust and ensure its in the PATH 46 | ENV RUSTUP_HOME=/rust 47 | ENV CARGO_HOME=/cargo 48 | ENV PATH=/cargo/bin:/rust/bin:$PATH 49 | RUN mkdir /rust /cargo 50 | 51 | RUN curl -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --default-toolchain stable --profile minimal 52 | 53 | # To speed up rebuilds, install all of the dependencies before we copy over 54 | # the whole project, so that this layer in the Docker cache can be 55 | # used while you develop on the source 56 | # 57 | # This is aiming at installing the `[tool.poetry.depdendencies]` from pyproject.toml. 58 | COPY --from=requirements /mcs/requirements.txt /mcs/ 59 | RUN --mount=type=cache,target=/root/.cache/pip \ 60 | pip install --prefix="/install" --no-deps --no-warn-script-location -r /mcs/requirements.txt 61 | 62 | COPY src /mcs/src/ 63 | COPY rust /mcs/rust/ 64 | # ... and what we need to `pip install`. 65 | COPY pyproject.toml README.md build_rust.py Cargo.toml Cargo.lock /mcs/ 66 | 67 | # Repeat of earlier build argument declaration, as this is a new build stage. 68 | ARG TEST_ONLY_IGNORE_POETRY_LOCKFILE 69 | 70 | # Install the matrix content scanner package itself. 71 | # If we have populated requirements.txt, we don't install any dependencies 72 | # as we should already have those from the previous `pip install` step. 73 | RUN --mount=type=cache,target=/mcs/target,sharing=locked \ 74 | --mount=type=cache,target=${CARGO_HOME}/registry,sharing=locked \ 75 | if [ -z "$TEST_ONLY_IGNORE_POETRY_LOCKFILE" ]; then \ 76 | pip install --prefix="/install" --no-deps --no-warn-script-location /mcs; \ 77 | else \ 78 | pip install --prefix="/install" --no-warn-script-location /mcs; \ 79 | fi 80 | 81 | ### 82 | ### Stage 2: runtime 83 | ### 84 | 85 | FROM docker.io/library/python:${PYTHON_VERSION}-slim 86 | 87 | # Install libmagic & other useful tools. 88 | # We don't need to install libolm, because we're installing it with a 89 | # wheel from gitlab.matrix.org later, which comes with libolm already compiled. 90 | RUN apt-get update -qq && apt-get install -qq libmagic1 c-icap && rm -rf /var/lib/apt/lists/* 91 | 92 | # Copy the necessary project files into the image. 93 | COPY --from=builder /install /usr/local 94 | 95 | # Create the directory in which long-lived configuration and secrets will live. We switch 96 | # to it to ensure any automatically-generated secret is persisted when the container is 97 | # destroyed. 98 | RUN mkdir /data 99 | WORKDIR /data 100 | 101 | # Start the service using user-provided configuration. 102 | ENTRYPOINT ["python", "-m", "matrix_content_scanner.mcs", "-c", "/data/config.yaml"] 103 | -------------------------------------------------------------------------------- /docs/api.md: -------------------------------------------------------------------------------- 1 | # Matrix Content Scanner API 2 | 3 | This document describes the custom API implemented by the Matrix Content Scanner. 4 | 5 | ## Error codes 6 | 7 | An error is returned as JSON responses to the request that caused it, in the following format: 8 | 9 | | Parameter | Type | Description | 10 | |-----------|------|--------------------------------------------------------| 11 | | `reason` | str | The machine-readable code for the error. | 12 | | `info` | str | Additional human-readable information about the error. | 13 | 14 | Example: 15 | 16 | ```json 17 | { 18 | "info": "***VIRUS DETECTED***", 19 | "reason": "MCS_MEDIA_NOT_CLEAN" 20 | } 21 | ``` 22 | 23 | The error codes used by the Matrix Content Scanner are described below, alongside the HTTP 24 | status code of the response for each scenario: 25 | 26 | | Status Code | Reason | Description | 27 | |-------------|-------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 28 | | 400 | `MCS_MALFORMED_JSON` | The request body contains malformed JSON. | 29 | | 400 | `MCS_MEDIA_FAILED_TO_DECRYPT` | The server failed to decrypt the encrypted media downloaded from the media repo. | 30 | | 401 | `M_MISSING_TOKEN` | The request is missing a required access token for authentication. | 31 | | 401 | `M_UNKNOWN_TOKEN` | The access token provided for authentication is not valid. | 32 | | 404 | `M_NOT_FOUND` | The `Authorization` header was missing when requesting authenticated media. | 33 | | 404 | `M_NOT_FOUND` | No route could be found at the given path. | 34 | | 404 | `M_NOT_FOUND` | The requested media was not present in the media repo. | 35 | | 403 | `MCS_MEDIA_NOT_CLEAN` | The server scanned the downloaded media but the antivirus script returned a non-zero exit code. | 36 | | 403 | `MCS_MIME_TYPE_FORBIDDEN` | The Mime type is not in the allowed list of Mime types. | 37 | | 403 | `MCS_BAD_DECRYPTION` | The provided `encrypted_body` could not be decrypted, or the encrypted file could not be decrypted. The client should request the public key of the server and then retry (once). | 38 | | 500 | `M_UNKNOWN` | The server experienced an unexpected error. | 39 | | 502 | `MCS_MEDIA_REQUEST_FAILED` | The server failed to request media from the media repo. | 40 | 41 | 42 | ## Routes 43 | 44 | 45 | ### `GET /_matrix/media_proxy/unstable/download/{serverName}/{mediaId}` 46 | 47 | Downloads the media at `mxc://{serverName}/{mediaId}` and scans it. If the scan is 48 | successful, the media is sent in the response (identical to the 49 | `GET /_matrix/media/v3/download/...` route in the Matrix specification). If the scan is 50 | unsuccessful, an error is sent with the reason `MCS_MEDIA_NOT_CLEAN`. 51 | 52 | 53 | ### `GET /_matrix/media_proxy/unstable/thumbnail/{serverName}/{mediaId}` 54 | 55 | Takes the query parameters described [in the Matrix specification](https://spec.matrix.org/latest/client-server-api/#get_matrixmediav3thumbnailservernamemediaid). 56 | 57 | Downloads a thumbnail of the media at `mxc://{serverName}/{mediaId}` and scans it. If the 58 | scan is successful, the media is sent in the response (identical to the 59 | `GET /_matrix/media/v3/thumbnail/...` route in the Matrix specification). If the scan is 60 | unsuccessful, an error is sent with the reason `MCS_MEDIA_NOT_CLEAN`. 61 | 62 | 63 | ### `GET /_matrix/media_proxy/unstable/scan/{serverName}/{mediaId}` 64 | 65 | Downloads the media at `mxc://{serverName}/{mediaId}`, scans it and 66 | responds with the result of the scan. 67 | 68 | Response format: 69 | 70 | | Parameter | Type | Description | 71 | |-----------|------|--------------------------------------------------------------------| 72 | | `clean` | bool | The scan's result: `true` if the file is clean, `false` otherwise. | 73 | | `info` | str | Human-readable information about the result. | 74 | 75 | Example: 76 | 77 | ```json 78 | { 79 | "clean": false, 80 | "info": "***VIRUS DETECTED***" 81 | } 82 | ``` 83 | 84 | 85 | ### `POST /_matrix/media_proxy/unstable/download_encrypted` 86 | 87 | Downloads a specified encrypted file, decrypts it and then behaves identically to the 88 | `GET /_matrix/media_proxy/unstable/download/{serverName}/{mediaId}` route. 89 | 90 | Request body: 91 | 92 | | Parameter | Type | Description | 93 | |------------------|---------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 94 | | `encrypted_body` | EncryptedBody | An Olm-encrypted version of the request body. See [this section](#encrypted-post-body) for more information. | 95 | | `file` | EncryptedFile | The metadata (download MXC URL and decryption key) of an encrypted file. Follows the format of the `EncryptedFile` structure from the [Matrix specification](https://spec.matrix.org/v1.2/client-server-api/#extensions-to-mroommessage-msgtypes). Ignored if `encrypted_body` is present. | 96 | 97 | Example: 98 | 99 | ```json 100 | { 101 | "file": { 102 | "v": "v2", 103 | "key": { 104 | "alg": "A256CTR", 105 | "ext": true, 106 | "k": "qcHVMSgYg-71CauWBezXI5qkaRb0LuIy-Wx5kIaHMIA", 107 | "key_ops": [ 108 | "encrypt", 109 | "decrypt" 110 | ], 111 | "kty": "oct" 112 | }, 113 | "iv": "X85+XgHN+HEAAAAAAAAAAA", 114 | "hashes": { 115 | "sha256": "5qG4fFnbbVdlAB1Q72JDKwCagV6Dbkx9uds4rSak37c" 116 | }, 117 | "url": "mxc://matrix.org/oSTbuSlyZKXvgtbtUsPxRbto" 118 | } 119 | } 120 | ``` 121 | 122 | 123 | ### `POST /_matrix/media_proxy/unstable/scan_encrypted` 124 | 125 | Downloads a specified encrypted file, decrypts it and then behaves identically to the 126 | `GET /_matrix/media_proxy/unstable/scan/{serverName}/{mediaId}` route. 127 | 128 | The request body for this route is the same as for 129 | `POST /_matrix/media_proxy/unstable/download_encrypted`. 130 | 131 | 132 | ### `GET /_matrix/media_proxy/unstable/public_key` 133 | 134 | Responds with a base64 representation of the public key to use to generate the 135 | `encrypted_body` parameter of POST requests. See [this section](#encrypted-post-body) for 136 | more information. 137 | 138 | Response format: 139 | 140 | | Parameter | Type | Description | 141 | |:-------------|------|--------------------------------------------| 142 | | `public_key` | str | A base64 representation of the public key. | 143 | 144 | Example: 145 | 146 | ```json 147 | { 148 | "public_key": "GdwYYj5Ey9O96FMi4DjIhPhY604RuZg2Om98Kqh+3GE" 149 | } 150 | ``` 151 | 152 | 153 | ## Encrypted POST body 154 | 155 | When processing encrypted attachments, there are two ways to communicate the metadata 156 | (i.e. URL and decryption key for the file) to the Matrix Content Scanner. 157 | 158 | The first one is by sending it in the request body as shown above. However, this might not 159 | provide enough security depending on the infrastructure the Matrix Content Scanner is 160 | deployed in. For example if translation from HTTPS to HTTP is done on a separate machine 161 | than the one hosting the Matrix Content Scanner, it might be a concern that other pieces 162 | of the infrastructure might be able to intercept this traffic and decrypt the attachment. 163 | 164 | The second way of communicating encrypted file metadata is to first encrypt it using 165 | vodozemac's [`PkEncryption`](https://github.com/matrix-org/vodozemac/blob/poljar/pk-dekurcina/src/pk_encryption.rs#L97) 166 | class. This is done using the public key retrieved from 167 | `GET /_matrix/media_proxy/unstable/public_key` and sending the resulting encrypted message 168 | in an `encrypted_body` parameter of the request's body. This parameter follows this format: 169 | 170 | | Parameter | Type | Description | 171 | |--------------|------|------------------------| 172 | | `ciphertext` | str | The encrypted content. | 173 | | `mac` | str | The MAC. | 174 | | `ephemeral` | str | The ephemeral key. | 175 | 176 | Example (generated using the body and public key from the previous examples): 177 | 178 | ```json 179 | { 180 | "encrypted_body": { 181 | "ciphertext": "tED6iNpKcZti+HMZ6t1M+ZlE27IbvF9nojz59dg3jtJHv/9wtH6KiYyaZsVvCNzuwWCjdcxA4PMevZuWnVIEWHArCKdcFJeAvzxzlVtFvlgM5PIiTNtkh8sXIaC7RP5+3s0/aQs9PhuhlJ5nGlS86BZJ56dDwQWS5DO/WPqsTko9lz6//XtZ8ko417vybz81NTNpoADRc8XRntsI1+rmdKkXJtuXTA3d46CCAhLvoJLZlk7xb7IGHADk3eYQ9WTaKQ76/PW1dDo5xQGyXOr+lJByisjkoz4C8i4wRYXnks+d3q6kIndGZgO8s/H7/kfYC052IAlAk3LmYavXaNwXJtnWUCCakTHME154yup8DtmsyuZkC3p3KhSsKAeoxmYvsSf0+p0MinOWB4BgeWwaBaKDKTHbaUKwQzdbZrBXKP+QBdmM9PUrmsTPR2RmWRsPCC3dcmz4rakCZB/Xvwg++xDzpxi3+iJxJ011g1Dfp4sd44U6LJDVZafIoPu7esChYD4o+x4tP4airHueLGpP0rQxPuDZRvklwCRZ5xtzr47fINel2IGrTQEPyNES+lASGr2xeWwBJXBe47OkM0rXZn1HVM6iK3g3HfUT6pFhdI/52ztUf+gOhOhRvTpP079Je9INLApXSu793EQGJpH+ms3ymJ3mfBhEYVVnj8zbczo", 182 | "mac": "nipjbUCnIEw", 183 | "ephemeral": "fk2xOTmttnFDTAORxVQTtIlbsu7O01Oe52+umaOjIiE" 184 | } 185 | } 186 | ``` 187 | 188 | ## Authenticated Media 189 | 190 | When accessing media from a Synapse homeserver with authenticated media enabled, an 191 | `Authorization` HTTP header must be passed along with any request to the Matrix Content Scanner. 192 | If the `Authorization` header is not present, the content scanner assumes the request is not an 193 | authenticated media request and will use the old Matrix endpoints. 194 | 195 | This header follows the `Authentication Bearer scheme` as [outlined in the Matrix specification](https://spec.matrix.org/v1.12/client-server-api/#using-access-tokens). 196 | The `access_token` must be the Matrix access token of the client's user. 197 | The `Authorization` header method must be used, sending the access token as a query string 198 | parameter is not supported. 199 | 200 | Example authorization header: 201 | 202 | ``` 203 | Authorization: Bearer 204 | ``` 205 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | strict = true 3 | files = 4 | tests/, 5 | src/ 6 | -------------------------------------------------------------------------------- /perf/.gitignore: -------------------------------------------------------------------------------- 1 | pickle_path 2 | -------------------------------------------------------------------------------- /perf/config.yaml: -------------------------------------------------------------------------------- 1 | web: 2 | host: localhost 3 | port: 8080 4 | scan: 5 | script: ./dummy_scan.sh 6 | temp_directory: temp 7 | crypto: 8 | pickle_path: pickle_path 9 | pickle_key: pickle_key 10 | download: 11 | base_homeserver_url: https://matrix-client.matrix.org -------------------------------------------------------------------------------- /perf/dummy_scan.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | sleep 1 3 | # Roughly a 5% change to fail, assuming a uniform distribution of $RANDOM. 4 | if [ $((RANDOM % 20)) = 0 ]; then 5 | echo "I don't like the look of $1" > /dev/stderr 6 | exit 1 7 | fi -------------------------------------------------------------------------------- /perf/scanner_perf.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | """ 4 | This script is a rudimentary end-to-end test of the content scanner. It starts the 5 | content scanner as a subprocess, using the hard-coded config.yaml. The scanner is 6 | configured with matrix.org as its upstream homserver, and to use a dummy scanning script 7 | which just calls `sleep 1`. 8 | 9 | Next, we concurrently request Matrix Avatar URLs taken from the public 10 | #synapse-dev:matrix.org room. (The URLs are hard-coded in this file. It's ugly, but good 11 | enough for now.) 12 | 13 | We wait for the content scanner to finish responding to reach response, reading the 14 | response bodies from the scanner. We print how long (wall clock) it took to do so, 15 | and close the content scanner subprocess. 16 | 17 | Invoke this script with `-v` to print out content scanner logs. 18 | """ 19 | 20 | import asyncio 21 | import collections 22 | import os.path 23 | import subprocess 24 | import sys 25 | import time 26 | import timeit 27 | import traceback 28 | 29 | import aiohttp 30 | 31 | timer = timeit.default_timer 32 | 33 | AVATAR_URLS_TAKEN_FROM_SYNAPSE_DEV = [ 34 | "http://127.0.0.1:8080/ipfs/QmfS3zCyhM4KgvYWH1HrD1Rnumns7fyTzcSHjk5fsWe5ZH?filename=IMG_20230222_191003_e_1677506180005.jpg", 35 | "mxc://1312.media/SQdCZTnJfLkBAxgQMPkVgsPY", 36 | "mxc://abolivier.bzh/zPatuAFfwaXVxsJudPWkFcWF", 37 | "mxc://aguiarvieira.pt/74665ee95b29e2a217b88911cfc664a1ccbb7e141703097801866477568", 38 | "mxc://amorgan.xyz/JHlaCvKzIPrlcnYWTFoOqsmH", 39 | "mxc://asra.gr/4f06832b1418d4c5ba91cae68135592754841080", 40 | "mxc://automattic.com/cf00594221369ad4498eb3b73032969c7be0fa3b", 41 | "mxc://b3.hk/kKAHEhEOFMyXHQCcSFuQOQza", 42 | "mxc://beeper.com/18850ea089e0ecc16d7db55527925b43ad63295c", 43 | "mxc://beeper.com/c2ef30e46e6f99cd913f2b632573033c60a74524", 44 | "mxc://bolha.chat/BevcFWoBVCMGMGqYQNhVddfu", 45 | "mxc://bolha.chat/ClRsLphUvHmWHWOFjKLwiknN", 46 | "mxc://bonifacelabs.ca/WjbmLXYLDRPxUzorCdExENVZ", 47 | "mxc://bramen.com.co/oTFgSIkJdDTBIcuvtWTukatz", 48 | "mxc://brodi.me/PPjyGXrcCqcwRrKpYoIgLvgw", 49 | "mxc://cadair.com/LdiPRXiYOVpdWvURyocZmvUo", 50 | "mxc://chat.decatrion.com/MXOQjcRSnVSqOALFTDlgIKnq", 51 | "mxc://chat.interru.io/UJdEhRreNufARVwpCAGWnHTx", 52 | "mxc://chat.mistli.net/MIlfZzUpEUelhCLXVFPMacZO", 53 | "mxc://chat.pyro.monster/bgZxviIdWbBYWInhwZozaryA", 54 | "mxc://chat.upi.li/rYupYBDqEXxkiQGEhPOiNUGs", 55 | "mxc://cody.to/hXfwsZbCLswNYgvRDqIQZOnS", 56 | "mxc://connecteu.rs/8c81538fc306d556bbbce15230b12c68ee7395f8", 57 | "mxc://cyberia.club/ObtWErjecvRjoCxbEWzHSiXM", 58 | "mxc://element.io/050bd1fa6777a004eb8ffd6c31028998331a91aa", 59 | "mxc://element.io/0750b4015ab58d23d704d3a828a1173a175cf95f", 60 | "mxc://element.io/1fec45ef987253db2728112927562567f8dd9d5e", 61 | "mxc://element.io/42eff27432ec038e933337dabcdfe3d230b3c68d", 62 | "mxc://element.io/47465a9ec77dd489e49b6748bc53c4f0122f06d7", 63 | "mxc://element.io/6130836e26b462a6fe63d4e080dd9d2037490f2b", 64 | "mxc://element.io/658198ce7f58872cc8fb68862f1eabdc5d847fbc", 65 | "mxc://element.io/a3f0d8b0868a7bf4e7449141167747a4699109ff", 66 | "mxc://element.io/bd48d4466c7e21b2ce00836631c06360206c29a0", 67 | "mxc://element.io/f03df00167d5f7ad5b5eac5375f32146cc2c3f51", 68 | "mxc://envs.net/89be88bd94378aef18b7f01e6a14d2228cfbb9fa", 69 | "mxc://envs.net/de405527b5c8dca188d6d8c7f3731e861a9b17ec", 70 | "mxc://ergaster.org/nmVViTqFqKGGxSHHcwevqnig", 71 | "mxc://ether.ai/JKGvwPJrfnWiWEIeVGLtJaSl", 72 | "mxc://fabcity.hamburg/QdttdrpZgTNKcJJWauixXEvQ", 73 | "mxc://fachschaften.org/c8faf7765794be1b24b3117925ac2464a204fc961726279478688088064", 74 | "mxc://gatto.club/qEJyuPBKpZITccTfIriEebdK", 75 | "mxc://gruenhage.xyz/3ecdecdab75225c0a14c7c804061d86962ee1550", 76 | "mxc://hackerlab.in/vjENMlrncPUGDmbyMZhWJzkG", 77 | "mxc://hackliberty.org/LeTsthiOdqoNnjOjqWjxWMAI", 78 | "mxc://half-shot.uk/81696e31e533651fb9e44ce351b4201151042acd", 79 | "mxc://jacksonchen666.com/pQoQssnTIGKOYHpcWUmYpdsQ", 80 | "mxc://jameskitt616.one/pBZDFcMKCjVjkrTMgMykKpTi", 81 | "mxc://jboi.nl/dvVWQixQMJyIQoaLFqFTTpsE", 82 | "mxc://jki.re/NBtxUkzjXpmdsGychrevxsaB", 83 | "mxc://lant.uk/MVZeSTcVlpNiDToBuKgyQfIK", 84 | "mxc://librepush.net/WbEnGmxZGKJyHqbojduVeatQ", 85 | "mxc://littlevortex.net/jSNRNEyKLRnzYEpsODAUznIZ", 86 | "mxc://luebke.io/imaijIHMncPjQqYRLtByZRzX", 87 | "mxc://matrix.0x45.moe/PwcDRntlwelLMuofemYarmqx", 88 | "mxc://matrix.atommac.com/cAycTPLQEkgtZSlZlRlZXoTx", 89 | "mxc://matrix.clandestine.network/JpKsGDMkNnSkfQqUdFuoBkFy", 90 | "mxc://matrix.eclabs.de/KyXZzZTeJyhQDBkqGBcKWyBp", 91 | "mxc://matrix.f5.htw-berlin.de/LosKszHTJgwslbvrTvNWanwE", 92 | "mxc://matrix.kevwe.se/PXHQcmOahOjAJoTouFBmevfj", 93 | "mxc://matrix.m0dex.eu/c2qHa8jqd86MdKplo1VQamYOhkMxkGEl", 94 | "mxc://matrix.org/AokEDpMKDROUmGwuoErhRIxv", 95 | "mxc://matrix.org/BORiLtSOEUnZiwCcaJftvxxm", 96 | "mxc://matrix.org/BugjUgdADNUndQASgkYDHogL", 97 | "mxc://matrix.org/CLtgiPGknzEpKDiyOrUedmEc", 98 | "mxc://matrix.org/DIGiJjzKkVsWwpppAcrGRwzB", 99 | "mxc://matrix.org/DrLDzhkVYvGjfCiUBLkrYLhs", 100 | "mxc://matrix.org/EbNOzLZJdNszNDDfDrPFvTTx", 101 | "mxc://matrix.org/FEzUmMhxMsqtfXKyYQFDROgO", 102 | "mxc://matrix.org/FVaBPAAuzqpBstuOfxDhDuiw", 103 | "mxc://matrix.org/FwXVuHOTPCJOZwjuunyMoDvw", 104 | "mxc://matrix.org/GBWoKBFhozIJcuuXzgAmESMh", 105 | "mxc://matrix.org/GadiqrOaESCBOpqEspzaFHZZ", 106 | "mxc://matrix.org/GbfNYPPXYfpYDGCPnxEOZACq", 107 | "mxc://matrix.org/HcOKfHoyUseJyNvJCZbySygK", 108 | "mxc://matrix.org/HjVgrKzDUXKrzYMDvtglFdvy", 109 | "mxc://matrix.org/IssHdyiXMcSnRCxCzqoaocGL", 110 | "mxc://matrix.org/JEPcTsDZpImzoyVdKHfeiUlK", 111 | "mxc://matrix.org/JQXLHcWNbcbQBMEWebxQPiPT", 112 | "mxc://matrix.org/JUFinhjLVhQhAmzsSpSaPFiT", 113 | "mxc://matrix.org/JUssqTzHorMXUbeaulQUNjTm", 114 | "mxc://matrix.org/KfkLMomWWjVZMbgVCKisfFPy", 115 | "mxc://matrix.org/LWCDUbJGEqfXWbuACLYPzpMM", 116 | "mxc://matrix.org/LfpqILSYnaIQDnCqGgrryaVA", 117 | "mxc://matrix.org/LlsgPelTpiYvvEgjbqKzefbr", 118 | "mxc://matrix.org/MKYSaqghosWAaMkfOTGqAXWu", 119 | "mxc://matrix.org/MSSWISKFrXqYAWwVZpgQzKNc", 120 | "mxc://matrix.org/MhFPyrortOJyjvIArZYRJNpd", 121 | "mxc://matrix.org/MohmbgPyrsnuKIYJivBLhnaJ", 122 | "mxc://matrix.org/MygYRbllJEcOXaGOySOEYMJc", 123 | "mxc://matrix.org/NZGChxcCXbBvgkCNZTLXlpux", 124 | "mxc://matrix.org/OVXDqAESXvavwJINbuwBeIHy", 125 | "mxc://matrix.org/PQWXmVjsGPqEgItiYEISwDzI", 126 | "mxc://matrix.org/QqFWSwNSKvlljlNZKBGrqCKR", 127 | "mxc://matrix.org/QsaeAloXAKVPsiczXtIBJzrZ", 128 | "mxc://matrix.org/RMMTwRenYWLPdRwIHlwuGCLG", 129 | "mxc://matrix.org/RnAJViaJiNHcGtTZgbRWXqlB", 130 | "mxc://matrix.org/SUpOMAcbPcYBaUnDikHYJOjh", 131 | "mxc://matrix.org/TGopDZiMVyhwhQBuEbUeFOKt", 132 | "mxc://matrix.org/TLEyVAuatPchpWniJrgmjUcU", 133 | "mxc://matrix.org/TlumUuzCcCGHSUMXNJmAFLML", 134 | "mxc://matrix.org/TpxNfvaFAAoZWdhwoYBHQezB", 135 | "mxc://matrix.org/VpjGllthGpjTPkvbJgOdyxkF", 136 | "mxc://matrix.org/WWvqnsZlhzWvPylUjdfhmrOV", 137 | "mxc://matrix.org/XBkKJIaWeXdfoYwMZsQWKjzj", 138 | "mxc://matrix.org/XmiRUvkkKjmTseRYrmBlvGNw", 139 | "mxc://matrix.org/XnDebYmBmnBBNeyBiUKltVlh", 140 | "mxc://matrix.org/XxylKIkLFThmHZjBMvCmipRT", 141 | "mxc://matrix.org/YtCeQeNxqnKsLvIcnwKIMlkV", 142 | "mxc://matrix.org/ZJIdWuBIRhObjOHVnoWfBUkq", 143 | "mxc://matrix.org/ZafPzsxMJtLaSaJXloBEKiws", 144 | "mxc://matrix.org/bCawIGTEGxaXxDIxIqteAhVU", 145 | "mxc://matrix.org/bDayqThxTIcGNcskzIADknRv", 146 | "mxc://matrix.org/bEVwopEQDMNjfzbiPKYgZXWU", 147 | "mxc://matrix.org/bHNoSLOERjdQrUodZUIFYAQl", 148 | "mxc://matrix.org/bSYOldVxWNFeulNUshiOSvlM", 149 | "mxc://matrix.org/bcBGBuKkVBITyyfjLHLVrPKj", 150 | "mxc://matrix.org/bipAEyCRqzXokNjHcDwbWXkO#auto", 151 | "mxc://matrix.org/cKhTXJzIZZjHfNRbNJHjxSxw", 152 | "mxc://matrix.org/cZEhMcslgpUJdTNMIuQSEukn", 153 | "mxc://matrix.org/djdngehyFuFlApXWpYotALoK", 154 | "mxc://matrix.org/eeSkBZDfQavoKeXjWhUGOCrI", 155 | "mxc://matrix.org/fJYvrULeLqUSuOFFhvAuPbVB", 156 | "mxc://matrix.org/gJNPpakWLvKGUYteErJnbqRw", 157 | "mxc://matrix.org/iNUefSlAXjkdNzXyVaYjiiTK", 158 | "mxc://matrix.org/jRqrnjimPBqTSSdJlOupMqSx", 159 | "mxc://matrix.org/jVqDFNtFnwfXedjMKZLgtnsY", 160 | "mxc://matrix.org/kOewGAJWihuVeafiSwgLeiJa", 161 | "mxc://matrix.org/lyWZOWsBRhCcxKRgVUbDdtux", 162 | "mxc://matrix.org/mhuskbkCQPvAXCCoZMMcUltg", 163 | "mxc://matrix.org/nKpRPUortweIAocZOKakSmle", 164 | "mxc://matrix.org/nwWAiyZHhWuATgUqhXSUgyOq", 165 | "mxc://matrix.org/oUxxDyzQOHdVDMxgwFzyCWEe", 166 | "mxc://matrix.org/oqUhSAlhShWRUoOypviZYzCl", 167 | "mxc://matrix.org/owHbMxnvtZQhORPMIjEMhHJC", 168 | "mxc://matrix.org/paFLquBfsoSUMExpgOePaYGn", 169 | "mxc://matrix.org/pcyhRmMTlUPZNUWLBrrBYOUF", 170 | "mxc://matrix.org/qCJQIqJLUntAlQjvjVqqkISE", 171 | "mxc://matrix.org/qyoRKkkSwwqoaseeRDCWGmgL", 172 | "mxc://matrix.org/rAtNyCxKhZKYjIpCMTMVIyZb", 173 | "mxc://matrix.org/stXVscjfSSwEGcpNUOaTOmuw", 174 | "mxc://matrix.org/tmemWZxwaiSRLneppvjscbSv", 175 | "mxc://matrix.org/uFsobEhOojpEXTORyXJznvMf", 176 | "mxc://matrix.org/wEydarIdYNQoHHnOpfYGQAkZ", 177 | "mxc://matrix.org/xppypIFIDuFCqmdJHGjTuRsk", 178 | "mxc://matrix.org/yAEcXFYGUHsLALuVuHtqgsPk", 179 | "mxc://matrix.org/yCdHqfZAMYzGsSeCYODLGNJQ", 180 | "mxc://matrix.org/zRHixRxWSlriuAyCEqxKcsUN", 181 | "mxc://matrix.tarina.org/yQAGQhgyZtbJDzoCxcUoNlte", 182 | "mxc://maunium.net/jdlSfvudiMSmcRrleeiYjjFO", 183 | "mxc://mccarty.io/uCPFlUrLVWMrjuZVDnlIzIoI", 184 | "mxc://medienhaus.dev/RSWiRFctJPQRAfLGfUTIWqCo", 185 | "mxc://moritzdietz.com/oPOkWTlBWdTFbwXuGZNxbpAU", 186 | "mxc://mozilla.org/66d994693725ea09256c22ac43b0e74e79f1abb4", 187 | "mxc://mpl.mpg.de/lxwOKWWbfwlGxAMKhNIfiJRR", 188 | "mxc://msg-net.de/uqthdSIKEsmLlAnrguhOBSRg", 189 | "mxc://mx.anismk.de/hjKAFiGKMasHOCdEVPsmoozA", 190 | "mxc://mx.grupotd.nat.cu/ZfxNoISumlPZZEHqRNbhewQW", 191 | "mxc://neko.dev/wLFwLqbnyvrstuomVXdKMqyJ", 192 | "mxc://nevarro.space/WmGsIGgESPTtJFskYIXdRlVM", 193 | "mxc://obermui.de/pCkwyNUtzdnaImzuqbsaJCgV", 194 | "mxc://perthchat.org/sNAywRrlPKygmkoxpfxSTrFz", 195 | "mxc://pixelplanet.fun/xfxdQZvpLePdlNcRIjoFovPE", 196 | "mxc://pixie.town/fq3MchyYAMzpCkfxbqr9WffR", 197 | "mxc://pixie.town/qBpNzYpOknBxnSdcbFWrbqWT", 198 | "mxc://raim.ist/oInPkqchozNTmIOeUXlCsFbp", 199 | "mxc://riot.ovh/PJxWnOsjdnIpkByXMFJVGZgE", 200 | "mxc://rs485.network/XpMPNjUVJmwwVQyaVtkAjpfl", 201 | "mxc://scamdemic.wtf/WFPdCxatgVIQcYOqkWDKVsXP", 202 | "mxc://seymour.family/ZlzrDJSjRnQYuWJGvhdCkyiS", 203 | "mxc://shiina.family/zxIxLfIyoXTeclPZznmIdRli", 204 | "mxc://simonatherley.com/nYEzJcoThHfARGPSkHXRGapn", 205 | "mxc://skyforge.at/RExFPAnBOsbCqFZIFHAESyKQ", 206 | "mxc://stratum0.org/FKcEkoEcEutsdRUaPjQitDwo", 207 | "mxc://sw1v.org/rARZrbDMGnNQOKKWZtCVxusq", 208 | "mxc://t2l.io/fYhaPLjAZLwEYqaSGKwRpQgk", 209 | "mxc://that.host/QbAhNvUApAEpvCKNWtIZwjCO", 210 | "mxc://the-apothecary.club/HScGQAQKwuQbbdNkLYoPpsNb", 211 | "mxc://tout.im/VQpPnZfufsMWerGlxkupbtYo", 212 | "mxc://uhoreg.ca/JbcxMQHvPoPUoRkwQRdmwXKm", 213 | "mxc://veganism.social/dDVjvEJugTUfWfiavHKhvCxi", 214 | "mxc://wi11.co.uk/DztCMbxBfOUrmklICETzYOEJ", 215 | "mxc://yaal.coop/BviDGOwocxQQNndowuZmhxGr", 216 | ] 217 | 218 | 219 | async def request_media(session: aiohttp.ClientSession, media_url: str) -> int: 220 | media_id = media_url.removeprefix("mxc://") 221 | url = f"http://localhost:8080/_matrix/media_proxy/unstable/download/{media_id}" 222 | 223 | # timeout = aiohttp.ClientTimeout(total=10) 224 | async with session.get(url) as response: 225 | await response.read() 226 | if "-v" not in sys.argv: 227 | # Simple progress meter 228 | print(".", end="", flush=True) 229 | 230 | return response.status 231 | 232 | 233 | async def main() -> None: 234 | perfdir = os.path.dirname(__file__) 235 | os.makedirs(os.path.join(perfdir, "temp"), exist_ok=True) 236 | 237 | print(f"number of URLs: {len(AVATAR_URLS_TAKEN_FROM_SYNAPSE_DEV)}") 238 | 239 | server = None 240 | try: 241 | server = subprocess.Popen( 242 | args=[ 243 | sys.executable, 244 | "-m", 245 | "matrix_content_scanner.mcs", 246 | "-c", 247 | "config.yaml", 248 | ], 249 | cwd=perfdir, 250 | stdin=subprocess.DEVNULL, 251 | stdout=None if "-v" in sys.argv else subprocess.DEVNULL, 252 | stderr=None if "-v" in sys.argv else subprocess.DEVNULL, 253 | ) 254 | 255 | # Give server time to startup 256 | time.sleep(0.5) 257 | 258 | await run_test() 259 | # Run test a second time, now that caches have warmed up 260 | await run_test() 261 | finally: 262 | if server is not None: 263 | server.terminate() 264 | print("Server return code:", server.returncode) 265 | 266 | 267 | async def run_test() -> None: 268 | failed = False 269 | start = timer() 270 | try: 271 | async with aiohttp.ClientSession() as session: 272 | requests = [] 273 | for url in AVATAR_URLS_TAKEN_FROM_SYNAPSE_DEV: 274 | requests.append(asyncio.ensure_future(request_media(session, url))) 275 | 276 | statuses = await asyncio.gather(*requests) 277 | print() 278 | print("Status codes from scanner server:", collections.Counter(statuses)) 279 | except Exception: 280 | traceback.print_exc() 281 | failed = True 282 | finally: 283 | end = timer() 284 | duration = end - start 285 | print(f"{'Failed' if failed else 'Succeeded'} in {duration:.2f}s") 286 | 287 | 288 | if __name__ == "__main__": 289 | asyncio.run(main()) 290 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["poetry-core", "wheel", "setuptools-rust"] 3 | build-backend = "poetry.core.masonry.api" 4 | 5 | [tool.ruff] 6 | line-length = 88 7 | target-version = "py38" 8 | 9 | [tool.ruff.lint] 10 | # See https://beta.ruff.rs/docs/rules/#error-e 11 | # for error codes. The ones we ignore are: 12 | # E501: line too long (we don't normally run this check in other projects such as Synapse) 13 | # E731: do not assign a lambda expression, use a def 14 | # 15 | # flake8-bugbear compatible checks. Its error codes are described at 16 | # https://beta.ruff.rs/docs/rules/#flake8-bugbear-b 17 | # B023: Functions defined inside a loop must not use variables redefined in the loop 18 | ignore = [ 19 | "B023", 20 | "E501", 21 | "E731", 22 | ] 23 | select = [ 24 | # pycodestyle 25 | "E", 26 | "W", 27 | # pyflakes 28 | "F", 29 | # isort 30 | "I001", 31 | # flake8-bugbear 32 | "B0", 33 | # flake8-comprehensions 34 | "C4", 35 | # flake8-2020 36 | "YTT", 37 | # flake8-slots 38 | "SLOT", 39 | # flake8-debugger 40 | "T10", 41 | # flake8-pie 42 | "PIE", 43 | # flake8-executable 44 | "EXE", 45 | ] 46 | 47 | [tool.ruff.lint.isort] 48 | combine-as-imports = true 49 | section-order = ["future", "standard-library", "third-party", "twisted", "first-party", "testing", "local-folder"] 50 | known-first-party = ["matrix_content_scanner"] 51 | 52 | [tool.ruff.lint.isort.sections] 53 | twisted = ["twisted", "OpenSSL"] 54 | testing = ["tests"] 55 | 56 | [tool.ruff.format] 57 | quote-style = "double" 58 | indent-style = "space" 59 | skip-magic-trailing-comma = false 60 | line-ending = "auto" 61 | 62 | [tool.maturin] 63 | manifest-path = "rust/Cargo.toml" 64 | module-name = "matrix_content_scanner.mcs_rust" 65 | 66 | [tool.poetry] 67 | name = "matrix_content_scanner" 68 | version = "1.2.1" 69 | description = "A web service for scanning media hosted by a Matrix media repository" 70 | authors = ["Element Backend Team "] 71 | readme = "README.md" 72 | license = "AGPL-3.0-only OR LicenseRef-Element-Commercial" 73 | # Python version and licence classifiers are set automatically by Poetry 74 | classifiers = [] 75 | include = [ 76 | { path = "mypy.ini", format = "sdist" }, 77 | { path = "scripts-dev", format = "sdist" }, 78 | { path = "tests", format = "sdist" }, 79 | { path = "Cargo.toml", format = "sdist" }, 80 | { path = "Cargo.lock", format = "sdist" }, 81 | { path = "rust/Cargo.toml", format = "sdist" }, 82 | { path = "rust/build.rs", format = "sdist" }, 83 | { path = "rust/src/**", format = "sdist" }, 84 | ] 85 | exclude = [ 86 | { path = "src/*.so", format = "sdist"} 87 | ] 88 | 89 | [tool.poetry.dependencies] 90 | python = "^3.10.0" 91 | attrs = ">=19.2.0" 92 | aiohttp = ">=3.8.0" 93 | jsonschema = ">=4.23.0" 94 | pyyaml = ">=5.1.1" 95 | # Required for decrypting files" 96 | python-magic = ">=0.4.15,<0.5" 97 | # Required for maintaining the result cache. 98 | cachetools = ">=5.4.0" 99 | # Required for processing user-defined values such as durations or sizes. 100 | humanfriendly = ">=10.0" 101 | # Required for calculating cache keys deterministically. Type annotations aren't 102 | # discoverable in versions older than 1.6.3. 103 | canonicaljson = ">=1.6.3" 104 | setuptools_rust = ">=1.3" 105 | 106 | [tool.poetry.dev-dependencies] 107 | # for linting and formatting 108 | ruff = "^0.7.2" 109 | # for type checking 110 | mypy = "*" 111 | types-jsonschema = ">=3.2.0" 112 | types-PyYAML = ">=5.4.10" 113 | types-cachetools = "*" 114 | types-humanfriendly = "*" 115 | 116 | [tool.poetry.build] 117 | script = "build_rust.py" 118 | generate-setup-file = true 119 | 120 | [tool.poetry.urls] 121 | homepage = "https://github.com/element-hq/matrix-content-scanner" 122 | documentation = "https://github.com/element-hq/matrix-content-scanner/blob/main/README.md" 123 | repository = "https://github.com/element-hq/matrix-content-scanner.git" 124 | 125 | [tool.poetry.scripts] 126 | matrix-content-scanner = "matrix_content_scanner.mcs:main" 127 | -------------------------------------------------------------------------------- /rust/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "matrix_content_scanner" 3 | # dummy version. See pyproject.toml for the actual version number. 4 | version = "0.1.0" 5 | edition = "2021" 6 | publish = false 7 | 8 | [lib] 9 | name = "matrix_content_scanner" 10 | crate-type = ["lib", "cdylib"] 11 | 12 | [dependencies] 13 | anyhow = "1.0.63" 14 | lazy_static = "1.4.0" 15 | log = "0.4.17" 16 | matrix-sdk-crypto = "0.7.2" 17 | pyo3 = { version = "0.21.0", features = [ 18 | "macros", 19 | "anyhow", 20 | "abi3", 21 | "abi3-py38", 22 | ] } 23 | pyo3-log = "0.10.0" 24 | pythonize = "0.21.0" 25 | serde_json = "1.0.85" 26 | vodozemac = { git = "https://github.com/matrix-org/vodozemac.git", features = ["insecure-pk-encryption"] } 27 | 28 | [features] 29 | extension-module = ["pyo3/extension-module"] 30 | default = ["extension-module"] 31 | 32 | [build-dependencies] 33 | blake2 = "0.10.4" 34 | hex = "0.4.3" 35 | -------------------------------------------------------------------------------- /rust/build.rs: -------------------------------------------------------------------------------- 1 | //! This build script calculates the hash of all files in the `src/` 2 | //! directory and adds it as an environment variable during build time. 3 | //! 4 | //! This is used so that the python code can detect when the built native module 5 | //! does not match the source in-tree, helping to detect the case where the 6 | //! source has been updated but the library hasn't been rebuilt. 7 | 8 | use std::path::PathBuf; 9 | 10 | use blake2::{Blake2b512, Digest}; 11 | 12 | fn main() -> Result<(), std::io::Error> { 13 | let mut dirs = vec![PathBuf::from("src")]; 14 | 15 | let mut paths = Vec::new(); 16 | while let Some(path) = dirs.pop() { 17 | let mut entries = std::fs::read_dir(path)? 18 | .map(|res| res.map(|e| e.path())) 19 | .collect::, std::io::Error>>()?; 20 | 21 | entries.sort(); 22 | 23 | for entry in entries { 24 | if entry.is_dir() { 25 | dirs.push(entry); 26 | } else { 27 | paths.push(entry.to_str().expect("valid rust paths").to_string()); 28 | } 29 | } 30 | } 31 | 32 | paths.sort(); 33 | 34 | let mut hasher = Blake2b512::new(); 35 | 36 | for path in paths { 37 | let bytes = std::fs::read(path)?; 38 | hasher.update(bytes); 39 | } 40 | 41 | let hex_digest = hex::encode(hasher.finalize()); 42 | println!("cargo:rustc-env=MCS_RUST_DIGEST={hex_digest}"); 43 | 44 | Ok(()) 45 | } 46 | -------------------------------------------------------------------------------- /rust/src/crypto/mod.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | borrow::Cow, 3 | fs, 4 | io::{Cursor, ErrorKind, Read}, 5 | }; 6 | 7 | use anyhow::{Context, Error}; 8 | use matrix_sdk_crypto::AttachmentDecryptor; 9 | use pyo3::{ 10 | prelude::*, 11 | types::{PyBytes, PyDict}, 12 | }; 13 | use pythonize::depythonize_bound; 14 | use vodozemac::{ 15 | base64_encode, 16 | pk_encryption::{self, PkDecryption}, 17 | Curve25519PublicKey, 18 | }; 19 | 20 | /// Called when registering modules with python. 21 | pub fn register_module(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { 22 | let child_module = PyModule::new_bound(py, "crypto")?; 23 | child_module.add_class::()?; 24 | child_module.add_class::()?; 25 | child_module.add_function(wrap_pyfunction!(decrypt_attachment, &child_module)?)?; 26 | 27 | m.add_submodule(&child_module)?; 28 | 29 | Ok(()) 30 | } 31 | 32 | #[pyclass(frozen)] 33 | pub struct CryptoHandler { 34 | decryptor: PkDecryption, 35 | } 36 | 37 | #[pymethods] 38 | impl CryptoHandler { 39 | #[new] 40 | pub fn py_new(pickle_key: &str, pickle_path: &str) -> Result { 41 | match fs::read_to_string(pickle_path) { 42 | Ok(pickle) => { 43 | let decryptor = PkDecryption::from_libolm_pickle(&pickle, pickle_key.as_bytes())?; 44 | 45 | log::info!("Loaded Olm key pair from pickle file {}", pickle_path); 46 | 47 | Ok(Self { decryptor }) 48 | } 49 | Err(e) if e.kind() == ErrorKind::NotFound => { 50 | log::info!( 51 | "Pickle file not found, generating a new Olm key pair and storing it in pickle file {}", 52 | pickle_path, 53 | ); 54 | 55 | let decryptor = PkDecryption::new(); 56 | let pickle = decryptor.to_libolm_pickle(pickle_key.as_bytes())?; 57 | fs::write(pickle_path, pickle)?; 58 | Ok(Self { decryptor }) 59 | } 60 | Err(e) => { 61 | Err(e).with_context(|| format!("Failed to read the pickle file at the location configured for crypto.pickle_path ({pickle_path})")) 62 | } 63 | } 64 | } 65 | 66 | #[getter] 67 | pub fn public_key(&self) -> String { 68 | self.decryptor.public_key().to_base64() 69 | } 70 | 71 | pub fn decrypt_body( 72 | &self, 73 | ciphertext: &str, 74 | mac: &str, 75 | ephemeral: &str, 76 | ) -> Result { 77 | let message = pk_encryption::Message::from_base64(ciphertext, mac, ephemeral)?; 78 | let decrypted = self.decryptor.decrypt(&message)?; 79 | let decrypted = 80 | String::from_utf8(decrypted).context("Decrypted message isn't valid UTF-8")?; 81 | Ok(decrypted) 82 | } 83 | 84 | pub fn encrypt(&self, public_key: &str, payload: &str) -> Result { 85 | let encryptor = 86 | pk_encryption::PkEncryption::from_key(Curve25519PublicKey::from_base64(public_key)?); 87 | Ok(PkMessage(encryptor.encrypt(payload.as_bytes()))) 88 | } 89 | } 90 | 91 | #[pyclass(frozen)] 92 | pub struct PkMessage(pk_encryption::Message); 93 | 94 | #[pymethods] 95 | impl PkMessage { 96 | #[getter] 97 | pub fn ephemeral_key(&self) -> String { 98 | self.0.ephemeral_key.to_base64() 99 | } 100 | 101 | #[getter] 102 | pub fn mac(&self) -> String { 103 | base64_encode(&self.0.mac) 104 | } 105 | 106 | #[getter] 107 | pub fn ciphertext(&self) -> String { 108 | base64_encode(&self.0.ciphertext) 109 | } 110 | } 111 | 112 | #[pyfunction] 113 | pub fn decrypt_attachment( 114 | body: Bound<'_, PyBytes>, 115 | key_info: Bound<'_, PyDict>, 116 | ) -> Result, Error> { 117 | let mut cursor = Cursor::new(body.as_bytes()); 118 | let info = 119 | depythonize_bound(key_info.into_any()).context("Failed parsing supplied key info")?; 120 | 121 | let mut decryptor = AttachmentDecryptor::new(&mut cursor, info)?; 122 | let mut decrypted_data = Vec::new(); 123 | 124 | decryptor.read_to_end(&mut decrypted_data)?; 125 | 126 | Ok(Cow::Owned(decrypted_data)) 127 | } 128 | -------------------------------------------------------------------------------- /rust/src/lib.rs: -------------------------------------------------------------------------------- 1 | use lazy_static::lazy_static; 2 | use pyo3::prelude::*; 3 | use pyo3_log::ResetHandle; 4 | 5 | pub mod crypto; 6 | 7 | lazy_static! { 8 | static ref LOGGING_HANDLE: ResetHandle = pyo3_log::init(); 9 | } 10 | 11 | /// Returns the hash of all the rust source files at the time it was compiled. 12 | /// 13 | /// Used by python to detect if the rust library is outdated. 14 | #[pyfunction] 15 | fn get_rust_file_digest() -> &'static str { 16 | env!("MCS_RUST_DIGEST") 17 | } 18 | 19 | /// Reset the cached logging configuration of pyo3-log to pick up any changes 20 | /// in the Python logging configuration. 21 | /// 22 | #[pyfunction] 23 | fn reset_logging_config() { 24 | LOGGING_HANDLE.reset(); 25 | } 26 | 27 | /// The entry point for defining the Python module. 28 | #[pymodule] 29 | fn mcs_rust(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { 30 | m.add_function(wrap_pyfunction!(get_rust_file_digest, m)?)?; 31 | m.add_function(wrap_pyfunction!(reset_logging_config, m)?)?; 32 | 33 | crypto::register_module(py, m)?; 34 | 35 | Ok(()) 36 | } 37 | -------------------------------------------------------------------------------- /scripts-dev/lint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Runs linting scripts and type checking 3 | # ruff - sorts import statements, lints and finds mistakes, formats the code 4 | # mypy - checks type annotations 5 | 6 | set -e 7 | 8 | files=( 9 | "perf" 10 | "src" 11 | "tests" 12 | ) 13 | 14 | # Print out the commands being run 15 | set -x 16 | 17 | # Catch any common programming mistakes in Python code. 18 | # --quiet suppresses the update check. 19 | ruff check --quiet --fix "${files[@]}" 20 | 21 | # Reformat Python code. 22 | ruff format --quiet "${files[@]}" 23 | 24 | # Type-check the code. 25 | mypy "${files[@]}" 26 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | 6 | from matrix_content_scanner.utils.rust import check_rust_lib_up_to_date 7 | 8 | check_rust_lib_up_to_date() 9 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | from typing import Any, Dict, List, Optional, Union 6 | 7 | import attr 8 | import humanfriendly 9 | from jsonschema import ValidationError, validate 10 | 11 | from matrix_content_scanner.utils.errors import ConfigError 12 | 13 | _ONE_WEEK_SECONDS = 604800.0 14 | 15 | 16 | def _parse_duration(duration: Optional[Union[str, float]]) -> Optional[float]: 17 | """Parse a time duration into a float representing an amount of second. If the given 18 | value is None, or already a float, returns it as is. 19 | 20 | Args: 21 | duration: The duration to parse. 22 | 23 | Returns: 24 | The number of seconds in the given duration. 25 | """ 26 | if duration is None or isinstance(duration, float): 27 | return duration 28 | 29 | try: 30 | return humanfriendly.parse_timespan(duration) 31 | except humanfriendly.InvalidTimespan as e: 32 | raise ConfigError(e) 33 | 34 | 35 | def _parse_size(size: Optional[Union[str, float]]) -> Optional[float]: 36 | """Parse a file size into a float representing the number of bytes for that size. If 37 | the given value is None, or already a float, returns it as is. 38 | 39 | Args: 40 | size: The size to parse. 41 | 42 | Returns: 43 | The number of bytes represented by the given size. 44 | """ 45 | if size is None or isinstance(size, float): 46 | return size 47 | 48 | try: 49 | return humanfriendly.parse_size(size) 50 | except humanfriendly.InvalidSize as e: 51 | raise ConfigError(e) 52 | 53 | 54 | # Schema to validate the raw configuration dictionary against. 55 | _config_schema = { 56 | "type": "object", 57 | "required": ["web", "scan", "crypto"], 58 | "additionalProperties": False, 59 | "properties": { 60 | "web": { 61 | "type": "object", 62 | "required": ["host", "port"], 63 | "additionalProperties": False, 64 | "properties": { 65 | "host": {"type": "string"}, 66 | "port": {"type": "integer"}, 67 | }, 68 | }, 69 | "scan": { 70 | "type": "object", 71 | "required": ["script", "temp_directory"], 72 | "additionalProperties": False, 73 | "properties": { 74 | "script": {"type": "string"}, 75 | "temp_directory": {"type": "string"}, 76 | "removal_command": {"type": "string"}, 77 | "allowed_mimetypes": {"type": "array", "items": {"type": "string"}}, 78 | "blocked_mimetypes": {"type": "array", "items": {"type": "string"}}, 79 | }, 80 | }, 81 | "download": { 82 | "type": "object", 83 | "additionalProperties": False, 84 | "properties": { 85 | "base_homeserver_url": {"type": "string"}, 86 | "proxy": {"type": "string"}, 87 | "additional_headers": { 88 | "type": "object", 89 | "additionalProperties": {"type": "string"}, 90 | }, 91 | }, 92 | }, 93 | "crypto": { 94 | "type": "object", 95 | "required": ["pickle_path", "pickle_key"], 96 | "additionalProperties": False, 97 | "properties": { 98 | "pickle_path": {"type": "string"}, 99 | "pickle_key": {"type": "string"}, 100 | }, 101 | }, 102 | "result_cache": { 103 | "type": "object", 104 | "additionalProperties": False, 105 | "properties": { 106 | "max_size": {"type": "integer"}, 107 | "ttl": {"type": ["string", "number"]}, 108 | "exit_codes_to_ignore": { 109 | "type": "array", 110 | "items": {"type": "integer"}, 111 | }, 112 | "max_file_size": {"type": ["string", "number"]}, 113 | }, 114 | }, 115 | }, 116 | } 117 | 118 | 119 | @attr.s(auto_attribs=True, frozen=True) 120 | class WebConfig: 121 | """Configuration for serving the HTTP API.""" 122 | 123 | host: str 124 | port: int 125 | 126 | 127 | @attr.s(auto_attribs=True, frozen=True, slots=True) 128 | class ScanConfig: 129 | """Configuration for scanning files.""" 130 | 131 | script: str 132 | temp_directory: str 133 | removal_command: str = "rm" 134 | allowed_mimetypes: Optional[List[str]] = None 135 | blocked_mimetypes: Optional[List[str]] = None 136 | 137 | 138 | @attr.s(auto_attribs=True, frozen=True, slots=True) 139 | class ResultCacheConfig: 140 | """Configuration for caching scan results.""" 141 | 142 | max_size: int = 1024 143 | ttl: float = attr.ib(default=_ONE_WEEK_SECONDS, converter=_parse_duration) 144 | exit_codes_to_ignore: Optional[List[int]] = None 145 | max_file_size: Optional[float] = attr.ib(default=None, converter=_parse_size) 146 | 147 | 148 | @attr.s(auto_attribs=True, frozen=True, slots=True) 149 | class DownloadConfig: 150 | """Configuration for downloading files.""" 151 | 152 | base_homeserver_url: Optional[str] = None 153 | proxy: Optional[str] = None 154 | additional_headers: Optional[Dict[str, str]] = None 155 | 156 | 157 | @attr.s(auto_attribs=True, frozen=True, slots=True) 158 | class CryptoConfig: 159 | """Configuration for decrypting encrypted bodies.""" 160 | 161 | pickle_path: str 162 | pickle_key: str 163 | 164 | 165 | class MatrixContentScannerConfig: 166 | def __init__(self, config_dict: Dict[str, Any]): 167 | if not isinstance(config_dict, dict): 168 | raise ConfigError("Bad configuration format") 169 | 170 | try: 171 | validate(config_dict, _config_schema) 172 | except ValidationError as e: 173 | raise ConfigError(e.message) 174 | 175 | self.web = WebConfig(**(config_dict.get("web") or {})) 176 | self.scan = ScanConfig(**(config_dict.get("scan") or {})) 177 | self.crypto = CryptoConfig(**(config_dict.get("crypto") or {})) 178 | self.download = DownloadConfig(**(config_dict.get("download") or {})) 179 | self.result_cache = ResultCacheConfig(**(config_dict.get("result_cache") or {})) 180 | 181 | # Don't allow both allowlist and blocklist for MIME types, since we do not document 182 | # the semantics for that and it is in any case pointless. 183 | # This could have been expressed in JSONSchema but I suspect the error message would be poor 184 | # in that case. 185 | if ( 186 | self.scan.allowed_mimetypes is not None 187 | and self.scan.blocked_mimetypes is not None 188 | ): 189 | raise ConfigError( 190 | "Both `scan.allowed_mimetypes` and `scan.blocked_mimetypes` are specified, which is not allowed!" 191 | ) 192 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/httpserver.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | import logging 6 | from typing import TYPE_CHECKING, Awaitable, Callable 7 | 8 | from aiohttp import web 9 | 10 | from matrix_content_scanner.servlets.download import DownloadHandler 11 | from matrix_content_scanner.servlets.public_key import PublicKeyHandler 12 | from matrix_content_scanner.servlets.scan import ScanHandler 13 | from matrix_content_scanner.servlets.thumbnail import ThumbnailHandler 14 | 15 | if TYPE_CHECKING: 16 | from matrix_content_scanner.mcs import MatrixContentScanner 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | _MEDIA_PATH_REGEXP = r"/{media_path:.+}" 21 | 22 | _CORS_HEADERS = { 23 | "Access-Control-Allow-Origin": "*", 24 | "Access-Control-Allow-Methods": "GET, POST, OPTIONS", 25 | "Access-Control-Allow-Headers": "Origin, X-Requested-With, Content-Type, Accept, Authorization", 26 | } 27 | 28 | 29 | @web.middleware 30 | async def simple_cors_middleware( 31 | request: web.Request, 32 | handler: Callable[[web.Request], Awaitable[web.StreamResponse]], 33 | ) -> web.StreamResponse: 34 | """A simple aiohttp middleware that adds CORS headers to responses, and handles 35 | OPTIONS requests. 36 | 37 | Args: 38 | request: The request to handle. 39 | handler: The handler for this request. 40 | 41 | Returns: 42 | A response with CORS headers. 43 | """ 44 | if request.method == "OPTIONS": 45 | # We don't register routes for OPTIONS requests, therefore the handler we're given 46 | # in this case just raises a 405 Method Not Allowed status using an exception. 47 | # Because we actually want to return a 200 OK with additional headers, we ignore 48 | # the handler and just return a new response. 49 | response = web.StreamResponse( 50 | status=200, 51 | headers=_CORS_HEADERS, 52 | ) 53 | return response 54 | 55 | # Run the request's handler and append CORS headers to it. 56 | response = await handler(request) 57 | response.headers.update(_CORS_HEADERS) 58 | return response 59 | 60 | 61 | @web.middleware 62 | async def json_errors_middleware( 63 | request: web.Request, 64 | handler: Callable[[web.Request], Awaitable[web.StreamResponse]], 65 | ) -> web.StreamResponse: 66 | """A simple aiohttp middleware that converts 404/405 errors into Matrix JSON error. 67 | 68 | Args: 69 | request: The request to handle. 70 | handler: The handler for this request. 71 | 72 | Returns: 73 | The original response OR a JSON error response. 74 | """ 75 | # Run the request's handler and append CORS headers to it. 76 | try: 77 | return await handler(request) 78 | except (web.HTTPNotFound, web.HTTPMethodNotAllowed) as ex: 79 | # Return the proper JSON response. 80 | return web.json_response( 81 | {"errcode": "M_UNRECOGNIZED", "error": "Unrecognized request"}, 82 | status=ex.status, 83 | ) 84 | 85 | 86 | class HTTPServer: 87 | def __init__(self, mcs: "MatrixContentScanner"): 88 | self._mcs = mcs 89 | self._bind_address = mcs.config.web.host 90 | self._bind_port = mcs.config.web.port 91 | 92 | self._app = self._build_app() 93 | 94 | def _build_app(self) -> web.Application: 95 | """Build the aiohttp app and attach all the handlers to it. 96 | 97 | Returns: 98 | The built aiohttp application. 99 | """ 100 | # First we build an application with all routes defined on the root path. 101 | app = web.Application() 102 | 103 | scan_handler = ScanHandler(self._mcs) 104 | download_handler = DownloadHandler(self._mcs) 105 | thumbnail_handler = ThumbnailHandler(self._mcs) 106 | public_key_handler = PublicKeyHandler(self._mcs) 107 | 108 | app.add_routes( 109 | [ 110 | web.get("/scan" + _MEDIA_PATH_REGEXP, scan_handler.handle_plain), 111 | web.post("/scan_encrypted", scan_handler.handle_encrypted), 112 | web.get( 113 | "/download" + _MEDIA_PATH_REGEXP, download_handler.handle_plain 114 | ), 115 | web.post("/download_encrypted", download_handler.handle_encrypted), 116 | web.get( 117 | "/thumbnail" + _MEDIA_PATH_REGEXP, 118 | thumbnail_handler.handle_thumbnail, 119 | ), 120 | web.get( 121 | "/public_key", 122 | public_key_handler.handle_public_key, 123 | ), 124 | ] 125 | ) 126 | 127 | # Then we create a root application, and define the app we previously created as 128 | # a subapp on the base path for the content scanner API. 129 | root = web.Application( 130 | # Apply middlewares. This will also apply to subapps. 131 | middlewares=[ 132 | # Handle trailing slashes. 133 | web.normalize_path_middleware(), 134 | # Handler CORS. 135 | simple_cors_middleware, 136 | # Convert unknown routes/methods into JSON errors. 137 | json_errors_middleware, 138 | ], 139 | ) 140 | root.add_subapp("/_matrix/media_proxy/unstable", app) 141 | 142 | return root 143 | 144 | def start(self) -> None: 145 | """Start an aiohttp server serving the content scanner API.""" 146 | logger.info("Starting listener on %s:%d", self._bind_address, self._bind_port) 147 | web.run_app( 148 | app=self._app, 149 | host=self._bind_address, 150 | port=self._bind_port, 151 | print=None, 152 | ) 153 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/logutils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | import logging 6 | from contextvars import ContextVar 7 | from typing import Any 8 | 9 | # The request's ID. 10 | request_id: ContextVar[str] = ContextVar("request_id") 11 | 12 | 13 | def setup_custom_factory() -> None: 14 | """Generates a new record factory, chained to the current factory, and sets it as the 15 | new default record factory. 16 | 17 | The new factory adds attributes for the media path and request type to log records, 18 | and populates them using the matching ContextVars; 19 | """ 20 | old_factory = logging.getLogRecordFactory() 21 | 22 | def _factory(*args: Any, **kwargs: Any) -> logging.LogRecord: 23 | record = old_factory(*args, **kwargs) 24 | record.request_id = request_id.get(None) 25 | return record 26 | 27 | logging.setLogRecordFactory(_factory) 28 | 29 | 30 | def set_request_id_in_context(v: str) -> None: 31 | """Sets the request_id ContextVar to the given value. 32 | 33 | Args: 34 | v: The value to set the ContextVar. 35 | """ 36 | request_id.set(v) 37 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/mcs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | import argparse 6 | import logging 7 | import sys 8 | from functools import cached_property 9 | 10 | import yaml 11 | from yaml.scanner import ScannerError 12 | 13 | from matrix_content_scanner import logutils 14 | from matrix_content_scanner.config import MatrixContentScannerConfig 15 | from matrix_content_scanner.httpserver import HTTPServer 16 | from matrix_content_scanner.mcs_rust import crypto, reset_logging_config 17 | from matrix_content_scanner.scanner.file_downloader import FileDownloader 18 | from matrix_content_scanner.scanner.scanner import Scanner 19 | from matrix_content_scanner.utils.errors import ConfigError 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | class MatrixContentScanner: 25 | def __init__( 26 | self, 27 | config: MatrixContentScannerConfig, 28 | ) -> None: 29 | self.config = config 30 | 31 | @cached_property 32 | def file_downloader(self) -> FileDownloader: 33 | return FileDownloader(self) 34 | 35 | @cached_property 36 | def scanner(self) -> Scanner: 37 | return Scanner(self) 38 | 39 | @cached_property 40 | def crypto_handler(self) -> crypto.CryptoHandler: 41 | return crypto.CryptoHandler( 42 | self.config.crypto.pickle_key, self.config.crypto.pickle_path 43 | ) 44 | 45 | def start(self) -> None: 46 | http_server = HTTPServer(self) 47 | http_server.start() 48 | 49 | 50 | def setup_logging() -> None: 51 | """Basic logging setup.""" 52 | # Set the format, this assumes every logger is created by 53 | # matrix_content_scanner.logging.getLogger and has custom request_type and 54 | # media_path fields set. 55 | log_format = "%(asctime)s - %(name)s - %(lineno)d - %(levelname)s - %(request_id)s - %(message)s" 56 | formatter = logging.Formatter(log_format) 57 | 58 | logutils.setup_custom_factory() 59 | 60 | # Create the handler and set the default logging level to INFO. 61 | handler = logging.StreamHandler() 62 | handler.setFormatter(formatter) 63 | rootLogger = logging.getLogger("") 64 | rootLogger.setLevel(logging.INFO) 65 | rootLogger.addHandler(handler) 66 | 67 | reset_logging_config() 68 | 69 | 70 | def main() -> None: 71 | parser = argparse.ArgumentParser( 72 | description="A web service for scanning media hosted by a Matrix media repository." 73 | ) 74 | parser.add_argument( 75 | "-c", 76 | type=argparse.FileType("r"), 77 | required=True, 78 | help="The YAML configuration file.", 79 | ) 80 | 81 | args = parser.parse_args() 82 | 83 | # Load the configuration file. 84 | try: 85 | cfg = MatrixContentScannerConfig(yaml.safe_load(args.c)) 86 | except (ConfigError, ScannerError) as e: 87 | # If there's an error reading the file, print it and exit without raising so we 88 | # don't confuse/annoy the user with an unnecessary stack trace. 89 | print("Failed to read configuration file: %s" % e, file=sys.stderr) 90 | sys.exit(1) 91 | 92 | # Create the content scanner. 93 | mcs = MatrixContentScanner(cfg) 94 | 95 | setup_logging() 96 | 97 | # Construct the crypto handler early on, so we can make sure we can load the Olm key 98 | # pair from the pickle file (or write it if it doesn't already exist). 99 | try: 100 | _ = mcs.crypto_handler 101 | except ConfigError as e: 102 | print(e, file=sys.stderr) 103 | sys.exit(1) 104 | 105 | # Start the content scanner. 106 | mcs.start() 107 | 108 | 109 | if __name__ == "__main__": 110 | main() 111 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/mcs_rust/__init__.pyi: -------------------------------------------------------------------------------- 1 | def get_rust_file_digest() -> str: ... 2 | def reset_logging_config() -> None: ... 3 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/mcs_rust/crypto.pyi: -------------------------------------------------------------------------------- 1 | from matrix_content_scanner.utils.types import JsonDict 2 | 3 | class CryptoHandler: 4 | def __init__(self, pickle_key: str, pickle_path: str) -> None: ... 5 | @property 6 | def public_key(self) -> str: ... 7 | def decrypt_body(self, ciphertext: str, mac: str, ephemeral: str) -> str: ... 8 | def encrypt(self, public_key: str, payload: str) -> PkMessage: ... 9 | 10 | class PkMessage: 11 | @property 12 | def ephemeral_key(self) -> str: ... 13 | @property 14 | def mac(self) -> str: ... 15 | @property 16 | def ciphertext(self) -> str: ... 17 | 18 | def decrypt_attachment(body: bytes, key_info: JsonDict) -> bytes: ... 19 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/element-hq/matrix-content-scanner-python/5e26332a9ef35e5f596b97758f7b6aea16b14cb8/src/matrix_content_scanner/py.typed -------------------------------------------------------------------------------- /src/matrix_content_scanner/scanner/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/scanner/file_downloader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | import json 6 | import logging 7 | import urllib.parse 8 | from http import HTTPStatus 9 | from typing import TYPE_CHECKING, Dict, Optional, Tuple 10 | 11 | import aiohttp 12 | from multidict import CIMultiDictProxy, MultiMapping 13 | 14 | from matrix_content_scanner.utils.constants import ErrCode 15 | from matrix_content_scanner.utils.errors import ( 16 | ContentScannerRestError, 17 | WellKnownDiscoveryError, 18 | ) 19 | from matrix_content_scanner.utils.types import MediaDescription 20 | 21 | if TYPE_CHECKING: 22 | from matrix_content_scanner.mcs import MatrixContentScanner 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | class _PathNotFoundException(Exception): 28 | """An exception raised to signal that a URL could not be found on the remote 29 | homeserver. 30 | """ 31 | 32 | 33 | class FileDownloader: 34 | MEDIA_DOWNLOAD_PREFIX = "_matrix/media/%s/download" 35 | MEDIA_THUMBNAIL_PREFIX = "_matrix/media/%s/thumbnail" 36 | MEDIA_DOWNLOAD_AUTHENTICATED_PREFIX = "_matrix/client/%s/media/download" 37 | MEDIA_THUMBNAIL_AUTHENTICATED_PREFIX = "_matrix/client/%s/media/thumbnail" 38 | 39 | def __init__(self, mcs: "MatrixContentScanner"): 40 | self._base_url = mcs.config.download.base_homeserver_url 41 | self._well_known_cache: Dict[str, Optional[str]] = {} 42 | self._proxy_url = mcs.config.download.proxy 43 | self._headers = ( 44 | mcs.config.download.additional_headers 45 | if mcs.config.download.additional_headers is not None 46 | else {} 47 | ) 48 | 49 | async def download_file( 50 | self, 51 | media_path: str, 52 | thumbnail_params: Optional[MultiMapping[str]] = None, 53 | auth_header: Optional[str] = None, 54 | ) -> MediaDescription: 55 | """Retrieve the file with the given `server_name/media_id` path, and stores it on 56 | disk. 57 | 58 | Args: 59 | media_path: The path identifying the media to retrieve. 60 | thumbnail_params: If present, then we want to request and scan a thumbnail 61 | generated with the provided parameters instead of the full media. 62 | auth_header: If present, we forward the given Authorization header, this is 63 | required for authenticated media endpoints. 64 | 65 | Returns: 66 | A description of the file (including its full content). 67 | 68 | Raises: 69 | ContentScannerRestError: The file was not found or could not be downloaded due 70 | to an error on the remote homeserver's side. 71 | """ 72 | 73 | auth_media = True if auth_header is not None else False 74 | 75 | prefix = ( 76 | self.MEDIA_DOWNLOAD_AUTHENTICATED_PREFIX 77 | if auth_media 78 | else self.MEDIA_DOWNLOAD_PREFIX 79 | ) 80 | if thumbnail_params is not None: 81 | prefix = ( 82 | self.MEDIA_THUMBNAIL_AUTHENTICATED_PREFIX 83 | if auth_media 84 | else self.MEDIA_THUMBNAIL_PREFIX 85 | ) 86 | 87 | url = await self._build_https_url( 88 | media_path, prefix, "v1" if auth_media else "v3" 89 | ) 90 | 91 | # Attempt to retrieve the file at the generated URL. 92 | try: 93 | file = await self._get_file_content(url, thumbnail_params, auth_header) 94 | except _PathNotFoundException: 95 | if auth_media: 96 | raise ContentScannerRestError( 97 | http_status=HTTPStatus.NOT_FOUND, 98 | reason=ErrCode.NOT_FOUND, 99 | info="File not found", 100 | ) 101 | 102 | # If the file could not be found, it might be because the homeserver hasn't 103 | # been upgraded to a version that supports Matrix v1.1 endpoints yet, so try 104 | # again with an r0 endpoint. 105 | logger.info("File not found, trying legacy r0 path") 106 | 107 | url = await self._build_https_url(media_path, prefix, "r0") 108 | 109 | try: 110 | file = await self._get_file_content(url, thumbnail_params, auth_header) 111 | except _PathNotFoundException: 112 | # If that still failed, raise an error. 113 | raise ContentScannerRestError( 114 | http_status=HTTPStatus.NOT_FOUND, 115 | reason=ErrCode.NOT_FOUND, 116 | info="File not found", 117 | ) 118 | 119 | return file 120 | 121 | async def _build_https_url( 122 | self, 123 | media_path: str, 124 | prefix: str, 125 | endpoint_version: str, 126 | ) -> str: 127 | """Turn a `server_name/media_id` path into an https:// one we can use to fetch 128 | the media. 129 | 130 | Note that if `base_homeserver_url` is set to an http URL, it will not be turned 131 | into an https one. 132 | 133 | Args: 134 | media_path: The media path to translate. 135 | endpoint_version: The version of the download endpoint to use. As of Matrix 136 | v1.11, this is "v1" for authenticated media. For unauthenticated media 137 | this is either "v3" or "r0". 138 | 139 | Returns: 140 | An https URL to use. If `base_homeserver_url` is set in the config, this 141 | will be used as the base of the URL. 142 | """ 143 | server_name, media_id = media_path.split("/") 144 | 145 | # Figure out what base URL to use. If one is specified in the configuration file, 146 | # use it, otherwise try to discover one using .well-known. If that fails, use the 147 | # server name with an HTTPS scheme. 148 | if self._base_url is not None: 149 | base_url = self._base_url 150 | else: 151 | base_url = None 152 | 153 | try: 154 | base_url = await self._discover_via_well_known(server_name) 155 | except WellKnownDiscoveryError as e: 156 | # We don't catch ContentScannerRestErrors here because if one makes its 157 | # way up here then it likely means that trying to reach https://server_name 158 | # failed, in which case we're unlikely to be able to reach it again when 159 | # downloading the file, so we let the error escalate. 160 | logger.info("Failed to discover server via well-known: %s", e) 161 | 162 | if base_url is None: 163 | # base_url might be None if either .well-known discovery failed, or we 164 | # didn't find a .well-known file. 165 | base_url = "https://" + server_name 166 | 167 | # Build the full URL. 168 | path_prefix = prefix % endpoint_version 169 | url = "%s/%s/%s/%s" % ( 170 | base_url, 171 | path_prefix, 172 | urllib.parse.quote(server_name), 173 | urllib.parse.quote(media_id), 174 | ) 175 | 176 | return url 177 | 178 | async def _get_file_content( 179 | self, 180 | url: str, 181 | thumbnail_params: Optional[MultiMapping[str]], 182 | auth_header: Optional[str] = None, 183 | ) -> MediaDescription: 184 | """Retrieve the content of the file at a given URL. 185 | 186 | Args: 187 | url: The URL to query. 188 | thumbnail_params: Query parameters used if the request is for a thumbnail. 189 | auth_header: If present, we forward the given Authorization header, this is 190 | required for authenticated media endpoints. 191 | 192 | Returns: 193 | A description of the file (including its full content). 194 | 195 | Raises: 196 | _PathNotFoundException: the server returned an error that can mean the path 197 | of the request wasn't understood, e.g. because we requested a v3 URL but 198 | the server only supports r0, or the media couldn't be found. 199 | We raise a separate error class in this case because if the error is due 200 | to a v3 vs r0 path we want to retry the request on the r0 path. 201 | ContentScannerRestError: the server returned a non-200 status which cannot 202 | meant that the path wasn't understood. 203 | """ 204 | code, body, headers = await self._get( 205 | url, query=thumbnail_params, auth_header=auth_header 206 | ) 207 | 208 | logger.info("Remote server responded with %d", code) 209 | 210 | # If the response isn't a 200 OK, raise. 211 | if 200 < code: 212 | logger.info("Response body: %s", body) 213 | # If the response is a 404 or an "unrecognised request" à la Synapse, 214 | # consider that we could not find the media, and that we should retry if this 215 | # request was directed at a v3 endpoint. 216 | if code == 400: 217 | try: 218 | err = json.loads(body) 219 | if err["errcode"] == "M_UNRECOGNIZED": 220 | raise _PathNotFoundException 221 | except (json.decoder.JSONDecodeError, KeyError): 222 | pass 223 | 224 | if code == 401: 225 | try: 226 | err = json.loads(body) 227 | if err["errcode"] == ErrCode.MISSING_TOKEN: 228 | raise ContentScannerRestError( 229 | HTTPStatus.UNAUTHORIZED, 230 | ErrCode.MISSING_TOKEN, 231 | "Access token missing from request", 232 | ) 233 | if err["errcode"] == ErrCode.UNKNOWN_TOKEN: 234 | raise ContentScannerRestError( 235 | HTTPStatus.UNAUTHORIZED, 236 | ErrCode.UNKNOWN_TOKEN, 237 | "Invalid access token passed", 238 | ) 239 | except (json.decoder.JSONDecodeError, KeyError): 240 | pass 241 | 242 | if code == 404: 243 | raise _PathNotFoundException 244 | 245 | raise ContentScannerRestError( 246 | HTTPStatus.BAD_GATEWAY, 247 | ErrCode.REQUEST_FAILED, 248 | "The remote server responded with an error", 249 | ) 250 | 251 | # Check that we have the right amount of Content-Type headers (so we don't get 252 | # confused later when we try comparing it with the file's MIME type). 253 | content_type_headers = headers.getall("content-type", None) 254 | if content_type_headers is None or len(content_type_headers) != 1: 255 | raise ContentScannerRestError( 256 | HTTPStatus.BAD_GATEWAY, 257 | ErrCode.REQUEST_FAILED, 258 | "The remote server responded with an invalid amount of Content-Type headers", 259 | ) 260 | 261 | return MediaDescription( 262 | content_type=content_type_headers[0], 263 | content=body, 264 | response_headers=headers, 265 | ) 266 | 267 | async def _discover_via_well_known(self, domain: str) -> Optional[str]: 268 | """Try to discover the base URL for the given domain via .well-known client 269 | discovery. 270 | 271 | Args: 272 | domain: The domain to discover the base URL for. 273 | 274 | Returns: 275 | The base URL to use, or None if no .well-known client file exist for this 276 | domain. 277 | 278 | Raises: 279 | WellKnownDiscoveryError if an error happened during the discovery attempt. 280 | """ 281 | # Check if we already have a result cached, and if so return with it straight 282 | # away. 283 | if domain in self._well_known_cache: 284 | logger.info("Fetching .well-known discovery result from cache") 285 | return self._well_known_cache[domain] 286 | 287 | # Attempt to download the .well-known file. 288 | try: 289 | url = f"https://{domain}/.well-known/matrix/client" 290 | code, body, _ = await self._get(url) 291 | except ContentScannerRestError: 292 | raise WellKnownDiscoveryError(f"Failed to reach web server at {domain}") 293 | 294 | if code != 200: 295 | if code == 404: 296 | # If the response status is 404, then the homeserver hasn't set up 297 | # .well-known discovery, in which case we tell the caller that there's 298 | # no base URL to use rather than raising an error. 299 | # The difference is that we want to cache this result here, but we don't 300 | # want to do that when the discovery fails due to an incorrectly set up 301 | # file or an unavailable homeserver, which might be fixed later on. 302 | logger.info( 303 | ".well-known discover has not been set up for this homeserver" 304 | ) 305 | self._well_known_cache[domain] = None 306 | return None 307 | 308 | raise WellKnownDiscoveryError( 309 | f"Server responded with non-200 status {code}" 310 | ) 311 | 312 | # Try to parse the JSON content. 313 | try: 314 | parsed_body = json.loads(body) 315 | except json.decoder.JSONDecodeError as e: 316 | raise WellKnownDiscoveryError(e) 317 | 318 | # Check if the parsed content has a base URL in the right place. 319 | try: 320 | base_url: str = parsed_body["m.homeserver"]["base_url"] 321 | except (KeyError, TypeError): 322 | # We might get a KeyError if we're trying to reach a key that doesn't exist, 323 | # and we might get a TypeError if parsed_body or parsed_body["m.homeserver"] 324 | # isn't a dictionary. 325 | raise WellKnownDiscoveryError("Response did not include a usable URL") 326 | 327 | # Remove the trailing slash if there is one. 328 | if base_url.endswith("/"): 329 | base_url = base_url[:-1] 330 | 331 | # Check if the base URL is one for a working homeserver. 332 | url = base_url + "/_matrix/client/versions" 333 | try: 334 | code, _, _ = await self._get(url) 335 | except ContentScannerRestError: 336 | raise WellKnownDiscoveryError( 337 | "Base URL does not seem to point to a working homeserver" 338 | ) 339 | 340 | if code != 200: 341 | raise WellKnownDiscoveryError( 342 | "Base URL does not seem to point to a working homeserver" 343 | ) 344 | 345 | # Cache and return the result. 346 | self._well_known_cache[domain] = base_url 347 | return base_url 348 | 349 | async def _get( 350 | self, 351 | url: str, 352 | query: Optional[MultiMapping[str]] = None, 353 | auth_header: Optional[str] = None, 354 | ) -> Tuple[int, bytes, CIMultiDictProxy[str]]: 355 | """Sends a GET request to the provided URL. 356 | 357 | Args: 358 | url: The URL to send requests to. 359 | query: Optional parameters to use in the request's query string. 360 | auth_header: If present, we forward the given Authorization header, this is 361 | required for authenticated media endpoints. 362 | 363 | Returns: 364 | The HTTP status code, body and headers the remote server responded with. 365 | 366 | Raises: 367 | ContentScannerRestError(502) if the request failed (if the remote server 368 | timed out or refused the connection, etc.). 369 | """ 370 | try: 371 | logger.info("Sending GET request to %s", url) 372 | async with aiohttp.ClientSession() as session: 373 | if auth_header is not None: 374 | request_headers = {"Authorization": auth_header, **self._headers} 375 | else: 376 | request_headers = self._headers 377 | 378 | async with session.get( 379 | url, 380 | proxy=self._proxy_url, 381 | headers=request_headers, 382 | params=query, 383 | ) as resp: 384 | return resp.status, await resp.read(), resp.headers 385 | 386 | except Exception as e: 387 | logger.error(e) 388 | raise ContentScannerRestError( 389 | HTTPStatus.BAD_GATEWAY, 390 | ErrCode.REQUEST_FAILED, 391 | "Failed to reach the remote server", 392 | ) 393 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/scanner/scanner.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | import asyncio 6 | import hashlib 7 | import logging 8 | import os 9 | import subprocess 10 | from asyncio import Future 11 | from pathlib import Path 12 | from typing import TYPE_CHECKING, Dict, List, Optional, Tuple 13 | 14 | import attr 15 | import magic 16 | from cachetools import TTLCache 17 | from canonicaljson import encode_canonical_json 18 | from humanfriendly import format_size 19 | from multidict import MultiMapping 20 | 21 | from matrix_content_scanner.mcs_rust import crypto 22 | from matrix_content_scanner.utils.constants import ErrCode 23 | from matrix_content_scanner.utils.errors import ( 24 | ContentScannerRestError, 25 | FileDirtyError, 26 | FileMimeTypeForbiddenError, 27 | ) 28 | from matrix_content_scanner.utils.types import JsonDict, MediaDescription 29 | 30 | if TYPE_CHECKING: 31 | from matrix_content_scanner.mcs import MatrixContentScanner 32 | 33 | logger = logging.getLogger(__name__) 34 | 35 | 36 | @attr.s(auto_attribs=True, frozen=True) 37 | class CacheEntry: 38 | """An entry in the scanner's result cache.""" 39 | 40 | # The result of the scan: True if the scan passed, False otherwise. 41 | result: bool 42 | 43 | # The media that was scanned, so we can return it in future requests. We only cache 44 | # it if the scan succeeded and the file's size does not exceed the configured limit, 45 | # otherwise it's None. 46 | media: Optional[MediaDescription] = None 47 | 48 | # Hash of the media content, so we can make sure no malicious servers changed the file 49 | # since we've scanned it (e.g. if we need to re-download it because the file was too 50 | # big). None if the scan failed. 51 | media_hash: Optional[str] = None 52 | 53 | # Info to include in the FileDirtyError if the scan failed. 54 | info: Optional[str] = None 55 | 56 | 57 | class Scanner: 58 | def __init__(self, mcs: "MatrixContentScanner"): 59 | self._file_downloader = mcs.file_downloader 60 | self._script = mcs.config.scan.script 61 | self._removal_command = mcs.config.scan.removal_command 62 | self._store_directory = Path(mcs.config.scan.temp_directory).resolve( 63 | strict=True 64 | ) 65 | 66 | # Result cache settings. 67 | self._result_cache: TTLCache[str, CacheEntry] = TTLCache( 68 | maxsize=mcs.config.result_cache.max_size, 69 | ttl=mcs.config.result_cache.ttl, 70 | ) 71 | 72 | if mcs.config.result_cache.exit_codes_to_ignore is None: 73 | self._exit_codes_to_ignore = [] 74 | else: 75 | self._exit_codes_to_ignore = mcs.config.result_cache.exit_codes_to_ignore 76 | 77 | self._max_size_to_cache = mcs.config.result_cache.max_file_size 78 | 79 | # List of MIME types we should allow. 80 | # If None, we fall back to `_blocked_mimetypes`. 81 | # If that's also None, we don't fail files based on their 82 | # MIME types (besides comparing it with the Content-Type header from the server 83 | # for unencrypted files). 84 | self._allowed_mimetypes = mcs.config.scan.allowed_mimetypes 85 | 86 | # List of MIME types we should block. 87 | # Must not be specified at the same time as `_allowed_mimetypes`. 88 | # See the comment for `_allowed_mimetypes` for the semantics. 89 | self._blocked_mimetypes = mcs.config.scan.blocked_mimetypes 90 | 91 | # Cache of futures for files that are currently scanning and downloading, so that 92 | # concurrent requests don't cause a file to be downloaded and scanned twice. 93 | self._current_scans: Dict[str, Future[MediaDescription]] = {} 94 | 95 | # Limit the number of concurrent scans. 96 | self._current_scan_semaphore = asyncio.Semaphore(100) 97 | 98 | async def scan_file( 99 | self, 100 | media_path: str, 101 | metadata: Optional[JsonDict] = None, 102 | thumbnail_params: Optional["MultiMapping[str]"] = None, 103 | auth_header: Optional[str] = None, 104 | ) -> MediaDescription: 105 | """Download and scan the given media. 106 | 107 | Unless the scan fails with one of the codes listed in `do_not_cache_exit_codes`, 108 | also cache the result. 109 | 110 | If the file already has an entry in the result cache, return this value without 111 | downloading the file again (unless we purposefully did not cache the file's 112 | content to save up on memory). 113 | 114 | If a file is currently already being downloaded or scanned as a result of another 115 | request, don't download it again and use the result from the first request. 116 | 117 | Args: 118 | media_path: The `server_name/media_id` path for the media. 119 | metadata: The metadata attached to the file (e.g. decryption key), or None if 120 | the file isn't encrypted. 121 | thumbnail_params: If present, then we want to request and scan a thumbnail 122 | generated with the provided parameters instead of the full media. 123 | auth_header: If present, we forward the given Authorization header, this is 124 | required for authenticated media endpoints. 125 | 126 | Returns: 127 | A description of the media. 128 | 129 | Raises: 130 | ContentScannerRestError if the file could not be downloaded. 131 | FileDirtyError if the result of the scan said that the file is dirty, or if 132 | the media path is malformed. 133 | """ 134 | # Compute the key to use when caching, both in the current scans cache and in the 135 | # results cache. 136 | cache_key = self._get_cache_key_for_file(media_path, metadata, thumbnail_params) 137 | if cache_key not in self._current_scans: 138 | # Create a future in the context of the current event loop. 139 | loop = asyncio.get_event_loop() 140 | f = loop.create_future() 141 | # Register the future in the current scans cache so that subsequent queries 142 | # can use it. 143 | self._current_scans[cache_key] = f 144 | # Try to download and scan the file. 145 | try: 146 | res = await self._scan_file( 147 | cache_key, media_path, metadata, thumbnail_params, auth_header 148 | ) 149 | # Set the future's result, and mark it as done. 150 | f.set_result(res) 151 | # Return the result. 152 | return res 153 | except Exception as e: 154 | # If there's an exception, catch it, pass it on to the future, and raise 155 | # it. 156 | f.set_exception(e) 157 | # We retrieve the exception from the future, because if we don't and no 158 | # other request is awaiting on the future, asyncio complains about "Future 159 | # exception was never retrieved". 160 | f.exception() 161 | raise 162 | finally: 163 | # Remove the future from the cache. 164 | del self._current_scans[cache_key] 165 | 166 | return await self._current_scans[cache_key] 167 | 168 | async def _scan_file( 169 | self, 170 | cache_key: str, 171 | media_path: str, 172 | metadata: Optional[JsonDict] = None, 173 | thumbnail_params: Optional[MultiMapping[str]] = None, 174 | auth_header: Optional[str] = None, 175 | ) -> MediaDescription: 176 | """Download and scan the given media. 177 | 178 | Unless the scan fails with one of the codes listed in `do_not_cache_exit_codes`, 179 | also cache the result. 180 | 181 | If the file already has an entry in the result cache, return this value without 182 | downloading the file again (unless we purposefully did not cache the file's 183 | content to save up on memory). 184 | 185 | Args: 186 | cache_key: The key to use to cache the result of the scan in the result cache. 187 | media_path: The `server_name/media_id` path for the media. 188 | metadata: The metadata attached to the file (e.g. decryption key), or None if 189 | the file isn't encrypted. 190 | thumbnail_params: If present, then we want to request and scan a thumbnail 191 | generated with the provided parameters instead of the full media. 192 | auth_header: If present, we forward the given Authorization header, this is 193 | required for authenticated media endpoints. 194 | 195 | Returns: 196 | A description of the media. 197 | 198 | Raises: 199 | ContentScannerRestError if the file could not be downloaded. 200 | FileDirtyError if the result of the scan said that the file is dirty, or if 201 | the media path is malformed. 202 | """ 203 | # The media to scan. 204 | media: Optional[MediaDescription] = None 205 | 206 | # Return the cached result if there's one. 207 | cache_entry = self._result_cache.get(cache_key) 208 | if cache_entry is not None: 209 | logger.info("Found a cached result %s", cache_entry.result) 210 | 211 | if cache_entry.result is False: 212 | # Feed the additional info we might have added when caching the error, 213 | # into the new error. 214 | raise FileDirtyError(info=cache_entry.info) 215 | 216 | if cache_entry.media is not None: 217 | return cache_entry.media 218 | 219 | # If we don't have the media cached 220 | logger.info( 221 | "Got a positive result from cache without a media, downloading file", 222 | ) 223 | 224 | media = await self._file_downloader.download_file( 225 | media_path=media_path, 226 | thumbnail_params=thumbnail_params, 227 | auth_header=auth_header, 228 | ) 229 | 230 | # Compare the media's hash to ensure the server hasn't changed the file since 231 | # the last scan. If it has changed, shout about it in the logs, discard the 232 | # cache entry and scan it again. 233 | media_hash = hashlib.sha256(media.content).hexdigest() 234 | if media_hash == cache_entry.media_hash: 235 | return media 236 | 237 | logger.warning( 238 | "Media has changed since last scan (cached hash: %s, new hash: %s)," 239 | " discarding cached result and scanning again", 240 | cache_entry.media_hash, 241 | media_hash, 242 | ) 243 | 244 | del self._result_cache[cache_key] 245 | 246 | # Check if the media path is valid and only contains one slash (otherwise we'll 247 | # have issues parsing it further down the line). 248 | if media_path.count("/") != 1: 249 | info = "Malformed media ID" 250 | self._result_cache[cache_key] = CacheEntry( 251 | result=False, 252 | info=info, 253 | ) 254 | raise FileDirtyError(info) 255 | 256 | # Download the file if we don't already have it. 257 | if media is None: 258 | media = await self._file_downloader.download_file( 259 | media_path=media_path, 260 | thumbnail_params=thumbnail_params, 261 | auth_header=auth_header, 262 | ) 263 | 264 | # Download and scan the file. 265 | try: 266 | media, cacheable = await self._scan_media(media, media_path, metadata) 267 | except FileDirtyError as e: 268 | if e.cacheable: 269 | logger.info("Caching scan failure") 270 | 271 | # If the test fails, don't store the media to save memory. 272 | self._result_cache[cache_key] = CacheEntry( 273 | result=False, 274 | media=None, 275 | info=e.info, 276 | ) 277 | 278 | raise 279 | 280 | # Update the cache if the result should be cached. 281 | if cacheable: 282 | logger.info("Caching scan success") 283 | 284 | cached_media: Optional[MediaDescription] = media 285 | 286 | if ( 287 | self._max_size_to_cache is not None 288 | and len(media.content) > self._max_size_to_cache 289 | ): 290 | # Don't cache the file's content if it exceeds the maximum allowed file 291 | # size, to minimise memory usage. 292 | logger.info( 293 | "File content has size %s, which is more than %s, not caching content", 294 | format_size(len(media.content)), 295 | format_size(self._max_size_to_cache), 296 | ) 297 | 298 | cached_media = None 299 | 300 | # Hash the media, that way if we need to re-download the file we can make sure 301 | # it's the right one. We get a hex digest in case we want to print it later. 302 | media_hash = hashlib.sha256(media.content).hexdigest() 303 | 304 | self._result_cache[cache_key] = CacheEntry( 305 | result=True, 306 | media=cached_media, 307 | media_hash=media_hash, 308 | ) 309 | 310 | return media 311 | 312 | async def _scan_media( 313 | self, 314 | media: MediaDescription, 315 | media_path: str, 316 | metadata: Optional[JsonDict] = None, 317 | ) -> Tuple[MediaDescription, bool]: 318 | """Scans the given media. 319 | 320 | Args: 321 | media: The already downloaded media. If provided, the download step is 322 | skipped. Usually provided if we've re-downloaded a file with a cached 323 | result, but the file changed since the initial scan. 324 | media_path: The `server_name/media_id` path for the media. 325 | metadata: The metadata attached to the file (e.g. decryption key), or None if 326 | the file isn't encrypted. 327 | 328 | Returns: 329 | A description of the media, as well as a boolean indicating whether the 330 | successful scan result should be cached or not. 331 | 332 | Raises: 333 | FileDirtyError if the result of the scan said that the file is dirty, or if 334 | the media path is malformed. 335 | """ 336 | 337 | # Decrypt the content if necessary. 338 | media_content = media.content 339 | if metadata is not None: 340 | # If the file is encrypted, we need to decrypt it before we can scan it. 341 | media_content = self._decrypt_file(media_content, metadata) 342 | 343 | # Check the file's MIME type to see if it's allowed. 344 | self._check_mimetype(media_content) 345 | 346 | # Write the file to disk. 347 | file_path = self._write_file_to_disk(media_path, media_content) 348 | 349 | # Scan the file and see if the result is positive or negative. 350 | exit_code = await self._run_scan(file_path) 351 | result = exit_code == 0 352 | 353 | # If the exit code isn't part of the ones we should ignore, cache the result. 354 | cacheable = True 355 | if exit_code in self._exit_codes_to_ignore: 356 | logger.info( 357 | "Scan returned exit code %d which must not be cached", exit_code 358 | ) 359 | cacheable = False 360 | 361 | # Delete the file now that we've scanned it. 362 | logger.info("Scan has finished, removing file") 363 | removal_command_parts = self._removal_command.split() 364 | removal_command_parts.append(file_path) 365 | subprocess.run(removal_command_parts) 366 | 367 | # Raise an error if the result isn't clean. 368 | if result is False: 369 | raise FileDirtyError(cacheable=cacheable) 370 | 371 | return media, cacheable 372 | 373 | def _get_cache_key_for_file( 374 | self, 375 | media_path: str, 376 | metadata: Optional[JsonDict], 377 | thumbnail_params: Optional[MultiMapping[str]], 378 | ) -> str: 379 | """Generates the key to use to store the result for the given media in the result 380 | cache. 381 | 382 | The key is computed using the media's `server_name/media_id` path, but also the 383 | metadata dict (stringified), in case e.g. the decryption key changes, as well as 384 | the parameters used to generate the thumbnail if any (stringified), to 385 | differentiate thumbnails from full-sized media. 386 | The resulting key is a sha256 hash of the concatenation of these two values. 387 | 388 | Args: 389 | media_path: The `server_name/media_id` path of the file to scan. 390 | metadata: The file's metadata (or None if the file isn't encrypted). 391 | thumbnail_params: The parameters to generate thumbnail with. If no parameter 392 | is passed, this will be an empty dict. If the media being requested is not 393 | a thumbnail, this will be None. 394 | """ 395 | # If we're provided with thumbnailing parameters, turn them into a structure that 396 | # can be serialised as JSON. 397 | thumbnail_params_json: Optional[Dict[str, List[str]]] = None 398 | if thumbnail_params is not None: 399 | thumbnail_params_json = {} 400 | for k in thumbnail_params.keys(): 401 | thumbnail_params_json[k] = thumbnail_params.getall(k) 402 | 403 | hash = hashlib.sha256() 404 | hash.update(media_path.encode("utf8")) 405 | hash.update(b"\0") 406 | hash.update(encode_canonical_json(metadata)) 407 | hash.update(b"\0") 408 | hash.update(encode_canonical_json(thumbnail_params_json)) 409 | 410 | return hash.hexdigest() 411 | 412 | def _decrypt_file(self, body: bytes, metadata: JsonDict) -> bytes: 413 | """Extract decryption information from the file's metadata and decrypt it. 414 | 415 | Args: 416 | body: The encrypted body of the file. 417 | metadata: The part of the request that includes decryption information. 418 | 419 | Returns: 420 | The decrypted content of the file. 421 | 422 | Raises: 423 | ContentScannerRestError(400) if the decryption failed. 424 | """ 425 | logger.info("Decrypting encrypted file") 426 | 427 | # Decrypt the file. 428 | try: 429 | return crypto.decrypt_attachment(body, metadata["file"]) 430 | except Exception as e: 431 | raise ContentScannerRestError( 432 | http_status=400, 433 | reason=ErrCode.FAILED_TO_DECRYPT, 434 | info=str(e), 435 | ) 436 | 437 | def _write_file_to_disk(self, media_path: str, body: bytes) -> str: 438 | """Writes the given content to disk. The final file name will be a concatenation 439 | of `temp_directory` and the media's `server_name/media_id` path. 440 | 441 | Args: 442 | media_path: The `server_name/media_id` path of the media we're processing. 443 | body: The bytes to write to disk. 444 | 445 | Returns: 446 | The full path to the newly written file. 447 | 448 | Raises: 449 | FileDirtyError if the media path is malformed in a way that would cause the 450 | file to be written outside the configured directory. 451 | """ 452 | # Figure out the full absolute path for this file. 453 | full_path = self._store_directory.joinpath(media_path).resolve() 454 | try: 455 | # Check if the full path is a sub-path to the store's path, to make sure 456 | # there isn't any '..' etc. in the full path, which would cause us to try 457 | # writing outside the store's directory. 458 | full_path.relative_to(self._store_directory) 459 | except ValueError: 460 | raise FileDirtyError("Malformed media ID") 461 | 462 | logger.info("Writing file to %s", full_path) 463 | 464 | # Create any directory we need. 465 | os.makedirs(full_path.parent, exist_ok=True) 466 | 467 | with open(full_path, "wb") as fp: 468 | fp.write(body) 469 | 470 | return str(full_path) 471 | 472 | async def _run_scan(self, file_name: str) -> int: 473 | """Runs the scan script, passing it the given file name. 474 | 475 | Args: 476 | file_name: Name of the file to scan. 477 | 478 | Returns: 479 | The exit code the script returned. 480 | """ 481 | async with self._current_scan_semaphore: 482 | process = await asyncio.create_subprocess_exec( 483 | self._script, file_name, stderr=asyncio.subprocess.PIPE 484 | ) 485 | _, stderr = await process.communicate() 486 | retcode = await process.wait() 487 | if retcode == 0: 488 | logger.info("Scan succeeded") 489 | else: 490 | logger.info( 491 | "Scanning failed with exit code %d. Stderr: %s", 492 | retcode, 493 | stderr.decode(), 494 | ) 495 | 496 | return retcode 497 | 498 | def _check_mimetype(self, media_content: bytes) -> None: 499 | """Detects the MIME type of the provided bytes, and checks that this type is allowed 500 | (if an allow list is provided in the configuration) 501 | Args: 502 | media_content: The file's content. If the file is encrypted, this is its 503 | decrypted content. 504 | Raises: 505 | FileMimeTypeForbiddenError if one of the checks fail. 506 | """ 507 | detected_mimetype = magic.from_buffer(media_content, mime=True) 508 | logger.debug("Detected MIME type for file is %s", detected_mimetype) 509 | 510 | # If there's an allow list for MIME types, check that the MIME type that's been 511 | # detected for this file is in it. 512 | if ( 513 | self._allowed_mimetypes is not None 514 | and detected_mimetype not in self._allowed_mimetypes 515 | ): 516 | logger.error( 517 | "MIME type for file is forbidden: %s", 518 | detected_mimetype, 519 | ) 520 | raise FileMimeTypeForbiddenError( 521 | f"File type: {detected_mimetype} not allowed" 522 | ) 523 | 524 | # If there's a block list for MIME types, check that the MIME type detected for 525 | # this file is NOT in it. 526 | if ( 527 | self._blocked_mimetypes is not None 528 | and detected_mimetype in self._blocked_mimetypes 529 | ): 530 | logger.error( 531 | "MIME type for file is forbidden: %s", 532 | detected_mimetype, 533 | ) 534 | raise FileMimeTypeForbiddenError( 535 | f"File type: {detected_mimetype} not allowed" 536 | ) 537 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/servlets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | import functools 6 | import json 7 | import logging 8 | from typing import Awaitable, Callable, Dict, Optional, Tuple, TypeVar, Union 9 | 10 | import attr 11 | from aiohttp import web 12 | from multidict import CIMultiDictProxy 13 | 14 | from matrix_content_scanner import logutils 15 | from matrix_content_scanner.mcs_rust import crypto 16 | from matrix_content_scanner.utils.constants import ErrCode 17 | from matrix_content_scanner.utils.encrypted_file_metadata import ( 18 | validate_encrypted_file_metadata, 19 | ) 20 | from matrix_content_scanner.utils.errors import ContentScannerRestError 21 | from matrix_content_scanner.utils.types import JsonDict 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | _next_request_seq = 0 26 | 27 | _Handler = TypeVar("_Handler") 28 | 29 | 30 | @attr.s(auto_attribs=True, frozen=True, slots=True) 31 | class _BytesResponse: 32 | """A binary response, and the headers to send back to the client alongside it.""" 33 | 34 | headers: CIMultiDictProxy[str] 35 | content: bytes 36 | 37 | 38 | def web_handler( 39 | func: Callable[ 40 | [_Handler, web.Request], Awaitable[Tuple[int, Union[JsonDict, _BytesResponse]]] 41 | ], 42 | ) -> Callable[[_Handler, web.Request], Awaitable[web.Response]]: 43 | """Decorator that adds a wrapper to the given web handler method, which turns its 44 | return value into an aiohttp Response, and handles errors. 45 | 46 | Args: 47 | func: The function to wrap. 48 | 49 | Returns: 50 | The wrapper to run for this function. 51 | """ 52 | 53 | def handle_error(status: int, reason: ErrCode, info: Optional[str]) -> web.Response: 54 | """Turns an error with the given parameters into an aiohttp Response. 55 | 56 | Args: 57 | status: The HTTP status code. 58 | reason: The error code to include in the response's JSON body. 59 | info: Optional extra info to include in the response's JSON body. 60 | """ 61 | # Write the reason for the error into the response body, and add some extra info 62 | # if we have any. 63 | res_body: JsonDict = {"reason": reason} 64 | if info is not None: 65 | res_body["info"] = info 66 | 67 | res = _to_json_bytes(res_body) 68 | 69 | return web.Response( 70 | status=status, 71 | content_type="application/json", 72 | body=res, 73 | ) 74 | 75 | @functools.wraps(func) 76 | async def wrapper(self: _Handler, request: web.Request) -> web.Response: 77 | """Run the wrapped method, and turn the return value into an aiohttp Response. 78 | 79 | If the wrapped method raises an exception, turn that into an aiohttp Response 80 | as well. 81 | 82 | Args: 83 | self: The object the wrapped method belongs to. 84 | request: The aiohttp Request to process. 85 | """ 86 | # Set the request ID in the logging context, and increment the sequence for the 87 | # next request. 88 | global _next_request_seq 89 | request_id = f"{request.method}-{_next_request_seq}" 90 | logutils.set_request_id_in_context(request_id) 91 | _next_request_seq += 1 92 | 93 | # Check that the path is correct. 94 | if not request.path.startswith("/_matrix/media_proxy/unstable"): 95 | return handle_error( 96 | status=400, 97 | reason=ErrCode.UNKNOWN, 98 | info="Invalid path", 99 | ) 100 | 101 | try: 102 | status, res = await func(self, request) 103 | 104 | # Set the response and headers according to the return value. If the handler 105 | # didn't return with a bytes response (in which it is responsible for 106 | # providing the headers, including the content-type one), default to json. 107 | headers: Union[Dict[str, str], CIMultiDictProxy[str]] 108 | if isinstance(res, _BytesResponse): 109 | raw_res = res.content 110 | headers = res.headers 111 | else: 112 | raw_res = _to_json_bytes(res) 113 | headers = {"content-type": "application/json"} 114 | 115 | return web.Response( 116 | status=status, 117 | body=raw_res, 118 | headers=headers, 119 | ) 120 | except ContentScannerRestError as e: 121 | # If we get a REST error, use it to generate an error response. 122 | return handle_error( 123 | status=e.http_status, 124 | reason=e.reason, 125 | info=e.info, 126 | ) 127 | except Exception as e: 128 | # Otherwise, just treat it as an unknown server error. 129 | logger.exception(e) 130 | return handle_error( 131 | status=500, 132 | reason=ErrCode.UNKNOWN, 133 | info="Internal Server Error", 134 | ) 135 | 136 | return wrapper 137 | 138 | 139 | def _to_json_bytes(content: JsonDict) -> bytes: 140 | """Converts a dict into JSON and encodes it to bytes.""" 141 | return json.dumps(content).encode("UTF-8") 142 | 143 | 144 | async def get_media_metadata_from_request( 145 | request: web.Request, 146 | crypto_handler: crypto.CryptoHandler, 147 | ) -> Tuple[str, JsonDict]: 148 | """Extracts, optionally decrypts, and validates encrypted file metadata from a 149 | request body. 150 | 151 | Args: 152 | request: The request to extract the data from. 153 | crypto_handler: The crypto handler to use if we need to decrypt an Olm-encrypted 154 | body. 155 | 156 | Raises: 157 | ContentScannerRestError(400) if the request's body is None or if the metadata 158 | didn't pass schema validation. 159 | """ 160 | if request.content is None: 161 | raise ContentScannerRestError( 162 | 400, 163 | ErrCode.MALFORMED_JSON, 164 | "No content in request body", 165 | ) 166 | 167 | try: 168 | body = await request.json() 169 | except json.decoder.JSONDecodeError as e: 170 | raise ContentScannerRestError(400, ErrCode.MALFORMED_JSON, str(e)) 171 | 172 | metadata = _metadata_from_body(body, crypto_handler) 173 | 174 | validate_encrypted_file_metadata(metadata) 175 | 176 | # Get the media path. 177 | url = metadata["file"]["url"] 178 | media_path = url[len("mxc://") :] 179 | 180 | return media_path, metadata 181 | 182 | 183 | def _metadata_from_body( 184 | body: JsonDict, crypto_handler: crypto.CryptoHandler 185 | ) -> JsonDict: 186 | """Parse the given body as JSON, and decrypts it if needed. 187 | 188 | Args: 189 | body: The body, parsed as JSON. 190 | crypto_handler: The crypto handler to use if we need to decrypt an Olm-encrypted 191 | body. 192 | 193 | Returns: 194 | The parsed and decrypted file metadata. 195 | 196 | Raises: 197 | ContentScannerRestError(400) if the body isn't valid JSON or isn't a dictionary. 198 | """ 199 | # Every POST request body in the API implemented by the content scanner is a dict. 200 | if not isinstance(body, dict): 201 | raise ContentScannerRestError( 202 | 400, 203 | ErrCode.MALFORMED_JSON, 204 | "Body must be a dictionary", 205 | ) 206 | 207 | # Check if the metadata is encrypted, if not then the metadata is in clear text in 208 | # the body so just return it. 209 | encrypted_body: Optional[JsonDict] = body.get("encrypted_body") 210 | if encrypted_body is None: 211 | return body 212 | 213 | # If it is encrypted, decrypt it and return the decrypted version. 214 | try: 215 | decrypted: JsonDict = json.loads( 216 | crypto_handler.decrypt_body( 217 | ciphertext=encrypted_body["ciphertext"], 218 | mac=encrypted_body["mac"], 219 | ephemeral=encrypted_body["ephemeral"], 220 | ) 221 | ) 222 | return decrypted 223 | except Exception as e: 224 | logger.exception("Failed to decrypt encrypted body") 225 | raise ContentScannerRestError( 226 | http_status=400, 227 | reason=ErrCode.FAILED_TO_DECRYPT, 228 | info=str(e), 229 | ) 230 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/servlets/download.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | from typing import TYPE_CHECKING, Optional, Tuple 6 | 7 | from aiohttp import web 8 | 9 | from matrix_content_scanner.servlets import ( 10 | _BytesResponse, 11 | get_media_metadata_from_request, 12 | web_handler, 13 | ) 14 | from matrix_content_scanner.utils.types import JsonDict 15 | 16 | if TYPE_CHECKING: 17 | from matrix_content_scanner.mcs import MatrixContentScanner 18 | 19 | 20 | class DownloadHandler: 21 | def __init__(self, content_scanner: "MatrixContentScanner"): 22 | self._scanner = content_scanner.scanner 23 | self._crypto_handler = content_scanner.crypto_handler 24 | 25 | async def _scan( 26 | self, 27 | media_path: str, 28 | metadata: Optional[JsonDict] = None, 29 | auth_header: Optional[str] = None, 30 | ) -> Tuple[int, _BytesResponse]: 31 | media = await self._scanner.scan_file( 32 | media_path, metadata, auth_header=auth_header 33 | ) 34 | 35 | return 200, _BytesResponse( 36 | headers=media.response_headers, 37 | content=media.content, 38 | ) 39 | 40 | @web_handler 41 | async def handle_plain(self, request: web.Request) -> Tuple[int, _BytesResponse]: 42 | """Handles GET requests to ../download/serverName/mediaId""" 43 | media_path = request.match_info["media_path"] 44 | return await self._scan( 45 | media_path, auth_header=request.headers.get("Authorization") 46 | ) 47 | 48 | @web_handler 49 | async def handle_encrypted( 50 | self, request: web.Request 51 | ) -> Tuple[int, _BytesResponse]: 52 | """Handles POST requests to ../download_encrypted""" 53 | media_path, metadata = await get_media_metadata_from_request( 54 | request, self._crypto_handler 55 | ) 56 | 57 | return await self._scan( 58 | media_path, metadata, auth_header=request.headers.get("Authorization") 59 | ) 60 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/servlets/public_key.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | from typing import TYPE_CHECKING, Tuple 6 | 7 | from aiohttp import web 8 | 9 | from matrix_content_scanner.servlets import web_handler 10 | from matrix_content_scanner.utils.types import JsonDict 11 | 12 | if TYPE_CHECKING: 13 | from matrix_content_scanner.mcs import MatrixContentScanner 14 | 15 | 16 | class PublicKeyHandler: 17 | def __init__(self, content_scanner: "MatrixContentScanner") -> None: 18 | self._crypto_handler = content_scanner.crypto_handler 19 | 20 | @web_handler 21 | async def handle_public_key(self, request: web.Request) -> Tuple[int, JsonDict]: 22 | """Handles GET requests to .../public_key""" 23 | return 200, {"public_key": self._crypto_handler.public_key} 24 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/servlets/scan.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | from typing import TYPE_CHECKING, Optional, Tuple 6 | 7 | from aiohttp import web 8 | 9 | from matrix_content_scanner.servlets import get_media_metadata_from_request, web_handler 10 | from matrix_content_scanner.utils.errors import FileDirtyError 11 | from matrix_content_scanner.utils.types import JsonDict 12 | 13 | if TYPE_CHECKING: 14 | from matrix_content_scanner.mcs import MatrixContentScanner 15 | 16 | 17 | class ScanHandler: 18 | def __init__(self, content_scanner: "MatrixContentScanner"): 19 | self._scanner = content_scanner.scanner 20 | self._crypto_handler = content_scanner.crypto_handler 21 | 22 | async def _scan_and_format( 23 | self, 24 | media_path: str, 25 | metadata: Optional[JsonDict] = None, 26 | auth_header: Optional[str] = None, 27 | ) -> Tuple[int, JsonDict]: 28 | try: 29 | await self._scanner.scan_file(media_path, metadata, auth_header=auth_header) 30 | except FileDirtyError as e: 31 | res = {"clean": False, "info": e.info} 32 | else: 33 | res = {"clean": True, "info": "File is clean"} 34 | 35 | return 200, res 36 | 37 | @web_handler 38 | async def handle_plain(self, request: web.Request) -> Tuple[int, JsonDict]: 39 | """Handles GET requests to ../scan/serverName/mediaId""" 40 | media_path = request.match_info["media_path"] 41 | return await self._scan_and_format( 42 | media_path, auth_header=request.headers.get("Authorization") 43 | ) 44 | 45 | @web_handler 46 | async def handle_encrypted(self, request: web.Request) -> Tuple[int, JsonDict]: 47 | """Handles GET requests to ../scan_encrypted""" 48 | media_path, metadata = await get_media_metadata_from_request( 49 | request, self._crypto_handler 50 | ) 51 | return await self._scan_and_format( 52 | media_path, metadata, auth_header=request.headers.get("Authorization") 53 | ) 54 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/servlets/thumbnail.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | from typing import TYPE_CHECKING, Tuple 6 | 7 | from aiohttp import web 8 | 9 | from matrix_content_scanner.servlets import _BytesResponse, web_handler 10 | 11 | if TYPE_CHECKING: 12 | from matrix_content_scanner.mcs import MatrixContentScanner 13 | 14 | 15 | class ThumbnailHandler: 16 | def __init__(self, content_scanner: "MatrixContentScanner"): 17 | self._scanner = content_scanner.scanner 18 | 19 | @web_handler 20 | async def handle_thumbnail( 21 | self, request: web.Request 22 | ) -> Tuple[int, _BytesResponse]: 23 | """Handles GET requests to .../thumbnail/serverName/mediaId""" 24 | media_path = request.match_info["media_path"] 25 | 26 | media = await self._scanner.scan_file( 27 | media_path=media_path, 28 | thumbnail_params=request.query, 29 | auth_header=request.headers.get("Authorization"), 30 | ) 31 | 32 | return 200, _BytesResponse( 33 | headers=media.response_headers, 34 | content=media.content, 35 | ) 36 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/utils/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | from enum import Enum 6 | 7 | 8 | class ErrCode(str, Enum): 9 | # An unknown error happened. 10 | UNKNOWN = "M_UNKNOWN" 11 | # One of the following: 12 | # - No route was found with the path and method provided in the request. 13 | # - The homeserver does not have the requested piece of media. 14 | NOT_FOUND = "M_NOT_FOUND" 15 | # The access token is missing from the request. 16 | MISSING_TOKEN = "M_MISSING_TOKEN" 17 | # The provided access token is invalid. 18 | # One of the following: 19 | # - the access token was never valid. 20 | # - the access token has been logged out. 21 | # - the access token has been soft logged out. 22 | # - [Added in v1.3] the access token needs to be refreshed. 23 | UNKNOWN_TOKEN = "M_UNKNOWN_TOKEN" 24 | # The file failed the scan. 25 | NOT_CLEAN = "MCS_MEDIA_NOT_CLEAN" 26 | # The file could not be retrieved from the homeserver. 27 | # Does NOT cover homeserver responses with M_NOT_FOUND. 28 | REQUEST_FAILED = "MCS_MEDIA_REQUEST_FAILED" 29 | # The encrypted file could not be decrypted with the provided metadata. 30 | FAILED_TO_DECRYPT = "MCS_MEDIA_FAILED_TO_DECRYPT" 31 | # The request body isn't valid JSON, or is missing a required parameter. 32 | MALFORMED_JSON = "MCS_MALFORMED_JSON" 33 | # The Mime type is not in the allowed list of Mime types. 34 | MIME_TYPE_FORBIDDEN = "MCS_MIME_TYPE_FORBIDDEN" 35 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/utils/encrypted_file_metadata.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | from jsonschema import ValidationError, validate 6 | 7 | from matrix_content_scanner.utils.constants import ErrCode 8 | from matrix_content_scanner.utils.errors import ContentScannerRestError 9 | from matrix_content_scanner.utils.types import JsonDict 10 | 11 | # This is a subset of the content of an m.room.message event that includes a file, with 12 | # only the info that we need to locate and decrypt the file. 13 | _encrypted_file_metadata_schema = { 14 | "type": "object", 15 | "required": ["file"], 16 | "properties": { 17 | "file": { 18 | "type": "object", 19 | "required": ["v", "iv", "url", "hashes", "key"], 20 | "properties": { 21 | "v": {"const": "v2"}, 22 | "iv": {"type": "string"}, 23 | "url": {"type": "string"}, 24 | "hashes": { 25 | "type": "object", 26 | "required": ["sha256"], 27 | "properties": { 28 | "sha256": {"type": "string"}, 29 | }, 30 | }, 31 | "key": { 32 | "type": "object", 33 | "required": ["alg", "kty", "k", "key_ops", "ext"], 34 | "properties": { 35 | "alg": {"const": "A256CTR"}, 36 | "kty": {"const": "oct"}, 37 | "k": {"type": "string"}, 38 | "key_ops": {"type": "array", "items": {"type": "string"}}, 39 | "ext": {"const": True}, 40 | }, 41 | }, 42 | }, 43 | }, 44 | }, 45 | } 46 | 47 | 48 | def _validate(body: JsonDict) -> None: 49 | """Validates the schema using jsonschema, and by checking whether the `key_ops` list 50 | includes at least `encrypt` and `decrypt`. 51 | 52 | Args: 53 | body: The body to validate. 54 | 55 | Raises: 56 | ValidationError if the jsonschema validation failed. 57 | ValueError if the `key_ops` list doesn't include at least `encrypt` and `decrypt`. 58 | """ 59 | validate(body, _encrypted_file_metadata_schema) 60 | 61 | # We don't need to worry about triggering a KeyError/TypeError here because all of 62 | # these keys are marked as required in the schema, so at this point we know they're 63 | # here. 64 | key_ops = body["file"]["key"]["key_ops"] 65 | # We need the key_ops list to at least include "encrypt" and "decrypt", but we can't 66 | # check this with jsonschema, so we need to do it manually. 67 | if not set(key_ops).issuperset({"encrypt", "decrypt"}): 68 | raise ValueError('key_ops must contain at least "encrypt" and "decrypt"') 69 | 70 | 71 | def validate_encrypted_file_metadata(body: JsonDict) -> None: 72 | """Validates the schema of the given dictionary, and turns any validation error 73 | raised into a client error. 74 | 75 | Args: 76 | body: The body to validate. 77 | 78 | Raises: 79 | ContentScannerRestError(400) if the validation failed. 80 | """ 81 | # Run the validation and turns any error coming out of it into a REST error. 82 | try: 83 | _validate(body) 84 | except ValidationError as e: 85 | raise ContentScannerRestError(400, ErrCode.MALFORMED_JSON, e.message) 86 | except ValueError as e: 87 | raise ContentScannerRestError(400, ErrCode.MALFORMED_JSON, str(e)) 88 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/utils/errors.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | from typing import Optional 6 | 7 | from matrix_content_scanner.utils.constants import ErrCode 8 | 9 | 10 | class ContentScannerRestError(Exception): 11 | """An error that is converted into an error response by the REST resource.""" 12 | 13 | def __init__(self, http_status: int, reason: ErrCode, info: Optional[str]) -> None: 14 | super(Exception, self).__init__(info) 15 | self.http_status = http_status 16 | self.reason = reason 17 | self.info = info 18 | 19 | 20 | class FileDirtyError(ContentScannerRestError): 21 | """An error indicating that the file being scanned is dirty.""" 22 | 23 | def __init__( 24 | self, 25 | info: Optional[str] = "***VIRUS DETECTED***", 26 | cacheable: bool = True, 27 | ) -> None: 28 | """ 29 | Args: 30 | info: The info string to serve to the client. 31 | cacheable: Whether raising this error should be recorded as a scan failure in 32 | the scanner's result cache. 33 | """ 34 | super(FileDirtyError, self).__init__( 35 | http_status=403, 36 | reason=ErrCode.NOT_CLEAN, 37 | info=info, 38 | ) 39 | 40 | self.cacheable = cacheable 41 | 42 | 43 | class FileMimeTypeForbiddenError(ContentScannerRestError): 44 | """An error indicating that the file's MIME type is forbidden.""" 45 | 46 | def __init__(self, info: Optional[str]) -> None: 47 | super(FileMimeTypeForbiddenError, self).__init__( 48 | http_status=403, 49 | reason=ErrCode.MIME_TYPE_FORBIDDEN, 50 | info=info, 51 | ) 52 | 53 | 54 | class ConfigError(Exception): 55 | """An error indicating an issue with the configuration file.""" 56 | 57 | 58 | class WellKnownDiscoveryError(Exception): 59 | """An error indicating a failure when attempting a .well-known discovery.""" 60 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/utils/rust.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | 6 | import os 7 | import sys 8 | from hashlib import blake2b 9 | 10 | import matrix_content_scanner 11 | from matrix_content_scanner.mcs_rust import get_rust_file_digest 12 | 13 | 14 | def check_rust_lib_up_to_date() -> None: 15 | """For editable installs check if the rust library is outdated and needs to 16 | be rebuilt. 17 | """ 18 | 19 | if not _dist_is_editable(): 20 | return 21 | 22 | mcs_dir = os.path.dirname(matrix_content_scanner.__file__) 23 | mcs_root = os.path.abspath(os.path.join(mcs_dir, "../..")) 24 | 25 | # Double check we've not gone into site-packages... 26 | if os.path.basename(mcs_root) == "site-packages": 27 | return 28 | 29 | # ... and it looks like the root of a python project. 30 | if not os.path.exists("pyproject.toml"): 31 | return 32 | 33 | # Get the hash of all Rust source files 34 | hash = _hash_rust_files_in_directory(os.path.join(mcs_root, "rust", "src")) 35 | 36 | if hash != get_rust_file_digest(): 37 | raise Exception("Rust module outdated. Please rebuild using `poetry install`") 38 | 39 | 40 | def _hash_rust_files_in_directory(directory: str) -> str: 41 | """Get the hash of all files in a directory (recursively)""" 42 | 43 | directory = os.path.abspath(directory) 44 | 45 | paths = [] 46 | 47 | dirs = [directory] 48 | while dirs: 49 | dir = dirs.pop() 50 | with os.scandir(dir) as d: 51 | for entry in d: 52 | if entry.is_dir(): 53 | dirs.append(entry.path) 54 | else: 55 | paths.append(entry.path) 56 | 57 | # We sort to make sure that we get a consistent and well-defined ordering. 58 | paths.sort() 59 | 60 | hasher = blake2b() 61 | 62 | for path in paths: 63 | with open(os.path.join(directory, path), "rb") as f: 64 | hasher.update(f.read()) 65 | 66 | return hasher.hexdigest() 67 | 68 | 69 | def _dist_is_editable() -> bool: 70 | """Is distribution an editable install?""" 71 | for path_item in sys.path: 72 | egg_link = os.path.join(path_item, "matrix_content_scanner.pth") 73 | if os.path.isfile(egg_link): 74 | return True 75 | return False 76 | -------------------------------------------------------------------------------- /src/matrix_content_scanner/utils/types.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | from typing import Any, Dict 6 | 7 | import attr 8 | from multidict import CIMultiDictProxy 9 | 10 | 11 | @attr.s(auto_attribs=True) 12 | class MediaDescription: 13 | """A description of a media.""" 14 | 15 | content_type: str 16 | content: bytes 17 | response_headers: CIMultiDictProxy[str] 18 | cacheable: bool = True 19 | 20 | 21 | # A JSON object/dictionary. 22 | JsonDict = Dict[str, Any] 23 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | -------------------------------------------------------------------------------- /tests/scanner/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | -------------------------------------------------------------------------------- /tests/scanner/test_file_downloader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | import json 6 | from typing import Dict, List, Optional, Tuple, Union 7 | from unittest import IsolatedAsyncioTestCase 8 | from unittest.mock import Mock, call 9 | 10 | from multidict import CIMultiDict, CIMultiDictProxy, MultiDictProxy 11 | 12 | from matrix_content_scanner.utils.errors import ( 13 | ContentScannerRestError, 14 | WellKnownDiscoveryError, 15 | ) 16 | from matrix_content_scanner.utils.types import JsonDict 17 | 18 | from tests.testutils import ( 19 | MEDIA_PATH, 20 | SMALL_PNG, 21 | get_base_media_headers, 22 | get_content_scanner, 23 | to_thumbnail_params, 24 | ) 25 | 26 | 27 | class FileDownloaderTestCase(IsolatedAsyncioTestCase): 28 | def setUp(self) -> None: 29 | # Set a fixed base URL so that .well-known discovery doesn't get in the way. 30 | content_scanner = get_content_scanner( 31 | {"download": {"base_homeserver_url": "http://my-site.com"}} 32 | ) 33 | self.downloader = content_scanner.file_downloader 34 | 35 | self.media_status = 200 36 | self.media_body = SMALL_PNG 37 | self.media_headers = get_base_media_headers() 38 | 39 | async def _get( 40 | url: str, 41 | query: Optional[MultiDictProxy[str]] = None, 42 | auth_header: Optional[str] = None, 43 | ) -> Tuple[int, bytes, CIMultiDictProxy[str]]: 44 | """Mock for the _get method on the file downloader that doesn't serve a 45 | .well-known client file. 46 | """ 47 | if ( 48 | url.endswith( 49 | ( 50 | "/_matrix/media/v3/download/" + MEDIA_PATH, 51 | "/_matrix/media/r0/download/" + MEDIA_PATH, 52 | ) 53 | ) 54 | or "/_matrix/media/v3/thumbnail/" + MEDIA_PATH in url 55 | or "/_matrix/media/r0/thumbnail/" + MEDIA_PATH in url 56 | ): 57 | return self.media_status, self.media_body, self.media_headers 58 | if ( 59 | url.endswith(("/_matrix/client/v1/media/download/" + MEDIA_PATH,)) 60 | or "/_matrix/client/v1/media/thumbnail/" + MEDIA_PATH in url 61 | ): 62 | if auth_header is not None: 63 | return self.media_status, self.media_body, self.media_headers 64 | else: 65 | return 404, b"Not found", CIMultiDictProxy(CIMultiDict()) 66 | elif url.endswith("/.well-known/matrix/client"): 67 | return 404, b"Not found", CIMultiDictProxy(CIMultiDict()) 68 | 69 | raise RuntimeError("Unexpected request on %s" % url) 70 | 71 | # Mock _get so we don't actually try to download files. 72 | self.get_mock = Mock(side_effect=_get) 73 | self.downloader._get = self.get_mock # type: ignore[method-assign] 74 | 75 | async def test_download(self) -> None: 76 | """Tests that downloading a file works.""" 77 | media = await self.downloader.download_file(MEDIA_PATH) 78 | self.assertEqual(media.content, SMALL_PNG) 79 | self.assertEqual(media.content_type, "image/png") 80 | 81 | # Check that we tried downloading from the set base URL. 82 | args = self.get_mock.call_args.args 83 | self.assertTrue(args[0].startswith("http://my-site.com/")) 84 | 85 | async def test_download_auth_media(self) -> None: 86 | """Tests that downloading a file works using authenticated media.""" 87 | media = await self.downloader.download_file( 88 | MEDIA_PATH, auth_header="Bearer access_token" 89 | ) 90 | self.assertEqual(media.content, SMALL_PNG) 91 | self.assertEqual(media.content_type, "image/png") 92 | 93 | # Check that we tried downloading from the set base URL. 94 | args = self.get_mock.call_args.args 95 | self.assertTrue(args[0].startswith("http://my-site.com/")) 96 | self.assertIn("/_matrix/client/v1/media/download/" + MEDIA_PATH, args[0]) 97 | 98 | async def test_download_auth_media_invalid_token(self) -> None: 99 | """Tests that downloading an authenticated media file with an invalid access 100 | token returns the correct error code. 101 | """ 102 | self.media_status = 401 103 | self.media_body = ( 104 | b'{"errcode":"M_UNKNOWN_TOKEN","error":"Invalid access token"}' 105 | ) 106 | self._set_headers({"content-type": ["application/json"]}) 107 | 108 | # Check that we fail at downloading the file. 109 | with self.assertRaises(ContentScannerRestError) as cm: 110 | await self.downloader.download_file( 111 | MEDIA_PATH, auth_header="Bearer access_token" 112 | ) 113 | 114 | self.assertEqual(cm.exception.http_status, 401) 115 | self.assertEqual(cm.exception.reason, "M_UNKNOWN_TOKEN") 116 | 117 | # Check that we tried downloading from the set base URL. 118 | args = self.get_mock.call_args.args 119 | self.assertTrue(args[0].startswith("http://my-site.com/")) 120 | self.assertIn("/_matrix/client/v1/media/download/" + MEDIA_PATH, args[0]) 121 | 122 | async def test_download_auth_media_missing_token(self) -> None: 123 | """Tests that downloading an authenticated media file with a missing access 124 | token returns the correct error code. 125 | """ 126 | self.media_status = 401 127 | self.media_body = ( 128 | b'{"errcode":"M_MISSING_TOKEN","error":"Missing access token"}' 129 | ) 130 | self._set_headers({"content-type": ["application/json"]}) 131 | 132 | # Check that we fail at downloading the file. 133 | with self.assertRaises(ContentScannerRestError) as cm: 134 | await self.downloader.download_file( 135 | MEDIA_PATH, auth_header="Bearer access_token" 136 | ) 137 | 138 | self.assertEqual(cm.exception.http_status, 401) 139 | self.assertEqual(cm.exception.reason, "M_MISSING_TOKEN") 140 | 141 | # Check that we tried downloading from the set base URL. 142 | args = self.get_mock.call_args.args 143 | self.assertTrue(args[0].startswith("http://my-site.com/")) 144 | self.assertIn("/_matrix/client/v1/media/download/" + MEDIA_PATH, args[0]) 145 | 146 | async def test_no_base_url(self) -> None: 147 | """Tests that configuring a base homeserver URL means files are downloaded from 148 | that homeserver (rather than the one the files were uploaded to) and .well-known 149 | discovery is bypassed. 150 | """ 151 | self.downloader._base_url = None 152 | await self.downloader.download_file(MEDIA_PATH) 153 | 154 | # Check that we've tried making a .well-known discovery request before 155 | # downloading the file. 156 | self.assertEqual(self.get_mock.call_count, 2) 157 | self.assertEqual( 158 | self.get_mock.mock_calls[0], call("https://foo/.well-known/matrix/client") 159 | ) 160 | self.assertEqual( 161 | self.get_mock.mock_calls[1], 162 | call( 163 | "https://foo/_matrix/media/v3/download/" + MEDIA_PATH, 164 | query=None, 165 | auth_header=None, 166 | ), 167 | ) 168 | 169 | async def test_retry_on_404(self) -> None: 170 | """Tests that if we get a 404 when trying to download a file on a v3 path, we 171 | retry with an r0 path for backwards compatibility. 172 | """ 173 | self.media_status = 404 174 | self.media_body = b"Not found" 175 | self._set_headers({"content-type": ["text/plain"]}) 176 | 177 | await self._test_retry() 178 | 179 | async def test_retry_on_unrecognised(self) -> None: 180 | """Tests that if we get a Synapse-style M_UNRECOGNIZED response when trying to 181 | download a file on a v3 path, we retry with an r0 path for backwards 182 | compatibility. 183 | """ 184 | self.media_status = 400 185 | self.media_body = b'{"errcode":"M_UNRECOGNIZED","error":"Unrecognized request"}' 186 | self._set_headers({"content-type": ["application/json"]}) 187 | 188 | await self._test_retry() 189 | 190 | async def _test_retry(self) -> None: 191 | """Tests that in a set specific case a failure to download a file from a v3 192 | download path means we retry the request on an r0 one for backwards compatibility. 193 | """ 194 | # Check that we eventually fail at downloading the file. 195 | with self.assertRaises(ContentScannerRestError) as cm: 196 | await self.downloader.download_file(MEDIA_PATH) 197 | 198 | self.assertEqual(cm.exception.http_status, 404) 199 | self.assertEqual(cm.exception.info, "File not found") 200 | 201 | # Check that we sent out two requests: one to the v3 path and one to the r0 path. 202 | self.assertEqual(self.get_mock.call_count, 2) 203 | self.assertEqual( 204 | self.get_mock.mock_calls[0], 205 | call( 206 | "http://my-site.com/_matrix/media/v3/download/" + MEDIA_PATH, 207 | query=None, 208 | auth_header=None, 209 | ), 210 | ) 211 | self.assertEqual( 212 | self.get_mock.mock_calls[1], 213 | call( 214 | "http://my-site.com/_matrix/media/r0/download/" + MEDIA_PATH, 215 | query=None, 216 | auth_header=None, 217 | ), 218 | ) 219 | 220 | async def test_no_retry(self) -> None: 221 | """Tests that in a set specific case a failure to download a file from a v1 222 | authenticated media download path means we don't retry the request. 223 | """ 224 | self.media_status = 400 225 | self.media_body = b'{"errcode":"M_UNRECOGNIZED","error":"Unrecognized request"}' 226 | self._set_headers({"content-type": ["application/json"]}) 227 | 228 | # Check that we eventually fail at downloading the file. 229 | with self.assertRaises(ContentScannerRestError) as cm: 230 | await self.downloader.download_file( 231 | MEDIA_PATH, auth_header="Bearer access_token" 232 | ) 233 | 234 | self.assertEqual(cm.exception.http_status, 404) 235 | self.assertEqual(cm.exception.info, "File not found") 236 | 237 | # Check that we sent out only one request. 238 | self.assertEqual(self.get_mock.call_count, 1) 239 | self.assertEqual( 240 | self.get_mock.mock_calls[0], 241 | call( 242 | "http://my-site.com/_matrix/client/v1/media/download/" + MEDIA_PATH, 243 | query=None, 244 | auth_header="Bearer access_token", 245 | ), 246 | ) 247 | 248 | async def test_thumbnail(self) -> None: 249 | """Tests that we can download a thumbnail and that the parameters to generate the 250 | thumbnail are correctly passed on to the homeserver. 251 | """ 252 | await self.downloader.download_file( 253 | MEDIA_PATH, to_thumbnail_params({"height": "50"}) 254 | ) 255 | 256 | url: str = self.get_mock.call_args.args[0] 257 | query: CIMultiDictProxy[str] = self.get_mock.call_args.kwargs["query"] 258 | self.assertIn("/thumbnail/", url) 259 | self.assertIn("height", query) 260 | self.assertEqual(query.get("height"), "50", query.getall("height")) 261 | 262 | async def test_thumbnail_auth_media(self) -> None: 263 | """Tests that we can download a thumbnail and that the parameters to generate the 264 | thumbnail are correctly passed on to the homeserver using authenticated media. 265 | """ 266 | await self.downloader.download_file( 267 | MEDIA_PATH, to_thumbnail_params({"height": "50"}), "Bearer access_token" 268 | ) 269 | 270 | url: str = self.get_mock.call_args.args[0] 271 | query: CIMultiDictProxy[str] = self.get_mock.call_args.kwargs["query"] 272 | self.assertIn("/thumbnail/", url) 273 | self.assertIn("/_matrix/client/v1/media/thumbnail/" + MEDIA_PATH, url) 274 | self.assertIn("height", query) 275 | self.assertEqual(query.get("height"), "50", query.getall("height")) 276 | 277 | async def test_multiple_content_type(self) -> None: 278 | """Tests that we raise an error if the homeserver responds with too many 279 | Content-Type headers. 280 | """ 281 | self._set_headers({"content-type": ["image/jpeg", "image/png"]}) 282 | 283 | with self.assertRaises(ContentScannerRestError) as cm: 284 | await self.downloader.download_file(MEDIA_PATH) 285 | 286 | self.assertEqual(cm.exception.http_status, 502) 287 | assert cm.exception.info is not None 288 | self.assertTrue("Content-Type" in cm.exception.info) 289 | 290 | async def test_no_content_type(self) -> None: 291 | """Tests that we raise an error if the homeserver responds with no Content-Type 292 | headers. 293 | """ 294 | self._set_headers({}) 295 | 296 | with self.assertRaises(ContentScannerRestError) as cm: 297 | await self.downloader.download_file(MEDIA_PATH) 298 | 299 | self.assertEqual(cm.exception.http_status, 502) 300 | assert cm.exception.info is not None 301 | self.assertTrue("Content-Type" in cm.exception.info) 302 | 303 | def _set_headers(self, headers: Dict[str, List[str]]) -> None: 304 | """Replace the headers set in setUp with ones constructed from the provided 305 | dictionary. 306 | 307 | Args: 308 | headers: The raw headers to set. 309 | """ 310 | md: CIMultiDict[str] = CIMultiDict() 311 | for k, v in headers.items(): 312 | for el in v: 313 | md.add(k, el) 314 | 315 | self.media_headers = CIMultiDictProxy(md) 316 | 317 | 318 | class WellKnownDiscoveryTestCase(IsolatedAsyncioTestCase): 319 | def setUp(self) -> None: 320 | self.downloader = get_content_scanner().file_downloader 321 | 322 | self.well_known_status = 200 323 | self.well_known_body: Union[bytes, JsonDict] = b"" 324 | 325 | self.versions_status = 200 326 | 327 | async def _get( 328 | url: str, 329 | query: Optional[MultiDictProxy[str]] = None, 330 | auth_header: Optional[str] = None, 331 | ) -> Tuple[int, bytes, CIMultiDictProxy[str]]: 332 | """Mock for the _get method on the file downloader that serves a .well-known 333 | client file. 334 | """ 335 | if url.endswith("/.well-known/matrix/client"): 336 | if isinstance(self.well_known_body, bytes): 337 | body_bytes = self.well_known_body 338 | else: 339 | body_bytes = json.dumps(self.well_known_body).encode("utf-8") 340 | 341 | return ( 342 | self.well_known_status, 343 | body_bytes, 344 | CIMultiDictProxy(CIMultiDict()), 345 | ) 346 | elif url.endswith("/_matrix/client/versions"): 347 | return self.versions_status, b"{}", CIMultiDictProxy(CIMultiDict()) 348 | elif url.endswith("/_matrix/media/v3/download/" + MEDIA_PATH): 349 | return 200, SMALL_PNG, get_base_media_headers() 350 | 351 | raise RuntimeError("Unexpected request on %s" % url) 352 | 353 | # Mock _get so we don't actually try to download files. 354 | self.get_mock = Mock(side_effect=_get) 355 | self.downloader._get = self.get_mock # type: ignore[method-assign] 356 | 357 | async def test_discover(self) -> None: 358 | """Checks that the base URL to use to download files can be discovered via 359 | .well-known discovery. 360 | """ 361 | self.well_known_body = {"m.homeserver": {"base_url": "https://foo.bar"}} 362 | 363 | await self.downloader.download_file(MEDIA_PATH) 364 | 365 | # Check that we got 3 calls: 366 | # * one to retrieve the .well-known file 367 | # * one to check that the base URL can be used to interact with a homeserver 368 | # (by hitting the /_matrix/client/versions endpoint) 369 | # * one to download the file 370 | self.assertEqual(self.get_mock.call_count, 3, self.get_mock.mock_calls) 371 | 372 | calls = self.get_mock.mock_calls 373 | 374 | self.assertEqual(calls[0], call("https://foo/.well-known/matrix/client")) 375 | self.assertTrue(calls[1], call("https://foo.bar/_matrix/client/versions")) 376 | self.assertTrue( 377 | calls[2], call("https://foo.bar/_matrix/media/v3/download/" + MEDIA_PATH) 378 | ) 379 | 380 | async def test_error_status(self) -> None: 381 | """Tests that we raise a WellKnownDiscoveryError if the server responded with an 382 | error.""" 383 | self.well_known_status = 401 384 | await self._assert_discovery_fail() 385 | 386 | async def test_malformed_content(self) -> None: 387 | """Tests that we raise a WellKnownDiscoveryError if the server responded with a 388 | body that isn't compliant with the Matrix specification.""" 389 | self.well_known_body = {"m.homeserver": "https://foo.bar"} 390 | await self._assert_discovery_fail() 391 | 392 | async def test_not_valid_homeserver(self) -> None: 393 | """Tests that we raise a WellKnownDiscoveryError if the server at the provided 394 | base URL isn't a Matrix homeserver.""" 395 | self.versions_status = 404 396 | await self._assert_discovery_fail() 397 | 398 | async def test_404_no_fail(self) -> None: 399 | """Tests that we don't raise a WellKnownDiscoveryError if the .well-known file 400 | couldn't be found, and that we return None instead of the discovered base URL in 401 | this case. 402 | """ 403 | self.well_known_status = 404 404 | res = await self.downloader._discover_via_well_known("foo") 405 | self.assertIsNone(res) 406 | 407 | async def _assert_discovery_fail(self) -> None: 408 | """Checks that .well-known discovery fails and raises a WellKnownDiscoveryError.""" 409 | with self.assertRaises(WellKnownDiscoveryError): 410 | await self.downloader._discover_via_well_known("foo") 411 | -------------------------------------------------------------------------------- /tests/scanner/test_scanner.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | import asyncio 6 | import copy 7 | from typing import Any, Dict, List, Optional 8 | from unittest import IsolatedAsyncioTestCase 9 | from unittest.mock import AsyncMock, Mock 10 | 11 | from multidict import CIMultiDict, CIMultiDictProxy 12 | 13 | from matrix_content_scanner.scanner.scanner import CacheEntry 14 | from matrix_content_scanner.utils.constants import ErrCode 15 | from matrix_content_scanner.utils.errors import ( 16 | ContentScannerRestError, 17 | FileDirtyError, 18 | FileMimeTypeForbiddenError, 19 | ) 20 | from matrix_content_scanner.utils.types import MediaDescription 21 | 22 | from tests.testutils import ( 23 | ENCRYPTED_FILE_METADATA, 24 | MEDIA_PATH, 25 | SMALL_BINARY_FILE, 26 | SMALL_PNG, 27 | SMALL_PNG_ENCRYPTED, 28 | SMALL_TEXT_FILE, 29 | get_content_scanner, 30 | to_thumbnail_params, 31 | ) 32 | 33 | 34 | class ScannerTestCase(IsolatedAsyncioTestCase): 35 | def setUp(self) -> None: 36 | self.downloader_res = MediaDescription( 37 | content_type="image/png", 38 | content=SMALL_PNG, 39 | response_headers=CIMultiDictProxy(CIMultiDict()), 40 | ) 41 | 42 | async def download_file( 43 | media_path: str, 44 | thumbnail_params: Optional[Dict[str, List[str]]] = None, 45 | auth_header: Optional[str] = None, 46 | ) -> MediaDescription: 47 | """Mock for the file downloader's `download_file` method.""" 48 | return self.downloader_res 49 | 50 | self.downloader_mock = Mock(side_effect=download_file) 51 | 52 | # Mock download_file so we don't actually try to download files. 53 | mcs = get_content_scanner() 54 | mcs.file_downloader.download_file = self.downloader_mock # type: ignore[method-assign] 55 | self.scanner = mcs.scanner 56 | 57 | async def test_scan(self) -> None: 58 | """Tests that we can scan files and that the scanner returns the media scanned if 59 | the scan was successful. 60 | """ 61 | media = await self.scanner.scan_file(MEDIA_PATH) 62 | self.assertEqual(media.content, SMALL_PNG) 63 | 64 | async def test_scan_dirty(self) -> None: 65 | """Tests that the scanner raises a FileDirtyError if the scan fails.""" 66 | self.scanner._script = "false" 67 | with self.assertRaises(FileDirtyError): 68 | await self.scanner.scan_file(MEDIA_PATH) 69 | 70 | async def test_encrypted_file(self) -> None: 71 | """Tests that the scanner can decrypt and scan encrypted files, and that if the 72 | scan is successful it returns the encrypted file and not the decrypted version. 73 | """ 74 | self._setup_encrypted() 75 | 76 | media = await self.scanner.scan_file(MEDIA_PATH, ENCRYPTED_FILE_METADATA) 77 | self.assertEqual(media.content, SMALL_PNG_ENCRYPTED) 78 | 79 | async def test_cache(self) -> None: 80 | """Tests that scan results are cached.""" 81 | # Scan the file a first time, and check that the downloader has been called. 82 | await self.scanner.scan_file(MEDIA_PATH) 83 | self.assertEqual(self.downloader_mock.call_count, 1) 84 | 85 | # Scan the file a second time, and check that the downloader has not been called 86 | # this time. 87 | media = await self.scanner.scan_file(MEDIA_PATH) 88 | self.assertEqual(self.downloader_mock.call_count, 1) 89 | self.assertEqual(media.content, SMALL_PNG) 90 | 91 | async def test_cache_encrypted(self) -> None: 92 | """Tests that scan results for encrypted files are cached, and that the cached 93 | file is the encrypted version, not the decrypted one.""" 94 | self._setup_encrypted() 95 | 96 | # Scan the file a first time, and check that the downloader has been called. 97 | await self.scanner.scan_file(MEDIA_PATH, ENCRYPTED_FILE_METADATA) 98 | self.assertEqual(self.downloader_mock.call_count, 1) 99 | 100 | # Scan the file a second time, and check that the downloader has not been called 101 | # this time, and that the media returned is the encrypted copy. 102 | media = await self.scanner.scan_file(MEDIA_PATH, ENCRYPTED_FILE_METADATA) 103 | self.assertEqual(self.downloader_mock.call_count, 1) 104 | self.assertEqual(media.content, SMALL_PNG_ENCRYPTED) 105 | 106 | async def test_cache_download_thumbnail(self) -> None: 107 | """Tests that cached results for full file downloads are not used for thumbnails.""" 108 | await self.scanner.scan_file(MEDIA_PATH) 109 | self.assertEqual(self.downloader_mock.call_count, 1) 110 | 111 | await self.scanner.scan_file( 112 | MEDIA_PATH, thumbnail_params=to_thumbnail_params({"width": "50"}) 113 | ) 114 | self.assertEqual(self.downloader_mock.call_count, 2) 115 | 116 | async def test_cache_thumbnail_params(self) -> None: 117 | """Tests that cached results for thumbnails are only used if the generation 118 | parameters are the same. 119 | """ 120 | # Scan a thumbnail and check that the downloader was called. 121 | await self.scanner.scan_file( 122 | MEDIA_PATH, thumbnail_params=to_thumbnail_params({"width": "50"}) 123 | ) 124 | self.assertEqual(self.downloader_mock.call_count, 1) 125 | 126 | # Scan the thumbnail again and check that the cache result was used (since the 127 | # downloader was not called) 128 | await self.scanner.scan_file( 129 | MEDIA_PATH, thumbnail_params=to_thumbnail_params({"width": "50"}) 130 | ) 131 | self.assertEqual(self.downloader_mock.call_count, 1) 132 | 133 | # Scan a different thumbnail of the same media (with different parameters) and 134 | # check that the downloader was called. 135 | await self.scanner.scan_file( 136 | MEDIA_PATH, thumbnail_params=to_thumbnail_params({"height": "50"}) 137 | ) 138 | self.assertEqual(self.downloader_mock.call_count, 2) 139 | 140 | async def test_cache_max_size(self) -> None: 141 | """Tests that we don't cache files if they exceed the configured maximum file 142 | size. 143 | """ 144 | # Set the maximum file size to be just under the size of the file. 145 | self.scanner._max_size_to_cache = len(SMALL_PNG) - 1 146 | 147 | # Scan the file a first time, and check that the downloader has been called. 148 | await self.scanner.scan_file(MEDIA_PATH) 149 | self.assertEqual(self.downloader_mock.call_count, 1) 150 | 151 | # Scan the file a second time, and check that the downloader has been called 152 | # again. 153 | media = await self.scanner.scan_file(MEDIA_PATH) 154 | self.assertEqual(self.downloader_mock.call_count, 2) 155 | self.assertEqual(media.content, SMALL_PNG) 156 | 157 | async def test_cache_max_size_mismatching_hash(self) -> None: 158 | """Tests that we re-scan big files if the hash we have cached for them does not 159 | match the hash of the newly downloaded content. 160 | """ 161 | # Mock the _run_scan command so we can keep track of its call count. 162 | mock_runner = AsyncMock(return_value=0) 163 | self.scanner._run_scan = mock_runner # type: ignore[method-assign] 164 | 165 | # Calculate the cache key for this file so we can look it up later. 166 | cache_key = self.scanner._get_cache_key_for_file(MEDIA_PATH, None, None) 167 | 168 | # Set the maximum file size to be just under the size of the file. 169 | self.scanner._max_size_to_cache = len(SMALL_PNG) - 1 170 | 171 | # Make sure the cache is empty. 172 | self.assertEqual(len(self.scanner._result_cache), 0) 173 | 174 | # Scan the file a first time, and check that the file has been scanned. 175 | await self.scanner.scan_file(MEDIA_PATH) 176 | self.assertEqual(self.downloader_mock.call_count, 1) 177 | mock_runner.assert_called_once() 178 | 179 | # Test that the file has been cached. 180 | self.assertIn(cache_key, self.scanner._result_cache) 181 | 182 | # Change the hash of the cache entry to force it to be scanned again. 183 | entry: CacheEntry = self.scanner._result_cache[cache_key] 184 | self.scanner._result_cache[cache_key] = CacheEntry( 185 | result=entry.result, 186 | media=entry.media, 187 | media_hash="BAD_HASH", 188 | info=entry.info, 189 | ) 190 | 191 | # Run the scanner again and check that the cache entry for the file has been 192 | # discarded (i.e. the scan is run again). 193 | await self.scanner.scan_file(MEDIA_PATH) 194 | self.assertEqual(mock_runner.call_count, 2) 195 | 196 | # Also check that the file has only been re-downloaded once. 197 | self.assertEqual(self.downloader_mock.call_count, 2) 198 | 199 | async def test_different_encryption_key(self) -> None: 200 | """Tests that if some of the file's metadata changed, we don't match against the 201 | cache and we download the file again. 202 | 203 | Also tests that the scanner fails in the correct way if it can't decrypt a file. 204 | """ 205 | self._setup_encrypted() 206 | 207 | # Scan the file and check that the downloader was called. 208 | await self.scanner.scan_file(MEDIA_PATH, ENCRYPTED_FILE_METADATA) 209 | self.assertEqual(self.downloader_mock.call_count, 1) 210 | 211 | # Copy the file metadata and change the key. 212 | modified_metadata = copy.deepcopy(ENCRYPTED_FILE_METADATA) 213 | modified_metadata["file"]["key"]["k"] = "somethingelse" 214 | 215 | # This causes the scanner to not be able to decrypt the file. 216 | with self.assertRaises(ContentScannerRestError) as cm: 217 | await self.scanner.scan_file(MEDIA_PATH, modified_metadata) 218 | 219 | self.assertEqual(cm.exception.http_status, 400) 220 | self.assertEqual(cm.exception.reason, ErrCode.FAILED_TO_DECRYPT) 221 | 222 | # But it also causes it to be downloaded again because its metadata have changed. 223 | self.assertEqual(self.downloader_mock.call_count, 2) 224 | 225 | async def test_allowlist_mimetype(self) -> None: 226 | """Tests that, if there's an allow list for MIME types and the file's MIME type 227 | isn't in it, the file's scan fails. 228 | """ 229 | # Set an allow list that only allows JPEG files. 230 | self.scanner._allowed_mimetypes = ["image/jpeg"] 231 | 232 | # Check that the scan fails since the file is a PNG. 233 | with self.assertRaises(FileMimeTypeForbiddenError): 234 | await self.scanner.scan_file(MEDIA_PATH) 235 | 236 | async def test_allowlist_mimetype_encrypted(self) -> None: 237 | """Tests that the file's MIME type is correctly detected and compared with the 238 | allow list (if set), even if it's encrypted. 239 | """ 240 | self._setup_encrypted() 241 | 242 | # Set an allow list that only allows JPEG files. 243 | self.scanner._allowed_mimetypes = ["image/jpeg"] 244 | 245 | # Check that the scan fails since the file is a PNG. 246 | with self.assertRaises(FileMimeTypeForbiddenError): 247 | await self.scanner.scan_file(MEDIA_PATH, ENCRYPTED_FILE_METADATA) 248 | 249 | async def test_blocklist_mimetype(self) -> None: 250 | """Tests that, if there's an allow list for MIME types and the file's MIME type 251 | isn't in it, the file's scan fails. 252 | """ 253 | # Set a block list that blocks PNG images. 254 | self.scanner._blocked_mimetypes = ["image/png"] 255 | 256 | # Check that the scan fails since the file is a PNG. 257 | with self.assertRaises(FileMimeTypeForbiddenError): 258 | await self.scanner.scan_file(MEDIA_PATH) 259 | 260 | async def test_blocklist_mimetype_encrypted(self) -> None: 261 | """Tests that the file's MIME type is correctly detected and compared with the 262 | allow list (if set), even if it's encrypted. 263 | """ 264 | self._setup_encrypted() 265 | 266 | # Set a block list that blocks PNG images. 267 | self.scanner._blocked_mimetypes = ["image/png"] 268 | 269 | # Check that the scan fails since the file is a PNG. 270 | with self.assertRaises(FileMimeTypeForbiddenError): 271 | await self.scanner.scan_file(MEDIA_PATH, ENCRYPTED_FILE_METADATA) 272 | 273 | async def test_blocklist_mimetype_fallback_binary_file(self) -> None: 274 | """Tests that unrecognised binary files' MIME type is assumed to be 275 | `application/octet-stream` and that they can be blocked in this way. 276 | """ 277 | 278 | self.downloader_res = MediaDescription( 279 | # This is the *claimed* content-type by the uploader 280 | content_type="application/vnd.io.element.generic_binary_file", 281 | content=SMALL_BINARY_FILE, 282 | response_headers=CIMultiDictProxy(CIMultiDict()), 283 | ) 284 | 285 | # Set a block list that blocks uncategorised binary files. 286 | self.scanner._blocked_mimetypes = ["application/octet-stream"] 287 | 288 | with self.assertRaises(FileMimeTypeForbiddenError): 289 | await self.scanner.scan_file(MEDIA_PATH) 290 | 291 | async def test_blocklist_mimetype_fallback_text_file(self) -> None: 292 | """Tests that unrecognised text files' MIME type is assumed to be 293 | `text/plain` and that they can be blocked in this way. 294 | """ 295 | 296 | self.downloader_res = MediaDescription( 297 | # This is the *claimed* content-type by the uploader 298 | content_type="application/vnd.io.element.generic_file", 299 | content=SMALL_TEXT_FILE, 300 | response_headers=CIMultiDictProxy(CIMultiDict()), 301 | ) 302 | 303 | # Set a block list that blocks uncategorised text files. 304 | self.scanner._blocked_mimetypes = ["text/plain"] 305 | 306 | with self.assertRaises(FileMimeTypeForbiddenError): 307 | await self.scanner.scan_file(MEDIA_PATH) 308 | 309 | async def test_dont_cache_exit_codes(self) -> None: 310 | """Tests that if the configuration specifies exit codes to ignore when running 311 | the scanning script, we don't cache them. 312 | """ 313 | self.scanner._exit_codes_to_ignore = [5] 314 | 315 | # It's tricky to give a value to `scanner._script` that makes `_run_scan` return 5 316 | # directly, so we just mock it here. 317 | run_scan_mock = AsyncMock(return_value=5) 318 | self.scanner._run_scan = run_scan_mock # type: ignore[method-assign] 319 | 320 | # Scan the file, we'll check later that it wasn't cached. 321 | with self.assertRaises(FileDirtyError): 322 | await self.scanner.scan_file(MEDIA_PATH) 323 | 324 | self.assertEqual(self.downloader_mock.call_count, 1) 325 | 326 | # Update the mock so that the file is cached at the next scan. 327 | run_scan_mock.return_value = 1 328 | 329 | # Scan the file again to check that the file wasn't cached. 330 | with self.assertRaises(FileDirtyError): 331 | await self.scanner.scan_file(MEDIA_PATH) 332 | 333 | self.assertEqual(self.downloader_mock.call_count, 2) 334 | 335 | # The file should be cached now. 336 | with self.assertRaises(FileDirtyError): 337 | await self.scanner.scan_file(MEDIA_PATH) 338 | 339 | self.assertEqual(self.downloader_mock.call_count, 2) 340 | 341 | async def test_outside_temp_dir(self) -> None: 342 | """Tests that a scan is failed if the media path is formed in a way that would 343 | cause the scanner to write outside of the configured directory. 344 | """ 345 | with self.assertRaises(FileDirtyError): 346 | await self.scanner.scan_file("../bar") 347 | 348 | async def test_invalid_media_path(self) -> None: 349 | """Tests that a scan fails if the media path is invalid.""" 350 | with self.assertRaises(FileDirtyError): 351 | await self.scanner.scan_file(MEDIA_PATH + "/baz") 352 | 353 | async def test_deduplicate_scans(self) -> None: 354 | """Tests that if two scan requests come in for the same file and with the same 355 | parameter, only one download/scan happens. 356 | """ 357 | 358 | # Change the Mock's side effect to introduce some delay, to simulate a long 359 | # download time. We sleep asynchronously to allow additional scans requests to be 360 | # processed. 361 | async def _scan_file(*args: Any) -> MediaDescription: 362 | await asyncio.sleep(0.2) 363 | 364 | return self.downloader_res 365 | 366 | scan_mock = Mock(side_effect=_scan_file) 367 | self.scanner._scan_file = scan_mock # type: ignore[method-assign] 368 | 369 | # Request two scans of the same file at the same time. 370 | results = await asyncio.gather( 371 | asyncio.create_task(self.scanner.scan_file(MEDIA_PATH)), 372 | asyncio.create_task(self.scanner.scan_file(MEDIA_PATH)), 373 | ) 374 | 375 | # Check that the scanner has only been called once, meaning that the second 376 | # call did not trigger a scan. 377 | scan_mock.assert_called_once() 378 | 379 | # Check that we got two results, and that we actually got the correct media 380 | # description in the second scan. 381 | self.assertEqual(len(results), 2, results) 382 | self.assertEqual(results[0].content, results[1].content, results) 383 | 384 | def _setup_encrypted(self) -> None: 385 | """Sets up class properties to make the downloader return an encrypted file 386 | instead of a plain text one. 387 | """ 388 | self.downloader_res.content_type = "application/octet-stream" 389 | self.downloader_res.content = SMALL_PNG_ENCRYPTED 390 | -------------------------------------------------------------------------------- /tests/servlets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | -------------------------------------------------------------------------------- /tests/servlets/test_scan.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | from http import HTTPStatus 6 | from unittest.mock import patch 7 | 8 | from aiohttp.test_utils import AioHTTPTestCase 9 | from aiohttp.web_app import Application 10 | from multidict import CIMultiDict 11 | 12 | from matrix_content_scanner.httpserver import HTTPServer 13 | from matrix_content_scanner.utils.constants import ErrCode 14 | from matrix_content_scanner.utils.errors import ContentScannerRestError 15 | 16 | from tests.testutils import get_content_scanner 17 | 18 | SERVER_NAME = "test" 19 | 20 | 21 | class TestScanHandler(AioHTTPTestCase): 22 | def setUp(self) -> None: 23 | # Bypass well-known lookups. 24 | self.scanner = get_content_scanner( 25 | {"download": {"base_homeserver_url": "http://my-site.com"}} 26 | ) 27 | 28 | async def get_application(self) -> Application: 29 | return HTTPServer(self.scanner)._app 30 | 31 | async def test_media_not_found_on_remote_homeserver(self) -> None: 32 | """Missing media on the remote HS should be presented as a 404 to the client.""" 33 | patch_downloader = patch.object( 34 | self.scanner.file_downloader, 35 | "_get", 36 | return_value=(HTTPStatus.NOT_FOUND, b"", CIMultiDict()), 37 | ) 38 | 39 | with patch_downloader: 40 | async with self.client.get( 41 | f"/_matrix/media_proxy/unstable/download/{SERVER_NAME}/media-does-not-exist" 42 | ) as resp: 43 | self.assertEqual(resp.status, 404) 44 | body = await resp.json() 45 | self.assertEqual(body["reason"], "M_NOT_FOUND", body) 46 | 47 | async def test_remote_homeserver_unreachable(self) -> None: 48 | """An unreachable HS should be presented as a 502 to the client.""" 49 | patch_downloader = patch.object( 50 | self.scanner.file_downloader, 51 | "_get", 52 | side_effect=ContentScannerRestError( 53 | HTTPStatus.BAD_GATEWAY, 54 | ErrCode.REQUEST_FAILED, 55 | "dodgy network timeout :(((", 56 | ), 57 | ) 58 | 59 | with patch_downloader: 60 | async with self.client.get( 61 | f"/_matrix/media_proxy/unstable/download/{SERVER_NAME}/media-does-not-exist" 62 | ) as resp: 63 | self.assertEqual(resp.status, 502) 64 | body = await resp.json() 65 | self.assertEqual(body["reason"], "MCS_MEDIA_REQUEST_FAILED", body) 66 | -------------------------------------------------------------------------------- /tests/servlets/test_servlets.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | import json 6 | import unittest 7 | 8 | from matrix_content_scanner.servlets import _metadata_from_body 9 | from matrix_content_scanner.utils.constants import ErrCode 10 | from matrix_content_scanner.utils.errors import ContentScannerRestError 11 | from matrix_content_scanner.utils.types import JsonDict 12 | 13 | from tests.testutils import ENCRYPTED_FILE_METADATA, get_content_scanner 14 | 15 | 16 | class EncryptedFileMetadataTestCase(unittest.TestCase): 17 | def setUp(self) -> None: 18 | self.crypto_handler = get_content_scanner().crypto_handler 19 | 20 | def test_unencrypted(self) -> None: 21 | """Tests that the _metadata_from_body function correctly returns non-encrypted 22 | metadata. 23 | """ 24 | metadata = _metadata_from_body(ENCRYPTED_FILE_METADATA, self.crypto_handler) 25 | self.assertEqual(metadata, ENCRYPTED_FILE_METADATA) 26 | 27 | def test_encrypted(self) -> None: 28 | """Tests that the _metadata_from_body function correctly decrypts Olm-encrypted 29 | metadata and returns a decrypted version. 30 | """ 31 | encrypted_body = self._encrypt_body(ENCRYPTED_FILE_METADATA) 32 | metadata = _metadata_from_body(encrypted_body, self.crypto_handler) 33 | self.assertEqual(metadata, ENCRYPTED_FILE_METADATA) 34 | 35 | def test_bad_json(self) -> None: 36 | """Tests that the _metadata_from_body function raises a REST error if the request 37 | body is not a valid JSON object. 38 | """ 39 | with self.assertRaises(ContentScannerRestError) as cm: 40 | _metadata_from_body("foo", self.crypto_handler) # type: ignore[arg-type] 41 | 42 | self.assertEqual(cm.exception.reason, ErrCode.MALFORMED_JSON) 43 | 44 | def _encrypt_body(self, content: JsonDict) -> JsonDict: 45 | """Encrypts the provided dictionary with Olm's PkEncryption class. 46 | 47 | Args: 48 | content: The dictionary to encrypt. 49 | 50 | Returns: 51 | An encrypted version of the dictionary in the format that's expected in POST 52 | requests. 53 | """ 54 | msg = self.crypto_handler.encrypt( 55 | self.crypto_handler.public_key, json.dumps(content) 56 | ) 57 | 58 | return { 59 | "encrypted_body": { 60 | "ciphertext": msg.ciphertext, 61 | "mac": msg.mac, 62 | "ephemeral": msg.ephemeral_key, 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /tests/test_crypto.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | import json 6 | import unittest 7 | 8 | from tests.testutils import get_content_scanner 9 | 10 | 11 | class CryptoHandlerTestCase(unittest.TestCase): 12 | def setUp(self) -> None: 13 | self.crypto_handler = get_content_scanner().crypto_handler 14 | 15 | def test_decrypt(self) -> None: 16 | """Tests that an Olm-encrypted payload is successfully decrypted.""" 17 | payload = {"foo": "bar"} 18 | 19 | # Encrypt the payload with PkEncryption. 20 | encrypted = self.crypto_handler.encrypt( 21 | self.crypto_handler.public_key, json.dumps(payload) 22 | ) 23 | 24 | # Decrypt the payload with the crypto handler. 25 | decrypted = json.loads( 26 | get_content_scanner().crypto_handler.decrypt_body( 27 | encrypted.ciphertext, 28 | encrypted.mac, 29 | encrypted.ephemeral_key, 30 | ) 31 | ) 32 | 33 | # Check that the decrypted payload is the same as the original one before 34 | # encryption. 35 | self.assertEqual(decrypted, payload) 36 | -------------------------------------------------------------------------------- /tests/testutils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | import os 6 | from binascii import unhexlify 7 | from typing import Dict, Optional 8 | 9 | from multidict import CIMultiDict, CIMultiDictProxy, MultiDict, MultiDictProxy 10 | 11 | from matrix_content_scanner.config import MatrixContentScannerConfig 12 | from matrix_content_scanner.mcs import MatrixContentScanner 13 | from matrix_content_scanner.utils.types import JsonDict 14 | 15 | # The media path to use in tests. 16 | MEDIA_PATH = "foo/bar" 17 | 18 | # A small, unencrypted PNG. 19 | SMALL_PNG = unhexlify( 20 | b"89504e470d0a1a0a0000000d4948445200000001000000010806" 21 | b"0000001f15c4890000000a49444154789c63000100000500010d" 22 | b"0a2db40000000049454e44ae426082" 23 | ) 24 | 25 | # A small binary file without any specific format. 26 | SMALL_BINARY_FILE = unhexlify(b"010203") 27 | 28 | # A small text file without any specific format. 29 | SMALL_TEXT_FILE = b"Hello world\nThis is a tiny text file" 30 | 31 | # A small, encrypted PNG. 32 | SMALL_PNG_ENCRYPTED = unhexlify( 33 | b"9fd28dd7a1d845a04948f13af104e39402c888f7b601bce313ad" 34 | b"bf3e2423f67d93d5e304efc147d46df511abacbb8ae7e2e8156c" 35 | b"2e08de86c31fdc6aa5bd4d11537e5657102a83214d13d7ff57e6" 36 | b"d35940f149fbd1e661a260b0b6fe465e4e0a7c8039c08d78f679" 37 | b"cde511be94c685eee50571858d99d0c84918381aea3e52319509" 38 | b"36cac1a7b2ec46c980f3c3995eaf21fc2711b0de8ff014ff5fe7" 39 | b"4a7fcb3515df4f1f2ceeae72d7b58bc69d56dedf31fd430ac2ce" 40 | b"8aee9fcb150a1af9fdee30ac26d68d3db77c1adec5f68cad78f9" 41 | b"ed6ef9156ba23b76e38dfd59cb077c964248f331d43147dc7fa7" 42 | b"b61baf7546e5edfd78347828386b64b3a1ebdff0dcd55ea57f4b" 43 | b"b73b06fbedff62ef8a7fd89146fd11723e739d541d07bf399837" 44 | b"3ed56cb9ef475bd409e590258cdb6a0cdf4871882c334c2897c4" 45 | b"ea0dc76748e727a71d8c2e85253b2c80667f5d98ddbcf8fb90ba" 46 | b"adceb6e75a2741b740dc0d084d55cc20dd7369e7041529b62ce1" 47 | b"59bcde9d9a0f4978093cd52dfe77107613d2bc265519177ed623" 48 | b"49d70517ecf4a243fb7c20db411459766785ee6f039f68383a62" 49 | b"375b14cdf405401dc4aabf6812d9803218544d1ccdc9339e81cb" 50 | b"b36acb3414e8dfb49521b89f1b6d54a712da35e45462844a622c" 51 | b"aa92313335d317201e1eab5f34daba5358fde87648b24868b098" 52 | b"505916b8bc997b19976487718835f0d54a8794e24ca19240cad1" 53 | b"61e0624d8df2214edd3c33ae2b5156e2ef7191d75528f9c26a89" 54 | b"4a" 55 | ) 56 | 57 | # The metadata necessary to download and decrypt SMALL_PNG_ENCRYPTED 58 | ENCRYPTED_FILE_METADATA: JsonDict = { 59 | "file": { 60 | "v": "v2", 61 | "key": { 62 | "alg": "A256CTR", 63 | "ext": True, 64 | "k": "F3miZm2vZhucJ062AuKMUwmd-O6AK0AXP29p4MKtq3Q", 65 | "key_ops": ["encrypt", "decrypt"], 66 | "kty": "oct", 67 | }, 68 | "iv": "rJqtSdi3F/EAAAAAAAAAAA", 69 | "hashes": {"sha256": "NYvGRRQGfyWpXSUpba+ozSbehFP6kw5ZDg0xMppyX8c"}, 70 | "url": "mxc://" + MEDIA_PATH, 71 | } 72 | } 73 | 74 | 75 | def to_thumbnail_params(params: Dict[str, str]) -> MultiDictProxy[str]: 76 | """Turn the given dictionary into query parameters as they'd appear when processing a 77 | thumbnailing request. 78 | 79 | Args: 80 | params: The raw parameters. 81 | 82 | Returns: 83 | A multidict that can be passed onto the scanner or the file downloader. 84 | """ 85 | return MultiDictProxy(MultiDict(params)) 86 | 87 | 88 | def get_base_media_headers() -> CIMultiDictProxy[str]: 89 | """Get the base headers necessary to react to a download request for SMALL_PNG. 90 | 91 | Returns: 92 | The headers to pass onto the file downloader. 93 | """ 94 | return CIMultiDictProxy(CIMultiDict({"content-type": "image/png"})) 95 | 96 | 97 | def get_content_scanner(config: Optional[JsonDict] = None) -> MatrixContentScanner: 98 | """Instantiates an instance of the content scanner. 99 | 100 | Args: 101 | config: The optional provided config. 102 | """ 103 | # Create the temporary directory that we'll use so the scanner doesn't complain about 104 | # it not existing. 105 | os.makedirs(os.path.abspath("temp"), exist_ok=True) 106 | 107 | # We define the default configuration here rather than as a constant outside of a 108 | # function because otherwise a test that sets its own config would have side effects 109 | # on the config used for other tests. 110 | default_config = { 111 | "scan": { 112 | "script": "true", 113 | "temp_directory": "temp", 114 | }, 115 | "web": { 116 | "host": "127.0.0.1", 117 | "port": 8080, 118 | }, 119 | "crypto": { 120 | "pickle_path": "mcs_pickle.txt", 121 | "pickle_key": "foo", 122 | }, 123 | } 124 | 125 | if config is None: 126 | config = {} 127 | 128 | # Update the configuration provided with some default settings. 129 | # Note that `update` does not update nested dictionaries (only the top level), so 130 | # e.g. if a configuration with a `scan` section is provided it will need to include 131 | # all required settings in that section. 132 | default_config.update(config) 133 | 134 | parsed_config = MatrixContentScannerConfig(default_config) 135 | 136 | return MatrixContentScanner(parsed_config) 137 | -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | -------------------------------------------------------------------------------- /tests/utils/test_encrypted_file_metadata.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 New Vector Ltd 2 | # 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial 4 | # Please see LICENSE files in the repository root for full details. 5 | import copy 6 | import unittest 7 | 8 | from matrix_content_scanner.utils.constants import ErrCode 9 | from matrix_content_scanner.utils.encrypted_file_metadata import ( 10 | validate_encrypted_file_metadata, 11 | ) 12 | from matrix_content_scanner.utils.errors import ContentScannerRestError 13 | 14 | from tests.testutils import ENCRYPTED_FILE_METADATA 15 | 16 | 17 | class EncryptedMetadataValidationTestCase(unittest.TestCase): 18 | def setUp(self) -> None: 19 | self.metadata = copy.deepcopy(ENCRYPTED_FILE_METADATA) 20 | 21 | def test_validate(self) -> None: 22 | """Tests that valid file metadata is considered as such.""" 23 | validate_encrypted_file_metadata(ENCRYPTED_FILE_METADATA) 24 | 25 | def test_key_ops_no_decrypt(self) -> None: 26 | """Tests that the metadata validation fails if key_ops doesn't include `decrypt`.""" 27 | self.metadata["file"]["key"]["key_ops"] = ["encrypt"] 28 | self._test_fails_validation() 29 | 30 | def test_key_ops_no_encrypt(self) -> None: 31 | """Tests that the metadata validation fails if key_ops doesn't include `encrypt`.""" 32 | self.metadata["file"]["key"]["key_ops"] = ["decrypt"] 33 | self._test_fails_validation() 34 | 35 | def test_ops_extra_values(self) -> None: 36 | """tests that the metadata validation does not fail if there are extra values in 37 | key_ops. 38 | """ 39 | self.metadata["file"]["key"]["key_ops"].append("foo") 40 | validate_encrypted_file_metadata(self.metadata) 41 | 42 | def test_no_file(self) -> None: 43 | """Tests that the metadata validation fails if there isn't a `file` property.""" 44 | self.metadata = {"foo": "bar"} 45 | self._test_fails_validation() 46 | 47 | def test_no_key(self) -> None: 48 | """Tests that the metadata validation fails if there isn't a `file.key` property.""" 49 | del self.metadata["file"]["key"] 50 | self._test_fails_validation() 51 | 52 | def test_no_k(self) -> None: 53 | """Tests that the metadata validation fails if there isn't a `file.key.k` 54 | property. 55 | """ 56 | del self.metadata["file"]["key"]["k"] 57 | self._test_fails_validation() 58 | 59 | def test_no_ext(self) -> None: 60 | """Tests that the metadata validation fails if there isn't a `file.key.ext` 61 | property. 62 | """ 63 | del self.metadata["file"]["key"]["ext"] 64 | self._test_fails_validation() 65 | 66 | def test_bad_ext(self) -> None: 67 | """Tests that the metadata validation fails if the `file.key.ext` property has an 68 | invalid value. 69 | """ 70 | self.metadata["file"]["key"]["ext"] = False 71 | self._test_fails_validation() 72 | 73 | def test_bad_alg(self) -> None: 74 | """Tests that the metadata validation fails if the `file.key.alg` property has an 75 | invalid value. 76 | """ 77 | self.metadata["file"]["key"]["alg"] = "bad" 78 | self._test_fails_validation() 79 | 80 | def test_bad_kty(self) -> None: 81 | """Tests that the metadata validation fails if the `file.key.kty` property has an 82 | invalid value. 83 | """ 84 | self.metadata["file"]["key"]["kty"] = "bad" 85 | self._test_fails_validation() 86 | 87 | def test_no_iv(self) -> None: 88 | """Tests that the metadata validation fails if there isn't a `file.iv` property.""" 89 | del self.metadata["file"]["iv"] 90 | self._test_fails_validation() 91 | 92 | def test_no_url(self) -> None: 93 | """Tests that the metadata validation fails if there isn't a `file.url` property.""" 94 | del self.metadata["file"]["url"] 95 | self._test_fails_validation() 96 | 97 | def test_no_hashes(self) -> None: 98 | """Tests that the metadata validation fails if there isn't a `file.hashes` 99 | property. 100 | """ 101 | del self.metadata["file"]["hashes"] 102 | self._test_fails_validation() 103 | 104 | def test_no_sha256(self) -> None: 105 | """Tests that the metadata validation fails if there isn't a `file.hashes.sha256` 106 | property. 107 | """ 108 | del self.metadata["file"]["hashes"]["sha256"] 109 | self._test_fails_validation() 110 | 111 | def _test_fails_validation(self) -> None: 112 | """Tests that the validation fails with a REST error complaining about malformed 113 | JSON. 114 | """ 115 | with self.assertRaises(ContentScannerRestError) as cm: 116 | validate_encrypted_file_metadata(self.metadata) 117 | 118 | self.assertEqual(cm.exception.http_status, 400) 119 | self.assertEqual(cm.exception.reason, ErrCode.MALFORMED_JSON) 120 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py 3 | 4 | # required for PEP 517 (pyproject.toml-style) builds 5 | isolated_build = true 6 | 7 | [testenv] 8 | allowlist_externals = poetry 9 | commands = poetry install 10 | 11 | [testenv:py] 12 | 13 | # As of twisted 16.4, trial tries to import the tests as a package (previously 14 | # it loaded the files explicitly), which means they need to be on the 15 | # pythonpath. Our sdist doesn't include the 'tests' package, so normally it 16 | # doesn't work within the tox virtualenv. 17 | # 18 | # As a workaround, we tell tox to do install with 'pip -e', which just 19 | # creates a symlink to the project directory instead of unpacking the sdist. 20 | usedevelop=true 21 | 22 | commands = poetry run python -m unittest discover tests 23 | --------------------------------------------------------------------------------