├── .dockerignore
├── .github
    ├── CODEOWNERS
    └── workflows
    │   ├── ci.yml
    │   └── docker.yaml
├── .gitignore
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── LICENSE-COMMERCIAL
├── README.md
├── build_rust.py
├── config.sample.yaml
├── docker
    └── Dockerfile
├── docs
    └── api.md
├── mypy.ini
├── perf
    ├── .gitignore
    ├── config.yaml
    ├── dummy_scan.sh
    └── scanner_perf.py
├── poetry.lock
├── pyproject.toml
├── rust
    ├── Cargo.toml
    ├── build.rs
    └── src
    │   ├── crypto
    │       └── mod.rs
    │   └── lib.rs
├── scripts-dev
    └── lint.sh
├── src
    └── matrix_content_scanner
    │   ├── __init__.py
    │   ├── config.py
    │   ├── httpserver.py
    │   ├── logutils.py
    │   ├── mcs.py
    │   ├── mcs_rust
    │       ├── __init__.pyi
    │       └── crypto.pyi
    │   ├── py.typed
    │   ├── scanner
    │       ├── __init__.py
    │       ├── file_downloader.py
    │       └── scanner.py
    │   ├── servlets
    │       ├── __init__.py
    │       ├── download.py
    │       ├── public_key.py
    │       ├── scan.py
    │       └── thumbnail.py
    │   └── utils
    │       ├── __init__.py
    │       ├── constants.py
    │       ├── encrypted_file_metadata.py
    │       ├── errors.py
    │       ├── rust.py
    │       └── types.py
├── tests
    ├── __init__.py
    ├── scanner
    │   ├── __init__.py
    │   ├── test_file_downloader.py
    │   └── test_scanner.py
    ├── servlets
    │   ├── __init__.py
    │   ├── test_scan.py
    │   └── test_servlets.py
    ├── test_crypto.py
    ├── testutils.py
    └── utils
    │   ├── __init__.py
    │   └── test_encrypted_file_metadata.py
└── tox.ini


/.dockerignore:
--------------------------------------------------------------------------------
 1 | # ignore everything by default
 2 | *
 3 | 
 4 | # things to include
 5 | !src
 6 | !README.md
 7 | !pyproject.toml
 8 | !setup.cfg
 9 | !poetry.lock
10 | !Cargo.toml
11 | !Cargo.lock
12 | !build_rust.py
13 | !rust
14 | 
15 | **/__pycache__
16 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | 
2 | # Automatically request reviews from the synapse-core team when a pull request comes in.
3 | * @element-hq/synapse-core
4 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Linting and Tests
 2 | on:
 3 |   push:
 4 |     branches: ["main"]
 5 |   pull_request:
 6 | 
 7 | jobs:
 8 |   check-code-style:
 9 |     name: Check code style
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - run: sudo apt-get install -y libmagic1
13 |       - uses: actions/checkout@v4
14 |       - name: Install Rust
15 |         uses: dtolnay/rust-toolchain@stable
16 | 
17 |       - name: Setup Poetry
18 |         uses: matrix-org/setup-python-poetry@v1
19 |         with:
20 |           install-project: "false"
21 | 
22 |       - name: Run ruff check
23 |         run: poetry run ruff check --output-format=github .
24 | 
25 |       - name: Run ruff format
26 |         run: poetry run ruff format --check .
27 | 
28 |   check-types:
29 |     name: Check types with Mypy
30 |     runs-on: ubuntu-latest
31 |     steps:
32 |       - run: sudo apt-get install -y libmagic1
33 |       - uses: actions/checkout@v4
34 |       - name: Install Rust
35 |         uses: dtolnay/rust-toolchain@1.82.0
36 |       - uses: Swatinem/rust-cache@v2
37 | 
38 |       - name: Setup Poetry
39 |         uses: matrix-org/setup-python-poetry@v1
40 |         with:
41 |           # We have seen odd mypy failures that were resolved when we started
42 |           # installing the project again:
43 |           # https://github.com/matrix-org/synapse/pull/15376#issuecomment-1498983775
44 |           # To make CI green, err towards caution and install the project.
45 |           install-project: "true"
46 | 
47 |       # Cribbed from
48 |       # https://github.com/AustinScola/mypy-cache-github-action/blob/85ea4f2972abed39b33bd02c36e341b28ca59213/src/restore.ts#L10-L17
49 |       - name: Restore/persist mypy's cache
50 |         uses: actions/cache@v4
51 |         with:
52 |           path: |
53 |             .mypy_cache
54 |           key: mypy-cache-${{ github.context.sha }}
55 |           restore-keys: mypy-cache-
56 | 
57 |       - name: Run mypy
58 |         run: poetry run mypy
59 | 
60 | 
61 |   unit-tests:
62 |     name: Unit tests
63 |     runs-on: ubuntu-latest
64 |     strategy:
65 |       matrix:
66 |         # Run the unit tests both against our oldest supported Python version
67 |         # and the newest stable.
68 |         python_version: [ "3.10", "3.12" ]
69 |     steps:
70 |       - run: sudo apt-get install -y libmagic1
71 |       - uses: actions/checkout@v4
72 |       - name: Install Rust
73 |         uses: dtolnay/rust-toolchain@stable
74 |       - uses: actions/setup-python@v2
75 |         with:
76 |           python-version: ${{ matrix.python_version }}
77 |       - run: python -m pip install tox "poetry==1.8.3"
78 |       - run: tox -e py
79 | 


--------------------------------------------------------------------------------
/.github/workflows/docker.yaml:
--------------------------------------------------------------------------------
 1 | # GitHub actions workflow which builds and publishes the docker images.
 2 | 
 3 | name: Build docker images
 4 | 
 5 | on:
 6 |   push:
 7 |     tags: ["v*"]
 8 |   workflow_dispatch: # A build was manually requested
 9 | 
10 | permissions:
11 |   contents: read
12 |   id-token: write # needed for signing the images with GitHub OIDC Token
13 | 
14 | jobs:
15 |   build:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - name: Install Cosign
19 |         uses: sigstore/cosign-installer@v3
20 | 
21 |       - name: Log in to DockerHub
22 |         uses: docker/login-action@v2
23 |         with:
24 |           username: ${{ secrets.DOCKER_HUB_USERNAME }}
25 |           password: ${{ secrets.DOCKER_HUB_TOKEN }}
26 | 
27 |       - name: Calculate docker image tag
28 |         id: set-tag
29 |         uses: docker/metadata-action@master
30 |         with:
31 |           images: vectorim/matrix-content-scanner
32 |           tags: |
33 |             type=raw,value=latest
34 |             type=pep440,pattern={{raw}}
35 | 
36 |       - name: Build and push all platforms
37 |         id: build-and-push
38 |         uses: docker/build-push-action@v3
39 |         with:
40 |           push: true
41 |           labels: "gitsha1=${{ github.sha }}"
42 |           tags: "${{ steps.set-tag.outputs.tags }}"
43 |           file: "docker/Dockerfile"
44 |           platforms: linux/amd64
45 | 
46 |       - name: Sign the images with GitHub OIDC Token
47 |         env:
48 |           DIGEST: ${{ steps.build-and-push.outputs.digest }}
49 |           TAGS: ${{ steps.set-tag.outputs.tags }}
50 |         run: |
51 |           images=""
52 |           for tag in ${TAGS}; do
53 |             images+="${tag}@${DIGEST} "
54 |           done
55 |           cosign sign --yes ${images}
56 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /.idea
 2 | /.venv
 3 | *.egg-info
 4 | /.envrc
 5 | /.tox
 6 | _trial_temp
 7 | __pycache__
 8 | /dist
 9 | config.yaml
10 | /build
11 | /.vscode
12 | mcs_pickle.txt
13 | pickle
14 | 
15 | # Poetry will create a setup.py, which we don't want to include.
16 | /setup.py
17 | 
18 | # rust
19 | /target/
20 | /src/matrix_content_scanner/*.so
21 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | # We make the whole root folder a workspace so that we can run `cargo`
2 | # commands from the root (rather than having to cd into rust/).
3 | 
4 | [workspace]
5 | members = ["rust"]
6 | resolver = "2"
7 | 


--------------------------------------------------------------------------------
/LICENSE-COMMERCIAL:
--------------------------------------------------------------------------------
1 | Licensees holding a valid commercial license with Element may use this
2 | software in accordance with the terms contained in a written agreement
3 | between you and Element.
4 | 
5 | To purchase a commercial license please contact our sales team at
6 | licensing@element.io
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Matrix Content Scanner
  2 | 
  3 | A web service for scanning media hosted on a Matrix media repository.
  4 | 
  5 | ## Installation
  6 | 
  7 | This project requires libmagic to be installed on the system. On Debian/Ubuntu:
  8 | 
  9 | ```commandline
 10 | sudo apt install libmagic1
 11 | ```
 12 | 
 13 | Then, preferably in a virtual environment, install the Matrix Content Scanner:
 14 | 
 15 | ```commandline
 16 | pip install matrix-content-scanner
 17 | ```
 18 | 
 19 | ## Usage
 20 | 
 21 | Copy and edit the [sample configuration file](https://github.com/matrix-org/matrix-content-scanner-python/blob/main/config.sample.yaml).
 22 | Each key is documented in this file.
 23 | 
 24 | Then run the content scanner (from within your virtual environment if one was created):
 25 | 
 26 | ```commandline
 27 | python -m matrix_content_scanner.mcs -c CONFIG_FILE
 28 | ```
 29 | 
 30 | Where `CONFIG_FILE` is the path to your configuration file.
 31 | 
 32 | ## Docker
 33 | 
 34 | This project provides a Docker image to run it, published as
 35 | `vectorim/matrix-content-scanner`.
 36 | 
 37 | To use it, copy the [sample configuration file](/config.sample.yaml) into a dedicated
 38 | directory, edit it accordingly with your requirements, and then mount this directory as
 39 | `/data` in the image. Do not forget to also publish the port that the content scanner's
 40 | Web server is configured to listen on.
 41 | 
 42 | For example, assuming the port for the Web server is `8080`:
 43 | 
 44 | ```shell
 45 | docker run -p 8080:8080 -v /path/to/your/config/directory:/data vectorim/matrix-content-scanner
 46 | ```
 47 | 
 48 | ## API
 49 | 
 50 | See [the API documentation](/docs/api.md) for information about how clients are expected
 51 | to interact with the Matrix Content Scanner.
 52 | 
 53 | ## Migrating from the [legacy Matrix Content Scanner](https://github.com/matrix-org/matrix-content-scanner)
 54 | 
 55 | Because it uses the same APIs and Olm pickle format as the legacy Matrix Content Scanner,
 56 | this project can be used as a drop-in replacement. The only change (apart from the
 57 | deployment instructions) is the configuration format:
 58 | 
 59 | * the `server` section is renamed `web`
 60 | * `scan.tempDirectory` is renamed `scan.temp_directory`
 61 | * `scan.baseUrl` is renamed `download.base_homeserver_url` (and becomes optional)
 62 | * `scan.doNotCacheExitCodes` is renamed `result_cache.exit_codes_to_ignore`
 63 | * `scan.directDownload` is removed. Direct download always happens when `download.base_homeserver_url`
 64 |   is absent from the configuration file, and setting a value for it will always cause files to be
 65 |   downloaded from the server configured.
 66 | * `proxy` is renamed `download.proxy`
 67 | * `middleware.encryptedBody.pickleKey` is renamed `crypto.pickle_key`
 68 | * `middleware.encryptedBody.picklePath` is renamed `crypto.pickle_path`
 69 | * `acceptedMimeType` is renamed `scan.allowed_mimetypes`
 70 | * `requestHeader` is renamed `download.additional_headers` and turned into a dictionary.
 71 | 
 72 | Note that the format of the cryptographic pickle file and key are compatible between
 73 | this project and the legacy Matrix Content Scanner. If no file exist at that path one will
 74 | be created automatically.
 75 | 
 76 | ## Development
 77 | 
 78 | In a virtual environment with poetry (>=1.8.3) installed, run
 79 | ```shell
 80 | poetry install
 81 | ```
 82 | 
 83 | To run the unit tests, you can use:
 84 | ```shell
 85 | tox -e py
 86 | ```
 87 | 
 88 | To run the linters and `mypy` type checker, use `./scripts-dev/lint.sh`.
 89 | 
 90 | 
 91 | ## Releasing
 92 | 
 93 | The exact steps for releasing will vary; but this is an approach taken by the
 94 | Synapse developers (assuming a Unix-like shell):
 95 | 
 96 |  1. Set a shell variable to the version you are releasing (this just makes
 97 |     subsequent steps easier):
 98 |     ```shell
 99 |     version=X.Y.Z
100 |     ```
101 | 
102 |  2. Update `setup.cfg` so that the `version` is correct.
103 | 
104 |  3. Stage the changed files and commit.
105 |     ```shell
106 |     git add -u
107 |     git commit -m v$version -n
108 |     ```
109 | 
110 |  4. Push your changes.
111 |     ```shell
112 |     git push
113 |     ```
114 | 
115 |  5. When ready, create a signed tag for the release:
116 |     ```shell
117 |     git tag -s v$version
118 |     ```
119 |     Base the tag message on the changelog.
120 | 
121 |  6. Push the tag.
122 |     ```shell
123 |     git push origin tag v$version
124 |     ```
125 | 
126 |  7. Create a *release*, based on the tag you just pushed, on GitHub or GitLab.
127 | 
128 |  8. Create a source distribution and upload it to PyPI:
129 |     ```shell
130 |     python -m build
131 |     twine upload dist/matrix_content_scanner-$version*
132 |     ```
133 | 


--------------------------------------------------------------------------------
/build_rust.py:
--------------------------------------------------------------------------------
 1 | # A build script for poetry that adds the rust extension.
 2 | 
 3 | import os
 4 | from typing import Any, Dict
 5 | 
 6 | from setuptools_rust import Binding, RustExtension
 7 | 
 8 | 
 9 | def build(setup_kwargs: Dict[str, Any]) -> None:
10 |     original_project_dir = os.path.dirname(os.path.realpath(__file__))
11 |     cargo_toml_path = os.path.join(original_project_dir, "rust", "Cargo.toml")
12 | 
13 |     extension = RustExtension(
14 |         target="matrix_content_scanner.mcs_rust",
15 |         path=cargo_toml_path,
16 |         binding=Binding.PyO3,
17 |         py_limited_api=True,
18 |         debug=False,
19 |     )
20 |     setup_kwargs.setdefault("rust_extensions", []).append(extension)
21 |     setup_kwargs["zip_safe"] = False
22 | 


--------------------------------------------------------------------------------
/config.sample.yaml:
--------------------------------------------------------------------------------
  1 | # Configuration file template for the Matrix Content Scanner.
  2 | #
  3 | # Supported time units:
  4 | #  * ms, millisecond, milliseconds
  5 | #  * s, sec, secs, second, seconds
  6 | #  * m, min, mins, minute, minutes
  7 | #  * h, hour, hours
  8 | #  * d, day, days
  9 | #  * w, week, weeks
 10 | #  * y, year, years
 11 | # If no unit is given, "seconds" are implied.
 12 | #
 13 | # Examples of supported size units can be found here: https://humanfriendly.readthedocs.io/en/latest/api.html#humanfriendly.parse_size
 14 | # Size units use a decimal base, so 1KB means 1000 bytes, while 1KiB means 1024 bytes.
 15 | 
 16 | # Configuration for hosting the HTTP(S) API.
 17 | web:
 18 |     host: 127.0.0.1
 19 |     port: 8080
 20 | 
 21 | # Configuration for scanning files.
 22 | scan:
 23 |     # The script to run to scan a file. This script will be called with a path to the
 24 |     # downloaded file as its only argument, e.g. "./example.sh /temp/foo.bar/my_file".
 25 |     # Required.
 26 |     script: "./example.sh"
 27 | 
 28 |     # Directory in which to download files for scanning. Each file downloaded is removed
 29 |     # after the scan has completed.
 30 |     # Required.
 31 |     temp_directory: "/tmp"
 32 | 
 33 |     # Command to run to remove files from disk once they have been scanned.
 34 |     # Optional, defaults to "rm".
 35 |     removal_command: "srm"
 36 | 
 37 |     # List of allowed MIME types. If a file has a MIME type that's not in this list, its
 38 |     # scan is considered failed.
 39 |     # Unrecognised binary files are considered to be `application/octet-stream`.
 40 |     # Unrecognised text files are considered to be `text/plain`.
 41 |     # Optional, defaults to allowing all MIME types.
 42 |     allowed_mimetypes: ["image/jpeg"]
 43 | 
 44 |     # List of blocked MIME types.
 45 |     # If specified, `allowed_mimetypes` must not be specified as well.
 46 |     # If specified, a file whose MIME type is on this list will produce a scan that is
 47 |     # considered failed.
 48 |     # Unrecognised binary files are considered to be `application/octet-stream`.
 49 |     # Unrecognised text files are considered to be `text/plain`.
 50 |     # Optional.
 51 |     # blocked_mimetypes: ["image/jpeg"]
 52 | 
 53 | # Configuration of scan result caching.
 54 | #
 55 | # Results are stored in a cache to avoid having to download and scan a file twice. There
 56 | # is a unique entry in this cache for each set of media path (i.e. the
 57 | # "server_name/media_id" identifier for the media), thumbnailing parameters and
 58 | # encryption metadata. This means that, for example, the result for the scan of the media
 59 | # "example.com/abc" and the result for the scan of the  *thumbnail* of "example.com/abc"
 60 | # will be stored in two separate entries.
 61 | #
 62 | # Each entry in the cache includes the result of the scan as well as a copy of the media
 63 | # that was scanned. If the media fails the scan, however, or is larger than the configured
 64 | # maximum size (if set), no copy of the media is stored in the result cache.
 65 | result_cache:
 66 |     # List of exit codes from the scanning script that shouldn't cause the result of the
 67 |     # scan to be cached for future requests.
 68 |     # Optional, defaults to an empty list (i.e. results are cached regardless of the
 69 |     # script's exit code).
 70 |     exit_codes_to_ignore: [1, 2]
 71 | 
 72 |     # Maximum number of results that can be stored in the cache. If more files are
 73 |     # scanned before existing items reach their TTL, the least-recently accessed will be
 74 |     # evicted.
 75 |     # Optional, defaults to 1024.
 76 |     max_size: 2048
 77 | 
 78 |     # The maximum amount of time an entry will stay in the cache before being evicted.
 79 |     # Optional, defaults to 1 week.
 80 |     ttl: "1d"
 81 | 
 82 |     # The maximum cachable file size. If a file is bigger than this size, a copy of it
 83 |     # will be not be cached even if the scan succeeds. If the file is requested again, it
 84 |     # is downloaded again from the homeserver, but is not written to disk or scanned.
 85 |     # Optional, defaults to no maximum size.
 86 |     max_file_size: "100MB"
 87 | 
 88 | 
 89 | # Configuration for downloading files.
 90 | # When downloading files directly from their respective homeservers (which is the default
 91 | # behaviour), the homeservers' default URLs are determined using .well-known discovery
 92 | # (defaults to using the homeserver's domain if not available).
 93 | # See https://spec.matrix.org/latest/client-server-api/#server-discovery for more info.
 94 | # Settings in this section (apart from `base_homeserver_url`) apply to .well-known
 95 | # discovery requests as well as file download ones.
 96 | download:
 97 |     # If provided, all files are downloaded using the homeserver at this URL. If this
 98 |     # setting is provided, .well-known discovery is not used to determine the base URL
 99 |     # to use.
100 |     # Optional, defaults to downloading files directly from their respective homeservers.
101 |     base_homeserver_url: "https://matrix.org"
102 | 
103 |     # HTTP(S) proxy to use when sending requests.
104 |     # Optional, defaults to no proxy.
105 |     proxy: "http://10.0.0.1:3128"
106 | 
107 |     # Headers to send in outgoing requests.
108 |     # Optional, defaults to no additional headers.
109 |     additional_headers:
110 |         user-agent: "matrix-content-scanner"
111 | 
112 | # Configuration for decrypting Olm-encrypted request bodies.
113 | crypto:
114 |     # The path to the Olm pickle file. This file contains the key pair to use when
115 |     # encrypting and decrypting encrypted POST request bodies.
116 |     # A new keypair will be created at startup if the pickle file doesn't already exist.
117 |     # Required.
118 |     pickle_path: "./pickle"
119 | 
120 |     # The key to use to decode the pickle file.
121 |     # Required.
122 |     pickle_key: "this_is_a_secret"
123 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
  1 | ARG PYTHON_VERSION=3.11
  2 | 
  3 | FROM docker.io/python:${PYTHON_VERSION}-slim AS requirements
  4 | 
  5 | 
  6 | # We install poetry in its own build stage to avoid its dependencies conflicting.
  7 | RUN --mount=type=cache,target=/root/.cache/pip \
  8 |   pip install --user "poetry==1.8.3"
  9 | 
 10 | WORKDIR /mcs
 11 | 
 12 | # Copy just what we need to run `poetry export`...
 13 | COPY pyproject.toml poetry.lock /mcs/
 14 | 
 15 | # If specified, we won't verify the hashes of dependencies.
 16 | # This is only needed if the hashes of dependencies cannot be checked for some
 17 | # reason, such as when a git repository is used directly as a dependency.
 18 | ARG TEST_ONLY_SKIP_DEP_HASH_VERIFICATION
 19 | 
 20 | # If specified, we won't use the Poetry lockfile.
 21 | # Instead, we'll just install what a regular `pip install` would from PyPI.
 22 | ARG TEST_ONLY_IGNORE_POETRY_LOCKFILE
 23 | 
 24 | # Export the dependencies, but only if we're actually going to use the Poetry lockfile.
 25 | # Otherwise, just create an empty requirements file so that the Dockerfile can
 26 | # proceed.
 27 | RUN if [ -z "$TEST_ONLY_IGNORE_POETRY_LOCKFILE" ]; then \
 28 |   /root/.local/bin/poetry export -o /mcs/requirements.txt ${TEST_ONLY_SKIP_DEP_HASH_VERIFICATION:+--without-hashes}; \
 29 |   else \
 30 |   touch /mcs/requirements.txt; \
 31 |   fi
 32 | 
 33 | ###
 34 | ### Stage 1: builder
 35 | ###
 36 | FROM docker.io/library/python:${PYTHON_VERSION}-slim AS builder
 37 | 
 38 | RUN \
 39 |   --mount=type=cache,target=/var/cache/apt,sharing=locked \
 40 |   --mount=type=cache,target=/var/lib/apt,sharing=locked \
 41 |   apt-get update -qq && apt-get install -yqq \
 42 |   build-essential \
 43 |   curl
 44 | 
 45 | # Install rust and ensure its in the PATH
 46 | ENV RUSTUP_HOME=/rust
 47 | ENV CARGO_HOME=/cargo
 48 | ENV PATH=/cargo/bin:/rust/bin:$PATH
 49 | RUN mkdir /rust /cargo
 50 | 
 51 | RUN curl -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --default-toolchain stable --profile minimal
 52 | 
 53 | # To speed up rebuilds, install all of the dependencies before we copy over
 54 | # the whole project, so that this layer in the Docker cache can be
 55 | # used while you develop on the source
 56 | #
 57 | # This is aiming at installing the `[tool.poetry.depdendencies]` from pyproject.toml.
 58 | COPY --from=requirements /mcs/requirements.txt /mcs/
 59 | RUN --mount=type=cache,target=/root/.cache/pip \
 60 |   pip install --prefix="/install" --no-deps --no-warn-script-location -r /mcs/requirements.txt
 61 | 
 62 | COPY src /mcs/src/
 63 | COPY rust /mcs/rust/
 64 | # ... and what we need to `pip install`.
 65 | COPY pyproject.toml README.md build_rust.py Cargo.toml Cargo.lock /mcs/
 66 | 
 67 | # Repeat of earlier build argument declaration, as this is a new build stage.
 68 | ARG TEST_ONLY_IGNORE_POETRY_LOCKFILE
 69 | 
 70 | # Install the matrix content scanner package itself.
 71 | # If we have populated requirements.txt, we don't install any dependencies
 72 | # as we should already have those from the previous `pip install` step.
 73 | RUN --mount=type=cache,target=/mcs/target,sharing=locked \
 74 |   --mount=type=cache,target=${CARGO_HOME}/registry,sharing=locked \
 75 |   if [ -z "$TEST_ONLY_IGNORE_POETRY_LOCKFILE" ]; then \
 76 |   pip install --prefix="/install" --no-deps --no-warn-script-location /mcs; \
 77 |   else \
 78 |   pip install --prefix="/install" --no-warn-script-location /mcs; \
 79 |   fi
 80 | 
 81 | ###
 82 | ### Stage 2: runtime
 83 | ###
 84 | 
 85 | FROM docker.io/library/python:${PYTHON_VERSION}-slim
 86 | 
 87 | # Install libmagic & other useful tools.
 88 | # We don't need to install libolm, because we're installing it with a
 89 | # wheel from gitlab.matrix.org later, which comes with libolm already compiled.
 90 | RUN apt-get update -qq && apt-get install -qq libmagic1 c-icap && rm -rf /var/lib/apt/lists/*
 91 | 
 92 | # Copy the necessary project files into the image.
 93 | COPY --from=builder /install /usr/local
 94 | 
 95 | # Create the directory in which long-lived configuration and secrets will live. We switch
 96 | # to it to ensure any automatically-generated secret is persisted when the container is
 97 | # destroyed.
 98 | RUN mkdir /data
 99 | WORKDIR /data
100 | 
101 | # Start the service using user-provided configuration.
102 | ENTRYPOINT ["python", "-m", "matrix_content_scanner.mcs", "-c", "/data/config.yaml"]
103 | 


--------------------------------------------------------------------------------
/docs/api.md:
--------------------------------------------------------------------------------
  1 | # Matrix Content Scanner API
  2 | 
  3 | This document describes the custom API implemented by the Matrix Content Scanner.
  4 | 
  5 | ## Error codes
  6 | 
  7 | An error is returned as JSON responses to the request that caused it, in the following format:
  8 | 
  9 | | Parameter | Type | Description                                            |
 10 | |-----------|------|--------------------------------------------------------|
 11 | | `reason`  | str  | The machine-readable code for the error.               |
 12 | | `info`    | str  | Additional human-readable information about the error. |
 13 | 
 14 | Example:
 15 | 
 16 | ```json
 17 | {
 18 |     "info": "***VIRUS DETECTED***",
 19 |     "reason": "MCS_MEDIA_NOT_CLEAN"
 20 | }
 21 | ```
 22 | 
 23 | The error codes used by the Matrix Content Scanner are described below, alongside the HTTP
 24 | status code of the response for each scenario:
 25 | 
 26 | | Status Code | Reason                        | Description                                                                                                                                                                       |
 27 | |-------------|-------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 28 | | 400         | `MCS_MALFORMED_JSON`          | The request body contains malformed JSON.                                                                                                                                         |
 29 | | 400         | `MCS_MEDIA_FAILED_TO_DECRYPT` | The server failed to decrypt the encrypted media downloaded from the media repo.                                                                                                  |
 30 | | 401         | `M_MISSING_TOKEN`             | The request is missing a required access token for authentication.                                                                                                                |
 31 | | 401         | `M_UNKNOWN_TOKEN`             | The access token provided for authentication is not valid.                                                                                                                        |
 32 | | 404         | `M_NOT_FOUND`                 | The `Authorization` header was missing when requesting authenticated media.                                                                                                       |
 33 | | 404         | `M_NOT_FOUND`                 | No route could be found at the given path.                                                                                                                                        |
 34 | | 404         | `M_NOT_FOUND`                 | The requested media was not present in the media repo.                                                                                                                            |
 35 | | 403         | `MCS_MEDIA_NOT_CLEAN`         | The server scanned the downloaded media but the antivirus script returned a non-zero exit code.                                                                                   |
 36 | | 403         | `MCS_MIME_TYPE_FORBIDDEN`     | The Mime type is not in the allowed list of Mime types.                                                                                                                           |
 37 | | 403         | `MCS_BAD_DECRYPTION`          | The provided `encrypted_body` could not be decrypted, or the encrypted file could not be decrypted. The client should request the public key of the server and then retry (once). |
 38 | | 500         | `M_UNKNOWN`                   | The server experienced an unexpected error.                                                                                                                                       |
 39 | | 502         | `MCS_MEDIA_REQUEST_FAILED`    | The server failed to request media from the media repo.                                                                                                                           |
 40 | 
 41 | 
 42 | ## Routes
 43 | 
 44 | 
 45 | ### `GET /_matrix/media_proxy/unstable/download/{serverName}/{mediaId}`
 46 | 
 47 | Downloads the media at `mxc://{serverName}/{mediaId}` and scans it. If the scan is
 48 | successful, the media is sent in the response (identical to the
 49 | `GET /_matrix/media/v3/download/...` route in the Matrix specification). If the scan is
 50 | unsuccessful, an error is sent with the reason `MCS_MEDIA_NOT_CLEAN`.
 51 | 
 52 | 
 53 | ### `GET /_matrix/media_proxy/unstable/thumbnail/{serverName}/{mediaId}`
 54 | 
 55 | Takes the query parameters described [in the Matrix specification](https://spec.matrix.org/latest/client-server-api/#get_matrixmediav3thumbnailservernamemediaid).
 56 | 
 57 | Downloads a thumbnail of the media at `mxc://{serverName}/{mediaId}` and scans it. If the
 58 | scan is successful, the media is sent in the response (identical to the
 59 | `GET /_matrix/media/v3/thumbnail/...` route in the Matrix specification). If the scan is
 60 | unsuccessful, an error is sent with the reason `MCS_MEDIA_NOT_CLEAN`.
 61 | 
 62 | 
 63 | ### `GET /_matrix/media_proxy/unstable/scan/{serverName}/{mediaId}`
 64 | 
 65 | Downloads the media at `mxc://{serverName}/{mediaId}`, scans it and
 66 | responds with the result of the scan.
 67 | 
 68 | Response format:
 69 | 
 70 | | Parameter | Type | Description                                                        |
 71 | |-----------|------|--------------------------------------------------------------------|
 72 | | `clean`   | bool | The scan's result: `true` if the file is clean, `false` otherwise. |
 73 | | `info`    | str  | Human-readable information about the result.                       |
 74 | 
 75 | Example:
 76 | 
 77 | ```json
 78 | {
 79 |     "clean": false,
 80 |     "info": "***VIRUS DETECTED***"
 81 | }
 82 | ```
 83 | 
 84 | 
 85 | ### `POST /_matrix/media_proxy/unstable/download_encrypted`
 86 | 
 87 | Downloads a specified encrypted file, decrypts it and then behaves identically to the
 88 | `GET /_matrix/media_proxy/unstable/download/{serverName}/{mediaId}` route.
 89 | 
 90 | Request body:
 91 | 
 92 | | Parameter        | Type          | Description                                                                                                                                                                                                                                                                                |
 93 | |------------------|---------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 94 | | `encrypted_body` | EncryptedBody | An Olm-encrypted version of the request body. See [this section](#encrypted-post-body) for more information.                                                                                                                                                                               |
 95 | | `file`           | EncryptedFile | The metadata (download MXC URL and decryption key) of an encrypted file. Follows the format of the `EncryptedFile` structure from the [Matrix specification](https://spec.matrix.org/v1.2/client-server-api/#extensions-to-mroommessage-msgtypes). Ignored if `encrypted_body` is present. |
 96 | 
 97 | Example:
 98 | 
 99 | ```json
100 | {
101 |     "file": {
102 |         "v": "v2",
103 |         "key": {
104 |             "alg": "A256CTR",
105 |             "ext": true,
106 |             "k": "qcHVMSgYg-71CauWBezXI5qkaRb0LuIy-Wx5kIaHMIA",
107 |             "key_ops": [
108 |                 "encrypt",
109 |                 "decrypt"
110 |             ],
111 |             "kty": "oct"
112 |         },
113 |         "iv": "X85+XgHN+HEAAAAAAAAAAA",
114 |         "hashes": {
115 |             "sha256": "5qG4fFnbbVdlAB1Q72JDKwCagV6Dbkx9uds4rSak37c"
116 |         },
117 |         "url": "mxc://matrix.org/oSTbuSlyZKXvgtbtUsPxRbto"
118 |     }
119 | }
120 | ```
121 | 
122 | 
123 | ### `POST /_matrix/media_proxy/unstable/scan_encrypted`
124 | 
125 | Downloads a specified encrypted file, decrypts it and then behaves identically to the
126 | `GET /_matrix/media_proxy/unstable/scan/{serverName}/{mediaId}` route.
127 | 
128 | The request body for this route is the same as for
129 | `POST /_matrix/media_proxy/unstable/download_encrypted`.
130 | 
131 | 
132 | ### `GET /_matrix/media_proxy/unstable/public_key`
133 | 
134 | Responds with a base64 representation of the public key to use to generate the
135 | `encrypted_body` parameter of POST requests. See [this section](#encrypted-post-body) for
136 | more information.
137 | 
138 | Response format:
139 | 
140 | | Parameter    | Type | Description                                |
141 | |:-------------|------|--------------------------------------------|
142 | | `public_key` | str  | A base64 representation of the public key. |
143 | 
144 | Example:
145 | 
146 | ```json
147 | {
148 |     "public_key": "GdwYYj5Ey9O96FMi4DjIhPhY604RuZg2Om98Kqh+3GE"
149 | }
150 | ```
151 | 
152 | 
153 | ## Encrypted POST body
154 | 
155 | When processing encrypted attachments, there are two ways to communicate the metadata
156 | (i.e. URL and decryption key for the file) to the Matrix Content Scanner.
157 | 
158 | The first one is by sending it in the request body as shown above. However, this might not
159 | provide enough security depending on the infrastructure the Matrix Content Scanner is
160 | deployed in. For example if translation from HTTPS to HTTP is done on a separate machine
161 | than the one hosting the Matrix Content Scanner, it might be a concern that other pieces
162 | of the infrastructure might be able to intercept this traffic and decrypt the attachment.
163 | 
164 | The second way of communicating encrypted file metadata is to first encrypt it using
165 | vodozemac's [`PkEncryption`](https://github.com/matrix-org/vodozemac/blob/poljar/pk-dekurcina/src/pk_encryption.rs#L97)
166 | class. This is done using the public key retrieved from
167 | `GET /_matrix/media_proxy/unstable/public_key` and sending the resulting encrypted message
168 | in an `encrypted_body` parameter of the request's body. This parameter follows this format:
169 | 
170 | | Parameter    | Type | Description            |
171 | |--------------|------|------------------------|
172 | | `ciphertext` | str  | The encrypted content. |
173 | | `mac`        | str  | The MAC.               |
174 | | `ephemeral`  | str  | The ephemeral key.     |
175 | 
176 | Example (generated using the body and public key from the previous examples):
177 | 
178 | ```json
179 | {
180 |     "encrypted_body": {
181 |         "ciphertext": "tED6iNpKcZti+HMZ6t1M+ZlE27IbvF9nojz59dg3jtJHv/9wtH6KiYyaZsVvCNzuwWCjdcxA4PMevZuWnVIEWHArCKdcFJeAvzxzlVtFvlgM5PIiTNtkh8sXIaC7RP5+3s0/aQs9PhuhlJ5nGlS86BZJ56dDwQWS5DO/WPqsTko9lz6//XtZ8ko417vybz81NTNpoADRc8XRntsI1+rmdKkXJtuXTA3d46CCAhLvoJLZlk7xb7IGHADk3eYQ9WTaKQ76/PW1dDo5xQGyXOr+lJByisjkoz4C8i4wRYXnks+d3q6kIndGZgO8s/H7/kfYC052IAlAk3LmYavXaNwXJtnWUCCakTHME154yup8DtmsyuZkC3p3KhSsKAeoxmYvsSf0+p0MinOWB4BgeWwaBaKDKTHbaUKwQzdbZrBXKP+QBdmM9PUrmsTPR2RmWRsPCC3dcmz4rakCZB/Xvwg++xDzpxi3+iJxJ011g1Dfp4sd44U6LJDVZafIoPu7esChYD4o+x4tP4airHueLGpP0rQxPuDZRvklwCRZ5xtzr47fINel2IGrTQEPyNES+lASGr2xeWwBJXBe47OkM0rXZn1HVM6iK3g3HfUT6pFhdI/52ztUf+gOhOhRvTpP079Je9INLApXSu793EQGJpH+ms3ymJ3mfBhEYVVnj8zbczo",
182 |         "mac": "nipjbUCnIEw",
183 |         "ephemeral": "fk2xOTmttnFDTAORxVQTtIlbsu7O01Oe52+umaOjIiE"
184 |     }
185 | }
186 | ```
187 | 
188 | ## Authenticated Media
189 | 
190 | When accessing media from a Synapse homeserver with authenticated media enabled, an
191 | `Authorization` HTTP header must be passed along with any request to the Matrix Content Scanner.
192 | If the `Authorization` header is not present, the content scanner assumes the request is not an
193 | authenticated media request and will use the old Matrix endpoints.
194 | 
195 | This header follows the `Authentication Bearer scheme` as [outlined in the Matrix specification](https://spec.matrix.org/v1.12/client-server-api/#using-access-tokens).
196 | The `access_token` must be the Matrix access token of the client's user.
197 | The `Authorization` header method must be used, sending the access token as a query string
198 | parameter is not supported.
199 | 
200 | Example authorization header:
201 | 
202 | ```
203 | Authorization: Bearer <access_token>
204 | ```
205 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | strict = true
3 | files =
4 |   tests/,
5 |   src/
6 | 


--------------------------------------------------------------------------------
/perf/.gitignore:
--------------------------------------------------------------------------------
1 | pickle_path
2 | 


--------------------------------------------------------------------------------
/perf/config.yaml:
--------------------------------------------------------------------------------
 1 | web:
 2 |   host: localhost
 3 |   port: 8080
 4 | scan:
 5 |   script: ./dummy_scan.sh
 6 |   temp_directory: temp
 7 | crypto:
 8 |   pickle_path: pickle_path
 9 |   pickle_key: pickle_key
10 | download:
11 |   base_homeserver_url: https://matrix-client.matrix.org


--------------------------------------------------------------------------------
/perf/dummy_scan.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | sleep 1
3 | # Roughly a 5% change to fail, assuming a uniform distribution of $RANDOM.
4 | if [ $((RANDOM % 20)) = 0 ]; then
5 |   echo "I don't like the look of $1" > /dev/stderr
6 |   exit 1
7 | fi


--------------------------------------------------------------------------------
/perf/scanner_perf.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | """
  4 | This script is a rudimentary end-to-end test of the content scanner. It starts the
  5 | content scanner as a subprocess, using the hard-coded config.yaml. The scanner is
  6 | configured with matrix.org as its upstream homserver, and to use a dummy scanning script
  7 | which just calls `sleep 1`.
  8 | 
  9 | Next, we concurrently request Matrix Avatar URLs taken from the public
 10 | #synapse-dev:matrix.org room. (The URLs are hard-coded in this file. It's ugly, but good
 11 | enough for now.)
 12 | 
 13 | We wait for the content scanner to finish responding to reach response, reading the
 14 | response bodies from the scanner. We print how long (wall clock) it took to do so,
 15 | and close the content scanner subprocess.
 16 | 
 17 | Invoke this script with `-v` to print out content scanner logs.
 18 | """
 19 | 
 20 | import asyncio
 21 | import collections
 22 | import os.path
 23 | import subprocess
 24 | import sys
 25 | import time
 26 | import timeit
 27 | import traceback
 28 | 
 29 | import aiohttp
 30 | 
 31 | timer = timeit.default_timer
 32 | 
 33 | AVATAR_URLS_TAKEN_FROM_SYNAPSE_DEV = [
 34 |     "http://127.0.0.1:8080/ipfs/QmfS3zCyhM4KgvYWH1HrD1Rnumns7fyTzcSHjk5fsWe5ZH?filename=IMG_20230222_191003_e_1677506180005.jpg",
 35 |     "mxc://1312.media/SQdCZTnJfLkBAxgQMPkVgsPY",
 36 |     "mxc://abolivier.bzh/zPatuAFfwaXVxsJudPWkFcWF",
 37 |     "mxc://aguiarvieira.pt/74665ee95b29e2a217b88911cfc664a1ccbb7e141703097801866477568",
 38 |     "mxc://amorgan.xyz/JHlaCvKzIPrlcnYWTFoOqsmH",
 39 |     "mxc://asra.gr/4f06832b1418d4c5ba91cae68135592754841080",
 40 |     "mxc://automattic.com/cf00594221369ad4498eb3b73032969c7be0fa3b",
 41 |     "mxc://b3.hk/kKAHEhEOFMyXHQCcSFuQOQza",
 42 |     "mxc://beeper.com/18850ea089e0ecc16d7db55527925b43ad63295c",
 43 |     "mxc://beeper.com/c2ef30e46e6f99cd913f2b632573033c60a74524",
 44 |     "mxc://bolha.chat/BevcFWoBVCMGMGqYQNhVddfu",
 45 |     "mxc://bolha.chat/ClRsLphUvHmWHWOFjKLwiknN",
 46 |     "mxc://bonifacelabs.ca/WjbmLXYLDRPxUzorCdExENVZ",
 47 |     "mxc://bramen.com.co/oTFgSIkJdDTBIcuvtWTukatz",
 48 |     "mxc://brodi.me/PPjyGXrcCqcwRrKpYoIgLvgw",
 49 |     "mxc://cadair.com/LdiPRXiYOVpdWvURyocZmvUo",
 50 |     "mxc://chat.decatrion.com/MXOQjcRSnVSqOALFTDlgIKnq",
 51 |     "mxc://chat.interru.io/UJdEhRreNufARVwpCAGWnHTx",
 52 |     "mxc://chat.mistli.net/MIlfZzUpEUelhCLXVFPMacZO",
 53 |     "mxc://chat.pyro.monster/bgZxviIdWbBYWInhwZozaryA",
 54 |     "mxc://chat.upi.li/rYupYBDqEXxkiQGEhPOiNUGs",
 55 |     "mxc://cody.to/hXfwsZbCLswNYgvRDqIQZOnS",
 56 |     "mxc://connecteu.rs/8c81538fc306d556bbbce15230b12c68ee7395f8",
 57 |     "mxc://cyberia.club/ObtWErjecvRjoCxbEWzHSiXM",
 58 |     "mxc://element.io/050bd1fa6777a004eb8ffd6c31028998331a91aa",
 59 |     "mxc://element.io/0750b4015ab58d23d704d3a828a1173a175cf95f",
 60 |     "mxc://element.io/1fec45ef987253db2728112927562567f8dd9d5e",
 61 |     "mxc://element.io/42eff27432ec038e933337dabcdfe3d230b3c68d",
 62 |     "mxc://element.io/47465a9ec77dd489e49b6748bc53c4f0122f06d7",
 63 |     "mxc://element.io/6130836e26b462a6fe63d4e080dd9d2037490f2b",
 64 |     "mxc://element.io/658198ce7f58872cc8fb68862f1eabdc5d847fbc",
 65 |     "mxc://element.io/a3f0d8b0868a7bf4e7449141167747a4699109ff",
 66 |     "mxc://element.io/bd48d4466c7e21b2ce00836631c06360206c29a0",
 67 |     "mxc://element.io/f03df00167d5f7ad5b5eac5375f32146cc2c3f51",
 68 |     "mxc://envs.net/89be88bd94378aef18b7f01e6a14d2228cfbb9fa",
 69 |     "mxc://envs.net/de405527b5c8dca188d6d8c7f3731e861a9b17ec",
 70 |     "mxc://ergaster.org/nmVViTqFqKGGxSHHcwevqnig",
 71 |     "mxc://ether.ai/JKGvwPJrfnWiWEIeVGLtJaSl",
 72 |     "mxc://fabcity.hamburg/QdttdrpZgTNKcJJWauixXEvQ",
 73 |     "mxc://fachschaften.org/c8faf7765794be1b24b3117925ac2464a204fc961726279478688088064",
 74 |     "mxc://gatto.club/qEJyuPBKpZITccTfIriEebdK",
 75 |     "mxc://gruenhage.xyz/3ecdecdab75225c0a14c7c804061d86962ee1550",
 76 |     "mxc://hackerlab.in/vjENMlrncPUGDmbyMZhWJzkG",
 77 |     "mxc://hackliberty.org/LeTsthiOdqoNnjOjqWjxWMAI",
 78 |     "mxc://half-shot.uk/81696e31e533651fb9e44ce351b4201151042acd",
 79 |     "mxc://jacksonchen666.com/pQoQssnTIGKOYHpcWUmYpdsQ",
 80 |     "mxc://jameskitt616.one/pBZDFcMKCjVjkrTMgMykKpTi",
 81 |     "mxc://jboi.nl/dvVWQixQMJyIQoaLFqFTTpsE",
 82 |     "mxc://jki.re/NBtxUkzjXpmdsGychrevxsaB",
 83 |     "mxc://lant.uk/MVZeSTcVlpNiDToBuKgyQfIK",
 84 |     "mxc://librepush.net/WbEnGmxZGKJyHqbojduVeatQ",
 85 |     "mxc://littlevortex.net/jSNRNEyKLRnzYEpsODAUznIZ",
 86 |     "mxc://luebke.io/imaijIHMncPjQqYRLtByZRzX",
 87 |     "mxc://matrix.0x45.moe/PwcDRntlwelLMuofemYarmqx",
 88 |     "mxc://matrix.atommac.com/cAycTPLQEkgtZSlZlRlZXoTx",
 89 |     "mxc://matrix.clandestine.network/JpKsGDMkNnSkfQqUdFuoBkFy",
 90 |     "mxc://matrix.eclabs.de/KyXZzZTeJyhQDBkqGBcKWyBp",
 91 |     "mxc://matrix.f5.htw-berlin.de/LosKszHTJgwslbvrTvNWanwE",
 92 |     "mxc://matrix.kevwe.se/PXHQcmOahOjAJoTouFBmevfj",
 93 |     "mxc://matrix.m0dex.eu/c2qHa8jqd86MdKplo1VQamYOhkMxkGEl",
 94 |     "mxc://matrix.org/AokEDpMKDROUmGwuoErhRIxv",
 95 |     "mxc://matrix.org/BORiLtSOEUnZiwCcaJftvxxm",
 96 |     "mxc://matrix.org/BugjUgdADNUndQASgkYDHogL",
 97 |     "mxc://matrix.org/CLtgiPGknzEpKDiyOrUedmEc",
 98 |     "mxc://matrix.org/DIGiJjzKkVsWwpppAcrGRwzB",
 99 |     "mxc://matrix.org/DrLDzhkVYvGjfCiUBLkrYLhs",
100 |     "mxc://matrix.org/EbNOzLZJdNszNDDfDrPFvTTx",
101 |     "mxc://matrix.org/FEzUmMhxMsqtfXKyYQFDROgO",
102 |     "mxc://matrix.org/FVaBPAAuzqpBstuOfxDhDuiw",
103 |     "mxc://matrix.org/FwXVuHOTPCJOZwjuunyMoDvw",
104 |     "mxc://matrix.org/GBWoKBFhozIJcuuXzgAmESMh",
105 |     "mxc://matrix.org/GadiqrOaESCBOpqEspzaFHZZ",
106 |     "mxc://matrix.org/GbfNYPPXYfpYDGCPnxEOZACq",
107 |     "mxc://matrix.org/HcOKfHoyUseJyNvJCZbySygK",
108 |     "mxc://matrix.org/HjVgrKzDUXKrzYMDvtglFdvy",
109 |     "mxc://matrix.org/IssHdyiXMcSnRCxCzqoaocGL",
110 |     "mxc://matrix.org/JEPcTsDZpImzoyVdKHfeiUlK",
111 |     "mxc://matrix.org/JQXLHcWNbcbQBMEWebxQPiPT",
112 |     "mxc://matrix.org/JUFinhjLVhQhAmzsSpSaPFiT",
113 |     "mxc://matrix.org/JUssqTzHorMXUbeaulQUNjTm",
114 |     "mxc://matrix.org/KfkLMomWWjVZMbgVCKisfFPy",
115 |     "mxc://matrix.org/LWCDUbJGEqfXWbuACLYPzpMM",
116 |     "mxc://matrix.org/LfpqILSYnaIQDnCqGgrryaVA",
117 |     "mxc://matrix.org/LlsgPelTpiYvvEgjbqKzefbr",
118 |     "mxc://matrix.org/MKYSaqghosWAaMkfOTGqAXWu",
119 |     "mxc://matrix.org/MSSWISKFrXqYAWwVZpgQzKNc",
120 |     "mxc://matrix.org/MhFPyrortOJyjvIArZYRJNpd",
121 |     "mxc://matrix.org/MohmbgPyrsnuKIYJivBLhnaJ",
122 |     "mxc://matrix.org/MygYRbllJEcOXaGOySOEYMJc",
123 |     "mxc://matrix.org/NZGChxcCXbBvgkCNZTLXlpux",
124 |     "mxc://matrix.org/OVXDqAESXvavwJINbuwBeIHy",
125 |     "mxc://matrix.org/PQWXmVjsGPqEgItiYEISwDzI",
126 |     "mxc://matrix.org/QqFWSwNSKvlljlNZKBGrqCKR",
127 |     "mxc://matrix.org/QsaeAloXAKVPsiczXtIBJzrZ",
128 |     "mxc://matrix.org/RMMTwRenYWLPdRwIHlwuGCLG",
129 |     "mxc://matrix.org/RnAJViaJiNHcGtTZgbRWXqlB",
130 |     "mxc://matrix.org/SUpOMAcbPcYBaUnDikHYJOjh",
131 |     "mxc://matrix.org/TGopDZiMVyhwhQBuEbUeFOKt",
132 |     "mxc://matrix.org/TLEyVAuatPchpWniJrgmjUcU",
133 |     "mxc://matrix.org/TlumUuzCcCGHSUMXNJmAFLML",
134 |     "mxc://matrix.org/TpxNfvaFAAoZWdhwoYBHQezB",
135 |     "mxc://matrix.org/VpjGllthGpjTPkvbJgOdyxkF",
136 |     "mxc://matrix.org/WWvqnsZlhzWvPylUjdfhmrOV",
137 |     "mxc://matrix.org/XBkKJIaWeXdfoYwMZsQWKjzj",
138 |     "mxc://matrix.org/XmiRUvkkKjmTseRYrmBlvGNw",
139 |     "mxc://matrix.org/XnDebYmBmnBBNeyBiUKltVlh",
140 |     "mxc://matrix.org/XxylKIkLFThmHZjBMvCmipRT",
141 |     "mxc://matrix.org/YtCeQeNxqnKsLvIcnwKIMlkV",
142 |     "mxc://matrix.org/ZJIdWuBIRhObjOHVnoWfBUkq",
143 |     "mxc://matrix.org/ZafPzsxMJtLaSaJXloBEKiws",
144 |     "mxc://matrix.org/bCawIGTEGxaXxDIxIqteAhVU",
145 |     "mxc://matrix.org/bDayqThxTIcGNcskzIADknRv",
146 |     "mxc://matrix.org/bEVwopEQDMNjfzbiPKYgZXWU",
147 |     "mxc://matrix.org/bHNoSLOERjdQrUodZUIFYAQl",
148 |     "mxc://matrix.org/bSYOldVxWNFeulNUshiOSvlM",
149 |     "mxc://matrix.org/bcBGBuKkVBITyyfjLHLVrPKj",
150 |     "mxc://matrix.org/bipAEyCRqzXokNjHcDwbWXkO#auto",
151 |     "mxc://matrix.org/cKhTXJzIZZjHfNRbNJHjxSxw",
152 |     "mxc://matrix.org/cZEhMcslgpUJdTNMIuQSEukn",
153 |     "mxc://matrix.org/djdngehyFuFlApXWpYotALoK",
154 |     "mxc://matrix.org/eeSkBZDfQavoKeXjWhUGOCrI",
155 |     "mxc://matrix.org/fJYvrULeLqUSuOFFhvAuPbVB",
156 |     "mxc://matrix.org/gJNPpakWLvKGUYteErJnbqRw",
157 |     "mxc://matrix.org/iNUefSlAXjkdNzXyVaYjiiTK",
158 |     "mxc://matrix.org/jRqrnjimPBqTSSdJlOupMqSx",
159 |     "mxc://matrix.org/jVqDFNtFnwfXedjMKZLgtnsY",
160 |     "mxc://matrix.org/kOewGAJWihuVeafiSwgLeiJa",
161 |     "mxc://matrix.org/lyWZOWsBRhCcxKRgVUbDdtux",
162 |     "mxc://matrix.org/mhuskbkCQPvAXCCoZMMcUltg",
163 |     "mxc://matrix.org/nKpRPUortweIAocZOKakSmle",
164 |     "mxc://matrix.org/nwWAiyZHhWuATgUqhXSUgyOq",
165 |     "mxc://matrix.org/oUxxDyzQOHdVDMxgwFzyCWEe",
166 |     "mxc://matrix.org/oqUhSAlhShWRUoOypviZYzCl",
167 |     "mxc://matrix.org/owHbMxnvtZQhORPMIjEMhHJC",
168 |     "mxc://matrix.org/paFLquBfsoSUMExpgOePaYGn",
169 |     "mxc://matrix.org/pcyhRmMTlUPZNUWLBrrBYOUF",
170 |     "mxc://matrix.org/qCJQIqJLUntAlQjvjVqqkISE",
171 |     "mxc://matrix.org/qyoRKkkSwwqoaseeRDCWGmgL",
172 |     "mxc://matrix.org/rAtNyCxKhZKYjIpCMTMVIyZb",
173 |     "mxc://matrix.org/stXVscjfSSwEGcpNUOaTOmuw",
174 |     "mxc://matrix.org/tmemWZxwaiSRLneppvjscbSv",
175 |     "mxc://matrix.org/uFsobEhOojpEXTORyXJznvMf",
176 |     "mxc://matrix.org/wEydarIdYNQoHHnOpfYGQAkZ",
177 |     "mxc://matrix.org/xppypIFIDuFCqmdJHGjTuRsk",
178 |     "mxc://matrix.org/yAEcXFYGUHsLALuVuHtqgsPk",
179 |     "mxc://matrix.org/yCdHqfZAMYzGsSeCYODLGNJQ",
180 |     "mxc://matrix.org/zRHixRxWSlriuAyCEqxKcsUN",
181 |     "mxc://matrix.tarina.org/yQAGQhgyZtbJDzoCxcUoNlte",
182 |     "mxc://maunium.net/jdlSfvudiMSmcRrleeiYjjFO",
183 |     "mxc://mccarty.io/uCPFlUrLVWMrjuZVDnlIzIoI",
184 |     "mxc://medienhaus.dev/RSWiRFctJPQRAfLGfUTIWqCo",
185 |     "mxc://moritzdietz.com/oPOkWTlBWdTFbwXuGZNxbpAU",
186 |     "mxc://mozilla.org/66d994693725ea09256c22ac43b0e74e79f1abb4",
187 |     "mxc://mpl.mpg.de/lxwOKWWbfwlGxAMKhNIfiJRR",
188 |     "mxc://msg-net.de/uqthdSIKEsmLlAnrguhOBSRg",
189 |     "mxc://mx.anismk.de/hjKAFiGKMasHOCdEVPsmoozA",
190 |     "mxc://mx.grupotd.nat.cu/ZfxNoISumlPZZEHqRNbhewQW",
191 |     "mxc://neko.dev/wLFwLqbnyvrstuomVXdKMqyJ",
192 |     "mxc://nevarro.space/WmGsIGgESPTtJFskYIXdRlVM",
193 |     "mxc://obermui.de/pCkwyNUtzdnaImzuqbsaJCgV",
194 |     "mxc://perthchat.org/sNAywRrlPKygmkoxpfxSTrFz",
195 |     "mxc://pixelplanet.fun/xfxdQZvpLePdlNcRIjoFovPE",
196 |     "mxc://pixie.town/fq3MchyYAMzpCkfxbqr9WffR",
197 |     "mxc://pixie.town/qBpNzYpOknBxnSdcbFWrbqWT",
198 |     "mxc://raim.ist/oInPkqchozNTmIOeUXlCsFbp",
199 |     "mxc://riot.ovh/PJxWnOsjdnIpkByXMFJVGZgE",
200 |     "mxc://rs485.network/XpMPNjUVJmwwVQyaVtkAjpfl",
201 |     "mxc://scamdemic.wtf/WFPdCxatgVIQcYOqkWDKVsXP",
202 |     "mxc://seymour.family/ZlzrDJSjRnQYuWJGvhdCkyiS",
203 |     "mxc://shiina.family/zxIxLfIyoXTeclPZznmIdRli",
204 |     "mxc://simonatherley.com/nYEzJcoThHfARGPSkHXRGapn",
205 |     "mxc://skyforge.at/RExFPAnBOsbCqFZIFHAESyKQ",
206 |     "mxc://stratum0.org/FKcEkoEcEutsdRUaPjQitDwo",
207 |     "mxc://sw1v.org/rARZrbDMGnNQOKKWZtCVxusq",
208 |     "mxc://t2l.io/fYhaPLjAZLwEYqaSGKwRpQgk",
209 |     "mxc://that.host/QbAhNvUApAEpvCKNWtIZwjCO",
210 |     "mxc://the-apothecary.club/HScGQAQKwuQbbdNkLYoPpsNb",
211 |     "mxc://tout.im/VQpPnZfufsMWerGlxkupbtYo",
212 |     "mxc://uhoreg.ca/JbcxMQHvPoPUoRkwQRdmwXKm",
213 |     "mxc://veganism.social/dDVjvEJugTUfWfiavHKhvCxi",
214 |     "mxc://wi11.co.uk/DztCMbxBfOUrmklICETzYOEJ",
215 |     "mxc://yaal.coop/BviDGOwocxQQNndowuZmhxGr",
216 | ]
217 | 
218 | 
219 | async def request_media(session: aiohttp.ClientSession, media_url: str) -> int:
220 |     media_id = media_url.removeprefix("mxc://")
221 |     url = f"http://localhost:8080/_matrix/media_proxy/unstable/download/{media_id}"
222 | 
223 |     # timeout = aiohttp.ClientTimeout(total=10)
224 |     async with session.get(url) as response:
225 |         await response.read()
226 |         if "-v" not in sys.argv:
227 |             # Simple progress meter
228 |             print(".", end="", flush=True)
229 | 
230 |         return response.status
231 | 
232 | 
233 | async def main() -> None:
234 |     perfdir = os.path.dirname(__file__)
235 |     os.makedirs(os.path.join(perfdir, "temp"), exist_ok=True)
236 | 
237 |     print(f"number of URLs: {len(AVATAR_URLS_TAKEN_FROM_SYNAPSE_DEV)}")
238 | 
239 |     server = None
240 |     try:
241 |         server = subprocess.Popen(
242 |             args=[
243 |                 sys.executable,
244 |                 "-m",
245 |                 "matrix_content_scanner.mcs",
246 |                 "-c",
247 |                 "config.yaml",
248 |             ],
249 |             cwd=perfdir,
250 |             stdin=subprocess.DEVNULL,
251 |             stdout=None if "-v" in sys.argv else subprocess.DEVNULL,
252 |             stderr=None if "-v" in sys.argv else subprocess.DEVNULL,
253 |         )
254 | 
255 |         # Give server time to startup
256 |         time.sleep(0.5)
257 | 
258 |         await run_test()
259 |         # Run test a second time, now that caches have warmed up
260 |         await run_test()
261 |     finally:
262 |         if server is not None:
263 |             server.terminate()
264 |             print("Server return code:", server.returncode)
265 | 
266 | 
267 | async def run_test() -> None:
268 |     failed = False
269 |     start = timer()
270 |     try:
271 |         async with aiohttp.ClientSession() as session:
272 |             requests = []
273 |             for url in AVATAR_URLS_TAKEN_FROM_SYNAPSE_DEV:
274 |                 requests.append(asyncio.ensure_future(request_media(session, url)))
275 | 
276 |             statuses = await asyncio.gather(*requests)
277 |             print()
278 |             print("Status codes from scanner server:", collections.Counter(statuses))
279 |     except Exception:
280 |         traceback.print_exc()
281 |         failed = True
282 |     finally:
283 |         end = timer()
284 |         duration = end - start
285 |         print(f"{'Failed' if failed else 'Succeeded'} in {duration:.2f}s")
286 | 
287 | 
288 | if __name__ == "__main__":
289 |     asyncio.run(main())
290 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["poetry-core", "wheel", "setuptools-rust"]
  3 | build-backend = "poetry.core.masonry.api"
  4 | 
  5 | [tool.ruff]
  6 | line-length = 88
  7 | target-version = "py38"
  8 | 
  9 | [tool.ruff.lint]
 10 | # See https://beta.ruff.rs/docs/rules/#error-e
 11 | # for error codes. The ones we ignore are:
 12 | #  E501: line too long (we don't normally run this check in other projects such as Synapse)
 13 | #  E731: do not assign a lambda expression, use a def
 14 | #
 15 | # flake8-bugbear compatible checks. Its error codes are described at
 16 | # https://beta.ruff.rs/docs/rules/#flake8-bugbear-b
 17 | #  B023: Functions defined inside a loop must not use variables redefined in the loop
 18 | ignore = [
 19 |     "B023",
 20 |     "E501",
 21 |     "E731",
 22 | ]
 23 | select = [
 24 |     # pycodestyle
 25 |     "E",
 26 |     "W",
 27 |     # pyflakes
 28 |     "F",
 29 |     # isort
 30 |     "I001",
 31 |     # flake8-bugbear
 32 |     "B0",
 33 |     # flake8-comprehensions
 34 |     "C4",
 35 |     # flake8-2020
 36 |     "YTT",
 37 |     # flake8-slots
 38 |     "SLOT",
 39 |     # flake8-debugger
 40 |     "T10",
 41 |     # flake8-pie
 42 |     "PIE",
 43 |     # flake8-executable
 44 |     "EXE",
 45 | ]
 46 | 
 47 | [tool.ruff.lint.isort]
 48 | combine-as-imports = true
 49 | section-order = ["future", "standard-library", "third-party", "twisted", "first-party", "testing", "local-folder"]
 50 | known-first-party = ["matrix_content_scanner"]
 51 | 
 52 | [tool.ruff.lint.isort.sections]
 53 | twisted = ["twisted", "OpenSSL"]
 54 | testing = ["tests"]
 55 | 
 56 | [tool.ruff.format]
 57 | quote-style = "double"
 58 | indent-style = "space"
 59 | skip-magic-trailing-comma = false
 60 | line-ending = "auto"
 61 | 
 62 | [tool.maturin]
 63 | manifest-path = "rust/Cargo.toml"
 64 | module-name = "matrix_content_scanner.mcs_rust"
 65 | 
 66 | [tool.poetry]
 67 | name = "matrix_content_scanner"
 68 | version = "1.2.1"
 69 | description = "A web service for scanning media hosted by a Matrix media repository"
 70 | authors = ["Element Backend Team <team-backend-synapse@element.io>"]
 71 | readme = "README.md"
 72 | license = "AGPL-3.0-only OR LicenseRef-Element-Commercial"
 73 | # Python version and licence classifiers are set automatically by Poetry
 74 | classifiers = []
 75 | include = [
 76 |     { path = "mypy.ini", format = "sdist" },
 77 |     { path = "scripts-dev", format = "sdist" },
 78 |     { path = "tests", format = "sdist" },
 79 |     { path = "Cargo.toml", format = "sdist" },
 80 |     { path = "Cargo.lock", format = "sdist" },
 81 |     { path = "rust/Cargo.toml", format = "sdist" },
 82 |     { path = "rust/build.rs", format = "sdist" },
 83 |     { path = "rust/src/**", format = "sdist" },
 84 | ]
 85 | exclude = [
 86 |     { path = "src/*.so", format = "sdist"}
 87 | ]
 88 | 
 89 | [tool.poetry.dependencies]
 90 | python = "^3.10.0"
 91 | attrs = ">=19.2.0"
 92 | aiohttp = ">=3.8.0"
 93 | jsonschema = ">=4.23.0"
 94 | pyyaml = ">=5.1.1"
 95 | # Required for decrypting files"
 96 | python-magic = ">=0.4.15,<0.5"
 97 | # Required for maintaining the result cache.
 98 | cachetools = ">=5.4.0"
 99 | # Required for processing user-defined values such as durations or sizes.
100 | humanfriendly = ">=10.0"
101 | # Required for calculating cache keys deterministically. Type annotations aren't
102 | # discoverable in versions older than 1.6.3.
103 | canonicaljson = ">=1.6.3"
104 | setuptools_rust = ">=1.3"
105 | 
106 | [tool.poetry.dev-dependencies]
107 | # for linting and formatting
108 | ruff = "^0.7.2"
109 | # for type checking
110 | mypy = "*"
111 | types-jsonschema = ">=3.2.0"
112 | types-PyYAML = ">=5.4.10"
113 | types-cachetools = "*"
114 | types-humanfriendly = "*"
115 | 
116 | [tool.poetry.build]
117 | script = "build_rust.py"
118 | generate-setup-file = true
119 | 
120 | [tool.poetry.urls]
121 | homepage = "https://github.com/element-hq/matrix-content-scanner"
122 | documentation = "https://github.com/element-hq/matrix-content-scanner/blob/main/README.md"
123 | repository = "https://github.com/element-hq/matrix-content-scanner.git"
124 | 
125 | [tool.poetry.scripts]
126 | matrix-content-scanner = "matrix_content_scanner.mcs:main"
127 | 


--------------------------------------------------------------------------------
/rust/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "matrix_content_scanner"
 3 | # dummy version. See pyproject.toml for the actual version number.
 4 | version = "0.1.0"
 5 | edition = "2021"
 6 | publish = false
 7 | 
 8 | [lib]
 9 | name = "matrix_content_scanner"
10 | crate-type = ["lib", "cdylib"]
11 | 
12 | [dependencies]
13 | anyhow = "1.0.63"
14 | lazy_static = "1.4.0"
15 | log = "0.4.17"
16 | matrix-sdk-crypto = "0.7.2"
17 | pyo3 = { version = "0.21.0", features = [
18 |     "macros",
19 |     "anyhow",
20 |     "abi3",
21 |     "abi3-py38",
22 | ] }
23 | pyo3-log = "0.10.0"
24 | pythonize = "0.21.0"
25 | serde_json = "1.0.85"
26 | vodozemac = { git = "https://github.com/matrix-org/vodozemac.git", features = ["insecure-pk-encryption"] }
27 | 
28 | [features]
29 | extension-module = ["pyo3/extension-module"]
30 | default = ["extension-module"]
31 | 
32 | [build-dependencies]
33 | blake2 = "0.10.4"
34 | hex = "0.4.3"
35 | 


--------------------------------------------------------------------------------
/rust/build.rs:
--------------------------------------------------------------------------------
 1 | //! This build script calculates the hash of all files in the `src/`
 2 | //! directory and adds it as an environment variable during build time.
 3 | //!
 4 | //! This is used so that the python code can detect when the built native module
 5 | //! does not match the source in-tree, helping to detect the case where the
 6 | //! source has been updated but the library hasn't been rebuilt.
 7 | 
 8 | use std::path::PathBuf;
 9 | 
10 | use blake2::{Blake2b512, Digest};
11 | 
12 | fn main() -> Result<(), std::io::Error> {
13 |     let mut dirs = vec![PathBuf::from("src")];
14 | 
15 |     let mut paths = Vec::new();
16 |     while let Some(path) = dirs.pop() {
17 |         let mut entries = std::fs::read_dir(path)?
18 |             .map(|res| res.map(|e| e.path()))
19 |             .collect::<Result<Vec<_>, std::io::Error>>()?;
20 | 
21 |         entries.sort();
22 | 
23 |         for entry in entries {
24 |             if entry.is_dir() {
25 |                 dirs.push(entry);
26 |             } else {
27 |                 paths.push(entry.to_str().expect("valid rust paths").to_string());
28 |             }
29 |         }
30 |     }
31 | 
32 |     paths.sort();
33 | 
34 |     let mut hasher = Blake2b512::new();
35 | 
36 |     for path in paths {
37 |         let bytes = std::fs::read(path)?;
38 |         hasher.update(bytes);
39 |     }
40 | 
41 |     let hex_digest = hex::encode(hasher.finalize());
42 |     println!("cargo:rustc-env=MCS_RUST_DIGEST={hex_digest}");
43 | 
44 |     Ok(())
45 | }
46 | 


--------------------------------------------------------------------------------
/rust/src/crypto/mod.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     borrow::Cow,
  3 |     fs,
  4 |     io::{Cursor, ErrorKind, Read},
  5 | };
  6 | 
  7 | use anyhow::{Context, Error};
  8 | use matrix_sdk_crypto::AttachmentDecryptor;
  9 | use pyo3::{
 10 |     prelude::*,
 11 |     types::{PyBytes, PyDict},
 12 | };
 13 | use pythonize::depythonize_bound;
 14 | use vodozemac::{
 15 |     base64_encode,
 16 |     pk_encryption::{self, PkDecryption},
 17 |     Curve25519PublicKey,
 18 | };
 19 | 
 20 | /// Called when registering modules with python.
 21 | pub fn register_module(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
 22 |     let child_module = PyModule::new_bound(py, "crypto")?;
 23 |     child_module.add_class::<CryptoHandler>()?;
 24 |     child_module.add_class::<PkMessage>()?;
 25 |     child_module.add_function(wrap_pyfunction!(decrypt_attachment, &child_module)?)?;
 26 | 
 27 |     m.add_submodule(&child_module)?;
 28 | 
 29 |     Ok(())
 30 | }
 31 | 
 32 | #[pyclass(frozen)]
 33 | pub struct CryptoHandler {
 34 |     decryptor: PkDecryption,
 35 | }
 36 | 
 37 | #[pymethods]
 38 | impl CryptoHandler {
 39 |     #[new]
 40 |     pub fn py_new(pickle_key: &str, pickle_path: &str) -> Result<Self, Error> {
 41 |         match fs::read_to_string(pickle_path) {
 42 |             Ok(pickle) => {
 43 |                 let decryptor = PkDecryption::from_libolm_pickle(&pickle, pickle_key.as_bytes())?;
 44 | 
 45 |                 log::info!("Loaded Olm key pair from pickle file {}", pickle_path);
 46 | 
 47 |                 Ok(Self { decryptor })
 48 |             }
 49 |             Err(e) if e.kind() == ErrorKind::NotFound => {
 50 |                 log::info!(
 51 |                         "Pickle file not found, generating a new Olm key pair and storing it in pickle file {}",
 52 |                         pickle_path,
 53 |                     );
 54 | 
 55 |                 let decryptor = PkDecryption::new();
 56 |                 let pickle = decryptor.to_libolm_pickle(pickle_key.as_bytes())?;
 57 |                 fs::write(pickle_path, pickle)?;
 58 |                 Ok(Self { decryptor })
 59 |             }
 60 |             Err(e) => {
 61 |                 Err(e).with_context(|| format!("Failed to read the pickle file at the location configured for crypto.pickle_path ({pickle_path})"))
 62 |             }
 63 |         }
 64 |     }
 65 | 
 66 |     #[getter]
 67 |     pub fn public_key(&self) -> String {
 68 |         self.decryptor.public_key().to_base64()
 69 |     }
 70 | 
 71 |     pub fn decrypt_body(
 72 |         &self,
 73 |         ciphertext: &str,
 74 |         mac: &str,
 75 |         ephemeral: &str,
 76 |     ) -> Result<String, Error> {
 77 |         let message = pk_encryption::Message::from_base64(ciphertext, mac, ephemeral)?;
 78 |         let decrypted = self.decryptor.decrypt(&message)?;
 79 |         let decrypted =
 80 |             String::from_utf8(decrypted).context("Decrypted message isn't valid UTF-8")?;
 81 |         Ok(decrypted)
 82 |     }
 83 | 
 84 |     pub fn encrypt(&self, public_key: &str, payload: &str) -> Result<PkMessage, Error> {
 85 |         let encryptor =
 86 |             pk_encryption::PkEncryption::from_key(Curve25519PublicKey::from_base64(public_key)?);
 87 |         Ok(PkMessage(encryptor.encrypt(payload.as_bytes())))
 88 |     }
 89 | }
 90 | 
 91 | #[pyclass(frozen)]
 92 | pub struct PkMessage(pk_encryption::Message);
 93 | 
 94 | #[pymethods]
 95 | impl PkMessage {
 96 |     #[getter]
 97 |     pub fn ephemeral_key(&self) -> String {
 98 |         self.0.ephemeral_key.to_base64()
 99 |     }
100 | 
101 |     #[getter]
102 |     pub fn mac(&self) -> String {
103 |         base64_encode(&self.0.mac)
104 |     }
105 | 
106 |     #[getter]
107 |     pub fn ciphertext(&self) -> String {
108 |         base64_encode(&self.0.ciphertext)
109 |     }
110 | }
111 | 
112 | #[pyfunction]
113 | pub fn decrypt_attachment(
114 |     body: Bound<'_, PyBytes>,
115 |     key_info: Bound<'_, PyDict>,
116 | ) -> Result<Cow<'static, [u8]>, Error> {
117 |     let mut cursor = Cursor::new(body.as_bytes());
118 |     let info =
119 |         depythonize_bound(key_info.into_any()).context("Failed parsing supplied key info")?;
120 | 
121 |     let mut decryptor = AttachmentDecryptor::new(&mut cursor, info)?;
122 |     let mut decrypted_data = Vec::new();
123 | 
124 |     decryptor.read_to_end(&mut decrypted_data)?;
125 | 
126 |     Ok(Cow::Owned(decrypted_data))
127 | }
128 | 


--------------------------------------------------------------------------------
/rust/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use lazy_static::lazy_static;
 2 | use pyo3::prelude::*;
 3 | use pyo3_log::ResetHandle;
 4 | 
 5 | pub mod crypto;
 6 | 
 7 | lazy_static! {
 8 |     static ref LOGGING_HANDLE: ResetHandle = pyo3_log::init();
 9 | }
10 | 
11 | /// Returns the hash of all the rust source files at the time it was compiled.
12 | ///
13 | /// Used by python to detect if the rust library is outdated.
14 | #[pyfunction]
15 | fn get_rust_file_digest() -> &'static str {
16 |     env!("MCS_RUST_DIGEST")
17 | }
18 | 
19 | /// Reset the cached logging configuration of pyo3-log to pick up any changes
20 | /// in the Python logging configuration.
21 | ///
22 | #[pyfunction]
23 | fn reset_logging_config() {
24 |     LOGGING_HANDLE.reset();
25 | }
26 | 
27 | /// The entry point for defining the Python module.
28 | #[pymodule]
29 | fn mcs_rust(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
30 |     m.add_function(wrap_pyfunction!(get_rust_file_digest, m)?)?;
31 |     m.add_function(wrap_pyfunction!(reset_logging_config, m)?)?;
32 | 
33 |     crypto::register_module(py, m)?;
34 | 
35 |     Ok(())
36 | }
37 | 


--------------------------------------------------------------------------------
/scripts-dev/lint.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Runs linting scripts and type checking
 3 | # ruff - sorts import statements, lints and finds mistakes, formats the code
 4 | # mypy - checks type annotations
 5 | 
 6 | set -e
 7 | 
 8 | files=(
 9 |   "perf"
10 |   "src"
11 |   "tests"
12 | )
13 | 
14 | # Print out the commands being run
15 | set -x
16 | 
17 | # Catch any common programming mistakes in Python code.
18 | # --quiet suppresses the update check.
19 | ruff check --quiet --fix "${files[@]}"
20 | 
21 | # Reformat Python code.
22 | ruff format --quiet "${files[@]}"
23 | 
24 | # Type-check the code.
25 | mypy "${files[@]}"
26 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/__init__.py:
--------------------------------------------------------------------------------
1 | #  Copyright 2022 New Vector Ltd
2 | #
3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
4 | # Please see LICENSE files in the repository root for full details.
5 | 
6 | from matrix_content_scanner.utils.rust import check_rust_lib_up_to_date
7 | 
8 | check_rust_lib_up_to_date()
9 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/config.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2022 New Vector Ltd
  2 | #
  3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
  4 | # Please see LICENSE files in the repository root for full details.
  5 | from typing import Any, Dict, List, Optional, Union
  6 | 
  7 | import attr
  8 | import humanfriendly
  9 | from jsonschema import ValidationError, validate
 10 | 
 11 | from matrix_content_scanner.utils.errors import ConfigError
 12 | 
 13 | _ONE_WEEK_SECONDS = 604800.0
 14 | 
 15 | 
 16 | def _parse_duration(duration: Optional[Union[str, float]]) -> Optional[float]:
 17 |     """Parse a time duration into a float representing an amount of second. If the given
 18 |     value is None, or already a float, returns it as is.
 19 | 
 20 |     Args:
 21 |         duration: The duration to parse.
 22 | 
 23 |     Returns:
 24 |         The number of seconds in the given duration.
 25 |     """
 26 |     if duration is None or isinstance(duration, float):
 27 |         return duration
 28 | 
 29 |     try:
 30 |         return humanfriendly.parse_timespan(duration)
 31 |     except humanfriendly.InvalidTimespan as e:
 32 |         raise ConfigError(e)
 33 | 
 34 | 
 35 | def _parse_size(size: Optional[Union[str, float]]) -> Optional[float]:
 36 |     """Parse a file size into a float representing the number of bytes for that size. If
 37 |     the given value is None, or already a float, returns it as is.
 38 | 
 39 |     Args:
 40 |         size: The size to parse.
 41 | 
 42 |     Returns:
 43 |         The number of bytes represented by the given size.
 44 |     """
 45 |     if size is None or isinstance(size, float):
 46 |         return size
 47 | 
 48 |     try:
 49 |         return humanfriendly.parse_size(size)
 50 |     except humanfriendly.InvalidSize as e:
 51 |         raise ConfigError(e)
 52 | 
 53 | 
 54 | # Schema to validate the raw configuration dictionary against.
 55 | _config_schema = {
 56 |     "type": "object",
 57 |     "required": ["web", "scan", "crypto"],
 58 |     "additionalProperties": False,
 59 |     "properties": {
 60 |         "web": {
 61 |             "type": "object",
 62 |             "required": ["host", "port"],
 63 |             "additionalProperties": False,
 64 |             "properties": {
 65 |                 "host": {"type": "string"},
 66 |                 "port": {"type": "integer"},
 67 |             },
 68 |         },
 69 |         "scan": {
 70 |             "type": "object",
 71 |             "required": ["script", "temp_directory"],
 72 |             "additionalProperties": False,
 73 |             "properties": {
 74 |                 "script": {"type": "string"},
 75 |                 "temp_directory": {"type": "string"},
 76 |                 "removal_command": {"type": "string"},
 77 |                 "allowed_mimetypes": {"type": "array", "items": {"type": "string"}},
 78 |                 "blocked_mimetypes": {"type": "array", "items": {"type": "string"}},
 79 |             },
 80 |         },
 81 |         "download": {
 82 |             "type": "object",
 83 |             "additionalProperties": False,
 84 |             "properties": {
 85 |                 "base_homeserver_url": {"type": "string"},
 86 |                 "proxy": {"type": "string"},
 87 |                 "additional_headers": {
 88 |                     "type": "object",
 89 |                     "additionalProperties": {"type": "string"},
 90 |                 },
 91 |             },
 92 |         },
 93 |         "crypto": {
 94 |             "type": "object",
 95 |             "required": ["pickle_path", "pickle_key"],
 96 |             "additionalProperties": False,
 97 |             "properties": {
 98 |                 "pickle_path": {"type": "string"},
 99 |                 "pickle_key": {"type": "string"},
100 |             },
101 |         },
102 |         "result_cache": {
103 |             "type": "object",
104 |             "additionalProperties": False,
105 |             "properties": {
106 |                 "max_size": {"type": "integer"},
107 |                 "ttl": {"type": ["string", "number"]},
108 |                 "exit_codes_to_ignore": {
109 |                     "type": "array",
110 |                     "items": {"type": "integer"},
111 |                 },
112 |                 "max_file_size": {"type": ["string", "number"]},
113 |             },
114 |         },
115 |     },
116 | }
117 | 
118 | 
119 | @attr.s(auto_attribs=True, frozen=True)
120 | class WebConfig:
121 |     """Configuration for serving the HTTP API."""
122 | 
123 |     host: str
124 |     port: int
125 | 
126 | 
127 | @attr.s(auto_attribs=True, frozen=True, slots=True)
128 | class ScanConfig:
129 |     """Configuration for scanning files."""
130 | 
131 |     script: str
132 |     temp_directory: str
133 |     removal_command: str = "rm"
134 |     allowed_mimetypes: Optional[List[str]] = None
135 |     blocked_mimetypes: Optional[List[str]] = None
136 | 
137 | 
138 | @attr.s(auto_attribs=True, frozen=True, slots=True)
139 | class ResultCacheConfig:
140 |     """Configuration for caching scan results."""
141 | 
142 |     max_size: int = 1024
143 |     ttl: float = attr.ib(default=_ONE_WEEK_SECONDS, converter=_parse_duration)
144 |     exit_codes_to_ignore: Optional[List[int]] = None
145 |     max_file_size: Optional[float] = attr.ib(default=None, converter=_parse_size)
146 | 
147 | 
148 | @attr.s(auto_attribs=True, frozen=True, slots=True)
149 | class DownloadConfig:
150 |     """Configuration for downloading files."""
151 | 
152 |     base_homeserver_url: Optional[str] = None
153 |     proxy: Optional[str] = None
154 |     additional_headers: Optional[Dict[str, str]] = None
155 | 
156 | 
157 | @attr.s(auto_attribs=True, frozen=True, slots=True)
158 | class CryptoConfig:
159 |     """Configuration for decrypting encrypted bodies."""
160 | 
161 |     pickle_path: str
162 |     pickle_key: str
163 | 
164 | 
165 | class MatrixContentScannerConfig:
166 |     def __init__(self, config_dict: Dict[str, Any]):
167 |         if not isinstance(config_dict, dict):
168 |             raise ConfigError("Bad configuration format")
169 | 
170 |         try:
171 |             validate(config_dict, _config_schema)
172 |         except ValidationError as e:
173 |             raise ConfigError(e.message)
174 | 
175 |         self.web = WebConfig(**(config_dict.get("web") or {}))
176 |         self.scan = ScanConfig(**(config_dict.get("scan") or {}))
177 |         self.crypto = CryptoConfig(**(config_dict.get("crypto") or {}))
178 |         self.download = DownloadConfig(**(config_dict.get("download") or {}))
179 |         self.result_cache = ResultCacheConfig(**(config_dict.get("result_cache") or {}))
180 | 
181 |         # Don't allow both allowlist and blocklist for MIME types, since we do not document
182 |         # the semantics for that and it is in any case pointless.
183 |         # This could have been expressed in JSONSchema but I suspect the error message would be poor
184 |         # in that case.
185 |         if (
186 |             self.scan.allowed_mimetypes is not None
187 |             and self.scan.blocked_mimetypes is not None
188 |         ):
189 |             raise ConfigError(
190 |                 "Both `scan.allowed_mimetypes` and `scan.blocked_mimetypes` are specified, which is not allowed!"
191 |             )
192 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/httpserver.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2022 New Vector Ltd
  2 | #
  3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
  4 | # Please see LICENSE files in the repository root for full details.
  5 | import logging
  6 | from typing import TYPE_CHECKING, Awaitable, Callable
  7 | 
  8 | from aiohttp import web
  9 | 
 10 | from matrix_content_scanner.servlets.download import DownloadHandler
 11 | from matrix_content_scanner.servlets.public_key import PublicKeyHandler
 12 | from matrix_content_scanner.servlets.scan import ScanHandler
 13 | from matrix_content_scanner.servlets.thumbnail import ThumbnailHandler
 14 | 
 15 | if TYPE_CHECKING:
 16 |     from matrix_content_scanner.mcs import MatrixContentScanner
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | _MEDIA_PATH_REGEXP = r"/{media_path:.+}"
 21 | 
 22 | _CORS_HEADERS = {
 23 |     "Access-Control-Allow-Origin": "*",
 24 |     "Access-Control-Allow-Methods": "GET, POST, OPTIONS",
 25 |     "Access-Control-Allow-Headers": "Origin, X-Requested-With, Content-Type, Accept, Authorization",
 26 | }
 27 | 
 28 | 
 29 | @web.middleware
 30 | async def simple_cors_middleware(
 31 |     request: web.Request,
 32 |     handler: Callable[[web.Request], Awaitable[web.StreamResponse]],
 33 | ) -> web.StreamResponse:
 34 |     """A simple aiohttp middleware that adds CORS headers to responses, and handles
 35 |     OPTIONS requests.
 36 | 
 37 |     Args:
 38 |         request: The request to handle.
 39 |         handler: The handler for this request.
 40 | 
 41 |     Returns:
 42 |         A response with CORS headers.
 43 |     """
 44 |     if request.method == "OPTIONS":
 45 |         # We don't register routes for OPTIONS requests, therefore the handler we're given
 46 |         # in this case just raises a 405 Method Not Allowed status using an exception.
 47 |         # Because we actually want to return a 200 OK with additional headers, we ignore
 48 |         # the handler and just return a new response.
 49 |         response = web.StreamResponse(
 50 |             status=200,
 51 |             headers=_CORS_HEADERS,
 52 |         )
 53 |         return response
 54 | 
 55 |     # Run the request's handler and append CORS headers to it.
 56 |     response = await handler(request)
 57 |     response.headers.update(_CORS_HEADERS)
 58 |     return response
 59 | 
 60 | 
 61 | @web.middleware
 62 | async def json_errors_middleware(
 63 |     request: web.Request,
 64 |     handler: Callable[[web.Request], Awaitable[web.StreamResponse]],
 65 | ) -> web.StreamResponse:
 66 |     """A simple aiohttp middleware that converts 404/405 errors into Matrix JSON error.
 67 | 
 68 |     Args:
 69 |         request: The request to handle.
 70 |         handler: The handler for this request.
 71 | 
 72 |     Returns:
 73 |         The original response OR a JSON error response.
 74 |     """
 75 |     # Run the request's handler and append CORS headers to it.
 76 |     try:
 77 |         return await handler(request)
 78 |     except (web.HTTPNotFound, web.HTTPMethodNotAllowed) as ex:
 79 |         # Return the proper JSON response.
 80 |         return web.json_response(
 81 |             {"errcode": "M_UNRECOGNIZED", "error": "Unrecognized request"},
 82 |             status=ex.status,
 83 |         )
 84 | 
 85 | 
 86 | class HTTPServer:
 87 |     def __init__(self, mcs: "MatrixContentScanner"):
 88 |         self._mcs = mcs
 89 |         self._bind_address = mcs.config.web.host
 90 |         self._bind_port = mcs.config.web.port
 91 | 
 92 |         self._app = self._build_app()
 93 | 
 94 |     def _build_app(self) -> web.Application:
 95 |         """Build the aiohttp app and attach all the handlers to it.
 96 | 
 97 |         Returns:
 98 |             The built aiohttp application.
 99 |         """
100 |         # First we build an application with all routes defined on the root path.
101 |         app = web.Application()
102 | 
103 |         scan_handler = ScanHandler(self._mcs)
104 |         download_handler = DownloadHandler(self._mcs)
105 |         thumbnail_handler = ThumbnailHandler(self._mcs)
106 |         public_key_handler = PublicKeyHandler(self._mcs)
107 | 
108 |         app.add_routes(
109 |             [
110 |                 web.get("/scan" + _MEDIA_PATH_REGEXP, scan_handler.handle_plain),
111 |                 web.post("/scan_encrypted", scan_handler.handle_encrypted),
112 |                 web.get(
113 |                     "/download" + _MEDIA_PATH_REGEXP, download_handler.handle_plain
114 |                 ),
115 |                 web.post("/download_encrypted", download_handler.handle_encrypted),
116 |                 web.get(
117 |                     "/thumbnail" + _MEDIA_PATH_REGEXP,
118 |                     thumbnail_handler.handle_thumbnail,
119 |                 ),
120 |                 web.get(
121 |                     "/public_key",
122 |                     public_key_handler.handle_public_key,
123 |                 ),
124 |             ]
125 |         )
126 | 
127 |         # Then we create a root application, and define the app we previously created as
128 |         # a subapp on the base path for the content scanner API.
129 |         root = web.Application(
130 |             # Apply middlewares. This will also apply to subapps.
131 |             middlewares=[
132 |                 # Handle trailing slashes.
133 |                 web.normalize_path_middleware(),
134 |                 # Handler CORS.
135 |                 simple_cors_middleware,
136 |                 # Convert unknown routes/methods into JSON errors.
137 |                 json_errors_middleware,
138 |             ],
139 |         )
140 |         root.add_subapp("/_matrix/media_proxy/unstable", app)
141 | 
142 |         return root
143 | 
144 |     def start(self) -> None:
145 |         """Start an aiohttp server serving the content scanner API."""
146 |         logger.info("Starting listener on %s:%d", self._bind_address, self._bind_port)
147 |         web.run_app(
148 |             app=self._app,
149 |             host=self._bind_address,
150 |             port=self._bind_port,
151 |             print=None,
152 |         )
153 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/logutils.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2022 New Vector Ltd
 2 | #
 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
 4 | # Please see LICENSE files in the repository root for full details.
 5 | import logging
 6 | from contextvars import ContextVar
 7 | from typing import Any
 8 | 
 9 | # The request's ID.
10 | request_id: ContextVar[str] = ContextVar("request_id")
11 | 
12 | 
13 | def setup_custom_factory() -> None:
14 |     """Generates a new record factory, chained to the current factory, and sets it as the
15 |     new default record factory.
16 | 
17 |     The new factory adds attributes for the media path and request type to log records,
18 |     and populates them using the matching ContextVars;
19 |     """
20 |     old_factory = logging.getLogRecordFactory()
21 | 
22 |     def _factory(*args: Any, **kwargs: Any) -> logging.LogRecord:
23 |         record = old_factory(*args, **kwargs)
24 |         record.request_id = request_id.get(None)
25 |         return record
26 | 
27 |     logging.setLogRecordFactory(_factory)
28 | 
29 | 
30 | def set_request_id_in_context(v: str) -> None:
31 |     """Sets the request_id ContextVar to the given value.
32 | 
33 |     Args:
34 |         v: The value to set the ContextVar.
35 |     """
36 |     request_id.set(v)
37 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/mcs.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022 New Vector Ltd
  2 | #
  3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
  4 | # Please see LICENSE files in the repository root for full details.
  5 | import argparse
  6 | import logging
  7 | import sys
  8 | from functools import cached_property
  9 | 
 10 | import yaml
 11 | from yaml.scanner import ScannerError
 12 | 
 13 | from matrix_content_scanner import logutils
 14 | from matrix_content_scanner.config import MatrixContentScannerConfig
 15 | from matrix_content_scanner.httpserver import HTTPServer
 16 | from matrix_content_scanner.mcs_rust import crypto, reset_logging_config
 17 | from matrix_content_scanner.scanner.file_downloader import FileDownloader
 18 | from matrix_content_scanner.scanner.scanner import Scanner
 19 | from matrix_content_scanner.utils.errors import ConfigError
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | 
 24 | class MatrixContentScanner:
 25 |     def __init__(
 26 |         self,
 27 |         config: MatrixContentScannerConfig,
 28 |     ) -> None:
 29 |         self.config = config
 30 | 
 31 |     @cached_property
 32 |     def file_downloader(self) -> FileDownloader:
 33 |         return FileDownloader(self)
 34 | 
 35 |     @cached_property
 36 |     def scanner(self) -> Scanner:
 37 |         return Scanner(self)
 38 | 
 39 |     @cached_property
 40 |     def crypto_handler(self) -> crypto.CryptoHandler:
 41 |         return crypto.CryptoHandler(
 42 |             self.config.crypto.pickle_key, self.config.crypto.pickle_path
 43 |         )
 44 | 
 45 |     def start(self) -> None:
 46 |         http_server = HTTPServer(self)
 47 |         http_server.start()
 48 | 
 49 | 
 50 | def setup_logging() -> None:
 51 |     """Basic logging setup."""
 52 |     # Set the format, this assumes every logger is created by
 53 |     # matrix_content_scanner.logging.getLogger and has custom request_type and
 54 |     # media_path fields set.
 55 |     log_format = "%(asctime)s - %(name)s - %(lineno)d - %(levelname)s - %(request_id)s - %(message)s"
 56 |     formatter = logging.Formatter(log_format)
 57 | 
 58 |     logutils.setup_custom_factory()
 59 | 
 60 |     # Create the handler and set the default logging level to INFO.
 61 |     handler = logging.StreamHandler()
 62 |     handler.setFormatter(formatter)
 63 |     rootLogger = logging.getLogger("")
 64 |     rootLogger.setLevel(logging.INFO)
 65 |     rootLogger.addHandler(handler)
 66 | 
 67 |     reset_logging_config()
 68 | 
 69 | 
 70 | def main() -> None:
 71 |     parser = argparse.ArgumentParser(
 72 |         description="A web service for scanning media hosted by a Matrix media repository."
 73 |     )
 74 |     parser.add_argument(
 75 |         "-c",
 76 |         type=argparse.FileType("r"),
 77 |         required=True,
 78 |         help="The YAML configuration file.",
 79 |     )
 80 | 
 81 |     args = parser.parse_args()
 82 | 
 83 |     # Load the configuration file.
 84 |     try:
 85 |         cfg = MatrixContentScannerConfig(yaml.safe_load(args.c))
 86 |     except (ConfigError, ScannerError) as e:
 87 |         # If there's an error reading the file, print it and exit without raising so we
 88 |         # don't confuse/annoy the user with an unnecessary stack trace.
 89 |         print("Failed to read configuration file: %s" % e, file=sys.stderr)
 90 |         sys.exit(1)
 91 | 
 92 |     # Create the content scanner.
 93 |     mcs = MatrixContentScanner(cfg)
 94 | 
 95 |     setup_logging()
 96 | 
 97 |     # Construct the crypto handler early on, so we can make sure we can load the Olm key
 98 |     # pair from the pickle file (or write it if it doesn't already exist).
 99 |     try:
100 |         _ = mcs.crypto_handler
101 |     except ConfigError as e:
102 |         print(e, file=sys.stderr)
103 |         sys.exit(1)
104 | 
105 |     # Start the content scanner.
106 |     mcs.start()
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     main()
111 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/mcs_rust/__init__.pyi:
--------------------------------------------------------------------------------
1 | def get_rust_file_digest() -> str: ...
2 | def reset_logging_config() -> None: ...
3 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/mcs_rust/crypto.pyi:
--------------------------------------------------------------------------------
 1 | from matrix_content_scanner.utils.types import JsonDict
 2 | 
 3 | class CryptoHandler:
 4 |     def __init__(self, pickle_key: str, pickle_path: str) -> None: ...
 5 |     @property
 6 |     def public_key(self) -> str: ...
 7 |     def decrypt_body(self, ciphertext: str, mac: str, ephemeral: str) -> str: ...
 8 |     def encrypt(self, public_key: str, payload: str) -> PkMessage: ...
 9 | 
10 | class PkMessage:
11 |     @property
12 |     def ephemeral_key(self) -> str: ...
13 |     @property
14 |     def mac(self) -> str: ...
15 |     @property
16 |     def ciphertext(self) -> str: ...
17 | 
18 | def decrypt_attachment(body: bytes, key_info: JsonDict) -> bytes: ...
19 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/element-hq/matrix-content-scanner-python/5e26332a9ef35e5f596b97758f7b6aea16b14cb8/src/matrix_content_scanner/py.typed


--------------------------------------------------------------------------------
/src/matrix_content_scanner/scanner/__init__.py:
--------------------------------------------------------------------------------
1 | #  Copyright 2022 New Vector Ltd
2 | #
3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
4 | # Please see LICENSE files in the repository root for full details.
5 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/scanner/file_downloader.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2022 New Vector Ltd
  2 | #
  3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
  4 | # Please see LICENSE files in the repository root for full details.
  5 | import json
  6 | import logging
  7 | import urllib.parse
  8 | from http import HTTPStatus
  9 | from typing import TYPE_CHECKING, Dict, Optional, Tuple
 10 | 
 11 | import aiohttp
 12 | from multidict import CIMultiDictProxy, MultiMapping
 13 | 
 14 | from matrix_content_scanner.utils.constants import ErrCode
 15 | from matrix_content_scanner.utils.errors import (
 16 |     ContentScannerRestError,
 17 |     WellKnownDiscoveryError,
 18 | )
 19 | from matrix_content_scanner.utils.types import MediaDescription
 20 | 
 21 | if TYPE_CHECKING:
 22 |     from matrix_content_scanner.mcs import MatrixContentScanner
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | 
 27 | class _PathNotFoundException(Exception):
 28 |     """An exception raised to signal that a URL could not be found on the remote
 29 |     homeserver.
 30 |     """
 31 | 
 32 | 
 33 | class FileDownloader:
 34 |     MEDIA_DOWNLOAD_PREFIX = "_matrix/media/%s/download"
 35 |     MEDIA_THUMBNAIL_PREFIX = "_matrix/media/%s/thumbnail"
 36 |     MEDIA_DOWNLOAD_AUTHENTICATED_PREFIX = "_matrix/client/%s/media/download"
 37 |     MEDIA_THUMBNAIL_AUTHENTICATED_PREFIX = "_matrix/client/%s/media/thumbnail"
 38 | 
 39 |     def __init__(self, mcs: "MatrixContentScanner"):
 40 |         self._base_url = mcs.config.download.base_homeserver_url
 41 |         self._well_known_cache: Dict[str, Optional[str]] = {}
 42 |         self._proxy_url = mcs.config.download.proxy
 43 |         self._headers = (
 44 |             mcs.config.download.additional_headers
 45 |             if mcs.config.download.additional_headers is not None
 46 |             else {}
 47 |         )
 48 | 
 49 |     async def download_file(
 50 |         self,
 51 |         media_path: str,
 52 |         thumbnail_params: Optional[MultiMapping[str]] = None,
 53 |         auth_header: Optional[str] = None,
 54 |     ) -> MediaDescription:
 55 |         """Retrieve the file with the given `server_name/media_id` path, and stores it on
 56 |         disk.
 57 | 
 58 |         Args:
 59 |             media_path: The path identifying the media to retrieve.
 60 |             thumbnail_params: If present, then we want to request and scan a thumbnail
 61 |                 generated with the provided parameters instead of the full media.
 62 |             auth_header: If present, we forward the given Authorization header, this is
 63 |                 required for authenticated media endpoints.
 64 | 
 65 |         Returns:
 66 |             A description of the file (including its full content).
 67 | 
 68 |         Raises:
 69 |             ContentScannerRestError: The file was not found or could not be downloaded due
 70 |                 to an error on the remote homeserver's side.
 71 |         """
 72 | 
 73 |         auth_media = True if auth_header is not None else False
 74 | 
 75 |         prefix = (
 76 |             self.MEDIA_DOWNLOAD_AUTHENTICATED_PREFIX
 77 |             if auth_media
 78 |             else self.MEDIA_DOWNLOAD_PREFIX
 79 |         )
 80 |         if thumbnail_params is not None:
 81 |             prefix = (
 82 |                 self.MEDIA_THUMBNAIL_AUTHENTICATED_PREFIX
 83 |                 if auth_media
 84 |                 else self.MEDIA_THUMBNAIL_PREFIX
 85 |             )
 86 | 
 87 |         url = await self._build_https_url(
 88 |             media_path, prefix, "v1" if auth_media else "v3"
 89 |         )
 90 | 
 91 |         # Attempt to retrieve the file at the generated URL.
 92 |         try:
 93 |             file = await self._get_file_content(url, thumbnail_params, auth_header)
 94 |         except _PathNotFoundException:
 95 |             if auth_media:
 96 |                 raise ContentScannerRestError(
 97 |                     http_status=HTTPStatus.NOT_FOUND,
 98 |                     reason=ErrCode.NOT_FOUND,
 99 |                     info="File not found",
100 |                 )
101 | 
102 |             # If the file could not be found, it might be because the homeserver hasn't
103 |             # been upgraded to a version that supports Matrix v1.1 endpoints yet, so try
104 |             # again with an r0 endpoint.
105 |             logger.info("File not found, trying legacy r0 path")
106 | 
107 |             url = await self._build_https_url(media_path, prefix, "r0")
108 | 
109 |             try:
110 |                 file = await self._get_file_content(url, thumbnail_params, auth_header)
111 |             except _PathNotFoundException:
112 |                 # If that still failed, raise an error.
113 |                 raise ContentScannerRestError(
114 |                     http_status=HTTPStatus.NOT_FOUND,
115 |                     reason=ErrCode.NOT_FOUND,
116 |                     info="File not found",
117 |                 )
118 | 
119 |         return file
120 | 
121 |     async def _build_https_url(
122 |         self,
123 |         media_path: str,
124 |         prefix: str,
125 |         endpoint_version: str,
126 |     ) -> str:
127 |         """Turn a `server_name/media_id` path into an https:// one we can use to fetch
128 |         the media.
129 | 
130 |         Note that if `base_homeserver_url` is set to an http URL, it will not be turned
131 |         into an https one.
132 | 
133 |         Args:
134 |             media_path: The media path to translate.
135 |             endpoint_version: The version of the download endpoint to use. As of Matrix
136 |                 v1.11, this is "v1" for authenticated media. For unauthenticated media
137 |                 this is either "v3" or "r0".
138 | 
139 |         Returns:
140 |             An https URL to use. If `base_homeserver_url` is set in the config, this
141 |             will be used as the base of the URL.
142 |         """
143 |         server_name, media_id = media_path.split("/")
144 | 
145 |         # Figure out what base URL to use. If one is specified in the configuration file,
146 |         # use it, otherwise try to discover one using .well-known. If that fails, use the
147 |         # server name with an HTTPS scheme.
148 |         if self._base_url is not None:
149 |             base_url = self._base_url
150 |         else:
151 |             base_url = None
152 | 
153 |             try:
154 |                 base_url = await self._discover_via_well_known(server_name)
155 |             except WellKnownDiscoveryError as e:
156 |                 # We don't catch ContentScannerRestErrors here because if one makes its
157 |                 # way up here then it likely means that trying to reach https://server_name
158 |                 # failed, in which case we're unlikely to be able to reach it again when
159 |                 # downloading the file, so we let the error escalate.
160 |                 logger.info("Failed to discover server via well-known: %s", e)
161 | 
162 |             if base_url is None:
163 |                 # base_url might be None if either .well-known discovery failed, or we
164 |                 # didn't find a .well-known file.
165 |                 base_url = "https://" + server_name
166 | 
167 |         # Build the full URL.
168 |         path_prefix = prefix % endpoint_version
169 |         url = "%s/%s/%s/%s" % (
170 |             base_url,
171 |             path_prefix,
172 |             urllib.parse.quote(server_name),
173 |             urllib.parse.quote(media_id),
174 |         )
175 | 
176 |         return url
177 | 
178 |     async def _get_file_content(
179 |         self,
180 |         url: str,
181 |         thumbnail_params: Optional[MultiMapping[str]],
182 |         auth_header: Optional[str] = None,
183 |     ) -> MediaDescription:
184 |         """Retrieve the content of the file at a given URL.
185 | 
186 |         Args:
187 |             url: The URL to query.
188 |             thumbnail_params: Query parameters used if the request is for a thumbnail.
189 |             auth_header: If present, we forward the given Authorization header, this is
190 |                 required for authenticated media endpoints.
191 | 
192 |         Returns:
193 |             A description of the file (including its full content).
194 | 
195 |         Raises:
196 |             _PathNotFoundException: the server returned an error that can mean the path
197 |                 of the request wasn't understood, e.g. because we requested a v3 URL but
198 |                 the server only supports r0, or the media couldn't be found.
199 |                 We raise a separate error class in this case because if the error is due
200 |                 to a v3 vs r0 path we want to retry the request on the r0 path.
201 |             ContentScannerRestError: the server returned a non-200 status which cannot
202 |                 meant that the path wasn't understood.
203 |         """
204 |         code, body, headers = await self._get(
205 |             url, query=thumbnail_params, auth_header=auth_header
206 |         )
207 | 
208 |         logger.info("Remote server responded with %d", code)
209 | 
210 |         # If the response isn't a 200 OK, raise.
211 |         if 200 < code:
212 |             logger.info("Response body: %s", body)
213 |             # If the response is a 404 or an "unrecognised request" à la Synapse,
214 |             # consider that we could not find the media, and that we should retry if this
215 |             # request was directed at a v3 endpoint.
216 |             if code == 400:
217 |                 try:
218 |                     err = json.loads(body)
219 |                     if err["errcode"] == "M_UNRECOGNIZED":
220 |                         raise _PathNotFoundException
221 |                 except (json.decoder.JSONDecodeError, KeyError):
222 |                     pass
223 | 
224 |             if code == 401:
225 |                 try:
226 |                     err = json.loads(body)
227 |                     if err["errcode"] == ErrCode.MISSING_TOKEN:
228 |                         raise ContentScannerRestError(
229 |                             HTTPStatus.UNAUTHORIZED,
230 |                             ErrCode.MISSING_TOKEN,
231 |                             "Access token missing from request",
232 |                         )
233 |                     if err["errcode"] == ErrCode.UNKNOWN_TOKEN:
234 |                         raise ContentScannerRestError(
235 |                             HTTPStatus.UNAUTHORIZED,
236 |                             ErrCode.UNKNOWN_TOKEN,
237 |                             "Invalid access token passed",
238 |                         )
239 |                 except (json.decoder.JSONDecodeError, KeyError):
240 |                     pass
241 | 
242 |             if code == 404:
243 |                 raise _PathNotFoundException
244 | 
245 |             raise ContentScannerRestError(
246 |                 HTTPStatus.BAD_GATEWAY,
247 |                 ErrCode.REQUEST_FAILED,
248 |                 "The remote server responded with an error",
249 |             )
250 | 
251 |         # Check that we have the right amount of Content-Type headers (so we don't get
252 |         # confused later when we try comparing it with the file's MIME type).
253 |         content_type_headers = headers.getall("content-type", None)
254 |         if content_type_headers is None or len(content_type_headers) != 1:
255 |             raise ContentScannerRestError(
256 |                 HTTPStatus.BAD_GATEWAY,
257 |                 ErrCode.REQUEST_FAILED,
258 |                 "The remote server responded with an invalid amount of Content-Type headers",
259 |             )
260 | 
261 |         return MediaDescription(
262 |             content_type=content_type_headers[0],
263 |             content=body,
264 |             response_headers=headers,
265 |         )
266 | 
267 |     async def _discover_via_well_known(self, domain: str) -> Optional[str]:
268 |         """Try to discover the base URL for the given domain via .well-known client
269 |         discovery.
270 | 
271 |         Args:
272 |             domain: The domain to discover the base URL for.
273 | 
274 |         Returns:
275 |             The base URL to use, or None if no .well-known client file exist for this
276 |             domain.
277 | 
278 |         Raises:
279 |             WellKnownDiscoveryError if an error happened during the discovery attempt.
280 |         """
281 |         # Check if we already have a result cached, and if so return with it straight
282 |         # away.
283 |         if domain in self._well_known_cache:
284 |             logger.info("Fetching .well-known discovery result from cache")
285 |             return self._well_known_cache[domain]
286 | 
287 |         # Attempt to download the .well-known file.
288 |         try:
289 |             url = f"https://{domain}/.well-known/matrix/client"
290 |             code, body, _ = await self._get(url)
291 |         except ContentScannerRestError:
292 |             raise WellKnownDiscoveryError(f"Failed to reach web server at {domain}")
293 | 
294 |         if code != 200:
295 |             if code == 404:
296 |                 # If the response status is 404, then the homeserver hasn't set up
297 |                 # .well-known discovery, in which case we tell the caller that there's
298 |                 # no base URL to use rather than raising an error.
299 |                 # The difference is that we want to cache this result here, but we don't
300 |                 # want to do that when the discovery fails due to an incorrectly set up
301 |                 # file or an unavailable homeserver, which might be fixed later on.
302 |                 logger.info(
303 |                     ".well-known discover has not been set up for this homeserver"
304 |                 )
305 |                 self._well_known_cache[domain] = None
306 |                 return None
307 | 
308 |             raise WellKnownDiscoveryError(
309 |                 f"Server responded with non-200 status {code}"
310 |             )
311 | 
312 |         # Try to parse the JSON content.
313 |         try:
314 |             parsed_body = json.loads(body)
315 |         except json.decoder.JSONDecodeError as e:
316 |             raise WellKnownDiscoveryError(e)
317 | 
318 |         # Check if the parsed content has a base URL in the right place.
319 |         try:
320 |             base_url: str = parsed_body["m.homeserver"]["base_url"]
321 |         except (KeyError, TypeError):
322 |             # We might get a KeyError if we're trying to reach a key that doesn't exist,
323 |             # and we might get a TypeError if parsed_body or parsed_body["m.homeserver"]
324 |             # isn't a dictionary.
325 |             raise WellKnownDiscoveryError("Response did not include a usable URL")
326 | 
327 |         # Remove the trailing slash if there is one.
328 |         if base_url.endswith("/"):
329 |             base_url = base_url[:-1]
330 | 
331 |         # Check if the base URL is one for a working homeserver.
332 |         url = base_url + "/_matrix/client/versions"
333 |         try:
334 |             code, _, _ = await self._get(url)
335 |         except ContentScannerRestError:
336 |             raise WellKnownDiscoveryError(
337 |                 "Base URL does not seem to point to a working homeserver"
338 |             )
339 | 
340 |         if code != 200:
341 |             raise WellKnownDiscoveryError(
342 |                 "Base URL does not seem to point to a working homeserver"
343 |             )
344 | 
345 |         # Cache and return the result.
346 |         self._well_known_cache[domain] = base_url
347 |         return base_url
348 | 
349 |     async def _get(
350 |         self,
351 |         url: str,
352 |         query: Optional[MultiMapping[str]] = None,
353 |         auth_header: Optional[str] = None,
354 |     ) -> Tuple[int, bytes, CIMultiDictProxy[str]]:
355 |         """Sends a GET request to the provided URL.
356 | 
357 |         Args:
358 |             url: The URL to send requests to.
359 |             query: Optional parameters to use in the request's query string.
360 |             auth_header: If present, we forward the given Authorization header, this is
361 |                 required for authenticated media endpoints.
362 | 
363 |         Returns:
364 |             The HTTP status code, body and headers the remote server responded with.
365 | 
366 |         Raises:
367 |             ContentScannerRestError(502) if the request failed (if the remote server
368 |                 timed out or refused the connection, etc.).
369 |         """
370 |         try:
371 |             logger.info("Sending GET request to %s", url)
372 |             async with aiohttp.ClientSession() as session:
373 |                 if auth_header is not None:
374 |                     request_headers = {"Authorization": auth_header, **self._headers}
375 |                 else:
376 |                     request_headers = self._headers
377 | 
378 |                 async with session.get(
379 |                     url,
380 |                     proxy=self._proxy_url,
381 |                     headers=request_headers,
382 |                     params=query,
383 |                 ) as resp:
384 |                     return resp.status, await resp.read(), resp.headers
385 | 
386 |         except Exception as e:
387 |             logger.error(e)
388 |             raise ContentScannerRestError(
389 |                 HTTPStatus.BAD_GATEWAY,
390 |                 ErrCode.REQUEST_FAILED,
391 |                 "Failed to reach the remote server",
392 |             )
393 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/scanner/scanner.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2022 New Vector Ltd
  2 | #
  3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
  4 | # Please see LICENSE files in the repository root for full details.
  5 | import asyncio
  6 | import hashlib
  7 | import logging
  8 | import os
  9 | import subprocess
 10 | from asyncio import Future
 11 | from pathlib import Path
 12 | from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
 13 | 
 14 | import attr
 15 | import magic
 16 | from cachetools import TTLCache
 17 | from canonicaljson import encode_canonical_json
 18 | from humanfriendly import format_size
 19 | from multidict import MultiMapping
 20 | 
 21 | from matrix_content_scanner.mcs_rust import crypto
 22 | from matrix_content_scanner.utils.constants import ErrCode
 23 | from matrix_content_scanner.utils.errors import (
 24 |     ContentScannerRestError,
 25 |     FileDirtyError,
 26 |     FileMimeTypeForbiddenError,
 27 | )
 28 | from matrix_content_scanner.utils.types import JsonDict, MediaDescription
 29 | 
 30 | if TYPE_CHECKING:
 31 |     from matrix_content_scanner.mcs import MatrixContentScanner
 32 | 
 33 | logger = logging.getLogger(__name__)
 34 | 
 35 | 
 36 | @attr.s(auto_attribs=True, frozen=True)
 37 | class CacheEntry:
 38 |     """An entry in the scanner's result cache."""
 39 | 
 40 |     # The result of the scan: True if the scan passed, False otherwise.
 41 |     result: bool
 42 | 
 43 |     # The media that was scanned, so we can return it in future requests. We only cache
 44 |     # it if the scan succeeded and the file's size does not exceed the configured limit,
 45 |     # otherwise it's None.
 46 |     media: Optional[MediaDescription] = None
 47 | 
 48 |     # Hash of the media content, so we can make sure no malicious servers changed the file
 49 |     # since we've scanned it (e.g. if we need to re-download it because the file was too
 50 |     # big). None if the scan failed.
 51 |     media_hash: Optional[str] = None
 52 | 
 53 |     # Info to include in the FileDirtyError if the scan failed.
 54 |     info: Optional[str] = None
 55 | 
 56 | 
 57 | class Scanner:
 58 |     def __init__(self, mcs: "MatrixContentScanner"):
 59 |         self._file_downloader = mcs.file_downloader
 60 |         self._script = mcs.config.scan.script
 61 |         self._removal_command = mcs.config.scan.removal_command
 62 |         self._store_directory = Path(mcs.config.scan.temp_directory).resolve(
 63 |             strict=True
 64 |         )
 65 | 
 66 |         # Result cache settings.
 67 |         self._result_cache: TTLCache[str, CacheEntry] = TTLCache(
 68 |             maxsize=mcs.config.result_cache.max_size,
 69 |             ttl=mcs.config.result_cache.ttl,
 70 |         )
 71 | 
 72 |         if mcs.config.result_cache.exit_codes_to_ignore is None:
 73 |             self._exit_codes_to_ignore = []
 74 |         else:
 75 |             self._exit_codes_to_ignore = mcs.config.result_cache.exit_codes_to_ignore
 76 | 
 77 |         self._max_size_to_cache = mcs.config.result_cache.max_file_size
 78 | 
 79 |         # List of MIME types we should allow.
 80 |         # If None, we fall back to `_blocked_mimetypes`.
 81 |         # If that's also None, we don't fail files based on their
 82 |         # MIME types (besides comparing it with the Content-Type header from the server
 83 |         # for unencrypted files).
 84 |         self._allowed_mimetypes = mcs.config.scan.allowed_mimetypes
 85 | 
 86 |         # List of MIME types we should block.
 87 |         # Must not be specified at the same time as `_allowed_mimetypes`.
 88 |         # See the comment for `_allowed_mimetypes` for the semantics.
 89 |         self._blocked_mimetypes = mcs.config.scan.blocked_mimetypes
 90 | 
 91 |         # Cache of futures for files that are currently scanning and downloading, so that
 92 |         # concurrent requests don't cause a file to be downloaded and scanned twice.
 93 |         self._current_scans: Dict[str, Future[MediaDescription]] = {}
 94 | 
 95 |         # Limit the number of concurrent scans.
 96 |         self._current_scan_semaphore = asyncio.Semaphore(100)
 97 | 
 98 |     async def scan_file(
 99 |         self,
100 |         media_path: str,
101 |         metadata: Optional[JsonDict] = None,
102 |         thumbnail_params: Optional["MultiMapping[str]"] = None,
103 |         auth_header: Optional[str] = None,
104 |     ) -> MediaDescription:
105 |         """Download and scan the given media.
106 | 
107 |         Unless the scan fails with one of the codes listed in `do_not_cache_exit_codes`,
108 |         also cache the result.
109 | 
110 |         If the file already has an entry in the result cache, return this value without
111 |         downloading the file again (unless we purposefully did not cache the file's
112 |         content to save up on memory).
113 | 
114 |         If a file is currently already being downloaded or scanned as a result of another
115 |         request, don't download it again and use the result from the first request.
116 | 
117 |         Args:
118 |             media_path: The `server_name/media_id` path for the media.
119 |             metadata: The metadata attached to the file (e.g. decryption key), or None if
120 |                 the file isn't encrypted.
121 |             thumbnail_params: If present, then we want to request and scan a thumbnail
122 |                 generated with the provided parameters instead of the full media.
123 |             auth_header: If present, we forward the given Authorization header, this is
124 |                 required for authenticated media endpoints.
125 | 
126 |         Returns:
127 |             A description of the media.
128 | 
129 |         Raises:
130 |             ContentScannerRestError if the file could not be downloaded.
131 |             FileDirtyError if the result of the scan said that the file is dirty, or if
132 |                 the media path is malformed.
133 |         """
134 |         # Compute the key to use when caching, both in the current scans cache and in the
135 |         # results cache.
136 |         cache_key = self._get_cache_key_for_file(media_path, metadata, thumbnail_params)
137 |         if cache_key not in self._current_scans:
138 |             # Create a future in the context of the current event loop.
139 |             loop = asyncio.get_event_loop()
140 |             f = loop.create_future()
141 |             # Register the future in the current scans cache so that subsequent queries
142 |             # can use it.
143 |             self._current_scans[cache_key] = f
144 |             # Try to download and scan the file.
145 |             try:
146 |                 res = await self._scan_file(
147 |                     cache_key, media_path, metadata, thumbnail_params, auth_header
148 |                 )
149 |                 # Set the future's result, and mark it as done.
150 |                 f.set_result(res)
151 |                 # Return the result.
152 |                 return res
153 |             except Exception as e:
154 |                 # If there's an exception, catch it, pass it on to the future, and raise
155 |                 # it.
156 |                 f.set_exception(e)
157 |                 # We retrieve the exception from the future, because if we don't and no
158 |                 # other request is awaiting on the future, asyncio complains about "Future
159 |                 # exception was never retrieved".
160 |                 f.exception()
161 |                 raise
162 |             finally:
163 |                 # Remove the future from the cache.
164 |                 del self._current_scans[cache_key]
165 | 
166 |         return await self._current_scans[cache_key]
167 | 
168 |     async def _scan_file(
169 |         self,
170 |         cache_key: str,
171 |         media_path: str,
172 |         metadata: Optional[JsonDict] = None,
173 |         thumbnail_params: Optional[MultiMapping[str]] = None,
174 |         auth_header: Optional[str] = None,
175 |     ) -> MediaDescription:
176 |         """Download and scan the given media.
177 | 
178 |         Unless the scan fails with one of the codes listed in `do_not_cache_exit_codes`,
179 |         also cache the result.
180 | 
181 |         If the file already has an entry in the result cache, return this value without
182 |         downloading the file again (unless we purposefully did not cache the file's
183 |         content to save up on memory).
184 | 
185 |         Args:
186 |             cache_key: The key to use to cache the result of the scan in the result cache.
187 |             media_path: The `server_name/media_id` path for the media.
188 |             metadata: The metadata attached to the file (e.g. decryption key), or None if
189 |                 the file isn't encrypted.
190 |             thumbnail_params: If present, then we want to request and scan a thumbnail
191 |                 generated with the provided parameters instead of the full media.
192 |             auth_header: If present, we forward the given Authorization header, this is
193 |                 required for authenticated media endpoints.
194 | 
195 |         Returns:
196 |             A description of the media.
197 | 
198 |         Raises:
199 |             ContentScannerRestError if the file could not be downloaded.
200 |             FileDirtyError if the result of the scan said that the file is dirty, or if
201 |                 the media path is malformed.
202 |         """
203 |         # The media to scan.
204 |         media: Optional[MediaDescription] = None
205 | 
206 |         # Return the cached result if there's one.
207 |         cache_entry = self._result_cache.get(cache_key)
208 |         if cache_entry is not None:
209 |             logger.info("Found a cached result %s", cache_entry.result)
210 | 
211 |             if cache_entry.result is False:
212 |                 # Feed the additional info we might have added when caching the error,
213 |                 # into the new error.
214 |                 raise FileDirtyError(info=cache_entry.info)
215 | 
216 |             if cache_entry.media is not None:
217 |                 return cache_entry.media
218 | 
219 |             # If we don't have the media cached
220 |             logger.info(
221 |                 "Got a positive result from cache without a media, downloading file",
222 |             )
223 | 
224 |             media = await self._file_downloader.download_file(
225 |                 media_path=media_path,
226 |                 thumbnail_params=thumbnail_params,
227 |                 auth_header=auth_header,
228 |             )
229 | 
230 |             # Compare the media's hash to ensure the server hasn't changed the file since
231 |             # the last scan. If it has changed, shout about it in the logs, discard the
232 |             # cache entry and scan it again.
233 |             media_hash = hashlib.sha256(media.content).hexdigest()
234 |             if media_hash == cache_entry.media_hash:
235 |                 return media
236 | 
237 |             logger.warning(
238 |                 "Media has changed since last scan (cached hash: %s, new hash: %s),"
239 |                 " discarding cached result and scanning again",
240 |                 cache_entry.media_hash,
241 |                 media_hash,
242 |             )
243 | 
244 |             del self._result_cache[cache_key]
245 | 
246 |         # Check if the media path is valid and only contains one slash (otherwise we'll
247 |         # have issues parsing it further down the line).
248 |         if media_path.count("/") != 1:
249 |             info = "Malformed media ID"
250 |             self._result_cache[cache_key] = CacheEntry(
251 |                 result=False,
252 |                 info=info,
253 |             )
254 |             raise FileDirtyError(info)
255 | 
256 |         # Download the file if we don't already have it.
257 |         if media is None:
258 |             media = await self._file_downloader.download_file(
259 |                 media_path=media_path,
260 |                 thumbnail_params=thumbnail_params,
261 |                 auth_header=auth_header,
262 |             )
263 | 
264 |         # Download and scan the file.
265 |         try:
266 |             media, cacheable = await self._scan_media(media, media_path, metadata)
267 |         except FileDirtyError as e:
268 |             if e.cacheable:
269 |                 logger.info("Caching scan failure")
270 | 
271 |                 # If the test fails, don't store the media to save memory.
272 |                 self._result_cache[cache_key] = CacheEntry(
273 |                     result=False,
274 |                     media=None,
275 |                     info=e.info,
276 |                 )
277 | 
278 |             raise
279 | 
280 |         # Update the cache if the result should be cached.
281 |         if cacheable:
282 |             logger.info("Caching scan success")
283 | 
284 |             cached_media: Optional[MediaDescription] = media
285 | 
286 |             if (
287 |                 self._max_size_to_cache is not None
288 |                 and len(media.content) > self._max_size_to_cache
289 |             ):
290 |                 # Don't cache the file's content if it exceeds the maximum allowed file
291 |                 # size, to minimise memory usage.
292 |                 logger.info(
293 |                     "File content has size %s, which is more than %s, not caching content",
294 |                     format_size(len(media.content)),
295 |                     format_size(self._max_size_to_cache),
296 |                 )
297 | 
298 |                 cached_media = None
299 | 
300 |             # Hash the media, that way if we need to re-download the file we can make sure
301 |             # it's the right one. We get a hex digest in case we want to print it later.
302 |             media_hash = hashlib.sha256(media.content).hexdigest()
303 | 
304 |             self._result_cache[cache_key] = CacheEntry(
305 |                 result=True,
306 |                 media=cached_media,
307 |                 media_hash=media_hash,
308 |             )
309 | 
310 |         return media
311 | 
312 |     async def _scan_media(
313 |         self,
314 |         media: MediaDescription,
315 |         media_path: str,
316 |         metadata: Optional[JsonDict] = None,
317 |     ) -> Tuple[MediaDescription, bool]:
318 |         """Scans the given media.
319 | 
320 |         Args:
321 |             media: The already downloaded media. If provided, the download step is
322 |                 skipped. Usually provided if we've re-downloaded a file with a cached
323 |                 result, but the file changed since the initial scan.
324 |             media_path: The `server_name/media_id` path for the media.
325 |             metadata: The metadata attached to the file (e.g. decryption key), or None if
326 |                 the file isn't encrypted.
327 | 
328 |         Returns:
329 |             A description of the media, as well as a boolean indicating whether the
330 |             successful scan result should be cached or not.
331 | 
332 |         Raises:
333 |             FileDirtyError if the result of the scan said that the file is dirty, or if
334 |                 the media path is malformed.
335 |         """
336 | 
337 |         # Decrypt the content if necessary.
338 |         media_content = media.content
339 |         if metadata is not None:
340 |             # If the file is encrypted, we need to decrypt it before we can scan it.
341 |             media_content = self._decrypt_file(media_content, metadata)
342 | 
343 |         # Check the file's MIME type to see if it's allowed.
344 |         self._check_mimetype(media_content)
345 | 
346 |         # Write the file to disk.
347 |         file_path = self._write_file_to_disk(media_path, media_content)
348 | 
349 |         # Scan the file and see if the result is positive or negative.
350 |         exit_code = await self._run_scan(file_path)
351 |         result = exit_code == 0
352 | 
353 |         # If the exit code isn't part of the ones we should ignore, cache the result.
354 |         cacheable = True
355 |         if exit_code in self._exit_codes_to_ignore:
356 |             logger.info(
357 |                 "Scan returned exit code %d which must not be cached", exit_code
358 |             )
359 |             cacheable = False
360 | 
361 |         # Delete the file now that we've scanned it.
362 |         logger.info("Scan has finished, removing file")
363 |         removal_command_parts = self._removal_command.split()
364 |         removal_command_parts.append(file_path)
365 |         subprocess.run(removal_command_parts)
366 | 
367 |         # Raise an error if the result isn't clean.
368 |         if result is False:
369 |             raise FileDirtyError(cacheable=cacheable)
370 | 
371 |         return media, cacheable
372 | 
373 |     def _get_cache_key_for_file(
374 |         self,
375 |         media_path: str,
376 |         metadata: Optional[JsonDict],
377 |         thumbnail_params: Optional[MultiMapping[str]],
378 |     ) -> str:
379 |         """Generates the key to use to store the result for the given media in the result
380 |         cache.
381 | 
382 |         The key is computed using the media's `server_name/media_id` path, but also the
383 |         metadata dict (stringified), in case e.g. the decryption key changes, as well as
384 |         the parameters used to generate the thumbnail if any (stringified), to
385 |         differentiate thumbnails from full-sized media.
386 |         The resulting key is a sha256 hash of the concatenation of these two values.
387 | 
388 |         Args:
389 |             media_path: The `server_name/media_id` path of the file to scan.
390 |             metadata: The file's metadata (or None if the file isn't encrypted).
391 |             thumbnail_params: The parameters to generate thumbnail with. If no parameter
392 |                 is passed, this will be an empty dict. If the media being requested is not
393 |                 a thumbnail, this will be None.
394 |         """
395 |         # If we're provided with thumbnailing parameters, turn them into a structure that
396 |         # can be serialised as JSON.
397 |         thumbnail_params_json: Optional[Dict[str, List[str]]] = None
398 |         if thumbnail_params is not None:
399 |             thumbnail_params_json = {}
400 |             for k in thumbnail_params.keys():
401 |                 thumbnail_params_json[k] = thumbnail_params.getall(k)
402 | 
403 |         hash = hashlib.sha256()
404 |         hash.update(media_path.encode("utf8"))
405 |         hash.update(b"\0")
406 |         hash.update(encode_canonical_json(metadata))
407 |         hash.update(b"\0")
408 |         hash.update(encode_canonical_json(thumbnail_params_json))
409 | 
410 |         return hash.hexdigest()
411 | 
412 |     def _decrypt_file(self, body: bytes, metadata: JsonDict) -> bytes:
413 |         """Extract decryption information from the file's metadata and decrypt it.
414 | 
415 |         Args:
416 |             body: The encrypted body of the file.
417 |             metadata: The part of the request that includes decryption information.
418 | 
419 |         Returns:
420 |             The decrypted content of the file.
421 | 
422 |         Raises:
423 |             ContentScannerRestError(400) if the decryption failed.
424 |         """
425 |         logger.info("Decrypting encrypted file")
426 | 
427 |         # Decrypt the file.
428 |         try:
429 |             return crypto.decrypt_attachment(body, metadata["file"])
430 |         except Exception as e:
431 |             raise ContentScannerRestError(
432 |                 http_status=400,
433 |                 reason=ErrCode.FAILED_TO_DECRYPT,
434 |                 info=str(e),
435 |             )
436 | 
437 |     def _write_file_to_disk(self, media_path: str, body: bytes) -> str:
438 |         """Writes the given content to disk. The final file name will be a concatenation
439 |         of `temp_directory` and the media's `server_name/media_id` path.
440 | 
441 |         Args:
442 |             media_path: The `server_name/media_id` path of the media we're processing.
443 |             body: The bytes to write to disk.
444 | 
445 |         Returns:
446 |             The full path to the newly written file.
447 | 
448 |         Raises:
449 |             FileDirtyError if the media path is malformed in a way that would cause the
450 |                 file to be written outside the configured directory.
451 |         """
452 |         # Figure out the full absolute path for this file.
453 |         full_path = self._store_directory.joinpath(media_path).resolve()
454 |         try:
455 |             # Check if the full path is a sub-path to the store's path, to make sure
456 |             # there isn't any '..' etc. in the full path, which would cause us to try
457 |             # writing outside the store's directory.
458 |             full_path.relative_to(self._store_directory)
459 |         except ValueError:
460 |             raise FileDirtyError("Malformed media ID")
461 | 
462 |         logger.info("Writing file to %s", full_path)
463 | 
464 |         # Create any directory we need.
465 |         os.makedirs(full_path.parent, exist_ok=True)
466 | 
467 |         with open(full_path, "wb") as fp:
468 |             fp.write(body)
469 | 
470 |         return str(full_path)
471 | 
472 |     async def _run_scan(self, file_name: str) -> int:
473 |         """Runs the scan script, passing it the given file name.
474 | 
475 |         Args:
476 |             file_name: Name of the file to scan.
477 | 
478 |         Returns:
479 |             The exit code the script returned.
480 |         """
481 |         async with self._current_scan_semaphore:
482 |             process = await asyncio.create_subprocess_exec(
483 |                 self._script, file_name, stderr=asyncio.subprocess.PIPE
484 |             )
485 |             _, stderr = await process.communicate()
486 |             retcode = await process.wait()
487 |             if retcode == 0:
488 |                 logger.info("Scan succeeded")
489 |             else:
490 |                 logger.info(
491 |                     "Scanning failed with exit code %d. Stderr: %s",
492 |                     retcode,
493 |                     stderr.decode(),
494 |                 )
495 | 
496 |             return retcode
497 | 
498 |     def _check_mimetype(self, media_content: bytes) -> None:
499 |         """Detects the MIME type of the provided bytes, and checks that this type is allowed
500 |         (if an allow list is provided in the configuration)
501 |         Args:
502 |             media_content: The file's content. If the file is encrypted, this is its
503 |                 decrypted content.
504 |         Raises:
505 |             FileMimeTypeForbiddenError if one of the checks fail.
506 |         """
507 |         detected_mimetype = magic.from_buffer(media_content, mime=True)
508 |         logger.debug("Detected MIME type for file is %s", detected_mimetype)
509 | 
510 |         # If there's an allow list for MIME types, check that the MIME type that's been
511 |         # detected for this file is in it.
512 |         if (
513 |             self._allowed_mimetypes is not None
514 |             and detected_mimetype not in self._allowed_mimetypes
515 |         ):
516 |             logger.error(
517 |                 "MIME type for file is forbidden: %s",
518 |                 detected_mimetype,
519 |             )
520 |             raise FileMimeTypeForbiddenError(
521 |                 f"File type: {detected_mimetype} not allowed"
522 |             )
523 | 
524 |         # If there's a block list for MIME types, check that the MIME type detected for
525 |         # this file is NOT in it.
526 |         if (
527 |             self._blocked_mimetypes is not None
528 |             and detected_mimetype in self._blocked_mimetypes
529 |         ):
530 |             logger.error(
531 |                 "MIME type for file is forbidden: %s",
532 |                 detected_mimetype,
533 |             )
534 |             raise FileMimeTypeForbiddenError(
535 |                 f"File type: {detected_mimetype} not allowed"
536 |             )
537 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/servlets/__init__.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2022 New Vector Ltd
  2 | #
  3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
  4 | # Please see LICENSE files in the repository root for full details.
  5 | import functools
  6 | import json
  7 | import logging
  8 | from typing import Awaitable, Callable, Dict, Optional, Tuple, TypeVar, Union
  9 | 
 10 | import attr
 11 | from aiohttp import web
 12 | from multidict import CIMultiDictProxy
 13 | 
 14 | from matrix_content_scanner import logutils
 15 | from matrix_content_scanner.mcs_rust import crypto
 16 | from matrix_content_scanner.utils.constants import ErrCode
 17 | from matrix_content_scanner.utils.encrypted_file_metadata import (
 18 |     validate_encrypted_file_metadata,
 19 | )
 20 | from matrix_content_scanner.utils.errors import ContentScannerRestError
 21 | from matrix_content_scanner.utils.types import JsonDict
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | _next_request_seq = 0
 26 | 
 27 | _Handler = TypeVar("_Handler")
 28 | 
 29 | 
 30 | @attr.s(auto_attribs=True, frozen=True, slots=True)
 31 | class _BytesResponse:
 32 |     """A binary response, and the headers to send back to the client alongside it."""
 33 | 
 34 |     headers: CIMultiDictProxy[str]
 35 |     content: bytes
 36 | 
 37 | 
 38 | def web_handler(
 39 |     func: Callable[
 40 |         [_Handler, web.Request], Awaitable[Tuple[int, Union[JsonDict, _BytesResponse]]]
 41 |     ],
 42 | ) -> Callable[[_Handler, web.Request], Awaitable[web.Response]]:
 43 |     """Decorator that adds a wrapper to the given web handler method, which turns its
 44 |     return value into an aiohttp Response, and handles errors.
 45 | 
 46 |     Args:
 47 |         func: The function to wrap.
 48 | 
 49 |     Returns:
 50 |         The wrapper to run for this function.
 51 |     """
 52 | 
 53 |     def handle_error(status: int, reason: ErrCode, info: Optional[str]) -> web.Response:
 54 |         """Turns an error with the given parameters into an aiohttp Response.
 55 | 
 56 |         Args:
 57 |             status: The HTTP status code.
 58 |             reason: The error code to include in the response's JSON body.
 59 |             info: Optional extra info to include in the response's JSON body.
 60 |         """
 61 |         # Write the reason for the error into the response body, and add some extra info
 62 |         # if we have any.
 63 |         res_body: JsonDict = {"reason": reason}
 64 |         if info is not None:
 65 |             res_body["info"] = info
 66 | 
 67 |         res = _to_json_bytes(res_body)
 68 | 
 69 |         return web.Response(
 70 |             status=status,
 71 |             content_type="application/json",
 72 |             body=res,
 73 |         )
 74 | 
 75 |     @functools.wraps(func)
 76 |     async def wrapper(self: _Handler, request: web.Request) -> web.Response:
 77 |         """Run the wrapped method, and turn the return value into an aiohttp Response.
 78 | 
 79 |         If the wrapped method raises an exception, turn that into an aiohttp Response
 80 |         as well.
 81 | 
 82 |         Args:
 83 |             self: The object the wrapped method belongs to.
 84 |             request: The aiohttp Request to process.
 85 |         """
 86 |         # Set the request ID in the logging context, and increment the sequence for the
 87 |         # next request.
 88 |         global _next_request_seq
 89 |         request_id = f"{request.method}-{_next_request_seq}"
 90 |         logutils.set_request_id_in_context(request_id)
 91 |         _next_request_seq += 1
 92 | 
 93 |         # Check that the path is correct.
 94 |         if not request.path.startswith("/_matrix/media_proxy/unstable"):
 95 |             return handle_error(
 96 |                 status=400,
 97 |                 reason=ErrCode.UNKNOWN,
 98 |                 info="Invalid path",
 99 |             )
100 | 
101 |         try:
102 |             status, res = await func(self, request)
103 | 
104 |             # Set the response and headers according to the return value. If the handler
105 |             # didn't return with a bytes response (in which it is responsible for
106 |             # providing the headers, including the content-type one), default to json.
107 |             headers: Union[Dict[str, str], CIMultiDictProxy[str]]
108 |             if isinstance(res, _BytesResponse):
109 |                 raw_res = res.content
110 |                 headers = res.headers
111 |             else:
112 |                 raw_res = _to_json_bytes(res)
113 |                 headers = {"content-type": "application/json"}
114 | 
115 |             return web.Response(
116 |                 status=status,
117 |                 body=raw_res,
118 |                 headers=headers,
119 |             )
120 |         except ContentScannerRestError as e:
121 |             # If we get a REST error, use it to generate an error response.
122 |             return handle_error(
123 |                 status=e.http_status,
124 |                 reason=e.reason,
125 |                 info=e.info,
126 |             )
127 |         except Exception as e:
128 |             # Otherwise, just treat it as an unknown server error.
129 |             logger.exception(e)
130 |             return handle_error(
131 |                 status=500,
132 |                 reason=ErrCode.UNKNOWN,
133 |                 info="Internal Server Error",
134 |             )
135 | 
136 |     return wrapper
137 | 
138 | 
139 | def _to_json_bytes(content: JsonDict) -> bytes:
140 |     """Converts a dict into JSON and encodes it to bytes."""
141 |     return json.dumps(content).encode("UTF-8")
142 | 
143 | 
144 | async def get_media_metadata_from_request(
145 |     request: web.Request,
146 |     crypto_handler: crypto.CryptoHandler,
147 | ) -> Tuple[str, JsonDict]:
148 |     """Extracts, optionally decrypts, and validates encrypted file metadata from a
149 |     request body.
150 | 
151 |     Args:
152 |         request: The request to extract the data from.
153 |         crypto_handler: The crypto handler to use if we need to decrypt an Olm-encrypted
154 |             body.
155 | 
156 |     Raises:
157 |         ContentScannerRestError(400) if the request's body is None or if the metadata
158 |             didn't pass schema validation.
159 |     """
160 |     if request.content is None:
161 |         raise ContentScannerRestError(
162 |             400,
163 |             ErrCode.MALFORMED_JSON,
164 |             "No content in request body",
165 |         )
166 | 
167 |     try:
168 |         body = await request.json()
169 |     except json.decoder.JSONDecodeError as e:
170 |         raise ContentScannerRestError(400, ErrCode.MALFORMED_JSON, str(e))
171 | 
172 |     metadata = _metadata_from_body(body, crypto_handler)
173 | 
174 |     validate_encrypted_file_metadata(metadata)
175 | 
176 |     # Get the media path.
177 |     url = metadata["file"]["url"]
178 |     media_path = url[len("mxc://") :]
179 | 
180 |     return media_path, metadata
181 | 
182 | 
183 | def _metadata_from_body(
184 |     body: JsonDict, crypto_handler: crypto.CryptoHandler
185 | ) -> JsonDict:
186 |     """Parse the given body as JSON, and decrypts it if needed.
187 | 
188 |     Args:
189 |         body: The body, parsed as JSON.
190 |         crypto_handler: The crypto handler to use if we need to decrypt an Olm-encrypted
191 |             body.
192 | 
193 |     Returns:
194 |         The parsed and decrypted file metadata.
195 | 
196 |     Raises:
197 |         ContentScannerRestError(400) if the body isn't valid JSON or isn't a dictionary.
198 |     """
199 |     # Every POST request body in the API implemented by the content scanner is a dict.
200 |     if not isinstance(body, dict):
201 |         raise ContentScannerRestError(
202 |             400,
203 |             ErrCode.MALFORMED_JSON,
204 |             "Body must be a dictionary",
205 |         )
206 | 
207 |     # Check if the metadata is encrypted, if not then the metadata is in clear text in
208 |     # the body so just return it.
209 |     encrypted_body: Optional[JsonDict] = body.get("encrypted_body")
210 |     if encrypted_body is None:
211 |         return body
212 | 
213 |     # If it is encrypted, decrypt it and return the decrypted version.
214 |     try:
215 |         decrypted: JsonDict = json.loads(
216 |             crypto_handler.decrypt_body(
217 |                 ciphertext=encrypted_body["ciphertext"],
218 |                 mac=encrypted_body["mac"],
219 |                 ephemeral=encrypted_body["ephemeral"],
220 |             )
221 |         )
222 |         return decrypted
223 |     except Exception as e:
224 |         logger.exception("Failed to decrypt encrypted body")
225 |         raise ContentScannerRestError(
226 |             http_status=400,
227 |             reason=ErrCode.FAILED_TO_DECRYPT,
228 |             info=str(e),
229 |         )
230 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/servlets/download.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2022 New Vector Ltd
 2 | #
 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
 4 | # Please see LICENSE files in the repository root for full details.
 5 | from typing import TYPE_CHECKING, Optional, Tuple
 6 | 
 7 | from aiohttp import web
 8 | 
 9 | from matrix_content_scanner.servlets import (
10 |     _BytesResponse,
11 |     get_media_metadata_from_request,
12 |     web_handler,
13 | )
14 | from matrix_content_scanner.utils.types import JsonDict
15 | 
16 | if TYPE_CHECKING:
17 |     from matrix_content_scanner.mcs import MatrixContentScanner
18 | 
19 | 
20 | class DownloadHandler:
21 |     def __init__(self, content_scanner: "MatrixContentScanner"):
22 |         self._scanner = content_scanner.scanner
23 |         self._crypto_handler = content_scanner.crypto_handler
24 | 
25 |     async def _scan(
26 |         self,
27 |         media_path: str,
28 |         metadata: Optional[JsonDict] = None,
29 |         auth_header: Optional[str] = None,
30 |     ) -> Tuple[int, _BytesResponse]:
31 |         media = await self._scanner.scan_file(
32 |             media_path, metadata, auth_header=auth_header
33 |         )
34 | 
35 |         return 200, _BytesResponse(
36 |             headers=media.response_headers,
37 |             content=media.content,
38 |         )
39 | 
40 |     @web_handler
41 |     async def handle_plain(self, request: web.Request) -> Tuple[int, _BytesResponse]:
42 |         """Handles GET requests to ../download/serverName/mediaId"""
43 |         media_path = request.match_info["media_path"]
44 |         return await self._scan(
45 |             media_path, auth_header=request.headers.get("Authorization")
46 |         )
47 | 
48 |     @web_handler
49 |     async def handle_encrypted(
50 |         self, request: web.Request
51 |     ) -> Tuple[int, _BytesResponse]:
52 |         """Handles POST requests to ../download_encrypted"""
53 |         media_path, metadata = await get_media_metadata_from_request(
54 |             request, self._crypto_handler
55 |         )
56 | 
57 |         return await self._scan(
58 |             media_path, metadata, auth_header=request.headers.get("Authorization")
59 |         )
60 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/servlets/public_key.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2022 New Vector Ltd
 2 | #
 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
 4 | # Please see LICENSE files in the repository root for full details.
 5 | from typing import TYPE_CHECKING, Tuple
 6 | 
 7 | from aiohttp import web
 8 | 
 9 | from matrix_content_scanner.servlets import web_handler
10 | from matrix_content_scanner.utils.types import JsonDict
11 | 
12 | if TYPE_CHECKING:
13 |     from matrix_content_scanner.mcs import MatrixContentScanner
14 | 
15 | 
16 | class PublicKeyHandler:
17 |     def __init__(self, content_scanner: "MatrixContentScanner") -> None:
18 |         self._crypto_handler = content_scanner.crypto_handler
19 | 
20 |     @web_handler
21 |     async def handle_public_key(self, request: web.Request) -> Tuple[int, JsonDict]:
22 |         """Handles GET requests to .../public_key"""
23 |         return 200, {"public_key": self._crypto_handler.public_key}
24 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/servlets/scan.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2022 New Vector Ltd
 2 | #
 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
 4 | # Please see LICENSE files in the repository root for full details.
 5 | from typing import TYPE_CHECKING, Optional, Tuple
 6 | 
 7 | from aiohttp import web
 8 | 
 9 | from matrix_content_scanner.servlets import get_media_metadata_from_request, web_handler
10 | from matrix_content_scanner.utils.errors import FileDirtyError
11 | from matrix_content_scanner.utils.types import JsonDict
12 | 
13 | if TYPE_CHECKING:
14 |     from matrix_content_scanner.mcs import MatrixContentScanner
15 | 
16 | 
17 | class ScanHandler:
18 |     def __init__(self, content_scanner: "MatrixContentScanner"):
19 |         self._scanner = content_scanner.scanner
20 |         self._crypto_handler = content_scanner.crypto_handler
21 | 
22 |     async def _scan_and_format(
23 |         self,
24 |         media_path: str,
25 |         metadata: Optional[JsonDict] = None,
26 |         auth_header: Optional[str] = None,
27 |     ) -> Tuple[int, JsonDict]:
28 |         try:
29 |             await self._scanner.scan_file(media_path, metadata, auth_header=auth_header)
30 |         except FileDirtyError as e:
31 |             res = {"clean": False, "info": e.info}
32 |         else:
33 |             res = {"clean": True, "info": "File is clean"}
34 | 
35 |         return 200, res
36 | 
37 |     @web_handler
38 |     async def handle_plain(self, request: web.Request) -> Tuple[int, JsonDict]:
39 |         """Handles GET requests to ../scan/serverName/mediaId"""
40 |         media_path = request.match_info["media_path"]
41 |         return await self._scan_and_format(
42 |             media_path, auth_header=request.headers.get("Authorization")
43 |         )
44 | 
45 |     @web_handler
46 |     async def handle_encrypted(self, request: web.Request) -> Tuple[int, JsonDict]:
47 |         """Handles GET requests to ../scan_encrypted"""
48 |         media_path, metadata = await get_media_metadata_from_request(
49 |             request, self._crypto_handler
50 |         )
51 |         return await self._scan_and_format(
52 |             media_path, metadata, auth_header=request.headers.get("Authorization")
53 |         )
54 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/servlets/thumbnail.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2022 New Vector Ltd
 2 | #
 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
 4 | # Please see LICENSE files in the repository root for full details.
 5 | from typing import TYPE_CHECKING, Tuple
 6 | 
 7 | from aiohttp import web
 8 | 
 9 | from matrix_content_scanner.servlets import _BytesResponse, web_handler
10 | 
11 | if TYPE_CHECKING:
12 |     from matrix_content_scanner.mcs import MatrixContentScanner
13 | 
14 | 
15 | class ThumbnailHandler:
16 |     def __init__(self, content_scanner: "MatrixContentScanner"):
17 |         self._scanner = content_scanner.scanner
18 | 
19 |     @web_handler
20 |     async def handle_thumbnail(
21 |         self, request: web.Request
22 |     ) -> Tuple[int, _BytesResponse]:
23 |         """Handles GET requests to .../thumbnail/serverName/mediaId"""
24 |         media_path = request.match_info["media_path"]
25 | 
26 |         media = await self._scanner.scan_file(
27 |             media_path=media_path,
28 |             thumbnail_params=request.query,
29 |             auth_header=request.headers.get("Authorization"),
30 |         )
31 | 
32 |         return 200, _BytesResponse(
33 |             headers=media.response_headers,
34 |             content=media.content,
35 |         )
36 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #  Copyright 2022 New Vector Ltd
2 | #
3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
4 | # Please see LICENSE files in the repository root for full details.
5 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/utils/constants.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2022 New Vector Ltd
 2 | #
 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
 4 | # Please see LICENSE files in the repository root for full details.
 5 | from enum import Enum
 6 | 
 7 | 
 8 | class ErrCode(str, Enum):
 9 |     # An unknown error happened.
10 |     UNKNOWN = "M_UNKNOWN"
11 |     # One of the following:
12 |     # - No route was found with the path and method provided in the request.
13 |     # - The homeserver does not have the requested piece of media.
14 |     NOT_FOUND = "M_NOT_FOUND"
15 |     # The access token is missing from the request.
16 |     MISSING_TOKEN = "M_MISSING_TOKEN"
17 |     # The provided access token is invalid.
18 |     # One of the following:
19 |     # - the access token was never valid.
20 |     # - the access token has been logged out.
21 |     # - the access token has been soft logged out.
22 |     # - [Added in v1.3] the access token needs to be refreshed.
23 |     UNKNOWN_TOKEN = "M_UNKNOWN_TOKEN"
24 |     # The file failed the scan.
25 |     NOT_CLEAN = "MCS_MEDIA_NOT_CLEAN"
26 |     # The file could not be retrieved from the homeserver.
27 |     # Does NOT cover homeserver responses with M_NOT_FOUND.
28 |     REQUEST_FAILED = "MCS_MEDIA_REQUEST_FAILED"
29 |     # The encrypted file could not be decrypted with the provided metadata.
30 |     FAILED_TO_DECRYPT = "MCS_MEDIA_FAILED_TO_DECRYPT"
31 |     # The request body isn't valid JSON, or is missing a required parameter.
32 |     MALFORMED_JSON = "MCS_MALFORMED_JSON"
33 |     # The Mime type is not in the allowed list of Mime types.
34 |     MIME_TYPE_FORBIDDEN = "MCS_MIME_TYPE_FORBIDDEN"
35 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/utils/encrypted_file_metadata.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2022 New Vector Ltd
 2 | #
 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
 4 | # Please see LICENSE files in the repository root for full details.
 5 | from jsonschema import ValidationError, validate
 6 | 
 7 | from matrix_content_scanner.utils.constants import ErrCode
 8 | from matrix_content_scanner.utils.errors import ContentScannerRestError
 9 | from matrix_content_scanner.utils.types import JsonDict
10 | 
11 | # This is a subset of the content of an m.room.message event that includes a file, with
12 | # only the info that we need to locate and decrypt the file.
13 | _encrypted_file_metadata_schema = {
14 |     "type": "object",
15 |     "required": ["file"],
16 |     "properties": {
17 |         "file": {
18 |             "type": "object",
19 |             "required": ["v", "iv", "url", "hashes", "key"],
20 |             "properties": {
21 |                 "v": {"const": "v2"},
22 |                 "iv": {"type": "string"},
23 |                 "url": {"type": "string"},
24 |                 "hashes": {
25 |                     "type": "object",
26 |                     "required": ["sha256"],
27 |                     "properties": {
28 |                         "sha256": {"type": "string"},
29 |                     },
30 |                 },
31 |                 "key": {
32 |                     "type": "object",
33 |                     "required": ["alg", "kty", "k", "key_ops", "ext"],
34 |                     "properties": {
35 |                         "alg": {"const": "A256CTR"},
36 |                         "kty": {"const": "oct"},
37 |                         "k": {"type": "string"},
38 |                         "key_ops": {"type": "array", "items": {"type": "string"}},
39 |                         "ext": {"const": True},
40 |                     },
41 |                 },
42 |             },
43 |         },
44 |     },
45 | }
46 | 
47 | 
48 | def _validate(body: JsonDict) -> None:
49 |     """Validates the schema using jsonschema, and by checking whether the `key_ops` list
50 |     includes at least `encrypt` and `decrypt`.
51 | 
52 |     Args:
53 |         body: The body to validate.
54 | 
55 |     Raises:
56 |         ValidationError if the jsonschema validation failed.
57 |         ValueError if the `key_ops` list doesn't include at least `encrypt` and `decrypt`.
58 |     """
59 |     validate(body, _encrypted_file_metadata_schema)
60 | 
61 |     # We don't need to worry about triggering a KeyError/TypeError here because all of
62 |     # these keys are marked as required in the schema, so at this point we know they're
63 |     # here.
64 |     key_ops = body["file"]["key"]["key_ops"]
65 |     # We need the key_ops list to at least include "encrypt" and "decrypt", but we can't
66 |     # check this with jsonschema, so we need to do it manually.
67 |     if not set(key_ops).issuperset({"encrypt", "decrypt"}):
68 |         raise ValueError('key_ops must contain at least "encrypt" and "decrypt"')
69 | 
70 | 
71 | def validate_encrypted_file_metadata(body: JsonDict) -> None:
72 |     """Validates the schema of the given dictionary, and turns any validation error
73 |     raised into a client error.
74 | 
75 |     Args:
76 |         body: The body to validate.
77 | 
78 |     Raises:
79 |         ContentScannerRestError(400) if the validation failed.
80 |     """
81 |     # Run the validation and turns any error coming out of it into a REST error.
82 |     try:
83 |         _validate(body)
84 |     except ValidationError as e:
85 |         raise ContentScannerRestError(400, ErrCode.MALFORMED_JSON, e.message)
86 |     except ValueError as e:
87 |         raise ContentScannerRestError(400, ErrCode.MALFORMED_JSON, str(e))
88 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/utils/errors.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2022 New Vector Ltd
 2 | #
 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
 4 | # Please see LICENSE files in the repository root for full details.
 5 | from typing import Optional
 6 | 
 7 | from matrix_content_scanner.utils.constants import ErrCode
 8 | 
 9 | 
10 | class ContentScannerRestError(Exception):
11 |     """An error that is converted into an error response by the REST resource."""
12 | 
13 |     def __init__(self, http_status: int, reason: ErrCode, info: Optional[str]) -> None:
14 |         super(Exception, self).__init__(info)
15 |         self.http_status = http_status
16 |         self.reason = reason
17 |         self.info = info
18 | 
19 | 
20 | class FileDirtyError(ContentScannerRestError):
21 |     """An error indicating that the file being scanned is dirty."""
22 | 
23 |     def __init__(
24 |         self,
25 |         info: Optional[str] = "***VIRUS DETECTED***",
26 |         cacheable: bool = True,
27 |     ) -> None:
28 |         """
29 |         Args:
30 |             info: The info string to serve to the client.
31 |             cacheable: Whether raising this error should be recorded as a scan failure in
32 |                 the scanner's result cache.
33 |         """
34 |         super(FileDirtyError, self).__init__(
35 |             http_status=403,
36 |             reason=ErrCode.NOT_CLEAN,
37 |             info=info,
38 |         )
39 | 
40 |         self.cacheable = cacheable
41 | 
42 | 
43 | class FileMimeTypeForbiddenError(ContentScannerRestError):
44 |     """An error indicating that the file's MIME type is forbidden."""
45 | 
46 |     def __init__(self, info: Optional[str]) -> None:
47 |         super(FileMimeTypeForbiddenError, self).__init__(
48 |             http_status=403,
49 |             reason=ErrCode.MIME_TYPE_FORBIDDEN,
50 |             info=info,
51 |         )
52 | 
53 | 
54 | class ConfigError(Exception):
55 |     """An error indicating an issue with the configuration file."""
56 | 
57 | 
58 | class WellKnownDiscoveryError(Exception):
59 |     """An error indicating a failure when attempting a .well-known discovery."""
60 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/utils/rust.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2024 New Vector Ltd
 2 | #
 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
 4 | # Please see LICENSE files in the repository root for full details.
 5 | 
 6 | import os
 7 | import sys
 8 | from hashlib import blake2b
 9 | 
10 | import matrix_content_scanner
11 | from matrix_content_scanner.mcs_rust import get_rust_file_digest
12 | 
13 | 
14 | def check_rust_lib_up_to_date() -> None:
15 |     """For editable installs check if the rust library is outdated and needs to
16 |     be rebuilt.
17 |     """
18 | 
19 |     if not _dist_is_editable():
20 |         return
21 | 
22 |     mcs_dir = os.path.dirname(matrix_content_scanner.__file__)
23 |     mcs_root = os.path.abspath(os.path.join(mcs_dir, "../.."))
24 | 
25 |     # Double check we've not gone into site-packages...
26 |     if os.path.basename(mcs_root) == "site-packages":
27 |         return
28 | 
29 |     # ... and it looks like the root of a python project.
30 |     if not os.path.exists("pyproject.toml"):
31 |         return
32 | 
33 |     # Get the hash of all Rust source files
34 |     hash = _hash_rust_files_in_directory(os.path.join(mcs_root, "rust", "src"))
35 | 
36 |     if hash != get_rust_file_digest():
37 |         raise Exception("Rust module outdated. Please rebuild using `poetry install`")
38 | 
39 | 
40 | def _hash_rust_files_in_directory(directory: str) -> str:
41 |     """Get the hash of all files in a directory (recursively)"""
42 | 
43 |     directory = os.path.abspath(directory)
44 | 
45 |     paths = []
46 | 
47 |     dirs = [directory]
48 |     while dirs:
49 |         dir = dirs.pop()
50 |         with os.scandir(dir) as d:
51 |             for entry in d:
52 |                 if entry.is_dir():
53 |                     dirs.append(entry.path)
54 |                 else:
55 |                     paths.append(entry.path)
56 | 
57 |     # We sort to make sure that we get a consistent and well-defined ordering.
58 |     paths.sort()
59 | 
60 |     hasher = blake2b()
61 | 
62 |     for path in paths:
63 |         with open(os.path.join(directory, path), "rb") as f:
64 |             hasher.update(f.read())
65 | 
66 |     return hasher.hexdigest()
67 | 
68 | 
69 | def _dist_is_editable() -> bool:
70 |     """Is distribution an editable install?"""
71 |     for path_item in sys.path:
72 |         egg_link = os.path.join(path_item, "matrix_content_scanner.pth")
73 |         if os.path.isfile(egg_link):
74 |             return True
75 |     return False
76 | 


--------------------------------------------------------------------------------
/src/matrix_content_scanner/utils/types.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2022 New Vector Ltd
 2 | #
 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
 4 | # Please see LICENSE files in the repository root for full details.
 5 | from typing import Any, Dict
 6 | 
 7 | import attr
 8 | from multidict import CIMultiDictProxy
 9 | 
10 | 
11 | @attr.s(auto_attribs=True)
12 | class MediaDescription:
13 |     """A description of a media."""
14 | 
15 |     content_type: str
16 |     content: bytes
17 |     response_headers: CIMultiDictProxy[str]
18 |     cacheable: bool = True
19 | 
20 | 
21 | # A JSON object/dictionary.
22 | JsonDict = Dict[str, Any]
23 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | #  Copyright 2022 New Vector Ltd
2 | #
3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
4 | # Please see LICENSE files in the repository root for full details.
5 | 


--------------------------------------------------------------------------------
/tests/scanner/__init__.py:
--------------------------------------------------------------------------------
1 | #  Copyright 2022 New Vector Ltd
2 | #
3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
4 | # Please see LICENSE files in the repository root for full details.
5 | 


--------------------------------------------------------------------------------
/tests/scanner/test_file_downloader.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2022 New Vector Ltd
  2 | #
  3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
  4 | # Please see LICENSE files in the repository root for full details.
  5 | import json
  6 | from typing import Dict, List, Optional, Tuple, Union
  7 | from unittest import IsolatedAsyncioTestCase
  8 | from unittest.mock import Mock, call
  9 | 
 10 | from multidict import CIMultiDict, CIMultiDictProxy, MultiDictProxy
 11 | 
 12 | from matrix_content_scanner.utils.errors import (
 13 |     ContentScannerRestError,
 14 |     WellKnownDiscoveryError,
 15 | )
 16 | from matrix_content_scanner.utils.types import JsonDict
 17 | 
 18 | from tests.testutils import (
 19 |     MEDIA_PATH,
 20 |     SMALL_PNG,
 21 |     get_base_media_headers,
 22 |     get_content_scanner,
 23 |     to_thumbnail_params,
 24 | )
 25 | 
 26 | 
 27 | class FileDownloaderTestCase(IsolatedAsyncioTestCase):
 28 |     def setUp(self) -> None:
 29 |         # Set a fixed base URL so that .well-known discovery doesn't get in the way.
 30 |         content_scanner = get_content_scanner(
 31 |             {"download": {"base_homeserver_url": "http://my-site.com"}}
 32 |         )
 33 |         self.downloader = content_scanner.file_downloader
 34 | 
 35 |         self.media_status = 200
 36 |         self.media_body = SMALL_PNG
 37 |         self.media_headers = get_base_media_headers()
 38 | 
 39 |         async def _get(
 40 |             url: str,
 41 |             query: Optional[MultiDictProxy[str]] = None,
 42 |             auth_header: Optional[str] = None,
 43 |         ) -> Tuple[int, bytes, CIMultiDictProxy[str]]:
 44 |             """Mock for the _get method on the file downloader that doesn't serve a
 45 |             .well-known client file.
 46 |             """
 47 |             if (
 48 |                 url.endswith(
 49 |                     (
 50 |                         "/_matrix/media/v3/download/" + MEDIA_PATH,
 51 |                         "/_matrix/media/r0/download/" + MEDIA_PATH,
 52 |                     )
 53 |                 )
 54 |                 or "/_matrix/media/v3/thumbnail/" + MEDIA_PATH in url
 55 |                 or "/_matrix/media/r0/thumbnail/" + MEDIA_PATH in url
 56 |             ):
 57 |                 return self.media_status, self.media_body, self.media_headers
 58 |             if (
 59 |                 url.endswith(("/_matrix/client/v1/media/download/" + MEDIA_PATH,))
 60 |                 or "/_matrix/client/v1/media/thumbnail/" + MEDIA_PATH in url
 61 |             ):
 62 |                 if auth_header is not None:
 63 |                     return self.media_status, self.media_body, self.media_headers
 64 |                 else:
 65 |                     return 404, b"Not found", CIMultiDictProxy(CIMultiDict())
 66 |             elif url.endswith("/.well-known/matrix/client"):
 67 |                 return 404, b"Not found", CIMultiDictProxy(CIMultiDict())
 68 | 
 69 |             raise RuntimeError("Unexpected request on %s" % url)
 70 | 
 71 |         # Mock _get so we don't actually try to download files.
 72 |         self.get_mock = Mock(side_effect=_get)
 73 |         self.downloader._get = self.get_mock  # type: ignore[method-assign]
 74 | 
 75 |     async def test_download(self) -> None:
 76 |         """Tests that downloading a file works."""
 77 |         media = await self.downloader.download_file(MEDIA_PATH)
 78 |         self.assertEqual(media.content, SMALL_PNG)
 79 |         self.assertEqual(media.content_type, "image/png")
 80 | 
 81 |         # Check that we tried downloading from the set base URL.
 82 |         args = self.get_mock.call_args.args
 83 |         self.assertTrue(args[0].startswith("http://my-site.com/"))
 84 | 
 85 |     async def test_download_auth_media(self) -> None:
 86 |         """Tests that downloading a file works using authenticated media."""
 87 |         media = await self.downloader.download_file(
 88 |             MEDIA_PATH, auth_header="Bearer access_token"
 89 |         )
 90 |         self.assertEqual(media.content, SMALL_PNG)
 91 |         self.assertEqual(media.content_type, "image/png")
 92 | 
 93 |         # Check that we tried downloading from the set base URL.
 94 |         args = self.get_mock.call_args.args
 95 |         self.assertTrue(args[0].startswith("http://my-site.com/"))
 96 |         self.assertIn("/_matrix/client/v1/media/download/" + MEDIA_PATH, args[0])
 97 | 
 98 |     async def test_download_auth_media_invalid_token(self) -> None:
 99 |         """Tests that downloading an authenticated media file with an invalid access
100 |         token returns the correct error code.
101 |         """
102 |         self.media_status = 401
103 |         self.media_body = (
104 |             b'{"errcode":"M_UNKNOWN_TOKEN","error":"Invalid access token"}'
105 |         )
106 |         self._set_headers({"content-type": ["application/json"]})
107 | 
108 |         # Check that we fail at downloading the file.
109 |         with self.assertRaises(ContentScannerRestError) as cm:
110 |             await self.downloader.download_file(
111 |                 MEDIA_PATH, auth_header="Bearer access_token"
112 |             )
113 | 
114 |         self.assertEqual(cm.exception.http_status, 401)
115 |         self.assertEqual(cm.exception.reason, "M_UNKNOWN_TOKEN")
116 | 
117 |         # Check that we tried downloading from the set base URL.
118 |         args = self.get_mock.call_args.args
119 |         self.assertTrue(args[0].startswith("http://my-site.com/"))
120 |         self.assertIn("/_matrix/client/v1/media/download/" + MEDIA_PATH, args[0])
121 | 
122 |     async def test_download_auth_media_missing_token(self) -> None:
123 |         """Tests that downloading an authenticated media file with a missing access
124 |         token returns the correct error code.
125 |         """
126 |         self.media_status = 401
127 |         self.media_body = (
128 |             b'{"errcode":"M_MISSING_TOKEN","error":"Missing access token"}'
129 |         )
130 |         self._set_headers({"content-type": ["application/json"]})
131 | 
132 |         # Check that we fail at downloading the file.
133 |         with self.assertRaises(ContentScannerRestError) as cm:
134 |             await self.downloader.download_file(
135 |                 MEDIA_PATH, auth_header="Bearer access_token"
136 |             )
137 | 
138 |         self.assertEqual(cm.exception.http_status, 401)
139 |         self.assertEqual(cm.exception.reason, "M_MISSING_TOKEN")
140 | 
141 |         # Check that we tried downloading from the set base URL.
142 |         args = self.get_mock.call_args.args
143 |         self.assertTrue(args[0].startswith("http://my-site.com/"))
144 |         self.assertIn("/_matrix/client/v1/media/download/" + MEDIA_PATH, args[0])
145 | 
146 |     async def test_no_base_url(self) -> None:
147 |         """Tests that configuring a base homeserver URL means files are downloaded from
148 |         that homeserver (rather than the one the files were uploaded to) and .well-known
149 |         discovery is bypassed.
150 |         """
151 |         self.downloader._base_url = None
152 |         await self.downloader.download_file(MEDIA_PATH)
153 | 
154 |         # Check that we've tried making a .well-known discovery request before
155 |         # downloading the file.
156 |         self.assertEqual(self.get_mock.call_count, 2)
157 |         self.assertEqual(
158 |             self.get_mock.mock_calls[0], call("https://foo/.well-known/matrix/client")
159 |         )
160 |         self.assertEqual(
161 |             self.get_mock.mock_calls[1],
162 |             call(
163 |                 "https://foo/_matrix/media/v3/download/" + MEDIA_PATH,
164 |                 query=None,
165 |                 auth_header=None,
166 |             ),
167 |         )
168 | 
169 |     async def test_retry_on_404(self) -> None:
170 |         """Tests that if we get a 404 when trying to download a file on a v3 path, we
171 |         retry with an r0 path for backwards compatibility.
172 |         """
173 |         self.media_status = 404
174 |         self.media_body = b"Not found"
175 |         self._set_headers({"content-type": ["text/plain"]})
176 | 
177 |         await self._test_retry()
178 | 
179 |     async def test_retry_on_unrecognised(self) -> None:
180 |         """Tests that if we get a Synapse-style M_UNRECOGNIZED response when trying to
181 |         download a file on a v3 path, we retry with an r0 path for backwards
182 |         compatibility.
183 |         """
184 |         self.media_status = 400
185 |         self.media_body = b'{"errcode":"M_UNRECOGNIZED","error":"Unrecognized request"}'
186 |         self._set_headers({"content-type": ["application/json"]})
187 | 
188 |         await self._test_retry()
189 | 
190 |     async def _test_retry(self) -> None:
191 |         """Tests that in a set specific case a failure to download a file from a v3
192 |         download path means we retry the request on an r0 one for backwards compatibility.
193 |         """
194 |         # Check that we eventually fail at downloading the file.
195 |         with self.assertRaises(ContentScannerRestError) as cm:
196 |             await self.downloader.download_file(MEDIA_PATH)
197 | 
198 |         self.assertEqual(cm.exception.http_status, 404)
199 |         self.assertEqual(cm.exception.info, "File not found")
200 | 
201 |         # Check that we sent out two requests: one to the v3 path and one to the r0 path.
202 |         self.assertEqual(self.get_mock.call_count, 2)
203 |         self.assertEqual(
204 |             self.get_mock.mock_calls[0],
205 |             call(
206 |                 "http://my-site.com/_matrix/media/v3/download/" + MEDIA_PATH,
207 |                 query=None,
208 |                 auth_header=None,
209 |             ),
210 |         )
211 |         self.assertEqual(
212 |             self.get_mock.mock_calls[1],
213 |             call(
214 |                 "http://my-site.com/_matrix/media/r0/download/" + MEDIA_PATH,
215 |                 query=None,
216 |                 auth_header=None,
217 |             ),
218 |         )
219 | 
220 |     async def test_no_retry(self) -> None:
221 |         """Tests that in a set specific case a failure to download a file from a v1
222 |         authenticated media download path means we don't retry the request.
223 |         """
224 |         self.media_status = 400
225 |         self.media_body = b'{"errcode":"M_UNRECOGNIZED","error":"Unrecognized request"}'
226 |         self._set_headers({"content-type": ["application/json"]})
227 | 
228 |         # Check that we eventually fail at downloading the file.
229 |         with self.assertRaises(ContentScannerRestError) as cm:
230 |             await self.downloader.download_file(
231 |                 MEDIA_PATH, auth_header="Bearer access_token"
232 |             )
233 | 
234 |         self.assertEqual(cm.exception.http_status, 404)
235 |         self.assertEqual(cm.exception.info, "File not found")
236 | 
237 |         # Check that we sent out only one request.
238 |         self.assertEqual(self.get_mock.call_count, 1)
239 |         self.assertEqual(
240 |             self.get_mock.mock_calls[0],
241 |             call(
242 |                 "http://my-site.com/_matrix/client/v1/media/download/" + MEDIA_PATH,
243 |                 query=None,
244 |                 auth_header="Bearer access_token",
245 |             ),
246 |         )
247 | 
248 |     async def test_thumbnail(self) -> None:
249 |         """Tests that we can download a thumbnail and that the parameters to generate the
250 |         thumbnail are correctly passed on to the homeserver.
251 |         """
252 |         await self.downloader.download_file(
253 |             MEDIA_PATH, to_thumbnail_params({"height": "50"})
254 |         )
255 | 
256 |         url: str = self.get_mock.call_args.args[0]
257 |         query: CIMultiDictProxy[str] = self.get_mock.call_args.kwargs["query"]
258 |         self.assertIn("/thumbnail/", url)
259 |         self.assertIn("height", query)
260 |         self.assertEqual(query.get("height"), "50", query.getall("height"))
261 | 
262 |     async def test_thumbnail_auth_media(self) -> None:
263 |         """Tests that we can download a thumbnail and that the parameters to generate the
264 |         thumbnail are correctly passed on to the homeserver using authenticated media.
265 |         """
266 |         await self.downloader.download_file(
267 |             MEDIA_PATH, to_thumbnail_params({"height": "50"}), "Bearer access_token"
268 |         )
269 | 
270 |         url: str = self.get_mock.call_args.args[0]
271 |         query: CIMultiDictProxy[str] = self.get_mock.call_args.kwargs["query"]
272 |         self.assertIn("/thumbnail/", url)
273 |         self.assertIn("/_matrix/client/v1/media/thumbnail/" + MEDIA_PATH, url)
274 |         self.assertIn("height", query)
275 |         self.assertEqual(query.get("height"), "50", query.getall("height"))
276 | 
277 |     async def test_multiple_content_type(self) -> None:
278 |         """Tests that we raise an error if the homeserver responds with too many
279 |         Content-Type headers.
280 |         """
281 |         self._set_headers({"content-type": ["image/jpeg", "image/png"]})
282 | 
283 |         with self.assertRaises(ContentScannerRestError) as cm:
284 |             await self.downloader.download_file(MEDIA_PATH)
285 | 
286 |         self.assertEqual(cm.exception.http_status, 502)
287 |         assert cm.exception.info is not None
288 |         self.assertTrue("Content-Type" in cm.exception.info)
289 | 
290 |     async def test_no_content_type(self) -> None:
291 |         """Tests that we raise an error if the homeserver responds with no Content-Type
292 |         headers.
293 |         """
294 |         self._set_headers({})
295 | 
296 |         with self.assertRaises(ContentScannerRestError) as cm:
297 |             await self.downloader.download_file(MEDIA_PATH)
298 | 
299 |         self.assertEqual(cm.exception.http_status, 502)
300 |         assert cm.exception.info is not None
301 |         self.assertTrue("Content-Type" in cm.exception.info)
302 | 
303 |     def _set_headers(self, headers: Dict[str, List[str]]) -> None:
304 |         """Replace the headers set in setUp with ones constructed from the provided
305 |         dictionary.
306 | 
307 |         Args:
308 |             headers: The raw headers to set.
309 |         """
310 |         md: CIMultiDict[str] = CIMultiDict()
311 |         for k, v in headers.items():
312 |             for el in v:
313 |                 md.add(k, el)
314 | 
315 |         self.media_headers = CIMultiDictProxy(md)
316 | 
317 | 
318 | class WellKnownDiscoveryTestCase(IsolatedAsyncioTestCase):
319 |     def setUp(self) -> None:
320 |         self.downloader = get_content_scanner().file_downloader
321 | 
322 |         self.well_known_status = 200
323 |         self.well_known_body: Union[bytes, JsonDict] = b""
324 | 
325 |         self.versions_status = 200
326 | 
327 |         async def _get(
328 |             url: str,
329 |             query: Optional[MultiDictProxy[str]] = None,
330 |             auth_header: Optional[str] = None,
331 |         ) -> Tuple[int, bytes, CIMultiDictProxy[str]]:
332 |             """Mock for the _get method on the file downloader that serves a .well-known
333 |             client file.
334 |             """
335 |             if url.endswith("/.well-known/matrix/client"):
336 |                 if isinstance(self.well_known_body, bytes):
337 |                     body_bytes = self.well_known_body
338 |                 else:
339 |                     body_bytes = json.dumps(self.well_known_body).encode("utf-8")
340 | 
341 |                 return (
342 |                     self.well_known_status,
343 |                     body_bytes,
344 |                     CIMultiDictProxy(CIMultiDict()),
345 |                 )
346 |             elif url.endswith("/_matrix/client/versions"):
347 |                 return self.versions_status, b"{}", CIMultiDictProxy(CIMultiDict())
348 |             elif url.endswith("/_matrix/media/v3/download/" + MEDIA_PATH):
349 |                 return 200, SMALL_PNG, get_base_media_headers()
350 | 
351 |             raise RuntimeError("Unexpected request on %s" % url)
352 | 
353 |         # Mock _get so we don't actually try to download files.
354 |         self.get_mock = Mock(side_effect=_get)
355 |         self.downloader._get = self.get_mock  # type: ignore[method-assign]
356 | 
357 |     async def test_discover(self) -> None:
358 |         """Checks that the base URL to use to download files can be discovered via
359 |         .well-known discovery.
360 |         """
361 |         self.well_known_body = {"m.homeserver": {"base_url": "https://foo.bar"}}
362 | 
363 |         await self.downloader.download_file(MEDIA_PATH)
364 | 
365 |         # Check that we got 3 calls:
366 |         #  * one to retrieve the .well-known file
367 |         #  * one to check that the base URL can be used to interact with a homeserver
368 |         #    (by hitting the /_matrix/client/versions endpoint)
369 |         #  * one to download the file
370 |         self.assertEqual(self.get_mock.call_count, 3, self.get_mock.mock_calls)
371 | 
372 |         calls = self.get_mock.mock_calls
373 | 
374 |         self.assertEqual(calls[0], call("https://foo/.well-known/matrix/client"))
375 |         self.assertTrue(calls[1], call("https://foo.bar/_matrix/client/versions"))
376 |         self.assertTrue(
377 |             calls[2], call("https://foo.bar/_matrix/media/v3/download/" + MEDIA_PATH)
378 |         )
379 | 
380 |     async def test_error_status(self) -> None:
381 |         """Tests that we raise a WellKnownDiscoveryError if the server responded with an
382 |         error."""
383 |         self.well_known_status = 401
384 |         await self._assert_discovery_fail()
385 | 
386 |     async def test_malformed_content(self) -> None:
387 |         """Tests that we raise a WellKnownDiscoveryError if the server responded with a
388 |         body that isn't compliant with the Matrix specification."""
389 |         self.well_known_body = {"m.homeserver": "https://foo.bar"}
390 |         await self._assert_discovery_fail()
391 | 
392 |     async def test_not_valid_homeserver(self) -> None:
393 |         """Tests that we raise a WellKnownDiscoveryError if the server at the provided
394 |         base URL isn't a Matrix homeserver."""
395 |         self.versions_status = 404
396 |         await self._assert_discovery_fail()
397 | 
398 |     async def test_404_no_fail(self) -> None:
399 |         """Tests that we don't raise a WellKnownDiscoveryError if the .well-known file
400 |         couldn't be found, and that we return None instead of the discovered base URL in
401 |         this case.
402 |         """
403 |         self.well_known_status = 404
404 |         res = await self.downloader._discover_via_well_known("foo")
405 |         self.assertIsNone(res)
406 | 
407 |     async def _assert_discovery_fail(self) -> None:
408 |         """Checks that .well-known discovery fails and raises a WellKnownDiscoveryError."""
409 |         with self.assertRaises(WellKnownDiscoveryError):
410 |             await self.downloader._discover_via_well_known("foo")
411 | 


--------------------------------------------------------------------------------
/tests/scanner/test_scanner.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2022 New Vector Ltd
  2 | #
  3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
  4 | # Please see LICENSE files in the repository root for full details.
  5 | import asyncio
  6 | import copy
  7 | from typing import Any, Dict, List, Optional
  8 | from unittest import IsolatedAsyncioTestCase
  9 | from unittest.mock import AsyncMock, Mock
 10 | 
 11 | from multidict import CIMultiDict, CIMultiDictProxy
 12 | 
 13 | from matrix_content_scanner.scanner.scanner import CacheEntry
 14 | from matrix_content_scanner.utils.constants import ErrCode
 15 | from matrix_content_scanner.utils.errors import (
 16 |     ContentScannerRestError,
 17 |     FileDirtyError,
 18 |     FileMimeTypeForbiddenError,
 19 | )
 20 | from matrix_content_scanner.utils.types import MediaDescription
 21 | 
 22 | from tests.testutils import (
 23 |     ENCRYPTED_FILE_METADATA,
 24 |     MEDIA_PATH,
 25 |     SMALL_BINARY_FILE,
 26 |     SMALL_PNG,
 27 |     SMALL_PNG_ENCRYPTED,
 28 |     SMALL_TEXT_FILE,
 29 |     get_content_scanner,
 30 |     to_thumbnail_params,
 31 | )
 32 | 
 33 | 
 34 | class ScannerTestCase(IsolatedAsyncioTestCase):
 35 |     def setUp(self) -> None:
 36 |         self.downloader_res = MediaDescription(
 37 |             content_type="image/png",
 38 |             content=SMALL_PNG,
 39 |             response_headers=CIMultiDictProxy(CIMultiDict()),
 40 |         )
 41 | 
 42 |         async def download_file(
 43 |             media_path: str,
 44 |             thumbnail_params: Optional[Dict[str, List[str]]] = None,
 45 |             auth_header: Optional[str] = None,
 46 |         ) -> MediaDescription:
 47 |             """Mock for the file downloader's `download_file` method."""
 48 |             return self.downloader_res
 49 | 
 50 |         self.downloader_mock = Mock(side_effect=download_file)
 51 | 
 52 |         # Mock download_file so we don't actually try to download files.
 53 |         mcs = get_content_scanner()
 54 |         mcs.file_downloader.download_file = self.downloader_mock  # type: ignore[method-assign]
 55 |         self.scanner = mcs.scanner
 56 | 
 57 |     async def test_scan(self) -> None:
 58 |         """Tests that we can scan files and that the scanner returns the media scanned if
 59 |         the scan was successful.
 60 |         """
 61 |         media = await self.scanner.scan_file(MEDIA_PATH)
 62 |         self.assertEqual(media.content, SMALL_PNG)
 63 | 
 64 |     async def test_scan_dirty(self) -> None:
 65 |         """Tests that the scanner raises a FileDirtyError if the scan fails."""
 66 |         self.scanner._script = "false"
 67 |         with self.assertRaises(FileDirtyError):
 68 |             await self.scanner.scan_file(MEDIA_PATH)
 69 | 
 70 |     async def test_encrypted_file(self) -> None:
 71 |         """Tests that the scanner can decrypt and scan encrypted files, and that if the
 72 |         scan is successful it returns the encrypted file and not the decrypted version.
 73 |         """
 74 |         self._setup_encrypted()
 75 | 
 76 |         media = await self.scanner.scan_file(MEDIA_PATH, ENCRYPTED_FILE_METADATA)
 77 |         self.assertEqual(media.content, SMALL_PNG_ENCRYPTED)
 78 | 
 79 |     async def test_cache(self) -> None:
 80 |         """Tests that scan results are cached."""
 81 |         # Scan the file a first time, and check that the downloader has been called.
 82 |         await self.scanner.scan_file(MEDIA_PATH)
 83 |         self.assertEqual(self.downloader_mock.call_count, 1)
 84 | 
 85 |         # Scan the file a second time, and check that the downloader has not been called
 86 |         # this time.
 87 |         media = await self.scanner.scan_file(MEDIA_PATH)
 88 |         self.assertEqual(self.downloader_mock.call_count, 1)
 89 |         self.assertEqual(media.content, SMALL_PNG)
 90 | 
 91 |     async def test_cache_encrypted(self) -> None:
 92 |         """Tests that scan results for encrypted files are cached, and that the cached
 93 |         file is the encrypted version, not the decrypted one."""
 94 |         self._setup_encrypted()
 95 | 
 96 |         # Scan the file a first time, and check that the downloader has been called.
 97 |         await self.scanner.scan_file(MEDIA_PATH, ENCRYPTED_FILE_METADATA)
 98 |         self.assertEqual(self.downloader_mock.call_count, 1)
 99 | 
100 |         # Scan the file a second time, and check that the downloader has not been called
101 |         # this time, and that the media returned is the encrypted copy.
102 |         media = await self.scanner.scan_file(MEDIA_PATH, ENCRYPTED_FILE_METADATA)
103 |         self.assertEqual(self.downloader_mock.call_count, 1)
104 |         self.assertEqual(media.content, SMALL_PNG_ENCRYPTED)
105 | 
106 |     async def test_cache_download_thumbnail(self) -> None:
107 |         """Tests that cached results for full file downloads are not used for thumbnails."""
108 |         await self.scanner.scan_file(MEDIA_PATH)
109 |         self.assertEqual(self.downloader_mock.call_count, 1)
110 | 
111 |         await self.scanner.scan_file(
112 |             MEDIA_PATH, thumbnail_params=to_thumbnail_params({"width": "50"})
113 |         )
114 |         self.assertEqual(self.downloader_mock.call_count, 2)
115 | 
116 |     async def test_cache_thumbnail_params(self) -> None:
117 |         """Tests that cached results for thumbnails are only used if the generation
118 |         parameters are the same.
119 |         """
120 |         # Scan a thumbnail and check that the downloader was called.
121 |         await self.scanner.scan_file(
122 |             MEDIA_PATH, thumbnail_params=to_thumbnail_params({"width": "50"})
123 |         )
124 |         self.assertEqual(self.downloader_mock.call_count, 1)
125 | 
126 |         # Scan the thumbnail again and check that the cache result was used (since the
127 |         # downloader was not called)
128 |         await self.scanner.scan_file(
129 |             MEDIA_PATH, thumbnail_params=to_thumbnail_params({"width": "50"})
130 |         )
131 |         self.assertEqual(self.downloader_mock.call_count, 1)
132 | 
133 |         # Scan a different thumbnail of the same media (with different parameters) and
134 |         # check that the downloader was called.
135 |         await self.scanner.scan_file(
136 |             MEDIA_PATH, thumbnail_params=to_thumbnail_params({"height": "50"})
137 |         )
138 |         self.assertEqual(self.downloader_mock.call_count, 2)
139 | 
140 |     async def test_cache_max_size(self) -> None:
141 |         """Tests that we don't cache files if they exceed the configured maximum file
142 |         size.
143 |         """
144 |         # Set the maximum file size to be just under the size of the file.
145 |         self.scanner._max_size_to_cache = len(SMALL_PNG) - 1
146 | 
147 |         # Scan the file a first time, and check that the downloader has been called.
148 |         await self.scanner.scan_file(MEDIA_PATH)
149 |         self.assertEqual(self.downloader_mock.call_count, 1)
150 | 
151 |         # Scan the file a second time, and check that the downloader has been called
152 |         # again.
153 |         media = await self.scanner.scan_file(MEDIA_PATH)
154 |         self.assertEqual(self.downloader_mock.call_count, 2)
155 |         self.assertEqual(media.content, SMALL_PNG)
156 | 
157 |     async def test_cache_max_size_mismatching_hash(self) -> None:
158 |         """Tests that we re-scan big files if the hash we have cached for them does not
159 |         match the hash of the newly downloaded content.
160 |         """
161 |         # Mock the _run_scan command so we can keep track of its call count.
162 |         mock_runner = AsyncMock(return_value=0)
163 |         self.scanner._run_scan = mock_runner  # type: ignore[method-assign]
164 | 
165 |         # Calculate the cache key for this file so we can look it up later.
166 |         cache_key = self.scanner._get_cache_key_for_file(MEDIA_PATH, None, None)
167 | 
168 |         # Set the maximum file size to be just under the size of the file.
169 |         self.scanner._max_size_to_cache = len(SMALL_PNG) - 1
170 | 
171 |         # Make sure the cache is empty.
172 |         self.assertEqual(len(self.scanner._result_cache), 0)
173 | 
174 |         # Scan the file a first time, and check that the file has been scanned.
175 |         await self.scanner.scan_file(MEDIA_PATH)
176 |         self.assertEqual(self.downloader_mock.call_count, 1)
177 |         mock_runner.assert_called_once()
178 | 
179 |         # Test that the file has been cached.
180 |         self.assertIn(cache_key, self.scanner._result_cache)
181 | 
182 |         # Change the hash of the cache entry to force it to be scanned again.
183 |         entry: CacheEntry = self.scanner._result_cache[cache_key]
184 |         self.scanner._result_cache[cache_key] = CacheEntry(
185 |             result=entry.result,
186 |             media=entry.media,
187 |             media_hash="BAD_HASH",
188 |             info=entry.info,
189 |         )
190 | 
191 |         # Run the scanner again and check that the cache entry for the file has been
192 |         # discarded (i.e. the scan is run again).
193 |         await self.scanner.scan_file(MEDIA_PATH)
194 |         self.assertEqual(mock_runner.call_count, 2)
195 | 
196 |         # Also check that the file has only been re-downloaded once.
197 |         self.assertEqual(self.downloader_mock.call_count, 2)
198 | 
199 |     async def test_different_encryption_key(self) -> None:
200 |         """Tests that if some of the file's metadata changed, we don't match against the
201 |         cache and we download the file again.
202 | 
203 |         Also tests that the scanner fails in the correct way if it can't decrypt a file.
204 |         """
205 |         self._setup_encrypted()
206 | 
207 |         # Scan the file and check that the downloader was called.
208 |         await self.scanner.scan_file(MEDIA_PATH, ENCRYPTED_FILE_METADATA)
209 |         self.assertEqual(self.downloader_mock.call_count, 1)
210 | 
211 |         # Copy the file metadata and change the key.
212 |         modified_metadata = copy.deepcopy(ENCRYPTED_FILE_METADATA)
213 |         modified_metadata["file"]["key"]["k"] = "somethingelse"
214 | 
215 |         # This causes the scanner to not be able to decrypt the file.
216 |         with self.assertRaises(ContentScannerRestError) as cm:
217 |             await self.scanner.scan_file(MEDIA_PATH, modified_metadata)
218 | 
219 |         self.assertEqual(cm.exception.http_status, 400)
220 |         self.assertEqual(cm.exception.reason, ErrCode.FAILED_TO_DECRYPT)
221 | 
222 |         # But it also causes it to be downloaded again because its metadata have changed.
223 |         self.assertEqual(self.downloader_mock.call_count, 2)
224 | 
225 |     async def test_allowlist_mimetype(self) -> None:
226 |         """Tests that, if there's an allow list for MIME types and the file's MIME type
227 |         isn't in it, the file's scan fails.
228 |         """
229 |         # Set an allow list that only allows JPEG files.
230 |         self.scanner._allowed_mimetypes = ["image/jpeg"]
231 | 
232 |         # Check that the scan fails since the file is a PNG.
233 |         with self.assertRaises(FileMimeTypeForbiddenError):
234 |             await self.scanner.scan_file(MEDIA_PATH)
235 | 
236 |     async def test_allowlist_mimetype_encrypted(self) -> None:
237 |         """Tests that the file's MIME type is correctly detected and compared with the
238 |         allow list (if set), even if it's encrypted.
239 |         """
240 |         self._setup_encrypted()
241 | 
242 |         # Set an allow list that only allows JPEG files.
243 |         self.scanner._allowed_mimetypes = ["image/jpeg"]
244 | 
245 |         # Check that the scan fails since the file is a PNG.
246 |         with self.assertRaises(FileMimeTypeForbiddenError):
247 |             await self.scanner.scan_file(MEDIA_PATH, ENCRYPTED_FILE_METADATA)
248 | 
249 |     async def test_blocklist_mimetype(self) -> None:
250 |         """Tests that, if there's an allow list for MIME types and the file's MIME type
251 |         isn't in it, the file's scan fails.
252 |         """
253 |         # Set a block list that blocks PNG images.
254 |         self.scanner._blocked_mimetypes = ["image/png"]
255 | 
256 |         # Check that the scan fails since the file is a PNG.
257 |         with self.assertRaises(FileMimeTypeForbiddenError):
258 |             await self.scanner.scan_file(MEDIA_PATH)
259 | 
260 |     async def test_blocklist_mimetype_encrypted(self) -> None:
261 |         """Tests that the file's MIME type is correctly detected and compared with the
262 |         allow list (if set), even if it's encrypted.
263 |         """
264 |         self._setup_encrypted()
265 | 
266 |         # Set a block list that blocks PNG images.
267 |         self.scanner._blocked_mimetypes = ["image/png"]
268 | 
269 |         # Check that the scan fails since the file is a PNG.
270 |         with self.assertRaises(FileMimeTypeForbiddenError):
271 |             await self.scanner.scan_file(MEDIA_PATH, ENCRYPTED_FILE_METADATA)
272 | 
273 |     async def test_blocklist_mimetype_fallback_binary_file(self) -> None:
274 |         """Tests that unrecognised binary files' MIME type is assumed to be
275 |         `application/octet-stream` and that they can be blocked in this way.
276 |         """
277 | 
278 |         self.downloader_res = MediaDescription(
279 |             # This is the *claimed* content-type by the uploader
280 |             content_type="application/vnd.io.element.generic_binary_file",
281 |             content=SMALL_BINARY_FILE,
282 |             response_headers=CIMultiDictProxy(CIMultiDict()),
283 |         )
284 | 
285 |         # Set a block list that blocks uncategorised binary files.
286 |         self.scanner._blocked_mimetypes = ["application/octet-stream"]
287 | 
288 |         with self.assertRaises(FileMimeTypeForbiddenError):
289 |             await self.scanner.scan_file(MEDIA_PATH)
290 | 
291 |     async def test_blocklist_mimetype_fallback_text_file(self) -> None:
292 |         """Tests that unrecognised text files' MIME type is assumed to be
293 |         `text/plain` and that they can be blocked in this way.
294 |         """
295 | 
296 |         self.downloader_res = MediaDescription(
297 |             # This is the *claimed* content-type by the uploader
298 |             content_type="application/vnd.io.element.generic_file",
299 |             content=SMALL_TEXT_FILE,
300 |             response_headers=CIMultiDictProxy(CIMultiDict()),
301 |         )
302 | 
303 |         # Set a block list that blocks uncategorised text files.
304 |         self.scanner._blocked_mimetypes = ["text/plain"]
305 | 
306 |         with self.assertRaises(FileMimeTypeForbiddenError):
307 |             await self.scanner.scan_file(MEDIA_PATH)
308 | 
309 |     async def test_dont_cache_exit_codes(self) -> None:
310 |         """Tests that if the configuration specifies exit codes to ignore when running
311 |         the scanning script, we don't cache them.
312 |         """
313 |         self.scanner._exit_codes_to_ignore = [5]
314 | 
315 |         # It's tricky to give a value to `scanner._script` that makes `_run_scan` return 5
316 |         # directly, so we just mock it here.
317 |         run_scan_mock = AsyncMock(return_value=5)
318 |         self.scanner._run_scan = run_scan_mock  # type: ignore[method-assign]
319 | 
320 |         # Scan the file, we'll check later that it wasn't cached.
321 |         with self.assertRaises(FileDirtyError):
322 |             await self.scanner.scan_file(MEDIA_PATH)
323 | 
324 |         self.assertEqual(self.downloader_mock.call_count, 1)
325 | 
326 |         # Update the mock so that the file is cached at the next scan.
327 |         run_scan_mock.return_value = 1
328 | 
329 |         # Scan the file again to check that the file wasn't cached.
330 |         with self.assertRaises(FileDirtyError):
331 |             await self.scanner.scan_file(MEDIA_PATH)
332 | 
333 |         self.assertEqual(self.downloader_mock.call_count, 2)
334 | 
335 |         # The file should be cached now.
336 |         with self.assertRaises(FileDirtyError):
337 |             await self.scanner.scan_file(MEDIA_PATH)
338 | 
339 |         self.assertEqual(self.downloader_mock.call_count, 2)
340 | 
341 |     async def test_outside_temp_dir(self) -> None:
342 |         """Tests that a scan is failed if the media path is formed in a way that would
343 |         cause the scanner to write outside of the configured directory.
344 |         """
345 |         with self.assertRaises(FileDirtyError):
346 |             await self.scanner.scan_file("../bar")
347 | 
348 |     async def test_invalid_media_path(self) -> None:
349 |         """Tests that a scan fails if the media path is invalid."""
350 |         with self.assertRaises(FileDirtyError):
351 |             await self.scanner.scan_file(MEDIA_PATH + "/baz")
352 | 
353 |     async def test_deduplicate_scans(self) -> None:
354 |         """Tests that if two scan requests come in for the same file and with the same
355 |         parameter, only one download/scan happens.
356 |         """
357 | 
358 |         # Change the Mock's side effect to introduce some delay, to simulate a long
359 |         # download time. We sleep asynchronously to allow additional scans requests to be
360 |         # processed.
361 |         async def _scan_file(*args: Any) -> MediaDescription:
362 |             await asyncio.sleep(0.2)
363 | 
364 |             return self.downloader_res
365 | 
366 |         scan_mock = Mock(side_effect=_scan_file)
367 |         self.scanner._scan_file = scan_mock  # type: ignore[method-assign]
368 | 
369 |         # Request two scans of the same file at the same time.
370 |         results = await asyncio.gather(
371 |             asyncio.create_task(self.scanner.scan_file(MEDIA_PATH)),
372 |             asyncio.create_task(self.scanner.scan_file(MEDIA_PATH)),
373 |         )
374 | 
375 |         # Check that the scanner has only been called once, meaning that the second
376 |         # call did not trigger a scan.
377 |         scan_mock.assert_called_once()
378 | 
379 |         # Check that we got two results, and that we actually got the correct media
380 |         # description in the second scan.
381 |         self.assertEqual(len(results), 2, results)
382 |         self.assertEqual(results[0].content, results[1].content, results)
383 | 
384 |     def _setup_encrypted(self) -> None:
385 |         """Sets up class properties to make the downloader return an encrypted file
386 |         instead of a plain text one.
387 |         """
388 |         self.downloader_res.content_type = "application/octet-stream"
389 |         self.downloader_res.content = SMALL_PNG_ENCRYPTED
390 | 


--------------------------------------------------------------------------------
/tests/servlets/__init__.py:
--------------------------------------------------------------------------------
1 | #  Copyright 2022 New Vector Ltd
2 | #
3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
4 | # Please see LICENSE files in the repository root for full details.
5 | 


--------------------------------------------------------------------------------
/tests/servlets/test_scan.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2023 New Vector Ltd
 2 | #
 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
 4 | # Please see LICENSE files in the repository root for full details.
 5 | from http import HTTPStatus
 6 | from unittest.mock import patch
 7 | 
 8 | from aiohttp.test_utils import AioHTTPTestCase
 9 | from aiohttp.web_app import Application
10 | from multidict import CIMultiDict
11 | 
12 | from matrix_content_scanner.httpserver import HTTPServer
13 | from matrix_content_scanner.utils.constants import ErrCode
14 | from matrix_content_scanner.utils.errors import ContentScannerRestError
15 | 
16 | from tests.testutils import get_content_scanner
17 | 
18 | SERVER_NAME = "test"
19 | 
20 | 
21 | class TestScanHandler(AioHTTPTestCase):
22 |     def setUp(self) -> None:
23 |         # Bypass well-known lookups.
24 |         self.scanner = get_content_scanner(
25 |             {"download": {"base_homeserver_url": "http://my-site.com"}}
26 |         )
27 | 
28 |     async def get_application(self) -> Application:
29 |         return HTTPServer(self.scanner)._app
30 | 
31 |     async def test_media_not_found_on_remote_homeserver(self) -> None:
32 |         """Missing media on the remote HS should be presented as a 404 to the client."""
33 |         patch_downloader = patch.object(
34 |             self.scanner.file_downloader,
35 |             "_get",
36 |             return_value=(HTTPStatus.NOT_FOUND, b"", CIMultiDict()),
37 |         )
38 | 
39 |         with patch_downloader:
40 |             async with self.client.get(
41 |                 f"/_matrix/media_proxy/unstable/download/{SERVER_NAME}/media-does-not-exist"
42 |             ) as resp:
43 |                 self.assertEqual(resp.status, 404)
44 |                 body = await resp.json()
45 |                 self.assertEqual(body["reason"], "M_NOT_FOUND", body)
46 | 
47 |     async def test_remote_homeserver_unreachable(self) -> None:
48 |         """An unreachable HS should be presented as a 502 to the client."""
49 |         patch_downloader = patch.object(
50 |             self.scanner.file_downloader,
51 |             "_get",
52 |             side_effect=ContentScannerRestError(
53 |                 HTTPStatus.BAD_GATEWAY,
54 |                 ErrCode.REQUEST_FAILED,
55 |                 "dodgy network timeout :(((",
56 |             ),
57 |         )
58 | 
59 |         with patch_downloader:
60 |             async with self.client.get(
61 |                 f"/_matrix/media_proxy/unstable/download/{SERVER_NAME}/media-does-not-exist"
62 |             ) as resp:
63 |                 self.assertEqual(resp.status, 502)
64 |                 body = await resp.json()
65 |                 self.assertEqual(body["reason"], "MCS_MEDIA_REQUEST_FAILED", body)
66 | 


--------------------------------------------------------------------------------
/tests/servlets/test_servlets.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2022 New Vector Ltd
 2 | #
 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
 4 | # Please see LICENSE files in the repository root for full details.
 5 | import json
 6 | import unittest
 7 | 
 8 | from matrix_content_scanner.servlets import _metadata_from_body
 9 | from matrix_content_scanner.utils.constants import ErrCode
10 | from matrix_content_scanner.utils.errors import ContentScannerRestError
11 | from matrix_content_scanner.utils.types import JsonDict
12 | 
13 | from tests.testutils import ENCRYPTED_FILE_METADATA, get_content_scanner
14 | 
15 | 
16 | class EncryptedFileMetadataTestCase(unittest.TestCase):
17 |     def setUp(self) -> None:
18 |         self.crypto_handler = get_content_scanner().crypto_handler
19 | 
20 |     def test_unencrypted(self) -> None:
21 |         """Tests that the _metadata_from_body function correctly returns non-encrypted
22 |         metadata.
23 |         """
24 |         metadata = _metadata_from_body(ENCRYPTED_FILE_METADATA, self.crypto_handler)
25 |         self.assertEqual(metadata, ENCRYPTED_FILE_METADATA)
26 | 
27 |     def test_encrypted(self) -> None:
28 |         """Tests that the _metadata_from_body function correctly decrypts Olm-encrypted
29 |         metadata and returns a decrypted version.
30 |         """
31 |         encrypted_body = self._encrypt_body(ENCRYPTED_FILE_METADATA)
32 |         metadata = _metadata_from_body(encrypted_body, self.crypto_handler)
33 |         self.assertEqual(metadata, ENCRYPTED_FILE_METADATA)
34 | 
35 |     def test_bad_json(self) -> None:
36 |         """Tests that the _metadata_from_body function raises a REST error if the request
37 |         body is not a valid JSON object.
38 |         """
39 |         with self.assertRaises(ContentScannerRestError) as cm:
40 |             _metadata_from_body("foo", self.crypto_handler)  # type: ignore[arg-type]
41 | 
42 |         self.assertEqual(cm.exception.reason, ErrCode.MALFORMED_JSON)
43 | 
44 |     def _encrypt_body(self, content: JsonDict) -> JsonDict:
45 |         """Encrypts the provided dictionary with Olm's PkEncryption class.
46 | 
47 |         Args:
48 |             content: The dictionary to encrypt.
49 | 
50 |         Returns:
51 |             An encrypted version of the dictionary in the format that's expected in POST
52 |             requests.
53 |         """
54 |         msg = self.crypto_handler.encrypt(
55 |             self.crypto_handler.public_key, json.dumps(content)
56 |         )
57 | 
58 |         return {
59 |             "encrypted_body": {
60 |                 "ciphertext": msg.ciphertext,
61 |                 "mac": msg.mac,
62 |                 "ephemeral": msg.ephemeral_key,
63 |             }
64 |         }
65 | 


--------------------------------------------------------------------------------
/tests/test_crypto.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2022 New Vector Ltd
 2 | #
 3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
 4 | # Please see LICENSE files in the repository root for full details.
 5 | import json
 6 | import unittest
 7 | 
 8 | from tests.testutils import get_content_scanner
 9 | 
10 | 
11 | class CryptoHandlerTestCase(unittest.TestCase):
12 |     def setUp(self) -> None:
13 |         self.crypto_handler = get_content_scanner().crypto_handler
14 | 
15 |     def test_decrypt(self) -> None:
16 |         """Tests that an Olm-encrypted payload is successfully decrypted."""
17 |         payload = {"foo": "bar"}
18 | 
19 |         # Encrypt the payload with PkEncryption.
20 |         encrypted = self.crypto_handler.encrypt(
21 |             self.crypto_handler.public_key, json.dumps(payload)
22 |         )
23 | 
24 |         # Decrypt the payload with the crypto handler.
25 |         decrypted = json.loads(
26 |             get_content_scanner().crypto_handler.decrypt_body(
27 |                 encrypted.ciphertext,
28 |                 encrypted.mac,
29 |                 encrypted.ephemeral_key,
30 |             )
31 |         )
32 | 
33 |         # Check that the decrypted payload is the same as the original one before
34 |         # encryption.
35 |         self.assertEqual(decrypted, payload)
36 | 


--------------------------------------------------------------------------------
/tests/testutils.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2022 New Vector Ltd
  2 | #
  3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
  4 | # Please see LICENSE files in the repository root for full details.
  5 | import os
  6 | from binascii import unhexlify
  7 | from typing import Dict, Optional
  8 | 
  9 | from multidict import CIMultiDict, CIMultiDictProxy, MultiDict, MultiDictProxy
 10 | 
 11 | from matrix_content_scanner.config import MatrixContentScannerConfig
 12 | from matrix_content_scanner.mcs import MatrixContentScanner
 13 | from matrix_content_scanner.utils.types import JsonDict
 14 | 
 15 | # The media path to use in tests.
 16 | MEDIA_PATH = "foo/bar"
 17 | 
 18 | # A small, unencrypted PNG.
 19 | SMALL_PNG = unhexlify(
 20 |     b"89504e470d0a1a0a0000000d4948445200000001000000010806"
 21 |     b"0000001f15c4890000000a49444154789c63000100000500010d"
 22 |     b"0a2db40000000049454e44ae426082"
 23 | )
 24 | 
 25 | # A small binary file without any specific format.
 26 | SMALL_BINARY_FILE = unhexlify(b"010203")
 27 | 
 28 | # A small text file without any specific format.
 29 | SMALL_TEXT_FILE = b"Hello world\nThis is a tiny text file"
 30 | 
 31 | # A small, encrypted PNG.
 32 | SMALL_PNG_ENCRYPTED = unhexlify(
 33 |     b"9fd28dd7a1d845a04948f13af104e39402c888f7b601bce313ad"
 34 |     b"bf3e2423f67d93d5e304efc147d46df511abacbb8ae7e2e8156c"
 35 |     b"2e08de86c31fdc6aa5bd4d11537e5657102a83214d13d7ff57e6"
 36 |     b"d35940f149fbd1e661a260b0b6fe465e4e0a7c8039c08d78f679"
 37 |     b"cde511be94c685eee50571858d99d0c84918381aea3e52319509"
 38 |     b"36cac1a7b2ec46c980f3c3995eaf21fc2711b0de8ff014ff5fe7"
 39 |     b"4a7fcb3515df4f1f2ceeae72d7b58bc69d56dedf31fd430ac2ce"
 40 |     b"8aee9fcb150a1af9fdee30ac26d68d3db77c1adec5f68cad78f9"
 41 |     b"ed6ef9156ba23b76e38dfd59cb077c964248f331d43147dc7fa7"
 42 |     b"b61baf7546e5edfd78347828386b64b3a1ebdff0dcd55ea57f4b"
 43 |     b"b73b06fbedff62ef8a7fd89146fd11723e739d541d07bf399837"
 44 |     b"3ed56cb9ef475bd409e590258cdb6a0cdf4871882c334c2897c4"
 45 |     b"ea0dc76748e727a71d8c2e85253b2c80667f5d98ddbcf8fb90ba"
 46 |     b"adceb6e75a2741b740dc0d084d55cc20dd7369e7041529b62ce1"
 47 |     b"59bcde9d9a0f4978093cd52dfe77107613d2bc265519177ed623"
 48 |     b"49d70517ecf4a243fb7c20db411459766785ee6f039f68383a62"
 49 |     b"375b14cdf405401dc4aabf6812d9803218544d1ccdc9339e81cb"
 50 |     b"b36acb3414e8dfb49521b89f1b6d54a712da35e45462844a622c"
 51 |     b"aa92313335d317201e1eab5f34daba5358fde87648b24868b098"
 52 |     b"505916b8bc997b19976487718835f0d54a8794e24ca19240cad1"
 53 |     b"61e0624d8df2214edd3c33ae2b5156e2ef7191d75528f9c26a89"
 54 |     b"4a"
 55 | )
 56 | 
 57 | # The metadata necessary to download and decrypt SMALL_PNG_ENCRYPTED
 58 | ENCRYPTED_FILE_METADATA: JsonDict = {
 59 |     "file": {
 60 |         "v": "v2",
 61 |         "key": {
 62 |             "alg": "A256CTR",
 63 |             "ext": True,
 64 |             "k": "F3miZm2vZhucJ062AuKMUwmd-O6AK0AXP29p4MKtq3Q",
 65 |             "key_ops": ["encrypt", "decrypt"],
 66 |             "kty": "oct",
 67 |         },
 68 |         "iv": "rJqtSdi3F/EAAAAAAAAAAA",
 69 |         "hashes": {"sha256": "NYvGRRQGfyWpXSUpba+ozSbehFP6kw5ZDg0xMppyX8c"},
 70 |         "url": "mxc://" + MEDIA_PATH,
 71 |     }
 72 | }
 73 | 
 74 | 
 75 | def to_thumbnail_params(params: Dict[str, str]) -> MultiDictProxy[str]:
 76 |     """Turn the given dictionary into query parameters as they'd appear when processing a
 77 |     thumbnailing request.
 78 | 
 79 |     Args:
 80 |         params: The raw parameters.
 81 | 
 82 |     Returns:
 83 |         A multidict that can be passed onto the scanner or the file downloader.
 84 |     """
 85 |     return MultiDictProxy(MultiDict(params))
 86 | 
 87 | 
 88 | def get_base_media_headers() -> CIMultiDictProxy[str]:
 89 |     """Get the base headers necessary to react to a download request for SMALL_PNG.
 90 | 
 91 |     Returns:
 92 |         The headers to pass onto the file downloader.
 93 |     """
 94 |     return CIMultiDictProxy(CIMultiDict({"content-type": "image/png"}))
 95 | 
 96 | 
 97 | def get_content_scanner(config: Optional[JsonDict] = None) -> MatrixContentScanner:
 98 |     """Instantiates an instance of the content scanner.
 99 | 
100 |     Args:
101 |         config: The optional provided config.
102 |     """
103 |     # Create the temporary directory that we'll use so the scanner doesn't complain about
104 |     # it not existing.
105 |     os.makedirs(os.path.abspath("temp"), exist_ok=True)
106 | 
107 |     # We define the default configuration here rather than as a constant outside of a
108 |     # function because otherwise a test that sets its own config would have side effects
109 |     # on the config used for other tests.
110 |     default_config = {
111 |         "scan": {
112 |             "script": "true",
113 |             "temp_directory": "temp",
114 |         },
115 |         "web": {
116 |             "host": "127.0.0.1",
117 |             "port": 8080,
118 |         },
119 |         "crypto": {
120 |             "pickle_path": "mcs_pickle.txt",
121 |             "pickle_key": "foo",
122 |         },
123 |     }
124 | 
125 |     if config is None:
126 |         config = {}
127 | 
128 |     # Update the configuration provided with some default settings.
129 |     # Note that `update` does not update nested dictionaries (only the top level), so
130 |     # e.g. if a configuration with a `scan` section is provided it will need to include
131 |     # all required settings in that section.
132 |     default_config.update(config)
133 | 
134 |     parsed_config = MatrixContentScannerConfig(default_config)
135 | 
136 |     return MatrixContentScanner(parsed_config)
137 | 


--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #  Copyright 2022 New Vector Ltd
2 | #
3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
4 | # Please see LICENSE files in the repository root for full details.
5 | 


--------------------------------------------------------------------------------
/tests/utils/test_encrypted_file_metadata.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2022 New Vector Ltd
  2 | #
  3 | # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
  4 | # Please see LICENSE files in the repository root for full details.
  5 | import copy
  6 | import unittest
  7 | 
  8 | from matrix_content_scanner.utils.constants import ErrCode
  9 | from matrix_content_scanner.utils.encrypted_file_metadata import (
 10 |     validate_encrypted_file_metadata,
 11 | )
 12 | from matrix_content_scanner.utils.errors import ContentScannerRestError
 13 | 
 14 | from tests.testutils import ENCRYPTED_FILE_METADATA
 15 | 
 16 | 
 17 | class EncryptedMetadataValidationTestCase(unittest.TestCase):
 18 |     def setUp(self) -> None:
 19 |         self.metadata = copy.deepcopy(ENCRYPTED_FILE_METADATA)
 20 | 
 21 |     def test_validate(self) -> None:
 22 |         """Tests that valid file metadata is considered as such."""
 23 |         validate_encrypted_file_metadata(ENCRYPTED_FILE_METADATA)
 24 | 
 25 |     def test_key_ops_no_decrypt(self) -> None:
 26 |         """Tests that the metadata validation fails if key_ops doesn't include `decrypt`."""
 27 |         self.metadata["file"]["key"]["key_ops"] = ["encrypt"]
 28 |         self._test_fails_validation()
 29 | 
 30 |     def test_key_ops_no_encrypt(self) -> None:
 31 |         """Tests that the metadata validation fails if key_ops doesn't include `encrypt`."""
 32 |         self.metadata["file"]["key"]["key_ops"] = ["decrypt"]
 33 |         self._test_fails_validation()
 34 | 
 35 |     def test_ops_extra_values(self) -> None:
 36 |         """tests that the metadata validation does not fail if there are extra values in
 37 |         key_ops.
 38 |         """
 39 |         self.metadata["file"]["key"]["key_ops"].append("foo")
 40 |         validate_encrypted_file_metadata(self.metadata)
 41 | 
 42 |     def test_no_file(self) -> None:
 43 |         """Tests that the metadata validation fails if there isn't a `file` property."""
 44 |         self.metadata = {"foo": "bar"}
 45 |         self._test_fails_validation()
 46 | 
 47 |     def test_no_key(self) -> None:
 48 |         """Tests that the metadata validation fails if there isn't a `file.key` property."""
 49 |         del self.metadata["file"]["key"]
 50 |         self._test_fails_validation()
 51 | 
 52 |     def test_no_k(self) -> None:
 53 |         """Tests that the metadata validation fails if there isn't a `file.key.k`
 54 |         property.
 55 |         """
 56 |         del self.metadata["file"]["key"]["k"]
 57 |         self._test_fails_validation()
 58 | 
 59 |     def test_no_ext(self) -> None:
 60 |         """Tests that the metadata validation fails if there isn't a `file.key.ext`
 61 |         property.
 62 |         """
 63 |         del self.metadata["file"]["key"]["ext"]
 64 |         self._test_fails_validation()
 65 | 
 66 |     def test_bad_ext(self) -> None:
 67 |         """Tests that the metadata validation fails if the `file.key.ext` property has an
 68 |         invalid value.
 69 |         """
 70 |         self.metadata["file"]["key"]["ext"] = False
 71 |         self._test_fails_validation()
 72 | 
 73 |     def test_bad_alg(self) -> None:
 74 |         """Tests that the metadata validation fails if the `file.key.alg` property has an
 75 |         invalid value.
 76 |         """
 77 |         self.metadata["file"]["key"]["alg"] = "bad"
 78 |         self._test_fails_validation()
 79 | 
 80 |     def test_bad_kty(self) -> None:
 81 |         """Tests that the metadata validation fails if the `file.key.kty` property has an
 82 |         invalid value.
 83 |         """
 84 |         self.metadata["file"]["key"]["kty"] = "bad"
 85 |         self._test_fails_validation()
 86 | 
 87 |     def test_no_iv(self) -> None:
 88 |         """Tests that the metadata validation fails if there isn't a `file.iv` property."""
 89 |         del self.metadata["file"]["iv"]
 90 |         self._test_fails_validation()
 91 | 
 92 |     def test_no_url(self) -> None:
 93 |         """Tests that the metadata validation fails if there isn't a `file.url` property."""
 94 |         del self.metadata["file"]["url"]
 95 |         self._test_fails_validation()
 96 | 
 97 |     def test_no_hashes(self) -> None:
 98 |         """Tests that the metadata validation fails if there isn't a `file.hashes`
 99 |         property.
100 |         """
101 |         del self.metadata["file"]["hashes"]
102 |         self._test_fails_validation()
103 | 
104 |     def test_no_sha256(self) -> None:
105 |         """Tests that the metadata validation fails if there isn't a `file.hashes.sha256`
106 |         property.
107 |         """
108 |         del self.metadata["file"]["hashes"]["sha256"]
109 |         self._test_fails_validation()
110 | 
111 |     def _test_fails_validation(self) -> None:
112 |         """Tests that the validation fails with a REST error complaining about malformed
113 |         JSON.
114 |         """
115 |         with self.assertRaises(ContentScannerRestError) as cm:
116 |             validate_encrypted_file_metadata(self.metadata)
117 | 
118 |         self.assertEqual(cm.exception.http_status, 400)
119 |         self.assertEqual(cm.exception.reason, ErrCode.MALFORMED_JSON)
120 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py
 3 | 
 4 | # required for PEP 517 (pyproject.toml-style) builds
 5 | isolated_build = true
 6 | 
 7 | [testenv]
 8 | allowlist_externals = poetry
 9 | commands = poetry install
10 | 
11 | [testenv:py]
12 | 
13 | # As of twisted 16.4, trial tries to import the tests as a package (previously
14 | # it loaded the files explicitly), which means they need to be on the
15 | # pythonpath. Our sdist doesn't include the 'tests' package, so normally it
16 | # doesn't work within the tox virtualenv.
17 | #
18 | # As a workaround, we tell tox to do install with 'pip -e', which just
19 | # creates a symlink to the project directory instead of unpacking the sdist.
20 | usedevelop=true
21 | 
22 | commands = poetry run python -m unittest discover tests
23 | 


--------------------------------------------------------------------------------