├── .dockerignore ├── .env.example ├── .gitattributes ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE.md ├── README.md ├── commands.py ├── data ├── .gitignore └── demo │ ├── 1local │ ├── 1.txt │ ├── 2.md │ └── 3.jpg │ └── 2audio │ └── coffee.m4a ├── docs ├── commands.md ├── components │ └── youtube.md ├── custom-builds.md ├── custom-components.md ├── install.md ├── overview.md ├── testing.md ├── training-classifiers.md ├── tutorial │ ├── 1 │ │ ├── 1a.yaml │ │ ├── 1b.yaml │ │ ├── 1c.yaml │ │ └── README.md │ ├── 2 │ │ ├── 2a.yaml │ │ ├── 2b.yaml │ │ └── README.md │ └── 3 │ │ ├── 3a.yaml │ │ ├── 3b.yaml │ │ ├── 3c.yaml │ │ └── README.md └── updates │ ├── 2020.01.30.md │ ├── 2020.02.16.md │ ├── 2020.03.16.md │ └── 2020.11.22.md ├── example.blacklist.txt ├── examples ├── 4chan.yaml ├── classify.yaml ├── meta-test.yaml ├── pytorchfasterrcnn-test.yaml ├── ranking-test.yaml └── yolov5-test.yaml ├── media └── .gitignore ├── mtriage ├── requirements.txt ├── scripts ├── lint └── scaffold ├── src ├── build │ ├── core.end.Dockerfile │ ├── core.requirements.txt │ ├── core.start.Dockerfile │ ├── cpu-header.Dockerfile │ └── gpu-header.Dockerfile ├── conftest.py ├── lib │ ├── analysers │ │ ├── AnalysedFramesMeta │ │ │ ├── core.py │ │ │ └── info.yaml │ │ ├── ConvertAudio │ │ │ ├── core.py │ │ │ ├── info.yaml │ │ │ └── partial.Dockerfile │ │ ├── ExtractAudio │ │ │ ├── core.py │ │ │ ├── info.yaml │ │ │ └── partial.Dockerfile │ │ ├── ExtractTypes │ │ │ ├── core.py │ │ │ └── info.yaml │ │ ├── Flatten │ │ │ ├── core.py │ │ │ └── info.yaml │ │ ├── Frames │ │ │ ├── core.py │ │ │ ├── info.yaml │ │ │ └── partial.Dockerfile │ │ ├── ImageDedup │ │ │ ├── core.py │ │ │ ├── info.yaml │ │ │ └── requirements.txt │ │ ├── KerasPretrained │ │ │ ├── core.py │ │ │ ├── info.yaml │ │ │ └── requirements.txt │ │ ├── ProtestsPretrained │ │ │ ├── core.py │ │ │ ├── image.jpg │ │ │ ├── info.yaml │ │ │ ├── partial.Dockerfile │ │ │ ├── requirements.txt │ │ │ ├── test.py │ │ │ └── utils.py │ │ ├── PytorchFasterRcnn │ │ │ ├── core.py │ │ │ ├── info.yaml │ │ │ └── requirements.txt │ │ ├── Rank │ │ │ ├── core.py │ │ │ └── info.yaml │ │ ├── TorchHub │ │ │ ├── core.py │ │ │ ├── info.yaml │ │ │ ├── partial.Dockerfile │ │ │ └── requirements.txt │ │ └── TwintToGephi │ │ │ ├── core.py │ │ │ ├── info.yaml │ │ │ └── requirements.txt │ ├── common │ │ ├── __init__.py │ │ ├── analyser.py │ │ ├── etypes.py │ │ ├── exceptions.py │ │ ├── get.py │ │ ├── mtmodule.py │ │ ├── selector.py │ │ ├── storage.py │ │ └── util.py │ ├── etypes │ │ └── cvjson.py │ ├── selectors │ │ ├── FourChan │ │ │ ├── boards.py │ │ │ ├── core.py │ │ │ ├── info.yaml │ │ │ └── requirements.txt │ │ ├── Local │ │ │ ├── core.py │ │ │ └── info.yaml │ │ ├── Twitter │ │ │ ├── core.py │ │ │ ├── info.yaml │ │ │ └── partial.Dockerfile │ │ └── Youtube │ │ │ ├── core.py │ │ │ ├── info.yaml │ │ │ ├── partial.Dockerfile │ │ │ └── requirements.txt │ └── util │ │ ├── cvjson.py │ │ └── twint.py ├── run.py ├── test │ ├── README.md │ ├── __init__.py │ ├── etype_stubs │ │ └── image.jpeg │ ├── test_analyser.py │ ├── test_analyser_errors.py │ ├── test_etypes.py │ ├── test_get.py │ ├── test_infoyamls.py │ ├── test_integration.py │ ├── test_localstorage.py │ ├── test_mtmodule.py │ ├── test_run.py │ ├── test_selector.py │ ├── test_selector_errors.py │ └── utils.py └── validate.py ├── test ├── test_build.py └── test_util.py └── util.py /.dockerignore: -------------------------------------------------------------------------------- 1 | Dockerfile 2 | requirements.txt 3 | media/**/* 4 | **/*/__pycache__ 5 | **/*.pyc 6 | 7 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | GOOGLE_API_KEY= 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.mkv filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # generic 2 | *.swp 3 | .DS_Store 4 | .ipynb_checkpoints/ 5 | __pycache__/ 6 | .pytest_cache/ 7 | .mypy* 8 | .vscode 9 | *.pyc 10 | 11 | # vision artefacts 12 | *.weights 13 | *.conv* 14 | 15 | # build artifacts 16 | build.Dockerfile 17 | build.requirements.txt 18 | 19 | # authentication files 20 | credentials/** 21 | .env 22 | 23 | # other data 24 | tags* 25 | logfile.log 26 | 27 | blacklists/** 28 | whitelists/** 29 | config/** 30 | 31 | data/demo/3video/dancingonmyown.mov 32 | 33 | data/demo/3video/info.json 34 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to mtriage 2 | 3 | Hi there! Thank you already, for taking the time to contribute to improve 4 | mtriage. This document is the right place to start. Read it thoroughly! 5 | 6 | ## What do I need to know to help? 7 | ### Python 8 | The majority of mtriage is written in Python. You'll be best placed to 9 | contribute if you're comfortable working with classes, decorators, etc- but 10 | don't worry if these terms are not familiar just yet! 11 | 12 | ### Docker 13 | Mtriage uses Docker containers to abstract dependencies from needing to be 14 | installed on the local host. It's not essential, but a good operational 15 | knowledge of Docker will be helpful. 16 | 17 | ## Do I need to be an experienced Python developer? 18 | Contributing can of course be about contributing code, but it can also take 19 | many other forms. A great amount of work that remains to be done to make 20 | mtriage a usable community tool doesn't involve writing any code. The following 21 | are just a few examples of other welcome contributions: 22 | 23 | - Writing, updating or correcting documentation. 24 | - Requesting a feature 25 | - Reporting a bug 26 | 27 | If you're new to this project and looking for a good problem to get started, 28 | you might want to check out the open issues that are tagged ["good first issue"](https://github.com/forensic-architecture/mtriage/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22). 29 | 30 | These are a rnage of the issues that have come up in conversation for which we 31 | would welcome community contributions. These are, however, by no means 32 | exhaustive! If you see a gap or have an idea, please open up an issue to 33 | discuss it with mtriage's maintainers. 34 | 35 | ## What parts of mtriage are being actively developed? 36 | You can learn about what we are currently working on by looking at the latest 37 | update. [Updates can be found here](docs/updates). 38 | 39 | ## How do I make a contribution? 40 | 1. Make sure you have a [GitHub account](https://github.com/signup/free) 41 | 2. Fork the repository on GitHub. This is necessary so that you can push your 42 | changes, as you can't do this directly on our repo. 43 | 3. Get set up with a local instance of mtriage. The easiest way to do this is 44 | by [following through the tutorial](https://github.com/forensic-architecture/mtriage/blob/main/docs/tutorial/1/README.md). 45 | 4. [Join our Discord server](https://discord.gg/PjHKHJD5KX). Here you'll be able 46 | to track commits that are actively being made across our projects; but more 47 | importantly it's where you can ask questions if something's not clear or 48 | not working as you expect. The #mtriage and #support channels are the two 49 | best places to ask questions about setting mtriage up, or how it works. 50 | 51 | Once you're set up with a local copy of mtriage, you can start modifying code 52 | and making changes. 53 | 54 | When you're ready to submit a contribution, you can do it by making a pull 55 | request from a branch on your forked copy of timemap to this repository. You 56 | can do this with the following steps: 57 | 1. Push the changes to a remote repository. If the changes you have made 58 | address a bug, you should name it `bug/{briefdesc}`, where `{briefdesc}` is 59 | a hyphen-separated description of your change. If instead you are 60 | contributing changes as a feature request, name it `feature/{briefdesc`}. If 61 | in doubt, prefix your branch with `feature/`. 62 | 2. Submit a pull request to the `develop` branch of 63 | `forensic-architecture/mtriage` (not `main`!). 64 | 3. Wait for the pull request to be reviewed by a maintainer. 65 | 4. Make changes to the pull request if the reviewing maintainer recommends 66 | them. 67 | 5. Celebrate your success once your pull request is merged! 68 | 69 | ### How do I validate my changes? 70 | We are still working on a full set of tests, but there are some basic ones in 71 | place that need to pass before we can merge any contributions. 72 | 73 | Tests can be run with the following command: 74 | ``` 75 | ./mtriage dev test 76 | ``` 77 | 78 | All code must be formatted according to the 79 | [black](https://github.com/ambv/black) formatter. (CI builds will fail if code 80 | is not Black-formatted.) 81 | 82 | ## New components 83 | If you are contributing a new component (i.e. an analyser or a selector), 84 | ensure that your component lists the correct dependencies. You can do so by 85 | ensuring that it works in a [standalone custom build](./docs/custom-builds.md). 86 | 87 | 88 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Do No Harm License 2 | 3 | **Preamble** 4 | 5 | Most software today is developed with little to no thought of how it will be used, or the consequences for our society and planet. 6 | 7 | As software developers, we engineer the infrastructure of the 21st century. We recognise that our infrastructure has great power to shape the world and the lives of those we share it with, and we choose to consciously take responsibility for the social and environmental impacts of what we build. 8 | 9 | We envisage a world free from injustice, inequality, and the reckless destruction of lives and our planet. We reject slavery in all its forms, whether by force, indebtedness, or by algorithms that hack human vulnerabilities. We seek a world where humankind is at peace with our neighbours, nature, and ourselves. We want our work to enrich the physical, mental and spiritual wellbeing of all society. 10 | 11 | We build software to further this vision of a just world, or at the very least, to not put that vision further from reach. 12 | 13 | **Terms** 14 | 15 | *Copyright* (c) 2019 Forensic Architecture. All rights reserved. 16 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 17 | 18 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 19 | 20 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 21 | 22 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 23 | 24 | 4. This software must not be used by any organisation, website, product or service that: 25 | 26 | a) lobbies for, promotes, or derives a majority of income from actions that support or contribute to: 27 | * sex trafficking 28 | * human trafficking 29 | * slavery 30 | * indentured servitude 31 | * gambling 32 | * tobacco 33 | * adversely addictive behaviours 34 | * nuclear energy 35 | * warfare 36 | * weapons manufacturing 37 | * war crimes 38 | * violence (except when required to protect public safety) 39 | * burning of forests 40 | * deforestation 41 | * hate speech or discrimination based on age, gender, gender identity, race, sexuality, religion, nationality 42 | 43 | b) lobbies against, or derives a majority of income from actions that discourage or frustrate: 44 | * peace 45 | * access to the rights set out in the Universal Declaration of Human Rights and the Convention on the Rights of the Child 46 | * peaceful assembly and association (including worker associations) 47 | * a safe environment or action to curtail the use of fossil fuels or prevent climate change 48 | * democratic processes 49 | 50 | 5. All redistribution of source code or binary form, including any modifications must be under these terms. You must inform recipients that the code is governed by these conditions, and how they can obtain a copy of this license. You may not attempt to alter the conditions of who may/may not use this software. 51 | 52 | We define: 53 | 54 | **Forests** to be 0.5 or more hectares of trees that were either planted more than 50 years ago or were not planted by humans or human made equipment. 55 | 56 | **Deforestation** to be the clearing, burning or destruction of 0.5 or more hectares of forests within a 1 year period. 57 | 58 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 59 | 60 | **Attribution** 61 | 62 | Do No Harm License [Contributor Covenant][homepage], (pre 1.0), 63 | available at https://github.com/raisely/NoHarm 64 | 65 | [homepage]: https://github.com/raisely/NoHarm 66 | 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mtriage 2 | 3 | [![Build Status](https://travis-ci.com/forensic-architecture/mtriage.svg?branch=master)](https://travis-ci.com/forensic-architecture/mtriage) 4 | 5 | ##### select, download, and analyse media 6 | 7 | mtriage is a command-line application to orchestrate complex scraping and 8 | analysis workflows. mtriage is developed at [Forensic Architecture](https://forensic-architecture.org), 9 | and is intended for use by open source research agencies, journalists, and 10 | activists. To learn more about why we developed mtriage, you can read [an 11 | overview of our reasons here](docs/overview.md). 12 | 13 | ## getting started 14 | 15 | First thing's first; follow the instructions to install mtriage: 16 | * [Install](docs/install.md) 17 | 18 | Once installed, the best way to get started with mtriage is to work through the 19 | three tutorials: 20 | * [1. Getting started](docs/tutorial/1/README.md) 21 | * [2. Chaining analysers](docs/tutorial/2/README.md) 22 | * [3. An end-to-end workflow](docs/tutorial/3/README.md) 23 | 24 | ## latest update 25 | Updates are posted irregularly; but you can get a sense of what's going on here 26 | by reading [the latest update](docs/updates/2020.11.22.md). 27 | 28 | ## supported components 29 | 30 | Below is a list of currently supported components. If you are interested in 31 | helping us to develop additional selectors and analysers, please consider 32 | joining [the conversaton on Discord](https://discord.gg/FJ4XsCg). We're 33 | accepting PRs for new components, but the internal documentation leaves 34 | a little bit wanting at the moment, so best to communicate with us directly on 35 | the #mtriage channel. 36 | 37 | ### selectors 38 | * Youtube - search by query with optional date range (time uploaded), download video and metadata. 39 | * Twitter - search by query, download tweets and images. 40 | * Local - use media that already exists on your filesystem. 41 | 42 | ### analysers 43 | * ConvertAudio - convert audio files between formats. 44 | * ExtractAudio - extract audio from a video. 45 | * ExtractTypes - extract elements that contain media with specified extensions. 46 | * Frames - extract frames from videos as images using ffmpeg. 47 | * ImageDedup - deduplicate images that are too similar using the 48 | [imagededup](https://github.com/idealo/imagededup) module. (Good to use 49 | after using 'frames'.) 50 | * KerasPretrained - classify objects in images using [Resnet50 trained on 51 | ImageNet](https://resources.wolframcloud.com/NeuralNetRepository/resources/ResNet-50-Trained-on-ImageNet-Competition-Data). 52 | * Rank - generate a JSON file containing the rankings for videos classified 53 | with KerasPretrained. 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | **/* 2 | !.gitignore 3 | !demo/ 4 | -------------------------------------------------------------------------------- /data/demo/1local/1.txt: -------------------------------------------------------------------------------- 1 | This is a simple text file. 2 | -------------------------------------------------------------------------------- /data/demo/1local/2.md: -------------------------------------------------------------------------------- 1 | # Markdown example 2 | 3 | The __tiniest__ bit less simple than a txt file. 4 | -------------------------------------------------------------------------------- /data/demo/1local/3.jpg: -------------------------------------------------------------------------------- 1 | this is not a JPG but it's fine for testing 2 | -------------------------------------------------------------------------------- /data/demo/2audio/coffee.m4a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/forensic-architecture/mtriage/7a841241518f831766767d6ddaa6320b8de4be98/data/demo/2audio/coffee.m4a -------------------------------------------------------------------------------- /docs/commands.md: -------------------------------------------------------------------------------- 1 | ## commands 2 | 3 | ### `./mtriage run path/to/file.yaml` 4 | 5 | The primary command to trigger new mtriage workflows. Each run takes a YAML 6 | file that specifies which selectors and analysers to run (i.e. `./mtriage run 7 | examples/youtube.yaml`). See [examples folder](./examples) for examples of how 8 | to specify different analyser options. 9 | 10 | You can also pass the following flags to the run command: 11 | 12 | | flag | description | 13 | |-------|-------------| 14 | | `--gpu` | Run using the mtriage GPU image. This will speed up certain analysers that depend on it | 15 | | `--tag` | Allows you to run mtriage with a custom build by passing the name of the Docker image tag you used during the custom build (see below) | 16 | | `--dry` | Primarily for testing. Will not run any command, but instead return the command that will be run. | 17 | | `--dev` | Run using local code, to see changes in development. This will also bypass internal mtriage error handling, allowing you to see the origin of errors | 18 | 19 | ### `./mtriage dev build` 20 | 21 | The command to build an mtriage Docker image from source code. You won't need 22 | this unless you are developing mtriage, as the latest images are also on [Docker 23 | Hub](https://hub.docker.com/repository/docker/forensicarchitecture/mtriage). 24 | 25 | | flag | description | 26 | |-------|-------------| 27 | | `--gpu` | Build the GPU image. Will build the CPU image otherwise | 28 | | `--tag` | Give your build a custom tag. Will default to 'dev' or 'dev-gpu' | 29 | | `--blacklist` | Give build a path to a blacklist that lists which components to exclude. See [example.blacklist.txt](./example.blacklist.txt) for format. | 30 | | `--whitelist` | Give build a path to a whitelist that lists which components to include. | 31 | | `--dry` | Primarily for testing. Will not run any command, but instead return the command that will be run. | 32 | 33 | ### `./mtriage dev test` 34 | 35 | Run all mtriage tests. These run in two parts for the time being: one inside 36 | Docker, and one on your local Python installation. 37 | 38 | | flag | description | 39 | |-------|-------------| 40 | | `--verbose` | Run verbose tests, showing all print statements in the console. | 41 | | `--gpu` | Test the GPU image. Will build the CPU image otherwise | 42 | | `--tag` | Test with a custom tag. Will default to 'dev' or 'dev-gpu' | 43 | | `--dry` | Primarily for testing. Will not run any command, but instead return the command that will be run. | 44 | 45 | ### `./mtriage dev clean ` 46 | 47 | Remove all mtriage Docker containers, stopped or running. 48 | 49 | ### `./mtriage dev` 50 | 51 | Open a bash shell inside mtriage's Docker container. For debugging. 52 | 53 | | flag | description | 54 | |-------|-------------| 55 | | `--gpu` | Run the GPU image. Will run the CPU image otherwise | 56 | | `--tag` | Run with a custom tag. Will default to 'dev' or 'dev-gpu' | 57 | | `--dry` | Primarily for testing. Will not run any command, but instead return the command that will be run. | 58 | | `--yaml` | Pass a path to an mtriage YAML config to saturate the shell environment with its runtime parameters. (I.e. if you run `python run.py` from inside the src folder, it will use this YAML). | 59 | 60 | 61 | -------------------------------------------------------------------------------- /docs/components/youtube.md: -------------------------------------------------------------------------------- 1 | # Configuring the Youtube selector 2 | 3 | In order to run the Youtube selector, mtriage requires a Google Cloud Platform 4 | API key. 5 | 6 | 1. Create a new project in GCP, and in the [credentials 7 | page](https://console.cloud.google.com/apis/credentials), enable the 8 | 'Youtube Data V3' API. 9 | 2. Create a new API key, ensuring that it has access to the Youtube V3 API. 10 | 3. In the '.env' file in mtriage's root folder, add the line 11 | `GOOGLE_API_KEY=xxxxx`, replacing 'xxxxx' with your downloaded API key. 12 | -------------------------------------------------------------------------------- /docs/custom-builds.md: -------------------------------------------------------------------------------- 1 | # Custom Builds 2 | 3 | The default 'dev' and 'dev-gpu' mtriage images (available 4 | [here](https://cloud.docker.com/u/forensicarchitecture/repository/docker/forensicarchitecture/mtriage)) 5 | include dependencies for all selectors and all analysers. While this is useful 6 | for playing around with mtriage locally, as everything is already installed, it 7 | is unnecessarily weighty if you are trying to deploy mtriage, or use only some 8 | components. 9 | 10 | For this reason, it is possible to create custom mtriage builds through the 11 | `mtriage dev build` command. Without any additional flags, this command will 12 | build a Docker image with all dependencies for all components installed. (This 13 | is the command that is run on successful merges to master to create the Docker 14 | Hub image). 15 | 16 | To exclude the dependencies for certain modules, you can pass a blacklist.txt 17 | file via flag to the build command: 18 | ``` 19 | ./mtriage dev build --blacklist example.blacklist.txt 20 | ``` 21 | 22 | Modules specified in the blacklist will *not* be installed in the build. For 23 | example, if you wanted a build of mtriage with only dependencies for selectors 24 | installed, you could pass a blacklist that specified all analysers. 25 | 26 | You can also pass a whitelist with the `--whitelist` flag. 27 | 28 | 29 | -------------------------------------------------------------------------------- /docs/custom-components.md: -------------------------------------------------------------------------------- 1 | # Custom Components 2 | 3 | Components are the main way in which mtriage is intended to be extended. 4 | A custom component can either be a selector (to index and retrieve media to 5 | kick of an mtriage workflow) or an analyser (to process media in an mtriage 6 | workflow). 7 | 8 | Components currently sit within [src/lib/selectors](/src/lib/selectors) and 9 | [src/lib/analysers](/src/lib/analysers). Each component is self-contained 10 | (along with a listing of the dependencies it requires) inside a folder there. 11 | 12 | ### Testing Components in a Standalone Build 13 | 14 | If you are contributing a new analyser or selector, you should confirm that it 15 | runs without issues in a standalone build. Mtriage uses whitelists to allow the 16 | creation of standalone builds. Work through the following steps to create 17 | a custom build with your component: 18 | 19 | 1. Create a 'whitelist.txt' in the core mtriage directory, which contains 20 | a single line with the name of your new component. For example, if your 21 | component is called 'MyCustomComponent', your whitelist would look like 22 | this: 23 | ``` 24 | MyCustomComponent 25 | ``` 26 | 2. Create the custom mtriage image with solely your component with the 27 | following command: 28 | ``` 29 | ./mtriage dev build --tag mycustomcomponent --whitelist whitelist.txt 30 | ``` 31 | 3. Test the running of your component with the following command: 32 | ``` 33 | ./mtriage run path/to/config.yml --tag mycustomcomponent --dev 34 | ``` 35 | 36 | Please note that mtriage is still in a very early stage of development, but we 37 | will keep updating this document as the code changes. 38 | 39 | Thanks again for your interest and for your future contributions! 40 | 41 | -------------------------------------------------------------------------------- /docs/install.md: -------------------------------------------------------------------------------- 1 | # Install 2 | 3 | mtriage is currently in active development, and is not yet packaged in any way. 4 | It uses [Docker](https://www.docker.com/products/docker-desktop) to manage 5 | dependencies, which you will need to download to ensure mtriage works as 6 | expected. 7 | 8 | Follow the instructions relevant to your operating system to install Docker CE. 9 | Docker Desktop (Mac installation [here](https://docs.docker.com/v17.12/docker-for-mac/install/), 10 | Ubuntu installation [here](https://docs.docker.com/v17.12/install/linux/docker-ce/ubuntu/)). 11 | If you have a CUDA GPU, you can use [Nvidia Docker](https://github.com/NVIDIA/nvidia-docker) 12 | instead of Docker to make certain analysers more performant. 13 | 14 | NOTE (05/2023): if you are on Apple Silicon, your machine will not be able to natively run FA's docker images. In order to fix this, you'll need to [enable virtualization](https://collabnix.com/warning-the-requested-images-platform-linux-amd64-does-not-match-the-detected-host-platform-linux-arm64-v8/) by changing some settings in Docker Desktop. Navigate to Settings > General and make sure the "Use Virtualization framework" box is checked. After, navigate to Settings > Features in development and check the "Use Rosetta for x86/amd64 emulation on Apple Silicon." If you're well-versed in Docker, you can set the 'platform' flag to 'linux/amd64' in the Dockerfile. If not, the easiest solution is to modify your personal ~/.bashrc or ~/.zshrc file and add ``export DOCKER_DEFAULT_PLATFORM=linux/amd64`` to it. 15 | 16 | You also need to ensure that [Python 3](https://www.python.org/downloads/) is installed on your computer. Most modern operating systems have a version installed by default. Mtriage will _probably_ work with Python 2.x as well, but it's untested. 17 | 18 | Once you have Docker and Python installed, you can clone the source code and 19 | install the requirements (the only runtime dependency is [pyyaml](https://pyyaml.org/)). 20 | 21 | ```bash 22 | git clone https://github.com/forensic-architecture/mtriage.git 23 | pip3 install -r requirements.txt 24 | ``` 25 | 26 | ### additional setup 27 | Run the test suite to ensure that everython is working. This command may take 28 | a while, as the first time you run mtriage it will download the [latest Docker 29 | image](https://hub.docker.com/r/forensicarchitecture/mtriage). Mtriage commands will run much faster after this first one: 30 | 31 | ```bash 32 | ./mtriage dev test 33 | ``` 34 | 35 | Depending on what components you intend to use, there may be additional setup 36 | required. Check the [component docs folder](/docs/components) before using an 37 | analyser or if you run into an authentication or setup issue. 38 | 39 | Assuming this command completed and all the tests passed, you are now ready to 40 | run mtriage workflows! 41 | -------------------------------------------------------------------------------- /docs/overview.md: -------------------------------------------------------------------------------- 1 | # Why Mtriage? 2 | 3 | Recent advances in deep learning make it a very powerful technique when 4 | analysing visual and audio media. The state of the art in object detection in 5 | images performs comparably to humans, and the recognition of speech and other 6 | audio signatures is also impressively effective. Due to these capabilities, 7 | deep learning has the potential to dramatically effect the scale on which human 8 | rights organisations can track and monitor weapons, trade, and other objects 9 | that signify possible human rights abuses. 10 | 11 | In practice, however, using machine learning in human rights research is 12 | difficult. The state of tooling is such that it is difficult to use for anyone 13 | who does not have a background in software development. Even if the simple aim 14 | is to run a pretrained classifier for object detection on an image, there is 15 | often a lot of installation pain and indirection in online resources. On top of 16 | this, to deploy classifiers at scale, analysing thousands of videos rather than 17 | just one image, a lot of custom plumbing is required. Human rights researchers 18 | do often not have the resource to employ an in-house software developer for 19 | this plumbing, which effectively means that human rights research rarely uses 20 | machine learning. At best, it is limited to a few organisations who have the 21 | technical resource to deploy custom software infrastructure, or who can partner 22 | with data science firms to do so. 23 | 24 | We developed mtriage to address the insufficiency in machine learning tooling 25 | for human rights research, with the hope that it can democratise the use of 26 | machine learning-- and also other more advanced computational analytic 27 | techniques. In the first instance, it provides both pretrained object detection 28 | classifiers, as well as the means to use them to analyse public domain media. 29 | Mtriage is structured modularly: we intend to add new classifiers, and to 30 | support new sources and kinds of public domain media, as we develop these 31 | capabilities for ongoing and future Forensic Architecture investigations. 32 | 33 | Mtriage is open source and in active development. This means that everyone can 34 | not only use mtriage in their own research, but also that community 35 | contributions (of a new classifier, or a new media source) can potentially be 36 | made available to all other users as upstream contributions. 37 | 38 | To get started with mtriage, check out [Getting Started](docs/getting-started.md). 39 | -------------------------------------------------------------------------------- /docs/testing.md: -------------------------------------------------------------------------------- 1 | # Testing 2 | 3 | Mtriage has three kinds of tests: 4 | 1. Tests for the core code that runs inside Docker (in src/test). 5 | 2. Tests for the outer orchestration logic (in test/). 6 | 3. Tests for analysers and selectors (in each component folder, in test/). 7 | 8 | Each kind of test is run with appropriate containerisation given its context, 9 | i.e. tests of type 1 are run inside Docker, whereas tests of type 2 are run 10 | using the locally installed Python environment. 11 | 12 | To run all tests, use the following command: 13 | ``` 14 | ./mtriage dev test 15 | ``` 16 | See [docs/custom-components.md](./custom-components.md) for more information on 17 | how to test a new component. 18 | -------------------------------------------------------------------------------- /docs/training-classifiers.md: -------------------------------------------------------------------------------- 1 | ## Training Classifiers 2 | 3 | ALl the classifiers supported by our code are trained on the 1000 class ImageNet dataset by default. If you want to see the labels available, please refer to [Classify-read-me](docs/Classify-Read-Me.md). If you would like to train the classifiers on other data sets, here you can find a list of some existing options: 4 | 5 | 6 | #### Supervisely 7 | The most straightforward method for creating models is to use the platform [supervisely](https://supervise.ly/). This platform provides a way to annotate data, prepare a synthetic data set, train models, and download them without ever having to bring up an ipython notebook. 8 | 9 | (nb: supervisely is a web platform, but needs cloud configuration or CUDA hardware. TODO) 10 | (Q: should I be explaining how to use supervisely in more detail? I have been reading about it but not sure) 11 | 12 | #### Tensorflow 13 | Another way to train models is to use the process documented in [tensorflow for poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/index.html?index=..%2F..%2Findex#0). This is a great way to train a basic image classifier with various categories, which doesn't require any bounding box annotation (you label the training set by putting them in appropriate folders). 14 | 15 | #### Keras 16 | The third and perhaps most flexible way to train models is using [keras](https://keras.io/), and generalising the methodology from [this excellent 11-part series on keras and python](https://pythonprogramming.net/loading-custom-data-deep-learning-python-tensorflow-keras/). 17 | 18 | ### Installation 19 | In the [tensorflow](/tensorflow) directory there are scripts to setup a ready-made environment in [Docker](https://www.docker.com/) for training models using tensorflow and keras. You can also refer to the [README](docs/README.md)] for a brief explanation of how to install and use Docker. 20 | 21 | ```bash 22 | cd tensorflow [Q: is this directory supposed to be one of the folders that are included in the github download? I don't see any directory called tensorflow at the moment; alternatively, are they supposed to download tensorflow manually?] 23 | sh setup.sh # downloads tensorflow models and builds Docker image locally 24 | sh run.sh # starts Docker container with appropriate volume/port mapping 25 | ``` 26 | Visit [http://localhost:8080](http://localhost:8080) and use the token displayed in the console after running the last command. Tensorboard is also available at [http://localhost:6006](http://localhost:6006) 27 | 28 | 29 | ### training data 30 | #### google images 31 | the excellent CLI `google_images_download` is part of the Pipfile. to retrieve more than 100 images at a time, you need to download [chromedriver](https://sites.google.com/a/chromium.org/chromedriver/downloads), unzip, and pass the appropriate path to the binary. the suggestion is to put the binary in `/usr/local/bin`, and then you can copy and paste the following command to download images for a search: 32 | 33 | ```bash 34 | googleimagesdownload --keywords "tanks" --limit 1000 --chromedriver /usr/local/bin/chromedriver 35 | ``` 36 | 37 | another handy tool is [findimagedupes](https://gitlab.com/opennota/findimagedupes), especially if you are creating datasets by interweaving google searches (which will inevitably have overlapping images returned). 38 | -------------------------------------------------------------------------------- /docs/tutorial/1/1a.yaml: -------------------------------------------------------------------------------- 1 | folder: media/demo_official/1 2 | select: 3 | name: Local 4 | config: 5 | source: data/demo/1local 6 | # aggregate: true 7 | -------------------------------------------------------------------------------- /docs/tutorial/1/1b.yaml: -------------------------------------------------------------------------------- 1 | folder: media/demo_official/1 2 | elements_in: 3 | - Local 4 | analyse: 5 | name: ExtractTypes 6 | config: 7 | exts: 8 | - txt 9 | - md 10 | -------------------------------------------------------------------------------- /docs/tutorial/1/1c.yaml: -------------------------------------------------------------------------------- 1 | folder: media/demo_official/1 2 | select: 3 | name: Local 4 | config: 5 | source: data/demo/1local 6 | analyse: 7 | name: ExtractTypes 8 | config: 9 | exts: 10 | - txt 11 | - md 12 | -------------------------------------------------------------------------------- /docs/tutorial/1/README.md: -------------------------------------------------------------------------------- 1 | # 1a. Working with selectors 2 | 3 | Mtriage workflows are orchestrated using YAML files. These config files 4 | indicate components used to select and/or process media. Most mtriage YAML 5 | files are very simple, and mostly consist of configuration specific to the 6 | components being run. For example, here is the config for the youtube run we'll 7 | do in a second: 8 | 9 | ```yaml 10 | folder: media/demo_official/1 11 | select: 12 | name: Local 13 | config: 14 | source: data/demo/1local 15 | # aggregate: true 16 | ``` 17 | 18 | 19 | In order to analyse media with mtriage, we first need to 'select' that media 20 | from somewhere. Selectors designate and index a 'media space', and then 21 | download the relevant media in that space as local mtriage elements (elements 22 | are essentially folders). In this example we'll use the 23 | [Local](../src/lib/selectors/Local) selector, which simply selects from media 24 | already on your computer's file system. 25 | 26 | Let's try running the config: 27 | 28 | ``` 29 | ./mtriage run docs/tutorial/1/1a.yaml 30 | ``` 31 | 32 | You should see the following logs: 33 | 34 | ``` 35 | Local: index: Indexing local folder... 36 | Local: index: indexed file: 1.txt 37 | Local: index: indexed file: 3.jpg 38 | Local: index: indexed file: 2.md 39 | ``` 40 | 41 | If you look in media/demo_official/1/Local/data, you'll see the three folders, 42 | each containing one of the indexed media, as well as an 'element_map.csv'. You 43 | won't normally need to look carefully at the folder structure in folders 44 | mtriage produces, but it's helpful to have a look to get an idea of how things 45 | are working under the hood. 46 | 47 | As a quick primer, mtriage works by formatting media as 'elements', which in 48 | this case are represented simply as folders on disk. (Later we'll see that we 49 | can store elements remotely, as well.) Selectors work by indexing media, and 50 | then retrieving that media and storing them as elements. This prepares media to 51 | be processed using an Analyser, which take elements as input and produce 52 | elements as output. The 'elements_map.csv' is a listing that mtriage uses 53 | internally. 54 | 55 | # 1b. Working with analysers 56 | 57 | Now that we've selected some elements, let's get to analysing them. We're going 58 | to use the very straightforward 'ExtractTypes' analyser, which simply extracts 59 | elements that have media with particular types. Here's the config: 60 | 61 | ```yaml 62 | folder: media/demo_official/1 63 | elements_in: 64 | - Local 65 | analyse: 66 | name: ExtractTypes 67 | config: 68 | exts: 69 | - txt 70 | - md 71 | ``` 72 | 73 | The first line here indicates that we are working with the elements in the 74 | folder 'media/demo_official/1'. The `elements_in` attribute indicates which 75 | elements we want to process, which we specify via __the name of the selector we 76 | used to produce them__. All workflows in mtriage are contained by a base 77 | selector in this way. If we had used multiple selectors to index and retrieve 78 | media, we could add extra line items in the `elements_in` array to indicate we 79 | want to use them as well. 80 | 81 | The `analyse` attribute indicates which analyse we want to use, and the 82 | configuration we want to use for the analyser. The 'ExtractTypes' analyser 83 | recieves an array of extensions (`exts`) that represents a whitelist of the 84 | media types we want to extract. 85 | 86 | 87 | Let's run this config and take a look at the result: 88 | 89 | ``` 90 | ./mtriage run docs/tutorial/1/1b.yaml 91 | ``` 92 | 93 | We should see the following logs: 94 | 95 | ``` 96 | ExtractTypes: None: Running in parallel 97 | ExtractTypes: analyse: Extracting element 1 with paths: ['1.txt'] 98 | ExtractTypes: analyse: No extracted media in element 3. 99 | ExtractTypes: analyse: Extracting element 2 with paths: ['2.md'] 100 | ``` 101 | 102 | As we can see, the analyser has extracted the two elements with media that have 103 | matching extensions, and skipped over element 3 (which contains '3.jpg'). The 104 | first logged line is an important aspect of mtriage's value add: it runs these 105 | operations in parallel, across as many CPUs are available on your computer. 106 | 107 | # 1c. Putting it all together 108 | 109 | We can put both selection and analysis together in a single config, as follows: 110 | 111 | ```yaml 112 | folder: media/demo_official/1 113 | select: 114 | name: Local 115 | config: 116 | source: data/demo/1local 117 | analyse: 118 | name: ExtractTypes 119 | config: 120 | exts: 121 | - txt 122 | - md 123 | ``` 124 | 125 | And run it with: 126 | 127 | ``` 128 | ./mtriage run docs/tutorial/1/1c.yaml 129 | ``` 130 | 131 | Of course, this particular workflow isn't very useful at all, but hopefully you 132 | are already beginning to see how we can use mtriage to orchestrate much more 133 | meaningful and powerful media workflows. In the next tutorial, we'll use 134 | mtriage to reformat audio files. 135 | 136 | [Go to tutorial 2](/docs/tutorial/2/README.md) 137 | -------------------------------------------------------------------------------- /docs/tutorial/2/2a.yaml: -------------------------------------------------------------------------------- 1 | folder: media/demo_official/2 2 | select: 3 | name: Local 4 | config: 5 | source: data/demo/2audio 6 | analyse: 7 | name: ConvertAudio 8 | config: 9 | output_ext: mp3 10 | -------------------------------------------------------------------------------- /docs/tutorial/2/2b.yaml: -------------------------------------------------------------------------------- 1 | folder: media/demo_official/2 2 | select: 3 | name: Local 4 | config: 5 | source: data/demo/2audio 6 | analyse: 7 | - name: ConvertAudio 8 | config: 9 | # in_parallel: no 10 | output_ext: mp3 11 | - name: ConvertAudio 12 | config: 13 | # in_parallel: no 14 | output_ext: aac 15 | -------------------------------------------------------------------------------- /docs/tutorial/2/README.md: -------------------------------------------------------------------------------- 1 | # 2a. An audio workflow 2 | 3 | Now that we're familiar with selectors and analysers in principle, let's run 4 | a couple of workflows to get a sense for mtriage's flexibility. Here's a config 5 | that selects a generic audio file using Local, and then converts it to a 6 | specific extension, mp4: 7 | 8 | ```yaml 9 | folder: media/demo_official/2 10 | select: 11 | name: Local 12 | config: 13 | source: data/demo/2audio 14 | analyse: 15 | name: ConvertAudio 16 | config: 17 | output_ext: mp4 18 | ``` 19 | 20 | Let's run it: 21 | 22 | ``` 23 | ./mtriage run docs/tutorial/2/2a.yaml 24 | ``` 25 | 26 | You should see the following output: 27 | 28 | ``` 29 | Local: index: Indexing local folder... 30 | Local: index: indexed file: coffee.m4a 31 | ConvertAudio: None: Running in parallel 32 | ConvertAudio: analyse: Converted 'coffee' from .m4a to .mp3 33 | ``` 34 | 35 | Try creating a different folder in the 'data' folder with several different 36 | video files, modifying the `source` attribute in the config to point to it, and 37 | running this updated config. We're now starting to get a sense of how mtriage 38 | is useful to scale up simple media analysis in parallel for bulk processing. 39 | 40 | # 2b. Chaining analysers 41 | 42 | What makes mtriage really useful for constructing workflows is the ability to 43 | chain different analysers together. The Etype system tells us something about 44 | the inputs and outputs of each analyser, and with this information we can 45 | reliably string analysers together to do successive analysis. 46 | 47 | ```yaml 48 | folder: media/demo_official/2 49 | select: 50 | name: Local 51 | config: 52 | source: data/demo/2audio 53 | analyse: 54 | - name: ConvertAudio 55 | config: 56 | output_ext: mp3 57 | - name: ConvertAudio 58 | config: 59 | output_ext: aac 60 | ``` 61 | 62 | Say we wanted to convert an audio file to two different output formats. We can 63 | do it by specifying an analysis chain with two ConvertAudio parts. Let's run 64 | this config: 65 | 66 | ``` 67 | ./mtriage run docs/tutorial/2/2b.yaml 68 | ``` 69 | 70 | We'll get the following: 71 | 72 | ``` 73 | Local: index: Indexing local folder... 74 | Local: index: indexed file: coffee.m4a 75 | ConvertAudio: None: Running in parallel 76 | ConvertAudio: analyse: Converted 'coffee' from .m4a to .mp3 77 | ConvertAudio: None: Running in parallel 78 | ConvertAudio: analyse: Converted 'coffee' from .mp3 to .aac 79 | ``` 80 | 81 | Mtriage runs this config in the order that its specificed: selecting media with 82 | the Local selector, using ConvertAudio to convert this selected media to mp3, 83 | and then converting that media (the mp3 file) to aac, using ConvertAudio with 84 | a different configuration. 85 | 86 | When mtriage runs analysers in a chain, it keeps the intermediary results by 87 | default. Therefore this config works to produce the two audio versions of the 88 | source video file in which we are interested. In tutorial 3, we'll see how to 89 | conveniently visualise the results of mtriage workflows with 90 | [mtriage-viewer](https://github.com/forensic-architecture/mtriage-viewer). 91 | 92 | As we're only extracting audio from one file here, it doesn't make sense to run 93 | analysis in parallel. (As soon as there are as many elements being analysed as 94 | there are CPUs available, however, it does make sense; which is why mtriage 95 | runs in parallel by default.) We can easily run analysis serially by setting 96 | `in_parallel` to false in an analyser's config: 97 | 98 | ```yaml 99 | folder: media/demo_official/2 100 | select: 101 | name: Local 102 | config: 103 | source: data/demo/2audio 104 | analyse: 105 | - name: ConvertAudio 106 | config: 107 | in_parallel: no 108 | output_ext: mp3 109 | - name: ConvertAudio 110 | config: 111 | in_parallel: no 112 | output_ext: aac 113 | ``` 114 | 115 | Try uncommenting the relevant lines with `in_parallel` in 116 | docs/tutorial/2/2b.yaml, and running the config again. You should see 117 | a different line logging that indicates mtriage is running analysis serially. 118 | 119 | In the next tutorial, we'll work with the Youtube selector to analyse videos 120 | that are selected using youtube's search API, showing the full power and 121 | extensibility of mtriage. 122 | 123 | [Go to tutorial 3](/docs/tutorial/3/README.md) 124 | -------------------------------------------------------------------------------- /docs/tutorial/3/3a.yaml: -------------------------------------------------------------------------------- 1 | folder: media/demo_official/3 2 | select: 3 | name: Youtube 4 | config: 5 | search_term: Tear gas 6 | uploaded_before: "2015-10-02T00:00:00Z" 7 | uploaded_after: "2015-10-01T00:00:00Z" 8 | -------------------------------------------------------------------------------- /docs/tutorial/3/3b.yaml: -------------------------------------------------------------------------------- 1 | folder: media/demo_official/3 2 | elements_in: 3 | - Youtube 4 | analyse: 5 | - name: Frames 6 | - name: KerasPretrained 7 | config: 8 | in_parallel: false 9 | model: ResNet50 10 | labels: 11 | - tank 12 | - rifle 13 | - military uniform 14 | -------------------------------------------------------------------------------- /docs/tutorial/3/3c.yaml: -------------------------------------------------------------------------------- 1 | folder: media/demo_official/3c 2 | select: 3 | name: Youtube 4 | config: 5 | search_term: tear gas + mexico 6 | uploaded_before: "2018-11-30T00:00:00Z" 7 | uploaded_after: "2018-11-15T00:00:00Z" 8 | analyse: 9 | - name: Frames 10 | - name: ImageDedup 11 | config: 12 | threshold: 3 13 | method: dhash 14 | - name: KerasPretrained 15 | config: 16 | model: ResNet50 17 | labels: 18 | - tank 19 | - rifle 20 | - military uniform 21 | -------------------------------------------------------------------------------- /docs/tutorial/3/README.md: -------------------------------------------------------------------------------- 1 | # 3a. Selecting media with Youtube 2 | 3 | The Youtube selector uses [Youtube's Data API](https://developers.google.com/youtube/v3) 4 | to find videos uploaded between certain dates using a search term. This API 5 | requires an API key, which is free to get. Follow the instructions in [the 6 | documentation](/docs/components/youtube.md), adding the API key in a line in 7 | the .env file at the root of your mtriage folder. 8 | 9 | With the API key in our mtriage environment, we can run the following config to 10 | select some videos from youtube: 11 | 12 | ```yaml 13 | folder: media/demo_official/3 14 | select: 15 | name: Youtube 16 | config: 17 | search_term: Tear gas 18 | uploaded_before: "2015-10-02T00:00:00Z" 19 | uploaded_after: "2015-10-01T00:00:00Z" 20 | ``` 21 | 22 | Let's run it: 23 | 24 | ``` 25 | ./mtriage run docs/tutorial/3/3a.yaml 26 | ``` 27 | 28 | The Youtube selector indexes videos by making an API call, and then downloads 29 | the videos in parallel according to however many CPU cores your computer has 30 | available. By default, it downloads the videos at very low quality, and it also 31 | retrieves a 'meta.json' file regarding the video's provenance and other meta 32 | information. 33 | 34 | # 3b. Image classification with KerasPretrained 35 | 36 | Let's now classify the frames in the videos that we've downloaded using image 37 | classifiers that have been pretrained on the labels in the 38 | [ImageNet](http://www.image-net.org/) database. We'll do so using a neural net 39 | architecture called [ResNet](https://arxiv.org/abs/1512.03385), which is 40 | a state-of-the-art architecture for image classification. We'll give the 41 | KerasPretrained analyser the three labels we're interested in--tank, rifle, and 42 | military uniform--to indicate that we want to predict the appearance of these 43 | objects in the videos' frames. 44 | 45 | ```yaml 46 | folder: media/demo_official/3 47 | elements_in: 48 | - Youtube 49 | analyse: 50 | - name: Frames 51 | - name: KerasPretrained 52 | config: 53 | model: ResNet50 54 | labels: 55 | - tank 56 | - rifle 57 | - military uniform 58 | ``` 59 | 60 | Note that the first time you runthis config, it will download the pretrained 61 | weights for Resnet, which is a file ~100mb in size (this download only happens 62 | once): 63 | 64 | ``` 65 | ./mtriage run docs/tutorial/3/3b.yaml 66 | ``` 67 | 68 | # 3c. A complete mtriage workflow 69 | 70 | Now that we've tested the parts, let put this all together in a single 71 | workflow, and broaden the media space slightly: 72 | 73 | ```yaml 74 | folder: media/demo_official/3c 75 | select: 76 | name: Youtube 77 | config: 78 | search_term: tear gas + mexico 79 | uploaded_before: "2018-11-30T00:00:00Z" 80 | uploaded_after: "2018-11-15T00:00:00Z" 81 | analyse: 82 | - name: Frames 83 | - name: ImageDedup 84 | config: 85 | threshold: 3 86 | method: dhash 87 | - name: KerasPretrained 88 | config: 89 | model: ResNet50 90 | labels: 91 | - tank 92 | - rifle 93 | - military uniform 94 | - name: Rank 95 | ``` 96 | 97 | 98 | In this config, we select videos uploaded between the 15th and 30th of November 99 | in 2018 that match both "tear gas" and "mexico" in Youtube's search API. Once 100 | downloaded, we split each video into frames, deduplicate similar images using 101 | [dhash](https://github.com/maccman/dhash), classify deduplicated frames using 102 | resnet, and then create an additional JSON that ranks the classified videos 103 | according to the number of positive frames they contain (using the `Rank` analyser). 104 | 105 | That's a fair bit of computational work. Go and grab a beverage while this 106 | command runs to completion, if you like: 107 | 108 | ``` 109 | ./mtriage run docs/tutorial/3/3c.yaml 110 | ``` 111 | Once it's finished, take a look at the files that the workflow has produced in 112 | the media/demo_official/3c folder. You should see everything in a 'Youtube' 113 | folder (as you may recall, mtriage runs are organised internally by selector), 114 | and then most of the created media in a 'derived' folder inside that. 115 | 116 | You're officially finished with the mtriage tutorial. If you want to work 117 | through the media mtriage has just analysed using a frontend interface, 118 | however, as we do here at [Forensic Architecture](https://forensic-architecture.org), 119 | head over to our [mtriage-viewer](https://github.com/forensic-architecture/mtriage-viewer) 120 | and follow the instructions there! 121 | -------------------------------------------------------------------------------- /docs/updates/2020.01.30.md: -------------------------------------------------------------------------------- 1 | # Introducing Development Cycles 2 | 3 | As of 2020, I'm aiming to keep mtriage development to a regular, two-week 4 | release cycle. 5 | 6 | At the start of each cycle, I'll put some issues on the [release cycle project 7 | board](https://github.com/forensic-architecture/mtriage/projects/1), and then 8 | merge them into the [release](https://github.com/forensic-architecture/mtriage/tree/release) 9 | branch as the code is written and reviewed. At the end of each cycle, I'll 10 | merge the release branch into the [master](https://github.com/forensic-architecture/mtriage/tree/master) 11 | branch, and then this goes on wash-rinse-repeat every two weeks. 12 | 13 | 14 | 15 | ## Status 16 | Mtriage is currently a tool that we are using internally at [Forensic 17 | Architecture](https://forensic-architecture.org) primarily to orchestrate one 18 | particular workflow, the deployment of computer vision classifiers on public 19 | domain images and video. [This post](https://forensic-architecture.org/investigation/cv-in-triple-chaser) 20 | goes into greater detail about that how that workflow works. 21 | 22 | Mtriage's main development over at least the next few months will be aimed at 23 | consolidating its use in this particular capacity. I know that there is some 24 | use of mtriage as an orchestration tool for other workflows, i.e. to create 25 | labelled datasets for machine learning, and these are definitely uses that we 26 | are interested in discussing and supporting in the future. Mtriage is intended 27 | as a tool to orchestrate computational workflows beyond just the initial 28 | application in computer vision, but we need to refine that one first before 29 | moving onto others. 30 | 31 | ## Cycle 1 32 | 33 | The first cycle will begin February 1st, and end on February 15th. It will 34 | focus on parallelising both selectors and analysers, and making the Etype 35 | system more flexible. 36 | -------------------------------------------------------------------------------- /docs/updates/2020.02.16.md: -------------------------------------------------------------------------------- 1 | # Parallisation and carrying 2 | 3 | The past two weeks has seen two additions to mtriage: 4 | 1. Parallelisation by default of `Analyser.analyse` and `Selector.retrieve`. 5 | 2. A generic and optional 'carry' flag that can be passed via analyser config 6 | to copy files from an element's base folder to its destination. 7 | 8 | ### [#122: Parallelisation](https://github.com/forensic-architecture/mtriage/pull/122) 9 | Huge thanks to @ivansafrin for the major part of this PR. In my mind, 10 | parallelising the two major computationally intensive operations, 11 | `retrieve` for the selecting phase and `analyse` for the analysis phase, adds 12 | a real reason to adopt mtriage as a framework, rather than writing your own 13 | custom scripts. 14 | 15 | 16 | Applying `retrieve_element` from a selector, or `analyse_element` from an analyser 17 | is, because of the way mtriage is conceived, always self-contained; and 18 | therefore easy to parallelise. The idea of an element as a folder that contains 19 | a set of similarly typed elements is the geist of mtriage as a framework. 20 | Selectors are functions that create elements, and analysers are functions 21 | that process them (to create new elements). 22 | 23 | When looking to apply computational logic on media at scale, packing media into 24 | elements through mtriage allows developers to focus on the important and 25 | innovative logic that is being applied, and forget about the redundant code 26 | that reads and writes files in for loops. 27 | 28 | Parallelising these operations means that now, not only does mtriage take the 29 | burden of necessary redundancy from the developer, it also does so in a way 30 | that enables code to run a lot more efficiently across multiple CPUs. This is 31 | a huge boon for us at FA. 32 | 33 | 34 | ### [#140: Adding the 'carry' option](https://github.com/forensic-architecture/mtriage/pull/140) 35 | This is a continuation of work I had been doing before introducing these 36 | updates making mtriage's type system more flexible, and so less coupled to its 37 | inaugural (computer vision) workflow. 38 | 39 | Prior to this PR, if an analyser further down the chain in a workflow needed a 40 | file in a selector's original element, the first analysers had to encode that 41 | logic in their `analyse_element` function, and copy the files over. 42 | 43 | Not only did this make analyser encapsulation bad, it also meant that analysers 44 | tended towards convoluted out types such as `JsonAnnotatedImageArray`. 45 | 46 | The carry flag solves both of these problems by offloading the work and 47 | specification of copying files during analysis to mtriage config, which makes 48 | a lot more sense that baking it into analysers themselves. 49 | 50 | ## Cycle 2 51 | 52 | This cycle begins February 16th, and will end on February 29th. There have been 53 | a [couple](https://github.com/forensic-architecture/mtriage/pull/139) of [community contributions](https://github.com/forensic-architecture/mtriage/pull/135) that I am 54 | looking to merge. Otherwise, this cycle will focus on improving developer 55 | experience in general, and on writing templates and documentation for creating 56 | new components (analysers and selectors) in particular, as well as fixing some 57 | critical bugs in the Youtube selector. 58 | -------------------------------------------------------------------------------- /docs/updates/2020.03.16.md: -------------------------------------------------------------------------------- 1 | # Abstract storage, revamped component API, more robust etypes 2 | 3 | No update was posted for 2020.02.30, and so this update covers the last four 4 | weeks of development. Essentially what happened is that I started implementing 5 | a significant feature, abstract storage, and it ended up in a PR that touches 6 | most internals, cleaning the component API (how analysers and selectors are 7 | written), and changing a significant number of tests. I'll cover all the 8 | changes as comprehensively as I can in the rest of this update. 9 | 10 | ## Abstract storage 11 | Before this cycle, storage in mtriage was hardcoded to use the filesystem of 12 | the computer on which mtriage was running (written to a linux filesystem inside 13 | a Docker container, and to the disk on the host system via volumes configured 14 | in the other layer of mtriage). As we are looking to move to cloud deployments 15 | of mtriage, abstract storage--the optionality to store media produced by 16 | mtriage either locally, or elsewhere--is crucial. 17 | 18 | Calls to read and write files were scattered throughout the implementations for 19 | [mtmodule](src/lib/common/mtmodule.py), [analyser](src/lib/common/analyser.py), 20 | and [selector](src/lib/common/selector.py). Moreover, the intermediate data 21 | structures that each of these modules used to represent available media on disk 22 | and various stages of the mtriage lifecycle were bloated and unsystematic. 23 | 24 | In order to decouple storage from the local disk, I introduced a [Storage 25 | API](src/lib/common/storage.py), which analysers and selectors interface with 26 | to read and write elements persistently. While thinking through how components 27 | ought to return elements to the mtriage runtime in order for persistent storage 28 | to occur remotely rather than locally, I realised that, regardless of where 29 | elements are ultimately stored, it makes sense for them to be readable and 30 | writable locally to provide full flexibility from the component perspective. 31 | In other words, in order that analysers and selectors can still deal with 32 | elements _as if_ they are simply written on the local filesystem, analysers and 33 | selectors still write media to disk on the computer where mtriage is running, 34 | and return elements back to the mtriage runtime **by reference**, indicating 35 | the paths where media that comprise elements reside. The mtriage runtime then, 36 | by way of the Storage API, persists those elements in designated storage, 37 | deleting the transitory local representations that were returned by 38 | a component. 39 | 40 | Modifying internals to make storage work in this way made it clear that the 41 | component API (e.g. `Analyser.analyse_element`, and 42 | `Selector.retrieve_element`) could be a lot more particular regarding inputs 43 | and outputs. I implemented this by solidifying the way that Etypes work, so 44 | that those functions can unanimously receive and return etypes to/from the 45 | mtriage runtime. 46 | 47 | ## Revamped component API 48 | Here's what the ConvertAudio analyser looks like after the rewrite: 49 | 50 | ```python 51 | from lib.common.analyser import Analyser 52 | from lib.common.exceptions import ElementShouldSkipError 53 | from lib.common.etypes import Etype 54 | from subprocess import call, STDOUT 55 | from pathlib import Path 56 | import os 57 | 58 | 59 | class ConvertAudio(Analyser): 60 | def analyse_element(self, element: Etype.Audio, config) -> Etype.Audio: 61 | output_ext = config["output_ext"] 62 | 63 | FNULL = open(os.devnull, "w") 64 | output = f"/tmp/{element.id}.{output_ext}" 65 | # TODO: error handling 66 | out = call( 67 | ["ffmpeg", "-y", "-i", element.paths[0], output], 68 | stdout=FNULL, 69 | stderr=STDOUT, 70 | ) 71 | self.logger( 72 | f"Converted '{element.id}' from {element.paths[0].suffix} to .{output_ext}" 73 | ) 74 | return Etype.Audio(element.id, paths=[output]) 75 | 76 | 77 | module = ConvertAudio 78 | ``` 79 | 80 | Notably, the confusing functions `get_in_etype` and `get_out_etype` no longer 81 | exist: those specifications are now covered using Python 3 function typing. 82 | It's clear from the signature that the ConvertAudio analyser takes an element 83 | of `Etype.Audio` as input, and produces an element of `Etype.Audio` as output. 84 | 85 | Most importantly, the `analyse_element` function _returns a value that 86 | represents the element it has produced_. Previously, element creation was done 87 | implicitly through the creation of files, and `analyse_element` didn't return 88 | anything explicitly. Now, it returns **an instance of an Etype**, which has 89 | a standardised constructor that takes an element id (`str`) as is first 90 | argument, and a path or list of paths as its second (optionally named). 91 | 92 | Note that the file structure for analysers has changed: there is no scaffolding 93 | required via an '__init__.py'. Instead the module is registered simply through 94 | the Javascript/Node-style export of assigning the `module` variable in the last 95 | line. All a valid analyser (or selector) needs is a single 'core.py' file that 96 | defines a `module` variable that contains a class which inherits from 97 | `Analyser`. 98 | 99 | Note also how the analyser still creates files locally. Instead of representing 100 | groups of media as folders, however, analysers do so simply by passing 101 | references/paths to the relevant files in an Etype's constructor. 102 | 103 | ## More robust etypes 104 | Etypes are the operational heart of mtriage, and there's a fair bit of 105 | implementation magic that goes on inside them. The basic idea is that each 106 | Etype class offers a constructor that takes an string id and a path or set of 107 | paths: 108 | 109 | ```python 110 | a_json_element = Etype.Json("a_json_element", "path/to/myfile.json") 111 | ``` 112 | 113 | The Etype constructor checks all of the paths to ensure that they are valid, 114 | optionally filters out certain paths, and throws an `EtypeCastError` if 115 | something is up. 116 | 117 | There are two higher order etypes, `Union` and `Array`, which allow expressive 118 | composition of etypes. For example, you can create an element that contains one 119 | image and one json file using Union: 120 | 121 | ```python 122 | a_composite_element = Etype.Union(Etype.Json, Etype.Image)("a_composite_element", ["path/to/file.json", "path/to/file.png"]) 123 | ``` 124 | 125 | I've also added a more modular way to add new etypes, similar to the way that 126 | analyser and elements work. Any .py file in [lib/etypes](src/lib/etypes) will 127 | be treated as a custom etype, and will be made available through the `Etype` 128 | namespace in the way detailed above. Custom etypes need only inherit from the 129 | `Et` class (from lib.common.etypes), and define a `filter` function that is run 130 | on construction to filter out certain paths. 131 | 132 | This means that custom code which deals with specific etypes (i.e., media of 133 | particular structures produced during mtriage workflows) can be better 134 | encapsulated as static methods on the custom etypes, as is done in 135 | [cvjson.py](src/lib/etypes/cvjson.py). 136 | 137 | ## Cycle 3 138 | We're fast-tracking towards cloud deployments, but will also try to merge some 139 | of the outstanding PRs that have been on hold due to this rewrite. 140 | -------------------------------------------------------------------------------- /docs/updates/2020.11.22.md: -------------------------------------------------------------------------------- 1 | # Back into it 2 | Mtriage development has been stalled for some time, as I've had other 3 | priorities. I'm hoping to pick up the pace in the last couple of months of 4 | 2020, however. These are a couple of things on the near horizon: 5 | 6 | ### Component-wise testing 7 | This is pretty key to ensure that community component contributions are 8 | meaningful. Currently there are only tests for the core part of mtriage, and it 9 | is just an article of faith that each component (selector/analyser) works. 10 | 11 | ### Custom classifier documentation and compatibility 12 | None of our code for bootstrapping custom classifiers with mtriage is public, 13 | which means that it's pretty much useless to everyone in its current state, 14 | except as a reference. We're looking to drive some more research around 15 | training vision classifiers using synthetic data with mtriage, and we'll fold 16 | out all of these fixes into upstream mtriage as ways to apply custom 17 | classifiers in the abstract via mtriage as a deployment framework. 18 | 19 | ### Spec-ing a rewrite in Rust/Firecracker 20 | This is somewhat irresponsible on my part, as this here Python/Docker version 21 | of the framework barely works. But I'm looking for a way to get my fingers 22 | properly sticky with Rust in practice, and mtriage seems a good candidate 23 | (systems software, containers, etc). 24 | -------------------------------------------------------------------------------- /example.blacklist.txt: -------------------------------------------------------------------------------- 1 | # lines that begin with a # will be ignored. 2 | 3 | ### ANALYSERS 4 | ConvertAudio 5 | ExtractAudio 6 | Frames 7 | ImageDedup 8 | KerasPretrained 9 | Rank 10 | 11 | ### SELECTORS 12 | Local 13 | Twitter 14 | Youtube 15 | -------------------------------------------------------------------------------- /examples/4chan.yaml: -------------------------------------------------------------------------------- 1 | folder: media/fcs 2 | select: 3 | name: FourChan 4 | config: 5 | board: "g" -------------------------------------------------------------------------------- /examples/classify.yaml: -------------------------------------------------------------------------------- 1 | folder: media/example 2 | elements_in: 3 | - Youtube/Frames 4 | analyse: 5 | - name: KerasPretrained 6 | config: 7 | model: ResNet50 8 | labels: 9 | - tank 10 | - rifle 11 | - military uniform 12 | -------------------------------------------------------------------------------- /examples/meta-test.yaml: -------------------------------------------------------------------------------- 1 | folder: media/example 2 | elements_in: 3 | - Youtube/KerasPretrained 4 | analyse: 5 | name: AnalysedFramesMeta 6 | config: 7 | dev: true 8 | -------------------------------------------------------------------------------- /examples/pytorchfasterrcnn-test.yaml: -------------------------------------------------------------------------------- 1 | folder: media/pytorchfasterrcnn 2 | select: 3 | name: Local 4 | config: 5 | source: data/images 6 | aggregate: true 7 | analyse: 8 | name: PytorchFasterRcnn 9 | config: 10 | dev: true 11 | model: data/sean1.pth 12 | class_map: 13 | - background 14 | - canister 15 | - cylinder 16 | - can 17 | - bottle 18 | - bin 19 | 20 | 21 | -------------------------------------------------------------------------------- /examples/ranking-test.yaml: -------------------------------------------------------------------------------- 1 | folder: media/example 2 | elements_in: 3 | - Youtube 4 | analyse: 5 | name: Flatten 6 | config: 7 | dev: true 8 | -------------------------------------------------------------------------------- /examples/yolov5-test.yaml: -------------------------------------------------------------------------------- 1 | folder: media/yolov5 2 | select: 3 | name: Local 4 | config: 5 | source: data/images 6 | aggregate: true 7 | analyse: 8 | name: TorchHub 9 | config: 10 | dev: true 11 | repo: ultralytics/yolov5 12 | args: 13 | - yolov5s 14 | kwargs: 15 | pretrained: true 16 | -------------------------------------------------------------------------------- /media/.gitignore: -------------------------------------------------------------------------------- 1 | **/* 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /mtriage: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | from commands import parse_args, build, develop, clean, run_tests, run, export 5 | 6 | DEV_COMMANDS = {"develop": develop, "build": build, "test": run_tests, "clean": clean} 7 | 8 | if __name__ == "__main__": 9 | ARGS = parse_args(sys.argv[1:]) 10 | 11 | if ARGS.base == "dev": 12 | DEV_COMMANDS[ARGS.command](ARGS) 13 | elif ARGS.base == "export": 14 | export(ARGS) 15 | else: 16 | run(ARGS) 17 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pyyaml 2 | pytest==4.5.0 3 | black 4 | -------------------------------------------------------------------------------- /scripts/lint: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ## run from top level, i.e. `bash scripts/lint.sh` 4 | python3 -m black src/ 5 | python3 -m black test/ 6 | python3 -m black commands.py 7 | python3 -m black util.py 8 | python3 -m black mtriage 9 | -------------------------------------------------------------------------------- /scripts/scaffold: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | from pathlib import Path 5 | 6 | BASE_PATH = Path(os.path.dirname(os.path.abspath(__file__))) / "../src/lib/analysers" 7 | 8 | def is_camel_case(s): 9 | return s != s.lower() and s != s.upper() and "_" not in s 10 | 11 | def core_base(name): return f'''from lib.common.analyser import Analyser 12 | from lib.common.etypes import Etype 13 | 14 | class {name}(Analyser): 15 | in_etype = Etype.Any 16 | out_etype = Etype.Any 17 | 18 | def analyse_element(self, element, config): 19 | return element 20 | 21 | module = {name}''' 22 | 23 | def infoyaml_base(desc): return f'''desc: {desc} 24 | args: 25 | - name: myarg 26 | desc: Optional description 27 | required: false 28 | input: string 29 | ''' 30 | 31 | name = input("New analyser name: ") 32 | desc = input("Basic description for new analyser: ") 33 | 34 | if not is_camel_case(name): 35 | print("An analyser must be CamelCase") 36 | sys.exit() 37 | 38 | base = BASE_PATH / name 39 | if os.path.exists(base) and os.path.isdir(base): 40 | print(f'An analyser named "{name}" already exists.') 41 | sys.exit() 42 | 43 | os.mkdir(BASE_PATH / name) 44 | 45 | with open(base/"core.py", "w+") as f: 46 | f.write(core_base(name)) 47 | 48 | with open(base/"info.yaml", "w+") as f: 49 | f.write(infoyaml_base(desc)) 50 | 51 | 52 | -------------------------------------------------------------------------------- /src/build/core.end.Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | # ********************* 3 | # ... continuing after partials 4 | # ********************* 5 | 6 | # install pip packages 7 | # NOTE: build.requirements.txt is hardcoded here. 8 | ARG requirements_file=build.requirements.txt 9 | COPY $requirements_file /requirements.txt 10 | RUN pip3 install --upgrade pip && \ 11 | pip3 install -r /requirements.txt 12 | 13 | CMD ["python3", "/mtriage/src/run.py"] 14 | -------------------------------------------------------------------------------- /src/build/core.requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pyyaml 3 | -------------------------------------------------------------------------------- /src/build/core.start.Dockerfile: -------------------------------------------------------------------------------- 1 | MAINTAINER Lachlan Kermode 2 | ENV LANG C.UTF-8 3 | 4 | RUN apt-get update && \ 5 | # ================================================================== 6 | # tools 7 | # ------------------------------------------------------------------ 8 | DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y --no-install-recommends \ 9 | # core 10 | build-essential \ 11 | apt-utils \ 12 | ca-certificates \ 13 | wget \ 14 | # python 15 | software-properties-common \ 16 | python3.7 \ 17 | python3.7-dev \ 18 | python3-distutils-extra \ 19 | git \ 20 | # dev 21 | # git vim curl unzip unrar \ 22 | && \ 23 | wget -O ~/get-pip.py \ 24 | https://bootstrap.pypa.io/get-pip.py && \ 25 | python3.7 ~/get-pip.py && \ 26 | ln -s /usr/bin/python3.7 /usr/local/bin/python3 && \ 27 | ln -s /usr/bin/python3.7 /usr/local/bin/python && \ 28 | python -m pip --no-cache-dir install --upgrade setuptools && \ 29 | ldconfig && \ 30 | apt-get clean && \ 31 | apt-get autoremove && \ 32 | rm -rf /var/lib/apt/lists/* /tmp/* ~/* 33 | 34 | RUN apt-get update --fix-missing 35 | 36 | # Copy necessary folders 37 | RUN mkdir -p /mtriage 38 | COPY ./scripts /mtriage/scripts 39 | COPY ./src /mtriage/src 40 | WORKDIR /mtriage 41 | 42 | # ********************* 43 | # starting partials... 44 | # ********************* 45 | 46 | -------------------------------------------------------------------------------- /src/build/cpu-header.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | -------------------------------------------------------------------------------- /src/build/gpu-header.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04 2 | -------------------------------------------------------------------------------- /src/conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Setup global fixtures for modular-wise tests. 3 | """ 4 | import pytest 5 | 6 | # import requests 7 | import os 8 | import test.utils as test_utils 9 | 10 | 11 | @pytest.fixture(scope="session", autouse=True) 12 | def test_element_dir(): 13 | return "../media/test" 14 | 15 | 16 | # TODO(lachlan): create a special fixture to allow component-wise tests to analyse sub elements 17 | # EG_VIDEO = "https://datasheet-sources.ams3.digitaloceanspaces.com/ilovaisk_videos/platform_background.mp4" 18 | # EG_IMAGE = "https://datasheet-sources.ams3.digitaloceanspaces.com/ilovaisk_videos/Platform_Tutorial_thumb.png" 19 | # 20 | # @pytest.fixture(scope="session", autouse=True) 21 | # def analyse_stub_element() 22 | # if not os.path.exists("/test"): 23 | # os.makedirs("/test") 24 | # if not os.path.exists("/test/video.mp4"): 25 | # r = requests.get(EG_VIDEO) 26 | # open("/test/video.mp4", "wb").write(r.content) 27 | # if not os.path.exists("/test/image.png"): 28 | # r = requests.get(EG_IMAGE) 29 | # open("/test/image.png", "wb").write(r.content) 30 | # 31 | # return "some val" 32 | 33 | 34 | @pytest.fixture(scope="session", autouse=True) 35 | def utils(): 36 | return test_utils 37 | -------------------------------------------------------------------------------- /src/lib/analysers/AnalysedFramesMeta/core.py: -------------------------------------------------------------------------------- 1 | from lib.common.analyser import Analyser 2 | from lib.common.etypes import Etype 3 | from lib.util.cvjson import generate_meta 4 | from lib.etypes.cvjson import CvJson 5 | 6 | 7 | class AnalysedFramesMeta(Analyser): 8 | out_etype = Etype.CvJson 9 | 10 | def analyse_element(self, element, _): 11 | return element 12 | 13 | def post_analyse(self, elements) -> Etype.Json.as_array(): 14 | return generate_meta(elements, logger=self.logger) 15 | 16 | 17 | module = AnalysedFramesMeta 18 | -------------------------------------------------------------------------------- /src/lib/analysers/AnalysedFramesMeta/info.yaml: -------------------------------------------------------------------------------- 1 | desc: TODO 2 | args: [] 3 | 4 | -------------------------------------------------------------------------------- /src/lib/analysers/ConvertAudio/core.py: -------------------------------------------------------------------------------- 1 | from lib.common.analyser import Analyser 2 | from lib.common.exceptions import ElementShouldSkipError 3 | from lib.common.etypes import Etype 4 | from subprocess import call, STDOUT 5 | from pathlib import Path 6 | import os 7 | 8 | 9 | class ConvertAudio(Analyser): 10 | in_etype = Etype.Audio 11 | out_etype = Etype.Audio 12 | 13 | def analyse_element(self, element, config): 14 | output_ext = config["output_ext"] 15 | 16 | FNULL = open(os.devnull, "w") 17 | output = f"/tmp/{element.id}.{output_ext}" 18 | # TODO: error handling 19 | out = call( 20 | ["ffmpeg", "-y", "-i", element.paths[0], output], 21 | stdout=FNULL, 22 | stderr=STDOUT, 23 | ) 24 | self.logger( 25 | f"Converted '{element.id}' from {element.paths[0].suffix} to .{output_ext}" 26 | ) 27 | return Etype.Audio(element.id, paths=[output]) 28 | 29 | 30 | module = ConvertAudio 31 | -------------------------------------------------------------------------------- /src/lib/analysers/ConvertAudio/info.yaml: -------------------------------------------------------------------------------- 1 | desc: Convert the media in an Audio element from one file extension to another. 2 | args: 3 | - name: output_ext 4 | desc: The file extension of the output media, to which the input files will be converted. 5 | required: true 6 | input: string 7 | -------------------------------------------------------------------------------- /src/lib/analysers/ConvertAudio/partial.Dockerfile: -------------------------------------------------------------------------------- 1 | RUN apt-get install -y \ 2 | ffmpeg 3 | -------------------------------------------------------------------------------- /src/lib/analysers/ExtractAudio/core.py: -------------------------------------------------------------------------------- 1 | from lib.common.analyser import Analyser 2 | from lib.common.exceptions import ElementShouldSkipError 3 | from lib.common.etypes import Etype 4 | from subprocess import call, STDOUT 5 | import os 6 | 7 | 8 | class ExtractAudio(Analyser): 9 | in_etype = Etype.Video 10 | out_etype = Etype.Audio 11 | 12 | def analyse_element(self, element, config): 13 | output_ext = config["output_ext"] 14 | output = f"/tmp/{element.id}.{output_ext}" 15 | FNULL = open(os.devnull, "w") 16 | # TODO: add error handling 17 | out = call( 18 | ["ffmpeg", "-y", "-i", element.paths[0], output], 19 | stdout=FNULL, 20 | stderr=STDOUT, 21 | ) 22 | 23 | element.paths[0] = output 24 | 25 | return element 26 | 27 | 28 | module = ExtractAudio 29 | -------------------------------------------------------------------------------- /src/lib/analysers/ExtractAudio/info.yaml: -------------------------------------------------------------------------------- 1 | desc: Extract the audio from a video file. 2 | args: 3 | - name: output_ext 4 | desc: The file extension of the output audio, e.g. 'mp4' or 'aac'. 5 | required: true 6 | input: string 7 | -------------------------------------------------------------------------------- /src/lib/analysers/ExtractAudio/partial.Dockerfile: -------------------------------------------------------------------------------- 1 | RUN apt-get install -y \ 2 | ffmpeg 3 | -------------------------------------------------------------------------------- /src/lib/analysers/ExtractTypes/core.py: -------------------------------------------------------------------------------- 1 | from shutil import copyfile 2 | from pathlib import Path 3 | from lib.common.analyser import Analyser 4 | from lib.common.etypes import Etype 5 | 6 | 7 | class ExtractTypes(Analyser): 8 | in_etype = Etype.Any 9 | out_etype = Etype.Any 10 | 11 | def analyse_element(self, element, config): 12 | exts = config["exts"] if "exts" in config else [] 13 | element.paths = [ 14 | x for x in element.paths if x.suffix in exts or x.suffix[1:] in exts 15 | ] 16 | if len(element.paths) == 0: 17 | self.logger(f"No extracted media in element {element.id}.") 18 | return None 19 | self.logger( 20 | f"Extracting element {element.id} with paths: {[x.name for x in element.paths]}" 21 | ) 22 | return element 23 | 24 | 25 | module = ExtractTypes 26 | -------------------------------------------------------------------------------- /src/lib/analysers/ExtractTypes/info.yaml: -------------------------------------------------------------------------------- 1 | desc: Extract a set of file types from the element folder. 2 | args: 3 | - name: exts 4 | desc: A list of the extensions to extract, in rglob format (e.g. '*.jpg'). 5 | required: true 6 | 7 | 8 | -------------------------------------------------------------------------------- /src/lib/analysers/Flatten/core.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from lib.common.analyser import Analyser 3 | from lib.common.etypes import Etype 4 | from lib.util.cvjson import flatten 5 | 6 | 7 | class Flatten(Analyser): 8 | """NOTE: This class is kept for backwards compatibility, but should not be 9 | used in new implementations. Instaed, simply use the imported `rank` 10 | function directly in the relevant analyser's `post_analyse` method. 11 | """ 12 | 13 | out_etype = Etype.Json 14 | 15 | def analyse_element(self, element: Etype.CvJson, _) -> Etype.Json: 16 | return element 17 | 18 | def post_analyse(self, elements) -> Etype.Json: 19 | return flatten(elements, logger=self.logger) 20 | 21 | 22 | module = Flatten 23 | -------------------------------------------------------------------------------- /src/lib/analysers/Flatten/info.yaml: -------------------------------------------------------------------------------- 1 | desc: TODO 2 | args: [] 3 | 4 | -------------------------------------------------------------------------------- /src/lib/analysers/Frames/core.py: -------------------------------------------------------------------------------- 1 | import os 2 | from shutil import copyfile, rmtree 3 | from subprocess import call, STDOUT 4 | from pathlib import Path 5 | from lib.common.analyser import Analyser 6 | from lib.common.etypes import Etype, Union 7 | from lib.common.util import files 8 | 9 | VID_SUFFIXES = [".mp4", ".mov"] 10 | # GLOSSED_FRAMES = Union(Etype.Image.array(), Etype.Json) 11 | GLOSSED_FRAMES = Etype.Any # hack for the time being 12 | 13 | 14 | def ffmpeg_frames(out_folder, fp, rate): 15 | # TODO: better logs for FFMPEG process 16 | FNULL = open(os.devnull, "w") 17 | out = call( 18 | ["ffmpeg", "-i", fp, "-r", str(rate), f"{out_folder}/%04d.bmp"], 19 | stdout=FNULL, 20 | stderr=STDOUT, 21 | ) 22 | 23 | 24 | class Frames(Analyser): 25 | in_etype = Union(Etype.Json, Etype.Video) 26 | out_etype = GLOSSED_FRAMES 27 | 28 | def analyse_element(self, element, config): 29 | fps = int(config["fps"]) if "fps" in config else 1 30 | jsons = [x for x in element.paths if x.suffix in ".json"] 31 | dest = Path("/tmp") / element.id 32 | if dest.exists(): 33 | rmtree(dest) 34 | dest.mkdir() 35 | 36 | if len(jsons) is 1: 37 | json = jsons[0] 38 | copyfile(json, dest / "meta.json") 39 | 40 | video = [x for x in element.paths if x.suffix in VID_SUFFIXES][0] 41 | ffmpeg_frames(dest, video, fps) 42 | 43 | self.logger(f"Frames successfully created for element {element.id}.") 44 | self.disk.delete_local_on_write = True 45 | return GLOSSED_FRAMES(element.id, paths=files(dest)) 46 | 47 | 48 | module = Frames 49 | -------------------------------------------------------------------------------- /src/lib/analysers/Frames/info.yaml: -------------------------------------------------------------------------------- 1 | desc: Extract a subset of representative frames from a video. A number of frames per second are extracted. 2 | args: 3 | - name: fps 4 | desc: Frames per second. Defaults to 1. 5 | required: false 6 | input: int 7 | -------------------------------------------------------------------------------- /src/lib/analysers/Frames/partial.Dockerfile: -------------------------------------------------------------------------------- 1 | RUN apt-get install -y \ 2 | ffmpeg 3 | -------------------------------------------------------------------------------- /src/lib/analysers/ImageDedup/core.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from shutil import copyfile 4 | from imagededup import methods 5 | from lib.common.exceptions import InvalidAnalyserConfigError 6 | from lib.common.analyser import Analyser 7 | from lib.common.etypes import Etype 8 | 9 | 10 | class ImageDedup(Analyser): 11 | in_etype = Etype.Image.array() 12 | out_etype = Etype.Image.array() 13 | 14 | def __create_hasher(self, config): 15 | hasher_key = config["method"] if "method" in config else "phash" 16 | self.logger(f"Compare method is {hasher_key}") 17 | hasher = { 18 | "phash": methods.PHash, 19 | "ahash": methods.AHash, 20 | "dhash": methods.DHash, 21 | "whash": methods.WHash, 22 | }.get(hasher_key) 23 | if hasher is None: 24 | raise InvalidAnalyserConfigError( 25 | f"'{hasher_key}' is not a valid method for imagededup." 26 | ) 27 | 28 | self.hasher = hasher() 29 | 30 | # super low threshold by default to only remove essentially identical images. 31 | if "threshold" in config: 32 | self.threshold = int(config["threshold"]) 33 | else: 34 | self.threshold = 3 35 | 36 | self.logger(f"Hamming threshold is {self.threshold}") 37 | 38 | def pre_analyse(self, config): 39 | self.__create_hasher(config) 40 | 41 | def is_dry(self): 42 | return "dry" in self.config and self.config["dry"] 43 | 44 | def analyse_element(self, element, config): 45 | # NOTE: only works if all images are in same file, should probably copy for robustness. 46 | basedir = element.paths[0].parent 47 | encodings = self.hasher.encode_images(image_dir=basedir) 48 | 49 | args = {"image_dir": basedir, "max_distance_threshold": self.threshold} 50 | 51 | duplicates = self.hasher.find_duplicates_to_remove(**args) 52 | 53 | self.logger(f"{len(duplicates)} duplicates found.") 54 | 55 | self.logger("IMAGES TO REMOVE") 56 | self.logger("------------------") 57 | for dup in duplicates: 58 | self.logger(dup) 59 | self.logger("------------------") 60 | if self.is_dry(): 61 | return None 62 | 63 | self.logger(f"{element.id} images deduplicated.") 64 | 65 | deduplicated_paths = [p for p in element.paths if p.name not in duplicates] 66 | 67 | return Etype.Image.array()(element.id, paths=deduplicated_paths) 68 | 69 | 70 | module = ImageDedup 71 | -------------------------------------------------------------------------------- /src/lib/analysers/ImageDedup/info.yaml: -------------------------------------------------------------------------------- 1 | desc: Deduplicate images in a collection using https://github.com/idealo/imagededup. 2 | args: 3 | - name: threshold 4 | desc: The max Hamming distance threshold between two images below which retrieved duplicates are valid. See https://idealo.github.io/imagededup/methods/hashing/ for more information. 5 | required: false 6 | input: int 7 | - name: method 8 | desc: The method to use for hashing/comparison. Should be one of- phash, ahash, dhash, whash. See https://idealo.github.io/imagededup/methods/hashing/ for more information. 9 | required: false 10 | input: string 11 | - name: dry 12 | desc: If set to true, the analyser will return a txt file that names all the images that are duplicates, rather than actually removing them 13 | required: false 14 | input: bool 15 | 16 | 17 | -------------------------------------------------------------------------------- /src/lib/analysers/ImageDedup/requirements.txt: -------------------------------------------------------------------------------- 1 | imagededup 2 | -------------------------------------------------------------------------------- /src/lib/analysers/KerasPretrained/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | import os 4 | from importlib import import_module 5 | from lib.common.exceptions import InvalidAnalyserConfigError 6 | from lib.common.analyser import Analyser 7 | from lib.common.etypes import Etype, Union, Array 8 | from lib.util.cvjson import generate_meta 9 | from lib.etypes.cvjson import CvJson 10 | 11 | KERAS_HOME = "/mtriage/data/.keras" 12 | os.environ["KERAS_HOME"] = KERAS_HOME 13 | 14 | import tensorflow as tf 15 | from tensorflow.keras.preprocessing import image 16 | 17 | SUPPORTED_MODELS = { 18 | "ResNet50": {"module": "resnet50"}, 19 | "VGG16": {"module": "vgg16"}, 20 | "VGG19": {"module": "vgg19"}, 21 | } 22 | 23 | 24 | class KerasPretrained(Analyser): 25 | in_etype = Union(Array(Etype.Image), Etype.Json) 26 | out_etype = CvJson 27 | """ Override to always run serially. Otherwise it hangs, presumably due to 28 | the parallelisation that tensorflow does under the hood. """ 29 | 30 | @property 31 | def in_parallel(self): 32 | return False 33 | 34 | def pre_analyse(self, config): 35 | self.logger(config["model"]) 36 | self.logger(f"Storing models in {KERAS_HOME}") 37 | MOD = SUPPORTED_MODELS.get(config["model"]) 38 | if MOD is None: 39 | raise InvalidAnalyserConfigError( 40 | f"The module '{config['model']}' either does not exist, or is not yet supported." 41 | ) 42 | 43 | rLabels = config["labels"] 44 | 45 | # TODO: make it so that this doesn't redownload every run. 46 | # i.e. refactor it into partial.Dockerfile 47 | self.model_module = import_module( 48 | f"tensorflow.keras.applications.{MOD['module']}" 49 | ) 50 | impmodel = getattr(self.model_module, config["model"]) 51 | # NB: this downloads the weights if they don't exist 52 | self.model = impmodel(weights="imagenet") 53 | self.THRESH = 0.1 54 | 55 | def get_preds(img_path): 56 | img = image.load_img(img_path, target_size=(224, 224)) 57 | x = image.img_to_array(img) 58 | x = np.expand_dims(x, axis=0) 59 | x = self.model_module.preprocess_input(x) 60 | preds = self.model.predict(x) 61 | 62 | # top field must be included or defaults to 5, huge number ensures 63 | # it gets all labels 64 | decoded = self.model_module.decode_predictions(preds, top=10) 65 | 66 | # filter by labels provided in whitelist 67 | filteredPreds = [p for p in decoded[0] if p[1] in rLabels] 68 | 69 | return [ 70 | (x[1], float(x[2])) for x in filteredPreds if float(x[2]) >= self.THRESH 71 | ] 72 | 73 | self.get_preds = get_preds 74 | 75 | def analyse_element(self, element, _): 76 | self.logger(f"Running inference on frames in {element.id}...") 77 | val = Etype.CvJson.from_preds(element, self.get_preds) 78 | self.logger(f"Wrote predictions JSON for {element.id}.") 79 | self.disk.delete_local_on_write = True 80 | return val 81 | 82 | def post_analyse(self, elements) -> Etype.Json.as_array(): 83 | return generate_meta(elements, logger=self.logger) 84 | 85 | 86 | module = KerasPretrained 87 | -------------------------------------------------------------------------------- /src/lib/analysers/KerasPretrained/info.yaml: -------------------------------------------------------------------------------- 1 | desc: Classify objects in images using a neural net trained on ImageNet, as made available through the pretrained Keras modules. Inference is run on each image using the specified model, and the labels for the top 10 predictions will be retained in an output JSON of ImageFrameJson format. 2 | args: 3 | - name: model 4 | desc: The model you want to use to classify, 'Resnet50', 'VGG16', or 'VGG19'. All models are trained on ImageNet. 5 | required: true 6 | input: string 7 | - name: labels 8 | desc: Filter results to a limited array of ImageNet labels, if you are only interested in some of them. If not provided, the analyser will return predictions for all labels. 9 | required: true 10 | input: whitelist 11 | -------------------------------------------------------------------------------- /src/lib/analysers/KerasPretrained/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow==2.3.1 2 | pillow==6.2.0 3 | -------------------------------------------------------------------------------- /src/lib/analysers/ProtestsPretrained/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import json 4 | import os 5 | import torch 6 | from torch.autograd import Variable 7 | from PIL import Image 8 | 9 | from lib.common.analyser import Analyser 10 | from lib.common.etypes import Etype, Union, Array 11 | from lib.analysers.ProtestsPretrained.utils import transform, modified_resnet50, decode 12 | 13 | PTH_TAR = "/mtriage/model.pth.tar" 14 | 15 | # TODO cuda ? 16 | 17 | 18 | class ProtestsPretrained(Analyser): 19 | def pre_analyse(self, config): 20 | """ 21 | Init the logging, etc 22 | Init the model 23 | """ 24 | rLabels = config["labels"] 25 | self.THRESH = 0.0 26 | 27 | t = transform() 28 | model = modified_resnet50() 29 | model.load_state_dict( 30 | torch.load( 31 | PTH_TAR, 32 | map_location=torch.device("cpu"), 33 | )["state_dict"] 34 | ) 35 | model.eval() 36 | 37 | def get_preds(img_path): 38 | """ 39 | Gives labelds and probabilities for a single image 40 | This is were we preprocess the image, using a function defined in the model class 41 | """ 42 | # load image 43 | img = Image.open(img_path).convert("RGB") 44 | # process it 45 | x = t(img) 46 | # get in in the right format 47 | x = Variable(x).unsqueeze(0) 48 | # predictions 49 | output = model(x) 50 | # decode 51 | output = decode(output.cpu().data.numpy()[0]) 52 | # filter 53 | output = [(x[0], x[1]) for x in output if x[0] in rLabels] 54 | output = [(x[0], float(x[1])) for x in output if x[1] >= self.THRESH] 55 | 56 | return output 57 | 58 | self.get_preds = get_preds 59 | 60 | def analyse_element( 61 | self, element: Union(Array(Etype.Image), Etype.Json), _ 62 | ) -> Etype.Json: 63 | self.logger(f"Running inference on frames in {element.id}...") 64 | val = Etype.CvJson.from_preds(element, self.get_preds) 65 | self.logger(f"Wrote predictions JSON for {element.id}.") 66 | self.disk.delete_local_on_write = True 67 | return val 68 | 69 | 70 | module = ProtestsPretrained 71 | -------------------------------------------------------------------------------- /src/lib/analysers/ProtestsPretrained/image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/forensic-architecture/mtriage/7a841241518f831766767d6ddaa6320b8de4be98/src/lib/analysers/ProtestsPretrained/image.jpg -------------------------------------------------------------------------------- /src/lib/analysers/ProtestsPretrained/info.yaml: -------------------------------------------------------------------------------- 1 | desc: Classify the presence of protests and violence in images. 2 | args: 3 | - name: labels 4 | desc: Filter results to a limited array of labels. 5 | required: true 6 | input: whitelist 7 | -------------------------------------------------------------------------------- /src/lib/analysers/ProtestsPretrained/partial.Dockerfile: -------------------------------------------------------------------------------- 1 | RUN wget -O /mtriage/model.pth.tar https://www.dropbox.com/s/vgh2nwxrzembxpw/model.pth.tar?dl=0 2 | -------------------------------------------------------------------------------- /src/lib/analysers/ProtestsPretrained/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchvision 3 | pillow==6.2.0 4 | numpy<1.17 5 | -------------------------------------------------------------------------------- /src/lib/analysers/ProtestsPretrained/test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import torch 4 | from torch.autograd import Variable 5 | from PIL import Image 6 | from utils import transform, modified_resnet50, decode 7 | 8 | 9 | def pre_analyse(): 10 | """ 11 | Init the logging, etc 12 | Init the model 13 | Same as KerasPretrained 14 | """ 15 | t = transform() 16 | model = modified_resnet50() 17 | model.load_state_dict( 18 | torch.load( 19 | "model.pth.tar", 20 | map_location=torch.device("cpu"), 21 | )["state_dict"] 22 | ) 23 | model.eval() 24 | 25 | def get_preds(img_path): 26 | """ 27 | Gives labelds and probabilities for a single image 28 | This is were we preprocess the image, using a function defined in the model class 29 | """ 30 | # load image 31 | img = Image.open(img_path).convert("RGB") 32 | # process it 33 | x = t(img) 34 | # get in in the right format 35 | x = Variable(x).unsqueeze(0) 36 | # predictions 37 | output = model(x) 38 | # decode 39 | output = decode(output.cpu().data.numpy()[0]) 40 | 41 | # filter 42 | # return pred, proba 43 | return output 44 | 45 | return get_preds("image.jpg") 46 | 47 | 48 | print(pre_analyse()) 49 | -------------------------------------------------------------------------------- /src/lib/analysers/ProtestsPretrained/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | created by: Donghyeon Won 3 | """ 4 | import torch 5 | import torch.nn as nn 6 | import torchvision.transforms as transforms 7 | import torchvision.models as models 8 | 9 | 10 | def transform(): 11 | return transforms.Compose( 12 | [ 13 | transforms.Resize(256), 14 | transforms.CenterCrop(224), 15 | transforms.ToTensor(), 16 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 17 | ] 18 | ) 19 | 20 | 21 | def decode(preds): 22 | classes = [ 23 | "protest", 24 | "violence", 25 | "sign", 26 | "photo", 27 | "fire", 28 | "police", 29 | "children", 30 | "group_20", 31 | "group_100", 32 | "flag", 33 | "night", 34 | "shouting", 35 | ] 36 | return [(x, preds[c]) for c, x in enumerate(classes)] 37 | 38 | 39 | class FinalLayer(nn.Module): 40 | """modified last layer for resnet50 for our dataset""" 41 | 42 | def __init__(self): 43 | super(FinalLayer, self).__init__() 44 | self.fc = nn.Linear(2048, 12) 45 | self.sigmoid = nn.Sigmoid() 46 | 47 | def forward(self, x): 48 | out = self.fc(x) 49 | out = self.sigmoid(out) 50 | return out 51 | 52 | 53 | def modified_resnet50(): 54 | model = models.resnet50(pretrained=True) 55 | model.fc = FinalLayer() 56 | return model 57 | -------------------------------------------------------------------------------- /src/lib/analysers/PytorchFasterRcnn/core.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | from torch.autograd import Variable 4 | from torchvision import transforms 5 | from PIL import Image 6 | from lib.common.analyser import Analyser 7 | from lib.common.etypes import Etype 8 | 9 | 10 | class PytorchFasterRcnn(Analyser): 11 | in_etype = Etype.Any 12 | out_etype = Etype.Any 13 | 14 | def pre_analyse(self, config): 15 | # NB: in future this could be configurable. 16 | model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False, num_classes=6) 17 | if torch.cuda.is_available(): 18 | model.cuda() 19 | self.device = torch.device("cuda:0") 20 | else: 21 | self.device = torch.device("cpu") 22 | state_dict = torch.load(self.base_path/config["model"], map_location=torch.device(self.device)) 23 | model.load_state_dict(state_dict) 24 | model.eval() 25 | self.model = model 26 | self.transforms = transforms.Compose([transforms.Resize(224), transforms.ToTensor()]) 27 | self.threshold = config.get('threshold') if config.get('threshold') else 0.5 28 | 29 | def analyse_element(self, element, config): 30 | def get_preds(img): 31 | img = Image.open(img).convert('RGB') 32 | image_tensor = self.transforms(img).float().unsqueeze_(0) 33 | inp = Variable(image_tensor).to(self.device) 34 | output = self.model(inp)[0] 35 | labels = [config['class_map'][i.item()] for i in output.get('labels')] 36 | scores = output.get('scores') 37 | preds = [(x, y.item()) for x,y in zip(labels, scores) if y.item() > self.threshold] 38 | return preds 39 | 40 | self.logger(f"Running inference for {element.id}...") 41 | return Etype.CvJson.from_preds(element, get_preds) 42 | 43 | module = PytorchFasterRcnn 44 | -------------------------------------------------------------------------------- /src/lib/analysers/PytorchFasterRcnn/info.yaml: -------------------------------------------------------------------------------- 1 | desc: Run inference with weights produced by transfer learning from a FasterRCNN backbone. 2 | args: 3 | - name: model 4 | desc: Relative path to the 'model.pth' file that holds the weights of the transfer-learned model. Must use `fasterrcnn_resnet50_fpn` as a backbone for now. 5 | required: true 6 | input: path 7 | - name: class_map 8 | desc: A list of the class names that correspond to the indices returned (in `output['labels']`). 9 | required: true 10 | input: list 11 | - name: threshold 12 | desc: The cutoff for predictions, between 0 and 1. Defaults to 0.5. 13 | required: false 14 | input: number 15 | -------------------------------------------------------------------------------- /src/lib/analysers/PytorchFasterRcnn/requirements.txt: -------------------------------------------------------------------------------- 1 | # icevision 2 | Pillow 3 | torch 4 | torchvision 5 | -------------------------------------------------------------------------------- /src/lib/analysers/Rank/core.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from lib.common.analyser import Analyser 3 | from lib.common.etypes import Etype 4 | from lib.util.rank_cvjson import rank 5 | 6 | 7 | class Rank(Analyser): 8 | """NOTE: This class is kept for backwards compatibility, but should not be 9 | used in new implementations. Instaed, simply use the imported `rank` 10 | function directly in the relevant analyser's `post_analyse` method. 11 | """ 12 | 13 | def analyse_element(self, element: Etype.CvJson, _) -> Etype.Any: 14 | return element 15 | 16 | def post_analyse(self, elements) -> Etype.Json: 17 | return rank(elements, logger=self.logger) 18 | 19 | 20 | module = Rank 21 | -------------------------------------------------------------------------------- /src/lib/analysers/Rank/info.yaml: -------------------------------------------------------------------------------- 1 | desc: Rewrite an ImageFrameJson as a ranked ImageFrameJson. This step is necessary to display the results via the 'framemap' viewer. 2 | args: 3 | - name: threshold 4 | desc: The minimum score for which a prediction should be counted towards an element's rank. 5 | required: false 6 | input: float 7 | 8 | -------------------------------------------------------------------------------- /src/lib/analysers/TorchHub/core.py: -------------------------------------------------------------------------------- 1 | from lib.common.analyser import Analyser 2 | from lib.common.etypes import Etype 3 | 4 | from PIL import Image 5 | import torch 6 | 7 | def cls_and_conf(pred, names): 8 | # `pred` is an array with 6 values: x1, y1, x2, y2, confidence, class 9 | _,_,_,_,conf,cl = pred 10 | cl = names[int(cl)] 11 | conf = float(conf) 12 | return (cl, conf) 13 | 14 | 15 | class TorchHub(Analyser): 16 | in_etype = Etype.Any 17 | out_etype = Etype.Any 18 | 19 | def pre_analyse(self, config): 20 | if config.get('args') is None: config['args'] = [] 21 | if config.get('kwargs') is None: config['kwargs'] = {} 22 | 23 | self.model = torch.hub.load(config['repo'], *config['args'], **config['kwargs']) 24 | self.model.conf = 0.5 # confidence threshold 25 | self.model.iou = 0.45 # NMS IoU threshold 26 | self.logger("Model loaded from remote.") 27 | 28 | def analyse_element(self, element, config): 29 | imgs = [Image.open(x) for x in element.paths] 30 | results = self.model(imgs).tolist() 31 | self.logger(f"Batched inference successfully run for element {element.id}.") 32 | 33 | def get_preds(img_path): 34 | idx = element.paths.index(img_path) 35 | result = results[idx] 36 | return [cls_and_conf(p, result.names) for p in result.pred] 37 | 38 | return Etype.CvJson.from_preds(element, get_preds) 39 | 40 | module = TorchHub 41 | -------------------------------------------------------------------------------- /src/lib/analysers/TorchHub/info.yaml: -------------------------------------------------------------------------------- 1 | desc: Run inference using weights trained with https://github.com/ultralytics/yolov5 2 | args: 3 | - name: repo 4 | desc: Github repository from which to load the Torch hub model, i.e. 'ultralytics/yolov5' 5 | required: true 6 | input: string 7 | - name: args 8 | desc: Arguments for `torch.hub.load()` function. 9 | required: false 10 | input: list 11 | - name: kwargs 12 | desc: Keyword arguments for `torch.hub.load()` function 13 | required: false 14 | input: dict 15 | -------------------------------------------------------------------------------- /src/lib/analysers/TorchHub/partial.Dockerfile: -------------------------------------------------------------------------------- 1 | RUN apt-get install -y libgl1-mesa-dev 2 | -------------------------------------------------------------------------------- /src/lib/analysers/TorchHub/requirements.txt: -------------------------------------------------------------------------------- 1 | # numpy>=1.18.5 2 | # PyYAML>=5.3.1 3 | # tensorboard>=2.2 4 | # wandb 5 | # thop # FLOPS computation 6 | # pycocotools>=2.0 # COCO mAP 7 | 8 | Cython 9 | matplotlib>=3.2.2 10 | opencv-python>=4.1.2 11 | Pillow 12 | scipy>=1.4.1 13 | requests 14 | torch>=1.7.0 15 | torchvision>=0.8.1 16 | tqdm>=4.41.0 17 | 18 | seaborn>=0.11.0 19 | pandas 20 | -------------------------------------------------------------------------------- /src/lib/analysers/TwintToGephi/core.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import twint 4 | import pandas as pd 5 | from pathlib import Path 6 | from lib.common.analyser import Analyser 7 | from lib.common.etypes import Etype 8 | from lib.util.twint import to_serializable, pythonize 9 | 10 | 11 | from collections import namedtuple 12 | from datetime import datetime 13 | 14 | 15 | def fmt_timestmap(dstamp, tstamp, tzone): 16 | ds = datetime.strptime(dstamp, "%Y-%m-%d") 17 | fmtted_ds = ds.strftime("%m/%d/%y") 18 | return f"{fmtted_ds} {tstamp}" 19 | 20 | 21 | TMP = Path("/tmp") 22 | TweetEdge = namedtuple( 23 | "TweetEdge", "date tweet urls domains hashtags tweet_id inreplyto_id" 24 | ) 25 | 26 | 27 | class CsvGraph: 28 | node_labels = [ 29 | "Vertex", 30 | "Followed", 31 | "Followers", 32 | "Tweets", 33 | "Favorites", 34 | "Description", 35 | "Location", 36 | "Web", 37 | "Time Zone", 38 | "Joined Twitter Date (UTC)", 39 | ] 40 | edge_labels = [ 41 | "Vertex 1", 42 | "Vertex 2", 43 | "Width", 44 | "Relationship", 45 | "Relationship Date (UTC)", 46 | "Tweet", 47 | "URLs in Tweet", 48 | "Domains in Tweet", 49 | "Hashtags in Tweet", 50 | "Tweet Date (UTC)", 51 | "Twitter Page for Tweet", 52 | "Imported ID", 53 | "In-Reply-To Tweet ID", 54 | ] 55 | 56 | def __init__(self): 57 | self.nodes = [] 58 | self.edges = [] 59 | 60 | def has_node(self, name: str): 61 | return name in self.nodes 62 | 63 | def add_node(self, name: str): 64 | if name not in self.nodes: 65 | self.nodes.append(name) 66 | 67 | def add_edge(self, _from: dict, _to: dict): 68 | is_reply = _to is not None 69 | 70 | self.add_node(_from["username"]) 71 | if is_reply: 72 | self.add_node(_to["username"]) 73 | 74 | edge = TweetEdge( 75 | date=fmt_timestmap( 76 | _from["datestamp"], _from["timestamp"], _from["timezone"] 77 | ), 78 | tweet=_from["tweet"], 79 | urls=_from["urls"], 80 | domains=[], # NB: no domains provided in obj 81 | hashtags=_from["hashtags"], 82 | tweet_id=_from["id"], 83 | inreplyto_id=_to["id"] if _to is not None else None, 84 | ) 85 | 86 | self.edges.append( 87 | [ 88 | _from["username"], 89 | _to["username"] if is_reply else _from["username"], 90 | 1, # width defaults to 1 91 | "Tweet" if not is_reply else "Replies To", # relationship 92 | edge.date, # relationship date 93 | edge.tweet, 94 | "- ".join(edge.urls) if isinstance(edge.urls, list) else edge.urls, 95 | "- ".join(edge.domains) 96 | if isinstance(edge.domains, list) 97 | else edge.domains, 98 | "- ".join(edge.hashtags) 99 | if isinstance(edge.hashtags, list) 100 | else edge.hashtags, 101 | edge.date, # tweet date 102 | f"https://twitter.com/${_from['username']}/status/${_from['id']}", 103 | edge.tweet_id, # the tweet's id 104 | "" 105 | if not is_reply 106 | else edge.inreplyto_id, # the id of the tweet to which this replies. 107 | ] 108 | ) 109 | 110 | def to_xlsx(self, path): 111 | """ Save graph as XLSX file. The default tab will be edges, with an extra tab for nodes. """ 112 | edge_df = pd.DataFrame.from_records(self.edges) 113 | edge_df.columns = CsvGraph.edge_labels 114 | node_df = pd.DataFrame.from_records([[x] for x in self.nodes]) 115 | node_df.columns = ["Vertex"] 116 | 117 | writer = pd.ExcelWriter(path, engine="xlsxwriter") 118 | edge_df.to_excel(writer, sheet_name="Edges") 119 | node_df.to_excel(writer, sheet_name="Vertices") 120 | writer.save() 121 | 122 | 123 | class TwintToGephi(Analyser): 124 | in_etype = Etype.Json 125 | out_etype = Etype.Any 126 | 127 | def pre_analyse(self, _): 128 | # keeps a record of which user ids have been indexed so that there's no 129 | # repeated work. 130 | self.indexed_ids = [] 131 | # usernames (to easily check whether a user exists in the graph or not) 132 | self.graph = CsvGraph() 133 | 134 | def analyse_element(self, element, _): 135 | with open(element.paths[0], "r") as f: 136 | orig_tweet = json.load(f) 137 | orig_tweet = pythonize(orig_tweet) 138 | 139 | tweet_with_replies = [orig_tweet] 140 | reply_count = orig_tweet["replies_count"] 141 | # retweet_count = orig_tweet["retweets_count"] 142 | usr = orig_tweet["username"] 143 | 144 | # TODO: get retweets, as they are mentions 145 | # if retweet_count > 0: 146 | # retweets = self.get_all_retweets(usr) 147 | 148 | if reply_count > 0 and usr not in self.indexed_ids: 149 | # TODO: keep a record so that we don't need to rescrape 150 | # self.indexed_ids.append(usr) 151 | 152 | all_tweets = self.get_all_tweets_sent_to(usr) 153 | conv_tweets = [ 154 | tweet 155 | for tweet in all_tweets 156 | if tweet["conversation_id"] == orig_tweet["conversation_id"] 157 | ] 158 | if len(conv_tweets) > 0: 159 | tweet_with_replies = tweet_with_replies + conv_tweets 160 | self.logger(f"{len(conv_tweets)} replies added to tweet {element.id}.") 161 | 162 | output = TMP / f"{element.id}.json" 163 | with open(output, "w+") as f: 164 | json.dump(tweet_with_replies, f) 165 | 166 | element.paths = [output] 167 | 168 | return element 169 | 170 | def get_all_retweets(self, username): 171 | c = twint.Config() 172 | c.Username = username 173 | c.Retweets = True 174 | twint.run.Profile(c) 175 | 176 | def get_all_tweets_sent_to(self, username): 177 | """ See https://github.com/twintproject/twint/issues/513 """ 178 | c = twint.Config() 179 | c.To = f"@{username}" 180 | c.Retweets = True 181 | c.Since = self.config["uploaded_after"] 182 | c.Until = self.config["uploaded_before"] 183 | c.Store_object = True 184 | self.logger(f"Scraping tweets sent to {username}...") 185 | twint.run.Search(c) 186 | results = twint.output.tweets_list 187 | twint.output.tweets_list = [] 188 | 189 | return to_serializable(results) 190 | 191 | def add_to_graph(self, t, inreplyto=None): 192 | """Add the relevant rows (for `nodes` and `edges`) to a graph from 193 | a Twint-formatted tweet (Python dictionary)""" 194 | self.graph.add_node(t["username"]) 195 | 196 | self.graph.add_edge(t, inreplyto) 197 | 198 | def post_analyse(self, elements): 199 | for el in elements: 200 | el_json = el.paths[0] 201 | with open(el_json) as f: 202 | tweets = json.load(f) 203 | 204 | initial_tweet = tweets[0] 205 | self.logger(f"Adding tweet {initial_tweet['id']} to graph...") 206 | self.add_to_graph(initial_tweet) 207 | for tweet in tweets[1:]: 208 | self.logger(f"Adding reply {tweet['id']} to graph...") 209 | self.add_to_graph(tweet, inreplyto=initial_tweet) 210 | 211 | xlsx_path = TMP / "final.xlsx" 212 | self.graph.to_xlsx(xlsx_path) 213 | return Etype.Any("FINAL", xlsx_path) 214 | 215 | 216 | module = TwintToGephi 217 | -------------------------------------------------------------------------------- /src/lib/analysers/TwintToGephi/info.yaml: -------------------------------------------------------------------------------- 1 | desc: Create a single element from Twitter elements, which contains two CSV files that specify a relational graph. As replies are determined by scraping all tweets in a user's timeline and then filtering by conversation ID, a requirement of twint, `uploaded_before` and `uploaded_after` should be provided so that only relevant tweets need to be scraped. 2 | args: 3 | - name: uploaded_before 4 | desc: Only return tweets before this date. 5 | required: true 6 | input: date 7 | - name: uploaded_after 8 | desc: Only return tweets after this date. 9 | required: true 10 | input: date 11 | -------------------------------------------------------------------------------- /src/lib/analysers/TwintToGephi/requirements.txt: -------------------------------------------------------------------------------- 1 | xlsxwriter 2 | pandas 3 | -------------------------------------------------------------------------------- /src/lib/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/forensic-architecture/mtriage/7a841241518f831766767d6ddaa6320b8de4be98/src/lib/common/__init__.py -------------------------------------------------------------------------------- /src/lib/common/etypes.py: -------------------------------------------------------------------------------- 1 | import os 2 | from enum import Enum 3 | from pathlib import Path 4 | from copy import deepcopy 5 | from functools import reduce 6 | from types import SimpleNamespace as Ns 7 | from typing import Union as _Union, List, TypeVar 8 | from abc import abstractmethod 9 | from lib.common.exceptions import EtypeCastError 10 | from lib.common.get import get_custom_etypes 11 | 12 | 13 | class LocalElement: 14 | """Local as in not from storage, but on the same comp where mtriage is running. 15 | Returned from Selector.retrieve_element, and also Analyser.analyse_element.""" 16 | 17 | def __init__(self, id=None, query=None, paths=None, et=None): 18 | self.id = id # the element id 19 | self.query = query # the query string used to retrieve the element 20 | self.paths = ( 21 | paths # the path/s where the element's media are accessible locally 22 | ) 23 | self.et = et 24 | 25 | 26 | class LocalElementsIndex: 27 | """Similar to LocalElement, on the same comp as mtriage is running. 28 | Initialised with an array of arrays, where each inner array represents one element to be retrieved.""" 29 | 30 | def __init__(self, rows=[]): 31 | self.rows = rows 32 | 33 | 34 | Pth = TypeVar("Pth", str, Path) 35 | Function = type(lambda _: None) 36 | 37 | 38 | class Et: 39 | def __init__(self, name, filter_func, is_array=False): 40 | self.id = name 41 | self.filter_func = filter_func 42 | self.is_array = is_array 43 | 44 | def __repr__(self): 45 | ia = self.is_array 46 | return f"{'Array(' if ia else ''}{self.id.capitalize()}{')' if ia else ''}" 47 | 48 | def __str__(self): 49 | return self.__repr__() 50 | 51 | def __get_etype(self): 52 | for etype in Etype: 53 | if self.name == etype.name: 54 | return etype 55 | return None 56 | 57 | def __call__( 58 | self, el_id: str, paths: _Union[Pth, List[Pth]], is_array=False 59 | ) -> LocalElement: 60 | if isinstance(paths, (str, Path)): 61 | paths = [paths] 62 | else: 63 | paths = [Path(x) if isinstance(x, str) else x for x in paths] 64 | paths = self.filter(paths) 65 | 66 | # NOTE: a bit convoluted. Only do an array check if etype is not custom, 67 | # as custom etypes could have more sophisticated expressions than core 68 | # types. TODO: make more elegant. 69 | is_custom = self.id in [x.__name__ for x in get_custom_etypes()] 70 | if not is_custom: 71 | if len(paths) == 0 or ( 72 | self.id != "Any" 73 | and not (is_array or self.is_array) 74 | and (len(paths) != 1 or not paths[0].is_file()) 75 | ): 76 | raise EtypeCastError(self) 77 | 78 | # TODO: confirm all source files exist 79 | this_cls = deepcopy(self) 80 | if this_cls.is_array: 81 | this_cls.is_array = True 82 | return LocalElement(paths=paths, id=el_id, et=this_cls) 83 | 84 | def filter(self, ls): 85 | """ Exists to be overwritten, `filter_func` is just the fallback. """ 86 | return self.filter_func(ls) 87 | 88 | def __eq__(self, other): 89 | return all( 90 | [ 91 | isinstance(other, Et), 92 | self.id == other.id, 93 | self.is_array == other.is_array, 94 | ] 95 | ) 96 | 97 | def __lt__(self, other): 98 | return self.id < other.id 99 | 100 | def as_array(self): 101 | return Et(self.id, self.filter, is_array=True) 102 | 103 | def array(self): 104 | return self.as_array() 105 | 106 | @property 107 | def is_union(self): 108 | return False 109 | 110 | 111 | class UnionEt(Et): 112 | """ A higher order Etype that allows the additive composition of Ets. """ 113 | 114 | def __init__(self, *ets): 115 | self.ets = ets 116 | super().__init__(self, str(self), is_array=False) 117 | 118 | def __repr__(self): 119 | inner = "" 120 | for et in self.ets: 121 | inner += f"{et}, " 122 | inner = inner[:-2] 123 | 124 | return f"Union({inner})" 125 | 126 | def __eq__(self, other): 127 | return all([x == y for x, y in zip(sorted(self.ets), sorted(other.ets))]) 128 | 129 | def __call__(self, el_id: str, paths: _Union[Pth, List[Pth]]) -> LocalElement: 130 | 131 | self.ets[1](el_id, paths) 132 | ets = [T(el_id, paths) for T in self.ets] 133 | 134 | all_paths = [] 135 | 136 | for et in ets: 137 | all_paths += et.paths 138 | return LocalElement(paths=all_paths, id=el_id, et=self) 139 | 140 | @property 141 | def is_union(self): 142 | return True 143 | 144 | 145 | def class_as_et(class_obj): 146 | return class_obj(class_obj.__name__, class_obj.filter) 147 | # TODO: get across all custom methods somehow... 148 | 149 | 150 | def fglob(ps, exts): 151 | return [p for p in ps if p.suffix.lower() in exts] 152 | 153 | 154 | def all_etypes(): 155 | base = [x for x in dir(Etype) if not x.startswith("_") and x != "cast"] 156 | custom = get_custom_etypes() 157 | 158 | for t in base: 159 | yield getattr(Etype, t) 160 | for t in custom: 161 | yield t(t.__name__, t.filter) 162 | 163 | 164 | def cast(el_id, paths: _Union[List[Pth], Pth], to: Et = None) -> LocalElement: 165 | if isinstance(paths, (Path, str)): 166 | paths = [paths] 167 | # NB: cast even at the expense of losing some paths if explicit ET is provided 168 | if to is not None: 169 | return to(el_id, paths=paths) 170 | # implicit cast to the most inclusive type 171 | valid = [] 172 | if len(paths) == 0: 173 | raise EtypeCastError("Paths cannot be empty.") 174 | 175 | for et in all_etypes(): 176 | if et.id == "Any": 177 | continue 178 | try: 179 | # if both array and singular casts are valid, precedence given to singular 180 | et(el_id, paths=paths, is_array=True) 181 | v = Array(et) 182 | try: 183 | et(el_id, paths=paths) 184 | v = et 185 | except: 186 | pass 187 | valid.append(v) 188 | except EtypeCastError: 189 | pass 190 | 191 | if len(valid) == 0: 192 | return Etype.Any(el_id, paths) 193 | elif len(valid) == 1: 194 | return valid[0](el_id, paths) 195 | else: 196 | # multiple valid types, return a union 197 | etyped_paths = reduce(lambda a, b: a + b(el_id, paths).paths, valid, []) 198 | if len(etyped_paths) != len(paths): 199 | return Etype.Any(el_id, paths) 200 | return Union(*valid)(el_id, paths) 201 | 202 | 203 | class Etype: 204 | Any = Et("Any", lambda ps: ps) 205 | Image = Et("Image", lambda ps: fglob(ps, [".bmp", ".jpg", ".jpeg", ".png"])) 206 | Video = Et("Video", lambda ps: fglob(ps, [".mp4", ".mov"])) 207 | Audio = Et("Audio", lambda ps: fglob(ps, [".mp3", ".wav", ".m4a", ".aac"])) 208 | Json = Et("Json", lambda ps: fglob(ps, [".json"])) 209 | 210 | 211 | Etype.cast = cast 212 | # make custom etypes available on Etype 213 | for t in get_custom_etypes(): 214 | setattr(Etype, t.__name__, t(t.__name__, t.filter)) 215 | Union = UnionEt 216 | Array = lambda x: x.as_array() 217 | Index = LocalElementsIndex 218 | -------------------------------------------------------------------------------- /src/lib/common/exceptions.py: -------------------------------------------------------------------------------- 1 | class SelectorNotFoundError(Exception): 2 | def __init__(self, selector): 3 | super().__init__( 4 | f"""Could not find a valid selector named '{selector}'. Ensure that you have a folder named '{selector}' 5 | in the selectors directory, and that it exports a valid Selector.""" 6 | ) 7 | 8 | 9 | class AnalyserNotFoundError(Exception): 10 | def __init__(self, analyser): 11 | super().__init__( 12 | f"""Could not find a valid analyser named '{analyser}'. Ensure that you have a folder named '{analyser}' 13 | in the analysers directory, and that it exports a valid Analyser.""" 14 | ) 15 | 16 | 17 | class WorkingDirectorNotFoundError(Exception): 18 | def __init__(self, workdir): 19 | super().__init__( 20 | f"""The working directory path that you specified, '{workdir}', does not exist or is otherwise corrupted.""" 21 | ) 22 | 23 | 24 | class InvalidPhaseError(Exception): 25 | def __init__(self): 26 | super().__init__("The 'phase' argument must be either 'select' or 'analyse'.") 27 | 28 | 29 | class InvalidAnalyserConfigError(Exception): 30 | def __init__(self, msg): 31 | super().__init__(f"Invalid analyser config - {msg}") 32 | 33 | 34 | class InvalidSelectorConfigError(Exception): 35 | def __init__(self, msg): 36 | super().__init__(f"Invalid selector config - {msg}") 37 | 38 | 39 | class InvalidYamlError(Exception): 40 | def __init__(self, msg): 41 | super().__init__(f"Invalid YAML - {msg}") 42 | 43 | 44 | class ElementShouldSkipError(Exception): 45 | def __init__(self, msg): 46 | super().__init__(f"{msg} - skipping element") 47 | 48 | 49 | class ElementShouldRetryError(Exception): 50 | def __init__(self, msg): 51 | super().__init__(f"{msg} - attempt retry") 52 | 53 | 54 | class SelectorIndexError(Exception): 55 | def __init__(self, msg): 56 | super().__init__(f"Selector index failed - {msg}") 57 | 58 | 59 | class ImproperLoggedPhaseError(Exception): 60 | def __init__(self, fname): 61 | super().__init__( 62 | f"""The method '{fname}' does not belong to a class that inherits from MTModule. The 63 | phase decorator can only be applied to methods on such a class.""" 64 | ) 65 | 66 | 67 | class BatchedPhaseArgNotGenerator(Exception): 68 | def __init__(self, fname): 69 | super().__init__( 70 | f"""The method '{fname}' cannot be batched. The 'batched_phase' decorator can only be applied to a function that takes a generator as its first and only argument. """ 71 | ) 72 | 73 | 74 | class MTriageStorageCorruptedError(Exception): 75 | def __init__(self, fname): 76 | super().__init__( 77 | "MTriage encountered an unexpected file structure in selectors or analysers. Ensure you specified the correct working directory." 78 | ) 79 | 80 | 81 | class EtypeCastError(Exception): 82 | def __init__(self, msg): 83 | super().__init__(f"Could not cast element as {msg}") 84 | 85 | 86 | class InvalidElementsIn(Exception): 87 | def __init__(self, comp, msg): 88 | super().__init__(f"The elements_in '{comp}' is not valid. {msg}") 89 | 90 | 91 | class InvalidAnalyserElements(Exception): 92 | pass 93 | 94 | 95 | class InvalidCarry(Exception): 96 | def __init__(self, msg): 97 | super().__init__(f"The 'carry' attribute you provided is invalid: {msg}") 98 | 99 | 100 | class InvalidElementIndex(Exception): 101 | def __init__(self): 102 | super().__init__( 103 | f"""The element index read from disk is an invalid generator. Check that your index method is 104 | correct, and that your disk has not been corrupted.""" 105 | ) 106 | 107 | 108 | class InvalidStorageQuery(Exception): 109 | def __init__(self, query, msg): 110 | super().__init__(f"The query '{query}' is invalid: {msg}") 111 | -------------------------------------------------------------------------------- /src/lib/common/get.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from importlib import import_module 3 | from lib.common.util import files 4 | 5 | 6 | def get_module(_from, key): 7 | """Dynamically loads in all analysers from the analysers folder, generating a dictionary in which the folder name 8 | is the key, and the export from 'main' is the value. 9 | """ 10 | if _from == "select": 11 | module_folder = f"lib.selectors" 12 | elif _from == "analyse": 13 | module_folder = f"lib.analysers" 14 | else: 15 | raise ImportError("The phase argument must be either 'select' or 'analyse'") 16 | 17 | pth = f"{module_folder}.{key}.core" 18 | mod = import_module(pth) 19 | return mod.module 20 | 21 | 22 | def get_custom_etypes(): 23 | base_import = "lib.etypes" 24 | module_folder = Path("/mtriage/src/lib/etypes") 25 | all_etypes = [t.stem for t in files(module_folder)] 26 | imports = [f"{base_import}.{p}" for p in all_etypes] 27 | return [import_module(mod).etype for mod in imports] 28 | -------------------------------------------------------------------------------- /src/lib/common/selector.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import shutil 4 | from abc import abstractmethod 5 | from typing import Dict, Generator, Union, List 6 | from types import SimpleNamespace 7 | from lib.common.mtmodule import MTModule 8 | from lib.common.exceptions import ( 9 | InvalidElementIndex, 10 | ElementShouldRetryError, 11 | ElementShouldSkipError, 12 | EtypeCastError, 13 | ) 14 | from lib.common.etypes import LocalElement, LocalElementsIndex 15 | from lib.common.storage import Storage, LocalStorage 16 | from lib.common.util import MAX_CPUS 17 | 18 | 19 | class Selector(MTModule): 20 | """A Selector implements the indexing and retrieving of media for a platform or otherwise distinct space. 21 | 22 | 'index' and 'retrieve_element' are abstract methods that need to be defined on selectors. Other attributes and 23 | methods in the class should not have to be explicitly referenced by selectors, as all data necessary is passed in 24 | the arguments of exposed methods. 25 | """ 26 | 27 | def __init__(self, config, module, storage): 28 | super().__init__(config, module, storage=storage) 29 | 30 | @abstractmethod 31 | def index(self, config) -> LocalElementsIndex: 32 | """TODO: indicate the exact format this should output. 33 | Should populate a dataframe with the results, keep logs, and then call: 34 | self.index_complete(df, logs) 35 | 36 | REQUIRED: each result in the dataframe must contain an 'id' field containing 37 | a unique identifier for the element. 38 | 39 | NOTE: should be a relatively light pass that designates the space to be retrieved. 40 | No options for parallelisation, run on a single CPU. 41 | """ 42 | raise NotImplementedError 43 | 44 | @abstractmethod 45 | def retrieve_element(self, row: SimpleNamespace, config) -> LocalElement: 46 | """Retrieve takes a single row from LocalElementsIndex as an argument, which was produced by the 'index' 47 | method. Data that has already been retrieved will not be retrieved again. The method should return 48 | a LocalElement, which mtriage will then persist to an instance of `Storage`.""" 49 | raise NotImplementedError 50 | 51 | # optionally implemented by child 52 | # both ELEMENT_DIR and config are implicitly available on self, but passed explicitily for convenience 53 | def pre_retrieve(self, config: Dict): 54 | pass 55 | 56 | def post_retrieve(self, config: Dict): 57 | pass 58 | 59 | @MTModule.phase("index") 60 | def start_indexing(self): 61 | element_map = self.index(self.config) 62 | if element_map is not None: 63 | self.disk.write_elements_index(self.name, element_map) 64 | 65 | def start_retrieving(self): 66 | self.logger( 67 | f"Running selection {'in parallel' if self.in_parallel else 'serially'}" 68 | ) 69 | 70 | self.__pre_retrieve() 71 | elements = self.disk.read_elements_index(self.name).rows 72 | if not self.in_parallel: 73 | try: 74 | elements = [e for e in elements] 75 | except: 76 | raise InvalidElementIndex() 77 | self.__retrieve(elements) 78 | self.__post_retrieve() 79 | self.disk.write_meta( 80 | self.name, 81 | { 82 | "etype": self.out_etype.__repr__(), 83 | "config": self.get_full_config(), 84 | "stage": {"name": self.name, "module": "selector"}, 85 | }, 86 | ) 87 | 88 | @MTModule.phase("pre-retrieve") 89 | def __pre_retrieve(self): 90 | self.pre_retrieve(self.config) 91 | 92 | @MTModule.phase("retrieve") 93 | def __retrieve(self, element_indices: Union[List, Generator]): 94 | for element_index in element_indices: 95 | self.__attempt_retrieve(5, element_index) 96 | self.disk.delete_local_on_write = False 97 | 98 | @MTModule.phase("post-retrieve") 99 | def __post_retrieve(self): 100 | self.post_retrieve(self.config) 101 | 102 | def __attempt_retrieve(self, attempts, element_index): 103 | try: 104 | new_element = self.retrieve_element(element_index, self.config) 105 | if new_element is None: 106 | return 107 | success = self.disk.write_element(self.name, new_element) 108 | if not success: 109 | raise ElementShouldRetryError("Unsuccessful storage") 110 | 111 | except ElementShouldSkipError as e: 112 | self.error_logger(str(e), element_index) 113 | except ElementShouldRetryError as e: 114 | self.error_logger(str(e), element_index) 115 | if attempts > 1: 116 | return self.__attempt_retrieve(attempts - 1, element_index) 117 | else: 118 | self.error_logger( 119 | "failed after maximum retries - skipping element", element_index 120 | ) 121 | # TODO: flag to turn this off during development should be passed during run 122 | except Exception as e: 123 | if self.is_dev(): 124 | raise e 125 | else: 126 | self.error_logger( 127 | "unknown exception raised - skipping element", element_index 128 | ) 129 | -------------------------------------------------------------------------------- /src/lib/common/util.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import multiprocessing 3 | from pathlib import Path 4 | from typing import List 5 | 6 | MAX_CPUS = multiprocessing.cpu_count() - 1 7 | 8 | 9 | def get_batch_size(ls_len): 10 | """ Determine the batch size for multiprocessing. """ 11 | if ls_len >= MAX_CPUS: 12 | return ls_len // (MAX_CPUS + 1) 13 | # TODO: improve this heuristic for splitting up jobs 14 | return ls_len 15 | 16 | 17 | def batch(iterable, n=1): 18 | l = len(iterable) 19 | for ndx in range(0, l, n): 20 | yield iterable[ndx : min(ndx + n, l)] 21 | 22 | 23 | def serialize_dict(_dict): 24 | ret = "" 25 | for key in _dict: 26 | val = _dict[key] 27 | if isinstance(val, dict): 28 | ret += serialize_dict(val) 29 | else: 30 | ret += f"{key}{val}" 31 | return ret 32 | 33 | 34 | def hashdict(_dict): 35 | m = hashlib.md5() 36 | m.update(serialize_dict(_dict).encode("utf-8")) 37 | return m.hexdigest() 38 | 39 | 40 | def subdirs(path: Path) -> List[Path]: 41 | """ Return a list of Paths for subdirectories in a directory """ 42 | if path.is_dir(): 43 | return [f for f in path.iterdir() if f.is_dir()] 44 | else: 45 | return [] 46 | 47 | 48 | def files(path: Path) -> List[Path]: 49 | """ Return a list of Paths for files in a directory """ 50 | return [x for x in path.iterdir() if x.is_file()] 51 | -------------------------------------------------------------------------------- /src/lib/etypes/cvjson.py: -------------------------------------------------------------------------------- 1 | import json 2 | import ntpath 3 | from typing import List, Union 4 | from pathlib import Path 5 | from lib.common.etypes import Etype, Et, Pth 6 | from lib.common.exceptions import EtypeCastError 7 | 8 | TMP = Path("/tmp") 9 | IMG_SFXS = [".bmp", ".jpg", ".png", ".jpeg"] 10 | 11 | 12 | def deduce_frame_no(path): 13 | # TODO: error handling 14 | head, tail = ntpath.split(path) 15 | f = tail or ntpath.basename(head) 16 | num = f.split(".")[0] 17 | return int(num) 18 | 19 | 20 | def prepare_json(path): 21 | out = {} 22 | if path is not None: 23 | with open(path, "r") as f: 24 | f = json.load(f) 25 | out["title"] = f["title"] 26 | out["description"] = f["description"] 27 | out["webpage_url"] = f["webpage_url"] 28 | out["duration"] = f["duration"] 29 | out["upload_date"] = f["upload_date"] 30 | return out 31 | 32 | 33 | class CvJson(Et): 34 | """A custom Etype for computer vision (CV) json files, representing 35 | predictions on a set of frames.""" 36 | 37 | def __repr__(self): 38 | return "CvJson" 39 | 40 | def filter(self, paths: Union[Pth, List[Pth]]) -> List[Pth]: 41 | if isinstance(paths, (str, Path)): 42 | paths = [paths] 43 | 44 | pths = [] 45 | json_count = 0 46 | for p in paths: 47 | if p.suffix in ".json" and p.name == "scores.json": 48 | pths.append(p) 49 | json_count += 1 50 | pths.append(p) if p.suffix in IMG_SFXS else None 51 | if json_count != 1: 52 | raise EtypeCastError(self) 53 | return pths 54 | 55 | @staticmethod 56 | def from_preds(element, get_preds): 57 | """ Generate an element containing classifier predictions in a format 58 | appropriate for CvJson, i.e. a single JSON file 'preds.json' that 59 | contains an object representing which classes are predicted for each 60 | frame. 61 | 62 | This function assumes that `element.paths` represents an array of images 63 | to be interpreted. The `get_preds` function operates on a single image, 64 | accepting one argument that is a path to an image. It returns a list of 65 | tuples `('classname', 0.8)`, where `'classname'` is a string 66 | representing the class predicted, and `0.8` is the normalized prediction 67 | probability between 0 and 1. See KerasPretrained/core.py in analysers 68 | for an example. """ 69 | imgs = [p for p in element.paths if p.suffix in IMG_SFXS] 70 | labels = {} 71 | for imp in imgs: 72 | frame_no, preds = deduce_frame_no(imp), get_preds(imp) 73 | for pred_label, pred_conf in preds: 74 | if pred_label in labels.keys(): 75 | labels[pred_label]["frames"].append(frame_no) 76 | labels[pred_label]["scores"].append(pred_conf) 77 | else: 78 | labels[pred_label] = {"frames": [frame_no], "scores": [pred_conf]} 79 | 80 | meta = [p for p in element.paths if p.suffix in ".json"] 81 | meta = meta[0] if len(meta) > 0 else None 82 | out = {**prepare_json(meta), "labels": labels} 83 | base = TMP / element.id 84 | base.mkdir(parents=True, exist_ok=True) 85 | outp = base / "preds.json" 86 | 87 | with open(outp, "w") as fp: 88 | json.dump(out, fp) 89 | 90 | return Etype.Json(element.id, outp) 91 | 92 | 93 | etype = CvJson 94 | -------------------------------------------------------------------------------- /src/lib/selectors/FourChan/boards.py: -------------------------------------------------------------------------------- 1 | viable_boards = [ 2 | "a", 3 | "aco", 4 | "adv", 5 | "an", 6 | "asp", 7 | "b", 8 | "bant", 9 | "biz", 10 | "c", 11 | "cgl", 12 | "ck", 13 | "cm", 14 | "co", 15 | "d", 16 | "diy", 17 | "e", 18 | "f", 19 | "fa", 20 | "fit", 21 | "g", 22 | "gd", 23 | "gif", 24 | "h", 25 | "hc", 26 | "his", 27 | "hm", 28 | "hr", 29 | "i", 30 | "ic", 31 | "int", 32 | "jp", 33 | "k", 34 | "lgbt", 35 | "lit", 36 | "m", 37 | "mlp", 38 | "mu", 39 | "n", 40 | "news", 41 | "o", 42 | "out", 43 | "p", 44 | "po", 45 | "pol", 46 | "qa", 47 | "qst", 48 | "r", 49 | "r9k", 50 | "s", 51 | "s4s", 52 | "sci", 53 | "soc", 54 | "sp", 55 | "t", 56 | "tg", 57 | "toy", 58 | "trash", 59 | "trv", 60 | "tv", 61 | "u", 62 | "v", 63 | "vg", 64 | "vip", 65 | "vp", 66 | "vr", 67 | "w", 68 | "wg", 69 | "wsg", 70 | "wsr", 71 | "x", 72 | "y", 73 | ] 74 | -------------------------------------------------------------------------------- /src/lib/selectors/FourChan/core.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | import os 4 | import html2text 5 | from pathlib import Path 6 | from urllib.request import urlretrieve 7 | from lib.common.selector import Selector 8 | from lib.common.etypes import Etype, LocalElementsIndex 9 | from lib.common.util import files 10 | from lib.selectors.FourChan.boards import viable_boards 11 | 12 | TMP = Path("/tmp") 13 | 14 | 15 | class FourChan(Selector): 16 | """A selector that leverages the native 4chan API. 17 | 18 | https://github.com/4chan/4chan-API 19 | """ 20 | 21 | def index(self, config): 22 | results = [] 23 | board = config["board"] 24 | if board not in viable_boards: 25 | self.error_logger("Your chosen board does not exist on 4chan!") 26 | quit() 27 | # Create a HTML parser for parsing comments 28 | h = html2text.HTML2Text() 29 | h.ignore_links = False 30 | 31 | req = f"https://a.4cdn.org/{board}/threads.json" 32 | 33 | content = json.loads(requests.get(req).content) 34 | for page_index, page in enumerate(content): 35 | self.logger(f"Scraping page number: {page_index+1}") 36 | for thread_index, threads in enumerate(page["threads"]): 37 | self.logger(f"Extracting posts from thread number: {thread_index+1}") 38 | thread_id = threads["no"] 39 | req = f"https://a.4cdn.org/{board}/thread/{thread_id}.json" 40 | thread_content = json.loads(requests.get(req).content)[ 41 | "posts" 42 | ] # thread content is a list of posts 43 | for post_index, post in enumerate(thread_content): 44 | self.logger( 45 | f"Extracting media and comments from post number: {post_index+1}" 46 | ) 47 | post_row = [] 48 | post_row.append(post["no"]) 49 | post_row.append(thread_id) 50 | post_row.append(post["time"]) 51 | 52 | try: 53 | comment = post["com"] 54 | except KeyError: 55 | comment = "..." 56 | else: 57 | comment = h.handle(comment) 58 | post_row.append(comment) 59 | 60 | # Filename 61 | try: 62 | filename = post["filename"] 63 | except KeyError: 64 | filename = "" 65 | 66 | if filename != "": 67 | time_id = post["tim"] 68 | extension = post["ext"] 69 | full_file = f"{filename}{extension}" 70 | file_url = f"https://i.4cdn.org/{board}/{time_id}{extension}" 71 | post_row.append(full_file) 72 | post_row.append(extension) 73 | post_row.append(file_url) 74 | elif filename == "": 75 | post_row.append("") 76 | post_row.append("") 77 | post_row.append("") 78 | results.append(post_row) 79 | self.logger("Scraping metadata complete") 80 | results.insert( 81 | 0, ["id", "thread_id", "datetime", "comment", "filename", "ext", "url"] 82 | ) 83 | return LocalElementsIndex(results) 84 | 85 | def retrieve_element(self, element, _): 86 | base = TMP / element.id 87 | base.mkdir(parents=True, exist_ok=True) 88 | 89 | fn = element.filename 90 | identifier = element.id 91 | comment = element.comment 92 | url = element.url 93 | 94 | with open(base / f"{identifier}_comment.txt", "w+") as f: 95 | f.write(comment) 96 | 97 | if url != "": 98 | urlretrieve(url, base / fn) 99 | 100 | return Etype.cast(element.id, files(base)) 101 | 102 | 103 | module = FourChan 104 | -------------------------------------------------------------------------------- /src/lib/selectors/FourChan/info.yaml: -------------------------------------------------------------------------------- 1 | desc: Allows you to scrape text and media from 4chan 2 | args: 3 | - name: board 4 | desc: Numeric identifier for a specific board to scrape. If not specified all boards are scraped. 5 | required: true 6 | input: string 7 | -------------------------------------------------------------------------------- /src/lib/selectors/FourChan/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | html2text -------------------------------------------------------------------------------- /src/lib/selectors/Local/core.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from shutil import copyfile 4 | from lib.common.selector import Selector 5 | from lib.common.etypes import Etype, Index 6 | from lib.common.exceptions import SelectorIndexError 7 | 8 | 9 | BASE = Path("/mtriage") 10 | 11 | 12 | class Local(Selector): 13 | """A simple selector for importing local files into mtriage. 14 | 15 | It recursively finds every file in a source_folder specified in the config 16 | (see example script 4.select_local.sh) and imports each file into its own 17 | element. The element ID is the file's name concatenated with its extension. 18 | 19 | n.b. the directory being imported must be located within the mtriage 20 | directory on the mtriage host to be accessible inside the docker container 21 | (the media folder is recommended). 22 | """ 23 | 24 | out_etype = Etype.Any 25 | 26 | def __init__(self, *args): 27 | super().__init__(*args) 28 | 29 | def is_aggregate(self): 30 | return "aggregate" in self.config and self.config["aggregate"] 31 | 32 | def index(self, config): 33 | src = Path(config["source"]) 34 | abs_src = BASE / src 35 | if not os.path.exists(abs_src): 36 | raise SelectorIndexError( 37 | f"The 'source' folder {src} could not be found. Ensure it is in the same directory asmtriage." 38 | ) 39 | return self._index(abs_src) 40 | 41 | def _index(self, abs_src): 42 | self.logger("Indexing local folder...") 43 | results = [["id", "path"]] 44 | excluded = self.config.get("exclude", []) 45 | for root, _, files in os.walk(abs_src): 46 | main = Path(abs_src) 47 | root = Path(root) 48 | for file in files: 49 | if file == ".mtbatch" or file in excluded: 50 | continue 51 | fp = root / file 52 | elid = root.name if (root.name != main.name) else fp.stem 53 | results.append([elid, fp]) 54 | self.logger(f"indexed file {fp} as: {elid}") 55 | if self.is_aggregate(): 56 | # `self.results` used in `retrieve_element` for paths. 57 | self.results = results[1:] 58 | # NB: hacky way to just make `retrieve_element` run just once.: 59 | return Index([["id"], ["IS_AGGREGATE"]]) 60 | return Index(results) 61 | 62 | def retrieve_element(self, element, config): 63 | if self.is_aggregate(): 64 | og_folder = Path(config["source"]) 65 | return Etype.Any(og_folder.name, paths=[x[1] for x in self.results]) 66 | else: 67 | return Etype.Any(element.id, paths=[element.path]) 68 | 69 | 70 | module = Local 71 | -------------------------------------------------------------------------------- /src/lib/selectors/Local/info.yaml: -------------------------------------------------------------------------------- 1 | desc: Selects media from a path that already exists on the local filesystem. 2 | args: 3 | - name: source 4 | desc: The path to the source folder that represents the media space. Ensure that the path exists not only on the local filesystem, but also in the subsection that is mounted to Docker. The easiest way to ensure this is the case is to ensure that the 'source' is a subdirectory of one of the gitignored directories in mtriage, i.e. 'data'. 5 | required: true 6 | input: folder 7 | - name: aggregate 8 | desc: Put all inside one element. Otherwise will create one element per separate file. 9 | required: false 10 | input: bool 11 | - name: exclude 12 | desc: files to exclude 13 | required: false 14 | input: list 15 | -------------------------------------------------------------------------------- /src/lib/selectors/Twitter/core.py: -------------------------------------------------------------------------------- 1 | import twint 2 | import json 3 | from urllib.request import urlretrieve 4 | from pathlib import Path 5 | from lib.common.selector import Selector 6 | from lib.common.etypes import Etype, LocalElementsIndex 7 | from lib.common.util import files 8 | from lib.util.twint import to_serializable 9 | 10 | TMP = Path("/tmp") 11 | 12 | 13 | class Twitter(Selector): 14 | """A selector for scraping tweets. 15 | 16 | It leverages 'twint' - https://github.com/twintproject/twint - under 17 | the hood. 18 | """ 19 | 20 | out_etype = Etype.Json 21 | 22 | def index(self, config): 23 | c = twint.Config() 24 | c.Search = config["search_term"] 25 | c.Since = config["uploaded_after"] 26 | c.Until = config["uploaded_before"] 27 | c.Show_hashtags = True 28 | c.Store_object = True 29 | 30 | twint.run.Search(c) 31 | 32 | tweets = to_serializable(twint.output.tweets_list, as_list=True) 33 | return LocalElementsIndex(tweets) 34 | 35 | def retrieve_element(self, element, _): 36 | base = TMP / element.id 37 | base.mkdir(parents=True, exist_ok=True) 38 | with open(base / "tweet.json", "w+") as fp: 39 | json.dump(element.__dict__, fp) 40 | 41 | # retrieve photos 42 | if "download_photos" in self.config and self.config.download_photos: 43 | photos = element.photos.split(",") 44 | if len(photos) < 1 or photos[0] == "": 45 | self.logger(f"{element.id} downloaded.") 46 | return Etype.cast(element.id, files(base)) 47 | 48 | for url in photos: 49 | fname = url.rsplit("/", 1)[-1] 50 | urlretrieve(url, base / fname) 51 | 52 | self.logger(f"{element.id} downloaded (with images).") 53 | 54 | if "download_videos" in self.config and self.config.download_videos: 55 | if hasattr(element, "video") and element.video != "": 56 | fname = element.video.rsplit("/", 1)[-1] 57 | urlretrieve(element.video, base / fname) 58 | 59 | self.disk.delete_local_on_write = True 60 | return Etype.cast(element.id, files(base)) 61 | 62 | 63 | module = Twitter 64 | -------------------------------------------------------------------------------- /src/lib/selectors/Twitter/info.yaml: -------------------------------------------------------------------------------- 1 | desc: Search and download for Twitter using https://github.com/twintproject/twint. Only a proxy to scraping via time-constrained keyword search is implemented at present. 2 | args: 3 | - name: search_term 4 | desc: Searches for the term in the entire tweet. 5 | required: true 6 | input: string 7 | - name: uploaded_before 8 | desc: Only return tweets before this date. 9 | required: true 10 | input: date 11 | - name: uploaded_after 12 | desc: Only return tweets after this date. 13 | required: true 14 | input: date 15 | - name: download_photos 16 | required: false 17 | desc: set to True if the selector should download photos in tweets. False by default. 18 | input: boolean 19 | - name: download_videos 20 | required: false 21 | desc: set to True if the selector should download videos in tweets. False by default. 22 | input: boolean 23 | 24 | -------------------------------------------------------------------------------- /src/lib/selectors/Twitter/partial.Dockerfile: -------------------------------------------------------------------------------- 1 | RUN pip install -e git+https://github.com/twintproject/twint.git@origin/master#egg=twint 2 | RUN cd /mtriage/src/twint && python setup.py install 3 | -------------------------------------------------------------------------------- /src/lib/selectors/Youtube/core.py: -------------------------------------------------------------------------------- 1 | import yt_dlp 2 | import json 3 | import re 4 | import argparse, os, sys 5 | import math 6 | from subprocess import call, STDOUT 7 | from pathlib import Path 8 | from lib.common.selector import Selector 9 | from lib.common.etypes import Etype, Union, LocalElementsIndex 10 | from lib.common.util import files 11 | from lib.common.exceptions import ElementShouldSkipError 12 | 13 | from datetime import datetime, timedelta 14 | 15 | import googleapiclient.discovery 16 | from googleapiclient.errors import HttpError 17 | 18 | YOUTUBE_API_SERVICE_NAME = "youtube" 19 | YOUTUBE_API_VERSION = "v3" 20 | API_KEY = os.environ.get("GOOGLE_API_KEY") 21 | TMP = Path("/tmp") 22 | 23 | 24 | class Youtube(Selector): 25 | out_etype = Union(Etype.Json, Etype.Video) 26 | 27 | def index(self, _) -> LocalElementsIndex: 28 | results = self._run() 29 | if len(results) > 0: 30 | out = [] 31 | out.append(list(results[0].keys())) 32 | out.extend([x.values() for x in results]) 33 | return LocalElementsIndex(out) 34 | return None 35 | 36 | def pre_retrieve(self, _): 37 | self.ydl = yt_dlp.YoutubeDL( 38 | { 39 | "outtmpl": f"{TMP}/%(id)s/%(id)s.mp4", 40 | "format": "worstvideo[ext=mp4]", 41 | } 42 | ) 43 | 44 | def retrieve_element(self, element, _): 45 | with self.ydl: 46 | try: 47 | result = self.ydl.extract_info(element.url) 48 | meta = TMP / element.id / "meta.json" 49 | with open(meta, "w+") as fp: 50 | json.dump(result, fp) 51 | self.logger(f"{element.id}: video and meta downloaded successfully.") 52 | self.disk.delete_local_on_write = True 53 | return Etype.cast(element.id, files(TMP / element.id)) 54 | except yt_dlp.utils.DownloadError: 55 | raise ElementShouldSkipError( 56 | f"Something went wrong downloading {element.id}. It may have been deleted." 57 | ) 58 | 59 | def _run(self): 60 | self.logger(f"Query: {self.config['search_term']}") 61 | if "uploaded_after" in self.config: 62 | self.logger(f"Start: {self.config['uploaded_after']}") 63 | 64 | if "uploaded_before" in self.config: 65 | self.logger(f"End: {self.config['uploaded_before']}") 66 | 67 | if self.config.get("daily"): 68 | results = [] 69 | self.logger( 70 | f"Scraping daily, from {self.config['uploaded_after']} -- {self.config['uploaded_before']}" 71 | ) 72 | self.logger("-----------------") 73 | for after, before in self._days_between( 74 | self.config["uploaded_after"], self.config["uploaded_before"] 75 | ): 76 | results = results + self.get_results(before, after) 77 | 78 | else: 79 | results = self.get_results( 80 | self.config.get("uploaded_before"), self.config.get("uploaded_after") 81 | ) 82 | 83 | self.logger("\n\n----------------") 84 | self.logger(f"Scrape successful, {len(results) - 1} results.") 85 | 86 | return results 87 | 88 | def get_results(self, before, after): 89 | args_obj = {"q": self.config["search_term"]} 90 | 91 | if before is not None: 92 | args_obj["before"] = self.config["uploaded_before"] 93 | if "uploaded_after" in self.config.keys(): 94 | args_obj["after"] = self.config["uploaded_after"] 95 | 96 | new_results = self._youtube_search_all_pages(args_obj) 97 | if new_results is None: 98 | raise Exception("Something went wrong") 99 | return new_results 100 | 101 | def _add_to_csv_obj(self, csv_obj, s_res): 102 | for search_result in s_res: 103 | videoId = search_result["id"]["videoId"] 104 | title = search_result["snippet"]["title"] 105 | channelId = search_result["snippet"]["channelId"] 106 | desc = search_result["snippet"]["description"] 107 | publishedAt = search_result["snippet"]["publishedAt"] 108 | url = f"https://www.youtube.com/watch?v={videoId}" 109 | id = self._id_from_url(url) 110 | csv_obj.append( 111 | { 112 | "url": url, 113 | "title": title.replace(",", ";"), 114 | "desc": desc.replace(",", ";"), 115 | "published": publishedAt[0:10], 116 | "id": id, 117 | } 118 | ) 119 | return csv_obj 120 | 121 | def _youtube_search_all_pages(self, args): 122 | csv_obj = [] 123 | self.logger( 124 | f"Search terms: {args['q']}\n Start: {args['after'] if 'after' in args else ''}\n End: {args['before'] if 'before' in args else ''}" 125 | ) 126 | try: 127 | s_res = self._youtube_search(args) 128 | count = 1 129 | while True: 130 | self.logger(f"\tScraping page {count}...") 131 | count += 1 132 | csv_obj = self._add_to_csv_obj(csv_obj, s_res.get("items", [])) 133 | 134 | if (not "nextPageToken" in s_res) or (len(s_res.get("items", [])) == 0): 135 | break 136 | 137 | s_res = self._youtube_search(args, pageToken=s_res["nextPageToken"]) 138 | self.logger("\tAll pages scraped.") 139 | return csv_obj 140 | except HttpError as e: 141 | self.logger(f"An HTTP error {e.resp.status} occured.") 142 | print(e.content) 143 | return None 144 | 145 | def _youtube_search(self, options, pageToken=None): 146 | # modified from https://github.com/youtube/api-samples/blob/master/python/search.py 147 | if API_KEY is None: 148 | raise ElementShouldSkipError("No GOOGLE_API_KEY specified in .env") 149 | youtube = googleapiclient.discovery.build( 150 | YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=API_KEY 151 | ) 152 | 153 | theargs = { 154 | "pageToken": pageToken, 155 | "q": options["q"], 156 | "part": "id,snippet", 157 | "maxResults": 50, 158 | "safeSearch": "none", 159 | "type": "video", 160 | } 161 | 162 | if "before" in options: 163 | theargs["publishedBefore"] = options["before"] 164 | if "after" in options: 165 | theargs["publishedAfter"] = options["after"] 166 | 167 | request = youtube.search().list(**theargs) 168 | 169 | s = request.execute() 170 | 171 | return s 172 | 173 | def _days_between(self, start, end): 174 | bef = datetime.strptime(end[:-1], "%Y-%m-%dT%H:%M:%S") 175 | aft = datetime.strptime(start[:-1], "%Y-%m-%dT%H:%M:%S") 176 | between = (bef - aft).days 177 | return [ 178 | ( 179 | ((aft + timedelta(days=dt)).strftime("%Y-%m-%dT") + "00:00:00Z"), 180 | ((aft + timedelta(days=dt)).strftime("%Y-%m-%dT") + "23:59:59Z"), 181 | ) 182 | for dt in range(between) 183 | ] 184 | 185 | def _id_from_url(self, url): 186 | id_search = re.search( 187 | "https:\/\/www\.youtube\.com\/watch\?v\=(.*)", url, re.IGNORECASE 188 | ) 189 | if id: 190 | return id_search.group(1) 191 | return None 192 | 193 | 194 | module = Youtube 195 | -------------------------------------------------------------------------------- /src/lib/selectors/Youtube/info.yaml: -------------------------------------------------------------------------------- 1 | desc: Allows you to select a media space via Youtube 2 | args: 3 | - name: search_term 4 | desc: Plain string search query that is submitted to Youtube. 5 | required: true 6 | input: string 7 | - name: uploaded_before 8 | desc: Only return videos uploaded before this date. 9 | required: false 10 | input: date 11 | - name: uploaded_after 12 | desc: Only return videos uploaded after this date. 13 | required: false 14 | input: date 15 | - name: daily 16 | desc: Query the Youtube API N times with the given search terms, where N is the number of days between the 'uploaded_after' and 'uploaded_before' dates. This heuristic returns more results for a given search term, but can fail due to exhausting the API's daily quota. 17 | required: false 18 | input: bool 19 | -------------------------------------------------------------------------------- /src/lib/selectors/Youtube/partial.Dockerfile: -------------------------------------------------------------------------------- 1 | RUN apt-get install -y --no-install-recommends libsm6 libxrender1 libfontconfig1 2 | 3 | RUN curl -sSL https://sdk.cloud.google.com | bash 4 | ENV PATH="$PATH:/root/google-cloud-sdk/bin" 5 | 6 | -------------------------------------------------------------------------------- /src/lib/selectors/Youtube/requirements.txt: -------------------------------------------------------------------------------- 1 | yt-dlp==2023.3.4 2 | 3 | google-api-core==1.11.0 4 | google-api-python-client==1.7.8 5 | google-auth==1.6.3 6 | google-auth-httplib2==0.0.3 7 | grpcio 8 | -------------------------------------------------------------------------------- /src/lib/util/cvjson.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import operator 4 | import re 5 | from typing import List 6 | from shutil import copyfile, rmtree 7 | from pathlib import Path 8 | from lib.common.etypes import Etype 9 | from functools import reduce 10 | 11 | WK_DIR = Path("/tmp/ranking") 12 | 13 | 14 | def open_json(fp): 15 | try: 16 | with open(fp, "r") as f: 17 | return json.load(f) 18 | except: 19 | return {} 20 | 21 | 22 | def render_frame(element, label, frame, score): 23 | return {"element": element, "frame": frame, "score": score, "label": label} 24 | 25 | 26 | def rank(elements: List, threshold=0.5, logger=print, element_id="__RANKING") -> Etype: 27 | ranking_data = {} 28 | 29 | for element in elements: 30 | jsons = [f for f in element.paths if f.suffix in ".json"] 31 | if len(jsons) != 1: 32 | continue 33 | 34 | jsonp = jsons[0] 35 | with open(jsonp, "r") as jsonf: 36 | data = json.load(jsonf) 37 | 38 | try: 39 | # TODO: this logic should be a custom etype built from a core etype class... 40 | # the core class can then include associated methods. 41 | labels = data["labels"] 42 | for label, preds in labels.items(): 43 | frames, scores = preds["frames"], preds["scores"] 44 | valid_frames = [ 45 | idx for idx, _ in enumerate(frames) if scores[idx] > threshold 46 | ] 47 | rank = len(valid_frames) 48 | if rank > 4: 49 | logger(f"label '{label}': rank {rank}") 50 | # gather all ranks in `ranking_data` 51 | if label not in ranking_data: 52 | ranking_data[label] = {} 53 | ranking_data[label][element.id] = rank 54 | 55 | # dpath = WK_DIR / f"{element.id}.json" 56 | logger(f"Rankings indexed for {element.id}.") 57 | 58 | except Exception as e: 59 | logger(f"Could not analyse {element.id}: {e}") 60 | 61 | ranking = {} 62 | for label, values in ranking_data.items(): 63 | s_vals = sorted(values.items(), key=operator.itemgetter(1)) 64 | s_vals.reverse() 65 | s_els = [t[0] for t in s_vals] 66 | ranking[label] = s_els 67 | 68 | file = WK_DIR / "rankings.json" 69 | logger("All rankings aggregated, printed to rankings.json") 70 | 71 | if not os.path.exists(WK_DIR): 72 | os.makedirs(WK_DIR) 73 | 74 | with open(file, "w") as f: 75 | json.dump(ranking, f) 76 | 77 | return Etype.Json(element_id, file) 78 | 79 | 80 | def flatten(elements: List, logger=print) -> Etype: 81 | """ 82 | 'Flatten' all predictions into a list, where each item is a positive frame: 83 | [ 84 | { "element": "xxxx", "frame": 1, "score": 0.2, "label": "tank" }, 85 | ] 86 | """ 87 | is_json = re.compile(r".*\.json") 88 | # NOTE: assumes there is always one .json in each element's `paths` 89 | all_preds = [ 90 | next(filter(is_json.match, [str(x) for x in x.paths])) for x in elements 91 | ] 92 | all_preds = [open_json(x) for x in all_preds] 93 | preds = [ 94 | x.get("labels") 95 | for x in all_preds 96 | if isinstance(x, dict) and x.get("labels") is not None 97 | ] 98 | 99 | vls = [ 100 | [(label, el_preds[label]) for label in el_preds.keys()] for el_preds in preds 101 | ] 102 | vls = [(x[0].id, x[1]) for x in zip(elements, vls)] 103 | label_in_els = [ 104 | (x[0], y[0], y[1]["frames"], y[1]["scores"]) for x in vls for y in x[1] 105 | ] 106 | frames = [ 107 | render_frame(x[0], x[1], y[0], y[1]) 108 | for x in label_in_els 109 | for y in zip(x[2], x[3]) 110 | ] 111 | 112 | output = WK_DIR / "flattened.json" 113 | 114 | if not os.path.exists(WK_DIR): 115 | os.makedirs(WK_DIR) 116 | 117 | with open(output, "w") as f: 118 | json.dump(frames, f) 119 | 120 | logger("All frames aggregated, printed to flattened.json") 121 | return Etype.Json("__FLATTENED", output) 122 | 123 | 124 | def generate_meta(elements: List, logger=print) -> Etype: 125 | """ Combine various metrics inside a single element """ 126 | a = flatten(elements, logger=logger) 127 | b = rank(elements, logger=logger) 128 | 129 | return Etype.Any("__META", a.paths + b.paths) 130 | -------------------------------------------------------------------------------- /src/lib/util/twint.py: -------------------------------------------------------------------------------- 1 | LABELS = [ 2 | "id", 3 | "conversation_id", 4 | "datestamp", 5 | "timestamp", 6 | "timezone", 7 | "user_id", 8 | "username", 9 | "name", 10 | "place", 11 | "tweet", 12 | "mentions", 13 | "urls", 14 | "photos", 15 | "replies_count", 16 | "retweets_count", 17 | "likes_count", 18 | "hashtags", 19 | "cashtags", 20 | "link", 21 | "retweet", 22 | "quote_url", 23 | "video", 24 | "user_rt_id", 25 | "near", 26 | "geo", 27 | "source", 28 | "retweet_date", 29 | ] 30 | 31 | 32 | def pythonize(t): 33 | """ Make valid fields ints, essentially deserialize """ 34 | t["retweet"] = True if t["retweet"] == "True" else False 35 | t["likes_count"] = int(t["likes_count"]) 36 | t["replies_count"] = int(t["replies_count"]) 37 | t["retweets_count"] = int(t["retweets_count"]) 38 | t["photos"] = t["photos"].split(",") 39 | t["hashtags"] = t["hashtags"].split(",") 40 | t["urls"] = t["urls"].split(",") 41 | return t 42 | 43 | 44 | def attr_is_list(attr): 45 | return attr.strip() in [ 46 | "photos", 47 | "mentions", 48 | "urls", 49 | "mentions", 50 | "hashtags", 51 | "cashtags", 52 | ] 53 | 54 | 55 | def jsont(t, as_list): 56 | """ return all fields in a JSON-serializable way """ 57 | if not as_list: 58 | return { 59 | l: ",".join(getattr(t, l)) if attr_is_list(l) else getattr(t, l) 60 | for l in LABELS 61 | } 62 | else: 63 | td = t.__dict__ 64 | out = [] 65 | for l in LABELS: 66 | if attr_is_list(l): 67 | out.append(",".join(td[l])) 68 | else: 69 | out.append(td[l]) 70 | return out 71 | 72 | 73 | def to_serializable(tweets, as_list=False): 74 | vls = [jsont(t, as_list) for t in tweets] 75 | if as_list: 76 | vls.insert(0, LABELS) 77 | return vls 78 | -------------------------------------------------------------------------------- /src/run.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """The entry point for mtriage. 3 | 4 | Orchestrates selectors and analysers via CLI parameters. 5 | 6 | Modules: 7 | Each module corresponds to a web platform API, or some equivalent method 8 | of programmatic retrieval. 9 | 10 | TODO: document where to find selector and analyser design docs. 11 | Attributes: 12 | module (str): Indicates the platform or source from which media should be 13 | analysed. The code that implements is module is self-contained to a 14 | folder here in the 'select' folder. 15 | config (dict of str: str): Hyperparameters that refine the analyse space. 16 | These parameters are module-specific (although the aim is to create as 17 | consistent as possible a parameter language across modules). 18 | folder (str): The path to the directory where the data that is indexed 19 | during the SELECT pass will be saved. This directory serves as a kind of 20 | "working directory" during the SAMPLE and ANALYSE passes, in the sense 21 | that all generated data is saved in this directory. The directory also 22 | contains logs, and represents the 'saved state' of a media triage 23 | analysis. 24 | 25 | """ 26 | import os 27 | import yaml 28 | from validate import validate_yaml 29 | from lib.common.get import get_module 30 | from lib.common.storage import LocalStorage 31 | 32 | CONFIG_PATH = "/run_args.yaml" 33 | 34 | 35 | def make_storage(cfg: dict) -> LocalStorage: 36 | # TODO: generalise `folder` here to a `storage` var that is passed from YAML 37 | return LocalStorage(folder=cfg["folder"]) 38 | 39 | 40 | def _run_analyser(ana: dict, base_cfg: dict, cfg: dict): 41 | # run a single analyser 42 | Analyser = get_module("analyse", ana["name"]) 43 | analyser = Analyser( 44 | {**ana["config"], **base_cfg} if "config" in ana.keys() else base_cfg, 45 | ana["name"], 46 | make_storage(cfg), 47 | ) 48 | analyser.start_analysing() 49 | 50 | 51 | def _run_yaml(): 52 | with open(CONFIG_PATH, "r") as c: 53 | cfg = yaml.safe_load(c) 54 | 55 | validate_yaml(cfg) 56 | 57 | base_cfg = {} 58 | if "select" not in cfg and "elements_in" in cfg: 59 | base_cfg["elements_in"] = cfg["elements_in"] 60 | sel = None 61 | else: 62 | # run select 63 | sel = cfg["select"] 64 | Selector = get_module("select", sel["name"]) 65 | selector = Selector( 66 | sel["config"] if "config" in sel.keys() else {}, 67 | sel["name"], 68 | make_storage(cfg), 69 | ) 70 | selector.start_indexing() 71 | selector.start_retrieving() 72 | base_cfg["elements_in"] = [sel["name"]] 73 | 74 | if "analyse" not in cfg: 75 | return 76 | 77 | analyse_phase = cfg["analyse"] 78 | 79 | if isinstance(analyse_phase, dict): 80 | _run_analyser(analyse_phase, base_cfg, cfg) 81 | 82 | else: 83 | for ana in analyse_phase: 84 | _run_analyser(ana, base_cfg, cfg) 85 | if sel is None: 86 | # take the selector from elements in 87 | fst = cfg["elements_in"][0] 88 | sel = {"name": fst.split("/")[0]} 89 | base_cfg["elements_in"] = [f"{sel['name']}/{ana['name']}"] 90 | 91 | 92 | if __name__ == "__main__": 93 | _run_yaml() 94 | -------------------------------------------------------------------------------- /src/test/README.md: -------------------------------------------------------------------------------- 1 | # src tests 2 | 3 | In pytest. 4 | 5 | Note that all tests are run from the 'src' directory. Relative import paths should be specified accordingly: 6 | 7 | ```python 8 | from lib.common.analyser import Analyser 9 | ``` 10 | 11 | -------------------------------------------------------------------------------- /src/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/forensic-architecture/mtriage/7a841241518f831766767d6ddaa6320b8de4be98/src/test/__init__.py -------------------------------------------------------------------------------- /src/test/etype_stubs/image.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/forensic-architecture/mtriage/7a841241518f831766767d6ddaa6320b8de4be98/src/test/etype_stubs/image.jpeg -------------------------------------------------------------------------------- /src/test/test_analyser.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import json 4 | from pathlib import Path 5 | from lib.common.analyser import Analyser 6 | from lib.common.exceptions import InvalidAnalyserElements, InvalidCarry 7 | from lib.common.etypes import Etype 8 | from lib.common.mtmodule import MTModule 9 | from lib.common.storage import LocalStorage 10 | 11 | 12 | class EmptyAnalyser(Analyser): 13 | out_etype = Etype.Any 14 | 15 | def analyse_element(self, element, config): 16 | raise Exception("is the user-defined func!") 17 | 18 | 19 | class TxtCopyAnalyser(Analyser): 20 | out_etype = Etype.Any 21 | 22 | def analyse_element(self, element, config): 23 | """ just copy over all media in 'any' """ 24 | for f in element.paths: 25 | # only copy over txt files 26 | if f.suffix != ".txt": 27 | return 28 | with open(f, "r") as reader: 29 | contents = reader.readlines() 30 | txt = Path("/tmp/copy.txt") 31 | with open(txt, "w+") as writer: 32 | writer.writelines(contents) 33 | 34 | element.paths = [txt] 35 | return element 36 | 37 | 38 | # TODO: test casting errors via an analyser with explicit etype 39 | @pytest.fixture 40 | def additionals(utils): 41 | obj = lambda: None 42 | obj.maxDiff = None 43 | obj.emptyAnalyserName = "empty" 44 | obj.WHITELIST = ["sel1/an1", "sel1/an2", "sel2"] 45 | obj.sel1 = "sel1" 46 | obj.sel2 = "sel2" 47 | obj.sel1_elements = ["el1", "el2"] 48 | obj.sel2_elements = ["el4", "el5", "el6"] 49 | 50 | utils.scaffold_empty(obj.sel1, elements=obj.sel1_elements, analysers=["an1", "an2"]) 51 | utils.scaffold_empty(obj.sel2, elements=obj.sel2_elements) 52 | os.rmdir(utils.get_element_path(obj.sel1, "el1", analyser="an2")) 53 | 54 | obj.config = {"elements_in": obj.WHITELIST, "dev": True} 55 | obj.emptyAnalyser = EmptyAnalyser( 56 | obj.config, 57 | obj.emptyAnalyserName, 58 | storage=LocalStorage(folder=utils.TEMP_ELEMENT_DIR), 59 | ) 60 | utils.setup() 61 | yield obj 62 | utils.cleanup() 63 | 64 | 65 | def test_selector_imports(): 66 | assert type(Analyser) == type(MTModule) 67 | 68 | 69 | def test_cannot_instantiate(utils): 70 | with pytest.raises(TypeError): 71 | Analyser({}, "empty", utils.TEMP_ELEMENT_DIR) 72 | 73 | 74 | def test_init(additionals): 75 | assert additionals.config == additionals.emptyAnalyser.config 76 | 77 | 78 | def test_analyse(utils, additionals): 79 | config = {"elements_in": ["sel1"]} 80 | dummyName = "dummyAnalyser" 81 | checkUserExceptionAnalyser = EmptyAnalyser( 82 | {**config, "dev": True}, "empty", LocalStorage(folder=utils.TEMP_ELEMENT_DIR) 83 | ) 84 | dummyAnalyser = TxtCopyAnalyser( 85 | config, dummyName, LocalStorage(folder=utils.TEMP_ELEMENT_DIR) 86 | ) 87 | # TODO: work out whether this test is needed with the new format 88 | # test it calls the user-defined `analyse_element` 89 | # with pytest.raises(Exception, match="is the user-defined func!"): 90 | # checkUserExceptionAnalyser.start_analysing(in_parallel=False) 91 | # try again with a text el mocking selection completed 92 | # TODO: fix these tests- adding casting throws errors in some cases, as well as extra log. 93 | for el in additionals.sel1_elements: 94 | with open( 95 | f"{dummyAnalyser.disk.base_dir}/sel1/{dummyAnalyser.disk.RETRIEVED_EXT}/{el}/anitem.txt", 96 | "w+", 97 | ) as f: 98 | f.write("Hello") 99 | dummyAnalyser.start_analysing() 100 | # confirm txt has carried 101 | for el in additionals.sel1_elements: 102 | with open( 103 | f"{dummyAnalyser.disk.base_dir}/sel1/{dummyAnalyser.disk.ANALYSED_EXT}/{dummyName}/{el}/copy.txt", 104 | "r", 105 | ) as f: 106 | lines = f.readlines() 107 | assert len(lines) == 1 108 | assert lines[0] == "Hello" 109 | -------------------------------------------------------------------------------- /src/test/test_analyser_errors.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from lib.common.analyser import Analyser 4 | from test.test_analyser import EmptyAnalyser 5 | from lib.common.storage import LocalStorage 6 | from lib.common.etypes import Etype, LocalElement 7 | from lib.common.exceptions import ( 8 | ElementShouldRetryError, 9 | ElementShouldSkipError, 10 | InvalidAnalyserConfigError, 11 | MTriageStorageCorruptedError, 12 | InvalidAnalyserElements, 13 | ) 14 | 15 | 16 | class ErrorThrowingAnalyser(Analyser): 17 | out_etype = Etype.Any 18 | 19 | def __init__(self, *args): 20 | super().__init__(*args) 21 | self.retryCount = 0 22 | 23 | def analyse_element(self, element, config): 24 | if element.id == "skip": 25 | raise ElementShouldSkipError("test") 26 | elif element.id == "retry3" and self.retryCount < 3: 27 | self.retryCount += 1 28 | raise ElementShouldRetryError("test") 29 | elif element.id == "retryN": 30 | raise ElementShouldRetryError("test") 31 | else: 32 | pass 33 | 34 | 35 | @pytest.fixture 36 | def additionals(utils): 37 | obj = lambda: None 38 | obj.selname = "stub_sel" 39 | elements = ["skip", "retry3", "retryN", "pass"] 40 | utils.scaffold_empty(obj.selname, elements=elements) 41 | for element in elements: 42 | with open(f"{utils.get_element_path(obj.selname, element)}/out.txt", "w") as f: 43 | f.write("something") 44 | 45 | goodConfig = {"elements_in": [obj.selname], "dev": True} 46 | 47 | obj.an = ErrorThrowingAnalyser( 48 | goodConfig, "analyserErrorSelector", LocalStorage(folder=utils.TEMP_ELEMENT_DIR) 49 | ) 50 | yield obj 51 | utils.cleanup() 52 | 53 | 54 | def test_analyse_skip_error(additionals): 55 | with pytest.raises(ElementShouldSkipError, match="test - skipping element"): 56 | additionals.an.analyse_element(LocalElement(id="skip"), {}) 57 | 58 | 59 | def test_analyse_retry_error(additionals): 60 | with pytest.raises(ElementShouldRetryError, match="test - attempt retry"): 61 | additionals.an.analyse_element(LocalElement(id="retryN"), {}) 62 | 63 | 64 | def test_bad_init_error(utils): 65 | bad0 = {} 66 | bad1 = {"elements_in": []} 67 | bad2 = {"elements_in": None} 68 | good = {"elements_in": ["selname"]} 69 | 70 | with pytest.raises( 71 | InvalidAnalyserConfigError, 72 | match="must contain an 'elements_in' indicating the analyser's input", 73 | ): 74 | no_elements_in = ErrorThrowingAnalyser( 75 | bad0, "stub", LocalStorage(folder=utils.TEMP_ELEMENT_DIR) 76 | ) 77 | 78 | with pytest.raises( 79 | InvalidAnalyserConfigError, 80 | match="The 'elements_in' must be a list containing at least one string", 81 | ): 82 | empty_elements_in = ErrorThrowingAnalyser( 83 | bad1, "stub", LocalStorage(folder=utils.TEMP_ELEMENT_DIR) 84 | ) 85 | 86 | with pytest.raises( 87 | InvalidAnalyserConfigError, 88 | match="The 'elements_in' must be a list containing at least one string", 89 | ): 90 | empty_elements_in = ErrorThrowingAnalyser( 91 | bad2, "stub", LocalStorage(folder=utils.TEMP_ELEMENT_DIR) 92 | ) 93 | 94 | with pytest.raises( 95 | InvalidAnalyserConfigError, match="You must provide a name for your analyser" 96 | ): 97 | badan2 = ErrorThrowingAnalyser( 98 | good, "", LocalStorage(folder=utils.TEMP_ELEMENT_DIR) 99 | ) 100 | 101 | 102 | def test_integration(utils, additionals): 103 | assert additionals.an.retryCount == 0 104 | 105 | additionals.an.start_analysing() 106 | 107 | skip_path = utils.get_element_path( 108 | additionals.selname, "skip", analyser=additionals.an.name 109 | ) 110 | assert not os.path.exists(skip_path) 111 | 112 | retryn_path = utils.get_element_path( 113 | additionals.selname, "retryN", analyser=additionals.an.name 114 | ) 115 | assert not os.path.exists(retryn_path) 116 | 117 | retry3_path = utils.get_element_path( 118 | additionals.selname, "retry3", analyser=additionals.an.name 119 | ) 120 | assert additionals.an.retryCount == 3 121 | 122 | 123 | def test_bad_whitelist(utils): 124 | badConfig = {"elements_in": ["sel1/an1/el1"]} 125 | badAn = EmptyAnalyser( 126 | badConfig, "whitelistErrorAnalyser", LocalStorage(folder=utils.TEMP_ELEMENT_DIR) 127 | ) 128 | with pytest.raises( 129 | InvalidAnalyserElements, match="'elements_in' you specified does not exist" 130 | ): 131 | badAn.start_analysing() 132 | -------------------------------------------------------------------------------- /src/test/test_etypes.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from types import SimpleNamespace as Ns 3 | from pathlib import Path 4 | from lib.common.etypes import Etype, Union, Array, all_etypes, cast 5 | from lib.etypes.cvjson import etype as CvJson 6 | from lib.common.exceptions import EtypeCastError 7 | from test import utils 8 | 9 | 10 | def write_stub(f): 11 | with open(f, "w+") as f: 12 | f.write("stub") 13 | 14 | 15 | @pytest.fixture 16 | def base(): 17 | obj = Ns() 18 | obj.id = "xasd123" 19 | obj.txt1 = Path("/tmp/1.txt") 20 | obj.scoresjson1 = Path("/tmp/scores.json") 21 | obj.json2 = Path("/tmp/not_scores.json") 22 | obj.md1 = Path("/tmp/1.md") 23 | obj.im1 = Path("/tmp/1.png") 24 | obj.im2 = Path("/tmp/2.jpg") 25 | obj.im3 = Path("/tmp/3.bmp") 26 | obj.aud1 = Path("/tmp/1.mp3") 27 | write_stub(obj.txt1) 28 | write_stub(obj.md1) 29 | write_stub(obj.im1) 30 | write_stub(obj.im2) 31 | write_stub(obj.im3) 32 | write_stub(obj.aud1) 33 | write_stub(obj.scoresjson1) 34 | write_stub(obj.json2) 35 | yield obj 36 | utils.cleanup() 37 | 38 | 39 | def test_etype_construction(base): 40 | # shouldn't be okay with empty 41 | for t in all_etypes(): 42 | with pytest.raises(EtypeCastError): 43 | assert t(base.id, []) 44 | 45 | 46 | def test_Any(base): 47 | e = Etype.Any(base.id, [base.txt1]) 48 | assert len(e.paths) == 1 49 | e = Etype.Any(base.id, [base.txt1, base.md1, base.im3]) 50 | assert len(e.paths) == 3 51 | 52 | 53 | def test_Image(base): 54 | # shouldn't accept one txt 55 | with pytest.raises(EtypeCastError): 56 | Etype.Image(base.id, ["/tmp/notafile.txt"]) 57 | 58 | # shouldn't accept an image that doesn't exist 59 | with pytest.raises(EtypeCastError): 60 | Etype.Image(base.id, ["/tmp/nonexistent_image.png"]) 61 | 62 | # shouldn't be okay with 2 valid images 63 | with pytest.raises(EtypeCastError): 64 | Etype.Image(base.id, [base.im1, base.im2]) 65 | 66 | # works with either single path or list 67 | im1 = Etype.Image(base.id, base.im1) 68 | assert len(im1.paths) == 1 69 | im1 = Etype.Image(base.id, [base.im1]) 70 | assert len(im1.paths) == 1 71 | im2 = Etype.Image(base.id, base.im2) 72 | assert len(im1.paths) == 1 73 | 74 | # filters out invalid files 75 | im1_filtered = Etype.Image(base.id, [base.im1, base.txt1]) 76 | assert len(im1.paths) == 1 77 | assert im1.paths[0] == base.im1 78 | 79 | 80 | def test_Array(base): 81 | ImArr = Array(Etype.Image) 82 | with pytest.raises(EtypeCastError): 83 | ImArr(base.id, []) 84 | 85 | with pytest.raises(EtypeCastError): 86 | ImArr(base.id, base.txt1) 87 | 88 | has1 = ImArr(base.id, base.im1) 89 | assert len(has1.paths) == 1 90 | has3 = ImArr(base.id, [base.im1, base.im2, base.im3]) 91 | assert len(has3.paths) == 3 92 | has2 = ImArr(base.id, [base.im1, base.md1, base.txt1, base.im3]) 93 | assert len(has2.paths) == 2 94 | 95 | 96 | def test_Union(base): 97 | ImAud = Union(Etype.Image, Etype.Audio) 98 | with pytest.raises(EtypeCastError): 99 | ImAud(base.id, []) 100 | with pytest.raises(EtypeCastError): 101 | ImAud(base.id, base.txt1) 102 | with pytest.raises(EtypeCastError): 103 | ImAud(base.id, base.im1) 104 | with pytest.raises(EtypeCastError): 105 | ImAud(base.id, base.aud1) 106 | 107 | has2 = ImAud(base.id, [base.aud1, base.im1]) 108 | assert len(has2.paths) == 2 109 | f2 = ImAud(base.id, [base.im3, base.md1, base.aud1]) 110 | assert len(f2.paths) == 2 111 | assert base.im3 in f2.paths 112 | assert base.aud1 in f2.paths 113 | 114 | 115 | def test_cast(base): 116 | # explicit cast 117 | with pytest.raises(EtypeCastError): 118 | cast(base.id, [], Etype.Image) 119 | with pytest.raises(EtypeCastError): 120 | cast(base.id, [base.txt1], Etype.Image) 121 | 122 | t1 = cast(base.id, [base.im1], to=Etype.Image) 123 | assert len(t1.paths) == 1 124 | assert t1.et == Etype.Image 125 | 126 | # implicit cast 127 | with pytest.raises(EtypeCastError): 128 | cast(base.id, []) 129 | 130 | i1 = cast(base.id, [base.im1]) 131 | assert len(i1.paths) == 1 132 | assert i1.et == Etype.Image 133 | i2 = cast(base.id, [base.im2]) 134 | assert len(i2.paths) == 1 135 | assert i2.et == Etype.Image 136 | 137 | ia1 = cast(base.id, [base.im1, base.im2]) 138 | assert len(ia1.paths) == 2 139 | assert ia1.et == Array(Etype.Image) 140 | 141 | a1 = cast(base.id, base.aud1) 142 | assert len(a1.paths) == 1 143 | assert a1.et == Etype.Audio 144 | 145 | # unions 146 | 147 | ai1 = cast(base.id, [base.im3, base.aud1]) 148 | assert len(ai1.paths) == 2 149 | assert ai1.et == Union(Etype.Image, Etype.Audio) 150 | 151 | ai2 = cast(base.id, [base.aud1, base.im2]) 152 | assert len(ai1.paths) == 2 153 | assert ai1.et == Union(Etype.Image, Etype.Audio) 154 | 155 | iaa1 = cast(base.id, [base.im1, base.im2, base.aud1]) 156 | assert len(iaa1.paths) == 3 157 | assert iaa1.et == Union(Array(Etype.Image), Etype.Audio) 158 | 159 | any1 = cast(base.id, [base.im1, base.im2, base.aud1, base.txt1]) 160 | assert len(any1.paths) == 4 161 | assert any1.et == Etype.Any 162 | 163 | 164 | def test_custom_etypes(base): 165 | all_ets = all_etypes() 166 | cvjson_et = CvJson(CvJson.__name__, CvJson.filter) 167 | assert cvjson_et in all_ets 168 | 169 | cvj1 = cvjson_et(base.id, [base.im1, base.im2, base.scoresjson1]) 170 | 171 | assert len(cvj1.paths) == 3 172 | assert cvj1.et == cvjson_et 173 | 174 | with pytest.raises(EtypeCastError): 175 | cvjson_et(base.id, [base.im1, base.im2]) 176 | # throws error when json is not named 'scores.json' (specified in 177 | # CvJson.filter). 178 | with pytest.raises(EtypeCastError): 179 | cvjson_et(base.id, [base.im1, base.json2]) 180 | -------------------------------------------------------------------------------- /src/test/test_get.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import shutil 3 | from os import listdir, makedirs 4 | from os.path import isdir 5 | from lib.common.get import get_module 6 | 7 | 8 | def make_empty_main_export(pth): 9 | INIT = "module = 1" 10 | with open(f"{pth}/core.py", "w") as f: 11 | f.write(INIT) 12 | 13 | 14 | @pytest.fixture 15 | def additionals(): 16 | obj = lambda: None 17 | """ Make imaginary selector and analysers """ 18 | # tests always run from src 19 | obj.EMPTY_SELECTOR = "./lib/selectors/empty" 20 | obj.EMPTY_ANALYSER = "./lib/analysers/empty" 21 | 22 | if isdir(obj.EMPTY_SELECTOR): 23 | shutil.rmtree(obj.EMPTY_SELECTOR) 24 | if isdir(obj.EMPTY_ANALYSER): 25 | shutil.rmtree(obj.EMPTY_ANALYSER) 26 | 27 | makedirs(obj.EMPTY_SELECTOR) 28 | make_empty_main_export(obj.EMPTY_SELECTOR) 29 | makedirs(obj.EMPTY_ANALYSER) 30 | make_empty_main_export(obj.EMPTY_ANALYSER) 31 | yield obj 32 | if isdir(obj.EMPTY_SELECTOR): 33 | shutil.rmtree(obj.EMPTY_SELECTOR) 34 | if isdir(obj.EMPTY_ANALYSER): 35 | shutil.rmtree(obj.EMPTY_ANALYSER) 36 | 37 | 38 | # NOTE: additionals added as arg to ensure fixture setup is run 39 | def test_raises_when_faulty(additionals): 40 | with pytest.raises(ModuleNotFoundError): 41 | get_module("select", "smth") 42 | 43 | with pytest.raises(ModuleNotFoundError): 44 | get_module("analyse", "smth") 45 | 46 | with pytest.raises(ImportError, match="must be either 'select' or 'analyse'"): 47 | get_module("neitherthing", "smth") 48 | 49 | 50 | def test_imports_main(additionals): 51 | # main just exported as 'True', to check import logic is correct 52 | assert get_module("select", "empty") 53 | assert get_module("analyse", "empty") 54 | -------------------------------------------------------------------------------- /src/test/test_infoyamls.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import yaml 3 | from os import listdir 4 | 5 | 6 | def is_valid_arg(arg): 7 | if "name" not in arg or not isinstance(arg["name"], str): 8 | return False 9 | if "required" not in arg or not isinstance(arg["required"], bool): 10 | return False 11 | # NOTE: not checking for 'input' or 'desc' attrs, considering them optional at this time. 12 | return True 13 | 14 | 15 | @pytest.fixture 16 | def additionals(): 17 | obj = lambda: None 18 | obj.ALL_ANALYSERS = [x for x in listdir("lib/analysers") if x != "__deprecated"] 19 | obj.ALL_SELECTORS = [x for x in listdir("lib/selectors") if x != "__deprecated"] 20 | return obj 21 | 22 | 23 | def test_selectors(additionals, utils): 24 | # selector infos 25 | for sel in additionals.ALL_SELECTORS: 26 | with open(utils.get_info_path("selector", sel)) as f: 27 | info = yaml.safe_load(f) 28 | assert "desc" in info 29 | assert "args" in info 30 | assert isinstance(info["args"], list) 31 | for arg in info["args"]: 32 | assert is_valid_arg(arg) 33 | 34 | # analyser infos 35 | for ana in additionals.ALL_ANALYSERS: 36 | with open(utils.get_info_path("analyser", ana)) as f: 37 | info = yaml.safe_load(f) 38 | assert "desc" in info 39 | assert "args" in info 40 | assert isinstance(info["args"], list) 41 | for arg in info["args"]: 42 | assert is_valid_arg(arg) 43 | -------------------------------------------------------------------------------- /src/test/test_integration.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_demo(): 5 | # TODO: test using the `local` selector, followed by simple analysers. 6 | pass 7 | -------------------------------------------------------------------------------- /src/test/test_localstorage.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import json 3 | from pathlib import Path 4 | from lib.common.storage import LocalStorage 5 | 6 | 7 | @pytest.fixture 8 | def basic(utils): 9 | global base 10 | base = utils.TEMP_ELEMENT_DIR 11 | 12 | utils.scaffold_empty("Youtube", elements=["el1"], analysers=["Me"]) 13 | utils.setup() 14 | yield LocalStorage(folder=base) 15 | utils.cleanup() 16 | 17 | 18 | def test_core(basic): 19 | assert basic.base_dir == Path(base) 20 | 21 | 22 | def test_read_query(utils, basic): 23 | assert isinstance(basic.read_query("Youtube"), Path) 24 | assert basic.read_query("Youtube") == Path(f"{base}/Youtube/{basic.RETRIEVED_EXT}") 25 | assert basic.read_query("Youtube/Me") == Path( 26 | f"{base}/Youtube/{basic.ANALYSED_EXT}/Me" 27 | ) 28 | 29 | 30 | def test_read_all_media(utils, basic): 31 | cmpDict = { 32 | "Youtube": { 33 | f"{basic.RETRIEVED_EXT}": { 34 | "el1": f"{base}/Youtube/{basic.RETRIEVED_EXT}/el1", 35 | }, 36 | f"{basic.ANALYSED_EXT}": { 37 | "Me": { 38 | "el1": f"{base}/Youtube/{basic.ANALYSED_EXT}/Me/el1", 39 | }, 40 | }, 41 | }, 42 | } 43 | mediaDict = basic.read_all_media() 44 | assert utils.dictsEqual(cmpDict, mediaDict) 45 | 46 | 47 | def test_write_meta(basic): 48 | q = "Youtube/Me" 49 | og_data = {"some": "data"} 50 | basic.write_meta(q, og_data) 51 | with open(f"{basic.read_query(q)}/{basic._LocalStorage__META_FILE}", "r") as f: 52 | data = json.load(f) 53 | assert data.get("some") == "data" 54 | assert data.get("timestamp") is not None 55 | -------------------------------------------------------------------------------- /src/test/test_mtmodule.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from pathlib import Path 4 | from lib.common.exceptions import ImproperLoggedPhaseError 5 | from lib.common.mtmodule import MTModule 6 | from lib.common.storage import LocalStorage 7 | from test.utils import scaffold_empty 8 | 9 | 10 | class EmptyMTModule(MTModule): 11 | pass 12 | 13 | 14 | @pytest.fixture 15 | def additionals(utils): 16 | obj = lambda: None 17 | obj.BASE_DIR = utils.TEMP_ELEMENT_DIR 18 | obj.mod = EmptyMTModule({}, "empty", LocalStorage(folder=utils.TEMP_ELEMENT_DIR)) 19 | yield obj 20 | utils.cleanup() 21 | 22 | 23 | def test_class_variables(additionals): 24 | assert additionals.mod.name == "empty" 25 | assert additionals.mod.disk.base_dir == Path(additionals.BASE_DIR) 26 | assert additionals.mod._MTModule__LOGS == [] 27 | assert ( 28 | additionals.mod.disk._LocalStorage__LOGS_DIR == f"{additionals.BASE_DIR}/logs" 29 | ) 30 | assert ( 31 | additionals.mod.disk._LocalStorage__LOGS_FILE 32 | == f"{additionals.BASE_DIR}/logs/logs.txt" 33 | ) 34 | assert os.path.exists(f"{additionals.BASE_DIR}/logs") 35 | 36 | 37 | def test_phase_decorator(additionals): 38 | class BadClass: 39 | @MTModule.phase("somekey") 40 | def improper_func(self): 41 | pass 42 | 43 | class GoodClass(MTModule): 44 | @MTModule.phase("somekey") 45 | def proper_func(self): 46 | self.logger("we did something.") 47 | return "no error" 48 | 49 | # test that a decorated method carries through its return value 50 | gc = GoodClass({}, "my_good_mod", storage=LocalStorage(folder=additionals.BASE_DIR)) 51 | 52 | # test that a decorated method carries through its return value 53 | gc = GoodClass({}, "my_good_mod", storage=LocalStorage(folder=additionals.BASE_DIR)) 54 | assert gc.proper_func() == "no error" 55 | 56 | with open(f"{additionals.BASE_DIR}/logs/logs.txt", "r") as f: 57 | lines = f.readlines() 58 | assert len(lines) == 1 59 | assert lines[0] == "my_good_mod: somekey: we did something.\n" 60 | 61 | # check that logs were cleared after phase 62 | assert gc._MTModule__LOGS == [] 63 | 64 | 65 | def test_parallel_phase_decorator(additionals): 66 | class GoodClass(MTModule): 67 | @MTModule.phase("somekey") 68 | def func(self, gen): 69 | self.logger("This function only takes a generator of elements.") 70 | return "no error" 71 | 72 | @MTModule.phase("somekey", remove_db=False) 73 | def func_no_remove(self, gen): 74 | return "no error" 75 | 76 | @MTModule.phase("secondkey") 77 | def func_w_arg(self, gen, extra): 78 | self.logger(f"Running func with {list(gen)}, with extra arg {extra}.") 79 | return "no error" 80 | 81 | # test that a decorated method carries through its return value 82 | gc = GoodClass({}, "my_good_mod", storage=LocalStorage(folder=additionals.BASE_DIR)) 83 | 84 | # test parallel logs 85 | eg_gen = (a for a in range(0, 100)) 86 | assert gc.func(eg_gen) == "no error" 87 | 88 | with open(f"{additionals.BASE_DIR}/logs/logs.txt", "r") as f: 89 | lines = f.readlines() 90 | assert len(lines) == 100 91 | 92 | # test db file generation 93 | eg_gen = (a for a in range(0, 100)) 94 | assert gc.func_no_remove(eg_gen) == "no error" 95 | 96 | dbfile = f"{gc.disk.base_dir}/{gc.UNIQUE_ID}.db" 97 | with open(dbfile, "rb") as f: 98 | _bytes = f.read() 99 | assert len(_bytes) == 800 # 2 4-byte entries per item for 100 items 100 | 101 | os.remove(dbfile) 102 | 103 | # test that a function is resumed properly 104 | eg_gen = (a for a in range(0, 50)) 105 | assert gc.func_no_remove(eg_gen) == "no error" 106 | 107 | eg_gen = (a for a in range(0, 100)) 108 | assert gc.func(eg_gen) == "no error" 109 | 110 | with open(f"{additionals.BASE_DIR}/logs/logs.txt", "r") as f: 111 | lines = f.readlines() 112 | assert len(lines) == 150 113 | 114 | # test function with argument 115 | eg_gen = (a for a in range(0, 100)) 116 | assert gc.func_w_arg(eg_gen, 10) == "no error" 117 | -------------------------------------------------------------------------------- /src/test/test_run.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import yaml 4 | from run import validate_yaml 5 | from lib.common.exceptions import InvalidYamlError 6 | from test.utils import scaffold_empty, cleanup 7 | 8 | ARGS = "/run_args.yaml" 9 | BASELINE = {"folder": "media/test_official"} 10 | WITH_ELS = {**BASELINE, "elements_in": "sel1"} 11 | WITH_SELECT = { 12 | **BASELINE, 13 | "select": {"name": "Local", "config": {"source": "/a-folder"}}, 14 | } 15 | GOOD_ANALYSE_DICT = {**WITH_ELS, "analyse": {"name": "Frames"}} 16 | GOOD_SELECT_ANALYSE = { 17 | **WITH_SELECT, 18 | "analyse": [{"name": "Frames"}, {"name": "ImageDedup"}], 19 | } 20 | 21 | 22 | @pytest.fixture(autouse=True) 23 | def teardown(): 24 | yield None 25 | try: 26 | cleanup() 27 | os.remove(ARGS) 28 | except: 29 | pass 30 | 31 | 32 | def write(vl): 33 | with open(ARGS, "w") as c: 34 | yaml.dump(vl, c, default_flow_style=False) 35 | 36 | 37 | def validate(): 38 | with open(ARGS, "r") as c: 39 | cfg = yaml.safe_load(c) 40 | validate_yaml(cfg) 41 | 42 | 43 | def write_and_validate(config, regex): 44 | write(config) 45 | with pytest.raises(InvalidYamlError, match=regex): 46 | validate() 47 | 48 | 49 | def test_bad_yaml(): 50 | with open(ARGS, "w") as c: 51 | c.write('foo: "an escaped \\\' single quote"') 52 | 53 | with pytest.raises(yaml.YAMLError): 54 | validate() 55 | 56 | 57 | def test_validate_phase(): 58 | empty = {} 59 | bad_folder = {"folder": 1, "config": {}} 60 | good_folder = {"folder": "legit", "config": {}} 61 | 62 | write(empty) 63 | with pytest.raises( 64 | InvalidYamlError, match="The folder attribute must exist and be a string" 65 | ): 66 | validate() 67 | 68 | write(bad_folder) 69 | with pytest.raises( 70 | InvalidYamlError, match="The folder attribute must exist and be a string" 71 | ): 72 | validate() 73 | 74 | bad_phase = {**good_folder, "phase": "not a phase"} 75 | good_phase_select = {**good_folder, "phase": "select"} 76 | good_phase_analyse = {**good_folder, "phase": "analyse"} 77 | write(bad_phase) 78 | with pytest.raises( 79 | InvalidYamlError, match="specified a phase, you must specify a module" 80 | ): 81 | validate() 82 | 83 | bad_select_module = {**good_phase_select, "module": "not a selector"} 84 | bad_analyse_module = {**good_phase_analyse, "module": "not an analyser"} 85 | good_select_module = {**good_phase_select, "module": "Local"} 86 | write(bad_select_module) 87 | with pytest.raises( 88 | InvalidYamlError, match="No select module named 'not a selector'" 89 | ): 90 | validate() 91 | 92 | write(bad_analyse_module) 93 | with pytest.raises( 94 | InvalidYamlError, match="No analyse module named 'not an analyser'" 95 | ): 96 | validate() 97 | 98 | # the select module requires a 'source_folder' arg 99 | bad_local_config = {**good_select_module, "config": {}} 100 | bad_youtube_config = { 101 | **good_select_module, 102 | "module": "youtube", 103 | "config": {"uploaded_before": "212321"}, 104 | } 105 | good_youtube_config = { 106 | **good_select_module, 107 | "module": "youtube", 108 | "config": { 109 | "search_term": "a search term", 110 | "uploaded_before": "212321", 111 | "uploaded_after": "212321", 112 | }, 113 | } 114 | 115 | if os.path.exists("/mtriage/credentials/google.json"): 116 | write(good_select_module) 117 | with pytest.raises( 118 | InvalidYamlError, 119 | match="config you specified does not contain all the required arguments", 120 | ): 121 | validate() 122 | 123 | write(bad_local_config) 124 | with pytest.raises( 125 | InvalidYamlError, 126 | match="The config you specified does not contain all the required arguments for the 'Local' selecter.", 127 | ): 128 | validate() 129 | 130 | write(bad_youtube_config) 131 | with pytest.raises( 132 | InvalidYamlError, 133 | match="The config you specified does not contain all the required arguments for the 'youtube' selecter.", 134 | ): 135 | validate() 136 | 137 | write(good_youtube_config) 138 | validate() 139 | 140 | # should return True to indicate this is a single phase config, see 'validate_yaml' docstring for more info 141 | res = validate_yaml(good_youtube_config) 142 | assert res == True 143 | 144 | 145 | def test_validate(): 146 | write_and_validate(BASELINE, "specify either 'elements_in' or 'select'") 147 | 148 | write_and_validate(WITH_ELS, "at least one 'analyse' module must be specified") 149 | 150 | bad_analyse = {**WITH_ELS, "analyse": None} 151 | write_and_validate(bad_analyse, "must be a dict or list") 152 | 153 | bad_analyse_dict = {**WITH_ELS, "analyse": {}} 154 | write_and_validate(bad_analyse_dict, "containing at least a 'name' attribute") 155 | 156 | write(GOOD_ANALYSE_DICT) 157 | validate() 158 | 159 | write(GOOD_SELECT_ANALYSE) 160 | validate() 161 | 162 | 163 | def test_config_types(): 164 | validate_yaml(GOOD_ANALYSE_DICT) 165 | validate_yaml(GOOD_SELECT_ANALYSE) 166 | -------------------------------------------------------------------------------- /src/test/test_selector.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import csv 4 | from abc import ABC 5 | from pathlib import Path 6 | from lib.common.selector import Selector 7 | from lib.common.exceptions import ( 8 | ElementShouldRetryError, 9 | ElementShouldSkipError, 10 | SelectorIndexError, 11 | EtypeCastError, 12 | ) 13 | from lib.common.etypes import Etype, LocalElementsIndex 14 | from lib.common.storage import LocalStorage 15 | from test.utils import scaffold_elementmap, STUB_PATHS, list_files 16 | 17 | 18 | class EmptySelector(Selector): 19 | out_etype = Etype.Any 20 | 21 | def __init__(self, config, name, dr): 22 | super().__init__(config, name, dr) 23 | self.disk.delete_local_on_write = False 24 | 25 | def index(self, config): 26 | if not os.path.exists(self.disk.read_query(self.name)): 27 | df = scaffold_elementmap(["el1", "el2", "el3"]) 28 | 29 | df = [ 30 | x + [STUB_PATHS.imagejpg] if idx > 0 else (x + ["path"]) 31 | for idx, x in enumerate(df) 32 | ] 33 | return LocalElementsIndex(rows=df) 34 | else: 35 | return None 36 | 37 | def retrieve_element(self, row, config): 38 | return Etype.cast(row.id, row.path) 39 | 40 | 41 | @pytest.fixture 42 | def additionals(utils): 43 | obj = lambda: None 44 | obj.emptySelector = EmptySelector( 45 | {"dev": True}, "empty", LocalStorage(folder=utils.TEMP_ELEMENT_DIR) 46 | ) 47 | utils.setup() 48 | yield obj 49 | utils.cleanup() 50 | 51 | 52 | def test_selector_imports(): 53 | assert type(Selector) == type(ABC) 54 | 55 | 56 | def test_cannot_instantiate(utils): 57 | with pytest.raises(TypeError): 58 | Selector({}, "empty", utils.TEMP_ELEMENT_DIR) 59 | 60 | 61 | def test_init(utils, additionals): 62 | assert Path(utils.TEMP_ELEMENT_DIR) == additionals.emptySelector.disk.base_dir 63 | assert "empty" == additionals.emptySelector.name 64 | 65 | 66 | def test_index(additionals): 67 | additionals.emptySelector.start_indexing() 68 | # test element_map.csv is what it should be 69 | eidx = additionals.emptySelector.disk.read_elements_index("empty") 70 | emap = scaffold_elementmap(["el1", "el2", "el3"]) 71 | for idx, row in enumerate(eidx.rows): 72 | assert row.id == emap[idx + 1][0] 73 | 74 | 75 | def test_retrieve(additionals, utils): 76 | additionals.emptySelector.start_indexing() 77 | additionals.emptySelector.start_retrieving() 78 | pth = additionals.emptySelector.disk.read_query("empty") 79 | images = [pth / f"{x}/image.jpeg" for x in ["el1", "el2", "el3"]] 80 | for img in images: 81 | assert os.path.isfile(img) 82 | 83 | 84 | # the values that are returned from retrieve need to be managed in Python differently according to what kind of data 85 | # they represent. 86 | # 87 | # Video -> cv2.VideoCapture 88 | # Image -> cv2.Image 89 | # Audio -> simpleaudio.WaveObject 90 | # Json -> dict 91 | 92 | # the relationship between files on disk and how they are loaded through Python should be managed in the etypes library. 93 | -------------------------------------------------------------------------------- /src/test/test_selector_errors.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from lib.common.selector import Selector 4 | from lib.common.storage import LocalStorage 5 | from lib.common.etypes import Etype, LocalElement, LocalElementsIndex 6 | from lib.common.exceptions import ( 7 | ElementShouldRetryError, 8 | ElementShouldSkipError, 9 | SelectorIndexError, 10 | EtypeCastError, 11 | ) 12 | from test.utils import scaffold_elementmap 13 | import pdb 14 | 15 | 16 | class BasicErrorSelector(Selector): 17 | out_etype = Etype.Any 18 | 19 | def __init__(self, *args): 20 | super().__init__(*args) 21 | self.retryCount = 0 22 | 23 | def index(self, config) -> LocalElementsIndex: 24 | error = config["error"] if "error" in config else "" 25 | if error == "index": 26 | raise SelectorIndexError("test") 27 | else: 28 | elements = ["skip", "retry3", "retryN", "pass"] 29 | return LocalElementsIndex(rows=scaffold_elementmap(elements)) 30 | 31 | def retrieve_element(self, element, config) -> LocalElement: 32 | if element.id == "skip": 33 | raise ElementShouldSkipError("test") 34 | elif element.id == "retry3" and self.retryCount < 3: 35 | self.retryCount += 1 36 | raise ElementShouldRetryError("test") 37 | elif element.id == "retryN": 38 | raise ElementShouldRetryError("test") 39 | else: 40 | return None 41 | 42 | 43 | class RetrieveErrorSelector(BasicErrorSelector): 44 | out_etype = Etype.Any 45 | 46 | def retrieve_element(self, element, config): 47 | super().retrieve_element(element, config) 48 | with open(f"{element['base']}/out.txt", "w") as f: 49 | f.write("something") 50 | 51 | 52 | class BadIndexSelector(Selector): 53 | out_etype = Etype.Any 54 | 55 | def index(self, config): 56 | # fails to return a dataframe 57 | pass 58 | 59 | def retrieve_element(self, element, config): 60 | pass 61 | 62 | 63 | @pytest.fixture 64 | def additionals(utils): 65 | obj = lambda: None 66 | indexModule = "indexErrorSelector" 67 | indexConfig = {"error": "index", "dev": True} 68 | obj.indexErrorSelector = BasicErrorSelector( 69 | indexConfig, indexModule, LocalStorage(folder=utils.TEMP_ELEMENT_DIR) 70 | ) 71 | 72 | castModule = "castErrorSelector" 73 | castConfig = {"dev": True} 74 | obj.castErrorSelector = BasicErrorSelector( 75 | castConfig, castModule, LocalStorage(folder=utils.TEMP_ELEMENT_DIR) 76 | ) 77 | 78 | retrieveModule = "retrieveErrorSelector" 79 | retrieveConfig = {"dev": True} 80 | obj.retrieveErrorSelector = RetrieveErrorSelector( 81 | retrieveConfig, retrieveModule, LocalStorage(folder=utils.TEMP_ELEMENT_DIR) 82 | ) 83 | yield obj 84 | utils.cleanup() 85 | 86 | 87 | def test_index_error(additionals): 88 | with pytest.raises(SelectorIndexError, match="Selector index failed - test"): 89 | additionals.indexErrorSelector.start_indexing() 90 | 91 | 92 | def test_retrieve_skip_error(additionals): 93 | with pytest.raises(ElementShouldSkipError, match="test - skipping element"): 94 | additionals.castErrorSelector.retrieve_element(LocalElement(id="skip"), {}) 95 | 96 | 97 | def test_retrieve_retry_error(additionals): 98 | with pytest.raises(ElementShouldRetryError, match="test - attempt retry"): 99 | additionals.castErrorSelector.retrieve_element(LocalElement(id="retryN"), {}) 100 | 101 | 102 | def test_integration_1(utils, additionals): 103 | assert additionals.castErrorSelector.retryCount == 0 104 | additionals.castErrorSelector.start_indexing() 105 | additionals.castErrorSelector.start_retrieving() 106 | 107 | skip_path = utils.get_element_path("castErrorSelector", "skip") 108 | assert not os.path.exists(skip_path) 109 | 110 | retryn_path = utils.get_element_path("castErrorSelector", "retryN") 111 | assert not os.path.exists(retryn_path) 112 | 113 | retry3_path = utils.get_element_path("castErrorSelector", "retry3") 114 | assert additionals.castErrorSelector.retryCount == 3 115 | assert not os.path.exists(retry3_path) 116 | 117 | pass_path = utils.get_element_path("castErrorSelector", "pass") 118 | assert not os.path.exists(pass_path) 119 | 120 | 121 | def integration_2(utils, additionals): 122 | additionals.retrieveErrorSelector.start_indexing() 123 | additionals.retrieveErrorSelector.start_retrieving(in_parallel=False) 124 | 125 | skip_path = utils.get_element_path("retrieveErrorSelector", "skip") 126 | assert not os.path.exists(skip_path) 127 | 128 | retryn_path = utils.get_element_path("retrieveErrorSelector", "retryN") 129 | assert not os.path.exists(retryn_path) 130 | 131 | retry3_path = utils.get_element_path("retrieveErrorSelector", "retry3") 132 | assert additionals.retrieveErrorSelector.retryCount == 3 133 | assert os.path.exists(retry3_path) 134 | 135 | pass_path = utils.get_element_path("retrieveErrorSelector", "pass") 136 | assert os.path.exists(pass_path) 137 | -------------------------------------------------------------------------------- /src/test/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import shutil 4 | from types import SimpleNamespace as Ns 5 | from pathlib import Path 6 | from lib.common.storage import LocalStorage 7 | from lib.common.get import get_module 8 | 9 | TEMP_ELEMENT_DIR = "/mtriage/media/test_official" 10 | TMP_DIR = Path("/tmp") 11 | STUB_PATHS = Ns( 12 | imagejpg="/mtriage/src/test/etype_stubs/image.jpeg", 13 | ) 14 | 15 | 16 | def scaffold_empty( 17 | selector: str, elements: list = [], analysers: list = [], selector_txt=None 18 | ): 19 | """ 20 | Scaffold an mtriage folder. One folder per element in the elements list will be created in the TEMP_ELEMENT_DIR. 21 | If an analysers list is passed, mocks of derived elements will be created in the appropriate folders. 22 | Only a single selector should be passed, as derived elements are nested within a selector pass. To create multiple 23 | selector passes, call this function multiple times. 24 | """ 25 | derived_dir = f"{TEMP_ELEMENT_DIR}/{selector}/{LocalStorage.ANALYSED_EXT}" 26 | if not os.path.exists(derived_dir): 27 | os.makedirs(derived_dir) 28 | 29 | for element in elements: 30 | element_dir = ( 31 | f"{TEMP_ELEMENT_DIR}/{selector}/{LocalStorage.RETRIEVED_EXT}/{element}" 32 | ) 33 | if not os.path.exists(element_dir): 34 | os.makedirs(element_dir) 35 | if selector_txt is not None: 36 | with open(f"{element_dir}/item.txt", "a") as ftxt: 37 | ftxt.write(selector_txt) 38 | if len(analysers) > 0: 39 | for analyser in analysers: 40 | analyser_dir = f"{TEMP_ELEMENT_DIR}/{selector}/{LocalStorage.ANALYSED_EXT}/{analyser}/{element}" 41 | if not os.path.exists(analyser_dir): 42 | os.makedirs(analyser_dir) 43 | 44 | 45 | def get_element_path(selname, elementId, analyser=None): 46 | middle_insert = ( 47 | LocalStorage.RETRIEVED_EXT 48 | if analyser is None 49 | else f"{LocalStorage.ANALYSED_EXT}/{analyser}" 50 | ) 51 | return f"{TEMP_ELEMENT_DIR}/{selname}/{middle_insert}/{elementId}" 52 | 53 | 54 | def scaffold_elementmap(elements=[]): 55 | out = [[x] for x in elements] 56 | out.insert(0, ["id"]) 57 | return out 58 | 59 | 60 | def setup(): 61 | # to ensure that there isn't a read error 62 | with open("/run_args.yaml", "w") as f: 63 | json.dump({}, f) 64 | 65 | 66 | def cleanup(): 67 | if Path(TEMP_ELEMENT_DIR).exists(): 68 | shutil.rmtree(TEMP_ELEMENT_DIR) 69 | if TMP_DIR.exists(): 70 | shutil.rmtree(TMP_DIR) 71 | TMP_DIR.mkdir() 72 | 73 | 74 | def listOfDictsEqual(l1, l2): 75 | if len(l1) != len(l2): 76 | return False 77 | 78 | for d1, d2 in zip(l1, l2): 79 | if not dictsEqual(d1, d2): 80 | return False 81 | 82 | return True 83 | 84 | 85 | def dictsEqual(d1, d2): 86 | if len(d1.keys()) != len(d2.keys()): 87 | return False 88 | 89 | d1json = json.dumps(d1, sort_keys=True, default=str) 90 | d2json = json.dumps(d2, sort_keys=True, default=str) 91 | 92 | return d1json == d2json 93 | 94 | 95 | def get_info_path(kind, mod_name): 96 | return f"lib/{kind}s/{mod_name}/info.yaml" 97 | 98 | 99 | # https://stackoverflow.com/questions/9727673/list-directory-tree-structure-in-python 100 | def list_files(startpath): 101 | for root, dirs, files in os.walk(startpath): 102 | level = root.replace(startpath, "").count(os.sep) 103 | indent = " " * 4 * (level) 104 | print("{}{}/".format(indent, os.path.basename(root))) 105 | subindent = " " * 4 * (level + 1) 106 | for f in files: 107 | print("{}{}".format(subindent, f)) 108 | 109 | 110 | def ltemp(): 111 | """ Primarily for pdb debugging """ 112 | list_files(TEMP_ELEMENT_DIR) 113 | -------------------------------------------------------------------------------- /src/validate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | import inspect 4 | from pathlib import Path 5 | from lib.common.exceptions import InvalidYamlError 6 | from lib.common.get import get_module 7 | 8 | 9 | def validate_module(phase: str, module: str, cfg: dict): 10 | try: 11 | mod = get_module(phase, module) 12 | except ModuleNotFoundError as e: 13 | raise InvalidYamlError(f"No {phase} module named '{module}'") 14 | 15 | # dynamically check all required args for module config exist 16 | sfolder = os.path.dirname(inspect.getfile(mod)) 17 | info = Path(sfolder) / "info.yaml" 18 | with open(info, "r") as f: 19 | options = yaml.safe_load(f) 20 | for option in options["args"]: 21 | if "config" not in cfg: 22 | cfg["config"] = {} 23 | if option["required"] is True and option["name"] not in cfg["config"].keys(): 24 | raise InvalidYamlError( 25 | f"The config you specified does not contain all the required arguments for the '{module}' {phase}er." 26 | ) 27 | 28 | 29 | def validate_name(cfg: dict): 30 | if "name" not in cfg.keys(): 31 | raise InvalidYamlError( 32 | "Each analyse component must be a dict containing at least a 'name' attribute." 33 | ) 34 | 35 | 36 | def validate_analyse(cfg: dict): 37 | if not isinstance(cfg, dict) and not isinstance(cfg, list): 38 | raise InvalidYamlError("The 'analyse' attribute must be a dict or list.") 39 | if isinstance(cfg, dict): 40 | validate_name(cfg) 41 | validate_module("analyse", cfg["name"], cfg) 42 | else: 43 | for _cfg in cfg: 44 | validate_name(_cfg) 45 | validate_module("analyse", _cfg["name"], _cfg) 46 | 47 | 48 | def validate_yaml(cfg: dict) -> bool: 49 | """ 50 | Confirms all values on YAML. Throws an appropriate exception if something's up. 51 | """ 52 | keys = cfg.keys() 53 | 54 | if "folder" not in keys or not isinstance(cfg["folder"], str): 55 | raise InvalidYamlError("The folder attribute must exist and be a string") 56 | 57 | if "phase" in keys or "module" in keys: 58 | # confirm good phase yaml 59 | if "module" not in keys: 60 | raise InvalidYamlError( 61 | "If you specified a phase, you must specify a module" 62 | ) 63 | if "phase" not in keys: 64 | raise InvalidYamlError( 65 | "If you specified a module, you must specify a phase" 66 | ) 67 | 68 | if "config" not in keys or not isinstance(cfg["config"], dict): 69 | raise InvalidYamlError("The 'config' attribute must exist.") 70 | 71 | if cfg["phase"] not in ["select", "analyse"]: 72 | raise InvalidYamlError( 73 | "The phase attribute must be either select or analyse" 74 | ) 75 | validate_module(cfg["phase"], cfg["module"], cfg) 76 | else: 77 | if "elements_in" not in keys and "select" not in keys: 78 | raise InvalidYamlError("You must specify either 'elements_in' or 'select'.") 79 | if "elements_in" in keys: 80 | # bypassing selector... 81 | if "analyse" not in keys: 82 | raise InvalidYamlError( 83 | "You have specified 'elements_in', and so at least one 'analyse' module must be specified." 84 | ) 85 | 86 | elif "select" in keys: 87 | # run select then analyse 88 | validate_name(cfg["select"]) 89 | validate_module("select", cfg["select"]["name"], cfg["select"]) 90 | 91 | if "analyse" in cfg: 92 | validate_analyse(cfg["analyse"]) 93 | -------------------------------------------------------------------------------- /test/test_build.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import csv 4 | import re 5 | from commands import parse_args, build, develop, clean, run_tests, run, DIR_PATH 6 | 7 | 8 | def get_tag_str(cmd, tag): 9 | """ 10 | Returns the string for a tag in a command, or 'None' if the tag doesn't exist. 11 | """ 12 | idx = 0 13 | while len(cmd) > idx and cmd[idx] != tag: 14 | idx += 1 15 | if idx <= len(cmd) - 1: 16 | return cmd[idx + 1] 17 | return None 18 | 19 | 20 | def get_volumes(cmd): 21 | idx = 0 22 | volumes = [] 23 | while len(cmd) - 1 > idx: 24 | if cmd[idx] == "-v": 25 | volumes.append(cmd[idx + 1]) 26 | idx += 1 27 | return volumes 28 | 29 | 30 | def dockerimage_tag_matches(cmd, expected): 31 | build_tag = get_tag_str(cmd, "-t") 32 | if build_tag: 33 | return build_tag == expected 34 | return False 35 | 36 | 37 | def builds_from_cpu_dockerfile(dfile): 38 | return "FROM ubuntu:18.04\n" in dfile 39 | 40 | 41 | def builds_from_gpu_dockerfile(dfile): 42 | return "FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04\n" in dfile 43 | 44 | 45 | def read_deps(component): 46 | pth = "src/lib/selectors/{}/requirements.txt".format(component) 47 | if not os.path.exists(pth): 48 | return [] 49 | with open(pth, "r") as f: 50 | return f.readlines() 51 | 52 | 53 | class TestBuild(unittest.TestCase): 54 | def setUp(self): 55 | # make test whitelist 56 | self.SELECTOR_WL = "selector_whitelist.txt" 57 | with open(self.SELECTOR_WL, "w") as f: 58 | writer = csv.writer(f) 59 | writer.writerows([["Youtube"], ["Local"]]) 60 | 61 | self.BLANK_WL = "blank_whitelist.txt" 62 | with open(self.BLANK_WL, "w") as f: 63 | writer = csv.writer(f) 64 | writer.writerows([[""]]) 65 | 66 | def tearDown(self): 67 | os.remove(self.SELECTOR_WL) 68 | os.remove(self.BLANK_WL) 69 | 70 | def test_default_build(self): 71 | args = parse_args(["dev", "build", "--dry"]) 72 | cmd, dfile, pipfile = build(args) 73 | self.assertTrue( 74 | dockerimage_tag_matches(cmd, "forensicarchitecture/mtriage:dev") 75 | ) 76 | self.assertTrue(builds_from_cpu_dockerfile(dfile)) 77 | 78 | def test_gpu_build(self): 79 | args = parse_args(["dev", "build", "--gpu", "--dry"]) 80 | cmd, dfile, pipfile = build(args) 81 | self.assertTrue(builds_from_gpu_dockerfile(dfile)) 82 | 83 | def test_whitelist(self): 84 | args = parse_args(["dev", "build", "--whitelist", self.BLANK_WL, "--dry"]) 85 | cmd, dfile, pipfile = build(args) 86 | with open("src/build/core.requirements.txt", "r") as f: 87 | core_deps = f.readlines() 88 | self.assertListEqual(core_deps, pipfile) 89 | 90 | args = parse_args(["dev", "build", "--whitelist", self.SELECTOR_WL, "--dry"]) 91 | cmd, dfile, pipfile = build(args) 92 | expected_pipfile = core_deps + read_deps("Youtube") + read_deps("Twitter") 93 | expected_pipfile = [x for x in expected_pipfile if x != "\n"] 94 | pipfile = [x for x in pipfile if x != "\n"] 95 | self.assertListEqual(pipfile, expected_pipfile) 96 | 97 | def test_custom_tags(self): 98 | args = parse_args(["dev", "build", "--tag", "CUSTOM_TAG", "--dry"]) 99 | cmd, dfile, pipfile = build(args) 100 | self.assertTrue( 101 | dockerimage_tag_matches(cmd, "forensicarchitecture/mtriage:CUSTOM_TAG") 102 | ) 103 | 104 | args = parse_args( 105 | ["run", "docs/tutorial/1/1a.yaml", "--tag", "CUSTOM_TAG", "--dry"] 106 | ) 107 | cmd = run(args) 108 | self.assertTrue(cmd[-1] == "forensicarchitecture/mtriage:CUSTOM_TAG") 109 | 110 | def test_dev_tag(self): 111 | dev_args = parse_args(["run", "docs/tutorial/1/1a.yaml", "--dev", "--dry"]) 112 | cmd = run(dev_args) 113 | vs = get_volumes(cmd) 114 | media_re = r".*/mtriage/src:/mtriage/src$" 115 | has_src = False 116 | for v in vs: 117 | if re.match(media_re, v): 118 | has_src = True 119 | break 120 | self.assertTrue(has_src) 121 | 122 | no_dev_args = parse_args(["run", "docs/tutorial/1/1a.yaml", "--dry"]) 123 | cmd = run(no_dev_args) 124 | vs = get_volumes(cmd) 125 | matched = False 126 | for v in vs: 127 | if re.match(media_re, v) is not None: 128 | matched = True 129 | self.assertFalse(matched) 130 | -------------------------------------------------------------------------------- /test/test_util.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from util import ( 3 | name_and_ver, 4 | InvalidPipDep, 5 | should_add_pipdep, 6 | should_add_dockerline, 7 | InvalidArgumentsError, 8 | ) 9 | 10 | 11 | class TestUtil(unittest.TestCase): 12 | """Test the util functions at mtriage's outer layer.""" 13 | 14 | def test_name_and_ver(self): 15 | name, ver = name_and_ver("numpy") 16 | self.assertEqual(name, "numpy") 17 | self.assertEqual(ver, None) 18 | 19 | name, ver = name_and_ver("numpy==4.0") 20 | self.assertEqual(name, "numpy") 21 | self.assertEqual(ver, "4.0") 22 | 23 | n1, v1 = name_and_ver("google-api-core==1.11.0") 24 | self.assertEqual(n1, "google-api-core") 25 | self.assertEqual(v1, "1.11.0") 26 | 27 | # self.assertRaises(InvalidPipDep, name_and_ver, "numpy==") 28 | # self.assertRaises(InvalidPipDep, name_and_ver, "invalid==2.h") 29 | self.assertRaises(InvalidPipDep, name_and_ver, "invalid==2==") 30 | 31 | def test_should_add_pipdeps(self): 32 | p1 = [] 33 | # empty check --> false 34 | self.assertTrue(should_add_pipdep("numpy", p1)) 35 | 36 | p2 = ["numpy"] 37 | self.assertFalse(should_add_pipdep("numpy", p2)) 38 | self.assertTrue(should_add_pipdep("pandas", p2)) 39 | # should add specific versions over undefined 40 | self.assertTrue(should_add_pipdep("numpy==2.0", p2)) 41 | # should add higher versions 42 | p3 = ["numpy==1.0"] 43 | self.assertTrue(should_add_pipdep("numpy==3.0", p3)) 44 | # check with multiple 45 | p4 = ["pack1==2.0", "pandas=3.4", "numpy==1.0", "blueray"] 46 | self.assertTrue(should_add_pipdep("numpy==1.1", p4)) 47 | self.assertFalse(should_add_pipdep("numpy", p4)) 48 | self.assertTrue(should_add_pipdep("blueray==0.1", p4)) 49 | self.assertTrue(should_add_pipdep("newdep", p4)) 50 | # check error 51 | with self.assertRaises(InvalidPipDep): 52 | should_add_pipdep("invalid==1==", p4) 53 | 54 | def test_should_add_dockerline(self): 55 | p1 = [] 56 | self.assertTrue(should_add_dockerline("any line here", p1)) 57 | p2 = ["RUN apt-get install -y vim"] 58 | self.assertFalse(should_add_dockerline("RUN apt-get install -y vim", p2)) 59 | p3 = ["RUN apt-get install -y vim", "RUN curl -o https://smthn", "RUN it"] 60 | self.assertTrue(should_add_dockerline("RUN apt get install -y curl", p3)) 61 | self.assertFalse(should_add_dockerline("RUN curl -o https://smthn", p3)) 62 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import shutil 4 | from argparse import ArgumentTypeError 5 | 6 | DIR_PATH = os.path.dirname(os.path.realpath(__file__)) 7 | 8 | 9 | class InvalidPipDep(Exception): 10 | pass 11 | 12 | 13 | class InvalidArgumentsError(Exception): 14 | pass 15 | 16 | 17 | # parseargs type functions 18 | def str2yamlfile(fname): 19 | ext = os.path.splitext(fname)[1][1:] 20 | if ext not in "yaml": 21 | ArgumentTypeError("The file you specify to run mtriage must be a YAML file") 22 | if not os.path.exists(fname): 23 | ArgumentTypeError("Cannot find a file at {}.".format(fname)) 24 | return fname 25 | 26 | 27 | def get_subdirs(d): 28 | whitelist = ["__pycache__"] 29 | return [ 30 | o 31 | for o in os.listdir(d) 32 | if os.path.isdir(os.path.join(d, o)) 33 | and o not in whitelist 34 | and o != "__deprecated" 35 | ] 36 | 37 | 38 | def name_and_ver(pipdep): 39 | """Return the name and version from a string that expresses a pip dependency. 40 | Raises an InvalidPipDep exception if the string is an invalid dependency. 41 | """ 42 | pipdep = pipdep.split("==") 43 | dep_name = pipdep[0] 44 | try: 45 | if len(pipdep) == 1: 46 | dep_version = None 47 | elif len(pipdep) > 2: 48 | raise InvalidPipDep 49 | else: 50 | dep_version = pipdep[1] 51 | # if re.search(r"\d+(\.\d+)*", dep_version) is None: 52 | # raise InvalidPipDep 53 | return dep_name, dep_version 54 | except: 55 | raise InvalidPipDep 56 | 57 | 58 | def should_add_pipdep(dep, pipdeps): 59 | """Check whether pipdep should be added.""" 60 | dep_name, dep_ver = name_and_ver(dep) 61 | for _dep in pipdeps: 62 | _dep_name, _dep_ver = name_and_ver(_dep) 63 | if _dep_name == dep_name: 64 | # new version unspecified, cannot be more specific 65 | if dep_ver is None: 66 | return False 67 | # new version more specific 68 | elif _dep_ver is None and dep_ver is not None: 69 | return True 70 | elif str(dep_ver) < str(_dep_ver): 71 | return False 72 | return True 73 | 74 | 75 | def should_add_dockerline(line, dockerfile): 76 | """Check whether line should be added to array representing Dockerfile.""" 77 | return line not in dockerfile 78 | 79 | 80 | def lines_from_files(files): 81 | """ 'readlines' for a list of files, concatening them all together """ 82 | lines = [] 83 | for f in files: 84 | with open(f, "r") as fp: 85 | lines.extend(fp.readlines()) 86 | return lines 87 | 88 | 89 | def add_deps(dep_path, deps, should_add): 90 | """Add dependences at {folder_path} to {deps}, excluding if {should_add} is True for any given dependency.""" 91 | if not os.path.isfile(dep_path): 92 | return 93 | 94 | with open(dep_path) as f: 95 | for line in f.readlines(): 96 | if should_add(line, deps): 97 | deps.append(line) 98 | deps.append("\n") # for good measure 99 | 100 | 101 | def extract_dep(csv_row): 102 | if len(csv_row) == 1: 103 | return csv_row[0] 104 | return "" 105 | 106 | 107 | def get_env_config(): 108 | ENV_FILE = "{}/.env".format(DIR_PATH) 109 | if os.path.exists(ENV_FILE): 110 | return "--env-file={}".format(ENV_FILE) 111 | else: 112 | return "--env-file={}".format("{}/.env.example".format(DIR_PATH)) 113 | --------------------------------------------------------------------------------