├── .dockerignore
├── .env.example
├── .gitattributes
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE.md
├── README.md
├── commands.py
├── data
    ├── .gitignore
    └── demo
    │   ├── 1local
    │       ├── 1.txt
    │       ├── 2.md
    │       └── 3.jpg
    │   └── 2audio
    │       └── coffee.m4a
├── docs
    ├── commands.md
    ├── components
    │   └── youtube.md
    ├── custom-builds.md
    ├── custom-components.md
    ├── install.md
    ├── overview.md
    ├── testing.md
    ├── training-classifiers.md
    ├── tutorial
    │   ├── 1
    │   │   ├── 1a.yaml
    │   │   ├── 1b.yaml
    │   │   ├── 1c.yaml
    │   │   └── README.md
    │   ├── 2
    │   │   ├── 2a.yaml
    │   │   ├── 2b.yaml
    │   │   └── README.md
    │   └── 3
    │   │   ├── 3a.yaml
    │   │   ├── 3b.yaml
    │   │   ├── 3c.yaml
    │   │   └── README.md
    └── updates
    │   ├── 2020.01.30.md
    │   ├── 2020.02.16.md
    │   ├── 2020.03.16.md
    │   └── 2020.11.22.md
├── example.blacklist.txt
├── examples
    ├── 4chan.yaml
    ├── classify.yaml
    ├── meta-test.yaml
    ├── pytorchfasterrcnn-test.yaml
    ├── ranking-test.yaml
    └── yolov5-test.yaml
├── media
    └── .gitignore
├── mtriage
├── requirements.txt
├── scripts
    ├── lint
    └── scaffold
├── src
    ├── build
    │   ├── core.end.Dockerfile
    │   ├── core.requirements.txt
    │   ├── core.start.Dockerfile
    │   ├── cpu-header.Dockerfile
    │   └── gpu-header.Dockerfile
    ├── conftest.py
    ├── lib
    │   ├── analysers
    │   │   ├── AnalysedFramesMeta
    │   │   │   ├── core.py
    │   │   │   └── info.yaml
    │   │   ├── ConvertAudio
    │   │   │   ├── core.py
    │   │   │   ├── info.yaml
    │   │   │   └── partial.Dockerfile
    │   │   ├── ExtractAudio
    │   │   │   ├── core.py
    │   │   │   ├── info.yaml
    │   │   │   └── partial.Dockerfile
    │   │   ├── ExtractTypes
    │   │   │   ├── core.py
    │   │   │   └── info.yaml
    │   │   ├── Flatten
    │   │   │   ├── core.py
    │   │   │   └── info.yaml
    │   │   ├── Frames
    │   │   │   ├── core.py
    │   │   │   ├── info.yaml
    │   │   │   └── partial.Dockerfile
    │   │   ├── ImageDedup
    │   │   │   ├── core.py
    │   │   │   ├── info.yaml
    │   │   │   └── requirements.txt
    │   │   ├── KerasPretrained
    │   │   │   ├── core.py
    │   │   │   ├── info.yaml
    │   │   │   └── requirements.txt
    │   │   ├── ProtestsPretrained
    │   │   │   ├── core.py
    │   │   │   ├── image.jpg
    │   │   │   ├── info.yaml
    │   │   │   ├── partial.Dockerfile
    │   │   │   ├── requirements.txt
    │   │   │   ├── test.py
    │   │   │   └── utils.py
    │   │   ├── PytorchFasterRcnn
    │   │   │   ├── core.py
    │   │   │   ├── info.yaml
    │   │   │   └── requirements.txt
    │   │   ├── Rank
    │   │   │   ├── core.py
    │   │   │   └── info.yaml
    │   │   ├── TorchHub
    │   │   │   ├── core.py
    │   │   │   ├── info.yaml
    │   │   │   ├── partial.Dockerfile
    │   │   │   └── requirements.txt
    │   │   └── TwintToGephi
    │   │   │   ├── core.py
    │   │   │   ├── info.yaml
    │   │   │   └── requirements.txt
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── analyser.py
    │   │   ├── etypes.py
    │   │   ├── exceptions.py
    │   │   ├── get.py
    │   │   ├── mtmodule.py
    │   │   ├── selector.py
    │   │   ├── storage.py
    │   │   └── util.py
    │   ├── etypes
    │   │   └── cvjson.py
    │   ├── selectors
    │   │   ├── FourChan
    │   │   │   ├── boards.py
    │   │   │   ├── core.py
    │   │   │   ├── info.yaml
    │   │   │   └── requirements.txt
    │   │   ├── Local
    │   │   │   ├── core.py
    │   │   │   └── info.yaml
    │   │   ├── Twitter
    │   │   │   ├── core.py
    │   │   │   ├── info.yaml
    │   │   │   └── partial.Dockerfile
    │   │   └── Youtube
    │   │   │   ├── core.py
    │   │   │   ├── info.yaml
    │   │   │   ├── partial.Dockerfile
    │   │   │   └── requirements.txt
    │   └── util
    │   │   ├── cvjson.py
    │   │   └── twint.py
    ├── run.py
    ├── test
    │   ├── README.md
    │   ├── __init__.py
    │   ├── etype_stubs
    │   │   └── image.jpeg
    │   ├── test_analyser.py
    │   ├── test_analyser_errors.py
    │   ├── test_etypes.py
    │   ├── test_get.py
    │   ├── test_infoyamls.py
    │   ├── test_integration.py
    │   ├── test_localstorage.py
    │   ├── test_mtmodule.py
    │   ├── test_run.py
    │   ├── test_selector.py
    │   ├── test_selector_errors.py
    │   └── utils.py
    └── validate.py
├── test
    ├── test_build.py
    └── test_util.py
└── util.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | Dockerfile
2 | requirements.txt
3 | media/**/*
4 | **/*/__pycache__
5 | **/*.pyc
6 | 
7 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | GOOGLE_API_KEY=
2 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.mkv filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # generic
 2 | *.swp
 3 | .DS_Store
 4 | .ipynb_checkpoints/
 5 | __pycache__/
 6 | .pytest_cache/
 7 | .mypy*
 8 | .vscode
 9 | *.pyc
10 | 
11 | # vision artefacts
12 | *.weights
13 | *.conv*
14 | 
15 | # build artifacts
16 | build.Dockerfile
17 | build.requirements.txt
18 | 
19 | # authentication files
20 | credentials/**
21 | .env
22 | 
23 | # other data
24 | tags*
25 | logfile.log
26 | 
27 | blacklists/**
28 | whitelists/**
29 | config/**
30 | 
31 | data/demo/3video/dancingonmyown.mov
32 | 
33 | data/demo/3video/info.json
34 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to mtriage
 2 | 
 3 | Hi there! Thank you already, for taking the time to contribute to improve
 4 | mtriage. This document is the right place to start. Read it thoroughly!
 5 | 
 6 | ## What do I need to know to help?
 7 | ### Python
 8 | The majority of mtriage is written in Python. You'll be best placed to
 9 | contribute if you're comfortable working with classes, decorators, etc- but
10 | don't worry if these terms are not familiar just yet!
11 | 
12 | ### Docker
13 | Mtriage uses Docker containers to abstract dependencies from needing to be
14 | installed on the local host. It's not essential, but a good operational
15 | knowledge of Docker will be helpful.
16 | 
17 | ## Do I need to be an experienced Python developer?
18 | Contributing can of course be about contributing code, but it can also take
19 | many other forms. A great amount of work that remains to be done to make
20 | mtriage a usable community tool doesn't involve writing any code. The following
21 | are just a few examples of other welcome contributions:
22 | 
23 | - Writing, updating or correcting documentation.
24 | - Requesting a feature
25 | - Reporting a bug
26 | 
27 | If you're new to this project and looking for a good problem to get started,
28 | you might want to check out the open issues that are tagged ["good first issue"](https://github.com/forensic-architecture/mtriage/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22).
29 | 
30 | These are a rnage of the issues that have come up in conversation for which we
31 | would welcome community contributions. These are, however, by no means
32 | exhaustive! If you see a gap or have an idea, please open up an issue to
33 | discuss it with mtriage's maintainers.
34 | 
35 | ## What parts of mtriage are being actively developed?
36 | You can learn about what we are currently working on by looking at the latest
37 | update. [Updates can be found here](docs/updates).
38 | 
39 | ## How do I make a contribution?
40 | 1. Make sure you have a [GitHub account](https://github.com/signup/free)
41 | 2. Fork the repository on GitHub. This is necessary so that you can push your
42 |     changes, as you can't do this directly on our repo.
43 | 3. Get set up with a local instance of mtriage. The easiest way to do this is
44 |    by [following through the tutorial](https://github.com/forensic-architecture/mtriage/blob/main/docs/tutorial/1/README.md).
45 | 4. [Join our Discord server](https://discord.gg/PjHKHJD5KX). Here you'll be able
46 |     to track commits that are actively being made across our projects; but more
47 |     importantly it's where you can ask questions if something's not clear or
48 |     not working as you expect. The #mtriage and #support channels are the two
49 |     best places to ask questions about setting mtriage up, or how it works.
50 | 
51 | Once you're set up with a local copy of mtriage, you can start modifying code
52 | and making changes. 
53 | 
54 | When you're ready to submit a contribution, you can do it by making a pull
55 | request from a branch on your forked copy of timemap to this repository. You
56 | can do this with the following steps:
57 | 1. Push the changes to a remote repository. If the changes you have made
58 |    address a bug, you should name it `bug/{briefdesc}`, where `{briefdesc}` is
59 |    a hyphen-separated description of your change. If instead you are
60 |    contributing changes as a feature request, name it `feature/{briefdesc`}. If
61 |    in doubt, prefix your branch with `feature/`.
62 | 2. Submit a pull request to the `develop` branch of
63 |    `forensic-architecture/mtriage` (not `main`!).
64 | 3. Wait for the pull request to be reviewed by a maintainer.
65 | 4. Make changes to the pull request if the reviewing maintainer recommends
66 |    them.
67 | 5. Celebrate your success once your pull request is merged!
68 | 
69 | ### How do I validate my changes? 
70 | We are still working on a full set of tests, but there are some basic ones in
71 | place that need to pass before we can merge any contributions.
72 | 
73 | Tests can be run with the following command:
74 | ```
75 | ./mtriage dev test
76 | ```
77 | 
78 | All code must be formatted according to the
79 | [black](https://github.com/ambv/black) formatter. (CI builds will fail if code
80 | is not Black-formatted.)
81 | 
82 | ## New components
83 | If you are contributing a new component (i.e. an analyser or a selector),
84 | ensure that your component lists the correct dependencies. You can do so by
85 | ensuring that it works in a [standalone custom build](./docs/custom-builds.md).
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Do No Harm License
 2 | 
 3 | **Preamble**
 4 | 
 5 | Most software today is developed with little to no thought of how it will be used, or the consequences for our society and planet.
 6 | 
 7 | As software developers, we engineer the infrastructure of the 21st century. We recognise that our infrastructure has great power to shape the world and the lives of those we share it with, and we choose to consciously take responsibility for the social and environmental impacts of what we build.
 8 | 
 9 | We envisage a world free from injustice, inequality, and the reckless destruction of lives and our planet. We reject slavery in all its forms, whether by force, indebtedness, or by algorithms that hack human vulnerabilities. We seek a world where humankind is at peace with our neighbours, nature, and ourselves. We want our work to enrich the physical, mental and spiritual wellbeing of all society.
10 | 
11 | We build software to further this vision of a just world, or at the very least, to not put that vision further from reach.
12 | 
13 | **Terms**
14 | 
15 | *Copyright* (c) 2019 Forensic Architecture. All rights reserved.
16 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
17 | 
18 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
19 | 
20 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
21 | 
22 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
23 | 
24 | 4. This software must not be used by any organisation, website, product or service that:
25 | 
26 |    a) lobbies for, promotes, or derives a majority of income from actions that support or contribute to:
27 |       * sex trafficking
28 |       * human trafficking
29 |       * slavery
30 |       * indentured servitude
31 |       * gambling
32 |       * tobacco
33 |       * adversely addictive behaviours
34 |       * nuclear energy
35 |       * warfare
36 |       * weapons manufacturing
37 |       * war crimes
38 |       * violence (except when required to protect public safety)
39 |       * burning of forests
40 |       * deforestation
41 |       * hate speech or discrimination based on age, gender, gender identity, race, sexuality, religion, nationality
42 | 
43 |    b) lobbies against, or derives a majority of income from actions that discourage or frustrate:
44 |       * peace
45 |       * access to the rights set out in the Universal Declaration of Human Rights and the Convention on the Rights of the Child
46 |       * peaceful assembly and association (including worker associations)
47 |       * a safe environment or action to curtail the use of fossil fuels or prevent climate change
48 |       * democratic processes
49 | 
50 | 5. All redistribution of source code or binary form, including any modifications must be under these terms. You must inform recipients that the code is governed by these conditions, and how they can obtain a copy of this license. You may not attempt to alter the conditions of who may/may not use this software.
51 | 
52 | We define:
53 | 
54 | **Forests** to be 0.5 or more hectares of trees that were either planted more than 50 years ago or were not planted by humans or human made equipment.
55 | 
56 | **Deforestation** to be the clearing, burning or destruction of 0.5 or more hectares of forests within a 1 year period.
57 | 
58 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
59 | 
60 | **Attribution**
61 | 
62 | Do No Harm License [Contributor Covenant][homepage], (pre 1.0),
63 | available at https://github.com/raisely/NoHarm
64 | 
65 | [homepage]: https://github.com/raisely/NoHarm
66 | 
67 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # mtriage
 2 | 
 3 | [![Build Status](https://travis-ci.com/forensic-architecture/mtriage.svg?branch=master)](https://travis-ci.com/forensic-architecture/mtriage)
 4 | 
 5 | ##### select, download, and analyse media 
 6 | 
 7 | mtriage is a command-line application to orchestrate complex scraping and
 8 | analysis workflows. mtriage is developed at [Forensic Architecture](https://forensic-architecture.org), 
 9 | and is intended for use by open source research agencies, journalists, and
10 | activists. To learn more about why we developed mtriage, you can read [an
11 | overview of our reasons here](docs/overview.md).
12 | 
13 | ## getting started
14 | 
15 | First thing's first; follow the instructions to install mtriage:
16 | * [Install](docs/install.md)
17 | 
18 | Once installed, the best way to get started with mtriage is to work through the
19 | three tutorials:
20 | * [1. Getting started](docs/tutorial/1/README.md)
21 | * [2. Chaining analysers](docs/tutorial/2/README.md)
22 | * [3. An end-to-end workflow](docs/tutorial/3/README.md)
23 | 
24 | ## latest update
25 | Updates are posted irregularly; but you can get a sense of what's going on here
26 | by reading [the latest update](docs/updates/2020.11.22.md).
27 | 
28 | ## supported components
29 | 
30 | Below is a list of currently supported components. If you are interested in
31 | helping us to develop additional selectors and analysers, please consider
32 | joining [the conversaton on Discord](https://discord.gg/FJ4XsCg). We're
33 | accepting PRs for new components, but the internal documentation leaves
34 | a little bit wanting at the moment, so best to communicate with us directly on
35 | the #mtriage channel.
36 | 
37 | ### selectors
38 | * Youtube - search by query with optional date range (time uploaded), download video and metadata. 
39 | * Twitter - search by query, download tweets and images.
40 | * Local - use media that already exists on your filesystem. 
41 | 
42 | ### analysers
43 | * ConvertAudio - convert audio files between formats.
44 | * ExtractAudio - extract audio from a video.
45 | * ExtractTypes - extract elements that contain media with specified extensions.
46 | * Frames - extract frames from videos as images using ffmpeg.
47 | * ImageDedup - deduplicate images that are too similar using the 
48 |     [imagededup](https://github.com/idealo/imagededup) module. (Good to use
49 |     after using 'frames'.)
50 | * KerasPretrained - classify objects in images using [Resnet50 trained on
51 |     ImageNet](https://resources.wolframcloud.com/NeuralNetRepository/resources/ResNet-50-Trained-on-ImageNet-Competition-Data).
52 | * Rank - generate a JSON file containing the rankings for videos classified
53 |     with KerasPretrained.
54 | 
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | **/*
2 | !.gitignore
3 | !demo/
4 | 


--------------------------------------------------------------------------------
/data/demo/1local/1.txt:
--------------------------------------------------------------------------------
1 | This is a simple text file.
2 | 


--------------------------------------------------------------------------------
/data/demo/1local/2.md:
--------------------------------------------------------------------------------
1 | # Markdown example
2 | 
3 | The __tiniest__ bit less simple than a txt file.
4 | 


--------------------------------------------------------------------------------
/data/demo/1local/3.jpg:
--------------------------------------------------------------------------------
1 | this is not a JPG but it's fine for testing
2 | 


--------------------------------------------------------------------------------
/data/demo/2audio/coffee.m4a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/forensic-architecture/mtriage/7a841241518f831766767d6ddaa6320b8de4be98/data/demo/2audio/coffee.m4a


--------------------------------------------------------------------------------
/docs/commands.md:
--------------------------------------------------------------------------------
 1 | ## commands 
 2 | 
 3 | ### `./mtriage run path/to/file.yaml`
 4 | 
 5 | The primary command to trigger new mtriage workflows. Each run takes a YAML
 6 | file that specifies which selectors and analysers to run (i.e. `./mtriage run
 7 | examples/youtube.yaml`). See [examples folder](./examples) for examples of how
 8 | to specify different analyser options.
 9 | 
10 | You can also pass the following flags to the run command:
11 | 
12 | | flag  | description |
13 | |-------|-------------|
14 | | `--gpu` | Run using the mtriage GPU image. This will speed up certain analysers that depend on it |
15 | | `--tag` | Allows you to run mtriage with a custom build by passing the name of the Docker image tag you used during the custom build (see below) |
16 | | `--dry` | Primarily for testing. Will not run any command, but instead return the command that will be run. |
17 | | `--dev` | Run using local code, to see changes in development. This will also bypass internal mtriage error handling, allowing you to see the origin of errors |
18 | 
19 | ### `./mtriage dev build`
20 | 
21 | The command to build an mtriage Docker image from source code. You won't need
22 | this unless you are developing mtriage, as the latest images are also on [Docker
23 | Hub](https://hub.docker.com/repository/docker/forensicarchitecture/mtriage).
24 | 
25 | | flag  | description |
26 | |-------|-------------|
27 | | `--gpu` | Build the GPU image. Will build the CPU image otherwise |
28 | | `--tag` | Give your build a custom tag. Will default to 'dev' or 'dev-gpu' |
29 | | `--blacklist` | Give build a path to a blacklist that lists which components to exclude. See [example.blacklist.txt](./example.blacklist.txt) for format. |
30 | | `--whitelist` | Give build a path to a whitelist that lists which components to include. |
31 | | `--dry` | Primarily for testing. Will not run any command, but instead return the command that will be run. |
32 | 
33 | ### `./mtriage dev test`
34 | 
35 | Run all mtriage tests. These run in two parts for the time being: one inside 
36 | Docker, and one on your local Python installation.
37 | 
38 | | flag  | description |
39 | |-------|-------------|
40 | | `--verbose` | Run verbose tests, showing all print statements in the console. |
41 | | `--gpu` | Test the GPU image. Will build the CPU image otherwise |
42 | | `--tag` | Test with a custom tag. Will default to 'dev' or 'dev-gpu' |
43 | | `--dry` | Primarily for testing. Will not run any command, but instead return the command that will be run. |
44 | 
45 | ### `./mtriage dev clean `
46 | 
47 | Remove all mtriage Docker containers, stopped or running.
48 | 
49 | ### `./mtriage dev`
50 | 
51 | Open a bash shell inside mtriage's Docker container. For debugging.
52 | 
53 | | flag  | description |
54 | |-------|-------------|
55 | | `--gpu` | Run the GPU image. Will run the CPU image otherwise |
56 | | `--tag` | Run with a custom tag. Will default to 'dev' or 'dev-gpu' |
57 | | `--dry` | Primarily for testing. Will not run any command, but instead return the command that will be run. |
58 | | `--yaml` | Pass a path to an mtriage YAML config to saturate the shell environment with its runtime parameters. (I.e. if you run `python run.py` from inside the src folder, it will use this YAML). |
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/docs/components/youtube.md:
--------------------------------------------------------------------------------
 1 | # Configuring the Youtube selector 
 2 | 
 3 | In order to run the Youtube selector, mtriage requires a Google Cloud Platform
 4 | API key.
 5 | 
 6 | 1. Create a new project in GCP, and in the [credentials
 7 |    page](https://console.cloud.google.com/apis/credentials), enable the
 8 |    'Youtube Data V3' API.
 9 | 2. Create a new API key, ensuring that it has access to the Youtube V3 API. 
10 | 3. In the '.env' file in mtriage's root folder, add the line
11 |    `GOOGLE_API_KEY=xxxxx`, replacing 'xxxxx' with your downloaded API key. 
12 | 


--------------------------------------------------------------------------------
/docs/custom-builds.md:
--------------------------------------------------------------------------------
 1 | # Custom Builds
 2 | 
 3 | The default 'dev' and 'dev-gpu' mtriage images (available
 4 | [here](https://cloud.docker.com/u/forensicarchitecture/repository/docker/forensicarchitecture/mtriage))
 5 | include dependencies for all selectors and all analysers. While this is useful
 6 | for playing around with mtriage locally, as everything is already installed, it
 7 | is unnecessarily weighty if you are trying to deploy mtriage, or use only some
 8 | components.
 9 | 
10 | For this reason, it is possible to create custom mtriage builds through the
11 | `mtriage dev build` command. Without any additional flags, this command will
12 | build a Docker image with all dependencies for all components installed. (This
13 | is the command that is run on successful merges to master to create the Docker
14 | Hub image).
15 | 
16 | To exclude the dependencies for certain modules, you can pass a blacklist.txt
17 | file via flag to the build command:
18 | ```
19 | ./mtriage dev build --blacklist example.blacklist.txt
20 | ```
21 | 
22 | Modules specified in the blacklist will *not* be installed in the build. For
23 | example, if you wanted a build of mtriage with only dependencies for selectors
24 | installed, you could pass a blacklist that specified all analysers.
25 | 
26 | You can also pass a whitelist with the `--whitelist` flag.
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/docs/custom-components.md:
--------------------------------------------------------------------------------
 1 | # Custom Components
 2 | 
 3 | Components are the main way in which mtriage is intended to be extended.
 4 | A custom component can either be a selector (to index and retrieve media to
 5 | kick of an mtriage workflow) or an analyser (to process media in an mtriage
 6 | workflow). 
 7 | 
 8 | Components currently sit within [src/lib/selectors](/src/lib/selectors) and
 9 | [src/lib/analysers](/src/lib/analysers). Each component is self-contained
10 | (along with a listing of the dependencies it requires) inside a folder there.
11 | 
12 | ### Testing Components in a Standalone Build
13 | 
14 | If you are contributing a new analyser or selector, you should confirm that it
15 | runs without issues in a standalone build. Mtriage uses whitelists to allow the
16 | creation of standalone builds. Work through the following steps to create
17 | a custom build with your component:
18 | 
19 | 1. Create a 'whitelist.txt' in the core mtriage directory, which contains
20 |    a single line with the name of your new component. For example, if your
21 |    component is called 'MyCustomComponent', your whitelist would look like
22 |    this:
23 |     ```
24 |     MyCustomComponent
25 |     ```
26 | 2. Create the custom mtriage image with solely your component with the
27 |    following command:
28 |    ```
29 |    ./mtriage dev build --tag mycustomcomponent --whitelist whitelist.txt
30 |    ```
31 | 3. Test the running of your component with the following command:
32 |     ```
33 |     ./mtriage run path/to/config.yml --tag mycustomcomponent --dev
34 |     ```
35 | 
36 | Please note that mtriage is still in a very early stage of development, but we
37 | will keep updating this document as the code changes.
38 | 
39 | Thanks again for your interest and for your future contributions!
40 | 
41 | 


--------------------------------------------------------------------------------
/docs/install.md:
--------------------------------------------------------------------------------
 1 | # Install 
 2 | 
 3 | mtriage is currently in active development, and is not yet packaged in any way.
 4 | It uses [Docker](https://www.docker.com/products/docker-desktop) to manage
 5 | dependencies, which you will need to download to ensure mtriage works as
 6 | expected. 
 7 | 
 8 | Follow the instructions relevant to your operating system to install Docker CE.
 9 | Docker Desktop (Mac installation [here](https://docs.docker.com/v17.12/docker-for-mac/install/),
10 | Ubuntu installation [here](https://docs.docker.com/v17.12/install/linux/docker-ce/ubuntu/)). 
11 | If you have a CUDA GPU, you can use [Nvidia Docker](https://github.com/NVIDIA/nvidia-docker)
12 | instead of Docker to make certain analysers more performant.
13 | 
14 | NOTE (05/2023): if you are on Apple Silicon, your machine will not be able to natively run FA's docker images. In order to fix this, you'll need to [enable virtualization](https://collabnix.com/warning-the-requested-images-platform-linux-amd64-does-not-match-the-detected-host-platform-linux-arm64-v8/) by changing some settings in Docker Desktop. Navigate to Settings > General and make sure the "Use Virtualization framework" box is checked. After, navigate to Settings > Features in development and check the "Use Rosetta for x86/amd64 emulation on Apple Silicon." If you're well-versed in Docker, you can set the 'platform' flag to 'linux/amd64' in the Dockerfile. If not, the easiest solution is to modify your personal ~/.bashrc or ~/.zshrc file and add ``export DOCKER_DEFAULT_PLATFORM=linux/amd64`` to it. 
15 | 
16 | You also need to ensure that [Python 3](https://www.python.org/downloads/) is installed on your computer. Most modern operating systems have a version installed by default. Mtriage will _probably_ work with Python 2.x as well, but it's untested. 
17 | 
18 | Once you have Docker and Python installed, you can clone the source code and
19 | install the requirements (the only runtime dependency is [pyyaml](https://pyyaml.org/)).
20 | 
21 | ```bash
22 | git clone https://github.com/forensic-architecture/mtriage.git
23 | pip3 install -r requirements.txt
24 | ```
25 | 
26 | ### additional setup
27 | Run the test suite to ensure that everython is working. This command may take
28 | a while, as the first time you run mtriage it will download the [latest Docker
29 | image](https://hub.docker.com/r/forensicarchitecture/mtriage). Mtriage commands will run much faster after this first one:
30 | 
31 | ```bash
32 | ./mtriage dev test
33 | ```
34 | 
35 | Depending on what components you intend to use, there may be additional setup
36 | required. Check the [component docs folder](/docs/components) before using an 
37 | analyser or if you run into an authentication or setup issue.
38 | 
39 | Assuming this command completed and all the tests passed, you are now ready to
40 | run mtriage workflows! 
41 | 


--------------------------------------------------------------------------------
/docs/overview.md:
--------------------------------------------------------------------------------
 1 | # Why Mtriage?
 2 | 
 3 | Recent advances in deep learning make it a very powerful technique when
 4 | analysing visual and audio media. The state of the art in object detection in
 5 | images performs comparably to humans, and the recognition of speech and other
 6 | audio signatures is also impressively effective. Due to these capabilities,
 7 | deep learning has the potential to dramatically effect the scale on which human
 8 | rights organisations can track and monitor weapons, trade, and other objects
 9 | that signify possible human rights abuses.
10 | 
11 | In practice, however, using machine learning in human rights research is
12 | difficult. The state of tooling is such that it is difficult to use for anyone
13 | who does not have a background in software development. Even if the simple aim
14 | is to run a pretrained classifier for object detection on an image, there is
15 | often a lot of installation pain and indirection in online resources. On top of
16 | this, to deploy classifiers at scale, analysing thousands of videos rather than
17 | just one image, a lot of custom plumbing is required. Human rights researchers
18 | do often not have the resource to employ an in-house software developer for
19 | this plumbing, which effectively means that human rights research rarely uses
20 | machine learning. At best, it is limited to a few organisations who have the
21 | technical resource to deploy custom software infrastructure, or who can partner
22 | with data science firms to do so.
23 | 
24 | We developed mtriage to address the insufficiency in machine learning tooling
25 | for human rights research, with the hope that it can democratise the use of
26 | machine learning-- and also other more advanced computational analytic
27 | techniques. In the first instance, it provides both pretrained object detection
28 | classifiers, as well as the means to use them to analyse public domain media.
29 | Mtriage is structured modularly: we intend to add new classifiers, and to
30 | support new sources and kinds of public domain media, as we develop these
31 | capabilities for ongoing and future Forensic Architecture investigations.
32 | 
33 | Mtriage is open source and in active development. This means that everyone can
34 | not only use mtriage in their own research, but also that community
35 | contributions (of a new classifier, or a new media source) can potentially be
36 | made available to all other users as upstream contributions.
37 | 
38 | To get started with mtriage, check out [Getting Started](docs/getting-started.md).
39 | 


--------------------------------------------------------------------------------
/docs/testing.md:
--------------------------------------------------------------------------------
 1 | # Testing
 2 | 
 3 | Mtriage has three kinds of tests:
 4 | 1. Tests for the core code that runs inside Docker (in src/test).
 5 | 2. Tests for the outer orchestration logic (in test/).
 6 | 3. Tests for analysers and selectors (in each component folder, in test/).
 7 | 
 8 | Each kind of test is run with appropriate containerisation given its context,
 9 | i.e. tests of type 1 are run inside Docker, whereas tests of type 2 are run
10 | using the locally installed Python environment.
11 | 
12 | To run all tests, use the following command:
13 | ```
14 | ./mtriage dev test
15 | ```
16 | See [docs/custom-components.md](./custom-components.md) for more information on
17 | how to test a new component.
18 | 


--------------------------------------------------------------------------------
/docs/training-classifiers.md:
--------------------------------------------------------------------------------
 1 | ## Training Classifiers 
 2 | 
 3 | ALl the classifiers supported by our code are trained on the 1000 class ImageNet dataset by default. If you want to see the labels available, please refer to [Classify-read-me](docs/Classify-Read-Me.md). If you would like to train the classifiers on other data sets, here you can find a list of some existing options:  
 4 | 
 5 | 
 6 | #### Supervisely
 7 | The most straightforward method for creating models is to use the platform [supervisely](https://supervise.ly/). This platform provides a way to annotate data, prepare a synthetic data set, train models, and download them without ever having to bring up an ipython notebook.
 8 | 
 9 | (nb: supervisely is a web platform, but needs cloud configuration or CUDA hardware. TODO)
10 | (Q: should I be explaining how to use supervisely in more detail? I have been reading about it but not sure)
11 | 
12 | #### Tensorflow
13 | Another way to train models is to use the process documented in [tensorflow for poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/index.html?index=..%2F..%2Findex#0). This is a great way to train a basic image classifier with various categories, which doesn't require any bounding box annotation (you label the training set by putting them in appropriate folders).
14 | 
15 | #### Keras
16 | The third and perhaps most flexible way to train models is using [keras](https://keras.io/), and generalising the methodology from [this excellent 11-part series on keras and python](https://pythonprogramming.net/loading-custom-data-deep-learning-python-tensorflow-keras/).
17 | 
18 | ### Installation
19 | In the [tensorflow](/tensorflow) directory there are scripts to setup a ready-made environment in [Docker](https://www.docker.com/) for training models using tensorflow and keras. You can also refer to the [README](docs/README.md)] for a brief explanation of how to install and use Docker. 
20 | 
21 | ```bash
22 | cd tensorflow  [Q: is this directory supposed to be one of the folders that are included in the github download? I don't see any directory called tensorflow at the moment; alternatively, are they supposed to download tensorflow manually?]
23 | sh setup.sh # downloads tensorflow models and builds Docker image locally
24 | sh run.sh # starts Docker container with appropriate volume/port mapping
25 | ```
26 | Visit [http://localhost:8080](http://localhost:8080) and use the token displayed in the console after running the last command. Tensorboard is also available at [http://localhost:6006](http://localhost:6006)
27 | 
28 | 
29 | ### training data
30 | #### google images
31 | the excellent CLI `google_images_download` is part of the Pipfile. to retrieve more than 100 images at a time, you need to download [chromedriver](https://sites.google.com/a/chromium.org/chromedriver/downloads), unzip, and pass the appropriate path to the binary. the suggestion is to put the binary in `/usr/local/bin`, and then you can copy and paste the following command to download images for a search:
32 | 
33 | ```bash
34 | googleimagesdownload --keywords "tanks" --limit 1000 --chromedriver /usr/local/bin/chromedriver
35 | ```
36 | 
37 | another handy tool is [findimagedupes](https://gitlab.com/opennota/findimagedupes), especially if you are creating datasets by interweaving google searches (which will inevitably have overlapping images returned).
38 | 


--------------------------------------------------------------------------------
/docs/tutorial/1/1a.yaml:
--------------------------------------------------------------------------------
1 | folder: media/demo_official/1
2 | select:
3 |   name: Local
4 |   config:
5 |     source: data/demo/1local
6 |     # aggregate: true
7 | 


--------------------------------------------------------------------------------
/docs/tutorial/1/1b.yaml:
--------------------------------------------------------------------------------
 1 | folder: media/demo_official/1
 2 | elements_in:
 3 |   - Local
 4 | analyse:
 5 |   name: ExtractTypes
 6 |   config:
 7 |     exts:
 8 |       - txt
 9 |       - md
10 | 


--------------------------------------------------------------------------------
/docs/tutorial/1/1c.yaml:
--------------------------------------------------------------------------------
 1 | folder: media/demo_official/1
 2 | select:
 3 |   name: Local
 4 |   config:
 5 |     source: data/demo/1local
 6 | analyse:
 7 |   name: ExtractTypes
 8 |   config:
 9 |     exts:
10 |       - txt
11 |       - md
12 | 


--------------------------------------------------------------------------------
/docs/tutorial/1/README.md:
--------------------------------------------------------------------------------
  1 | # 1a. Working with selectors
  2 | 
  3 | Mtriage workflows are orchestrated using YAML files. These config files
  4 | indicate components used to select and/or process media. Most mtriage YAML
  5 | files are very simple, and mostly consist of configuration specific to the
  6 | components being run. For example, here is the config for the youtube run we'll
  7 | do in a second:
  8 | 
  9 | ```yaml
 10 | folder: media/demo_official/1
 11 | select:
 12 |   name: Local
 13 |   config:
 14 |     source: data/demo/1local
 15 |     # aggregate: true
 16 | ```
 17 | 
 18 | 
 19 | In order to analyse media with mtriage, we first need to 'select' that media
 20 | from somewhere. Selectors designate and index a 'media space', and then
 21 | download the relevant media in that space as local mtriage elements (elements
 22 | are essentially folders). In this example we'll use the
 23 | [Local](../src/lib/selectors/Local) selector, which simply selects from media
 24 | already on your computer's file system.
 25 | 
 26 | Let's try running the config:
 27 | 
 28 | ```
 29 | ./mtriage run docs/tutorial/1/1a.yaml
 30 | ```
 31 | 
 32 | You should see the following logs:
 33 | 
 34 | ```
 35 | Local: index: Indexing local folder...
 36 | Local: index: indexed file: 1.txt
 37 | Local: index: indexed file: 3.jpg
 38 | Local: index: indexed file: 2.md
 39 | ```
 40 | 
 41 | If you look in media/demo_official/1/Local/data, you'll see the three folders,
 42 | each containing one of the indexed media, as well as an 'element_map.csv'. You
 43 | won't normally need to look carefully at the folder structure in folders
 44 | mtriage produces, but it's helpful to have a look to get an idea of how things
 45 | are working under the hood. 
 46 | 
 47 | As a quick primer, mtriage works by formatting media as 'elements', which in
 48 | this case are represented simply as folders on disk. (Later we'll see that we
 49 | can store elements remotely, as well.) Selectors work by indexing media, and
 50 | then retrieving that media and storing them as elements. This prepares media to
 51 | be processed using an Analyser, which take elements as input and produce
 52 | elements as output. The 'elements_map.csv' is a listing that mtriage uses
 53 | internally.
 54 | 
 55 | # 1b. Working with analysers
 56 | 
 57 | Now that we've selected some elements, let's get to analysing them. We're going
 58 | to use the very straightforward 'ExtractTypes' analyser, which simply extracts
 59 | elements that have media with particular types. Here's the config:
 60 | 
 61 | ```yaml
 62 | folder: media/demo_official/1
 63 | elements_in:
 64 |   - Local
 65 | analyse:
 66 |   name: ExtractTypes
 67 |   config:
 68 |     exts:
 69 |       - txt
 70 |       - md
 71 | ```
 72 | 
 73 | The first line here indicates that we are working with the elements in the
 74 | folder 'media/demo_official/1'. The `elements_in` attribute indicates which
 75 | elements we want to process, which we specify via __the name of the selector we
 76 | used to produce them__. All workflows in mtriage are contained by a base
 77 | selector in this way. If we had used multiple selectors to index and retrieve
 78 | media, we could add extra line items in the `elements_in` array to indicate we
 79 | want to use them as well.
 80 | 
 81 | The `analyse` attribute indicates which analyse we want to use, and the
 82 | configuration we want to use for the analyser. The 'ExtractTypes' analyser
 83 | recieves an array of extensions (`exts`) that represents a whitelist of the
 84 | media types we want to extract.
 85 | 
 86 | 
 87 | Let's run this config and take a look at the result:
 88 | 
 89 | ```
 90 | ./mtriage run docs/tutorial/1/1b.yaml
 91 | ```
 92 | 
 93 | We should see the following logs:
 94 | 
 95 | ```
 96 | ExtractTypes: None: Running in parallel
 97 | ExtractTypes: analyse: Extracting element 1 with paths: ['1.txt']
 98 | ExtractTypes: analyse: No extracted media in element 3.
 99 | ExtractTypes: analyse: Extracting element 2 with paths: ['2.md']
100 | ```
101 | 
102 | As we can see, the analyser has extracted the two elements with media that have
103 | matching extensions, and skipped over element 3 (which contains '3.jpg'). The
104 | first logged line is an important aspect of mtriage's value add: it runs these
105 | operations in parallel, across as many CPUs are available on your computer.
106 | 
107 | # 1c. Putting it all together
108 | 
109 | We can put both selection and analysis together in a single config, as follows:
110 | 
111 | ```yaml
112 | folder: media/demo_official/1
113 | select:
114 |   name: Local
115 |   config:
116 |     source: data/demo/1local
117 | analyse:
118 |   name: ExtractTypes
119 |   config:
120 |     exts:
121 |       - txt
122 |       - md
123 | ```
124 | 
125 | And run it with:
126 | 
127 | ```
128 | ./mtriage run docs/tutorial/1/1c.yaml
129 | ```
130 | 
131 | Of course, this particular workflow isn't very useful at all, but hopefully you
132 | are already beginning to see how we can use mtriage to orchestrate much more
133 | meaningful and powerful media workflows. In the next tutorial, we'll use
134 | mtriage to reformat audio files.
135 | 
136 | [Go to tutorial 2](/docs/tutorial/2/README.md)
137 | 


--------------------------------------------------------------------------------
/docs/tutorial/2/2a.yaml:
--------------------------------------------------------------------------------
 1 | folder: media/demo_official/2
 2 | select:
 3 |   name: Local
 4 |   config:
 5 |     source: data/demo/2audio
 6 | analyse:
 7 |   name: ConvertAudio
 8 |   config:
 9 |     output_ext: mp3
10 | 


--------------------------------------------------------------------------------
/docs/tutorial/2/2b.yaml:
--------------------------------------------------------------------------------
 1 | folder: media/demo_official/2
 2 | select:
 3 |   name: Local
 4 |   config:
 5 |     source: data/demo/2audio
 6 | analyse:
 7 |   - name: ConvertAudio
 8 |     config:
 9 |       # in_parallel: no
10 |       output_ext: mp3
11 |   - name: ConvertAudio
12 |     config:
13 |       # in_parallel: no
14 |       output_ext: aac
15 | 


--------------------------------------------------------------------------------
/docs/tutorial/2/README.md:
--------------------------------------------------------------------------------
  1 | # 2a. An audio workflow
  2 | 
  3 | Now that we're familiar with selectors and analysers in principle, let's run
  4 | a couple of workflows to get a sense for mtriage's flexibility. Here's a config
  5 | that selects a generic audio file using Local, and then converts it to a
  6 | specific extension, mp4:
  7 | 
  8 | ```yaml
  9 | folder: media/demo_official/2
 10 | select:
 11 |   name: Local
 12 |   config:
 13 |     source: data/demo/2audio
 14 | analyse:
 15 |   name: ConvertAudio
 16 |   config:
 17 |     output_ext: mp4
 18 | ```
 19 | 
 20 | Let's run it:
 21 | 
 22 | ```
 23 | ./mtriage run docs/tutorial/2/2a.yaml
 24 | ```
 25 | 
 26 | You should see the following output:
 27 | 
 28 | ```
 29 | Local: index: Indexing local folder...
 30 | Local: index: indexed file: coffee.m4a
 31 | ConvertAudio: None: Running in parallel
 32 | ConvertAudio: analyse: Converted 'coffee' from .m4a to .mp3
 33 | ```
 34 | 
 35 | Try creating a different folder in the 'data' folder with several different
 36 | video files, modifying the `source` attribute in the config to point to it, and
 37 | running this updated config. We're now starting to get a sense of how mtriage
 38 | is useful to scale up simple media analysis in parallel for bulk processing.
 39 | 
 40 | # 2b. Chaining analysers
 41 | 
 42 | What makes mtriage really useful for constructing workflows is the ability to
 43 | chain different analysers together. The Etype system tells us something about
 44 | the inputs and outputs of each analyser, and with this information we can
 45 | reliably string analysers together to do successive analysis.
 46 | 
 47 | ```yaml
 48 | folder: media/demo_official/2
 49 | select:
 50 |   name: Local
 51 |   config:
 52 |     source: data/demo/2audio
 53 | analyse:
 54 |   - name: ConvertAudio
 55 |     config:
 56 |       output_ext: mp3
 57 |   - name: ConvertAudio
 58 |     config:
 59 |       output_ext: aac
 60 | ```
 61 | 
 62 | Say we wanted to convert an audio file to two different output formats. We can
 63 | do it by specifying an analysis chain with two ConvertAudio parts. Let's run
 64 | this config:
 65 | 
 66 | ```
 67 | ./mtriage run docs/tutorial/2/2b.yaml
 68 | ```
 69 | 
 70 | We'll get the following:
 71 | 
 72 | ```
 73 | Local: index: Indexing local folder...
 74 | Local: index: indexed file: coffee.m4a
 75 | ConvertAudio: None: Running in parallel
 76 | ConvertAudio: analyse: Converted 'coffee' from .m4a to .mp3
 77 | ConvertAudio: None: Running in parallel
 78 | ConvertAudio: analyse: Converted 'coffee' from .mp3 to .aac
 79 | ```
 80 | 
 81 | Mtriage runs this config in the order that its specificed: selecting media with
 82 | the Local selector, using ConvertAudio to convert this selected media to mp3,
 83 | and then converting that media (the mp3 file) to aac, using ConvertAudio with
 84 | a different configuration.
 85 | 
 86 | When mtriage runs analysers in a chain, it keeps the intermediary results by
 87 | default. Therefore this config works to produce the two audio versions of the
 88 | source video file in which we are interested. In tutorial 3, we'll see how to
 89 | conveniently visualise the results of mtriage workflows with
 90 | [mtriage-viewer](https://github.com/forensic-architecture/mtriage-viewer).
 91 | 
 92 | As we're only extracting audio from one file here, it doesn't make sense to run
 93 | analysis in parallel. (As soon as there are as many elements being analysed as
 94 | there are CPUs available, however, it does make sense; which is why mtriage
 95 | runs in parallel by default.) We can easily run analysis serially by setting
 96 | `in_parallel` to false in an analyser's config:
 97 | 
 98 | ```yaml
 99 | folder: media/demo_official/2
100 | select:
101 |   name: Local
102 |   config:
103 |     source: data/demo/2audio
104 | analyse:
105 |   - name: ConvertAudio
106 |     config:
107 |       in_parallel: no
108 |       output_ext: mp3
109 |   - name: ConvertAudio
110 |     config:
111 |       in_parallel: no
112 |       output_ext: aac
113 | ```
114 | 
115 | Try uncommenting the relevant lines with `in_parallel` in
116 | docs/tutorial/2/2b.yaml, and running the config again. You should see
117 | a different line logging that indicates mtriage is running analysis serially.
118 | 
119 | In the next tutorial, we'll work with the Youtube selector to analyse videos
120 | that are selected using youtube's search API, showing the full power  and
121 | extensibility of mtriage.
122 | 
123 | [Go to tutorial 3](/docs/tutorial/3/README.md)
124 | 


--------------------------------------------------------------------------------
/docs/tutorial/3/3a.yaml:
--------------------------------------------------------------------------------
1 | folder: media/demo_official/3
2 | select:
3 |   name: Youtube
4 |   config:
5 |     search_term: Tear gas
6 |     uploaded_before: "2015-10-02T00:00:00Z"
7 |     uploaded_after: "2015-10-01T00:00:00Z"
8 | 


--------------------------------------------------------------------------------
/docs/tutorial/3/3b.yaml:
--------------------------------------------------------------------------------
 1 | folder: media/demo_official/3
 2 | elements_in:
 3 |   - Youtube
 4 | analyse:
 5 |   - name: Frames
 6 |   - name: KerasPretrained
 7 |     config:
 8 |       in_parallel: false
 9 |       model: ResNet50
10 |       labels:
11 |         - tank
12 |         - rifle
13 |         - military uniform
14 | 


--------------------------------------------------------------------------------
/docs/tutorial/3/3c.yaml:
--------------------------------------------------------------------------------
 1 | folder: media/demo_official/3c
 2 | select:
 3 |   name: Youtube
 4 |   config:
 5 |     search_term: tear gas + mexico
 6 |     uploaded_before: "2018-11-30T00:00:00Z"
 7 |     uploaded_after: "2018-11-15T00:00:00Z"
 8 | analyse:
 9 |   - name: Frames
10 |   - name: ImageDedup
11 |     config:
12 |       threshold: 3
13 |       method: dhash
14 |   - name: KerasPretrained
15 |     config:
16 |       model: ResNet50
17 |       labels:
18 |         - tank
19 |         - rifle
20 |         - military uniform
21 | 


--------------------------------------------------------------------------------
/docs/tutorial/3/README.md:
--------------------------------------------------------------------------------
  1 | # 3a. Selecting media with Youtube
  2 | 
  3 | The Youtube selector uses [Youtube's Data API](https://developers.google.com/youtube/v3)
  4 | to find videos uploaded between certain dates using a search term. This API
  5 | requires an API key, which is free to get. Follow the instructions in [the
  6 | documentation](/docs/components/youtube.md), adding the API key in a line in
  7 | the .env file at the root of your mtriage folder.
  8 | 
  9 | With the API key in our mtriage environment, we can run the following config to
 10 | select some videos from youtube:
 11 | 
 12 | ```yaml
 13 | folder: media/demo_official/3
 14 | select:
 15 |   name: Youtube
 16 |   config:
 17 |     search_term: Tear gas 
 18 |     uploaded_before: "2015-10-02T00:00:00Z"
 19 |     uploaded_after: "2015-10-01T00:00:00Z"
 20 | ```
 21 | 
 22 | Let's run it:
 23 | 
 24 | ```
 25 | ./mtriage run docs/tutorial/3/3a.yaml
 26 | ```
 27 | 
 28 | The Youtube selector indexes videos by making an API call, and then downloads
 29 | the videos in parallel according to however many CPU cores your computer has
 30 | available. By default, it downloads the videos at very low quality, and it also
 31 | retrieves a 'meta.json' file regarding the video's provenance and other meta
 32 | information.
 33 | 
 34 | # 3b. Image classification with KerasPretrained 
 35 | 
 36 | Let's now classify the frames in the videos that we've downloaded using image
 37 | classifiers that have been pretrained on the labels in the
 38 | [ImageNet](http://www.image-net.org/) database. We'll do so using a neural net
 39 | architecture called [ResNet](https://arxiv.org/abs/1512.03385), which is
 40 | a state-of-the-art architecture for image classification. We'll give the
 41 | KerasPretrained analyser the three labels we're interested in--tank, rifle, and
 42 | military uniform--to indicate that we want to predict the appearance of these
 43 | objects in the videos' frames.
 44 | 
 45 | ```yaml
 46 | folder: media/demo_official/3
 47 | elements_in:
 48 |   - Youtube
 49 | analyse:
 50 |   - name: Frames
 51 |   - name: KerasPretrained
 52 |     config:
 53 |       model: ResNet50
 54 |       labels:
 55 |         - tank
 56 |         - rifle
 57 |         - military uniform
 58 | ```
 59 | 
 60 | Note that the first time you runthis config, it will download the pretrained
 61 | weights for Resnet, which is a file ~100mb in size (this download only happens
 62 | once):
 63 | 
 64 | ```
 65 | ./mtriage run docs/tutorial/3/3b.yaml
 66 | ```
 67 | 
 68 | # 3c. A complete mtriage workflow
 69 | 
 70 | Now that we've tested the parts, let put this all together in a single
 71 | workflow, and broaden the media space slightly:
 72 | 
 73 | ```yaml
 74 | folder: media/demo_official/3c
 75 | select:
 76 |   name: Youtube
 77 |   config:
 78 |     search_term: tear gas + mexico
 79 |     uploaded_before: "2018-11-30T00:00:00Z"
 80 |     uploaded_after: "2018-11-15T00:00:00Z"
 81 | analyse:
 82 |   - name: Frames
 83 |   - name: ImageDedup
 84 |     config:
 85 |       threshold: 3
 86 |       method: dhash
 87 |   - name: KerasPretrained
 88 |     config:
 89 |       model: ResNet50
 90 |       labels:
 91 |         - tank
 92 |         - rifle
 93 |         - military uniform
 94 |   - name: Rank
 95 | ```
 96 | 
 97 | 
 98 | In this config, we select videos uploaded between the 15th and 30th of November
 99 | in 2018 that match both "tear gas" and "mexico" in Youtube's search API. Once
100 | downloaded, we split each video into frames, deduplicate similar images using
101 | [dhash](https://github.com/maccman/dhash), classify deduplicated frames using
102 | resnet, and then create an additional JSON that ranks the classified videos
103 | according to the number of positive frames they contain (using the `Rank` analyser).
104 | 
105 | That's a fair bit of computational work. Go and grab a beverage while this
106 | command runs to completion, if you like:
107 | 
108 | ```
109 | ./mtriage run docs/tutorial/3/3c.yaml
110 | ```
111 | Once it's finished, take a look at the files that the workflow has produced in
112 | the media/demo_official/3c folder. You should see everything in a 'Youtube'
113 | folder (as you may recall, mtriage runs are organised internally by selector),
114 | and then most of the created media in a 'derived' folder inside that.
115 | 
116 | You're officially finished with the mtriage tutorial. If you want to work
117 | through the media mtriage has just analysed using a frontend interface,
118 | however, as we do here at [Forensic Architecture](https://forensic-architecture.org),
119 | head over to our [mtriage-viewer](https://github.com/forensic-architecture/mtriage-viewer)
120 | and follow the instructions there!
121 | 


--------------------------------------------------------------------------------
/docs/updates/2020.01.30.md:
--------------------------------------------------------------------------------
 1 | # Introducing Development Cycles 
 2 | 
 3 | As of 2020, I'm aiming to keep mtriage development to a regular, two-week
 4 | release cycle.
 5 | 
 6 | At the start of each cycle, I'll put some issues on the [release cycle project
 7 | board](https://github.com/forensic-architecture/mtriage/projects/1), and then
 8 | merge them into the [release](https://github.com/forensic-architecture/mtriage/tree/release)
 9 | branch as the code is written and reviewed. At the end of each cycle, I'll
10 | merge the release branch into the [master](https://github.com/forensic-architecture/mtriage/tree/master)
11 | branch, and then this goes on wash-rinse-repeat every two weeks.
12 | 
13 | 
14 | 
15 | ## Status
16 | Mtriage is currently a tool that we are using internally at [Forensic
17 | Architecture](https://forensic-architecture.org) primarily to orchestrate one
18 | particular workflow, the deployment of computer vision classifiers on public
19 | domain images and video. [This post](https://forensic-architecture.org/investigation/cv-in-triple-chaser)
20 | goes into greater detail about that how that workflow works.
21 | 
22 | Mtriage's main development over at least the next few months will be aimed at
23 | consolidating its use in this particular capacity. I know that there is some
24 | use of mtriage as an orchestration tool for other workflows, i.e. to create
25 | labelled datasets for machine learning, and these are definitely uses that we
26 | are interested in discussing and supporting in the future. Mtriage is intended
27 | as a tool to orchestrate computational workflows beyond just the initial
28 | application in computer vision, but we need to refine that one first before
29 | moving onto others.
30 | 
31 | ## Cycle 1
32 | 
33 | The first cycle will begin February 1st, and end on February 15th. It will
34 | focus on parallelising both selectors and analysers, and making the Etype
35 | system more flexible.
36 | 


--------------------------------------------------------------------------------
/docs/updates/2020.02.16.md:
--------------------------------------------------------------------------------
 1 | # Parallisation and carrying 
 2 | 
 3 | The past two weeks has seen two additions to mtriage:
 4 | 1. Parallelisation by default of `Analyser.analyse` and `Selector.retrieve`.
 5 | 2. A generic and optional 'carry' flag that can be passed via analyser config
 6 |    to copy files from an element's base folder to its destination.
 7 | 
 8 | ### [#122: Parallelisation](https://github.com/forensic-architecture/mtriage/pull/122)
 9 | Huge thanks to @ivansafrin for the major part of this PR. In my mind,
10 | parallelising the two major computationally intensive operations,
11 | `retrieve` for the selecting phase and `analyse` for the analysis phase, adds
12 | a real reason to adopt mtriage as a framework, rather than writing your own
13 | custom scripts. 
14 | 
15 | 
16 | Applying `retrieve_element` from a selector, or `analyse_element` from an analyser 
17 | is, because of the way mtriage is conceived, always self-contained; and 
18 | therefore easy to parallelise. The idea of an element as a folder that contains
19 | a set of similarly typed elements is the geist of mtriage as a framework.
20 | Selectors are functions that create elements, and analysers are functions
21 | that process them (to create new elements). 
22 | 
23 | When looking to apply computational logic on media at scale, packing media into
24 | elements through mtriage allows developers to focus on the important and
25 | innovative logic that is being applied, and forget about the redundant code
26 | that reads and writes files in for loops. 
27 | 
28 | Parallelising these operations means that now, not only does mtriage take the
29 | burden of necessary redundancy from the developer, it also does so in a way
30 | that enables code to run a lot more efficiently across multiple CPUs. This is
31 | a huge boon for us at FA.
32 | 
33 | 
34 | ### [#140: Adding the 'carry' option](https://github.com/forensic-architecture/mtriage/pull/140)
35 | This is a continuation of work I had been doing before introducing these
36 | updates making mtriage's type system more flexible, and so less coupled to its
37 | inaugural (computer vision) workflow.
38 | 
39 | Prior to this PR, if an analyser further down the chain in a workflow needed a 
40 | file in a selector's original element, the first analysers had to encode that 
41 | logic in their `analyse_element` function, and copy the files over.
42 | 
43 | Not only did this make analyser encapsulation bad, it also meant that analysers 
44 | tended towards convoluted out types such as `JsonAnnotatedImageArray`.
45 | 
46 | The carry flag solves both of these problems by offloading the work and
47 | specification of copying files during analysis to mtriage config, which makes
48 | a lot more sense that baking it into analysers themselves.
49 | 
50 | ## Cycle 2
51 | 
52 | This cycle begins February 16th, and will end on February 29th. There have been
53 | a [couple](https://github.com/forensic-architecture/mtriage/pull/139) of [community contributions](https://github.com/forensic-architecture/mtriage/pull/135) that I am 
54 | looking to merge. Otherwise, this cycle will focus on improving developer
55 | experience in general, and on writing templates and documentation for creating
56 | new components (analysers and selectors) in particular, as well as fixing some
57 | critical bugs in the Youtube selector.
58 | 


--------------------------------------------------------------------------------
/docs/updates/2020.03.16.md:
--------------------------------------------------------------------------------
  1 | # Abstract storage, revamped component API, more robust etypes
  2 | 
  3 | No update was posted for 2020.02.30, and so this update covers the last four
  4 | weeks of development. Essentially what happened is that I started implementing
  5 | a significant feature, abstract storage, and it ended up in a PR that touches
  6 | most internals, cleaning the component API (how analysers and selectors are
  7 | written), and changing a significant number of tests. I'll cover all the
  8 | changes as comprehensively as I can in the rest of this update.
  9 | 
 10 | ## Abstract storage
 11 | Before this cycle, storage in mtriage was hardcoded to use the filesystem of
 12 | the computer on which mtriage was running (written to a linux filesystem inside
 13 | a Docker container, and to the disk on the host system via volumes configured
 14 | in the other layer of mtriage). As we are looking to move to cloud deployments
 15 | of mtriage, abstract storage--the optionality to store media produced by
 16 | mtriage either locally, or elsewhere--is crucial.
 17 | 
 18 | Calls to read and write files were scattered throughout the implementations for
 19 | [mtmodule](src/lib/common/mtmodule.py), [analyser](src/lib/common/analyser.py),
 20 | and [selector](src/lib/common/selector.py). Moreover, the intermediate data
 21 | structures that each of these modules used to represent available media on disk
 22 | and various stages of the mtriage lifecycle were bloated and unsystematic.
 23 | 
 24 | In order to decouple storage from the local disk, I introduced a [Storage
 25 | API](src/lib/common/storage.py), which analysers and selectors interface with
 26 | to read and write elements persistently. While thinking through how components
 27 | ought to return elements to the mtriage runtime in order for persistent storage
 28 | to occur remotely rather than locally, I realised that, regardless of where
 29 | elements are ultimately stored, it makes sense for them to be readable and
 30 | writable locally to provide full flexibility from the component perspective.
 31 | In other words, in order that analysers and selectors can still deal with
 32 | elements _as if_ they are simply written on the local filesystem, analysers and
 33 | selectors still write media to disk on the computer where mtriage is running,
 34 | and return elements back to the mtriage runtime **by reference**, indicating
 35 | the paths where media that comprise elements reside. The mtriage runtime then,
 36 | by way of the Storage API, persists those elements in designated storage,
 37 | deleting the transitory local representations that were returned by
 38 | a component.
 39 | 
 40 | Modifying internals to make storage work in this way made it clear that the
 41 | component API (e.g. `Analyser.analyse_element`, and
 42 | `Selector.retrieve_element`) could be a lot more particular regarding inputs
 43 | and outputs. I implemented this by solidifying the way that Etypes work, so
 44 | that those functions can unanimously receive and return etypes to/from the
 45 | mtriage runtime.
 46 | 
 47 | ## Revamped component API
 48 | Here's what the ConvertAudio analyser looks like after the rewrite:
 49 | 
 50 | ```python
 51 | from lib.common.analyser import Analyser
 52 | from lib.common.exceptions import ElementShouldSkipError
 53 | from lib.common.etypes import Etype
 54 | from subprocess import call, STDOUT
 55 | from pathlib import Path
 56 | import os
 57 | 
 58 | 
 59 | class ConvertAudio(Analyser):
 60 |     def analyse_element(self, element: Etype.Audio, config) -> Etype.Audio:
 61 |         output_ext = config["output_ext"]
 62 | 
 63 |         FNULL = open(os.devnull, "w")
 64 |         output = f"/tmp/{element.id}.{output_ext}"
 65 |         # TODO: error handling
 66 |         out = call(
 67 |             ["ffmpeg", "-y", "-i", element.paths[0], output],
 68 |             stdout=FNULL,
 69 |             stderr=STDOUT,
 70 |         )
 71 |         self.logger(
 72 |             f"Converted '{element.id}' from {element.paths[0].suffix} to .{output_ext}"
 73 |         )
 74 |         return Etype.Audio(element.id, paths=[output])
 75 | 
 76 | 
 77 | module = ConvertAudio
 78 | ```
 79 | 
 80 | Notably, the confusing functions `get_in_etype` and `get_out_etype` no longer
 81 | exist: those specifications are now covered using Python 3 function typing.
 82 | It's clear from the signature that the ConvertAudio analyser takes an element
 83 | of `Etype.Audio` as input, and produces an element of `Etype.Audio` as output.
 84 | 
 85 | Most importantly, the `analyse_element` function _returns a value that
 86 | represents the element it has produced_. Previously, element creation was done
 87 | implicitly through the creation of files, and `analyse_element` didn't return
 88 | anything explicitly. Now, it returns **an instance of an Etype**, which has
 89 | a standardised constructor that takes an element id (`str`) as is first
 90 | argument, and a path or list of paths as its second (optionally named). 
 91 | 
 92 | Note that the file structure for analysers has changed: there is no scaffolding
 93 | required via an '__init__.py'. Instead the module is registered simply through
 94 | the Javascript/Node-style export of assigning the `module` variable in the last
 95 | line. All a valid analyser (or selector) needs is a single 'core.py' file that
 96 | defines a `module` variable that contains a class which inherits from
 97 | `Analyser`.
 98 | 
 99 | Note also how the analyser still creates files locally. Instead of representing
100 | groups of media as folders, however, analysers do so simply by passing
101 | references/paths to the relevant files in an Etype's constructor.
102 | 
103 | ## More robust etypes
104 | Etypes are the operational heart of mtriage, and there's a fair bit of
105 | implementation magic that goes on inside them. The basic idea is that each
106 | Etype class offers a constructor that takes an string id and a path or set of
107 | paths:
108 | 
109 | ```python
110 | a_json_element = Etype.Json("a_json_element", "path/to/myfile.json")
111 | ```
112 | 
113 | The Etype constructor checks all of the paths to ensure that they are valid,
114 | optionally filters out certain paths, and throws an `EtypeCastError` if
115 | something is up. 
116 | 
117 | There are two higher order etypes, `Union` and `Array`, which allow expressive
118 | composition of etypes. For example, you can create an element that contains one
119 | image and one json file using Union:
120 | 
121 | ```python
122 | a_composite_element = Etype.Union(Etype.Json, Etype.Image)("a_composite_element", ["path/to/file.json", "path/to/file.png"])
123 | ```
124 | 
125 | I've also added a more modular way to add new etypes, similar to the way that
126 | analyser and elements work. Any .py file in [lib/etypes](src/lib/etypes) will
127 | be treated as a custom etype, and will be made available through the `Etype`
128 | namespace in the way detailed above. Custom etypes need only inherit from the
129 | `Et` class (from lib.common.etypes), and define a `filter` function that is run
130 | on construction to filter out certain paths.
131 | 
132 | This means that custom code which deals with specific etypes (i.e., media of
133 | particular structures produced during mtriage workflows) can be better
134 | encapsulated as static methods on the custom etypes, as is done in
135 | [cvjson.py](src/lib/etypes/cvjson.py).
136 | 
137 | ## Cycle 3
138 | We're fast-tracking towards cloud deployments, but will also try to merge some
139 | of the outstanding PRs that have been on hold due to this rewrite.
140 | 


--------------------------------------------------------------------------------
/docs/updates/2020.11.22.md:
--------------------------------------------------------------------------------
 1 | # Back into it
 2 | Mtriage development has been stalled for some time, as I've had other
 3 | priorities. I'm hoping to pick up the pace in the last couple of months of
 4 | 2020, however. These are a couple of things on the near horizon:
 5 | 
 6 | ### Component-wise testing
 7 | This is pretty key to ensure that community component contributions are
 8 | meaningful. Currently there are only tests for the core part of mtriage, and it
 9 | is just an article of faith that each component (selector/analyser) works.
10 | 
11 | ### Custom classifier documentation and compatibility
12 | None of our code for bootstrapping custom classifiers with mtriage is public,
13 | which means that it's pretty much useless to everyone in its current state,
14 | except as a reference. We're looking to drive some more research around
15 | training vision classifiers using synthetic data with mtriage, and we'll fold
16 | out all of these fixes into upstream mtriage as ways to apply custom
17 | classifiers in the abstract via mtriage as a deployment framework.
18 | 
19 | ### Spec-ing a rewrite in Rust/Firecracker
20 | This is somewhat irresponsible on my part, as this here Python/Docker version
21 | of the framework barely works. But I'm looking for a way to get my fingers
22 | properly sticky with Rust in practice, and mtriage seems a good candidate
23 | (systems software, containers, etc).
24 | 


--------------------------------------------------------------------------------
/example.blacklist.txt:
--------------------------------------------------------------------------------
 1 | # lines that begin with a # will be ignored.
 2 | 
 3 | ### ANALYSERS
 4 | ConvertAudio
 5 | ExtractAudio
 6 | Frames
 7 | ImageDedup
 8 | KerasPretrained
 9 | Rank
10 | 
11 | ### SELECTORS
12 | Local
13 | Twitter
14 | Youtube
15 | 


--------------------------------------------------------------------------------
/examples/4chan.yaml:
--------------------------------------------------------------------------------
1 | folder: media/fcs
2 | select:
3 |   name: FourChan
4 |   config:
5 |     board: "g"


--------------------------------------------------------------------------------
/examples/classify.yaml:
--------------------------------------------------------------------------------
 1 | folder: media/example
 2 | elements_in:
 3 |   - Youtube/Frames
 4 | analyse:
 5 |   - name: KerasPretrained
 6 |     config:
 7 |       model: ResNet50
 8 |       labels:
 9 |         - tank
10 |         - rifle
11 |         - military uniform
12 | 


--------------------------------------------------------------------------------
/examples/meta-test.yaml:
--------------------------------------------------------------------------------
1 | folder: media/example
2 | elements_in:
3 |   - Youtube/KerasPretrained
4 | analyse:
5 |   name: AnalysedFramesMeta
6 |   config:
7 |     dev: true
8 | 


--------------------------------------------------------------------------------
/examples/pytorchfasterrcnn-test.yaml:
--------------------------------------------------------------------------------
 1 | folder: media/pytorchfasterrcnn
 2 | select:
 3 |   name: Local
 4 |   config:
 5 |     source: data/images
 6 |     aggregate: true
 7 | analyse:
 8 |   name: PytorchFasterRcnn
 9 |   config:
10 |     dev: true
11 |     model: data/sean1.pth
12 |     class_map:
13 |       - background
14 |       - canister
15 |       - cylinder
16 |       - can
17 |       - bottle
18 |       - bin
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/examples/ranking-test.yaml:
--------------------------------------------------------------------------------
1 | folder: media/example
2 | elements_in:
3 |   - Youtube
4 | analyse:
5 |   name: Flatten
6 |   config:
7 |     dev: true
8 | 


--------------------------------------------------------------------------------
/examples/yolov5-test.yaml:
--------------------------------------------------------------------------------
 1 | folder: media/yolov5
 2 | select:
 3 |   name: Local
 4 |   config:
 5 |     source: data/images
 6 |     aggregate: true
 7 | analyse:
 8 |   name: TorchHub
 9 |   config:
10 |     dev: true
11 |     repo: ultralytics/yolov5
12 |     args:
13 |       - yolov5s
14 |     kwargs:
15 |       pretrained: true
16 | 


--------------------------------------------------------------------------------
/media/.gitignore:
--------------------------------------------------------------------------------
1 | **/*
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/mtriage:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | from commands import parse_args, build, develop, clean, run_tests, run, export
 5 | 
 6 | DEV_COMMANDS = {"develop": develop, "build": build, "test": run_tests, "clean": clean}
 7 | 
 8 | if __name__ == "__main__":
 9 |     ARGS = parse_args(sys.argv[1:])
10 | 
11 |     if ARGS.base == "dev":
12 |         DEV_COMMANDS[ARGS.command](ARGS)
13 |     elif ARGS.base == "export":
14 |         export(ARGS)
15 |     else:
16 |         run(ARGS)
17 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pyyaml
2 | pytest==4.5.0
3 | black
4 | 


--------------------------------------------------------------------------------
/scripts/lint:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | ## run from top level, i.e. `bash scripts/lint.sh`
4 | python3 -m black src/
5 | python3 -m black test/
6 | python3 -m black commands.py
7 | python3 -m black util.py
8 | python3 -m black mtriage
9 | 


--------------------------------------------------------------------------------
/scripts/scaffold:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | from pathlib import Path
 5 | 
 6 | BASE_PATH = Path(os.path.dirname(os.path.abspath(__file__))) / "../src/lib/analysers"
 7 | 
 8 | def is_camel_case(s):
 9 |     return s != s.lower() and s != s.upper() and "_" not in s
10 | 
11 | def core_base(name): return f'''from lib.common.analyser import Analyser
12 | from lib.common.etypes import Etype
13 | 
14 | class {name}(Analyser):
15 |     in_etype = Etype.Any
16 |     out_etype = Etype.Any
17 | 
18 |     def analyse_element(self, element, config):
19 |         return element
20 | 
21 | module = {name}'''
22 | 
23 | def infoyaml_base(desc): return f'''desc: {desc}
24 | args:
25 |   - name: myarg
26 |     desc: Optional description
27 |     required: false
28 |     input: string
29 | '''
30 | 
31 | name = input("New analyser name: ")
32 | desc = input("Basic description for new analyser: ")
33 | 
34 | if not is_camel_case(name):
35 |     print("An analyser must be CamelCase")
36 |     sys.exit()
37 | 
38 | base = BASE_PATH / name
39 | if os.path.exists(base) and os.path.isdir(base):
40 |     print(f'An analyser named "{name}" already exists.')
41 |     sys.exit()
42 | 
43 | os.mkdir(BASE_PATH / name)
44 | 
45 | with open(base/"core.py", "w+") as f:
46 |     f.write(core_base(name))
47 | 
48 | with open(base/"info.yaml", "w+") as f:
49 |     f.write(infoyaml_base(desc))
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/src/build/core.end.Dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | # *********************
 3 | # ... continuing after partials
 4 | # *********************
 5 | 
 6 | # install pip packages
 7 | # NOTE: build.requirements.txt is hardcoded here.
 8 | ARG requirements_file=build.requirements.txt
 9 | COPY $requirements_file /requirements.txt
10 | RUN pip3 install --upgrade pip && \
11 | 	pip3 install -r /requirements.txt
12 | 
13 | CMD ["python3", "/mtriage/src/run.py"]
14 | 


--------------------------------------------------------------------------------
/src/build/core.requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pyyaml
3 | 


--------------------------------------------------------------------------------
/src/build/core.start.Dockerfile:
--------------------------------------------------------------------------------
 1 | MAINTAINER Lachlan Kermode <lk@forensic-architecture.org>
 2 | ENV LANG C.UTF-8
 3 | 
 4 | RUN apt-get update && \
 5 | # ==================================================================
 6 | # tools
 7 | # ------------------------------------------------------------------
 8 |     DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y --no-install-recommends \
 9 | 		# core
10 |         build-essential \
11 |         apt-utils \
12 |         ca-certificates \
13 | 		wget \
14 | 		# python
15 |         software-properties-common \
16 |         python3.7 \
17 |         python3.7-dev \
18 |         python3-distutils-extra \
19 |         git \
20 | 		# dev
21 | 		# git vim curl unzip unrar \
22 |         && \
23 |     wget -O ~/get-pip.py \
24 |         https://bootstrap.pypa.io/get-pip.py && \
25 |     python3.7 ~/get-pip.py && \
26 |     ln -s /usr/bin/python3.7 /usr/local/bin/python3 && \
27 |     ln -s /usr/bin/python3.7 /usr/local/bin/python && \
28 |     python -m pip --no-cache-dir install --upgrade setuptools && \
29 | 	ldconfig && \
30 |     apt-get clean && \
31 |     apt-get autoremove && \
32 |     rm -rf /var/lib/apt/lists/* /tmp/* ~/*
33 | 
34 | RUN apt-get update --fix-missing
35 | 
36 | # Copy necessary folders
37 | RUN mkdir -p /mtriage
38 | COPY ./scripts /mtriage/scripts
39 | COPY ./src /mtriage/src
40 | WORKDIR /mtriage
41 | 
42 | # *********************
43 | # starting partials...
44 | # *********************
45 | 
46 | 


--------------------------------------------------------------------------------
/src/build/cpu-header.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:18.04
2 | 


--------------------------------------------------------------------------------
/src/build/gpu-header.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
2 | 


--------------------------------------------------------------------------------
/src/conftest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Setup global fixtures for modular-wise tests.
 3 | """
 4 | import pytest
 5 | 
 6 | # import requests
 7 | import os
 8 | import test.utils as test_utils
 9 | 
10 | 
11 | @pytest.fixture(scope="session", autouse=True)
12 | def test_element_dir():
13 |     return "../media/test"
14 | 
15 | 
16 | # TODO(lachlan): create a special fixture to allow component-wise tests to analyse sub elements
17 | # EG_VIDEO = "https://datasheet-sources.ams3.digitaloceanspaces.com/ilovaisk_videos/platform_background.mp4"
18 | # EG_IMAGE = "https://datasheet-sources.ams3.digitaloceanspaces.com/ilovaisk_videos/Platform_Tutorial_thumb.png"
19 | #
20 | # @pytest.fixture(scope="session", autouse=True)
21 | # def analyse_stub_element()
22 | #     if not os.path.exists("/test"):
23 | #         os.makedirs("/test")
24 | #     if not os.path.exists("/test/video.mp4"):
25 | #         r = requests.get(EG_VIDEO)
26 | #         open("/test/video.mp4", "wb").write(r.content)
27 | #     if not os.path.exists("/test/image.png"):
28 | #         r = requests.get(EG_IMAGE)
29 | #         open("/test/image.png", "wb").write(r.content)
30 | #
31 | #     return "some val"
32 | 
33 | 
34 | @pytest.fixture(scope="session", autouse=True)
35 | def utils():
36 |     return test_utils
37 | 


--------------------------------------------------------------------------------
/src/lib/analysers/AnalysedFramesMeta/core.py:
--------------------------------------------------------------------------------
 1 | from lib.common.analyser import Analyser
 2 | from lib.common.etypes import Etype
 3 | from lib.util.cvjson import generate_meta
 4 | from lib.etypes.cvjson import CvJson
 5 | 
 6 | 
 7 | class AnalysedFramesMeta(Analyser):
 8 |     out_etype = Etype.CvJson
 9 | 
10 |     def analyse_element(self, element, _):
11 |         return element
12 | 
13 |     def post_analyse(self, elements) -> Etype.Json.as_array():
14 |         return generate_meta(elements, logger=self.logger)
15 | 
16 | 
17 | module = AnalysedFramesMeta
18 | 


--------------------------------------------------------------------------------
/src/lib/analysers/AnalysedFramesMeta/info.yaml:
--------------------------------------------------------------------------------
1 | desc: TODO
2 | args: []
3 | 
4 | 


--------------------------------------------------------------------------------
/src/lib/analysers/ConvertAudio/core.py:
--------------------------------------------------------------------------------
 1 | from lib.common.analyser import Analyser
 2 | from lib.common.exceptions import ElementShouldSkipError
 3 | from lib.common.etypes import Etype
 4 | from subprocess import call, STDOUT
 5 | from pathlib import Path
 6 | import os
 7 | 
 8 | 
 9 | class ConvertAudio(Analyser):
10 |     in_etype = Etype.Audio
11 |     out_etype = Etype.Audio
12 | 
13 |     def analyse_element(self, element, config):
14 |         output_ext = config["output_ext"]
15 | 
16 |         FNULL = open(os.devnull, "w")
17 |         output = f"/tmp/{element.id}.{output_ext}"
18 |         # TODO: error handling
19 |         out = call(
20 |             ["ffmpeg", "-y", "-i", element.paths[0], output],
21 |             stdout=FNULL,
22 |             stderr=STDOUT,
23 |         )
24 |         self.logger(
25 |             f"Converted '{element.id}' from {element.paths[0].suffix} to .{output_ext}"
26 |         )
27 |         return Etype.Audio(element.id, paths=[output])
28 | 
29 | 
30 | module = ConvertAudio
31 | 


--------------------------------------------------------------------------------
/src/lib/analysers/ConvertAudio/info.yaml:
--------------------------------------------------------------------------------
1 | desc: Convert the media in an Audio element from one file extension to another.
2 | args:
3 |   - name: output_ext
4 |     desc: The file extension of the output media, to which the input files will be converted.
5 |     required: true
6 |     input: string
7 | 


--------------------------------------------------------------------------------
/src/lib/analysers/ConvertAudio/partial.Dockerfile:
--------------------------------------------------------------------------------
1 | RUN apt-get install -y \
2 | 	ffmpeg
3 | 


--------------------------------------------------------------------------------
/src/lib/analysers/ExtractAudio/core.py:
--------------------------------------------------------------------------------
 1 | from lib.common.analyser import Analyser
 2 | from lib.common.exceptions import ElementShouldSkipError
 3 | from lib.common.etypes import Etype
 4 | from subprocess import call, STDOUT
 5 | import os
 6 | 
 7 | 
 8 | class ExtractAudio(Analyser):
 9 |     in_etype = Etype.Video
10 |     out_etype = Etype.Audio
11 | 
12 |     def analyse_element(self, element, config):
13 |         output_ext = config["output_ext"]
14 |         output = f"/tmp/{element.id}.{output_ext}"
15 |         FNULL = open(os.devnull, "w")
16 |         # TODO: add error handling
17 |         out = call(
18 |             ["ffmpeg", "-y", "-i", element.paths[0], output],
19 |             stdout=FNULL,
20 |             stderr=STDOUT,
21 |         )
22 | 
23 |         element.paths[0] = output
24 | 
25 |         return element
26 | 
27 | 
28 | module = ExtractAudio
29 | 


--------------------------------------------------------------------------------
/src/lib/analysers/ExtractAudio/info.yaml:
--------------------------------------------------------------------------------
1 | desc: Extract the audio from a video file.
2 | args:
3 |   - name: output_ext
4 |     desc: The file extension of the output audio, e.g. 'mp4' or 'aac'.
5 |     required: true
6 |     input: string
7 | 


--------------------------------------------------------------------------------
/src/lib/analysers/ExtractAudio/partial.Dockerfile:
--------------------------------------------------------------------------------
1 | RUN apt-get install -y \
2 | 	ffmpeg
3 | 


--------------------------------------------------------------------------------
/src/lib/analysers/ExtractTypes/core.py:
--------------------------------------------------------------------------------
 1 | from shutil import copyfile
 2 | from pathlib import Path
 3 | from lib.common.analyser import Analyser
 4 | from lib.common.etypes import Etype
 5 | 
 6 | 
 7 | class ExtractTypes(Analyser):
 8 |     in_etype = Etype.Any
 9 |     out_etype = Etype.Any
10 | 
11 |     def analyse_element(self, element, config):
12 |         exts = config["exts"] if "exts" in config else []
13 |         element.paths = [
14 |             x for x in element.paths if x.suffix in exts or x.suffix[1:] in exts
15 |         ]
16 |         if len(element.paths) == 0:
17 |             self.logger(f"No extracted media in element {element.id}.")
18 |             return None
19 |         self.logger(
20 |             f"Extracting element {element.id} with paths: {[x.name for x in element.paths]}"
21 |         )
22 |         return element
23 | 
24 | 
25 | module = ExtractTypes
26 | 


--------------------------------------------------------------------------------
/src/lib/analysers/ExtractTypes/info.yaml:
--------------------------------------------------------------------------------
1 | desc: Extract a set of file types from the element folder.
2 | args:
3 |   - name: exts
4 |     desc: A list of the extensions to extract, in rglob format (e.g. '*.jpg').
5 |     required: true
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/src/lib/analysers/Flatten/core.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from lib.common.analyser import Analyser
 3 | from lib.common.etypes import Etype
 4 | from lib.util.cvjson import flatten
 5 | 
 6 | 
 7 | class Flatten(Analyser):
 8 |     """NOTE: This class is kept for backwards compatibility, but should not be
 9 |     used in new implementations. Instaed, simply use the imported `rank`
10 |     function directly in the relevant analyser's `post_analyse` method.
11 |     """
12 | 
13 |     out_etype = Etype.Json
14 | 
15 |     def analyse_element(self, element: Etype.CvJson, _) -> Etype.Json:
16 |         return element
17 | 
18 |     def post_analyse(self, elements) -> Etype.Json:
19 |         return flatten(elements, logger=self.logger)
20 | 
21 | 
22 | module = Flatten
23 | 


--------------------------------------------------------------------------------
/src/lib/analysers/Flatten/info.yaml:
--------------------------------------------------------------------------------
1 | desc: TODO
2 | args: []
3 | 
4 | 


--------------------------------------------------------------------------------
/src/lib/analysers/Frames/core.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from shutil import copyfile, rmtree
 3 | from subprocess import call, STDOUT
 4 | from pathlib import Path
 5 | from lib.common.analyser import Analyser
 6 | from lib.common.etypes import Etype, Union
 7 | from lib.common.util import files
 8 | 
 9 | VID_SUFFIXES = [".mp4", ".mov"]
10 | # GLOSSED_FRAMES = Union(Etype.Image.array(), Etype.Json)
11 | GLOSSED_FRAMES = Etype.Any  # hack for the time being
12 | 
13 | 
14 | def ffmpeg_frames(out_folder, fp, rate):
15 |     # TODO: better logs for FFMPEG process
16 |     FNULL = open(os.devnull, "w")
17 |     out = call(
18 |         ["ffmpeg", "-i", fp, "-r", str(rate), f"{out_folder}/%04d.bmp"],
19 |         stdout=FNULL,
20 |         stderr=STDOUT,
21 |     )
22 | 
23 | 
24 | class Frames(Analyser):
25 |     in_etype = Union(Etype.Json, Etype.Video)
26 |     out_etype = GLOSSED_FRAMES
27 | 
28 |     def analyse_element(self, element, config):
29 |         fps = int(config["fps"]) if "fps" in config else 1
30 |         jsons = [x for x in element.paths if x.suffix in ".json"]
31 |         dest = Path("/tmp") / element.id
32 |         if dest.exists():
33 |             rmtree(dest)
34 |         dest.mkdir()
35 | 
36 |         if len(jsons) is 1:
37 |             json = jsons[0]
38 |             copyfile(json, dest / "meta.json")
39 | 
40 |         video = [x for x in element.paths if x.suffix in VID_SUFFIXES][0]
41 |         ffmpeg_frames(dest, video, fps)
42 | 
43 |         self.logger(f"Frames successfully created for element {element.id}.")
44 |         self.disk.delete_local_on_write = True
45 |         return GLOSSED_FRAMES(element.id, paths=files(dest))
46 | 
47 | 
48 | module = Frames
49 | 


--------------------------------------------------------------------------------
/src/lib/analysers/Frames/info.yaml:
--------------------------------------------------------------------------------
1 | desc: Extract a subset of representative frames from a video. A number of frames per second are extracted.
2 | args:
3 |   - name: fps
4 |     desc: Frames per second. Defaults to 1.
5 |     required: false
6 |     input: int
7 | 


--------------------------------------------------------------------------------
/src/lib/analysers/Frames/partial.Dockerfile:
--------------------------------------------------------------------------------
1 | RUN apt-get install -y \
2 | 	ffmpeg
3 | 


--------------------------------------------------------------------------------
/src/lib/analysers/ImageDedup/core.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | from shutil import copyfile
 4 | from imagededup import methods
 5 | from lib.common.exceptions import InvalidAnalyserConfigError
 6 | from lib.common.analyser import Analyser
 7 | from lib.common.etypes import Etype
 8 | 
 9 | 
10 | class ImageDedup(Analyser):
11 |     in_etype = Etype.Image.array()
12 |     out_etype = Etype.Image.array()
13 | 
14 |     def __create_hasher(self, config):
15 |         hasher_key = config["method"] if "method" in config else "phash"
16 |         self.logger(f"Compare method is {hasher_key}")
17 |         hasher = {
18 |             "phash": methods.PHash,
19 |             "ahash": methods.AHash,
20 |             "dhash": methods.DHash,
21 |             "whash": methods.WHash,
22 |         }.get(hasher_key)
23 |         if hasher is None:
24 |             raise InvalidAnalyserConfigError(
25 |                 f"'{hasher_key}' is not a valid method for imagededup."
26 |             )
27 | 
28 |         self.hasher = hasher()
29 | 
30 |         # super low threshold by default to only remove essentially identical images.
31 |         if "threshold" in config:
32 |             self.threshold = int(config["threshold"])
33 |         else:
34 |             self.threshold = 3
35 | 
36 |         self.logger(f"Hamming threshold is {self.threshold}")
37 | 
38 |     def pre_analyse(self, config):
39 |         self.__create_hasher(config)
40 | 
41 |     def is_dry(self):
42 |         return "dry" in self.config and self.config["dry"]
43 | 
44 |     def analyse_element(self, element, config):
45 |         # NOTE: only works if all images are in same file, should probably copy for robustness.
46 |         basedir = element.paths[0].parent
47 |         encodings = self.hasher.encode_images(image_dir=basedir)
48 | 
49 |         args = {"image_dir": basedir, "max_distance_threshold": self.threshold}
50 | 
51 |         duplicates = self.hasher.find_duplicates_to_remove(**args)
52 | 
53 |         self.logger(f"{len(duplicates)} duplicates found.")
54 | 
55 |         self.logger("IMAGES TO REMOVE")
56 |         self.logger("------------------")
57 |         for dup in duplicates:
58 |             self.logger(dup)
59 |         self.logger("------------------")
60 |         if self.is_dry():
61 |             return None
62 | 
63 |         self.logger(f"{element.id} images deduplicated.")
64 | 
65 |         deduplicated_paths = [p for p in element.paths if p.name not in duplicates]
66 | 
67 |         return Etype.Image.array()(element.id, paths=deduplicated_paths)
68 | 
69 | 
70 | module = ImageDedup
71 | 


--------------------------------------------------------------------------------
/src/lib/analysers/ImageDedup/info.yaml:
--------------------------------------------------------------------------------
 1 | desc: Deduplicate images in a collection using https://github.com/idealo/imagededup.
 2 | args:
 3 |   - name: threshold
 4 |     desc: The max Hamming distance threshold between two images below which retrieved duplicates are valid. See https://idealo.github.io/imagededup/methods/hashing/ for more information.
 5 |     required: false
 6 |     input: int
 7 |   - name: method
 8 |     desc: The method to use for hashing/comparison. Should be one of- phash, ahash, dhash, whash. See https://idealo.github.io/imagededup/methods/hashing/ for more information.
 9 |     required: false
10 |     input: string
11 |   - name: dry
12 |     desc: If set to true, the analyser will return a txt file that names all the images that are duplicates, rather than actually removing them
13 |     required: false
14 |     input: bool
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/src/lib/analysers/ImageDedup/requirements.txt:
--------------------------------------------------------------------------------
1 | imagededup
2 | 


--------------------------------------------------------------------------------
/src/lib/analysers/KerasPretrained/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import json
 3 | import os
 4 | from importlib import import_module
 5 | from lib.common.exceptions import InvalidAnalyserConfigError
 6 | from lib.common.analyser import Analyser
 7 | from lib.common.etypes import Etype, Union, Array
 8 | from lib.util.cvjson import generate_meta
 9 | from lib.etypes.cvjson import CvJson
10 | 
11 | KERAS_HOME = "/mtriage/data/.keras"
12 | os.environ["KERAS_HOME"] = KERAS_HOME
13 | 
14 | import tensorflow as tf
15 | from tensorflow.keras.preprocessing import image
16 | 
17 | SUPPORTED_MODELS = {
18 |     "ResNet50": {"module": "resnet50"},
19 |     "VGG16": {"module": "vgg16"},
20 |     "VGG19": {"module": "vgg19"},
21 | }
22 | 
23 | 
24 | class KerasPretrained(Analyser):
25 |     in_etype = Union(Array(Etype.Image), Etype.Json)
26 |     out_etype = CvJson
27 |     """ Override to always run serially. Otherwise it hangs, presumably due to
28 |     the parallelisation that tensorflow does under the hood. """
29 | 
30 |     @property
31 |     def in_parallel(self):
32 |         return False
33 | 
34 |     def pre_analyse(self, config):
35 |         self.logger(config["model"])
36 |         self.logger(f"Storing models in {KERAS_HOME}")
37 |         MOD = SUPPORTED_MODELS.get(config["model"])
38 |         if MOD is None:
39 |             raise InvalidAnalyserConfigError(
40 |                 f"The module '{config['model']}' either does not exist, or is not yet supported."
41 |             )
42 | 
43 |         rLabels = config["labels"]
44 | 
45 |         # TODO: make it so that this doesn't redownload every run.
46 |         # i.e. refactor it into partial.Dockerfile
47 |         self.model_module = import_module(
48 |             f"tensorflow.keras.applications.{MOD['module']}"
49 |         )
50 |         impmodel = getattr(self.model_module, config["model"])
51 |         # NB: this downloads the weights if they don't exist
52 |         self.model = impmodel(weights="imagenet")
53 |         self.THRESH = 0.1
54 | 
55 |         def get_preds(img_path):
56 |             img = image.load_img(img_path, target_size=(224, 224))
57 |             x = image.img_to_array(img)
58 |             x = np.expand_dims(x, axis=0)
59 |             x = self.model_module.preprocess_input(x)
60 |             preds = self.model.predict(x)
61 | 
62 |             # top field must be included or defaults to 5, huge number ensures
63 |             # it gets all labels
64 |             decoded = self.model_module.decode_predictions(preds, top=10)
65 | 
66 |             # filter by labels provided in whitelist
67 |             filteredPreds = [p for p in decoded[0] if p[1] in rLabels]
68 | 
69 |             return [
70 |                 (x[1], float(x[2])) for x in filteredPreds if float(x[2]) >= self.THRESH
71 |             ]
72 | 
73 |         self.get_preds = get_preds
74 | 
75 |     def analyse_element(self, element, _):
76 |         self.logger(f"Running inference on frames in {element.id}...")
77 |         val = Etype.CvJson.from_preds(element, self.get_preds)
78 |         self.logger(f"Wrote predictions JSON for {element.id}.")
79 |         self.disk.delete_local_on_write = True
80 |         return val
81 | 
82 |     def post_analyse(self, elements) -> Etype.Json.as_array():
83 |         return generate_meta(elements, logger=self.logger)
84 | 
85 | 
86 | module = KerasPretrained
87 | 


--------------------------------------------------------------------------------
/src/lib/analysers/KerasPretrained/info.yaml:
--------------------------------------------------------------------------------
 1 | desc: Classify objects in images using a neural net trained on ImageNet, as made available through the pretrained Keras modules. Inference is run on each image using the specified model, and the labels for the top 10 predictions will be retained in an output JSON of ImageFrameJson format.
 2 | args:
 3 |   - name: model
 4 |     desc: The model you want to use to classify, 'Resnet50', 'VGG16', or 'VGG19'. All models are trained on ImageNet.
 5 |     required: true
 6 |     input: string
 7 |   - name: labels
 8 |     desc: Filter results to a limited array of ImageNet labels, if you are only interested in some of them. If not provided, the analyser will return predictions for all labels.
 9 |     required: true
10 |     input: whitelist
11 | 


--------------------------------------------------------------------------------
/src/lib/analysers/KerasPretrained/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow==2.3.1
2 | pillow==6.2.0
3 | 


--------------------------------------------------------------------------------
/src/lib/analysers/ProtestsPretrained/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | import json
 4 | import os
 5 | import torch
 6 | from torch.autograd import Variable
 7 | from PIL import Image
 8 | 
 9 | from lib.common.analyser import Analyser
10 | from lib.common.etypes import Etype, Union, Array
11 | from lib.analysers.ProtestsPretrained.utils import transform, modified_resnet50, decode
12 | 
13 | PTH_TAR = "/mtriage/model.pth.tar"
14 | 
15 | # TODO cuda ?
16 | 
17 | 
18 | class ProtestsPretrained(Analyser):
19 |     def pre_analyse(self, config):
20 |         """
21 |         Init the logging, etc
22 |         Init the model
23 |         """
24 |         rLabels = config["labels"]
25 |         self.THRESH = 0.0
26 | 
27 |         t = transform()
28 |         model = modified_resnet50()
29 |         model.load_state_dict(
30 |             torch.load(
31 |                 PTH_TAR,
32 |                 map_location=torch.device("cpu"),
33 |             )["state_dict"]
34 |         )
35 |         model.eval()
36 | 
37 |         def get_preds(img_path):
38 |             """
39 |             Gives labelds and probabilities for a single image
40 |             This is were we preprocess the image, using a function defined in the model class
41 |             """
42 |             # load image
43 |             img = Image.open(img_path).convert("RGB")
44 |             # process it
45 |             x = t(img)
46 |             # get in in the right format
47 |             x = Variable(x).unsqueeze(0)
48 |             # predictions
49 |             output = model(x)
50 |             # decode
51 |             output = decode(output.cpu().data.numpy()[0])
52 |             # filter
53 |             output = [(x[0], x[1]) for x in output if x[0] in rLabels]
54 |             output = [(x[0], float(x[1])) for x in output if x[1] >= self.THRESH]
55 | 
56 |             return output
57 | 
58 |         self.get_preds = get_preds
59 | 
60 |     def analyse_element(
61 |         self, element: Union(Array(Etype.Image), Etype.Json), _
62 |     ) -> Etype.Json:
63 |         self.logger(f"Running inference on frames in {element.id}...")
64 |         val = Etype.CvJson.from_preds(element, self.get_preds)
65 |         self.logger(f"Wrote predictions JSON for {element.id}.")
66 |         self.disk.delete_local_on_write = True
67 |         return val
68 | 
69 | 
70 | module = ProtestsPretrained
71 | 


--------------------------------------------------------------------------------
/src/lib/analysers/ProtestsPretrained/image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/forensic-architecture/mtriage/7a841241518f831766767d6ddaa6320b8de4be98/src/lib/analysers/ProtestsPretrained/image.jpg


--------------------------------------------------------------------------------
/src/lib/analysers/ProtestsPretrained/info.yaml:
--------------------------------------------------------------------------------
1 | desc: Classify the presence of protests and violence in images.
2 | args:
3 |    - name: labels
4 |      desc: Filter results to a limited array of labels.
5 |      required: true
6 |      input: whitelist
7 | 


--------------------------------------------------------------------------------
/src/lib/analysers/ProtestsPretrained/partial.Dockerfile:
--------------------------------------------------------------------------------
1 | RUN wget -O /mtriage/model.pth.tar https://www.dropbox.com/s/vgh2nwxrzembxpw/model.pth.tar?dl=0
2 | 


--------------------------------------------------------------------------------
/src/lib/analysers/ProtestsPretrained/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision
3 | pillow==6.2.0
4 | numpy<1.17
5 | 


--------------------------------------------------------------------------------
/src/lib/analysers/ProtestsPretrained/test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import torch
 4 | from torch.autograd import Variable
 5 | from PIL import Image
 6 | from utils import transform, modified_resnet50, decode
 7 | 
 8 | 
 9 | def pre_analyse():
10 |     """
11 |     Init the logging, etc
12 |     Init the model
13 |     Same as KerasPretrained
14 |     """
15 |     t = transform()
16 |     model = modified_resnet50()
17 |     model.load_state_dict(
18 |         torch.load(
19 |             "model.pth.tar",
20 |             map_location=torch.device("cpu"),
21 |         )["state_dict"]
22 |     )
23 |     model.eval()
24 | 
25 |     def get_preds(img_path):
26 |         """
27 |         Gives labelds and probabilities for a single image
28 |         This is were we preprocess the image, using a function defined in the model class
29 |         """
30 |         # load image
31 |         img = Image.open(img_path).convert("RGB")
32 |         # process it
33 |         x = t(img)
34 |         # get in in the right format
35 |         x = Variable(x).unsqueeze(0)
36 |         # predictions
37 |         output = model(x)
38 |         # decode
39 |         output = decode(output.cpu().data.numpy()[0])
40 | 
41 |         # filter
42 |         # return pred, proba
43 |         return output
44 | 
45 |     return get_preds("image.jpg")
46 | 
47 | 
48 | print(pre_analyse())
49 | 


--------------------------------------------------------------------------------
/src/lib/analysers/ProtestsPretrained/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | created by: Donghyeon Won
 3 | """
 4 | import torch
 5 | import torch.nn as nn
 6 | import torchvision.transforms as transforms
 7 | import torchvision.models as models
 8 | 
 9 | 
10 | def transform():
11 |     return transforms.Compose(
12 |         [
13 |             transforms.Resize(256),
14 |             transforms.CenterCrop(224),
15 |             transforms.ToTensor(),
16 |             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
17 |         ]
18 |     )
19 | 
20 | 
21 | def decode(preds):
22 |     classes = [
23 |         "protest",
24 |         "violence",
25 |         "sign",
26 |         "photo",
27 |         "fire",
28 |         "police",
29 |         "children",
30 |         "group_20",
31 |         "group_100",
32 |         "flag",
33 |         "night",
34 |         "shouting",
35 |     ]
36 |     return [(x, preds[c]) for c, x in enumerate(classes)]
37 | 
38 | 
39 | class FinalLayer(nn.Module):
40 |     """modified last layer for resnet50 for our dataset"""
41 | 
42 |     def __init__(self):
43 |         super(FinalLayer, self).__init__()
44 |         self.fc = nn.Linear(2048, 12)
45 |         self.sigmoid = nn.Sigmoid()
46 | 
47 |     def forward(self, x):
48 |         out = self.fc(x)
49 |         out = self.sigmoid(out)
50 |         return out
51 | 
52 | 
53 | def modified_resnet50():
54 |     model = models.resnet50(pretrained=True)
55 |     model.fc = FinalLayer()
56 |     return model
57 | 


--------------------------------------------------------------------------------
/src/lib/analysers/PytorchFasterRcnn/core.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | from torch.autograd import Variable
 4 | from torchvision import transforms
 5 | from PIL import Image
 6 | from lib.common.analyser import Analyser
 7 | from lib.common.etypes import Etype
 8 | 
 9 | 
10 | class PytorchFasterRcnn(Analyser):
11 |     in_etype = Etype.Any
12 |     out_etype = Etype.Any
13 | 
14 |     def pre_analyse(self, config):
15 |         # NB: in future this could be configurable.
16 |         model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False, num_classes=6)
17 |         if torch.cuda.is_available():
18 |             model.cuda()
19 |             self.device = torch.device("cuda:0")
20 |         else:
21 |             self.device = torch.device("cpu")
22 |         state_dict = torch.load(self.base_path/config["model"], map_location=torch.device(self.device))
23 |         model.load_state_dict(state_dict)
24 |         model.eval()
25 |         self.model = model
26 |         self.transforms = transforms.Compose([transforms.Resize(224), transforms.ToTensor()])
27 |         self.threshold = config.get('threshold') if config.get('threshold') else 0.5
28 | 
29 |     def analyse_element(self, element, config):
30 |         def get_preds(img):
31 |             img = Image.open(img).convert('RGB')
32 |             image_tensor = self.transforms(img).float().unsqueeze_(0)
33 |             inp = Variable(image_tensor).to(self.device)
34 |             output = self.model(inp)[0]
35 |             labels = [config['class_map'][i.item()] for i in output.get('labels')]
36 |             scores = output.get('scores')
37 |             preds = [(x, y.item()) for x,y in zip(labels, scores) if y.item() > self.threshold]
38 |             return preds
39 | 
40 |         self.logger(f"Running inference for {element.id}...")
41 |         return Etype.CvJson.from_preds(element, get_preds)
42 | 
43 | module = PytorchFasterRcnn
44 | 


--------------------------------------------------------------------------------
/src/lib/analysers/PytorchFasterRcnn/info.yaml:
--------------------------------------------------------------------------------
 1 | desc: Run inference with weights produced by transfer learning from a FasterRCNN backbone.
 2 | args:
 3 |   - name: model
 4 |     desc: Relative path to the 'model.pth' file that holds the weights of the transfer-learned model. Must use `fasterrcnn_resnet50_fpn` as a backbone for now.
 5 |     required: true
 6 |     input: path
 7 |   - name: class_map
 8 |     desc: A list of the class names that correspond to the indices returned (in `output['labels']`).
 9 |     required: true
10 |     input: list
11 |   - name: threshold
12 |     desc: The cutoff for predictions, between 0 and 1. Defaults to 0.5.
13 |     required: false
14 |     input: number
15 | 


--------------------------------------------------------------------------------
/src/lib/analysers/PytorchFasterRcnn/requirements.txt:
--------------------------------------------------------------------------------
1 | # icevision
2 | Pillow
3 | torch
4 | torchvision
5 | 


--------------------------------------------------------------------------------
/src/lib/analysers/Rank/core.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from lib.common.analyser import Analyser
 3 | from lib.common.etypes import Etype
 4 | from lib.util.rank_cvjson import rank
 5 | 
 6 | 
 7 | class Rank(Analyser):
 8 |     """NOTE: This class is kept for backwards compatibility, but should not be
 9 |     used in new implementations. Instaed, simply use the imported `rank`
10 |     function directly in the relevant analyser's `post_analyse` method.
11 |     """
12 | 
13 |     def analyse_element(self, element: Etype.CvJson, _) -> Etype.Any:
14 |         return element
15 | 
16 |     def post_analyse(self, elements) -> Etype.Json:
17 |         return rank(elements, logger=self.logger)
18 | 
19 | 
20 | module = Rank
21 | 


--------------------------------------------------------------------------------
/src/lib/analysers/Rank/info.yaml:
--------------------------------------------------------------------------------
1 | desc: Rewrite an ImageFrameJson as a ranked ImageFrameJson. This step is necessary to display the results via the 'framemap' viewer.
2 | args:
3 |   - name: threshold
4 |     desc: The minimum score for which a prediction should be counted towards an element's rank.
5 |     required: false
6 |     input: float
7 | 
8 | 


--------------------------------------------------------------------------------
/src/lib/analysers/TorchHub/core.py:
--------------------------------------------------------------------------------
 1 | from lib.common.analyser import Analyser
 2 | from lib.common.etypes import Etype
 3 | 
 4 | from PIL import Image
 5 | import torch
 6 | 
 7 | def cls_and_conf(pred, names):
 8 |     # `pred` is an array with 6 values: x1, y1, x2, y2, confidence, class
 9 |     _,_,_,_,conf,cl = pred
10 |     cl = names[int(cl)]
11 |     conf = float(conf)
12 |     return (cl, conf)
13 | 
14 | 
15 | class TorchHub(Analyser):
16 |     in_etype = Etype.Any
17 |     out_etype = Etype.Any
18 | 
19 |     def pre_analyse(self, config):
20 |         if config.get('args') is None: config['args'] = []
21 |         if config.get('kwargs') is None: config['kwargs'] = {}
22 | 
23 |         self.model = torch.hub.load(config['repo'], *config['args'], **config['kwargs'])
24 |         self.model.conf = 0.5 # confidence threshold
25 |         self.model.iou = 0.45 # NMS IoU threshold
26 |         self.logger("Model loaded from remote.")
27 | 
28 |     def analyse_element(self, element, config):
29 |         imgs = [Image.open(x) for x in element.paths]
30 |         results = self.model(imgs).tolist()
31 |         self.logger(f"Batched inference successfully run for element {element.id}.")
32 | 
33 |         def get_preds(img_path):
34 |             idx = element.paths.index(img_path)
35 |             result = results[idx]
36 |             return [cls_and_conf(p, result.names) for p in result.pred]
37 | 
38 |         return Etype.CvJson.from_preds(element, get_preds)
39 | 
40 | module = TorchHub
41 | 


--------------------------------------------------------------------------------
/src/lib/analysers/TorchHub/info.yaml:
--------------------------------------------------------------------------------
 1 | desc: Run inference using weights trained with https://github.com/ultralytics/yolov5
 2 | args:
 3 |   - name: repo
 4 |     desc: Github repository from which to load the Torch hub model, i.e. 'ultralytics/yolov5'
 5 |     required: true
 6 |     input: string
 7 |   - name: args
 8 |     desc: Arguments for `torch.hub.load()` function.
 9 |     required: false
10 |     input: list
11 |   - name: kwargs
12 |     desc: Keyword arguments for `torch.hub.load()` function
13 |     required: false
14 |     input: dict
15 | 


--------------------------------------------------------------------------------
/src/lib/analysers/TorchHub/partial.Dockerfile:
--------------------------------------------------------------------------------
1 | RUN apt-get install -y libgl1-mesa-dev
2 | 


--------------------------------------------------------------------------------
/src/lib/analysers/TorchHub/requirements.txt:
--------------------------------------------------------------------------------
 1 | # numpy>=1.18.5
 2 | # PyYAML>=5.3.1
 3 | # tensorboard>=2.2
 4 | # wandb
 5 | # thop  # FLOPS computation
 6 | # pycocotools>=2.0  # COCO mAP
 7 | 
 8 | Cython
 9 | matplotlib>=3.2.2
10 | opencv-python>=4.1.2
11 | Pillow
12 | scipy>=1.4.1
13 | requests
14 | torch>=1.7.0
15 | torchvision>=0.8.1
16 | tqdm>=4.41.0
17 | 
18 | seaborn>=0.11.0
19 | pandas
20 | 


--------------------------------------------------------------------------------
/src/lib/analysers/TwintToGephi/core.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import twint
  4 | import pandas as pd
  5 | from pathlib import Path
  6 | from lib.common.analyser import Analyser
  7 | from lib.common.etypes import Etype
  8 | from lib.util.twint import to_serializable, pythonize
  9 | 
 10 | 
 11 | from collections import namedtuple
 12 | from datetime import datetime
 13 | 
 14 | 
 15 | def fmt_timestmap(dstamp, tstamp, tzone):
 16 |     ds = datetime.strptime(dstamp, "%Y-%m-%d")
 17 |     fmtted_ds = ds.strftime("%m/%d/%y")
 18 |     return f"{fmtted_ds} {tstamp}"
 19 | 
 20 | 
 21 | TMP = Path("/tmp")
 22 | TweetEdge = namedtuple(
 23 |     "TweetEdge", "date tweet urls domains hashtags tweet_id inreplyto_id"
 24 | )
 25 | 
 26 | 
 27 | class CsvGraph:
 28 |     node_labels = [
 29 |         "Vertex",
 30 |         "Followed",
 31 |         "Followers",
 32 |         "Tweets",
 33 |         "Favorites",
 34 |         "Description",
 35 |         "Location",
 36 |         "Web",
 37 |         "Time Zone",
 38 |         "Joined Twitter Date (UTC)",
 39 |     ]
 40 |     edge_labels = [
 41 |         "Vertex 1",
 42 |         "Vertex 2",
 43 |         "Width",
 44 |         "Relationship",
 45 |         "Relationship Date (UTC)",
 46 |         "Tweet",
 47 |         "URLs in Tweet",
 48 |         "Domains in Tweet",
 49 |         "Hashtags in Tweet",
 50 |         "Tweet Date (UTC)",
 51 |         "Twitter Page for Tweet",
 52 |         "Imported ID",
 53 |         "In-Reply-To Tweet ID",
 54 |     ]
 55 | 
 56 |     def __init__(self):
 57 |         self.nodes = []
 58 |         self.edges = []
 59 | 
 60 |     def has_node(self, name: str):
 61 |         return name in self.nodes
 62 | 
 63 |     def add_node(self, name: str):
 64 |         if name not in self.nodes:
 65 |             self.nodes.append(name)
 66 | 
 67 |     def add_edge(self, _from: dict, _to: dict):
 68 |         is_reply = _to is not None
 69 | 
 70 |         self.add_node(_from["username"])
 71 |         if is_reply:
 72 |             self.add_node(_to["username"])
 73 | 
 74 |         edge = TweetEdge(
 75 |             date=fmt_timestmap(
 76 |                 _from["datestamp"], _from["timestamp"], _from["timezone"]
 77 |             ),
 78 |             tweet=_from["tweet"],
 79 |             urls=_from["urls"],
 80 |             domains=[],  # NB: no domains provided in obj
 81 |             hashtags=_from["hashtags"],
 82 |             tweet_id=_from["id"],
 83 |             inreplyto_id=_to["id"] if _to is not None else None,
 84 |         )
 85 | 
 86 |         self.edges.append(
 87 |             [
 88 |                 _from["username"],
 89 |                 _to["username"] if is_reply else _from["username"],
 90 |                 1,  # width defaults to 1
 91 |                 "Tweet" if not is_reply else "Replies To",  # relationship
 92 |                 edge.date,  # relationship date
 93 |                 edge.tweet,
 94 |                 "- ".join(edge.urls) if isinstance(edge.urls, list) else edge.urls,
 95 |                 "- ".join(edge.domains)
 96 |                 if isinstance(edge.domains, list)
 97 |                 else edge.domains,
 98 |                 "- ".join(edge.hashtags)
 99 |                 if isinstance(edge.hashtags, list)
100 |                 else edge.hashtags,
101 |                 edge.date,  # tweet date
102 |                 f"https://twitter.com/${_from['username']}/status/${_from['id']}",
103 |                 edge.tweet_id,  # the tweet's id
104 |                 ""
105 |                 if not is_reply
106 |                 else edge.inreplyto_id,  # the id of the tweet to which this replies.
107 |             ]
108 |         )
109 | 
110 |     def to_xlsx(self, path):
111 |         """ Save graph as XLSX file. The default tab will be edges, with an extra tab for nodes. """
112 |         edge_df = pd.DataFrame.from_records(self.edges)
113 |         edge_df.columns = CsvGraph.edge_labels
114 |         node_df = pd.DataFrame.from_records([[x] for x in self.nodes])
115 |         node_df.columns = ["Vertex"]
116 | 
117 |         writer = pd.ExcelWriter(path, engine="xlsxwriter")
118 |         edge_df.to_excel(writer, sheet_name="Edges")
119 |         node_df.to_excel(writer, sheet_name="Vertices")
120 |         writer.save()
121 | 
122 | 
123 | class TwintToGephi(Analyser):
124 |     in_etype = Etype.Json
125 |     out_etype = Etype.Any
126 | 
127 |     def pre_analyse(self, _):
128 |         # keeps a record of which user ids have been indexed so that there's no
129 |         # repeated work.
130 |         self.indexed_ids = []
131 |         # usernames (to easily check whether a user exists in the graph or not)
132 |         self.graph = CsvGraph()
133 | 
134 |     def analyse_element(self, element, _):
135 |         with open(element.paths[0], "r") as f:
136 |             orig_tweet = json.load(f)
137 |             orig_tweet = pythonize(orig_tweet)
138 | 
139 |         tweet_with_replies = [orig_tweet]
140 |         reply_count = orig_tweet["replies_count"]
141 |         # retweet_count = orig_tweet["retweets_count"]
142 |         usr = orig_tweet["username"]
143 | 
144 |         # TODO: get retweets, as they are mentions
145 |         # if retweet_count > 0:
146 |         #     retweets = self.get_all_retweets(usr)
147 | 
148 |         if reply_count > 0 and usr not in self.indexed_ids:
149 |             # TODO: keep a record so that we don't need to rescrape
150 |             # self.indexed_ids.append(usr)
151 | 
152 |             all_tweets = self.get_all_tweets_sent_to(usr)
153 |             conv_tweets = [
154 |                 tweet
155 |                 for tweet in all_tweets
156 |                 if tweet["conversation_id"] == orig_tweet["conversation_id"]
157 |             ]
158 |             if len(conv_tweets) > 0:
159 |                 tweet_with_replies = tweet_with_replies + conv_tweets
160 |                 self.logger(f"{len(conv_tweets)} replies added to tweet {element.id}.")
161 | 
162 |         output = TMP / f"{element.id}.json"
163 |         with open(output, "w+") as f:
164 |             json.dump(tweet_with_replies, f)
165 | 
166 |         element.paths = [output]
167 | 
168 |         return element
169 | 
170 |     def get_all_retweets(self, username):
171 |         c = twint.Config()
172 |         c.Username = username
173 |         c.Retweets = True
174 |         twint.run.Profile(c)
175 | 
176 |     def get_all_tweets_sent_to(self, username):
177 |         """ See https://github.com/twintproject/twint/issues/513 """
178 |         c = twint.Config()
179 |         c.To = f"@{username}"
180 |         c.Retweets = True
181 |         c.Since = self.config["uploaded_after"]
182 |         c.Until = self.config["uploaded_before"]
183 |         c.Store_object = True
184 |         self.logger(f"Scraping tweets sent to {username}...")
185 |         twint.run.Search(c)
186 |         results = twint.output.tweets_list
187 |         twint.output.tweets_list = []
188 | 
189 |         return to_serializable(results)
190 | 
191 |     def add_to_graph(self, t, inreplyto=None):
192 |         """Add the relevant rows (for `nodes` and `edges`) to a graph from
193 |         a Twint-formatted tweet (Python dictionary)"""
194 |         self.graph.add_node(t["username"])
195 | 
196 |         self.graph.add_edge(t, inreplyto)
197 | 
198 |     def post_analyse(self, elements):
199 |         for el in elements:
200 |             el_json = el.paths[0]
201 |             with open(el_json) as f:
202 |                 tweets = json.load(f)
203 | 
204 |             initial_tweet = tweets[0]
205 |             self.logger(f"Adding tweet {initial_tweet['id']} to graph...")
206 |             self.add_to_graph(initial_tweet)
207 |             for tweet in tweets[1:]:
208 |                 self.logger(f"Adding reply {tweet['id']} to graph...")
209 |                 self.add_to_graph(tweet, inreplyto=initial_tweet)
210 | 
211 |         xlsx_path = TMP / "final.xlsx"
212 |         self.graph.to_xlsx(xlsx_path)
213 |         return Etype.Any("FINAL", xlsx_path)
214 | 
215 | 
216 | module = TwintToGephi
217 | 


--------------------------------------------------------------------------------
/src/lib/analysers/TwintToGephi/info.yaml:
--------------------------------------------------------------------------------
 1 | desc: Create a single element from Twitter elements, which contains two CSV files that specify a relational graph. As replies are determined by scraping all tweets in a user's timeline and then filtering by conversation ID, a requirement of twint, `uploaded_before` and `uploaded_after` should be provided so that only relevant tweets need to be scraped.
 2 | args:
 3 |   - name: uploaded_before
 4 |     desc: Only return tweets before this date.
 5 |     required: true
 6 |     input: date
 7 |   - name: uploaded_after
 8 |     desc: Only return tweets after this date.
 9 |     required: true
10 |     input: date
11 | 


--------------------------------------------------------------------------------
/src/lib/analysers/TwintToGephi/requirements.txt:
--------------------------------------------------------------------------------
1 | xlsxwriter
2 | pandas
3 | 


--------------------------------------------------------------------------------
/src/lib/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/forensic-architecture/mtriage/7a841241518f831766767d6ddaa6320b8de4be98/src/lib/common/__init__.py


--------------------------------------------------------------------------------
/src/lib/common/etypes.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from enum import Enum
  3 | from pathlib import Path
  4 | from copy import deepcopy
  5 | from functools import reduce
  6 | from types import SimpleNamespace as Ns
  7 | from typing import Union as _Union, List, TypeVar
  8 | from abc import abstractmethod
  9 | from lib.common.exceptions import EtypeCastError
 10 | from lib.common.get import get_custom_etypes
 11 | 
 12 | 
 13 | class LocalElement:
 14 |     """Local as in not from storage, but on the same comp where mtriage is running.
 15 |     Returned from Selector.retrieve_element, and also Analyser.analyse_element."""
 16 | 
 17 |     def __init__(self, id=None, query=None, paths=None, et=None):
 18 |         self.id = id  # the element id
 19 |         self.query = query  # the query string used to retrieve the element
 20 |         self.paths = (
 21 |             paths  # the path/s where the element's media are accessible locally
 22 |         )
 23 |         self.et = et
 24 | 
 25 | 
 26 | class LocalElementsIndex:
 27 |     """Similar to LocalElement, on the same comp as mtriage is running.
 28 |     Initialised with an array of arrays, where each inner array represents one element to be retrieved."""
 29 | 
 30 |     def __init__(self, rows=[]):
 31 |         self.rows = rows
 32 | 
 33 | 
 34 | Pth = TypeVar("Pth", str, Path)
 35 | Function = type(lambda _: None)
 36 | 
 37 | 
 38 | class Et:
 39 |     def __init__(self, name, filter_func, is_array=False):
 40 |         self.id = name
 41 |         self.filter_func = filter_func
 42 |         self.is_array = is_array
 43 | 
 44 |     def __repr__(self):
 45 |         ia = self.is_array
 46 |         return f"{'Array(' if ia else ''}{self.id.capitalize()}{')' if ia else ''}"
 47 | 
 48 |     def __str__(self):
 49 |         return self.__repr__()
 50 | 
 51 |     def __get_etype(self):
 52 |         for etype in Etype:
 53 |             if self.name == etype.name:
 54 |                 return etype
 55 |         return None
 56 | 
 57 |     def __call__(
 58 |         self, el_id: str, paths: _Union[Pth, List[Pth]], is_array=False
 59 |     ) -> LocalElement:
 60 |         if isinstance(paths, (str, Path)):
 61 |             paths = [paths]
 62 |         else:
 63 |             paths = [Path(x) if isinstance(x, str) else x for x in paths]
 64 |         paths = self.filter(paths)
 65 | 
 66 |         # NOTE: a bit convoluted. Only do an array check if etype is not custom,
 67 |         # as custom etypes could have more sophisticated expressions than core
 68 |         # types. TODO: make more elegant.
 69 |         is_custom = self.id in [x.__name__ for x in get_custom_etypes()]
 70 |         if not is_custom:
 71 |             if len(paths) == 0 or (
 72 |                 self.id != "Any"
 73 |                 and not (is_array or self.is_array)
 74 |                 and (len(paths) != 1 or not paths[0].is_file())
 75 |             ):
 76 |                 raise EtypeCastError(self)
 77 | 
 78 |         # TODO: confirm all source files exist
 79 |         this_cls = deepcopy(self)
 80 |         if this_cls.is_array:
 81 |             this_cls.is_array = True
 82 |         return LocalElement(paths=paths, id=el_id, et=this_cls)
 83 | 
 84 |     def filter(self, ls):
 85 |         """ Exists to be overwritten, `filter_func` is just the fallback. """
 86 |         return self.filter_func(ls)
 87 | 
 88 |     def __eq__(self, other):
 89 |         return all(
 90 |             [
 91 |                 isinstance(other, Et),
 92 |                 self.id == other.id,
 93 |                 self.is_array == other.is_array,
 94 |             ]
 95 |         )
 96 | 
 97 |     def __lt__(self, other):
 98 |         return self.id < other.id
 99 | 
100 |     def as_array(self):
101 |         return Et(self.id, self.filter, is_array=True)
102 | 
103 |     def array(self):
104 |         return self.as_array()
105 | 
106 |     @property
107 |     def is_union(self):
108 |         return False
109 | 
110 | 
111 | class UnionEt(Et):
112 |     """ A higher order Etype that allows the additive composition of Ets. """
113 | 
114 |     def __init__(self, *ets):
115 |         self.ets = ets
116 |         super().__init__(self, str(self), is_array=False)
117 | 
118 |     def __repr__(self):
119 |         inner = ""
120 |         for et in self.ets:
121 |             inner += f"{et}, "
122 |         inner = inner[:-2]
123 | 
124 |         return f"Union({inner})"
125 | 
126 |     def __eq__(self, other):
127 |         return all([x == y for x, y in zip(sorted(self.ets), sorted(other.ets))])
128 | 
129 |     def __call__(self, el_id: str, paths: _Union[Pth, List[Pth]]) -> LocalElement:
130 | 
131 |         self.ets[1](el_id, paths)
132 |         ets = [T(el_id, paths) for T in self.ets]
133 | 
134 |         all_paths = []
135 | 
136 |         for et in ets:
137 |             all_paths += et.paths
138 |         return LocalElement(paths=all_paths, id=el_id, et=self)
139 | 
140 |     @property
141 |     def is_union(self):
142 |         return True
143 | 
144 | 
145 | def class_as_et(class_obj):
146 |     return class_obj(class_obj.__name__, class_obj.filter)
147 |     # TODO: get across all custom methods somehow...
148 | 
149 | 
150 | def fglob(ps, exts):
151 |     return [p for p in ps if p.suffix.lower() in exts]
152 | 
153 | 
154 | def all_etypes():
155 |     base = [x for x in dir(Etype) if not x.startswith("_") and x != "cast"]
156 |     custom = get_custom_etypes()
157 | 
158 |     for t in base:
159 |         yield getattr(Etype, t)
160 |     for t in custom:
161 |         yield t(t.__name__, t.filter)
162 | 
163 | 
164 | def cast(el_id, paths: _Union[List[Pth], Pth], to: Et = None) -> LocalElement:
165 |     if isinstance(paths, (Path, str)):
166 |         paths = [paths]
167 |     # NB: cast even at the expense of losing some paths if explicit ET is provided
168 |     if to is not None:
169 |         return to(el_id, paths=paths)
170 |     # implicit cast to the most inclusive type
171 |     valid = []
172 |     if len(paths) == 0:
173 |         raise EtypeCastError("Paths cannot be empty.")
174 | 
175 |     for et in all_etypes():
176 |         if et.id == "Any":
177 |             continue
178 |         try:
179 |             # if both array and singular casts are valid, precedence given to singular
180 |             et(el_id, paths=paths, is_array=True)
181 |             v = Array(et)
182 |             try:
183 |                 et(el_id, paths=paths)
184 |                 v = et
185 |             except:
186 |                 pass
187 |             valid.append(v)
188 |         except EtypeCastError:
189 |             pass
190 | 
191 |     if len(valid) == 0:
192 |         return Etype.Any(el_id, paths)
193 |     elif len(valid) == 1:
194 |         return valid[0](el_id, paths)
195 |     else:
196 |         # multiple valid types, return a union
197 |         etyped_paths = reduce(lambda a, b: a + b(el_id, paths).paths, valid, [])
198 |         if len(etyped_paths) != len(paths):
199 |             return Etype.Any(el_id, paths)
200 |         return Union(*valid)(el_id, paths)
201 | 
202 | 
203 | class Etype:
204 |     Any = Et("Any", lambda ps: ps)
205 |     Image = Et("Image", lambda ps: fglob(ps, [".bmp", ".jpg", ".jpeg", ".png"]))
206 |     Video = Et("Video", lambda ps: fglob(ps, [".mp4", ".mov"]))
207 |     Audio = Et("Audio", lambda ps: fglob(ps, [".mp3", ".wav", ".m4a", ".aac"]))
208 |     Json = Et("Json", lambda ps: fglob(ps, [".json"]))
209 | 
210 | 
211 | Etype.cast = cast
212 | # make custom etypes available on Etype
213 | for t in get_custom_etypes():
214 |     setattr(Etype, t.__name__, t(t.__name__, t.filter))
215 | Union = UnionEt
216 | Array = lambda x: x.as_array()
217 | Index = LocalElementsIndex
218 | 


--------------------------------------------------------------------------------
/src/lib/common/exceptions.py:
--------------------------------------------------------------------------------
  1 | class SelectorNotFoundError(Exception):
  2 |     def __init__(self, selector):
  3 |         super().__init__(
  4 |             f"""Could not find a valid selector named '{selector}'. Ensure that you have a folder named '{selector}'
  5 |             in the selectors directory, and that it exports a valid Selector."""
  6 |         )
  7 | 
  8 | 
  9 | class AnalyserNotFoundError(Exception):
 10 |     def __init__(self, analyser):
 11 |         super().__init__(
 12 |             f"""Could not find a valid analyser named '{analyser}'. Ensure that you have a folder named '{analyser}'
 13 |             in the analysers directory, and that it exports a valid Analyser."""
 14 |         )
 15 | 
 16 | 
 17 | class WorkingDirectorNotFoundError(Exception):
 18 |     def __init__(self, workdir):
 19 |         super().__init__(
 20 |             f"""The working directory path that you specified, '{workdir}', does not exist or is otherwise corrupted."""
 21 |         )
 22 | 
 23 | 
 24 | class InvalidPhaseError(Exception):
 25 |     def __init__(self):
 26 |         super().__init__("The 'phase' argument must be either 'select' or 'analyse'.")
 27 | 
 28 | 
 29 | class InvalidAnalyserConfigError(Exception):
 30 |     def __init__(self, msg):
 31 |         super().__init__(f"Invalid analyser config - {msg}")
 32 | 
 33 | 
 34 | class InvalidSelectorConfigError(Exception):
 35 |     def __init__(self, msg):
 36 |         super().__init__(f"Invalid selector config - {msg}")
 37 | 
 38 | 
 39 | class InvalidYamlError(Exception):
 40 |     def __init__(self, msg):
 41 |         super().__init__(f"Invalid YAML - {msg}")
 42 | 
 43 | 
 44 | class ElementShouldSkipError(Exception):
 45 |     def __init__(self, msg):
 46 |         super().__init__(f"{msg} - skipping element")
 47 | 
 48 | 
 49 | class ElementShouldRetryError(Exception):
 50 |     def __init__(self, msg):
 51 |         super().__init__(f"{msg} - attempt retry")
 52 | 
 53 | 
 54 | class SelectorIndexError(Exception):
 55 |     def __init__(self, msg):
 56 |         super().__init__(f"Selector index failed - {msg}")
 57 | 
 58 | 
 59 | class ImproperLoggedPhaseError(Exception):
 60 |     def __init__(self, fname):
 61 |         super().__init__(
 62 |             f"""The method '{fname}' does not belong to a class that inherits from MTModule. The
 63 |                         phase decorator can only be applied to methods on such a class."""
 64 |         )
 65 | 
 66 | 
 67 | class BatchedPhaseArgNotGenerator(Exception):
 68 |     def __init__(self, fname):
 69 |         super().__init__(
 70 |             f"""The method '{fname}' cannot be batched. The 'batched_phase' decorator can only be applied to a function that takes a generator as its first and only argument. """
 71 |         )
 72 | 
 73 | 
 74 | class MTriageStorageCorruptedError(Exception):
 75 |     def __init__(self, fname):
 76 |         super().__init__(
 77 |             "MTriage encountered an unexpected file structure in selectors or analysers. Ensure you specified the correct working directory."
 78 |         )
 79 | 
 80 | 
 81 | class EtypeCastError(Exception):
 82 |     def __init__(self, msg):
 83 |         super().__init__(f"Could not cast element as {msg}")
 84 | 
 85 | 
 86 | class InvalidElementsIn(Exception):
 87 |     def __init__(self, comp, msg):
 88 |         super().__init__(f"The elements_in '{comp}' is not valid. {msg}")
 89 | 
 90 | 
 91 | class InvalidAnalyserElements(Exception):
 92 |     pass
 93 | 
 94 | 
 95 | class InvalidCarry(Exception):
 96 |     def __init__(self, msg):
 97 |         super().__init__(f"The 'carry' attribute you provided is invalid: {msg}")
 98 | 
 99 | 
100 | class InvalidElementIndex(Exception):
101 |     def __init__(self):
102 |         super().__init__(
103 |             f"""The element index read from disk is an invalid generator. Check that your index method is
104 |                          correct, and that your disk has not been corrupted."""
105 |         )
106 | 
107 | 
108 | class InvalidStorageQuery(Exception):
109 |     def __init__(self, query, msg):
110 |         super().__init__(f"The query '{query}' is invalid: {msg}")
111 | 


--------------------------------------------------------------------------------
/src/lib/common/get.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from importlib import import_module
 3 | from lib.common.util import files
 4 | 
 5 | 
 6 | def get_module(_from, key):
 7 |     """Dynamically loads in all analysers from the analysers folder, generating a dictionary in which the folder name
 8 |     is the key, and the export from 'main' is the value.
 9 |     """
10 |     if _from == "select":
11 |         module_folder = f"lib.selectors"
12 |     elif _from == "analyse":
13 |         module_folder = f"lib.analysers"
14 |     else:
15 |         raise ImportError("The phase argument must be either 'select' or 'analyse'")
16 | 
17 |     pth = f"{module_folder}.{key}.core"
18 |     mod = import_module(pth)
19 |     return mod.module
20 | 
21 | 
22 | def get_custom_etypes():
23 |     base_import = "lib.etypes"
24 |     module_folder = Path("/mtriage/src/lib/etypes")
25 |     all_etypes = [t.stem for t in files(module_folder)]
26 |     imports = [f"{base_import}.{p}" for p in all_etypes]
27 |     return [import_module(mod).etype for mod in imports]
28 | 


--------------------------------------------------------------------------------
/src/lib/common/selector.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | import shutil
  4 | from abc import abstractmethod
  5 | from typing import Dict, Generator, Union, List
  6 | from types import SimpleNamespace
  7 | from lib.common.mtmodule import MTModule
  8 | from lib.common.exceptions import (
  9 |     InvalidElementIndex,
 10 |     ElementShouldRetryError,
 11 |     ElementShouldSkipError,
 12 |     EtypeCastError,
 13 | )
 14 | from lib.common.etypes import LocalElement, LocalElementsIndex
 15 | from lib.common.storage import Storage, LocalStorage
 16 | from lib.common.util import MAX_CPUS
 17 | 
 18 | 
 19 | class Selector(MTModule):
 20 |     """A Selector implements the indexing and retrieving of media for a platform or otherwise distinct space.
 21 | 
 22 |     'index' and 'retrieve_element' are abstract methods that need to be defined on selectors. Other attributes and
 23 |     methods in the class should not have to be explicitly referenced by selectors, as all data necessary is passed in
 24 |     the arguments of exposed methods.
 25 |     """
 26 | 
 27 |     def __init__(self, config, module, storage):
 28 |         super().__init__(config, module, storage=storage)
 29 | 
 30 |     @abstractmethod
 31 |     def index(self, config) -> LocalElementsIndex:
 32 |         """TODO: indicate the exact format this should output.
 33 |         Should populate a dataframe with the results, keep logs, and then call:
 34 |             self.index_complete(df, logs)
 35 | 
 36 |         REQUIRED: each result in the dataframe must contain an 'id' field containing
 37 |         a unique identifier for the element.
 38 | 
 39 |         NOTE: should be a relatively light pass that designates the space to be retrieved.
 40 |         No options for parallelisation, run on a single CPU.
 41 |         """
 42 |         raise NotImplementedError
 43 | 
 44 |     @abstractmethod
 45 |     def retrieve_element(self, row: SimpleNamespace, config) -> LocalElement:
 46 |         """Retrieve takes a single row from LocalElementsIndex as an argument, which was produced by the 'index'
 47 |         method. Data that has already been retrieved will not be retrieved again. The method should return
 48 |         a LocalElement, which mtriage will then persist to an instance of `Storage`."""
 49 |         raise NotImplementedError
 50 | 
 51 |     # optionally implemented by child
 52 |     # both ELEMENT_DIR and config are implicitly available on self, but passed explicitily for convenience
 53 |     def pre_retrieve(self, config: Dict):
 54 |         pass
 55 | 
 56 |     def post_retrieve(self, config: Dict):
 57 |         pass
 58 | 
 59 |     @MTModule.phase("index")
 60 |     def start_indexing(self):
 61 |         element_map = self.index(self.config)
 62 |         if element_map is not None:
 63 |             self.disk.write_elements_index(self.name, element_map)
 64 | 
 65 |     def start_retrieving(self):
 66 |         self.logger(
 67 |             f"Running selection {'in parallel' if self.in_parallel else 'serially'}"
 68 |         )
 69 | 
 70 |         self.__pre_retrieve()
 71 |         elements = self.disk.read_elements_index(self.name).rows
 72 |         if not self.in_parallel:
 73 |             try:
 74 |                 elements = [e for e in elements]
 75 |             except:
 76 |                 raise InvalidElementIndex()
 77 |         self.__retrieve(elements)
 78 |         self.__post_retrieve()
 79 |         self.disk.write_meta(
 80 |             self.name,
 81 |             {
 82 |                 "etype": self.out_etype.__repr__(),
 83 |                 "config": self.get_full_config(),
 84 |                 "stage": {"name": self.name, "module": "selector"},
 85 |             },
 86 |         )
 87 | 
 88 |     @MTModule.phase("pre-retrieve")
 89 |     def __pre_retrieve(self):
 90 |         self.pre_retrieve(self.config)
 91 | 
 92 |     @MTModule.phase("retrieve")
 93 |     def __retrieve(self, element_indices: Union[List, Generator]):
 94 |         for element_index in element_indices:
 95 |             self.__attempt_retrieve(5, element_index)
 96 |             self.disk.delete_local_on_write = False
 97 | 
 98 |     @MTModule.phase("post-retrieve")
 99 |     def __post_retrieve(self):
100 |         self.post_retrieve(self.config)
101 | 
102 |     def __attempt_retrieve(self, attempts, element_index):
103 |         try:
104 |             new_element = self.retrieve_element(element_index, self.config)
105 |             if new_element is None:
106 |                 return
107 |             success = self.disk.write_element(self.name, new_element)
108 |             if not success:
109 |                 raise ElementShouldRetryError("Unsuccessful storage")
110 | 
111 |         except ElementShouldSkipError as e:
112 |             self.error_logger(str(e), element_index)
113 |         except ElementShouldRetryError as e:
114 |             self.error_logger(str(e), element_index)
115 |             if attempts > 1:
116 |                 return self.__attempt_retrieve(attempts - 1, element_index)
117 |             else:
118 |                 self.error_logger(
119 |                     "failed after maximum retries - skipping element", element_index
120 |                 )
121 |         # TODO: flag to turn this off during development should be passed during run
122 |         except Exception as e:
123 |             if self.is_dev():
124 |                 raise e
125 |             else:
126 |                 self.error_logger(
127 |                     "unknown exception raised - skipping element", element_index
128 |                 )
129 | 


--------------------------------------------------------------------------------
/src/lib/common/util.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import multiprocessing
 3 | from pathlib import Path
 4 | from typing import List
 5 | 
 6 | MAX_CPUS = multiprocessing.cpu_count() - 1
 7 | 
 8 | 
 9 | def get_batch_size(ls_len):
10 |     """ Determine the batch size for multiprocessing. """
11 |     if ls_len >= MAX_CPUS:
12 |         return ls_len // (MAX_CPUS + 1)
13 |     # TODO: improve this heuristic for splitting up jobs
14 |     return ls_len
15 | 
16 | 
17 | def batch(iterable, n=1):
18 |     l = len(iterable)
19 |     for ndx in range(0, l, n):
20 |         yield iterable[ndx : min(ndx + n, l)]
21 | 
22 | 
23 | def serialize_dict(_dict):
24 |     ret = ""
25 |     for key in _dict:
26 |         val = _dict[key]
27 |         if isinstance(val, dict):
28 |             ret += serialize_dict(val)
29 |         else:
30 |             ret += f"{key}{val}"
31 |     return ret
32 | 
33 | 
34 | def hashdict(_dict):
35 |     m = hashlib.md5()
36 |     m.update(serialize_dict(_dict).encode("utf-8"))
37 |     return m.hexdigest()
38 | 
39 | 
40 | def subdirs(path: Path) -> List[Path]:
41 |     """ Return a list of Paths for subdirectories in a directory """
42 |     if path.is_dir():
43 |         return [f for f in path.iterdir() if f.is_dir()]
44 |     else:
45 |         return []
46 | 
47 | 
48 | def files(path: Path) -> List[Path]:
49 |     """ Return a list of Paths for files in a directory """
50 |     return [x for x in path.iterdir() if x.is_file()]
51 | 


--------------------------------------------------------------------------------
/src/lib/etypes/cvjson.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import ntpath
 3 | from typing import List, Union
 4 | from pathlib import Path
 5 | from lib.common.etypes import Etype, Et, Pth
 6 | from lib.common.exceptions import EtypeCastError
 7 | 
 8 | TMP = Path("/tmp")
 9 | IMG_SFXS = [".bmp", ".jpg", ".png", ".jpeg"]
10 | 
11 | 
12 | def deduce_frame_no(path):
13 |     # TODO: error handling
14 |     head, tail = ntpath.split(path)
15 |     f = tail or ntpath.basename(head)
16 |     num = f.split(".")[0]
17 |     return int(num)
18 | 
19 | 
20 | def prepare_json(path):
21 |     out = {}
22 |     if path is not None:
23 |         with open(path, "r") as f:
24 |             f = json.load(f)
25 |             out["title"] = f["title"]
26 |             out["description"] = f["description"]
27 |             out["webpage_url"] = f["webpage_url"]
28 |             out["duration"] = f["duration"]
29 |             out["upload_date"] = f["upload_date"]
30 |     return out
31 | 
32 | 
33 | class CvJson(Et):
34 |     """A custom Etype for computer vision (CV) json files, representing
35 |     predictions on a set of frames."""
36 | 
37 |     def __repr__(self):
38 |         return "CvJson"
39 | 
40 |     def filter(self, paths: Union[Pth, List[Pth]]) -> List[Pth]:
41 |         if isinstance(paths, (str, Path)):
42 |             paths = [paths]
43 | 
44 |         pths = []
45 |         json_count = 0
46 |         for p in paths:
47 |             if p.suffix in ".json" and p.name == "scores.json":
48 |                 pths.append(p)
49 |                 json_count += 1
50 |             pths.append(p) if p.suffix in IMG_SFXS else None
51 |         if json_count != 1:
52 |             raise EtypeCastError(self)
53 |         return pths
54 | 
55 |     @staticmethod
56 |     def from_preds(element, get_preds):
57 |         """ Generate an element containing classifier predictions in a format
58 |         appropriate for CvJson, i.e. a single JSON file 'preds.json' that
59 |         contains an object representing which classes are predicted for each
60 |         frame.
61 | 
62 |         This function assumes that `element.paths` represents an array of images
63 |         to be interpreted. The `get_preds` function operates on a single image,
64 |         accepting one argument that is a path to an image. It returns a list of
65 |         tuples `('classname', 0.8)`, where `'classname'` is a string
66 |         representing the class predicted, and `0.8` is the normalized prediction
67 |         probability between 0 and 1. See KerasPretrained/core.py in analysers
68 |         for an example. """
69 |         imgs = [p for p in element.paths if p.suffix in IMG_SFXS]
70 |         labels = {}
71 |         for imp in imgs:
72 |             frame_no, preds = deduce_frame_no(imp), get_preds(imp)
73 |             for pred_label, pred_conf in preds:
74 |                 if pred_label in labels.keys():
75 |                     labels[pred_label]["frames"].append(frame_no)
76 |                     labels[pred_label]["scores"].append(pred_conf)
77 |                 else:
78 |                     labels[pred_label] = {"frames": [frame_no], "scores": [pred_conf]}
79 | 
80 |         meta = [p for p in element.paths if p.suffix in ".json"]
81 |         meta = meta[0] if len(meta) > 0 else None
82 |         out = {**prepare_json(meta), "labels": labels}
83 |         base = TMP / element.id
84 |         base.mkdir(parents=True, exist_ok=True)
85 |         outp = base / "preds.json"
86 | 
87 |         with open(outp, "w") as fp:
88 |             json.dump(out, fp)
89 | 
90 |         return Etype.Json(element.id, outp)
91 | 
92 | 
93 | etype = CvJson
94 | 


--------------------------------------------------------------------------------
/src/lib/selectors/FourChan/boards.py:
--------------------------------------------------------------------------------
 1 | viable_boards = [
 2 |     "a",
 3 |     "aco",
 4 |     "adv",
 5 |     "an",
 6 |     "asp",
 7 |     "b",
 8 |     "bant",
 9 |     "biz",
10 |     "c",
11 |     "cgl",
12 |     "ck",
13 |     "cm",
14 |     "co",
15 |     "d",
16 |     "diy",
17 |     "e",
18 |     "f",
19 |     "fa",
20 |     "fit",
21 |     "g",
22 |     "gd",
23 |     "gif",
24 |     "h",
25 |     "hc",
26 |     "his",
27 |     "hm",
28 |     "hr",
29 |     "i",
30 |     "ic",
31 |     "int",
32 |     "jp",
33 |     "k",
34 |     "lgbt",
35 |     "lit",
36 |     "m",
37 |     "mlp",
38 |     "mu",
39 |     "n",
40 |     "news",
41 |     "o",
42 |     "out",
43 |     "p",
44 |     "po",
45 |     "pol",
46 |     "qa",
47 |     "qst",
48 |     "r",
49 |     "r9k",
50 |     "s",
51 |     "s4s",
52 |     "sci",
53 |     "soc",
54 |     "sp",
55 |     "t",
56 |     "tg",
57 |     "toy",
58 |     "trash",
59 |     "trv",
60 |     "tv",
61 |     "u",
62 |     "v",
63 |     "vg",
64 |     "vip",
65 |     "vp",
66 |     "vr",
67 |     "w",
68 |     "wg",
69 |     "wsg",
70 |     "wsr",
71 |     "x",
72 |     "y",
73 | ]
74 | 


--------------------------------------------------------------------------------
/src/lib/selectors/FourChan/core.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import requests
  3 | import os
  4 | import html2text
  5 | from pathlib import Path
  6 | from urllib.request import urlretrieve
  7 | from lib.common.selector import Selector
  8 | from lib.common.etypes import Etype, LocalElementsIndex
  9 | from lib.common.util import files
 10 | from lib.selectors.FourChan.boards import viable_boards
 11 | 
 12 | TMP = Path("/tmp")
 13 | 
 14 | 
 15 | class FourChan(Selector):
 16 |     """A selector that leverages the native 4chan API.
 17 | 
 18 |     https://github.com/4chan/4chan-API
 19 |     """
 20 | 
 21 |     def index(self, config):
 22 |         results = []
 23 |         board = config["board"]
 24 |         if board not in viable_boards:
 25 |             self.error_logger("Your chosen board does not exist on 4chan!")
 26 |             quit()
 27 |         # Create a HTML parser for parsing comments
 28 |         h = html2text.HTML2Text()
 29 |         h.ignore_links = False
 30 | 
 31 |         req = f"https://a.4cdn.org/{board}/threads.json"
 32 | 
 33 |         content = json.loads(requests.get(req).content)
 34 |         for page_index, page in enumerate(content):
 35 |             self.logger(f"Scraping page number: {page_index+1}")
 36 |             for thread_index, threads in enumerate(page["threads"]):
 37 |                 self.logger(f"Extracting posts from thread number: {thread_index+1}")
 38 |                 thread_id = threads["no"]
 39 |                 req = f"https://a.4cdn.org/{board}/thread/{thread_id}.json"
 40 |                 thread_content = json.loads(requests.get(req).content)[
 41 |                     "posts"
 42 |                 ]  # thread content is a list of posts
 43 |                 for post_index, post in enumerate(thread_content):
 44 |                     self.logger(
 45 |                         f"Extracting media and comments from post number: {post_index+1}"
 46 |                     )
 47 |                     post_row = []
 48 |                     post_row.append(post["no"])
 49 |                     post_row.append(thread_id)
 50 |                     post_row.append(post["time"])
 51 | 
 52 |                     try:
 53 |                         comment = post["com"]
 54 |                     except KeyError:
 55 |                         comment = "..."
 56 |                     else:
 57 |                         comment = h.handle(comment)
 58 |                     post_row.append(comment)
 59 | 
 60 |                     # Filename
 61 |                     try:
 62 |                         filename = post["filename"]
 63 |                     except KeyError:
 64 |                         filename = ""
 65 | 
 66 |                     if filename != "":
 67 |                         time_id = post["tim"]
 68 |                         extension = post["ext"]
 69 |                         full_file = f"{filename}{extension}"
 70 |                         file_url = f"https://i.4cdn.org/{board}/{time_id}{extension}"
 71 |                         post_row.append(full_file)
 72 |                         post_row.append(extension)
 73 |                         post_row.append(file_url)
 74 |                     elif filename == "":
 75 |                         post_row.append("")
 76 |                         post_row.append("")
 77 |                         post_row.append("")
 78 |                     results.append(post_row)
 79 |         self.logger("Scraping metadata complete")
 80 |         results.insert(
 81 |             0, ["id", "thread_id", "datetime", "comment", "filename", "ext", "url"]
 82 |         )
 83 |         return LocalElementsIndex(results)
 84 | 
 85 |     def retrieve_element(self, element, _):
 86 |         base = TMP / element.id
 87 |         base.mkdir(parents=True, exist_ok=True)
 88 | 
 89 |         fn = element.filename
 90 |         identifier = element.id
 91 |         comment = element.comment
 92 |         url = element.url
 93 | 
 94 |         with open(base / f"{identifier}_comment.txt", "w+") as f:
 95 |             f.write(comment)
 96 | 
 97 |         if url != "":
 98 |             urlretrieve(url, base / fn)
 99 | 
100 |         return Etype.cast(element.id, files(base))
101 | 
102 | 
103 | module = FourChan
104 | 


--------------------------------------------------------------------------------
/src/lib/selectors/FourChan/info.yaml:
--------------------------------------------------------------------------------
1 | desc: Allows you to scrape text and media from 4chan
2 | args:
3 |   - name: board
4 |     desc: Numeric identifier for a specific board to scrape. If not specified all boards are scraped.
5 |     required: true
6 |     input: string
7 | 


--------------------------------------------------------------------------------
/src/lib/selectors/FourChan/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | html2text


--------------------------------------------------------------------------------
/src/lib/selectors/Local/core.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | from shutil import copyfile
 4 | from lib.common.selector import Selector
 5 | from lib.common.etypes import Etype, Index
 6 | from lib.common.exceptions import SelectorIndexError
 7 | 
 8 | 
 9 | BASE = Path("/mtriage")
10 | 
11 | 
12 | class Local(Selector):
13 |     """A simple selector for importing local files into mtriage.
14 | 
15 |     It recursively finds every file in a source_folder specified in the config
16 |     (see example script 4.select_local.sh) and imports each file into its own
17 |     element. The element ID is the file's name concatenated with its extension.
18 | 
19 |     n.b. the directory being imported must be located within the mtriage
20 |     directory on the mtriage host to be accessible inside the docker container
21 |     (the media folder is recommended).
22 |     """
23 | 
24 |     out_etype = Etype.Any
25 | 
26 |     def __init__(self, *args):
27 |         super().__init__(*args)
28 | 
29 |     def is_aggregate(self):
30 |         return "aggregate" in self.config and self.config["aggregate"]
31 | 
32 |     def index(self, config):
33 |         src = Path(config["source"])
34 |         abs_src = BASE / src
35 |         if not os.path.exists(abs_src):
36 |             raise SelectorIndexError(
37 |                 f"The 'source' folder {src} could not be found. Ensure it is in the same directory asmtriage."
38 |             )
39 |         return self._index(abs_src)
40 | 
41 |     def _index(self, abs_src):
42 |         self.logger("Indexing local folder...")
43 |         results = [["id", "path"]]
44 |         excluded = self.config.get("exclude", [])
45 |         for root, _, files in os.walk(abs_src):
46 |             main = Path(abs_src)
47 |             root = Path(root)
48 |             for file in files:
49 |                 if file == ".mtbatch" or file in excluded:
50 |                     continue
51 |                 fp = root / file
52 |                 elid = root.name if (root.name != main.name) else fp.stem
53 |                 results.append([elid, fp])
54 |                 self.logger(f"indexed file {fp} as: {elid}")
55 |         if self.is_aggregate():
56 |             # `self.results` used in `retrieve_element` for paths.
57 |             self.results = results[1:]
58 |             # NB: hacky way to just make `retrieve_element` run just once.:
59 |             return Index([["id"], ["IS_AGGREGATE"]])
60 |         return Index(results)
61 | 
62 |     def retrieve_element(self, element, config):
63 |         if self.is_aggregate():
64 |             og_folder = Path(config["source"])
65 |             return Etype.Any(og_folder.name, paths=[x[1] for x in self.results])
66 |         else:
67 |             return Etype.Any(element.id, paths=[element.path])
68 | 
69 | 
70 | module = Local
71 | 


--------------------------------------------------------------------------------
/src/lib/selectors/Local/info.yaml:
--------------------------------------------------------------------------------
 1 | desc: Selects media from a path that already exists on the local filesystem.
 2 | args:
 3 |   - name: source
 4 |     desc: The path to the source folder that represents the media space. Ensure that the path exists not only on the local filesystem, but also in the subsection that is mounted to Docker. The easiest way to ensure this is the case is to ensure that the 'source' is a subdirectory of one of the gitignored directories in mtriage, i.e. 'data'.
 5 |     required: true
 6 |     input: folder
 7 |   - name: aggregate
 8 |     desc: Put all inside one element. Otherwise will create one element per separate file.
 9 |     required: false
10 |     input: bool
11 |   - name: exclude
12 |     desc: files to exclude
13 |     required: false
14 |     input: list
15 | 


--------------------------------------------------------------------------------
/src/lib/selectors/Twitter/core.py:
--------------------------------------------------------------------------------
 1 | import twint
 2 | import json
 3 | from urllib.request import urlretrieve
 4 | from pathlib import Path
 5 | from lib.common.selector import Selector
 6 | from lib.common.etypes import Etype, LocalElementsIndex
 7 | from lib.common.util import files
 8 | from lib.util.twint import to_serializable
 9 | 
10 | TMP = Path("/tmp")
11 | 
12 | 
13 | class Twitter(Selector):
14 |     """A selector for scraping tweets.
15 | 
16 |     It leverages 'twint' - https://github.com/twintproject/twint - under
17 |     the hood.
18 |     """
19 | 
20 |     out_etype = Etype.Json
21 | 
22 |     def index(self, config):
23 |         c = twint.Config()
24 |         c.Search = config["search_term"]
25 |         c.Since = config["uploaded_after"]
26 |         c.Until = config["uploaded_before"]
27 |         c.Show_hashtags = True
28 |         c.Store_object = True
29 | 
30 |         twint.run.Search(c)
31 | 
32 |         tweets = to_serializable(twint.output.tweets_list, as_list=True)
33 |         return LocalElementsIndex(tweets)
34 | 
35 |     def retrieve_element(self, element, _):
36 |         base = TMP / element.id
37 |         base.mkdir(parents=True, exist_ok=True)
38 |         with open(base / "tweet.json", "w+") as fp:
39 |             json.dump(element.__dict__, fp)
40 | 
41 |         # retrieve photos
42 |         if "download_photos" in self.config and self.config.download_photos:
43 |             photos = element.photos.split(",")
44 |             if len(photos) < 1 or photos[0] == "":
45 |                 self.logger(f"{element.id} downloaded.")
46 |                 return Etype.cast(element.id, files(base))
47 | 
48 |             for url in photos:
49 |                 fname = url.rsplit("/", 1)[-1]
50 |                 urlretrieve(url, base / fname)
51 | 
52 |             self.logger(f"{element.id} downloaded (with images).")
53 | 
54 |         if "download_videos" in self.config and self.config.download_videos:
55 |             if hasattr(element, "video") and element.video != "":
56 |                 fname = element.video.rsplit("/", 1)[-1]
57 |                 urlretrieve(element.video, base / fname)
58 | 
59 |         self.disk.delete_local_on_write = True
60 |         return Etype.cast(element.id, files(base))
61 | 
62 | 
63 | module = Twitter
64 | 


--------------------------------------------------------------------------------
/src/lib/selectors/Twitter/info.yaml:
--------------------------------------------------------------------------------
 1 | desc: Search and download for Twitter using https://github.com/twintproject/twint. Only a proxy to scraping via time-constrained keyword search is implemented at present.
 2 | args:
 3 |   - name: search_term
 4 |     desc: Searches for the term in the entire tweet.
 5 |     required: true
 6 |     input: string
 7 |   - name: uploaded_before
 8 |     desc: Only return tweets before this date.
 9 |     required: true
10 |     input: date
11 |   - name: uploaded_after
12 |     desc: Only return tweets after this date.
13 |     required: true
14 |     input: date
15 |   - name: download_photos
16 |     required: false
17 |     desc: set to True if the selector should download photos in tweets. False by default.
18 |     input: boolean
19 |   - name: download_videos
20 |     required: false
21 |     desc: set to True if the selector should download videos in tweets. False by default.
22 |     input: boolean
23 | 
24 | 


--------------------------------------------------------------------------------
/src/lib/selectors/Twitter/partial.Dockerfile:
--------------------------------------------------------------------------------
1 | RUN pip install -e git+https://github.com/twintproject/twint.git@origin/master#egg=twint
2 | RUN cd /mtriage/src/twint && python setup.py install
3 | 


--------------------------------------------------------------------------------
/src/lib/selectors/Youtube/core.py:
--------------------------------------------------------------------------------
  1 | import yt_dlp 
  2 | import json
  3 | import re
  4 | import argparse, os, sys
  5 | import math
  6 | from subprocess import call, STDOUT
  7 | from pathlib import Path
  8 | from lib.common.selector import Selector
  9 | from lib.common.etypes import Etype, Union, LocalElementsIndex
 10 | from lib.common.util import files
 11 | from lib.common.exceptions import ElementShouldSkipError
 12 | 
 13 | from datetime import datetime, timedelta
 14 | 
 15 | import googleapiclient.discovery
 16 | from googleapiclient.errors import HttpError
 17 | 
 18 | YOUTUBE_API_SERVICE_NAME = "youtube"
 19 | YOUTUBE_API_VERSION = "v3"
 20 | API_KEY = os.environ.get("GOOGLE_API_KEY")
 21 | TMP = Path("/tmp")
 22 | 
 23 | 
 24 | class Youtube(Selector):
 25 |     out_etype = Union(Etype.Json, Etype.Video)
 26 | 
 27 |     def index(self, _) -> LocalElementsIndex:
 28 |         results = self._run()
 29 |         if len(results) > 0:
 30 |             out = []
 31 |             out.append(list(results[0].keys()))
 32 |             out.extend([x.values() for x in results])
 33 |             return LocalElementsIndex(out)
 34 |         return None
 35 | 
 36 |     def pre_retrieve(self, _):
 37 |         self.ydl = yt_dlp.YoutubeDL(
 38 |             {
 39 |                 "outtmpl": f"{TMP}/%(id)s/%(id)s.mp4",
 40 |                 "format": "worstvideo[ext=mp4]",
 41 |             }
 42 |         )
 43 | 
 44 |     def retrieve_element(self, element, _):
 45 |         with self.ydl:
 46 |             try:
 47 |                 result = self.ydl.extract_info(element.url)
 48 |                 meta = TMP / element.id / "meta.json"
 49 |                 with open(meta, "w+") as fp:
 50 |                     json.dump(result, fp)
 51 |                 self.logger(f"{element.id}: video and meta downloaded successfully.")
 52 |                 self.disk.delete_local_on_write = True
 53 |                 return Etype.cast(element.id, files(TMP / element.id))
 54 |             except yt_dlp.utils.DownloadError:
 55 |                 raise ElementShouldSkipError(
 56 |                     f"Something went wrong downloading {element.id}. It may have been deleted."
 57 |                 )
 58 | 
 59 |     def _run(self):
 60 |         self.logger(f"Query: {self.config['search_term']}")
 61 |         if "uploaded_after" in self.config:
 62 |             self.logger(f"Start: {self.config['uploaded_after']}")
 63 | 
 64 |         if "uploaded_before" in self.config:
 65 |             self.logger(f"End: {self.config['uploaded_before']}")
 66 | 
 67 |         if self.config.get("daily"):
 68 |             results = []
 69 |             self.logger(
 70 |                 f"Scraping daily, from {self.config['uploaded_after']} -- {self.config['uploaded_before']}"
 71 |             )
 72 |             self.logger("-----------------")
 73 |             for after, before in self._days_between(
 74 |                 self.config["uploaded_after"], self.config["uploaded_before"]
 75 |             ):
 76 |                 results = results + self.get_results(before, after)
 77 | 
 78 |         else:
 79 |             results = self.get_results(
 80 |                 self.config.get("uploaded_before"), self.config.get("uploaded_after")
 81 |             )
 82 | 
 83 |         self.logger("\n\n----------------")
 84 |         self.logger(f"Scrape successful, {len(results) - 1} results.")
 85 | 
 86 |         return results
 87 | 
 88 |     def get_results(self, before, after):
 89 |         args_obj = {"q": self.config["search_term"]}
 90 | 
 91 |         if before is not None:
 92 |             args_obj["before"] = self.config["uploaded_before"]
 93 |         if "uploaded_after" in self.config.keys():
 94 |             args_obj["after"] = self.config["uploaded_after"]
 95 | 
 96 |         new_results = self._youtube_search_all_pages(args_obj)
 97 |         if new_results is None:
 98 |             raise Exception("Something went wrong")
 99 |         return new_results
100 | 
101 |     def _add_to_csv_obj(self, csv_obj, s_res):
102 |         for search_result in s_res:
103 |             videoId = search_result["id"]["videoId"]
104 |             title = search_result["snippet"]["title"]
105 |             channelId = search_result["snippet"]["channelId"]
106 |             desc = search_result["snippet"]["description"]
107 |             publishedAt = search_result["snippet"]["publishedAt"]
108 |             url = f"https://www.youtube.com/watch?v={videoId}"
109 |             id = self._id_from_url(url)
110 |             csv_obj.append(
111 |                 {
112 |                     "url": url,
113 |                     "title": title.replace(",", ";"),
114 |                     "desc": desc.replace(",", ";"),
115 |                     "published": publishedAt[0:10],
116 |                     "id": id,
117 |                 }
118 |             )
119 |         return csv_obj
120 | 
121 |     def _youtube_search_all_pages(self, args):
122 |         csv_obj = []
123 |         self.logger(
124 |             f"Search terms: {args['q']}\n Start: {args['after'] if 'after' in args else ''}\n End: {args['before'] if 'before' in args else ''}"
125 |         )
126 |         try:
127 |             s_res = self._youtube_search(args)
128 |             count = 1
129 |             while True:
130 |                 self.logger(f"\tScraping page {count}...")
131 |                 count += 1
132 |                 csv_obj = self._add_to_csv_obj(csv_obj, s_res.get("items", []))
133 | 
134 |                 if (not "nextPageToken" in s_res) or (len(s_res.get("items", [])) == 0):
135 |                     break
136 | 
137 |                 s_res = self._youtube_search(args, pageToken=s_res["nextPageToken"])
138 |             self.logger("\tAll pages scraped.")
139 |             return csv_obj
140 |         except HttpError as e:
141 |             self.logger(f"An HTTP error {e.resp.status} occured.")
142 |             print(e.content)
143 |             return None
144 | 
145 |     def _youtube_search(self, options, pageToken=None):
146 |         # modified from https://github.com/youtube/api-samples/blob/master/python/search.py
147 |         if API_KEY is None:
148 |             raise ElementShouldSkipError("No GOOGLE_API_KEY specified in .env")
149 |         youtube = googleapiclient.discovery.build(
150 |             YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=API_KEY
151 |         )
152 | 
153 |         theargs = {
154 |             "pageToken": pageToken,
155 |             "q": options["q"],
156 |             "part": "id,snippet",
157 |             "maxResults": 50,
158 |             "safeSearch": "none",
159 |             "type": "video",
160 |         }
161 | 
162 |         if "before" in options:
163 |             theargs["publishedBefore"] = options["before"]
164 |         if "after" in options:
165 |             theargs["publishedAfter"] = options["after"]
166 | 
167 |         request = youtube.search().list(**theargs)
168 | 
169 |         s = request.execute()
170 | 
171 |         return s
172 | 
173 |     def _days_between(self, start, end):
174 |         bef = datetime.strptime(end[:-1], "%Y-%m-%dT%H:%M:%S")
175 |         aft = datetime.strptime(start[:-1], "%Y-%m-%dT%H:%M:%S")
176 |         between = (bef - aft).days
177 |         return [
178 |             (
179 |                 ((aft + timedelta(days=dt)).strftime("%Y-%m-%dT") + "00:00:00Z"),
180 |                 ((aft + timedelta(days=dt)).strftime("%Y-%m-%dT") + "23:59:59Z"),
181 |             )
182 |             for dt in range(between)
183 |         ]
184 | 
185 |     def _id_from_url(self, url):
186 |         id_search = re.search(
187 |             "https:\/\/www\.youtube\.com\/watch\?v\=(.*)", url, re.IGNORECASE
188 |         )
189 |         if id:
190 |             return id_search.group(1)
191 |         return None
192 | 
193 | 
194 | module = Youtube
195 | 


--------------------------------------------------------------------------------
/src/lib/selectors/Youtube/info.yaml:
--------------------------------------------------------------------------------
 1 | desc: Allows you to select a media space via Youtube
 2 | args:
 3 |   - name: search_term
 4 |     desc: Plain string search query that is submitted to Youtube.
 5 |     required: true
 6 |     input: string
 7 |   - name: uploaded_before
 8 |     desc: Only return videos uploaded before this date.
 9 |     required: false
10 |     input: date
11 |   - name: uploaded_after
12 |     desc: Only return videos uploaded after this date.
13 |     required: false
14 |     input: date
15 |   - name: daily
16 |     desc: Query the Youtube API N times with the given search terms, where N is the number of days between the 'uploaded_after' and 'uploaded_before' dates. This heuristic returns more results for a given search term, but can fail due to exhausting the API's daily quota.
17 |     required: false
18 |     input: bool
19 | 


--------------------------------------------------------------------------------
/src/lib/selectors/Youtube/partial.Dockerfile:
--------------------------------------------------------------------------------
1 | RUN apt-get install -y --no-install-recommends libsm6 libxrender1 libfontconfig1
2 | 
3 | RUN curl -sSL https://sdk.cloud.google.com | bash
4 | ENV PATH="$PATH:/root/google-cloud-sdk/bin"
5 | 
6 | 


--------------------------------------------------------------------------------
/src/lib/selectors/Youtube/requirements.txt:
--------------------------------------------------------------------------------
1 | yt-dlp==2023.3.4
2 | 
3 | google-api-core==1.11.0
4 | google-api-python-client==1.7.8
5 | google-auth==1.6.3
6 | google-auth-httplib2==0.0.3
7 | grpcio
8 | 


--------------------------------------------------------------------------------
/src/lib/util/cvjson.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import operator
  4 | import re
  5 | from typing import List
  6 | from shutil import copyfile, rmtree
  7 | from pathlib import Path
  8 | from lib.common.etypes import Etype
  9 | from functools import reduce
 10 | 
 11 | WK_DIR = Path("/tmp/ranking")
 12 | 
 13 | 
 14 | def open_json(fp):
 15 |     try:
 16 |         with open(fp, "r") as f:
 17 |             return json.load(f)
 18 |     except:
 19 |         return {}
 20 | 
 21 | 
 22 | def render_frame(element, label, frame, score):
 23 |     return {"element": element, "frame": frame, "score": score, "label": label}
 24 | 
 25 | 
 26 | def rank(elements: List, threshold=0.5, logger=print, element_id="__RANKING") -> Etype:
 27 |     ranking_data = {}
 28 | 
 29 |     for element in elements:
 30 |         jsons = [f for f in element.paths if f.suffix in ".json"]
 31 |         if len(jsons) != 1:
 32 |             continue
 33 | 
 34 |         jsonp = jsons[0]
 35 |         with open(jsonp, "r") as jsonf:
 36 |             data = json.load(jsonf)
 37 | 
 38 |         try:
 39 |             # TODO: this logic should be a custom etype built from a core etype class...
 40 |             # the core class can then include associated methods.
 41 |             labels = data["labels"]
 42 |             for label, preds in labels.items():
 43 |                 frames, scores = preds["frames"], preds["scores"]
 44 |                 valid_frames = [
 45 |                     idx for idx, _ in enumerate(frames) if scores[idx] > threshold
 46 |                 ]
 47 |                 rank = len(valid_frames)
 48 |                 if rank > 4:
 49 |                     logger(f"label '{label}': rank {rank}")
 50 |                 # gather all ranks in `ranking_data`
 51 |                 if label not in ranking_data:
 52 |                     ranking_data[label] = {}
 53 |                 ranking_data[label][element.id] = rank
 54 | 
 55 |             # dpath = WK_DIR / f"{element.id}.json"
 56 |             logger(f"Rankings indexed for {element.id}.")
 57 | 
 58 |         except Exception as e:
 59 |             logger(f"Could not analyse {element.id}: {e}")
 60 | 
 61 |     ranking = {}
 62 |     for label, values in ranking_data.items():
 63 |         s_vals = sorted(values.items(), key=operator.itemgetter(1))
 64 |         s_vals.reverse()
 65 |         s_els = [t[0] for t in s_vals]
 66 |         ranking[label] = s_els
 67 | 
 68 |     file = WK_DIR / "rankings.json"
 69 |     logger("All rankings aggregated, printed to rankings.json")
 70 | 
 71 |     if not os.path.exists(WK_DIR):
 72 |         os.makedirs(WK_DIR)
 73 | 
 74 |     with open(file, "w") as f:
 75 |         json.dump(ranking, f)
 76 | 
 77 |     return Etype.Json(element_id, file)
 78 | 
 79 | 
 80 | def flatten(elements: List, logger=print) -> Etype:
 81 |     """
 82 |     'Flatten' all predictions into a list, where each item is a positive frame:
 83 |     [
 84 |         { "element": "xxxx", "frame": 1, "score": 0.2, "label": "tank" },
 85 |     ]
 86 |     """
 87 |     is_json = re.compile(r".*\.json")
 88 |     # NOTE: assumes there is always one .json in each element's `paths`
 89 |     all_preds = [
 90 |         next(filter(is_json.match, [str(x) for x in x.paths])) for x in elements
 91 |     ]
 92 |     all_preds = [open_json(x) for x in all_preds]
 93 |     preds = [
 94 |         x.get("labels")
 95 |         for x in all_preds
 96 |         if isinstance(x, dict) and x.get("labels") is not None
 97 |     ]
 98 | 
 99 |     vls = [
100 |         [(label, el_preds[label]) for label in el_preds.keys()] for el_preds in preds
101 |     ]
102 |     vls = [(x[0].id, x[1]) for x in zip(elements, vls)]
103 |     label_in_els = [
104 |         (x[0], y[0], y[1]["frames"], y[1]["scores"]) for x in vls for y in x[1]
105 |     ]
106 |     frames = [
107 |         render_frame(x[0], x[1], y[0], y[1])
108 |         for x in label_in_els
109 |         for y in zip(x[2], x[3])
110 |     ]
111 | 
112 |     output = WK_DIR / "flattened.json"
113 | 
114 |     if not os.path.exists(WK_DIR):
115 |         os.makedirs(WK_DIR)
116 | 
117 |     with open(output, "w") as f:
118 |         json.dump(frames, f)
119 | 
120 |     logger("All frames aggregated, printed to flattened.json")
121 |     return Etype.Json("__FLATTENED", output)
122 | 
123 | 
124 | def generate_meta(elements: List, logger=print) -> Etype:
125 |     """ Combine various metrics inside a single element """
126 |     a = flatten(elements, logger=logger)
127 |     b = rank(elements, logger=logger)
128 | 
129 |     return Etype.Any("__META", a.paths + b.paths)
130 | 


--------------------------------------------------------------------------------
/src/lib/util/twint.py:
--------------------------------------------------------------------------------
 1 | LABELS = [
 2 |     "id",
 3 |     "conversation_id",
 4 |     "datestamp",
 5 |     "timestamp",
 6 |     "timezone",
 7 |     "user_id",
 8 |     "username",
 9 |     "name",
10 |     "place",
11 |     "tweet",
12 |     "mentions",
13 |     "urls",
14 |     "photos",
15 |     "replies_count",
16 |     "retweets_count",
17 |     "likes_count",
18 |     "hashtags",
19 |     "cashtags",
20 |     "link",
21 |     "retweet",
22 |     "quote_url",
23 |     "video",
24 |     "user_rt_id",
25 |     "near",
26 |     "geo",
27 |     "source",
28 |     "retweet_date",
29 | ]
30 | 
31 | 
32 | def pythonize(t):
33 |     """ Make valid fields ints, essentially deserialize """
34 |     t["retweet"] = True if t["retweet"] == "True" else False
35 |     t["likes_count"] = int(t["likes_count"])
36 |     t["replies_count"] = int(t["replies_count"])
37 |     t["retweets_count"] = int(t["retweets_count"])
38 |     t["photos"] = t["photos"].split(",")
39 |     t["hashtags"] = t["hashtags"].split(",")
40 |     t["urls"] = t["urls"].split(",")
41 |     return t
42 | 
43 | 
44 | def attr_is_list(attr):
45 |     return attr.strip() in [
46 |         "photos",
47 |         "mentions",
48 |         "urls",
49 |         "mentions",
50 |         "hashtags",
51 |         "cashtags",
52 |     ]
53 | 
54 | 
55 | def jsont(t, as_list):
56 |     """ return all fields in a JSON-serializable way """
57 |     if not as_list:
58 |         return {
59 |             l: ",".join(getattr(t, l)) if attr_is_list(l) else getattr(t, l)
60 |             for l in LABELS
61 |         }
62 |     else:
63 |         td = t.__dict__
64 |         out = []
65 |         for l in LABELS:
66 |             if attr_is_list(l):
67 |                 out.append(",".join(td[l]))
68 |             else:
69 |                 out.append(td[l])
70 |         return out
71 | 
72 | 
73 | def to_serializable(tweets, as_list=False):
74 |     vls = [jsont(t, as_list) for t in tweets]
75 |     if as_list:
76 |         vls.insert(0, LABELS)
77 |     return vls
78 | 


--------------------------------------------------------------------------------
/src/run.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """The entry point for mtriage.
 3 | 
 4 | Orchestrates selectors and analysers via CLI parameters.
 5 | 
 6 | Modules:
 7 |     Each module corresponds to a web platform API, or some equivalent method
 8 |     of programmatic retrieval.
 9 | 
10 |     TODO: document where to find selector and analyser design docs.
11 | Attributes:
12 |     module (str): Indicates the platform or source from which media should be
13 |         analysed. The code that implements is module is self-contained to a
14 |         folder here in the 'select' folder.
15 |     config (dict of str: str): Hyperparameters that refine the analyse space.
16 |         These parameters are module-specific (although the aim is to create as
17 |         consistent as possible a parameter language across modules).
18 |     folder (str): The path to the directory where the data that is indexed
19 |         during the SELECT pass will be saved. This directory serves as a kind of
20 |         "working directory" during the SAMPLE and ANALYSE passes, in the sense
21 |         that all generated data is saved in this directory. The directory also
22 |         contains logs, and represents the 'saved state' of a media triage
23 |         analysis.
24 | 
25 | """
26 | import os
27 | import yaml
28 | from validate import validate_yaml
29 | from lib.common.get import get_module
30 | from lib.common.storage import LocalStorage
31 | 
32 | CONFIG_PATH = "/run_args.yaml"
33 | 
34 | 
35 | def make_storage(cfg: dict) -> LocalStorage:
36 |     # TODO: generalise `folder` here to a `storage` var that is passed from YAML
37 |     return LocalStorage(folder=cfg["folder"])
38 | 
39 | 
40 | def _run_analyser(ana: dict, base_cfg: dict, cfg: dict):
41 |     # run a single analyser
42 |     Analyser = get_module("analyse", ana["name"])
43 |     analyser = Analyser(
44 |         {**ana["config"], **base_cfg} if "config" in ana.keys() else base_cfg,
45 |         ana["name"],
46 |         make_storage(cfg),
47 |     )
48 |     analyser.start_analysing()
49 | 
50 | 
51 | def _run_yaml():
52 |     with open(CONFIG_PATH, "r") as c:
53 |         cfg = yaml.safe_load(c)
54 | 
55 |     validate_yaml(cfg)
56 | 
57 |     base_cfg = {}
58 |     if "select" not in cfg and "elements_in" in cfg:
59 |         base_cfg["elements_in"] = cfg["elements_in"]
60 |         sel = None
61 |     else:
62 |         # run select
63 |         sel = cfg["select"]
64 |         Selector = get_module("select", sel["name"])
65 |         selector = Selector(
66 |             sel["config"] if "config" in sel.keys() else {},
67 |             sel["name"],
68 |             make_storage(cfg),
69 |         )
70 |         selector.start_indexing()
71 |         selector.start_retrieving()
72 |         base_cfg["elements_in"] = [sel["name"]]
73 | 
74 |     if "analyse" not in cfg:
75 |         return
76 | 
77 |     analyse_phase = cfg["analyse"]
78 | 
79 |     if isinstance(analyse_phase, dict):
80 |         _run_analyser(analyse_phase, base_cfg, cfg)
81 | 
82 |     else:
83 |         for ana in analyse_phase:
84 |             _run_analyser(ana, base_cfg, cfg)
85 |             if sel is None:
86 |                 # take the selector from elements in
87 |                 fst = cfg["elements_in"][0]
88 |                 sel = {"name": fst.split("/")[0]}
89 |             base_cfg["elements_in"] = [f"{sel['name']}/{ana['name']}"]
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     _run_yaml()
94 | 


--------------------------------------------------------------------------------
/src/test/README.md:
--------------------------------------------------------------------------------
 1 | # src tests
 2 | 
 3 | In pytest.
 4 | 
 5 | Note that all tests are run from the 'src' directory. Relative import paths should be specified accordingly:
 6 | 
 7 | ```python
 8 | from lib.common.analyser import Analyser
 9 | ```
10 | 
11 | 


--------------------------------------------------------------------------------
/src/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/forensic-architecture/mtriage/7a841241518f831766767d6ddaa6320b8de4be98/src/test/__init__.py


--------------------------------------------------------------------------------
/src/test/etype_stubs/image.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/forensic-architecture/mtriage/7a841241518f831766767d6ddaa6320b8de4be98/src/test/etype_stubs/image.jpeg


--------------------------------------------------------------------------------
/src/test/test_analyser.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import os
  3 | import json
  4 | from pathlib import Path
  5 | from lib.common.analyser import Analyser
  6 | from lib.common.exceptions import InvalidAnalyserElements, InvalidCarry
  7 | from lib.common.etypes import Etype
  8 | from lib.common.mtmodule import MTModule
  9 | from lib.common.storage import LocalStorage
 10 | 
 11 | 
 12 | class EmptyAnalyser(Analyser):
 13 |     out_etype = Etype.Any
 14 | 
 15 |     def analyse_element(self, element, config):
 16 |         raise Exception("is the user-defined func!")
 17 | 
 18 | 
 19 | class TxtCopyAnalyser(Analyser):
 20 |     out_etype = Etype.Any
 21 | 
 22 |     def analyse_element(self, element, config):
 23 |         """ just copy over all media in 'any' """
 24 |         for f in element.paths:
 25 |             # only copy over txt files
 26 |             if f.suffix != ".txt":
 27 |                 return
 28 |             with open(f, "r") as reader:
 29 |                 contents = reader.readlines()
 30 |             txt = Path("/tmp/copy.txt")
 31 |             with open(txt, "w+") as writer:
 32 |                 writer.writelines(contents)
 33 | 
 34 |             element.paths = [txt]
 35 |             return element
 36 | 
 37 | 
 38 | # TODO: test casting errors via an analyser with explicit etype
 39 | @pytest.fixture
 40 | def additionals(utils):
 41 |     obj = lambda: None
 42 |     obj.maxDiff = None
 43 |     obj.emptyAnalyserName = "empty"
 44 |     obj.WHITELIST = ["sel1/an1", "sel1/an2", "sel2"]
 45 |     obj.sel1 = "sel1"
 46 |     obj.sel2 = "sel2"
 47 |     obj.sel1_elements = ["el1", "el2"]
 48 |     obj.sel2_elements = ["el4", "el5", "el6"]
 49 | 
 50 |     utils.scaffold_empty(obj.sel1, elements=obj.sel1_elements, analysers=["an1", "an2"])
 51 |     utils.scaffold_empty(obj.sel2, elements=obj.sel2_elements)
 52 |     os.rmdir(utils.get_element_path(obj.sel1, "el1", analyser="an2"))
 53 | 
 54 |     obj.config = {"elements_in": obj.WHITELIST, "dev": True}
 55 |     obj.emptyAnalyser = EmptyAnalyser(
 56 |         obj.config,
 57 |         obj.emptyAnalyserName,
 58 |         storage=LocalStorage(folder=utils.TEMP_ELEMENT_DIR),
 59 |     )
 60 |     utils.setup()
 61 |     yield obj
 62 |     utils.cleanup()
 63 | 
 64 | 
 65 | def test_selector_imports():
 66 |     assert type(Analyser) == type(MTModule)
 67 | 
 68 | 
 69 | def test_cannot_instantiate(utils):
 70 |     with pytest.raises(TypeError):
 71 |         Analyser({}, "empty", utils.TEMP_ELEMENT_DIR)
 72 | 
 73 | 
 74 | def test_init(additionals):
 75 |     assert additionals.config == additionals.emptyAnalyser.config
 76 | 
 77 | 
 78 | def test_analyse(utils, additionals):
 79 |     config = {"elements_in": ["sel1"]}
 80 |     dummyName = "dummyAnalyser"
 81 |     checkUserExceptionAnalyser = EmptyAnalyser(
 82 |         {**config, "dev": True}, "empty", LocalStorage(folder=utils.TEMP_ELEMENT_DIR)
 83 |     )
 84 |     dummyAnalyser = TxtCopyAnalyser(
 85 |         config, dummyName, LocalStorage(folder=utils.TEMP_ELEMENT_DIR)
 86 |     )
 87 |     # TODO: work out whether this test is needed with the new format
 88 |     # test it calls the user-defined `analyse_element`
 89 |     # with pytest.raises(Exception, match="is the user-defined func!"):
 90 |     #     checkUserExceptionAnalyser.start_analysing(in_parallel=False)
 91 |     # try again with a text el mocking selection completed
 92 |     # TODO: fix these tests- adding casting throws errors in some cases, as well as extra log.
 93 |     for el in additionals.sel1_elements:
 94 |         with open(
 95 |             f"{dummyAnalyser.disk.base_dir}/sel1/{dummyAnalyser.disk.RETRIEVED_EXT}/{el}/anitem.txt",
 96 |             "w+",
 97 |         ) as f:
 98 |             f.write("Hello")
 99 |     dummyAnalyser.start_analysing()
100 |     # confirm txt has carried
101 |     for el in additionals.sel1_elements:
102 |         with open(
103 |             f"{dummyAnalyser.disk.base_dir}/sel1/{dummyAnalyser.disk.ANALYSED_EXT}/{dummyName}/{el}/copy.txt",
104 |             "r",
105 |         ) as f:
106 |             lines = f.readlines()
107 |             assert len(lines) == 1
108 |             assert lines[0] == "Hello"
109 | 


--------------------------------------------------------------------------------
/src/test/test_analyser_errors.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import os
  3 | from lib.common.analyser import Analyser
  4 | from test.test_analyser import EmptyAnalyser
  5 | from lib.common.storage import LocalStorage
  6 | from lib.common.etypes import Etype, LocalElement
  7 | from lib.common.exceptions import (
  8 |     ElementShouldRetryError,
  9 |     ElementShouldSkipError,
 10 |     InvalidAnalyserConfigError,
 11 |     MTriageStorageCorruptedError,
 12 |     InvalidAnalyserElements,
 13 | )
 14 | 
 15 | 
 16 | class ErrorThrowingAnalyser(Analyser):
 17 |     out_etype = Etype.Any
 18 | 
 19 |     def __init__(self, *args):
 20 |         super().__init__(*args)
 21 |         self.retryCount = 0
 22 | 
 23 |     def analyse_element(self, element, config):
 24 |         if element.id == "skip":
 25 |             raise ElementShouldSkipError("test")
 26 |         elif element.id == "retry3" and self.retryCount < 3:
 27 |             self.retryCount += 1
 28 |             raise ElementShouldRetryError("test")
 29 |         elif element.id == "retryN":
 30 |             raise ElementShouldRetryError("test")
 31 |         else:
 32 |             pass
 33 | 
 34 | 
 35 | @pytest.fixture
 36 | def additionals(utils):
 37 |     obj = lambda: None
 38 |     obj.selname = "stub_sel"
 39 |     elements = ["skip", "retry3", "retryN", "pass"]
 40 |     utils.scaffold_empty(obj.selname, elements=elements)
 41 |     for element in elements:
 42 |         with open(f"{utils.get_element_path(obj.selname, element)}/out.txt", "w") as f:
 43 |             f.write("something")
 44 | 
 45 |     goodConfig = {"elements_in": [obj.selname], "dev": True}
 46 | 
 47 |     obj.an = ErrorThrowingAnalyser(
 48 |         goodConfig, "analyserErrorSelector", LocalStorage(folder=utils.TEMP_ELEMENT_DIR)
 49 |     )
 50 |     yield obj
 51 |     utils.cleanup()
 52 | 
 53 | 
 54 | def test_analyse_skip_error(additionals):
 55 |     with pytest.raises(ElementShouldSkipError, match="test - skipping element"):
 56 |         additionals.an.analyse_element(LocalElement(id="skip"), {})
 57 | 
 58 | 
 59 | def test_analyse_retry_error(additionals):
 60 |     with pytest.raises(ElementShouldRetryError, match="test - attempt retry"):
 61 |         additionals.an.analyse_element(LocalElement(id="retryN"), {})
 62 | 
 63 | 
 64 | def test_bad_init_error(utils):
 65 |     bad0 = {}
 66 |     bad1 = {"elements_in": []}
 67 |     bad2 = {"elements_in": None}
 68 |     good = {"elements_in": ["selname"]}
 69 | 
 70 |     with pytest.raises(
 71 |         InvalidAnalyserConfigError,
 72 |         match="must contain an 'elements_in' indicating the analyser's input",
 73 |     ):
 74 |         no_elements_in = ErrorThrowingAnalyser(
 75 |             bad0, "stub", LocalStorage(folder=utils.TEMP_ELEMENT_DIR)
 76 |         )
 77 | 
 78 |     with pytest.raises(
 79 |         InvalidAnalyserConfigError,
 80 |         match="The 'elements_in' must be a list containing at least one string",
 81 |     ):
 82 |         empty_elements_in = ErrorThrowingAnalyser(
 83 |             bad1, "stub", LocalStorage(folder=utils.TEMP_ELEMENT_DIR)
 84 |         )
 85 | 
 86 |     with pytest.raises(
 87 |         InvalidAnalyserConfigError,
 88 |         match="The 'elements_in' must be a list containing at least one string",
 89 |     ):
 90 |         empty_elements_in = ErrorThrowingAnalyser(
 91 |             bad2, "stub", LocalStorage(folder=utils.TEMP_ELEMENT_DIR)
 92 |         )
 93 | 
 94 |     with pytest.raises(
 95 |         InvalidAnalyserConfigError, match="You must provide a name for your analyser"
 96 |     ):
 97 |         badan2 = ErrorThrowingAnalyser(
 98 |             good, "", LocalStorage(folder=utils.TEMP_ELEMENT_DIR)
 99 |         )
100 | 
101 | 
102 | def test_integration(utils, additionals):
103 |     assert additionals.an.retryCount == 0
104 | 
105 |     additionals.an.start_analysing()
106 | 
107 |     skip_path = utils.get_element_path(
108 |         additionals.selname, "skip", analyser=additionals.an.name
109 |     )
110 |     assert not os.path.exists(skip_path)
111 | 
112 |     retryn_path = utils.get_element_path(
113 |         additionals.selname, "retryN", analyser=additionals.an.name
114 |     )
115 |     assert not os.path.exists(retryn_path)
116 | 
117 |     retry3_path = utils.get_element_path(
118 |         additionals.selname, "retry3", analyser=additionals.an.name
119 |     )
120 |     assert additionals.an.retryCount == 3
121 | 
122 | 
123 | def test_bad_whitelist(utils):
124 |     badConfig = {"elements_in": ["sel1/an1/el1"]}
125 |     badAn = EmptyAnalyser(
126 |         badConfig, "whitelistErrorAnalyser", LocalStorage(folder=utils.TEMP_ELEMENT_DIR)
127 |     )
128 |     with pytest.raises(
129 |         InvalidAnalyserElements, match="'elements_in' you specified does not exist"
130 |     ):
131 |         badAn.start_analysing()
132 | 


--------------------------------------------------------------------------------
/src/test/test_etypes.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from types import SimpleNamespace as Ns
  3 | from pathlib import Path
  4 | from lib.common.etypes import Etype, Union, Array, all_etypes, cast
  5 | from lib.etypes.cvjson import etype as CvJson
  6 | from lib.common.exceptions import EtypeCastError
  7 | from test import utils
  8 | 
  9 | 
 10 | def write_stub(f):
 11 |     with open(f, "w+") as f:
 12 |         f.write("stub")
 13 | 
 14 | 
 15 | @pytest.fixture
 16 | def base():
 17 |     obj = Ns()
 18 |     obj.id = "xasd123"
 19 |     obj.txt1 = Path("/tmp/1.txt")
 20 |     obj.scoresjson1 = Path("/tmp/scores.json")
 21 |     obj.json2 = Path("/tmp/not_scores.json")
 22 |     obj.md1 = Path("/tmp/1.md")
 23 |     obj.im1 = Path("/tmp/1.png")
 24 |     obj.im2 = Path("/tmp/2.jpg")
 25 |     obj.im3 = Path("/tmp/3.bmp")
 26 |     obj.aud1 = Path("/tmp/1.mp3")
 27 |     write_stub(obj.txt1)
 28 |     write_stub(obj.md1)
 29 |     write_stub(obj.im1)
 30 |     write_stub(obj.im2)
 31 |     write_stub(obj.im3)
 32 |     write_stub(obj.aud1)
 33 |     write_stub(obj.scoresjson1)
 34 |     write_stub(obj.json2)
 35 |     yield obj
 36 |     utils.cleanup()
 37 | 
 38 | 
 39 | def test_etype_construction(base):
 40 |     # shouldn't be okay with empty
 41 |     for t in all_etypes():
 42 |         with pytest.raises(EtypeCastError):
 43 |             assert t(base.id, [])
 44 | 
 45 | 
 46 | def test_Any(base):
 47 |     e = Etype.Any(base.id, [base.txt1])
 48 |     assert len(e.paths) == 1
 49 |     e = Etype.Any(base.id, [base.txt1, base.md1, base.im3])
 50 |     assert len(e.paths) == 3
 51 | 
 52 | 
 53 | def test_Image(base):
 54 |     # shouldn't accept one txt
 55 |     with pytest.raises(EtypeCastError):
 56 |         Etype.Image(base.id, ["/tmp/notafile.txt"])
 57 | 
 58 |     # shouldn't accept an image that doesn't exist
 59 |     with pytest.raises(EtypeCastError):
 60 |         Etype.Image(base.id, ["/tmp/nonexistent_image.png"])
 61 | 
 62 |     # shouldn't be okay with 2 valid images
 63 |     with pytest.raises(EtypeCastError):
 64 |         Etype.Image(base.id, [base.im1, base.im2])
 65 | 
 66 |     # works with either single path or list
 67 |     im1 = Etype.Image(base.id, base.im1)
 68 |     assert len(im1.paths) == 1
 69 |     im1 = Etype.Image(base.id, [base.im1])
 70 |     assert len(im1.paths) == 1
 71 |     im2 = Etype.Image(base.id, base.im2)
 72 |     assert len(im1.paths) == 1
 73 | 
 74 |     # filters out invalid files
 75 |     im1_filtered = Etype.Image(base.id, [base.im1, base.txt1])
 76 |     assert len(im1.paths) == 1
 77 |     assert im1.paths[0] == base.im1
 78 | 
 79 | 
 80 | def test_Array(base):
 81 |     ImArr = Array(Etype.Image)
 82 |     with pytest.raises(EtypeCastError):
 83 |         ImArr(base.id, [])
 84 | 
 85 |     with pytest.raises(EtypeCastError):
 86 |         ImArr(base.id, base.txt1)
 87 | 
 88 |     has1 = ImArr(base.id, base.im1)
 89 |     assert len(has1.paths) == 1
 90 |     has3 = ImArr(base.id, [base.im1, base.im2, base.im3])
 91 |     assert len(has3.paths) == 3
 92 |     has2 = ImArr(base.id, [base.im1, base.md1, base.txt1, base.im3])
 93 |     assert len(has2.paths) == 2
 94 | 
 95 | 
 96 | def test_Union(base):
 97 |     ImAud = Union(Etype.Image, Etype.Audio)
 98 |     with pytest.raises(EtypeCastError):
 99 |         ImAud(base.id, [])
100 |     with pytest.raises(EtypeCastError):
101 |         ImAud(base.id, base.txt1)
102 |     with pytest.raises(EtypeCastError):
103 |         ImAud(base.id, base.im1)
104 |     with pytest.raises(EtypeCastError):
105 |         ImAud(base.id, base.aud1)
106 | 
107 |     has2 = ImAud(base.id, [base.aud1, base.im1])
108 |     assert len(has2.paths) == 2
109 |     f2 = ImAud(base.id, [base.im3, base.md1, base.aud1])
110 |     assert len(f2.paths) == 2
111 |     assert base.im3 in f2.paths
112 |     assert base.aud1 in f2.paths
113 | 
114 | 
115 | def test_cast(base):
116 |     # explicit cast
117 |     with pytest.raises(EtypeCastError):
118 |         cast(base.id, [], Etype.Image)
119 |     with pytest.raises(EtypeCastError):
120 |         cast(base.id, [base.txt1], Etype.Image)
121 | 
122 |     t1 = cast(base.id, [base.im1], to=Etype.Image)
123 |     assert len(t1.paths) == 1
124 |     assert t1.et == Etype.Image
125 | 
126 |     # implicit cast
127 |     with pytest.raises(EtypeCastError):
128 |         cast(base.id, [])
129 | 
130 |     i1 = cast(base.id, [base.im1])
131 |     assert len(i1.paths) == 1
132 |     assert i1.et == Etype.Image
133 |     i2 = cast(base.id, [base.im2])
134 |     assert len(i2.paths) == 1
135 |     assert i2.et == Etype.Image
136 | 
137 |     ia1 = cast(base.id, [base.im1, base.im2])
138 |     assert len(ia1.paths) == 2
139 |     assert ia1.et == Array(Etype.Image)
140 | 
141 |     a1 = cast(base.id, base.aud1)
142 |     assert len(a1.paths) == 1
143 |     assert a1.et == Etype.Audio
144 | 
145 |     # unions
146 | 
147 |     ai1 = cast(base.id, [base.im3, base.aud1])
148 |     assert len(ai1.paths) == 2
149 |     assert ai1.et == Union(Etype.Image, Etype.Audio)
150 | 
151 |     ai2 = cast(base.id, [base.aud1, base.im2])
152 |     assert len(ai1.paths) == 2
153 |     assert ai1.et == Union(Etype.Image, Etype.Audio)
154 | 
155 |     iaa1 = cast(base.id, [base.im1, base.im2, base.aud1])
156 |     assert len(iaa1.paths) == 3
157 |     assert iaa1.et == Union(Array(Etype.Image), Etype.Audio)
158 | 
159 |     any1 = cast(base.id, [base.im1, base.im2, base.aud1, base.txt1])
160 |     assert len(any1.paths) == 4
161 |     assert any1.et == Etype.Any
162 | 
163 | 
164 | def test_custom_etypes(base):
165 |     all_ets = all_etypes()
166 |     cvjson_et = CvJson(CvJson.__name__, CvJson.filter)
167 |     assert cvjson_et in all_ets
168 | 
169 |     cvj1 = cvjson_et(base.id, [base.im1, base.im2, base.scoresjson1])
170 | 
171 |     assert len(cvj1.paths) == 3
172 |     assert cvj1.et == cvjson_et
173 | 
174 |     with pytest.raises(EtypeCastError):
175 |         cvjson_et(base.id, [base.im1, base.im2])
176 |     # throws error when json is not named 'scores.json' (specified in
177 |     # CvJson.filter).
178 |     with pytest.raises(EtypeCastError):
179 |         cvjson_et(base.id, [base.im1, base.json2])
180 | 


--------------------------------------------------------------------------------
/src/test/test_get.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import shutil
 3 | from os import listdir, makedirs
 4 | from os.path import isdir
 5 | from lib.common.get import get_module
 6 | 
 7 | 
 8 | def make_empty_main_export(pth):
 9 |     INIT = "module = 1"
10 |     with open(f"{pth}/core.py", "w") as f:
11 |         f.write(INIT)
12 | 
13 | 
14 | @pytest.fixture
15 | def additionals():
16 |     obj = lambda: None
17 |     """ Make imaginary selector and analysers """
18 |     # tests always run from src
19 |     obj.EMPTY_SELECTOR = "./lib/selectors/empty"
20 |     obj.EMPTY_ANALYSER = "./lib/analysers/empty"
21 | 
22 |     if isdir(obj.EMPTY_SELECTOR):
23 |         shutil.rmtree(obj.EMPTY_SELECTOR)
24 |     if isdir(obj.EMPTY_ANALYSER):
25 |         shutil.rmtree(obj.EMPTY_ANALYSER)
26 | 
27 |     makedirs(obj.EMPTY_SELECTOR)
28 |     make_empty_main_export(obj.EMPTY_SELECTOR)
29 |     makedirs(obj.EMPTY_ANALYSER)
30 |     make_empty_main_export(obj.EMPTY_ANALYSER)
31 |     yield obj
32 |     if isdir(obj.EMPTY_SELECTOR):
33 |         shutil.rmtree(obj.EMPTY_SELECTOR)
34 |     if isdir(obj.EMPTY_ANALYSER):
35 |         shutil.rmtree(obj.EMPTY_ANALYSER)
36 | 
37 | 
38 | # NOTE: additionals added as arg to ensure fixture setup is run
39 | def test_raises_when_faulty(additionals):
40 |     with pytest.raises(ModuleNotFoundError):
41 |         get_module("select", "smth")
42 | 
43 |     with pytest.raises(ModuleNotFoundError):
44 |         get_module("analyse", "smth")
45 | 
46 |     with pytest.raises(ImportError, match="must be either 'select' or 'analyse'"):
47 |         get_module("neitherthing", "smth")
48 | 
49 | 
50 | def test_imports_main(additionals):
51 |     # main just exported as 'True', to check import logic is correct
52 |     assert get_module("select", "empty")
53 |     assert get_module("analyse", "empty")
54 | 


--------------------------------------------------------------------------------
/src/test/test_infoyamls.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import yaml
 3 | from os import listdir
 4 | 
 5 | 
 6 | def is_valid_arg(arg):
 7 |     if "name" not in arg or not isinstance(arg["name"], str):
 8 |         return False
 9 |     if "required" not in arg or not isinstance(arg["required"], bool):
10 |         return False
11 |     # NOTE: not checking for 'input' or 'desc' attrs, considering them optional at this time.
12 |     return True
13 | 
14 | 
15 | @pytest.fixture
16 | def additionals():
17 |     obj = lambda: None
18 |     obj.ALL_ANALYSERS = [x for x in listdir("lib/analysers") if x != "__deprecated"]
19 |     obj.ALL_SELECTORS = [x for x in listdir("lib/selectors") if x != "__deprecated"]
20 |     return obj
21 | 
22 | 
23 | def test_selectors(additionals, utils):
24 |     # selector infos
25 |     for sel in additionals.ALL_SELECTORS:
26 |         with open(utils.get_info_path("selector", sel)) as f:
27 |             info = yaml.safe_load(f)
28 |         assert "desc" in info
29 |         assert "args" in info
30 |         assert isinstance(info["args"], list)
31 |         for arg in info["args"]:
32 |             assert is_valid_arg(arg)
33 | 
34 |     # analyser infos
35 |     for ana in additionals.ALL_ANALYSERS:
36 |         with open(utils.get_info_path("analyser", ana)) as f:
37 |             info = yaml.safe_load(f)
38 |         assert "desc" in info
39 |         assert "args" in info
40 |         assert isinstance(info["args"], list)
41 |         for arg in info["args"]:
42 |             assert is_valid_arg(arg)
43 | 


--------------------------------------------------------------------------------
/src/test/test_integration.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | 
4 | def test_demo():
5 |     # TODO: test using the `local` selector, followed by simple analysers.
6 |     pass
7 | 


--------------------------------------------------------------------------------
/src/test/test_localstorage.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import json
 3 | from pathlib import Path
 4 | from lib.common.storage import LocalStorage
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def basic(utils):
 9 |     global base
10 |     base = utils.TEMP_ELEMENT_DIR
11 | 
12 |     utils.scaffold_empty("Youtube", elements=["el1"], analysers=["Me"])
13 |     utils.setup()
14 |     yield LocalStorage(folder=base)
15 |     utils.cleanup()
16 | 
17 | 
18 | def test_core(basic):
19 |     assert basic.base_dir == Path(base)
20 | 
21 | 
22 | def test_read_query(utils, basic):
23 |     assert isinstance(basic.read_query("Youtube"), Path)
24 |     assert basic.read_query("Youtube") == Path(f"{base}/Youtube/{basic.RETRIEVED_EXT}")
25 |     assert basic.read_query("Youtube/Me") == Path(
26 |         f"{base}/Youtube/{basic.ANALYSED_EXT}/Me"
27 |     )
28 | 
29 | 
30 | def test_read_all_media(utils, basic):
31 |     cmpDict = {
32 |         "Youtube": {
33 |             f"{basic.RETRIEVED_EXT}": {
34 |                 "el1": f"{base}/Youtube/{basic.RETRIEVED_EXT}/el1",
35 |             },
36 |             f"{basic.ANALYSED_EXT}": {
37 |                 "Me": {
38 |                     "el1": f"{base}/Youtube/{basic.ANALYSED_EXT}/Me/el1",
39 |                 },
40 |             },
41 |         },
42 |     }
43 |     mediaDict = basic.read_all_media()
44 |     assert utils.dictsEqual(cmpDict, mediaDict)
45 | 
46 | 
47 | def test_write_meta(basic):
48 |     q = "Youtube/Me"
49 |     og_data = {"some": "data"}
50 |     basic.write_meta(q, og_data)
51 |     with open(f"{basic.read_query(q)}/{basic._LocalStorage__META_FILE}", "r") as f:
52 |         data = json.load(f)
53 |     assert data.get("some") == "data"
54 |     assert data.get("timestamp") is not None
55 | 


--------------------------------------------------------------------------------
/src/test/test_mtmodule.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import os
  3 | from pathlib import Path
  4 | from lib.common.exceptions import ImproperLoggedPhaseError
  5 | from lib.common.mtmodule import MTModule
  6 | from lib.common.storage import LocalStorage
  7 | from test.utils import scaffold_empty
  8 | 
  9 | 
 10 | class EmptyMTModule(MTModule):
 11 |     pass
 12 | 
 13 | 
 14 | @pytest.fixture
 15 | def additionals(utils):
 16 |     obj = lambda: None
 17 |     obj.BASE_DIR = utils.TEMP_ELEMENT_DIR
 18 |     obj.mod = EmptyMTModule({}, "empty", LocalStorage(folder=utils.TEMP_ELEMENT_DIR))
 19 |     yield obj
 20 |     utils.cleanup()
 21 | 
 22 | 
 23 | def test_class_variables(additionals):
 24 |     assert additionals.mod.name == "empty"
 25 |     assert additionals.mod.disk.base_dir == Path(additionals.BASE_DIR)
 26 |     assert additionals.mod._MTModule__LOGS == []
 27 |     assert (
 28 |         additionals.mod.disk._LocalStorage__LOGS_DIR == f"{additionals.BASE_DIR}/logs"
 29 |     )
 30 |     assert (
 31 |         additionals.mod.disk._LocalStorage__LOGS_FILE
 32 |         == f"{additionals.BASE_DIR}/logs/logs.txt"
 33 |     )
 34 |     assert os.path.exists(f"{additionals.BASE_DIR}/logs")
 35 | 
 36 | 
 37 | def test_phase_decorator(additionals):
 38 |     class BadClass:
 39 |         @MTModule.phase("somekey")
 40 |         def improper_func(self):
 41 |             pass
 42 | 
 43 |     class GoodClass(MTModule):
 44 |         @MTModule.phase("somekey")
 45 |         def proper_func(self):
 46 |             self.logger("we did something.")
 47 |             return "no error"
 48 | 
 49 |     # test that a decorated method carries through its return value
 50 |     gc = GoodClass({}, "my_good_mod", storage=LocalStorage(folder=additionals.BASE_DIR))
 51 | 
 52 |     # test that a decorated method carries through its return value
 53 |     gc = GoodClass({}, "my_good_mod", storage=LocalStorage(folder=additionals.BASE_DIR))
 54 |     assert gc.proper_func() == "no error"
 55 | 
 56 |     with open(f"{additionals.BASE_DIR}/logs/logs.txt", "r") as f:
 57 |         lines = f.readlines()
 58 |         assert len(lines) == 1
 59 |         assert lines[0] == "my_good_mod: somekey: we did something.\n"
 60 | 
 61 |     # check that logs were cleared after phase
 62 |     assert gc._MTModule__LOGS == []
 63 | 
 64 | 
 65 | def test_parallel_phase_decorator(additionals):
 66 |     class GoodClass(MTModule):
 67 |         @MTModule.phase("somekey")
 68 |         def func(self, gen):
 69 |             self.logger("This function only takes a generator of elements.")
 70 |             return "no error"
 71 | 
 72 |         @MTModule.phase("somekey", remove_db=False)
 73 |         def func_no_remove(self, gen):
 74 |             return "no error"
 75 | 
 76 |         @MTModule.phase("secondkey")
 77 |         def func_w_arg(self, gen, extra):
 78 |             self.logger(f"Running func with {list(gen)}, with extra arg {extra}.")
 79 |             return "no error"
 80 | 
 81 |     # test that a decorated method carries through its return value
 82 |     gc = GoodClass({}, "my_good_mod", storage=LocalStorage(folder=additionals.BASE_DIR))
 83 | 
 84 |     # test parallel logs
 85 |     eg_gen = (a for a in range(0, 100))
 86 |     assert gc.func(eg_gen) == "no error"
 87 | 
 88 |     with open(f"{additionals.BASE_DIR}/logs/logs.txt", "r") as f:
 89 |         lines = f.readlines()
 90 |         assert len(lines) == 100
 91 | 
 92 |     # test db file generation
 93 |     eg_gen = (a for a in range(0, 100))
 94 |     assert gc.func_no_remove(eg_gen) == "no error"
 95 | 
 96 |     dbfile = f"{gc.disk.base_dir}/{gc.UNIQUE_ID}.db"
 97 |     with open(dbfile, "rb") as f:
 98 |         _bytes = f.read()
 99 |         assert len(_bytes) == 800  # 2 4-byte entries per item for 100 items
100 | 
101 |     os.remove(dbfile)
102 | 
103 |     # test that a function is resumed properly
104 |     eg_gen = (a for a in range(0, 50))
105 |     assert gc.func_no_remove(eg_gen) == "no error"
106 | 
107 |     eg_gen = (a for a in range(0, 100))
108 |     assert gc.func(eg_gen) == "no error"
109 | 
110 |     with open(f"{additionals.BASE_DIR}/logs/logs.txt", "r") as f:
111 |         lines = f.readlines()
112 |         assert len(lines) == 150
113 | 
114 |     # test function with argument
115 |     eg_gen = (a for a in range(0, 100))
116 |     assert gc.func_w_arg(eg_gen, 10) == "no error"
117 | 


--------------------------------------------------------------------------------
/src/test/test_run.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import os
  3 | import yaml
  4 | from run import validate_yaml
  5 | from lib.common.exceptions import InvalidYamlError
  6 | from test.utils import scaffold_empty, cleanup
  7 | 
  8 | ARGS = "/run_args.yaml"
  9 | BASELINE = {"folder": "media/test_official"}
 10 | WITH_ELS = {**BASELINE, "elements_in": "sel1"}
 11 | WITH_SELECT = {
 12 |     **BASELINE,
 13 |     "select": {"name": "Local", "config": {"source": "/a-folder"}},
 14 | }
 15 | GOOD_ANALYSE_DICT = {**WITH_ELS, "analyse": {"name": "Frames"}}
 16 | GOOD_SELECT_ANALYSE = {
 17 |     **WITH_SELECT,
 18 |     "analyse": [{"name": "Frames"}, {"name": "ImageDedup"}],
 19 | }
 20 | 
 21 | 
 22 | @pytest.fixture(autouse=True)
 23 | def teardown():
 24 |     yield None
 25 |     try:
 26 |         cleanup()
 27 |         os.remove(ARGS)
 28 |     except:
 29 |         pass
 30 | 
 31 | 
 32 | def write(vl):
 33 |     with open(ARGS, "w") as c:
 34 |         yaml.dump(vl, c, default_flow_style=False)
 35 | 
 36 | 
 37 | def validate():
 38 |     with open(ARGS, "r") as c:
 39 |         cfg = yaml.safe_load(c)
 40 |     validate_yaml(cfg)
 41 | 
 42 | 
 43 | def write_and_validate(config, regex):
 44 |     write(config)
 45 |     with pytest.raises(InvalidYamlError, match=regex):
 46 |         validate()
 47 | 
 48 | 
 49 | def test_bad_yaml():
 50 |     with open(ARGS, "w") as c:
 51 |         c.write('foo: "an escaped \\\' single quote"')
 52 | 
 53 |     with pytest.raises(yaml.YAMLError):
 54 |         validate()
 55 | 
 56 | 
 57 | def test_validate_phase():
 58 |     empty = {}
 59 |     bad_folder = {"folder": 1, "config": {}}
 60 |     good_folder = {"folder": "legit", "config": {}}
 61 | 
 62 |     write(empty)
 63 |     with pytest.raises(
 64 |         InvalidYamlError, match="The folder attribute must exist and be a string"
 65 |     ):
 66 |         validate()
 67 | 
 68 |     write(bad_folder)
 69 |     with pytest.raises(
 70 |         InvalidYamlError, match="The folder attribute must exist and be a string"
 71 |     ):
 72 |         validate()
 73 | 
 74 |     bad_phase = {**good_folder, "phase": "not a phase"}
 75 |     good_phase_select = {**good_folder, "phase": "select"}
 76 |     good_phase_analyse = {**good_folder, "phase": "analyse"}
 77 |     write(bad_phase)
 78 |     with pytest.raises(
 79 |         InvalidYamlError, match="specified a phase, you must specify a module"
 80 |     ):
 81 |         validate()
 82 | 
 83 |     bad_select_module = {**good_phase_select, "module": "not a selector"}
 84 |     bad_analyse_module = {**good_phase_analyse, "module": "not an analyser"}
 85 |     good_select_module = {**good_phase_select, "module": "Local"}
 86 |     write(bad_select_module)
 87 |     with pytest.raises(
 88 |         InvalidYamlError, match="No select module named 'not a selector'"
 89 |     ):
 90 |         validate()
 91 | 
 92 |     write(bad_analyse_module)
 93 |     with pytest.raises(
 94 |         InvalidYamlError, match="No analyse module named 'not an analyser'"
 95 |     ):
 96 |         validate()
 97 | 
 98 |     # the select module requires a 'source_folder' arg
 99 |     bad_local_config = {**good_select_module, "config": {}}
100 |     bad_youtube_config = {
101 |         **good_select_module,
102 |         "module": "youtube",
103 |         "config": {"uploaded_before": "212321"},
104 |     }
105 |     good_youtube_config = {
106 |         **good_select_module,
107 |         "module": "youtube",
108 |         "config": {
109 |             "search_term": "a search term",
110 |             "uploaded_before": "212321",
111 |             "uploaded_after": "212321",
112 |         },
113 |     }
114 | 
115 |     if os.path.exists("/mtriage/credentials/google.json"):
116 |         write(good_select_module)
117 |         with pytest.raises(
118 |             InvalidYamlError,
119 |             match="config you specified does not contain all the required arguments",
120 |         ):
121 |             validate()
122 | 
123 |         write(bad_local_config)
124 |         with pytest.raises(
125 |             InvalidYamlError,
126 |             match="The config you specified does not contain all the required arguments for the 'Local' selecter.",
127 |         ):
128 |             validate()
129 | 
130 |         write(bad_youtube_config)
131 |         with pytest.raises(
132 |             InvalidYamlError,
133 |             match="The config you specified does not contain all the required arguments for the 'youtube' selecter.",
134 |         ):
135 |             validate()
136 | 
137 |         write(good_youtube_config)
138 |         validate()
139 | 
140 |         # should return True to indicate this is a single phase config, see 'validate_yaml' docstring for more info
141 |         res = validate_yaml(good_youtube_config)
142 |         assert res == True
143 | 
144 | 
145 | def test_validate():
146 |     write_and_validate(BASELINE, "specify either 'elements_in' or 'select'")
147 | 
148 |     write_and_validate(WITH_ELS, "at least one 'analyse' module must be specified")
149 | 
150 |     bad_analyse = {**WITH_ELS, "analyse": None}
151 |     write_and_validate(bad_analyse, "must be a dict or list")
152 | 
153 |     bad_analyse_dict = {**WITH_ELS, "analyse": {}}
154 |     write_and_validate(bad_analyse_dict, "containing at least a 'name' attribute")
155 | 
156 |     write(GOOD_ANALYSE_DICT)
157 |     validate()
158 | 
159 |     write(GOOD_SELECT_ANALYSE)
160 |     validate()
161 | 
162 | 
163 | def test_config_types():
164 |     validate_yaml(GOOD_ANALYSE_DICT)
165 |     validate_yaml(GOOD_SELECT_ANALYSE)
166 | 


--------------------------------------------------------------------------------
/src/test/test_selector.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import csv
 4 | from abc import ABC
 5 | from pathlib import Path
 6 | from lib.common.selector import Selector
 7 | from lib.common.exceptions import (
 8 |     ElementShouldRetryError,
 9 |     ElementShouldSkipError,
10 |     SelectorIndexError,
11 |     EtypeCastError,
12 | )
13 | from lib.common.etypes import Etype, LocalElementsIndex
14 | from lib.common.storage import LocalStorage
15 | from test.utils import scaffold_elementmap, STUB_PATHS, list_files
16 | 
17 | 
18 | class EmptySelector(Selector):
19 |     out_etype = Etype.Any
20 | 
21 |     def __init__(self, config, name, dr):
22 |         super().__init__(config, name, dr)
23 |         self.disk.delete_local_on_write = False
24 | 
25 |     def index(self, config):
26 |         if not os.path.exists(self.disk.read_query(self.name)):
27 |             df = scaffold_elementmap(["el1", "el2", "el3"])
28 | 
29 |             df = [
30 |                 x + [STUB_PATHS.imagejpg] if idx > 0 else (x + ["path"])
31 |                 for idx, x in enumerate(df)
32 |             ]
33 |             return LocalElementsIndex(rows=df)
34 |         else:
35 |             return None
36 | 
37 |     def retrieve_element(self, row, config):
38 |         return Etype.cast(row.id, row.path)
39 | 
40 | 
41 | @pytest.fixture
42 | def additionals(utils):
43 |     obj = lambda: None
44 |     obj.emptySelector = EmptySelector(
45 |         {"dev": True}, "empty", LocalStorage(folder=utils.TEMP_ELEMENT_DIR)
46 |     )
47 |     utils.setup()
48 |     yield obj
49 |     utils.cleanup()
50 | 
51 | 
52 | def test_selector_imports():
53 |     assert type(Selector) == type(ABC)
54 | 
55 | 
56 | def test_cannot_instantiate(utils):
57 |     with pytest.raises(TypeError):
58 |         Selector({}, "empty", utils.TEMP_ELEMENT_DIR)
59 | 
60 | 
61 | def test_init(utils, additionals):
62 |     assert Path(utils.TEMP_ELEMENT_DIR) == additionals.emptySelector.disk.base_dir
63 |     assert "empty" == additionals.emptySelector.name
64 | 
65 | 
66 | def test_index(additionals):
67 |     additionals.emptySelector.start_indexing()
68 |     # test element_map.csv is what it should be
69 |     eidx = additionals.emptySelector.disk.read_elements_index("empty")
70 |     emap = scaffold_elementmap(["el1", "el2", "el3"])
71 |     for idx, row in enumerate(eidx.rows):
72 |         assert row.id == emap[idx + 1][0]
73 | 
74 | 
75 | def test_retrieve(additionals, utils):
76 |     additionals.emptySelector.start_indexing()
77 |     additionals.emptySelector.start_retrieving()
78 |     pth = additionals.emptySelector.disk.read_query("empty")
79 |     images = [pth / f"{x}/image.jpeg" for x in ["el1", "el2", "el3"]]
80 |     for img in images:
81 |         assert os.path.isfile(img)
82 | 
83 | 
84 | # the values that are returned from retrieve need to be managed in Python differently according to what kind of data
85 | # they represent.
86 | #
87 | # Video -> cv2.VideoCapture
88 | # Image -> cv2.Image
89 | # Audio -> simpleaudio.WaveObject
90 | # Json  -> dict
91 | 
92 | # the relationship between files on disk and how they are loaded through Python should be managed in the etypes library.
93 | 


--------------------------------------------------------------------------------
/src/test/test_selector_errors.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import os
  3 | from lib.common.selector import Selector
  4 | from lib.common.storage import LocalStorage
  5 | from lib.common.etypes import Etype, LocalElement, LocalElementsIndex
  6 | from lib.common.exceptions import (
  7 |     ElementShouldRetryError,
  8 |     ElementShouldSkipError,
  9 |     SelectorIndexError,
 10 |     EtypeCastError,
 11 | )
 12 | from test.utils import scaffold_elementmap
 13 | import pdb
 14 | 
 15 | 
 16 | class BasicErrorSelector(Selector):
 17 |     out_etype = Etype.Any
 18 | 
 19 |     def __init__(self, *args):
 20 |         super().__init__(*args)
 21 |         self.retryCount = 0
 22 | 
 23 |     def index(self, config) -> LocalElementsIndex:
 24 |         error = config["error"] if "error" in config else ""
 25 |         if error == "index":
 26 |             raise SelectorIndexError("test")
 27 |         else:
 28 |             elements = ["skip", "retry3", "retryN", "pass"]
 29 |             return LocalElementsIndex(rows=scaffold_elementmap(elements))
 30 | 
 31 |     def retrieve_element(self, element, config) -> LocalElement:
 32 |         if element.id == "skip":
 33 |             raise ElementShouldSkipError("test")
 34 |         elif element.id == "retry3" and self.retryCount < 3:
 35 |             self.retryCount += 1
 36 |             raise ElementShouldRetryError("test")
 37 |         elif element.id == "retryN":
 38 |             raise ElementShouldRetryError("test")
 39 |         else:
 40 |             return None
 41 | 
 42 | 
 43 | class RetrieveErrorSelector(BasicErrorSelector):
 44 |     out_etype = Etype.Any
 45 | 
 46 |     def retrieve_element(self, element, config):
 47 |         super().retrieve_element(element, config)
 48 |         with open(f"{element['base']}/out.txt", "w") as f:
 49 |             f.write("something")
 50 | 
 51 | 
 52 | class BadIndexSelector(Selector):
 53 |     out_etype = Etype.Any
 54 | 
 55 |     def index(self, config):
 56 |         # fails to return a dataframe
 57 |         pass
 58 | 
 59 |     def retrieve_element(self, element, config):
 60 |         pass
 61 | 
 62 | 
 63 | @pytest.fixture
 64 | def additionals(utils):
 65 |     obj = lambda: None
 66 |     indexModule = "indexErrorSelector"
 67 |     indexConfig = {"error": "index", "dev": True}
 68 |     obj.indexErrorSelector = BasicErrorSelector(
 69 |         indexConfig, indexModule, LocalStorage(folder=utils.TEMP_ELEMENT_DIR)
 70 |     )
 71 | 
 72 |     castModule = "castErrorSelector"
 73 |     castConfig = {"dev": True}
 74 |     obj.castErrorSelector = BasicErrorSelector(
 75 |         castConfig, castModule, LocalStorage(folder=utils.TEMP_ELEMENT_DIR)
 76 |     )
 77 | 
 78 |     retrieveModule = "retrieveErrorSelector"
 79 |     retrieveConfig = {"dev": True}
 80 |     obj.retrieveErrorSelector = RetrieveErrorSelector(
 81 |         retrieveConfig, retrieveModule, LocalStorage(folder=utils.TEMP_ELEMENT_DIR)
 82 |     )
 83 |     yield obj
 84 |     utils.cleanup()
 85 | 
 86 | 
 87 | def test_index_error(additionals):
 88 |     with pytest.raises(SelectorIndexError, match="Selector index failed - test"):
 89 |         additionals.indexErrorSelector.start_indexing()
 90 | 
 91 | 
 92 | def test_retrieve_skip_error(additionals):
 93 |     with pytest.raises(ElementShouldSkipError, match="test - skipping element"):
 94 |         additionals.castErrorSelector.retrieve_element(LocalElement(id="skip"), {})
 95 | 
 96 | 
 97 | def test_retrieve_retry_error(additionals):
 98 |     with pytest.raises(ElementShouldRetryError, match="test - attempt retry"):
 99 |         additionals.castErrorSelector.retrieve_element(LocalElement(id="retryN"), {})
100 | 
101 | 
102 | def test_integration_1(utils, additionals):
103 |     assert additionals.castErrorSelector.retryCount == 0
104 |     additionals.castErrorSelector.start_indexing()
105 |     additionals.castErrorSelector.start_retrieving()
106 | 
107 |     skip_path = utils.get_element_path("castErrorSelector", "skip")
108 |     assert not os.path.exists(skip_path)
109 | 
110 |     retryn_path = utils.get_element_path("castErrorSelector", "retryN")
111 |     assert not os.path.exists(retryn_path)
112 | 
113 |     retry3_path = utils.get_element_path("castErrorSelector", "retry3")
114 |     assert additionals.castErrorSelector.retryCount == 3
115 |     assert not os.path.exists(retry3_path)
116 | 
117 |     pass_path = utils.get_element_path("castErrorSelector", "pass")
118 |     assert not os.path.exists(pass_path)
119 | 
120 | 
121 | def integration_2(utils, additionals):
122 |     additionals.retrieveErrorSelector.start_indexing()
123 |     additionals.retrieveErrorSelector.start_retrieving(in_parallel=False)
124 | 
125 |     skip_path = utils.get_element_path("retrieveErrorSelector", "skip")
126 |     assert not os.path.exists(skip_path)
127 | 
128 |     retryn_path = utils.get_element_path("retrieveErrorSelector", "retryN")
129 |     assert not os.path.exists(retryn_path)
130 | 
131 |     retry3_path = utils.get_element_path("retrieveErrorSelector", "retry3")
132 |     assert additionals.retrieveErrorSelector.retryCount == 3
133 |     assert os.path.exists(retry3_path)
134 | 
135 |     pass_path = utils.get_element_path("retrieveErrorSelector", "pass")
136 |     assert os.path.exists(pass_path)
137 | 


--------------------------------------------------------------------------------
/src/test/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import shutil
  4 | from types import SimpleNamespace as Ns
  5 | from pathlib import Path
  6 | from lib.common.storage import LocalStorage
  7 | from lib.common.get import get_module
  8 | 
  9 | TEMP_ELEMENT_DIR = "/mtriage/media/test_official"
 10 | TMP_DIR = Path("/tmp")
 11 | STUB_PATHS = Ns(
 12 |     imagejpg="/mtriage/src/test/etype_stubs/image.jpeg",
 13 | )
 14 | 
 15 | 
 16 | def scaffold_empty(
 17 |     selector: str, elements: list = [], analysers: list = [], selector_txt=None
 18 | ):
 19 |     """
 20 |     Scaffold an mtriage folder. One folder per element in the elements list will be created in the TEMP_ELEMENT_DIR.
 21 |     If an analysers list is passed, mocks of derived elements will be created in the appropriate folders.
 22 |     Only a single selector should be passed, as derived elements are nested within a selector pass. To create multiple
 23 |     selector passes, call this function multiple times.
 24 |     """
 25 |     derived_dir = f"{TEMP_ELEMENT_DIR}/{selector}/{LocalStorage.ANALYSED_EXT}"
 26 |     if not os.path.exists(derived_dir):
 27 |         os.makedirs(derived_dir)
 28 | 
 29 |     for element in elements:
 30 |         element_dir = (
 31 |             f"{TEMP_ELEMENT_DIR}/{selector}/{LocalStorage.RETRIEVED_EXT}/{element}"
 32 |         )
 33 |         if not os.path.exists(element_dir):
 34 |             os.makedirs(element_dir)
 35 |         if selector_txt is not None:
 36 |             with open(f"{element_dir}/item.txt", "a") as ftxt:
 37 |                 ftxt.write(selector_txt)
 38 |         if len(analysers) > 0:
 39 |             for analyser in analysers:
 40 |                 analyser_dir = f"{TEMP_ELEMENT_DIR}/{selector}/{LocalStorage.ANALYSED_EXT}/{analyser}/{element}"
 41 |                 if not os.path.exists(analyser_dir):
 42 |                     os.makedirs(analyser_dir)
 43 | 
 44 | 
 45 | def get_element_path(selname, elementId, analyser=None):
 46 |     middle_insert = (
 47 |         LocalStorage.RETRIEVED_EXT
 48 |         if analyser is None
 49 |         else f"{LocalStorage.ANALYSED_EXT}/{analyser}"
 50 |     )
 51 |     return f"{TEMP_ELEMENT_DIR}/{selname}/{middle_insert}/{elementId}"
 52 | 
 53 | 
 54 | def scaffold_elementmap(elements=[]):
 55 |     out = [[x] for x in elements]
 56 |     out.insert(0, ["id"])
 57 |     return out
 58 | 
 59 | 
 60 | def setup():
 61 |     # to ensure that there isn't a read error
 62 |     with open("/run_args.yaml", "w") as f:
 63 |         json.dump({}, f)
 64 | 
 65 | 
 66 | def cleanup():
 67 |     if Path(TEMP_ELEMENT_DIR).exists():
 68 |         shutil.rmtree(TEMP_ELEMENT_DIR)
 69 |     if TMP_DIR.exists():
 70 |         shutil.rmtree(TMP_DIR)
 71 |         TMP_DIR.mkdir()
 72 | 
 73 | 
 74 | def listOfDictsEqual(l1, l2):
 75 |     if len(l1) != len(l2):
 76 |         return False
 77 | 
 78 |     for d1, d2 in zip(l1, l2):
 79 |         if not dictsEqual(d1, d2):
 80 |             return False
 81 | 
 82 |     return True
 83 | 
 84 | 
 85 | def dictsEqual(d1, d2):
 86 |     if len(d1.keys()) != len(d2.keys()):
 87 |         return False
 88 | 
 89 |     d1json = json.dumps(d1, sort_keys=True, default=str)
 90 |     d2json = json.dumps(d2, sort_keys=True, default=str)
 91 | 
 92 |     return d1json == d2json
 93 | 
 94 | 
 95 | def get_info_path(kind, mod_name):
 96 |     return f"lib/{kind}s/{mod_name}/info.yaml"
 97 | 
 98 | 
 99 | # https://stackoverflow.com/questions/9727673/list-directory-tree-structure-in-python
100 | def list_files(startpath):
101 |     for root, dirs, files in os.walk(startpath):
102 |         level = root.replace(startpath, "").count(os.sep)
103 |         indent = " " * 4 * (level)
104 |         print("{}{}/".format(indent, os.path.basename(root)))
105 |         subindent = " " * 4 * (level + 1)
106 |         for f in files:
107 |             print("{}{}".format(subindent, f))
108 | 
109 | 
110 | def ltemp():
111 |     """ Primarily for pdb debugging """
112 |     list_files(TEMP_ELEMENT_DIR)
113 | 


--------------------------------------------------------------------------------
/src/validate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | import inspect
 4 | from pathlib import Path
 5 | from lib.common.exceptions import InvalidYamlError
 6 | from lib.common.get import get_module
 7 | 
 8 | 
 9 | def validate_module(phase: str, module: str, cfg: dict):
10 |     try:
11 |         mod = get_module(phase, module)
12 |     except ModuleNotFoundError as e:
13 |         raise InvalidYamlError(f"No {phase} module named '{module}'")
14 | 
15 |     # dynamically check all required args for module config exist
16 |     sfolder = os.path.dirname(inspect.getfile(mod))
17 |     info = Path(sfolder) / "info.yaml"
18 |     with open(info, "r") as f:
19 |         options = yaml.safe_load(f)
20 |     for option in options["args"]:
21 |         if "config" not in cfg:
22 |             cfg["config"] = {}
23 |         if option["required"] is True and option["name"] not in cfg["config"].keys():
24 |             raise InvalidYamlError(
25 |                 f"The config you specified does not contain all the required arguments for the '{module}' {phase}er."
26 |             )
27 | 
28 | 
29 | def validate_name(cfg: dict):
30 |     if "name" not in cfg.keys():
31 |         raise InvalidYamlError(
32 |             "Each analyse component must be a dict containing at least a 'name' attribute."
33 |         )
34 | 
35 | 
36 | def validate_analyse(cfg: dict):
37 |     if not isinstance(cfg, dict) and not isinstance(cfg, list):
38 |         raise InvalidYamlError("The 'analyse' attribute must be a dict or list.")
39 |     if isinstance(cfg, dict):
40 |         validate_name(cfg)
41 |         validate_module("analyse", cfg["name"], cfg)
42 |     else:
43 |         for _cfg in cfg:
44 |             validate_name(_cfg)
45 |             validate_module("analyse", _cfg["name"], _cfg)
46 | 
47 | 
48 | def validate_yaml(cfg: dict) -> bool:
49 |     """
50 |     Confirms all values on YAML. Throws an appropriate exception if something's up.
51 |     """
52 |     keys = cfg.keys()
53 | 
54 |     if "folder" not in keys or not isinstance(cfg["folder"], str):
55 |         raise InvalidYamlError("The folder attribute must exist and be a string")
56 | 
57 |     if "phase" in keys or "module" in keys:
58 |         # confirm good phase yaml
59 |         if "module" not in keys:
60 |             raise InvalidYamlError(
61 |                 "If you specified a phase, you must specify a module"
62 |             )
63 |         if "phase" not in keys:
64 |             raise InvalidYamlError(
65 |                 "If you specified a module, you must specify a phase"
66 |             )
67 | 
68 |         if "config" not in keys or not isinstance(cfg["config"], dict):
69 |             raise InvalidYamlError("The 'config' attribute must exist.")
70 | 
71 |         if cfg["phase"] not in ["select", "analyse"]:
72 |             raise InvalidYamlError(
73 |                 "The phase attribute must be either select or analyse"
74 |             )
75 |         validate_module(cfg["phase"], cfg["module"], cfg)
76 |     else:
77 |         if "elements_in" not in keys and "select" not in keys:
78 |             raise InvalidYamlError("You must specify either 'elements_in' or 'select'.")
79 |         if "elements_in" in keys:
80 |             # bypassing selector...
81 |             if "analyse" not in keys:
82 |                 raise InvalidYamlError(
83 |                     "You have specified 'elements_in', and so at least one 'analyse' module must be specified."
84 |                 )
85 | 
86 |         elif "select" in keys:
87 |             # run select then analyse
88 |             validate_name(cfg["select"])
89 |             validate_module("select", cfg["select"]["name"], cfg["select"])
90 | 
91 |         if "analyse" in cfg:
92 |             validate_analyse(cfg["analyse"])
93 | 


--------------------------------------------------------------------------------
/test/test_build.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import os
  3 | import csv
  4 | import re
  5 | from commands import parse_args, build, develop, clean, run_tests, run, DIR_PATH
  6 | 
  7 | 
  8 | def get_tag_str(cmd, tag):
  9 |     """
 10 |     Returns the string for a tag in a command, or 'None' if the tag doesn't exist.
 11 |     """
 12 |     idx = 0
 13 |     while len(cmd) > idx and cmd[idx] != tag:
 14 |         idx += 1
 15 |     if idx <= len(cmd) - 1:
 16 |         return cmd[idx + 1]
 17 |     return None
 18 | 
 19 | 
 20 | def get_volumes(cmd):
 21 |     idx = 0
 22 |     volumes = []
 23 |     while len(cmd) - 1 > idx:
 24 |         if cmd[idx] == "-v":
 25 |             volumes.append(cmd[idx + 1])
 26 |         idx += 1
 27 |     return volumes
 28 | 
 29 | 
 30 | def dockerimage_tag_matches(cmd, expected):
 31 |     build_tag = get_tag_str(cmd, "-t")
 32 |     if build_tag:
 33 |         return build_tag == expected
 34 |     return False
 35 | 
 36 | 
 37 | def builds_from_cpu_dockerfile(dfile):
 38 |     return "FROM ubuntu:18.04\n" in dfile
 39 | 
 40 | 
 41 | def builds_from_gpu_dockerfile(dfile):
 42 |     return "FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04\n" in dfile
 43 | 
 44 | 
 45 | def read_deps(component):
 46 |     pth = "src/lib/selectors/{}/requirements.txt".format(component)
 47 |     if not os.path.exists(pth):
 48 |         return []
 49 |     with open(pth, "r") as f:
 50 |         return f.readlines()
 51 | 
 52 | 
 53 | class TestBuild(unittest.TestCase):
 54 |     def setUp(self):
 55 |         # make test whitelist
 56 |         self.SELECTOR_WL = "selector_whitelist.txt"
 57 |         with open(self.SELECTOR_WL, "w") as f:
 58 |             writer = csv.writer(f)
 59 |             writer.writerows([["Youtube"], ["Local"]])
 60 | 
 61 |         self.BLANK_WL = "blank_whitelist.txt"
 62 |         with open(self.BLANK_WL, "w") as f:
 63 |             writer = csv.writer(f)
 64 |             writer.writerows([[""]])
 65 | 
 66 |     def tearDown(self):
 67 |         os.remove(self.SELECTOR_WL)
 68 |         os.remove(self.BLANK_WL)
 69 | 
 70 |     def test_default_build(self):
 71 |         args = parse_args(["dev", "build", "--dry"])
 72 |         cmd, dfile, pipfile = build(args)
 73 |         self.assertTrue(
 74 |             dockerimage_tag_matches(cmd, "forensicarchitecture/mtriage:dev")
 75 |         )
 76 |         self.assertTrue(builds_from_cpu_dockerfile(dfile))
 77 | 
 78 |     def test_gpu_build(self):
 79 |         args = parse_args(["dev", "build", "--gpu", "--dry"])
 80 |         cmd, dfile, pipfile = build(args)
 81 |         self.assertTrue(builds_from_gpu_dockerfile(dfile))
 82 | 
 83 |     def test_whitelist(self):
 84 |         args = parse_args(["dev", "build", "--whitelist", self.BLANK_WL, "--dry"])
 85 |         cmd, dfile, pipfile = build(args)
 86 |         with open("src/build/core.requirements.txt", "r") as f:
 87 |             core_deps = f.readlines()
 88 |         self.assertListEqual(core_deps, pipfile)
 89 | 
 90 |         args = parse_args(["dev", "build", "--whitelist", self.SELECTOR_WL, "--dry"])
 91 |         cmd, dfile, pipfile = build(args)
 92 |         expected_pipfile = core_deps + read_deps("Youtube") + read_deps("Twitter")
 93 |         expected_pipfile = [x for x in expected_pipfile if x != "\n"]
 94 |         pipfile = [x for x in pipfile if x != "\n"]
 95 |         self.assertListEqual(pipfile, expected_pipfile)
 96 | 
 97 |     def test_custom_tags(self):
 98 |         args = parse_args(["dev", "build", "--tag", "CUSTOM_TAG", "--dry"])
 99 |         cmd, dfile, pipfile = build(args)
100 |         self.assertTrue(
101 |             dockerimage_tag_matches(cmd, "forensicarchitecture/mtriage:CUSTOM_TAG")
102 |         )
103 | 
104 |         args = parse_args(
105 |             ["run", "docs/tutorial/1/1a.yaml", "--tag", "CUSTOM_TAG", "--dry"]
106 |         )
107 |         cmd = run(args)
108 |         self.assertTrue(cmd[-1] == "forensicarchitecture/mtriage:CUSTOM_TAG")
109 | 
110 |     def test_dev_tag(self):
111 |         dev_args = parse_args(["run", "docs/tutorial/1/1a.yaml", "--dev", "--dry"])
112 |         cmd = run(dev_args)
113 |         vs = get_volumes(cmd)
114 |         media_re = r".*/mtriage/src:/mtriage/src$"
115 |         has_src = False
116 |         for v in vs:
117 |             if re.match(media_re, v):
118 |                 has_src = True
119 |                 break
120 |         self.assertTrue(has_src)
121 | 
122 |         no_dev_args = parse_args(["run", "docs/tutorial/1/1a.yaml", "--dry"])
123 |         cmd = run(no_dev_args)
124 |         vs = get_volumes(cmd)
125 |         matched = False
126 |         for v in vs:
127 |             if re.match(media_re, v) is not None:
128 |                 matched = True
129 |         self.assertFalse(matched)
130 | 


--------------------------------------------------------------------------------
/test/test_util.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from util import (
 3 |     name_and_ver,
 4 |     InvalidPipDep,
 5 |     should_add_pipdep,
 6 |     should_add_dockerline,
 7 |     InvalidArgumentsError,
 8 | )
 9 | 
10 | 
11 | class TestUtil(unittest.TestCase):
12 |     """Test the util functions at mtriage's outer layer."""
13 | 
14 |     def test_name_and_ver(self):
15 |         name, ver = name_and_ver("numpy")
16 |         self.assertEqual(name, "numpy")
17 |         self.assertEqual(ver, None)
18 | 
19 |         name, ver = name_and_ver("numpy==4.0")
20 |         self.assertEqual(name, "numpy")
21 |         self.assertEqual(ver, "4.0")
22 | 
23 |         n1, v1 = name_and_ver("google-api-core==1.11.0")
24 |         self.assertEqual(n1, "google-api-core")
25 |         self.assertEqual(v1, "1.11.0")
26 | 
27 |         # self.assertRaises(InvalidPipDep, name_and_ver, "numpy==")
28 |         # self.assertRaises(InvalidPipDep, name_and_ver, "invalid==2.h")
29 |         self.assertRaises(InvalidPipDep, name_and_ver, "invalid==2==")
30 | 
31 |     def test_should_add_pipdeps(self):
32 |         p1 = []
33 |         # empty check --> false
34 |         self.assertTrue(should_add_pipdep("numpy", p1))
35 | 
36 |         p2 = ["numpy"]
37 |         self.assertFalse(should_add_pipdep("numpy", p2))
38 |         self.assertTrue(should_add_pipdep("pandas", p2))
39 |         # should add specific versions over undefined
40 |         self.assertTrue(should_add_pipdep("numpy==2.0", p2))
41 |         # should add higher versions
42 |         p3 = ["numpy==1.0"]
43 |         self.assertTrue(should_add_pipdep("numpy==3.0", p3))
44 |         # check with multiple
45 |         p4 = ["pack1==2.0", "pandas=3.4", "numpy==1.0", "blueray"]
46 |         self.assertTrue(should_add_pipdep("numpy==1.1", p4))
47 |         self.assertFalse(should_add_pipdep("numpy", p4))
48 |         self.assertTrue(should_add_pipdep("blueray==0.1", p4))
49 |         self.assertTrue(should_add_pipdep("newdep", p4))
50 |         # check error
51 |         with self.assertRaises(InvalidPipDep):
52 |             should_add_pipdep("invalid==1==", p4)
53 | 
54 |     def test_should_add_dockerline(self):
55 |         p1 = []
56 |         self.assertTrue(should_add_dockerline("any line here", p1))
57 |         p2 = ["RUN apt-get install -y vim"]
58 |         self.assertFalse(should_add_dockerline("RUN apt-get install -y vim", p2))
59 |         p3 = ["RUN apt-get install -y vim", "RUN curl -o https://smthn", "RUN it"]
60 |         self.assertTrue(should_add_dockerline("RUN apt get install -y curl", p3))
61 |         self.assertFalse(should_add_dockerline("RUN curl -o https://smthn", p3))
62 | 


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import shutil
  4 | from argparse import ArgumentTypeError
  5 | 
  6 | DIR_PATH = os.path.dirname(os.path.realpath(__file__))
  7 | 
  8 | 
  9 | class InvalidPipDep(Exception):
 10 |     pass
 11 | 
 12 | 
 13 | class InvalidArgumentsError(Exception):
 14 |     pass
 15 | 
 16 | 
 17 | # parseargs type functions
 18 | def str2yamlfile(fname):
 19 |     ext = os.path.splitext(fname)[1][1:]
 20 |     if ext not in "yaml":
 21 |         ArgumentTypeError("The file you specify to run mtriage must be a YAML file")
 22 |     if not os.path.exists(fname):
 23 |         ArgumentTypeError("Cannot find a file at {}.".format(fname))
 24 |     return fname
 25 | 
 26 | 
 27 | def get_subdirs(d):
 28 |     whitelist = ["__pycache__"]
 29 |     return [
 30 |         o
 31 |         for o in os.listdir(d)
 32 |         if os.path.isdir(os.path.join(d, o))
 33 |         and o not in whitelist
 34 |         and o != "__deprecated"
 35 |     ]
 36 | 
 37 | 
 38 | def name_and_ver(pipdep):
 39 |     """Return the name and version from a string that expresses a pip dependency.
 40 |     Raises an InvalidPipDep exception if the string is an invalid dependency.
 41 |     """
 42 |     pipdep = pipdep.split("==")
 43 |     dep_name = pipdep[0]
 44 |     try:
 45 |         if len(pipdep) == 1:
 46 |             dep_version = None
 47 |         elif len(pipdep) > 2:
 48 |             raise InvalidPipDep
 49 |         else:
 50 |             dep_version = pipdep[1]
 51 |             # if re.search(r"\d+(\.\d+)*", dep_version) is None:
 52 |             #     raise InvalidPipDep
 53 |         return dep_name, dep_version
 54 |     except:
 55 |         raise InvalidPipDep
 56 | 
 57 | 
 58 | def should_add_pipdep(dep, pipdeps):
 59 |     """Check whether pipdep should be added."""
 60 |     dep_name, dep_ver = name_and_ver(dep)
 61 |     for _dep in pipdeps:
 62 |         _dep_name, _dep_ver = name_and_ver(_dep)
 63 |         if _dep_name == dep_name:
 64 |             # new version unspecified, cannot be more specific
 65 |             if dep_ver is None:
 66 |                 return False
 67 |             # new version more specific
 68 |             elif _dep_ver is None and dep_ver is not None:
 69 |                 return True
 70 |             elif str(dep_ver) < str(_dep_ver):
 71 |                 return False
 72 |     return True
 73 | 
 74 | 
 75 | def should_add_dockerline(line, dockerfile):
 76 |     """Check whether line should be added to array representing Dockerfile."""
 77 |     return line not in dockerfile
 78 | 
 79 | 
 80 | def lines_from_files(files):
 81 |     """ 'readlines' for a list of files, concatening them all together """
 82 |     lines = []
 83 |     for f in files:
 84 |         with open(f, "r") as fp:
 85 |             lines.extend(fp.readlines())
 86 |     return lines
 87 | 
 88 | 
 89 | def add_deps(dep_path, deps, should_add):
 90 |     """Add dependences at {folder_path} to {deps}, excluding if {should_add} is True for any given dependency."""
 91 |     if not os.path.isfile(dep_path):
 92 |         return
 93 | 
 94 |     with open(dep_path) as f:
 95 |         for line in f.readlines():
 96 |             if should_add(line, deps):
 97 |                 deps.append(line)
 98 |         deps.append("\n")  # for good measure
 99 | 
100 | 
101 | def extract_dep(csv_row):
102 |     if len(csv_row) == 1:
103 |         return csv_row[0]
104 |     return ""
105 | 
106 | 
107 | def get_env_config():
108 |     ENV_FILE = "{}/.env".format(DIR_PATH)
109 |     if os.path.exists(ENV_FILE):
110 |         return "--env-file={}".format(ENV_FILE)
111 |     else:
112 |         return "--env-file={}".format("{}/.env.example".format(DIR_PATH))
113 | 


--------------------------------------------------------------------------------