├── license ├── ru.pdf └── en_us.pdf ├── images └── demo.gif ├── requirements.txt ├── examples ├── f17a6060-6ced-4bd1-9886-8578cfbb864f.mp4 ├── demo_video_torch.ipynb └── demo_video_onnx.ipynb ├── config_example.yaml ├── pyproject.toml ├── .pre-commit-config.yaml ├── .gitignore ├── README.md ├── demo.py └── constants.py /license/ru.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hukenovs/slovo/HEAD/license/ru.pdf -------------------------------------------------------------------------------- /images/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hukenovs/slovo/HEAD/images/demo.gif -------------------------------------------------------------------------------- /license/en_us.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hukenovs/slovo/HEAD/license/en_us.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | loguru==0.7.0 2 | omegaconf==2.3.0 3 | onnxruntime==1.14.1 4 | opencv-contrib-python==4.7.0.72 5 | torch>=1.8 6 | -------------------------------------------------------------------------------- /examples/f17a6060-6ced-4bd1-9886-8578cfbb864f.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hukenovs/slovo/HEAD/examples/f17a6060-6ced-4bd1-9886-8578cfbb864f.mp4 -------------------------------------------------------------------------------- /config_example.yaml: -------------------------------------------------------------------------------- 1 | model_path: mvit32-2.onnx 2 | frame_interval: 2 # set frame interval 3 | mean: [123.675, 116.28, 103.53] 4 | std: [58.395, 57.12, 57.375] 5 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # [project] 2 | # dynamic = ["dependencies"] 3 | # 4 | # [tool.setuptools.dynamic] 5 | # dependencies = requirements.txt 6 | 7 | # TODO 2020/06/02: Check Flake8 8 | # [tool.flake8] 9 | # ignore = ['E203', 'E266', 'E501', 'W503', 'F403', 'F401'] 10 | # max-line-length = 120 11 | # max-complexity = 18 12 | # select = ['B','C','E','F','W','T4','B9'] 13 | 14 | [tool.isort] 15 | # known_third_party = ["cv2", "numpy", "pandas", "qimage2ndarray", "scipy", "sklearn", "tensorflow", "torch"] 16 | multi_line_output = 3 17 | include_trailing_comma = true 18 | force_grid_wrap = 0 19 | use_parentheses = true 20 | line_length = 120 21 | 22 | [tool.black] 23 | line-length = 120 24 | target-version = ['py39'] 25 | exclude = ''' 26 | /( 27 | \.eggs 28 | | \.git 29 | | \.hg 30 | | \.mypy_cache 31 | | \.tox 32 | | \.venv 33 | | _build 34 | | buck-out 35 | | build 36 | | dist 37 | # The following are specific to Black, you probably don't want those. 38 | | blib2to3 39 | | tests/data 40 | | profiling 41 | )/ 42 | ''' 43 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: '^$' 2 | fail_fast: false 3 | default_language_version: 4 | python: python3.9 5 | repos: 6 | - repo: https://github.com/psf/black 7 | rev: 22.6.0 8 | hooks: 9 | - id: black 10 | language_version: python3 11 | args: 12 | - "--line-length=120" 13 | - repo: https://github.com/PyCQA/flake8 14 | rev: 4.0.1 15 | hooks: 16 | - id: flake8 17 | language_version: python3 18 | args: 19 | - "--max-line-length=120" 20 | - "--ignore=E203,E265,E309,E501,E265,W503,E402" 21 | - repo: https://github.com/pre-commit/pre-commit-hooks 22 | rev: v3.2.0 23 | hooks: 24 | - id: check-docstring-first 25 | - id: check-merge-conflict 26 | - id: check-yaml 27 | - id: trailing-whitespace 28 | - id: end-of-file-fixer 29 | - id: requirements-txt-fixer 30 | 31 | - repo: https://github.com/PyCQA/autoflake 32 | rev: v1.4 33 | hooks: 34 | - id: autoflake 35 | args: ['-r', '--in-place', 36 | '--remove-all-unused-imports', 37 | '--ignore-init-module-imports', 38 | '--remove-unused-variables', 39 | '--remove-duplicate-keys' 40 | ] 41 | 42 | - repo: https://github.com/PyCQA/isort 43 | rev: 5.10.1 44 | hooks: 45 | - id: isort 46 | args: 47 | - "--profile=black" 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *.pyc 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib64/ 20 | /parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | .DS_Store 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *,cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # IntelliJ IDEA 75 | .idea/ 76 | 77 | #VS Code 78 | .vscode/ 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # dotenv 90 | .env 91 | 92 | # virtualenv 93 | .venv 94 | venv/ 95 | ENV/ 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | .spyproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | 104 | # User added: 105 | /templates 106 | *.pth 107 | *.pt 108 | *.onnx 109 | -------------------------------------------------------------------------------- /examples/demo_video_torch.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "775d1720-0fa3-40bf-a9e5-a8cc44744dca", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from IPython import display\n", 11 | "import sys\n", 12 | "sys.path.append(\"../\")\n", 13 | "\n", 14 | "import torch\n", 15 | "import numpy as np\n", 16 | "import cv2\n", 17 | "from PIL import Image\n", 18 | "\n", 19 | "from constants import classes" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "id": "76ed828a-a15a-489c-bfd8-dcf24d3db703", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "path_to_model = \"../mvit16-1.pt\"\n", 30 | "path_to_input_video = \"f17a6060-6ced-4bd1-9886-8578cfbb864f.mp4\"\n", 31 | "path_to_output_video = \"output_torch.mp4\"" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "id": "4a36baad-bd49-4126-b98a-4f20b7919caf", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "model = torch.jit.load(path_to_model)\n", 42 | "model.eval()\n", 43 | "window_size = 16 # from model name\n", 44 | "threshold = 0.5\n", 45 | "frame_interval = 1\n", 46 | "mean = [123.675, 116.28, 103.53]\n", 47 | "std = [58.395, 57.12, 57.375]" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "id": "d72fb23e-3946-4b76-ac62-cfcc325ff657", 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "def resize(im, new_shape=(224, 224)):\n", 58 | " \"\"\"\n", 59 | " Resize and pad image while preserving aspect ratio.\n", 60 | "\n", 61 | " Parameters\n", 62 | " ----------\n", 63 | " im : np.ndarray\n", 64 | " Image to be resized.\n", 65 | " new_shape : Tuple[int]\n", 66 | " Size of the new image.\n", 67 | "\n", 68 | " Returns\n", 69 | " -------\n", 70 | " np.ndarray\n", 71 | " Resized image.\n", 72 | " \"\"\"\n", 73 | " shape = im.shape[:2] # current shape [height, width]\n", 74 | " if isinstance(new_shape, int):\n", 75 | " new_shape = (new_shape, new_shape)\n", 76 | "\n", 77 | " # Scale ratio (new / old)\n", 78 | " r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])\n", 79 | "\n", 80 | " # Compute padding\n", 81 | " new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))\n", 82 | " dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding\n", 83 | "\n", 84 | " dw /= 2\n", 85 | " dh /= 2\n", 86 | "\n", 87 | " if shape[::-1] != new_unpad: # resize\n", 88 | " im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)\n", 89 | " top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))\n", 90 | " left, right = int(round(dw - 0.1)), int(round(dw + 0.1))\n", 91 | " im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)) # add border\n", 92 | " return im" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 5, 98 | "id": "184ed911-6b9b-4250-a30b-c347e3be2ed1", 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "cap = cv2.VideoCapture(path_to_input_video)\n", 103 | "_,frame = cap.read()\n", 104 | "shape = frame.shape\n", 105 | "fourcc = cv2.VideoWriter_fourcc(*'H264')\n", 106 | "writer = cv2.VideoWriter(path_to_output_video, fourcc, 30, (frame.shape[1], frame.shape[0]+50))\n", 107 | "\n", 108 | "tensors_list = []\n", 109 | "prediction_list = []\n", 110 | "prediction_list.append(\"---\")\n", 111 | "\n", 112 | "frame_counter = 0\n", 113 | "while True:\n", 114 | " _, frame = cap.read()\n", 115 | " if frame is None:\n", 116 | " break\n", 117 | " frame_counter += 1\n", 118 | " if frame_counter == frame_interval:\n", 119 | " image = cv2.cvtColor(frame.copy(), cv2.COLOR_BGR2RGB)\n", 120 | " image = resize(image, (224, 224))\n", 121 | " image = (image - mean) / std\n", 122 | " image = np.transpose(image, [2, 0, 1])\n", 123 | " tensors_list.append(image)\n", 124 | " if len(tensors_list) == window_size:\n", 125 | " input_tensor = np.stack(tensors_list[: window_size], axis=1)[None][None]\n", 126 | " input_tensor = input_tensor.astype(np.float32)\n", 127 | " input_tensor = torch.from_numpy(input_tensor)\n", 128 | " with torch.no_grad():\n", 129 | " outputs = model(input_tensor)[0]\n", 130 | " gloss = str(classes[outputs.argmax().item()])\n", 131 | " if outputs.max() > threshold:\n", 132 | " if gloss != prediction_list[-1] and len(prediction_list):\n", 133 | " if gloss != \"---\":\n", 134 | " prediction_list.append(gloss)\n", 135 | " tensors_list.clear()\n", 136 | " frame_counter = 0\n", 137 | "\n", 138 | " text = \" \".join(prediction_list)\n", 139 | " text_div = np.zeros((50, frame.shape[1], 3), dtype=np.uint8)\n", 140 | " cv2.putText(text_div, text, (10, 30), cv2.FONT_HERSHEY_COMPLEX, 0.7, (255, 255, 255), 2)\n", 141 | "\n", 142 | " frame = np.concatenate((frame, text_div), axis=0)\n", 143 | " writer.write(frame)\n", 144 | "writer.release()\n", 145 | "cap.release()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 6, 151 | "id": "3c512a02-1d2b-4603-b3cd-9801216c3bdf", 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "from IPython.display import display, HTML" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 7, 161 | "id": "53a41c5c-dcff-439b-a17a-07b9530525f8", 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/html": [ 167 | "