├── sign_vq ├── __init__.py ├── data │ ├── __init__.py │ ├── header.poseheader │ ├── mean_pose_full.png │ ├── mean_pose_reduced.png │ ├── draw_mean_pose.py │ ├── zip_dataset.py │ ├── huggingface_dataset.py │ ├── README.md │ ├── normalize.py │ └── pose_normalization.json ├── tests │ ├── __init__.py │ ├── model_test.py │ └── model_overfit_test.py ├── pose_reconstruction.py ├── codes_to_poses.py ├── utils.py ├── benchmark_pose_reconstuction.py ├── poses_to_codes.py ├── train.py ├── dataset.py └── model.py ├── .gitignore ├── assets └── validation │ ├── validation_0.gif │ ├── validation_1.gif │ ├── validation_2.gif │ ├── validation_3.gif │ └── validation_4.gif ├── .pre-commit-config.yaml ├── scripts ├── extract_mean_std.sh ├── zip_dataset.sh ├── train_model.sh ├── sync_bucket.sh └── huggingface_dataset.sh ├── .github └── workflows │ ├── lint.yaml │ └── test.yaml ├── pyproject.toml └── README.md /sign_vq/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sign_vq/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sign_vq/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | build/ 3 | sign_vq.egg-info/ 4 | example_data/ 5 | sign-language-vq/ 6 | wandb/ -------------------------------------------------------------------------------- /sign_vq/data/header.poseheader: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sign-language-processing/sign-vq/main/sign_vq/data/header.poseheader -------------------------------------------------------------------------------- /sign_vq/data/mean_pose_full.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sign-language-processing/sign-vq/main/sign_vq/data/mean_pose_full.png -------------------------------------------------------------------------------- /assets/validation/validation_0.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sign-language-processing/sign-vq/main/assets/validation/validation_0.gif -------------------------------------------------------------------------------- /assets/validation/validation_1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sign-language-processing/sign-vq/main/assets/validation/validation_1.gif -------------------------------------------------------------------------------- /assets/validation/validation_2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sign-language-processing/sign-vq/main/assets/validation/validation_2.gif -------------------------------------------------------------------------------- /assets/validation/validation_3.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sign-language-processing/sign-vq/main/assets/validation/validation_3.gif -------------------------------------------------------------------------------- /assets/validation/validation_4.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sign-language-processing/sign-vq/main/assets/validation/validation_4.gif -------------------------------------------------------------------------------- /sign_vq/data/mean_pose_reduced.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sign-language-processing/sign-vq/main/sign_vq/data/mean_pose_reduced.png -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | files: ^sign_vq/ 4 | hooks: 5 | - id: pylint 6 | name: pylint 7 | entry: pylint 8 | language: system 9 | types: [python] 10 | - id: pytest 11 | name: pytest 12 | entry: pytest sign_vq 13 | language: system 14 | types: [python] 15 | 16 | -------------------------------------------------------------------------------- /scripts/extract_mean_std.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=preprocess-pose-data 4 | #SBATCH --time=24:00:00 5 | #SBATCH --mem=16G 6 | #SBATCH --output=extract_mean_std.out 7 | 8 | #SBATCH --ntasks=1 9 | 10 | set -e # exit on error 11 | set -x # echo commands 12 | 13 | module load anaconda3 14 | source activate vq 15 | 16 | python -m sign_vq.data.normalize --dir="$1" 17 | -------------------------------------------------------------------------------- /scripts/zip_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=preprocess-pose-data 4 | #SBATCH --time=24:00:00 5 | #SBATCH --mem=16G 6 | #SBATCH --output=zip_dataset.out 7 | 8 | #SBATCH --ntasks=1 9 | 10 | set -e # exit on error 11 | set -x # echo commands 12 | 13 | module load anaconda3 14 | source activate vq 15 | 16 | python -m sign_vq.data.zip_dataset --dir="$1" --out="$2" 17 | -------------------------------------------------------------------------------- /scripts/train_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=train-vq-vae 4 | #SBATCH --time=2-00:00:00 5 | #SBATCH --cpus-per-task=4 6 | #SBATCH --mem=128G 7 | #SBATCH --output=train.out 8 | 9 | #SBATCH --ntasks=1 10 | #SBATCH --gres gpu:1 11 | #SBATCH --constraint=GPUMEM80GB 12 | 13 | set -e # exit on error 14 | set -x # echo commands 15 | 16 | module load anaconda3 17 | source activate vq 18 | 19 | srun python -m sign_vq.train --data-path="$1" \ 20 | --wandb-dir="/scratch/$(whoami)/wandb/sign-vq/" 21 | -------------------------------------------------------------------------------- /.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | 4 | on: 5 | push: 6 | branches: [ master, main ] 7 | pull_request: 8 | branches: [ master, main ] 9 | 10 | 11 | jobs: 12 | test: 13 | name: Lint 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v3 18 | - uses: actions/setup-python@v4 19 | with: 20 | python-version: '3.11' 21 | 22 | - name: Install Requirements 23 | run: pip install .[dev] 24 | 25 | - name: Lint Code 26 | run: pylint sign_vq 27 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | 4 | on: 5 | push: 6 | branches: [ master, main ] 7 | pull_request: 8 | branches: [ master, main ] 9 | 10 | 11 | jobs: 12 | test: 13 | name: Test 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v3 18 | - uses: actions/setup-python@v4 19 | with: 20 | python-version: '3.11' 21 | 22 | - name: Install Requirements 23 | run: pip install .[dev] 24 | 25 | - name: Test Code 26 | run: pytest sign_vq 27 | -------------------------------------------------------------------------------- /scripts/sync_bucket.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=sync-data 4 | #SBATCH --time=48:00:00 5 | #SBATCH --mem=32G 6 | #SBATCH --output=sync_bucket.out 7 | 8 | #SBATCH --ntasks=1 9 | 10 | set -e # exit on error 11 | set -x # echo commands 12 | 13 | module load anaconda3 14 | source activate vq 15 | 16 | poses_dir=$1 17 | # -i: Skip copying any files that already exist at the destination, regardless of their modification time. 18 | # -d: Delete extra files under dst_url not found under src_url. By default extra files are not deleted. 19 | gsutil -m rsync -i -d gs://sign-mt-poses "$poses_dir" 20 | -------------------------------------------------------------------------------- /scripts/huggingface_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=preprocess-pose-data 4 | #SBATCH --time=24:00:00 5 | #SBATCH --mem=16G 6 | #SBATCH --output=huggingface_dataset.out 7 | 8 | #SBATCH --ntasks=1 9 | 10 | set -e # exit on error 11 | set -x # echo commands 12 | 13 | module load anaconda3 14 | source activate vq 15 | 16 | # Convert to huggingface dataset 17 | HF_DATASET_DIR="/scratch/$(whoami)/poses/huggingface" 18 | mkdir -p $HF_DATASET_DIR 19 | 20 | cd .. 21 | 22 | [ ! -f "$HF_DATASET_DIR/dataset_dict.json" ] && \ 23 | python -m sign_vq.data.huggingface_dataset \ 24 | --directory="/scratch/amoryo/poses/sign-mt-poses" \ 25 | --output-path="$HF_DATASET_DIR" 26 | -------------------------------------------------------------------------------- /sign_vq/data/draw_mean_pose.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import cv2 4 | import numpy as np 5 | from pose_format import Pose 6 | from pose_format.numpy import NumPyPoseBody 7 | from pose_format.pose_visualizer import PoseVisualizer 8 | from pose_format.utils.generic import reduce_holistic 9 | 10 | from sign_vq.data.normalize import load_mean_and_std, unshift_hand, load_pose_header 11 | 12 | if __name__ == "__main__": 13 | mean, _ = load_mean_and_std() 14 | 15 | data = mean.reshape((1, 1, -1, 3)) * 1000 16 | confidence = np.ones((1, 1, len(mean))) 17 | body = NumPyPoseBody(fps=1, data=data, confidence=confidence) 18 | pose = Pose(header=load_pose_header(), body=body) 19 | 20 | unshift_hand(pose, "RIGHT_HAND_LANDMARKS") 21 | unshift_hand(pose, "LEFT_HAND_LANDMARKS") 22 | 23 | poses = { 24 | "full": pose, 25 | "reduced": reduce_holistic(pose) 26 | } 27 | for name, pose in poses.items(): 28 | pose.focus() 29 | 30 | v = PoseVisualizer(pose) 31 | image_path = Path(__file__).parent / f"mean_pose_{name}.png" 32 | cv2.imwrite(str(image_path), next(v.draw())) 33 | -------------------------------------------------------------------------------- /sign_vq/data/zip_dataset.py: -------------------------------------------------------------------------------- 1 | import io 2 | import zipfile 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | from pose_format import Pose 7 | from tqdm import tqdm 8 | 9 | from sign_vq.data.normalize import pre_process_mediapipe, normalize_mean_std 10 | 11 | 12 | def save_poses_to_zip(directory: str, zip_filename: str): 13 | pose_files = list(Path(directory).glob("*.pose")) 14 | with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zip_file: 15 | for file in tqdm(pose_files): 16 | with open(file, 'rb') as pose_file: 17 | pose = Pose.read(pose_file.read()) 18 | pose = pre_process_mediapipe(pose) 19 | pose = normalize_mean_std(pose) 20 | 21 | # Using the file name as the zip entry name 22 | npz_filename = file.stem + '.npz' 23 | 24 | # Saving the masked array to a temporary buffer 25 | with io.BytesIO() as buf: 26 | data = pose.body.data[:, 0, :, :] # only first person 27 | 28 | float16_data = data.filled(0).astype(np.float16) 29 | np.savez_compressed(buf, data=float16_data, mask=data.mask) 30 | zip_file.writestr(npz_filename, buf.getvalue()) 31 | 32 | 33 | if __name__ == "__main__": 34 | import argparse 35 | 36 | parser = argparse.ArgumentParser(description='Preprocess data') 37 | parser.add_argument('--dir', type=str, help='Directory containing the pose files') 38 | parser.add_argument('--out', type=str, help='Output zip file') 39 | 40 | args = parser.parse_args() 41 | 42 | save_poses_to_zip(args.dir, args.out) 43 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "sign-vq" 3 | description = "Quantize Sign Language Poses" 4 | version = "0.0.1" 5 | authors = [ 6 | { name = "Amit Moryossef", email = "amitmoryossef@gmail.com" } 7 | ] 8 | readme = "README.md" 9 | dependencies = [ 10 | "pose-format>=0.3.2", 11 | "tqdm", 12 | "torch", 13 | "vector-quantize-pytorch", 14 | "pytorch-lightning", 15 | # for inference using a huggingface hosted model 16 | "huggingface-hub" 17 | ] 18 | 19 | [project.optional-dependencies] 20 | dev = [ 21 | "pytest", 22 | "pylint", 23 | "opencv-python", 24 | "wandb", 25 | # to support wandb video logging 26 | "moviepy", 27 | "imageio", 28 | # to support a huggingface dataset, not recommended 29 | "datasets", 30 | "psutil", # To log used memory 31 | ] 32 | 33 | [tool.yapf] 34 | based_on_style = "google" 35 | column_limit = 120 36 | 37 | [tool.pylint] 38 | max-line-length = 120 39 | disable = [ 40 | "C0114", # Missing module docstring 41 | "C0115", # Missing class docstring 42 | "C0116", # Missing function or method docstring 43 | "C0415", # Import outside toplevel 44 | ] 45 | good-names = ["i", "f", "x", "y"] 46 | 47 | [tool.pylint.typecheck] 48 | generated-members = ["torch.*", "numpy.*", "cv2.*"] 49 | 50 | [tool.setuptools] 51 | packages = [ 52 | "sign_vq", 53 | "sign_vq.data", 54 | ] 55 | 56 | [tool.setuptools.package-data] 57 | sign_vq = ["**/*.json", "**/*.poseheader"] 58 | 59 | [tool.pytest.ini_options] 60 | addopts = "-v" 61 | testpaths = ["sign_vq"] 62 | 63 | [project.scripts] 64 | poses_to_codes = "sign_vq.poses_to_codes:main" 65 | codes_to_poses = "sign_vq.codes_to_poses:main" 66 | reconstruct_poses = "sign_vq.pose_reconstruction:main" 67 | -------------------------------------------------------------------------------- /sign_vq/pose_reconstruction.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | from pose_format import Pose 6 | 7 | from sign_vq.model import PoseFSQAutoEncoder 8 | from sign_vq.poses_to_codes import load_model, process_file, pose_to_tensor 9 | from sign_vq.utils import pose_from_data 10 | 11 | 12 | def run_inference(model: PoseFSQAutoEncoder, pose: Pose, only_masked: bool): 13 | tensor = pose_to_tensor(pose, model.device) 14 | new_tensor, _ = model(tensor) 15 | new_pose = pose_from_data(new_tensor[0]) 16 | 17 | if only_masked: 18 | original_pose = pose_from_data(tensor[0]) 19 | mask = pose.body.data.mask 20 | original_pose.body.data[mask] = new_pose.body.data[mask] 21 | new_pose = original_pose 22 | 23 | return new_pose 24 | 25 | 26 | def main(): 27 | parser = argparse.ArgumentParser(description='Reconstruct poses missing keypoints') 28 | parser.add_argument('--model', type=str, help='Path to trained model', default="sign/mediapipe-vq") 29 | parser.add_argument('--input', type=str, help='Path to pose file') 30 | parser.add_argument('--output', type=str, help='Path to output pose file') 31 | parser.add_argument('--only-masked', action='store_true', help='Only modify masked points') 32 | args = parser.parse_args() 33 | 34 | if not os.path.exists(args.input): 35 | raise FileNotFoundError(f"File {args.input} does not exist") 36 | 37 | with open(args.input, 'rb') as f: 38 | pose = process_file(f) 39 | 40 | model = load_model(args.model) 41 | 42 | with torch.no_grad(): 43 | new_pose = run_inference(model, pose, args.only_masked) 44 | 45 | with open(args.output, "wb") as f: 46 | new_pose.write(f) 47 | 48 | if __name__ == "__main__": 49 | main() 50 | -------------------------------------------------------------------------------- /sign_vq/codes_to_poses.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from pathlib import Path 4 | from typing import Iterable 5 | 6 | import torch 7 | from tqdm import tqdm 8 | 9 | from sign_vq.model import PoseFSQAutoEncoder 10 | from sign_vq.poses_to_codes import load_model 11 | from sign_vq.utils import pose_from_data 12 | 13 | 14 | def run_inference(model: PoseFSQAutoEncoder, codes: Iterable[str], output_path: Path): 15 | for i, line in enumerate(tqdm(codes)): 16 | int_codes = torch.tensor([[int(c) for c in line.split(" ")]], dtype=torch.long, device=model.device) 17 | poses_data = model.unquantize(int_codes) 18 | pose = pose_from_data(poses_data[0]) 19 | 20 | pose_output_path = output_path / f"{i}.pose" if output_path.is_dir() else output_path 21 | 22 | with open(pose_output_path, "wb") as f: 23 | pose.write(f) 24 | 25 | 26 | def main(): 27 | parser = argparse.ArgumentParser(description='Run inference on a trained model') 28 | parser.add_argument('--model', type=str, help='Path to trained model', default="sign/mediapipe-vq") 29 | parser.add_argument('--codes', type=str, 30 | help='Codes or path to text file with codes, new line separated') 31 | parser.add_argument('--output', type=str, help='Path to output pose file / directory') 32 | args = parser.parse_args() 33 | 34 | if os.path.exists(args.codes): 35 | with open(args.codes, 'r', encoding='utf-8') as f: 36 | codes = f.read().splitlines() 37 | assert os.path.isdir(args.output), "When codes is a file, output must be a directory" 38 | else: 39 | codes = [args.codes] 40 | assert not os.path.isdir(args.output), "When codes is given directly, output must be a file" 41 | 42 | assert len(codes) > 0, "No codes found" 43 | 44 | model = load_model(args.model) 45 | 46 | with torch.no_grad(): 47 | run_inference(model, codes, Path(args.output)) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /sign_vq/data/huggingface_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | 4 | import numpy as np 5 | 6 | import datasets 7 | from pose_format import Pose 8 | 9 | from sign_vq.data.normalize import pre_process_mediapipe, normalize_mean_std 10 | 11 | 12 | class SignLanguagePoseDataset(datasets.GeneratorBasedBuilder): 13 | def __init__(self, pose_directory: Path): 14 | super().__init__() 15 | 16 | self.data = list(pose_directory.glob("*.pose")) 17 | 18 | def _info(self): 19 | return datasets.DatasetInfo( 20 | features=datasets.Features( 21 | { 22 | "data": datasets.Array3D(dtype="float16", shape=(None, 178, 3)), 23 | "mask": datasets.Array3D(dtype="bool", shape=(None, 178, 3)), 24 | } 25 | ) 26 | ) 27 | 28 | def _split_generators(self, dl_manager): 29 | return [ 30 | datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={}) 31 | ] 32 | 33 | def _generate_examples(self, **unused_kwargs): 34 | for i, file in enumerate(self.data): 35 | with open(file, 'rb') as pose_file: 36 | pose = Pose.read(pose_file.read()) 37 | pose = pre_process_mediapipe(pose) 38 | pose = normalize_mean_std(pose) 39 | 40 | data = pose.body.data[:, 0, :, :] # only first person 41 | 42 | float16_data = data.filled(0).astype(np.float16) 43 | if i == 0: 44 | print(float16_data.shape, float16_data.dtype, data.mask.shape, data.mask.dtype) 45 | 46 | yield i, { 47 | "data": float16_data, 48 | "mask": data.mask, 49 | } 50 | 51 | 52 | def main(): 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument("--directory", type=str, required=True) 55 | parser.add_argument("--output-path", type=str, required=True) 56 | args = parser.parse_args() 57 | 58 | pose_directory = Path(args.directory) 59 | output_path = Path(args.output_path) 60 | 61 | output_path.mkdir(parents=True, exist_ok=True) 62 | 63 | dataset = SignLanguagePoseDataset(pose_directory) 64 | dataset.download_and_prepare(output_path) 65 | dataset.as_dataset().save_to_disk(output_path) 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /sign_vq/data/README.md: -------------------------------------------------------------------------------- 1 | # Data 2 | 3 | To train a good compression model, we need a large dataset of sign language poses, 4 | Therefore, this project requires a directory of `.pose` files. 5 | 6 | ## Downloading Poses 7 | 8 | For the purpose of experimentation, I download all poses from SignTube by running: 9 | 10 | ```bash 11 | gsutil -m rsync gs://sign-mt-poses /scratch/amoryo/poses/sign-mt-poses 12 | ``` 13 | 14 | ## Pose Pre-Processing Guidelines 15 | 16 | The objective is to standardize the poses, ensuring each component is centered and scaled appropriately. 17 | 18 | 1. **Normalize by Shoulders:** Adjust the pose based on shoulder positions to maintain a consistent reference. 19 | 2. **Remove Legs:** Exclude leg keypoints from the pose. 20 | 3. **Hand Position Correction:** Align the hands in the pose to accurately reflect the wrist's position. 21 | 4. **Reposition Hands:** Adjust hands so that the wrist coordinates are set to the origin `(0,0,0)`. 22 | 5. **Calculate Mean and Standard Deviation:** Compute the mean and standard deviation for all points in the pose. Store 23 | this in a file. 24 | 25 | After calculating the mean, we reposition the hands to the wrists, and visualize the mean pose: 26 | 27 | Mean pose 28 | 29 | We then preprocess the entire dataset, and store it inside as a huggingface dataset (`huggingface_dataset.py`). 30 | This is because loading and pre-processing 563k pose files takes about 8-9 hours, 31 | and we don't want to repeat this process in every training run. 32 | Furthermore, the original poses contain a header, 543 (X,Y,Z,C) points stored in `float32`, resulting in 508GB in size, 33 | and by storing `float16` data of 178 (X,Y,Z) points and 178 (X,Y,Z) `bool` tensor, 34 | we reduce the size to 45GB (as `.zip` and 280G as a huggingface dataset). 35 | 36 | Iterating over the zip dataset takes about 10 minutes, compared to 8-9 hours for the original dataset. 37 | 38 | ## Benchmarking different dataset storing/loading methods 39 | 40 | | Dataset | Size | cold start | it/s | it/s (shuffled) | num_workers=4 | num_workers=8 | 41 | |-------------|-------|------------|------|-----------------|---------------|---------------| 42 | | Original | 508GB | 1s | 17 | / | / | / | 43 | | Huggingface | 280G | 100s | 21 | 2 | 4 | / | 44 | | Zip | 45GB | 0s | 850 | 15 | ERROR | ERROR | 45 | | Directory | 45GB | 1s | 22 | 18 | 36 | 96 | 46 | -------------------------------------------------------------------------------- /sign_vq/tests/model_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import MagicMock 3 | 4 | import torch 5 | 6 | from pose_format.torch.masked import MaskedTensor 7 | 8 | from sign_vq.model import PoseFSQAutoEncoder, AutoEncoderLightningWrapper 9 | 10 | 11 | class ModelTestCase(unittest.TestCase): 12 | 13 | def __init__(self, *args, **kwargs): 14 | super().__init__(*args, **kwargs) 15 | self.pose_dim = (2, 3) 16 | self.seq_length = 5 17 | 18 | def model_setup(self): 19 | model = PoseFSQAutoEncoder(codebook_size=2 ** 4, num_codebooks=2, pose_dims=self.pose_dim, 20 | hidden_dim=16, nhead=2, num_layers=2, dim_feedforward=32) 21 | loss_weights = torch.ones((self.pose_dim[0], 1), dtype=torch.float) 22 | model = AutoEncoderLightningWrapper(model, loss_weights=loss_weights) 23 | model.log = MagicMock(return_value=True) 24 | return model 25 | 26 | def test_forward_yields_same_shape(self): 27 | model = self.model_setup() 28 | pose = MaskedTensor(torch.full((4, 3, *self.pose_dim), fill_value=2, dtype=torch.float)) 29 | out_pose, _ = model(pose) 30 | 31 | self.assertEqual(pose.shape, out_pose.shape) 32 | 33 | def test_training_step_expected_loss_zero(self): 34 | model = self.model_setup() 35 | tensor = torch.full((4, 3, *self.pose_dim), fill_value=2, dtype=torch.float) 36 | mask = torch.zeros_like(tensor, dtype=torch.bool) 37 | batch = MaskedTensor(tensor, mask) 38 | 39 | loss = float(model.training_step(batch)) 40 | self.assertEqual(0, loss) 41 | 42 | def test_training_step_expected_loss_finite(self): 43 | model = self.model_setup() 44 | batch = MaskedTensor(torch.full((4, 3, *self.pose_dim), fill_value=2, dtype=torch.float)) 45 | 46 | loss = model.training_step(batch) 47 | self.assertNotEqual(0, float(loss)) 48 | self.assertTrue(torch.isfinite(loss)) 49 | 50 | def test_indices_with_multiple_codebooks(self): 51 | model = self.model_setup() 52 | pose = MaskedTensor(torch.full((4, 3, *self.pose_dim), fill_value=2, dtype=torch.float)) 53 | _, indices = model(pose) 54 | # 4 items in the batch 55 | # 3 frames 56 | # 2 codebooks 57 | self.assertEqual((4, 3, 2), indices.shape) 58 | 59 | def test_training_step_bfloat16_expected_loss_finite(self): 60 | batch = MaskedTensor(torch.full((4, 3, *self.pose_dim), fill_value=2, dtype=torch.float)) 61 | model = self.model_setup() 62 | 63 | with torch.autocast(device_type="cpu", dtype=torch.bfloat16): 64 | loss = model.training_step(batch) 65 | self.assertNotEqual(0, float(loss)) 66 | self.assertTrue(torch.isfinite(loss)) 67 | 68 | if __name__ == "__main__": 69 | unittest.main() 70 | -------------------------------------------------------------------------------- /sign_vq/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import cv2 4 | import numpy as np 5 | import torch 6 | from pose_format import Pose 7 | from pose_format.pose_visualizer import PoseVisualizer 8 | from pose_format.torch.masked import MaskedTensor 9 | from torch import Tensor 10 | 11 | from sign_vq.data.normalize import load_pose_header, unnormalize_mean_std, unshift_hand 12 | 13 | 14 | def pose_from_data(pose_data: Union[MaskedTensor, Tensor]): 15 | from pose_format.numpy import NumPyPoseBody 16 | 17 | if isinstance(pose_data, Tensor): 18 | pose_data = MaskedTensor(pose_data) 19 | 20 | if pose_data.device != torch.device("cpu"): 21 | pose_data = pose_data.to(torch.device("cpu")) 22 | 23 | # Add person dimension 24 | pose_data.tensor = pose_data.tensor.unsqueeze(1) 25 | pose_data.mask = pose_data.mask.unsqueeze(1) 26 | 27 | if pose_data.dtype != torch.float32: 28 | pose_data.tensor = pose_data.tensor.to(torch.float32) 29 | 30 | np_data = pose_data.tensor.numpy() 31 | np_confidence = pose_data.mask.numpy().astype(np.float32).max(-1) 32 | np_body = NumPyPoseBody(fps=25, data=np_data, confidence=np_confidence) 33 | 34 | pose = Pose(header=load_pose_header(), body=np_body) 35 | pose = unnormalize_mean_std(pose) 36 | unshift_hand(pose, "RIGHT_HAND_LANDMARKS") 37 | unshift_hand(pose, "LEFT_HAND_LANDMARKS") 38 | 39 | # Resize pose 40 | new_width = 200 41 | shift = 1.25 42 | shift_vec = np.full(shape=(pose.body.data.shape[-1]), fill_value=shift, dtype=np.float32) 43 | pose.body.data = (pose.body.data + shift_vec) * new_width 44 | pose.header.dimensions.height = pose.header.dimensions.width = int(new_width * shift * 2) 45 | 46 | return pose 47 | 48 | 49 | def draw_pose(pose_data: MaskedTensor): 50 | pose = pose_from_data(pose_data) 51 | 52 | # Draw pose 53 | visualizer = PoseVisualizer(pose) 54 | frames = [cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in visualizer.draw()] 55 | return np.stack(frames) 56 | 57 | 58 | def draw_original_and_predicted_pose(original: MaskedTensor, predicted: Tensor): 59 | original = MaskedTensor(original.tensor.cpu(), original.mask.cpu()) 60 | predicted = predicted.cpu() 61 | 62 | # to find the pose length, find the last frame where the confidence is not zero 63 | frame_confidence = original.mask.numpy().max(-1).max(-1) # (frames) 64 | pose_length = frame_confidence.nonzero()[0].max() + 1 65 | 66 | original = original[:pose_length] 67 | predicted = MaskedTensor(predicted[:pose_length]) 68 | 69 | original_video = draw_pose(original) 70 | predicted_video = draw_pose(predicted) 71 | return np.concatenate([original_video, predicted_video], axis=2) 72 | 73 | 74 | if __name__ == "__main__": 75 | fake_pose = MaskedTensor(torch.zeros(size=(100, 178, 3), dtype=torch.float32)) 76 | draw_pose(fake_pose) 77 | # video = draw_original_and_predicted_pose(fake_pose, fake_pose.tensor) 78 | # print(video.shape) 79 | # print(np.moveaxis(video, 3, 1).shape) 80 | -------------------------------------------------------------------------------- /sign_vq/tests/model_overfit_test.py: -------------------------------------------------------------------------------- 1 | import random 2 | import unittest 3 | from unittest.mock import MagicMock 4 | 5 | import torch 6 | from pose_format.torch.masked import MaskedTensor 7 | from pose_format.torch.masked.collator import zero_pad_collator 8 | from tqdm import tqdm 9 | 10 | from sign_vq.model import PoseFSQAutoEncoder, AutoEncoderLightningWrapper 11 | 12 | 13 | def get_batch(bsz=4): 14 | data_tensor = torch.tensor([[[1, 1]], [[2, 2]], [[3, 3]]], dtype=torch.float32) 15 | return { 16 | "text": ["text1"] * bsz, 17 | "pose": { 18 | "length": torch.tensor([3], dtype=torch.float32).expand(bsz, 1), 19 | "data": data_tensor.expand(bsz, *data_tensor.shape), 20 | "confidence": torch.ones([bsz, 3, 1]), 21 | "inverse_mask": torch.ones([bsz, 3]), 22 | }, 23 | } 24 | 25 | 26 | pose_dim = (2, 3) 27 | 28 | 29 | class ModelOverfitTestCase(unittest.TestCase): 30 | def model_setup(self): 31 | model = PoseFSQAutoEncoder(codebook_size=2 ** 4, num_codebooks=1, pose_dims=pose_dim, 32 | hidden_dim=16, nhead=2, num_layers=2, dim_feedforward=32) 33 | model = AutoEncoderLightningWrapper(model, learning_rate=5e-2, warmup_steps=1) 34 | model.log = MagicMock(return_value=True) 35 | return model 36 | 37 | def test_model_should_overfit(self): 38 | torch.manual_seed(42) 39 | random.seed(42) 40 | 41 | poses = [ 42 | MaskedTensor(torch.tensor([[[1, 2, 3], [4, 5, 6]]], dtype=torch.float32)), 43 | MaskedTensor(torch.tensor([[[3, 2, 1], [0, -1, -2]]], dtype=torch.float32)), 44 | ] 45 | batch = zero_pad_collator(poses) 46 | print("batch", batch.shape) 47 | 48 | model = self.model_setup() 49 | 50 | optimizers = model.configure_optimizers() 51 | optimizer = optimizers["optimizer"] 52 | scheduler = optimizers["lr_scheduler"]["scheduler"] 53 | 54 | model.train() 55 | torch.set_grad_enabled(True) 56 | 57 | # Simple training loop 58 | losses = [] 59 | for _ in tqdm(range(70)): 60 | optimizer.zero_grad() # clear gradients 61 | 62 | loss = model.training_step(batch) 63 | loss_float = float(loss.detach()) 64 | losses.append(loss_float) 65 | 66 | loss.backward() # backward 67 | optimizer.step() # update parameters 68 | scheduler.step() # update learning rate 69 | 70 | print("losses", losses) 71 | print("last loss", losses[-1]) 72 | 73 | model.eval() 74 | with torch.no_grad(): 75 | prediction, _ = model(batch) 76 | 77 | print("batch", batch.tensor) 78 | print("prediction", prediction) 79 | print("torch.round(prediction)", torch.round(prediction)) 80 | 81 | self.assertEqual(batch.shape, prediction.shape) 82 | self.assertTrue(torch.all(torch.eq(torch.round(prediction), batch.tensor))) 83 | 84 | 85 | if __name__ == '__main__': 86 | unittest.main() 87 | -------------------------------------------------------------------------------- /sign_vq/benchmark_pose_reconstuction.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import itertools 3 | import os 4 | from pathlib import Path 5 | from typing import List 6 | from collections import defaultdict 7 | 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | import torch 11 | from pose_format import Pose 12 | from pose_format.numpy import NumPyPoseBody 13 | from tqdm import tqdm 14 | 15 | from sign_vq.data.normalize import pre_process_mediapipe, normalize_mean_std 16 | from sign_vq.model import PoseFSQAutoEncoder 17 | from sign_vq.pose_reconstruction import run_inference 18 | from sign_vq.poses_to_codes import load_model, process_file 19 | 20 | 21 | def dropout_joints(pose: Pose, dropout_rate: float): 22 | confidence = pose.body.confidence 23 | mask = np.random.rand(*confidence.shape) > dropout_rate 24 | new_confidence = confidence * mask 25 | body = NumPyPoseBody(fps=pose.body.fps, data=pose.body.data.data, confidence=new_confidence) 26 | return Pose(header=pose.header, body=body) 27 | 28 | 29 | def benchmark_single_pose(model: PoseFSQAutoEncoder, original_pose: Pose, reduced_pose: Pose, only_masked=True): 30 | new_pose = run_inference(model, reduced_pose, only_masked=only_masked) 31 | new_pose = pre_process_mediapipe(new_pose) 32 | new_pose = normalize_mean_std(new_pose) 33 | return ((original_pose.body.data - new_pose.body.data) ** 2).sum() 34 | 35 | 36 | def benchmark_pose_reconstructions(model: PoseFSQAutoEncoder, poses: List[Pose], steps=100): 37 | distances = defaultdict(lambda: defaultdict(list)) 38 | 39 | for dropout_rate in tqdm(torch.linspace(0, 1, steps)): 40 | dropout_rate = float(dropout_rate.item()) 41 | for pose in poses: 42 | reduced_pose = dropout_joints(pose, dropout_rate) 43 | 44 | masked_distance = benchmark_single_pose(model, pose, reduced_pose, only_masked=True) 45 | distances["masked"][dropout_rate].append(masked_distance) 46 | unmasked_distance = benchmark_single_pose(model, pose, reduced_pose, only_masked=False) 47 | distances["unmasked"][dropout_rate].append(unmasked_distance) 48 | 49 | # create a single chart of the sum of the distances for each dropout rate 50 | for key, values in distances.items(): 51 | plt.plot(values.keys(), [sum(v) for v in values.values()], label=key) 52 | 53 | plt.xlabel("Dropout rate") 54 | plt.ylabel("Sum of squared distances") 55 | plt.yscale("log") 56 | plt.legend() 57 | plt.show() 58 | 59 | 60 | def main(): 61 | parser = argparse.ArgumentParser(description='Benchmark pose reconstruction') 62 | parser.add_argument('--model', type=str, help='Path to trained model', default="sign/mediapipe-vq") 63 | parser.add_argument('--directory', type=str, help='Path to pose files', default="../examples") 64 | parser.add_argument('--num-files', type=int, default=10, help='Number of files to benchmark') 65 | args = parser.parse_args() 66 | 67 | if not os.path.exists(args.directory): 68 | raise FileNotFoundError(f"Directory {args.directory} does not exist") 69 | 70 | model = load_model(args.model) 71 | 72 | pose_files = itertools.islice(Path(args.directory).rglob("*.pose"), args.num_files) 73 | poses = [process_file(f.open('rb')) for f in pose_files] 74 | 75 | with torch.no_grad(): 76 | benchmark_pose_reconstructions(model, poses) 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /sign_vq/poses_to_codes.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import zipfile 4 | import csv 5 | from pathlib import Path 6 | from typing import Iterable 7 | 8 | import torch 9 | from pose_format import Pose 10 | from tqdm import tqdm 11 | 12 | from sign_vq.data.normalize import pre_process_mediapipe, normalize_mean_std 13 | from sign_vq.model import PoseFSQAutoEncoder 14 | 15 | 16 | def process_file(file): 17 | pose = Pose.read(file.read()) 18 | pose = pre_process_mediapipe(pose) 19 | pose = normalize_mean_std(pose) 20 | return pose 21 | 22 | 23 | def load_zip_poses(zip_path: Path) -> Iterable[tuple[str, torch.Tensor]]: 24 | with zipfile.ZipFile(zip_path, 'r') as zip_ref: 25 | for file in zip_ref.namelist(): 26 | with zip_ref.open(file) as pose_file: 27 | yield file, process_file(pose_file) 28 | 29 | 30 | def load_directory_poses(directory_path: Path) -> Iterable[tuple[str, Pose]]: 31 | for file in directory_path.glob("*.pose"): 32 | with open(file, 'rb') as pose_file: 33 | yield file.name, process_file(pose_file) 34 | 35 | 36 | def load_poses(data_path: Path) -> Iterable[tuple[str, Pose]]: 37 | if data_path.is_dir(): 38 | yield from load_directory_poses(data_path) 39 | elif data_path.suffix == ".zip": 40 | yield from load_zip_poses(data_path) 41 | else: 42 | raise ValueError(f"Unknown data type {data_path}") 43 | 44 | 45 | def load_model(model_name: str): 46 | print("Loading model...") 47 | if Path(model_name).is_dir(): 48 | model_paths = list(Path(model_name).glob("*.ckpt")) 49 | if len(model_paths) == 0: 50 | raise ValueError(f"No checkpoint found in {model_name}") 51 | model_path = sorted(model_paths)[-1] 52 | args_path = Path(model_name) / "args.json" 53 | else: 54 | from huggingface_hub import hf_hub_download 55 | model_path = hf_hub_download(repo_id=model_name, filename="model.ckpt") 56 | args_path = hf_hub_download(repo_id=model_name, filename="args.json") 57 | 58 | with open(args_path, 'r', encoding="utf-8") as f: 59 | model_args = json.load(f) 60 | 61 | map_location = None if torch.cuda.is_available() else torch.device('cpu') 62 | model_state = torch.load(model_path, map_location=map_location)["state_dict"] 63 | model_state = {k.replace("model.", ""): v 64 | for k, v in model_state.items() 65 | if k.startswith("model.")} 66 | model = PoseFSQAutoEncoder(**model_args) 67 | model.load_state_dict(model_state) 68 | 69 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 70 | model = model.to(device) 71 | 72 | model.eval() 73 | 74 | return model 75 | 76 | def pose_to_tensor(pose: Pose, device: torch.device): 77 | tensor = torch.tensor(pose.body.data.filled(0), dtype=torch.float32, device=device) 78 | # remove person dimension 79 | tensor = tensor.squeeze(1) 80 | # add batch dimension 81 | tensor = tensor.unsqueeze(0) 82 | return tensor 83 | 84 | def run_inference(model: PoseFSQAutoEncoder, poses: Iterable[tuple[str, Pose]], output_path: Path): 85 | print("Running inference...") 86 | with open(output_path, 'w', encoding="utf-8") as f: 87 | writer = csv.writer(f) 88 | writer.writerow(["file", "fps", "length", "codes"]) 89 | 90 | for file, pose in tqdm(poses): 91 | tensor = pose_to_tensor(pose, model.device) 92 | codes = model.quantize(tensor) 93 | codes_list = torch.flatten(codes[0]).tolist() 94 | writer.writerow([file, pose.body.fps, len(pose.body.data), " ".join(map(str, codes_list))]) 95 | 96 | 97 | def main(): 98 | parser = argparse.ArgumentParser(description='Run inference on a trained model') 99 | parser.add_argument('--model', type=str, help='Path to trained model', 100 | default="sign/mediapipe-vq") 101 | parser.add_argument('--data', type=str, help='Path to data to run inference on') 102 | parser.add_argument('--output', type=str, help='Path to output csv file') 103 | args = parser.parse_args() 104 | 105 | model = load_model(args.model) 106 | poses = load_poses(Path(args.data)) 107 | 108 | with torch.no_grad(): 109 | run_inference(model, poses, Path(args.output)) 110 | 111 | 112 | if __name__ == "__main__": 113 | main() 114 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sign MediaPipe VQ 2 | 3 | We try to compress mediapipe poses using VQ-VAE and then generate videos from the compressed poses. 4 | Given a good quantizer, we can use it for downstream tasks like SignWriting transcription or animation. 5 | 6 | ## Training a model 7 | 8 | ```bash 9 | # 0. Setup the environment. 10 | conda create --name vq python=3.11 11 | conda activate vq 12 | pip install ".[dev]" 13 | 14 | DATA_DIR=/scratch/amoryo/poses 15 | POSES_DIR=/shares/volk.cl.uzh/amoryo/datasets/sign-mt-poses 16 | 17 | # 1. Downloads lots of poses from the bucket. (about 508GB) 18 | sbatch scripts/sync_bucket.sh "$POSES_DIR" 19 | # Check the number of files (should be above 500k) 20 | find "$POSES_DIR" -type f -name "*.pose" | wc -l 21 | 22 | # 2. Collect normalization data 23 | sbatch scripts/extract_mean_std.sh "$POSES_DIR" 24 | 25 | # 3. Creates a ZIP file of the poses after normalizing them. (about 45GB) 26 | sbatch scripts/zip_dataset.sh "$POSES_DIR" "$DATA_DIR/normalized.zip" 27 | 28 | # 4. Trains the model and reports to `wandb`. 29 | sbatch scripts/train_model.sh "$DATA_DIR/normalized.zip" 30 | ``` 31 | 32 | To set up example data: 33 | ```bash 34 | DATA_DIR=example_data 35 | POSES_DIR=example_data/poses 36 | 37 | python -m sign_vq.data.zip_dataset --dir="$POSES_DIR" --out="$DATA_DIR/normalized.zip" 38 | 39 | python -m sign_vq.train --data-path="$DATA_DIR/normalized.zip" 40 | ``` 41 | 42 | ### Mixed Precision Training 43 | 44 | - `--dtype=float32` - Default 45 | - `--dtype=bfloat16` - [Now supported](https://github.com/lucidrains/vector-quantize-pytorch/issues/114) by `vector_quantize_pytorch` 46 | 47 | ## Training Output 48 | 49 | In Weights & Biases, we can see the training progress. 50 | In validation, we generate a video from the compressed poses (right) and compare it to the original video (left). 51 | (This is the output using 4 codebooks of size 1024.) 52 | 53 | | 0 | 1 | 2 | 3 | 4 | 54 | |-----------------------------------------|-----------------------------------------|-----------------------------------------|-----------------------------------------|-----------------------------------------| 55 | | ![](assets/validation/validation_0.gif) | ![](assets/validation/validation_1.gif) | ![](assets/validation/validation_2.gif) | ![](assets/validation/validation_3.gif) | ![](assets/validation/validation_4.gif) | 56 | 57 | ## Inference 58 | 59 | To quantize a pose file, directory of poses, or Zip file of poses, use the `inference` command. 60 | ```bash 61 | poses_to_codes --data="DIRECTORY" --output="output.csv" 62 | ``` 63 | 64 | To convert codes back to poses, use the `codes_to_poses` command. 65 | ```bash 66 | codes_to_poses --output="DIRECTORY" --codes="codes_file.txt" 67 | # Or directly from codes, 5 frames example 68 | codes_to_poses --output="test.pose" --codes="731 63 540 261 787 63 250 100 492 351 530 307 939 63 532 61 788 55 530 60" 69 | ``` 70 | 71 | ## Background 72 | 73 | Vector Quantization has been successfully used by many for highly compressing images and audio. 74 | For example, by Deepmind and OpenAI for high quality generation of images (VQ-VAE-2) and music (Jukebox). 75 | 76 | We use a Finite Scalar Quantization. 77 | This work out of Google Deepmind aims to vastly simplify the way vector quantization is done for generative modeling, 78 | removing the need for commitment losses, EMA updating of the codebook, as well as tackle the issues with codebook 79 | collapse or insufficient utilization. They simply round each scalar into discrete levels with straight through 80 | gradients; the codes become uniform points in a hypercube. 81 | 82 | ## Data 83 | 84 | Data is expected as a zip file of numpy masked arrays. 85 | See [sign_vq/data/README.md](sign_vq/data/README.md) for more details. 86 | 87 | ## Other Resources 88 | 89 | - [MotionGPT](https://github.com/OpenMotionLab/MotionGPT): Human Motion as a Foreign Language 90 | - [T2M-GPT](https://github.com/Mael-zys/T2M-GPT): Generating Human Motion from Textual Descriptions with Discrete 91 | Representations 92 | 93 | ## Downstream Tasks 94 | 95 | - [SignWriting Transcription](https://github.com/sign-language-processing/signwriting-transcription/tree/main/signwriting_transcription/pose_to_vq_to_signwriting) 96 | - Pose Error Correction - Given a pose sequence with missing keypoints, we can use the VQ model to fill in the missing keypoints. 97 | - [Fluent Pose Synthesis](https://github.com/sign-language-processing/fluent-pose-synthesis) - For pose in-betweening. 98 | - [SignWriting Animation](https://github.com/sign-language-processing/signwriting-animation) 99 | -------------------------------------------------------------------------------- /sign_vq/data/normalize.py: -------------------------------------------------------------------------------- 1 | import functools 2 | from pathlib import Path 3 | 4 | import numpy as np 5 | from pose_format import Pose, PoseHeader 6 | from pose_format.utils.generic import pose_normalization_info, correct_wrists, hands_components, reduce_holistic 7 | from pose_format.utils.reader import BufferReader 8 | 9 | CURRENT_DIR = Path(__file__).parent 10 | 11 | 12 | def shift_hand(pose: Pose, hand_component: str, wrist_name: str): 13 | # pylint: disable=protected-access 14 | wrist_index = pose.header._get_point_index(hand_component, wrist_name) 15 | hand = pose.body.data[:, :, wrist_index: wrist_index + 21] 16 | wrist = hand[:, :, 0:1] 17 | pose.body.data[:, :, wrist_index: wrist_index + 21] = hand - wrist 18 | 19 | 20 | def unshift_hand(pose: Pose, hand_component: str): 21 | # pylint: disable=protected-access 22 | wrist_index = pose.header._get_point_index(hand_component, "WRIST") 23 | hand = pose.body.data[:, :, wrist_index: wrist_index + 21] 24 | body_wrist_name = "LEFT_WRIST" if hand_component == "LEFT_HAND_LANDMARKS" else "RIGHT_WRIST" 25 | # pylint: disable=protected-access 26 | body_wrist_index = pose.header._get_point_index("POSE_LANDMARKS", body_wrist_name) 27 | body_wrist = pose.body.data[:, :, body_wrist_index: body_wrist_index + 1] 28 | pose.body.data[:, :, wrist_index: wrist_index + 21] = hand + body_wrist 29 | 30 | 31 | def pre_process_mediapipe(pose: Pose): 32 | # Remove legs, simplify face 33 | pose = reduce_holistic(pose) 34 | pose = pose.get_components(["POSE_LANDMARKS", "FACE_LANDMARKS", "LEFT_HAND_LANDMARKS", "RIGHT_HAND_LANDMARKS"]) 35 | 36 | # Align hand wrists with body wrists 37 | correct_wrists(pose) 38 | # Adjust pose based on shoulder positions 39 | pose = pose.normalize(pose_normalization_info(pose.header)) 40 | 41 | # Shift hands to origin 42 | (left_hand_component, right_hand_component), _, (wrist, _) = hands_components(pose.header) 43 | shift_hand(pose, left_hand_component, wrist) 44 | shift_hand(pose, right_hand_component, wrist) 45 | 46 | return pose 47 | 48 | 49 | @functools.lru_cache(maxsize=1) 50 | def load_mean_and_std(): 51 | import json 52 | with open(CURRENT_DIR / "pose_normalization.json", "r", encoding="utf-8") as f: 53 | data = json.load(f) 54 | 55 | mean, std = [], [] 56 | for component in data.values(): 57 | for point in component.values(): 58 | mean.append(point["mean"]) 59 | std.append(point["std"]) 60 | 61 | # when std is 0, set std to 1 62 | std = np.array(std) 63 | std[std == 0] = 1 64 | 65 | return np.array(mean), std 66 | 67 | 68 | @functools.lru_cache(maxsize=1) 69 | def load_pose_header(): 70 | with open(CURRENT_DIR / "header.poseheader", "rb") as f: 71 | return PoseHeader.read(BufferReader(f.read())) 72 | 73 | 74 | def normalize_mean_std(pose: Pose): 75 | mean, std = load_mean_and_std() 76 | pose.body.data = (pose.body.data - mean) / std 77 | return pose 78 | 79 | 80 | def unnormalize_mean_std(pose: Pose): 81 | mean, std = load_mean_and_std() 82 | pose.body.data = (pose.body.data * std) + mean 83 | return pose 84 | 85 | 86 | def get_mean_and_std(directory: str): 87 | cumulative_sum, squared_sum, frames_count = None, None, None 88 | 89 | from tqdm import tqdm 90 | 91 | for file in tqdm(list(Path(directory).glob("*.pose"))): 92 | # Get the pose 93 | with open(file, 'rb') as pose_file: 94 | pose = Pose.read(pose_file.read()) 95 | pose = pre_process_mediapipe(pose) 96 | tensor = pose.body.data.filled(0) 97 | 98 | # Get relevant values 99 | frames_sum = np.sum(tensor, axis=(0, 1)) 100 | frames_squared_sum = np.sum(np.square(tensor), axis=(0, 1)) 101 | # pylint: disable=singleton-comparison 102 | unmasked_frames = pose.body.data[:, :, :, 0:1].mask == False 103 | num_unmasked_frames = np.sum(unmasked_frames, axis=(0, 1)) 104 | 105 | # Update cumulative values 106 | cumulative_sum = frames_sum if cumulative_sum is None else cumulative_sum + frames_sum 107 | squared_sum = frames_squared_sum if squared_sum is None else squared_sum + frames_squared_sum 108 | frames_count = num_unmasked_frames if frames_count is None else frames_count + num_unmasked_frames 109 | 110 | mean = cumulative_sum / frames_count 111 | std = np.sqrt((squared_sum / frames_count) - np.square(mean)) 112 | 113 | return mean, std 114 | 115 | 116 | def main(poses_location: str): 117 | mean, std = get_mean_and_std(poses_location) 118 | 119 | # get a single random pose 120 | random_pose_path = Path(poses_location).glob("*.pose").__next__() 121 | with open(random_pose_path, 'rb') as pose_file: 122 | pose = Pose.read(pose_file.read()) 123 | pose = pre_process_mediapipe(pose) 124 | 125 | # store header 126 | with open(CURRENT_DIR / "header.poseheader", "wb") as f: 127 | pose.header.write(f) 128 | 129 | i = 0 130 | mean_std_info = {} 131 | for component in pose.header.components: 132 | component_info = {} 133 | for point in component.points: 134 | component_info[point] = { 135 | "mean": mean[i].tolist(), 136 | "std": std[i].tolist() 137 | } 138 | i += 1 139 | mean_std_info[component.name] = component_info 140 | 141 | import json 142 | 143 | with open(CURRENT_DIR / "pose_normalization.json", "w", encoding="utf-8") as f: 144 | json.dump(mean_std_info, f, indent=2) 145 | 146 | 147 | if __name__ == "__main__": 148 | import argparse 149 | 150 | parser = argparse.ArgumentParser(description='Collect normalization info') 151 | parser.add_argument('--dir', type=str, help='Directory containing the pose files') 152 | 153 | args = parser.parse_args() 154 | 155 | if not Path(args.dir).exists(): 156 | raise FileNotFoundError(f"Directory {args.dir} does not exist") 157 | 158 | main(args.dir) 159 | -------------------------------------------------------------------------------- /sign_vq/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import random 5 | 6 | import numpy as np 7 | import pytorch_lightning as pl 8 | import torch 9 | from pose_format.torch.masked.collator import zero_pad_collator 10 | from pytorch_lightning.callbacks import LearningRateMonitor 11 | from pytorch_lightning.loggers import WandbLogger 12 | from torch.utils.data import DataLoader 13 | 14 | from sign_vq.data.normalize import load_pose_header 15 | from sign_vq.dataset import ZipPoseDataset, DirectoryPoseDataset, PackedDataset 16 | from sign_vq.model import PoseFSQAutoEncoder, AutoEncoderLightningWrapper 17 | 18 | 19 | def parse_args(): 20 | parser = argparse.ArgumentParser() 21 | 22 | # Define your arguments here 23 | parser.add_argument('--data-path', type=str, help='Path to training dataset') 24 | parser.add_argument('--wandb-dir', default=None, type=str, help='Path to wandb directory') 25 | parser.add_argument('--lr', type=float, default=3e-4, help='Learning rate') 26 | parser.add_argument('--steps', type=int, default=int(1e7), help='Number of training iterations') 27 | parser.add_argument('--batch-size', type=int, default=240, help='Batch size') 28 | parser.add_argument('--loss-hand-weight', type=int, default=10, help='Weight for hand reconstruction loss') 29 | parser.add_argument('--num-layers', type=int, default=8, help='Number of transformer layers') 30 | parser.add_argument('--codebook-size', type=int, default=2 ** 10, 31 | choices=[2 ** 8, 2 ** 9, 2 ** 10, 2 ** 11, 2 ** 12, 2 ** 14, 2 ** 16], 32 | help='Estimated number of codes in the VQ model') 33 | parser.add_argument('--num-codebooks', type=int, default=1, help='Number of codebooks') 34 | parser.add_argument('--max-pose-length', type=int, default=512, help='Maximum pose length in the dataset') 35 | parser.add_argument('--seed', type=int, default=42, help='Random seed') 36 | parser.add_argument('--device', type=str, 37 | default='gpu' if torch.cuda.is_available() else 'cpu', 38 | help='Device to use for training') 39 | parser.add_argument('--dtype', type=str, default='float32', help='Data type to use for training', 40 | choices=['bfloat16', 'float16', 'float32']) 41 | 42 | args = parser.parse_args() 43 | 44 | # Set random seed 45 | if args.seed is not None: 46 | torch.random.manual_seed(args.seed) 47 | torch.manual_seed(args.seed) 48 | np.random.seed(args.seed) 49 | random.seed(args.seed) 50 | 51 | return args 52 | 53 | 54 | def create_loss_weights(hand_weight=1): 55 | header = load_pose_header() 56 | 57 | total_points = header.total_points() 58 | hand_points = 21 59 | affected_points = 2 * (hand_points + 1) # wrist + hand_points 60 | # We want the loss to be the same scale across different runs, so we change the default weight accordingly 61 | default_weight = total_points / ((total_points - affected_points) + (affected_points * hand_weight)) 62 | 63 | weights = torch.full((total_points, 1), fill_value=default_weight, dtype=torch.float32) 64 | for hand in ["RIGHT", "LEFT"]: 65 | # pylint: disable=protected-access 66 | wrist_index = header._get_point_index(f"{hand}_HAND_LANDMARKS", "WRIST") 67 | weights[wrist_index: wrist_index + hand_points, :] = hand_weight 68 | # pylint: disable=protected-access 69 | body_wrist_index = header._get_point_index("POSE_LANDMARKS", f"{hand}_WRIST") 70 | weights[body_wrist_index, :] = hand_weight 71 | return weights 72 | 73 | 74 | # pylint: disable=too-many-locals 75 | def main(): 76 | args = parse_args() 77 | torch_dtype = getattr(torch, args.dtype) 78 | 79 | # internal multiplications use the bfloat16 datatype, if a fast matrix multiplication algorithm is available. 80 | torch.set_float32_matmul_precision("medium") 81 | 82 | auto_encoder = PoseFSQAutoEncoder(codebook_size=args.codebook_size, 83 | num_codebooks=args.num_codebooks, 84 | num_layers=args.num_layers) 85 | loss_weights = create_loss_weights(hand_weight=args.loss_hand_weight) 86 | model = AutoEncoderLightningWrapper(auto_encoder, learning_rate=args.lr, loss_weights=loss_weights) 87 | 88 | if args.data_path.endswith(".zip"): 89 | dataset = ZipPoseDataset(args.data_path, in_memory=True, 90 | dtype=torch_dtype, max_length=args.max_pose_length) 91 | training_dataset = dataset.slice(10, None) 92 | validation_dataset = dataset.slice(0, 10) 93 | shuffle = True # Shuffle is only slow without in_memory since the zip file is read sequentially 94 | num_workers = 0 # Reading from multiple workers errors out since the zip file is read sequentially 95 | else: 96 | dataset = DirectoryPoseDataset(args.data_path) 97 | training_dataset = dataset.slice(10, None) 98 | validation_dataset = dataset.slice(0, 10) 99 | shuffle = True 100 | num_workers = os.cpu_count() 101 | 102 | training_iter_dataset = PackedDataset(training_dataset, max_length=args.max_pose_length, shuffle=shuffle) 103 | 104 | train_dataset = DataLoader(training_iter_dataset, 105 | batch_size=args.batch_size, 106 | num_workers=num_workers, 107 | collate_fn=zero_pad_collator) 108 | validation_dataset = DataLoader(validation_dataset, 109 | batch_size=args.batch_size, 110 | shuffle=False, 111 | num_workers=num_workers, 112 | collate_fn=zero_pad_collator) 113 | 114 | logger = WandbLogger(project="sign-language-vq", 115 | save_dir=args.wandb_dir, 116 | log_model=False, offline=args.wandb_dir is None) 117 | logger.log_hyperparams(auto_encoder.args_dict) 118 | 119 | # Save model arguments to file 120 | with open(os.path.join(logger.experiment.dir, "args.json"), "w", encoding="utf-8") as f: 121 | json.dump(auto_encoder.args_dict, f, indent=2) 122 | 123 | lr_monitor = LearningRateMonitor(logging_interval='step') 124 | callbacks = [lr_monitor] 125 | 126 | precision = "bf16-mixed" if args.dtype == "bfloat16" else ("16-mixed" if args.dtype == "float16" else None) 127 | trainer = pl.Trainer(max_steps=args.steps, 128 | logger=logger, 129 | callbacks=callbacks, 130 | val_check_interval=100_000 // args.batch_size, 131 | accelerator=args.device, 132 | profiler="simple", 133 | precision=precision, 134 | gradient_clip_val=1, # Taken from the Llamma 2 paper 135 | ) 136 | 137 | # TODO: https://lightning.ai/docs/pytorch/stable/advanced/training_tricks.html requiring LightningDataModule 138 | # tuner = Tuner(trainer) 139 | # # Auto-scale batch size with binary search 140 | # tuner.scale_batch_size(model, mode="binsearch") 141 | 142 | trainer.fit(model, train_dataloaders=train_dataset, val_dataloaders=validation_dataset) 143 | 144 | 145 | if __name__ == '__main__': 146 | main() 147 | -------------------------------------------------------------------------------- /sign_vq/dataset.py: -------------------------------------------------------------------------------- 1 | import io 2 | import random 3 | import time 4 | import zipfile 5 | from itertools import islice 6 | from pathlib import Path 7 | 8 | import numpy as np 9 | import psutil 10 | import torch 11 | from pose_format.torch.masked import MaskedTorch 12 | from pose_format.torch.masked.tensor import MaskedTensor 13 | from torch.utils.data import Dataset, IterableDataset 14 | from tqdm import tqdm 15 | 16 | 17 | def print_memory(): 18 | # Get current process 19 | process = psutil.Process() 20 | 21 | # Get the memory info of the current process 22 | memory_info = process.memory_info() 23 | 24 | # Convert bytes to GB 25 | rss_in_gb = memory_info.rss / (1024 ** 3) 26 | vms_in_gb = memory_info.vms / (1024 ** 3) 27 | 28 | # Print the RSS and VMS in GB 29 | print(f"Memory used in GB: RSS={rss_in_gb:.2f}, VMS={vms_in_gb:.2f}") 30 | 31 | 32 | def preprocess_pose(pose, dtype=torch.float16): 33 | tensor_data = torch.tensor(pose['data'], dtype=dtype) 34 | tensor_mask = torch.tensor(pose['mask'], dtype=torch.bool) 35 | tensor_mask = torch.logical_not(tensor_mask) # numpy and torch have different mask conventions 36 | tensor = MaskedTensor(tensor=tensor_data, mask=tensor_mask) 37 | 38 | return tensor 39 | 40 | 41 | def crop_pose(tensor, max_length: int): 42 | if max_length is not None: 43 | offset = random.randint(0, len(tensor) - max_length) \ 44 | if len(tensor) > max_length else 0 45 | return tensor[offset:offset + max_length] 46 | return tensor 47 | 48 | 49 | class _ZipPoseDataset(Dataset): 50 | def __init__(self, zip_obj: zipfile.ZipFile, 51 | files: list, 52 | max_length: int = 512, 53 | in_memory: bool = False, 54 | dtype=torch.float16): 55 | self.max_length = max_length 56 | self.zip = zip_obj 57 | self.files = files 58 | self.in_memory = in_memory 59 | self.dtype = dtype 60 | self.memory_files = [] 61 | 62 | def __len__(self): 63 | return len(self.files) 64 | 65 | def __getitem__(self, idx): 66 | if len(self.memory_files) == len(self.files): 67 | tensor = self.memory_files[idx] 68 | else: 69 | # If we want to store in memory, we first load sequentially all the files 70 | idx = idx if not self.in_memory else len(self.memory_files) 71 | 72 | with self.zip.open(self.files[idx]) as file: 73 | file_content = file.read() # Read the entire file content 74 | 75 | # Convert the bytes content to a BytesIO object and load with numpy 76 | pose_file = io.BytesIO(file_content) 77 | pose = np.load(pose_file) 78 | tensor = preprocess_pose(pose, dtype=self.dtype) 79 | if self.in_memory: 80 | self.memory_files.append(tensor) 81 | if len(self.memory_files) % 10000 == 0: 82 | print_memory() 83 | 84 | cropped_pose = crop_pose(tensor, self.max_length) 85 | if cropped_pose.dtype != self.dtype: 86 | cropped_pose = MaskedTensor(tensor=cropped_pose.tensor.type(self.dtype), 87 | mask=cropped_pose.mask) 88 | return cropped_pose 89 | 90 | def slice(self, start, end): 91 | return _ZipPoseDataset(zip_obj=self.zip, files=self.files[start:end], 92 | max_length=self.max_length, in_memory=self.in_memory, dtype=self.dtype) 93 | 94 | 95 | class ZipPoseDataset(_ZipPoseDataset): 96 | def __init__(self, zip_path: Path, max_length: int = 512, in_memory: bool = False, dtype=torch.float32): 97 | print(f"ZipPoseDataset @ {zip_path} with max_length={max_length}, in_memory={in_memory}") 98 | 99 | # pylint: disable=consider-using-with 100 | self.zip_obj = zipfile.ZipFile(zip_path, 'r') 101 | files = self.zip_obj.namelist() 102 | print("Total files", len(files)) 103 | 104 | super().__init__(zip_obj=self.zip_obj, files=files, 105 | max_length=max_length, in_memory=in_memory, dtype=dtype) 106 | 107 | def __del__(self): 108 | self.zip_obj.close() 109 | 110 | 111 | class PackedDataset(IterableDataset): 112 | def __init__(self, dataset: Dataset, max_length: int, shuffle=True): 113 | self.dataset = dataset 114 | self.max_length = max_length 115 | self.shuffle = shuffle 116 | 117 | def __iter__(self): 118 | dataset_len = len(self.dataset) 119 | datum_idx = 0 120 | 121 | datum_shape = self.dataset[0].shape 122 | padding_shape = tuple([10] + list(datum_shape)[1:]) 123 | padding = MaskedTensor(tensor=torch.zeros(padding_shape), mask=torch.zeros(padding_shape)) 124 | 125 | while True: 126 | poses = [] 127 | total_length = 0 128 | while total_length < self.max_length: 129 | if self.shuffle: 130 | datum_idx = random.randint(0, dataset_len - 1) 131 | else: 132 | datum_idx = (datum_idx + 1) % dataset_len 133 | 134 | # Append pose 135 | pose = self.dataset[datum_idx] 136 | poses.append(pose) 137 | total_length += len(pose) 138 | 139 | # Append padding 140 | poses.append(padding) 141 | total_length += len(padding) 142 | 143 | concatenated_pose = MaskedTorch.cat(poses, dim=0)[:self.max_length] 144 | yield concatenated_pose 145 | 146 | 147 | class HuggingfacePoseDataset(Dataset): 148 | def __init__(self, dataset_path: Path, max_length: int = 512): 149 | now = time.time() 150 | from datasets import load_from_disk 151 | 152 | self.dataset = load_from_disk(str(dataset_path))["train"] 153 | self.max_length = max_length 154 | print("Loaded huggingface dataset in", time.time() - now) 155 | 156 | def __len__(self): 157 | return len(self.dataset) 158 | 159 | def __getitem__(self, idx): 160 | pose = self.dataset[idx] 161 | return crop_pose(preprocess_pose(pose), self.max_length) 162 | 163 | 164 | class DirectoryPoseDataset(Dataset): 165 | def __init__(self, directory_path: Path, max_length: int = 512): 166 | self.directory_path = directory_path 167 | self.files = list(directory_path.glob('*.npz')) 168 | self.max_length = max_length 169 | 170 | def __len__(self): 171 | return len(self.files) 172 | 173 | def __getitem__(self, idx): 174 | pose = np.load(self.files[idx]) 175 | return crop_pose(preprocess_pose(pose), self.max_length) 176 | 177 | 178 | def benchmark_dataloader(dataset, num_workers: int): 179 | print(f"{num_workers} workers") 180 | from torch.utils.data import DataLoader 181 | from pose_format.torch.masked.collator import zero_pad_collator 182 | 183 | data_loader = DataLoader(dataset, batch_size=1, shuffle=True, 184 | collate_fn=zero_pad_collator, 185 | num_workers=num_workers) 186 | for _ in tqdm(islice(data_loader, 200)): 187 | pass 188 | 189 | 190 | def benchmark(): 191 | # Benchmark 192 | datasets = [ 193 | # HuggingfacePoseDataset(Path("/scratch/amoryo/poses/huggingface"), max_length=512), 194 | ZipPoseDataset(Path('/scratch/amoryo/poses/normalized.zip'), max_length=512), 195 | DirectoryPoseDataset(Path('/scratch/amoryo/poses/normalized'), max_length=512), 196 | ] 197 | 198 | for dataset in datasets: 199 | print("Benchmarking", dataset.__class__.__name__) 200 | 201 | print("Benchmarking dataset") 202 | print(next(iter(dataset)).shape) 203 | for _ in tqdm(islice(iter(dataset), 500)): 204 | pass 205 | 206 | print("Benchmarking data loader") 207 | benchmark_dataloader(dataset, 0) 208 | benchmark_dataloader(dataset, 1) 209 | benchmark_dataloader(dataset, 4) 210 | benchmark_dataloader(dataset, 8) 211 | 212 | 213 | if __name__ == "__main__": 214 | benchmark() 215 | -------------------------------------------------------------------------------- /sign_vq/model.py: -------------------------------------------------------------------------------- 1 | """forked from https://github.com/lucidrains/vector-quantize-pytorch/blob/master/examples/autoencoder_fsq.py""" 2 | import inspect 3 | import math 4 | import sys 5 | from itertools import islice 6 | from typing import Union 7 | 8 | import numpy as np 9 | import pytorch_lightning as pl 10 | import torch 11 | import wandb 12 | from pose_format.torch.masked import MaskedTensor 13 | from torch import Tensor, nn 14 | from vector_quantize_pytorch import FSQ 15 | 16 | from sign_vq.utils import draw_original_and_predicted_pose 17 | 18 | user_command = " ".join(sys.argv) 19 | IS_TESTING = "pytest" in user_command or "unittest" in user_command 20 | 21 | def estimate_levels(codebook_size: int): 22 | # Codebook levels based on https://arxiv.org/pdf/2309.15505.pdf Section 4.1 23 | levels = { 24 | 2 ** 4: [4, 4], # Not mentioned in the paper, used for tests 25 | 2 ** 8: [8, 6, 5], 26 | 2 ** 9: [4, 5, 5, 5], # Not mentioned in the paper 27 | 2 ** 10: [8, 5, 5, 5], 28 | 2 ** 11: [4, 4, 5, 5, 5], # Not mentioned in the paper 29 | 2 ** 12: [7, 5, 5, 5, 5], 30 | 2 ** 14: [8, 8, 8, 6, 5], 31 | 2 ** 16: [8, 8, 8, 5, 5, 5] 32 | } 33 | if codebook_size in levels: 34 | return levels[codebook_size] 35 | 36 | raise ValueError("Codebook size not supported. Supported sizes are 2^4, 2^8, 2^10, 2^11, 2^12, 2^14, 2^16") 37 | 38 | 39 | class PositionalEncoding(nn.Module): 40 | # From https://pytorch.org/tutorials/beginner/transformer_tutorial.html 41 | def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 100000): 42 | super().__init__() 43 | self.dropout = nn.Dropout(p=dropout) 44 | 45 | position = torch.arange(max_len).unsqueeze(1) 46 | div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)) 47 | embedding = torch.zeros(max_len, 1, d_model) 48 | embedding[:, 0, 0::2] = torch.sin(position * div_term) 49 | embedding[:, 0, 1::2] = torch.cos(position * div_term) 50 | self.register_buffer('pe', embedding) 51 | 52 | def forward(self, x: Tensor) -> Tensor: 53 | """ 54 | Arguments: 55 | x: Tensor, shape ``[seq_len, batch_size, embedding_dim]`` 56 | """ 57 | x = x + self.pe[:x.size(0)] 58 | return self.dropout(x) 59 | 60 | 61 | class PoseFSQAutoEncoder(nn.Module): 62 | # pylint: disable=too-many-arguments 63 | def __init__(self, codebook_size: int, 64 | pose_dims: tuple = (178, 3), 65 | num_codebooks: int = 4, 66 | hidden_dim=512, 67 | nhead=16, 68 | dim_feedforward=2048, 69 | num_layers=6): 70 | super().__init__() 71 | 72 | # Store a dictionary of all arguments passed to the constructor 73 | self.args_dict = locals() 74 | del self.args_dict['self'] 75 | del self.args_dict['__class__'] 76 | 77 | levels = estimate_levels(codebook_size) 78 | 79 | # Calculate the exact number of codes based on the levels 80 | self.num_codes = math.prod(levels) 81 | self.num_codebooks = num_codebooks 82 | 83 | self.encoder = nn.Sequential( 84 | nn.Flatten(start_dim=2), 85 | nn.Dropout(0.15), 86 | nn.Linear(math.prod(pose_dims), hidden_dim, bias=False), 87 | PositionalEncoding(d_model=hidden_dim), 88 | nn.TransformerEncoder( 89 | nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=nhead, 90 | dim_feedforward=dim_feedforward, 91 | batch_first=True), 92 | num_layers=num_layers 93 | ) 94 | ) 95 | 96 | self.fsq = FSQ(levels, dim=hidden_dim, num_codebooks=num_codebooks) 97 | 98 | self.decoder = nn.Sequential( 99 | PositionalEncoding(d_model=hidden_dim), 100 | nn.TransformerEncoder( 101 | nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=nhead, 102 | dim_feedforward=dim_feedforward, 103 | batch_first=True), 104 | num_layers=num_layers 105 | ), 106 | nn.Linear(hidden_dim, math.prod(pose_dims)), 107 | nn.Unflatten(dim=2, unflattened_size=pose_dims) 108 | ) 109 | 110 | def __getattr__(self, item): 111 | if item == "device": 112 | return next(self.parameters()).device 113 | return super().__getattr__(item) 114 | 115 | def quantize(self, x: Tensor): 116 | x = self.encoder(x) 117 | _, indices = self.fsq(x) 118 | return indices 119 | 120 | def unquantize(self, indices: Tensor): 121 | # (batch, codes) or (batch, codes, codebooks) 122 | indices = indices.view(len(indices), -1, self.num_codebooks) 123 | x = self.fsq.indices_to_codes(indices) 124 | x = self.decoder(x) 125 | return x 126 | 127 | @torch.compile(disable=IS_TESTING or True) 128 | def forward(self, batch: Union[MaskedTensor, Tensor]): 129 | tensor = batch.tensor if isinstance(batch, MaskedTensor) else batch 130 | x = self.encoder(tensor) 131 | x, indices = self.fsq(x) 132 | x = self.decoder(x) 133 | return x, indices 134 | 135 | 136 | def masked_loss(loss_type: str, 137 | tensor1: torch.Tensor, 138 | tensor2: torch.Tensor, 139 | confidence: torch.Tensor, 140 | loss_weights: torch.Tensor = None): 141 | assert tensor1.dtype == tensor2.dtype, "Tensors must have the same dtype" 142 | assert tensor1.dtype == torch.float32, "Tensors must be float32, or casted" 143 | difference = tensor1 - tensor2 144 | 145 | if loss_type == 'l1': 146 | error = torch.abs(difference) 147 | elif loss_type == 'l2': 148 | error = torch.pow(difference, 2) 149 | else: 150 | raise NotImplementedError() 151 | 152 | masked_error = error * confidence # confidence is 0 for masked values 153 | 154 | if loss_weights is not None: 155 | masked_error = masked_error * loss_weights 156 | 157 | return masked_error.mean() 158 | 159 | 160 | # pylint: disable=abstract-method,too-many-ancestors,arguments-differ 161 | class AutoEncoderLightningWrapper(pl.LightningModule): 162 | def __init__(self, model: PoseFSQAutoEncoder, 163 | learning_rate: float = 3e-4, 164 | warmup_steps: int = 10000, # For some reason, this is only 400 steps 165 | loss_weights: torch.Tensor = None): 166 | super().__init__() 167 | self.model = model 168 | self.learning_rate = learning_rate 169 | self.loss_weights = loss_weights 170 | self.warmup_steps = warmup_steps 171 | 172 | def forward(self, batch): 173 | return self.model(batch) 174 | 175 | def configure_optimizers(self): 176 | # Optimizer taken from https://arxiv.org/pdf/2307.09288.pdf 177 | fused = 'fused' in inspect.signature(torch.optim.AdamW).parameters and 'cuda' in str(self.device) 178 | optimizer = torch.optim.AdamW(self.parameters(), 179 | lr=self.learning_rate, 180 | betas=(0.9, 0.95), 181 | eps=1e-5, 182 | weight_decay=0.1, 183 | fused=fused) 184 | 185 | def warm_decay(step): 186 | if step < self.warmup_steps: 187 | return min(step / self.warmup_steps, 1) 188 | 189 | # Don't go below a tenth of the learning rate 190 | return max(0.1, self.warmup_steps ** 0.5 * step ** -0.5) 191 | 192 | scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, warm_decay) 193 | 194 | return { 195 | "optimizer": optimizer, 196 | "lr_scheduler": { 197 | "scheduler": scheduler, 198 | "interval": "step", # runs per batch rather than per epoch 199 | "frequency": 1, 200 | "monitor": "train_loss", 201 | "name": "learning_rate" # used in LearningRateMonitor 202 | } 203 | } 204 | 205 | def get_codebook_util(self, indices: Tensor): 206 | if self.model.num_codebooks == 1: 207 | codebooks = [indices] 208 | else: 209 | codebooks = [indices[:, :, i] for i in range(self.model.num_codebooks)] 210 | uniques = [codebook.unique().numel() for codebook in codebooks] 211 | mean_unique = torch.tensor(uniques, dtype=torch.float).mean() 212 | return mean_unique / self.model.num_codes * 100 213 | 214 | def step(self, x: MaskedTensor): 215 | batch_size = x.shape[0] 216 | 217 | x_hat, indices = self(x) 218 | 219 | if self.loss_weights is not None and self.loss_weights.device != self.device: 220 | self.loss_weights = self.loss_weights.to(self.device) 221 | 222 | loss = masked_loss('l2', tensor1=x_hat, tensor2=x.tensor, 223 | confidence=x.mask, loss_weights=self.loss_weights) 224 | 225 | phase = "train" if self.training else "validation" 226 | self.log(f"{phase}_code_utilization", self.get_codebook_util(indices), batch_size=1) 227 | self.log(f"{phase}_loss", loss, batch_size=batch_size) 228 | 229 | return loss, x_hat 230 | 231 | def training_step(self, batch, *args, **kwargs): 232 | loss, _ = self.step(batch) 233 | return loss 234 | 235 | def validation_step(self, batch, batch_idx, *args, **kwargs): 236 | loss, prediction = self.step(batch) 237 | 238 | if batch_idx == 0: 239 | for i, (pose, pred) in enumerate(islice(zip(batch, prediction), 5)): 240 | video = draw_original_and_predicted_pose(pose, pred) 241 | # axes are (time, channel, height, width) 242 | video = np.moveaxis(video, 3, 1) 243 | wandb.log({f"validation_{i}": wandb.Video(video, fps=25, format="mp4")}) 244 | 245 | return loss 246 | -------------------------------------------------------------------------------- /sign_vq/data/pose_normalization.json: -------------------------------------------------------------------------------- 1 | { 2 | "POSE_LANDMARKS": { 3 | "LEFT_SHOULDER": { 4 | "mean": [ 5 | 0.49811875442668313, 6 | 0.0032490109996838904, 7 | 0.000147583102824792 8 | ], 9 | "std": [ 10 | 0.06389874322593754, 11 | 0.042241562285976864, 12 | 0.0010676480005221826 13 | ] 14 | }, 15 | "RIGHT_SHOULDER": { 16 | "mean": [ 17 | -0.49811872016194336, 18 | -0.003249005645818303, 19 | -0.00014758320320977178 20 | ], 21 | "std": [ 22 | 0.06909697373063844, 23 | 0.044427591383087044, 24 | 0.0011663787247761764 25 | ] 26 | }, 27 | "LEFT_ELBOW": { 28 | "mean": [ 29 | 0.7170717092987702, 30 | 0.7945065946860426, 31 | -0.0007108388241079718 32 | ], 33 | "std": [ 34 | 0.12527391106921382, 35 | 0.13082757513575483, 36 | 0.0018681364771636498 37 | ] 38 | }, 39 | "RIGHT_ELBOW": { 40 | "mean": [ 41 | -0.76120606470132, 42 | 0.7592317303962558, 43 | -0.0018246002030853994 44 | ], 45 | "std": [ 46 | 0.1447455519564686, 47 | 0.17235698691671855, 48 | 0.001956124592199861 49 | ] 50 | }, 51 | "LEFT_WRIST": { 52 | "mean": [ 53 | 0.49924383715675574, 54 | 0.962204330203061, 55 | -0.0005712816175316294 56 | ], 57 | "std": [ 58 | 0.24803886871088393, 59 | 0.5535785735795531, 60 | 0.0028114675470507027 61 | ] 62 | }, 63 | "RIGHT_WRIST": { 64 | "mean": [ 65 | -0.5148610545100442, 66 | 0.7364611661171545, 67 | -0.000614577391613304 68 | ], 69 | "std": [ 70 | 0.2453588074790006, 71 | 0.6231745305001263, 72 | 0.003588491703721872 73 | ] 74 | }, 75 | "LEFT_HIP": { 76 | "mean": [ 77 | 0.329768001499288, 78 | 1.406531383649082, 79 | 0.0020794641482117207 80 | ], 81 | "std": [ 82 | 0.06274473806447825, 83 | 0.12962222291676376, 84 | 0.0007489519373830634 85 | ] 86 | }, 87 | "RIGHT_HIP": { 88 | "mean": [ 89 | -0.32991636782245576, 90 | 1.393718700980542, 91 | 0.002406542504698681 92 | ], 93 | "std": [ 94 | 0.062238323395291464, 95 | 0.13129726332962338, 96 | 0.0008129326188274317 97 | ] 98 | } 99 | }, 100 | "FACE_LANDMARKS": { 101 | "0": { 102 | "mean": [ 103 | -0.02785132824093435, 104 | -0.5050171753910451, 105 | 0.0020419819898012145 106 | ], 107 | "std": [ 108 | 0.054176928489284217, 109 | 0.05873066218870624, 110 | 0.0007363608412831227 111 | ] 112 | }, 113 | "7": { 114 | "mean": [ 115 | -0.1680000848163223, 116 | -0.6869448242889381, 117 | 0.002354293323234747 118 | ], 119 | "std": [ 120 | 0.053502259300988675, 121 | 0.057240081010188966, 122 | 0.0007645422437288949 123 | ] 124 | }, 125 | "10": { 126 | "mean": [ 127 | -0.031463413774283966, 128 | -0.8762583553898946, 129 | 0.0022675009127104346 130 | ], 131 | "std": [ 132 | 0.06044251800704197, 133 | 0.06225174214400763, 134 | 0.0007545558048326379 135 | ] 136 | }, 137 | "13": { 138 | "mean": [ 139 | -0.026807027272479144, 140 | -0.48220679993248317, 141 | 0.0020870761780783877 142 | ], 143 | "std": [ 144 | 0.05311451862499276, 145 | 0.05747113071839264, 146 | 0.0007422723662957471 147 | ] 148 | }, 149 | "14": { 150 | "mean": [ 151 | -0.02592243669128027, 152 | -0.47329803242772983, 153 | 0.0020895003172205938 154 | ], 155 | "std": [ 156 | 0.05308800466366484, 157 | 0.058568077102100165, 158 | 0.0007425147160334977 159 | ] 160 | }, 161 | "17": { 162 | "mean": [ 163 | -0.025207519498790434, 164 | -0.44206922525007303, 165 | 0.002069346603119158 166 | ], 167 | "std": [ 168 | 0.05303372369935548, 169 | 0.059326060197893785, 170 | 0.0007422589746339403 171 | ] 172 | }, 173 | "21": { 174 | "mean": [ 175 | -0.24194029208826862, 176 | -0.7612858504432622, 177 | 0.002659218584683548 178 | ], 179 | "std": [ 180 | 0.05384655171262683, 181 | 0.05873420036558397, 182 | 0.0008008683225428737 183 | ] 184 | }, 185 | "33": { 186 | "mean": [ 187 | -0.17467202768498086, 188 | -0.6913884859131909, 189 | 0.0023741180020293816 190 | ], 191 | "std": [ 192 | 0.053508484065091816, 193 | 0.05720449331088391, 194 | 0.0007669763933133368 195 | ] 196 | }, 197 | "37": { 198 | "mean": [ 199 | -0.05104042213138511, 200 | -0.5071368286110022, 201 | 0.002052079822015075 202 | ], 203 | "std": [ 204 | 0.05392825651271605, 205 | 0.05866903991718037, 206 | 0.0007374505328901478 207 | ] 208 | }, 209 | "39": { 210 | "mean": [ 211 | -0.07422246515038163, 212 | -0.4979359731807495, 213 | 0.002083884200535911 214 | ], 215 | "std": [ 216 | 0.05317772120605334, 217 | 0.057752436179152274, 218 | 0.000741149317436575 219 | ] 220 | }, 221 | "40": { 222 | "mean": [ 223 | -0.08887981158287313, 224 | -0.4865661255857649, 225 | 0.002123320991935124 226 | ], 227 | "std": [ 228 | 0.05243372439397268, 229 | 0.056846857692070825, 230 | 0.0007452993867357136 231 | ] 232 | }, 233 | "46": { 234 | "mean": [ 235 | -0.2007209661882922, 236 | -0.7301774742086928, 237 | 0.002345946682596718 238 | ], 239 | "std": [ 240 | 0.05458310530243733, 241 | 0.05997790650607773, 242 | 0.0007637983192901707 243 | ] 244 | }, 245 | "52": { 246 | "mean": [ 247 | -0.1531603335401875, 248 | -0.7548100967861631, 249 | 0.002247320661341892 250 | ], 251 | "std": [ 252 | 0.05637522526148448, 253 | 0.061208593738296485, 254 | 0.0007522827577299653 255 | ] 256 | }, 257 | "53": { 258 | "mean": [ 259 | -0.18107596881675758, 260 | -0.7465079787129559, 261 | 0.0022925933730523276 262 | ], 263 | "std": [ 264 | 0.05565351173840385, 265 | 0.06073334059122825, 266 | 0.0007576344579103584 267 | ] 268 | }, 269 | "54": { 270 | "mean": [ 271 | -0.22255682886593148, 272 | -0.803044164857472, 273 | 0.0025558312684087147 274 | ], 275 | "std": [ 276 | 0.055535620483107326, 277 | 0.06047848670247419, 278 | 0.0007882524575056893 279 | ] 280 | }, 281 | "55": { 282 | "mean": [ 283 | -0.0690302788952475, 284 | -0.7398689056002739, 285 | 0.0021905605296127424 286 | ], 287 | "std": [ 288 | 0.05659324938822962, 289 | 0.060439979271152945, 290 | 0.0007455072909711018 291 | ] 292 | }, 293 | "58": { 294 | "mean": [ 295 | -0.22676885821189866, 296 | -0.45777674837723165, 297 | 0.0027249361162739825 298 | ], 299 | "std": [ 300 | 0.04767191402614754, 301 | 0.05333516861834808, 302 | 0.000817281356110267 303 | ] 304 | }, 305 | "61": { 306 | "mean": [ 307 | -0.10353388133300996, 308 | -0.469348652332285, 309 | 0.002209297314482608 310 | ], 311 | "std": [ 312 | 0.05141915559171282, 313 | 0.05577020341554144, 314 | 0.000755623037379187 315 | ] 316 | }, 317 | "63": { 318 | "mean": [ 319 | -0.19259175744206186, 320 | -0.7602732011970287, 321 | 0.002327041962395112 322 | ], 323 | "std": [ 324 | 0.05574449646551834, 325 | 0.06090890942237185, 326 | 0.0007616197500561888 327 | ] 328 | }, 329 | "65": { 330 | "mean": [ 331 | -0.11613003158807567, 332 | -0.7539834807517299, 333 | 0.002213030491442138 334 | ], 335 | "std": [ 336 | 0.05679429230310796, 337 | 0.061068815155652774, 338 | 0.0007483731691122167 339 | ] 340 | }, 341 | "66": { 342 | "mean": [ 343 | -0.12017839837036004, 344 | -0.7723506063157589, 345 | 0.0022074689235842848 346 | ], 347 | "std": [ 348 | 0.057409026296671904, 349 | 0.06161972480419522, 350 | 0.0007477015785452159 351 | ] 352 | }, 353 | "67": { 354 | "mean": [ 355 | -0.14374608202716224, 356 | -0.8609059834570086, 357 | 0.002343503913756897 358 | ], 359 | "std": [ 360 | 0.058966695423883185, 361 | 0.06194458872601895, 362 | 0.0007632011668635141 363 | ] 364 | }, 365 | "70": { 366 | "mean": [ 367 | -0.21322504338473258, 368 | -0.7386715269530197, 369 | 0.0024012648221198086 370 | ], 371 | "std": [ 372 | 0.05442935505759949, 373 | 0.059989290391956426, 374 | 0.0007702226363310446 375 | ] 376 | }, 377 | "78": { 378 | "mean": [ 379 | -0.09268960672555007, 380 | -0.4722965341947451, 381 | 0.002190784890143734 382 | ], 383 | "std": [ 384 | 0.0511778134263874, 385 | 0.055752565570952546, 386 | 0.0007536989064519186 387 | ] 388 | }, 389 | "80": { 390 | "mean": [ 391 | -0.07281711152833437, 392 | -0.47863826982799834, 393 | 0.00214073171760927 394 | ], 395 | "std": [ 396 | 0.05186038708217797, 397 | 0.05640229156692073, 398 | 0.000747654528801387 399 | ] 400 | }, 401 | "81": { 402 | "mean": [ 403 | -0.05959184255002606, 404 | -0.480915032246929, 405 | 0.002116221736877706 406 | ], 407 | "std": [ 408 | 0.05232827222725727, 409 | 0.057020434424887204, 410 | 0.0007451031892748856 411 | ] 412 | }, 413 | "82": { 414 | "mean": [ 415 | -0.04411728982274298, 416 | -0.4818157596309543, 417 | 0.002095081318935671 418 | ], 419 | "std": [ 420 | 0.05279163040270501, 421 | 0.057376565753177074, 422 | 0.0007430472075392176 423 | ] 424 | }, 425 | "84": { 426 | "mean": [ 427 | -0.0478137863819606, 428 | -0.44283933138044207, 429 | 0.002074903346019355 430 | ], 431 | "std": [ 432 | 0.05279462544981615, 433 | 0.058947840725848515, 434 | 0.0007428896028960655 435 | ] 436 | }, 437 | "87": { 438 | "mean": [ 439 | -0.04438255098367634, 440 | -0.47314634433673575, 441 | 0.0020979847372050143 442 | ], 443 | "std": [ 444 | 0.052832793380968635, 445 | 0.05836289193968788, 446 | 0.0007434565993805872 447 | ] 448 | }, 449 | "88": { 450 | "mean": [ 451 | -0.07419401291119211, 452 | -0.4721949055755358, 453 | 0.0021426583500065285 454 | ], 455 | "std": [ 456 | 0.05185051919566291, 457 | 0.056957622659553016, 458 | 0.0007482058121728885 459 | ] 460 | }, 461 | "91": { 462 | "mean": [ 463 | -0.08463959590649392, 464 | -0.4564948622049564, 465 | 0.0021288973027191473 466 | ], 467 | "std": [ 468 | 0.051904884283799586, 469 | 0.057003431204818086, 470 | 0.0007480434197789585 471 | ] 472 | }, 473 | "93": { 474 | "mean": [ 475 | -0.24902332992689893, 476 | -0.5647245038386677, 477 | 0.0028507862909683294 478 | ], 479 | "std": [ 480 | 0.04894556147555433, 481 | 0.05450347114642596, 482 | 0.0008288277985390021 483 | ] 484 | }, 485 | "95": { 486 | "mean": [ 487 | -0.08282791651706439, 488 | -0.47129887877248, 489 | 0.0021706250108186275 490 | ], 491 | "std": [ 492 | 0.05144415348843597, 493 | 0.056293785720203264, 494 | 0.0007512783098051726 495 | ] 496 | }, 497 | "103": { 498 | "mean": [ 499 | -0.19127228377743558, 500 | -0.8369369319448686, 501 | 0.0024425916566540576 502 | ], 503 | "std": [ 504 | 0.05729341128588803, 505 | 0.06113540522474803, 506 | 0.0007746428205036559 507 | ] 508 | }, 509 | "105": { 510 | "mean": [ 511 | -0.16052370785189335, 512 | -0.7723860836932952, 513 | 0.002265671985705705 514 | ], 515 | "std": [ 516 | 0.05679694582112656, 517 | 0.06143423274621134, 518 | 0.0007543580612199246 519 | ] 520 | }, 521 | "107": { 522 | "mean": [ 523 | -0.07558214250662117, 524 | -0.768716130834461, 525 | 0.002175764796961407 526 | ], 527 | "std": [ 528 | 0.05761955591354789, 529 | 0.061448577472485694, 530 | 0.0007439033993170853 531 | ] 532 | }, 533 | "109": { 534 | "mean": [ 535 | -0.09239015913130656, 536 | -0.8726368493947754, 537 | 0.0022841188710364217 538 | ], 539 | "std": [ 540 | 0.05999554122189567, 541 | 0.06232442250572296, 542 | 0.0007565109546173195 543 | ] 544 | }, 545 | "127": { 546 | "mean": [ 547 | -0.25483830884968256, 548 | -0.6638620896206249, 549 | 0.0028432842859732587 550 | ], 551 | "std": [ 552 | 0.05093101600774182, 553 | 0.05645386428560972, 554 | 0.0008247804078749868 555 | ] 556 | }, 557 | "132": { 558 | "mean": [ 559 | -0.24163940344000523, 560 | -0.5118864740388028, 561 | 0.002803900837161462 562 | ], 563 | "std": [ 564 | 0.04814457529908911, 565 | 0.053831419750124246, 566 | 0.0008246496353129405 567 | ] 568 | }, 569 | "133": { 570 | "mean": [ 571 | -0.08643600698547504, 572 | -0.6905130799069429, 573 | 0.0023212780143689214 574 | ], 575 | "std": [ 576 | 0.053703897910464385, 577 | 0.05701128481306954, 578 | 0.0007600483862788405 579 | ] 580 | }, 581 | "136": { 582 | "mean": [ 583 | -0.1822457101065596, 584 | -0.38766192193340315, 585 | 0.002496167986463425 586 | ], 587 | "std": [ 588 | 0.04801352168887893, 589 | 0.05306973370362864, 590 | 0.0007929150534695146 591 | ] 592 | }, 593 | "144": { 594 | "mean": [ 595 | -0.14891843663411847, 596 | -0.6817341788180259, 597 | 0.002317724969161873 598 | ], 599 | "std": [ 600 | 0.05355808579514437, 601 | 0.057043074152407856, 602 | 0.0007604453469948871 603 | ] 604 | }, 605 | "145": { 606 | "mean": [ 607 | -0.13222368857093367, 608 | -0.68036346195867, 609 | 0.0023025711174312004 610 | ], 611 | "std": [ 612 | 0.05353169692871112, 613 | 0.05703308877805007, 614 | 0.000758577994019109 615 | ] 616 | }, 617 | "146": { 618 | "mean": [ 619 | -0.09566788796566696, 620 | -0.4628020468525433, 621 | 0.002171218078530479 622 | ], 623 | "std": [ 624 | 0.051541609390959, 625 | 0.05616566278079744, 626 | 0.0007521295947217158 627 | ] 628 | }, 629 | "148": { 630 | "mean": [ 631 | -0.06462791890852784, 632 | -0.32251567819596233, 633 | 0.002203115069432985 634 | ], 635 | "std": [ 636 | 0.05113732861886254, 637 | 0.05506703252179477, 638 | 0.0007644230965723983 639 | ] 640 | }, 641 | "149": { 642 | "mean": [ 643 | -0.12474286835661016, 644 | -0.34646301096537463, 645 | 0.002329364375346743 646 | ], 647 | "std": [ 648 | 0.04950147050356128, 649 | 0.05389209736630041, 650 | 0.0007760651443022351 651 | ] 652 | }, 653 | "150": { 654 | "mean": [ 655 | -0.1513681426908359, 656 | -0.3628038773342566, 657 | 0.0024021373352958886 658 | ], 659 | "std": [ 660 | 0.04870940130629598, 661 | 0.05340164538912903, 662 | 0.000783158934621887 663 | ] 664 | }, 665 | "152": { 666 | "mean": [ 667 | -0.023050162987498495, 668 | -0.3193749695813225, 669 | 0.0021833342173370567 670 | ], 671 | "std": [ 672 | 0.05144671170432995, 673 | 0.05526442489773067, 674 | 0.000762384307656345 675 | ] 676 | }, 677 | "153": { 678 | "mean": [ 679 | -0.116160165056955, 680 | -0.6819834811634596, 681 | 0.0022997806835869626 682 | ], 683 | "std": [ 684 | 0.05352198625999653, 685 | 0.056668689756010755, 686 | 0.0007579822081799788 687 | ] 688 | }, 689 | "154": { 690 | "mean": [ 691 | -0.10113828342365683, 692 | -0.6856582375356933, 693 | 0.0023075429682422404 694 | ], 695 | "std": [ 696 | 0.05359118372535092, 697 | 0.05663623872674381, 698 | 0.0007587448916653029 699 | ] 700 | }, 701 | "155": { 702 | "mean": [ 703 | -0.09143509200194948, 704 | -0.6881734943949088, 705 | 0.002320186769779114 706 | ], 707 | "std": [ 708 | 0.05364498590164681, 709 | 0.05678118821512366, 710 | 0.0007599917097336975 711 | ] 712 | }, 713 | "157": { 714 | "mean": [ 715 | -0.10257228657205106, 716 | -0.7030269871504472, 717 | 0.002303744118247952 718 | ], 719 | "std": [ 720 | 0.05393483298017217, 721 | 0.05704836913637518, 722 | 0.0007581687887784183 723 | ] 724 | }, 725 | "158": { 726 | "mean": [ 727 | -0.1188433561756979, 728 | -0.7072312965641653, 729 | 0.002295199654346038 730 | ], 731 | "std": [ 732 | 0.054013956361378956, 733 | 0.0575279784434127, 734 | 0.0007574081695868018 735 | ] 736 | }, 737 | "159": { 738 | "mean": [ 739 | -0.13492207080080246, 740 | -0.7080092598700235, 741 | 0.002298591867631228 742 | ], 743 | "std": [ 744 | 0.054126593853900645, 745 | 0.05785651310732329, 746 | 0.0007579634265010493 747 | ] 748 | }, 749 | "160": { 750 | "mean": [ 751 | -0.15152190658583475, 752 | -0.7045783847352441, 753 | 0.00231259791554624 754 | ], 755 | "std": [ 756 | 0.054087950647906204, 757 | 0.057754507217024395, 758 | 0.0007598498871561585 759 | ] 760 | }, 761 | "161": { 762 | "mean": [ 763 | -0.16240675372611538, 764 | -0.6990899589466012, 765 | 0.0023309217644567354 766 | ], 767 | "std": [ 768 | 0.05390329252200322, 769 | 0.05780931350713141, 770 | 0.0007617655636579999 771 | ] 772 | }, 773 | "162": { 774 | "mean": [ 775 | -0.25190811420977566, 776 | -0.7176837417214663, 777 | 0.0027666317920156925 778 | ], 779 | "std": [ 780 | 0.05240024966916248, 781 | 0.057790662962217305, 782 | 0.0008142385780685047 783 | ] 784 | }, 785 | "163": { 786 | "mean": [ 787 | -0.16017648245472985, 788 | -0.6842722180550791, 789 | 0.0023372003746328722 790 | ], 791 | "std": [ 792 | 0.053547892336079754, 793 | 0.057061649577827236, 794 | 0.0007624972241930814 795 | ] 796 | }, 797 | "172": { 798 | "mean": [ 799 | -0.20613804264754768, 800 | -0.4163420535229079, 801 | 0.0026143853461995897 802 | ], 803 | "std": [ 804 | 0.04764562095362656, 805 | 0.052976879295725525, 806 | 0.0008055726290187386 807 | ] 808 | }, 809 | "173": { 810 | "mean": [ 811 | -0.09134457150747649, 812 | -0.69586721318526, 813 | 0.0023141931001578694 814 | ], 815 | "std": [ 816 | 0.0538406135502201, 817 | 0.05697460021631561, 818 | 0.0007592742023224441 819 | ] 820 | }, 821 | "176": { 822 | "mean": [ 823 | -0.09672020908595882, 824 | -0.33256449098446583, 825 | 0.002254576459469918 826 | ], 827 | "std": [ 828 | 0.05041997918514168, 829 | 0.054591754139219316, 830 | 0.0007688804011549972 831 | ] 832 | }, 833 | "178": { 834 | "mean": [ 835 | -0.060734190946897214, 836 | -0.47313985959364646, 837 | 0.0021186309990671405 838 | ], 839 | "std": [ 840 | 0.05238873131965711, 841 | 0.057804326843174575, 842 | 0.0007456309743288752 843 | ] 844 | }, 845 | "181": { 846 | "mean": [ 847 | -0.06821103110726114, 848 | -0.44830099473728785, 849 | 0.0020989400788208514 850 | ], 851 | "std": [ 852 | 0.052413764661781694, 853 | 0.058034655510089315, 854 | 0.0007450374601393689 855 | ] 856 | }, 857 | "185": { 858 | "mean": [ 859 | -0.09937906526271179, 860 | -0.47771095156622584, 861 | 0.002166970507474207 862 | ], 863 | "std": [ 864 | 0.051828454861457256, 865 | 0.05610533269411619, 866 | 0.0007504991366910427 867 | ] 868 | }, 869 | "191": { 870 | "mean": [ 871 | -0.08363595324788443, 872 | -0.4765214987160809, 873 | 0.0021700126432761526 874 | ], 875 | "std": [ 876 | 0.051451084787897894, 877 | 0.056021297708789886, 878 | 0.0007510678568767678 879 | ] 880 | }, 881 | "234": { 882 | "mean": [ 883 | -0.2521158146980888, 884 | -0.6141702238541892, 885 | 0.0028677706780229502 886 | ], 887 | "std": [ 888 | 0.049823041393306136, 889 | 0.05543031524071138, 890 | 0.0008291695617970166 891 | ] 892 | }, 893 | "246": { 894 | "mean": [ 895 | -0.1690979449591819, 896 | -0.6949211955320395, 897 | 0.0023496281252880498 898 | ], 899 | "std": [ 900 | 0.053718027156369884, 901 | 0.057547977512259575, 902 | 0.0007639818659763227 903 | ] 904 | }, 905 | "249": { 906 | "mean": [ 907 | 0.11274317722252175, 908 | -0.6908338173587892, 909 | 0.0023403684954402737 910 | ], 911 | "std": [ 912 | 0.05351179834213431, 913 | 0.05695804937149908, 914 | 0.0007624822889950964 915 | ] 916 | }, 917 | "251": { 918 | "mean": [ 919 | 0.18823114513340533, 920 | -0.768279148358347, 921 | 0.0026387680016596825 922 | ], 923 | "std": [ 924 | 0.05457323466149632, 925 | 0.05819634588546615, 926 | 0.0007974121919557324 927 | ] 928 | }, 929 | "263": { 930 | "mean": [ 931 | 0.11949447241699945, 932 | -0.6953382091167356, 933 | 0.002359548774334663 934 | ], 935 | "std": [ 936 | 0.05347572384692857, 937 | 0.05702173551468582, 938 | 0.000764799735251073 939 | ] 940 | }, 941 | "267": { 942 | "mean": [ 943 | -0.0043060098711857255, 944 | -0.5083575386090539, 945 | 0.0020495571733207306 946 | ], 947 | "std": [ 948 | 0.05392839415180394, 949 | 0.058483429054732285, 950 | 0.0007369475358249311 951 | ] 952 | }, 953 | "269": { 954 | "mean": [ 955 | 0.02051429176475198, 956 | -0.5001383840744179, 957 | 0.0020787086292905176 958 | ], 959 | "std": [ 960 | 0.05317116264396396, 961 | 0.057520898341158484, 962 | 0.0007405769644601608 963 | ] 964 | }, 965 | "270": { 966 | "mean": [ 967 | 0.03674399968694817, 968 | -0.48964555509375113, 969 | 0.0021155056327456334 970 | ], 971 | "std": [ 972 | 0.05247019013663573, 973 | 0.05657855083080846, 974 | 0.0007442961852285536 975 | ] 976 | }, 977 | "276": { 978 | "mean": [ 979 | 0.1449652137279541, 980 | -0.7350394532284217, 981 | 0.0023294908964586044 982 | ], 983 | "std": [ 984 | 0.05475638796162476, 985 | 0.059618694394514664, 986 | 0.0007614521603970382 987 | ] 988 | }, 989 | "282": { 990 | "mean": [ 991 | 0.09530821632217727, 992 | -0.7577070270957784, 993 | 0.0022354136040337296 994 | ], 995 | "std": [ 996 | 0.056359916116682916, 997 | 0.06089560735954094, 998 | 0.0007508537386812282 999 | ] 1000 | }, 1001 | "283": { 1002 | "mean": [ 1003 | 0.12408091847705226, 1004 | -0.7504372526737427, 1005 | 0.002277535484368364 1006 | ], 1007 | "std": [ 1008 | 0.05568954202204518, 1009 | 0.06038968360667555, 1010 | 0.0007555963893253614 1011 | ] 1012 | }, 1013 | "284": { 1014 | "mean": [ 1015 | 0.1665677457422129, 1016 | -0.8095265748117104, 1017 | 0.002537230788592623 1018 | ], 1019 | "std": [ 1020 | 0.05617383213457304, 1021 | 0.05933119645339793, 1022 | 0.0007853530565406619 1023 | ] 1024 | }, 1025 | "285": { 1026 | "mean": [ 1027 | 0.009679503024043284, 1028 | -0.740657024905891, 1029 | 0.002186814620126066 1030 | ], 1031 | "std": [ 1032 | 0.056495910348876494, 1033 | 0.06030566340923309, 1034 | 0.000745117038087955 1035 | ] 1036 | }, 1037 | "288": { 1038 | "mean": [ 1039 | 0.18277709877637874, 1040 | -0.4634603683639441, 1041 | 0.002705997621368786 1042 | ], 1043 | "std": [ 1044 | 0.04842857482780465, 1045 | 0.05317495409544257, 1046 | 0.0008134991000674945 1047 | ] 1048 | }, 1049 | "291": { 1050 | "mean": [ 1051 | 0.05403009884927205, 1052 | -0.4731317279316773, 1053 | 0.0021995487555923833 1054 | ], 1055 | "std": [ 1056 | 0.05149851455590998, 1057 | 0.055254902282552816, 1058 | 0.000754198055065401 1059 | ] 1060 | }, 1061 | "293": { 1062 | "mean": [ 1063 | 0.13560205101101347, 1064 | -0.7645100372586232, 1065 | 0.0023119564642312294 1066 | ], 1067 | "std": [ 1068 | 0.055926016849925986, 1069 | 0.06066271326674103, 1070 | 0.0007594053749056067 1071 | ] 1072 | }, 1073 | "295": { 1074 | "mean": [ 1075 | 0.057643095763512196, 1076 | -0.7559102385978874, 1077 | 0.0022046988617834824 1078 | ], 1079 | "std": [ 1080 | 0.05671952706071889, 1081 | 0.06097137313871081, 1082 | 0.0007471824846427082 1083 | ] 1084 | }, 1085 | "296": { 1086 | "mean": [ 1087 | 0.06135789285101164, 1088 | -0.7745040899943725, 1089 | 0.00219932948807225 1090 | ], 1091 | "std": [ 1092 | 0.05737846299108924, 1093 | 0.06137513481049947, 1094 | 0.0007465751450224486 1095 | ] 1096 | }, 1097 | "297": { 1098 | "mean": [ 1099 | 0.08337477078448939, 1100 | -0.8648888509030099, 1101 | 0.0023328268185711445 1102 | ], 1103 | "std": [ 1104 | 0.0592265608178447, 1105 | 0.06133571362372108, 1106 | 0.00076171491893465 1107 | ] 1108 | }, 1109 | "300": { 1110 | "mean": [ 1111 | 0.15782423625380715, 1112 | -0.7441390917691849, 1113 | 0.0023835682177045596 1114 | ], 1115 | "std": [ 1116 | 0.05475947581339534, 1117 | 0.059523400984926644, 1118 | 0.0007677286296403985 1119 | ] 1120 | }, 1121 | "308": { 1122 | "mean": [ 1123 | 0.04321130001823448, 1124 | -0.47544015922823457, 1125 | 0.0021823878555035235 1126 | ], 1127 | "std": [ 1128 | 0.05135269922484394, 1129 | 0.05529077669398822, 1130 | 0.0007522902475886432 1131 | ] 1132 | }, 1133 | "310": { 1134 | "mean": [ 1135 | 0.02139668859853671, 1136 | -0.4809502008271753, 1137 | 0.00213443032291087 1138 | ], 1139 | "std": [ 1140 | 0.051988454941560444, 1141 | 0.056202868260023166, 1142 | 0.0007467381379093714 1143 | ] 1144 | }, 1145 | "311": { 1146 | "mean": [ 1147 | 0.007196265656044046, 1148 | -0.4824293398461198, 1149 | 0.002111258329742564 1150 | ], 1151 | "std": [ 1152 | 0.052403507797455076, 1153 | 0.05671099727275145, 1154 | 0.0007443497174607335 1155 | ] 1156 | }, 1157 | "312": { 1158 | "mean": [ 1159 | -0.009047274883644752, 1160 | -0.48275330251410575, 1161 | 0.0020925326690806267 1162 | ], 1163 | "std": [ 1164 | 0.052824266180452925, 1165 | 0.057265725369349256, 1166 | 0.0007427303248518915 1167 | ] 1168 | }, 1169 | "314": { 1170 | "mean": [ 1171 | -0.0022367987689607327, 1172 | -0.4437854862769026, 1173 | 0.0020716774597466865 1174 | ], 1175 | "std": [ 1176 | 0.05278186368123777, 1177 | 0.05876477238974999, 1178 | 0.0007425272227567395 1179 | ] 1180 | }, 1181 | "317": { 1182 | "mean": [ 1183 | -0.007043097911368899, 1184 | -0.4739088677780956, 1185 | 0.0020952414807245217 1186 | ], 1187 | "std": [ 1188 | 0.05284383034840837, 1189 | 0.058274390392815366, 1190 | 0.0007430923337161511 1191 | ] 1192 | }, 1193 | "318": { 1194 | "mean": [ 1195 | 0.02414398088805537, 1196 | -0.4744900242778145, 1197 | 0.0021364850847392087 1198 | ], 1199 | "std": [ 1200 | 0.05196953441352482, 1201 | 0.05668418872429074, 1202 | 0.0007472083554035675 1203 | ] 1204 | }, 1205 | "321": { 1206 | "mean": [ 1207 | 0.03516897482061751, 1208 | -0.4593152450974699, 1209 | 0.002122244490271487 1210 | ], 1211 | "std": [ 1212 | 0.05194134849823071, 1213 | 0.05660841282136303, 1214 | 0.0007469776701885909 1215 | ] 1216 | }, 1217 | "323": { 1218 | "mean": [ 1219 | 0.2030294661064695, 1220 | -0.5716047476348143, 1221 | 0.002828976678204763 1222 | ], 1223 | "std": [ 1224 | 0.049882190621181, 1225 | 0.054350063841165454, 1226 | 0.0008239349288526684 1227 | ] 1228 | }, 1229 | "324": { 1230 | "mean": [ 1231 | 0.03316645227271117, 1232 | -0.47397186242524897, 1233 | 0.0021629653905401356 1234 | ], 1235 | "std": [ 1236 | 0.05162190188776753, 1237 | 0.05590516565482825, 1238 | 0.0007501353306761255 1239 | ] 1240 | }, 1241 | "332": { 1242 | "mean": [ 1243 | 0.13308209357082842, 1244 | -0.842488901353625, 1245 | 0.002426864975228367 1246 | ], 1247 | "std": [ 1248 | 0.057761561156457836, 1249 | 0.06075522354094601, 1250 | 0.0007723865258957775 1251 | ] 1252 | }, 1253 | "334": { 1254 | "mean": [ 1255 | 0.10237792441112399, 1256 | -0.7756728534260896, 1257 | 0.00225330508312763 1258 | ], 1259 | "std": [ 1260 | 0.05686819106464896, 1261 | 0.06115301217747623, 1262 | 0.0007526747796694809 1263 | ] 1264 | }, 1265 | "336": { 1266 | "mean": [ 1267 | 0.015976123158789726, 1268 | -0.769686371919539, 1269 | 0.0021716402623253566 1270 | ], 1271 | "std": [ 1272 | 0.057559295073927465, 1273 | 0.061290091351198776, 1274 | 0.0007434090023300741 1275 | ] 1276 | }, 1277 | "338": { 1278 | "mean": [ 1279 | 0.03052014527471439, 1280 | -0.8749117933407762, 1281 | 0.0022785227243153692 1282 | ], 1283 | "std": [ 1284 | 0.06009682146824771, 1285 | 0.0617176965925391, 1286 | 0.0007558259277945143 1287 | ] 1288 | }, 1289 | "356": { 1290 | "mean": [ 1291 | 0.20582715239817037, 1292 | -0.6710265984775916, 1293 | 0.00282103908675466 1294 | ], 1295 | "std": [ 1296 | 0.05191900394341819, 1297 | 0.05600299166886955, 1298 | 0.0008205669593344126 1299 | ] 1300 | }, 1301 | "361": { 1302 | "mean": [ 1303 | 0.1968451313140747, 1304 | -0.5183487778583834, 1305 | 0.0027836587995296058 1306 | ], 1307 | "std": [ 1308 | 0.04905375797040617, 1309 | 0.05368168687857234, 1310 | 0.0008205380343796965 1311 | ] 1312 | }, 1313 | "362": { 1314 | "mean": [ 1315 | 0.029271263957230065, 1316 | -0.691986591953048, 1317 | 0.002314601613239389 1318 | ], 1319 | "std": [ 1320 | 0.05359240945837717, 1321 | 0.05698695659640869, 1322 | 0.0007590925611394391 1323 | ] 1324 | }, 1325 | "365": { 1326 | "mean": [ 1327 | 0.1385894246157794, 1328 | -0.3912522737130457, 1329 | 0.0024813550984070917 1330 | ], 1331 | "std": [ 1332 | 0.048277156560488914, 1333 | 0.0531015204302723, 1334 | 0.0007900877385214925 1335 | ] 1336 | }, 1337 | "373": { 1338 | "mean": [ 1339 | 0.09314748448495194, 1340 | -0.6851758961687641, 1341 | 0.002305174450139677 1342 | ], 1343 | "std": [ 1344 | 0.05361258498857508, 1345 | 0.056833144069848344, 1346 | 0.0007584571310271522 1347 | ] 1348 | }, 1349 | "374": { 1350 | "mean": [ 1351 | 0.07595173855190232, 1352 | -0.6834276917778278, 1353 | 0.0022913311105189444 1354 | ], 1355 | "std": [ 1356 | 0.05359631568757324, 1357 | 0.056591777909055854, 1358 | 0.0007570184657319486 1359 | ] 1360 | }, 1361 | "375": { 1362 | "mean": [ 1363 | 0.04631456267261447, 1364 | -0.4662471266626611, 1365 | 0.0021624736469390547 1366 | ], 1367 | "std": [ 1368 | 0.05159541616175203, 1369 | 0.05573779677809228, 1370 | 0.0007507676425629 1371 | ] 1372 | }, 1373 | "377": { 1374 | "mean": [ 1375 | 0.018899700101218605, 1376 | -0.32328351981287157, 1377 | 0.002199005143696503 1378 | ], 1379 | "std": [ 1380 | 0.05112532674288087, 1381 | 0.05503217914195181, 1382 | 0.0007637741329550626 1383 | ] 1384 | }, 1385 | "378": { 1386 | "mean": [ 1387 | 0.08038171016296657, 1388 | -0.34842977521598784, 1389 | 0.002319350979891858 1390 | ], 1391 | "std": [ 1392 | 0.0494857845547706, 1393 | 0.05379718546702203, 1394 | 0.0007744581087684633 1395 | ] 1396 | }, 1397 | "379": { 1398 | "mean": [ 1399 | 0.10738659929899241, 1400 | -0.36551312751319176, 1401 | 0.0023901427586169154 1402 | ], 1403 | "std": [ 1404 | 0.04877857475159839, 1405 | 0.053401366754413475, 1406 | 0.0007810370369561127 1407 | ] 1408 | }, 1409 | "380": { 1410 | "mean": [ 1411 | 0.05947839814562424, 1412 | -0.684567222399428, 1413 | 0.002289839930550117 1414 | ], 1415 | "std": [ 1416 | 0.05355539198470208, 1417 | 0.05662788676769823, 1418 | 0.0007566183993187504 1419 | ] 1420 | }, 1421 | "381": { 1422 | "mean": [ 1423 | 0.04413067961634407, 1424 | -0.6878712847804607, 1425 | 0.002299131994835468 1426 | ], 1427 | "std": [ 1428 | 0.05357091128819202, 1429 | 0.05669991774468466, 1430 | 0.0007574803272703834 1431 | ] 1432 | }, 1433 | "382": { 1434 | "mean": [ 1435 | 0.03435435684755503, 1436 | -0.6898890348947285, 1437 | 0.0023125759351835863 1438 | ], 1439 | "std": [ 1440 | 0.053572577427670014, 1441 | 0.05670042336485121, 1442 | 0.0007588229814051361 1443 | ] 1444 | }, 1445 | "384": { 1446 | "mean": [ 1447 | 0.0452803833931082, 1448 | -0.7048420976208765, 1449 | 0.0022948774543959157 1450 | ], 1451 | "std": [ 1452 | 0.05386875485630404, 1453 | 0.056789136509392574, 1454 | 0.0007569009302615088 1455 | ] 1456 | }, 1457 | "385": { 1458 | "mean": [ 1459 | 0.06163808350314468, 1460 | -0.709458960213991, 1461 | 0.002285069655747806 1462 | ], 1463 | "std": [ 1464 | 0.0539722241501944, 1465 | 0.05725116076648814, 1466 | 0.0007558722925169772 1467 | ] 1468 | }, 1469 | "386": { 1470 | "mean": [ 1471 | 0.07792679744083217, 1472 | -0.7106282039970481, 1473 | 0.0022872277520859384 1474 | ], 1475 | "std": [ 1476 | 0.054086506445558596, 1477 | 0.057497027277416214, 1478 | 0.0007563166509294591 1479 | ] 1480 | }, 1481 | "387": { 1482 | "mean": [ 1483 | 0.09496782735390556, 1484 | -0.7076736315266388, 1485 | 0.002299599479621669 1486 | ], 1487 | "std": [ 1488 | 0.05402532479029198, 1489 | 0.05734519731365047, 1490 | 0.0007577897883620716 1491 | ] 1492 | }, 1493 | "388": { 1494 | "mean": [ 1495 | 0.1063641972050174, 1496 | -0.702639000511008, 1497 | 0.0023175579720163438 1498 | ], 1499 | "std": [ 1500 | 0.05384788102755951, 1501 | 0.057170711195705305, 1502 | 0.0007597436628103121 1503 | ] 1504 | }, 1505 | "389": { 1506 | "mean": [ 1507 | 0.20048384418040713, 1508 | -0.7248601221186919, 1509 | 0.0027448696246690743 1510 | ], 1511 | "std": [ 1512 | 0.053298207323153005, 1513 | 0.056980030663500825, 1514 | 0.0008106950593464991 1515 | ] 1516 | }, 1517 | "390": { 1518 | "mean": [ 1519 | 0.10466623241761841, 1520 | -0.6879799128049092, 1521 | 0.0023238170143086542 1522 | ], 1523 | "std": [ 1524 | 0.05358147278273691, 1525 | 0.056805733395214365, 1526 | 0.0007607409783911977 1527 | ] 1528 | }, 1529 | "397": { 1530 | "mean": [ 1531 | 0.16246190835318458, 1532 | -0.42090590591457805, 1533 | 0.0025967897978419084 1534 | ], 1535 | "std": [ 1536 | 0.048167494618337886, 1537 | 0.05296034939431187, 1538 | 0.0008024326420329117 1539 | ] 1540 | }, 1541 | "398": { 1542 | "mean": [ 1543 | 0.034099519739583695, 1544 | -0.6974523726070934, 1545 | 0.00230650426208024 1546 | ], 1547 | "std": [ 1548 | 0.0537318734312636, 1549 | 0.057028964637738024, 1550 | 0.0007580643298328032 1551 | ] 1552 | }, 1553 | "400": { 1554 | "mean": [ 1555 | 0.05174376800320367, 1556 | -0.334018137071583, 1557 | 0.002247091207800041 1558 | ], 1559 | "std": [ 1560 | 0.05039139723718805, 1561 | 0.054502951754649594, 1562 | 0.0007674890192954974 1563 | ] 1564 | }, 1565 | "402": { 1566 | "mean": [ 1567 | 0.009948743166723414, 1568 | -0.4747934347706126, 1569 | 0.0021136106306263404 1570 | ], 1571 | "std": [ 1572 | 0.05244649394708229, 1573 | 0.057590633953012504, 1574 | 0.0007447921459178335 1575 | ] 1576 | }, 1577 | "405": { 1578 | "mean": [ 1579 | 0.018519815101789794, 1580 | -0.45026254374478164, 1581 | 0.0020934274306725637 1582 | ], 1583 | "std": [ 1584 | 0.05241998509168015, 1585 | 0.057787630405501236, 1586 | 0.000744432074525668 1587 | ] 1588 | }, 1589 | "409": { 1590 | "mean": [ 1591 | 0.04862511266168905, 1592 | -0.48123316207721556, 1593 | 0.00215763555467636 1594 | ], 1595 | "std": [ 1596 | 0.05188015923404109, 1597 | 0.05571738421004954, 1598 | 0.0007490717135479398 1599 | ] 1600 | }, 1601 | "415": { 1602 | "mean": [ 1603 | 0.03320017336566075, 1604 | -0.4792664664482223, 1605 | 0.002162224491486776 1606 | ], 1607 | "std": [ 1608 | 0.05163117269188556, 1609 | 0.055672825122091776, 1610 | 0.0007498448906127604 1611 | ] 1612 | }, 1613 | "454": { 1614 | "mean": [ 1615 | 0.20474221027527442, 1616 | -0.6212881386311808, 1617 | 0.002845226331429695 1618 | ], 1619 | "std": [ 1620 | 0.05081406853185841, 1621 | 0.055091378829736665, 1622 | 0.0008246102134705406 1623 | ] 1624 | }, 1625 | "466": { 1626 | "mean": [ 1627 | 0.11353484769697351, 1628 | -0.6986777488753045, 1629 | 0.0023358609127769976 1630 | ], 1631 | "std": [ 1632 | 0.053676960117098, 1633 | 0.057283074662156555, 1634 | 0.0007620297915824722 1635 | ] 1636 | } 1637 | }, 1638 | "LEFT_HAND_LANDMARKS": { 1639 | "WRIST": { 1640 | "mean": [ 1641 | 0.0, 1642 | 0.0, 1643 | 0.0 1644 | ], 1645 | "std": [ 1646 | 0.0, 1647 | 0.0, 1648 | 0.0 1649 | ] 1650 | }, 1651 | "THUMB_CMC": { 1652 | "mean": [ 1653 | -0.05717984484525883, 1654 | -0.0631502218697391, 1655 | -4.499623561794899e-05 1656 | ], 1657 | "std": [ 1658 | 0.061809701169202313, 1659 | 0.05175281281801094, 1660 | 7.443176301934644e-05 1661 | ] 1662 | }, 1663 | "THUMB_MCP": { 1664 | "mean": [ 1665 | -0.11994439470756335, 1666 | -0.11121105115654467, 1667 | -9.588935479625043e-05 1668 | ], 1669 | "std": [ 1670 | 0.11173225120613456, 1671 | 0.1037297365280941, 1672 | 0.00011409122071653663 1673 | ] 1674 | }, 1675 | "THUMB_IP": { 1676 | "mean": [ 1677 | -0.1666885008871546, 1678 | -0.13345885475965732, 1679 | -0.00014799290588031846 1680 | ], 1681 | "std": [ 1682 | 0.14901110826492855, 1683 | 0.14928262918619864, 1684 | 0.00014412564967858458 1685 | ] 1686 | }, 1687 | "THUMB_TIP": { 1688 | "mean": [ 1689 | -0.1957159515908774, 1690 | -0.14442986126653887, 1691 | -0.0002022937613935301 1692 | ], 1693 | "std": [ 1694 | 0.1803620942228068, 1695 | 0.18549745256347486, 1696 | 0.00017653189173069437 1697 | ] 1698 | }, 1699 | "INDEX_FINGER_MCP": { 1700 | "mean": [ 1701 | -0.1252903829685782, 1702 | -0.13057647410384413, 1703 | -0.00012134237844299505 1704 | ], 1705 | "std": [ 1706 | 0.1202199617906834, 1707 | 0.1311866680901748, 1708 | 0.0001331814597649172 1709 | ] 1710 | }, 1711 | "INDEX_FINGER_PIP": { 1712 | "mean": [ 1713 | -0.19341751976230095, 1714 | -0.14437765781458306, 1715 | -0.00021942583038713268 1716 | ], 1717 | "std": [ 1718 | 0.16256347174712185, 1719 | 0.19163993477336164, 1720 | 0.00016402483666586707 1721 | ] 1722 | }, 1723 | "INDEX_FINGER_DIP": { 1724 | "mean": [ 1725 | -0.224891781737266, 1726 | -0.1437207255205343, 1727 | -0.0002863469812796394 1728 | ], 1729 | "std": [ 1730 | 0.18228621714941282, 1731 | 0.219521058475297, 1732 | 0.0001821794547111309 1733 | ] 1734 | }, 1735 | "INDEX_FINGER_TIP": { 1736 | "mean": [ 1737 | -0.24447405681033058, 1738 | -0.14302645001930006, 1739 | -0.0003265706635998913 1740 | ], 1741 | "std": [ 1742 | 0.20014361798870864, 1743 | 0.2412466934523928, 1744 | 0.00019579856683788721 1745 | ] 1746 | }, 1747 | "MIDDLE_FINGER_MCP": { 1748 | "mean": [ 1749 | -0.11398773756118448, 1750 | -0.09549967725089122, 1751 | -0.00015074653502121792 1752 | ], 1753 | "std": [ 1754 | 0.11351111892977685, 1755 | 0.13800886819017316, 1756 | 0.00011969759564772559 1757 | ] 1758 | }, 1759 | "MIDDLE_FINGER_PIP": { 1760 | "mean": [ 1761 | -0.18857853345399933, 1762 | -0.09984230502872304, 1763 | -0.00023453784673608814 1764 | ], 1765 | "std": [ 1766 | 0.15811926791871336, 1767 | 0.19728676877423745, 1768 | 0.00015207496309008206 1769 | ] 1770 | }, 1771 | "MIDDLE_FINGER_DIP": { 1772 | "mean": [ 1773 | -0.21598337923182467, 1774 | -0.09497350700154725, 1775 | -0.00027393250624566155 1776 | ], 1777 | "std": [ 1778 | 0.1770347920584684, 1779 | 0.21954324904720093, 1780 | 0.00016213275961877906 1781 | ] 1782 | }, 1783 | "MIDDLE_FINGER_TIP": { 1784 | "mean": [ 1785 | -0.23025663419713321, 1786 | -0.09337450006325233, 1787 | -0.0003011485248837637 1788 | ], 1789 | "std": [ 1790 | 0.19651854402782576, 1791 | 0.2378150194230093, 1792 | 0.00017415412748885643 1793 | ] 1794 | }, 1795 | "RING_FINGER_MCP": { 1796 | "mean": [ 1797 | -0.10084351655427212, 1798 | -0.05543433373393147, 1799 | -0.00018293718547420196 1800 | ], 1801 | "std": [ 1802 | 0.11217842696997496, 1803 | 0.139364777085065, 1804 | 0.00011542453042653498 1805 | ] 1806 | }, 1807 | "RING_FINGER_PIP": { 1808 | "mean": [ 1809 | -0.1705305330388045, 1810 | -0.05562060375978384, 1811 | -0.0002567950125911685 1812 | ], 1813 | "std": [ 1814 | 0.15371287298294678, 1815 | 0.19219715524101208, 1816 | 0.00014827437454452218 1817 | ] 1818 | }, 1819 | "RING_FINGER_DIP": { 1820 | "mean": [ 1821 | -0.19287875986493194, 1822 | -0.051670221569695966, 1823 | -0.00026179110547322275 1824 | ], 1825 | "std": [ 1826 | 0.1704734487605362, 1827 | 0.20955627435481267, 1828 | 0.00015639264869047605 1829 | ] 1830 | }, 1831 | "RING_FINGER_TIP": { 1832 | "mean": [ 1833 | -0.20351962768700796, 1834 | -0.05197493261193231, 1835 | -0.0002619672881639671 1836 | ], 1837 | "std": [ 1838 | 0.18816882144776315, 1839 | 0.22415125078177559, 1840 | 0.00016734497163790882 1841 | ] 1842 | }, 1843 | "PINKY_MCP": { 1844 | "mean": [ 1845 | -0.08770425234114743, 1846 | -0.013953512623301512, 1847 | -0.00021850417258263553 1848 | ], 1849 | "std": [ 1850 | 0.11705540396288934, 1851 | 0.136824991830283, 1852 | 0.00012417352642377942 1853 | ] 1854 | }, 1855 | "PINKY_PIP": { 1856 | "mean": [ 1857 | -0.14234076021200887, 1858 | -0.013275593781322072, 1859 | -0.00027141506763185276 1860 | ], 1861 | "std": [ 1862 | 0.15074750047434984, 1863 | 0.1779164723461388, 1864 | 0.0001529812291423194 1865 | ] 1866 | }, 1867 | "PINKY_DIP": { 1868 | "mean": [ 1869 | -0.16095667336492706, 1870 | -0.012025589726685026, 1871 | -0.0002630189596475426 1872 | ], 1873 | "std": [ 1874 | 0.1650907568549052, 1875 | 0.19251888153237762, 1876 | 0.0001614811347317605 1877 | ] 1878 | }, 1879 | "PINKY_TIP": { 1880 | "mean": [ 1881 | -0.170275131451333, 1882 | -0.013258694814362497, 1883 | -0.0002517298443739073 1884 | ], 1885 | "std": [ 1886 | 0.17988725400954816, 1887 | 0.20459309070235937, 1888 | 0.00017157925872138865 1889 | ] 1890 | } 1891 | }, 1892 | "RIGHT_HAND_LANDMARKS": { 1893 | "WRIST": { 1894 | "mean": [ 1895 | 0.0, 1896 | 0.0, 1897 | 0.0 1898 | ], 1899 | "std": [ 1900 | 0.0, 1901 | 0.0, 1902 | 0.0 1903 | ] 1904 | }, 1905 | "THUMB_CMC": { 1906 | "mean": [ 1907 | 0.06113069748675152, 1908 | -0.0653079341525351, 1909 | -3.7051428783021304e-05 1910 | ], 1911 | "std": [ 1912 | 0.0591857886303557, 1913 | 0.051762021199929, 1914 | 6.449479971855766e-05 1915 | ] 1916 | }, 1917 | "THUMB_MCP": { 1918 | "mean": [ 1919 | 0.1251577214932426, 1920 | -0.12098150663612256, 1921 | -8.539061932383315e-05 1922 | ], 1923 | "std": [ 1924 | 0.10198466458804939, 1925 | 0.10287862241833064, 1926 | 9.673356757327872e-05 1927 | ] 1928 | }, 1929 | "THUMB_IP": { 1930 | "mean": [ 1931 | 0.17303729256021663, 1932 | -0.150535708638432, 1933 | -0.0001401995798013077 1934 | ], 1935 | "std": [ 1936 | 0.13253839594079742, 1937 | 0.1471597304675383, 1938 | 0.00012136742016309589 1939 | ] 1940 | }, 1941 | "THUMB_TIP": { 1942 | "mean": [ 1943 | 0.20285199659361342, 1944 | -0.16761208033805314, 1945 | -0.00019129475955425817 1946 | ], 1947 | "std": [ 1948 | 0.15922547005427276, 1949 | 0.1815998970711668, 1950 | 0.000150467411431489 1951 | ] 1952 | }, 1953 | "INDEX_FINGER_MCP": { 1954 | "mean": [ 1955 | 0.11851765372910136, 1956 | -0.16166741057493536, 1957 | -9.689838802660863e-05 1958 | ], 1959 | "std": [ 1960 | 0.11863306157089328, 1961 | 0.12855092250359704, 1962 | 0.00012520096483120102 1963 | ] 1964 | }, 1965 | "INDEX_FINGER_PIP": { 1966 | "mean": [ 1967 | 0.18564985168861495, 1968 | -0.1881099408679721, 1969 | -0.00019611697397996195 1970 | ], 1971 | "std": [ 1972 | 0.15648733770405557, 1973 | 0.18971949006813416, 1974 | 0.00015497010500691152 1975 | ] 1976 | }, 1977 | "INDEX_FINGER_DIP": { 1978 | "mean": [ 1979 | 0.21686999633316661, 1980 | -0.1887148989392569, 1981 | -0.00026282603552259823 1982 | ], 1983 | "std": [ 1984 | 0.1728756686852748, 1985 | 0.2186232270552628, 1986 | 0.00017120192103547212 1987 | ] 1988 | }, 1989 | "INDEX_FINGER_TIP": { 1990 | "mean": [ 1991 | 0.23573193724759747, 1992 | -0.18789263885567026, 1993 | -0.00030138621324936923 1994 | ], 1995 | "std": [ 1996 | 0.18836767191085596, 1997 | 0.24128772298069817, 1998 | 0.00018321621042526512 1999 | ] 2000 | }, 2001 | "MIDDLE_FINGER_MCP": { 2002 | "mean": [ 2003 | 0.10439901964441753, 2004 | -0.13129374169573338, 2005 | -0.0001364668281781232 2006 | ], 2007 | "std": [ 2008 | 0.1151210439555401, 2009 | 0.1361740251128679, 2010 | 0.00011557244038509688 2011 | ] 2012 | }, 2013 | "MIDDLE_FINGER_PIP": { 2014 | "mean": [ 2015 | 0.17907125025257217, 2016 | -0.14451770536397313, 2017 | -0.00022359051846106053 2018 | ], 2019 | "std": [ 2020 | 0.15285921819906687, 2021 | 0.19672252541880933, 2022 | 0.00014905004609420337 2023 | ] 2024 | }, 2025 | "MIDDLE_FINGER_DIP": { 2026 | "mean": [ 2027 | 0.2044530721391551, 2028 | -0.13646567950639082, 2029 | -0.00025999396237326493 2030 | ], 2031 | "std": [ 2032 | 0.16556054785428065, 2033 | 0.21836638101087538, 2034 | 0.0001559751263391497 2035 | ] 2036 | }, 2037 | "MIDDLE_FINGER_TIP": { 2038 | "mean": [ 2039 | 0.2157012681537131, 2040 | -0.13192492750445067, 2041 | -0.0002808265365007887 2042 | ], 2043 | "std": [ 2044 | 0.18105223685982397, 2045 | 0.23600460833312348, 2046 | 0.00016386802829093815 2047 | ] 2048 | }, 2049 | "RING_FINGER_MCP": { 2050 | "mean": [ 2051 | 0.08936545255913597, 2052 | -0.09255108112248657, 2053 | -0.00018079590053678173 2054 | ], 2055 | "std": [ 2056 | 0.11678397422096998, 2057 | 0.13947998405240314, 2058 | 0.00011321394776500012 2059 | ] 2060 | }, 2061 | "RING_FINGER_PIP": { 2062 | "mean": [ 2063 | 0.15949956656375672, 2064 | -0.09902482684170004, 2065 | -0.0002594920580454232 2066 | ], 2067 | "std": [ 2068 | 0.15147105095742847, 2069 | 0.1929971788249909, 2070 | 0.00014455370469292086 2071 | ] 2072 | }, 2073 | "RING_FINGER_DIP": { 2074 | "mean": [ 2075 | 0.17864987056248618, 2076 | -0.09047214630065496, 2077 | -0.00025721924883791554 2078 | ], 2079 | "std": [ 2080 | 0.1607408556349383, 2081 | 0.20798118609753458, 2082 | 0.00014998310578443383 2083 | ] 2084 | }, 2085 | "RING_FINGER_TIP": { 2086 | "mean": [ 2087 | 0.18508851182807834, 2088 | -0.08667564185522705, 2089 | -0.00024763968474074183 2090 | ], 2091 | "std": [ 2092 | 0.1731917503835963, 2093 | 0.2204543615118416, 2094 | 0.00015823951707144365 2095 | ] 2096 | }, 2097 | "PINKY_MCP": { 2098 | "mean": [ 2099 | 0.07481753258914732, 2100 | -0.0504051072884894, 2101 | -0.00022953525221050747 2102 | ], 2103 | "std": [ 2104 | 0.12424070712171752, 2105 | 0.13876626433319503, 2106 | 0.00012234685512977936 2107 | ] 2108 | }, 2109 | "PINKY_PIP": { 2110 | "mean": [ 2111 | 0.12908633514680307, 2112 | -0.05487163810746959, 2113 | -0.00027574363018090134 2114 | ], 2115 | "std": [ 2116 | 0.15305430875610007, 2117 | 0.18090056655723508, 2118 | 0.000144185956439435 2119 | ] 2120 | }, 2121 | "PINKY_DIP": { 2122 | "mean": [ 2123 | 0.14475841034535586, 2124 | -0.051090184627490695, 2125 | -0.0002559125376532547 2126 | ], 2127 | "std": [ 2128 | 0.1610515973162314, 2129 | 0.1943861739640513, 2130 | 0.0001512465810354175 2131 | ] 2132 | }, 2133 | "PINKY_TIP": { 2134 | "mean": [ 2135 | 0.15051838128843145, 2136 | -0.04984799669006585, 2137 | -0.00023496026655587263 2138 | ], 2139 | "std": [ 2140 | 0.17115034527993028, 2141 | 0.20525389695542712, 2142 | 0.0001606822838125767 2143 | ] 2144 | } 2145 | } 2146 | } --------------------------------------------------------------------------------