├── .gitignore ├── README.md ├── assets └── model_card.png ├── models ├── mediapipe-holistic │ ├── README.md │ ├── js │ │ ├── group1-shard1of1.bin │ │ └── model.json │ └── py │ │ └── model.h5 └── openpose-body │ ├── README.md │ ├── js │ ├── group1-shard1of1.bin │ └── model.json │ └── py │ └── model.h5 ├── requirements.txt └── sign_language_detection ├── args.py ├── dataset.py ├── examples ├── create_tfrecord.py └── create_tfrecord_dgs_corpus.py ├── holistic.poseheader ├── model.py ├── openpose.poseheader ├── tools ├── h5_to_tfjs.py └── model_card.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sign Language Detection Training 2 | 3 | This is a fork of the [TensorFlow implementation](https://github.com/google-research/google-research/tree/master/sign_language_detection) of the model proposed in 4 | [Real-Time Sign Language Detection using Human Pose Estimation](https://arxiv.org/abs/2008.04637), 5 | published in SLRTP 2020. 6 | 7 | In this fork, we add data loading and training for Holistic pose estimation. 8 | 9 | This model is used in the 10 | [Real-TIme Sign Language Detection for Videoconferencing](https://youtu.be/nozz2pvbG_Q) 11 | demo published in ECCV 2020. 12 | 13 | ## Models 14 | 15 | This repository includes pre-trained models for both python and javascript. 16 | 17 | ![Model Card](assets/model_card.png) 18 | 19 | ## Usage 20 | 21 | You can use the included models to perform inference or fine-tuning. 22 | 23 | To load a model in python, use 24 | `tensorflow.python.keras.models.load_model('models/py/model.h5')`. 25 | 26 | To load a model in the browser, use `tf.loadLayersModel('models/js/model.json')` 27 | from [tfjs](https://github.com/tensorflow/tfjs). 28 | 29 | You can use the [train.py](train.py) script to train the model from scratch 30 | using a `tfrecord` dataset file. 31 | 32 | ```bash 33 | python -m train --dataset_path="data.tfrecord" --device="/GPU:0" 34 | ``` 35 | 36 | ## Dataset 37 | 38 | The dataset is represented as a `tfrecord` file where each video has 4 39 | properties: 40 | 1. `fps`:`Int64List` - the framerate of the video 41 | 1. `pose_data`:`BytesList` - human pose estimation, as a tensor of the shape 42 | `(frames, 1, points, dimensions)` 43 | 1. `pose_confidence`:`BytesList` - human pose 44 | estimation confidence, as a tensor of the shape `(frames, 1, points)`0 45 | 1. `is_signing`:`BytesList` - a bytes object representing weather the user was 46 | signing or not in every frame. 47 | 48 | Please see `examples/create_tfrecord.py` for an example of creating this record. 49 | 50 | The provided models were trained on the 51 | [Public DGS Corpus](https://www.sign-lang.uni-hamburg.de/meinedgs/ling/start-name_en.html). 52 | 53 | To create the data files using the dgs corpus, *TODO* 54 | 55 | 56 | ### Citations 57 | 58 | ```bibtex 59 | @inproceedings{moryossef2020sign, 60 | title={Real-Time Sign Language Detection using Human Pose Estimation}, 61 | author={Amit Moryossef and Ioannis Tsochantaridis and Roee Aharoni and Sarah Ebling and S. Narayanan}, 62 | journal={SLRTP}, 63 | year={2020}, 64 | } 65 | 66 | 67 | # If you are using the Public DGS Corpus 68 | @inproceedings{hanke2020extending, 69 | title={{E}xtending the {P}ublic {DGS} {C}orpus in Size and Depth}, 70 | author={Hanke, Thomas and Schulder, Marc and Konrad, Reiner and Jahn, Elena}, 71 | booktitle={Proceedings of the LREC2020 9th Workshop on the Representation and Processing of Sign Languages: Sign Language Resources in the Service of the Language Community, Technological Challenges and Application Perspectives}, 72 | pages={75--82}, 73 | year={2020} 74 | } 75 | ``` 76 | -------------------------------------------------------------------------------- /assets/model_card.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sign-language-processing/detection-train/39df41d58862791badb393b7191c779b2c4e4a66/assets/model_card.png -------------------------------------------------------------------------------- /models/mediapipe-holistic/README.md: -------------------------------------------------------------------------------- 1 | # MediaPipe Holistic 2 | 3 | Unlike the OpenPose-Body model, this one includes hands. 4 | 5 | For simplicity, out of 500+ keypoints, the following three components were selected: 6 | - POSE_LANDMARKS 7 | - LEFT_HAND_LANDMARKS 8 | - RIGHT_HAND_LANDMARKS -------------------------------------------------------------------------------- /models/mediapipe-holistic/js/group1-shard1of1.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sign-language-processing/detection-train/39df41d58862791badb393b7191c779b2c4e4a66/models/mediapipe-holistic/js/group1-shard1of1.bin -------------------------------------------------------------------------------- /models/mediapipe-holistic/js/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "layers-model", 3 | "generatedBy": "keras v2.4.0", 4 | "convertedBy": "TensorFlow.js Converter v3.8.0", 5 | "modelTopology": { 6 | "keras_version": "2.4.0", 7 | "backend": "tensorflow", 8 | "model_config": { 9 | "class_name": "Sequential", 10 | "config": { 11 | "name": "tgt", 12 | "layers": [ 13 | { 14 | "class_name": "InputLayer", 15 | "config": { 16 | "batch_input_shape": [1, 1, 75], 17 | "dtype": "float32", 18 | "sparse": false, 19 | "ragged": false, 20 | "name": "dropout_input" 21 | } 22 | }, 23 | { 24 | "class_name": "Dropout", 25 | "config": { 26 | "name": "dropout", 27 | "trainable": true, 28 | "dtype": "float32", 29 | "rate": 0.5, 30 | "noise_shape": null, 31 | "seed": null 32 | } 33 | }, 34 | { 35 | "class_name": "LSTM", 36 | "config": { 37 | "name": "lstm", 38 | "trainable": true, 39 | "dtype": "float32", 40 | "return_sequences": true, 41 | "return_state": false, 42 | "go_backwards": false, 43 | "stateful": true, 44 | "unroll": false, 45 | "time_major": false, 46 | "units": 64, 47 | "activation": "tanh", 48 | "recurrent_activation": "sigmoid", 49 | "use_bias": true, 50 | "kernel_initializer": { 51 | "class_name": "GlorotUniform", 52 | "config": { 53 | "seed": null 54 | } 55 | }, 56 | "recurrent_initializer": { 57 | "class_name": "Orthogonal", 58 | "config": { 59 | "gain": 1.0, 60 | "seed": null 61 | } 62 | }, 63 | "bias_initializer": { 64 | "class_name": "Zeros", 65 | "config": {} 66 | }, 67 | "unit_forget_bias": true, 68 | "kernel_regularizer": null, 69 | "recurrent_regularizer": null, 70 | "bias_regularizer": null, 71 | "activity_regularizer": null, 72 | "kernel_constraint": null, 73 | "recurrent_constraint": null, 74 | "bias_constraint": null, 75 | "dropout": 0.0, 76 | "recurrent_dropout": 0.0, 77 | "implementation": 2 78 | } 79 | }, 80 | { 81 | "class_name": "Dense", 82 | "config": { 83 | "name": "dense", 84 | "trainable": true, 85 | "dtype": "float32", 86 | "units": 2, 87 | "activation": "softmax", 88 | "use_bias": true, 89 | "kernel_initializer": { 90 | "class_name": "GlorotUniform", 91 | "config": { 92 | "seed": null 93 | } 94 | }, 95 | "bias_initializer": { 96 | "class_name": "Zeros", 97 | "config": {} 98 | }, 99 | "kernel_regularizer": null, 100 | "bias_regularizer": null, 101 | "activity_regularizer": null, 102 | "kernel_constraint": null, 103 | "bias_constraint": null 104 | } 105 | } 106 | ] 107 | } 108 | }, 109 | "training_config": { 110 | "loss": "sparse_categorical_crossentropy", 111 | "metrics": [ 112 | "accuracy" 113 | ], 114 | "weighted_metrics": null, 115 | "loss_weights": null, 116 | "optimizer_config": { 117 | "class_name": "Adam", 118 | "config": { 119 | "name": "Adam", 120 | "learning_rate": 0.0010000000474974513, 121 | "decay": 0.0, 122 | "beta_1": 0.8999999761581421, 123 | "beta_2": 0.9990000128746033, 124 | "epsilon": 1e-07, 125 | "amsgrad": false 126 | } 127 | } 128 | } 129 | }, 130 | "weightsManifest": [ 131 | { 132 | "paths": [ 133 | "group1-shard1of1.bin" 134 | ], 135 | "weights": [ 136 | { 137 | "name": "dense/kernel", 138 | "shape": [ 139 | 64, 140 | 2 141 | ], 142 | "dtype": "float32" 143 | }, 144 | { 145 | "name": "dense/bias", 146 | "shape": [ 147 | 2 148 | ], 149 | "dtype": "float32" 150 | }, 151 | { 152 | "name": "lstm/lstm_cell/kernel", 153 | "shape": [ 154 | 75, 155 | 256 156 | ], 157 | "dtype": "float32" 158 | }, 159 | { 160 | "name": "lstm/lstm_cell/recurrent_kernel", 161 | "shape": [ 162 | 64, 163 | 256 164 | ], 165 | "dtype": "float32" 166 | }, 167 | { 168 | "name": "lstm/lstm_cell/bias", 169 | "shape": [ 170 | 256 171 | ], 172 | "dtype": "float32" 173 | } 174 | ] 175 | } 176 | ] 177 | } -------------------------------------------------------------------------------- /models/mediapipe-holistic/py/model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sign-language-processing/detection-train/39df41d58862791badb393b7191c779b2c4e4a66/models/mediapipe-holistic/py/model.h5 -------------------------------------------------------------------------------- /models/openpose-body/README.md: -------------------------------------------------------------------------------- 1 | # OpenPose-Body 2 | 3 | Original model from the paper, trained on OpenPose body 25 points. -------------------------------------------------------------------------------- /models/openpose-body/js/group1-shard1of1.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sign-language-processing/detection-train/39df41d58862791badb393b7191c779b2c4e4a66/models/openpose-body/js/group1-shard1of1.bin -------------------------------------------------------------------------------- /models/openpose-body/js/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "layers-model", 3 | "generatedBy": "keras v2.3.0-tf", 4 | "convertedBy": "TensorFlow.js Converter v2.0.1.post1", 5 | "modelTopology": { 6 | "keras_version": "2.3.0-tf", 7 | "backend": "tensorflow", 8 | "model_config": { 9 | "class_name": "Sequential", 10 | "config": { 11 | "name": "sequential", 12 | "layers": [ 13 | { 14 | "class_name": "Dropout", 15 | "config": { 16 | "batch_input_shape": [1, 1, 25], 17 | "name": "dropout", 18 | "trainable": true, 19 | "dtype": "float32", 20 | "rate": 0.5, 21 | "noise_shape": null, 22 | "seed": null 23 | } 24 | }, 25 | { 26 | "class_name": "LSTM", 27 | "config": { 28 | "name": "lstm", 29 | "trainable": true, 30 | "dtype": "float32", 31 | "return_sequences": true, 32 | "return_state": false, 33 | "go_backwards": false, 34 | "stateful": true, 35 | "unroll": false, 36 | "time_major": false, 37 | "units": 64, 38 | "activation": "tanh", 39 | "recurrent_activation": "sigmoid", 40 | "use_bias": true, 41 | "kernel_initializer": { 42 | "class_name": "GlorotUniform", 43 | "config": { 44 | "seed": null 45 | } 46 | }, 47 | "recurrent_initializer": { 48 | "class_name": "Orthogonal", 49 | "config": { 50 | "gain": 1.0, 51 | "seed": null 52 | } 53 | }, 54 | "bias_initializer": { 55 | "class_name": "Zeros", 56 | "config": {} 57 | }, 58 | "unit_forget_bias": true, 59 | "kernel_regularizer": null, 60 | "recurrent_regularizer": null, 61 | "bias_regularizer": null, 62 | "activity_regularizer": null, 63 | "kernel_constraint": null, 64 | "recurrent_constraint": null, 65 | "bias_constraint": null, 66 | "dropout": 0.0, 67 | "recurrent_dropout": 0.0, 68 | "implementation": 2 69 | } 70 | }, 71 | { 72 | "class_name": "Dense", 73 | "config": { 74 | "name": "dense", 75 | "trainable": true, 76 | "dtype": "float32", 77 | "units": 2, 78 | "activation": "linear", 79 | "use_bias": true, 80 | "kernel_initializer": { 81 | "class_name": "GlorotUniform", 82 | "config": { 83 | "seed": null 84 | } 85 | }, 86 | "bias_initializer": { 87 | "class_name": "Zeros", 88 | "config": {} 89 | }, 90 | "kernel_regularizer": null, 91 | "bias_regularizer": null, 92 | "activity_regularizer": null, 93 | "kernel_constraint": null, 94 | "bias_constraint": null 95 | } 96 | } 97 | ], 98 | "build_input_shape": [ 99 | 1, 100 | 1, 101 | 25 102 | ] 103 | } 104 | } 105 | }, 106 | "weightsManifest": [ 107 | { 108 | "paths": [ 109 | "group1-shard1of1.bin" 110 | ], 111 | "weights": [ 112 | { 113 | "name": "dense/kernel", 114 | "shape": [ 115 | 64, 116 | 2 117 | ], 118 | "dtype": "float32" 119 | }, 120 | { 121 | "name": "dense/bias", 122 | "shape": [ 123 | 2 124 | ], 125 | "dtype": "float32" 126 | }, 127 | { 128 | "name": "lstm/lstm_cell/kernel", 129 | "shape": [ 130 | 25, 131 | 256 132 | ], 133 | "dtype": "float32" 134 | }, 135 | { 136 | "name": "lstm/lstm_cell/recurrent_kernel", 137 | "shape": [ 138 | 64, 139 | 256 140 | ], 141 | "dtype": "float32" 142 | }, 143 | { 144 | "name": "lstm/lstm_cell/bias", 145 | "shape": [ 146 | 256 147 | ], 148 | "dtype": "float32" 149 | } 150 | ] 151 | } 152 | ] 153 | } 154 | -------------------------------------------------------------------------------- /models/openpose-body/py/model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sign-language-processing/detection-train/39df41d58862791badb393b7191c779b2c4e4a66/models/openpose-body/py/model.h5 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py>=0.9.0 2 | tensorflow==2.3.0 3 | pose_format>=0.0.1 4 | numpy>=1.13.0 5 | model_card_toolkit>=0.1.0 6 | -------------------------------------------------------------------------------- /sign_language_detection/args.py: -------------------------------------------------------------------------------- 1 | """Training command line arguments.""" 2 | from absl import flags 3 | 4 | FLAGS = flags.FLAGS 5 | 6 | # Training flags 7 | flags.DEFINE_integer('seed', 42, 'Random seed') 8 | flags.DEFINE_string('device', '/GPU:0', 'Tensorflow device') 9 | flags.DEFINE_string('model_path', 'model.h5', 'Path to save trained model') 10 | flags.DEFINE_integer('epochs', 100, 'Maximum number of epochs') 11 | flags.DEFINE_integer('steps_per_epoch', 128, 'Number of batches per epoch') 12 | flags.DEFINE_float('learning_rate', 0.001, 'Learning rate') 13 | flags.DEFINE_integer('stop_patience', 3, 'Patience for early stopping') 14 | flags.DEFINE_integer('batch_size', 8, 'Batch size for training') 15 | flags.DEFINE_integer('test_batch_size', 1, 'Batch size for evaluation') 16 | 17 | # Model flags 18 | flags.DEFINE_float('input_dropout', 0.5, 'Input dropout rate') 19 | flags.DEFINE_integer('encoder_layers', 1, 'Number of RNN layers') 20 | flags.DEFINE_bool('encoder_bidirectional', False, 'Use a bidirectional encoder?') 21 | flags.DEFINE_integer('hidden_size', 2 ** 6, 'RNN hidden state size') 22 | 23 | # Augmentation flags 24 | flags.DEFINE_float('frame_dropout_std', 0.3, 'Augmentation drop frames std') 25 | 26 | # Dataset flags 27 | flags.DEFINE_string('dataset_path', "examples/data.tfrecord", 'Location of tfrecord file') 28 | flags.DEFINE_integer('input_size', 75, 'Number of pose points') 29 | 30 | flags.mark_flag_as_required('dataset_path') 31 | flags.mark_flag_as_required('input_size') 32 | -------------------------------------------------------------------------------- /sign_language_detection/dataset.py: -------------------------------------------------------------------------------- 1 | """Utilities to load and process a sign language detection dataset.""" 2 | import functools 3 | import os 4 | from typing import Dict 5 | 6 | import tensorflow as tf 7 | from pose_format.pose import Pose, PoseHeader 8 | from pose_format.tensorflow.masked.tensor import MaskedTensor 9 | from pose_format.tensorflow.pose_body import TensorflowPoseBody 10 | from pose_format.tensorflow.pose_body import TF_POSE_RECORD_DESCRIPTION 11 | from pose_format.utils.reader import BufferReader 12 | 13 | from sign_language_detection.args import FLAGS 14 | 15 | 16 | @functools.lru_cache(maxsize=1) 17 | def get_pose_header(): 18 | """Get pose header with components description.""" 19 | dir_path = os.path.dirname(os.path.realpath(__file__)) 20 | header_path = os.path.join(dir_path, "holistic.poseheader") 21 | f = open(header_path, "rb") 22 | reader = BufferReader(f.read()) 23 | header = PoseHeader.read(reader) 24 | header.components.pop(1) # Remove face from holistic pose 25 | return header 26 | 27 | 28 | def differentiate_frames(src): 29 | """Subtract every two consecutive frames.""" 30 | # Shift data to pre/post frames 31 | pre_src = src[:-1] 32 | post_src = src[1:] 33 | 34 | # Differentiate src points 35 | src = pre_src - post_src 36 | 37 | return src 38 | 39 | 40 | def distance(src): 41 | """Calculate the Euclidean distance from x:y coordinates.""" 42 | square = src.square() 43 | sum_squares = square.sum(axis=-1).fix_nan() 44 | sqrt = sum_squares.sqrt().zero_filled() 45 | return sqrt 46 | 47 | 48 | def optical_flow(src, fps): 49 | """Calculate the optical flow norm between frames, normalized by fps.""" 50 | 51 | # Remove "people" dimension 52 | src = src.squeeze(1) 53 | 54 | # Differentiate Frames 55 | src = differentiate_frames(src) 56 | 57 | # Calculate distance 58 | src = distance(src) 59 | 60 | # Normalize distance by fps 61 | src = src * fps 62 | 63 | return src 64 | 65 | 66 | minimum_fps = tf.constant(1, dtype=tf.float32) 67 | 68 | 69 | def load_datum(tfrecord_dict): 70 | """Convert tfrecord dictionary to tensors.""" 71 | pose_body = TensorflowPoseBody.from_tfrecord(tfrecord_dict) 72 | tgt = tf.io.decode_raw(tfrecord_dict["is_signing"], out_type=tf.int8) 73 | 74 | fps = pose_body.fps 75 | frames = tf.cast(tf.size(tgt), dtype=fps.dtype) 76 | 77 | return { 78 | "fps": fps, 79 | "frames": frames, 80 | "tgt": tgt, 81 | "pose_data_tensor": pose_body.data.tensor, 82 | "pose_data_mask": pose_body.data.mask, 83 | "pose_confidence": pose_body.confidence, 84 | } 85 | 86 | 87 | def process_datum(datum, augment=False): 88 | """Prepare every datum to be an input-output pair for training / eval. 89 | 90 | Supports data augmentation only including frames dropout. 91 | Frame dropout affects the FPS, which does change the optical flow. 92 | 93 | Args: 94 | datum (Dict[str, tf.Tensor]): a dictionary of tensors loaded from the 95 | tfrecord. 96 | augment (bool): should apply data augmentation on the datum? 97 | 98 | Returns: 99 | dict(Dict[str, tf.Tensor]): dictionary including "src" and "tgt" tensors 100 | """ 101 | masked_tensor = MaskedTensor(tensor=datum["pose_data_tensor"], mask=datum["pose_data_mask"]) 102 | pose_body = TensorflowPoseBody(fps=datum["fps"], data=masked_tensor, confidence=datum["pose_confidence"]) 103 | pose = Pose(header=get_pose_header(), body=pose_body) 104 | tgt = datum["tgt"] 105 | 106 | fps = pose.body.fps 107 | frames = datum["frames"] 108 | 109 | if augment: 110 | pose, selected_indexes = pose.frame_dropout_normal(dropout_mean=0, dropout_std=FLAGS.frame_dropout_std) 111 | tgt = tf.gather(tgt, selected_indexes) 112 | 113 | new_frames = tf.cast(tf.size(tgt), dtype=fps.dtype) 114 | 115 | fps = tf.math.maximum(minimum_fps, (new_frames / frames) * fps) 116 | frames = new_frames 117 | 118 | flow = optical_flow(pose.body.data, fps) 119 | tgt = tgt[1:] # First frame tag is not used 120 | 121 | return {"src": flow, "tgt": tgt} 122 | 123 | 124 | def prepare_io(datum): 125 | """Convert dictionary into input-output tuple for Keras.""" 126 | src = datum["src"] 127 | tgt = datum["tgt"] 128 | 129 | return src, tf.cast(tgt, dtype=tf.int32) 130 | 131 | 132 | def batch_dataset(dataset, batch_size): 133 | """Batch and pad a dataset.""" 134 | dataset = dataset.padded_batch( 135 | batch_size, padded_shapes={ 136 | "src": [None, None], 137 | "tgt": [None] 138 | }) 139 | 140 | return dataset.map(prepare_io) 141 | 142 | 143 | def train_pipeline(dataset): 144 | """Prepare the training dataset.""" 145 | dataset = dataset.map(load_datum).cache() 146 | dataset = dataset.repeat() 147 | dataset = dataset.map(lambda d: process_datum(d, True)) 148 | dataset = dataset.shuffle(FLAGS.batch_size) 149 | dataset = batch_dataset(dataset, FLAGS.batch_size) 150 | dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) 151 | return dataset 152 | 153 | 154 | def test_pipeline(dataset): 155 | """Prepare the test dataset.""" 156 | dataset = dataset.map(load_datum) 157 | dataset = dataset.map(process_datum) 158 | dataset = batch_dataset(dataset, FLAGS.test_batch_size) 159 | return dataset 160 | 161 | 162 | def split_dataset(dataset): 163 | """Split dataset to train, dev, and test.""" 164 | 165 | def is_dev(x, _): 166 | # Every 7th item 167 | return x % 8 == 6 168 | 169 | def is_test(x, _): 170 | # Every 8th item 171 | return x % 8 == 7 172 | 173 | def is_train(x, y): 174 | return not is_test(x, y) and not is_dev(x, y) 175 | 176 | def recover(_, y): 177 | return y 178 | 179 | train = train_pipeline(dataset.enumerate().filter(is_train).map(recover)) 180 | dev = test_pipeline(dataset.enumerate().filter(is_dev).map(recover)) 181 | test = test_pipeline(dataset.enumerate().filter(is_test).map(recover)) 182 | 183 | return train, dev, test 184 | 185 | 186 | def get_datasets(): 187 | """Get train, dev, and test datasets.""" 188 | # Set features 189 | features = {"is_signing": tf.io.FixedLenFeature([], tf.string)} 190 | features.update(TF_POSE_RECORD_DESCRIPTION) 191 | 192 | # Dataset iterator 193 | dataset = tf.data.TFRecordDataset(filenames=[FLAGS.dataset_path]) 194 | dataset = dataset.map( 195 | lambda serialized: tf.io.parse_single_example(serialized, features)) 196 | 197 | return split_dataset(dataset) 198 | -------------------------------------------------------------------------------- /sign_language_detection/examples/create_tfrecord.py: -------------------------------------------------------------------------------- 1 | """Example code to create tfrecord for training.""" 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | with tf.io.TFRecordWriter('example.tfrecord') as writer: 7 | for _ in range(5): # Iterate over 5 examples 8 | frames = 100 # Number of frames in the example video 9 | fps = 25 # FPS in the example video 10 | 11 | is_signing = np.random.randint(low=0, high=1, size=(frames), dtype='byte').tobytes() 12 | data = tf.io.serialize_tensor(tf.random.normal(shape=(frames, 1, 75, 3), dtype=tf.float32)).numpy() 13 | confidence = tf.io.serialize_tensor(tf.random.normal(shape=(frames, 1, 75), dtype=tf.float32)).numpy() 14 | 15 | features = { 16 | 'fps': tf.train.Feature(int64_list=tf.train.Int64List(value=[fps])), 17 | 'pose_data': tf.train.Feature(bytes_list=tf.train.BytesList(value=[data])), 18 | 'pose_confidence': tf.train.Feature(bytes_list=tf.train.BytesList(value=[confidence])), 19 | 'is_signing': tf.train.Feature(bytes_list=tf.train.BytesList(value=[is_signing])) 20 | } 21 | 22 | example = tf.train.Example(features=tf.train.Features(feature=features)) 23 | writer.write(example.SerializeToString()) 24 | -------------------------------------------------------------------------------- /sign_language_detection/examples/create_tfrecord_dgs_corpus.py: -------------------------------------------------------------------------------- 1 | """Code to create tfrecord for training from The Public DGS Corpus.""" 2 | import itertools 3 | 4 | import numpy as np 5 | # noinspection PyUnresolvedReferences 6 | import sign_language_datasets.datasets 7 | import tensorflow as tf 8 | import tensorflow_datasets as tfds 9 | from pose_format import Pose 10 | from pose_format.numpy import NumPyPoseBody 11 | from sign_language_datasets.datasets.config import SignDatasetConfig 12 | from sign_language_datasets.datasets.dgs_corpus.dgs_utils import get_elan_sentences 13 | from tqdm import tqdm 14 | from sign_language_detection.dataset import get_pose_header 15 | 16 | config = SignDatasetConfig(name="dgs-holistic", version="3.0.0", include_video=False, include_pose="holistic") 17 | dgs_corpus = tfds.load('dgs_corpus', builder_kwargs=dict(config=config)) 18 | 19 | pose_header = get_pose_header() 20 | 21 | 22 | def time_frame(ms, fps): 23 | return int(fps * (ms / 1000)) 24 | 25 | 26 | def hide_legs(pose: Pose): 27 | point_names = ["KNEE", "ANKLE", "HEEL", "FOOT_INDEX"] 28 | # pylint: disable=protected-access 29 | points = [pose.header._get_point_index("POSE_LANDMARKS", side + "_" + n) 30 | for n in point_names for side in ["LEFT", "RIGHT"]] 31 | pose.body.confidence[:, :, points] = 0 32 | pose.body.data[:, :, points, :] = 0 33 | 34 | 35 | def load_pose(tf_pose): 36 | fps = int(tf_pose["fps"].numpy()) 37 | 38 | pose_body = NumPyPoseBody(fps, tf_pose["data"].numpy(), tf_pose["conf"].numpy()) 39 | pose = Pose(pose_header, pose_body) 40 | 41 | # Get subset of components 42 | pose = pose.get_components(["POSE_LANDMARKS", "LEFT_HAND_LANDMARKS", "RIGHT_HAND_LANDMARKS"]) 43 | 44 | # Normalize to shoulderwidth 45 | pose = pose.normalize(pose.header.normalization_info( 46 | p1=("POSE_LANDMARKS", "RIGHT_SHOULDER"), 47 | p2=("POSE_LANDMARKS", "LEFT_SHOULDER") 48 | )) 49 | 50 | # Remove legs 51 | hide_legs(pose) 52 | 53 | # Data without Z axis 54 | data = pose.body.data.data[:, :, :, :2] 55 | conf = pose.body.confidence 56 | 57 | return data, conf, fps 58 | 59 | 60 | with tf.io.TFRecordWriter('data.tfrecord') as writer: 61 | for datum in tqdm(dgs_corpus["train"]): 62 | elan_path = datum["paths"]["eaf"].numpy().decode('utf-8') 63 | sentences = list(get_elan_sentences(elan_path)) 64 | 65 | for person in ["a", "b"]: 66 | pose_data, pose_conf, fps = load_pose(datum["poses"][person]) 67 | frames = len(pose_data) 68 | 69 | if pose_data.shape[0] > 0: # Remove odd, 0 width examples 70 | is_signing = np.zeros(pose_data.shape[0], dtype='byte') 71 | 72 | for sentence in sentences: 73 | if sentence["participant"].lower() == person: 74 | for gloss in sentence["glosses"]: 75 | start_frame = time_frame(gloss["start"], fps) 76 | end_frame = time_frame(gloss["end"], fps) 77 | 78 | is_signing[start_frame:end_frame + 1] = 1 # Sign detected 79 | 80 | is_signing = is_signing.tobytes() 81 | pose_data = tf.io.serialize_tensor(pose_data).numpy() 82 | pose_conf = tf.io.serialize_tensor(pose_conf).numpy() 83 | 84 | features = { 85 | 'fps': tf.train.Feature(int64_list=tf.train.Int64List(value=[fps])), 86 | 'pose_data': tf.train.Feature(bytes_list=tf.train.BytesList(value=[pose_data])), 87 | 'pose_confidence': tf.train.Feature(bytes_list=tf.train.BytesList(value=[pose_conf])), 88 | 'is_signing': tf.train.Feature(bytes_list=tf.train.BytesList(value=[is_signing])) 89 | } 90 | 91 | example = tf.train.Example(features=tf.train.Features(feature=features)) 92 | writer.write(example.SerializeToString()) 93 | -------------------------------------------------------------------------------- /sign_language_detection/holistic.poseheader: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sign-language-processing/detection-train/39df41d58862791badb393b7191c779b2c4e4a66/sign_language_detection/holistic.poseheader -------------------------------------------------------------------------------- /sign_language_detection/model.py: -------------------------------------------------------------------------------- 1 | """Sign language sequence tagging keras model.""" 2 | 3 | import tensorflow as tf 4 | 5 | from sign_language_detection.args import FLAGS 6 | 7 | 8 | def get_model(): 9 | """Create keras sequential model following the hyperparameters.""" 10 | 11 | model = tf.keras.Sequential(name='tgt') 12 | 13 | # model.add(SequenceMasking()) # Mask padded sequences 14 | model.add(tf.keras.layers.Dropout(FLAGS.input_dropout)) # Random feature dropout 15 | 16 | # Add LSTM 17 | for _ in range(FLAGS.encoder_layers): 18 | rnn = tf.keras.layers.LSTM(FLAGS.hidden_size, return_sequences=True) 19 | if FLAGS.encoder_bidirectional: 20 | rnn = tf.keras.layers.Bidirectional(rnn) 21 | model.add(rnn) 22 | 23 | # Project and normalize to labels space 24 | model.add(tf.keras.layers.Dense(2, activation='softmax')) 25 | 26 | return model 27 | 28 | 29 | def build_model(): 30 | """Apply input shape, loss, optimizer, and metric to the model.""" 31 | model = get_model() 32 | model.build(input_shape=(None, None, FLAGS.input_size)) 33 | model.compile( 34 | loss='sparse_categorical_crossentropy', 35 | optimizer=tf.keras.optimizers.Adam(FLAGS.learning_rate), 36 | metrics=['accuracy'], 37 | ) 38 | model.summary() 39 | 40 | return model 41 | -------------------------------------------------------------------------------- /sign_language_detection/openpose.poseheader: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sign-language-processing/detection-train/39df41d58862791badb393b7191c779b2c4e4a66/sign_language_detection/openpose.poseheader -------------------------------------------------------------------------------- /sign_language_detection/tools/h5_to_tfjs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 5 | 6 | from tensorflow.keras.models import load_model 7 | 8 | 9 | 10 | model = load_model('../model.h5') 11 | model.build(input_shape=(1, 1, 75)) 12 | 13 | model.layers[1].stateful = True 14 | 15 | model.predict(np.random.randn(1, 1, 75)) # Set input shapes 16 | 17 | model.save("stateful_model.h5") 18 | -------------------------------------------------------------------------------- /sign_language_detection/tools/model_card.py: -------------------------------------------------------------------------------- 1 | """Generate a model card file for the detector model.""" 2 | 3 | import shutil 4 | 5 | import model_card_toolkit 6 | 7 | # Initialize the Model Card Toolkit with a path to store generate assets 8 | model_card_output_path = 'model_card' 9 | shutil.rmtree(model_card_output_path, ignore_errors=True) 10 | mct = model_card_toolkit.ModelCardToolkit(model_card_output_path) 11 | 12 | # Initialize the model_card_toolkit.ModelCard, which can be freely populated 13 | model_card = mct.scaffold_assets() 14 | model_card.model_details.name = 'Sign Language Detector' 15 | model_card.model_details.overview = (""" 16 | This is a lightweight Keras model which aims to classify whether or not a person is signing in a given video frame. 17 | The model is trained on the DGS Corpus, which includes a diverse group of German Sign Language deaf signers. 18 | This model does not stand alone, and requires human pose estimation and shoulder width normalization as pre-processing. 19 | """) 20 | model_card.model_details.owners = [{ 21 | 'name': 'Amit Moryossef', 22 | 'contact': 'amitmoryossef@google.com' 23 | }, { 24 | 'name': 'Ioannis Tsochantaridis', 25 | 'contact': 'ioannis@google.com' 26 | }] 27 | 28 | model_card.considerations.use_cases = [ 29 | """ 30 | Performing real-time sign language detection for video-conferencing applications. 31 | """, """ 32 | Performing offline sign language detection on videos, containing bi-directional context in order to extract 33 | sequences containing signing. 34 | """ 35 | ] 36 | model_card.considerations.limitations = [ 37 | """ 38 | While the models are trained to detect sign language, they are not specifically trained to distinguish between 39 | gesturing and signing, and therefore should not be used outside the setting of signing. 40 | """ 41 | ] 42 | model_card.considerations.ethical_considerations = [{ 43 | 'name': 44 | """ 45 | Bias against minorities 46 | """, 47 | 'mitigation_strategy': 48 | """ 49 | As the model uses optical flow based on pose estimation, it can not reconstruct the shape or color of a person. 50 | Make sure that the pose estimation you use works well for both majority and minority groups. 51 | """ 52 | }] 53 | 54 | # Write the model card data to a JSON file 55 | mct.update_model_card_json(model_card) 56 | 57 | # Return the model card document as an HTML page 58 | html = mct.export_format() 59 | -------------------------------------------------------------------------------- /sign_language_detection/train.py: -------------------------------------------------------------------------------- 1 | """Training script for sign language detection.""" 2 | 3 | import random 4 | 5 | import tensorflow as tf 6 | from absl import app 7 | from tensorflow.keras.callbacks import EarlyStopping 8 | from tensorflow.keras.callbacks import ModelCheckpoint 9 | from tensorflow.keras.models import load_model 10 | 11 | from sign_language_detection.args import FLAGS 12 | from sign_language_detection.dataset import get_datasets 13 | from sign_language_detection.model import build_model 14 | 15 | 16 | def set_seed(): 17 | """Set seed for deterministic random number generation.""" 18 | seed = FLAGS.seed if FLAGS.seed is not None else random.randint(0, 1000) 19 | tf.random.set_seed(seed) 20 | random.seed(seed) 21 | 22 | 23 | def main(unused_argv): 24 | """Keras training loop with early-stopping and model checkpoint.""" 25 | 26 | set_seed() 27 | 28 | # Initialize Dataset 29 | train, dev, test = get_datasets() 30 | 31 | # Initialize Model 32 | model = build_model() 33 | 34 | # Train 35 | es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=FLAGS.stop_patience) 36 | mc = ModelCheckpoint(FLAGS.model_path, monitor='val_accuracy', mode='max', verbose=1, save_best_only=True) 37 | 38 | with tf.device(FLAGS.device): 39 | model.fit(train, 40 | epochs=FLAGS.epochs, 41 | steps_per_epoch=FLAGS.steps_per_epoch, 42 | validation_data=dev, 43 | callbacks=[es, mc] 44 | ) 45 | 46 | best_model = load_model(FLAGS.model_path) 47 | print('Testing') 48 | best_model.evaluate(test) 49 | 50 | 51 | if __name__ == '__main__': 52 | app.run(main) 53 | --------------------------------------------------------------------------------