├── .dockerignore ├── .gitignore ├── LICENSE ├── README.md ├── README_en.md ├── inference ├── .dockerignore ├── .gitignore ├── Dockerfile ├── Dockerfile.cpu ├── Dockerfile.gpu ├── Makefile ├── README.md ├── README_en.md ├── python │ ├── __init__.py │ ├── build_youtube_playlists_corpus.sh │ ├── decode.py │ ├── models.py │ ├── requirements.txt │ ├── speech_to_text.py │ ├── split_audio.py │ ├── text_preprocess.py │ ├── transcriber.py │ ├── utils_srt.py │ ├── vadSplit.py │ ├── yt.sh │ ├── ytpl.sh │ └── ytpl_build.sh ├── server │ ├── .dockerignore │ ├── .gitignore │ ├── LICENSE │ ├── Makefile │ ├── README.md │ ├── README_en.md │ ├── app │ │ ├── Dockerfile │ │ ├── cherrypy.conf │ │ ├── gunicorn.conf │ │ ├── requirements.txt │ │ ├── start.sh │ │ ├── static_html │ │ │ ├── index.html │ │ │ └── js │ │ │ │ ├── LICENSE.txt │ │ │ │ ├── WavAudioEncoder.js │ │ │ │ ├── audioRecorder.js │ │ │ │ ├── audioRecorderWorker.js │ │ │ │ ├── ready.js │ │ │ │ ├── resampler.js │ │ │ │ └── script.js │ │ └── wsgi.py │ ├── docker-compose.cpu.yml │ ├── speech.wav │ └── worker │ │ ├── .gitignore │ │ ├── Dockerfile │ │ ├── audio.py │ │ ├── external_api_urls.template.py │ │ ├── persist.py │ │ ├── punctuation_client.py │ │ ├── requirements.txt │ │ ├── speech_to_text_task.py │ │ └── worker.py └── speech.wav └── train ├── fine-tune ├── .dockerignore ├── .gitignore ├── Dockerfile ├── Makefile ├── README.md ├── README_en.md └── python │ ├── .gitignore │ ├── custom_common_voice.py │ ├── cv_version.template.py │ ├── decode.py │ ├── evaluate.py │ ├── models.py │ ├── publish.py │ ├── requirements.txt │ ├── run.py │ ├── run_base-cy.sh │ ├── run_en_cy.sh │ ├── run_xls-r-1b.sh │ ├── run_xlsr-large-53.sh │ ├── speech.wav │ ├── test.py │ ├── text_preprocess.py │ ├── train_kenlm.py │ └── train_wav2vec2.py └── pre-train ├── .dockerignore ├── .gitignore ├── Dockerfile ├── Makefile ├── README.md ├── README_en.md └── python ├── .gitignore ├── requirements.txt ├── run.sh ├── run_wav2vec2_pretraining_no_trainer.py └── youtube_dataset.py /.dockerignore: -------------------------------------------------------------------------------- 1 | models 2 | homedir 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *pycache* 2 | *.json 3 | *.pid 4 | *.log 5 | *.lock 6 | *.rdb 7 | gh 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Prifysgol Bangor University 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5270295.svg)](https://doi.org/10.5281/zenodo.5270295) 3 | 4 | 5 | # docker-wav2vec2-cy 6 | 7 | [(click here to read the README in English)](README_en.md) 8 | 9 | Mae'r project yn datblygu ac yn darparu adnabod lleferydd Cymraeg a ddwyieithog gan ddefnyddio'r dull wav2vec2 [1], [2] a [3]. Defnyddir data o Mozilla Common Voice Cymraeg yn bennaf, gyda sgriptiau'r project hwn, i greu modelau sydd yn trawsgrifio unrhyw leferydd Cymraeg (a Saesneg) yn lledgywir. Mae modd i chi llwytho i lawr y modelau er mwyn defnyddio adnabod lleferydd ar eich cyfrifiadur neu o fewn projectau meddalwedd eich hunain. 10 | 11 | 12 | ## Defnyddio adnabod lleferydd Cymraeg 13 | 14 | Mae'r adnoddau yn y ffolder 'inference' yn ei gwneud hi'n hawdd defnyddio modelau sydd wedi eu hyfforddi'n barod i drawsgrifio lleferydd Cymraeg o fewn ffeiliau sain fach neu fawr neu hyd yn oed o fewn fideos megis ar YouTube. Ewch i [inference/README.md](inference/README.md) am ragor o wybodaeth. 15 | 16 | 17 | ## Hyfforddi Modelau 18 | 19 | Mae'r adnoddau yn y ffolder 'train' yn hwyluso hyfforddi neu fireinio modelau acwsteg. Mae'r sgriptiau yn cynnwys modd hyfforddi modelau iaith yn ogystal er mwyn gwella cywirdeb canlyniadau trawsgrifio. Gweler [train/README.md](train/README.md) am ragor o wybodaeth. 20 | 21 | 22 | ## Diolchiadau 23 | 24 | Diolch i'r cwmnïau, sefydliadau ac unigolion canlynol sydd wedi ein helpu i wireddu datrysiad adnabod lleferydd Cymraeg mor effeithiol. 25 | 26 | - Mozilla a phawb sydd wedi cyfrannu yn hael ac yn wirfoddol drwy gwefan [Common Voice](https://commonvoice.mozilla.org/), yn enwedig i Rhoslyn Prys (meddal.com) a ymgymerodd â nifer o ymgyrchoedd torfoli ar sail wirfoddol, i'r Mentrau Iaith, Cyngor Gwynedd, Llyfrgell Genedlaethol Cymru a weithiodd gyda Rhoslyn ar rai o'r ymgyrchoedd hyn, ac hefyd i Lywodraeth Cymru. 27 | - Facebook AI am rhannu'r ddull wav2vec2 yn ogystal a modelau amlieithog enfawr wedi'i rhag-hyfforddi. [wav2vec 2.0 - Learning the Structure of Speech from Raw Audio](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/) 28 | - HuggingFace : [Fine-Tune XLSR-Wav2Vec2 for low-resource ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) 29 | - Parlance Speech Recognition : (https://github.com/parlance/ctcdecode) 30 | - KenLM : (https://github.com/kpu/kenlm) 31 | 32 | 33 | ## Cyfeiriadau 34 | 35 | [1] Alexei Baevski, H. Zhou, Abdel-rahman Mohamed, and Michael Auli. 2020. *wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations*. ArXiv, abs/2006.11477. 36 | 37 | [2] Alexis Conneau, Alexi Baevski, Ronan Collobert, Abdelrahman Mohamed and Michael Auli. 2020. *Unsupervised Cross-lingual Representation Learning for Speech Recognition*. ArXiv, abs/2006.13979. 38 | 39 | [3] Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau and Michael Auli. 2021. *XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale*. ArXiv, abs/2111.09296 40 | 41 | [4] Rosana Ardila, Megan Branson, Kelly Davis, Michael Henretty, Michael Kohler, Josh Meyer, Reuben Morais, Lindsay Saunders, Francis M. Tyers, and Gregor Weber. 2020. *Common Voice: A Massively-Multilingual Speech Corpus*. In LREC. 42 | 43 | [5] Pedro Javier Ortiz Suárez, Benoît Sagot, and Laurent Romary. 2019. *Asynchronous pipelines for processing huge corpora on medium to low resource infrastructures.* In CMLC-7 (pp. 9 – 16). Leibniz-Institut für Deutsche Sprache. 44 | 45 | 46 | 47 | ## Cydnabyddiaeth 48 | 49 | Os defnyddiwch chi'r adnodd hwn, gofynnwn yn garedig i chi gydnabod a chyfeirio at ein gwaith. Mae cydnabyddiaeth o'r fath yn gymorth i ni sicrhau cyllid yn y dyfodol i greu rhagor o adnoddau defnyddiol i'w rhannu. 50 | 51 | ``` 52 | @software{dewi_bryn_jones_2021_5270295, 53 | author = {Dewi Bryn Jones}, 54 | title = {{GitHub Repository: techiaith/docker-wav2vec2-cy Speech recognition for Welsh with wav2vec2.}}, 55 | month = aug, 56 | year = 2022, 57 | publisher = {Zenodo}, 58 | version = {22.10}, 59 | doi = {10.5281/zenodo.5270295}, 60 | url = {https://doi.org/10.5281/zenodo.5270295} 61 | } 62 | ``` 63 | -------------------------------------------------------------------------------- /README_en.md: -------------------------------------------------------------------------------- 1 | 2 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5270295.svg)](https://doi.org/10.5281/zenodo.5270295) 3 | 4 | # docker-wav2vec2-cy 5 | 6 | [(cliciwch yma os hoffwch ddarllen y README Cymraeg)](README.md) 7 | 8 | The project develops and provides Welsh and bilingual speech recognition using the wav2vec2 method [1], [2] and [3]. Data from Mozilla Common Voice Cymraeg has been mainly used, with the scripts of this project, to create models that transcribe any Welsh (and English) speech fairly accurately. You can download the models in order to use speech recognition on your computer or within your own software projects. 9 | 10 | ## How to Use the Welsh speech recognition models 11 | 12 | The resources in the 'inference' folder make it easy to use models that have already been trained to transcribe Welsh speech with small or large audio files or even with videos such as on YouTube. Visit [inference/README_en.md](inference/README.md) for more information. 13 | 14 | ## Training your own models 15 | 16 | The resources in the 'train' folder facilitate the training or refinement of acoustic models. The scripts also include a way to train language models in order to improve the accuracy of transcription results. See [train/README_en.md](train/README.md) for more information. 17 | 18 | ## Acknowledgements 19 | 20 | Such effective wav2vec2+KenLM speech recognition models would not have been possible without the work and contributions of the following organisations and individuals.. 21 | 22 | - Mozilla and everyone who has contributed their voices to [Common Voice](https://commonvoice.mozilla.org/) but in particular to Rhoslyn Prys (meddal.com) who undertook on a voluntary basis a number of crowdsourcing campaigns, to the Mentrau Iaith, Gwynedd Council, the National Library of Wales who worked with Rhoslyn on some of these campaigns, and to the Welsh Government. 23 | - Facebook AI for wav2vec2 and subsequently HuggingFace: [Fine-Tune XLSR-Wav2Vec2 for low-resource ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) 24 | - Parlance Speech Recognition for their PyTorch CTC Decoder bindings and KenLM integration (https://github.com/parlance/ctcdecode) 25 | - KenLM : (https://github.com/kpu/kenlm) 26 | 27 | 28 | ## References 29 | 30 | Alexei Baevski, H. Zhou, Abdel-rahman Mohamed, and Michael Auli 2020. *wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations*. ArXiv, abs/2006.11477. 31 | 32 | Rosana Ardila, Megan Branson, Kelly Davis, Michael Henretty, Michael Kohler, Josh Meyer, Reuben Morais, Lindsay Saunders, Francis M. Tyers, and Gregor Weber 2020. *Common Voice: A Massively-Multilingual Speech Corpus*. In LREC. 33 | 34 | Pedro Javier Ortiz Suárez, Benoît Sagot, and Laurent Romary 2019. *Asynchronous pipelines for processing huge corpora on medium to low resource infrastructures.* In CMLC-7 (pp. 9 – 16). Leibniz-Institut für Deutsche Sprache. 35 | 36 | 37 | ## Acknowledging our work 38 | 39 | If you use this resource, we kindly ask you to acknowledge and reference our work. Doing so helps us secure future funding to create more useful resources to share. 40 | 41 | ``` 42 | @software{dewi_bryn_jones_2021_5270295, 43 | author = {Dewi Bryn Jones}, 44 | title = {{GitHub Repository: techiaith/docker-wav2vec2-cy Speech recognition for Welsh with wav2vec2.}}, 45 | month = aug, 46 | year = 2022, 47 | publisher = {Zenodo}, 48 | version = {22.10}, 49 | doi = {10.5281/zenodo.5270295}, 50 | url = {https://doi.org/10.5281/zenodo.5270295} 51 | } 52 | ``` 53 | -------------------------------------------------------------------------------- /inference/.dockerignore: -------------------------------------------------------------------------------- 1 | server 2 | recordings 3 | data 4 | models 5 | tmp 6 | -------------------------------------------------------------------------------- /inference/.gitignore: -------------------------------------------------------------------------------- 1 | tmp 2 | data 3 | models 4 | recordings 5 | -------------------------------------------------------------------------------- /inference/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM techiaith/wav2vec2-inference-device 2 | 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | ENV TZ=Europe/London 5 | 6 | RUN apt update -q \ 7 | && apt install -y -qq tzdata bash build-essential git curl wget software-properties-common \ 8 | vim ca-certificates libffi-dev libssl-dev libsndfile1 libbz2-dev liblzma-dev locales \ 9 | libboost-all-dev libboost-tools-dev libboost-thread-dev cmake \ 10 | python3 python3-pip python3-setuptools python3-dev curl zip zlib1g-dev vim \ 11 | ffmpeg sox alsa-utils \ 12 | && python3 -m pip install --upgrade pip \ 13 | && apt clean -q 14 | 15 | 16 | # gosod YouTube downloader 17 | RUN wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl \ 18 | && chmod a+rx /usr/local/bin/youtube-dl 19 | 20 | # 21 | ARG MODEL_VERSION 22 | ENV MODEL_VERSION=${MODEL_VERSION} 23 | 24 | # 25 | ARG WAV2VEC2_MODEL_NAME 26 | ENV WAV2VEC2_MODEL_NAME=${WAV2VEC2_MODEL_NAME} 27 | 28 | # Set the locale 29 | RUN locale-gen cy_GB.UTF-8 30 | ENV LANG cy_GB.UTF-8 31 | ENV LANGUAGE cy_GB:en 32 | ENV LC_ALL cy_GB.UTF-8 33 | 34 | # Install local Python files and dependencies.. 35 | RUN mkdir -p /wav2vec2 36 | 37 | WORKDIR /wav2vec2 38 | 39 | COPY python/requirements.txt /wav2vec2/ 40 | RUN pip3 install -r requirements.txt 41 | 42 | ENV PATH="${PATH}:/wav2vec2" 43 | ENV PYTHONPATH "${PYTHONPATH}:/wav2vec2" 44 | 45 | # install ctc-decode 46 | RUN git clone --recursive https://github.com/parlance/ctcdecode.git /tmp/ctcdecode \ 47 | && cd /tmp/ctcdecode && pip3 install . 48 | 49 | COPY python /wav2vec2/ 50 | COPY speech.wav /wav2vec2/ 51 | 52 | RUN mkdir -p /models 53 | RUN python3 transcriber.py -w speech.wav 54 | -------------------------------------------------------------------------------- /inference/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | -------------------------------------------------------------------------------- /inference/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:11.4.0-cudnn8-devel-ubuntu20.04 2 | -------------------------------------------------------------------------------- /inference/Makefile: -------------------------------------------------------------------------------- 1 | default: build 2 | 3 | $(eval DEVICE = cpu) 4 | #$(eval DEVICE = gpu) 5 | 6 | config: 7 | # to use a local model, provide the full /models/.... path for WAV2VEC2_MODEL_NAME and 8 | # leave the MODEL_VERSION blank empty string. 9 | $(eval WAV2VEC2_MODEL_NAME = techiaith/wav2vec2-xlsr-ft-cy) 10 | $(eval WAV2VEC2_MODEL_VERSION = 22.10) 11 | mkdir -p ${PWD}/data/ 12 | mkdir -p ${PWD}/tmp/ 13 | 14 | 15 | 16 | build: config 17 | docker build --rm -f Dockerfile.${DEVICE} -t techiaith/wav2vec2-inference-device . 18 | docker build --rm -t techiaith/wav2vec2-inference-${DEVICE} \ 19 | --build-arg WAV2VEC2_MODEL_NAME=${WAV2VEC2_MODEL_NAME} \ 20 | --build-arg MODEL_VERSION=${WAV2VEC2_MODEL_VERSION} \ 21 | . 22 | 23 | run: config run-${DEVICE} 24 | 25 | run-gpu: 26 | docker run --gpus all --name techiaith-wav2vec2-inference-${DEVICE} \ 27 | --restart=always \ 28 | -it \ 29 | -v ${PWD}/data/:/data \ 30 | -v ${PWD}/tmp/:/tmp \ 31 | techiaith/wav2vec2-inference-${DEVICE} 32 | 33 | 34 | run-cpu: 35 | docker run --name techiaith-wav2vec2-inference-${DEVICE} \ 36 | --restart=always \ 37 | -it \ 38 | -v ${PWD}/data/:/data \ 39 | -v ${PWD}/tmp/:/tmp \ 40 | techiaith/wav2vec2-inference-${DEVICE} 41 | 42 | 43 | stop: config 44 | -docker stop techiaith-wav2vec2-inference-${DEVICE} 45 | -docker rm techiaith-wav2vec2-inference-${DEVICE} 46 | 47 | 48 | clean: config stop 49 | -docker rmi techiaith/wav2vec2-inference-${DEVICE} 50 | 51 | -------------------------------------------------------------------------------- /inference/README.md: -------------------------------------------------------------------------------- 1 | # Defnyddio Modelau Adnabod Lleferydd wav2vec2. 2 | 3 | [**(click here to read the README in English)**](README_en.md) 4 | 5 | ## Cefndir 6 | 7 | Cynnigir sawl ddull i chi ddefnyddio adnabod lleferydd gan gynnwys 8 | 9 | - o linell gorchymun cyfrifiadur eich hunain 10 | - o fewn cod Python eich hunain 11 | - o API ar weinydd lleol - gweler [server/README.md](server/README.md) 12 | 13 | 14 | ## Llinell gorchymyn 15 | 16 | ### Gosod 17 | 18 | Byddwch angen cyfrifiadur gyda system weithredu sy'n darparu llinell gorchymyn tebyg i Linux, fel Ubuntu, Mac OS X neu Windows Sub-system for Linux. Byddwch angen yn ogystal [git](https://git-scm.com/downloads) a docker ([Windows](https://learn.microsoft.com/en-us/windows/wsl/tutorials/wsl-containers),[Linux](https://docs.docker.com/desktop/install/linux-install/),[Mac OS X](https://docs.docker.com/desktop/install/mac-install/)) 19 | 20 | 21 | ``` 22 | $ git clone https://github.com/techiaith/docker-wav2vec2-cy 23 | $ cd docker-wav2vec2-cy/inference 24 | $ make 25 | ``` 26 | 27 | Mae'r proses yn estyn ac yn gosod modelau sydd wedi'i hyfforddi eisoes gan Uned Technolegau Iaith, Prifysgol Bangor. 28 | 29 | 30 | ### Defnyddio 31 | 32 | Er mwyn drawsgrifio un ffeil yn sydyn, mae modd defnyddio sgript `transcriber.py` mewn modd debyg i'r canlynol 33 | 34 | `$ docker run --rm -it -v ${PWD}/:${PWD} techiaith/wav2vec2-inference python3 transcriber.py -w ${PWD}/.wav` 35 | 36 | Bydd hyn yn dangos trawsgrifiad o'r sain lleferydd ar y sgrin. E.e. 37 | 38 | ``` 39 | /home/$ docker run --rm -it -v ${PWD}/:${PWD} techiaith/wav2vec2-inference python3 transcriber.py -w ${PWD}/speech.wav 40 | split_only: False 41 | Initialising wav2vec2 model "techiaith/wav2vec2-xls-r-1b-ft-cy" from HuggingFace model repository 42 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. 43 | Initializing KenLM language model... 44 | /usr/local/lib/python3.8/dist-packages/torch/cuda/__init__.py:52: UserWarning: CUDA initialization: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:100.) 45 | return torch._C._cuda_getDeviceCount() > 0 46 | wav2vec loaded to device cpu 47 | Processing: /home//speech.wav 48 | 1 0.619581589958159 5.170041841004185 mae ganddynt ddau o blant mab a merch 49 | 1 50 | 00:00:00,619 --> 00:00:05,170 51 | mae ganddynt ddau o blant mab a merch 52 | 53 | 54 | ``` 55 | 56 | I gadw'r trawsgrifiad i ffeil `.srt` a `.TextGrid`, ychwanegwch enw ffeil allbwn: 57 | 58 | ```shell 59 | /home/$ docker run --rm -it -v ${PWD}/:${PWD} techiaith/wav2vec2-inference python3 transcriber.py -w ${PWD}/speech.wav -s ${PWD}/speech.srt 60 | split_only: False 61 | Initialising wav2vec2 model "techiaith/wav2vec2-xls-r-1b-ft-cy" from HuggingFace model repository 62 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. 63 | Initializing KenLM language model... 64 | /usr/local/lib/python3.8/dist-packages/torch/cuda/__init__.py:52: UserWarning: CUDA initialization: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:100.) 65 | return torch._C._cuda_getDeviceCount() > 0 66 | wav2vec loaded to device cpu 67 | Processing: /home//speech.wav 68 | 1 0.619581589958159 5.170041841004185 mae ganddynt ddau o blant mab a merch 69 | srt file of transcription saved to /home//speech.srt 70 | Textgrid of transcription saved to /home//speech.TextGrid 71 | ``` 72 | 73 | #### Trawsgrifio fideo YouTube 74 | 75 | Er mwyn is-deitlau fideo YouTube yn lleol, defnyddiwch y sgript `yt.sh`. 76 | 77 | Er enghraifft ar gyfer fideo https://www.youtube.com/watch?v=OpiwHxPPqRI mae'r enghraifft isod yn creu ffeiliau `OpiwHxPPqRI.TextGrid` ac `OpiwHxPPqRI.srt` o fewn y ffolder `recordings`. 78 | 79 | 80 | ``` 81 | /home/$ docker run --rm -it -v ${PWD}/recordings/:/recordings techiaith/wav2vec2-inference yt.sh OpiwHxPPqRI 82 | 83 | + youtube-dl --extract-audio --audio-format mp3 'https://www.youtube.com/watch?v=OpiwHxPPqRI' 84 | [youtube] OpiwHxPPqRI: Downloading webpage 85 | [youtube] OpiwHxPPqRI: Downloading MPD manifest 86 | [download] Destination: Cynnyrch Uned Technolegau Iaith Prifysgol Bangor 2020-OpiwHxPPqRI.webm 87 | [download] 100% of 3.51MiB in 00:54 88 | [ffmpeg] Destination: Cynnyrch Uned Technolegau Iaith Prifysgol Bangor 2020-OpiwHxPPqRI.mp3 89 | Deleting original file Cynnyrch Uned Technolegau Iaith Prifysgol Bangor 2020-OpiwHxPPqRI.webm (pass -k to keep) 90 | + ffmpeg -i 'Cynnyrch Uned Technolegau Iaith Prifysgol Bangor 2020-OpiwHxPPqRI.mp3' -vn -acodec pcm_s16le -ar 16000 -ac 1 /recordings/OpiwHxPPqRI.wav 91 | ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers 92 | built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1) 93 | configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-nvenc --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared 94 | libavutil 56. 31.100 / 56. 31.100 95 | libavcodec 58. 54.100 / 58. 54.100 96 | libavformat 58. 29.100 / 58. 29.100 97 | libavdevice 58. 8.100 / 58. 8.100 98 | libavfilter 7. 57.100 / 7. 57.100 99 | libavresample 4. 0. 0 / 4. 0. 0 100 | libswscale 5. 5.100 / 5. 5.100 101 | libswresample 3. 5.100 / 3. 5.100 102 | libpostproc 55. 5.100 / 55. 5.100 103 | Input #0, mp3, from 'Cynnyrch Uned Technolegau Iaith Prifysgol Bangor 2020-OpiwHxPPqRI.mp3': 104 | Metadata: 105 | encoder : Lavf58.29.100 106 | Duration: 00:03:33.67, start: 0.023021, bitrate: 101 kb/s 107 | Stream #0:0: Audio: mp3, 48000 Hz, stereo, fltp, 100 kb/s 108 | Metadata: 109 | encoder : Lavc58.54 110 | Stream mapping: 111 | Stream #0:0 -> #0:0 (mp3 (mp3float) -> pcm_s16le (native)) 112 | Press [q] to stop, [?] for help 113 | Output #0, wav, to '/recordings/OpiwHxPPqRI.wav': 114 | Metadata: 115 | ISFT : Lavf58.29.100 116 | Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 16000 Hz, mono, s16, 256 kb/s 117 | Metadata: 118 | encoder : Lavc58.54.100 pcm_s16le 119 | size= 6677kB time=00:03:33.64 bitrate= 256.0kbits/s speed= 767x 120 | video:0kB audio:6676kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.001141% 121 | + rm 'Cynnyrch Uned Technolegau Iaith Prifysgol Bangor 2020-OpiwHxPPqRI.mp3' 122 | + python3 transcriber.py -w /recordings/OpiwHxPPqRI.wav -s /recordings/OpiwHxPPqRI.srt 123 | split_only: False 124 | Initialising wav2vec2 model "techiaith/wav2vec2-xls-r-1b-ft-cy" from HuggingFace model repository 125 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. 126 | Initializing KenLM language model... 127 | /usr/local/lib/python3.8/dist-packages/torch/cuda/__init__.py:52: UserWarning: CUDA initialization: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:100.) 128 | return torch._C._cuda_getDeviceCount() > 0 129 | wav2vec loaded to device cpu 130 | Processing: /recordings/OpiwHxPPqRI.wav 131 | 1 0.0 1.630135135135135 132 | 2 0.0 2.290714285714286 133 | 3 0.0 2.8303846153846157 134 | 4 3.169469387755102 8.279020408163266 fel arfer byddwn ni yn unedd technolegau iaith canolfan bedwr ym mhrifysgol bangor yn rhoi 135 | 5 8.298979591836734 12.67004081632653 ryw sesiwn fach yn yr eisteddfod genedlaethol i ddangos peth o'r gwaith 136 | 6 12.739945945945946 18.225081081081083 diweddarah a rhoed rhyw rhagflas o bethau newydd sydd ar y ffordd eleni wrth gwrs 137 | 7 18.34475675675676 20.080054054054056 chawson ni ddim steddfod 138 | 8 20.206799999999998 20.590799999999998 ond 139 | 9 20.679771428571428 24.100114285714287 roeddwn i'n dal isio roi driw flas bach iddoch chi o'n gwaith 140 | 10 24.189963099630994 28.226236162361623 dyma cysgliad os ydach chi ddim yn gwybod yn barod mwy o ymau helpu chi 141 | 11 28.266199261992618 32.7420664206642 ysgrifennu yn y gymraeg ar fformai o'n gwneud hyn ydy troi cysyll sydd ar gyfer 142 | 12 32.8419741697417 34.9600184501845 gwirio sillafu a gramadeg cymraeg 143 | 13 35.04949367088607 36.55025316455696 gan cynnwys tru glo 144 | 14 37.109240506329115 40.00006329113924 sori gan gynnwys treiglo 145 | 15 40.438983050847455 41.470169491525425 dyna wy llant 146 | 16 42.02961102106969 47.80492706645057 a cysgeir sef casgliad o eiriaduron cynhwysfawr sydd yn hawdd yw chwilio elle bwch chi'n 147 | 17 47.844894651539704 53.86001620745543 gofyn pwy sy'n gallu gael copi o'r meddalwedd anhygol yma am ddim wel y cyhoedd 148 | 18 53.080648298217184 53.86001620745543 149 | 19 54.12941489361702 60.15335106382978 y byd addysg a chwmnïau sy'n cyflogi hydd at deg person alle gellych chi ddarganfod 150 | 20 60.17329787234042 61.39005319148936 y cysgliad 151 | 21 61.53677419354838 62.020645161290325 wel 152 | 22 62.22986631016043 68.04208556149733 cysgliad dodcom digon syml a os oes angen cymorth gyda unrhyw beth ar y wefan 153 | 23 68.44155080213903 73.29505347593582 yn ogystal â'r tudalen cymorth mae sgwrsffot ar gael i chi ofyn cwestiynau iddo unrhyw 154 | 24 73.335 77.05002673796791 bryd gan fod llawer iawn ohonom yn gweithio adraf ar hyn o bryd 155 | 25 77.26026058631922 81.68745928338762 mae 'r porth termau yn berffaith i helpu chi gyda hyn mae hwn yn gadael 156 | 26 81.72752442996743 85.35342019543974 i chi fynd ar dros ugian o eiriaduron termau gwahanol ar y we ac os 157 | 27 85.41351791530946 89.3799674267101 ydach chi eisiau fersiynhwylus i gario gwmpas gyda chi i bob man 158 | 28 89.44995735607677 94.51912579957357 yna mae'r ap guriaduron yn cynnwys nifer o'r gydiaduronmae mwy o datblygiadau newydd ar y 159 | 29 94.55904051172709 98.77004264392325 ffordd hefyd os ydych chi wedi blino gorfod siarad saesneg gyda olecsa 160 | 30 98.82 102.25005780346821 mae gyda ni brwt y teip cynorthwyydd personol cymraeg 161 | 31 102.59955719557196 107.00324723247232 macsen ma'n gweithio fo ap ar eich ffôn symudol yn daethech chi beth bynnag gyda 162 | 32 107.0231734317343 107.86007380073801 chi isio gwybod 163 | 33 107.98960975609755 111.9700975609756 rydyn ni werthu'n gwella'r ap ag yn ychwanegu pedwar sgil newydd iddo 164 | 34 112.1797159090909 115.57005681818183 yn cynnwys rhai i gael podleyddiadau rhaglenni esperd yr ek 165 | 35 115.67974137931034 117.91008620689655 dyma i chi flas ar sgil sbotsoffai 166 | 36 118.0295744680851 119.83021276595744 chware fiwsig cyrff 167 | 37 123.92523985239852 125.32007380073802 clasur 168 | 38 125.94965034965034 128.68006993006992 rydyn ni hefyd yn gwella eu lleisiau synthetig 169 | 39 128.94845070422534 131.53014084507043 a erbyn mis mawrth igian inarigian 170 | 40 131.61957446808512 133.42021276595744 bydd gyda ni pedwar llais newydd 171 | 41 133.63957894736842 135.43010526315788 fenywaidd a gwrywoedd 172 | 42 135.5798663101604 140.852807486631 de a gogledd ac bydd y datblygwyr yn medru eu rhoi nhw yn eu meddalwedd 173 | 43 141.05254010695185 145.526550802139 mae creu lleisiau synddetig yn bwysig ar gyfer pobl sy'n colli'r gallu i siarad ei 174 | 44 145.5664973262032 150.4000267379679 hun dwy flynydd ynol mi wnaethyn i ddatblygu'r rhaglen lleisiwr gyda'r gwasanaeth iechyd 175 | 45 0.0 150.9109090909091 176 | 46 151.05992163009404 155.4764576802508 a chyn bo hir mi fygyn yn nu lleisiwr dau sydd yn cynnig gwell gwasanaeth 177 | 47 155.57637931034483 160.4925235109718 i gleifion os hoffech chi i helpun ni i ddatblygu'r dechnoleg yma cofiwch gech hi 178 | 48 160.5324921630094 163.6900156739812 ddal i gyfrannu eich llais i comin foes 179 | 49 163.86974226804125 167.62005154639175 rydyn ni'n defnyddio'r recordiau hyn i wella ei'n rhaglenni adnabod lleferydd 180 | 50 167.72993927125503 171.7458704453441 ac os ydach chi yn mwynhau y broses o recordio a darllen y brawddegau hyn 181 | 51 171.76585020242914 175.9815789473684 yn uchel beth am gynnig eich hyn fel talant iais yn ni ar gyfer creu 182 | 52 176.02153846153846 177.52002024291497 lleisiau synthetig newydd 183 | 53 177.99870967741936 180.28016129032258 rydyn ni'i chwilio am bedwar talant llais 184 | 54 180.42956896551726 184.93008620689653 au gydag acen gogledd dau gydag acen deheuol dynion a merched 185 | 55 185.01988826815642 188.5300558659218 rydyn ni'n cynnig tawl bychan am gael ddefnyddio eich llais 186 | 56 188.97750000000002 190.15012499999997 diolch fawr am wylio 187 | 57 190.40071942446042 192.96992805755397 gobeithio bod y fideo yma wedi bod o gymorth i chi 188 | 58 193.29933774834436 196.18013245033112 ac eich bod chi'n mwynhau defnyddio ein cyflysterau ni 189 | 59 0.0 196.9902857142857 190 | 60 0.0 197.74079999999998 191 | 61 0.0 199.1203448275862 192 | 62 0.0 202.03006993006994 193 | 63 0.0 203.44026315789475 194 | 64 203.50909090909093 203.89090909090908 engo 195 | 65 0.0 207.8204347826087 196 | 66 0.0 208.6308 197 | srt file of transcription saved to /recordings/OpiwHxPPqRI.srt 198 | Textgrid of transcription saved to /recordings/OpiwHxPPqRI.TextGrid 199 | ``` 200 | 201 | Ewch i'r ffolder `recordings` i ganfod ffeil `.srt` (`.wav` a `.TextGrid`) ar gyfer y fideo: 202 | 203 | ```shell 204 | /home/$ cd recordings/ 205 | /home//recordings$ ls -l 206 | total 6708 207 | -rw-r--r-- 1 root root 4737 Oct 5 17:01 OpiwHxPPqRI.srt 208 | -rw-r--r-- 1 root root 17766 Oct 5 17:01 OpiwHxPPqRI.TextGrid 209 | -rw-r--r-- 1 root root 6836776 Oct 5 17:00 OpiwHxPPqRI.wav 210 | ``` 211 | 212 | ## Scriptiau Python 213 | 214 | Mae modd defnyddio'r modelau o fewn sgript Python syml fel y canlynol. 215 | 216 | **D.S.** nid yw'r enghraifft yma yn defnyddio model iaith i wella cywirdeb canlyniadau adnabod lleferydd o'r model acwstig. 217 | 218 | ### Dibyniaethau 219 | 220 | ```shell 221 | $ python3 -m pip install -r python/requirments.txt 222 | ``` 223 | 224 | ### Cod Python Enghreifftiol 225 | 226 | ```python 227 | import torch 228 | import librosa 229 | import torchaudio 230 | 231 | from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor 232 | 233 | processor = Wav2Vec2Processor.from_pretrained("techiaith/wav2vec2-xlsr-ft-cy") 234 | model = Wav2Vec2ForCTC.from_pretrained("techiaith/wav2vec2-xlsr-ft-cy") 235 | 236 | audio, rate = librosa.load(, sr=16000) 237 | 238 | inputs = processor(audio, sampling_rate=16_000, return_tensors="pt", padding=True) 239 | 240 | with torch.no_grad(): 241 | logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits 242 | 243 | # greedy decoding 244 | predicted_ids = torch.argmax(logits, dim=-1) 245 | 246 | print("Prediction:", processor.batch_decode(predicted_ids)) 247 | 248 | ``` 249 | 250 | ## Rhybudd 251 | 252 | Sylwch na fydd canlyniadau trawsgrifio bob amser yn hollol gywir. Mae'r gwaith o fesur a gwella cywirdeb y modelau yn waith parhaus. Gweler ein [canlyniadau gwerthuso ar gyfer modelau amrywiol](../train/fine-tune/README.md#evaluation) 253 | 254 | Yn y cyfamser, os hoffech weld y modelau yn gwella, yna recordiwch ychydig o frawddegau Cymraeg ar wefan Mozilla Common Voice ( https://voice.mozilla.org/cy ), fel bod bydd gennym fwy o ddata hyfforddi. Neu defnyddiwch ein ap cynorthwyydd - Macsen ( http://techiaith.cymru/macsen ) -------------------------------------------------------------------------------- /inference/README_en.md: -------------------------------------------------------------------------------- 1 | # How to Use the wav2vec2 Welsh Language Speech Recognition Models. 2 | 3 | [**(cliciwch yma os hoffwch ddarllen y README Cymraeg)**](README.md) 4 | 5 | ## Background 6 | 7 | There are several methods for using the speech recognition models from this project on your own system, including 8 | 9 | - from your own computer's command line 10 | - within your own Python code 11 | - from an API on your own local server - see [server/README.md](server/README.md) 12 | 13 | ## Install 14 | 15 | ``` 16 | $ git clone https://github.com/techiaith/docker-wav2vec2-cy 17 | $ cd docker-wav2vec2-xlsr-cy/inference 18 | $ make 19 | ``` 20 | 21 | The build process fetches speech recognition models that have been pretrained by Bangor University's Language Technologies Unit. 22 | 23 | ## How to Use 24 | 25 | Get started by using.. 26 | 27 | ``` 28 | $ make run 29 | ``` 30 | 31 | A new command line prompt will appear where you can use the scripts `decode.py` or `transcribe.py` scripts to convert speech audio files into text. For example... 32 | 33 | ```shell 34 | root@a20d8f23cb0f:/wav2vec2# python3 decode.py --wav speech.wav 35 | Downloading kenlm.tar.gz version 21.08 36 | kenlm.tar.gz: 455MB [00:40, 11.2MB/s] 37 | Extracting... 38 | Initialising processor... 39 | Downloading: 100%|████████████████████████████████████████████████████████████████████████████████| 214/214 [00:00<00:00, 129kB/s] 40 | Downloading: 100%|████████████████████████████████████████████████████████████████████████████████| 437/437 [00:00<00:00, 284kB/s] 41 | Downloading: 100%|████████████████████████████████████████████████████████████████████████████████| 181/181 [00:00<00:00, 123kB/s] 42 | Downloading: 100%|█████████████████████████████████████████████████████████████████████████████| 85.0/85.0 [00:00<00:00, 58.5kB/s] 43 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. 44 | Initialising wav2vec ctc model... 45 | Downloading: 100%|███████████████████████████████████████████████████████████████████████████| 1.85k/1.85k [00:00<00:00, 1.29MB/s] 46 | Downloading: 100%|███████████████████████████████████████████████████████████████████████████| 1.26G/1.26G [01:50<00:00, 11.5MB/s] 47 | Initialising vocab... 48 | Initialising ctc with lm decoder... 49 | mae ganddynt ddau o blant mab a merch 0.6109205020920503 5.169916317991632 [{'word': 'mae', 'start': 0.6109205020920503, 'end': 0.7715899581589959}, {'word': 'ganddynt', 'start': 0.8117573221757324, 'end': 1.4544351464435148}, {'word': 'ddau', 'start': 1.6151046025104603, 'end': 2.077029288702929}, {'word': 'o', 'start': 2.137280334728034, 'end': 2.17744769874477}, {'word': 'blant', 'start': 2.2376987447698746, 'end': 2.820125523012553}, {'word': 'mab', 'start': 3.1816317991631804, 'end': 3.623472803347281}, {'word': 'a', 'start': 3.9648953974895402, 'end': 4.18581589958159}, {'word': 'merch', 'start': 4.246066945606695, 'end': 5.169916317991632}] 50 | ``` 51 | 52 | Use the `yt.sh` script to create locally subtitles for any YouTube video. For example for the video https://www.youtube.com/watch?v=OpiwHxPPqRI, the script creates a `.TextGrid` and `.srt` files in the `/recordings` folder. 53 | 54 | 55 | ```shell 56 | root@413c6994d668:/wav2vec2# ./yt.sh OpiwHxPPqRI 57 | 58 | + youtube-dl --extract-audio --audio-format mp3 'https://www.youtube.com/watch?v=OpiwHxPPqRI' 59 | [youtube] OpiwHxPPqRI: Downloading webpage 60 | [youtube] OpiwHxPPqRI: Downloading MPD manifest 61 | [download] Destination: Cynnyrch Uned Technolegau Iaith Prifysgol Bangor 2020-OpiwHxPPqRI.webm 62 | [download] 100% of 3.51MiB in 00:00 63 | [ffmpeg] Destination: Cynnyrch Uned Technolegau Iaith Prifysgol Bangor 2020-OpiwHxPPqRI.mp3 64 | Deleting original file Cynnyrch Uned Technolegau Iaith Prifysgol Bangor 2020-OpiwHxPPqRI.webm (pass -k to keep) 65 | + ffmpeg -i 'Cynnyrch Uned Technolegau Iaith Prifysgol Bangor 2020-OpiwHxPPqRI.mp3' -vn -acodec pcm_s16le -ar 16000 -ac 1 /recordings/OpiwHxPPqRI.wav 66 | ffmpeg version 4.2.4-1ubuntu0.1 Copyright (c) 2000-2020 the FFmpeg developers 67 | built with gcc 9 (Ubuntu 9.3.0-10ubuntu2) 68 | configuration: --prefix=/usr --extra-version=1ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-nvenc --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared 69 | libavutil 56. 31.100 / 56. 31.100 70 | libavcodec 58. 54.100 / 58. 54.100 71 | libavformat 58. 29.100 / 58. 29.100 72 | libavdevice 58. 8.100 / 58. 8.100 73 | libavfilter 7. 57.100 / 7. 57.100 74 | libavresample 4. 0. 0 / 4. 0. 0 75 | libswscale 5. 5.100 / 5. 5.100 76 | libswresample 3. 5.100 / 3. 5.100 77 | libpostproc 55. 5.100 / 55. 5.100 78 | Input #0, mp3, from 'Cynnyrch Uned Technolegau Iaith Prifysgol Bangor 2020-OpiwHxPPqRI.mp3': 79 | Metadata: 80 | encoder : Lavf58.29.100 81 | Duration: 00:03:33.67, start: 0.023021, bitrate: 101 kb/s 82 | Stream #0:0: Audio: mp3, 48000 Hz, stereo, fltp, 100 kb/s 83 | Metadata: 84 | encoder : Lavc58.54 85 | File '/recordings/OpiwHxPPqRI.wav' already exists. Overwrite ? [y/N] y 86 | Stream mapping: 87 | Stream #0:0 -> #0:0 (mp3 (mp3float) -> pcm_s16le (native)) 88 | Press [q] to stop, [?] for help 89 | Output #0, wav, to '/recordings/OpiwHxPPqRI.wav': 90 | Metadata: 91 | ISFT : Lavf58.29.100 92 | Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 16000 Hz, mono, s16, 256 kb/s 93 | Metadata: 94 | encoder : Lavc58.54.100 pcm_s16le 95 | size= 6677kB time=00:03:33.64 bitrate= 256.0kbits/s speed= 288x 96 | video:0kB audio:6676kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.001141% 97 | + rm 'Cynnyrch Uned Technolegau Iaith Prifysgol Bangor 2020-OpiwHxPPqRI.mp3' 98 | + python3 transcriber.py -w /recordings/OpiwHxPPqRI.wav 99 | Model file /models/techiaith/wav2vec2-xlsr-ft-cy/21.08/kenlm.tar.gz already downloaded. 100 | Initialising processor... 101 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. 102 | Initialising wav2vec ctc model... 103 | Initialising vocab... 104 | Initialising ctc with lm decoder... 105 | srt file of transcription saved to /recordings/OpiwHxPPqRI.srt 106 | Textgrid of transcription saved to /recordings/OpiwHxPPqRI.TextGrid 107 | root@413c6994d668:/wav2vec2# 108 | ``` 109 | 110 | ## Python Scripts 111 | 112 | The models can be used within a simple Python script like the following. 113 | 114 | **D.S.** this example does not use a language model to improve the accuracy of speech recognition results. 115 | 116 | ### Dependencies 117 | 118 | ```shell 119 | $ python3 -m pip install -r python/requirments.txt 120 | ``` 121 | 122 | ### Example Python code 123 | 124 | ```python 125 | import torch 126 | import librosa 127 | import torchaudio 128 | 129 | from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor 130 | 131 | processor = Wav2Vec2Processor.from_pretrained("techiaith/wav2vec2-xlsr-ft-cy") 132 | model = Wav2Vec2ForCTC.from_pretrained("techiaith/wav2vec2-xlsr-ft-cy") 133 | 134 | audio, rate = librosa.load(, sr=16000) 135 | 136 | inputs = processor(audio, sampling_rate=16_000, return_tensors="pt", padding=True) 137 | 138 | with torch.no_grad(): 139 | logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits 140 | 141 | # greedy decoding 142 | predicted_ids = torch.argmax(logits, dim=-1) 143 | 144 | print("Prediction:", processor.batch_decode(predicted_ids)) 145 | 146 | ``` 147 | 148 | 149 | ## Warning 150 | 151 | Please note that transcription results will not always be totally correct. The work on measuring and improving the models' capabilities is ongoing work. See our [evaluation results for various models](../train/fine-tune/README_en.md#evaluation) 152 | 153 | In the meantime, if you would like to see the models improve, then record some Welsh 154 | sentences on the Mozilla Common Voice (https://voice.mozilla.org/cy) website, so that 155 | we will have more training data. Or use our voice assistant app - Macsen (http://techiaith.cymru/macsen) 156 | -------------------------------------------------------------------------------- /inference/python/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/techiaith/docker-huggingface-stt-cy/36224bbb731037b701dbf516165f7dec15bce262/inference/python/__init__.py -------------------------------------------------------------------------------- /inference/python/build_youtube_playlists_corpus.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -rf /data/welsh-youtube-corpus/dataset 4 | rm -rf /data/welsh-youtube-corpus/transcription_files 5 | 6 | # techiaith pretrained 7 | ./ytpl_build.sh PLkNuNDk4pYpWpE1n-iIAPRtQNolth4pVq 8 | 9 | # Y Babell Lên 10 | ./ytpl_build.sh PLNbPx7YxCU13Z6E_ZoFNZOFpYAPP7xMcw 11 | 12 | # Hansh - Straeon Stiwdio 13 | ./ytpl_build.sh PLMUgzTukecfPY5rtNt0t8JuYZEJPPrhTi 14 | 15 | # Hansh - Her Ffilm Fer 16 | ./ytpl_build.sh PLMUgzTukecfNycx2qxPJ0nsf5elNR--zk 17 | 18 | # Hansh - Mae Bywydau Duon o Bwys 19 | ./ytpl_build.sh PLMUgzTukecfPQannXfBqovMTFqJGtQRST 20 | -------------------------------------------------------------------------------- /inference/python/decode.py: -------------------------------------------------------------------------------- 1 | import os 2 | from speech_to_text import SpeechToText 3 | 4 | from argparse import ArgumentParser, RawTextHelpFormatter 5 | 6 | DESCRIPTION = """ 7 | 8 | Prifysgol Bangor University 9 | 10 | """ 11 | 12 | # 13 | def main(audio_file, **args): 14 | stt=SpeechToText() 15 | for transcript, time_start, time_end, alignments in stt.transcribe(audio_file): 16 | print (transcript, time_start, time_end, alignments) 17 | 18 | 19 | if __name__ == "__main__": 20 | 21 | parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) 22 | 23 | parser.add_argument("--wav", dest="audio_file", required=True) 24 | parser.set_defaults(func=main) 25 | args = parser.parse_args() 26 | args.func(**vars(args)) 27 | -------------------------------------------------------------------------------- /inference/python/models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | import tarfile 4 | import urllib.request 5 | from urllib.parse import urlparse 6 | 7 | from pathlib import Path 8 | from tqdm import tqdm 9 | 10 | from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor 11 | from ctcdecode import CTCBeamDecoder 12 | 13 | 14 | class DownloadProgressBar(tqdm): 15 | def update_to(self, b=1, bsize=1, tsize=None): 16 | if tsize is not None: 17 | self.total = tsize 18 | self.update(b * bsize - self.n) 19 | 20 | 21 | def create(model_path, revision): 22 | 23 | cache_dir=model_path 24 | 25 | # initialize acoustic model... 26 | # 27 | if Path(model_path).is_dir(): 28 | # from a local directory containing our own trained model 29 | print("Initiaising wav2vec2 model from local directory: %s" % model_path) 30 | processor = Wav2Vec2Processor.from_pretrained(model_path) 31 | model = Wav2Vec2ForCTC.from_pretrained(model_path) 32 | else: 33 | # from the HuggingFace models repository. 34 | print("Initialising wav2vec2 model \"%s\" from HuggingFace model repository" % model_path) 35 | cache_dir = os.path.join('/models', model_path) 36 | processor = Wav2Vec2Processor.from_pretrained(model_path, cache_dir=cache_dir, revision=revision) 37 | model = Wav2Vec2ForCTC.from_pretrained(model_path, cache_dir=cache_dir, revision=revision) 38 | 39 | # initialize language model... 40 | # 41 | targz_file_path=os.path.join(cache_dir, "kenlm.tar.gz") 42 | if not Path(targz_file_path).is_file(): 43 | print ("Downloading kenlm language model version {}".format(revision)) 44 | try: 45 | file_url = os.path.join("https://huggingface.co", model_path, "resolve", revision, 'kenlm.tar.gz') 46 | download(file_url, os.path.join(cache_dir, targz_file_path)) 47 | except Exception as e: 48 | print (e) 49 | 50 | if not Path(os.path.join(cache_dir, "config_ctc.yaml")).is_file(): 51 | if Path(targz_file_path).is_file(): 52 | extract(targz_file_path) 53 | 54 | if Path(os.path.join(cache_dir, "config_ctc.yaml")).is_file(): 55 | with open(os.path.join(cache_dir, "config_ctc.yaml"), 'r') as config_file: 56 | ctc_lm_params=yaml.load(config_file, Loader=yaml.FullLoader) 57 | 58 | # 59 | vocab=processor.tokenizer.convert_ids_to_tokens(range(0, processor.tokenizer.vocab_size)) 60 | space_ix = vocab.index('|') 61 | vocab[space_ix]=' ' 62 | 63 | ctcdecoder = CTCBeamDecoder(vocab, 64 | model_path='', 65 | alpha=0, 66 | beta=0, 67 | cutoff_top_n=40, 68 | cutoff_prob=1.0, 69 | beam_width=100, 70 | num_processes=4, 71 | blank_id=processor.tokenizer.pad_token_id, 72 | log_probs_input=True 73 | ) 74 | 75 | kenlm_ctcdecoder=None 76 | if Path(os.path.join(cache_dir, "lm.binary")).is_file(): 77 | if ctc_lm_params: 78 | print ("Initializing KenLM language model...") 79 | kenlm_ctcdecoder = CTCBeamDecoder(vocab, 80 | model_path=os.path.join(cache_dir, "lm.binary"), 81 | alpha=ctc_lm_params['alpha'], 82 | beta=ctc_lm_params['beta'], 83 | cutoff_top_n=40, 84 | cutoff_prob=1.0, 85 | beam_width=100, 86 | num_processes=4, 87 | blank_id=processor.tokenizer.pad_token_id, 88 | log_probs_input=True 89 | ) 90 | 91 | return processor, model, vocab, ctcdecoder, kenlm_ctcdecoder 92 | 93 | 94 | def download(file_url, output_file_path): 95 | with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=file_url.split('/')[-1]) as t: 96 | urllib.request.urlretrieve(file_url, filename=output_file_path, reporthook=t.update_to) 97 | 98 | def extract(targz_file_path): 99 | # extract. 100 | if targz_file_path.endswith(".tar.gz"): 101 | print ("Extracting...") 102 | model_dir = Path(targz_file_path).parent.absolute() 103 | tar = tarfile.open(targz_file_path, "r:gz") 104 | tar.extractall(model_dir) 105 | tar.close() 106 | 107 | #Path(output_file_path).unlink() 108 | 109 | 110 | -------------------------------------------------------------------------------- /inference/python/requirements.txt: -------------------------------------------------------------------------------- 1 | packaging 2 | numpy 3 | wave 4 | jiwer 5 | webrtcvad 6 | transformers==4.24.0 7 | tqdm 8 | pyyaml==5.4.1 9 | torchaudio==0.7.2 10 | torch >= 1.5 11 | datasets >= 1.18.0 12 | librosa 13 | srt 14 | praatio<5 15 | pydub 16 | pandas 17 | python_speech_features 18 | scipy 19 | ffmpeg-normalize 20 | -------------------------------------------------------------------------------- /inference/python/speech_to_text.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import glob 4 | import yaml 5 | import torch 6 | import librosa 7 | 8 | from datetime import timedelta 9 | 10 | import numpy as np 11 | import json 12 | 13 | import models 14 | 15 | from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor 16 | from vadSplit import VadSplit 17 | 18 | from ctcdecode import CTCBeamDecoder 19 | 20 | 21 | DESCRIPTION = """ 22 | 23 | (c) Prifysgol Bangor University 24 | 25 | """ 26 | 27 | class SpeechToText: 28 | 29 | def __init__(self, models_root_dir='', wav2vec2_model_path='', version='', language_model_path='', split_only=False): 30 | 31 | self.split_only = split_only 32 | 33 | print ("split_only: ", split_only) 34 | 35 | if self.split_only==False: 36 | if len(wav2vec2_model_path)==0: 37 | self.wav2vec2_model_path = os.environ["WAV2VEC2_MODEL_NAME"] 38 | 39 | # @todo - improve. 40 | if len(language_model_path)==0: 41 | self.language_model_path = os.path.join(os.environ["WAV2VEC2_MODEL_NAME"], "kenlm") 42 | 43 | # 44 | if len(version)==0: 45 | self.version=os.environ["MODEL_VERSION"] 46 | 47 | # 48 | self.processor, self.model, self.vocab, self.ctcdecoder, self.kenlm_ctcdecoder = models.create(self.wav2vec2_model_path, self.version) 49 | 50 | self.device = "cpu" 51 | if torch.cuda.is_available(): 52 | self.device="cuda" 53 | self.model.cuda() 54 | 55 | print ("wav2vec loaded to device %s" % self.device) 56 | 57 | 58 | 59 | def get_model_name(self): 60 | return self.wav2vec2_model_path 61 | 62 | def get_language_model(self): 63 | return self.language_model_path 64 | 65 | def get_model_version(self): 66 | return self.version 67 | 68 | def get_device(self): 69 | return self.device 70 | 71 | 72 | def split_frames(self, frames, aggressiveness): 73 | 74 | for audio_segment in split(frames, aggressiveness=aggressiveness): 75 | 76 | audio_segment_buffer, audio_segment_time_start, audio_segment_time_end = audio_segment 77 | 78 | audio_segment_time_start = audio_segment_time_start / 1000 79 | audio_segment_time_end = audio_segment_time_end / 1000 80 | audio_segment_duration = audio_segment_time_end - audio_segment_time_start 81 | 82 | #print (audio_segment_duration, len(audio_segment_buffer), aggressiveness) 83 | 84 | if audio_segment_duration>100.0 and aggressiveness<4: 85 | self.split_frames(audio_segment_buffer, aggressiveness+1) 86 | else: 87 | yield audio_segment_buffer, audio_segment_time_start, audio_segment_time_end 88 | 89 | 90 | def transcribe(self, wav_file_path, max_segment_length=15, max_segment_words=14, withlm=False): 91 | 92 | print ("Processing: %s" % wav_file_path) 93 | 94 | wav_audio, rate = librosa.load(wav_file_path, sr=16000) 95 | 96 | time_start = 0.0 97 | time_end = librosa.get_duration(y=wav_audio,sr=rate) 98 | 99 | vadSplitter = VadSplit() 100 | for audio_segment in vadSplitter.split_audio_file(wav_file_path): 101 | audio_segment_buffer, audio_segment_time_start, audio_segment_time_end = audio_segment 102 | if self.split_only==True: 103 | yield "", audio_segment_time_start, audio_segment_time_end, None 104 | else: 105 | # Run stt on the chunk that just completed VAD 106 | audio = np.frombuffer(audio_segment_buffer, dtype=np.int16) 107 | 108 | # timings into seconds. 109 | audio_segment_time_start = audio_segment_time_start / 1000 110 | audio_segment_time_end = audio_segment_time_end / 1000 111 | 112 | # Run stt on the chunk that just completed VAD 113 | audio = np.frombuffer(audio_segment_buffer, dtype=np.int16) 114 | 115 | features = self.processor(audio, sampling_rate=16_000, return_tensors="pt", padding=True) 116 | with torch.no_grad(): 117 | logits = self.model(features.input_values.to(self.device, dtype=torch.float32), attention_mask=features.attention_mask.to(self.device)).logits 118 | 119 | transcription, alignment, timesteps = self.ctc_decode(logits, withlm) 120 | 121 | # 122 | # for when voice activated splitting fails to ensure no split/segment is more than 123 | # a set number of seconds, we can use the alignments from the CTC results to 124 | # produce segments with a given time and/or word count. 125 | # 126 | timestep_length = (audio_segment_time_end - audio_segment_time_start) / timesteps 127 | for a in alignment: 128 | a[1] = ((a[1] * timestep_length) + audio_segment_time_start) 129 | aligned_words = self.aligned_words(alignment, timestep_length) 130 | 131 | if len(aligned_words) > 0: 132 | for transcription, seg_start, seg_end, seg_alignment in self.segment(aligned_words, max_segment_length, max_segment_words): 133 | yield transcription, seg_start, seg_end, seg_alignment 134 | 135 | 136 | def ctc_decode(self, logits, withlm): 137 | 138 | if withlm: 139 | beam_results, beam_scores, timesteps, out_lens = self.kenlm_ctcdecoder.decode(logits) 140 | else: 141 | beam_results, beam_scores, timesteps, out_lens = self.ctcdecoder.decode(logits) 142 | 143 | # beam_results - Shape: BATCHSIZE x N_BEAMS X N_TIMESTEPS A batch containing the series 144 | # of characters (these are ints, you still need to decode them back to your text) representing 145 | # results from a given beam search. Note that the beams are almost always shorter than the 146 | # total number of timesteps, and the additional data is non-sensical, so to see the top beam 147 | # (as int labels) from the first item in the batch, you need to run beam_results[0][0][:out_len[0][0]]. 148 | beam_string = "".join(self.vocab[n] for n in beam_results[0][0][:out_lens[0][0]]) 149 | 150 | # beam_scores - Shape: BATCHSIZE x N_BEAMS A batch with the approximate CTC score of each beam 151 | # If this is true, you can get the model's confidence that the beam is correct with 152 | # p=1/np.exp(beam_score). 153 | score = 0.0 #float(beam_scores[0][0].item()) / 100 154 | 155 | # timesteps : BATCHSIZE x N_BEAMS : the timestep at which the nth output character has peak probability. 156 | # Can be used as alignment between the audio and the transcript. 157 | alignment = list() 158 | for i in range(0, out_lens[0][0]): 159 | alignment.append([beam_string[i], int(timesteps[0][0][i])] ) 160 | 161 | return beam_string, alignment, int(beam_results.shape[2]) 162 | 163 | 164 | def greedy_decode(self, logits): 165 | predicted_ids = torch.argmax(logits, dim=-1) 166 | return self.processor.batch_decode(predicted_ids)[0] 167 | 168 | 169 | def aligned_words(self, char_alignments, timestep_length): 170 | word_alignments = list() 171 | 172 | word = '' 173 | w_start = 0.0 174 | w_end = 0.0 175 | 176 | for c, ts in char_alignments: 177 | if c != " ": 178 | if len(word)==0: 179 | word = c 180 | w_start=ts 181 | w_end=ts+timestep_length 182 | else: 183 | word = word + c 184 | w_end = ts 185 | else: 186 | word_alignments.append({'word':word, 'start':w_start, 'end':ts}) 187 | word='' 188 | 189 | if (len(word)>0): 190 | word_alignments.append({'word':word, 'start':w_start, 'end':w_end}) 191 | 192 | return word_alignments 193 | 194 | 195 | def segment(self, word_alignments, segment_max_length, segment_max_words): 196 | 197 | segment_alignments = list() 198 | 199 | segment_text = '' 200 | segment_start = word_alignments[0]['start'] 201 | segment_end = word_alignments[0]['end'] 202 | 203 | for a in word_alignments: 204 | 205 | # if the segment has reached a maximum number of words 206 | if len(segment_alignments)>segment_max_words: 207 | yield segment_text, segment_start, segment_end, segment_alignments 208 | segment_text = a['word'] 209 | segment_start = a['start'] 210 | segment_end = a['end'] 211 | segment_alignments = list() 212 | segment_alignments.append(a) 213 | 214 | elif a['start'] > segment_start + segment_max_length: 215 | yield segment_text, segment_start, segment_end, segment_alignments 216 | segment_text = a['word'] 217 | segment_start = a['start'] 218 | segment_end = a['end'] 219 | segment_alignments = list() 220 | segment_alignments.append(a) 221 | 222 | else: 223 | segment_text = segment_text + ' ' + a['word'] 224 | segment_text = segment_text.strip() 225 | segment_end = a['end'] 226 | segment_alignments.append(a) 227 | 228 | yield segment_text, segment_start, segment_end, segment_alignments 229 | 230 | -------------------------------------------------------------------------------- /inference/python/split_audio.py: -------------------------------------------------------------------------------- 1 | import os 2 | import srt 3 | import pandas 4 | import time 5 | #import hashlib 6 | import uuid 7 | 8 | from pathlib import Path 9 | from pydub import AudioSegment 10 | 11 | from argparse import ArgumentParser, RawTextHelpFormatter 12 | 13 | DESCRIPTION = """ 14 | 15 | © Prifysgol Bangor University 16 | 17 | """ 18 | 19 | 20 | def split_from_srt(wav_file_path, srt_file_path, destination_dir, csv_file_path, **kwargs): 21 | # produce audio clips of each 'segment' in srt file and an accompanying txt file 22 | # that contains the transcription 23 | # build also a csv file in a format compatible with Common Voice (/DeepSpeech/coqui) 24 | # 25 | Path(destination_dir).mkdir(parents=True, exist_ok=True) 26 | Path(csv_file_path).parent.mkdir(parents=True, exist_ok=True) 27 | 28 | wav_filename = Path(wav_file_path).name 29 | 30 | df = pandas.DataFrame(columns=['wav_filename', 'wav_filesize', 'transcript', 'duration', 'parent_wavfile_name']) 31 | 32 | wav_audio_file = AudioSegment.from_wav(wav_file_path) 33 | srt_segments = list(srt.parse(open(srt_file_path, 'r', encoding='utf-8').read())) 34 | 35 | i=0 36 | for s in srt_segments: 37 | 38 | transcript = s.content.lower() 39 | 40 | # pydub does things in milliseconds 41 | start = float(s.start.total_seconds()) * 1000 42 | end = float(s.end.total_seconds()) * 1000 43 | 44 | wav_segment = wav_audio_file[start:end] 45 | 46 | wav_segment_file_name = uuid.uuid4().hex + ".wav" 47 | wav_segment_file_path = os.path.join(destination_dir, wav_segment_file_name) 48 | wav_segment.export(wav_segment_file_path, format="wav") 49 | 50 | #txt_segment_file_path = wav_segment_file_path.replace(".wav", ".txt") 51 | #with open(txt_segment_file_path, 'w', encoding='utf-8') as txt_segment_file: 52 | # txt_segment_file.write(transcript) 53 | 54 | duration = end - start; 55 | df.loc[i] = [wav_segment_file_name, os.path.getsize(wav_segment_file_path), transcript, duration, wav_filename] 56 | i+=1 57 | 58 | print ("Adding segments to csv file {}".format(csv_file_path)) 59 | with open(csv_file_path, 'a') as csvfile: 60 | df.to_csv(csvfile, encoding='utf-8', mode='a', index=False, header=csvfile.tell()==0, sep="\t") 61 | 62 | 63 | if __name__ == "__main__": 64 | 65 | parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) 66 | 67 | parser.add_argument("--wavfile", dest="wav_file_path", required=True) 68 | parser.add_argument("--srtfile", dest="srt_file_path", required=True) 69 | parser.add_argument("--destdir", dest="destination_dir", required=True) 70 | parser.add_argument("--csvfile", dest="csv_file_path", required=True) 71 | 72 | parser.set_defaults(func=split_from_srt) 73 | args = parser.parse_args() 74 | args.func(**vars(args)) 75 | -------------------------------------------------------------------------------- /inference/python/text_preprocess.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | chars_to_ignore_regex = '[\,\?\.\!\u00AC\;\:\"\\%\\\]' 4 | 5 | # Preprocessing the datasets. 6 | # We need to read the aduio files as arrays 7 | def cleanup(sentence): 8 | sentence = re.sub(chars_to_ignore_regex, '', sentence).lower() 9 | sentence = sentence.replace('\u2013',"-") 10 | sentence = sentence.replace('\u2014',"-") 11 | sentence = sentence.replace('\u2018',"'") 12 | sentence = sentence.replace('\u201C',"") 13 | sentence = sentence.replace('\u201D',"") 14 | sentence = sentence.replace('ñ',"n") 15 | sentence = sentence.replace(" - "," ") 16 | return sentence 17 | -------------------------------------------------------------------------------- /inference/python/transcriber.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | import glob 3 | import srt 4 | from praatio import tgio 5 | from datetime import timedelta 6 | 7 | import models 8 | 9 | from speech_to_text import SpeechToText 10 | 11 | from argparse import ArgumentParser, RawTextHelpFormatter 12 | 13 | DESCRIPTION = """ 14 | 15 | (c) Prifysgol Bangor University 16 | 17 | """ 18 | 19 | WORDS_PER_SEGMENT=14 20 | SECONDS_PER_SEGMENT=8 21 | 22 | 23 | class Transcriber: 24 | 25 | def __init__(self): 26 | pass 27 | 28 | 29 | def to_srt_file(self, transcriptions, srt_file_path, split_only): 30 | i = 0 31 | srt_segments = [] 32 | for transcript, time_start, time_end, alignments in transcriptions: 33 | i = i+1 34 | start_delta = timedelta(seconds=time_start) 35 | end_delta = timedelta(seconds=time_end) 36 | if split_only==True: 37 | srt_segments.append(srt.Subtitle(i, start=start_delta, end=end_delta, content="...")) 38 | else: 39 | srt_segments.append(srt.Subtitle(i, start=start_delta, end=end_delta, content=transcript)) 40 | 41 | str_string = srt.compose(srt_segments) 42 | if len(srt_file_path) > 0: 43 | with open(srt_file_path, 'w', encoding='utf-8') as srt_file: 44 | srt_file.write(str_string) 45 | print("srt file of transcription saved to %s" % srt_file_path) 46 | else: 47 | print (str_string) 48 | 49 | 50 | 51 | 52 | def to_textgrid_file(self, transcriptions, wav_file_path, textgrid_file_path): 53 | textgrid_entries_list = [] 54 | for transcript, time_start, time_end, alignments in transcriptions: 55 | textgrid_entry = (time_start, time_end, transcript) 56 | textgrid_entries_list.append(textgrid_entry) 57 | 58 | utterance_tier = tgio.IntervalTier('utterance', textgrid_entries_list, 0, pairedWav=wav_file_path) 59 | tg = tgio.Textgrid() 60 | tg.addTier(utterance_tier) 61 | if len(textgrid_file_path) > 0: 62 | tg.save(textgrid_file_path, useShortForm=False, outputFormat='textgrid') 63 | print("Textgrid of transcription saved to %s" % textgrid_file_path) 64 | else: 65 | print (utterance_tier) 66 | 67 | 68 | 69 | def main(wav_file, output_srt_file, with_lm, split_only, **args): 70 | 71 | output_textgrid_file=output_srt_file.replace(".srt", ".TextGrid") 72 | 73 | stt=SpeechToText(split_only=split_only) 74 | t=Transcriber() 75 | 76 | # 77 | i = 0 78 | transcriptions = list() 79 | for transcript, time_start, time_end, alignments in stt.transcribe(wav_file, withlm=with_lm): 80 | i += 1 81 | print (i, time_start, time_end, transcript) 82 | transcriptions.append((transcript, time_start, time_end, alignments)) 83 | 84 | t.to_srt_file(transcriptions, output_srt_file, split_only) 85 | t.to_textgrid_file(transcriptions, wav_file, output_textgrid_file) 86 | 87 | 88 | # 89 | if __name__ == "__main__": 90 | parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) 91 | 92 | parser.add_argument("-w", dest="wav_file", required=True) 93 | parser.add_argument("-s", dest="output_srt_file", default='') 94 | parser.add_argument("-l", dest="with_lm", action='store_true') 95 | parser.add_argument("--split_only", dest="split_only", action='store_true') 96 | 97 | parser.set_defaults(func=main) 98 | args = parser.parse_args() 99 | args.func(**vars(args)) 100 | -------------------------------------------------------------------------------- /inference/python/utils_srt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import srt 3 | import json 4 | 5 | from datetime import timedelta 6 | 7 | def to_srt_from_jsonstring(transcriptions, srt_file_path=''): 8 | i = 0 9 | 10 | srt_segments = [] 11 | json_segments = json.loads(transcriptions) 12 | 13 | for transcript in json_segments["transcript"]: 14 | i = i+1 15 | time_start = transcript["start"] 16 | time_end = transcript["end"] 17 | text = transcript["text"] 18 | start_delta = timedelta(seconds=time_start) 19 | end_delta = timedelta(seconds=time_end) 20 | srt_segments.append(srt.Subtitle(i, start=start_delta, end=end_delta, content=text)) 21 | 22 | srt_string = srt.compose(srt_segments) 23 | if len(srt_file_path) > 0: 24 | with open(srt_file_path, 'w', encoding='utf-8') as srt_file: 25 | srt_file.write(srt_string) 26 | print("srt file of transcription saved to %s" % srt_file_path) 27 | 28 | return srt_string 29 | 30 | -------------------------------------------------------------------------------- /inference/python/vadSplit.py: -------------------------------------------------------------------------------- 1 | import os 2 | import wave 3 | import collections 4 | import contextlib 5 | 6 | from pydub import AudioSegment 7 | 8 | AudioFormat = collections.namedtuple('AudioFormat', 'rate channels width') 9 | 10 | DEFAULT_RATE = 16000 11 | DEFAULT_CHANNELS = 1 12 | DEFAULT_WIDTH = 2 13 | DEFAULT_FORMAT = AudioFormat(DEFAULT_RATE, DEFAULT_CHANNELS, DEFAULT_WIDTH) 14 | 15 | 16 | class AudioFile: 17 | 18 | def __init__(self, audio_path, as_path=False, audio_format=DEFAULT_FORMAT): 19 | self.audio_path = audio_path 20 | self.audio_format = audio_format 21 | self.as_path = as_path 22 | self.open_file = None 23 | self.tmp_file_path = None 24 | 25 | def __enter__(self): 26 | if self.audio_path.endswith('.wav'): 27 | self.open_file = wave.open(self.audio_path, 'r') 28 | if read_audio_format_from_wav_file(self.open_file) == self.audio_format: 29 | if self.as_path: 30 | self.open_file.close() 31 | return self.audio_path 32 | return self.open_file 33 | self.open_file.close() 34 | 35 | def __exit__(self, *args): 36 | if not self.as_path: 37 | self.open_file.close() 38 | if self.tmp_file_path is not None: 39 | os.remove(self.tmp_file_path) 40 | 41 | 42 | def read_audio_format_from_wav_file(wav_file): 43 | return AudioFormat(wav_file.getframerate(), wav_file.getnchannels(), wav_file.getsampwidth()) 44 | 45 | 46 | def get_num_samples(pcm_buffer_size, audio_format=DEFAULT_FORMAT): 47 | return pcm_buffer_size // (audio_format.channels * audio_format.width) 48 | 49 | 50 | def get_pcm_duration(pcm_buffer_size, audio_format=DEFAULT_FORMAT): 51 | """Calculates duration in seconds of a binary PCM buffer (typically read from a WAV file)""" 52 | return get_num_samples(pcm_buffer_size, audio_format) / audio_format.rate 53 | 54 | 55 | def read_frames(wav_file, frame_duration_ms=30, yield_remainder=False): 56 | audio_format = read_audio_format_from_wav_file(wav_file) 57 | frame_size = int(audio_format.rate * (frame_duration_ms / 1000.0)) 58 | while True: 59 | try: 60 | data = wav_file.readframes(frame_size) 61 | if not yield_remainder and get_pcm_duration(len(data), audio_format) * 1000 < frame_duration_ms: 62 | break 63 | yield data 64 | except EOFError: 65 | break 66 | 67 | 68 | def read_frames_from_file(audio_path, audio_format=DEFAULT_FORMAT, frame_duration_ms=30, yield_remainder=False): 69 | with AudioFile(audio_path, audio_format=audio_format) as wav_file: 70 | for frame in read_frames(wav_file, frame_duration_ms=frame_duration_ms, yield_remainder=yield_remainder): 71 | yield frame 72 | 73 | 74 | def split(audio_frames, 75 | audio_format=DEFAULT_FORMAT, 76 | num_padding_frames=10, 77 | threshold=0.5, 78 | aggressiveness=3): 79 | 80 | from webrtcvad import Vad # pylint: disable=import-outside-toplevel 81 | 82 | if audio_format.channels != 1: 83 | raise ValueError('VAD-splitting requires mono samples') 84 | 85 | if audio_format.width != 2: 86 | raise ValueError('VAD-splitting requires 16 bit samples') 87 | 88 | if audio_format.rate not in [8000, 16000, 32000, 48000]: 89 | raise ValueError( 90 | 'VAD-splitting only supported for sample rates 8000, 16000, 32000, or 48000') 91 | 92 | if aggressiveness not in [0, 1, 2, 3]: 93 | raise ValueError( 94 | 'VAD-splitting aggressiveness mode %s has to be one of 0, 1, 2, or 3' % aggressiveness) 95 | 96 | ring_buffer = collections.deque(maxlen=num_padding_frames) 97 | triggered = False 98 | vad = Vad(int(aggressiveness)) 99 | voiced_frames = [] 100 | frame_duration_ms = 0 101 | frame_index = 0 102 | for frame_index, frame in enumerate(audio_frames): 103 | frame_duration_ms = get_pcm_duration(len(frame), audio_format) * 1000 104 | if int(frame_duration_ms) not in [10, 20, 30]: 105 | raise ValueError( 106 | 'VAD-splitting only supported for frame durations 10, 20, or 30 ms') 107 | is_speech = vad.is_speech(frame, audio_format.rate) 108 | if not triggered: 109 | ring_buffer.append((frame, is_speech)) 110 | num_voiced = len([f for f, speech in ring_buffer if speech]) 111 | if num_voiced > threshold * ring_buffer.maxlen: 112 | triggered = True 113 | for f, s in ring_buffer: 114 | voiced_frames.append(f) 115 | ring_buffer.clear() 116 | else: 117 | voiced_frames.append(frame) 118 | ring_buffer.append((frame, is_speech)) 119 | num_unvoiced = len([f for f, speech in ring_buffer if not speech]) 120 | if num_unvoiced > threshold * ring_buffer.maxlen: 121 | triggered = False 122 | yield b''.join(voiced_frames), \ 123 | frame_duration_ms * max(0, frame_index - (len(voiced_frames)-1)), \ 124 | frame_duration_ms * frame_index 125 | ring_buffer.clear() 126 | voiced_frames = [] 127 | if len(voiced_frames) > 0: 128 | yield b''.join(voiced_frames), \ 129 | frame_duration_ms * (frame_index - (len(voiced_frames)-1)), \ 130 | frame_duration_ms * (frame_index + 1) 131 | 132 | 133 | 134 | class VadSplit: 135 | 136 | 137 | def split_audio_file(self, audio_file_path, aggressiveness=0, offset=0.0): 138 | 139 | #print ("\nVadSpliting {} with aggressiveness {} ".format(audio_file_path, aggressiveness)) 140 | 141 | frames = read_frames_from_file(audio_file_path) 142 | for audio_segment in split(frames, aggressiveness=aggressiveness): 143 | audio_segment_buffer, audio_segment_time_start, audio_segment_time_end = audio_segment 144 | 145 | root_audio_segment_time_start = audio_segment_time_start + offset 146 | root_audio_segment_time_end = audio_segment_time_end + offset 147 | 148 | audio_segment_duration = root_audio_segment_time_end - root_audio_segment_time_start 149 | 150 | # split (if possible) with a higher aggressiveness if the segment is longer than 15 seconds... 151 | if audio_segment_duration / 1000 > 15.0 and aggressiveness < 3: 152 | 153 | #print ("audio_segment_duration too long (s) {} {} {}, {}".format(audio_segment_time_start, audio_segment_time_end, audio_segment_duration, aggressiveness)) 154 | 155 | tmp_chunk_file_path = os.path.join("/tmp", "chunk_{}_{}.wav".format(round(root_audio_segment_time_start), round(root_audio_segment_time_end))) 156 | 157 | wav_audio_file_segment = AudioSegment.from_wav(audio_file_path) 158 | wav_segment = wav_audio_file_segment[audio_segment_time_start:audio_segment_time_end] 159 | wav_segment.export(tmp_chunk_file_path, format='wav') 160 | 161 | for smaller_audio_segment in self.split_audio_file(tmp_chunk_file_path, aggressiveness+1, audio_segment_time_start): 162 | smaller_audio_segment_buffer, smaller_audio_segment_time_start, smaller_audio_segment_time_end = smaller_audio_segment 163 | 164 | smaller_audio_segment_time_start += offset; 165 | smaller_audio_segment_time_end += offset; 166 | smaller_audio_segment_duration = smaller_audio_segment_time_end - smaller_audio_segment_time_start 167 | 168 | #print ("yielding smaller...", smaller_audio_segment_time_start, smaller_audio_segment_time_end, smaller_audio_segment_duration, aggressiveness) 169 | yield smaller_audio_segment_buffer, smaller_audio_segment_time_start, smaller_audio_segment_time_end 170 | else: 171 | #print ("yielding...", root_audio_segment_time_start, root_audio_segment_time_end, audio_segment_duration, aggressiveness) 172 | yield audio_segment_buffer, root_audio_segment_time_start, root_audio_segment_time_end 173 | 174 | -------------------------------------------------------------------------------- /inference/python/yt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | help() 5 | { 6 | echo 7 | echo "Trawsgrifio fideo YouTube a chreu ffeil srt er mwyn ei gywiro" 8 | echo 9 | echo "Usage: $ ./`basename $0` [OPTIONS] " 10 | echo 11 | echo "Options:" 12 | echo 13 | echo " " 14 | echo 15 | echo "Example: " 16 | echo 17 | echo "$ yt.sh 6z8klxzufx8" 18 | echo 19 | } 20 | 21 | if [ -z "$1" ]; then 22 | help 23 | exit 1 24 | fi 25 | 26 | set -x 27 | 28 | youtube-dl --extract-audio --audio-format mp3 https://www.youtube.com/watch?v=$1 29 | ffmpeg -i *.mp3 -vn -acodec pcm_s16le -ar 16000 -ac 1 /recordings/$1.wav 30 | rm *.mp3 31 | python3 transcriber.py -w /recordings/$1.wav -s /recordings/$1.srt 32 | -------------------------------------------------------------------------------- /inference/python/ytpl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## Script written by Leena Sarah Farhat and Dewi Bryn Jones 4 | 5 | help() 6 | { 7 | echo 8 | echo "Trawsgrifio fideo YouTube playlist a chreu ffeil srt er mwyn ei gywiro" 9 | echo 10 | echo "Transcribe videos from a YouTube playlist and create SRT files for post-editing" 11 | echo 12 | echo "Usage: $ ./`basename $0` [OPTIONS] " 13 | echo 14 | echo "Options:" 15 | echo 16 | echo " " 17 | echo 18 | echo "Example: " 19 | echo 20 | echo "$ ytpl.sh PLZ8Xx5GjMhRqs1O-PuINm4gbDr1SKFSjn" 21 | echo 22 | } 23 | 24 | if [ -z "$1" ]; then 25 | help 26 | exit 1 27 | fi 28 | 29 | set -x 30 | 31 | DATA_DIR='/data/recordings/'$1 32 | 33 | # download videos... 34 | youtube-dl --download-archive ${DATA_DIR}/downloaded.txt --rm-cache-dir -cwi --no-post-overwrites -o ${DATA_DIR}'/%(playlist_index)s - %(title)s.%(ext)s' --cookies=cookies.txt --extract-audio --audio-format mp3 https://www.youtube.com/playlist?list=$1 35 | 36 | set -e 37 | 38 | # convert, transcribe and save as clips... 39 | TRANSCRIPTIONS_DIR=${DATA_DIR}/transcriptions 40 | TRANSCRIPTIONS_WITH_LM_DIR=${DATA_DIR}/transcriptions_withlm 41 | 42 | mkdir -p ${TRANSCRIPTIONS_DIR} 43 | mkdir -p ${TRANSCRIPTIONS_WITH_LM_DIR} 44 | 45 | for filepath in ${DATA_DIR}/*.mp3 46 | do 47 | if [ -f "$filepath" ]; then 48 | 49 | filename_ext=${filepath##*/} 50 | filename=${filename_ext%.*} 51 | wavfile_path=$DATA_DIR/${filename}.wav 52 | 53 | ffmpeg -i "${filepath}" -vn -acodec pcm_s16le -ar 16000 -ac 1 "${wavfile_path}" 54 | 55 | echo "Transcribe with only acoustic model. Output srt file" 56 | cp -v "${wavfile_path}" ${TRANSCRIPTIONS_DIR} 57 | python3 transcriber.py -w "${wavfile_path}" -s "${TRANSCRIPTIONS_DIR}/${filename}.srt" 58 | 59 | echo "Transcribe with the help of a language model. Output srt file" 60 | cp -v "${wavfile_path}" ${TRANSCRIPTIONS_WITH_LM_DIR} 61 | python3 transcriber.py -w "${wavfile_path}" -l -s "${TRANSCRIPTIONS_WITH_LM_DIR}/${filename}.srt" 62 | 63 | rm "${wavfile_path}" 64 | fi 65 | done 66 | -------------------------------------------------------------------------------- /inference/python/ytpl_build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## Script written by Leena Sarah Farhat and Dewi Bryn Jones 4 | 5 | help() 6 | { 7 | echo 8 | echo "Llwytho i lawr playlist fideo YouTube a chreu ffeil srt gyda segmentau" 9 | echo 10 | echo "Download videos from a YouTube playlist and create SRT files with segments" 11 | echo 12 | echo "Usage: $ ./`basename $0` [OPTIONS] " 13 | echo 14 | echo "Options:" 15 | echo 16 | echo " " 17 | echo 18 | echo "Example: " 19 | echo 20 | echo "$ ytpl_download.sh PLZ8Xx5GjMhRqs1O-PuINm4gbDr1SKFSjn" 21 | echo 22 | } 23 | 24 | if [ -z "$1" ]; then 25 | help 26 | exit 1 27 | fi 28 | 29 | echo 30 | echo "#### Downloading and processing audio from playlist $1 ####" 31 | echo 32 | 33 | set -x 34 | 35 | CORPUS_ROOT_DIR='/data/welsh-youtube-corpus' 36 | 37 | DOWNLOADS_DIR=${CORPUS_ROOT_DIR}/downloads/$1 38 | mkdir -p ${DOWNLOADS_DIR} 39 | 40 | # download videos... 41 | youtube-dl --download-archive ${DOWNLOADS_DIR}/downloaded.txt --rm-cache-dir -cwi --no-post-overwrites -o ${DOWNLOADS_DIR}'/%(playlist_index)s - %(title)s.%(ext)s' --cookies=cookies.txt --extract-audio --audio-format mp3 https://www.youtube.com/playlist?list=$1 42 | 43 | # convert, transcribe and save as clips... 44 | TRANSCRIPTION_FILES_DIR=${CORPUS_ROOT_DIR}/transcription_files 45 | 46 | DATA_ROOT_DIR=${CORPUS_ROOT_DIR}/dataset 47 | 48 | CLIPS_DIR=${DATA_ROOT_DIR}/clips 49 | CSV_FILE=${DATA_ROOT_DIR}/clips.tsv 50 | 51 | mkdir -p ${TRANSCRIPTION_FILES_DIR} 52 | mkdir -p ${CLIPS_DIR} 53 | 54 | for filepath in ${DOWNLOADS_DIR}/*.mp3 55 | do 56 | if [ -f "$filepath" ]; then 57 | 58 | filename_ext=${filepath##*/} 59 | filename=${filename_ext%.*} 60 | wavfile_path=$TRANSCRIPTION_FILES_DIR/${filename}.wav 61 | 62 | ffmpeg -i "${filepath}" -vn -acodec pcm_s16le -ar 16000 -ac 1 "${wavfile_path}" 63 | 64 | # 65 | python3 transcriber.py --split_only \ 66 | -w "${wavfile_path}" \ 67 | -s "${TRANSCRIPTION_FILES_DIR}/${filename}.srt" 68 | 69 | # 70 | python3 split_audio.py --wavfile "${wavfile_path}" \ 71 | --srt "${TRANSCRIPTION_FILES_DIR}/${filename}.srt" \ 72 | --destdir ${CLIPS_DIR} \ 73 | --csvfile ${CSV_FILE} 74 | 75 | fi 76 | done 77 | -------------------------------------------------------------------------------- /inference/server/.dockerignore: -------------------------------------------------------------------------------- 1 | models 2 | log 3 | recordings 4 | -------------------------------------------------------------------------------- /inference/server/.gitignore: -------------------------------------------------------------------------------- 1 | log 2 | recordings 3 | models 4 | -------------------------------------------------------------------------------- /inference/server/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Uned Technolegau Iaith / Language Technologies Unit 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /inference/server/Makefile: -------------------------------------------------------------------------------- 1 | default: build 2 | 3 | $(eval DEVICE = cpu) 4 | #$(eval DEVICE = gpu) 5 | 6 | 7 | config: 8 | $(eval DOCKER_COMPOSE = docker compose -f docker-compose.${DEVICE}.yml) 9 | 10 | up: config 11 | sudo rm -rf ${PWD}/logs 12 | mkdir -p ${PWD}/logs/ 13 | mkdir -p ${PWD}/recordings/ 14 | mkdir -p ${PWD}/redis_data/ 15 | ${DOCKER_COMPOSE} up -d --build && ${DOCKER_COMPOSE} logs -f 16 | 17 | 18 | down: config 19 | ${DOCKER_COMPOSE} down 20 | 21 | 22 | -------------------------------------------------------------------------------- /inference/server/README.md: -------------------------------------------------------------------------------- 1 | # Gweinydd Adnabod Lleferydd wav2vec2 Cymraeg 2 | 3 | [**(click here to read the README in English)**](README_en.md) 4 | 5 | ## Cefndir 6 | 7 | Os ydych yn pryderu am breifatrwydd unrhyw API adnabod lleferydd Cymraeg ar-lein, fel yr un gan https://api.techiaith.org/cy, yna mae cynnwys y ffolder hwn yn eich helpu i osod a defnyddio gosodiad lleol eich hunain. 8 | 9 | ## Gosod 10 | 11 | ``` 12 | $ git clone https://github.com/techiaith/docker-wav2vec2-cy 13 | $ cd docker-wav2vec2-cy/inference/server 14 | $ make 15 | ``` 16 | 17 | Mae'r proses gosod yn estyn modelau sydd wedi'i hyfforddi eisoes gan Uned Technolegau Iaith, Prifysgol Bangor. 18 | 19 | 20 | ## Defnyddio 21 | 22 | I redeg, does ond angen un gorchymyn ychwanegol.. 23 | 24 | ``` 25 | $ make up 26 | ``` 27 | 28 | I'w brofi'n syml, mae'n bosib gyrru'r ffeil wav enghreifftiol sydd wedi'i gynnwys o fewn y project. 29 | 30 | ``` 31 | $ curl -F 'soundfile=@speech.wav' localhost:5511/speech_to_text/ 32 | {"version": 1, "success": true, "id": "e1684eab-e472-4aaa-8c4f-66c007477a7f"} 33 | ``` 34 | (gallwch drawsgrifio eich recordiadau eich hun cyn belled â bod y ffeiliau mewn fformat wav a sianel mono 16kHz.) 35 | 36 | Byddwch yn derbyn ymateb sydd i bob pwrpas ond yn gydnabyddiaeth o'ch cais sy'n cynnwys rhif adnabod. Gan y gall gymryd peth amser i berfformio lleferydd i destun ar ffeil, gallwch wirio'r statws gyda cheisiadau ping dilynol. Os yw hyd y sain yn hirach na 5-10 eiliad, bydd yr API yn segmentu gan ddefnyddio canfod llais. 37 | 38 | ``` 39 | $ curl localhost:5511/get_status/?stt_id=e1684eab-e472-4aaa-8c4f-66c007477a7f 40 | {"version": 1, "status": "PENDING"} 41 | ``` 42 | 43 | pan fydd y trawsgrifiad wedi'i gwblhau, bydd yr ymateb 44 | 45 | ``` 46 | $ curl localhost:5511/get_status/?stt_id=e1684eab-e472-4aaa-8c4f-66c007477a7f 47 | {"version": 1, "status": "SUCCESS"} 48 | ```` 49 | 50 | Gellir cael y canlyniadau mewn fformat srt: 51 | 52 | ``` 53 | $ curl localhost:5511/get_srt/?stt_id=e1684eab-e472-4aaa-8c4f-66c007477a7f 54 | 1 55 | 00:00:00,619 --> 00:00:05,170 56 | mae ganddynt ddau o blant mab a merch 57 | ``` 58 | 59 | json: ( sy'n cynnwys aliniadau ar lefel geiriau ) 60 | 61 | ``` 62 | $ curl localhost:5511/get_json/?stt_id=e1684eab-e472-4aaa-8c4f-66c007477a7f 63 | [{"text": "mae ganddynt ddau o blant mab a merch", "start": 0.619581589958159, "end": 5.170041841004185, "alignment": [{"word": "mae", "start": 0.619581589958159, "end": 0.7992050209205022}, {"word": "ganddynt", "start": 0.8391213389121339, "end": 1.457824267782427}, {"word": "ddau", "start": 1.6973221757322177, "end": 2.096485355648536}, {"word": "o", "start": 2.1563598326359834, "end": 2.1962761506276154}, {"word": "blant", "start": 2.256150627615063, "end": 2.7950209205020924}, {"word": "mab", "start": 3.1742259414225944, "end": 3.6133054393305444}, {"word": "a", "start": 3.9725523012552304, "end": 4.17213389121339}, {"word": "merch", "start": 4.251966527196653, "end": 5.170041841004185}]}] 64 | ``` 65 | 66 | neu csv: 67 | 68 | ``` 69 | $ curl localhost:5511/get_csv/?stt_id=e1684eab-e472-4aaa-8c4f-66c007477a7f 70 | ID Start End Transcript 71 | 1 0.619581589958159 5.170041841004185 mae ganddynt ddau o blant mab a merch 72 | ``` 73 | 74 | Mae'r gweinydd yn darparu GUI HTML syml iawn hefyd er mwyn defnyddio/cefnogi'r API uchod. Ewch i http://localhost:5511/static_html/index.hml 75 | 76 | 77 | ## Atgyweirio priflythrennau ac atalnodi 78 | 79 | Mae canlyniadau o'r model adnabod lleferydd cyson mewn llythrennau bach sydd heb unrhyw fath o atalnodi fel marc cwestiwn, atalnod llawn, cyplysnodau ayb fel yn yr enghraifft uchod - "mae ganddynt ddau o blant mab a merch". Mae modd cysylltu'r gweinydd trawsgrifio gyda weinydd atalnodi (o'n project arall at GitHub - https://github.com/techiaith/docker-atalnodi-server). 80 | 81 | Gosodwch y weinydd atalnodi a nodwch ei cyfeiriad we (fel http://localhost:5555) a nodwch yr URL o fewn ffeil newydd o'r enw `external_api_urls.py` yn y ffolder `worker`. E.e. 82 | 83 | ```python 84 | $ cat worker/external_api_urls.py 85 | PUNCTUATION_API_URL = "http://localhost:5555/restore" 86 | ```` 87 | 88 | Ac ail-gychwynwch y gweinydd drwy 89 | 90 | ```shell 91 | $ make down 92 | $ make up 93 | ``` 94 | 95 | Bydd y canlyniad i'r profi'r API gyda'r ffeil `speech.wav` yn rhoi testun sydd wedi ei briflythrennu a'i hatalnodi: 96 | 97 | ``` 98 | $ curl localhost:5511/get_srt/?stt_id=..... 99 | 1 100 | 00:00:00,619 --> 00:00:05,170 101 | Mae ganddynt ddau o blant, mab a merch. 102 | ``` 103 | -------------------------------------------------------------------------------- /inference/server/README_en.md: -------------------------------------------------------------------------------- 1 | # Welsh language wav2vec2 Speech Recognition Server 2 | 3 | [(cliciwch yma os hoffwch ddarllen y README Cymraeg)](README.md) 4 | 5 | ## Background 6 | 7 | If are concerned about the privacy of any online Welsh speech recognition API, such as by techiaith at https://api.techiaith.org/en, then the contents of this folder can help you install and use your own local installation. 8 | 9 | 10 | ## Install 11 | 12 | ``` 13 | $ git clone https://github.com/techiaith/docker-wav2vec2-cy 14 | $ cd docker-wav2vec2-cy/inference/server 15 | $ make 16 | ``` 17 | 18 | The build process fetches models that have been pretrained by Bangor University's Language Technologies Unit. 19 | 20 | ## Use 21 | 22 | Runing the models within an API, requires only one additional command: 23 | 24 | ``` 25 | $ make up 26 | ``` 27 | 28 | To verify that your API is up and running, you can make a simple text with a sample wav file provided within the folder: 29 | 30 | ``` 31 | $ curl -F 'soundfile=@speech.wav' localhost:5511/speech_to_text/ 32 | {"version": 1, "success": true, "id": "e1684eab-e472-4aaa-8c4f-66c007477a7f"} 33 | ``` 34 | 35 | (you can transcribe your own recordings as long as files are in wav format and 16kHz mono channel.) 36 | 37 | You will receive a response that is effectively only an acknowledgement of your request containing an id. Since it can take some time to perform speech to text on a file, you can check on the status with subsequent ping requests. If the audio's duration is longer than 5-10 seconds, the API will segment using voice detection. 38 | 39 | ``` 40 | $ curl localhost:5511/get_status/?stt_id=e1684eab-e472-4aaa-8c4f-66c007477a7f 41 | {"version": 1, "status": "PENDING"} 42 | ``` 43 | 44 | when transcription is complete, the response will be 45 | 46 | ``` 47 | $ curl localhost:5511/get_status/?stt_id=e1684eab-e472-4aaa-8c4f-66c007477a7f 48 | {"version": 1, "status": "SUCCESS"} 49 | ```` 50 | 51 | The results can be obtained in srt format: 52 | 53 | ``` 54 | $ curl localhost:5511/get_srt/?stt_id=e1684eab-e472-4aaa-8c4f-66c007477a7f 55 | 1 56 | 00:00:00,619 --> 00:00:05,170 57 | mae ganddynt ddau o blant mab a merch 58 | ``` 59 | 60 | json: (which contain alignments at word level) 61 | 62 | ``` 63 | $ curl localhost:5511/get_json/?stt_id=e1684eab-e472-4aaa-8c4f-66c007477a7f 64 | [{"text": "mae ganddynt ddau o blant mab a merch", "start": 0.619581589958159, "end": 5.170041841004185, "alignment": [{"word": "mae", "start": 0.619581589958159, "end": 0.7992050209205022}, {"word": "ganddynt", "start": 0.8391213389121339, "end": 1.457824267782427}, {"word": "ddau", "start": 1.6973221757322177, "end": 2.096485355648536}, {"word": "o", "start": 2.1563598326359834, "end": 2.1962761506276154}, {"word": "blant", "start": 2.256150627615063, "end": 2.7950209205020924}, {"word": "mab", "start": 3.1742259414225944, "end": 3.6133054393305444}, {"word": "a", "start": 3.9725523012552304, "end": 4.17213389121339}, {"word": "merch", "start": 4.251966527196653, "end": 5.170041841004185}]}] 65 | ``` 66 | 67 | or csv: 68 | 69 | ``` 70 | $ curl localhost:5511/get_csv/?stt_id=e1684eab-e472-4aaa-8c4f-66c007477a7f 71 | ID Start End Transcript 72 | 1 0.619581589958159 5.170041841004185 mae ganddynt ddau o blant mab a merch 73 | ``` 74 | 75 | 76 | The server provides a very simple HTML GUI in order to use/support the above API. Go to http://localhost:5511/static_html/index.hml 77 | 78 | ## Restore capitalization and punctuation 79 | 80 | Results from the speech recognition model are always in lowercase letters and do not contain any type of punctuation marks such question marks, full stops, colons etc. such as in the example transcription result above - "mae ganddynt dau o blant mab a merch". Therefore, it's now possible to connect the transcription server with a punctuation server that you may have installed from our other project on GitHub - see https://github.com/techiaith/docker-atalnodi-server. 81 | 82 | Simply install the punctuation server and enter its web address (such as http://localhost:5555/restore) into a new file named `external_api_urls.py` in the `worker` folder. E.g. 83 | 84 | ```python 85 | $ cat worker/external_api_urls.py 86 | PUNCTUATION_API_URL = "http://localhost:5555/restore" 87 | ```` 88 | 89 | Restart your speech recognition server.. 90 | 91 | ```shell 92 | $ make down 93 | $ make up 94 | ``` 95 | 96 | The result of testing the API with the `speech.wav` file will this time give a transcription that is capitalized and punctuated: 97 | 98 | ``` 99 | $ curl localhost:5511/get_srt/?stt_id=..... 100 | 1 101 | 00:00:00,619 --> 00:00:05,170 102 | Mae ganddynt ddau o blant, mab a merch. 103 | ``` 104 | -------------------------------------------------------------------------------- /inference/server/app/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM techiaith/wav2vec2-inference-cpu 2 | 3 | RUN mkdir -p /var/log/wav2vec2 4 | 5 | WORKDIR /wav2vec2-server 6 | 7 | COPY ./requirements.txt . 8 | RUN pip install -r requirements.txt 9 | 10 | COPY . . 11 | 12 | EXPOSE 8008 13 | 14 | CMD ["/bin/bash", "-c", "/wav2vec2-server/start.sh"] 15 | 16 | -------------------------------------------------------------------------------- /inference/server/app/cherrypy.conf: -------------------------------------------------------------------------------- 1 | [supervisord] 2 | 3 | [program:speech-to-text-server] 4 | command=gunicorn wsgi --config /wav2vec2-server/gunicorn.conf --chdir=/wav2vec2-server 5 | -------------------------------------------------------------------------------- /inference/server/app/gunicorn.conf: -------------------------------------------------------------------------------- 1 | import os 2 | import multiprocessing 3 | 4 | bind = "0.0.0.0:8008" 5 | 6 | pythonpath = "/wav2vec2-server" 7 | pidfile = "/tmp/gunicorn_wav2vec2.pid" 8 | errorlog = "/var/log/wav2vec2/gunicorn-error.log" 9 | chdir = "/wav2vec2-server" 10 | 11 | max_requests = 1000 12 | workers = 1 #multiprocessing.cpu_count() * 2 + 1 13 | timeout = 360 14 | daemon = True 15 | 16 | -------------------------------------------------------------------------------- /inference/server/app/requirements.txt: -------------------------------------------------------------------------------- 1 | CherryPy==18.6.1 2 | gunicorn==20.1.0 3 | supervisor==4.2.2 4 | celery==5.2.7 5 | redis==4.3.4 -------------------------------------------------------------------------------- /inference/server/app/start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "Starting CherryPy..." 3 | supervisord -c /wav2vec2-server/cherrypy.conf 4 | sleep infinity 5 | -------------------------------------------------------------------------------- /inference/server/app/static_html/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | wav2vec XLSR Finetuned for Welsh 8 | 13 | 14 | 15 | 16 | 17 | 18 | 20 | 21 | 23 | 24 | 27 | 28 | 29 | 30 |

Speech Recognition Server GUI

31 | 32 |
33 | 34 |
35 | 36 | 37 |
38 | 39 | 46 | 47 |
48 | 49 |
50 | 51 | 53 | 54 |
55 | 56 |
57 | 58 | 59 | 60 | 61 | 62 |
63 | 64 | 65 |
66 |
67 | 68 |
69 | 70 |
71 |
72 |
73 | 74 | 75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 | 83 | 84 |
85 | 86 |
87 |
88 | 89 | 90 |
91 |
92 |
93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /inference/server/app/static_html/js/LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Yuji Miyane 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /inference/server/app/static_html/js/WavAudioEncoder.js: -------------------------------------------------------------------------------- 1 | (function(self) { 2 | var min = Math.min, 3 | max = Math.max; 4 | 5 | var setString = function(view, offset, str) { 6 | var len = str.length; 7 | for (var i = 0; i < len; ++i) 8 | view.setUint8(offset + i, str.charCodeAt(i)); 9 | }; 10 | 11 | var Encoder = function(sampleRate, numChannels) { 12 | this.sampleRate = sampleRate; 13 | this.numChannels = numChannels; 14 | this.numSamples = 0; 15 | this.dataViews = []; 16 | }; 17 | 18 | Encoder.prototype.encode = function(buffer) { 19 | var len = buffer[0].length, 20 | nCh = this.numChannels, 21 | view = new DataView(new ArrayBuffer(len * nCh * 2)), 22 | offset = 0; 23 | for (var i = 0; i < len; ++i) 24 | for (var ch = 0; ch < nCh; ++ch) { 25 | var x = buffer[ch][i] * 0x7fff; 26 | view.setInt16(offset, x < 0 ? max(x, -0x8000) : min(x, 0x7fff), true); 27 | offset += 2; 28 | } 29 | this.dataViews.push(view); 30 | this.numSamples += len; 31 | }; 32 | 33 | Encoder.prototype.finish = function(mimeType, doCleanup) { 34 | var dataSize = this.numChannels * this.numSamples * 2, 35 | view = new DataView(new ArrayBuffer(44)); 36 | setString(view, 0, 'RIFF'); 37 | view.setUint32(4, 36 + dataSize, true); 38 | setString(view, 8, 'WAVE'); 39 | setString(view, 12, 'fmt '); 40 | view.setUint32(16, 16, true); 41 | view.setUint16(20, 1, true); 42 | view.setUint16(22, this.numChannels, true); 43 | view.setUint32(24, this.sampleRate, true); 44 | view.setUint32(28, this.sampleRate * 4, true); 45 | view.setUint16(32, this.numChannels * 2, true); 46 | view.setUint16(34, 16, true); 47 | setString(view, 36, 'data'); 48 | view.setUint32(40, dataSize, true); 49 | this.dataViews.unshift(view); 50 | var blob = new Blob(this.dataViews, { type: 'audio/wav' }); 51 | if(doCleanup){ 52 | this.cleanup(); 53 | } 54 | return blob; 55 | }; 56 | 57 | Encoder.prototype.cancel = Encoder.prototype.cleanup = function() { 58 | delete this.dataViews; 59 | }; 60 | 61 | self.WavAudioEncoder = Encoder; 62 | })(self); 63 | 64 | -------------------------------------------------------------------------------- /inference/server/app/static_html/js/audioRecorder.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Record audio 3 | */ 4 | (function(window){ 5 | 6 | function AudioRecorderObject(source) { 7 | var callback; 8 | var recording = false; 9 | 10 | this.context = source.context; 11 | this.node = (this.context.createScriptProcessor || 12 | this.context.createJavaScriptNode).call(this.context, 4096, 2, 2); 13 | var worker = new Worker('js/audioRecorderWorker.js'); 14 | 15 | worker.onmessage = function(e){ 16 | var blob = e.data; 17 | callback(blob); 18 | }; 19 | 20 | worker.postMessage({ 21 | command: 'init', 22 | config: { 23 | contextSampleRate: this.context.sampleRate, 24 | desiredSampleRate: 16000, 25 | } 26 | }); 27 | 28 | this.record = function(){ 29 | recording = true; 30 | }; 31 | 32 | this.stop = function(){ 33 | recording = false; 34 | }; 35 | 36 | this.clear = function(){ 37 | worker.postMessage({ command: 'clear' }); 38 | }; 39 | 40 | this.exportWAV = function(cb, doCleanup){ 41 | callback = cb; 42 | if (!callback) throw new Error('Unable to set callback function. Please check if provided.'); 43 | 44 | worker.postMessage({ 45 | command: 'exportWAV', 46 | type: 'audio/wav', 47 | doCleanup: doCleanup, 48 | }); 49 | }; 50 | 51 | this.node.onaudioprocess = function(e){ 52 | if (!recording) return; 53 | 54 | 55 | worker.postMessage({ 56 | command: 'record', 57 | buffer: [ 58 | e.inputBuffer.getChannelData(0), 59 | ] 60 | }); 61 | }; 62 | 63 | source.connect(this.node); 64 | this.node.connect(this.context.destination); //need to check if this is required. 65 | 66 | } 67 | 68 | var audioRecorder = { 69 | 70 | fromSource: function(src){ 71 | return new AudioRecorderObject(src); 72 | } 73 | }; 74 | 75 | window.audioRecorder = audioRecorder; 76 | 77 | })(window); 78 | -------------------------------------------------------------------------------- /inference/server/app/static_html/js/audioRecorderWorker.js: -------------------------------------------------------------------------------- 1 | importScripts('resampler.js'); 2 | importScripts('WavAudioEncoder.js'); 3 | 4 | var recLength = 0; 5 | var recBuffersL = []; 6 | var bits = 16; 7 | var sampleRate; 8 | var encoder; 9 | var resampler; 10 | 11 | this.onmessage = function(e){ 12 | switch(e.data.command){ 13 | case 'init': 14 | init(e.data.config); 15 | break; 16 | case 'record': 17 | record(e.data.buffer); 18 | break; 19 | case 'exportWAV': 20 | exportWAV(e.data.type, e.data.doCleanup); 21 | break; 22 | case 'clear': 23 | clear(); 24 | break; 25 | } 26 | }; 27 | 28 | function init(config){ 29 | var contextSampleRate = config.contextSampleRate; 30 | sampleRate = config.desiredSampleRate; 31 | encoder = new WavAudioEncoder(sampleRate, 1); 32 | resampler = new Resampler(contextSampleRate, sampleRate, 1, 4096); 33 | } 34 | 35 | function record(inputBuffer) { 36 | if(typeof resampler !== 'undefined'){ 37 | inputBuffer[0] = resampler.resampler(inputBuffer[0]); 38 | } 39 | encoder.encode(inputBuffer); 40 | } 41 | 42 | function exportWAV(type, doCleanup) { 43 | var audioBlob = encoder.finish(type, doCleanup); 44 | this.postMessage(audioBlob); 45 | } 46 | 47 | function clear() { 48 | encoder.cancel(); 49 | } 50 | 51 | 52 | -------------------------------------------------------------------------------- /inference/server/app/static_html/js/ready.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | 3 | var stt_id = ''; 4 | var params = (new URL(window.location)).searchParams; 5 | stt_id = params.get("stt_id"); 6 | 7 | if (stt_id == null) { 8 | $('#btnJson').hide(); 9 | $('#btnSRT').hide(); 10 | $('#btnCSV').hide(); 11 | $('#btnEditTranscription').hide(); 12 | $('#transcriptions-panel').hide(); 13 | } else{ 14 | $('#btnJson').show(); 15 | $('#btnSRT').show(); 16 | $('#btnCSV').show(); 17 | $('btnEditTranscription').show(); 18 | $('#transcriptions-panel').show(); 19 | } 20 | 21 | $('#btnStatus').hide(); 22 | $('#error-panel').hide(); 23 | $('#progress-panel').hide(); 24 | $('#status-panel').hide(); 25 | 26 | }); 27 | 28 | -------------------------------------------------------------------------------- /inference/server/app/static_html/js/resampler.js: -------------------------------------------------------------------------------- 1 | /*jslint nomen: true, indent: 2, maxerr: 3 */ 2 | /*global self, buffer */ 3 | (function (worker_instance) { 4 | "use strict"; 5 | 6 | //JavaScript Audio Resampler (c) 2011 - Grant Galitz 7 | 8 | var INCORRECT_BUFFER_LENGTH = "Buffer was of incorrect sample length."; 9 | var INCORRECT_SETTINGS = "Invalid settings specified for the resampler."; 10 | 11 | function Resampler(fromSampleRate, toSampleRate, channels, outputBufferSize, noReturn) { 12 | 13 | if (!fromSampleRate || !toSampleRate || !channels) { 14 | throw(new Error(INCORRECT_SETTINGS)); 15 | } 16 | 17 | this.fromSampleRate = fromSampleRate; 18 | this.toSampleRate = toSampleRate; 19 | this.channels = channels || 0; 20 | this.outputBufferSize = outputBufferSize; 21 | this.noReturn = !!noReturn; 22 | 23 | this.initialize(); 24 | } 25 | 26 | Resampler.prototype.bypassResampler = function (buffer) { 27 | 28 | // set the buffer passed as our own, as we don't need to resample it 29 | if (this.noReturn) { 30 | this.outputBuffer = buffer; 31 | return buffer.length; 32 | } 33 | // just return the buffer passsed 34 | return buffer; 35 | }; 36 | 37 | Resampler.prototype.initialize = function () { 38 | if (this.fromSampleRate == this.toSampleRate) { 39 | 40 | // Setup resampler bypass - Resampler just returns what was passed through 41 | this.resampler = this.bypassResampler; 42 | this.ratioWeight = 1; 43 | 44 | } else { 45 | 46 | if (this.fromSampleRate < this.toSampleRate) { 47 | 48 | // Use generic linear interpolation if upsampling, 49 | // as linear interpolation produces a gradient that we want 50 | // and works fine with two input sample points per output in this case. 51 | this.linearInterpolation(); 52 | this.lastWeight = 1; 53 | 54 | } else { 55 | 56 | // Custom resampler I wrote that doesn't skip samples 57 | // like standard linear interpolation in high downsampling. 58 | // This is more accurate than linear interpolation on downsampling. 59 | this.multiTap(); 60 | this.tailExists = false; 61 | this.lastWeight = 0; 62 | } 63 | 64 | // Initialize the internal buffer: 65 | this.initializeBuffers(); 66 | this.ratioWeight = this.fromSampleRate / this.toSampleRate; 67 | } 68 | }; 69 | 70 | Resampler.prototype.bufferSlice = function (sliceAmount) { 71 | 72 | // If we're going to access the properties directly from this object: 73 | if (this.noReturn) { 74 | return sliceAmount; 75 | } 76 | 77 | //Typed array and normal array buffer section referencing: 78 | try { 79 | return this.outputBuffer.subarray(0, sliceAmount); 80 | } 81 | catch (error) { 82 | try { 83 | //Regular array pass: 84 | this.outputBuffer.length = sliceAmount; 85 | return this.outputBuffer; 86 | } 87 | catch (error) { 88 | //Nightly Firefox 4 used to have the subarray function named as slice: 89 | return this.outputBuffer.slice(0, sliceAmount); 90 | } 91 | } 92 | }; 93 | 94 | Resampler.prototype.initializeBuffers = function () { 95 | try { 96 | this.outputBuffer = new Float32Array(this.outputBufferSize); 97 | this.lastOutput = new Float32Array(this.channels); 98 | } 99 | catch (error) { 100 | this.outputBuffer = []; 101 | this.lastOutput = []; 102 | } 103 | }; 104 | 105 | Resampler.prototype.linearInterpolation = function () { 106 | this.resampler = function (buffer) { 107 | var bufferLength = buffer.length, 108 | channels = this.channels, 109 | outLength, 110 | ratioWeight, 111 | weight, 112 | firstWeight, 113 | secondWeight, 114 | sourceOffset, 115 | outputOffset, 116 | outputBuffer, 117 | channel; 118 | 119 | if ((bufferLength % channels) !== 0) { 120 | throw(new Error(INCORRECT_BUFFER_LENGTH)); 121 | } 122 | if (bufferLength <= 0) { 123 | return (this.noReturn) ? 0 : []; 124 | } 125 | 126 | outLength = this.outputBufferSize; 127 | ratioWeight = this.ratioWeight; 128 | weight = this.lastWeight; 129 | firstWeight = 0; 130 | secondWeight = 0; 131 | sourceOffset = 0; 132 | outputOffset = 0; 133 | outputBuffer = this.outputBuffer; 134 | 135 | for (; weight < 1; weight += ratioWeight) { 136 | secondWeight = weight % 1; 137 | firstWeight = 1 - secondWeight; 138 | this.lastWeight = weight % 1; 139 | for (channel = 0; channel < this.channels; ++channel) { 140 | outputBuffer[outputOffset++] = (this.lastOutput[channel] * firstWeight) + (buffer[channel] * secondWeight); 141 | } 142 | } 143 | weight -= 1; 144 | for (bufferLength -= channels, sourceOffset = Math.floor(weight) * channels; outputOffset < outLength && sourceOffset < bufferLength;) { 145 | secondWeight = weight % 1; 146 | firstWeight = 1 - secondWeight; 147 | for (channel = 0; channel < this.channels; ++channel) { 148 | outputBuffer[outputOffset++] = (buffer[sourceOffset((channel > 0) ? (" + " + channel) : "")] * firstWeight) + (buffer[sourceOffset(channels + channel)] * secondWeight); 149 | } 150 | weight += ratioWeight; 151 | sourceOffset = Math.floor(weight) * channels; 152 | } 153 | for (channel = 0; channel < channels; ++channel) { 154 | this.lastOutput[channel] = buffer[sourceOffset++]; 155 | } 156 | return this.bufferSlice(outputOffset); 157 | }; 158 | }; 159 | 160 | Resampler.prototype.multiTap = function () { 161 | this.resampler = function (buffer) { 162 | var bufferLength = buffer.length, 163 | outLength, 164 | output_variable_list, 165 | channels = this.channels, 166 | ratioWeight, 167 | weight, 168 | channel, 169 | actualPosition, 170 | amountToNext, 171 | alreadyProcessedTail, 172 | outputBuffer, 173 | outputOffset, 174 | currentPosition; 175 | 176 | if ((bufferLength % channels) !== 0) { 177 | throw(new Error(INCORRECT_BUFFER_LENGTH)); 178 | } 179 | if (bufferLength <= 0) { 180 | return (this.noReturn) ? 0 : []; 181 | } 182 | 183 | outLength = this.outputBufferSize; 184 | output_variable_list = []; 185 | ratioWeight = this.ratioWeight; 186 | weight = 0; 187 | actualPosition = 0; 188 | amountToNext = 0; 189 | alreadyProcessedTail = !this.tailExists; 190 | this.tailExists = false; 191 | outputBuffer = this.outputBuffer; 192 | outputOffset = 0; 193 | currentPosition = 0; 194 | 195 | for (channel = 0; channel < channels; ++channel) { 196 | output_variable_list[channel] = 0; 197 | } 198 | 199 | do { 200 | if (alreadyProcessedTail) { 201 | weight = ratioWeight; 202 | for (channel = 0; channel < channels; ++channel) { 203 | output_variable_list[channel] = 0; 204 | } 205 | } else { 206 | weight = this.lastWeight; 207 | for (channel = 0; channel < channels; ++channel) { 208 | output_variable_list[channel] = this.lastOutput[channel]; 209 | } 210 | alreadyProcessedTail = true; 211 | } 212 | while (weight > 0 && actualPosition < bufferLength) { 213 | amountToNext = 1 + actualPosition - currentPosition; 214 | if (weight >= amountToNext) { 215 | for (channel = 0; channel < channels; ++channel) { 216 | output_variable_list[channel] += buffer[actualPosition++] * amountToNext; 217 | } 218 | currentPosition = actualPosition; 219 | weight -= amountToNext; 220 | } else { 221 | for (channel = 0; channel < channels; ++channel) { 222 | output_variable_list[channel] += buffer[actualPosition + ((channel > 0) ? (" + " + channel) : "")] * weight; 223 | } 224 | currentPosition += weight; 225 | weight = 0; 226 | break; 227 | } 228 | } 229 | 230 | if (weight === 0) { 231 | for (channel = 0; channel < channels; ++channel) { 232 | outputBuffer[outputOffset++] = output_variable_list[channel] / ratioWeight; 233 | } 234 | } else { 235 | this.lastWeight = weight; 236 | for (channel = 0; channel < channels; ++channel) { 237 | this.lastOutput[channel] = output_variable_list[channel]; 238 | } 239 | this.tailExists = true; 240 | break; 241 | } 242 | } while (actualPosition < bufferLength && outputOffset < outLength); 243 | return this.bufferSlice(outputOffset); 244 | }; 245 | }; 246 | 247 | worker_instance.Resampler = Resampler; 248 | 249 | }(self)); 250 | -------------------------------------------------------------------------------- /inference/server/app/static_html/js/script.js: -------------------------------------------------------------------------------- 1 | 2 | 'use strict' 3 | 4 | var constraints = { 5 | audio : true, 6 | }; 7 | var recorder = null; 8 | var audioStream = null; 9 | var audioData = null; 10 | var audioContext = null; 11 | var csrftoken = getCookie('csrftoken'); 12 | var socket = null; 13 | var interval; 14 | 15 | var stt_id = ''; 16 | var params = (new URL(window.location)).searchParams; 17 | stt_id = params.get("stt_id"); 18 | 19 | function getCookie(name) { 20 | var cookieValue = null; 21 | if (document.cookie && document.cookie != '') { 22 | var cookies = document.cookie.split(';'); 23 | for (var i = 0; i < cookies.length; i++) { 24 | var cookie = cookies[i].trim(); 25 | // Does this cookie string begin with the name we want? 26 | if (cookie.substring(0, name.length + 1) == (name + '=')) { 27 | cookieValue = decodeURIComponent(cookie.substring(name.length + 1)); 28 | break; 29 | } 30 | } 31 | } 32 | return cookieValue; 33 | } 34 | 35 | 36 | function protocolHandler(){ 37 | if($('#ws-radio').prop('checked')){ 38 | $('#file').prop('disabled', true); 39 | $('#submitAudio').prop('disabled', true); 40 | } else { 41 | $('#file').prop('disabled', false); 42 | $('#submitAudio').prop('disabled', false); 43 | } 44 | } 45 | 46 | 47 | function initWebSocket(){ 48 | if(!socket){ 49 | socket = new WebSocket('ws://127.0.0.1:8000/dsserver/'); 50 | 51 | socket.onopen = function(){ 52 | interval = setInterval(function(){ 53 | recorder.exportWAV(function(blob){ 54 | audioData = blob; 55 | if(socket && socket.readyState == WebSocket.OPEN){ 56 | socket.send(audioData); 57 | } 58 | }, false); 59 | }, 2000); 60 | } 61 | 62 | socket.onmessage = function(res){ 63 | $('#transcription-result').text(res.data); 64 | } 65 | 66 | socket.onerror = function(error){ 67 | alert('web socket error: ' + error); 68 | } 69 | 70 | socket.onclose = function(e){ 71 | clearInterval(interval); 72 | console.log('websocket closed'); 73 | } 74 | 75 | } 76 | } 77 | 78 | 79 | function closeWebSocket(){ 80 | if(socket && socket.readyState != WebSocket.CLOSED){ 81 | socket.close(); 82 | } 83 | socket = null; 84 | } 85 | 86 | 87 | function startRecording(){ 88 | $("#file").val(""); 89 | if (navigator.mediaDevices.getUserMedia === undefined) { 90 | displayError("This browser doesn't support getUserMedia."); 91 | } 92 | navigator.mediaDevices.getUserMedia(constraints) 93 | .then(function(stream){ 94 | audioStream = stream; 95 | if(!audioContext){ 96 | audioContext = new AudioContext(); 97 | } 98 | var source = audioContext.createMediaStreamSource(stream); 99 | recorder = audioRecorder.fromSource(source); 100 | recorder.record(); 101 | if($('#ws-radio').prop('checked') && !socket){ 102 | initWebSocket(); 103 | } else if(socket){ 104 | closeWebSocket(); 105 | } 106 | }) 107 | .catch(function(err){ 108 | displayError("Error occurred while getting audio stream: " + err); 109 | }) 110 | } 111 | 112 | 113 | function stopRecording(){ 114 | recorder.stop(); 115 | clearInterval(interval); 116 | recorder.exportWAV(function(blob){ 117 | audioStream.getTracks()[0].stop(); 118 | audioStream = null; 119 | audioData = blob; 120 | var url = URL.createObjectURL(blob); 121 | var mt = document.createElement('audio'); 122 | mt.controls = true; 123 | mt.src = url; 124 | $('#player')[0].innerHTML = ""; 125 | $('#player').append(mt); 126 | if(socket && socket.readyState == WebSocket.OPEN){ 127 | socket.send(audioData); 128 | closeWebSocket(); 129 | } 130 | }, true); 131 | recorder.clear(); 132 | } 133 | 134 | 135 | function submitToServer(){ 136 | 137 | if(audioData == null) { 138 | displayError("There is no audio data here!"); 139 | return; 140 | } 141 | 142 | $('#error-panel').hide(); 143 | $('#progress-panel').show(); 144 | $('#btnSRT').hide(); 145 | $('#btnJson').hide(); 146 | $('#btnCSV').hide(); 147 | $('#btnStatus').hide(); 148 | $("#btnEditTranscription").hide(); 149 | 150 | $('.progress-bar').css('width', '0%').attr('aria-valuenow', 0); 151 | 152 | var formData = new FormData(); 153 | formData.append("soundfile", audioData); 154 | $.ajax({ 155 | xhr: function() { 156 | var xhr = new XMLHttpRequest(); 157 | xhr.upload.addEventListener('progress', function(e){ 158 | if (e.lengthComputable) { 159 | var uploadPercent = e.loaded / e.total; 160 | uploadPercent = (uploadPercent * 100); 161 | console.log(uploadPercent); 162 | $('.progress-bar').width(uploadPercent + '%'); 163 | } else { 164 | console.log('not computable'); 165 | } 166 | }, false); 167 | return xhr; 168 | }, 169 | url: "../speech_to_text/", 170 | type: "POST", 171 | contentType: false, 172 | processData: false, 173 | data: formData, 174 | headers: { 175 | 'X-CSRFTOKEN': csrftoken 176 | }, 177 | success: function(response){ 178 | stt_id = response.id; 179 | $('#transcriptions-panel').show(); 180 | $('#btnStatus').show(); 181 | $('#progress-panel').hide(); 182 | $('#transcription-id').text(stt_id); 183 | }, 184 | error: function(response){ 185 | $('#transcription-id').text(response.responseText); 186 | $('#progress-panel').hide(); 187 | } 188 | }); 189 | } 190 | 191 | 192 | function submitGetStatus(){ 193 | $('#status-panel').show(); 194 | $.ajax({ 195 | url: "../get_status", 196 | type: "GET", 197 | data: { 198 | "stt_id": stt_id 199 | }, 200 | success: function(response){ 201 | var status = response.status; 202 | var currenttext = $('#status').text(); 203 | if (status == "SUCCESS"){ 204 | $('#btnSRT').show(); 205 | $('#btnJson').show(); 206 | $('#btnCSV').show(); 207 | $("#btnEditTranscription").show(); 208 | 209 | // 210 | $.ajax({ 211 | url: "../get_json", 212 | type: "GET", 213 | data: { 214 | "stt_id": stt_id 215 | }, 216 | success: function(response){ 217 | var str = JSON.stringify(response, null, 2); // spacing level = 2 218 | $('#transcription-result').text(str); 219 | }, 220 | error: function(response){ 221 | $('#transcription-result').text(response.responseText); 222 | } 223 | }); 224 | } 225 | $('#status').text(currenttext + "\n" + response.status); 226 | }, 227 | error: function(response){ 228 | $('#status-message').text(response.responseText); 229 | $('#progress-panel').hide(); 230 | } 231 | }); 232 | } 233 | 234 | function editTranscriptions(){ 235 | // 236 | $.ajax({ 237 | url: "../get_json", 238 | type: "GET", 239 | data: { 240 | "stt_id": stt_id 241 | }, 242 | success: function(response){ 243 | // 244 | $('#transcription-result').empty(); 245 | 246 | // 247 | var base_audio_url = window.location.protocol + "//" + window.location.host + window.location.pathname; 248 | base_audio_url = base_audio_url.replace("index.html", "../get_audio?stt_id=" + stt_id); 249 | 250 | // 251 | var table = $(''), table_head = $(''), head_row = $(''), table_body = $(''), body_row = []; 252 | 253 | head_row.append(""); 254 | head_row.append(""); 256 | head_row.append(""); 257 | head_row.append(""); 258 | 259 | table_head.append(head_row); 260 | table.append(table_head); 261 | 262 | // 263 | $.each(response, function(i, trans_obj) { 264 | // 265 | if (trans_obj.text.length > 0) { 266 | body_row[i] = $(''); 267 | body_row[i].append(''); 268 | 269 | var audio_url = base_audio_url + "&start=" + trans_obj.start + "&end=" + trans_obj.end; 270 | var audio = $("'); 277 | body_row[i].append(''); 278 | body_row[i].append(''); 279 | } else { 280 | body_row[i] = $(''); 281 | body_row[i].append(''); 282 | body_row[i].append('
Index"); 255 | head_row.append("StartEndTranscript
' + i + '"); 272 | 273 | audio_td.append(audio); 274 | body_row[i].append(audio_td); 275 | 276 | body_row[i].append('' + trans_obj.start + '' + trans_obj.end + '' + trans_obj.text + '
' + i + ''); 283 | body_row[i].append(''); 284 | body_row[i].append(''); 285 | body_row[i].append(''); 286 | } 287 | }); 288 | 289 | // 290 | table_body.append(body_row); 291 | table.append(table_body); 292 | 293 | table.addClass('table').addClass('table-bordered'); 294 | 295 | $('#transcription-result').html(table); 296 | 297 | }, 298 | error: function(response){ 299 | $('#transcription-result').text(response.responseText); 300 | } 301 | }); 302 | } 303 | 304 | 305 | function submitGetSrt(){ 306 | var srt_url = window.location.protocol + "//" + window.location.host + window.location.pathname; 307 | srt_url = srt_url.replace("index.html", "../get_srt?stt_id=" + stt_id); 308 | window.open(srt_url, "", "_blank"); 309 | } 310 | 311 | 312 | function submitGetCsv(){ 313 | var srt_url = window.location.protocol + "//" + window.location.host + window.location.pathname; 314 | srt_url = srt_url.replace("index.html", "../get_csv?stt_id=" + stt_id); 315 | window.open(srt_url, "", "_blank"); 316 | } 317 | 318 | 319 | function submitGetJson(){ 320 | var json_url = window.location.protocol + "//" + window.location.host + window.location.pathname; 321 | json_url = json_url.replace("index.html", "../get_json?stt_id=" + stt_id); 322 | window.open(json_url, "", "_blank"); 323 | } 324 | 325 | 326 | var openFile = function(event) { 327 | var input = event.target; 328 | var isValid = checkValidity(input.files[0]); 329 | if(!isValid){ 330 | displayError("Only wav file type allowed."); 331 | return; 332 | } 333 | var url = URL.createObjectURL(input.files[0]); 334 | var mt = document.createElement('audio'); 335 | audioData = input.files[0]; 336 | mt.controls = true; 337 | mt.src = url; 338 | $('#player')[0].innerHTML = ""; 339 | $('#player').append(mt); 340 | }; 341 | 342 | function checkValidity(file){ 343 | var isValid = false; 344 | var allowedFileTypes = ['audio/x-wav', 'audio/wav', 'audio/mpeg', 'video/mp4' ]; 345 | isValid = allowedFileTypes.includes(file.type); 346 | return isValid; 347 | } 348 | 349 | function displayError(errorMsg){ 350 | $('#error-panel').addClass('alert-danger'); 351 | $('#error-message').text(errorMsg); 352 | $('#error-panel').show(); 353 | } 354 | 355 | $(window).on('load',function(){ 356 | $("#file").val(""); 357 | $("#file").change(openFile); 358 | }); 359 | 360 | -------------------------------------------------------------------------------- /inference/server/app/wsgi.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import uuid 4 | import time 5 | import glob 6 | import json 7 | import cherrypy 8 | import tempfile 9 | 10 | from celery import Celery 11 | from celery.result import AsyncResult 12 | from pydub import AudioSegment 13 | 14 | from datetime import datetime 15 | from pathlib import Path 16 | from utils_srt import to_srt_from_jsonstring 17 | 18 | 19 | # wait for rabbitmq to startup 20 | time.sleep(20) 21 | 22 | STATIC_PATH=os.path.join(os.path.abspath(os.path.dirname(__file__)), 'static_html') 23 | 24 | class StaticRoot(object): pass 25 | 26 | class SpeechToTextAPI(object): 27 | 28 | 29 | def __init__(self): 30 | self.tmp_dir = '/recordings' 31 | self.tasks = dict() 32 | self.celery = Celery('postman', 33 | broker='pyamqp://user:bitnami@rabbitmq', 34 | backend='redis://redis:6379/0') 35 | 36 | 37 | @cherrypy.expose 38 | def index(self): 39 | cherrypy.log("Request index page") 40 | msg = "

wav2vec2 xlsr-ft-cy Server

\n" 41 | return msg 42 | 43 | 44 | @cherrypy.expose 45 | @cherrypy.tools.json_out() 46 | def versions(self): 47 | cherrypy.log("Request versions page") 48 | result = { 49 | 'version': 1, 50 | } 51 | return result 52 | 53 | 54 | @cherrypy.expose 55 | @cherrypy.tools.json_out() 56 | def get_status(self, stt_id, **kwargs): 57 | cherrypy.log("Request status") 58 | if stt_id in self.tasks: 59 | task_result = AsyncResult(self.tasks[stt_id]) 60 | task_status = task_result.status 61 | else: 62 | task_status = 'UNKNOWN' 63 | 64 | # 65 | result = { 66 | 'version': 1, 67 | 'status': task_status 68 | } 69 | 70 | return result 71 | 72 | 73 | @cherrypy.expose 74 | @cherrypy.tools.json_out() 75 | def get_json(self, stt_id, **kwargs): 76 | cherrypy.log("Request json file") 77 | jsonobj = '' 78 | 79 | json_file_path = Path(os.path.join(self.tmp_dir, stt_id + ".json")) 80 | if json_file_path.is_file(): 81 | with open(json_file_path, 'r', encoding='utf-8') as json_file: 82 | jsonobj = json.load(json_file) 83 | 84 | return jsonobj 85 | 86 | 87 | @cherrypy.expose 88 | def get_srt(self, stt_id, **kwargs): 89 | cherrypy.log("Request srt file") 90 | cherrypy.response.headers['Content-Type'] = 'text/plain' 91 | srt = '' 92 | 93 | srt_file_path = Path(os.path.join(self.tmp_dir, stt_id + ".srt")) 94 | if srt_file_path.is_file(): 95 | with open(srt_file_path, 'r', encoding='utf-8') as srt_file: 96 | srt = srt_file.read() 97 | 98 | return srt 99 | 100 | 101 | @cherrypy.expose 102 | def get_csv(self, stt_id, **kwargs): 103 | cherrypy.log("Request csv file") 104 | cherrypy.response.headers['Content-Type'] = 'text/csv' 105 | csv = '' 106 | 107 | csv_file_path = Path(os.path.join(self.tmp_dir, stt_id + ".csv")) 108 | if csv_file_path.is_file(): 109 | with open(csv_file_path, 'r', encoding='utf-8') as csv_file: 110 | csv = csv_file.read() 111 | 112 | return csv 113 | 114 | 115 | @cherrypy.expose 116 | def get_audio(self, stt_id, start=0, end=0, **kwargs): 117 | cherrypy.log("Request audio for stt_id %s" % stt_id) 118 | cherrypy.response.headers["Content-Type"] = "audio/wav" 119 | cherrypy.response.headers['Content-Disposition'] = 'attachment; filename="%s_%s_%s.wav"' % (stt_id, start, end) 120 | 121 | start_ts = float(start) 122 | end_ts = float(end) 123 | 124 | wav_segment = None 125 | audio_file_path = os.path.join(self.tmp_dir, stt_id + ".wav") 126 | wav_audio_file = AudioSegment.from_file(audio_file_path, "wav") 127 | if start_ts==0.0 and end_ts==0.0: 128 | wav_segment = wav_audio_file 129 | else: 130 | # pydub does things in milliseconds 131 | wav_segment = wav_audio_file[start_ts * 1000:end_ts * 1000] 132 | 133 | audio_bytes = None 134 | 135 | with tempfile.TemporaryFile() as temp_file: 136 | wav_segment.export(out_f=temp_file, format="wav") 137 | temp_file.seek(0) 138 | audio_bytes = temp_file.read() 139 | 140 | return audio_bytes 141 | 142 | 143 | @cherrypy.expose 144 | @cherrypy.tools.json_out() 145 | def speech_to_text(self, soundfile, max_segment_length=5, max_segment_words=14, **kwargs): 146 | success = True 147 | stt_id = str(uuid.uuid4()) 148 | 149 | audio_file_path = os.path.join(self.tmp_dir, stt_id) 150 | with open(audio_file_path, 'wb') as audiofile: 151 | while True: 152 | data = soundfile.file.read(8192) 153 | if not data: 154 | break 155 | audiofile.write(data) 156 | 157 | # 158 | cherrypy.log("tmp file written to %s" % audio_file_path) 159 | 160 | # 161 | cherrypy.log("sent task stt for %s" % audio_file_path) 162 | transcription_task = self.celery.send_task('speech_to_text', args=(audio_file_path, max_segment_length, max_segment_words)) 163 | self.tasks.setdefault(stt_id, transcription_task.task_id) 164 | 165 | # 166 | result = { 167 | 'version':1, 168 | 'success':success, 169 | 'id':stt_id, 170 | } 171 | 172 | return result 173 | 174 | 175 | cherrypy.config.update({ 176 | 'environment': 'production', 177 | 'log.screen': False, 178 | 'response.stream': True, 179 | 'log.access_file': '/var/log/wav2vec2/access.log', 180 | 'log.error_file': '/var/log/wav2vec2/error.log', 181 | }) 182 | 183 | 184 | cherrypy.tree.mount(StaticRoot(), '/static', config={ 185 | '/': { 186 | 'tools.staticdir.on': True, 187 | 'tools.staticdir.dir': STATIC_PATH, 188 | 'tools.staticdir.index': 'index.html', 189 | }, 190 | }) 191 | 192 | 193 | cherrypy.tree.mount(SpeechToTextAPI(), '/') 194 | application = cherrypy.tree 195 | 196 | -------------------------------------------------------------------------------- /inference/server/docker-compose.cpu.yml: -------------------------------------------------------------------------------- 1 | version: "3.1" 2 | 3 | services: 4 | 5 | application: 6 | image: techiaith/wav2vec2-server 7 | container_name: techiaith-wav2vec2-server 8 | restart: always 9 | build: 10 | context: ${PWD}/app 11 | dockerfile: Dockerfile 12 | ports: 13 | - "5511:8008" 14 | volumes: 15 | - ${PWD}/recordings:/recordings 16 | - ${PWD}/logs:/var/log/wav2vec2 17 | 18 | 19 | worker: 20 | image: techiaith/techiaith-wav2vec2-worker-cpu 21 | container_name: techiaith-wav2vec2-worker-cpu 22 | restart: always 23 | build: 24 | context: ${PWD}/worker 25 | dockerfile: Dockerfile 26 | volumes: 27 | - ${PWD}/recordings:/recordings 28 | - ${PWD}/logs:/var/log/wav2vec2 29 | depends_on: 30 | - rabbitmq 31 | - redis 32 | 33 | 34 | rabbitmq: 35 | image: bitnami/rabbitmq:3.9.24 36 | container_name: techiaith-wav2vec2-server-broker-rabbitmq 37 | restart: always 38 | 39 | 40 | redis: 41 | image: redis:7.0.5 42 | container_name: techiaith-wav2vec2-server-backend-redis 43 | restart: always 44 | volumes: 45 | - ${PWD}/redis_data:/data 46 | 47 | 48 | -------------------------------------------------------------------------------- /inference/server/speech.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/techiaith/docker-huggingface-stt-cy/36224bbb731037b701dbf516165f7dec15bce262/inference/server/speech.wav -------------------------------------------------------------------------------- /inference/server/worker/.gitignore: -------------------------------------------------------------------------------- 1 | external_api_urls.py 2 | -------------------------------------------------------------------------------- /inference/server/worker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM techiaith/wav2vec2-inference-cpu 2 | 3 | RUN git clone https://github.com/marian-nmt/moses-scripts.git /usr/local/bin/moses-scripts 4 | 5 | WORKDIR /wav2vec2-worker 6 | 7 | COPY ./requirements.txt . 8 | 9 | RUN pip install -r requirements.txt 10 | 11 | COPY . . 12 | 13 | ENTRYPOINT celery -A worker worker --loglevel=info 14 | -------------------------------------------------------------------------------- /inference/server/worker/audio.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pydub 3 | import shlex 4 | import subprocess 5 | import fleep 6 | 7 | from pathlib import Path 8 | 9 | def prepare_audio(audio_file_path): 10 | print("Task prepare_audio for %s started" % audio_file_path) 11 | audio_file = Path(audio_file_path) 12 | 13 | with open(audio_file_path, 'rb') as audio_file: 14 | info = fleep.get(audio_file.read(128)) 15 | if len(info.extension) > 0 and info.extension[0] == "wav": 16 | wav_file_path=audio_file_path 17 | else: 18 | wav_file_path=convert_audio(audio_file_path) 19 | 20 | # 21 | wav_normalized_audio_file_path = normalize_audio(wav_file_path) 22 | print ("Completed prepared audio file : %s" % wav_normalized_audio_file_path) 23 | return wav_normalized_audio_file_path 24 | 25 | def convert_audio(audio_file_path): 26 | print ("Converting to wav") 27 | wav_file_path = Path(audio_file_path).with_suffix(".wav") 28 | convert_cmd = "ffmpeg -i {} -vn -acodec pcm_s16le -ar 16000 -ac 1 {}".format(audio_file_path, wav_file_path) 29 | subprocess.Popen(shlex.split(convert_cmd)).wait() 30 | Path(audio_file_path).unlink() 31 | return wav_file_path.as_posix() 32 | 33 | def normalize_audio(wav_file_path): 34 | print ("Normalizing loudness") 35 | 36 | wav_file_path = Path(wav_file_path) 37 | wav_normalized_file_path = Path(wav_file_path).with_suffix(".n.wav") 38 | 39 | normalize_cmd = "ffmpeg-normalize {} -ar 16000 -o {}".format(wav_file_path, wav_normalized_file_path) 40 | subprocess.Popen(shlex.split(normalize_cmd)).wait() 41 | 42 | wav_file_path.unlink() 43 | wav_file_path = wav_normalized_file_path.rename(wav_file_path.with_suffix(".wav").as_posix()) 44 | 45 | return wav_file_path.as_posix() 46 | 47 | -------------------------------------------------------------------------------- /inference/server/worker/external_api_urls.template.py: -------------------------------------------------------------------------------- 1 | PUNCTUATION_API_URL = "https://......." 2 | -------------------------------------------------------------------------------- /inference/server/worker/persist.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import srt 4 | import json 5 | 6 | from pathlib import Path 7 | from datetime import timedelta 8 | 9 | 10 | def save_as_json(audio_file_path, transcription): 11 | json_str = json.dumps(transcription) 12 | json_file_path = Path(audio_file_path).with_suffix(".json") 13 | with open(json_file_path, 'w', encoding='utf-8') as json_file: 14 | json_file.write(json_str) 15 | return json_file_path 16 | 17 | 18 | def save_as_srt(audio_file_path, transcription): 19 | i = 0 20 | 21 | srt_segments = [] 22 | srt_file_path = Path(audio_file_path).with_suffix(".srt") 23 | 24 | for transcript in transcription: 25 | i = i+1 26 | time_start = transcript["start"] 27 | time_end = transcript["end"] 28 | text = transcript["text"] 29 | start_delta = timedelta(seconds=time_start) 30 | end_delta = timedelta(seconds=time_end) 31 | srt_segments.append(srt.Subtitle(i, start=start_delta, end=end_delta, content=text)) 32 | 33 | srt_string = srt.compose(srt_segments) 34 | with open(srt_file_path, 'w', encoding='utf-8') as srt_file: 35 | srt_file.write(srt_string) 36 | 37 | print("srt file of transcription saved to %s" % srt_file_path) 38 | 39 | return srt_file_path 40 | 41 | 42 | def save_as_csv(audio_file_path, transcription): 43 | i=0 44 | rows = [] 45 | header = ["ID", "Start", "End", "Transcript"] 46 | 47 | for transcript in transcription: 48 | i = i+1 49 | 50 | time_start = transcript["start"] 51 | time_end = transcript["end"] 52 | text = transcript["text"] 53 | 54 | rows.append({'ID': i, 'Start': time_start, 'End': time_end, 'Transcript': text}) 55 | 56 | csv_file_path = Path(audio_file_path).with_suffix(".csv") 57 | with open(csv_file_path, 'w', encoding='utf-8', newline='') as csv_file: 58 | csv_writer = csv.DictWriter(csv_file, delimiter='\t', fieldnames=header, quoting=csv.QUOTE_NONE) 59 | csv_writer.writeheader() 60 | csv_writer.writerows(rows) 61 | -------------------------------------------------------------------------------- /inference/server/worker/punctuation_client.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | 4 | from external_api_urls import PUNCTUATION_API_URL 5 | 6 | def restore_punctuation_and_truecase(raw_text): 7 | 8 | try: 9 | params = {'text': raw_text } 10 | response = requests.get(PUNCTUATION_API_URL, params=params, timeout=360) 11 | result = response.json() 12 | return result['restored_text'] 13 | except: 14 | print("Restore punctuation API exception") 15 | return raw_text 16 | 17 | -------------------------------------------------------------------------------- /inference/server/worker/requirements.txt: -------------------------------------------------------------------------------- 1 | celery==5.2.7 2 | redis==4.3.4 3 | fleep==1.0.1 4 | srt==3.5.2 5 | -------------------------------------------------------------------------------- /inference/server/worker/speech_to_text_task.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from celery import Celery, Task 3 | 4 | from speech_to_text import SpeechToText 5 | 6 | 7 | class SpeechToTextTask(Task): 8 | """ 9 | Abstraction of Celery's Task class to support loading ML model. 10 | """ 11 | abstract = True 12 | 13 | def __init__(self): 14 | super().__init__() 15 | self.model = None 16 | 17 | def __call__(self, *args, **kwargs): 18 | """ 19 | Load model on first call (i.e. first task processed) 20 | Avoids the need to load model on each task request 21 | """ 22 | if not self.model: 23 | print("Loading SpeechToText model") 24 | self.model = SpeechToText() 25 | 26 | return self.run(*args, **kwargs) 27 | 28 | -------------------------------------------------------------------------------- /inference/server/worker/worker.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import json 4 | 5 | import punctuation_client 6 | 7 | from pathlib import Path 8 | from celery import Celery 9 | 10 | from audio import prepare_audio 11 | from persist import save_as_json, save_as_srt, save_as_csv 12 | 13 | from speech_to_text_task import SpeechToTextTask 14 | 15 | # Wait for rabbitmq to be started 16 | time.sleep(15) 17 | 18 | # 19 | app = Celery( 20 | 'postman', 21 | broker='pyamqp://user:bitnami@rabbitmq', 22 | backend='redis://redis:6379/0', 23 | ) 24 | 25 | 26 | @app.task(name='speech_to_text', 27 | ignore_result=False, 28 | bind=True, 29 | base=SpeechToTextTask, 30 | serializer='json') 31 | def speech_to_text(self, audio_file_path, max_segment_length, max_segment_words): 32 | print("Task speech_to_text for %s started" % audio_file_path) 33 | audio_file_path = prepare_audio(audio_file_path) 34 | 35 | print ("Using model :", self.model.get_model_name(), self.model.get_model_version()) 36 | 37 | success = True 38 | transcripts = [] 39 | try: 40 | for transcript, time_start, time_end, alignment in self.model.transcribe(audio_file_path, max_segment_length, max_segment_words): 41 | print ("{}-{} {}".format(time_start, time_end, transcript)) 42 | 43 | transcript = punctuation_client.restore_punctuation_and_truecase(transcript) 44 | transcripts.append({'text': transcript, 'start':time_start, 'end':time_end, 'alignment':alignment}) 45 | 46 | except Exception as e: 47 | print("Error during transcribing %s" % audio_file_path) 48 | print(e) 49 | success=False 50 | else: 51 | print("Transcribing %s succesful." % audio_file_path) 52 | 53 | save_as_json(audio_file_path, transcripts) 54 | save_as_srt(audio_file_path, transcripts) 55 | save_as_csv(audio_file_path, transcripts) 56 | 57 | return transcripts 58 | 59 | 60 | -------------------------------------------------------------------------------- /inference/speech.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/techiaith/docker-huggingface-stt-cy/36224bbb731037b701dbf516165f7dec15bce262/inference/speech.wav -------------------------------------------------------------------------------- /train/fine-tune/.dockerignore: -------------------------------------------------------------------------------- 1 | models 2 | homedir 3 | data -------------------------------------------------------------------------------- /train/fine-tune/.gitignore: -------------------------------------------------------------------------------- 1 | homedir 2 | models 3 | data 4 | logs 5 | python/cv_version.py 6 | *.csv 7 | -------------------------------------------------------------------------------- /train/fine-tune/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:11.4.0-cudnn8-devel-ubuntu20.04 2 | 3 | LABEL maintainer="techiaith" 4 | LABEL repository="wav2vec2-xlsr-ft-cy" 5 | 6 | ARG DEBIAN_FRONTEND=noninteractive 7 | ENV TZ=Europe/London 8 | 9 | RUN apt update -q \ 10 | && apt install -y -qq tzdata bash build-essential git curl wget software-properties-common \ 11 | vim ca-certificates libffi-dev libssl-dev libsndfile1 libbz2-dev liblzma-dev locales \ 12 | libboost-all-dev libboost-tools-dev libboost-thread-dev cmake \ 13 | python3 python3-setuptools python3-pip cython 14 | 15 | RUN python3 -m pip install --upgrade pip 16 | 17 | # Set the locale 18 | RUN locale-gen cy_GB.UTF-8 19 | ENV LANG cy_GB.UTF-8 20 | ENV LANGUAGE cy_GB:en 21 | ENV LC_ALL cy_GB.UTF-8 22 | 23 | RUN mkdir -p /xlsr-ft-train 24 | WORKDIR /xlsr-ft-train 25 | 26 | RUN pip3 install torch==1.9.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html 27 | COPY python/requirements.txt /xlsr-ft-train/ 28 | RUN pip3 install -r requirements.txt 29 | 30 | # Install KenLM 31 | RUN git clone https://github.com/kpu/kenlm.git /usr/local/src/kenlm 32 | WORKDIR /usr/local/src/kenlm 33 | RUN mkdir -p build && cd build && cmake .. && make -j 4 34 | 35 | ENV PATH="/usr/local/src/kenlm/build/bin:/usr/local/src/kenlm/scripts:${PATH}" 36 | 37 | RUN git clone --recursive https://github.com/parlance/ctcdecode.git /tmp/ctcdecode \ 38 | && cd /tmp/ctcdecode && pip3 install . 39 | 40 | COPY python /xlsr-ft-train 41 | WORKDIR /xlsr-ft-train 42 | 43 | -------------------------------------------------------------------------------- /train/fine-tune/Makefile: -------------------------------------------------------------------------------- 1 | default: build 2 | 3 | 4 | build: 5 | if [ ! -d "data/corpws-profi-adnabod-lleferydd" ]; then \ 6 | mkdir -p data; \ 7 | cd data && git clone https://git.techiaith.bangor.ac.uk/data-porth-technolegau-iaith/corpws-profi-adnabod-lleferydd.git; \ 8 | fi 9 | docker build --rm -t techiaith/wav2vec2-xlsr-ft-train-${USER} . 10 | 11 | 12 | run: 13 | mkdir -p homedir/datasets 14 | mkdir -p logs 15 | docker run --name techiaith-wav2vec2-xlsr-ft-train-${USER}-tensorboard \ 16 | --restart=always \ 17 | -v ${PWD}/logs/:/logs \ 18 | -d -p 6006:6006 \ 19 | tensorflow/tensorflow \ 20 | tensorboard --bind_all --logdir /logs 21 | 22 | docker run --gpus all --name techiaith-wav2vec2-xlsr-ft-train-${USER} \ 23 | -it \ 24 | --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \ 25 | -v ${PWD}/homedir/:/root \ 26 | -v ${PWD}/data/:/data \ 27 | -v ${PWD}/logs/:/logs \ 28 | -v ${PWD}/models/:/models \ 29 | -v ${PWD}/python/:/xlsr-ft-train \ 30 | techiaith/wav2vec2-xlsr-ft-train-${USER} bash 31 | 32 | 33 | stop: 34 | -docker stop techiaith-wav2vec2-xlsr-ft-train-${USER}-tensorboard 35 | -docker stop techiaith-wav2vec2-xlsr-ft-train-${USER} 36 | -docker rm techiaith-wav2vec2-xlsr-ft-train-${USER}-tensorboard 37 | -docker rm techiaith-wav2vec2-xlsr-ft-train-${USER} 38 | 39 | clean: stop 40 | -docker rmi techiaith/wav2vec2-xlsr-ft-train-${USER} 41 | 42 | purge: clean 43 | sudo rm -rf homedir 44 | sudo rm -rf logs 45 | 46 | tensorboard: 47 | python3 -m tensorboard.main serve --bind_all --logdir=logs/ 48 | 49 | -------------------------------------------------------------------------------- /train/fine-tune/README.md: -------------------------------------------------------------------------------- 1 | # Mireinio modelau wav2vec2 ar gyfer y Gymraeg 2 | 3 | [**(click here to read the README in English)**](README_en.md) 4 | 5 | Mae sgriptiau i fireinio amrywiaeth o fodelau sydd wedi eu rhag-hyfforddi ac ar gael o hwb modelau HuggingFace. 6 | 7 | - `run_xlsr-large-53.sh` - i fireinio modelau cyntaf wav2vec2 amlieithog gan Facebook : [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) yn ogystal a chreu ac optimeiddio model iaith KenLM 8 | - `run_xls-r-1b.sh` - i fireinio modelau wav2vec2 amlieithog mwy : [facebook/wav2vec2-xls-r-1b](https://huggingface.co/facebook/wav2vec2-xls-r-1b) yn ogystal a chreu ac optimeiddio model iaith KenLM 9 | - `run_en_cy.sh` - mireinio [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) ar gyfer model adnabod lleferydd acwstig yn unig ond yn ddwyieithog. 10 | - `run_base-cy.sh` - mireinio model arbrofol sydd wedi ei rhag-hyfforddi gan uned techiaith gyda rhagor o sain lleferydd Cymraeg yn ogystal a chreu ac optimeiddio model iaith KenLM ategol. 11 | 12 | Datblygwyd y sgriptiau cyntaf ar gyfer y Gymraeg yn ystod [wythnos fireinio i ieithoedd llai eu hadnoddau gan HuggingFace](https://discuss.huggingface.co/t/open-to-the-community-xlsr-wav2vec2-fine-tuning-week-for-low-resource-languages/4467). 13 | 14 | Adeiladwyd a ddefnyddiwyd is-setiau ein hunain o ddata Common Voice Cymraeg a Saesneg gan Mozilla ar gyfer mireinio'r modelau mwyaf effeithiol. Gweler https://github.com/techiaith/docker-commonvoice-custom-splits-builder. 15 | 16 | Mae'r project yn cynnwys sgriptiau i hyfforddi modelau iaith KenLM gyda thestun o [gorpws broject OSCAR ar wefan Datasets HuggingFace](https://huggingface.co/datasets/oscar) a'u optimeiddio o fewn ddull dadgodio CTC. (rydym wedi integreiddio [Parlance CTC Decode](https://github.com/parlance/ctcdecode) gyda HuggingFace i alluogi wella canlyniadau gyd chymorth modelau iaith) 17 | 18 | 19 | # Sut i'w ddefnyddio... 20 | 21 | `$ make` 22 | 23 | `$ make run ` 24 | 25 | Er mwyn llwytho i lawr data Common Voice, mae angen i chi greu ffeil Python i gynnwys yr URL. Mae enghraifft/templed i'w weld yn y ffeil [`cv_version.template.py`](cv_version.template.py) . Nodwch enw'r ffeil (heb yr estyniad `.py`) o fewn y sgript hoffwch ei ddefnyddio i hyfforddi. e.e. o fewn y sgript mireinio wav2vec2-large-xlsr-53 gan Facebook, `run_xlsr-large-53.sh`, newidiwch yr enw ar gyfer `CV_CONFIG_FILE`. 26 | 27 | (disgwylir eich bod wedi llwytho'r set(iau) data Common Voice o'u wefan ac wedi lleoli'r ffeil `.tar.gz` ar weinydd `http` lleol eich hunain) 28 | 29 | Yna i ddechrau hyfforddi, dewisich unrhyw un o'r pedwar sgript "run" 30 | 31 | `root@d702159be82f:/xlsr-ft-train# ./run_xlsr-large-53.sh` 32 | 33 | Yn dibynnu ar y cerdyn graffics, bydd yn gymryd rhai oriau i hyfforddi. 34 | 35 | 36 | # Gwerthuso 37 | 38 | Bydd y sgriptiau yn werthuso'r modelau yn ystod hyfforddi. Dyma'r canlyniadau ar ol i pob cam gwblhau 39 | 40 | |Language | Training Data | Test Data | Model | Decode | WER | CER | 41 | |---|---|---|---|---|---|---| 42 | | CY |cv11 training+validation (s=max) | cv11 test | wav2vec2-large-xlsr-53 | greedy | **6.04%** | **1.88%** | 43 | | CY |cv11 training+validation (s=max) | cv11 test | wav2vec2-large-xlsr-53 | ctc | **6.01%** | **1.88%** | 44 | | CY |cv11 training+validation (s=max) | cv11 test | wav2vec2-large-xlsr-53 | ctc with lm (kenlm, n=5) | **4.05%** | **1.49%** | 45 | | CY |cv11 training+validation (s=max) | bangor custom | wav2vec2-large-xlsr-53 | greedy | 37.46% | 14.11% | 46 | | CY |cv11 training+validation (s=max) | bangor custom | wav2vec2-large-xlsr-53 | ctc | 37.18% | 14.08% | 47 | | CY |cv11 training+validation (s=max) | bangor custom | wav2vec2-large-xlsr-53 | ctc with lm (kenlm, n=5) | 31.51% | 14.84% | 48 | | CY+EN |cv11 training+validation cy+en (s=max) | cv11 test cy+en | wav2vec2-large-xlsr-53 | greedy | 17.07% | 7.32% | 49 | | CY+EN |cv11 training+validation cy+en (s=max) | cv11 test cy| wav2vec2-large-xlsr-53 | greedy | 7.13% | 2.2% | 50 | | CY+EN |cv11 training+validation cy+en (s=max) | cv11 test en| wav2vec2-large-xlsr-53 | greedy | 27.54% | 11.6% | 51 | | CY+EN |cv11 training+validation (s=max) | bangor custom | wav2vec2-large-xlsr-53 | greedy | 40.76% | 15.42% | 52 | | CY+EN |cv11 training+validation (s=max) | bangor custom | wav2vec2-large-xlsr-53 | ctc | 40.47.18% | 15.34% | 53 | | CY |cv11 training+validation+custom other (s=3) | cv11 test | wav2vec2-xls-r-1b | greedy | 15.82% | 4.53% | 54 | | CY |cv11 training+validation+custom other (s=3) | cv11 test | wav2vec2-xls-r-1b | ctc | 15.72% | 4.50% | 55 | | CY |cv11 training+validation+custom other (s=3) | cv11 test | wav2vec2-xls-r-1b | ctc with lm (kenlm, n=5) | 10.17% | 3.42% | 56 | | CY |cv11 training+validation+custom other (s=3) | cv11 test | wav2vec2-large-xlsr-53 | greedy | 16.73% | 4.63% | 57 | | CY |cv11 training+validation+custom other (s=3) | cv11 test | wav2vec2-large-xlsr-53 | ctc | 16.62% | 4.61% | 58 | | CY |cv11 training+validation+custom other (s=3) | cv11 test | wav2vec2-large-xlsr-53 | ctc with lm (kenlm, n=5) | 10.45% | 3.42% | 59 | | CY |cv11 training+validation (s=3) | cv11 test | wav2vec2-large-xlsr-53 | greedy | 17.42% | 4.83% | 60 | | CY |cv11 training+validation (s=3) | cv11 test | wav2vec2-large-xlsr-53 | ctc | 17.29% | 4.80% | 61 | | CY |cv11 training+validation (s=3) | cv11 test | wav2vec2-large-xlsr-53 | ctc with lm (kenlm, n=5) | 10.82% | 3.58% | 62 | 63 | Allwedd: 64 | 65 | - "custom other" : is-set ychwanegol sydd wedi ei greu gyda recordiadau o frawddegau unigryw o fewn 'other.tsv' yn Common Voice. h.y. heb i neb wrando eto a'u cadarnhau 66 | - "s=3" : yr uchafswm ar y nifer o recordiadau mesul frawddeg unigryw o fewn Common Voice 67 | - "s=max" : uchafswm eitha uchel, fel caniateir pob un recordiad o frawddeg yn y is-set. 68 | - "bangor custom" : set profi trawsgrifiadau gan Prifysgol Bangor: https://git.techiaith.bangor.ac.uk/data-porth-technolegau-iaith/corpws-profi-adnabod-lleferydd/-/tree/master/data/trawsgrifio -------------------------------------------------------------------------------- /train/fine-tune/README_en.md: -------------------------------------------------------------------------------- 1 | # Fine tune wav2vec2 models for Welsh 2 | 3 | [**(cliciwch yma os hoffwch ddarllen y README Cymraeg)**](README.md) 4 | 5 | These are scripts to fine-tune a variety of pre-trained models that are available from HuggingFace's model hub. 6 | 7 | - `run_xlsr-large-53.sh` - to fine-tune the first multilingual wav2vec2 models from Facebook : [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) as well as create and optimize supporting KenLM language models 8 | - `run_xls-r-1b.sh` - to fine-tune more multilingual wav2vec2 models - [facebook/wav2vec2-xls-r-1b](https://huggingface.co/facebook/wav2vec2-xls-r-1b) as well as create and optimize supporting KenLM language models 9 | - `run_en_cy.sh` - fine-tune [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) for bilingual acoustic speech recognition models. 10 | - `run_base-cy.sh` - fine-tuning an experimental model pre-trained by techiaith with more Welsh speech audio as well as create and optimize supporting KenLM language models 11 | 12 | The first scripts for Welsh were developed during [a fine-tuning week for low resource languages by HuggingFace](https://discuss.huggingface.co/t/open-to-the-community-xlsr-wav2vec2-fine-tuning-week-for-low-resource-languages/4467). 13 | 14 | Our own subsets of Welsh and English Common Voice data were built and used by Mozilla for refining the most effective models. See https://github.com/techiaith/docker-commonvoice-custom-splits-builder. 15 | 16 | The project includes scripts to train KenLM language models with text from the [OSCAR project corpus on the HuggingFace Datasets website](https://huggingface.co/datasets/oscar) and optimize alpha and beta CTC decoding hyperparameters. (We have integrated [Parlance CTC Decode](https://github.com/parlance/ctcdecode) to improve recognition results with the support of a language model) 17 | 18 | 19 | # How to use... 20 | 21 | `$ make` 22 | 23 | `$ make run ` 24 | ` 25 | In order to download and import the Common Voice data, you need to create a Python file to contain a URL to its `.tar.gz` file. An example/template can be found in the file [`cv_version.template.py`](cv_version.template.py). Enter the name of the file (without the `.py` extension) into the variable `CV_CONFIG_FILE` inside the script you would like to use for training. 26 | 27 | (it is expected that you have downloaded the Common Voice dataset(s) from the official website and re-located the `.tar.gz` file to your own local `http` server) 28 | 29 | Then to start training, choose any of the four "run" scripts. E.g. 30 | 31 | `root@d702159be82f:/xlsr-ft-train# ./run_xlsr-large-53.sh` 32 | 33 | Depending on the graphics card, it will take a few hours to train. 34 | 35 | 36 | # Evaluation 37 | 38 | The scripts will evaluate the models during the training. Here are the results after each step is complete: 39 | 40 | |Language | Training Data | Test Data | Model | Decode | WER | CER | 41 | |---|---|---|---|---|---|---| 42 | | CY |cv11 training+validation (s=max) | cv11 test | wav2vec2-large-xlsr-53 | greedy | **6.04%** | **1.88%** | 43 | | CY |cv11 training+validation (s=max) | cv11 test | wav2vec2-large-xlsr-53 | ctc | **6.01%** | **1.88%** | 44 | | CY |cv11 training+validation (s=max) | cv11 test | wav2vec2-large-xlsr-53 | ctc with lm (kenlm, n=5) | **4.05%** | **1.49%** | 45 | | CY |cv11 training+validation (s=max) | bangor custom | wav2vec2-large-xlsr-53 | greedy | 37.46% | 14.11% | 46 | | CY |cv11 training+validation (s=max) | bangor custom | wav2vec2-large-xlsr-53 | ctc | 37.18% | 14.08% | 47 | | CY |cv11 training+validation (s=max) | bangor custom | wav2vec2-large-xlsr-53 | ctc with lm (kenlm, n=5) | 31.51% | 14.84% | 48 | | CY+EN |cv11 training+validation cy+en (s=max) | cv11 test cy+en | wav2vec2-large-xlsr-53 | greedy | 17.07% | 7.32% | 49 | | CY+EN |cv11 training+validation cy+en (s=max) | cv11 test cy| wav2vec2-large-xlsr-53 | greedy | 7.13% | 2.2% | 50 | | CY+EN |cv11 training+validation cy+en (s=max) | cv11 test en| wav2vec2-large-xlsr-53 | greedy | 27.54% | 11.6% | 51 | | CY+EN |cv11 training+validation (s=max) | bangor custom | wav2vec2-large-xlsr-53 | greedy | 40.76% | 15.42% | 52 | | CY+EN |cv11 training+validation (s=max) | bangor custom | wav2vec2-large-xlsr-53 | ctc | 40.47.18% | 15.34% | 53 | | CY |cv11 training+validation+custom other (s=3) | cv11 test | wav2vec2-xls-r-1b | greedy | 15.82% | 4.53% | 54 | | CY |cv11 training+validation+custom other (s=3) | cv11 test | wav2vec2-xls-r-1b | ctc | 15.72% | 4.50% | 55 | | CY |cv11 training+validation+custom other (s=3) | cv11 test | wav2vec2-xls-r-1b | ctc with lm (kenlm, n=5) | 10.17% | 3.42% | 56 | | CY |cv11 training+validation+custom other (s=3) | cv11 test | wav2vec2-large-xlsr-53 | greedy | 16.73% | 4.63% | 57 | | CY |cv11 training+validation+custom other (s=3) | cv11 test | wav2vec2-large-xlsr-53 | ctc | 16.62% | 4.61% | 58 | | CY |cv11 training+validation+custom other (s=3) | cv11 test | wav2vec2-large-xlsr-53 | ctc with lm (kenlm, n=5) | 10.45% | 3.42% | 59 | | CY |cv11 training+validation (s=3) | cv11 test | wav2vec2-large-xlsr-53 | greedy | 17.42% | 4.83% | 60 | | CY |cv11 training+validation (s=3) | cv11 test | wav2vec2-large-xlsr-53 | ctc | 17.29% | 4.80% | 61 | | CY |cv11 training+validation (s=3) | cv11 test | wav2vec2-large-xlsr-53 | ctc with lm (kenlm, n=5) | 10.82% | 3.58% | 62 | 63 | Key: 64 | 65 | - "custom other" : an additional subset created from recordings of unique sentences in 'other.tsv' in Common Voice. i.e. recordings that no-one has yet listened to and validated. 66 | - "s=3" : the maximum number of recordings per unique sentence within Common Voice 67 | - "s=max" : quite a high maximum, so that every single recording of a sentence is allowed in the permitted 68 | - "bangor custom" : a transcriptions test set constructed at Bangor University: https://git.techiaith.bangor.ac.uk/data-porth-technolegau-iaith/corpws-profi-adnabod-lleferydd/-/tree/master/data/trawsgrifio 69 | -------------------------------------------------------------------------------- /train/fine-tune/python/.gitignore: -------------------------------------------------------------------------------- 1 | cv_version_* 2 | -------------------------------------------------------------------------------- /train/fine-tune/python/cv_version.template.py: -------------------------------------------------------------------------------- 1 | CV_VERSION="10.0.0" 2 | CV_ID="cv-corpus-10.0-2022-07-04" 3 | CV_DATA_URL="http://......../cv-corpus-10.0-2022-07-04-cy.tar.gz" 4 | -------------------------------------------------------------------------------- /train/fine-tune/python/decode.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import librosa 4 | import yaml 5 | import numpy as np 6 | 7 | from ctcdecode import CTCBeamDecoder 8 | 9 | from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor 10 | from argparse import ArgumentParser, RawTextHelpFormatter 11 | 12 | DESCRIPTION = """ 13 | 14 | Prifysgol Bangor University 15 | 16 | """ 17 | 18 | def greedy_decode(logits): 19 | predicted_ids=torch.argmax(logits, dim=-1) 20 | return processor.batch_decode(predicted_ids)[0] 21 | 22 | def lm_decode(ctc_matrix): 23 | 24 | kenlm_model_name= "kenlm-cy" 25 | kenlm_model_dir=os.path.join(models_root_dir, kenlm_model_name) 26 | with open(os.path.join(kenlm_model_dir, "config_ctc.yaml"), 'r') as config_file: 27 | ctc_lm_params=yaml.load(config_file, Loader=yaml.FullLoader) 28 | 29 | vocab=processor.tokenizer.convert_ids_to_tokens(range(0, processor.tokenizer.vocab_size)) 30 | space_ix = vocab.index('|') 31 | vocab[space_ix]=' ' 32 | 33 | ctcdecoder = CTCBeamDecoder(vocab, 34 | model_path=os.path.join(kenlm_model_dir, "lm.binary"), 35 | alpha=ctc_lm_params['alpha'], 36 | beta=ctc_lm_params['beta'], 37 | cutoff_top_n=40, 38 | cutoff_prob=1.0, 39 | beam_width=10, 40 | num_processes=4, 41 | blank_id=processor.tokenizer.pad_token_id, 42 | log_probs_input=False 43 | ) 44 | 45 | beam_results, beam_scores, timesteps, out_lens = ctcdecoder.decode(ctc_matrix) 46 | 47 | # beam_results - Shape: BATCHSIZE x N_BEAMS X N_TIMESTEPS A batch containing the series 48 | # of characters (these are ints, you still need to decode them back to your text) representing 49 | # results from a given beam search. Note that the beams are almost always shorter than the 50 | # total number of timesteps, and the additional data is non-sensical, so to see the top beam 51 | # (as int labels) from the first item in the batch, you need to run beam_results[0][0][:out_len[0][0]]. 52 | beam_string = "".join(vocab[n] for n in beam_results[0][0][:out_lens[0][0]]) 53 | 54 | # beam_scores - Shape: BATCHSIZE x N_BEAMS A batch with the approximate CTC score of each beam 55 | score = float(beam_scores[0][0].item()) / 100 56 | 57 | # timesteps : BATCHSIZE x N_BEAMS : the timestep at which the nth output character has peak probability. 58 | # Can be used as alignment between the audio and the transcript. 59 | alignment = list() 60 | for i in range(0, out_lens[0][0]): 61 | alignment.append([beam_string[i], int(timesteps[0][0][i])] ) 62 | 63 | return beam_string, alignment, score, int(beam_results.shape[2]) 64 | 65 | # 66 | def main(audio_file, **args): 67 | global models_root_dir 68 | global processor 69 | global model 70 | 71 | models_root_dir="/models" 72 | wav2vec2_model_name = "wav2vec2-xlsr-ft-cy" 73 | wav2vec2_model_path = os.path.join(models_root_dir, wav2vec2_model_name) 74 | 75 | processor = Wav2Vec2Processor.from_pretrained(wav2vec2_model_path) 76 | model = Wav2Vec2ForCTC.from_pretrained(wav2vec2_model_path) 77 | 78 | audio, rate = librosa.load(audio_file, sr=16000) 79 | 80 | inputs = processor(audio, sampling_rate=16_000, return_tensors="pt", padding=True) 81 | 82 | with torch.no_grad(): 83 | logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits 84 | 85 | print("Greedy decoding: " + greedy_decode(logits)) 86 | 87 | ctc_matrix = torch.softmax(logits, dim=-1) 88 | text, alignment, score, timesteps = lm_decode(ctc_matrix) 89 | timestep_length = librosa.get_duration(audio) / timesteps 90 | for a in alignment: 91 | a[1] = a[1] * timestep_length 92 | 93 | print("LM decoding (with alignments): " + text) 94 | print("Score: " + str(score)) 95 | print("Alignment:" + str(alignment)) 96 | 97 | 98 | if __name__ == "__main__": 99 | 100 | parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) 101 | 102 | parser.add_argument("--wav", dest="audio_file", required=True) 103 | parser.set_defaults(func=main) 104 | args = parser.parse_args() 105 | args.func(**vars(args)) 106 | -------------------------------------------------------------------------------- /train/fine-tune/python/evaluate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torchaudio 4 | import json 5 | import numpy as np 6 | import yaml 7 | 8 | import models 9 | 10 | from argparse import ArgumentParser, RawTextHelpFormatter 11 | 12 | from datasets import load_dataset, load_metric 13 | from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor 14 | import text_preprocess 15 | 16 | from ctcdecode import CTCBeamDecoder 17 | 18 | DESCRIPTION = """ 19 | 20 | Much of the code in this file was lifted from a HuggingFace blog entry: 21 | 22 | Fine-Tune XLSR-Wav2Vec2 for low-resource ASR with Transformers 23 | https://huggingface.co/blog/fine-tune-xlsr-wav2vec2 24 | 25 | by Patrick von Platen 26 | 27 | An implementation of a CTC (Connectionist Temporal Classification) beam search decoder with 28 | KenLM language models support from https://github.com/parlance/ctcdecode has been added. 29 | 30 | """ 31 | 32 | 33 | # Preprocessing the datasets. 34 | # We need to read the aduio files as arrays 35 | def speech_file_to_array_fn(batch): 36 | batch["sentence"] = text_preprocess.cleanup(batch["sentence"]).strip() # + " " 37 | speech_array, sampling_rate = torchaudio.load(batch["path"]) 38 | batch["speech"] = resampler(speech_array).squeeze().numpy() 39 | return batch 40 | 41 | 42 | def batch_evaluate(batch): 43 | inputs = processor(batch["speech"], sampling_rate=16000, return_tensors="pt", padding=True) 44 | 45 | with torch.no_grad(): 46 | logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits 47 | 48 | pred_ids = torch.argmax(logits, dim=-1) 49 | 50 | batch["pred_strings"] = processor.batch_decode(pred_ids)[0].strip() 51 | 52 | if ctcdecoder: 53 | beam_results, beam_scores, timesteps, out_lens = ctcdecoder.decode(logits) 54 | pred_with_ctc = "".join(vocab[n] for n in beam_results[0][0][:out_lens[0][0]]) 55 | batch["pred_strings_with_ctc"]=pred_with_ctc.strip() 56 | 57 | if kenlm_ctcdecoder: 58 | beam_results, beam_scores, timesteps, out_lens = kenlm_ctcdecoder.decode(logits) 59 | pred_with_lm = "".join(vocab[n] for n in beam_results[0][0][:out_lens[0][0]]) 60 | batch["pred_strings_with_lm"]=pred_with_lm.strip() 61 | 62 | return batch 63 | 64 | 65 | def main(wav2vec2_model_path, revision, test_split_name, **args): 66 | evaluate(wav2vec2_model_path, revision, test_split_name) 67 | 68 | def evaluate(wav2vec2_model_path, revision, test_split_name): 69 | 70 | global processor 71 | global model 72 | global vocab 73 | global ctcdecoder 74 | global kenlm_ctcdecoder 75 | global resampler 76 | 77 | processor, model, vocab, ctcdecoder, kenlm_ctcdecoder = models.create(wav2vec2_model_path, revision) 78 | 79 | # 80 | test_dataset = load_dataset("custom_common_voice.py", "cy", split=test_split_name) 81 | 82 | wer = load_metric("wer") 83 | cer = load_metric("cer") 84 | 85 | model.to("cuda") 86 | 87 | resampler = torchaudio.transforms.Resample(48000, 16000) 88 | 89 | test_dataset = test_dataset.map(speech_file_to_array_fn) 90 | result = test_dataset.map(batch_evaluate, batch_size=8) 91 | 92 | report = "\n" + test_split_name + "\n" 93 | report = report + "---------------\n" 94 | 95 | report = report + "WER: {:2f}\n".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])) 96 | if ctcdecoder: report = report + "WER with CTC: {:2f}\n".format(100 * wer.compute(predictions=result["pred_strings_with_ctc"], references=result["sentence"])) 97 | if kenlm_ctcdecoder: report = report + "WER with CTC+LM: {:2f}\n".format(100 * wer.compute(predictions=result["pred_strings_with_lm"], references=result["sentence"])) 98 | 99 | report = report + "CER: {:2f}\n".format(100 * cer.compute(predictions=result["pred_strings"], references=result["sentence"])) 100 | if ctcdecoder: report = report + "CER with CTC: {:2f}\n".format(100 * cer.compute(predictions=result["pred_strings_with_ctc"], references=result["sentence"])) 101 | if kenlm_ctcdecoder: report = report + "CER with CTC+LM: {:2f}\n".format(100 * cer.compute(predictions=result["pred_strings_with_lm"], references=result["sentence"])) 102 | 103 | print (report) 104 | 105 | with(open(os.path.join(wav2vec2_model_path, "{}_results.txt".format(test_split_name)), 'w' )) as results_file: 106 | results_file.write(report) 107 | 108 | 109 | 110 | if __name__ == "__main__": 111 | 112 | models_root_dir="/models/published" 113 | wav2vec2_model_name = "wav2vec2-xlsr-ft-cy" 114 | kenlm_model_name= "kenlm" 115 | 116 | wav2vec_model_dir = os.path.join(models_root_dir, wav2vec2_model_name) 117 | 118 | parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) 119 | 120 | parser.add_argument("--model_path", dest="wav2vec2_model_path", default=wav2vec_model_dir) 121 | parser.add_argument("--revision", dest="revision", default='') 122 | parser.add_argument("--test-split-name", dest="test_split_name", default="test") 123 | parser.set_defaults(func=main) 124 | args = parser.parse_args() 125 | args.func(**vars(args)) 126 | -------------------------------------------------------------------------------- /train/fine-tune/python/models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | import tarfile 4 | import urllib.request 5 | from urllib.parse import urlparse 6 | 7 | from pathlib import Path 8 | from tqdm import tqdm 9 | 10 | from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor 11 | from ctcdecode import CTCBeamDecoder 12 | 13 | 14 | class DownloadProgressBar(tqdm): 15 | def update_to(self, b=1, bsize=1, tsize=None): 16 | if tsize is not None: 17 | self.total = tsize 18 | self.update(b * bsize - self.n) 19 | 20 | 21 | def create(model_path, revision): 22 | 23 | cache_dir=model_path 24 | 25 | # initialize acoustic model... 26 | # 27 | if Path(model_path).is_dir(): 28 | # from a local directory containing our own trained model 29 | print("Initiaising wav2vec2 model from local directory: %s" % model_path) 30 | processor = Wav2Vec2Processor.from_pretrained(model_path) 31 | model = Wav2Vec2ForCTC.from_pretrained(model_path) 32 | else: 33 | # from the HuggingFace models repository. keep cache in /models/published 34 | print("Initialising wav2vec2 model \"%s\" from HuggingFace model repository" % model_path) 35 | cache_dir = os.path.join('/', 'models', 'cache', model_path) 36 | processor = Wav2Vec2Processor.from_pretrained(model_path, cache_dir=cache_dir, revision=revision) 37 | model = Wav2Vec2ForCTC.from_pretrained(model_path, cache_dir=cache_dir, revision=revision) 38 | 39 | vocab=processor.tokenizer.convert_ids_to_tokens(range(0, processor.tokenizer.vocab_size)) 40 | space_ix = vocab.index('|') 41 | vocab[space_ix]=' ' 42 | 43 | ctcdecoder = CTCBeamDecoder(vocab, 44 | model_path='', 45 | alpha=0, 46 | beta=0, 47 | cutoff_top_n=40, 48 | cutoff_prob=1.0, 49 | beam_width=100, 50 | num_processes=4, 51 | blank_id=processor.tokenizer.pad_token_id, 52 | log_probs_input=True 53 | ) 54 | 55 | # initialize ctc decoder with KenLM language model... 56 | # 57 | targz_file_path=os.path.join(cache_dir, "kenlm.tar.gz") 58 | ctc_lm_params_filepath = os.path.join(cache_dir, "config_ctc.yaml") 59 | lm_binary_filepath = os.path.join(cache_dir, "lm.binary") 60 | 61 | kenlm_ctcdecoder=None 62 | 63 | if not Path(targz_file_path).is_file(): 64 | print ("Downloading kenlm language model version {}".format(revision)) 65 | try: 66 | # @todo - replace with url join 67 | file_url = os.path.join("https://huggingface.co", model_path, "resolve", revision, 'kenlm.tar.gz') 68 | download(file_url, os.path.join(cache_dir, targz_file_path)) 69 | except Exception as e: 70 | print (e) 71 | 72 | if not Path(ctc_lm_params_filepath).is_file() or not Path(lm_binary_filepath).is_file(): 73 | if Path(targz_file_path).is_file(): 74 | print ("Extracting LM tar gz {}".format(targz_file_path)) 75 | extract(targz_file_path) 76 | 77 | if Path(ctc_lm_params_filepath).is_file(): 78 | print ("Opening ctc_lm_params {}".format(ctc_lm_params_filepath)) 79 | with open(os.path.join(cache_dir, "config_ctc.yaml"), 'r') as config_file: 80 | ctc_lm_params=yaml.load(config_file, Loader=yaml.FullLoader) 81 | 82 | if Path(lm_binary_filepath).is_file(): 83 | print ("Loading lm.binary {}".format(lm_binary_filepath)) 84 | kenlm_ctcdecoder = CTCBeamDecoder(vocab, 85 | model_path=os.path.join(cache_dir, "lm.binary"), 86 | alpha=ctc_lm_params['alpha'], 87 | beta=ctc_lm_params['beta'], 88 | cutoff_top_n=40, 89 | cutoff_prob=1.0, 90 | beam_width=100, 91 | num_processes=4, 92 | blank_id=processor.tokenizer.pad_token_id, 93 | log_probs_input=True 94 | ) 95 | 96 | return processor, model, vocab, ctcdecoder, kenlm_ctcdecoder 97 | 98 | 99 | def download(file_url, output_file_path): 100 | with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=file_url.split('/')[-1]) as t: 101 | urllib.request.urlretrieve(file_url, filename=output_file_path, reporthook=t.update_to) 102 | 103 | def extract(targz_file_path): 104 | # extract. 105 | if targz_file_path.endswith(".tar.gz"): 106 | print ("Extracting...") 107 | model_dir = Path(targz_file_path).parent.absolute() 108 | tar = tarfile.open(targz_file_path, "r:gz") 109 | tar.extractall(model_dir) 110 | tar.close() 111 | 112 | #Path(output_file_path).unlink() 113 | -------------------------------------------------------------------------------- /train/fine-tune/python/publish.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import shutil 4 | import tarfile 5 | 6 | from pathlib import Path 7 | 8 | DESCRIPTION = """ 9 | 10 | Prifysgol Bangor University 11 | 12 | """ 13 | 14 | # 15 | def export_checkpoint(training_dir): 16 | # copy config and model binary file 17 | checkpoint_dir=glob.glob(os.path.join(training_dir, r"checkpoint-*"))[0] 18 | shutil.copy(os.path.join(checkpoint_dir, "config.json"), training_dir) 19 | shutil.copy(os.path.join(checkpoint_dir, "pytorch_model.bin"), training_dir) 20 | shutil.rmtree(checkpoint_dir) 21 | 22 | 23 | # 24 | def copy_for_publishing(source_dir, target_dir): 25 | print ("Copying for evaluation or publishing") 26 | print (source_dir) 27 | print (target_dir) 28 | 29 | Path(target_dir).mkdir(parents=True, exist_ok=True) 30 | 31 | # copy json files 32 | for file in glob.glob(os.path.join(source_dir,'*')): 33 | if os.path.isfile(file): 34 | print ("Copying %s" % file) 35 | shutil.copy(file, target_dir) 36 | 37 | return target_dir 38 | 39 | # 40 | def make_model_tarfile(model_name, source_dir, output_dir): 41 | 42 | Path(output_dir).mkdir(parents=True, exist_ok=True) 43 | 44 | output_tar_file_path = os.path.join(output_dir, model_name + ".tar.gz") 45 | print ("Creating {} ".format(output_tar_file_path)) 46 | with tarfile.open(output_tar_file_path, "w:gz") as tar: 47 | tar.add(source_dir, arcname="") 48 | 49 | return output_tar_file_path 50 | 51 | -------------------------------------------------------------------------------- /train/fine-tune/python/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.18.0 2 | torch >= 1.5 3 | transformers==4.24.0 4 | protobuf~=3.20.0 5 | tensorboard 6 | librosa 7 | jiwer 8 | evaluate 9 | optuna 10 | accelerate 11 | -------------------------------------------------------------------------------- /train/fine-tune/python/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | import publish 6 | import train_kenlm 7 | import train_wav2vec2 8 | 9 | from pathlib import Path 10 | from evaluate import evaluate 11 | 12 | """ 13 | 14 | Execute all steps for training both an acoustic and languege model for Welsh 15 | 16 | """ 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser(description="Finetune a wav2vec2 pre-trained model for speech recognition") 20 | 21 | parser.add_argument( 22 | "--session-id", 23 | type=str, 24 | required=True, 25 | help="an id that should be given for the training session (str)", 26 | ) 27 | parser.add_argument( 28 | "--language", 29 | type=str, 30 | required=True, 31 | help="language(s) we are training for (str)", 32 | ) 33 | parser.add_argument( 34 | "--training-dir", 35 | type=str, 36 | required=True, 37 | help="directory where training should be conducted" 38 | ) 39 | 40 | parser.add_argument( 41 | "--train-wav2vec2", 42 | action='store_true', 43 | help="flag whether a new wav2vec2 acoustic model should be trained (bool)", 44 | ) 45 | parser.add_argument( 46 | "--no-train-wav2vec2", 47 | dest="train_wav2vec2", 48 | action='store_false' 49 | ), 50 | parser.set_defaults(train_wav2vec2=True) 51 | 52 | parser.add_argument( 53 | "--train-kenlm", 54 | action="store_true", 55 | help="flag whether a new KenLM language model should be trained (bool)", 56 | ) 57 | parser.add_argument( 58 | "--no-train-kenlm", 59 | dest="train_kenlm", 60 | action='store_false' 61 | ), 62 | parser.set_defaults(train_kenlm=True) 63 | 64 | parser.add_argument( 65 | "--optimize-kenlm", 66 | action='store_true', 67 | help="flag whether the last KenLM model should be optimized (bool)", 68 | ) 69 | parser.add_argument( 70 | "--no-optimize-kenlm", 71 | dest="optimize_kenlm", 72 | action='store_false' 73 | ), 74 | parser.set_defaults(train_optimize=True) 75 | 76 | parser.add_argument( 77 | "--pre-trained-model-name", 78 | type=str, 79 | required=True, 80 | help="name of pretrained model from HuggingFace models hub (str)", 81 | ) 82 | parser.add_argument( 83 | "--training-split-name", 84 | type=str, 85 | required=True, 86 | help="name of split for training (str)", 87 | ) 88 | parser.add_argument( 89 | "--test-split-name", 90 | type=str, 91 | required=True, 92 | help="name of split for testing (str)", 93 | ) 94 | parser.add_argument( 95 | "--oscar-text-corpus-name", 96 | type=str, 97 | default=None, 98 | help="name of language specific OSCAR text corpus (str)", 99 | ) 100 | 101 | args = parser.parse_args() 102 | return args 103 | 104 | 105 | def main(): 106 | 107 | args = parse_args() 108 | 109 | perform_training_wav2vec2 = args.train_wav2vec2 110 | perform_training_kenlm = args.train_kenlm 111 | perform_optimize_kenlm = args.optimize_kenlm 112 | 113 | pretrained_model_name = args.pre_trained_model_name 114 | 115 | session_id = args.session_id 116 | training_dir = args.training_dir 117 | logging_dir = os.path.join("/", "logs", session_id) 118 | 119 | Path(training_dir).mkdir(parents=True, exist_ok=True) 120 | with open(os.path.join(training_dir, 'commandline_args.txt'), 'w') as f: 121 | json.dump(args.__dict__, f, indent=2) 122 | 123 | # 124 | if perform_training_wav2vec2: 125 | print ("\nTraining acoustic model in {}".format(training_dir)) 126 | wav2vec2_model_dir = train_wav2vec2.train(training_dir, logging_dir, 127 | args.language, 128 | args.pre_trained_model_name, 129 | args.training_split_name, 130 | args.test_split_name) 131 | #evaluate(training_dir, '') 132 | 133 | if perform_training_kenlm: 134 | print ("\n\nTraining KenLM language model...") 135 | lm_model_dir = train_kenlm.train(training_dir, "unshuffled_deduplicated_cy") 136 | 137 | if perform_optimize_kenlm: 138 | print ("\n\nOptimizing KenLM language model...") 139 | print (lm_model_dir) 140 | train_kenlm.optimize(lm_model_dir, wav2vec2_model_dir) 141 | 142 | #print ("Packaging for publishing...") 143 | #publish_dir = os.path.join(models_root_dir, "published", wav2vec2_model_name) 144 | 145 | #if perform_training_kenlm or perform_optimize_kenlm: kenlm_archive_file_path = publish.make_model_tarfile(kenlm_model_name, lm_model_dir, publish_dir) 146 | #if perform_training_wav2vec2: publish_dir = publish.copy_for_publishing(wav2vec2_model_dir, publish_dir) 147 | 148 | #print ("Files for publication ready at {}".format(publish_dir)) 149 | 150 | 151 | if __name__ == "__main__": 152 | main() 153 | -------------------------------------------------------------------------------- /train/fine-tune/python/run_base-cy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | export CV_CONFIG_FILE='cv_version_11_cy' 5 | 6 | # 7 | pre_trained_model='techiaith/wav2vec2-base-cy' 8 | 9 | session_date=$(date '+%Y-%m-%d_%H:%M:%S') 10 | session_name=${pre_trained_model//\//_}__${session_date} 11 | 12 | training_dir="/root/sessions/"${session_name} 13 | 14 | set -x 15 | 16 | accelerate launch run.py \ 17 | --session-id="${session_name}" \ 18 | --training-dir="${training_dir}" \ 19 | --training-split-name="train_plus+validation" \ 20 | --test-split-name="test" \ 21 | --language="cy" \ 22 | --train-wav2vec2 \ 23 | --train-kenlm \ 24 | --optimize-kenlm \ 25 | --pre-trained-model-name="${pre_trained_model}" \ 26 | --oscar-text-corpus-name="unshuffled_deduplicated_cy" 27 | 28 | python3 evaluate.py --model_path="${training_dir}" --test-split-name="test" 29 | 30 | python3 test.py --model_path="${training_dir}" --test_csv /data/corpws-profi-adnabod-lleferydd/data/trawsgrifio/clips.csv 31 | -------------------------------------------------------------------------------- /train/fine-tune/python/run_en_cy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # this is refering to a Python file that contains values 4 | # shown in cv_version.template.py 5 | export CV_CONFIG_FILE='cv_version_11_en_cy' 6 | 7 | 8 | # "facebook/wav2vec2-xls-r-1b" 9 | pre_trained_model='facebook/wav2vec2-large-xlsr-53' 10 | 11 | session_date=$(date '+%Y-%m-%d_%H:%M:%S') 12 | session_name=${pre_trained_model//\//_}__${session_date} 13 | 14 | training_dir="/root/sessions/"${session_name} 15 | 16 | set -x 17 | 18 | accelerate launch run.py \ 19 | --session-id="${session_name}" \ 20 | --training-dir="${training_dir}" \ 21 | --training-split-name="train+validation" \ 22 | --test-split-name="test" \ 23 | --language="cy" \ 24 | --train-wav2vec2 \ 25 | --no-train-kenlm \ 26 | --no-optimize-kenlm \ 27 | --pre-trained-model-name="${pre_trained_model}" \ 28 | --oscar-text-corpus-name="unshuffled_deduplicated_cy" 29 | 30 | 31 | python3 evaluate.py --model_path="${training_dir}" 32 | 33 | python3 evaluate.py --model_path="${training_dir}" --test-split-name="test_cy" 34 | 35 | python3 evaluate.py --model_path="${training_dir}" --test-split-name="test_en" 36 | 37 | python3 test.py --model_path="${training_dir}" --test_csv /data/corpws-profi-adnabod-lleferydd/data/trawsgrifio/clips.csv 38 | -------------------------------------------------------------------------------- /train/fine-tune/python/run_xls-r-1b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CV_CONFIG_FILE='cv_version_11_cy' 4 | 5 | # 6 | pre_trained_model='facebook/wav2vec2-xls-r-1b' 7 | 8 | session_date=$(date '+%Y-%m-%d_%H:%M:%S') 9 | session_name=${pre_trained_model//\//_}__${session_date} 10 | 11 | training_dir="/root/sessions/"${session_name} 12 | 13 | set -x 14 | 15 | accelerate launch run.py \ 16 | --session-id="${session_name}" \ 17 | --training-dir="${training_dir}" \ 18 | --training-split-name="train_plus+validation" \ 19 | --test-split-name="test" \ 20 | --language="cy" \ 21 | --train-wav2vec2 \ 22 | --train-kenlm \ 23 | --optimize-kenlm \ 24 | --pre-trained-model-name="${pre_trained_model}" \ 25 | --oscar-text-corpus-name="unshuffled_deduplicated_cy" 26 | 27 | python3 evaluate.py --model_path="${training_dir}" --test-split-name="test" 28 | 29 | python3 test.py --model_path="${training_dir}" --test_csv /data/corpws-profi-adnabod-lleferydd/data/trawsgrifio/clips.csv 30 | -------------------------------------------------------------------------------- /train/fine-tune/python/run_xlsr-large-53.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | export CV_CONFIG_FILE='cv_version_11_cy' 5 | 6 | # 7 | pre_trained_model='facebook/wav2vec2-large-xlsr-53' 8 | 9 | session_date=$(date '+%Y-%m-%d_%H:%M:%S') 10 | session_name=${pre_trained_model//\//_}__${session_date} 11 | 12 | training_dir="/root/sessions/"${session_name} 13 | 14 | set -x 15 | 16 | accelerate launch run.py \ 17 | --session-id="${session_name}" \ 18 | --training-dir="${training_dir}" \ 19 | --training-split-name="train_plus+validation" \ 20 | --test-split-name="test" \ 21 | --language="cy" \ 22 | --train-wav2vec2 \ 23 | --train-kenlm \ 24 | --optimize-kenlm \ 25 | --pre-trained-model-name="${pre_trained_model}" \ 26 | --oscar-text-corpus-name="unshuffled_deduplicated_cy" 27 | 28 | python3 evaluate.py --model_path="${training_dir}" --test-split-name="test" 29 | 30 | python3 test.py --model_path="${training_dir}" --test_csv /data/corpws-profi-adnabod-lleferydd/data/trawsgrifio/clips.csv 31 | -------------------------------------------------------------------------------- /train/fine-tune/python/speech.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/techiaith/docker-huggingface-stt-cy/36224bbb731037b701dbf516165f7dec15bce262/train/fine-tune/python/speech.wav -------------------------------------------------------------------------------- /train/fine-tune/python/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import torch 4 | import librosa 5 | import yaml 6 | import datetime 7 | import pandas 8 | 9 | import models 10 | import text_preprocess 11 | 12 | import re 13 | import jiwer 14 | 15 | from pathlib import Path 16 | from tqdm import tqdm 17 | 18 | from argparse import ArgumentParser, RawTextHelpFormatter 19 | 20 | DESCRIPTION = """ 21 | 22 | Prifysgol Bangor University 23 | 24 | """ 25 | 26 | 27 | tags_regex = '\[.*?\]' 28 | 29 | class TestStatistics: 30 | 31 | def __init__(self, experiment_name): 32 | self.experiment_name=experiment_name 33 | 34 | self.total_clips=0 35 | self.total_ignored_clips=0 36 | 37 | self.total_duration=0 38 | 39 | self.dfResults=pandas.DataFrame(columns=['wav_filename', 'parent', 'duration', 'prediction', 'reference', 'wer', 'cer']) 40 | self.dfIgnoredResults=pandas.DataFrame(columns=['wav_filename', 'parent', 'duration', 'prediction', 'reference', 'wer', 'cer']) 41 | 42 | 43 | def calculate_error_rates(self, prediction, reference): 44 | 45 | cer_error = jiwer.cer(reference, prediction) 46 | wer_error = jiwer.wer(reference, prediction) 47 | 48 | return 100*wer_error, 100*cer_error 49 | 50 | 51 | def add(self, clip_file_path, clip_parent, prediction, reference): 52 | 53 | #print (clip_file_path) 54 | #print (reference) 55 | #print (prediction) 56 | 57 | audio, rate = librosa.load(clip_file_path, sr=16000) 58 | duration=librosa.get_duration(y=audio, sr=rate) 59 | 60 | current_wer, current_cer=self.calculate_error_rates(prediction, reference) 61 | 62 | self.total_duration+=duration 63 | 64 | # skip averaging if reference contains a (metadata) tag in square brackets 65 | if not re.findall(tags_regex, reference): 66 | self.total_clips+=1 67 | self.dfResults.loc[self.total_clips] = [clip_file_path, clip_parent, duration, prediction, reference, current_wer, current_cer] 68 | else: 69 | self.total_ignored_clips+=1 70 | self.dfIgnoredResults.loc[self.total_clips] = [clip_file_path, clip_parent, duration, prediction, reference, current_wer, current_cer] 71 | 72 | #print ("WER: %s, CER: %s" % (current_wer, current_cer)) 73 | #print ("") 74 | 75 | 76 | def print(self): 77 | 78 | predictions = self.dfResults['prediction'].tolist() 79 | references = self.dfResults['reference'].tolist() 80 | 81 | average_wer=100 * jiwer.wer(hypothesis=predictions, truth=references) 82 | average_cer=100 * jiwer.cer(hypothesis=predictions, truth=references) 83 | 84 | # 85 | print ("") 86 | print ("Test Statistics - " + self.experiment_name) 87 | print ("-----------------------------------------------------------------------------------------------------") 88 | print ("No of Clips: %s" % self.total_clips) 89 | print ("Duration: {} hours ({} seconds).".format(datetime.timedelta(seconds=self.total_duration), self.total_duration)) 90 | print ("WER: {:2f}".format(average_wer)) 91 | print ("CER: {:2f}".format(average_cer)) 92 | 93 | print ("No of ignored clips: %s" % self.total_ignored_clips) 94 | 95 | print ("") 96 | 97 | 98 | def save(self): 99 | print("Results saved to results.csv file") 100 | 101 | self.dfResults.to_csv("testresults_" + self.experiment_name + ".csv", encoding='utf-8', index=False) 102 | self.dfIgnoredResults.to_csv("results_ignored.csv", encoding='utf-8', index=False) 103 | 104 | 105 | # 106 | # 107 | def main(testset_csv_file_path, model_path, revision, **args): 108 | 109 | # iterate through each audio file and text. 110 | processor, model, vocab, ctcdecoder, kenlm_ctcdecoder = models.create(model_path, revision) 111 | 112 | test_stats_greedy =TestStatistics("greedy") 113 | test_stats_ctc = TestStatistics("CTC") 114 | test_stats_ctc_kenlm = TestStatistics("CTC+KenLM") 115 | 116 | testset_csv_parent_dir=Path(testset_csv_file_path).parent.absolute() 117 | 118 | with open(testset_csv_file_path, 'r', encoding='utf-8') as testset_csv_file_count: 119 | lines = len(testset_csv_file_count.readlines()) 120 | 121 | with open(testset_csv_file_path, 'r', encoding='utf-8') as testset_csv_file: 122 | testset_reader = csv.DictReader(testset_csv_file) 123 | for row in tqdm(testset_reader, total=lines): 124 | 125 | reference=text_preprocess.cleanup(row["transcript"]) 126 | try: 127 | clip_parent = row["parent_video_youtube_id"] 128 | except: 129 | clip_parent = "" 130 | 131 | clip_file_path = os.path.join(testset_csv_parent_dir, "clips", row["wav_filename"]) 132 | 133 | audio, rate = librosa.load(clip_file_path, sr=16000) 134 | duration = librosa.get_duration(y=audio, sr=rate) 135 | 136 | inputs = processor(audio, sampling_rate=16_000, return_tensors="pt", padding=True) 137 | with torch.no_grad(): 138 | logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits 139 | 140 | # greedy 141 | predicted_ids = torch.argmax(logits, dim=-1) 142 | prediction = processor.batch_decode(predicted_ids)[0] 143 | prediction = " ".join(prediction.strip().split(" ")) 144 | test_stats_greedy.add(clip_file_path, clip_parent, prediction, reference) 145 | 146 | # ctc decode 147 | if ctcdecoder: 148 | beam_results, beam_scores, timesteps, out_lens = ctcdecoder.decode(logits) 149 | prediction = "".join(vocab[n] for n in beam_results[0][0][:out_lens[0][0]]) 150 | prediction = " ".join(prediction.strip().split(" ")) 151 | test_stats_ctc.add(clip_file_path, clip_parent, prediction, reference) 152 | 153 | # ctc + lm decode 154 | if kenlm_ctcdecoder: 155 | beam_results, beam_scores, timesteps, out_lens = kenlm_ctcdecoder.decode(logits) 156 | prediction = "".join(vocab[n] for n in beam_results[0][0][:out_lens[0][0]]) 157 | prediction = " ".join(prediction.strip().split(" ")) 158 | test_stats_ctc_kenlm.add(clip_file_path, clip_parent, prediction, reference) 159 | 160 | test_stats_greedy.print() 161 | test_stats_greedy.save() 162 | 163 | if ctcdecoder: 164 | test_stats_ctc.print() 165 | test_stats_ctc.save() 166 | 167 | if kenlm_ctcdecoder: 168 | test_stats_ctc_kenlm.print() 169 | test_stats_ctc_kenlm.save() 170 | 171 | 172 | 173 | if __name__ == "__main__": 174 | 175 | models_root_dir="/models/published" 176 | wav2vec2_model_name = "wav2vec2-xlsr-ft-cy" 177 | kenlm_model_name= "kenlm" 178 | 179 | wav2vec_model_dir = os.path.join(models_root_dir, wav2vec2_model_name) 180 | 181 | parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) 182 | 183 | parser.add_argument("--test_csv", dest="testset_csv_file_path", required=True) 184 | parser.add_argument("--model_path", dest="model_path", default=wav2vec_model_dir) 185 | parser.add_argument("--revision", dest="revision", default='') 186 | 187 | parser.set_defaults(func=main) 188 | args = parser.parse_args() 189 | args.func(**vars(args)) 190 | -------------------------------------------------------------------------------- /train/fine-tune/python/text_preprocess.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | chars_to_ignore_regex = '[\,\?\.\!\u00AC\;\:\"\\%\\\]' 4 | 5 | # Preprocessing the datasets. 6 | # We need to read the aduio files as arrays 7 | def cleanup(sentence): 8 | sentence = re.sub(chars_to_ignore_regex, '', sentence).lower() 9 | sentence = sentence.replace('\u2013',"-") 10 | sentence = sentence.replace('\u2014',"-") 11 | sentence = sentence.replace('\u2018',"'") 12 | sentence = sentence.replace('\u201C',"") 13 | sentence = sentence.replace('\u201D',"") 14 | sentence = sentence.replace('…',"") 15 | sentence = sentence.replace('ñ',"n") 16 | sentence = sentence.replace('í',"i") 17 | sentence = sentence.replace(" - "," ") 18 | 19 | sentence = " ".join(sentence.strip().split()) 20 | 21 | return sentence 22 | -------------------------------------------------------------------------------- /train/fine-tune/python/train_kenlm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import sys 4 | import glob 5 | import json 6 | import yaml 7 | import shlex 8 | import subprocess 9 | 10 | import torch 11 | import torchaudio 12 | import optuna 13 | import text_preprocess 14 | 15 | from pathlib import Path 16 | from ctcdecode import CTCBeamDecoder 17 | from datasets import load_dataset, load_metric, set_caching_enabled 18 | from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor 19 | 20 | from argparse import ArgumentParser, RawTextHelpFormatter 21 | 22 | 23 | DESCRIPTION = """ 24 | 25 | Train and optimize a KenLM language model from HuggingFace's provision of the Welsh corpus by the OSCAR project. 26 | 27 | """ 28 | 29 | set_caching_enabled(False) 30 | 31 | 32 | # Preprocessing the datasets. 33 | def speech_file_to_array_fn(batch): 34 | batch["sentence"] = text_preprocess.cleanup(batch["sentence"]).strip() # + " " 35 | speech_array, sampling_rate = torchaudio.load(batch["path"]) 36 | batch["speech"] = resampler(speech_array).squeeze().numpy() 37 | return batch 38 | 39 | 40 | def decode(batch): 41 | inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True) 42 | with torch.no_grad(): 43 | logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits 44 | 45 | beam_results, beam_scores, timesteps, out_lens = ctcdecoder.decode(logits) 46 | batch["pred_strings_with_lm"] = "".join(vocab[n] for n in beam_results[0][0][:out_lens[0][0]]).strip() 47 | 48 | return batch 49 | 50 | 51 | def optimize_lm_objective(trial): 52 | global ctcdecoder 53 | 54 | alpha = trial.suggest_uniform('lm_alpha', 0, 6) 55 | beta = trial.suggest_uniform('lm_beta',0, 5) 56 | 57 | try: 58 | binarylm_file_path=os.path.join(lm_model_dir, "lm.binary") 59 | ctcdecoder = CTCBeamDecoder(vocab, 60 | model_path=binarylm_file_path, 61 | alpha=alpha, 62 | beta=beta, 63 | cutoff_top_n=40, 64 | cutoff_prob=1.0, 65 | beam_width=100, 66 | num_processes=4, 67 | blank_id=processor.tokenizer.pad_token_id, 68 | log_probs_input=True 69 | ) 70 | result = test_dataset.map(decode) 71 | result_wer = wer.compute(predictions=result["pred_strings_with_lm"], references=result["sentence"]) 72 | result_cer = cer.compute(predictions=result["pred_strings_with_lm"], references=result["sentence"]) 73 | 74 | # clear tmp cache 75 | fileList = glob.glob("/tmp/**/cache-*.arrow", recursive=True) 76 | for filepath in fileList: 77 | try: 78 | os.remove(filepath) 79 | except OSError: 80 | print("Error deleting tmp cache file %s" % filepath) 81 | 82 | print(f"WER: {result_wer} | CER: {result_cer}") 83 | trial.report(result_wer, step=0) 84 | 85 | 86 | except Exception as e: 87 | print (e) 88 | raise 89 | 90 | finally: 91 | return result_wer 92 | 93 | 94 | 95 | def train(lm_dir, oscar_dataset_name): 96 | 97 | Path(lm_dir).mkdir(parents=True, exist_ok=True) 98 | corpus_file_path = os.path.join(lm_dir, "corpus.txt") 99 | 100 | print ("\nLoading OSCAR {} dataset...".format(oscar_dataset_name)) 101 | oscar_corpus = load_dataset("oscar", oscar_dataset_name) 102 | 103 | print ("\nExporting OSCAR to text file {}...".format(corpus_file_path)) 104 | with open(corpus_file_path, 'w', encoding='utf-8') as corpus_file: 105 | for line in oscar_corpus["train"]: 106 | t = text_preprocess.cleanup(line["text"]) 107 | corpus_file.write(t) 108 | 109 | # generate KenLM ARPA file language model 110 | lm_arpa_file_path=os.path.join(lm_dir, "lm.arpa") 111 | lm_bin_file_path=os.path.join(lm_dir, "lm.binary") 112 | 113 | cmd = "lmplz -o {n} --text {corpus_file} --arpa {lm_file}".format(n=5, corpus_file=corpus_file_path, lm_file=lm_arpa_file_path) 114 | print (cmd) 115 | 116 | subprocess.run(shlex.split(cmd), stderr=sys.stderr, stdout=sys.stdout) 117 | 118 | # generate binary version 119 | cmd = "build_binary trie -s {arpa_file} {bin_file}".format(arpa_file=lm_arpa_file_path, bin_file=lm_bin_file_path) 120 | print (cmd) 121 | 122 | subprocess.run(shlex.split(cmd), stderr=sys.stderr, stdout=sys.stdout) 123 | 124 | # 125 | os.remove(corpus_file_path) 126 | os.remove(lm_arpa_file_path) 127 | 128 | return lm_dir 129 | 130 | 131 | 132 | def optimize(lm_dir, wav2vec_model_path): 133 | global processor 134 | global model 135 | global vocab 136 | global wer 137 | global cer 138 | global resampler 139 | global test_dataset 140 | global lm_model_dir 141 | 142 | lm_model_dir=lm_dir 143 | 144 | test_dataset = load_dataset("custom_common_voice.py", "cy", split="test") 145 | #test_dataset = load_dataset("common_voice", "cy", split="test") 146 | 147 | wer = load_metric("wer") 148 | cer = load_metric("cer") 149 | 150 | processor = Wav2Vec2Processor.from_pretrained(wav2vec_model_path) 151 | model = Wav2Vec2ForCTC.from_pretrained(wav2vec_model_path) 152 | 153 | model.to("cuda") 154 | 155 | resampler = torchaudio.transforms.Resample(48_000, 16_000) 156 | 157 | vocab=processor.tokenizer.convert_ids_to_tokens(range(0, processor.tokenizer.vocab_size)) 158 | space_ix = vocab.index('|') 159 | vocab[space_ix]=' ' 160 | 161 | print ("Preprocessing speech files") 162 | test_dataset = test_dataset.map(speech_file_to_array_fn) 163 | 164 | 165 | print ("Beginning alpha and beta hyperparameter optimization") 166 | study = optuna.create_study() 167 | study.optimize(optimize_lm_objective, n_jobs=1, n_trials=100) 168 | 169 | # 170 | lm_best = {'alpha':study.best_params['lm_alpha'], 'beta':study.best_params['lm_beta']} 171 | 172 | config_file_path = os.path.join(lm_model_dir, "config_ctc.yaml") 173 | with open (config_file_path, 'w') as config_file: 174 | yaml.dump(lm_best, config_file) 175 | 176 | print('Best params saved to config file {}: alpha={}, beta={} with WER={}'.format(config_file_path, study.best_params['lm_alpha'], study.best_params['lm_beta'], study.best_value)) 177 | 178 | 179 | 180 | def main(lm_root_dir, wav2vec2_model_path, **args): 181 | lm_file_path=train_kenlm(lm_root_dir, "unshuffled_deduplicated_cy") 182 | optimize_kenlm(lm_file_path, wav2vec2_model_path) 183 | 184 | 185 | 186 | if __name__ == "__main__": 187 | 188 | parser = ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) 189 | 190 | parser.add_argument("--target_dir", dest="lm_root_dir", required=True, help="target directory for language model") 191 | parser.add_argument("--model", dest="wav2vec_model_path", required=True, help="acoustic model to be used for optimizing") 192 | 193 | parser.set_defaults(func=main) 194 | args = parser.parse_args() 195 | args.func(**vars(args)) 196 | 197 | -------------------------------------------------------------------------------- /train/fine-tune/python/train_wav2vec2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import random 4 | import shutil 5 | import torch 6 | import torchaudio 7 | import librosa 8 | import importlib 9 | 10 | import pandas as pd 11 | import numpy as np 12 | import soundfile as sf 13 | 14 | import publish 15 | 16 | from pathlib import Path 17 | from datasets import Dataset, ClassLabel, load_dataset, load_from_disk, load_metric, concatenate_datasets 18 | from dataclasses import dataclass, field 19 | from typing import Any, Dict, List, Optional, Union 20 | 21 | from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC, TrainingArguments, Trainer 22 | 23 | import text_preprocess 24 | 25 | 26 | """ 27 | 28 | Train an acoustic model by fine tuning the wav2vec2 large XLSR pre-trained models by Facebook for Welsh. 29 | 30 | Much of the code in this file was lifted from a HuggingFace blog entry: 31 | 32 | Fine-Tune XLSR-Wav2Vec2 for low-resource ASR with Transformers 33 | https://huggingface.co/blog/fine-tune-xlsr-wav2vec2 34 | 35 | by Patrick von Platen 36 | """ 37 | 38 | 39 | # 40 | def remove_special_characters(batch): 41 | batch["sentence"] = text_preprocess.cleanup(batch["sentence"]) + " " 42 | return batch 43 | 44 | def extract_all_chars(batch): 45 | all_text = " ".join(batch["sentence"]) 46 | vocab = list(set(all_text)) 47 | return {"vocab": [vocab], "all_text": [all_text]} 48 | 49 | def show_random_elements(dataset, num_examples=10): 50 | assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset." 51 | picks = [] 52 | for _ in range(num_examples): 53 | pick = random.randint(0, len(dataset)-1) 54 | while pick in picks: 55 | pick = random.randint(0, len(dataset)-1) 56 | picks.append(pick) 57 | 58 | df = pd.DataFrame(dataset[picks]) 59 | print (df.to_html()) 60 | 61 | 62 | @dataclass 63 | class DataCollatorCTCWithPadding: 64 | """ 65 | Data collator that will dynamically pad the inputs received. 66 | Args: 67 | processor (:class:`~transformers.Wav2Vec2Processor`) 68 | The processor used for proccessing the data. 69 | padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`): 70 | Select a strategy to pad the returned sequences (according to the model's padding side and padding index) 71 | among: 72 | * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single 73 | sequence if provided). 74 | * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the 75 | maximum acceptable input length for the model if that argument is not provided. 76 | * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of 77 | different lengths). 78 | max_length (:obj:`int`, `optional`): 79 | Maximum length of the ``input_values`` of the returned list and optionally padding length (see above). 80 | max_length_labels (:obj:`int`, `optional`): 81 | Maximum length of the ``labels`` returned list and optionally padding length (see above). 82 | pad_to_multiple_of (:obj:`int`, `optional`): 83 | If set will pad the sequence to a multiple of the provided value. 84 | This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 85 | 7.5 (Volta). 86 | """ 87 | processor: Wav2Vec2Processor 88 | padding: Union[bool, str] = True 89 | max_length: Optional[int] = None 90 | max_length_labels: Optional[int] = None 91 | pad_to_multiple_of: Optional[int] = None 92 | pad_to_multiple_of_labels: Optional[int] = None 93 | 94 | def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: 95 | # split inputs and labels since they have to be of different lenghts and need 96 | # different padding methods 97 | input_features = [{"input_values": feature["input_values"]} for feature in features] 98 | label_features = [{"input_ids": feature["labels"]} for feature in features] 99 | 100 | batch = self.processor.pad( 101 | input_features, 102 | padding=self.padding, 103 | max_length=self.max_length, 104 | pad_to_multiple_of=self.pad_to_multiple_of, 105 | return_tensors="pt", 106 | ) 107 | with self.processor.as_target_processor(): 108 | labels_batch = self.processor.pad( 109 | label_features, 110 | padding=self.padding, 111 | max_length=self.max_length_labels, 112 | pad_to_multiple_of=self.pad_to_multiple_of_labels, 113 | return_tensors="pt", 114 | ) 115 | 116 | # replace padding with -100 to ignore loss correctly 117 | labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) 118 | 119 | batch["labels"] = labels 120 | 121 | return batch 122 | 123 | 124 | def speech_file_to_array_fn(batch): 125 | speech_array, sampling_rate = torchaudio.load(batch["path"]) 126 | batch["speech"] = speech_array[0].numpy() 127 | batch["sampling_rate"] = sampling_rate 128 | batch["target_text"] = batch["sentence"] 129 | return batch 130 | 131 | 132 | def resample(batch): 133 | batch["speech"] = librosa.resample(np.asarray(batch["speech"]), orig_sr=48000, target_sr=16000) 134 | batch["sampling_rate"] = 16_000 135 | return batch 136 | 137 | 138 | def prepare_dataset(batch): 139 | # check that all files have the correct sampling rate 140 | assert ( 141 | len(set(batch["sampling_rate"])) == 1 142 | ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}." 143 | 144 | batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values 145 | 146 | with processor.as_target_processor(): 147 | batch["labels"] = processor(batch["target_text"]).input_ids 148 | 149 | return batch 150 | 151 | 152 | def compute_metrics(pred): 153 | pred_logits = pred.predictions 154 | pred_ids = np.argmax(pred_logits, axis=-1) 155 | 156 | pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id 157 | 158 | pred_str = processor.batch_decode(pred_ids) 159 | 160 | # we do not want to group tokens when computing the metrics 161 | label_str = processor.batch_decode(pred.label_ids, group_tokens=False) 162 | wer = wer_metric.compute(predictions=pred_str, references=label_str) 163 | cer = cer_metric.compute(predictions=pred_str, references=label_str) 164 | 165 | return {"wer": wer, "cer": cer} 166 | 167 | 168 | 169 | def train(output_dir, logging_dir, language, pre_trained_model_name, training_split, test_split): 170 | 171 | global processor 172 | global tokenizer 173 | global model 174 | 175 | global wer_metric 176 | global cer_metric 177 | 178 | dataset_cache_dir = "/root/datasets" 179 | 180 | training_dataset_dir_path = os.path.join(dataset_cache_dir, "train") 181 | test_dataset_dir_path = os.path.join(dataset_cache_dir, "test") 182 | 183 | if Path(training_dataset_dir_path).is_dir() and Path(test_dataset_dir_path).is_dir(): 184 | # 185 | print ("\nLoading datasets from previous runs") 186 | common_voice_train=load_from_disk(training_dataset_dir_path) 187 | common_voice_test=load_from_disk(test_dataset_dir_path) 188 | 189 | print ("\nConstructing tokenizer") 190 | tokenizer = Wav2Vec2CTCTokenizer("./vocab.%s.json" % language, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") 191 | 192 | print ("\nFeature Extractor") 193 | feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) 194 | 195 | print ("\nConstructing Processor") 196 | processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) 197 | 198 | processor.save_pretrained(output_dir) 199 | 200 | else: 201 | # 202 | dataset_name="custom_common_voice.py" 203 | Path(dataset_cache_dir).mkdir(parents=True, exist_ok=True) 204 | 205 | try: 206 | cv_config_import = os.environ['CV_CONFIG_FILE'] 207 | mod = importlib.import_module(cv_config_import) 208 | CV_VERSION=mod.CV_VERSION 209 | CV_ID=mod.CV_ID 210 | CV_DATA_URL=mod.CV_DATA_URL 211 | print (CV_VERSION, CV_ID, CV_DATA_URL) 212 | del mod 213 | 214 | cv_dataset_config_file_path = os.path.join(dataset_cache_dir, 'dataset_config.txt') 215 | 216 | with open(cv_dataset_config_file_path, 'w') as f: 217 | f.write(CV_DATA_URL + "\n") 218 | f.write(CV_VERSION + '\n') 219 | f.write(CV_ID + '\n') 220 | f.write(training_split + '\n') 221 | f.write(test_split + '\n') 222 | 223 | shutil.copy(cv_dataset_config_file_path, output_dir) 224 | 225 | except ImportError: 226 | print ("Please create a cv_version.py file. see cv_version.template.py") 227 | 228 | # 229 | print ("\nLoading %s datasets" % dataset_name) 230 | common_voice_train = load_dataset(dataset_name, language, split=training_split) 231 | common_voice_test = load_dataset(dataset_name, language, split=test_split) 232 | 233 | print ("\nRemoving unnecessary columns") 234 | common_voice_train = common_voice_train.remove_columns(["accents", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"]) 235 | common_voice_test = common_voice_test.remove_columns(["accents", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"]) 236 | 237 | print ("\nRemoving unnecesary characters from sentences ") 238 | common_voice_train = common_voice_train.map(remove_special_characters) 239 | common_voice_test = common_voice_test.map(remove_special_characters) 240 | 241 | print ("\nExtracting tokens and saving to vocab.%s.json" % language) 242 | vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names) 243 | vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names) 244 | 245 | vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0])) 246 | vocab_dict = {v: k for k, v in enumerate(vocab_list)} 247 | vocab_dict["|"] = vocab_dict[" "] 248 | del vocab_dict[" "] 249 | 250 | vocab_dict["[UNK]"] = len(vocab_dict) 251 | vocab_dict["[PAD]"] = len(vocab_dict) 252 | 253 | print(vocab_dict) 254 | print(len(vocab_dict)) 255 | 256 | with open('vocab.%s.tmp.json' % language, 'w') as vocab_file: 257 | json.dump(vocab_dict, vocab_file) 258 | 259 | print ("\nCreating array from speech files") 260 | common_voice_train = common_voice_train.map(speech_file_to_array_fn, remove_columns=common_voice_train.column_names) 261 | common_voice_test = common_voice_test.map(speech_file_to_array_fn, remove_columns=common_voice_test.column_names) 262 | 263 | print ("\nDownsampling all speech files") 264 | common_voice_train = common_voice_train.map(resample, num_proc=8) 265 | common_voice_test = common_voice_test.map(resample, num_proc=8) 266 | 267 | print ("\nConstructing tokenizer") 268 | tokenizer = Wav2Vec2CTCTokenizer("./vocab.%s.tmp.json" % language, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") 269 | 270 | print ("\nFeature Extractor") 271 | feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) 272 | 273 | print ("\nConstructing Processor") 274 | processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) 275 | processor.save_pretrained(output_dir) 276 | 277 | print ("\nPreparing the training dataset") 278 | common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names, batch_size=8, num_proc=4, batched=True) 279 | 280 | print ("\nPreparing test set") 281 | common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names, batch_size=8, num_proc=4, batched=True) 282 | 283 | common_voice_train.save_to_disk(training_dataset_dir_path) 284 | common_voice_test.save_to_disk(test_dataset_dir_path) 285 | 286 | # 287 | shutil.copy(os.path.join(dataset_cache_dir, 'dataset_config.txt'), os.path.join(output_dir, "/")) 288 | 289 | print ("\nSetting up data collator") 290 | data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True) 291 | 292 | wer_metric = load_metric("wer") 293 | cer_metric = load_metric("cer") 294 | 295 | print ("\nLoading pre-trained facebook/wav2vec2-large-xlsr model") 296 | # see https://huggingface.co/transformers/model_doc/wav2vec2.html?highlight=mask_time_prob#transformers.Wav2Vec2Config 297 | model = Wav2Vec2ForCTC.from_pretrained( 298 | pre_trained_model_name, 299 | #"facebook/wav2vec2-large-xlsr-53", 300 | #"facebook/wav2vec2-xls-r-300m", 301 | #"facebook/wav2vec2-xls-r-1b", 302 | #"facebook/wav2vec2-xls-r-2b", 303 | activation_dropout=0.055, 304 | attention_dropout=0.055, 305 | hidden_dropout=0.047, 306 | feat_proj_dropout=0.04, 307 | mask_time_prob=0.082, 308 | layerdrop=0.041, 309 | gradient_checkpointing=True, 310 | ctc_loss_reduction="mean", 311 | pad_token_id=processor.tokenizer.pad_token_id, 312 | vocab_size=len(processor.tokenizer) 313 | ) 314 | 315 | model.freeze_feature_encoder() 316 | 317 | 318 | # see https://huggingface.co/transformers/main_classes/trainer.html?highlight=group_by_length#transformers.TrainingArguments 319 | training_args = TrainingArguments( 320 | output_dir=output_dir, 321 | group_by_length=True, 322 | #auto_find_batch_size=True, 323 | per_device_train_batch_size=64, 324 | gradient_accumulation_steps=2, 325 | evaluation_strategy="steps", 326 | num_train_epochs=30, 327 | save_steps=400, 328 | eval_steps=400, 329 | logging_steps=400, 330 | learning_rate=3e-4, 331 | warmup_steps=800, 332 | save_total_limit=5, 333 | save_strategy='steps', 334 | logging_dir=logging_dir 335 | ) 336 | 337 | print ("\nConstructing trainer") 338 | trainer = Trainer( 339 | model=model, 340 | data_collator=data_collator, 341 | args=training_args, 342 | compute_metrics=compute_metrics, 343 | train_dataset=common_voice_train, 344 | eval_dataset=common_voice_test, 345 | tokenizer=processor.feature_extractor, 346 | ) 347 | 348 | print ("\nTraining...") 349 | print ("See: %s" % output_dir) 350 | trainer.train() 351 | 352 | print ("\n\n") 353 | 354 | # copy config and model binary file 355 | #publish.export_checkpoint(output_dir) 356 | trainer.save_model() 357 | 358 | print ("\n\nModel trained. See %s" % output_dir) 359 | 360 | return output_dir 361 | 362 | 363 | if __name__ == "__main__": 364 | train() 365 | 366 | -------------------------------------------------------------------------------- /train/pre-train/.dockerignore: -------------------------------------------------------------------------------- 1 | homedir 2 | data 3 | models 4 | logs 5 | -------------------------------------------------------------------------------- /train/pre-train/.gitignore: -------------------------------------------------------------------------------- 1 | models 2 | homedir 3 | logs 4 | data 5 | -------------------------------------------------------------------------------- /train/pre-train/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:11.4.0-cudnn8-devel-ubuntu20.04 2 | 3 | LABEL maintainer="techiaith" 4 | LABEL repository="wav2vec2-xlsr-pretrain-cy" 5 | 6 | ARG DEBIAN_FRONTEND=noninteractive 7 | ENV TZ=Europe/London 8 | 9 | RUN apt update -q \ 10 | && apt install -y -qq tzdata bash build-essential git curl wget software-properties-common \ 11 | vim ca-certificates libffi-dev libssl-dev libsndfile1 libbz2-dev liblzma-dev locales \ 12 | libboost-all-dev libboost-tools-dev libboost-thread-dev cmake \ 13 | python3 python3-setuptools python3-pip cython 14 | 15 | RUN python3 -m pip install --upgrade pip 16 | 17 | # Set the locale 18 | RUN locale-gen cy_GB.UTF-8 19 | ENV LANG cy_GB.UTF-8 20 | ENV LANGUAGE cy_GB:en 21 | ENV LC_ALL cy_GB.UTF-8 22 | 23 | RUN mkdir -p /wav2vec2-pre-train 24 | WORKDIR /wav2vec2-pre-train 25 | 26 | RUN pip3 install torch==1.9.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html 27 | COPY python/requirements.txt /wav2vec2-pre-train/ 28 | RUN pip3 install -r requirements.txt 29 | 30 | COPY python /wav2vec2-pre-train 31 | WORKDIR /wav2vec2-pre-train 32 | 33 | -------------------------------------------------------------------------------- /train/pre-train/Makefile: -------------------------------------------------------------------------------- 1 | default: build 2 | 3 | build: 4 | docker build --rm -t techiaith/wav2vec2-pre-train-${USER} . 5 | 6 | run: 7 | mkdir -p homedir/datasets 8 | mkdir -p logs 9 | docker run --name techiaith-wav2vec2-pre-train-${USER}-tensorboard \ 10 | --restart=always \ 11 | -v ${PWD}/logs/:/logs \ 12 | -d -p 6007:6006 \ 13 | tensorflow/tensorflow \ 14 | tensorboard --bind_all --logdir /logs 15 | 16 | docker run --gpus all --name techiaith-wav2vec2-pre-train-${USER} \ 17 | -it \ 18 | --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \ 19 | -v ${PWD}/homedir/:/root \ 20 | -v ${PWD}/data/:/data \ 21 | -v ${PWD}/logs/:/logs \ 22 | -v ${PWD}/models/:/models \ 23 | -v ${PWD}/python/:/wav2vec2-pre-train \ 24 | techiaith/wav2vec2-pre-train-${USER} bash 25 | 26 | stop: 27 | -docker stop techiaith-wav2vec2-pre-train-${USER}-tensorboard 28 | -docker stop techiaith-wav2vec2-pre-train-${USER} 29 | -docker rm techiaith-wav2vec2-pre-train-${USER}-tensorboard 30 | -docker rm techiaith-wav2vec2-pre-train-${USER} 31 | 32 | clean: stop 33 | -docker rmi techiaith/wav2vec2-pre-train-${USER} 34 | sudo rm -rf homedir 35 | sudo rm -rf logs 36 | 37 | tensorboard: 38 | python3 -m tensorboard.main serve --bind_all --logdir=logs/ 39 | 40 | -------------------------------------------------------------------------------- /train/pre-train/README.md: -------------------------------------------------------------------------------- 1 | # Rhag hyfforddi modelau adnabod lleferydd 2 | 3 | [(**click here to read the README in English**)](README_en.md) 4 | 5 | Ar y moment, mae'r modelau adnabod lleferydd Cymraeg gorau o'r repo hwn wedi eu creu drwy fireinio modelau mae Facebook/Meta AI wedi eu rhag-hyfforddi o sain leferydd wahanol ieithoedd, gan gynnwys mymryn o Gymraeg, yn unig (h.y. heb angen trawsgrifiadau hefyd). Yn y papur gwreiddiol ar y dull wav2vec2 ["wav2vec2: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) gan Alexei Baevski, Henry Zhou, Abdelrahman Mohamed a Michael Auli, profwyd bod modd cael WER cyn lleied â 4.8 ar set profi Saesneg LibriSpeech ar ôl rhag-hyfforddi ar 53,000 awr o sain lleferydd Saesneg yn unig. Yn y ffolder hwn rydym am greu sgriptiau i greu modelau sylfaenol penodol i’r Gymraeg mewn ymgais i ostwng sgorau WER hyd yn oed ymhellach. 6 | 7 | Mae'r gwaith yn defnyddio lawer ar adnoddau a dogfennaeth gan y cwmni HuggingFace: 8 | 9 | https://github.com/huggingface/transformers/tree/main/examples/pytorch/speech-pretraining 10 | 11 | Dim ond model cychwynnol/arbrofol sydd wedi ei rhag-hyfforddi gyda'r sgriptiau hyn hyd yn hyn, gan ddefnyddio lleferydd Saesneg o is-setiau lleiaf LibriSpeech (`validation` a `test`), ac yna 184 awr a 47 munud o leferydd Cymraeg sydd wedi ei chrafu o amryw o fideos ar YouTube. Mae'r sgript [`build_youtube_playlists_corpus.sh](../../inference/python/build_youtube_playlists_corpus.sh) yn rhestru'r playlists defnyddiwyd i nodi ba fideos defnyddir. Mae hefyd ar gael o wefan HuggingFace o 12 | 13 | https://huggingface.co/techiaith/wav2vec2-base-cy 14 | 15 | Megis prawf cysyniad yw'r gwaith hyd yn hyn, hyd nes byddwn ni wedi casglu miloedd o oriau o leferydd Cymraeg, yn hytrach na channoedd. Ar ol fireinio'r model o'r sgriptiau rhag-hyfforddi hyn ('wav2vec2-base-cy'), gweler [run_base-cy.sh](../fine-tune/python/run_base-cy.sh) gwelwyd WER uchel gyda set profi Common Voice yn ogystal ar set profi o fideos YouTube rydyn ni wedi eu drawsgrifio'n gywir. 16 | 17 | | Set Profi | WER | CER | WER (+LM) | CER(+LM) | 18 | |--- |--- |--- |--- |--- | 19 | | CV10 | 94.83 | 83.55 | 92.31 | 82.25 | 20 | | YouTube | 95.43 | 90.26 | 93.60 | 89.33 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /train/pre-train/README_en.md: -------------------------------------------------------------------------------- 1 | # Pre-training speech recognition models 2 | 3 | [(**cliciwch yma i ddarllen y README yn Gymraeg**)](README.md) 4 | 5 | At the moment, the best Welsh speech recognition models from this repo have been created by fine-tuning models that Facebook/Meta AI have pre-trained from only the sounds of speech in different languages, including some Welsh. (i.e. without also need transcripts) In the original paper on the wav2vec2 method ["wav2vec2: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed and Michael Auli, it was proved that it is possible to get a WER as low as 4.8 on the LibriSpeech English testing if a model is first pre-trained with 53,000 hours of English speech audio alone. In this folder we want to create scripts to pre-train models with Welsh (and some English) speech alone in an attempt to lower scores WER even further in subsequent fine-tuned models. 6 | 7 | The work draws heavily on resources and documentation from the HuggingFace: 8 | 9 | https://github.com/huggingface/transformers/tree/main/examples/pytorch/speech-pretraining 10 | 11 | Only an initial/experimental base model has been pre-trained with these scripts so far, using English speech from LibriSpeech's minimal subsets (`validation` and `test`), and 184 hours and 47 minutes of Welsh speech from various videos on YouTube. The script [`build_youtube_playlists_corpus.sh](../../inference/python/build_youtube_playlists_corpus.sh) lists the playlists used to identify which videos are used. The resulting pre-trained base model is available from the HuggingFace website from 12 | 13 | https://huggingface.co/techiaith/wav2vec2-base-cy 14 | 15 | The work so far is a proof of concept. Until we have collected thousands of hours of Welsh speech, rather than hundreds, the WER scores, after fine-tuning the model, see [run_base-cy.sh](../fine-tune/python/run_base-cy.sh), as seen below, will remain very high. We tested with the Welsh Common Voice test set as well as on a test set of YouTube videos with transcriptions we have corrected. 16 | 17 | | Set Profi | WER | CER | WER (+LM) | CER(+LM) | 18 | |--- |--- |--- |--- |--- | 19 | | CV10 | 94.83 | 83.55 | 92.31 | 82.25 | 20 | | YouTube | 95.43 | 90.26 | 93.60 | 89.33 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /train/pre-train/python/.gitignore: -------------------------------------------------------------------------------- 1 | dataset_url.py -------------------------------------------------------------------------------- /train/pre-train/python/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.21.0 2 | datasets == 2.4.0 3 | torchaudio 4 | accelerate == 0.12.0 5 | librosa 6 | tensorboard 7 | -------------------------------------------------------------------------------- /train/pre-train/python/run.sh: -------------------------------------------------------------------------------- 1 | accelerate launch run_wav2vec2_pretraining_no_trainer.py \ 2 | --dataset_name="librispeech_asr" \ 3 | --dataset_config_names clean clean \ 4 | --dataset_split_names validation test \ 5 | --model_name_or_path="patrickvonplaten/wav2vec2-base-v2" \ 6 | --output_dir="/root/wav2vec2-base-cy" \ 7 | --max_train_steps="20000" \ 8 | --num_warmup_steps="32000" \ 9 | --gradient_accumulation_steps="8" \ 10 | --learning_rate="0.005" \ 11 | --weight_decay="0.01" \ 12 | --max_duration_in_seconds="20.0" \ 13 | --min_duration_in_seconds="2.0" \ 14 | --logging_steps="1" \ 15 | --saving_steps="10000" \ 16 | --per_device_train_batch_size="8" \ 17 | --per_device_eval_batch_size="8" \ 18 | --adam_beta1="0.9" \ 19 | --adam_beta2="0.98" \ 20 | --adam_epsilon="1e-06" \ 21 | --gradient_checkpointing 22 | -------------------------------------------------------------------------------- /train/pre-train/python/youtube_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datasets 3 | 4 | from pathlib import Path 5 | 6 | _DESCRIPTION = """\ 7 | YouTubeDataset is a dataset built internally at Bangor University but 8 | which can be built by anyone using the scripts at : 9 | 10 | inference/python/build-youtube_playlists_corpus.sh 11 | 12 | that downloads all videos from selected playlists, extracts 13 | audio and segments into short clips containing speech. 14 | """ 15 | 16 | from dataset_url import URL 17 | _DL_URL = URL 18 | 19 | 20 | class YouTubeDatasetConfig(datasets.BuilderConfig): 21 | 22 | def __init__(self, name, **kwargs): 23 | description = f"Dataset of clippings from YouTube" 24 | super(YouTubeDatasetConfig, self).__init__( 25 | name=name, version=datasets.Version("1.0",""), 26 | description=description, 27 | **kwargs 28 | ) 29 | 30 | 31 | class YouTubeDataset(datasets.GeneratorBasedBuilder): 32 | 33 | def _info(self): 34 | 35 | features=datasets.Features( 36 | { 37 | "audio": datasets.Audio(sampling_rate=16_000), 38 | } 39 | ) 40 | 41 | return datasets.DatasetInfo( 42 | description=_DESCRIPTION, 43 | features=features, 44 | supervised_keys=None 45 | ) 46 | 47 | def _split_generators(self, dl_manager): 48 | dl_path=dl_manager.download_and_extract(_DL_URL) 49 | 50 | abs_path_to_data = os.path.join(dl_path) 51 | abs_path_to_clips = os.path.join(abs_path_to_data, "clips") 52 | 53 | generated_splits=[ 54 | datasets.SplitGenerator( 55 | name=datasets.Split.TRAIN, 56 | gen_kwargs={ 57 | "filepath": os.path.join(abs_path_to_data, "clips.tsv"), 58 | "path_to_clips": abs_path_to_clips, 59 | }, 60 | ), 61 | ] 62 | return generated_splits 63 | 64 | 65 | def _generate_examples(self, filepath, path_to_clips): 66 | 67 | with open(filepath, encoding='utf-8') as f: 68 | lines = f.readlines() 69 | header_line = lines[0] 70 | 71 | for id_, line in enumerate(lines[1:]): 72 | field_values = line.strip().split("\t") 73 | audio_file_path=os.path.join(path_to_clips, field_values[0]) 74 | with open(audio_file_path ,'rb') as audio_file: 75 | audio = {"path": audio_file_path, "bytes":audio_file.read()} 76 | yield id_, { "audio": audio } 77 | --------------------------------------------------------------------------------