├── whisper_finetune ├── readme.md ├── README.md ├── requirements.txt ├── utils.py ├── config.py ├── inference.py ├── test.py ├── finetune.py ├── model.py └── dataset.py ├── images ├── wav2vec.png └── whisper.png ├── README.md ├── demo ├── templates │ └── home.html ├── app.py └── static │ ├── css │ └── style.css │ └── js │ └── recorder.js ├── .gitignore └── notebooks ├── [WhisperLM]BeamSearchLM.ipynb └── [WhisperLM]_KenLMipynb.ipynb /whisper_finetune/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /whisper_finetune/README.md: -------------------------------------------------------------------------------- 1 | # vi_whisper_finetuning 2 | -------------------------------------------------------------------------------- /images/wav2vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKAB/whisper-finetune-vietnamese/HEAD/images/wav2vec.png -------------------------------------------------------------------------------- /images/whisper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKAB/whisper-finetune-vietnamese/HEAD/images/whisper.png -------------------------------------------------------------------------------- /whisper_finetune/requirements.txt: -------------------------------------------------------------------------------- 1 | evaluate==0.3.0 2 | flake8==5.0.4 3 | isort==5.10.1 4 | jiwer==2.5.1 5 | pytorch-lightning==1.7.7 6 | torchaudio==0.12.1 7 | whisper @ git+https://github.com/openai/whisper.git@9e653bd0ea0f1e9493cb4939733e9de249493cfb 8 | gdown==4.4.0 9 | -------------------------------------------------------------------------------- /whisper_finetune/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchaudio 3 | import torchaudio.transforms as at 4 | 5 | 6 | def load_wave(wave_path, sample_rate:int=16000) -> torch.Tensor: 7 | waveform, sr = torchaudio.load(wave_path, normalize=True) 8 | if sample_rate != sr: 9 | waveform = at.Resample(sr, sample_rate)(waveform) 10 | return waveform -------------------------------------------------------------------------------- /whisper_finetune/config.py: -------------------------------------------------------------------------------- 1 | class Config: 2 | learning_rate = 0.0001 3 | weight_decay = 0.01 4 | adam_epsilon = 1e-8 5 | warmup_steps = 2 6 | batch_size = 16 7 | num_worker = 2 8 | num_train_epochs = 10 9 | gradient_accumulation_steps = 1 10 | sample_rate = 16000 11 | log_output_dir = "logs" 12 | check_output_dir = "artifacts" 13 | train_name = "whisper" 14 | train_id = "fluers" 15 | model_name = "base" 16 | lang = "vi" 17 | checkpoint_path = "" # using origin model if this parh is invaild -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Whiper vietnamese finetuning 3 | 4 | \[In case you are looking for a Vietnamese ASR model, have a look at [HKAB/whisper-finetune-1-notebook](https://github.com/HKAB/whisper-finetune-1-notebook) \] 5 | 6 | This notebook contains: 7 | - Notebooks finetuning, inferencing and generating N-gram. 8 | - Demo Whisper and Wav2vec 9 | 10 | 11 | 12 | 13 | ## Installation 14 | 15 | For using Beam search with LM, install Whisper from my Github 16 | ```bash 17 | pip install git+https://github.com/HKAB/whisper.git 18 | ``` 19 | 20 | ## Run 21 | 22 | For training & inference 23 | 24 | ```shell 25 | python finetune.py --model_name base \ 26 | --dataset vin100h 27 | 28 | python test.py --checkpoint_path path/to/ckpt \ 29 | --dataset vin100h \ 30 | --model_name base 31 | 32 | ``` 33 | 34 | For generating language model with KenLM, use notebook in notebooks folder. 35 | 36 | We share the [checkpoint](https://drive.google.com/file/d/1vSaQjvjljToYlekm_GvlOkJYGLQA5EdJ/view?usp=sharing) (*base, batch_size 1, gradient accumulation steps 10, epoch 14, lr 0.0001*)\. 37 | ## Demo 38 | 39 | 40 | ![Whisper](images/whisper.png "Whisper") 41 | 42 | ![Wav2vec](images/wav2vec.png "Wav2vec") 43 | ## Contributing 44 | 45 | - We finetune Whisper on 100h speech dataset. 46 | - We implement BeamSearchWithLM, using KenLM and showed positive result. 47 | 48 | ## Result 49 | 50 | | Methods | Fleurs | Vin100h (Full) | 51 | |---------------------------|---------|--------------- | 52 | | Whisper (base) | 50.38% | 50.33% | 53 | | Finetune Whisper (base) | 28.68% | 33% | 54 | | Whisper (large) one shot | - | 26.87% | 55 | -------------------------------------------------------------------------------- /whisper_finetune/inference.py: -------------------------------------------------------------------------------- 1 | try: 2 | import tensorflow # required in Colab to avoid protobuf compatibility issues 3 | except ImportError: 4 | pass 5 | 6 | import whisper 7 | import torch 8 | import argparse 9 | 10 | from config import Config 11 | from model import WhisperModelModule 12 | from utils import load_wave 13 | 14 | 15 | 16 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 17 | 18 | if __name__=="__main__": 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--checkpoint_path', type=str, default='', help='path of checkpoint, if not set, use origin pretrained model') 21 | parser.add_argument('--audio_path', type=str, default='test01.wav', help='the audio file for inference') 22 | 23 | args = parser.parse_args() 24 | config = Config() 25 | config.checkpoint_path = args.checkpoint_path 26 | 27 | module = WhisperModelModule(config) 28 | try: 29 | state_dict = torch.load(config.checkpoint_path) 30 | state_dict = state_dict["state_dict"] 31 | module.load_state_dict(state_dict) 32 | print(f"load checkpoint successfully from {config.checkpoint_path}") 33 | except Exception as e: 34 | print(e) 35 | print(f"load checkpoint failt using origin weigth of {config.model_name} model") 36 | model = module.model 37 | model.to(device) 38 | 39 | audio = whisper.load_audio(args.audio_path) 40 | audio = whisper.pad_or_trim(audio) 41 | 42 | # make log-Mel spectrogram and move to the same device as the model 43 | mel = whisper.log_mel_spectrogram(audio).to(model.device) 44 | 45 | # decode the audio 46 | options = whisper.DecodingOptions( 47 | language="vi", without_timestamps=True, fp16=torch.cuda.is_available() 48 | ) 49 | 50 | result = model.decode(mel, options) 51 | print('Predicted:', result.text) 52 | 53 | -------------------------------------------------------------------------------- /whisper_finetune/test.py: -------------------------------------------------------------------------------- 1 | try: 2 | import tensorflow # required in Colab to avoid protobuf compatibility issues 3 | except ImportError: 4 | pass 5 | 6 | 7 | import jiwer 8 | import whisper 9 | import torch 10 | import argparse 11 | from tqdm import tqdm 12 | import pandas as pd 13 | 14 | from config import Config 15 | from dataset import load_dataset, WhisperDataCollatorWithPadding 16 | from model import WhisperModelModule 17 | 18 | 19 | 20 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 21 | 22 | if __name__=="__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument('--checkpoint_path', type=str, default='', help='path of checkpoint, if not set, use origin pretrained model') 25 | parser.add_argument('--dataset_name', type=str, default='fluers', help='the dataset for finetuning, includes fluers, vin100h, vlsp2019') 26 | parser.add_argument('--model_name', type=str, default='tiny', help='model name') 27 | 28 | args = parser.parse_args() 29 | config = Config() 30 | config.checkpoint_path = args.checkpoint_path 31 | config.model_name = args.model_name 32 | 33 | module = WhisperModelModule(config) 34 | try: 35 | state_dict = torch.load(config.checkpoint_path) 36 | state_dict = state_dict["state_dict"] 37 | module.load_state_dict(state_dict) 38 | print(f"load checkpoint successfully from {config.checkpoint_path}") 39 | except Exception as e: 40 | print(e) 41 | print(f"load checkpoint failt using origin weigth of {config.model_name} model") 42 | model = module.model 43 | model.to(device) 44 | 45 | _, valid_dataset = load_dataset(args.dataset_name, test=True) 46 | test_loader = torch.utils.data.DataLoader( 47 | valid_dataset, 48 | batch_size=config.batch_size, 49 | num_workers=config.num_worker, 50 | collate_fn=WhisperDataCollatorWithPadding(), 51 | ) 52 | 53 | # decode the audio 54 | options = whisper.DecodingOptions( 55 | language="vi", without_timestamps=True, fp16=torch.cuda.is_available() 56 | ) 57 | 58 | hypotheses = [] 59 | references = [] 60 | print(model.device) 61 | for sample in tqdm(test_loader): 62 | mels = sample["input_ids"].to(model.device) 63 | texts = sample["texts"] 64 | results = model.decode(mels, options) 65 | hypotheses.extend([result.text for result in results]) 66 | references.extend(texts) 67 | 68 | data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references)) 69 | 70 | data["hypothesis_clean"] = [ 71 | text.lower() for text in data["hypothesis"] 72 | ] 73 | data["reference_clean"] = [ 74 | text.lower() for text in data["reference"] 75 | ] 76 | 77 | data.to_csv('results.csv') 78 | for i in range(60): 79 | print('Reference:', data["reference_clean"][i]) 80 | print('Predict:', data["hypothesis_clean"][i]) 81 | print('\n') 82 | wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"])) 83 | 84 | print(f"WER: {wer * 100:.2f} %") -------------------------------------------------------------------------------- /whisper_finetune/finetune.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from pathlib import Path 4 | import torch 5 | from config import Config 6 | 7 | try: 8 | import tensorflow # required in Colab to avoid protobuf compatibility issues 9 | except ImportError: 10 | pass 11 | 12 | from dataset import load_dataset, WhisperDataCollatorWithPadding 13 | 14 | from pytorch_lightning import Trainer 15 | from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint 16 | from pytorch_lightning.loggers import TensorBoardLogger 17 | from model import WhisperModelModule 18 | 19 | DEVICE = "cuda" if torch.cuda.is_available() else "cpu" 20 | 21 | if __name__=="__main__": 22 | parser = argparse.ArgumentParser() 23 | 24 | parser.add_argument('--checkpoint_path', type=str, default='', help='path of checkpoint, if not set, use origin pretrained model') 25 | parser.add_argument('--model_name', type=str, default='base', help='model name, tiny, small, medium, base, large') 26 | parser.add_argument('--dataset', type=str, default='fluers', help='the dataset for finetuning, includes fluers, vin100h, vlsp2019') 27 | parser.add_argument('--lang', type=str, default='vi', help='language, vi, en') 28 | parser.add_argument('--lr', type=float, default=0.0005, help='learning rate') 29 | parser.add_argument('--epoch', type=int, default=10, help='number of epoch for finetuning') 30 | parser.add_argument('--batch_size', type=int, default=16, help='batch size') 31 | 32 | args = parser.parse_args() 33 | 34 | # Load default config from config.py and set new config from args 35 | config = Config() 36 | config.model_name = args.model_name 37 | config.lang = args.lang 38 | config.learning_rate = args.lr 39 | config.num_train_epochs = args.epoch 40 | config.batch_size = args.batch_size 41 | config.checkpoint_path = args.checkpoint_path 42 | 43 | print(f"""Finetuning Whisper with new config: 44 | checkpoint_path: %s, 45 | dataset: %s, 46 | model_name: %s, 47 | lang: %s, 48 | learning_rate: %.5f, 49 | num_finetune_epochs: %d, 50 | batch_size: %d""" % ("No check point" if config.checkpoint_path == "" else config.checkpoint_path, args.dataset, config.model_name, config.lang, config.learning_rate, config.num_train_epochs, config.batch_size)) 51 | 52 | # Load dataset for finetuning 53 | if config.lang == "vi": 54 | train_dataset, valid_dataset = load_dataset(args.dataset) 55 | else: 56 | raise ValueError("Not support other language dataset, please choose vi for languague!") 57 | 58 | train_loader = torch.utils.data.DataLoader( 59 | train_dataset, 60 | batch_size=config.batch_size, 61 | num_workers=config.num_worker, 62 | collate_fn=WhisperDataCollatorWithPadding(), 63 | ) 64 | valid_loader = torch.utils.data.DataLoader( 65 | valid_dataset, 66 | batch_size=config.batch_size, 67 | num_workers=config.num_worker, 68 | collate_fn=WhisperDataCollatorWithPadding(), 69 | ) 70 | 71 | 72 | Path(os.path.join(os.getcwd(), config.log_output_dir)).mkdir(exist_ok=True) 73 | Path(os.path.join(os.getcwd(), config.check_output_dir)).mkdir(exist_ok=True) 74 | 75 | # Log and checkpoint 76 | tflogger = TensorBoardLogger( 77 | save_dir=config.log_output_dir, name=config.train_name, version=config.train_id 78 | ) 79 | 80 | checkpoint_callback = ModelCheckpoint( 81 | dirpath=f"{config.check_output_dir}/checkpoint", 82 | filename="checkpoint-{epoch:04d}", 83 | save_top_k=1, # -1: all model save, 1: best model save 84 | ) 85 | 86 | # callback list 87 | callback_list = [checkpoint_callback, LearningRateMonitor(logging_interval="epoch")] 88 | model = WhisperModelModule(config, train_loader, valid_loader) 89 | 90 | # Trainer 91 | trainer = Trainer( 92 | precision=16, 93 | accelerator=DEVICE, 94 | max_epochs=config.num_train_epochs, 95 | accumulate_grad_batches=config.gradient_accumulation_steps, 96 | logger=tflogger, 97 | callbacks=callback_list, 98 | ) 99 | 100 | trainer.fit(model) -------------------------------------------------------------------------------- /demo/templates/home.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | Hello, world! 12 | 13 | 14 | 15 |
16 | 17 |
18 |

🎶 Speech2Text 📑

19 |
20 | 21 |
22 | 23 |
24 | 25 |

26 |
27 | 28 |
29 |
30 |

Audio is playing . 31 | . 32 | . 33 |

34 | 35 |

Audio is handling by server . 36 | . 37 | . 38 |

39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |

47 |
48 |
49 |
50 |
51 |
52 |
53 |
Settings
54 |
55 |
56 | 57 | 58 | 59 | 60 |
61 |
62 | 63 | 64 | 65 | 66 |
67 | Replay 68 | 69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |

To record audio, use browsers like Chrome and Firefox that support audio recording.

78 | 79 |
80 |
81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /demo/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, redirect, url_for, request, jsonify 2 | 3 | # import cv2 as cv2 4 | # import numpy as np 5 | # import urllib.request 6 | # from PIL import Image 7 | # import io 8 | # from scipy.io import wavfile 9 | 10 | # from pygame import mixer 11 | from werkzeug.utils import secure_filename 12 | import os 13 | import whisper 14 | import torch 15 | from datetime import datetime 16 | 17 | #wav2vec 18 | from transformers.file_utils import cached_path, hf_bucket_url 19 | import zipfile 20 | from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC 21 | import soundfile as sf 22 | import kenlm 23 | from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel 24 | import librosa 25 | 26 | # from odoo.http import request 27 | app = Flask(__name__, template_folder='templates') 28 | 29 | # model = whisper.load_model("tiny") 30 | model = whisper.load_model("base") 31 | state_dict = torch.load("checkpoint-epoch=0013.ckpt", map_location="cpu")['state_dict'] 32 | # change all key of state_dict to remove "model." 33 | new_state_dict = {k.replace("model.", ""): v for k, v in state_dict.items()} 34 | model.load_state_dict(new_state_dict) 35 | print(f"Model whisper base loaded") 36 | 37 | 38 | cache_dir = './cache/' 39 | processor_w2v = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir) 40 | model_w2v = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir) 41 | # lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip') 42 | # lm_file = cached_path(lm_file,cache_dir=cache_dir) 43 | # with zipfile.ZipFile(lm_file, 'r') as zip_ref: 44 | # zip_ref.extractall(cache_dir) 45 | lm_file = cache_dir + 'vi_lm_4grams.bin' 46 | 47 | def get_decoder_ngram_model(tokenizer, ngram_lm_path): 48 | vocab_dict = tokenizer.get_vocab() 49 | sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items()) 50 | vocab = [x[1] for x in sort_vocab][:-2] 51 | vocab_list = vocab 52 | # convert ctc blank character representation 53 | vocab_list[tokenizer.pad_token_id] = "" 54 | # replace special characters 55 | vocab_list[tokenizer.unk_token_id] = "" 56 | # vocab_list[tokenizer.bos_token_id] = "" 57 | # vocab_list[tokenizer.eos_token_id] = "" 58 | # convert space character representation 59 | vocab_list[tokenizer.word_delimiter_token_id] = " " 60 | # specify ctc blank char index, since conventially it is the last entry of the logit matrix 61 | alphabet = Alphabet.build_alphabet(vocab_list, ctc_token_idx=tokenizer.pad_token_id) 62 | lm_model = kenlm.Model(ngram_lm_path) 63 | decoder = BeamSearchDecoderCTC(alphabet, 64 | language_model=LanguageModel(lm_model)) 65 | return decoder 66 | 67 | ngram_lm_model = get_decoder_ngram_model(processor_w2v.tokenizer, lm_file) 68 | print("Huggingface model loaded") 69 | 70 | @app.route('/') 71 | def home(): 72 | return render_template('home.html') 73 | 74 | 75 | @app.route('/transcribe-whisper', methods=['POST']) 76 | def transcribe(): 77 | files = request.files 78 | file = files.get('file') 79 | 80 | file_name = secure_filename(str(datetime.now())) + ".wav" 81 | file_path = os.path.join("audio_folder", file_name) 82 | file.save(file_path) 83 | 84 | audio = whisper.load_audio(file_path) 85 | audio = whisper.pad_or_trim(audio) 86 | print("Audio loaded and trimmed") 87 | 88 | mel = whisper.log_mel_spectrogram(audio).to(model.device) 89 | # options = whisper.DecodingOptions(fp16 = False, withlm=F, beam_size=1, 90 | # patience=1.0, lm_path="../../dataset_tokenized_3gram.binary", lm_alpha=0.75, lm_beta=0.0, 91 | # without_timestamps=True, language="vi") 92 | options = whisper.DecodingOptions(fp16 = False, language="vi", without_timestamps=True) 93 | print("Model decoding...") 94 | result = whisper.decode(model, mel, options) 95 | 96 | return jsonify(result.text) 97 | 98 | @app.route('/transcribe-w2v', methods=['POST']) 99 | def transcribe_w2v(): 100 | files = request.files 101 | file = files.get('file') 102 | 103 | file_name = secure_filename(str(datetime.now())) + ".wav" 104 | file_path = os.path.join("audio_folder", file_name) 105 | file.save(file_path) 106 | 107 | speech, sr = librosa.load(file_path) 108 | speech = librosa.resample(speech, orig_sr=sr, target_sr=16000) 109 | input_values = processor_w2v( 110 | speech, 111 | sampling_rate=16000, 112 | return_tensors="pt").input_values 113 | logits = model_w2v(input_values).logits[0] 114 | pred_ids = torch.argmax(logits, dim=-1) 115 | 116 | print("Model decoding...") 117 | beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500) 118 | return jsonify(beam_search_output) 119 | 120 | if __name__ == '__main__': 121 | app.run(debug=True) 122 | 123 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/python,jupyternotebooks,audio,windows 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,jupyternotebooks,audio,windows 3 | 4 | ### Audio ### 5 | *.aif 6 | *.aiff 7 | *.iff 8 | *.m3u 9 | *.m4a 10 | *.mid 11 | *.mp3 12 | *.mpa 13 | *.ra 14 | *.wav 15 | *.wma 16 | *.ogg 17 | *.flac 18 | 19 | ### JupyterNotebooks ### 20 | # gitignore template for Jupyter Notebooks 21 | # website: http://jupyter.org/ 22 | 23 | .ipynb_checkpoints 24 | */.ipynb_checkpoints/* 25 | 26 | # IPython 27 | profile_default/ 28 | ipython_config.py 29 | 30 | # Remove previous ipynb_checkpoints 31 | # git rm -r .ipynb_checkpoints/ 32 | 33 | ### Python ### 34 | # Byte-compiled / optimized / DLL files 35 | __pycache__/ 36 | *.py[cod] 37 | *$py.class 38 | 39 | # C extensions 40 | *.so 41 | 42 | # Distribution / packaging 43 | .Python 44 | build/ 45 | develop-eggs/ 46 | dist/ 47 | downloads/ 48 | eggs/ 49 | .eggs/ 50 | lib/ 51 | lib64/ 52 | parts/ 53 | sdist/ 54 | var/ 55 | wheels/ 56 | share/python-wheels/ 57 | *.egg-info/ 58 | .installed.cfg 59 | *.egg 60 | MANIFEST 61 | 62 | # PyInstaller 63 | # Usually these files are written by a python script from a template 64 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 65 | *.manifest 66 | *.spec 67 | 68 | # Installer logs 69 | pip-log.txt 70 | pip-delete-this-directory.txt 71 | 72 | # Unit test / coverage reports 73 | htmlcov/ 74 | .tox/ 75 | .nox/ 76 | .coverage 77 | .coverage.* 78 | .cache 79 | nosetests.xml 80 | coverage.xml 81 | *.cover 82 | *.py,cover 83 | .hypothesis/ 84 | .pytest_cache/ 85 | cover/ 86 | 87 | # Translations 88 | *.mo 89 | *.pot 90 | 91 | # Django stuff: 92 | *.log 93 | local_settings.py 94 | db.sqlite3 95 | db.sqlite3-journal 96 | 97 | # Flask stuff: 98 | instance/ 99 | .webassets-cache 100 | 101 | # Scrapy stuff: 102 | .scrapy 103 | 104 | # Sphinx documentation 105 | docs/_build/ 106 | 107 | # PyBuilder 108 | .pybuilder/ 109 | target/ 110 | 111 | # Jupyter Notebook 112 | 113 | # IPython 114 | 115 | # pyenv 116 | # For a library or package, you might want to ignore these files since the code is 117 | # intended to run in multiple environments; otherwise, check them in: 118 | # .python-version 119 | 120 | # pipenv 121 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 122 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 123 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 124 | # install all needed dependencies. 125 | #Pipfile.lock 126 | 127 | # poetry 128 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 129 | # This is especially recommended for binary packages to ensure reproducibility, and is more 130 | # commonly ignored for libraries. 131 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 132 | #poetry.lock 133 | 134 | # pdm 135 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 136 | #pdm.lock 137 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 138 | # in version control. 139 | # https://pdm.fming.dev/#use-with-ide 140 | .pdm.toml 141 | 142 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 143 | __pypackages__/ 144 | 145 | # Celery stuff 146 | celerybeat-schedule 147 | celerybeat.pid 148 | 149 | # SageMath parsed files 150 | *.sage.py 151 | 152 | # Environments 153 | .env 154 | .venv 155 | env/ 156 | venv/ 157 | ENV/ 158 | env.bak/ 159 | venv.bak/ 160 | 161 | # Spyder project settings 162 | .spyderproject 163 | .spyproject 164 | 165 | # Rope project settings 166 | .ropeproject 167 | 168 | # mkdocs documentation 169 | /site 170 | 171 | # mypy 172 | .mypy_cache/ 173 | .dmypy.json 174 | dmypy.json 175 | 176 | # Pyre type checker 177 | .pyre/ 178 | 179 | # pytype static type analyzer 180 | .pytype/ 181 | 182 | # Cython debug symbols 183 | cython_debug/ 184 | 185 | # PyCharm 186 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 187 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 188 | # and can be added to the global gitignore or merged into this file. For a more nuclear 189 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 190 | #.idea/ 191 | 192 | ### Windows ### 193 | # Windows thumbnail cache files 194 | Thumbs.db 195 | Thumbs.db:encryptable 196 | ehthumbs.db 197 | ehthumbs_vista.db 198 | 199 | # Dump file 200 | *.stackdump 201 | 202 | # Folder config file 203 | [Dd]esktop.ini 204 | 205 | # Recycle Bin used on file shares 206 | $RECYCLE.BIN/ 207 | 208 | # Windows Installer files 209 | *.cab 210 | *.msi 211 | *.msix 212 | *.msm 213 | *.msp 214 | 215 | # Windows shortcuts 216 | *.lnk 217 | 218 | *.exe 219 | ffmpeg-2022-10-02-git-5f02a261a2-essentials_build/ 220 | 221 | # End of https://www.toptal.com/developers/gitignore/api/python,jupyternotebooks,audio,windows 222 | 223 | -------------------------------------------------------------------------------- /whisper_finetune/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import whisper 4 | 5 | from pytorch_lightning import LightningModule 6 | 7 | from config import Config 8 | import evaluate 9 | 10 | from transformers import AdamW, get_linear_schedule_with_warmup 11 | 12 | 13 | class WhisperModelModule(LightningModule): 14 | def __init__( 15 | self, 16 | cfg: Config, 17 | train_dataloader=None, 18 | eval_dataloader=None, 19 | ) -> None: 20 | super().__init__() 21 | self.options = whisper.DecodingOptions( 22 | language=cfg.lang, without_timestamps=True 23 | ) 24 | self.model = whisper.load_model(cfg.model_name) 25 | self.tokenizer = whisper.tokenizer.get_tokenizer( 26 | True, language=cfg.lang, task=self.options.task 27 | ) 28 | 29 | # only decoder training 30 | for p in self.model.encoder.parameters(): 31 | p.requires_grad = False 32 | 33 | self.loss_fn = nn.CrossEntropyLoss(ignore_index=-100) 34 | self.metrics_wer = evaluate.load("wer") 35 | self.metrics_cer = evaluate.load("cer") 36 | 37 | self.cfg = cfg 38 | self.trainloader = train_dataloader 39 | self.evaloader = eval_dataloader 40 | 41 | def forward(self, x): 42 | return self.model(x) 43 | 44 | def training_step(self, batch, batch_id): 45 | input_ids = batch["input_ids"] 46 | labels = batch["labels"].long() 47 | dec_input_ids = batch["dec_input_ids"].long() 48 | 49 | with torch.no_grad(): 50 | audio_features = self.model.encoder(input_ids) 51 | 52 | out = self.model.decoder(dec_input_ids, audio_features) 53 | loss = self.loss_fn(out.view(-1, out.size(-1)), labels.view(-1)) 54 | self.log("train/loss", loss, on_step=True, prog_bar=True, logger=True) 55 | return loss 56 | 57 | def validation_step(self, batch, batch_id): 58 | input_ids = batch["input_ids"] 59 | labels = batch["labels"].long() 60 | dec_input_ids = batch["dec_input_ids"].long() 61 | 62 | audio_features = self.model.encoder(input_ids) 63 | out = self.model.decoder(dec_input_ids, audio_features) 64 | 65 | loss = self.loss_fn(out.view(-1, out.size(-1)), labels.view(-1)) 66 | 67 | out[out == -100] = self.tokenizer.eot 68 | labels[labels == -100] = self.tokenizer.eot 69 | 70 | o_list, l_list = [], [] 71 | for o, l in zip(out, labels): 72 | o = torch.argmax(o, dim=1) 73 | o_list.append(self.tokenizer.decode(o, skip_special_tokens=True)) 74 | l_list.append(self.tokenizer.decode(l, skip_special_tokens=True)) 75 | cer = self.metrics_cer.compute(references=l_list, predictions=o_list) 76 | wer = self.metrics_wer.compute(references=l_list, predictions=o_list) 77 | 78 | self.log("val/loss", loss, on_step=True, prog_bar=True, logger=True) 79 | self.log("val/cer", cer, on_step=True, prog_bar=True, logger=True) 80 | self.log("val/wer", wer, on_step=True, prog_bar=True, logger=True) 81 | 82 | return {"cer": cer, "wer": wer, "loss": loss} 83 | 84 | def configure_optimizers(self): 85 | """configure optimizer and scheduler""" 86 | model = self.model 87 | no_decay = ["bias", "LayerNorm.weight"] 88 | optimizer_grouped_parameters = [ 89 | { 90 | "params": [ 91 | p 92 | for n, p in model.named_parameters() 93 | if not any(nd in n for nd in no_decay) 94 | ], 95 | "weight_decay": self.cfg.weight_decay, 96 | }, 97 | { 98 | "params": [ 99 | p 100 | for n, p in model.named_parameters() 101 | if any(nd in n for nd in no_decay) 102 | ], 103 | "weight_decay": 0.0, 104 | }, 105 | ] 106 | optimizer = AdamW( 107 | optimizer_grouped_parameters, 108 | lr=self.cfg.learning_rate, 109 | eps=self.cfg.adam_epsilon, 110 | ) 111 | self.optimizer = optimizer 112 | 113 | scheduler = get_linear_schedule_with_warmup( 114 | optimizer, 115 | num_warmup_steps=self.cfg.warmup_steps, 116 | num_training_steps=self.t_total, 117 | ) 118 | self.scheduler = scheduler 119 | 120 | return [optimizer], [ 121 | {"scheduler": scheduler, "interval": "step", "frequency": 1} 122 | ] 123 | 124 | def setup(self, stage=None): 125 | if stage == "fit" or stage is None: 126 | self.t_total = ( 127 | (len(self.trainloader.dataset) // (self.cfg.batch_size)) 128 | // self.cfg.gradient_accumulation_steps 129 | * float(self.cfg.num_train_epochs) 130 | ) 131 | 132 | def train_dataloader(self): 133 | return self.trainloader 134 | 135 | def val_dataloader(self): 136 | return self.evaloader -------------------------------------------------------------------------------- /notebooks/[WhisperLM]BeamSearchLM.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import whisper\n", 10 | "import torch" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "model = whisper.load_model(\"base\")" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/plain": [ 30 | "" 31 | ] 32 | }, 33 | "execution_count": 3, 34 | "metadata": {}, 35 | "output_type": "execute_result" 36 | } 37 | ], 38 | "source": [ 39 | "state_dict = torch.load(\"/mnt/c/Users/truongnp3/Desktop/Course/NLP/Project/whisper-finetune-vietnamese/demo/checkpoint-epoch=0014.ckpt\", map_location=\"cpu\")['state_dict']\n", 40 | "# change all key of state_dict to remove \"model.\"\n", 41 | "new_state_dict = {k.replace(\"model.\", \"\"): v for k, v in state_dict.items()}\n", 42 | "model.load_state_dict(new_state_dict)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 4, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "# audio = whisper.load_audio(\"/mnt/c/Users/truongnp3/Desktop/Course/NLP/Project/whisper-finetune-vietnamese/notebooks/test_audio.aiff\")\n", 52 | "# audio = whisper.load_audio(\"/mnt/c/Users/truongnp3/Downloads/spkyut-20190730-utt000005432.wav\")\n", 53 | "# audio = whisper.load_audio(\"/mnt/c/Users/truongnp3/Downloads/spkyut-20190730-utt000005394.wav\")\n", 54 | "audio = whisper.load_audio(\"/mnt/c/Users/truongnp3/Desktop/Course/NLP/Project/whisper-finetune-vietnamese/demo/audio_folder/2022-10-23_234521.158395.wav\")\n", 55 | "audio = whisper.pad_or_trim(audio)\n", 56 | "mel = whisper.log_mel_spectrogram(audio).to(model.device)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 7, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "Detected language: vi\n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "_, probs = model.detect_language(mel)\n", 74 | "print(f\"Detected language: {max(probs, key=probs.get)}\")" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 5, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "# options = whisper.DecodingOptions(fp16 = False, without_timestamps=True, language=\"vi\")\n", 84 | "options = whisper.DecodingOptions(fp16 = False, withlm=False, beam_size=1, without_timestamps=True)\n", 85 | "result = whisper.decode(model, mel, options)\n", 86 | "\n", 87 | "# print the recognized text\n", 88 | "# print(result.text)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 6, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "chuyển tiền đang qua số điện thoại hàng ngàn ưu đãi khi nạp tiền điện thoại thanh toán dịch vụ cùng ba chăm cộng cộng tiền ích khác\n" 101 | ] 102 | } 103 | ], 104 | "source": [ 105 | "print(result.text) #beam search" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 7, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "name": "stderr", 115 | "output_type": "stream", 116 | "text": [ 117 | "Loading the LM will be faster if you build a binary file.\n", 118 | "Reading /home/hkab/dataset_tokenized_3gram.arpa\n", 119 | "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", 120 | "****************************************************************************************************\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "# options = whisper.DecodingOptions(fp16 = False)\n", 126 | "options = whisper.DecodingOptions(fp16 = False, withlm=True, beam_size=1, \n", 127 | " patience=1.0, lm_path=\"dataset_tokenized_3gram.arpa\", lm_alpha=3.0, lm_beta=0.0,\n", 128 | " without_timestamps=True)\n", 129 | "result = whisper.decode(model, mel, options)\n", 130 | "\n", 131 | "# print the recognized text\n", 132 | "# print(result.text)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 8, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "chuyển tiền đen qua số điện thoại hàng ngàn ưu đãi khi nạp tìm điện thoại thanh toán dịch vụ cũng ba trăm cộng cộng tiền đích khác\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "print(result.text)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "- Nhiều chồng ngờ góc quên hay góc xanh nhà, nhìn chọn chỗ đất hiện, đất cắt hoặc đất mùn, đất phủ xa tây sốt, đồ ẩm thấp, dễ phát nữ.\n", 157 | "\n", 158 | "- Nếu trồng ngờ góc quên hay góc xanh nhà, hiện trọn trộ đất hiện, đất cắt hoặc đất muồn, đất phủ xa tây sốt, đồ ẩm thấp, dễ phát nước." 159 | ] 160 | } 161 | ], 162 | "metadata": { 163 | "kernelspec": { 164 | "display_name": "Python 3 (ipykernel)", 165 | "language": "python", 166 | "name": "python3" 167 | }, 168 | "orig_nbformat": 4, 169 | "vscode": { 170 | "interpreter": { 171 | "hash": "569cc53f83e70b41c2d959ffedb296ac14adb9e332ab59ae04a2c7a2935b0e00" 172 | } 173 | } 174 | }, 175 | "nbformat": 4, 176 | "nbformat_minor": 2 177 | } 178 | -------------------------------------------------------------------------------- /demo/static/css/style.css: -------------------------------------------------------------------------------- 1 | /* style.css*/ 2 | 3 | /* Media Queries */ 4 | 5 | /* Small Devices*/ 6 | 7 | @media (min-width: 0px) { 8 | * { 9 | box-sizing: border-box; 10 | } 11 | body { 12 | margin: 0; 13 | padding: 0; 14 | /*background-color: lightcyan;*/ 15 | color: #414142; 16 | position: relative; 17 | font-family: monospace; 18 | } 19 | .title { 20 | font-size: 30px; 21 | margin-bottom: 55px; 22 | text-align: center; 23 | } 24 | .audio-recording-container { 25 | width: 100%; 26 | height: 100vh; 27 | /* view port height*/ 28 | /*targeting Chrome & Safari*/ 29 | display: -webkit-flex; 30 | /*targeting IE10*/ 31 | display: -ms-flex; 32 | display: flex; 33 | flex-direction: column; 34 | justify-content: center; 35 | /*horizontal centering*/ 36 | align-items: center; 37 | } 38 | .start-recording-button { 39 | font-size: 70px; 40 | color: #435f7a; 41 | cursor: pointer; 42 | opacity: .5; 43 | margin-bottom: 30px; 44 | } 45 | .start-recording-button:hover { 46 | opacity: 1; 47 | } 48 | .recording-contorl-buttons-container { 49 | /*targeting Chrome & Safari*/ 50 | display: -webkit-flex; 51 | /*targeting IE10*/ 52 | display: -ms-flex; 53 | display: flex; 54 | justify-content: space-evenly; 55 | /*horizontal centering*/ 56 | align-items: center; 57 | width: 334px; 58 | margin-bottom: 30px; 59 | } 60 | .cancel-recording-button, 61 | .stop-recording-button { 62 | font-size: 70px; 63 | cursor: pointer; 64 | } 65 | .cancel-recording-button { 66 | color: red; 67 | opacity: 0.7; 68 | } 69 | .cancel-recording-button:hover { 70 | color: rgb(206, 4, 4); 71 | } 72 | .stop-recording-button { 73 | color: #33cc33; 74 | opacity: 0.7; 75 | } 76 | .stop-recording-button:hover { 77 | color: #27a527; 78 | } 79 | .recording-elapsed-time { 80 | /*targeting Chrome & Safari*/ 81 | display: -webkit-flex; 82 | /*targeting IE10*/ 83 | display: -ms-flex; 84 | display: flex; 85 | justify-content: center; 86 | /*horizontal centering*/ 87 | align-items: center; 88 | } 89 | .red-recording-dot { 90 | font-size: 25px; 91 | color: red; 92 | margin-right: 12px; 93 | /*transitions with Firefox, IE and Opera Support browser support*/ 94 | animation-name: flashing-recording-dot; 95 | -webkit-animation-name: flashing-recording-dot; 96 | -moz-animation-name: flashing-recording-dot; 97 | -o-animation-name: flashing-recording-dot; 98 | animation-duration: 2s; 99 | -webkit-animation-duration: 2s; 100 | -moz-animation-duration: 2s; 101 | -o-animation-duration: 2s; 102 | animation-iteration-count: infinite; 103 | -webkit-animation-iteration-count: infinite; 104 | -moz-animation-iteration-count: infinite; 105 | -o-animation-iteration-count: infinite; 106 | } 107 | /* The animation code */ 108 | @keyframes flashing-recording-dot { 109 | 0% { 110 | opacity: 1; 111 | } 112 | 50% { 113 | opacity: 0; 114 | } 115 | 100% { 116 | opacity: 1; 117 | } 118 | } 119 | @-webkit-keyframes flashing-recording-dot { 120 | 0% { 121 | opacity: 1; 122 | } 123 | 50% { 124 | opacity: 0; 125 | } 126 | 100% { 127 | opacity: 1; 128 | } 129 | } 130 | @-moz-keyframes flashing-recording-dot { 131 | 0% { 132 | opacity: 1; 133 | } 134 | 50% { 135 | opacity: 0; 136 | } 137 | 100% { 138 | opacity: 1; 139 | } 140 | } 141 | @-o-keyframes flashing-recording-dot { 142 | 0% { 143 | opacity: 1; 144 | } 145 | 50% { 146 | opacity: 0; 147 | } 148 | 100% { 149 | opacity: 1; 150 | } 151 | } 152 | .elapsed-time { 153 | font-size: 32px; 154 | } 155 | .recording-contorl-buttons-container.hide { 156 | display: none; 157 | } 158 | .overlay { 159 | position: absolute; 160 | top: 0; 161 | height: 100vh; 162 | width: 100%; 163 | background-color: rgba(82, 76, 76, 0.35); 164 | /*targeting Chrome & Safari*/ 165 | display: -webkit-flex; 166 | /*targeting IE10*/ 167 | display: -ms-flex; 168 | display: flex; 169 | justify-content: center; 170 | /*horizontal centering*/ 171 | align-items: center; 172 | } 173 | .overlay.hide { 174 | display: none; 175 | } 176 | .browser-not-supporting-audio-recording-box { 177 | /*targeting Chrome & Safari*/ 178 | display: -webkit-flex; 179 | /*targeting IE10*/ 180 | display: -ms-flex; 181 | display: flex; 182 | flex-direction: column; 183 | justify-content: space-between; 184 | /*horizontal centering*/ 185 | align-items: center; 186 | width: 317px; 187 | height: 119px; 188 | background-color: white; 189 | border-radius: 10px; 190 | padding: 15px; 191 | font-size: 16px; 192 | } 193 | .close-browser-not-supported-box { 194 | cursor: pointer; 195 | background-color: #abc1c05c; 196 | border-radius: 10px; 197 | font-size: 16px; 198 | border: none; 199 | } 200 | .close-browser-not-supported-box:hover { 201 | background-color: #92a5a45c; 202 | } 203 | .close-browser-not-supported-box:focus { 204 | outline: none; 205 | border: none; 206 | } 207 | .audio-element.hide { 208 | display: none; 209 | } 210 | .text-indication-of-audio-playing-container { 211 | height: 20px; 212 | } 213 | .text-indication-of-audio-playing { 214 | font-size: 20px; 215 | } 216 | .text-indication-of-audio-playing.hide { 217 | display: none; 218 | } 219 | /* 3 Dots animation*/ 220 | .text-indication-of-audio-playing span { 221 | /*transitions with Firefox, IE and Opera Support browser support*/ 222 | animation-name: blinking-dot; 223 | -webkit-animation-name: blinking-dot; 224 | -moz-animation-name: blinking-dot; 225 | -o-animation-name: blinking-dot; 226 | animation-duration: 2s; 227 | -webkit-animation-duration: 2s; 228 | -moz-animation-duration: 2s; 229 | -o-animation-duration: 2s; 230 | animation-iteration-count: infinite; 231 | -webkit-animation-iteration-count: infinite; 232 | -moz-animation-iteration-count: infinite; 233 | -o-animation-iteration-count: infinite; 234 | } 235 | .text-indication-of-audio-playing span:nth-child(2) { 236 | animation-delay: .4s; 237 | -webkit-animation-delay: .4s; 238 | -moz-animation-delay: .4s; 239 | -o-animation-delay: .4s; 240 | } 241 | .text-indication-of-audio-playing span:nth-child(3) { 242 | animation-delay: .8s; 243 | -webkit-animation-delay: .8s; 244 | -moz-animation-delay: .8s; 245 | -o-animation-delay: .8s; 246 | } 247 | 248 | .text-indication-of-audio-handling { 249 | font-size: 20px; 250 | } 251 | 252 | /* 3 Dots animation*/ 253 | .text-indication-of-audio-handling span { 254 | /*transitions with Firefox, IE and Opera Support browser support*/ 255 | animation-name: blinking-dot; 256 | -webkit-animation-name: blinking-dot; 257 | -moz-animation-name: blinking-dot; 258 | -o-animation-name: blinking-dot; 259 | animation-duration: 2s; 260 | -webkit-animation-duration: 2s; 261 | -moz-animation-duration: 2s; 262 | -o-animation-duration: 2s; 263 | animation-iteration-count: infinite; 264 | -webkit-animation-iteration-count: infinite; 265 | -moz-animation-iteration-count: infinite; 266 | -o-animation-iteration-count: infinite; 267 | } 268 | .text-indication-of-audio-handling span:nth-child(2) { 269 | animation-delay: .4s; 270 | -webkit-animation-delay: .4s; 271 | -moz-animation-delay: .4s; 272 | -o-animation-delay: .4s; 273 | } 274 | .text-indication-of-audio-handling span:nth-child(3) { 275 | animation-delay: .8s; 276 | -webkit-animation-delay: .8s; 277 | -moz-animation-delay: .8s; 278 | -o-animation-delay: .8s; 279 | } 280 | /* The animation code */ 281 | @keyframes blinking-dot { 282 | 0% { 283 | opacity: 0; 284 | } 285 | 50% { 286 | opacity: 1; 287 | } 288 | 100% { 289 | opacity: 0; 290 | } 291 | } 292 | /* The animation code */ 293 | @-webkit-keyframes blinking-dot { 294 | 0% { 295 | opacity: 0; 296 | } 297 | 50% { 298 | opacity: 1; 299 | } 300 | 100% { 301 | opacity: 0; 302 | } 303 | } 304 | /* The animation code */ 305 | @-moz-keyframes blinking-dot { 306 | 0% { 307 | opacity: 0; 308 | } 309 | 50% { 310 | opacity: 1; 311 | } 312 | 100% { 313 | opacity: 0; 314 | } 315 | } 316 | /* The animation code */ 317 | @-o-keyframes blinking-dot { 318 | 0% { 319 | opacity: 0; 320 | } 321 | 50% { 322 | opacity: 1; 323 | } 324 | 100% { 325 | opacity: 0; 326 | } 327 | } 328 | } 329 | 330 | /* Medium devices */ 331 | 332 | @media (min-width: 768px) {} 333 | 334 | /* Large devices */ 335 | 336 | @media (min-width: 992px) {} 337 | 338 | /*Ipad pro view*/ 339 | 340 | /* 341 | @media (min-width: 1024px) { 342 | 343 | } */ 344 | 345 | /* Extra Large devices */ 346 | 347 | @media (min-width: 1200px) {} 348 | 349 | -------------------------------------------------------------------------------- /whisper_finetune/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | import torch 5 | import torchaudio 6 | 7 | import pandas as pd 8 | import whisper 9 | import torchaudio.transforms as at 10 | from utils import load_wave 11 | from pathlib import Path 12 | 13 | class WhisperDataCollatorWithPadding: 14 | """ 15 | Using for collating many input tensors with different sizes to batch and maybe applying some processes to data. 16 | Input: list or dictionary of input tensors 17 | Output: list dictionary of batched tensors 18 | """ 19 | def __call__(self, features): 20 | input_ids, labels, dec_input_ids, texts = [], [], [], [] 21 | for f in features: 22 | input_ids.append(f["input_ids"]) 23 | labels.append(f["labels"]) 24 | dec_input_ids.append(f["dec_input_ids"]) 25 | texts.append(f["text"]) 26 | 27 | input_ids = torch.concat([input_id[None, :] for input_id in input_ids]) # [batch_size, seq_len] 28 | 29 | label_lengths = [len(lab) for lab in labels] # same size with input_ids 30 | dec_input_ids_length = [len(e) for e in dec_input_ids] 31 | max_label_len = max(label_lengths + dec_input_ids_length) 32 | 33 | labels = [ 34 | np.pad(lab, (0, max_label_len - lab_len), "constant", constant_values=-100) 35 | for lab, lab_len in zip(labels, label_lengths) 36 | ] 37 | dec_input_ids = [ 38 | np.pad(e, (0, max_label_len - e_len), "constant", constant_values=50257) 39 | for e, e_len in zip(dec_input_ids, dec_input_ids_length) 40 | ] # 50257 is eot token id 41 | 42 | batch = {"labels": labels, "dec_input_ids": dec_input_ids} 43 | 44 | batch = { 45 | k: torch.tensor(np.array(v), requires_grad=False) for k, v in batch.items() 46 | } 47 | batch["input_ids"] = input_ids 48 | batch["texts"] = texts 49 | return batch 50 | 51 | class WhisperDataset(torch.utils.data.Dataset): 52 | def __init__(self, dataset, sample_rate=16000) -> None: 53 | super().__init__() 54 | 55 | self.dataset = dataset 56 | self.sample_rate = sample_rate 57 | 58 | self.options = whisper.DecodingOptions(language="vi", without_timestamps=True) 59 | self.tokenizer = whisper.tokenizer.get_tokenizer( 60 | True, language="vi", task=self.options.task 61 | ) 62 | 63 | def load_wave(self, wave_path, sample_rate: int = 16000) -> torch.Tensor: 64 | waveform, sr = torchaudio.load(wave_path, normalize=True) 65 | if sample_rate != sr: 66 | waveform = at.Resample(sr, sample_rate)(waveform) 67 | return waveform 68 | 69 | 70 | def __len__(self): 71 | return len(self.dataset) 72 | 73 | def __getitem__(self, id): 74 | audio_id, audio_path, text = self.dataset[id] 75 | 76 | audio = self.load_wave(audio_path, sample_rate=self.sample_rate) 77 | audio = whisper.pad_or_trim(audio.flatten()) 78 | mel = whisper.log_mel_spectrogram(audio) 79 | 80 | text_token = [ 81 | *self.tokenizer.sot_sequence_including_notimestamps 82 | ] + self.tokenizer.encode(text) 83 | labels = text_token[1:] + [self.tokenizer.eot] 84 | if len(text_token) >= 448: 85 | audio_id, audio_path, text = self.dataset[0] 86 | 87 | audio = self.load_wave(audio_path, sample_rate=self.sample_rate) 88 | audio = whisper.pad_or_trim(audio.flatten()) 89 | mel = whisper.log_mel_spectrogram(audio) 90 | 91 | text_token = [ 92 | *self.tokenizer.sot_sequence_including_notimestamps 93 | ] + self.tokenizer.encode(text) 94 | labels = text_token[1:] + [self.tokenizer.eot] 95 | return { 96 | "input_ids": mel, 97 | "labels": labels, 98 | "dec_input_ids": text_token, 99 | "text": text, 100 | } 101 | 102 | 103 | def load_dataset(dataset_name, test=False): 104 | train_dataset = None 105 | test_dataset = None 106 | 107 | if dataset_name == 'fluers': 108 | print('Loading Vietnamese Fluers dataset...') 109 | if not os.path.exists('vi_vn.tar.gz'): 110 | os.system("wget https://storage.googleapis.com/xtreme_translations/FLEURS102/vi_vn.tar.gz") 111 | os.makedirs('fluers', exist_ok=True) 112 | os.system("tar -xf 'vi_vn.tar.gz' -C fluers") 113 | if not test: 114 | train_list_files = get_list_files_fluers('train') 115 | val_list_files = get_list_files_fluers('dev') 116 | train_list_files +=val_list_files 117 | print('Num train samples:', len(train_list_files)) 118 | train_dataset = WhisperDataset(train_list_files) 119 | 120 | test_list_files = get_list_files_fluers('test') 121 | print('Num test samples:', len(test_list_files)) 122 | test_dataset = WhisperDataset(test_list_files) 123 | 124 | elif dataset_name == 'vlsp2019': 125 | # Download VLSP2019 dataset 126 | print('Loading VLSP2019 dataset...') 127 | 128 | if not test: 129 | train_list_files = get_list_files_vlsp2019('train') 130 | print('Num train samples:', len(train_list_files)) 131 | train_dataset = WhisperDataset(train_list_files) 132 | 133 | test_list_files = get_list_files_vlsp2019('test') 134 | print('Num test samples:', len(test_list_files)) 135 | test_dataset = WhisperDataset(test_list_files) 136 | 137 | elif dataset_name == 'vin100h': 138 | # Download VIN100h dataset 139 | if not os.path.exists('downloaded_check.txt'): 140 | print('Loading VIN100h dataset...') 141 | os.system("gdown 1vUSxdORDxk-ePUt-bUVDahpoXiqKchMx") 142 | os.system("tar -xf 'VinBigdata-VLSP2020-100h (1).rar'") 143 | os.system("gdown 1Zmj9BqNysiON6Lzjqos9kY08DRanJxXv") 144 | os.system("unzip 'vin100h_listfiles.zip'") 145 | os.system("remove 'VinBigdata-VLSP2020-100h (1).rar'") 146 | with open('downloaded_check.txt', 'w') as f: 147 | f.write('True') 148 | else: 149 | print('Dataset files already downloaded!') 150 | if not test: 151 | train_list_files = get_list_files_vin100h('train') 152 | print('Num train samples:', len(train_list_files)) 153 | train_dataset = WhisperDataset(train_list_files) 154 | 155 | test_list_files = get_list_files_vin100h('test') 156 | print('Num test samples:', len(test_list_files)) 157 | test_dataset = WhisperDataset(test_list_files) 158 | 159 | else: 160 | print(dataset_name, 'is not supported, please try again!') 161 | 162 | return train_dataset, test_dataset 163 | 164 | #------------------------------------FLUERS------------------------------------# 165 | 166 | def get_list_files_fluers(phase, audio_path = 'fluers/vi_vn/audio', text_max_length=1000, audio_max_sample_length=960000, sample_rate=16000): 167 | audio_path = os.path.join(audio_path, phase) 168 | audio_transcript_pair_list = [] 169 | if phase=='train': 170 | tsv_file = 'fluers/vi_vn/train.tsv' 171 | elif phase=='dev': 172 | tsv_file = 'fluers/vi_vn/dev.tsv' 173 | else: 174 | tsv_file = 'fluers/vi_vn/test.tsv' 175 | df = pd.read_table(tsv_file, names=("id", "file_name", "raw_transcription", "transcription", "_", "num_samples", "gender")) 176 | for index, row in df.iterrows(): 177 | new_path = Path(os.path.join(audio_path, row['file_name'])) 178 | audio_id = row['id'] 179 | text = row['transcription'] 180 | if new_path.exists(): 181 | audio = load_wave(new_path, sample_rate=sample_rate)[0] 182 | if len(text) > text_max_length or len(audio) > audio_max_sample_length: 183 | print('skip file:', new_path,'with len text:', len(text), 'and len audio', len(audio)) 184 | continue 185 | audio_transcript_pair_list.append((audio_id, str(new_path), text)) 186 | return audio_transcript_pair_list 187 | 188 | 189 | 190 | #------------------------------------VLSP2019 ASR Dataset------------------------------------# 191 | def get_list_files_vlsp2019(phase, dataset_path = 'vlsp2019/data', text_max_length=1000, audio_max_sample_length=960000, sample_rate=16000): 192 | audio_transcript_pair_list = [] 193 | if phase=='train': 194 | csv_file = 'vlsp2019/vlsp2019_train.csv' 195 | else: 196 | csv_file = 'vlsp2019/vlsp2019_test.csv' 197 | df = pd.read_csv(csv_file) 198 | for index, row in df.iterrows(): 199 | new_path = Path(os.path.join(dataset_path, row['filename']+'.wav')) 200 | audio_id = index 201 | with open(Path(os.path.join(dataset_path, row['filename']+'.txt')), 'r') as f: 202 | text = f.readlines()[0] 203 | if new_path.exists(): 204 | audio = load_wave(new_path, sample_rate=sample_rate)[0] 205 | if len(text) > text_max_length or len(audio) > audio_max_sample_length: 206 | print('skip file:', new_path,'with len text:', len(text), 'and len audio', len(audio)) 207 | continue 208 | audio_transcript_pair_list.append((audio_id, str(new_path), text)) 209 | return audio_transcript_pair_list 210 | 211 | #------------------------------------VIN100h ASR Dataset------------------------------------# 212 | def get_list_files_vin100h(phase, dataset_path = 'vlsp2020_train_set_02', text_max_length=1000, audio_max_sample_length=960000, sample_rate=16000): 213 | audio_transcript_pair_list = [] 214 | if phase=='train': 215 | csv_file = 'train_vin100h.csv' 216 | else: 217 | csv_file = 'test_vin100h.csv' 218 | df = pd.read_csv(csv_file) 219 | for index, row in df.iterrows(): 220 | new_path = Path(os.path.join(dataset_path, row['filename']+'.wav')) 221 | audio_id = index 222 | with open(Path(os.path.join(dataset_path, row['filename']+'.txt')), 'r') as f: 223 | text = f.readlines()[0] 224 | if new_path.exists(): 225 | audio = load_wave(new_path, sample_rate=sample_rate)[0] 226 | if len(text) > text_max_length or len(audio) > audio_max_sample_length: 227 | print('skip file:', new_path,'with len text:', len(text), 'and len audio', len(audio)) 228 | continue 229 | audio_transcript_pair_list.append((audio_id, str(new_path), text)) 230 | return audio_transcript_pair_list 231 | 232 | if __name__=='__main__': 233 | # load_fluers() 234 | print('Load dataset...') 235 | -------------------------------------------------------------------------------- /demo/static/js/recorder.js: -------------------------------------------------------------------------------- 1 | // index.js --------------- 2 | //Model 3 | //none 4 | 5 | // Source: https://ralzohairi.medium.com/audio-recording-in-javascript-96eed45b75ee 6 | 7 | //View 8 | var microphoneButton = document.getElementsByClassName("start-recording-button")[0]; 9 | var recordingControlButtonsContainer = document.getElementsByClassName("recording-contorl-buttons-container")[0]; 10 | var stopRecordingButton = document.getElementsByClassName("stop-recording-button")[0]; 11 | var cancelRecordingButton = document.getElementsByClassName("cancel-recording-button")[0]; 12 | var elapsedTimeTag = document.getElementsByClassName("elapsed-time")[0]; 13 | var closeBrowserNotSupportedBoxButton = document.getElementsByClassName("close-browser-not-supported-box")[0]; 14 | var overlay = document.getElementsByClassName("overlay")[0]; 15 | var audioElement = document.getElementsByClassName("audio-element")[0]; 16 | var audioElementSource = document.getElementsByClassName("audio-element")[0] 17 | var textIndicatorOfAudiPlaying = document.getElementsByClassName("text-indication-of-audio-playing")[0]; 18 | 19 | 20 | var replayAudioBtn = document.getElementsByClassName("replay-btn")[0]; 21 | var textIndicatorOfAudioHandling = document.getElementsByClassName("text-indication-of-audio-handling")[0]; 22 | 23 | //Listeners 24 | 25 | //Listen to start recording button 26 | microphoneButton.onclick = startAudioRecording; 27 | 28 | //Listen to stop recording button 29 | stopRecordingButton.onclick = stopAudioRecording; 30 | 31 | //Listen to cancel recording button 32 | cancelRecordingButton.onclick = cancelAudioRecording; 33 | 34 | //Listen to when the ok button is clicked in the browser not supporting audio recording box 35 | closeBrowserNotSupportedBoxButton.onclick = hideBrowserNotSupportedOverlay; 36 | 37 | //Listen to when the audio being played ends 38 | audioElement.onended = hideTextIndicatorOfAudioPlaying; 39 | 40 | // replay audio 41 | replayAudioBtn.onclick = replayAudio; 42 | 43 | function replayAudio() { 44 | console.log("Playing audio..."); 45 | audioElement.play(); 46 | 47 | displayTextIndicatorOfAudioPlaying(); 48 | } 49 | 50 | /** Displays recording control buttons */ 51 | function handleDisplayingRecordingControlButtons() { 52 | //Hide the microphone button that starts audio recording 53 | microphoneButton.style.display = "none"; 54 | 55 | //Display the recording control buttons 56 | recordingControlButtonsContainer.classList.remove("hide"); 57 | 58 | //Handle the displaying of the elapsed recording time 59 | handleElapsedRecordingTime(); 60 | } 61 | 62 | /** Hide the displayed recording control buttons */ 63 | function handleHidingRecordingControlButtons() { 64 | //Display the microphone button that starts audio recording 65 | microphoneButton.style.display = "block"; 66 | 67 | //Hide the recording control buttons 68 | recordingControlButtonsContainer.classList.add("hide"); 69 | 70 | //stop interval that handles both time elapsed and the red dot 71 | clearInterval(elapsedTimeTimer); 72 | } 73 | 74 | /** Displays browser not supported info box for the user*/ 75 | function displayBrowserNotSupportedOverlay() { 76 | overlay.classList.remove("hide"); 77 | } 78 | 79 | /** Displays browser not supported info box for the user*/ 80 | function hideBrowserNotSupportedOverlay() { 81 | overlay.classList.add("hide"); 82 | } 83 | 84 | /** Creates a source element for the the audio element in the HTML document*/ 85 | function createSourceForAudioElement() { 86 | let sourceElement = document.createElement("source"); 87 | audioElement.appendChild(sourceElement); 88 | 89 | audioElementSource = sourceElement; 90 | } 91 | 92 | /** Display the text indicator of the audio being playing in the background */ 93 | function displayTextIndicatorOfAudioPlaying() { 94 | textIndicatorOfAudiPlaying.classList.remove("hide"); 95 | } 96 | 97 | /** Hide the text indicator of the audio being playing in the background */ 98 | function hideTextIndicatorOfAudioPlaying() { 99 | textIndicatorOfAudiPlaying.classList.add("hide"); 100 | } 101 | 102 | /** Display the text indicator of the audio being handling in the background */ 103 | function displayTextIndicatorOfAudioHandling() { 104 | textIndicatorOfAudioHandling.classList.remove("d-none"); 105 | } 106 | 107 | /** Hide the text indicator of the audio being playing in the background */ 108 | function hideTextIndicatorOfAudioHandling() { 109 | textIndicatorOfAudioHandling.classList.add("d-none"); 110 | } 111 | 112 | //Controller 113 | 114 | /** Stores the actual start time when an audio recording begins to take place to ensure elapsed time start time is accurate*/ 115 | var audioRecordStartTime; 116 | 117 | /** Stores the maximum recording time in hours to stop recording once maximum recording hour has been reached */ 118 | var maximumRecordingTimeInHours = 1; 119 | 120 | /** Stores the reference of the setInterval function that controls the timer in audio recording*/ 121 | var elapsedTimeTimer; 122 | 123 | /** Starts the audio recording*/ 124 | function startAudioRecording() { 125 | 126 | console.log("Recording Audio..."); 127 | 128 | //If a previous audio recording is playing, pause it 129 | let recorderAudioIsPlaying = !audioElement.paused; // the paused property tells whether the media element is paused or not 130 | console.log("paused?", !recorderAudioIsPlaying); 131 | if (recorderAudioIsPlaying) { 132 | audioElement.pause(); 133 | //also hide the audio playing indicator displayed on the screen 134 | hideTextIndicatorOfAudioPlaying(); 135 | } 136 | 137 | //start recording using the audio recording API 138 | audioRecorder.start() 139 | .then(() => { //on success 140 | 141 | //store the recording start time to display the elapsed time according to it 142 | audioRecordStartTime = new Date(); 143 | 144 | //display control buttons to offer the functionality of stop and cancel 145 | handleDisplayingRecordingControlButtons(); 146 | }) 147 | .catch(error => { //on error 148 | //No Browser Support Error 149 | if (error.message.includes("mediaDevices API or getUserMedia method is not supported in this browser.")) { 150 | console.log("To record audio, use browsers like Chrome and Firefox."); 151 | displayBrowserNotSupportedOverlay(); 152 | } 153 | 154 | //Error handling structure 155 | switch (error.name) { 156 | case 'AbortError': //error from navigator.mediaDevices.getUserMedia 157 | console.log("An AbortError has occured."); 158 | break; 159 | case 'NotAllowedError': //error from navigator.mediaDevices.getUserMedia 160 | console.log("A NotAllowedError has occured. User might have denied permission."); 161 | break; 162 | case 'NotFoundError': //error from navigator.mediaDevices.getUserMedia 163 | console.log("A NotFoundError has occured."); 164 | break; 165 | case 'NotReadableError': //error from navigator.mediaDevices.getUserMedia 166 | console.log("A NotReadableError has occured."); 167 | break; 168 | case 'SecurityError': //error from navigator.mediaDevices.getUserMedia or from the MediaRecorder.start 169 | console.log("A SecurityError has occured."); 170 | break; 171 | case 'TypeError': //error from navigator.mediaDevices.getUserMedia 172 | console.log("A TypeError has occured."); 173 | break; 174 | case 'InvalidStateError': //error from the MediaRecorder.start 175 | console.log("An InvalidStateError has occured."); 176 | break; 177 | case 'UnknownError': //error from the MediaRecorder.start 178 | console.log("An UnknownError has occured."); 179 | break; 180 | default: 181 | console.log("An error occured with the error name " + error.name); 182 | }; 183 | }); 184 | } 185 | /** Stop the currently started audio recording & sends it 186 | */ 187 | function stopAudioRecording() { 188 | 189 | console.log("Stopping Audio Recording..."); 190 | 191 | //stop the recording using the audio recording API 192 | audioRecorder.stop() 193 | .then(audioAsblob => { 194 | 195 | // Send audio 196 | sendAudio(audioAsblob); 197 | displayTextIndicatorOfAudioHandling(); 198 | //Play recorder audio 199 | saveAudio(audioAsblob); 200 | 201 | //hide recording control button & return record icon 202 | handleHidingRecordingControlButtons(); 203 | }) 204 | .catch(error => { 205 | //Error handling structure 206 | switch (error.name) { 207 | case 'InvalidStateError': //error from the MediaRecorder.stop 208 | console.log("An InvalidStateError has occured."); 209 | break; 210 | default: 211 | console.log("An error occured with the error name " + error.name); 212 | }; 213 | }); 214 | } 215 | 216 | /** Cancel the currently started audio recording */ 217 | function cancelAudioRecording() { 218 | console.log("Canceling audio..."); 219 | 220 | //cancel the recording using the audio recording API 221 | audioRecorder.cancel(); 222 | 223 | //hide recording control button & return record icon 224 | handleHidingRecordingControlButtons(); 225 | } 226 | 227 | function sendAudio(recorderAudioAsBlob) { 228 | const audioUrl = URL.createObjectURL(recorderAudioAsBlob); 229 | const audio = new Audio(audioUrl); 230 | 231 | var data = new FormData() 232 | data.append('file', recorderAudioAsBlob , 'file') 233 | 234 | var url = "http://127.0.0.1:5000/transcribe-"; 235 | var model = $('input[name="model"]:checked').val(); 236 | console.log(url + model) 237 | fetch(url + model, { 238 | method: 'POST', 239 | body: data 240 | 241 | }).then(response => response.json() 242 | ).then(json => { 243 | console.log(json); 244 | $(".text-transcribe-content").text(json); 245 | hideTextIndicatorOfAudioHandling(); 246 | }); 247 | } 248 | 249 | /** Plays recorded audio using the audio element in the HTML document 250 | * @param {Blob} recorderAudioAsBlob - recorded audio as a Blob Object 251 | */ 252 | function saveAudio(recorderAudioAsBlob) { 253 | 254 | //read content of files (Blobs) asynchronously 255 | let reader = new FileReader(); 256 | 257 | //once content has been read 258 | reader.onload = (e) => { 259 | //store the base64 URL that represents the URL of the recording audio 260 | let base64URL = e.target.result; 261 | 262 | //If this is the first audio playing, create a source element 263 | //as pre populating the HTML with a source of empty src causes error 264 | if (!audioElementSource) //if its not defined create it (happens first time only) 265 | createSourceForAudioElement(); 266 | 267 | //set the audio element's source using the base64 URL 268 | audioElementSource.src = base64URL; 269 | 270 | //set the type of the audio element based on the recorded audio's Blob type 271 | let BlobType = recorderAudioAsBlob.type.includes(";") ? 272 | recorderAudioAsBlob.type.substr(0, recorderAudioAsBlob.type.indexOf(';')) : recorderAudioAsBlob.type; 273 | audioElementSource.type = BlobType 274 | 275 | //call the load method as it is used to update the audio element after changing the source or other settings 276 | audioElement.load(); 277 | 278 | //play the audio after successfully setting new src and type that corresponds to the recorded audio 279 | // console.log("Playing audio..."); 280 | // audioElement.play(); 281 | 282 | // console.log("Sending audio..."); 283 | 284 | //Display text indicator of having the audio play in the background 285 | // displayTextIndicatorOfAudioPlaying(); 286 | }; 287 | 288 | //read content and convert it to a URL (base64) 289 | reader.readAsDataURL(recorderAudioAsBlob); 290 | } 291 | 292 | /** Computes the elapsed recording time since the moment the function is called in the format h:m:s*/ 293 | function handleElapsedRecordingTime() { 294 | //display inital time when recording begins 295 | displayElapsedTimeDuringAudioRecording("00:00"); 296 | 297 | //create an interval that compute & displays elapsed time, as well as, animate red dot - every second 298 | elapsedTimeTimer = setInterval(() => { 299 | //compute the elapsed time every second 300 | let elapsedTime = computeElapsedTime(audioRecordStartTime); //pass the actual record start time 301 | //display the elapsed time 302 | displayElapsedTimeDuringAudioRecording(elapsedTime); 303 | }, 1000); //every second 304 | } 305 | 306 | /** Display elapsed time during audio recording 307 | * @param {String} elapsedTime - elapsed time in the format mm:ss or hh:mm:ss 308 | */ 309 | function displayElapsedTimeDuringAudioRecording(elapsedTime) { 310 | //1. display the passed elapsed time as the elapsed time in the elapsedTime HTML element 311 | elapsedTimeTag.innerHTML = elapsedTime; 312 | 313 | //2. Stop the recording when the max number of hours is reached 314 | if (elapsedTimeReachedMaximumNumberOfHours(elapsedTime)) { 315 | stopAudioRecording(); 316 | } 317 | } 318 | 319 | /** 320 | * @param {String} elapsedTime - elapsed time in the format mm:ss or hh:mm:ss 321 | * @returns {Boolean} whether the elapsed time reached the maximum number of hours or not 322 | */ 323 | function elapsedTimeReachedMaximumNumberOfHours(elapsedTime) { 324 | //Split the elapsed time by the symbo : 325 | let elapsedTimeSplitted = elapsedTime.split(":"); 326 | 327 | //Turn the maximum recording time in hours to a string and pad it with zero if less than 10 328 | let maximumRecordingTimeInHoursAsString = maximumRecordingTimeInHours < 10 ? "0" + maximumRecordingTimeInHours : maximumRecordingTimeInHours.toString(); 329 | 330 | //if it the elapsed time reach hours and also reach the maximum recording time in hours return true 331 | if (elapsedTimeSplitted.length === 3 && elapsedTimeSplitted[0] === maximumRecordingTimeInHoursAsString) 332 | return true; 333 | else //otherwise, return false 334 | return false; 335 | } 336 | 337 | /** Computes the elapsedTime since the moment the function is called in the format mm:ss or hh:mm:ss 338 | * @param {String} startTime - start time to compute the elapsed time since 339 | * @returns {String} elapsed time in mm:ss format or hh:mm:ss format, if elapsed hours are 0. 340 | */ 341 | function computeElapsedTime(startTime) { 342 | //record end time 343 | let endTime = new Date(); 344 | 345 | //time difference in ms 346 | let timeDiff = endTime - startTime; 347 | 348 | //convert time difference from ms to seconds 349 | timeDiff = timeDiff / 1000; 350 | 351 | //extract integer seconds that dont form a minute using % 352 | let seconds = Math.floor(timeDiff % 60); //ignoring uncomplete seconds (floor) 353 | 354 | //pad seconds with a zero if neccessary 355 | seconds = seconds < 10 ? "0" + seconds : seconds; 356 | 357 | //convert time difference from seconds to minutes using % 358 | timeDiff = Math.floor(timeDiff / 60); 359 | 360 | //extract integer minutes that don't form an hour using % 361 | let minutes = timeDiff % 60; //no need to floor possible incomplete minutes, becase they've been handled as seconds 362 | minutes = minutes < 10 ? "0" + minutes : minutes; 363 | 364 | //convert time difference from minutes to hours 365 | timeDiff = Math.floor(timeDiff / 60); 366 | 367 | //extract integer hours that don't form a day using % 368 | let hours = timeDiff % 24; //no need to floor possible incomplete hours, becase they've been handled as seconds 369 | 370 | //convert time difference from hours to days 371 | timeDiff = Math.floor(timeDiff / 24); 372 | 373 | // the rest of timeDiff is number of days 374 | let days = timeDiff; //add days to hours 375 | 376 | let totalHours = hours + (days * 24); 377 | totalHours = totalHours < 10 ? "0" + totalHours : totalHours; 378 | 379 | if (totalHours === "00") { 380 | return minutes + ":" + seconds; 381 | } else { 382 | return totalHours + ":" + minutes + ":" + seconds; 383 | } 384 | } 385 | 386 | // audio-recording.js --------------- 387 | //API to handle audio recording 388 | 389 | var audioRecorder = { 390 | /** Stores the recorded audio as Blob objects of audio data as the recording continues*/ 391 | audioBlobs: [],/*of type Blob[]*/ 392 | /** Stores the reference of the MediaRecorder instance that handles the MediaStream when recording starts*/ 393 | mediaRecorder: null, /*of type MediaRecorder*/ 394 | /** Stores the reference to the stream currently capturing the audio*/ 395 | streamBeingCaptured: null, /*of type MediaStream*/ 396 | /** Start recording the audio 397 | * @returns {Promise} - returns a promise that resolves if audio recording successfully started 398 | */ 399 | start: function () { 400 | //Feature Detection 401 | if (!(navigator.mediaDevices && navigator.mediaDevices.getUserMedia)) { 402 | //Feature is not supported in browser 403 | //return a custom error 404 | return Promise.reject(new Error('mediaDevices API or getUserMedia method is not supported in this browser.')); 405 | } 406 | 407 | else { 408 | //Feature is supported in browser 409 | 410 | //create an audio stream 411 | return navigator.mediaDevices.getUserMedia({ audio: true }/*of type MediaStreamConstraints*/) 412 | //returns a promise that resolves to the audio stream 413 | .then(stream /*of type MediaStream*/ => { 414 | 415 | //save the reference of the stream to be able to stop it when necessary 416 | audioRecorder.streamBeingCaptured = stream; 417 | 418 | //create a media recorder instance by passing that stream into the MediaRecorder constructor 419 | audioRecorder.mediaRecorder = new MediaRecorder(stream); /*the MediaRecorder interface of the MediaStream Recording 420 | API provides functionality to easily record media*/ 421 | 422 | //clear previously saved audio Blobs, if any 423 | audioRecorder.audioBlobs = []; 424 | 425 | //add a dataavailable event listener in order to store the audio data Blobs when recording 426 | audioRecorder.mediaRecorder.addEventListener("dataavailable", event => { 427 | //store audio Blob object 428 | audioRecorder.audioBlobs.push(event.data); 429 | }); 430 | 431 | //start the recording by calling the start method on the media recorder 432 | audioRecorder.mediaRecorder.start(); 433 | }); 434 | 435 | /* errors are not handled in the API because if its handled and the promise is chained, the .then after the catch will be executed*/ 436 | } 437 | }, 438 | /** Stop the started audio recording 439 | * @returns {Promise} - returns a promise that resolves to the audio as a blob file 440 | */ 441 | stop: function () { 442 | //return a promise that would return the blob or URL of the recording 443 | return new Promise(resolve => { 444 | //save audio type to pass to set the Blob type 445 | let mimeType = audioRecorder.mediaRecorder.mimeType; 446 | 447 | //listen to the stop event in order to create & return a single Blob object 448 | audioRecorder.mediaRecorder.addEventListener("stop", () => { 449 | //create a single blob object, as we might have gathered a few Blob objects that needs to be joined as one 450 | let audioBlob = new Blob(audioRecorder.audioBlobs, { type: mimeType }); 451 | 452 | //resolve promise with the single audio blob representing the recorded audio 453 | resolve(audioBlob); 454 | }); 455 | audioRecorder.cancel(); 456 | }); 457 | }, 458 | /** Cancel audio recording*/ 459 | cancel: function () { 460 | //stop the recording feature 461 | audioRecorder.mediaRecorder.stop(); 462 | 463 | //stop all the tracks on the active stream in order to stop the stream 464 | audioRecorder.stopStream(); 465 | 466 | //reset API properties for next recording 467 | audioRecorder.resetRecordingProperties(); 468 | }, 469 | /** Stop all the tracks on the active stream in order to stop the stream and remove 470 | * the red flashing dot showing in the tab 471 | */ 472 | stopStream: function () { 473 | //stopping the capturing request by stopping all the tracks on the active stream 474 | audioRecorder.streamBeingCaptured.getTracks() //get all tracks from the stream 475 | .forEach(track /*of type MediaStreamTrack*/ => track.stop()); //stop each one 476 | }, 477 | /** Reset all the recording properties including the media recorder and stream being captured*/ 478 | resetRecordingProperties: function () { 479 | audioRecorder.mediaRecorder = null; 480 | audioRecorder.streamBeingCaptured = null; 481 | 482 | /*No need to remove event listeners attached to mediaRecorder as 483 | If a DOM element which is removed is reference-free (no references pointing to it), the element itself is picked 484 | up by the garbage collector as well as any event handlers/listeners associated with it. 485 | getEventListeners(audioRecorder.mediaRecorder) will return an empty array of events.*/ 486 | } 487 | } -------------------------------------------------------------------------------- /notebooks/[WhisperLM]_KenLMipynb.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "kValhOGAlsfj" 7 | }, 8 | "source": [ 9 | "# Dataset" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 4, 15 | "metadata": { 16 | "colab": { 17 | "base_uri": "https://localhost:8080/" 18 | }, 19 | "id": "7PzdrrMwzjZB", 20 | "outputId": "5aa72e6c-5e11-4178-9070-372979bf471a" 21 | }, 22 | "outputs": [ 23 | { 24 | "name": "stdout", 25 | "output_type": "stream", 26 | "text": [ 27 | "Downloading...\n", 28 | "From: https://drive.google.com/uc?id=1ypvEoGRNWrNLmW246RtBm9iMyKXm_2BP\n", 29 | "To: /home/hkab/corpus-title.tar.gz\n", 30 | "100%|████████████████████████████████████████| 220M/220M [01:37<00:00, 2.24MB/s]\n" 31 | ] 32 | } 33 | ], 34 | "source": [ 35 | "!gdown 1ypvEoGRNWrNLmW246RtBm9iMyKXm_2BP" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 5, 41 | "metadata": { 42 | "colab": { 43 | "base_uri": "https://localhost:8080/" 44 | }, 45 | "id": "dvDBB219zxCr", 46 | "outputId": "88fd7aea-2e48-4b87-d79a-bf5c61b21ebf" 47 | }, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "corpus-title.txt\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "!tar -xvf corpus-title.tar.gz" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": { 64 | "id": "zaAvDY2xlvpI" 65 | }, 66 | "source": [ 67 | "# Dependencies" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 6, 73 | "metadata": { 74 | "colab": { 75 | "base_uri": "https://localhost:8080/" 76 | }, 77 | "id": "rcOx6ErX3AYb", 78 | "outputId": "b2716c94-da63-4f7c-86e9-e001e66baa3b" 79 | }, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "Collecting transformers\n", 86 | " Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)\n", 87 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m997.5 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:01\u001b[0m00:01\u001b[0m\n", 88 | "\u001b[?25hRequirement already satisfied: packaging>=20.0 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from transformers) (21.3)\n", 89 | "Requirement already satisfied: tqdm>=4.27 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from transformers) (4.64.1)\n", 90 | "Requirement already satisfied: requests in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from transformers) (2.28.1)\n", 91 | "Collecting pyyaml>=5.1\n", 92 | " Downloading PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (682 kB)\n", 93 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m682.2/682.2 kB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", 94 | "\u001b[?25hRequirement already satisfied: filelock in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from transformers) (3.8.0)\n", 95 | "Collecting tokenizers!=0.11.3,<0.13,>=0.11.1\n", 96 | " Downloading tokenizers-0.12.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)\n", 97 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.6/6.6 MB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", 98 | "\u001b[?25hCollecting huggingface-hub<1.0,>=0.9.0\n", 99 | " Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)\n", 100 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.5/163.5 kB\u001b[0m \u001b[31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", 101 | "\u001b[?25hCollecting regex!=2019.12.17\n", 102 | " Downloading regex-2022.9.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n", 103 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", 104 | "\u001b[?25hCollecting numpy>=1.17\n", 105 | " Downloading numpy-1.23.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)\n", 106 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.1/17.1 MB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", 107 | "\u001b[?25hRequirement already satisfied: typing-extensions>=3.7.4.3 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.9.0->transformers) (4.4.0)\n", 108 | "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from packaging>=20.0->transformers) (3.0.9)\n", 109 | "Requirement already satisfied: charset-normalizer<3,>=2 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from requests->transformers) (2.1.1)\n", 110 | "Requirement already satisfied: urllib3<1.27,>=1.21.1 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from requests->transformers) (1.26.11)\n", 111 | "Requirement already satisfied: certifi>=2017.4.17 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from requests->transformers) (2022.9.24)\n", 112 | "Requirement already satisfied: idna<4,>=2.5 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from requests->transformers) (3.4)\n", 113 | "Installing collected packages: tokenizers, regex, pyyaml, numpy, huggingface-hub, transformers\n", 114 | "Successfully installed huggingface-hub-0.10.0 numpy-1.23.3 pyyaml-6.0 regex-2022.9.13 tokenizers-0.12.1 transformers-4.22.2\n" 115 | ] 116 | } 117 | ], 118 | "source": [ 119 | "!pip install transformers" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 7, 125 | "metadata": { 126 | "id": "605Nw379kJOM" 127 | }, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "[sudo] password for hkab: \n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "# !sudo apt install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 16, 144 | "metadata": { 145 | "colab": { 146 | "base_uri": "https://localhost:8080/" 147 | }, 148 | "id": "N2EkHFyXkLS2", 149 | "outputId": "ded57a89-4ad8-4cc5-afb2-d1eaa693746d" 150 | }, 151 | "outputs": [ 152 | { 153 | "name": "stdout", 154 | "output_type": "stream", 155 | "text": [ 156 | "--2022-10-10 15:20:25-- https://kheafield.com/code/kenlm.tar.gz\n", 157 | "Resolving kheafield.com (kheafield.com)... 35.196.63.85\n", 158 | "Connecting to kheafield.com (kheafield.com)|35.196.63.85|:443... connected.\n", 159 | "HTTP request sent, awaiting response... 200 OK\n", 160 | "Length: 491888 (480K) [application/x-gzip]\n", 161 | "Saving to: ‘STDOUT’\n", 162 | "\n", 163 | "- 100%[===================>] 480.36K 395KB/s in 1.2s \n", 164 | "\n", 165 | "2022-10-10 15:20:27 (395 KB/s) - written to stdout [491888/491888]\n", 166 | "\n" 167 | ] 168 | } 169 | ], 170 | "source": [ 171 | "!wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 17, 177 | "metadata": { 178 | "colab": { 179 | "base_uri": "https://localhost:8080/" 180 | }, 181 | "id": "Irt8SyookOno", 182 | "outputId": "b9cfd484-cfa6-4ecd-f855-79dca383b91f" 183 | }, 184 | "outputs": [ 185 | { 186 | "name": "stdout", 187 | "output_type": "stream", 188 | "text": [ 189 | "-- The C compiler identification is GNU 9.4.0\n", 190 | "-- The CXX compiler identification is GNU 9.4.0\n", 191 | "-- Check for working C compiler: /usr/bin/cc\n", 192 | "-- Check for working C compiler: /usr/bin/cc -- works\n", 193 | "-- Detecting C compiler ABI info\n", 194 | "-- Detecting C compiler ABI info - done\n", 195 | "-- Detecting C compile features\n", 196 | "-- Detecting C compile features - done\n", 197 | "-- Check for working CXX compiler: /usr/bin/c++\n", 198 | "-- Check for working CXX compiler: /usr/bin/c++ -- works\n", 199 | "-- Detecting CXX compiler ABI info\n", 200 | "-- Detecting CXX compiler ABI info - done\n", 201 | "-- Detecting CXX compile features\n", 202 | "-- Detecting CXX compile features - done\n", 203 | "-- Found Boost: /usr/lib/x86_64-linux-gnu/cmake/Boost-1.71.0/BoostConfig.cmake (found suitable version \"1.71.0\", minimum required is \"1.41.0\") found components: program_options system thread unit_test_framework \n", 204 | "-- Check if compiler accepts -pthread\n", 205 | "-- Check if compiler accepts -pthread - yes\n", 206 | "-- Found Threads: TRUE \n", 207 | "-- Found ZLIB: /usr/lib/x86_64-linux-gnu/libz.so (found version \"1.2.11\") \n", 208 | "-- Found BZip2: /usr/lib/x86_64-linux-gnu/libbz2.so (found version \"1.0.8\") \n", 209 | "-- Looking for BZ2_bzCompressInit\n", 210 | "-- Looking for BZ2_bzCompressInit - found\n", 211 | "-- Looking for lzma_auto_decoder in /usr/lib/x86_64-linux-gnu/liblzma.so\n", 212 | "-- Looking for lzma_auto_decoder in /usr/lib/x86_64-linux-gnu/liblzma.so - found\n", 213 | "-- Looking for lzma_easy_encoder in /usr/lib/x86_64-linux-gnu/liblzma.so\n", 214 | "-- Looking for lzma_easy_encoder in /usr/lib/x86_64-linux-gnu/liblzma.so - found\n", 215 | "-- Looking for lzma_lzma_preset in /usr/lib/x86_64-linux-gnu/liblzma.so\n", 216 | "-- Looking for lzma_lzma_preset in /usr/lib/x86_64-linux-gnu/liblzma.so - found\n", 217 | "-- Found LibLZMA: /usr/lib/x86_64-linux-gnu/liblzma.so (found version \"5.2.4\") \n", 218 | "-- Looking for clock_gettime in rt\n", 219 | "-- Looking for clock_gettime in rt - found\n", 220 | "-- Found OpenMP_C: -fopenmp (found version \"4.5\") \n", 221 | "-- Found OpenMP_CXX: -fopenmp (found version \"4.5\") \n", 222 | "-- Found OpenMP: TRUE (found version \"4.5\") \n", 223 | "-- Configuring done\n", 224 | "-- Generating done\n", 225 | "-- Build files have been written to: /home/hkab/kenlm/build\n", 226 | "\u001b[35m\u001b[1mScanning dependencies of target kenlm_util\u001b[0m\n", 227 | "[ 2%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/bignum.cc.o\u001b[0m\n", 228 | "[ 2%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/bignum-dtoa.cc.o\u001b[0m\n", 229 | "[ 3%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/cached-powers.cc.o\u001b[0m\n", 230 | "[ 4%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/diy-fp.cc.o\u001b[0m\n", 231 | "[ 5%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/double-conversion.cc.o\u001b[0m\n", 232 | "[ 6%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/fast-dtoa.cc.o\u001b[0m\n", 233 | "[ 7%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/fixed-dtoa.cc.o\u001b[0m\n", 234 | "[ 8%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/strtod.cc.o\u001b[0m\n", 235 | "[ 9%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/chain.cc.o\u001b[0m\n", 236 | "[ 10%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/count_records.cc.o\u001b[0m\n", 237 | "[ 11%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/io.cc.o\u001b[0m\n", 238 | "[ 12%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/line_input.cc.o\u001b[0m\n", 239 | "[ 13%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/multi_progress.cc.o\u001b[0m\n", 240 | "[ 14%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/rewindable_stream.cc.o\u001b[0m\n", 241 | "[ 15%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/bit_packing.cc.o\u001b[0m\n", 242 | "[ 16%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/ersatz_progress.cc.o\u001b[0m\n", 243 | "[ 17%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/exception.cc.o\u001b[0m\n", 244 | "[ 18%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/file.cc.o\u001b[0m\n", 245 | "[ 19%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/file_piece.cc.o\u001b[0m\n", 246 | "[ 20%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/float_to_string.cc.o\u001b[0m\n", 247 | "[ 21%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/integer_to_string.cc.o\u001b[0m\n", 248 | "[ 22%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/mmap.cc.o\u001b[0m\n", 249 | "[ 23%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/murmur_hash.cc.o\u001b[0m\n", 250 | "[ 25%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/parallel_read.cc.o\u001b[0m\n", 251 | "[ 26%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/pool.cc.o\u001b[0m\n", 252 | "[ 27%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/read_compressed.cc.o\u001b[0m\n", 253 | "[ 28%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/scoped.cc.o\u001b[0m\n", 254 | "[ 29%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/spaces.cc.o\u001b[0m\n", 255 | "[ 30%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/string_piece.cc.o\u001b[0m\n", 256 | "[ 31%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/usage.cc.o\u001b[0m\n", 257 | "[ 32%] \u001b[32m\u001b[1mLinking CXX static library ../lib/libkenlm_util.a\u001b[0m\n", 258 | "[ 32%] Built target kenlm_util\n", 259 | "\u001b[35m\u001b[1mScanning dependencies of target probing_hash_table_benchmark\u001b[0m\n", 260 | "\u001b[35m\u001b[1mScanning dependencies of target kenlm\u001b[0m\n", 261 | "[ 33%] \u001b[32mBuilding CXX object util/CMakeFiles/probing_hash_table_benchmark.dir/probing_hash_table_benchmark_main.cc.o\u001b[0m\n", 262 | "[ 34%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/bhiksha.cc.o\u001b[0m\n", 263 | "[ 35%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/binary_format.cc.o\u001b[0m\n", 264 | "[ 36%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/config.cc.o\u001b[0m\n", 265 | "[ 37%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/lm_exception.cc.o\u001b[0m\n", 266 | "[ 38%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/model.cc.o\u001b[0m\n", 267 | "[ 39%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/quantize.cc.o\u001b[0m\n", 268 | "[ 40%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/read_arpa.cc.o\u001b[0m\n", 269 | "[ 41%] \u001b[32m\u001b[1mLinking CXX executable ../bin/probing_hash_table_benchmark\u001b[0m\n", 270 | "[ 42%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/search_hashed.cc.o\u001b[0m\n", 271 | "[ 42%] Built target probing_hash_table_benchmark\n", 272 | "\u001b[35m\u001b[1mScanning dependencies of target kenlm_filter\u001b[0m\n", 273 | "[ 43%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/arpa_io.cc.o\u001b[0m\n", 274 | "[ 44%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/phrase.cc.o\u001b[0m\n", 275 | "[ 45%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/search_trie.cc.o\u001b[0m\n", 276 | "[ 46%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/vocab.cc.o\u001b[0m\n", 277 | "[ 47%] \u001b[32m\u001b[1mLinking CXX static library ../../lib/libkenlm_filter.a\u001b[0m\n", 278 | "[ 47%] Built target kenlm_filter\n", 279 | "[ 48%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/sizes.cc.o\u001b[0m\n", 280 | "[ 50%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/trie.cc.o\u001b[0m\n", 281 | "[ 51%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/trie_sort.cc.o\u001b[0m\n", 282 | "[ 52%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/value_build.cc.o\u001b[0m\n", 283 | "[ 53%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/virtual_interface.cc.o\u001b[0m\n", 284 | "[ 54%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/vocab.cc.o\u001b[0m\n", 285 | "[ 55%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/model_buffer.cc.o\u001b[0m\n", 286 | "[ 56%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/print.cc.o\u001b[0m\n", 287 | "[ 57%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/renumber.cc.o\u001b[0m\n", 288 | "[ 58%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/size_option.cc.o\u001b[0m\n", 289 | "[ 59%] \u001b[32m\u001b[1mLinking CXX static library ../lib/libkenlm.a\u001b[0m\n", 290 | "[ 59%] Built target kenlm\n", 291 | "\u001b[35m\u001b[1mScanning dependencies of target build_binary\u001b[0m\n", 292 | "\u001b[35m\u001b[1mScanning dependencies of target fragment\u001b[0m\n", 293 | "[ 60%] \u001b[32mBuilding CXX object lm/CMakeFiles/build_binary.dir/build_binary_main.cc.o\u001b[0m\n", 294 | "[ 61%] \u001b[32mBuilding CXX object lm/CMakeFiles/fragment.dir/fragment_main.cc.o\u001b[0m\n", 295 | "[ 62%] \u001b[32m\u001b[1mLinking CXX executable ../bin/fragment\u001b[0m\n", 296 | "[ 62%] Built target fragment\n", 297 | "[ 63%] \u001b[32m\u001b[1mLinking CXX executable ../bin/build_binary\u001b[0m\n", 298 | "\u001b[35m\u001b[1mScanning dependencies of target query\u001b[0m\n", 299 | "[ 64%] \u001b[32mBuilding CXX object lm/CMakeFiles/query.dir/query_main.cc.o\u001b[0m\n", 300 | "[ 64%] Built target build_binary\n", 301 | "\u001b[35m\u001b[1mScanning dependencies of target kenlm_benchmark\u001b[0m\n", 302 | "[ 65%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm_benchmark.dir/kenlm_benchmark_main.cc.o\u001b[0m\n", 303 | "[ 66%] \u001b[32m\u001b[1mLinking CXX executable ../bin/query\u001b[0m\n", 304 | "[ 66%] Built target query\n", 305 | "\u001b[35m\u001b[1mScanning dependencies of target kenlm_builder\u001b[0m\n", 306 | "[ 67%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/adjust_counts.cc.o\u001b[0m\n", 307 | "[ 68%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/corpus_count.cc.o\u001b[0m\n", 308 | "[ 69%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/initial_probabilities.cc.o\u001b[0m\n", 309 | "[ 70%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/interpolate.cc.o\u001b[0m\n", 310 | "[ 71%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/output.cc.o\u001b[0m\n", 311 | "[ 72%] \u001b[32m\u001b[1mLinking CXX executable ../bin/kenlm_benchmark\u001b[0m\n", 312 | "[ 72%] Built target kenlm_benchmark\n", 313 | "\u001b[35m\u001b[1mScanning dependencies of target phrase_table_vocab\u001b[0m\n", 314 | "[ 73%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/phrase_table_vocab.dir/phrase_table_vocab_main.cc.o\u001b[0m\n", 315 | "[ 75%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/phrase_table_vocab\u001b[0m\n", 316 | "[ 75%] Built target phrase_table_vocab\n", 317 | "\u001b[35m\u001b[1mScanning dependencies of target filter\u001b[0m\n", 318 | "[ 76%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/filter.dir/filter_main.cc.o\u001b[0m\n", 319 | "[ 77%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/pipeline.cc.o\u001b[0m\n", 320 | "[ 78%] \u001b[32m\u001b[1mLinking CXX static library ../../lib/libkenlm_builder.a\u001b[0m\n", 321 | "[ 78%] Built target kenlm_builder\n", 322 | "\u001b[35m\u001b[1mScanning dependencies of target kenlm_interpolate\u001b[0m\n", 323 | "[ 79%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/backoff_reunification.cc.o\u001b[0m\n", 324 | "[ 80%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/bounded_sequence_encoding.cc.o\u001b[0m\n", 325 | "[ 81%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/merge_probabilities.cc.o\u001b[0m\n", 326 | "[ 82%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/filter\u001b[0m\n", 327 | "[ 82%] Built target filter\n", 328 | "[ 83%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/merge_vocab.cc.o\u001b[0m\n", 329 | "[ 84%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/normalize.cc.o\u001b[0m\n", 330 | "[ 85%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/pipeline.cc.o\u001b[0m\n", 331 | "[ 86%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/split_worker.cc.o\u001b[0m\n", 332 | "[ 87%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/tune_derivatives.cc.o\u001b[0m\n", 333 | "[ 88%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/tune_instances.cc.o\u001b[0m\n", 334 | "[ 89%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/tune_weights.cc.o\u001b[0m\n", 335 | "[ 90%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/universal_vocab.cc.o\u001b[0m\n", 336 | "\u001b[35m\u001b[1mScanning dependencies of target count_ngrams\u001b[0m\n", 337 | "[ 91%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/count_ngrams.dir/count_ngrams_main.cc.o\u001b[0m\n", 338 | "[ 92%] \u001b[32m\u001b[1mLinking CXX static library ../../lib/libkenlm_interpolate.a\u001b[0m\n", 339 | "[ 92%] Built target kenlm_interpolate\n", 340 | "\u001b[35m\u001b[1mScanning dependencies of target lmplz\u001b[0m\n", 341 | "[ 93%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/lmplz.dir/lmplz_main.cc.o\u001b[0m\n", 342 | "[ 94%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/lmplz\u001b[0m\n", 343 | "[ 94%] Built target lmplz\n", 344 | "\u001b[35m\u001b[1mScanning dependencies of target streaming_example\u001b[0m\n", 345 | "[ 95%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/streaming_example.dir/streaming_example_main.cc.o\u001b[0m\n", 346 | "[ 96%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/count_ngrams\u001b[0m\n", 347 | "[ 96%] Built target count_ngrams\n", 348 | "\u001b[35m\u001b[1mScanning dependencies of target interpolate\u001b[0m\n", 349 | "[ 97%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/interpolate.dir/interpolate_main.cc.o\u001b[0m\n", 350 | "[ 98%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/streaming_example\u001b[0m\n", 351 | "[ 98%] Built target streaming_example\n", 352 | "[100%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/interpolate\u001b[0m\n", 353 | "[100%] Built target interpolate\n", 354 | "build_binary fragment\t lmplz\t\t\t query\n", 355 | "count_ngrams interpolate phrase_table_vocab\t streaming_example\n", 356 | "filter\t kenlm_benchmark probing_hash_table_benchmark\n" 357 | ] 358 | } 359 | ], 360 | "source": [ 361 | "!mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2\n", 362 | "!ls kenlm/build/bin" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 18, 368 | "metadata": { 369 | "colab": { 370 | "base_uri": "https://localhost:8080/" 371 | }, 372 | "id": "mW60TiF_oby2", 373 | "outputId": "382fd3f6-12c5-4fc9-e7dd-fd63adb6dd47" 374 | }, 375 | "outputs": [ 376 | { 377 | "name": "stdout", 378 | "output_type": "stream", 379 | "text": [ 380 | "Collecting pyctcdecode\n", 381 | " Downloading pyctcdecode-0.4.0-py2.py3-none-any.whl (45 kB)\n", 382 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.1/45.1 kB\u001b[0m \u001b[31m842.4 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m \u001b[36m0:00:01\u001b[0m\n", 383 | "\u001b[?25hCollecting pygtrie<3.0,>=2.1\n", 384 | " Downloading pygtrie-2.5.0-py3-none-any.whl (25 kB)\n", 385 | "Collecting hypothesis<7,>=6.14\n", 386 | " Downloading hypothesis-6.56.1-py3-none-any.whl (395 kB)\n", 387 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m395.3/395.3 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", 388 | "\u001b[?25hRequirement already satisfied: numpy<2.0.0,>=1.15.0 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from pyctcdecode) (1.23.3)\n", 389 | "Requirement already satisfied: attrs>=19.2.0 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from hypothesis<7,>=6.14->pyctcdecode) (22.1.0)\n", 390 | "Collecting exceptiongroup>=1.0.0rc8\n", 391 | " Downloading exceptiongroup-1.0.0rc9-py3-none-any.whl (12 kB)\n", 392 | "Collecting sortedcontainers<3.0.0,>=2.1.0\n", 393 | " Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)\n", 394 | "Installing collected packages: sortedcontainers, pygtrie, exceptiongroup, hypothesis, pyctcdecode\n", 395 | "Successfully installed exceptiongroup-1.0.0rc9 hypothesis-6.56.1 pyctcdecode-0.4.0 pygtrie-2.5.0 sortedcontainers-2.4.0\n" 396 | ] 397 | } 398 | ], 399 | "source": [ 400 | "!pip install pyctcdecode" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 19, 406 | "metadata": { 407 | "colab": { 408 | "base_uri": "https://localhost:8080/", 409 | "height": 315 410 | }, 411 | "id": "E20MfjkLrJmt", 412 | "outputId": "72164662-c63f-472a-ff97-7e26d5f3adc1" 413 | }, 414 | "outputs": [ 415 | { 416 | "name": "stdout", 417 | "output_type": "stream", 418 | "text": [ 419 | "Collecting https://github.com/kpu/kenlm/archive/master.zip\n", 420 | " Downloading https://github.com/kpu/kenlm/archive/master.zip (550 kB)\n", 421 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m550.7/550.7 kB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", 422 | "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n", 423 | "\u001b[?25hBuilding wheels for collected packages: kenlm\n", 424 | " Building wheel for kenlm (setup.py) ... \u001b[?25ldone\n", 425 | "\u001b[?25h Created wheel for kenlm: filename=kenlm-0.0.0-cp310-cp310-linux_x86_64.whl size=340147 sha256=bcdae3827c372f9fbea4e21d082f520f3d8dfeb5c555fd138a20cbf9a3ca88a7\n", 426 | " Stored in directory: /tmp/pip-ephem-wheel-cache-rno5cvzv/wheels/a5/73/ee/670fbd0cee8f6f0b21d10987cb042291e662e26e1a07026462\n", 427 | "Successfully built kenlm\n", 428 | "Installing collected packages: kenlm\n", 429 | "Successfully installed kenlm-0.0.0\n" 430 | ] 431 | } 432 | ], 433 | "source": [ 434 | "!pip install https://github.com/kpu/kenlm/archive/master.zip" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 21, 440 | "metadata": {}, 441 | "outputs": [ 442 | { 443 | "name": "stdout", 444 | "output_type": "stream", 445 | "text": [ 446 | "Collecting joblib\n", 447 | " Downloading joblib-1.2.0-py3-none-any.whl (297 kB)\n", 448 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m298.0/298.0 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", 449 | "\u001b[?25hInstalling collected packages: joblib\n", 450 | "Successfully installed joblib-1.2.0\n" 451 | ] 452 | } 453 | ], 454 | "source": [ 455 | "!pip install joblib" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 23, 461 | "metadata": {}, 462 | "outputs": [ 463 | { 464 | "name": "stdout", 465 | "output_type": "stream", 466 | "text": [ 467 | "Collecting ipywidgets\n", 468 | " Downloading ipywidgets-8.0.2-py3-none-any.whl (134 kB)\n", 469 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.4/134.4 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", 470 | "\u001b[?25hRequirement already satisfied: traitlets>=4.3.1 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipywidgets) (5.4.0)\n", 471 | "Collecting jupyterlab-widgets~=3.0\n", 472 | " Downloading jupyterlab_widgets-3.0.3-py3-none-any.whl (384 kB)\n", 473 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m384.1/384.1 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", 474 | "\u001b[?25hRequirement already satisfied: ipython>=6.1.0 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipywidgets) (8.5.0)\n", 475 | "Collecting widgetsnbextension~=4.0\n", 476 | " Downloading widgetsnbextension-4.0.3-py3-none-any.whl (2.0 MB)\n", 477 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", 478 | "\u001b[?25hRequirement already satisfied: ipykernel>=4.5.1 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipywidgets) (6.16.0)\n", 479 | "Requirement already satisfied: psutil in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipykernel>=4.5.1->ipywidgets) (5.9.0)\n", 480 | "Requirement already satisfied: packaging in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipykernel>=4.5.1->ipywidgets) (21.3)\n", 481 | "Requirement already satisfied: tornado>=6.1 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipykernel>=4.5.1->ipywidgets) (6.1)\n", 482 | "Requirement already satisfied: nest-asyncio in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipykernel>=4.5.1->ipywidgets) (1.5.6)\n", 483 | "Requirement already satisfied: pyzmq>=17 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipykernel>=4.5.1->ipywidgets) (23.2.0)\n", 484 | "Requirement already satisfied: matplotlib-inline>=0.1 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipykernel>=4.5.1->ipywidgets) (0.1.6)\n", 485 | "Requirement already satisfied: jupyter-client>=6.1.12 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipykernel>=4.5.1->ipywidgets) (7.3.4)\n", 486 | "Requirement already satisfied: debugpy>=1.0 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipykernel>=4.5.1->ipywidgets) (1.5.1)\n", 487 | "Requirement already satisfied: pexpect>4.3 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (4.8.0)\n", 488 | "Requirement already satisfied: prompt-toolkit<3.1.0,>3.0.1 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (3.0.31)\n", 489 | "Requirement already satisfied: stack-data in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (0.5.1)\n", 490 | "Requirement already satisfied: pickleshare in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (0.7.5)\n", 491 | "Requirement already satisfied: pygments>=2.4.0 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (2.13.0)\n", 492 | "Requirement already satisfied: decorator in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (5.1.1)\n", 493 | "Requirement already satisfied: backcall in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (0.2.0)\n", 494 | "Requirement already satisfied: jedi>=0.16 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (0.18.1)\n", 495 | "Requirement already satisfied: parso<0.9.0,>=0.8.0 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets) (0.8.3)\n", 496 | "Requirement already satisfied: entrypoints in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets) (0.4)\n", 497 | "Requirement already satisfied: jupyter-core>=4.9.2 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets) (4.11.1)\n", 498 | "Requirement already satisfied: python-dateutil>=2.8.2 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets) (2.8.2)\n", 499 | "Requirement already satisfied: ptyprocess>=0.5 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets) (0.7.0)\n", 500 | "Requirement already satisfied: wcwidth in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from prompt-toolkit<3.1.0,>3.0.1->ipython>=6.1.0->ipywidgets) (0.2.5)\n", 501 | "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from packaging->ipykernel>=4.5.1->ipywidgets) (3.0.9)\n", 502 | "Requirement already satisfied: executing in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (1.1.1)\n", 503 | "Requirement already satisfied: asttokens in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.0.8)\n", 504 | "Requirement already satisfied: pure-eval in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (0.2.2)\n", 505 | "Requirement already satisfied: six>=1.5 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from python-dateutil>=2.8.2->jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets) (1.16.0)\n", 506 | "Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets\n", 507 | "Successfully installed ipywidgets-8.0.2 jupyterlab-widgets-3.0.3 widgetsnbextension-4.0.3\n" 508 | ] 509 | } 510 | ], 511 | "source": [ 512 | "!pip install ipywidgets" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": { 518 | "id": "Sl0N0sjUlycn" 519 | }, 520 | "source": [ 521 | "# KenLM" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": 1, 527 | "metadata": { 528 | "colab": { 529 | "base_uri": "https://localhost:8080/", 530 | "height": 104, 531 | "referenced_widgets": [ 532 | "63415a015b504c7f8baa3d5a60b7c560", 533 | "b1cfc6b061cd40acbf15c3a617c2decd", 534 | "98c0b2a457bf405482ec3b11c61ac454", 535 | "ffe2904f8a2747638e11410a5474f9e4", 536 | "e8d2d9fa065f4a13b614dd8c4a72f824", 537 | "3fee1bf921dd48f69811c0aaa3213974", 538 | "06989b53974f41d28796d7e9f7ca1034", 539 | "aabcd286ca28404da9170688b6871027", 540 | "5cb3e448899c47ba9fd2c7731aba954b", 541 | "9668a1ccd9d742349585cf3349cff5b6", 542 | "9073d7f2397f47d0a60bc828f69f93cf" 543 | ] 544 | }, 545 | "id": "Bdf74ScY1sIn", 546 | "outputId": "2ac3feef-9b8e-4323-b4e6-61ef63c2e158" 547 | }, 548 | "outputs": [], 549 | "source": [ 550 | "import json\n", 551 | "import os\n", 552 | "\n", 553 | "import numpy as np\n", 554 | "from joblib import Parallel, delayed\n", 555 | "from tqdm.auto import tqdm\n", 556 | "from transformers import GPT2TokenizerFast, GPT2Tokenizer" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 2, 562 | "metadata": { 563 | "id": "AzCxddKg1m-4" 564 | }, 565 | "outputs": [], 566 | "source": [ 567 | "# Utilities from https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/ngram_lm/kenlm_utils.py\n", 568 | "def tokenize_str(texts, tokenizer, offset):\n", 569 | " tokenized_text = []\n", 570 | " for text in texts:\n", 571 | " tok_text = tokenizer.encode(text)\n", 572 | " tok_text = [chr(token + offset) for token in tok_text]\n", 573 | " tokenized_text.append(tok_text)\n", 574 | " return tokenized_text\n", 575 | "\n", 576 | "def tokenize_text(data, tokenizer, path, chunk_size=8192, buffer_size=32, token_offset=100):\n", 577 | " dataset_len = len(data)\n", 578 | " print(\n", 579 | " f\"Chunking {dataset_len} rows into {dataset_len / float(chunk_size):0.4f} tasks (each chunk contains {chunk_size} elements)\"\n", 580 | " )\n", 581 | "\n", 582 | " current_step = 0\n", 583 | " if os.path.exists(path):\n", 584 | " print(f\"Deleting previous file : {path}\")\n", 585 | " os.remove(path)\n", 586 | "\n", 587 | " with Parallel(n_jobs=-2, verbose=10) as parallel:\n", 588 | " while True:\n", 589 | " start = current_step * chunk_size\n", 590 | " end = min((current_step + buffer_size) * chunk_size, dataset_len)\n", 591 | "\n", 592 | " tokenized_data = parallel(\n", 593 | " delayed(tokenize_str)(data[start : start + chunk_size], tokenizer, token_offset)\n", 594 | " for start in range(start, end, chunk_size)\n", 595 | " )\n", 596 | "\n", 597 | " # Write dataset\n", 598 | " write_dataset(tokenized_data, path)\n", 599 | " current_step += len(tokenized_data)\n", 600 | " print(f\"Finished writing {len(tokenized_data)} chunks to {path}. Current chunk index = {current_step}\")\n", 601 | " del tokenized_data\n", 602 | " if end >= dataset_len:\n", 603 | " break\n", 604 | "\n", 605 | "\n", 606 | "def write_dataset(chunks, path):\n", 607 | " # basedir = os.path.dirname(path)\n", 608 | "\n", 609 | " # if not os.path.exists(basedir):\n", 610 | " # os.makedirs(basedir, exist_ok=True)\n", 611 | "\n", 612 | " with open(path, 'a+', encoding='utf-8') as f:\n", 613 | " for chunk_idx in tqdm(range(len(chunks)), desc='Chunk ', total=len(chunks), unit=' chunks'):\n", 614 | " for text in chunks[chunk_idx]:\n", 615 | " line = ' '.join(text)\n", 616 | " f.write(f\"{line}\\n\")" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": 3, 622 | "metadata": { 623 | "id": "QnBBMMH-z-7N" 624 | }, 625 | "outputs": [], 626 | "source": [ 627 | "with open(\"corpus-title.txt\", 'r', encoding='utf-8') as f:\n", 628 | " dataset = f.readlines()" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": 4, 634 | "metadata": {}, 635 | "outputs": [ 636 | { 637 | "data": { 638 | "text/plain": [ 639 | "'Chây ì nộp phạt nguội.\\n'" 640 | ] 641 | }, 642 | "execution_count": 4, 643 | "metadata": {}, 644 | "output_type": "execute_result" 645 | } 646 | ], 647 | "source": [ 648 | "dataset[0]" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": 5, 654 | "metadata": {}, 655 | "outputs": [], 656 | "source": [ 657 | "chars_to_ignore_regex = '[,?.!\\-\\;\\:\"“%‘”�—’…–]' \n", 658 | "import re\n", 659 | "\n", 660 | "def clean_text(text):\n", 661 | " text = re.sub(chars_to_ignore_regex, \"\", text.lower())\n", 662 | " return text" 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": 6, 668 | "metadata": {}, 669 | "outputs": [ 670 | { 671 | "name": "stderr", 672 | "output_type": "stream", 673 | "text": [ 674 | "100%|██████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 89251.49it/s]\n" 675 | ] 676 | } 677 | ], 678 | "source": [ 679 | "from tqdm import tqdm\n", 680 | "dataset_clean = []\n", 681 | "for text in tqdm(dataset[0:100000]):\n", 682 | " dataset_clean.append(clean_text(text))" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": 7, 688 | "metadata": {}, 689 | "outputs": [ 690 | { 691 | "name": "stdout", 692 | "output_type": "stream", 693 | "text": [ 694 | "thay đổi về đăng ký chuyển nhượng xe từ 12/2 bạn cần biết\n", 695 | "\n" 696 | ] 697 | } 698 | ], 699 | "source": [ 700 | "print(dataset_clean[4])" 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": 11, 706 | "metadata": {}, 707 | "outputs": [], 708 | "source": [ 709 | "import whisper\n", 710 | "tokenizer = whisper.tokenizer.get_tokenizer('vi').tokenizer" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": 9, 716 | "metadata": { 717 | "colab": { 718 | "base_uri": "https://localhost:8080/", 719 | "height": 292, 720 | "referenced_widgets": [ 721 | "2c05958e110648a1ac6d447b26a94ad4", 722 | "262929238fd543cc8190f755bd48aca1", 723 | "4f1d415e84ba4db99c05837edfd53bda", 724 | "269f80894a8b42169242e5a9f711ea96", 725 | "7b023fdc14184a9e95d8db379b511673", 726 | "aba59e0a981845b0a2afc17f1ea8fad0", 727 | "b511866d7dcc4c69b114720594db39b9", 728 | "4a48e417d4be432c8733f046974af3cb", 729 | "cb1bd500742d4938a96e966c8964ecb6", 730 | "bc79e00c44874895a62dbab150be6b91", 731 | "a31dc64cca7146bd95cf8e98db98828c" 732 | ] 733 | }, 734 | "id": "kC2cCbfQ4ZD5", 735 | "outputId": "4ee3c965-cbf1-415a-d0b8-7969fb984df1" 736 | }, 737 | "outputs": [ 738 | { 739 | "name": "stdout", 740 | "output_type": "stream", 741 | "text": [ 742 | "Chunking 10000 rows into 1.2207 tasks (each chunk contains 8192 elements)\n" 743 | ] 744 | }, 745 | { 746 | "name": "stderr", 747 | "output_type": "stream", 748 | "text": [ 749 | "[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.\n", 750 | "[Parallel(n_jobs=-2)]: Done 2 out of 2 | elapsed: 4.5s remaining: 0.0s\n", 751 | "[Parallel(n_jobs=-2)]: Done 2 out of 2 | elapsed: 4.5s finished\n", 752 | "Chunk : 100%|██████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 55.85 chunks/s]" 753 | ] 754 | }, 755 | { 756 | "name": "stdout", 757 | "output_type": "stream", 758 | "text": [ 759 | "Finished writing 2 chunks to dataset_tokenized.txt. Current chunk index = 2\n" 760 | ] 761 | }, 762 | { 763 | "name": "stderr", 764 | "output_type": "stream", 765 | "text": [ 766 | "\n" 767 | ] 768 | } 769 | ], 770 | "source": [ 771 | "tokenize_text(dataset_clean[0:10000], tokenizer, \"dataset_tokenized.txt\")" 772 | ] 773 | }, 774 | { 775 | "cell_type": "markdown", 776 | "metadata": { 777 | "id": "6ELow7l2mVKc" 778 | }, 779 | "source": [ 780 | "--discount_fallback is needed for training KenLM for BPE-based models" 781 | ] 782 | }, 783 | { 784 | "cell_type": "code", 785 | "execution_count": 10, 786 | "metadata": { 787 | "colab": { 788 | "base_uri": "https://localhost:8080/" 789 | }, 790 | "id": "PMk_TCafBrXY", 791 | "outputId": "7f67464a-5367-4c10-c803-6773e26822c6" 792 | }, 793 | "outputs": [ 794 | { 795 | "name": "stdout", 796 | "output_type": "stream", 797 | "text": [ 798 | "=== 1/5 Counting and sorting n-grams ===\n", 799 | "Reading /home/hkab/dataset_tokenized.txt\n", 800 | "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", 801 | "****************************************************************************************************\n", 802 | "Unigram tokens 292401 types 3012\n", 803 | "=== 2/5 Calculating and sorting adjusted counts ===\n", 804 | "Chain sizes: 1:36144 2:1812654080 3:3398726400\n", 805 | "Statistics:\n", 806 | "1 3012 D1=0.626738 D2=0.996616 D3+=1.56584\n", 807 | "2 42556 D1=0.683317 D2=1.13987 D3+=1.61722\n", 808 | "3 110710 D1=0.73464 D2=1.12548 D3+=1.43493\n", 809 | "Memory estimate for binary LM:\n", 810 | "type kB\n", 811 | "probing 3019 assuming -p 1.5\n", 812 | "probing 3281 assuming -r models -p 1.5\n", 813 | "trie 1129 without quantization\n", 814 | "trie 577 assuming -q 8 -b 8 quantization \n", 815 | "trie 1084 assuming -a 22 array pointer compression\n", 816 | "trie 532 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n", 817 | "=== 3/5 Calculating and sorting initial probabilities ===\n", 818 | "Chain sizes: 1:36144 2:680896 3:2214200\n", 819 | "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", 820 | "####################################################################################################\n", 821 | "=== 4/5 Calculating and writing order-interpolated probabilities ===\n", 822 | "Chain sizes: 1:36144 2:680896 3:2214200\n", 823 | "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", 824 | "####################################################################################################\n", 825 | "=== 5/5 Writing ARPA model ===\n", 826 | "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", 827 | "****************************************************************************************************\n", 828 | "Name:lmplz\tVmPeak:5254272 kB\tVmRSS:9092 kB\tRSSMax:1978128 kB\tuser:0.520195\tsys:3.20971\tCPU:3.73272\treal:3.61356\n" 829 | ] 830 | } 831 | ], 832 | "source": [ 833 | "!kenlm/build/bin/lmplz -o 3 --text dataset_tokenized.txt --arpa dataset_tokenized_3gram.arpa --discount_fallback" 834 | ] 835 | }, 836 | { 837 | "cell_type": "code", 838 | "execution_count": 19, 839 | "metadata": { 840 | "colab": { 841 | "base_uri": "https://localhost:8080/" 842 | }, 843 | "id": "rMbF9O8nnkrg", 844 | "outputId": "2fd75ffe-cf7c-40ae-e29b-f058a2521709" 845 | }, 846 | "outputs": [ 847 | { 848 | "name": "stdout", 849 | "output_type": "stream", 850 | "text": [ 851 | "-0.6037645\t 桻 │\n", 852 | "-1.9628253\t • Ჯ\n", 853 | "-0.55146927\tᲯ ֭ 湨\n", 854 | "-0.30274913\t֭ 湨 ণ\n", 855 | "-1.1774871\t豵 o 揍\n", 856 | "-1.1730413\t– ź Ȩ\n", 857 | "-0.8179335\t} ৾ ୬\n", 858 | "-0.51691043\t⨠ ⽭ 㨢\n", 859 | "-0.30274913\t⽭ 㨢 ᱎ\n", 860 | "-2.5927703\tȉ 忎 㕒\n", 861 | "-2.5571775\tܫ 〶 㕒\n", 862 | "-0.23558763\t忎 㕒 砄\n", 863 | "-0.23558763\t〶 㕒 砄\n", 864 | "-0.25315025\tό ᭊ 涐\n", 865 | "-0.14362468\tɁ ᭊ 涐\n", 866 | "-2.8912714\t  б\n", 867 | "-0.49007577\t⼺ Ȝ 뀗\n", 868 | "-0.41827714\tj ո ঎\n", 869 | "\n", 870 | "\\end\\\n" 871 | ] 872 | } 873 | ], 874 | "source": [ 875 | "!tail -20 dataset_tokenized_3gram.arpa" 876 | ] 877 | }, 878 | { 879 | "cell_type": "code", 880 | "execution_count": 13, 881 | "metadata": { 882 | "colab": { 883 | "base_uri": "https://localhost:8080/" 884 | }, 885 | "id": "FxTQdOUYqlmX", 886 | "outputId": "2f60fd19-cee6-49f2-873d-ad3c10183f7d" 887 | }, 888 | "outputs": [ 889 | { 890 | "name": "stdout", 891 | "output_type": "stream", 892 | "text": [ 893 | "Reading dataset_tokenized_3gram.arpa\n", 894 | "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", 895 | "****************************************************************************************************\n", 896 | "SUCCESS\n" 897 | ] 898 | } 899 | ], 900 | "source": [ 901 | "!kenlm/build/bin/build_binary dataset_tokenized_3gram.arpa dataset_tokenized_3gram.binary " 902 | ] 903 | }, 904 | { 905 | "cell_type": "code", 906 | "execution_count": 15, 907 | "metadata": {}, 908 | "outputs": [], 909 | "source": [ 910 | "import kenlm" 911 | ] 912 | }, 913 | { 914 | "cell_type": "code", 915 | "execution_count": 34, 916 | "metadata": {}, 917 | "outputs": [], 918 | "source": [ 919 | "model = kenlm.Model('dataset_tokenized_5gram_vi_title.binary')" 920 | ] 921 | }, 922 | { 923 | "cell_type": "code", 924 | "execution_count": 35, 925 | "metadata": {}, 926 | "outputs": [ 927 | { 928 | "data": { 929 | "text/plain": [ 930 | "5" 931 | ] 932 | }, 933 | "execution_count": 35, 934 | "metadata": {}, 935 | "output_type": "execute_result" 936 | } 937 | ], 938 | "source": [ 939 | "model.order" 940 | ] 941 | }, 942 | { 943 | "cell_type": "code", 944 | "execution_count": 36, 945 | "metadata": {}, 946 | "outputs": [ 947 | { 948 | "data": { 949 | "text/plain": [ 950 | "'ốc Bhutan'" 951 | ] 952 | }, 953 | "execution_count": 36, 954 | "metadata": {}, 955 | "output_type": "execute_result" 956 | } 957 | ], 958 | "source": [ 959 | "a = \"忎 㕒 砄\".split()\n", 960 | "tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens([ord(i) - 100 for i in a]))" 961 | ] 962 | }, 963 | { 964 | "cell_type": "code", 965 | "execution_count": 7, 966 | "metadata": {}, 967 | "outputs": [], 968 | "source": [ 969 | "def tokenize_str(texts, tokenizer, offset):\n", 970 | " tokenized_text = []\n", 971 | " for text in texts:\n", 972 | " tok_text = tokenizer.encode(text)\n", 973 | " tok_text = [chr(token + offset) for token in tok_text]\n", 974 | " tokenized_text.append(tok_text)\n", 975 | " return tokenized_text" 976 | ] 977 | }, 978 | { 979 | "cell_type": "code", 980 | "execution_count": 8, 981 | "metadata": {}, 982 | "outputs": [], 983 | "source": [ 984 | "sentence = 'Nếu trồng người góc'" 985 | ] 986 | }, 987 | { 988 | "cell_type": "code", 989 | "execution_count": 12, 990 | "metadata": {}, 991 | "outputs": [], 992 | "source": [ 993 | "# sentence_split = tokenizer.convert_ids_to_tokens(tokenizer(sentence)['input_ids'])\n", 994 | "# Phải dùng tokenize_str để tokenize vì ta thêm offset 100 :D\n", 995 | "sentence_split = tokenize_str([sentence], tokenizer, 100)[0]" 996 | ] 997 | }, 998 | { 999 | "cell_type": "code", 1000 | "execution_count": 13, 1001 | "metadata": {}, 1002 | "outputs": [ 1003 | { 1004 | "data": { 1005 | "text/plain": [ 1006 | "['\\x91', '矌', 'ŀ', '᭛', '襢', '㹾', 'Ɔ', 'ꂅ']" 1007 | ] 1008 | }, 1009 | "execution_count": 13, 1010 | "metadata": {}, 1011 | "output_type": "execute_result" 1012 | } 1013 | ], 1014 | "source": [ 1015 | "sentence_split" 1016 | ] 1017 | }, 1018 | { 1019 | "cell_type": "code", 1020 | "execution_count": 40, 1021 | "metadata": {}, 1022 | "outputs": [ 1023 | { 1024 | "data": { 1025 | "text/plain": [ 1026 | "'Địa ốc Bhutan mới nổi.'" 1027 | ] 1028 | }, 1029 | "execution_count": 40, 1030 | "metadata": {}, 1031 | "output_type": "execute_result" 1032 | } 1033 | ], 1034 | "source": [ 1035 | "# Để convert lại ta ord rồi trừ 100, ngược của chr(token + offset)\n", 1036 | "tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens([ord(i) - 100 for i in sentence_split]))" 1037 | ] 1038 | }, 1039 | { 1040 | "cell_type": "code", 1041 | "execution_count": 41, 1042 | "metadata": {}, 1043 | "outputs": [ 1044 | { 1045 | "data": { 1046 | "text/plain": [ 1047 | "'ä Œ ⸠ ¤ ŀ 忎 㕒 砄 鈴 ƍ 鞔 q'" 1048 | ] 1049 | }, 1050 | "execution_count": 41, 1051 | "metadata": {}, 1052 | "output_type": "execute_result" 1053 | } 1054 | ], 1055 | "source": [ 1056 | "\" \".join(sentence_split)" 1057 | ] 1058 | }, 1059 | { 1060 | "cell_type": "code", 1061 | "execution_count": 42, 1062 | "metadata": {}, 1063 | "outputs": [ 1064 | { 1065 | "data": { 1066 | "text/plain": [ 1067 | "-76.36715698242188" 1068 | ] 1069 | }, 1070 | "execution_count": 42, 1071 | "metadata": {}, 1072 | "output_type": "execute_result" 1073 | } 1074 | ], 1075 | "source": [ 1076 | "model.score(\" \".join(sentence_split))" 1077 | ] 1078 | }, 1079 | { 1080 | "cell_type": "code", 1081 | "execution_count": 33, 1082 | "metadata": {}, 1083 | "outputs": [ 1084 | { 1085 | "data": { 1086 | "text/plain": [ 1087 | "-32.09736633300781" 1088 | ] 1089 | }, 1090 | "execution_count": 33, 1091 | "metadata": {}, 1092 | "output_type": "execute_result" 1093 | } 1094 | ], 1095 | "source": [ 1096 | "model.score(\" \".join(sentence_split))" 1097 | ] 1098 | }, 1099 | { 1100 | "cell_type": "code", 1101 | "execution_count": 43, 1102 | "metadata": {}, 1103 | "outputs": [ 1104 | { 1105 | "name": "stdout", 1106 | "output_type": "stream", 1107 | "text": [ 1108 | "prob: -9.877823829650879 length: 1: ä\n", 1109 | "\t\"ä\" is an OOV\n", 1110 | "prob: -5.552821159362793 length: 1: Œ\n", 1111 | "\t\"Œ\" is an OOV\n", 1112 | "prob: -5.552821159362793 length: 1: ⸠\n", 1113 | "\t\"⸠\" is an OOV\n", 1114 | "prob: -5.552821159362793 length: 1: ¤\n", 1115 | "\t\"¤\" is an OOV\n", 1116 | "prob: -5.552821159362793 length: 1: ŀ\n", 1117 | "\t\"ŀ\" is an OOV\n", 1118 | "prob: -5.552821159362793 length: 1: 忎\n", 1119 | "\t\"忎\" is an OOV\n", 1120 | "prob: -5.552821159362793 length: 1: 㕒\n", 1121 | "\t\"㕒\" is an OOV\n", 1122 | "prob: -5.552821159362793 length: 1: 砄\n", 1123 | "\t\"砄\" is an OOV\n", 1124 | "prob: -5.552821159362793 length: 1: 鈴\n", 1125 | "\t\"鈴\" is an OOV\n", 1126 | "prob: -5.552821159362793 length: 1: ƍ\n", 1127 | "\t\"ƍ\" is an OOV\n", 1128 | "prob: -5.552821159362793 length: 1: 鞔\n", 1129 | "\t\"鞔\" is an OOV\n", 1130 | "prob: -5.552821159362793 length: 1: q\n", 1131 | "\t\"q\" is an OOV\n", 1132 | "prob: -5.408297538757324 length: 1: \n" 1133 | ] 1134 | } 1135 | ], 1136 | "source": [ 1137 | "# Show scores and n-gram matches\n", 1138 | "words = [''] + sentence_split + ['']\n", 1139 | "for i, (prob, length, oov) in enumerate(model.full_scores(\" \".join(sentence_split))):\n", 1140 | " print('prob: {0} length: {1}: {2}'.format(prob, length, ' '.join(words[i+2-length:i+2])))\n", 1141 | " if oov:\n", 1142 | " print('\\t\"{0}\" is an OOV'.format(words[i+1]))" 1143 | ] 1144 | }, 1145 | { 1146 | "cell_type": "code", 1147 | "execution_count": 144, 1148 | "metadata": {}, 1149 | "outputs": [ 1150 | { 1151 | "name": "stdout", 1152 | "output_type": "stream", 1153 | "text": [ 1154 | "prob: -7.633749485015869 length: 1: —\n", 1155 | "prob: -6.314070701599121 length: 1: 〙\n", 1156 | "\t\"〙\" is an OOV\n", 1157 | "prob: -5.552821159362793 length: 1: ŀ\n", 1158 | "\t\"ŀ\" is an OOV\n", 1159 | "prob: -5.552821159362793 length: 1: 刺\n", 1160 | "\t\"刺\" is an OOV\n", 1161 | "prob: -5.552821159362793 length: 1: ό\n", 1162 | "\t\"ό\" is an OOV\n", 1163 | "prob: -5.552821159362793 length: 1: 䭤\n", 1164 | "\t\"䭤\" is an OOV\n", 1165 | "prob: -5.552821159362793 length: 1: 阦\n", 1166 | "\t\"阦\" is an OOV\n", 1167 | "prob: -5.552821159362793 length: 1: ŀ\n", 1168 | "\t\"ŀ\" is an OOV\n", 1169 | "prob: -5.552821159362793 length: 1: ·\n", 1170 | "\t\"·\" is an OOV\n", 1171 | "prob: -5.552821159362793 length: 1: 鷨\n", 1172 | "\t\"鷨\" is an OOV\n", 1173 | "prob: -5.408297538757324 length: 1: \n" 1174 | ] 1175 | } 1176 | ], 1177 | "source": [ 1178 | "# Show scores and n-gram matches\n", 1179 | "words = [''] + sentence_split + ['']\n", 1180 | "for i, (prob, length, oov) in enumerate(model.full_scores(\" \".join(sentence_split))):\n", 1181 | " print('prob: {0} length: {1}: {2}'.format(prob, length, ' '.join(words[i+2-length:i+2])))\n", 1182 | " if oov:\n", 1183 | " print('\\t\"{0}\" is an OOV'.format(words[i+1]))" 1184 | ] 1185 | }, 1186 | { 1187 | "cell_type": "code", 1188 | "execution_count": 2, 1189 | "metadata": {}, 1190 | "outputs": [], 1191 | "source": [ 1192 | "import kenlm" 1193 | ] 1194 | }, 1195 | { 1196 | "cell_type": "code", 1197 | "execution_count": 3, 1198 | "metadata": {}, 1199 | "outputs": [], 1200 | "source": [ 1201 | "model = kenlm.Model('dataset_tokenized_3gram.binary')" 1202 | ] 1203 | }, 1204 | { 1205 | "cell_type": "code", 1206 | "execution_count": 4, 1207 | "metadata": {}, 1208 | "outputs": [], 1209 | "source": [ 1210 | "state = kenlm.State()\n", 1211 | "state_next_token = kenlm.State()\n", 1212 | "state_next_token_t = kenlm.State()" 1213 | ] 1214 | }, 1215 | { 1216 | "cell_type": "code", 1217 | "execution_count": 5, 1218 | "metadata": {}, 1219 | "outputs": [ 1220 | { 1221 | "data": { 1222 | "text/plain": [ 1223 | "-8.021354913711548" 1224 | ] 1225 | }, 1226 | "execution_count": 5, 1227 | "metadata": {}, 1228 | "output_type": "execute_result" 1229 | } 1230 | ], 1231 | "source": [ 1232 | "model.BeginSentenceWrite(state)\n", 1233 | "accum = 0\n", 1234 | "accum += model.BaseScore(state, \"鷨\", state_next_token)\n", 1235 | "accum += model.BaseScore(state_next_token, \"刺\", state)\n", 1236 | "accum" 1237 | ] 1238 | }, 1239 | { 1240 | "cell_type": "code", 1241 | "execution_count": 6, 1242 | "metadata": {}, 1243 | "outputs": [ 1244 | { 1245 | "data": { 1246 | "text/plain": [ 1247 | "-12.832839250564575" 1248 | ] 1249 | }, 1250 | "execution_count": 6, 1251 | "metadata": {}, 1252 | "output_type": "execute_result" 1253 | } 1254 | ], 1255 | "source": [ 1256 | "prob = model.BaseScore(state, \"\", state_next_token_t)\n", 1257 | "prob + accum" 1258 | ] 1259 | }, 1260 | { 1261 | "cell_type": "code", 1262 | "execution_count": 71, 1263 | "metadata": {}, 1264 | "outputs": [ 1265 | { 1266 | "data": { 1267 | "text/plain": [ 1268 | "-12.832839012145996" 1269 | ] 1270 | }, 1271 | "execution_count": 71, 1272 | "metadata": {}, 1273 | "output_type": "execute_result" 1274 | } 1275 | ], 1276 | "source": [ 1277 | "model.score(\"鷨 刺 \", eos=False)" 1278 | ] 1279 | } 1280 | ], 1281 | "metadata": { 1282 | "colab": { 1283 | "collapsed_sections": [ 1284 | "kValhOGAlsfj", 1285 | "zaAvDY2xlvpI" 1286 | ], 1287 | "provenance": [] 1288 | }, 1289 | "kernelspec": { 1290 | "display_name": "Python 3 (ipykernel)", 1291 | "language": "python", 1292 | "name": "python3" 1293 | }, 1294 | "language_info": { 1295 | "codemirror_mode": { 1296 | "name": "ipython", 1297 | "version": 3 1298 | }, 1299 | "file_extension": ".py", 1300 | "mimetype": "text/x-python", 1301 | "name": "python", 1302 | "nbconvert_exporter": "python", 1303 | "pygments_lexer": "ipython3", 1304 | "version": "3.10.6" 1305 | }, 1306 | "vscode": { 1307 | "interpreter": { 1308 | "hash": "569cc53f83e70b41c2d959ffedb296ac14adb9e332ab59ae04a2c7a2935b0e00" 1309 | } 1310 | }, 1311 | "widgets": { 1312 | "application/vnd.jupyter.widget-state+json": { 1313 | "06989b53974f41d28796d7e9f7ca1034": { 1314 | "model_module": "@jupyter-widgets/controls", 1315 | "model_module_version": "1.5.0", 1316 | "model_name": "DescriptionStyleModel", 1317 | "state": { 1318 | "_model_module": "@jupyter-widgets/controls", 1319 | "_model_module_version": "1.5.0", 1320 | "_model_name": "DescriptionStyleModel", 1321 | "_view_count": null, 1322 | "_view_module": "@jupyter-widgets/base", 1323 | "_view_module_version": "1.2.0", 1324 | "_view_name": "StyleView", 1325 | "description_width": "" 1326 | } 1327 | }, 1328 | "262929238fd543cc8190f755bd48aca1": { 1329 | "model_module": "@jupyter-widgets/controls", 1330 | "model_module_version": "1.5.0", 1331 | "model_name": "HTMLModel", 1332 | "state": { 1333 | "_dom_classes": [], 1334 | "_model_module": "@jupyter-widgets/controls", 1335 | "_model_module_version": "1.5.0", 1336 | "_model_name": "HTMLModel", 1337 | "_view_count": null, 1338 | "_view_module": "@jupyter-widgets/controls", 1339 | "_view_module_version": "1.5.0", 1340 | "_view_name": "HTMLView", 1341 | "description": "", 1342 | "description_tooltip": null, 1343 | "layout": "IPY_MODEL_aba59e0a981845b0a2afc17f1ea8fad0", 1344 | "placeholder": "​", 1345 | "style": "IPY_MODEL_b511866d7dcc4c69b114720594db39b9", 1346 | "value": "Chunk : 100%" 1347 | } 1348 | }, 1349 | "269f80894a8b42169242e5a9f711ea96": { 1350 | "model_module": "@jupyter-widgets/controls", 1351 | "model_module_version": "1.5.0", 1352 | "model_name": "HTMLModel", 1353 | "state": { 1354 | "_dom_classes": [], 1355 | "_model_module": "@jupyter-widgets/controls", 1356 | "_model_module_version": "1.5.0", 1357 | "_model_name": "HTMLModel", 1358 | "_view_count": null, 1359 | "_view_module": "@jupyter-widgets/controls", 1360 | "_view_module_version": "1.5.0", 1361 | "_view_name": "HTMLView", 1362 | "description": "", 1363 | "description_tooltip": null, 1364 | "layout": "IPY_MODEL_bc79e00c44874895a62dbab150be6b91", 1365 | "placeholder": "​", 1366 | "style": "IPY_MODEL_a31dc64cca7146bd95cf8e98db98828c", 1367 | "value": " 13/13 [00:00<00:00, 29.86 chunks/s]" 1368 | } 1369 | }, 1370 | "2c05958e110648a1ac6d447b26a94ad4": { 1371 | "model_module": "@jupyter-widgets/controls", 1372 | "model_module_version": "1.5.0", 1373 | "model_name": "HBoxModel", 1374 | "state": { 1375 | "_dom_classes": [], 1376 | "_model_module": "@jupyter-widgets/controls", 1377 | "_model_module_version": "1.5.0", 1378 | "_model_name": "HBoxModel", 1379 | "_view_count": null, 1380 | "_view_module": "@jupyter-widgets/controls", 1381 | "_view_module_version": "1.5.0", 1382 | "_view_name": "HBoxView", 1383 | "box_style": "", 1384 | "children": [ 1385 | "IPY_MODEL_262929238fd543cc8190f755bd48aca1", 1386 | "IPY_MODEL_4f1d415e84ba4db99c05837edfd53bda", 1387 | "IPY_MODEL_269f80894a8b42169242e5a9f711ea96" 1388 | ], 1389 | "layout": "IPY_MODEL_7b023fdc14184a9e95d8db379b511673" 1390 | } 1391 | }, 1392 | "3fee1bf921dd48f69811c0aaa3213974": { 1393 | "model_module": "@jupyter-widgets/base", 1394 | "model_module_version": "1.2.0", 1395 | "model_name": "LayoutModel", 1396 | "state": { 1397 | "_model_module": "@jupyter-widgets/base", 1398 | "_model_module_version": "1.2.0", 1399 | "_model_name": "LayoutModel", 1400 | "_view_count": null, 1401 | "_view_module": "@jupyter-widgets/base", 1402 | "_view_module_version": "1.2.0", 1403 | "_view_name": "LayoutView", 1404 | "align_content": null, 1405 | "align_items": null, 1406 | "align_self": null, 1407 | "border": null, 1408 | "bottom": null, 1409 | "display": null, 1410 | "flex": null, 1411 | "flex_flow": null, 1412 | "grid_area": null, 1413 | "grid_auto_columns": null, 1414 | "grid_auto_flow": null, 1415 | "grid_auto_rows": null, 1416 | "grid_column": null, 1417 | "grid_gap": null, 1418 | "grid_row": null, 1419 | "grid_template_areas": null, 1420 | "grid_template_columns": null, 1421 | "grid_template_rows": null, 1422 | "height": null, 1423 | "justify_content": null, 1424 | "justify_items": null, 1425 | "left": null, 1426 | "margin": null, 1427 | "max_height": null, 1428 | "max_width": null, 1429 | "min_height": null, 1430 | "min_width": null, 1431 | "object_fit": null, 1432 | "object_position": null, 1433 | "order": null, 1434 | "overflow": null, 1435 | "overflow_x": null, 1436 | "overflow_y": null, 1437 | "padding": null, 1438 | "right": null, 1439 | "top": null, 1440 | "visibility": null, 1441 | "width": null 1442 | } 1443 | }, 1444 | "4a48e417d4be432c8733f046974af3cb": { 1445 | "model_module": "@jupyter-widgets/base", 1446 | "model_module_version": "1.2.0", 1447 | "model_name": "LayoutModel", 1448 | "state": { 1449 | "_model_module": "@jupyter-widgets/base", 1450 | "_model_module_version": "1.2.0", 1451 | "_model_name": "LayoutModel", 1452 | "_view_count": null, 1453 | "_view_module": "@jupyter-widgets/base", 1454 | "_view_module_version": "1.2.0", 1455 | "_view_name": "LayoutView", 1456 | "align_content": null, 1457 | "align_items": null, 1458 | "align_self": null, 1459 | "border": null, 1460 | "bottom": null, 1461 | "display": null, 1462 | "flex": null, 1463 | "flex_flow": null, 1464 | "grid_area": null, 1465 | "grid_auto_columns": null, 1466 | "grid_auto_flow": null, 1467 | "grid_auto_rows": null, 1468 | "grid_column": null, 1469 | "grid_gap": null, 1470 | "grid_row": null, 1471 | "grid_template_areas": null, 1472 | "grid_template_columns": null, 1473 | "grid_template_rows": null, 1474 | "height": null, 1475 | "justify_content": null, 1476 | "justify_items": null, 1477 | "left": null, 1478 | "margin": null, 1479 | "max_height": null, 1480 | "max_width": null, 1481 | "min_height": null, 1482 | "min_width": null, 1483 | "object_fit": null, 1484 | "object_position": null, 1485 | "order": null, 1486 | "overflow": null, 1487 | "overflow_x": null, 1488 | "overflow_y": null, 1489 | "padding": null, 1490 | "right": null, 1491 | "top": null, 1492 | "visibility": null, 1493 | "width": null 1494 | } 1495 | }, 1496 | "4f1d415e84ba4db99c05837edfd53bda": { 1497 | "model_module": "@jupyter-widgets/controls", 1498 | "model_module_version": "1.5.0", 1499 | "model_name": "FloatProgressModel", 1500 | "state": { 1501 | "_dom_classes": [], 1502 | "_model_module": "@jupyter-widgets/controls", 1503 | "_model_module_version": "1.5.0", 1504 | "_model_name": "FloatProgressModel", 1505 | "_view_count": null, 1506 | "_view_module": "@jupyter-widgets/controls", 1507 | "_view_module_version": "1.5.0", 1508 | "_view_name": "ProgressView", 1509 | "bar_style": "success", 1510 | "description": "", 1511 | "description_tooltip": null, 1512 | "layout": "IPY_MODEL_4a48e417d4be432c8733f046974af3cb", 1513 | "max": 13, 1514 | "min": 0, 1515 | "orientation": "horizontal", 1516 | "style": "IPY_MODEL_cb1bd500742d4938a96e966c8964ecb6", 1517 | "value": 13 1518 | } 1519 | }, 1520 | "5cb3e448899c47ba9fd2c7731aba954b": { 1521 | "model_module": "@jupyter-widgets/controls", 1522 | "model_module_version": "1.5.0", 1523 | "model_name": "ProgressStyleModel", 1524 | "state": { 1525 | "_model_module": "@jupyter-widgets/controls", 1526 | "_model_module_version": "1.5.0", 1527 | "_model_name": "ProgressStyleModel", 1528 | "_view_count": null, 1529 | "_view_module": "@jupyter-widgets/base", 1530 | "_view_module_version": "1.2.0", 1531 | "_view_name": "StyleView", 1532 | "bar_color": null, 1533 | "description_width": "" 1534 | } 1535 | }, 1536 | "63415a015b504c7f8baa3d5a60b7c560": { 1537 | "model_module": "@jupyter-widgets/controls", 1538 | "model_module_version": "1.5.0", 1539 | "model_name": "HBoxModel", 1540 | "state": { 1541 | "_dom_classes": [], 1542 | "_model_module": "@jupyter-widgets/controls", 1543 | "_model_module_version": "1.5.0", 1544 | "_model_name": "HBoxModel", 1545 | "_view_count": null, 1546 | "_view_module": "@jupyter-widgets/controls", 1547 | "_view_module_version": "1.5.0", 1548 | "_view_name": "HBoxView", 1549 | "box_style": "", 1550 | "children": [ 1551 | "IPY_MODEL_b1cfc6b061cd40acbf15c3a617c2decd", 1552 | "IPY_MODEL_98c0b2a457bf405482ec3b11c61ac454", 1553 | "IPY_MODEL_ffe2904f8a2747638e11410a5474f9e4" 1554 | ], 1555 | "layout": "IPY_MODEL_e8d2d9fa065f4a13b614dd8c4a72f824" 1556 | } 1557 | }, 1558 | "7b023fdc14184a9e95d8db379b511673": { 1559 | "model_module": "@jupyter-widgets/base", 1560 | "model_module_version": "1.2.0", 1561 | "model_name": "LayoutModel", 1562 | "state": { 1563 | "_model_module": "@jupyter-widgets/base", 1564 | "_model_module_version": "1.2.0", 1565 | "_model_name": "LayoutModel", 1566 | "_view_count": null, 1567 | "_view_module": "@jupyter-widgets/base", 1568 | "_view_module_version": "1.2.0", 1569 | "_view_name": "LayoutView", 1570 | "align_content": null, 1571 | "align_items": null, 1572 | "align_self": null, 1573 | "border": null, 1574 | "bottom": null, 1575 | "display": null, 1576 | "flex": null, 1577 | "flex_flow": null, 1578 | "grid_area": null, 1579 | "grid_auto_columns": null, 1580 | "grid_auto_flow": null, 1581 | "grid_auto_rows": null, 1582 | "grid_column": null, 1583 | "grid_gap": null, 1584 | "grid_row": null, 1585 | "grid_template_areas": null, 1586 | "grid_template_columns": null, 1587 | "grid_template_rows": null, 1588 | "height": null, 1589 | "justify_content": null, 1590 | "justify_items": null, 1591 | "left": null, 1592 | "margin": null, 1593 | "max_height": null, 1594 | "max_width": null, 1595 | "min_height": null, 1596 | "min_width": null, 1597 | "object_fit": null, 1598 | "object_position": null, 1599 | "order": null, 1600 | "overflow": null, 1601 | "overflow_x": null, 1602 | "overflow_y": null, 1603 | "padding": null, 1604 | "right": null, 1605 | "top": null, 1606 | "visibility": null, 1607 | "width": null 1608 | } 1609 | }, 1610 | "9073d7f2397f47d0a60bc828f69f93cf": { 1611 | "model_module": "@jupyter-widgets/controls", 1612 | "model_module_version": "1.5.0", 1613 | "model_name": "DescriptionStyleModel", 1614 | "state": { 1615 | "_model_module": "@jupyter-widgets/controls", 1616 | "_model_module_version": "1.5.0", 1617 | "_model_name": "DescriptionStyleModel", 1618 | "_view_count": null, 1619 | "_view_module": "@jupyter-widgets/base", 1620 | "_view_module_version": "1.2.0", 1621 | "_view_name": "StyleView", 1622 | "description_width": "" 1623 | } 1624 | }, 1625 | "9668a1ccd9d742349585cf3349cff5b6": { 1626 | "model_module": "@jupyter-widgets/base", 1627 | "model_module_version": "1.2.0", 1628 | "model_name": "LayoutModel", 1629 | "state": { 1630 | "_model_module": "@jupyter-widgets/base", 1631 | "_model_module_version": "1.2.0", 1632 | "_model_name": "LayoutModel", 1633 | "_view_count": null, 1634 | "_view_module": "@jupyter-widgets/base", 1635 | "_view_module_version": "1.2.0", 1636 | "_view_name": "LayoutView", 1637 | "align_content": null, 1638 | "align_items": null, 1639 | "align_self": null, 1640 | "border": null, 1641 | "bottom": null, 1642 | "display": null, 1643 | "flex": null, 1644 | "flex_flow": null, 1645 | "grid_area": null, 1646 | "grid_auto_columns": null, 1647 | "grid_auto_flow": null, 1648 | "grid_auto_rows": null, 1649 | "grid_column": null, 1650 | "grid_gap": null, 1651 | "grid_row": null, 1652 | "grid_template_areas": null, 1653 | "grid_template_columns": null, 1654 | "grid_template_rows": null, 1655 | "height": null, 1656 | "justify_content": null, 1657 | "justify_items": null, 1658 | "left": null, 1659 | "margin": null, 1660 | "max_height": null, 1661 | "max_width": null, 1662 | "min_height": null, 1663 | "min_width": null, 1664 | "object_fit": null, 1665 | "object_position": null, 1666 | "order": null, 1667 | "overflow": null, 1668 | "overflow_x": null, 1669 | "overflow_y": null, 1670 | "padding": null, 1671 | "right": null, 1672 | "top": null, 1673 | "visibility": null, 1674 | "width": null 1675 | } 1676 | }, 1677 | "98c0b2a457bf405482ec3b11c61ac454": { 1678 | "model_module": "@jupyter-widgets/controls", 1679 | "model_module_version": "1.5.0", 1680 | "model_name": "FloatProgressModel", 1681 | "state": { 1682 | "_dom_classes": [], 1683 | "_model_module": "@jupyter-widgets/controls", 1684 | "_model_module_version": "1.5.0", 1685 | "_model_name": "FloatProgressModel", 1686 | "_view_count": null, 1687 | "_view_module": "@jupyter-widgets/controls", 1688 | "_view_module_version": "1.5.0", 1689 | "_view_name": "ProgressView", 1690 | "bar_style": "success", 1691 | "description": "", 1692 | "description_tooltip": null, 1693 | "layout": "IPY_MODEL_aabcd286ca28404da9170688b6871027", 1694 | "max": 1, 1695 | "min": 0, 1696 | "orientation": "horizontal", 1697 | "style": "IPY_MODEL_5cb3e448899c47ba9fd2c7731aba954b", 1698 | "value": 0 1699 | } 1700 | }, 1701 | "a31dc64cca7146bd95cf8e98db98828c": { 1702 | "model_module": "@jupyter-widgets/controls", 1703 | "model_module_version": "1.5.0", 1704 | "model_name": "DescriptionStyleModel", 1705 | "state": { 1706 | "_model_module": "@jupyter-widgets/controls", 1707 | "_model_module_version": "1.5.0", 1708 | "_model_name": "DescriptionStyleModel", 1709 | "_view_count": null, 1710 | "_view_module": "@jupyter-widgets/base", 1711 | "_view_module_version": "1.2.0", 1712 | "_view_name": "StyleView", 1713 | "description_width": "" 1714 | } 1715 | }, 1716 | "aabcd286ca28404da9170688b6871027": { 1717 | "model_module": "@jupyter-widgets/base", 1718 | "model_module_version": "1.2.0", 1719 | "model_name": "LayoutModel", 1720 | "state": { 1721 | "_model_module": "@jupyter-widgets/base", 1722 | "_model_module_version": "1.2.0", 1723 | "_model_name": "LayoutModel", 1724 | "_view_count": null, 1725 | "_view_module": "@jupyter-widgets/base", 1726 | "_view_module_version": "1.2.0", 1727 | "_view_name": "LayoutView", 1728 | "align_content": null, 1729 | "align_items": null, 1730 | "align_self": null, 1731 | "border": null, 1732 | "bottom": null, 1733 | "display": null, 1734 | "flex": null, 1735 | "flex_flow": null, 1736 | "grid_area": null, 1737 | "grid_auto_columns": null, 1738 | "grid_auto_flow": null, 1739 | "grid_auto_rows": null, 1740 | "grid_column": null, 1741 | "grid_gap": null, 1742 | "grid_row": null, 1743 | "grid_template_areas": null, 1744 | "grid_template_columns": null, 1745 | "grid_template_rows": null, 1746 | "height": null, 1747 | "justify_content": null, 1748 | "justify_items": null, 1749 | "left": null, 1750 | "margin": null, 1751 | "max_height": null, 1752 | "max_width": null, 1753 | "min_height": null, 1754 | "min_width": null, 1755 | "object_fit": null, 1756 | "object_position": null, 1757 | "order": null, 1758 | "overflow": null, 1759 | "overflow_x": null, 1760 | "overflow_y": null, 1761 | "padding": null, 1762 | "right": null, 1763 | "top": null, 1764 | "visibility": null, 1765 | "width": "20px" 1766 | } 1767 | }, 1768 | "aba59e0a981845b0a2afc17f1ea8fad0": { 1769 | "model_module": "@jupyter-widgets/base", 1770 | "model_module_version": "1.2.0", 1771 | "model_name": "LayoutModel", 1772 | "state": { 1773 | "_model_module": "@jupyter-widgets/base", 1774 | "_model_module_version": "1.2.0", 1775 | "_model_name": "LayoutModel", 1776 | "_view_count": null, 1777 | "_view_module": "@jupyter-widgets/base", 1778 | "_view_module_version": "1.2.0", 1779 | "_view_name": "LayoutView", 1780 | "align_content": null, 1781 | "align_items": null, 1782 | "align_self": null, 1783 | "border": null, 1784 | "bottom": null, 1785 | "display": null, 1786 | "flex": null, 1787 | "flex_flow": null, 1788 | "grid_area": null, 1789 | "grid_auto_columns": null, 1790 | "grid_auto_flow": null, 1791 | "grid_auto_rows": null, 1792 | "grid_column": null, 1793 | "grid_gap": null, 1794 | "grid_row": null, 1795 | "grid_template_areas": null, 1796 | "grid_template_columns": null, 1797 | "grid_template_rows": null, 1798 | "height": null, 1799 | "justify_content": null, 1800 | "justify_items": null, 1801 | "left": null, 1802 | "margin": null, 1803 | "max_height": null, 1804 | "max_width": null, 1805 | "min_height": null, 1806 | "min_width": null, 1807 | "object_fit": null, 1808 | "object_position": null, 1809 | "order": null, 1810 | "overflow": null, 1811 | "overflow_x": null, 1812 | "overflow_y": null, 1813 | "padding": null, 1814 | "right": null, 1815 | "top": null, 1816 | "visibility": null, 1817 | "width": null 1818 | } 1819 | }, 1820 | "b1cfc6b061cd40acbf15c3a617c2decd": { 1821 | "model_module": "@jupyter-widgets/controls", 1822 | "model_module_version": "1.5.0", 1823 | "model_name": "HTMLModel", 1824 | "state": { 1825 | "_dom_classes": [], 1826 | "_model_module": "@jupyter-widgets/controls", 1827 | "_model_module_version": "1.5.0", 1828 | "_model_name": "HTMLModel", 1829 | "_view_count": null, 1830 | "_view_module": "@jupyter-widgets/controls", 1831 | "_view_module_version": "1.5.0", 1832 | "_view_name": "HTMLView", 1833 | "description": "", 1834 | "description_tooltip": null, 1835 | "layout": "IPY_MODEL_3fee1bf921dd48f69811c0aaa3213974", 1836 | "placeholder": "​", 1837 | "style": "IPY_MODEL_06989b53974f41d28796d7e9f7ca1034", 1838 | "value": "" 1839 | } 1840 | }, 1841 | "b511866d7dcc4c69b114720594db39b9": { 1842 | "model_module": "@jupyter-widgets/controls", 1843 | "model_module_version": "1.5.0", 1844 | "model_name": "DescriptionStyleModel", 1845 | "state": { 1846 | "_model_module": "@jupyter-widgets/controls", 1847 | "_model_module_version": "1.5.0", 1848 | "_model_name": "DescriptionStyleModel", 1849 | "_view_count": null, 1850 | "_view_module": "@jupyter-widgets/base", 1851 | "_view_module_version": "1.2.0", 1852 | "_view_name": "StyleView", 1853 | "description_width": "" 1854 | } 1855 | }, 1856 | "bc79e00c44874895a62dbab150be6b91": { 1857 | "model_module": "@jupyter-widgets/base", 1858 | "model_module_version": "1.2.0", 1859 | "model_name": "LayoutModel", 1860 | "state": { 1861 | "_model_module": "@jupyter-widgets/base", 1862 | "_model_module_version": "1.2.0", 1863 | "_model_name": "LayoutModel", 1864 | "_view_count": null, 1865 | "_view_module": "@jupyter-widgets/base", 1866 | "_view_module_version": "1.2.0", 1867 | "_view_name": "LayoutView", 1868 | "align_content": null, 1869 | "align_items": null, 1870 | "align_self": null, 1871 | "border": null, 1872 | "bottom": null, 1873 | "display": null, 1874 | "flex": null, 1875 | "flex_flow": null, 1876 | "grid_area": null, 1877 | "grid_auto_columns": null, 1878 | "grid_auto_flow": null, 1879 | "grid_auto_rows": null, 1880 | "grid_column": null, 1881 | "grid_gap": null, 1882 | "grid_row": null, 1883 | "grid_template_areas": null, 1884 | "grid_template_columns": null, 1885 | "grid_template_rows": null, 1886 | "height": null, 1887 | "justify_content": null, 1888 | "justify_items": null, 1889 | "left": null, 1890 | "margin": null, 1891 | "max_height": null, 1892 | "max_width": null, 1893 | "min_height": null, 1894 | "min_width": null, 1895 | "object_fit": null, 1896 | "object_position": null, 1897 | "order": null, 1898 | "overflow": null, 1899 | "overflow_x": null, 1900 | "overflow_y": null, 1901 | "padding": null, 1902 | "right": null, 1903 | "top": null, 1904 | "visibility": null, 1905 | "width": null 1906 | } 1907 | }, 1908 | "cb1bd500742d4938a96e966c8964ecb6": { 1909 | "model_module": "@jupyter-widgets/controls", 1910 | "model_module_version": "1.5.0", 1911 | "model_name": "ProgressStyleModel", 1912 | "state": { 1913 | "_model_module": "@jupyter-widgets/controls", 1914 | "_model_module_version": "1.5.0", 1915 | "_model_name": "ProgressStyleModel", 1916 | "_view_count": null, 1917 | "_view_module": "@jupyter-widgets/base", 1918 | "_view_module_version": "1.2.0", 1919 | "_view_name": "StyleView", 1920 | "bar_color": null, 1921 | "description_width": "" 1922 | } 1923 | }, 1924 | "e8d2d9fa065f4a13b614dd8c4a72f824": { 1925 | "model_module": "@jupyter-widgets/base", 1926 | "model_module_version": "1.2.0", 1927 | "model_name": "LayoutModel", 1928 | "state": { 1929 | "_model_module": "@jupyter-widgets/base", 1930 | "_model_module_version": "1.2.0", 1931 | "_model_name": "LayoutModel", 1932 | "_view_count": null, 1933 | "_view_module": "@jupyter-widgets/base", 1934 | "_view_module_version": "1.2.0", 1935 | "_view_name": "LayoutView", 1936 | "align_content": null, 1937 | "align_items": null, 1938 | "align_self": null, 1939 | "border": null, 1940 | "bottom": null, 1941 | "display": null, 1942 | "flex": null, 1943 | "flex_flow": null, 1944 | "grid_area": null, 1945 | "grid_auto_columns": null, 1946 | "grid_auto_flow": null, 1947 | "grid_auto_rows": null, 1948 | "grid_column": null, 1949 | "grid_gap": null, 1950 | "grid_row": null, 1951 | "grid_template_areas": null, 1952 | "grid_template_columns": null, 1953 | "grid_template_rows": null, 1954 | "height": null, 1955 | "justify_content": null, 1956 | "justify_items": null, 1957 | "left": null, 1958 | "margin": null, 1959 | "max_height": null, 1960 | "max_width": null, 1961 | "min_height": null, 1962 | "min_width": null, 1963 | "object_fit": null, 1964 | "object_position": null, 1965 | "order": null, 1966 | "overflow": null, 1967 | "overflow_x": null, 1968 | "overflow_y": null, 1969 | "padding": null, 1970 | "right": null, 1971 | "top": null, 1972 | "visibility": null, 1973 | "width": null 1974 | } 1975 | }, 1976 | "ffe2904f8a2747638e11410a5474f9e4": { 1977 | "model_module": "@jupyter-widgets/controls", 1978 | "model_module_version": "1.5.0", 1979 | "model_name": "HTMLModel", 1980 | "state": { 1981 | "_dom_classes": [], 1982 | "_model_module": "@jupyter-widgets/controls", 1983 | "_model_module_version": "1.5.0", 1984 | "_model_name": "HTMLModel", 1985 | "_view_count": null, 1986 | "_view_module": "@jupyter-widgets/controls", 1987 | "_view_module_version": "1.5.0", 1988 | "_view_name": "HTMLView", 1989 | "description": "", 1990 | "description_tooltip": null, 1991 | "layout": "IPY_MODEL_9668a1ccd9d742349585cf3349cff5b6", 1992 | "placeholder": "​", 1993 | "style": "IPY_MODEL_9073d7f2397f47d0a60bc828f69f93cf", 1994 | "value": " 0/0 [00:00<?, ?it/s]" 1995 | } 1996 | } 1997 | } 1998 | } 1999 | }, 2000 | "nbformat": 4, 2001 | "nbformat_minor": 0 2002 | } 2003 | --------------------------------------------------------------------------------