├── whisper_finetune
├── readme.md
├── README.md
├── requirements.txt
├── utils.py
├── config.py
├── inference.py
├── test.py
├── finetune.py
├── model.py
└── dataset.py
├── images
├── wav2vec.png
└── whisper.png
├── README.md
├── demo
├── templates
│ └── home.html
├── app.py
└── static
│ ├── css
│ └── style.css
│ └── js
│ └── recorder.js
├── .gitignore
└── notebooks
├── [WhisperLM]BeamSearchLM.ipynb
└── [WhisperLM]_KenLMipynb.ipynb
/whisper_finetune/readme.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/whisper_finetune/README.md:
--------------------------------------------------------------------------------
1 | # vi_whisper_finetuning
2 |
--------------------------------------------------------------------------------
/images/wav2vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKAB/whisper-finetune-vietnamese/HEAD/images/wav2vec.png
--------------------------------------------------------------------------------
/images/whisper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKAB/whisper-finetune-vietnamese/HEAD/images/whisper.png
--------------------------------------------------------------------------------
/whisper_finetune/requirements.txt:
--------------------------------------------------------------------------------
1 | evaluate==0.3.0
2 | flake8==5.0.4
3 | isort==5.10.1
4 | jiwer==2.5.1
5 | pytorch-lightning==1.7.7
6 | torchaudio==0.12.1
7 | whisper @ git+https://github.com/openai/whisper.git@9e653bd0ea0f1e9493cb4939733e9de249493cfb
8 | gdown==4.4.0
9 |
--------------------------------------------------------------------------------
/whisper_finetune/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torchaudio
3 | import torchaudio.transforms as at
4 |
5 |
6 | def load_wave(wave_path, sample_rate:int=16000) -> torch.Tensor:
7 | waveform, sr = torchaudio.load(wave_path, normalize=True)
8 | if sample_rate != sr:
9 | waveform = at.Resample(sr, sample_rate)(waveform)
10 | return waveform
--------------------------------------------------------------------------------
/whisper_finetune/config.py:
--------------------------------------------------------------------------------
1 | class Config:
2 | learning_rate = 0.0001
3 | weight_decay = 0.01
4 | adam_epsilon = 1e-8
5 | warmup_steps = 2
6 | batch_size = 16
7 | num_worker = 2
8 | num_train_epochs = 10
9 | gradient_accumulation_steps = 1
10 | sample_rate = 16000
11 | log_output_dir = "logs"
12 | check_output_dir = "artifacts"
13 | train_name = "whisper"
14 | train_id = "fluers"
15 | model_name = "base"
16 | lang = "vi"
17 | checkpoint_path = "" # using origin model if this parh is invaild
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # Whiper vietnamese finetuning
3 |
4 | \[In case you are looking for a Vietnamese ASR model, have a look at [HKAB/whisper-finetune-1-notebook](https://github.com/HKAB/whisper-finetune-1-notebook) \]
5 |
6 | This notebook contains:
7 | - Notebooks finetuning, inferencing and generating N-gram.
8 | - Demo Whisper and Wav2vec
9 |
10 |
11 |
12 |
13 | ## Installation
14 |
15 | For using Beam search with LM, install Whisper from my Github
16 | ```bash
17 | pip install git+https://github.com/HKAB/whisper.git
18 | ```
19 |
20 | ## Run
21 |
22 | For training & inference
23 |
24 | ```shell
25 | python finetune.py --model_name base \
26 | --dataset vin100h
27 |
28 | python test.py --checkpoint_path path/to/ckpt \
29 | --dataset vin100h \
30 | --model_name base
31 |
32 | ```
33 |
34 | For generating language model with KenLM, use notebook in notebooks folder.
35 |
36 | We share the [checkpoint](https://drive.google.com/file/d/1vSaQjvjljToYlekm_GvlOkJYGLQA5EdJ/view?usp=sharing) (*base, batch_size 1, gradient accumulation steps 10, epoch 14, lr 0.0001*)\.
37 | ## Demo
38 |
39 |
40 | 
41 |
42 | 
43 | ## Contributing
44 |
45 | - We finetune Whisper on 100h speech dataset.
46 | - We implement BeamSearchWithLM, using KenLM and showed positive result.
47 |
48 | ## Result
49 |
50 | | Methods | Fleurs | Vin100h (Full) |
51 | |---------------------------|---------|--------------- |
52 | | Whisper (base) | 50.38% | 50.33% |
53 | | Finetune Whisper (base) | 28.68% | 33% |
54 | | Whisper (large) one shot | - | 26.87% |
55 |
--------------------------------------------------------------------------------
/whisper_finetune/inference.py:
--------------------------------------------------------------------------------
1 | try:
2 | import tensorflow # required in Colab to avoid protobuf compatibility issues
3 | except ImportError:
4 | pass
5 |
6 | import whisper
7 | import torch
8 | import argparse
9 |
10 | from config import Config
11 | from model import WhisperModelModule
12 | from utils import load_wave
13 |
14 |
15 |
16 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17 |
18 | if __name__=="__main__":
19 | parser = argparse.ArgumentParser()
20 | parser.add_argument('--checkpoint_path', type=str, default='', help='path of checkpoint, if not set, use origin pretrained model')
21 | parser.add_argument('--audio_path', type=str, default='test01.wav', help='the audio file for inference')
22 |
23 | args = parser.parse_args()
24 | config = Config()
25 | config.checkpoint_path = args.checkpoint_path
26 |
27 | module = WhisperModelModule(config)
28 | try:
29 | state_dict = torch.load(config.checkpoint_path)
30 | state_dict = state_dict["state_dict"]
31 | module.load_state_dict(state_dict)
32 | print(f"load checkpoint successfully from {config.checkpoint_path}")
33 | except Exception as e:
34 | print(e)
35 | print(f"load checkpoint failt using origin weigth of {config.model_name} model")
36 | model = module.model
37 | model.to(device)
38 |
39 | audio = whisper.load_audio(args.audio_path)
40 | audio = whisper.pad_or_trim(audio)
41 |
42 | # make log-Mel spectrogram and move to the same device as the model
43 | mel = whisper.log_mel_spectrogram(audio).to(model.device)
44 |
45 | # decode the audio
46 | options = whisper.DecodingOptions(
47 | language="vi", without_timestamps=True, fp16=torch.cuda.is_available()
48 | )
49 |
50 | result = model.decode(mel, options)
51 | print('Predicted:', result.text)
52 |
53 |
--------------------------------------------------------------------------------
/whisper_finetune/test.py:
--------------------------------------------------------------------------------
1 | try:
2 | import tensorflow # required in Colab to avoid protobuf compatibility issues
3 | except ImportError:
4 | pass
5 |
6 |
7 | import jiwer
8 | import whisper
9 | import torch
10 | import argparse
11 | from tqdm import tqdm
12 | import pandas as pd
13 |
14 | from config import Config
15 | from dataset import load_dataset, WhisperDataCollatorWithPadding
16 | from model import WhisperModelModule
17 |
18 |
19 |
20 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21 |
22 | if __name__=="__main__":
23 | parser = argparse.ArgumentParser()
24 | parser.add_argument('--checkpoint_path', type=str, default='', help='path of checkpoint, if not set, use origin pretrained model')
25 | parser.add_argument('--dataset_name', type=str, default='fluers', help='the dataset for finetuning, includes fluers, vin100h, vlsp2019')
26 | parser.add_argument('--model_name', type=str, default='tiny', help='model name')
27 |
28 | args = parser.parse_args()
29 | config = Config()
30 | config.checkpoint_path = args.checkpoint_path
31 | config.model_name = args.model_name
32 |
33 | module = WhisperModelModule(config)
34 | try:
35 | state_dict = torch.load(config.checkpoint_path)
36 | state_dict = state_dict["state_dict"]
37 | module.load_state_dict(state_dict)
38 | print(f"load checkpoint successfully from {config.checkpoint_path}")
39 | except Exception as e:
40 | print(e)
41 | print(f"load checkpoint failt using origin weigth of {config.model_name} model")
42 | model = module.model
43 | model.to(device)
44 |
45 | _, valid_dataset = load_dataset(args.dataset_name, test=True)
46 | test_loader = torch.utils.data.DataLoader(
47 | valid_dataset,
48 | batch_size=config.batch_size,
49 | num_workers=config.num_worker,
50 | collate_fn=WhisperDataCollatorWithPadding(),
51 | )
52 |
53 | # decode the audio
54 | options = whisper.DecodingOptions(
55 | language="vi", without_timestamps=True, fp16=torch.cuda.is_available()
56 | )
57 |
58 | hypotheses = []
59 | references = []
60 | print(model.device)
61 | for sample in tqdm(test_loader):
62 | mels = sample["input_ids"].to(model.device)
63 | texts = sample["texts"]
64 | results = model.decode(mels, options)
65 | hypotheses.extend([result.text for result in results])
66 | references.extend(texts)
67 |
68 | data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
69 |
70 | data["hypothesis_clean"] = [
71 | text.lower() for text in data["hypothesis"]
72 | ]
73 | data["reference_clean"] = [
74 | text.lower() for text in data["reference"]
75 | ]
76 |
77 | data.to_csv('results.csv')
78 | for i in range(60):
79 | print('Reference:', data["reference_clean"][i])
80 | print('Predict:', data["hypothesis_clean"][i])
81 | print('\n')
82 | wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))
83 |
84 | print(f"WER: {wer * 100:.2f} %")
--------------------------------------------------------------------------------
/whisper_finetune/finetune.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | from pathlib import Path
4 | import torch
5 | from config import Config
6 |
7 | try:
8 | import tensorflow # required in Colab to avoid protobuf compatibility issues
9 | except ImportError:
10 | pass
11 |
12 | from dataset import load_dataset, WhisperDataCollatorWithPadding
13 |
14 | from pytorch_lightning import Trainer
15 | from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
16 | from pytorch_lightning.loggers import TensorBoardLogger
17 | from model import WhisperModelModule
18 |
19 | DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
20 |
21 | if __name__=="__main__":
22 | parser = argparse.ArgumentParser()
23 |
24 | parser.add_argument('--checkpoint_path', type=str, default='', help='path of checkpoint, if not set, use origin pretrained model')
25 | parser.add_argument('--model_name', type=str, default='base', help='model name, tiny, small, medium, base, large')
26 | parser.add_argument('--dataset', type=str, default='fluers', help='the dataset for finetuning, includes fluers, vin100h, vlsp2019')
27 | parser.add_argument('--lang', type=str, default='vi', help='language, vi, en')
28 | parser.add_argument('--lr', type=float, default=0.0005, help='learning rate')
29 | parser.add_argument('--epoch', type=int, default=10, help='number of epoch for finetuning')
30 | parser.add_argument('--batch_size', type=int, default=16, help='batch size')
31 |
32 | args = parser.parse_args()
33 |
34 | # Load default config from config.py and set new config from args
35 | config = Config()
36 | config.model_name = args.model_name
37 | config.lang = args.lang
38 | config.learning_rate = args.lr
39 | config.num_train_epochs = args.epoch
40 | config.batch_size = args.batch_size
41 | config.checkpoint_path = args.checkpoint_path
42 |
43 | print(f"""Finetuning Whisper with new config:
44 | checkpoint_path: %s,
45 | dataset: %s,
46 | model_name: %s,
47 | lang: %s,
48 | learning_rate: %.5f,
49 | num_finetune_epochs: %d,
50 | batch_size: %d""" % ("No check point" if config.checkpoint_path == "" else config.checkpoint_path, args.dataset, config.model_name, config.lang, config.learning_rate, config.num_train_epochs, config.batch_size))
51 |
52 | # Load dataset for finetuning
53 | if config.lang == "vi":
54 | train_dataset, valid_dataset = load_dataset(args.dataset)
55 | else:
56 | raise ValueError("Not support other language dataset, please choose vi for languague!")
57 |
58 | train_loader = torch.utils.data.DataLoader(
59 | train_dataset,
60 | batch_size=config.batch_size,
61 | num_workers=config.num_worker,
62 | collate_fn=WhisperDataCollatorWithPadding(),
63 | )
64 | valid_loader = torch.utils.data.DataLoader(
65 | valid_dataset,
66 | batch_size=config.batch_size,
67 | num_workers=config.num_worker,
68 | collate_fn=WhisperDataCollatorWithPadding(),
69 | )
70 |
71 |
72 | Path(os.path.join(os.getcwd(), config.log_output_dir)).mkdir(exist_ok=True)
73 | Path(os.path.join(os.getcwd(), config.check_output_dir)).mkdir(exist_ok=True)
74 |
75 | # Log and checkpoint
76 | tflogger = TensorBoardLogger(
77 | save_dir=config.log_output_dir, name=config.train_name, version=config.train_id
78 | )
79 |
80 | checkpoint_callback = ModelCheckpoint(
81 | dirpath=f"{config.check_output_dir}/checkpoint",
82 | filename="checkpoint-{epoch:04d}",
83 | save_top_k=1, # -1: all model save, 1: best model save
84 | )
85 |
86 | # callback list
87 | callback_list = [checkpoint_callback, LearningRateMonitor(logging_interval="epoch")]
88 | model = WhisperModelModule(config, train_loader, valid_loader)
89 |
90 | # Trainer
91 | trainer = Trainer(
92 | precision=16,
93 | accelerator=DEVICE,
94 | max_epochs=config.num_train_epochs,
95 | accumulate_grad_batches=config.gradient_accumulation_steps,
96 | logger=tflogger,
97 | callbacks=callback_list,
98 | )
99 |
100 | trainer.fit(model)
--------------------------------------------------------------------------------
/demo/templates/home.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | Hello, world!
12 |
13 |
14 |
15 |
16 |
17 |
18 |
🎶 Speech2Text 📑
19 |
20 |
21 |
29 |
30 |
Audio is playing .
31 | .
32 | .
33 |
34 |
35 |
Audio is handling by server .
36 | .
37 | .
38 |
39 |
40 |
74 |
75 |
76 |
77 |
To record audio, use browsers like Chrome and Firefox that support audio recording.
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
--------------------------------------------------------------------------------
/demo/app.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, render_template, redirect, url_for, request, jsonify
2 |
3 | # import cv2 as cv2
4 | # import numpy as np
5 | # import urllib.request
6 | # from PIL import Image
7 | # import io
8 | # from scipy.io import wavfile
9 |
10 | # from pygame import mixer
11 | from werkzeug.utils import secure_filename
12 | import os
13 | import whisper
14 | import torch
15 | from datetime import datetime
16 |
17 | #wav2vec
18 | from transformers.file_utils import cached_path, hf_bucket_url
19 | import zipfile
20 | from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
21 | import soundfile as sf
22 | import kenlm
23 | from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel
24 | import librosa
25 |
26 | # from odoo.http import request
27 | app = Flask(__name__, template_folder='templates')
28 |
29 | # model = whisper.load_model("tiny")
30 | model = whisper.load_model("base")
31 | state_dict = torch.load("checkpoint-epoch=0013.ckpt", map_location="cpu")['state_dict']
32 | # change all key of state_dict to remove "model."
33 | new_state_dict = {k.replace("model.", ""): v for k, v in state_dict.items()}
34 | model.load_state_dict(new_state_dict)
35 | print(f"Model whisper base loaded")
36 |
37 |
38 | cache_dir = './cache/'
39 | processor_w2v = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
40 | model_w2v = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
41 | # lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip')
42 | # lm_file = cached_path(lm_file,cache_dir=cache_dir)
43 | # with zipfile.ZipFile(lm_file, 'r') as zip_ref:
44 | # zip_ref.extractall(cache_dir)
45 | lm_file = cache_dir + 'vi_lm_4grams.bin'
46 |
47 | def get_decoder_ngram_model(tokenizer, ngram_lm_path):
48 | vocab_dict = tokenizer.get_vocab()
49 | sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
50 | vocab = [x[1] for x in sort_vocab][:-2]
51 | vocab_list = vocab
52 | # convert ctc blank character representation
53 | vocab_list[tokenizer.pad_token_id] = ""
54 | # replace special characters
55 | vocab_list[tokenizer.unk_token_id] = ""
56 | # vocab_list[tokenizer.bos_token_id] = ""
57 | # vocab_list[tokenizer.eos_token_id] = ""
58 | # convert space character representation
59 | vocab_list[tokenizer.word_delimiter_token_id] = " "
60 | # specify ctc blank char index, since conventially it is the last entry of the logit matrix
61 | alphabet = Alphabet.build_alphabet(vocab_list, ctc_token_idx=tokenizer.pad_token_id)
62 | lm_model = kenlm.Model(ngram_lm_path)
63 | decoder = BeamSearchDecoderCTC(alphabet,
64 | language_model=LanguageModel(lm_model))
65 | return decoder
66 |
67 | ngram_lm_model = get_decoder_ngram_model(processor_w2v.tokenizer, lm_file)
68 | print("Huggingface model loaded")
69 |
70 | @app.route('/')
71 | def home():
72 | return render_template('home.html')
73 |
74 |
75 | @app.route('/transcribe-whisper', methods=['POST'])
76 | def transcribe():
77 | files = request.files
78 | file = files.get('file')
79 |
80 | file_name = secure_filename(str(datetime.now())) + ".wav"
81 | file_path = os.path.join("audio_folder", file_name)
82 | file.save(file_path)
83 |
84 | audio = whisper.load_audio(file_path)
85 | audio = whisper.pad_or_trim(audio)
86 | print("Audio loaded and trimmed")
87 |
88 | mel = whisper.log_mel_spectrogram(audio).to(model.device)
89 | # options = whisper.DecodingOptions(fp16 = False, withlm=F, beam_size=1,
90 | # patience=1.0, lm_path="../../dataset_tokenized_3gram.binary", lm_alpha=0.75, lm_beta=0.0,
91 | # without_timestamps=True, language="vi")
92 | options = whisper.DecodingOptions(fp16 = False, language="vi", without_timestamps=True)
93 | print("Model decoding...")
94 | result = whisper.decode(model, mel, options)
95 |
96 | return jsonify(result.text)
97 |
98 | @app.route('/transcribe-w2v', methods=['POST'])
99 | def transcribe_w2v():
100 | files = request.files
101 | file = files.get('file')
102 |
103 | file_name = secure_filename(str(datetime.now())) + ".wav"
104 | file_path = os.path.join("audio_folder", file_name)
105 | file.save(file_path)
106 |
107 | speech, sr = librosa.load(file_path)
108 | speech = librosa.resample(speech, orig_sr=sr, target_sr=16000)
109 | input_values = processor_w2v(
110 | speech,
111 | sampling_rate=16000,
112 | return_tensors="pt").input_values
113 | logits = model_w2v(input_values).logits[0]
114 | pred_ids = torch.argmax(logits, dim=-1)
115 |
116 | print("Model decoding...")
117 | beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
118 | return jsonify(beam_search_output)
119 |
120 | if __name__ == '__main__':
121 | app.run(debug=True)
122 |
123 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by https://www.toptal.com/developers/gitignore/api/python,jupyternotebooks,audio,windows
2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,jupyternotebooks,audio,windows
3 |
4 | ### Audio ###
5 | *.aif
6 | *.aiff
7 | *.iff
8 | *.m3u
9 | *.m4a
10 | *.mid
11 | *.mp3
12 | *.mpa
13 | *.ra
14 | *.wav
15 | *.wma
16 | *.ogg
17 | *.flac
18 |
19 | ### JupyterNotebooks ###
20 | # gitignore template for Jupyter Notebooks
21 | # website: http://jupyter.org/
22 |
23 | .ipynb_checkpoints
24 | */.ipynb_checkpoints/*
25 |
26 | # IPython
27 | profile_default/
28 | ipython_config.py
29 |
30 | # Remove previous ipynb_checkpoints
31 | # git rm -r .ipynb_checkpoints/
32 |
33 | ### Python ###
34 | # Byte-compiled / optimized / DLL files
35 | __pycache__/
36 | *.py[cod]
37 | *$py.class
38 |
39 | # C extensions
40 | *.so
41 |
42 | # Distribution / packaging
43 | .Python
44 | build/
45 | develop-eggs/
46 | dist/
47 | downloads/
48 | eggs/
49 | .eggs/
50 | lib/
51 | lib64/
52 | parts/
53 | sdist/
54 | var/
55 | wheels/
56 | share/python-wheels/
57 | *.egg-info/
58 | .installed.cfg
59 | *.egg
60 | MANIFEST
61 |
62 | # PyInstaller
63 | # Usually these files are written by a python script from a template
64 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
65 | *.manifest
66 | *.spec
67 |
68 | # Installer logs
69 | pip-log.txt
70 | pip-delete-this-directory.txt
71 |
72 | # Unit test / coverage reports
73 | htmlcov/
74 | .tox/
75 | .nox/
76 | .coverage
77 | .coverage.*
78 | .cache
79 | nosetests.xml
80 | coverage.xml
81 | *.cover
82 | *.py,cover
83 | .hypothesis/
84 | .pytest_cache/
85 | cover/
86 |
87 | # Translations
88 | *.mo
89 | *.pot
90 |
91 | # Django stuff:
92 | *.log
93 | local_settings.py
94 | db.sqlite3
95 | db.sqlite3-journal
96 |
97 | # Flask stuff:
98 | instance/
99 | .webassets-cache
100 |
101 | # Scrapy stuff:
102 | .scrapy
103 |
104 | # Sphinx documentation
105 | docs/_build/
106 |
107 | # PyBuilder
108 | .pybuilder/
109 | target/
110 |
111 | # Jupyter Notebook
112 |
113 | # IPython
114 |
115 | # pyenv
116 | # For a library or package, you might want to ignore these files since the code is
117 | # intended to run in multiple environments; otherwise, check them in:
118 | # .python-version
119 |
120 | # pipenv
121 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
122 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
123 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
124 | # install all needed dependencies.
125 | #Pipfile.lock
126 |
127 | # poetry
128 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
129 | # This is especially recommended for binary packages to ensure reproducibility, and is more
130 | # commonly ignored for libraries.
131 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
132 | #poetry.lock
133 |
134 | # pdm
135 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
136 | #pdm.lock
137 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
138 | # in version control.
139 | # https://pdm.fming.dev/#use-with-ide
140 | .pdm.toml
141 |
142 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
143 | __pypackages__/
144 |
145 | # Celery stuff
146 | celerybeat-schedule
147 | celerybeat.pid
148 |
149 | # SageMath parsed files
150 | *.sage.py
151 |
152 | # Environments
153 | .env
154 | .venv
155 | env/
156 | venv/
157 | ENV/
158 | env.bak/
159 | venv.bak/
160 |
161 | # Spyder project settings
162 | .spyderproject
163 | .spyproject
164 |
165 | # Rope project settings
166 | .ropeproject
167 |
168 | # mkdocs documentation
169 | /site
170 |
171 | # mypy
172 | .mypy_cache/
173 | .dmypy.json
174 | dmypy.json
175 |
176 | # Pyre type checker
177 | .pyre/
178 |
179 | # pytype static type analyzer
180 | .pytype/
181 |
182 | # Cython debug symbols
183 | cython_debug/
184 |
185 | # PyCharm
186 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
187 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
188 | # and can be added to the global gitignore or merged into this file. For a more nuclear
189 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
190 | #.idea/
191 |
192 | ### Windows ###
193 | # Windows thumbnail cache files
194 | Thumbs.db
195 | Thumbs.db:encryptable
196 | ehthumbs.db
197 | ehthumbs_vista.db
198 |
199 | # Dump file
200 | *.stackdump
201 |
202 | # Folder config file
203 | [Dd]esktop.ini
204 |
205 | # Recycle Bin used on file shares
206 | $RECYCLE.BIN/
207 |
208 | # Windows Installer files
209 | *.cab
210 | *.msi
211 | *.msix
212 | *.msm
213 | *.msp
214 |
215 | # Windows shortcuts
216 | *.lnk
217 |
218 | *.exe
219 | ffmpeg-2022-10-02-git-5f02a261a2-essentials_build/
220 |
221 | # End of https://www.toptal.com/developers/gitignore/api/python,jupyternotebooks,audio,windows
222 |
223 |
--------------------------------------------------------------------------------
/whisper_finetune/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | import whisper
4 |
5 | from pytorch_lightning import LightningModule
6 |
7 | from config import Config
8 | import evaluate
9 |
10 | from transformers import AdamW, get_linear_schedule_with_warmup
11 |
12 |
13 | class WhisperModelModule(LightningModule):
14 | def __init__(
15 | self,
16 | cfg: Config,
17 | train_dataloader=None,
18 | eval_dataloader=None,
19 | ) -> None:
20 | super().__init__()
21 | self.options = whisper.DecodingOptions(
22 | language=cfg.lang, without_timestamps=True
23 | )
24 | self.model = whisper.load_model(cfg.model_name)
25 | self.tokenizer = whisper.tokenizer.get_tokenizer(
26 | True, language=cfg.lang, task=self.options.task
27 | )
28 |
29 | # only decoder training
30 | for p in self.model.encoder.parameters():
31 | p.requires_grad = False
32 |
33 | self.loss_fn = nn.CrossEntropyLoss(ignore_index=-100)
34 | self.metrics_wer = evaluate.load("wer")
35 | self.metrics_cer = evaluate.load("cer")
36 |
37 | self.cfg = cfg
38 | self.trainloader = train_dataloader
39 | self.evaloader = eval_dataloader
40 |
41 | def forward(self, x):
42 | return self.model(x)
43 |
44 | def training_step(self, batch, batch_id):
45 | input_ids = batch["input_ids"]
46 | labels = batch["labels"].long()
47 | dec_input_ids = batch["dec_input_ids"].long()
48 |
49 | with torch.no_grad():
50 | audio_features = self.model.encoder(input_ids)
51 |
52 | out = self.model.decoder(dec_input_ids, audio_features)
53 | loss = self.loss_fn(out.view(-1, out.size(-1)), labels.view(-1))
54 | self.log("train/loss", loss, on_step=True, prog_bar=True, logger=True)
55 | return loss
56 |
57 | def validation_step(self, batch, batch_id):
58 | input_ids = batch["input_ids"]
59 | labels = batch["labels"].long()
60 | dec_input_ids = batch["dec_input_ids"].long()
61 |
62 | audio_features = self.model.encoder(input_ids)
63 | out = self.model.decoder(dec_input_ids, audio_features)
64 |
65 | loss = self.loss_fn(out.view(-1, out.size(-1)), labels.view(-1))
66 |
67 | out[out == -100] = self.tokenizer.eot
68 | labels[labels == -100] = self.tokenizer.eot
69 |
70 | o_list, l_list = [], []
71 | for o, l in zip(out, labels):
72 | o = torch.argmax(o, dim=1)
73 | o_list.append(self.tokenizer.decode(o, skip_special_tokens=True))
74 | l_list.append(self.tokenizer.decode(l, skip_special_tokens=True))
75 | cer = self.metrics_cer.compute(references=l_list, predictions=o_list)
76 | wer = self.metrics_wer.compute(references=l_list, predictions=o_list)
77 |
78 | self.log("val/loss", loss, on_step=True, prog_bar=True, logger=True)
79 | self.log("val/cer", cer, on_step=True, prog_bar=True, logger=True)
80 | self.log("val/wer", wer, on_step=True, prog_bar=True, logger=True)
81 |
82 | return {"cer": cer, "wer": wer, "loss": loss}
83 |
84 | def configure_optimizers(self):
85 | """configure optimizer and scheduler"""
86 | model = self.model
87 | no_decay = ["bias", "LayerNorm.weight"]
88 | optimizer_grouped_parameters = [
89 | {
90 | "params": [
91 | p
92 | for n, p in model.named_parameters()
93 | if not any(nd in n for nd in no_decay)
94 | ],
95 | "weight_decay": self.cfg.weight_decay,
96 | },
97 | {
98 | "params": [
99 | p
100 | for n, p in model.named_parameters()
101 | if any(nd in n for nd in no_decay)
102 | ],
103 | "weight_decay": 0.0,
104 | },
105 | ]
106 | optimizer = AdamW(
107 | optimizer_grouped_parameters,
108 | lr=self.cfg.learning_rate,
109 | eps=self.cfg.adam_epsilon,
110 | )
111 | self.optimizer = optimizer
112 |
113 | scheduler = get_linear_schedule_with_warmup(
114 | optimizer,
115 | num_warmup_steps=self.cfg.warmup_steps,
116 | num_training_steps=self.t_total,
117 | )
118 | self.scheduler = scheduler
119 |
120 | return [optimizer], [
121 | {"scheduler": scheduler, "interval": "step", "frequency": 1}
122 | ]
123 |
124 | def setup(self, stage=None):
125 | if stage == "fit" or stage is None:
126 | self.t_total = (
127 | (len(self.trainloader.dataset) // (self.cfg.batch_size))
128 | // self.cfg.gradient_accumulation_steps
129 | * float(self.cfg.num_train_epochs)
130 | )
131 |
132 | def train_dataloader(self):
133 | return self.trainloader
134 |
135 | def val_dataloader(self):
136 | return self.evaloader
--------------------------------------------------------------------------------
/notebooks/[WhisperLM]BeamSearchLM.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import whisper\n",
10 | "import torch"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "model = whisper.load_model(\"base\")"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 3,
25 | "metadata": {},
26 | "outputs": [
27 | {
28 | "data": {
29 | "text/plain": [
30 | ""
31 | ]
32 | },
33 | "execution_count": 3,
34 | "metadata": {},
35 | "output_type": "execute_result"
36 | }
37 | ],
38 | "source": [
39 | "state_dict = torch.load(\"/mnt/c/Users/truongnp3/Desktop/Course/NLP/Project/whisper-finetune-vietnamese/demo/checkpoint-epoch=0014.ckpt\", map_location=\"cpu\")['state_dict']\n",
40 | "# change all key of state_dict to remove \"model.\"\n",
41 | "new_state_dict = {k.replace(\"model.\", \"\"): v for k, v in state_dict.items()}\n",
42 | "model.load_state_dict(new_state_dict)"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 4,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "# audio = whisper.load_audio(\"/mnt/c/Users/truongnp3/Desktop/Course/NLP/Project/whisper-finetune-vietnamese/notebooks/test_audio.aiff\")\n",
52 | "# audio = whisper.load_audio(\"/mnt/c/Users/truongnp3/Downloads/spkyut-20190730-utt000005432.wav\")\n",
53 | "# audio = whisper.load_audio(\"/mnt/c/Users/truongnp3/Downloads/spkyut-20190730-utt000005394.wav\")\n",
54 | "audio = whisper.load_audio(\"/mnt/c/Users/truongnp3/Desktop/Course/NLP/Project/whisper-finetune-vietnamese/demo/audio_folder/2022-10-23_234521.158395.wav\")\n",
55 | "audio = whisper.pad_or_trim(audio)\n",
56 | "mel = whisper.log_mel_spectrogram(audio).to(model.device)"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 7,
62 | "metadata": {},
63 | "outputs": [
64 | {
65 | "name": "stdout",
66 | "output_type": "stream",
67 | "text": [
68 | "Detected language: vi\n"
69 | ]
70 | }
71 | ],
72 | "source": [
73 | "_, probs = model.detect_language(mel)\n",
74 | "print(f\"Detected language: {max(probs, key=probs.get)}\")"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 5,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "# options = whisper.DecodingOptions(fp16 = False, without_timestamps=True, language=\"vi\")\n",
84 | "options = whisper.DecodingOptions(fp16 = False, withlm=False, beam_size=1, without_timestamps=True)\n",
85 | "result = whisper.decode(model, mel, options)\n",
86 | "\n",
87 | "# print the recognized text\n",
88 | "# print(result.text)"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 6,
94 | "metadata": {},
95 | "outputs": [
96 | {
97 | "name": "stdout",
98 | "output_type": "stream",
99 | "text": [
100 | "chuyển tiền đang qua số điện thoại hàng ngàn ưu đãi khi nạp tiền điện thoại thanh toán dịch vụ cùng ba chăm cộng cộng tiền ích khác\n"
101 | ]
102 | }
103 | ],
104 | "source": [
105 | "print(result.text) #beam search"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 7,
111 | "metadata": {},
112 | "outputs": [
113 | {
114 | "name": "stderr",
115 | "output_type": "stream",
116 | "text": [
117 | "Loading the LM will be faster if you build a binary file.\n",
118 | "Reading /home/hkab/dataset_tokenized_3gram.arpa\n",
119 | "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
120 | "****************************************************************************************************\n"
121 | ]
122 | }
123 | ],
124 | "source": [
125 | "# options = whisper.DecodingOptions(fp16 = False)\n",
126 | "options = whisper.DecodingOptions(fp16 = False, withlm=True, beam_size=1, \n",
127 | " patience=1.0, lm_path=\"dataset_tokenized_3gram.arpa\", lm_alpha=3.0, lm_beta=0.0,\n",
128 | " without_timestamps=True)\n",
129 | "result = whisper.decode(model, mel, options)\n",
130 | "\n",
131 | "# print the recognized text\n",
132 | "# print(result.text)"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 8,
138 | "metadata": {},
139 | "outputs": [
140 | {
141 | "name": "stdout",
142 | "output_type": "stream",
143 | "text": [
144 | "chuyển tiền đen qua số điện thoại hàng ngàn ưu đãi khi nạp tìm điện thoại thanh toán dịch vụ cũng ba trăm cộng cộng tiền đích khác\n"
145 | ]
146 | }
147 | ],
148 | "source": [
149 | "print(result.text)"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "- Nhiều chồng ngờ góc quên hay góc xanh nhà, nhìn chọn chỗ đất hiện, đất cắt hoặc đất mùn, đất phủ xa tây sốt, đồ ẩm thấp, dễ phát nữ.\n",
157 | "\n",
158 | "- Nếu trồng ngờ góc quên hay góc xanh nhà, hiện trọn trộ đất hiện, đất cắt hoặc đất muồn, đất phủ xa tây sốt, đồ ẩm thấp, dễ phát nước."
159 | ]
160 | }
161 | ],
162 | "metadata": {
163 | "kernelspec": {
164 | "display_name": "Python 3 (ipykernel)",
165 | "language": "python",
166 | "name": "python3"
167 | },
168 | "orig_nbformat": 4,
169 | "vscode": {
170 | "interpreter": {
171 | "hash": "569cc53f83e70b41c2d959ffedb296ac14adb9e332ab59ae04a2c7a2935b0e00"
172 | }
173 | }
174 | },
175 | "nbformat": 4,
176 | "nbformat_minor": 2
177 | }
178 |
--------------------------------------------------------------------------------
/demo/static/css/style.css:
--------------------------------------------------------------------------------
1 | /* style.css*/
2 |
3 | /* Media Queries */
4 |
5 | /* Small Devices*/
6 |
7 | @media (min-width: 0px) {
8 | * {
9 | box-sizing: border-box;
10 | }
11 | body {
12 | margin: 0;
13 | padding: 0;
14 | /*background-color: lightcyan;*/
15 | color: #414142;
16 | position: relative;
17 | font-family: monospace;
18 | }
19 | .title {
20 | font-size: 30px;
21 | margin-bottom: 55px;
22 | text-align: center;
23 | }
24 | .audio-recording-container {
25 | width: 100%;
26 | height: 100vh;
27 | /* view port height*/
28 | /*targeting Chrome & Safari*/
29 | display: -webkit-flex;
30 | /*targeting IE10*/
31 | display: -ms-flex;
32 | display: flex;
33 | flex-direction: column;
34 | justify-content: center;
35 | /*horizontal centering*/
36 | align-items: center;
37 | }
38 | .start-recording-button {
39 | font-size: 70px;
40 | color: #435f7a;
41 | cursor: pointer;
42 | opacity: .5;
43 | margin-bottom: 30px;
44 | }
45 | .start-recording-button:hover {
46 | opacity: 1;
47 | }
48 | .recording-contorl-buttons-container {
49 | /*targeting Chrome & Safari*/
50 | display: -webkit-flex;
51 | /*targeting IE10*/
52 | display: -ms-flex;
53 | display: flex;
54 | justify-content: space-evenly;
55 | /*horizontal centering*/
56 | align-items: center;
57 | width: 334px;
58 | margin-bottom: 30px;
59 | }
60 | .cancel-recording-button,
61 | .stop-recording-button {
62 | font-size: 70px;
63 | cursor: pointer;
64 | }
65 | .cancel-recording-button {
66 | color: red;
67 | opacity: 0.7;
68 | }
69 | .cancel-recording-button:hover {
70 | color: rgb(206, 4, 4);
71 | }
72 | .stop-recording-button {
73 | color: #33cc33;
74 | opacity: 0.7;
75 | }
76 | .stop-recording-button:hover {
77 | color: #27a527;
78 | }
79 | .recording-elapsed-time {
80 | /*targeting Chrome & Safari*/
81 | display: -webkit-flex;
82 | /*targeting IE10*/
83 | display: -ms-flex;
84 | display: flex;
85 | justify-content: center;
86 | /*horizontal centering*/
87 | align-items: center;
88 | }
89 | .red-recording-dot {
90 | font-size: 25px;
91 | color: red;
92 | margin-right: 12px;
93 | /*transitions with Firefox, IE and Opera Support browser support*/
94 | animation-name: flashing-recording-dot;
95 | -webkit-animation-name: flashing-recording-dot;
96 | -moz-animation-name: flashing-recording-dot;
97 | -o-animation-name: flashing-recording-dot;
98 | animation-duration: 2s;
99 | -webkit-animation-duration: 2s;
100 | -moz-animation-duration: 2s;
101 | -o-animation-duration: 2s;
102 | animation-iteration-count: infinite;
103 | -webkit-animation-iteration-count: infinite;
104 | -moz-animation-iteration-count: infinite;
105 | -o-animation-iteration-count: infinite;
106 | }
107 | /* The animation code */
108 | @keyframes flashing-recording-dot {
109 | 0% {
110 | opacity: 1;
111 | }
112 | 50% {
113 | opacity: 0;
114 | }
115 | 100% {
116 | opacity: 1;
117 | }
118 | }
119 | @-webkit-keyframes flashing-recording-dot {
120 | 0% {
121 | opacity: 1;
122 | }
123 | 50% {
124 | opacity: 0;
125 | }
126 | 100% {
127 | opacity: 1;
128 | }
129 | }
130 | @-moz-keyframes flashing-recording-dot {
131 | 0% {
132 | opacity: 1;
133 | }
134 | 50% {
135 | opacity: 0;
136 | }
137 | 100% {
138 | opacity: 1;
139 | }
140 | }
141 | @-o-keyframes flashing-recording-dot {
142 | 0% {
143 | opacity: 1;
144 | }
145 | 50% {
146 | opacity: 0;
147 | }
148 | 100% {
149 | opacity: 1;
150 | }
151 | }
152 | .elapsed-time {
153 | font-size: 32px;
154 | }
155 | .recording-contorl-buttons-container.hide {
156 | display: none;
157 | }
158 | .overlay {
159 | position: absolute;
160 | top: 0;
161 | height: 100vh;
162 | width: 100%;
163 | background-color: rgba(82, 76, 76, 0.35);
164 | /*targeting Chrome & Safari*/
165 | display: -webkit-flex;
166 | /*targeting IE10*/
167 | display: -ms-flex;
168 | display: flex;
169 | justify-content: center;
170 | /*horizontal centering*/
171 | align-items: center;
172 | }
173 | .overlay.hide {
174 | display: none;
175 | }
176 | .browser-not-supporting-audio-recording-box {
177 | /*targeting Chrome & Safari*/
178 | display: -webkit-flex;
179 | /*targeting IE10*/
180 | display: -ms-flex;
181 | display: flex;
182 | flex-direction: column;
183 | justify-content: space-between;
184 | /*horizontal centering*/
185 | align-items: center;
186 | width: 317px;
187 | height: 119px;
188 | background-color: white;
189 | border-radius: 10px;
190 | padding: 15px;
191 | font-size: 16px;
192 | }
193 | .close-browser-not-supported-box {
194 | cursor: pointer;
195 | background-color: #abc1c05c;
196 | border-radius: 10px;
197 | font-size: 16px;
198 | border: none;
199 | }
200 | .close-browser-not-supported-box:hover {
201 | background-color: #92a5a45c;
202 | }
203 | .close-browser-not-supported-box:focus {
204 | outline: none;
205 | border: none;
206 | }
207 | .audio-element.hide {
208 | display: none;
209 | }
210 | .text-indication-of-audio-playing-container {
211 | height: 20px;
212 | }
213 | .text-indication-of-audio-playing {
214 | font-size: 20px;
215 | }
216 | .text-indication-of-audio-playing.hide {
217 | display: none;
218 | }
219 | /* 3 Dots animation*/
220 | .text-indication-of-audio-playing span {
221 | /*transitions with Firefox, IE and Opera Support browser support*/
222 | animation-name: blinking-dot;
223 | -webkit-animation-name: blinking-dot;
224 | -moz-animation-name: blinking-dot;
225 | -o-animation-name: blinking-dot;
226 | animation-duration: 2s;
227 | -webkit-animation-duration: 2s;
228 | -moz-animation-duration: 2s;
229 | -o-animation-duration: 2s;
230 | animation-iteration-count: infinite;
231 | -webkit-animation-iteration-count: infinite;
232 | -moz-animation-iteration-count: infinite;
233 | -o-animation-iteration-count: infinite;
234 | }
235 | .text-indication-of-audio-playing span:nth-child(2) {
236 | animation-delay: .4s;
237 | -webkit-animation-delay: .4s;
238 | -moz-animation-delay: .4s;
239 | -o-animation-delay: .4s;
240 | }
241 | .text-indication-of-audio-playing span:nth-child(3) {
242 | animation-delay: .8s;
243 | -webkit-animation-delay: .8s;
244 | -moz-animation-delay: .8s;
245 | -o-animation-delay: .8s;
246 | }
247 |
248 | .text-indication-of-audio-handling {
249 | font-size: 20px;
250 | }
251 |
252 | /* 3 Dots animation*/
253 | .text-indication-of-audio-handling span {
254 | /*transitions with Firefox, IE and Opera Support browser support*/
255 | animation-name: blinking-dot;
256 | -webkit-animation-name: blinking-dot;
257 | -moz-animation-name: blinking-dot;
258 | -o-animation-name: blinking-dot;
259 | animation-duration: 2s;
260 | -webkit-animation-duration: 2s;
261 | -moz-animation-duration: 2s;
262 | -o-animation-duration: 2s;
263 | animation-iteration-count: infinite;
264 | -webkit-animation-iteration-count: infinite;
265 | -moz-animation-iteration-count: infinite;
266 | -o-animation-iteration-count: infinite;
267 | }
268 | .text-indication-of-audio-handling span:nth-child(2) {
269 | animation-delay: .4s;
270 | -webkit-animation-delay: .4s;
271 | -moz-animation-delay: .4s;
272 | -o-animation-delay: .4s;
273 | }
274 | .text-indication-of-audio-handling span:nth-child(3) {
275 | animation-delay: .8s;
276 | -webkit-animation-delay: .8s;
277 | -moz-animation-delay: .8s;
278 | -o-animation-delay: .8s;
279 | }
280 | /* The animation code */
281 | @keyframes blinking-dot {
282 | 0% {
283 | opacity: 0;
284 | }
285 | 50% {
286 | opacity: 1;
287 | }
288 | 100% {
289 | opacity: 0;
290 | }
291 | }
292 | /* The animation code */
293 | @-webkit-keyframes blinking-dot {
294 | 0% {
295 | opacity: 0;
296 | }
297 | 50% {
298 | opacity: 1;
299 | }
300 | 100% {
301 | opacity: 0;
302 | }
303 | }
304 | /* The animation code */
305 | @-moz-keyframes blinking-dot {
306 | 0% {
307 | opacity: 0;
308 | }
309 | 50% {
310 | opacity: 1;
311 | }
312 | 100% {
313 | opacity: 0;
314 | }
315 | }
316 | /* The animation code */
317 | @-o-keyframes blinking-dot {
318 | 0% {
319 | opacity: 0;
320 | }
321 | 50% {
322 | opacity: 1;
323 | }
324 | 100% {
325 | opacity: 0;
326 | }
327 | }
328 | }
329 |
330 | /* Medium devices */
331 |
332 | @media (min-width: 768px) {}
333 |
334 | /* Large devices */
335 |
336 | @media (min-width: 992px) {}
337 |
338 | /*Ipad pro view*/
339 |
340 | /*
341 | @media (min-width: 1024px) {
342 |
343 | } */
344 |
345 | /* Extra Large devices */
346 |
347 | @media (min-width: 1200px) {}
348 |
349 |
--------------------------------------------------------------------------------
/whisper_finetune/dataset.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 |
4 | import torch
5 | import torchaudio
6 |
7 | import pandas as pd
8 | import whisper
9 | import torchaudio.transforms as at
10 | from utils import load_wave
11 | from pathlib import Path
12 |
13 | class WhisperDataCollatorWithPadding:
14 | """
15 | Using for collating many input tensors with different sizes to batch and maybe applying some processes to data.
16 | Input: list or dictionary of input tensors
17 | Output: list dictionary of batched tensors
18 | """
19 | def __call__(self, features):
20 | input_ids, labels, dec_input_ids, texts = [], [], [], []
21 | for f in features:
22 | input_ids.append(f["input_ids"])
23 | labels.append(f["labels"])
24 | dec_input_ids.append(f["dec_input_ids"])
25 | texts.append(f["text"])
26 |
27 | input_ids = torch.concat([input_id[None, :] for input_id in input_ids]) # [batch_size, seq_len]
28 |
29 | label_lengths = [len(lab) for lab in labels] # same size with input_ids
30 | dec_input_ids_length = [len(e) for e in dec_input_ids]
31 | max_label_len = max(label_lengths + dec_input_ids_length)
32 |
33 | labels = [
34 | np.pad(lab, (0, max_label_len - lab_len), "constant", constant_values=-100)
35 | for lab, lab_len in zip(labels, label_lengths)
36 | ]
37 | dec_input_ids = [
38 | np.pad(e, (0, max_label_len - e_len), "constant", constant_values=50257)
39 | for e, e_len in zip(dec_input_ids, dec_input_ids_length)
40 | ] # 50257 is eot token id
41 |
42 | batch = {"labels": labels, "dec_input_ids": dec_input_ids}
43 |
44 | batch = {
45 | k: torch.tensor(np.array(v), requires_grad=False) for k, v in batch.items()
46 | }
47 | batch["input_ids"] = input_ids
48 | batch["texts"] = texts
49 | return batch
50 |
51 | class WhisperDataset(torch.utils.data.Dataset):
52 | def __init__(self, dataset, sample_rate=16000) -> None:
53 | super().__init__()
54 |
55 | self.dataset = dataset
56 | self.sample_rate = sample_rate
57 |
58 | self.options = whisper.DecodingOptions(language="vi", without_timestamps=True)
59 | self.tokenizer = whisper.tokenizer.get_tokenizer(
60 | True, language="vi", task=self.options.task
61 | )
62 |
63 | def load_wave(self, wave_path, sample_rate: int = 16000) -> torch.Tensor:
64 | waveform, sr = torchaudio.load(wave_path, normalize=True)
65 | if sample_rate != sr:
66 | waveform = at.Resample(sr, sample_rate)(waveform)
67 | return waveform
68 |
69 |
70 | def __len__(self):
71 | return len(self.dataset)
72 |
73 | def __getitem__(self, id):
74 | audio_id, audio_path, text = self.dataset[id]
75 |
76 | audio = self.load_wave(audio_path, sample_rate=self.sample_rate)
77 | audio = whisper.pad_or_trim(audio.flatten())
78 | mel = whisper.log_mel_spectrogram(audio)
79 |
80 | text_token = [
81 | *self.tokenizer.sot_sequence_including_notimestamps
82 | ] + self.tokenizer.encode(text)
83 | labels = text_token[1:] + [self.tokenizer.eot]
84 | if len(text_token) >= 448:
85 | audio_id, audio_path, text = self.dataset[0]
86 |
87 | audio = self.load_wave(audio_path, sample_rate=self.sample_rate)
88 | audio = whisper.pad_or_trim(audio.flatten())
89 | mel = whisper.log_mel_spectrogram(audio)
90 |
91 | text_token = [
92 | *self.tokenizer.sot_sequence_including_notimestamps
93 | ] + self.tokenizer.encode(text)
94 | labels = text_token[1:] + [self.tokenizer.eot]
95 | return {
96 | "input_ids": mel,
97 | "labels": labels,
98 | "dec_input_ids": text_token,
99 | "text": text,
100 | }
101 |
102 |
103 | def load_dataset(dataset_name, test=False):
104 | train_dataset = None
105 | test_dataset = None
106 |
107 | if dataset_name == 'fluers':
108 | print('Loading Vietnamese Fluers dataset...')
109 | if not os.path.exists('vi_vn.tar.gz'):
110 | os.system("wget https://storage.googleapis.com/xtreme_translations/FLEURS102/vi_vn.tar.gz")
111 | os.makedirs('fluers', exist_ok=True)
112 | os.system("tar -xf 'vi_vn.tar.gz' -C fluers")
113 | if not test:
114 | train_list_files = get_list_files_fluers('train')
115 | val_list_files = get_list_files_fluers('dev')
116 | train_list_files +=val_list_files
117 | print('Num train samples:', len(train_list_files))
118 | train_dataset = WhisperDataset(train_list_files)
119 |
120 | test_list_files = get_list_files_fluers('test')
121 | print('Num test samples:', len(test_list_files))
122 | test_dataset = WhisperDataset(test_list_files)
123 |
124 | elif dataset_name == 'vlsp2019':
125 | # Download VLSP2019 dataset
126 | print('Loading VLSP2019 dataset...')
127 |
128 | if not test:
129 | train_list_files = get_list_files_vlsp2019('train')
130 | print('Num train samples:', len(train_list_files))
131 | train_dataset = WhisperDataset(train_list_files)
132 |
133 | test_list_files = get_list_files_vlsp2019('test')
134 | print('Num test samples:', len(test_list_files))
135 | test_dataset = WhisperDataset(test_list_files)
136 |
137 | elif dataset_name == 'vin100h':
138 | # Download VIN100h dataset
139 | if not os.path.exists('downloaded_check.txt'):
140 | print('Loading VIN100h dataset...')
141 | os.system("gdown 1vUSxdORDxk-ePUt-bUVDahpoXiqKchMx")
142 | os.system("tar -xf 'VinBigdata-VLSP2020-100h (1).rar'")
143 | os.system("gdown 1Zmj9BqNysiON6Lzjqos9kY08DRanJxXv")
144 | os.system("unzip 'vin100h_listfiles.zip'")
145 | os.system("remove 'VinBigdata-VLSP2020-100h (1).rar'")
146 | with open('downloaded_check.txt', 'w') as f:
147 | f.write('True')
148 | else:
149 | print('Dataset files already downloaded!')
150 | if not test:
151 | train_list_files = get_list_files_vin100h('train')
152 | print('Num train samples:', len(train_list_files))
153 | train_dataset = WhisperDataset(train_list_files)
154 |
155 | test_list_files = get_list_files_vin100h('test')
156 | print('Num test samples:', len(test_list_files))
157 | test_dataset = WhisperDataset(test_list_files)
158 |
159 | else:
160 | print(dataset_name, 'is not supported, please try again!')
161 |
162 | return train_dataset, test_dataset
163 |
164 | #------------------------------------FLUERS------------------------------------#
165 |
166 | def get_list_files_fluers(phase, audio_path = 'fluers/vi_vn/audio', text_max_length=1000, audio_max_sample_length=960000, sample_rate=16000):
167 | audio_path = os.path.join(audio_path, phase)
168 | audio_transcript_pair_list = []
169 | if phase=='train':
170 | tsv_file = 'fluers/vi_vn/train.tsv'
171 | elif phase=='dev':
172 | tsv_file = 'fluers/vi_vn/dev.tsv'
173 | else:
174 | tsv_file = 'fluers/vi_vn/test.tsv'
175 | df = pd.read_table(tsv_file, names=("id", "file_name", "raw_transcription", "transcription", "_", "num_samples", "gender"))
176 | for index, row in df.iterrows():
177 | new_path = Path(os.path.join(audio_path, row['file_name']))
178 | audio_id = row['id']
179 | text = row['transcription']
180 | if new_path.exists():
181 | audio = load_wave(new_path, sample_rate=sample_rate)[0]
182 | if len(text) > text_max_length or len(audio) > audio_max_sample_length:
183 | print('skip file:', new_path,'with len text:', len(text), 'and len audio', len(audio))
184 | continue
185 | audio_transcript_pair_list.append((audio_id, str(new_path), text))
186 | return audio_transcript_pair_list
187 |
188 |
189 |
190 | #------------------------------------VLSP2019 ASR Dataset------------------------------------#
191 | def get_list_files_vlsp2019(phase, dataset_path = 'vlsp2019/data', text_max_length=1000, audio_max_sample_length=960000, sample_rate=16000):
192 | audio_transcript_pair_list = []
193 | if phase=='train':
194 | csv_file = 'vlsp2019/vlsp2019_train.csv'
195 | else:
196 | csv_file = 'vlsp2019/vlsp2019_test.csv'
197 | df = pd.read_csv(csv_file)
198 | for index, row in df.iterrows():
199 | new_path = Path(os.path.join(dataset_path, row['filename']+'.wav'))
200 | audio_id = index
201 | with open(Path(os.path.join(dataset_path, row['filename']+'.txt')), 'r') as f:
202 | text = f.readlines()[0]
203 | if new_path.exists():
204 | audio = load_wave(new_path, sample_rate=sample_rate)[0]
205 | if len(text) > text_max_length or len(audio) > audio_max_sample_length:
206 | print('skip file:', new_path,'with len text:', len(text), 'and len audio', len(audio))
207 | continue
208 | audio_transcript_pair_list.append((audio_id, str(new_path), text))
209 | return audio_transcript_pair_list
210 |
211 | #------------------------------------VIN100h ASR Dataset------------------------------------#
212 | def get_list_files_vin100h(phase, dataset_path = 'vlsp2020_train_set_02', text_max_length=1000, audio_max_sample_length=960000, sample_rate=16000):
213 | audio_transcript_pair_list = []
214 | if phase=='train':
215 | csv_file = 'train_vin100h.csv'
216 | else:
217 | csv_file = 'test_vin100h.csv'
218 | df = pd.read_csv(csv_file)
219 | for index, row in df.iterrows():
220 | new_path = Path(os.path.join(dataset_path, row['filename']+'.wav'))
221 | audio_id = index
222 | with open(Path(os.path.join(dataset_path, row['filename']+'.txt')), 'r') as f:
223 | text = f.readlines()[0]
224 | if new_path.exists():
225 | audio = load_wave(new_path, sample_rate=sample_rate)[0]
226 | if len(text) > text_max_length or len(audio) > audio_max_sample_length:
227 | print('skip file:', new_path,'with len text:', len(text), 'and len audio', len(audio))
228 | continue
229 | audio_transcript_pair_list.append((audio_id, str(new_path), text))
230 | return audio_transcript_pair_list
231 |
232 | if __name__=='__main__':
233 | # load_fluers()
234 | print('Load dataset...')
235 |
--------------------------------------------------------------------------------
/demo/static/js/recorder.js:
--------------------------------------------------------------------------------
1 | // index.js ---------------
2 | //Model
3 | //none
4 |
5 | // Source: https://ralzohairi.medium.com/audio-recording-in-javascript-96eed45b75ee
6 |
7 | //View
8 | var microphoneButton = document.getElementsByClassName("start-recording-button")[0];
9 | var recordingControlButtonsContainer = document.getElementsByClassName("recording-contorl-buttons-container")[0];
10 | var stopRecordingButton = document.getElementsByClassName("stop-recording-button")[0];
11 | var cancelRecordingButton = document.getElementsByClassName("cancel-recording-button")[0];
12 | var elapsedTimeTag = document.getElementsByClassName("elapsed-time")[0];
13 | var closeBrowserNotSupportedBoxButton = document.getElementsByClassName("close-browser-not-supported-box")[0];
14 | var overlay = document.getElementsByClassName("overlay")[0];
15 | var audioElement = document.getElementsByClassName("audio-element")[0];
16 | var audioElementSource = document.getElementsByClassName("audio-element")[0]
17 | var textIndicatorOfAudiPlaying = document.getElementsByClassName("text-indication-of-audio-playing")[0];
18 |
19 |
20 | var replayAudioBtn = document.getElementsByClassName("replay-btn")[0];
21 | var textIndicatorOfAudioHandling = document.getElementsByClassName("text-indication-of-audio-handling")[0];
22 |
23 | //Listeners
24 |
25 | //Listen to start recording button
26 | microphoneButton.onclick = startAudioRecording;
27 |
28 | //Listen to stop recording button
29 | stopRecordingButton.onclick = stopAudioRecording;
30 |
31 | //Listen to cancel recording button
32 | cancelRecordingButton.onclick = cancelAudioRecording;
33 |
34 | //Listen to when the ok button is clicked in the browser not supporting audio recording box
35 | closeBrowserNotSupportedBoxButton.onclick = hideBrowserNotSupportedOverlay;
36 |
37 | //Listen to when the audio being played ends
38 | audioElement.onended = hideTextIndicatorOfAudioPlaying;
39 |
40 | // replay audio
41 | replayAudioBtn.onclick = replayAudio;
42 |
43 | function replayAudio() {
44 | console.log("Playing audio...");
45 | audioElement.play();
46 |
47 | displayTextIndicatorOfAudioPlaying();
48 | }
49 |
50 | /** Displays recording control buttons */
51 | function handleDisplayingRecordingControlButtons() {
52 | //Hide the microphone button that starts audio recording
53 | microphoneButton.style.display = "none";
54 |
55 | //Display the recording control buttons
56 | recordingControlButtonsContainer.classList.remove("hide");
57 |
58 | //Handle the displaying of the elapsed recording time
59 | handleElapsedRecordingTime();
60 | }
61 |
62 | /** Hide the displayed recording control buttons */
63 | function handleHidingRecordingControlButtons() {
64 | //Display the microphone button that starts audio recording
65 | microphoneButton.style.display = "block";
66 |
67 | //Hide the recording control buttons
68 | recordingControlButtonsContainer.classList.add("hide");
69 |
70 | //stop interval that handles both time elapsed and the red dot
71 | clearInterval(elapsedTimeTimer);
72 | }
73 |
74 | /** Displays browser not supported info box for the user*/
75 | function displayBrowserNotSupportedOverlay() {
76 | overlay.classList.remove("hide");
77 | }
78 |
79 | /** Displays browser not supported info box for the user*/
80 | function hideBrowserNotSupportedOverlay() {
81 | overlay.classList.add("hide");
82 | }
83 |
84 | /** Creates a source element for the the audio element in the HTML document*/
85 | function createSourceForAudioElement() {
86 | let sourceElement = document.createElement("source");
87 | audioElement.appendChild(sourceElement);
88 |
89 | audioElementSource = sourceElement;
90 | }
91 |
92 | /** Display the text indicator of the audio being playing in the background */
93 | function displayTextIndicatorOfAudioPlaying() {
94 | textIndicatorOfAudiPlaying.classList.remove("hide");
95 | }
96 |
97 | /** Hide the text indicator of the audio being playing in the background */
98 | function hideTextIndicatorOfAudioPlaying() {
99 | textIndicatorOfAudiPlaying.classList.add("hide");
100 | }
101 |
102 | /** Display the text indicator of the audio being handling in the background */
103 | function displayTextIndicatorOfAudioHandling() {
104 | textIndicatorOfAudioHandling.classList.remove("d-none");
105 | }
106 |
107 | /** Hide the text indicator of the audio being playing in the background */
108 | function hideTextIndicatorOfAudioHandling() {
109 | textIndicatorOfAudioHandling.classList.add("d-none");
110 | }
111 |
112 | //Controller
113 |
114 | /** Stores the actual start time when an audio recording begins to take place to ensure elapsed time start time is accurate*/
115 | var audioRecordStartTime;
116 |
117 | /** Stores the maximum recording time in hours to stop recording once maximum recording hour has been reached */
118 | var maximumRecordingTimeInHours = 1;
119 |
120 | /** Stores the reference of the setInterval function that controls the timer in audio recording*/
121 | var elapsedTimeTimer;
122 |
123 | /** Starts the audio recording*/
124 | function startAudioRecording() {
125 |
126 | console.log("Recording Audio...");
127 |
128 | //If a previous audio recording is playing, pause it
129 | let recorderAudioIsPlaying = !audioElement.paused; // the paused property tells whether the media element is paused or not
130 | console.log("paused?", !recorderAudioIsPlaying);
131 | if (recorderAudioIsPlaying) {
132 | audioElement.pause();
133 | //also hide the audio playing indicator displayed on the screen
134 | hideTextIndicatorOfAudioPlaying();
135 | }
136 |
137 | //start recording using the audio recording API
138 | audioRecorder.start()
139 | .then(() => { //on success
140 |
141 | //store the recording start time to display the elapsed time according to it
142 | audioRecordStartTime = new Date();
143 |
144 | //display control buttons to offer the functionality of stop and cancel
145 | handleDisplayingRecordingControlButtons();
146 | })
147 | .catch(error => { //on error
148 | //No Browser Support Error
149 | if (error.message.includes("mediaDevices API or getUserMedia method is not supported in this browser.")) {
150 | console.log("To record audio, use browsers like Chrome and Firefox.");
151 | displayBrowserNotSupportedOverlay();
152 | }
153 |
154 | //Error handling structure
155 | switch (error.name) {
156 | case 'AbortError': //error from navigator.mediaDevices.getUserMedia
157 | console.log("An AbortError has occured.");
158 | break;
159 | case 'NotAllowedError': //error from navigator.mediaDevices.getUserMedia
160 | console.log("A NotAllowedError has occured. User might have denied permission.");
161 | break;
162 | case 'NotFoundError': //error from navigator.mediaDevices.getUserMedia
163 | console.log("A NotFoundError has occured.");
164 | break;
165 | case 'NotReadableError': //error from navigator.mediaDevices.getUserMedia
166 | console.log("A NotReadableError has occured.");
167 | break;
168 | case 'SecurityError': //error from navigator.mediaDevices.getUserMedia or from the MediaRecorder.start
169 | console.log("A SecurityError has occured.");
170 | break;
171 | case 'TypeError': //error from navigator.mediaDevices.getUserMedia
172 | console.log("A TypeError has occured.");
173 | break;
174 | case 'InvalidStateError': //error from the MediaRecorder.start
175 | console.log("An InvalidStateError has occured.");
176 | break;
177 | case 'UnknownError': //error from the MediaRecorder.start
178 | console.log("An UnknownError has occured.");
179 | break;
180 | default:
181 | console.log("An error occured with the error name " + error.name);
182 | };
183 | });
184 | }
185 | /** Stop the currently started audio recording & sends it
186 | */
187 | function stopAudioRecording() {
188 |
189 | console.log("Stopping Audio Recording...");
190 |
191 | //stop the recording using the audio recording API
192 | audioRecorder.stop()
193 | .then(audioAsblob => {
194 |
195 | // Send audio
196 | sendAudio(audioAsblob);
197 | displayTextIndicatorOfAudioHandling();
198 | //Play recorder audio
199 | saveAudio(audioAsblob);
200 |
201 | //hide recording control button & return record icon
202 | handleHidingRecordingControlButtons();
203 | })
204 | .catch(error => {
205 | //Error handling structure
206 | switch (error.name) {
207 | case 'InvalidStateError': //error from the MediaRecorder.stop
208 | console.log("An InvalidStateError has occured.");
209 | break;
210 | default:
211 | console.log("An error occured with the error name " + error.name);
212 | };
213 | });
214 | }
215 |
216 | /** Cancel the currently started audio recording */
217 | function cancelAudioRecording() {
218 | console.log("Canceling audio...");
219 |
220 | //cancel the recording using the audio recording API
221 | audioRecorder.cancel();
222 |
223 | //hide recording control button & return record icon
224 | handleHidingRecordingControlButtons();
225 | }
226 |
227 | function sendAudio(recorderAudioAsBlob) {
228 | const audioUrl = URL.createObjectURL(recorderAudioAsBlob);
229 | const audio = new Audio(audioUrl);
230 |
231 | var data = new FormData()
232 | data.append('file', recorderAudioAsBlob , 'file')
233 |
234 | var url = "http://127.0.0.1:5000/transcribe-";
235 | var model = $('input[name="model"]:checked').val();
236 | console.log(url + model)
237 | fetch(url + model, {
238 | method: 'POST',
239 | body: data
240 |
241 | }).then(response => response.json()
242 | ).then(json => {
243 | console.log(json);
244 | $(".text-transcribe-content").text(json);
245 | hideTextIndicatorOfAudioHandling();
246 | });
247 | }
248 |
249 | /** Plays recorded audio using the audio element in the HTML document
250 | * @param {Blob} recorderAudioAsBlob - recorded audio as a Blob Object
251 | */
252 | function saveAudio(recorderAudioAsBlob) {
253 |
254 | //read content of files (Blobs) asynchronously
255 | let reader = new FileReader();
256 |
257 | //once content has been read
258 | reader.onload = (e) => {
259 | //store the base64 URL that represents the URL of the recording audio
260 | let base64URL = e.target.result;
261 |
262 | //If this is the first audio playing, create a source element
263 | //as pre populating the HTML with a source of empty src causes error
264 | if (!audioElementSource) //if its not defined create it (happens first time only)
265 | createSourceForAudioElement();
266 |
267 | //set the audio element's source using the base64 URL
268 | audioElementSource.src = base64URL;
269 |
270 | //set the type of the audio element based on the recorded audio's Blob type
271 | let BlobType = recorderAudioAsBlob.type.includes(";") ?
272 | recorderAudioAsBlob.type.substr(0, recorderAudioAsBlob.type.indexOf(';')) : recorderAudioAsBlob.type;
273 | audioElementSource.type = BlobType
274 |
275 | //call the load method as it is used to update the audio element after changing the source or other settings
276 | audioElement.load();
277 |
278 | //play the audio after successfully setting new src and type that corresponds to the recorded audio
279 | // console.log("Playing audio...");
280 | // audioElement.play();
281 |
282 | // console.log("Sending audio...");
283 |
284 | //Display text indicator of having the audio play in the background
285 | // displayTextIndicatorOfAudioPlaying();
286 | };
287 |
288 | //read content and convert it to a URL (base64)
289 | reader.readAsDataURL(recorderAudioAsBlob);
290 | }
291 |
292 | /** Computes the elapsed recording time since the moment the function is called in the format h:m:s*/
293 | function handleElapsedRecordingTime() {
294 | //display inital time when recording begins
295 | displayElapsedTimeDuringAudioRecording("00:00");
296 |
297 | //create an interval that compute & displays elapsed time, as well as, animate red dot - every second
298 | elapsedTimeTimer = setInterval(() => {
299 | //compute the elapsed time every second
300 | let elapsedTime = computeElapsedTime(audioRecordStartTime); //pass the actual record start time
301 | //display the elapsed time
302 | displayElapsedTimeDuringAudioRecording(elapsedTime);
303 | }, 1000); //every second
304 | }
305 |
306 | /** Display elapsed time during audio recording
307 | * @param {String} elapsedTime - elapsed time in the format mm:ss or hh:mm:ss
308 | */
309 | function displayElapsedTimeDuringAudioRecording(elapsedTime) {
310 | //1. display the passed elapsed time as the elapsed time in the elapsedTime HTML element
311 | elapsedTimeTag.innerHTML = elapsedTime;
312 |
313 | //2. Stop the recording when the max number of hours is reached
314 | if (elapsedTimeReachedMaximumNumberOfHours(elapsedTime)) {
315 | stopAudioRecording();
316 | }
317 | }
318 |
319 | /**
320 | * @param {String} elapsedTime - elapsed time in the format mm:ss or hh:mm:ss
321 | * @returns {Boolean} whether the elapsed time reached the maximum number of hours or not
322 | */
323 | function elapsedTimeReachedMaximumNumberOfHours(elapsedTime) {
324 | //Split the elapsed time by the symbo :
325 | let elapsedTimeSplitted = elapsedTime.split(":");
326 |
327 | //Turn the maximum recording time in hours to a string and pad it with zero if less than 10
328 | let maximumRecordingTimeInHoursAsString = maximumRecordingTimeInHours < 10 ? "0" + maximumRecordingTimeInHours : maximumRecordingTimeInHours.toString();
329 |
330 | //if it the elapsed time reach hours and also reach the maximum recording time in hours return true
331 | if (elapsedTimeSplitted.length === 3 && elapsedTimeSplitted[0] === maximumRecordingTimeInHoursAsString)
332 | return true;
333 | else //otherwise, return false
334 | return false;
335 | }
336 |
337 | /** Computes the elapsedTime since the moment the function is called in the format mm:ss or hh:mm:ss
338 | * @param {String} startTime - start time to compute the elapsed time since
339 | * @returns {String} elapsed time in mm:ss format or hh:mm:ss format, if elapsed hours are 0.
340 | */
341 | function computeElapsedTime(startTime) {
342 | //record end time
343 | let endTime = new Date();
344 |
345 | //time difference in ms
346 | let timeDiff = endTime - startTime;
347 |
348 | //convert time difference from ms to seconds
349 | timeDiff = timeDiff / 1000;
350 |
351 | //extract integer seconds that dont form a minute using %
352 | let seconds = Math.floor(timeDiff % 60); //ignoring uncomplete seconds (floor)
353 |
354 | //pad seconds with a zero if neccessary
355 | seconds = seconds < 10 ? "0" + seconds : seconds;
356 |
357 | //convert time difference from seconds to minutes using %
358 | timeDiff = Math.floor(timeDiff / 60);
359 |
360 | //extract integer minutes that don't form an hour using %
361 | let minutes = timeDiff % 60; //no need to floor possible incomplete minutes, becase they've been handled as seconds
362 | minutes = minutes < 10 ? "0" + minutes : minutes;
363 |
364 | //convert time difference from minutes to hours
365 | timeDiff = Math.floor(timeDiff / 60);
366 |
367 | //extract integer hours that don't form a day using %
368 | let hours = timeDiff % 24; //no need to floor possible incomplete hours, becase they've been handled as seconds
369 |
370 | //convert time difference from hours to days
371 | timeDiff = Math.floor(timeDiff / 24);
372 |
373 | // the rest of timeDiff is number of days
374 | let days = timeDiff; //add days to hours
375 |
376 | let totalHours = hours + (days * 24);
377 | totalHours = totalHours < 10 ? "0" + totalHours : totalHours;
378 |
379 | if (totalHours === "00") {
380 | return minutes + ":" + seconds;
381 | } else {
382 | return totalHours + ":" + minutes + ":" + seconds;
383 | }
384 | }
385 |
386 | // audio-recording.js ---------------
387 | //API to handle audio recording
388 |
389 | var audioRecorder = {
390 | /** Stores the recorded audio as Blob objects of audio data as the recording continues*/
391 | audioBlobs: [],/*of type Blob[]*/
392 | /** Stores the reference of the MediaRecorder instance that handles the MediaStream when recording starts*/
393 | mediaRecorder: null, /*of type MediaRecorder*/
394 | /** Stores the reference to the stream currently capturing the audio*/
395 | streamBeingCaptured: null, /*of type MediaStream*/
396 | /** Start recording the audio
397 | * @returns {Promise} - returns a promise that resolves if audio recording successfully started
398 | */
399 | start: function () {
400 | //Feature Detection
401 | if (!(navigator.mediaDevices && navigator.mediaDevices.getUserMedia)) {
402 | //Feature is not supported in browser
403 | //return a custom error
404 | return Promise.reject(new Error('mediaDevices API or getUserMedia method is not supported in this browser.'));
405 | }
406 |
407 | else {
408 | //Feature is supported in browser
409 |
410 | //create an audio stream
411 | return navigator.mediaDevices.getUserMedia({ audio: true }/*of type MediaStreamConstraints*/)
412 | //returns a promise that resolves to the audio stream
413 | .then(stream /*of type MediaStream*/ => {
414 |
415 | //save the reference of the stream to be able to stop it when necessary
416 | audioRecorder.streamBeingCaptured = stream;
417 |
418 | //create a media recorder instance by passing that stream into the MediaRecorder constructor
419 | audioRecorder.mediaRecorder = new MediaRecorder(stream); /*the MediaRecorder interface of the MediaStream Recording
420 | API provides functionality to easily record media*/
421 |
422 | //clear previously saved audio Blobs, if any
423 | audioRecorder.audioBlobs = [];
424 |
425 | //add a dataavailable event listener in order to store the audio data Blobs when recording
426 | audioRecorder.mediaRecorder.addEventListener("dataavailable", event => {
427 | //store audio Blob object
428 | audioRecorder.audioBlobs.push(event.data);
429 | });
430 |
431 | //start the recording by calling the start method on the media recorder
432 | audioRecorder.mediaRecorder.start();
433 | });
434 |
435 | /* errors are not handled in the API because if its handled and the promise is chained, the .then after the catch will be executed*/
436 | }
437 | },
438 | /** Stop the started audio recording
439 | * @returns {Promise} - returns a promise that resolves to the audio as a blob file
440 | */
441 | stop: function () {
442 | //return a promise that would return the blob or URL of the recording
443 | return new Promise(resolve => {
444 | //save audio type to pass to set the Blob type
445 | let mimeType = audioRecorder.mediaRecorder.mimeType;
446 |
447 | //listen to the stop event in order to create & return a single Blob object
448 | audioRecorder.mediaRecorder.addEventListener("stop", () => {
449 | //create a single blob object, as we might have gathered a few Blob objects that needs to be joined as one
450 | let audioBlob = new Blob(audioRecorder.audioBlobs, { type: mimeType });
451 |
452 | //resolve promise with the single audio blob representing the recorded audio
453 | resolve(audioBlob);
454 | });
455 | audioRecorder.cancel();
456 | });
457 | },
458 | /** Cancel audio recording*/
459 | cancel: function () {
460 | //stop the recording feature
461 | audioRecorder.mediaRecorder.stop();
462 |
463 | //stop all the tracks on the active stream in order to stop the stream
464 | audioRecorder.stopStream();
465 |
466 | //reset API properties for next recording
467 | audioRecorder.resetRecordingProperties();
468 | },
469 | /** Stop all the tracks on the active stream in order to stop the stream and remove
470 | * the red flashing dot showing in the tab
471 | */
472 | stopStream: function () {
473 | //stopping the capturing request by stopping all the tracks on the active stream
474 | audioRecorder.streamBeingCaptured.getTracks() //get all tracks from the stream
475 | .forEach(track /*of type MediaStreamTrack*/ => track.stop()); //stop each one
476 | },
477 | /** Reset all the recording properties including the media recorder and stream being captured*/
478 | resetRecordingProperties: function () {
479 | audioRecorder.mediaRecorder = null;
480 | audioRecorder.streamBeingCaptured = null;
481 |
482 | /*No need to remove event listeners attached to mediaRecorder as
483 | If a DOM element which is removed is reference-free (no references pointing to it), the element itself is picked
484 | up by the garbage collector as well as any event handlers/listeners associated with it.
485 | getEventListeners(audioRecorder.mediaRecorder) will return an empty array of events.*/
486 | }
487 | }
--------------------------------------------------------------------------------
/notebooks/[WhisperLM]_KenLMipynb.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "kValhOGAlsfj"
7 | },
8 | "source": [
9 | "# Dataset"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 4,
15 | "metadata": {
16 | "colab": {
17 | "base_uri": "https://localhost:8080/"
18 | },
19 | "id": "7PzdrrMwzjZB",
20 | "outputId": "5aa72e6c-5e11-4178-9070-372979bf471a"
21 | },
22 | "outputs": [
23 | {
24 | "name": "stdout",
25 | "output_type": "stream",
26 | "text": [
27 | "Downloading...\n",
28 | "From: https://drive.google.com/uc?id=1ypvEoGRNWrNLmW246RtBm9iMyKXm_2BP\n",
29 | "To: /home/hkab/corpus-title.tar.gz\n",
30 | "100%|████████████████████████████████████████| 220M/220M [01:37<00:00, 2.24MB/s]\n"
31 | ]
32 | }
33 | ],
34 | "source": [
35 | "!gdown 1ypvEoGRNWrNLmW246RtBm9iMyKXm_2BP"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 5,
41 | "metadata": {
42 | "colab": {
43 | "base_uri": "https://localhost:8080/"
44 | },
45 | "id": "dvDBB219zxCr",
46 | "outputId": "88fd7aea-2e48-4b87-d79a-bf5c61b21ebf"
47 | },
48 | "outputs": [
49 | {
50 | "name": "stdout",
51 | "output_type": "stream",
52 | "text": [
53 | "corpus-title.txt\n"
54 | ]
55 | }
56 | ],
57 | "source": [
58 | "!tar -xvf corpus-title.tar.gz"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {
64 | "id": "zaAvDY2xlvpI"
65 | },
66 | "source": [
67 | "# Dependencies"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 6,
73 | "metadata": {
74 | "colab": {
75 | "base_uri": "https://localhost:8080/"
76 | },
77 | "id": "rcOx6ErX3AYb",
78 | "outputId": "b2716c94-da63-4f7c-86e9-e001e66baa3b"
79 | },
80 | "outputs": [
81 | {
82 | "name": "stdout",
83 | "output_type": "stream",
84 | "text": [
85 | "Collecting transformers\n",
86 | " Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)\n",
87 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m997.5 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:01\u001b[0m00:01\u001b[0m\n",
88 | "\u001b[?25hRequirement already satisfied: packaging>=20.0 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from transformers) (21.3)\n",
89 | "Requirement already satisfied: tqdm>=4.27 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from transformers) (4.64.1)\n",
90 | "Requirement already satisfied: requests in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from transformers) (2.28.1)\n",
91 | "Collecting pyyaml>=5.1\n",
92 | " Downloading PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (682 kB)\n",
93 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m682.2/682.2 kB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
94 | "\u001b[?25hRequirement already satisfied: filelock in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from transformers) (3.8.0)\n",
95 | "Collecting tokenizers!=0.11.3,<0.13,>=0.11.1\n",
96 | " Downloading tokenizers-0.12.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)\n",
97 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.6/6.6 MB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
98 | "\u001b[?25hCollecting huggingface-hub<1.0,>=0.9.0\n",
99 | " Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)\n",
100 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.5/163.5 kB\u001b[0m \u001b[31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
101 | "\u001b[?25hCollecting regex!=2019.12.17\n",
102 | " Downloading regex-2022.9.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
103 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.5/770.5 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
104 | "\u001b[?25hCollecting numpy>=1.17\n",
105 | " Downloading numpy-1.23.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)\n",
106 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.1/17.1 MB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
107 | "\u001b[?25hRequirement already satisfied: typing-extensions>=3.7.4.3 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.9.0->transformers) (4.4.0)\n",
108 | "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from packaging>=20.0->transformers) (3.0.9)\n",
109 | "Requirement already satisfied: charset-normalizer<3,>=2 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from requests->transformers) (2.1.1)\n",
110 | "Requirement already satisfied: urllib3<1.27,>=1.21.1 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from requests->transformers) (1.26.11)\n",
111 | "Requirement already satisfied: certifi>=2017.4.17 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from requests->transformers) (2022.9.24)\n",
112 | "Requirement already satisfied: idna<4,>=2.5 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from requests->transformers) (3.4)\n",
113 | "Installing collected packages: tokenizers, regex, pyyaml, numpy, huggingface-hub, transformers\n",
114 | "Successfully installed huggingface-hub-0.10.0 numpy-1.23.3 pyyaml-6.0 regex-2022.9.13 tokenizers-0.12.1 transformers-4.22.2\n"
115 | ]
116 | }
117 | ],
118 | "source": [
119 | "!pip install transformers"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 7,
125 | "metadata": {
126 | "id": "605Nw379kJOM"
127 | },
128 | "outputs": [
129 | {
130 | "name": "stdout",
131 | "output_type": "stream",
132 | "text": [
133 | "[sudo] password for hkab: \n"
134 | ]
135 | }
136 | ],
137 | "source": [
138 | "# !sudo apt install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 16,
144 | "metadata": {
145 | "colab": {
146 | "base_uri": "https://localhost:8080/"
147 | },
148 | "id": "N2EkHFyXkLS2",
149 | "outputId": "ded57a89-4ad8-4cc5-afb2-d1eaa693746d"
150 | },
151 | "outputs": [
152 | {
153 | "name": "stdout",
154 | "output_type": "stream",
155 | "text": [
156 | "--2022-10-10 15:20:25-- https://kheafield.com/code/kenlm.tar.gz\n",
157 | "Resolving kheafield.com (kheafield.com)... 35.196.63.85\n",
158 | "Connecting to kheafield.com (kheafield.com)|35.196.63.85|:443... connected.\n",
159 | "HTTP request sent, awaiting response... 200 OK\n",
160 | "Length: 491888 (480K) [application/x-gzip]\n",
161 | "Saving to: ‘STDOUT’\n",
162 | "\n",
163 | "- 100%[===================>] 480.36K 395KB/s in 1.2s \n",
164 | "\n",
165 | "2022-10-10 15:20:27 (395 KB/s) - written to stdout [491888/491888]\n",
166 | "\n"
167 | ]
168 | }
169 | ],
170 | "source": [
171 | "!wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 17,
177 | "metadata": {
178 | "colab": {
179 | "base_uri": "https://localhost:8080/"
180 | },
181 | "id": "Irt8SyookOno",
182 | "outputId": "b9cfd484-cfa6-4ecd-f855-79dca383b91f"
183 | },
184 | "outputs": [
185 | {
186 | "name": "stdout",
187 | "output_type": "stream",
188 | "text": [
189 | "-- The C compiler identification is GNU 9.4.0\n",
190 | "-- The CXX compiler identification is GNU 9.4.0\n",
191 | "-- Check for working C compiler: /usr/bin/cc\n",
192 | "-- Check for working C compiler: /usr/bin/cc -- works\n",
193 | "-- Detecting C compiler ABI info\n",
194 | "-- Detecting C compiler ABI info - done\n",
195 | "-- Detecting C compile features\n",
196 | "-- Detecting C compile features - done\n",
197 | "-- Check for working CXX compiler: /usr/bin/c++\n",
198 | "-- Check for working CXX compiler: /usr/bin/c++ -- works\n",
199 | "-- Detecting CXX compiler ABI info\n",
200 | "-- Detecting CXX compiler ABI info - done\n",
201 | "-- Detecting CXX compile features\n",
202 | "-- Detecting CXX compile features - done\n",
203 | "-- Found Boost: /usr/lib/x86_64-linux-gnu/cmake/Boost-1.71.0/BoostConfig.cmake (found suitable version \"1.71.0\", minimum required is \"1.41.0\") found components: program_options system thread unit_test_framework \n",
204 | "-- Check if compiler accepts -pthread\n",
205 | "-- Check if compiler accepts -pthread - yes\n",
206 | "-- Found Threads: TRUE \n",
207 | "-- Found ZLIB: /usr/lib/x86_64-linux-gnu/libz.so (found version \"1.2.11\") \n",
208 | "-- Found BZip2: /usr/lib/x86_64-linux-gnu/libbz2.so (found version \"1.0.8\") \n",
209 | "-- Looking for BZ2_bzCompressInit\n",
210 | "-- Looking for BZ2_bzCompressInit - found\n",
211 | "-- Looking for lzma_auto_decoder in /usr/lib/x86_64-linux-gnu/liblzma.so\n",
212 | "-- Looking for lzma_auto_decoder in /usr/lib/x86_64-linux-gnu/liblzma.so - found\n",
213 | "-- Looking for lzma_easy_encoder in /usr/lib/x86_64-linux-gnu/liblzma.so\n",
214 | "-- Looking for lzma_easy_encoder in /usr/lib/x86_64-linux-gnu/liblzma.so - found\n",
215 | "-- Looking for lzma_lzma_preset in /usr/lib/x86_64-linux-gnu/liblzma.so\n",
216 | "-- Looking for lzma_lzma_preset in /usr/lib/x86_64-linux-gnu/liblzma.so - found\n",
217 | "-- Found LibLZMA: /usr/lib/x86_64-linux-gnu/liblzma.so (found version \"5.2.4\") \n",
218 | "-- Looking for clock_gettime in rt\n",
219 | "-- Looking for clock_gettime in rt - found\n",
220 | "-- Found OpenMP_C: -fopenmp (found version \"4.5\") \n",
221 | "-- Found OpenMP_CXX: -fopenmp (found version \"4.5\") \n",
222 | "-- Found OpenMP: TRUE (found version \"4.5\") \n",
223 | "-- Configuring done\n",
224 | "-- Generating done\n",
225 | "-- Build files have been written to: /home/hkab/kenlm/build\n",
226 | "\u001b[35m\u001b[1mScanning dependencies of target kenlm_util\u001b[0m\n",
227 | "[ 2%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/bignum.cc.o\u001b[0m\n",
228 | "[ 2%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/bignum-dtoa.cc.o\u001b[0m\n",
229 | "[ 3%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/cached-powers.cc.o\u001b[0m\n",
230 | "[ 4%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/diy-fp.cc.o\u001b[0m\n",
231 | "[ 5%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/double-conversion.cc.o\u001b[0m\n",
232 | "[ 6%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/fast-dtoa.cc.o\u001b[0m\n",
233 | "[ 7%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/fixed-dtoa.cc.o\u001b[0m\n",
234 | "[ 8%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/strtod.cc.o\u001b[0m\n",
235 | "[ 9%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/chain.cc.o\u001b[0m\n",
236 | "[ 10%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/count_records.cc.o\u001b[0m\n",
237 | "[ 11%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/io.cc.o\u001b[0m\n",
238 | "[ 12%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/line_input.cc.o\u001b[0m\n",
239 | "[ 13%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/multi_progress.cc.o\u001b[0m\n",
240 | "[ 14%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/rewindable_stream.cc.o\u001b[0m\n",
241 | "[ 15%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/bit_packing.cc.o\u001b[0m\n",
242 | "[ 16%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/ersatz_progress.cc.o\u001b[0m\n",
243 | "[ 17%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/exception.cc.o\u001b[0m\n",
244 | "[ 18%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/file.cc.o\u001b[0m\n",
245 | "[ 19%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/file_piece.cc.o\u001b[0m\n",
246 | "[ 20%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/float_to_string.cc.o\u001b[0m\n",
247 | "[ 21%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/integer_to_string.cc.o\u001b[0m\n",
248 | "[ 22%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/mmap.cc.o\u001b[0m\n",
249 | "[ 23%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/murmur_hash.cc.o\u001b[0m\n",
250 | "[ 25%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/parallel_read.cc.o\u001b[0m\n",
251 | "[ 26%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/pool.cc.o\u001b[0m\n",
252 | "[ 27%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/read_compressed.cc.o\u001b[0m\n",
253 | "[ 28%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/scoped.cc.o\u001b[0m\n",
254 | "[ 29%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/spaces.cc.o\u001b[0m\n",
255 | "[ 30%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/string_piece.cc.o\u001b[0m\n",
256 | "[ 31%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/usage.cc.o\u001b[0m\n",
257 | "[ 32%] \u001b[32m\u001b[1mLinking CXX static library ../lib/libkenlm_util.a\u001b[0m\n",
258 | "[ 32%] Built target kenlm_util\n",
259 | "\u001b[35m\u001b[1mScanning dependencies of target probing_hash_table_benchmark\u001b[0m\n",
260 | "\u001b[35m\u001b[1mScanning dependencies of target kenlm\u001b[0m\n",
261 | "[ 33%] \u001b[32mBuilding CXX object util/CMakeFiles/probing_hash_table_benchmark.dir/probing_hash_table_benchmark_main.cc.o\u001b[0m\n",
262 | "[ 34%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/bhiksha.cc.o\u001b[0m\n",
263 | "[ 35%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/binary_format.cc.o\u001b[0m\n",
264 | "[ 36%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/config.cc.o\u001b[0m\n",
265 | "[ 37%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/lm_exception.cc.o\u001b[0m\n",
266 | "[ 38%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/model.cc.o\u001b[0m\n",
267 | "[ 39%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/quantize.cc.o\u001b[0m\n",
268 | "[ 40%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/read_arpa.cc.o\u001b[0m\n",
269 | "[ 41%] \u001b[32m\u001b[1mLinking CXX executable ../bin/probing_hash_table_benchmark\u001b[0m\n",
270 | "[ 42%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/search_hashed.cc.o\u001b[0m\n",
271 | "[ 42%] Built target probing_hash_table_benchmark\n",
272 | "\u001b[35m\u001b[1mScanning dependencies of target kenlm_filter\u001b[0m\n",
273 | "[ 43%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/arpa_io.cc.o\u001b[0m\n",
274 | "[ 44%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/phrase.cc.o\u001b[0m\n",
275 | "[ 45%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/search_trie.cc.o\u001b[0m\n",
276 | "[ 46%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/vocab.cc.o\u001b[0m\n",
277 | "[ 47%] \u001b[32m\u001b[1mLinking CXX static library ../../lib/libkenlm_filter.a\u001b[0m\n",
278 | "[ 47%] Built target kenlm_filter\n",
279 | "[ 48%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/sizes.cc.o\u001b[0m\n",
280 | "[ 50%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/trie.cc.o\u001b[0m\n",
281 | "[ 51%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/trie_sort.cc.o\u001b[0m\n",
282 | "[ 52%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/value_build.cc.o\u001b[0m\n",
283 | "[ 53%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/virtual_interface.cc.o\u001b[0m\n",
284 | "[ 54%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/vocab.cc.o\u001b[0m\n",
285 | "[ 55%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/model_buffer.cc.o\u001b[0m\n",
286 | "[ 56%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/print.cc.o\u001b[0m\n",
287 | "[ 57%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/renumber.cc.o\u001b[0m\n",
288 | "[ 58%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/size_option.cc.o\u001b[0m\n",
289 | "[ 59%] \u001b[32m\u001b[1mLinking CXX static library ../lib/libkenlm.a\u001b[0m\n",
290 | "[ 59%] Built target kenlm\n",
291 | "\u001b[35m\u001b[1mScanning dependencies of target build_binary\u001b[0m\n",
292 | "\u001b[35m\u001b[1mScanning dependencies of target fragment\u001b[0m\n",
293 | "[ 60%] \u001b[32mBuilding CXX object lm/CMakeFiles/build_binary.dir/build_binary_main.cc.o\u001b[0m\n",
294 | "[ 61%] \u001b[32mBuilding CXX object lm/CMakeFiles/fragment.dir/fragment_main.cc.o\u001b[0m\n",
295 | "[ 62%] \u001b[32m\u001b[1mLinking CXX executable ../bin/fragment\u001b[0m\n",
296 | "[ 62%] Built target fragment\n",
297 | "[ 63%] \u001b[32m\u001b[1mLinking CXX executable ../bin/build_binary\u001b[0m\n",
298 | "\u001b[35m\u001b[1mScanning dependencies of target query\u001b[0m\n",
299 | "[ 64%] \u001b[32mBuilding CXX object lm/CMakeFiles/query.dir/query_main.cc.o\u001b[0m\n",
300 | "[ 64%] Built target build_binary\n",
301 | "\u001b[35m\u001b[1mScanning dependencies of target kenlm_benchmark\u001b[0m\n",
302 | "[ 65%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm_benchmark.dir/kenlm_benchmark_main.cc.o\u001b[0m\n",
303 | "[ 66%] \u001b[32m\u001b[1mLinking CXX executable ../bin/query\u001b[0m\n",
304 | "[ 66%] Built target query\n",
305 | "\u001b[35m\u001b[1mScanning dependencies of target kenlm_builder\u001b[0m\n",
306 | "[ 67%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/adjust_counts.cc.o\u001b[0m\n",
307 | "[ 68%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/corpus_count.cc.o\u001b[0m\n",
308 | "[ 69%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/initial_probabilities.cc.o\u001b[0m\n",
309 | "[ 70%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/interpolate.cc.o\u001b[0m\n",
310 | "[ 71%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/output.cc.o\u001b[0m\n",
311 | "[ 72%] \u001b[32m\u001b[1mLinking CXX executable ../bin/kenlm_benchmark\u001b[0m\n",
312 | "[ 72%] Built target kenlm_benchmark\n",
313 | "\u001b[35m\u001b[1mScanning dependencies of target phrase_table_vocab\u001b[0m\n",
314 | "[ 73%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/phrase_table_vocab.dir/phrase_table_vocab_main.cc.o\u001b[0m\n",
315 | "[ 75%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/phrase_table_vocab\u001b[0m\n",
316 | "[ 75%] Built target phrase_table_vocab\n",
317 | "\u001b[35m\u001b[1mScanning dependencies of target filter\u001b[0m\n",
318 | "[ 76%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/filter.dir/filter_main.cc.o\u001b[0m\n",
319 | "[ 77%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/pipeline.cc.o\u001b[0m\n",
320 | "[ 78%] \u001b[32m\u001b[1mLinking CXX static library ../../lib/libkenlm_builder.a\u001b[0m\n",
321 | "[ 78%] Built target kenlm_builder\n",
322 | "\u001b[35m\u001b[1mScanning dependencies of target kenlm_interpolate\u001b[0m\n",
323 | "[ 79%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/backoff_reunification.cc.o\u001b[0m\n",
324 | "[ 80%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/bounded_sequence_encoding.cc.o\u001b[0m\n",
325 | "[ 81%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/merge_probabilities.cc.o\u001b[0m\n",
326 | "[ 82%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/filter\u001b[0m\n",
327 | "[ 82%] Built target filter\n",
328 | "[ 83%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/merge_vocab.cc.o\u001b[0m\n",
329 | "[ 84%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/normalize.cc.o\u001b[0m\n",
330 | "[ 85%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/pipeline.cc.o\u001b[0m\n",
331 | "[ 86%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/split_worker.cc.o\u001b[0m\n",
332 | "[ 87%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/tune_derivatives.cc.o\u001b[0m\n",
333 | "[ 88%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/tune_instances.cc.o\u001b[0m\n",
334 | "[ 89%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/tune_weights.cc.o\u001b[0m\n",
335 | "[ 90%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/universal_vocab.cc.o\u001b[0m\n",
336 | "\u001b[35m\u001b[1mScanning dependencies of target count_ngrams\u001b[0m\n",
337 | "[ 91%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/count_ngrams.dir/count_ngrams_main.cc.o\u001b[0m\n",
338 | "[ 92%] \u001b[32m\u001b[1mLinking CXX static library ../../lib/libkenlm_interpolate.a\u001b[0m\n",
339 | "[ 92%] Built target kenlm_interpolate\n",
340 | "\u001b[35m\u001b[1mScanning dependencies of target lmplz\u001b[0m\n",
341 | "[ 93%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/lmplz.dir/lmplz_main.cc.o\u001b[0m\n",
342 | "[ 94%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/lmplz\u001b[0m\n",
343 | "[ 94%] Built target lmplz\n",
344 | "\u001b[35m\u001b[1mScanning dependencies of target streaming_example\u001b[0m\n",
345 | "[ 95%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/streaming_example.dir/streaming_example_main.cc.o\u001b[0m\n",
346 | "[ 96%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/count_ngrams\u001b[0m\n",
347 | "[ 96%] Built target count_ngrams\n",
348 | "\u001b[35m\u001b[1mScanning dependencies of target interpolate\u001b[0m\n",
349 | "[ 97%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/interpolate.dir/interpolate_main.cc.o\u001b[0m\n",
350 | "[ 98%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/streaming_example\u001b[0m\n",
351 | "[ 98%] Built target streaming_example\n",
352 | "[100%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/interpolate\u001b[0m\n",
353 | "[100%] Built target interpolate\n",
354 | "build_binary fragment\t lmplz\t\t\t query\n",
355 | "count_ngrams interpolate phrase_table_vocab\t streaming_example\n",
356 | "filter\t kenlm_benchmark probing_hash_table_benchmark\n"
357 | ]
358 | }
359 | ],
360 | "source": [
361 | "!mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2\n",
362 | "!ls kenlm/build/bin"
363 | ]
364 | },
365 | {
366 | "cell_type": "code",
367 | "execution_count": 18,
368 | "metadata": {
369 | "colab": {
370 | "base_uri": "https://localhost:8080/"
371 | },
372 | "id": "mW60TiF_oby2",
373 | "outputId": "382fd3f6-12c5-4fc9-e7dd-fd63adb6dd47"
374 | },
375 | "outputs": [
376 | {
377 | "name": "stdout",
378 | "output_type": "stream",
379 | "text": [
380 | "Collecting pyctcdecode\n",
381 | " Downloading pyctcdecode-0.4.0-py2.py3-none-any.whl (45 kB)\n",
382 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.1/45.1 kB\u001b[0m \u001b[31m842.4 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m \u001b[36m0:00:01\u001b[0m\n",
383 | "\u001b[?25hCollecting pygtrie<3.0,>=2.1\n",
384 | " Downloading pygtrie-2.5.0-py3-none-any.whl (25 kB)\n",
385 | "Collecting hypothesis<7,>=6.14\n",
386 | " Downloading hypothesis-6.56.1-py3-none-any.whl (395 kB)\n",
387 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m395.3/395.3 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
388 | "\u001b[?25hRequirement already satisfied: numpy<2.0.0,>=1.15.0 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from pyctcdecode) (1.23.3)\n",
389 | "Requirement already satisfied: attrs>=19.2.0 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from hypothesis<7,>=6.14->pyctcdecode) (22.1.0)\n",
390 | "Collecting exceptiongroup>=1.0.0rc8\n",
391 | " Downloading exceptiongroup-1.0.0rc9-py3-none-any.whl (12 kB)\n",
392 | "Collecting sortedcontainers<3.0.0,>=2.1.0\n",
393 | " Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)\n",
394 | "Installing collected packages: sortedcontainers, pygtrie, exceptiongroup, hypothesis, pyctcdecode\n",
395 | "Successfully installed exceptiongroup-1.0.0rc9 hypothesis-6.56.1 pyctcdecode-0.4.0 pygtrie-2.5.0 sortedcontainers-2.4.0\n"
396 | ]
397 | }
398 | ],
399 | "source": [
400 | "!pip install pyctcdecode"
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": 19,
406 | "metadata": {
407 | "colab": {
408 | "base_uri": "https://localhost:8080/",
409 | "height": 315
410 | },
411 | "id": "E20MfjkLrJmt",
412 | "outputId": "72164662-c63f-472a-ff97-7e26d5f3adc1"
413 | },
414 | "outputs": [
415 | {
416 | "name": "stdout",
417 | "output_type": "stream",
418 | "text": [
419 | "Collecting https://github.com/kpu/kenlm/archive/master.zip\n",
420 | " Downloading https://github.com/kpu/kenlm/archive/master.zip (550 kB)\n",
421 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m550.7/550.7 kB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
422 | "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
423 | "\u001b[?25hBuilding wheels for collected packages: kenlm\n",
424 | " Building wheel for kenlm (setup.py) ... \u001b[?25ldone\n",
425 | "\u001b[?25h Created wheel for kenlm: filename=kenlm-0.0.0-cp310-cp310-linux_x86_64.whl size=340147 sha256=bcdae3827c372f9fbea4e21d082f520f3d8dfeb5c555fd138a20cbf9a3ca88a7\n",
426 | " Stored in directory: /tmp/pip-ephem-wheel-cache-rno5cvzv/wheels/a5/73/ee/670fbd0cee8f6f0b21d10987cb042291e662e26e1a07026462\n",
427 | "Successfully built kenlm\n",
428 | "Installing collected packages: kenlm\n",
429 | "Successfully installed kenlm-0.0.0\n"
430 | ]
431 | }
432 | ],
433 | "source": [
434 | "!pip install https://github.com/kpu/kenlm/archive/master.zip"
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": 21,
440 | "metadata": {},
441 | "outputs": [
442 | {
443 | "name": "stdout",
444 | "output_type": "stream",
445 | "text": [
446 | "Collecting joblib\n",
447 | " Downloading joblib-1.2.0-py3-none-any.whl (297 kB)\n",
448 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m298.0/298.0 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
449 | "\u001b[?25hInstalling collected packages: joblib\n",
450 | "Successfully installed joblib-1.2.0\n"
451 | ]
452 | }
453 | ],
454 | "source": [
455 | "!pip install joblib"
456 | ]
457 | },
458 | {
459 | "cell_type": "code",
460 | "execution_count": 23,
461 | "metadata": {},
462 | "outputs": [
463 | {
464 | "name": "stdout",
465 | "output_type": "stream",
466 | "text": [
467 | "Collecting ipywidgets\n",
468 | " Downloading ipywidgets-8.0.2-py3-none-any.whl (134 kB)\n",
469 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.4/134.4 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
470 | "\u001b[?25hRequirement already satisfied: traitlets>=4.3.1 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipywidgets) (5.4.0)\n",
471 | "Collecting jupyterlab-widgets~=3.0\n",
472 | " Downloading jupyterlab_widgets-3.0.3-py3-none-any.whl (384 kB)\n",
473 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m384.1/384.1 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
474 | "\u001b[?25hRequirement already satisfied: ipython>=6.1.0 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipywidgets) (8.5.0)\n",
475 | "Collecting widgetsnbextension~=4.0\n",
476 | " Downloading widgetsnbextension-4.0.3-py3-none-any.whl (2.0 MB)\n",
477 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
478 | "\u001b[?25hRequirement already satisfied: ipykernel>=4.5.1 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipywidgets) (6.16.0)\n",
479 | "Requirement already satisfied: psutil in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipykernel>=4.5.1->ipywidgets) (5.9.0)\n",
480 | "Requirement already satisfied: packaging in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipykernel>=4.5.1->ipywidgets) (21.3)\n",
481 | "Requirement already satisfied: tornado>=6.1 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipykernel>=4.5.1->ipywidgets) (6.1)\n",
482 | "Requirement already satisfied: nest-asyncio in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipykernel>=4.5.1->ipywidgets) (1.5.6)\n",
483 | "Requirement already satisfied: pyzmq>=17 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipykernel>=4.5.1->ipywidgets) (23.2.0)\n",
484 | "Requirement already satisfied: matplotlib-inline>=0.1 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipykernel>=4.5.1->ipywidgets) (0.1.6)\n",
485 | "Requirement already satisfied: jupyter-client>=6.1.12 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipykernel>=4.5.1->ipywidgets) (7.3.4)\n",
486 | "Requirement already satisfied: debugpy>=1.0 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipykernel>=4.5.1->ipywidgets) (1.5.1)\n",
487 | "Requirement already satisfied: pexpect>4.3 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (4.8.0)\n",
488 | "Requirement already satisfied: prompt-toolkit<3.1.0,>3.0.1 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (3.0.31)\n",
489 | "Requirement already satisfied: stack-data in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (0.5.1)\n",
490 | "Requirement already satisfied: pickleshare in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (0.7.5)\n",
491 | "Requirement already satisfied: pygments>=2.4.0 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (2.13.0)\n",
492 | "Requirement already satisfied: decorator in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (5.1.1)\n",
493 | "Requirement already satisfied: backcall in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (0.2.0)\n",
494 | "Requirement already satisfied: jedi>=0.16 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (0.18.1)\n",
495 | "Requirement already satisfied: parso<0.9.0,>=0.8.0 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets) (0.8.3)\n",
496 | "Requirement already satisfied: entrypoints in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets) (0.4)\n",
497 | "Requirement already satisfied: jupyter-core>=4.9.2 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets) (4.11.1)\n",
498 | "Requirement already satisfied: python-dateutil>=2.8.2 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets) (2.8.2)\n",
499 | "Requirement already satisfied: ptyprocess>=0.5 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets) (0.7.0)\n",
500 | "Requirement already satisfied: wcwidth in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from prompt-toolkit<3.1.0,>3.0.1->ipython>=6.1.0->ipywidgets) (0.2.5)\n",
501 | "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from packaging->ipykernel>=4.5.1->ipywidgets) (3.0.9)\n",
502 | "Requirement already satisfied: executing in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (1.1.1)\n",
503 | "Requirement already satisfied: asttokens in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.0.8)\n",
504 | "Requirement already satisfied: pure-eval in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (0.2.2)\n",
505 | "Requirement already satisfied: six>=1.5 in ./miniconda3/envs/nlp/lib/python3.10/site-packages (from python-dateutil>=2.8.2->jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets) (1.16.0)\n",
506 | "Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets\n",
507 | "Successfully installed ipywidgets-8.0.2 jupyterlab-widgets-3.0.3 widgetsnbextension-4.0.3\n"
508 | ]
509 | }
510 | ],
511 | "source": [
512 | "!pip install ipywidgets"
513 | ]
514 | },
515 | {
516 | "cell_type": "markdown",
517 | "metadata": {
518 | "id": "Sl0N0sjUlycn"
519 | },
520 | "source": [
521 | "# KenLM"
522 | ]
523 | },
524 | {
525 | "cell_type": "code",
526 | "execution_count": 1,
527 | "metadata": {
528 | "colab": {
529 | "base_uri": "https://localhost:8080/",
530 | "height": 104,
531 | "referenced_widgets": [
532 | "63415a015b504c7f8baa3d5a60b7c560",
533 | "b1cfc6b061cd40acbf15c3a617c2decd",
534 | "98c0b2a457bf405482ec3b11c61ac454",
535 | "ffe2904f8a2747638e11410a5474f9e4",
536 | "e8d2d9fa065f4a13b614dd8c4a72f824",
537 | "3fee1bf921dd48f69811c0aaa3213974",
538 | "06989b53974f41d28796d7e9f7ca1034",
539 | "aabcd286ca28404da9170688b6871027",
540 | "5cb3e448899c47ba9fd2c7731aba954b",
541 | "9668a1ccd9d742349585cf3349cff5b6",
542 | "9073d7f2397f47d0a60bc828f69f93cf"
543 | ]
544 | },
545 | "id": "Bdf74ScY1sIn",
546 | "outputId": "2ac3feef-9b8e-4323-b4e6-61ef63c2e158"
547 | },
548 | "outputs": [],
549 | "source": [
550 | "import json\n",
551 | "import os\n",
552 | "\n",
553 | "import numpy as np\n",
554 | "from joblib import Parallel, delayed\n",
555 | "from tqdm.auto import tqdm\n",
556 | "from transformers import GPT2TokenizerFast, GPT2Tokenizer"
557 | ]
558 | },
559 | {
560 | "cell_type": "code",
561 | "execution_count": 2,
562 | "metadata": {
563 | "id": "AzCxddKg1m-4"
564 | },
565 | "outputs": [],
566 | "source": [
567 | "# Utilities from https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/ngram_lm/kenlm_utils.py\n",
568 | "def tokenize_str(texts, tokenizer, offset):\n",
569 | " tokenized_text = []\n",
570 | " for text in texts:\n",
571 | " tok_text = tokenizer.encode(text)\n",
572 | " tok_text = [chr(token + offset) for token in tok_text]\n",
573 | " tokenized_text.append(tok_text)\n",
574 | " return tokenized_text\n",
575 | "\n",
576 | "def tokenize_text(data, tokenizer, path, chunk_size=8192, buffer_size=32, token_offset=100):\n",
577 | " dataset_len = len(data)\n",
578 | " print(\n",
579 | " f\"Chunking {dataset_len} rows into {dataset_len / float(chunk_size):0.4f} tasks (each chunk contains {chunk_size} elements)\"\n",
580 | " )\n",
581 | "\n",
582 | " current_step = 0\n",
583 | " if os.path.exists(path):\n",
584 | " print(f\"Deleting previous file : {path}\")\n",
585 | " os.remove(path)\n",
586 | "\n",
587 | " with Parallel(n_jobs=-2, verbose=10) as parallel:\n",
588 | " while True:\n",
589 | " start = current_step * chunk_size\n",
590 | " end = min((current_step + buffer_size) * chunk_size, dataset_len)\n",
591 | "\n",
592 | " tokenized_data = parallel(\n",
593 | " delayed(tokenize_str)(data[start : start + chunk_size], tokenizer, token_offset)\n",
594 | " for start in range(start, end, chunk_size)\n",
595 | " )\n",
596 | "\n",
597 | " # Write dataset\n",
598 | " write_dataset(tokenized_data, path)\n",
599 | " current_step += len(tokenized_data)\n",
600 | " print(f\"Finished writing {len(tokenized_data)} chunks to {path}. Current chunk index = {current_step}\")\n",
601 | " del tokenized_data\n",
602 | " if end >= dataset_len:\n",
603 | " break\n",
604 | "\n",
605 | "\n",
606 | "def write_dataset(chunks, path):\n",
607 | " # basedir = os.path.dirname(path)\n",
608 | "\n",
609 | " # if not os.path.exists(basedir):\n",
610 | " # os.makedirs(basedir, exist_ok=True)\n",
611 | "\n",
612 | " with open(path, 'a+', encoding='utf-8') as f:\n",
613 | " for chunk_idx in tqdm(range(len(chunks)), desc='Chunk ', total=len(chunks), unit=' chunks'):\n",
614 | " for text in chunks[chunk_idx]:\n",
615 | " line = ' '.join(text)\n",
616 | " f.write(f\"{line}\\n\")"
617 | ]
618 | },
619 | {
620 | "cell_type": "code",
621 | "execution_count": 3,
622 | "metadata": {
623 | "id": "QnBBMMH-z-7N"
624 | },
625 | "outputs": [],
626 | "source": [
627 | "with open(\"corpus-title.txt\", 'r', encoding='utf-8') as f:\n",
628 | " dataset = f.readlines()"
629 | ]
630 | },
631 | {
632 | "cell_type": "code",
633 | "execution_count": 4,
634 | "metadata": {},
635 | "outputs": [
636 | {
637 | "data": {
638 | "text/plain": [
639 | "'Chây ì nộp phạt nguội.\\n'"
640 | ]
641 | },
642 | "execution_count": 4,
643 | "metadata": {},
644 | "output_type": "execute_result"
645 | }
646 | ],
647 | "source": [
648 | "dataset[0]"
649 | ]
650 | },
651 | {
652 | "cell_type": "code",
653 | "execution_count": 5,
654 | "metadata": {},
655 | "outputs": [],
656 | "source": [
657 | "chars_to_ignore_regex = '[,?.!\\-\\;\\:\"“%‘”�—’…–]' \n",
658 | "import re\n",
659 | "\n",
660 | "def clean_text(text):\n",
661 | " text = re.sub(chars_to_ignore_regex, \"\", text.lower())\n",
662 | " return text"
663 | ]
664 | },
665 | {
666 | "cell_type": "code",
667 | "execution_count": 6,
668 | "metadata": {},
669 | "outputs": [
670 | {
671 | "name": "stderr",
672 | "output_type": "stream",
673 | "text": [
674 | "100%|██████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 89251.49it/s]\n"
675 | ]
676 | }
677 | ],
678 | "source": [
679 | "from tqdm import tqdm\n",
680 | "dataset_clean = []\n",
681 | "for text in tqdm(dataset[0:100000]):\n",
682 | " dataset_clean.append(clean_text(text))"
683 | ]
684 | },
685 | {
686 | "cell_type": "code",
687 | "execution_count": 7,
688 | "metadata": {},
689 | "outputs": [
690 | {
691 | "name": "stdout",
692 | "output_type": "stream",
693 | "text": [
694 | "thay đổi về đăng ký chuyển nhượng xe từ 12/2 bạn cần biết\n",
695 | "\n"
696 | ]
697 | }
698 | ],
699 | "source": [
700 | "print(dataset_clean[4])"
701 | ]
702 | },
703 | {
704 | "cell_type": "code",
705 | "execution_count": 11,
706 | "metadata": {},
707 | "outputs": [],
708 | "source": [
709 | "import whisper\n",
710 | "tokenizer = whisper.tokenizer.get_tokenizer('vi').tokenizer"
711 | ]
712 | },
713 | {
714 | "cell_type": "code",
715 | "execution_count": 9,
716 | "metadata": {
717 | "colab": {
718 | "base_uri": "https://localhost:8080/",
719 | "height": 292,
720 | "referenced_widgets": [
721 | "2c05958e110648a1ac6d447b26a94ad4",
722 | "262929238fd543cc8190f755bd48aca1",
723 | "4f1d415e84ba4db99c05837edfd53bda",
724 | "269f80894a8b42169242e5a9f711ea96",
725 | "7b023fdc14184a9e95d8db379b511673",
726 | "aba59e0a981845b0a2afc17f1ea8fad0",
727 | "b511866d7dcc4c69b114720594db39b9",
728 | "4a48e417d4be432c8733f046974af3cb",
729 | "cb1bd500742d4938a96e966c8964ecb6",
730 | "bc79e00c44874895a62dbab150be6b91",
731 | "a31dc64cca7146bd95cf8e98db98828c"
732 | ]
733 | },
734 | "id": "kC2cCbfQ4ZD5",
735 | "outputId": "4ee3c965-cbf1-415a-d0b8-7969fb984df1"
736 | },
737 | "outputs": [
738 | {
739 | "name": "stdout",
740 | "output_type": "stream",
741 | "text": [
742 | "Chunking 10000 rows into 1.2207 tasks (each chunk contains 8192 elements)\n"
743 | ]
744 | },
745 | {
746 | "name": "stderr",
747 | "output_type": "stream",
748 | "text": [
749 | "[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.\n",
750 | "[Parallel(n_jobs=-2)]: Done 2 out of 2 | elapsed: 4.5s remaining: 0.0s\n",
751 | "[Parallel(n_jobs=-2)]: Done 2 out of 2 | elapsed: 4.5s finished\n",
752 | "Chunk : 100%|██████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 55.85 chunks/s]"
753 | ]
754 | },
755 | {
756 | "name": "stdout",
757 | "output_type": "stream",
758 | "text": [
759 | "Finished writing 2 chunks to dataset_tokenized.txt. Current chunk index = 2\n"
760 | ]
761 | },
762 | {
763 | "name": "stderr",
764 | "output_type": "stream",
765 | "text": [
766 | "\n"
767 | ]
768 | }
769 | ],
770 | "source": [
771 | "tokenize_text(dataset_clean[0:10000], tokenizer, \"dataset_tokenized.txt\")"
772 | ]
773 | },
774 | {
775 | "cell_type": "markdown",
776 | "metadata": {
777 | "id": "6ELow7l2mVKc"
778 | },
779 | "source": [
780 | "--discount_fallback is needed for training KenLM for BPE-based models"
781 | ]
782 | },
783 | {
784 | "cell_type": "code",
785 | "execution_count": 10,
786 | "metadata": {
787 | "colab": {
788 | "base_uri": "https://localhost:8080/"
789 | },
790 | "id": "PMk_TCafBrXY",
791 | "outputId": "7f67464a-5367-4c10-c803-6773e26822c6"
792 | },
793 | "outputs": [
794 | {
795 | "name": "stdout",
796 | "output_type": "stream",
797 | "text": [
798 | "=== 1/5 Counting and sorting n-grams ===\n",
799 | "Reading /home/hkab/dataset_tokenized.txt\n",
800 | "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
801 | "****************************************************************************************************\n",
802 | "Unigram tokens 292401 types 3012\n",
803 | "=== 2/5 Calculating and sorting adjusted counts ===\n",
804 | "Chain sizes: 1:36144 2:1812654080 3:3398726400\n",
805 | "Statistics:\n",
806 | "1 3012 D1=0.626738 D2=0.996616 D3+=1.56584\n",
807 | "2 42556 D1=0.683317 D2=1.13987 D3+=1.61722\n",
808 | "3 110710 D1=0.73464 D2=1.12548 D3+=1.43493\n",
809 | "Memory estimate for binary LM:\n",
810 | "type kB\n",
811 | "probing 3019 assuming -p 1.5\n",
812 | "probing 3281 assuming -r models -p 1.5\n",
813 | "trie 1129 without quantization\n",
814 | "trie 577 assuming -q 8 -b 8 quantization \n",
815 | "trie 1084 assuming -a 22 array pointer compression\n",
816 | "trie 532 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n",
817 | "=== 3/5 Calculating and sorting initial probabilities ===\n",
818 | "Chain sizes: 1:36144 2:680896 3:2214200\n",
819 | "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
820 | "####################################################################################################\n",
821 | "=== 4/5 Calculating and writing order-interpolated probabilities ===\n",
822 | "Chain sizes: 1:36144 2:680896 3:2214200\n",
823 | "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
824 | "####################################################################################################\n",
825 | "=== 5/5 Writing ARPA model ===\n",
826 | "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
827 | "****************************************************************************************************\n",
828 | "Name:lmplz\tVmPeak:5254272 kB\tVmRSS:9092 kB\tRSSMax:1978128 kB\tuser:0.520195\tsys:3.20971\tCPU:3.73272\treal:3.61356\n"
829 | ]
830 | }
831 | ],
832 | "source": [
833 | "!kenlm/build/bin/lmplz -o 3 --text dataset_tokenized.txt --arpa dataset_tokenized_3gram.arpa --discount_fallback"
834 | ]
835 | },
836 | {
837 | "cell_type": "code",
838 | "execution_count": 19,
839 | "metadata": {
840 | "colab": {
841 | "base_uri": "https://localhost:8080/"
842 | },
843 | "id": "rMbF9O8nnkrg",
844 | "outputId": "2fd75ffe-cf7c-40ae-e29b-f058a2521709"
845 | },
846 | "outputs": [
847 | {
848 | "name": "stdout",
849 | "output_type": "stream",
850 | "text": [
851 | "-0.6037645\t 桻 │\n",
852 | "-1.9628253\t Ჯ\n",
853 | "-0.55146927\tᲯ ֭ 湨\n",
854 | "-0.30274913\t֭ 湨 ণ\n",
855 | "-1.1774871\t豵 o 揍\n",
856 | "-1.1730413\t ź Ȩ\n",
857 | "-0.8179335\t} ৾ ୬\n",
858 | "-0.51691043\t⨠ ⽭ 㨢\n",
859 | "-0.30274913\t⽭ 㨢 ᱎ\n",
860 | "-2.5927703\tȉ 忎 㕒\n",
861 | "-2.5571775\tܫ 〶 㕒\n",
862 | "-0.23558763\t忎 㕒 砄\n",
863 | "-0.23558763\t〶 㕒 砄\n",
864 | "-0.25315025\tό ᭊ 涐\n",
865 | "-0.14362468\tɁ ᭊ 涐\n",
866 | "-2.8912714\t б\n",
867 | "-0.49007577\t⼺ Ȝ 뀗\n",
868 | "-0.41827714\tj ո \n",
869 | "\n",
870 | "\\end\\\n"
871 | ]
872 | }
873 | ],
874 | "source": [
875 | "!tail -20 dataset_tokenized_3gram.arpa"
876 | ]
877 | },
878 | {
879 | "cell_type": "code",
880 | "execution_count": 13,
881 | "metadata": {
882 | "colab": {
883 | "base_uri": "https://localhost:8080/"
884 | },
885 | "id": "FxTQdOUYqlmX",
886 | "outputId": "2f60fd19-cee6-49f2-873d-ad3c10183f7d"
887 | },
888 | "outputs": [
889 | {
890 | "name": "stdout",
891 | "output_type": "stream",
892 | "text": [
893 | "Reading dataset_tokenized_3gram.arpa\n",
894 | "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
895 | "****************************************************************************************************\n",
896 | "SUCCESS\n"
897 | ]
898 | }
899 | ],
900 | "source": [
901 | "!kenlm/build/bin/build_binary dataset_tokenized_3gram.arpa dataset_tokenized_3gram.binary "
902 | ]
903 | },
904 | {
905 | "cell_type": "code",
906 | "execution_count": 15,
907 | "metadata": {},
908 | "outputs": [],
909 | "source": [
910 | "import kenlm"
911 | ]
912 | },
913 | {
914 | "cell_type": "code",
915 | "execution_count": 34,
916 | "metadata": {},
917 | "outputs": [],
918 | "source": [
919 | "model = kenlm.Model('dataset_tokenized_5gram_vi_title.binary')"
920 | ]
921 | },
922 | {
923 | "cell_type": "code",
924 | "execution_count": 35,
925 | "metadata": {},
926 | "outputs": [
927 | {
928 | "data": {
929 | "text/plain": [
930 | "5"
931 | ]
932 | },
933 | "execution_count": 35,
934 | "metadata": {},
935 | "output_type": "execute_result"
936 | }
937 | ],
938 | "source": [
939 | "model.order"
940 | ]
941 | },
942 | {
943 | "cell_type": "code",
944 | "execution_count": 36,
945 | "metadata": {},
946 | "outputs": [
947 | {
948 | "data": {
949 | "text/plain": [
950 | "'ốc Bhutan'"
951 | ]
952 | },
953 | "execution_count": 36,
954 | "metadata": {},
955 | "output_type": "execute_result"
956 | }
957 | ],
958 | "source": [
959 | "a = \"忎 㕒 砄\".split()\n",
960 | "tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens([ord(i) - 100 for i in a]))"
961 | ]
962 | },
963 | {
964 | "cell_type": "code",
965 | "execution_count": 7,
966 | "metadata": {},
967 | "outputs": [],
968 | "source": [
969 | "def tokenize_str(texts, tokenizer, offset):\n",
970 | " tokenized_text = []\n",
971 | " for text in texts:\n",
972 | " tok_text = tokenizer.encode(text)\n",
973 | " tok_text = [chr(token + offset) for token in tok_text]\n",
974 | " tokenized_text.append(tok_text)\n",
975 | " return tokenized_text"
976 | ]
977 | },
978 | {
979 | "cell_type": "code",
980 | "execution_count": 8,
981 | "metadata": {},
982 | "outputs": [],
983 | "source": [
984 | "sentence = 'Nếu trồng người góc'"
985 | ]
986 | },
987 | {
988 | "cell_type": "code",
989 | "execution_count": 12,
990 | "metadata": {},
991 | "outputs": [],
992 | "source": [
993 | "# sentence_split = tokenizer.convert_ids_to_tokens(tokenizer(sentence)['input_ids'])\n",
994 | "# Phải dùng tokenize_str để tokenize vì ta thêm offset 100 :D\n",
995 | "sentence_split = tokenize_str([sentence], tokenizer, 100)[0]"
996 | ]
997 | },
998 | {
999 | "cell_type": "code",
1000 | "execution_count": 13,
1001 | "metadata": {},
1002 | "outputs": [
1003 | {
1004 | "data": {
1005 | "text/plain": [
1006 | "['\\x91', '矌', 'ŀ', '᭛', '襢', '㹾', 'Ɔ', 'ꂅ']"
1007 | ]
1008 | },
1009 | "execution_count": 13,
1010 | "metadata": {},
1011 | "output_type": "execute_result"
1012 | }
1013 | ],
1014 | "source": [
1015 | "sentence_split"
1016 | ]
1017 | },
1018 | {
1019 | "cell_type": "code",
1020 | "execution_count": 40,
1021 | "metadata": {},
1022 | "outputs": [
1023 | {
1024 | "data": {
1025 | "text/plain": [
1026 | "'Địa ốc Bhutan mới nổi.'"
1027 | ]
1028 | },
1029 | "execution_count": 40,
1030 | "metadata": {},
1031 | "output_type": "execute_result"
1032 | }
1033 | ],
1034 | "source": [
1035 | "# Để convert lại ta ord rồi trừ 100, ngược của chr(token + offset)\n",
1036 | "tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens([ord(i) - 100 for i in sentence_split]))"
1037 | ]
1038 | },
1039 | {
1040 | "cell_type": "code",
1041 | "execution_count": 41,
1042 | "metadata": {},
1043 | "outputs": [
1044 | {
1045 | "data": {
1046 | "text/plain": [
1047 | "'ä Œ ⸠ ¤ ŀ 忎 㕒 砄 鈴 ƍ 鞔 q'"
1048 | ]
1049 | },
1050 | "execution_count": 41,
1051 | "metadata": {},
1052 | "output_type": "execute_result"
1053 | }
1054 | ],
1055 | "source": [
1056 | "\" \".join(sentence_split)"
1057 | ]
1058 | },
1059 | {
1060 | "cell_type": "code",
1061 | "execution_count": 42,
1062 | "metadata": {},
1063 | "outputs": [
1064 | {
1065 | "data": {
1066 | "text/plain": [
1067 | "-76.36715698242188"
1068 | ]
1069 | },
1070 | "execution_count": 42,
1071 | "metadata": {},
1072 | "output_type": "execute_result"
1073 | }
1074 | ],
1075 | "source": [
1076 | "model.score(\" \".join(sentence_split))"
1077 | ]
1078 | },
1079 | {
1080 | "cell_type": "code",
1081 | "execution_count": 33,
1082 | "metadata": {},
1083 | "outputs": [
1084 | {
1085 | "data": {
1086 | "text/plain": [
1087 | "-32.09736633300781"
1088 | ]
1089 | },
1090 | "execution_count": 33,
1091 | "metadata": {},
1092 | "output_type": "execute_result"
1093 | }
1094 | ],
1095 | "source": [
1096 | "model.score(\" \".join(sentence_split))"
1097 | ]
1098 | },
1099 | {
1100 | "cell_type": "code",
1101 | "execution_count": 43,
1102 | "metadata": {},
1103 | "outputs": [
1104 | {
1105 | "name": "stdout",
1106 | "output_type": "stream",
1107 | "text": [
1108 | "prob: -9.877823829650879 length: 1: ä\n",
1109 | "\t\"ä\" is an OOV\n",
1110 | "prob: -5.552821159362793 length: 1: Œ\n",
1111 | "\t\"Œ\" is an OOV\n",
1112 | "prob: -5.552821159362793 length: 1: ⸠\n",
1113 | "\t\"⸠\" is an OOV\n",
1114 | "prob: -5.552821159362793 length: 1: ¤\n",
1115 | "\t\"¤\" is an OOV\n",
1116 | "prob: -5.552821159362793 length: 1: ŀ\n",
1117 | "\t\"ŀ\" is an OOV\n",
1118 | "prob: -5.552821159362793 length: 1: 忎\n",
1119 | "\t\"忎\" is an OOV\n",
1120 | "prob: -5.552821159362793 length: 1: 㕒\n",
1121 | "\t\"㕒\" is an OOV\n",
1122 | "prob: -5.552821159362793 length: 1: 砄\n",
1123 | "\t\"砄\" is an OOV\n",
1124 | "prob: -5.552821159362793 length: 1: 鈴\n",
1125 | "\t\"鈴\" is an OOV\n",
1126 | "prob: -5.552821159362793 length: 1: ƍ\n",
1127 | "\t\"ƍ\" is an OOV\n",
1128 | "prob: -5.552821159362793 length: 1: 鞔\n",
1129 | "\t\"鞔\" is an OOV\n",
1130 | "prob: -5.552821159362793 length: 1: q\n",
1131 | "\t\"q\" is an OOV\n",
1132 | "prob: -5.408297538757324 length: 1: \n"
1133 | ]
1134 | }
1135 | ],
1136 | "source": [
1137 | "# Show scores and n-gram matches\n",
1138 | "words = [''] + sentence_split + ['']\n",
1139 | "for i, (prob, length, oov) in enumerate(model.full_scores(\" \".join(sentence_split))):\n",
1140 | " print('prob: {0} length: {1}: {2}'.format(prob, length, ' '.join(words[i+2-length:i+2])))\n",
1141 | " if oov:\n",
1142 | " print('\\t\"{0}\" is an OOV'.format(words[i+1]))"
1143 | ]
1144 | },
1145 | {
1146 | "cell_type": "code",
1147 | "execution_count": 144,
1148 | "metadata": {},
1149 | "outputs": [
1150 | {
1151 | "name": "stdout",
1152 | "output_type": "stream",
1153 | "text": [
1154 | "prob: -7.633749485015869 length: 1: \n",
1155 | "prob: -6.314070701599121 length: 1: 〙\n",
1156 | "\t\"〙\" is an OOV\n",
1157 | "prob: -5.552821159362793 length: 1: ŀ\n",
1158 | "\t\"ŀ\" is an OOV\n",
1159 | "prob: -5.552821159362793 length: 1: 刺\n",
1160 | "\t\"刺\" is an OOV\n",
1161 | "prob: -5.552821159362793 length: 1: ό\n",
1162 | "\t\"ό\" is an OOV\n",
1163 | "prob: -5.552821159362793 length: 1: 䭤\n",
1164 | "\t\"䭤\" is an OOV\n",
1165 | "prob: -5.552821159362793 length: 1: 阦\n",
1166 | "\t\"阦\" is an OOV\n",
1167 | "prob: -5.552821159362793 length: 1: ŀ\n",
1168 | "\t\"ŀ\" is an OOV\n",
1169 | "prob: -5.552821159362793 length: 1: ·\n",
1170 | "\t\"·\" is an OOV\n",
1171 | "prob: -5.552821159362793 length: 1: 鷨\n",
1172 | "\t\"鷨\" is an OOV\n",
1173 | "prob: -5.408297538757324 length: 1: \n"
1174 | ]
1175 | }
1176 | ],
1177 | "source": [
1178 | "# Show scores and n-gram matches\n",
1179 | "words = [''] + sentence_split + ['']\n",
1180 | "for i, (prob, length, oov) in enumerate(model.full_scores(\" \".join(sentence_split))):\n",
1181 | " print('prob: {0} length: {1}: {2}'.format(prob, length, ' '.join(words[i+2-length:i+2])))\n",
1182 | " if oov:\n",
1183 | " print('\\t\"{0}\" is an OOV'.format(words[i+1]))"
1184 | ]
1185 | },
1186 | {
1187 | "cell_type": "code",
1188 | "execution_count": 2,
1189 | "metadata": {},
1190 | "outputs": [],
1191 | "source": [
1192 | "import kenlm"
1193 | ]
1194 | },
1195 | {
1196 | "cell_type": "code",
1197 | "execution_count": 3,
1198 | "metadata": {},
1199 | "outputs": [],
1200 | "source": [
1201 | "model = kenlm.Model('dataset_tokenized_3gram.binary')"
1202 | ]
1203 | },
1204 | {
1205 | "cell_type": "code",
1206 | "execution_count": 4,
1207 | "metadata": {},
1208 | "outputs": [],
1209 | "source": [
1210 | "state = kenlm.State()\n",
1211 | "state_next_token = kenlm.State()\n",
1212 | "state_next_token_t = kenlm.State()"
1213 | ]
1214 | },
1215 | {
1216 | "cell_type": "code",
1217 | "execution_count": 5,
1218 | "metadata": {},
1219 | "outputs": [
1220 | {
1221 | "data": {
1222 | "text/plain": [
1223 | "-8.021354913711548"
1224 | ]
1225 | },
1226 | "execution_count": 5,
1227 | "metadata": {},
1228 | "output_type": "execute_result"
1229 | }
1230 | ],
1231 | "source": [
1232 | "model.BeginSentenceWrite(state)\n",
1233 | "accum = 0\n",
1234 | "accum += model.BaseScore(state, \"鷨\", state_next_token)\n",
1235 | "accum += model.BaseScore(state_next_token, \"刺\", state)\n",
1236 | "accum"
1237 | ]
1238 | },
1239 | {
1240 | "cell_type": "code",
1241 | "execution_count": 6,
1242 | "metadata": {},
1243 | "outputs": [
1244 | {
1245 | "data": {
1246 | "text/plain": [
1247 | "-12.832839250564575"
1248 | ]
1249 | },
1250 | "execution_count": 6,
1251 | "metadata": {},
1252 | "output_type": "execute_result"
1253 | }
1254 | ],
1255 | "source": [
1256 | "prob = model.BaseScore(state, \"\", state_next_token_t)\n",
1257 | "prob + accum"
1258 | ]
1259 | },
1260 | {
1261 | "cell_type": "code",
1262 | "execution_count": 71,
1263 | "metadata": {},
1264 | "outputs": [
1265 | {
1266 | "data": {
1267 | "text/plain": [
1268 | "-12.832839012145996"
1269 | ]
1270 | },
1271 | "execution_count": 71,
1272 | "metadata": {},
1273 | "output_type": "execute_result"
1274 | }
1275 | ],
1276 | "source": [
1277 | "model.score(\"鷨 刺 \", eos=False)"
1278 | ]
1279 | }
1280 | ],
1281 | "metadata": {
1282 | "colab": {
1283 | "collapsed_sections": [
1284 | "kValhOGAlsfj",
1285 | "zaAvDY2xlvpI"
1286 | ],
1287 | "provenance": []
1288 | },
1289 | "kernelspec": {
1290 | "display_name": "Python 3 (ipykernel)",
1291 | "language": "python",
1292 | "name": "python3"
1293 | },
1294 | "language_info": {
1295 | "codemirror_mode": {
1296 | "name": "ipython",
1297 | "version": 3
1298 | },
1299 | "file_extension": ".py",
1300 | "mimetype": "text/x-python",
1301 | "name": "python",
1302 | "nbconvert_exporter": "python",
1303 | "pygments_lexer": "ipython3",
1304 | "version": "3.10.6"
1305 | },
1306 | "vscode": {
1307 | "interpreter": {
1308 | "hash": "569cc53f83e70b41c2d959ffedb296ac14adb9e332ab59ae04a2c7a2935b0e00"
1309 | }
1310 | },
1311 | "widgets": {
1312 | "application/vnd.jupyter.widget-state+json": {
1313 | "06989b53974f41d28796d7e9f7ca1034": {
1314 | "model_module": "@jupyter-widgets/controls",
1315 | "model_module_version": "1.5.0",
1316 | "model_name": "DescriptionStyleModel",
1317 | "state": {
1318 | "_model_module": "@jupyter-widgets/controls",
1319 | "_model_module_version": "1.5.0",
1320 | "_model_name": "DescriptionStyleModel",
1321 | "_view_count": null,
1322 | "_view_module": "@jupyter-widgets/base",
1323 | "_view_module_version": "1.2.0",
1324 | "_view_name": "StyleView",
1325 | "description_width": ""
1326 | }
1327 | },
1328 | "262929238fd543cc8190f755bd48aca1": {
1329 | "model_module": "@jupyter-widgets/controls",
1330 | "model_module_version": "1.5.0",
1331 | "model_name": "HTMLModel",
1332 | "state": {
1333 | "_dom_classes": [],
1334 | "_model_module": "@jupyter-widgets/controls",
1335 | "_model_module_version": "1.5.0",
1336 | "_model_name": "HTMLModel",
1337 | "_view_count": null,
1338 | "_view_module": "@jupyter-widgets/controls",
1339 | "_view_module_version": "1.5.0",
1340 | "_view_name": "HTMLView",
1341 | "description": "",
1342 | "description_tooltip": null,
1343 | "layout": "IPY_MODEL_aba59e0a981845b0a2afc17f1ea8fad0",
1344 | "placeholder": "",
1345 | "style": "IPY_MODEL_b511866d7dcc4c69b114720594db39b9",
1346 | "value": "Chunk : 100%"
1347 | }
1348 | },
1349 | "269f80894a8b42169242e5a9f711ea96": {
1350 | "model_module": "@jupyter-widgets/controls",
1351 | "model_module_version": "1.5.0",
1352 | "model_name": "HTMLModel",
1353 | "state": {
1354 | "_dom_classes": [],
1355 | "_model_module": "@jupyter-widgets/controls",
1356 | "_model_module_version": "1.5.0",
1357 | "_model_name": "HTMLModel",
1358 | "_view_count": null,
1359 | "_view_module": "@jupyter-widgets/controls",
1360 | "_view_module_version": "1.5.0",
1361 | "_view_name": "HTMLView",
1362 | "description": "",
1363 | "description_tooltip": null,
1364 | "layout": "IPY_MODEL_bc79e00c44874895a62dbab150be6b91",
1365 | "placeholder": "",
1366 | "style": "IPY_MODEL_a31dc64cca7146bd95cf8e98db98828c",
1367 | "value": " 13/13 [00:00<00:00, 29.86 chunks/s]"
1368 | }
1369 | },
1370 | "2c05958e110648a1ac6d447b26a94ad4": {
1371 | "model_module": "@jupyter-widgets/controls",
1372 | "model_module_version": "1.5.0",
1373 | "model_name": "HBoxModel",
1374 | "state": {
1375 | "_dom_classes": [],
1376 | "_model_module": "@jupyter-widgets/controls",
1377 | "_model_module_version": "1.5.0",
1378 | "_model_name": "HBoxModel",
1379 | "_view_count": null,
1380 | "_view_module": "@jupyter-widgets/controls",
1381 | "_view_module_version": "1.5.0",
1382 | "_view_name": "HBoxView",
1383 | "box_style": "",
1384 | "children": [
1385 | "IPY_MODEL_262929238fd543cc8190f755bd48aca1",
1386 | "IPY_MODEL_4f1d415e84ba4db99c05837edfd53bda",
1387 | "IPY_MODEL_269f80894a8b42169242e5a9f711ea96"
1388 | ],
1389 | "layout": "IPY_MODEL_7b023fdc14184a9e95d8db379b511673"
1390 | }
1391 | },
1392 | "3fee1bf921dd48f69811c0aaa3213974": {
1393 | "model_module": "@jupyter-widgets/base",
1394 | "model_module_version": "1.2.0",
1395 | "model_name": "LayoutModel",
1396 | "state": {
1397 | "_model_module": "@jupyter-widgets/base",
1398 | "_model_module_version": "1.2.0",
1399 | "_model_name": "LayoutModel",
1400 | "_view_count": null,
1401 | "_view_module": "@jupyter-widgets/base",
1402 | "_view_module_version": "1.2.0",
1403 | "_view_name": "LayoutView",
1404 | "align_content": null,
1405 | "align_items": null,
1406 | "align_self": null,
1407 | "border": null,
1408 | "bottom": null,
1409 | "display": null,
1410 | "flex": null,
1411 | "flex_flow": null,
1412 | "grid_area": null,
1413 | "grid_auto_columns": null,
1414 | "grid_auto_flow": null,
1415 | "grid_auto_rows": null,
1416 | "grid_column": null,
1417 | "grid_gap": null,
1418 | "grid_row": null,
1419 | "grid_template_areas": null,
1420 | "grid_template_columns": null,
1421 | "grid_template_rows": null,
1422 | "height": null,
1423 | "justify_content": null,
1424 | "justify_items": null,
1425 | "left": null,
1426 | "margin": null,
1427 | "max_height": null,
1428 | "max_width": null,
1429 | "min_height": null,
1430 | "min_width": null,
1431 | "object_fit": null,
1432 | "object_position": null,
1433 | "order": null,
1434 | "overflow": null,
1435 | "overflow_x": null,
1436 | "overflow_y": null,
1437 | "padding": null,
1438 | "right": null,
1439 | "top": null,
1440 | "visibility": null,
1441 | "width": null
1442 | }
1443 | },
1444 | "4a48e417d4be432c8733f046974af3cb": {
1445 | "model_module": "@jupyter-widgets/base",
1446 | "model_module_version": "1.2.0",
1447 | "model_name": "LayoutModel",
1448 | "state": {
1449 | "_model_module": "@jupyter-widgets/base",
1450 | "_model_module_version": "1.2.0",
1451 | "_model_name": "LayoutModel",
1452 | "_view_count": null,
1453 | "_view_module": "@jupyter-widgets/base",
1454 | "_view_module_version": "1.2.0",
1455 | "_view_name": "LayoutView",
1456 | "align_content": null,
1457 | "align_items": null,
1458 | "align_self": null,
1459 | "border": null,
1460 | "bottom": null,
1461 | "display": null,
1462 | "flex": null,
1463 | "flex_flow": null,
1464 | "grid_area": null,
1465 | "grid_auto_columns": null,
1466 | "grid_auto_flow": null,
1467 | "grid_auto_rows": null,
1468 | "grid_column": null,
1469 | "grid_gap": null,
1470 | "grid_row": null,
1471 | "grid_template_areas": null,
1472 | "grid_template_columns": null,
1473 | "grid_template_rows": null,
1474 | "height": null,
1475 | "justify_content": null,
1476 | "justify_items": null,
1477 | "left": null,
1478 | "margin": null,
1479 | "max_height": null,
1480 | "max_width": null,
1481 | "min_height": null,
1482 | "min_width": null,
1483 | "object_fit": null,
1484 | "object_position": null,
1485 | "order": null,
1486 | "overflow": null,
1487 | "overflow_x": null,
1488 | "overflow_y": null,
1489 | "padding": null,
1490 | "right": null,
1491 | "top": null,
1492 | "visibility": null,
1493 | "width": null
1494 | }
1495 | },
1496 | "4f1d415e84ba4db99c05837edfd53bda": {
1497 | "model_module": "@jupyter-widgets/controls",
1498 | "model_module_version": "1.5.0",
1499 | "model_name": "FloatProgressModel",
1500 | "state": {
1501 | "_dom_classes": [],
1502 | "_model_module": "@jupyter-widgets/controls",
1503 | "_model_module_version": "1.5.0",
1504 | "_model_name": "FloatProgressModel",
1505 | "_view_count": null,
1506 | "_view_module": "@jupyter-widgets/controls",
1507 | "_view_module_version": "1.5.0",
1508 | "_view_name": "ProgressView",
1509 | "bar_style": "success",
1510 | "description": "",
1511 | "description_tooltip": null,
1512 | "layout": "IPY_MODEL_4a48e417d4be432c8733f046974af3cb",
1513 | "max": 13,
1514 | "min": 0,
1515 | "orientation": "horizontal",
1516 | "style": "IPY_MODEL_cb1bd500742d4938a96e966c8964ecb6",
1517 | "value": 13
1518 | }
1519 | },
1520 | "5cb3e448899c47ba9fd2c7731aba954b": {
1521 | "model_module": "@jupyter-widgets/controls",
1522 | "model_module_version": "1.5.0",
1523 | "model_name": "ProgressStyleModel",
1524 | "state": {
1525 | "_model_module": "@jupyter-widgets/controls",
1526 | "_model_module_version": "1.5.0",
1527 | "_model_name": "ProgressStyleModel",
1528 | "_view_count": null,
1529 | "_view_module": "@jupyter-widgets/base",
1530 | "_view_module_version": "1.2.0",
1531 | "_view_name": "StyleView",
1532 | "bar_color": null,
1533 | "description_width": ""
1534 | }
1535 | },
1536 | "63415a015b504c7f8baa3d5a60b7c560": {
1537 | "model_module": "@jupyter-widgets/controls",
1538 | "model_module_version": "1.5.0",
1539 | "model_name": "HBoxModel",
1540 | "state": {
1541 | "_dom_classes": [],
1542 | "_model_module": "@jupyter-widgets/controls",
1543 | "_model_module_version": "1.5.0",
1544 | "_model_name": "HBoxModel",
1545 | "_view_count": null,
1546 | "_view_module": "@jupyter-widgets/controls",
1547 | "_view_module_version": "1.5.0",
1548 | "_view_name": "HBoxView",
1549 | "box_style": "",
1550 | "children": [
1551 | "IPY_MODEL_b1cfc6b061cd40acbf15c3a617c2decd",
1552 | "IPY_MODEL_98c0b2a457bf405482ec3b11c61ac454",
1553 | "IPY_MODEL_ffe2904f8a2747638e11410a5474f9e4"
1554 | ],
1555 | "layout": "IPY_MODEL_e8d2d9fa065f4a13b614dd8c4a72f824"
1556 | }
1557 | },
1558 | "7b023fdc14184a9e95d8db379b511673": {
1559 | "model_module": "@jupyter-widgets/base",
1560 | "model_module_version": "1.2.0",
1561 | "model_name": "LayoutModel",
1562 | "state": {
1563 | "_model_module": "@jupyter-widgets/base",
1564 | "_model_module_version": "1.2.0",
1565 | "_model_name": "LayoutModel",
1566 | "_view_count": null,
1567 | "_view_module": "@jupyter-widgets/base",
1568 | "_view_module_version": "1.2.0",
1569 | "_view_name": "LayoutView",
1570 | "align_content": null,
1571 | "align_items": null,
1572 | "align_self": null,
1573 | "border": null,
1574 | "bottom": null,
1575 | "display": null,
1576 | "flex": null,
1577 | "flex_flow": null,
1578 | "grid_area": null,
1579 | "grid_auto_columns": null,
1580 | "grid_auto_flow": null,
1581 | "grid_auto_rows": null,
1582 | "grid_column": null,
1583 | "grid_gap": null,
1584 | "grid_row": null,
1585 | "grid_template_areas": null,
1586 | "grid_template_columns": null,
1587 | "grid_template_rows": null,
1588 | "height": null,
1589 | "justify_content": null,
1590 | "justify_items": null,
1591 | "left": null,
1592 | "margin": null,
1593 | "max_height": null,
1594 | "max_width": null,
1595 | "min_height": null,
1596 | "min_width": null,
1597 | "object_fit": null,
1598 | "object_position": null,
1599 | "order": null,
1600 | "overflow": null,
1601 | "overflow_x": null,
1602 | "overflow_y": null,
1603 | "padding": null,
1604 | "right": null,
1605 | "top": null,
1606 | "visibility": null,
1607 | "width": null
1608 | }
1609 | },
1610 | "9073d7f2397f47d0a60bc828f69f93cf": {
1611 | "model_module": "@jupyter-widgets/controls",
1612 | "model_module_version": "1.5.0",
1613 | "model_name": "DescriptionStyleModel",
1614 | "state": {
1615 | "_model_module": "@jupyter-widgets/controls",
1616 | "_model_module_version": "1.5.0",
1617 | "_model_name": "DescriptionStyleModel",
1618 | "_view_count": null,
1619 | "_view_module": "@jupyter-widgets/base",
1620 | "_view_module_version": "1.2.0",
1621 | "_view_name": "StyleView",
1622 | "description_width": ""
1623 | }
1624 | },
1625 | "9668a1ccd9d742349585cf3349cff5b6": {
1626 | "model_module": "@jupyter-widgets/base",
1627 | "model_module_version": "1.2.0",
1628 | "model_name": "LayoutModel",
1629 | "state": {
1630 | "_model_module": "@jupyter-widgets/base",
1631 | "_model_module_version": "1.2.0",
1632 | "_model_name": "LayoutModel",
1633 | "_view_count": null,
1634 | "_view_module": "@jupyter-widgets/base",
1635 | "_view_module_version": "1.2.0",
1636 | "_view_name": "LayoutView",
1637 | "align_content": null,
1638 | "align_items": null,
1639 | "align_self": null,
1640 | "border": null,
1641 | "bottom": null,
1642 | "display": null,
1643 | "flex": null,
1644 | "flex_flow": null,
1645 | "grid_area": null,
1646 | "grid_auto_columns": null,
1647 | "grid_auto_flow": null,
1648 | "grid_auto_rows": null,
1649 | "grid_column": null,
1650 | "grid_gap": null,
1651 | "grid_row": null,
1652 | "grid_template_areas": null,
1653 | "grid_template_columns": null,
1654 | "grid_template_rows": null,
1655 | "height": null,
1656 | "justify_content": null,
1657 | "justify_items": null,
1658 | "left": null,
1659 | "margin": null,
1660 | "max_height": null,
1661 | "max_width": null,
1662 | "min_height": null,
1663 | "min_width": null,
1664 | "object_fit": null,
1665 | "object_position": null,
1666 | "order": null,
1667 | "overflow": null,
1668 | "overflow_x": null,
1669 | "overflow_y": null,
1670 | "padding": null,
1671 | "right": null,
1672 | "top": null,
1673 | "visibility": null,
1674 | "width": null
1675 | }
1676 | },
1677 | "98c0b2a457bf405482ec3b11c61ac454": {
1678 | "model_module": "@jupyter-widgets/controls",
1679 | "model_module_version": "1.5.0",
1680 | "model_name": "FloatProgressModel",
1681 | "state": {
1682 | "_dom_classes": [],
1683 | "_model_module": "@jupyter-widgets/controls",
1684 | "_model_module_version": "1.5.0",
1685 | "_model_name": "FloatProgressModel",
1686 | "_view_count": null,
1687 | "_view_module": "@jupyter-widgets/controls",
1688 | "_view_module_version": "1.5.0",
1689 | "_view_name": "ProgressView",
1690 | "bar_style": "success",
1691 | "description": "",
1692 | "description_tooltip": null,
1693 | "layout": "IPY_MODEL_aabcd286ca28404da9170688b6871027",
1694 | "max": 1,
1695 | "min": 0,
1696 | "orientation": "horizontal",
1697 | "style": "IPY_MODEL_5cb3e448899c47ba9fd2c7731aba954b",
1698 | "value": 0
1699 | }
1700 | },
1701 | "a31dc64cca7146bd95cf8e98db98828c": {
1702 | "model_module": "@jupyter-widgets/controls",
1703 | "model_module_version": "1.5.0",
1704 | "model_name": "DescriptionStyleModel",
1705 | "state": {
1706 | "_model_module": "@jupyter-widgets/controls",
1707 | "_model_module_version": "1.5.0",
1708 | "_model_name": "DescriptionStyleModel",
1709 | "_view_count": null,
1710 | "_view_module": "@jupyter-widgets/base",
1711 | "_view_module_version": "1.2.0",
1712 | "_view_name": "StyleView",
1713 | "description_width": ""
1714 | }
1715 | },
1716 | "aabcd286ca28404da9170688b6871027": {
1717 | "model_module": "@jupyter-widgets/base",
1718 | "model_module_version": "1.2.0",
1719 | "model_name": "LayoutModel",
1720 | "state": {
1721 | "_model_module": "@jupyter-widgets/base",
1722 | "_model_module_version": "1.2.0",
1723 | "_model_name": "LayoutModel",
1724 | "_view_count": null,
1725 | "_view_module": "@jupyter-widgets/base",
1726 | "_view_module_version": "1.2.0",
1727 | "_view_name": "LayoutView",
1728 | "align_content": null,
1729 | "align_items": null,
1730 | "align_self": null,
1731 | "border": null,
1732 | "bottom": null,
1733 | "display": null,
1734 | "flex": null,
1735 | "flex_flow": null,
1736 | "grid_area": null,
1737 | "grid_auto_columns": null,
1738 | "grid_auto_flow": null,
1739 | "grid_auto_rows": null,
1740 | "grid_column": null,
1741 | "grid_gap": null,
1742 | "grid_row": null,
1743 | "grid_template_areas": null,
1744 | "grid_template_columns": null,
1745 | "grid_template_rows": null,
1746 | "height": null,
1747 | "justify_content": null,
1748 | "justify_items": null,
1749 | "left": null,
1750 | "margin": null,
1751 | "max_height": null,
1752 | "max_width": null,
1753 | "min_height": null,
1754 | "min_width": null,
1755 | "object_fit": null,
1756 | "object_position": null,
1757 | "order": null,
1758 | "overflow": null,
1759 | "overflow_x": null,
1760 | "overflow_y": null,
1761 | "padding": null,
1762 | "right": null,
1763 | "top": null,
1764 | "visibility": null,
1765 | "width": "20px"
1766 | }
1767 | },
1768 | "aba59e0a981845b0a2afc17f1ea8fad0": {
1769 | "model_module": "@jupyter-widgets/base",
1770 | "model_module_version": "1.2.0",
1771 | "model_name": "LayoutModel",
1772 | "state": {
1773 | "_model_module": "@jupyter-widgets/base",
1774 | "_model_module_version": "1.2.0",
1775 | "_model_name": "LayoutModel",
1776 | "_view_count": null,
1777 | "_view_module": "@jupyter-widgets/base",
1778 | "_view_module_version": "1.2.0",
1779 | "_view_name": "LayoutView",
1780 | "align_content": null,
1781 | "align_items": null,
1782 | "align_self": null,
1783 | "border": null,
1784 | "bottom": null,
1785 | "display": null,
1786 | "flex": null,
1787 | "flex_flow": null,
1788 | "grid_area": null,
1789 | "grid_auto_columns": null,
1790 | "grid_auto_flow": null,
1791 | "grid_auto_rows": null,
1792 | "grid_column": null,
1793 | "grid_gap": null,
1794 | "grid_row": null,
1795 | "grid_template_areas": null,
1796 | "grid_template_columns": null,
1797 | "grid_template_rows": null,
1798 | "height": null,
1799 | "justify_content": null,
1800 | "justify_items": null,
1801 | "left": null,
1802 | "margin": null,
1803 | "max_height": null,
1804 | "max_width": null,
1805 | "min_height": null,
1806 | "min_width": null,
1807 | "object_fit": null,
1808 | "object_position": null,
1809 | "order": null,
1810 | "overflow": null,
1811 | "overflow_x": null,
1812 | "overflow_y": null,
1813 | "padding": null,
1814 | "right": null,
1815 | "top": null,
1816 | "visibility": null,
1817 | "width": null
1818 | }
1819 | },
1820 | "b1cfc6b061cd40acbf15c3a617c2decd": {
1821 | "model_module": "@jupyter-widgets/controls",
1822 | "model_module_version": "1.5.0",
1823 | "model_name": "HTMLModel",
1824 | "state": {
1825 | "_dom_classes": [],
1826 | "_model_module": "@jupyter-widgets/controls",
1827 | "_model_module_version": "1.5.0",
1828 | "_model_name": "HTMLModel",
1829 | "_view_count": null,
1830 | "_view_module": "@jupyter-widgets/controls",
1831 | "_view_module_version": "1.5.0",
1832 | "_view_name": "HTMLView",
1833 | "description": "",
1834 | "description_tooltip": null,
1835 | "layout": "IPY_MODEL_3fee1bf921dd48f69811c0aaa3213974",
1836 | "placeholder": "",
1837 | "style": "IPY_MODEL_06989b53974f41d28796d7e9f7ca1034",
1838 | "value": ""
1839 | }
1840 | },
1841 | "b511866d7dcc4c69b114720594db39b9": {
1842 | "model_module": "@jupyter-widgets/controls",
1843 | "model_module_version": "1.5.0",
1844 | "model_name": "DescriptionStyleModel",
1845 | "state": {
1846 | "_model_module": "@jupyter-widgets/controls",
1847 | "_model_module_version": "1.5.0",
1848 | "_model_name": "DescriptionStyleModel",
1849 | "_view_count": null,
1850 | "_view_module": "@jupyter-widgets/base",
1851 | "_view_module_version": "1.2.0",
1852 | "_view_name": "StyleView",
1853 | "description_width": ""
1854 | }
1855 | },
1856 | "bc79e00c44874895a62dbab150be6b91": {
1857 | "model_module": "@jupyter-widgets/base",
1858 | "model_module_version": "1.2.0",
1859 | "model_name": "LayoutModel",
1860 | "state": {
1861 | "_model_module": "@jupyter-widgets/base",
1862 | "_model_module_version": "1.2.0",
1863 | "_model_name": "LayoutModel",
1864 | "_view_count": null,
1865 | "_view_module": "@jupyter-widgets/base",
1866 | "_view_module_version": "1.2.0",
1867 | "_view_name": "LayoutView",
1868 | "align_content": null,
1869 | "align_items": null,
1870 | "align_self": null,
1871 | "border": null,
1872 | "bottom": null,
1873 | "display": null,
1874 | "flex": null,
1875 | "flex_flow": null,
1876 | "grid_area": null,
1877 | "grid_auto_columns": null,
1878 | "grid_auto_flow": null,
1879 | "grid_auto_rows": null,
1880 | "grid_column": null,
1881 | "grid_gap": null,
1882 | "grid_row": null,
1883 | "grid_template_areas": null,
1884 | "grid_template_columns": null,
1885 | "grid_template_rows": null,
1886 | "height": null,
1887 | "justify_content": null,
1888 | "justify_items": null,
1889 | "left": null,
1890 | "margin": null,
1891 | "max_height": null,
1892 | "max_width": null,
1893 | "min_height": null,
1894 | "min_width": null,
1895 | "object_fit": null,
1896 | "object_position": null,
1897 | "order": null,
1898 | "overflow": null,
1899 | "overflow_x": null,
1900 | "overflow_y": null,
1901 | "padding": null,
1902 | "right": null,
1903 | "top": null,
1904 | "visibility": null,
1905 | "width": null
1906 | }
1907 | },
1908 | "cb1bd500742d4938a96e966c8964ecb6": {
1909 | "model_module": "@jupyter-widgets/controls",
1910 | "model_module_version": "1.5.0",
1911 | "model_name": "ProgressStyleModel",
1912 | "state": {
1913 | "_model_module": "@jupyter-widgets/controls",
1914 | "_model_module_version": "1.5.0",
1915 | "_model_name": "ProgressStyleModel",
1916 | "_view_count": null,
1917 | "_view_module": "@jupyter-widgets/base",
1918 | "_view_module_version": "1.2.0",
1919 | "_view_name": "StyleView",
1920 | "bar_color": null,
1921 | "description_width": ""
1922 | }
1923 | },
1924 | "e8d2d9fa065f4a13b614dd8c4a72f824": {
1925 | "model_module": "@jupyter-widgets/base",
1926 | "model_module_version": "1.2.0",
1927 | "model_name": "LayoutModel",
1928 | "state": {
1929 | "_model_module": "@jupyter-widgets/base",
1930 | "_model_module_version": "1.2.0",
1931 | "_model_name": "LayoutModel",
1932 | "_view_count": null,
1933 | "_view_module": "@jupyter-widgets/base",
1934 | "_view_module_version": "1.2.0",
1935 | "_view_name": "LayoutView",
1936 | "align_content": null,
1937 | "align_items": null,
1938 | "align_self": null,
1939 | "border": null,
1940 | "bottom": null,
1941 | "display": null,
1942 | "flex": null,
1943 | "flex_flow": null,
1944 | "grid_area": null,
1945 | "grid_auto_columns": null,
1946 | "grid_auto_flow": null,
1947 | "grid_auto_rows": null,
1948 | "grid_column": null,
1949 | "grid_gap": null,
1950 | "grid_row": null,
1951 | "grid_template_areas": null,
1952 | "grid_template_columns": null,
1953 | "grid_template_rows": null,
1954 | "height": null,
1955 | "justify_content": null,
1956 | "justify_items": null,
1957 | "left": null,
1958 | "margin": null,
1959 | "max_height": null,
1960 | "max_width": null,
1961 | "min_height": null,
1962 | "min_width": null,
1963 | "object_fit": null,
1964 | "object_position": null,
1965 | "order": null,
1966 | "overflow": null,
1967 | "overflow_x": null,
1968 | "overflow_y": null,
1969 | "padding": null,
1970 | "right": null,
1971 | "top": null,
1972 | "visibility": null,
1973 | "width": null
1974 | }
1975 | },
1976 | "ffe2904f8a2747638e11410a5474f9e4": {
1977 | "model_module": "@jupyter-widgets/controls",
1978 | "model_module_version": "1.5.0",
1979 | "model_name": "HTMLModel",
1980 | "state": {
1981 | "_dom_classes": [],
1982 | "_model_module": "@jupyter-widgets/controls",
1983 | "_model_module_version": "1.5.0",
1984 | "_model_name": "HTMLModel",
1985 | "_view_count": null,
1986 | "_view_module": "@jupyter-widgets/controls",
1987 | "_view_module_version": "1.5.0",
1988 | "_view_name": "HTMLView",
1989 | "description": "",
1990 | "description_tooltip": null,
1991 | "layout": "IPY_MODEL_9668a1ccd9d742349585cf3349cff5b6",
1992 | "placeholder": "",
1993 | "style": "IPY_MODEL_9073d7f2397f47d0a60bc828f69f93cf",
1994 | "value": " 0/0 [00:00<?, ?it/s]"
1995 | }
1996 | }
1997 | }
1998 | }
1999 | },
2000 | "nbformat": 4,
2001 | "nbformat_minor": 0
2002 | }
2003 |
--------------------------------------------------------------------------------