├── text ├── hf-datasets │ ├── raw-datasets │ │ └── .gitkeep │ ├── dedupe-datasets │ │ └── .gitkeep │ ├── postprocessing │ │ └── .gitkeep │ └── postprocessing-done │ │ └── .gitkeep ├── pretrain-llm │ ├── how-to-mosaic.png │ ├── README.md │ ├── prepare-madlad-400-ms.ipynb │ ├── prepare-starcoder.ipynb │ └── prepare-translation.ipynb ├── text_dedup │ ├── __init__.py │ └── utils │ │ ├── preprocess.py │ │ ├── __init__.py │ │ ├── union_find.py │ │ ├── tokenization.py │ │ ├── timer.py │ │ ├── analysis.py │ │ └── hashfunc.py ├── processing │ ├── text_dedup │ │ ├── __init__.py │ │ └── utils │ │ │ ├── preprocess.py │ │ │ ├── __init__.py │ │ │ ├── union_find.py │ │ │ ├── tokenization.py │ │ │ ├── timer.py │ │ │ ├── analysis.py │ │ │ └── hashfunc.py │ ├── README.md │ ├── utils.py │ ├── main.py │ └── function.py ├── yi │ └── README.md ├── mistral │ ├── README.md │ └── run-tokenizer.ipynb ├── tinyllama │ └── README.md ├── llama │ ├── README.md │ ├── prepare-dataset-1024.ipynb │ ├── prepare-dataset-2048.ipynb │ └── prepare-tokenizer.ipynb ├── pretrain-clm │ ├── README.md │ └── from-pyarrow-to-mosaic.ipynb ├── extra │ ├── process-lowyat.ipynb │ ├── process-data.gov.my.ipynb │ ├── sample-fineweb-edu.ipynb │ └── process-snapshot.ipynb ├── README.md ├── compare-tokens.ipynb ├── .gitignore └── madlad-400-ms │ ├── prepare-madlad-400-ms.ipynb │ ├── dedup-madlad-400-ms.ipynb │ └── postprocess-madlad-400-ms.ipynb ├── multilingual-tts ├── prepare │ ├── prepare-CORAA-MUPE-ASR.ipynb │ ├── prepare-ParlaSpeech-CZ.ipynb │ ├── prepare-ParlaSpeech-HR.ipynb │ ├── prepare-ParlaSpeech-PL.ipynb │ ├── prepare-WenetSpeech4TTS.ipynb │ └── prepare-MasriSpeech-Full.ipynb ├── README.md ├── embedding.py ├── convert_neucodec.py └── trim_silence.py ├── README.md ├── stt-whisper ├── .gitignore ├── README.md ├── force_alignment.py └── audioset_sliding.py ├── malaysian-short-instructions ├── .gitignore ├── keyword-location ├── negeri ├── keywords ├── dedup-questions-intents.ipynb └── dedup-questions.ipynb ├── speech-instructions ├── .gitignore ├── README.md ├── generate.sh ├── embedding.py ├── prepare-malaysian-podcast.ipynb ├── remote.sh ├── prepare-malaysian-others.ipynb └── prepare-malaysia-parliament.ipynb ├── emotional-malaysian-emilia ├── README.md ├── pitch_estimation.py ├── audioset_sliding.py └── audioset_sliding_v2.py ├── emilia-yodas ├── README.md └── convert_neucodec_emilia.py ├── LICENSE ├── speech-instructions-extra └── upload.ipynb └── .gitignore /text/hf-datasets/raw-datasets/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /text/hf-datasets/dedupe-datasets/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /text/hf-datasets/postprocessing/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /text/hf-datasets/postprocessing-done/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /multilingual-tts/prepare/prepare-CORAA-MUPE-ASR.ipynb: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /multilingual-tts/prepare/prepare-ParlaSpeech-CZ.ipynb: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /multilingual-tts/prepare/prepare-ParlaSpeech-HR.ipynb: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /multilingual-tts/prepare/prepare-ParlaSpeech-PL.ipynb: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dataset 2 | 3 | Our recipes to prepare datasets. -------------------------------------------------------------------------------- /stt-whisper/.gitignore: -------------------------------------------------------------------------------- 1 | *.json 2 | force_alignment 3 | *.parquet -------------------------------------------------------------------------------- /malaysian-short-instructions/.gitignore: -------------------------------------------------------------------------------- 1 | generate-questions* 2 | generate-answers* 3 | *.json 4 | *.parquet -------------------------------------------------------------------------------- /text/pretrain-llm/how-to-mosaic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malaysia-ai/dataset/HEAD/text/pretrain-llm/how-to-mosaic.png -------------------------------------------------------------------------------- /multilingual-tts/README.md: -------------------------------------------------------------------------------- 1 | # Multilingual-TTS 2 | 3 | Gather multilingual TTS dataset, everything pushed into https://huggingface.co/datasets/malaysia-ai/Multilingual-TTS. -------------------------------------------------------------------------------- /malaysian-short-instructions/keyword-location: -------------------------------------------------------------------------------- 1 | food 2 | attraction 3 | lifestyle 4 | culture 5 | shopping mall 6 | agama 7 | socioeconomy 8 | peluang pekerjaan 9 | infrastructure 10 | education 11 | technology 12 | business -------------------------------------------------------------------------------- /speech-instructions/.gitignore: -------------------------------------------------------------------------------- 1 | *.parquet 2 | embedding* 3 | *.json 4 | *.jsonl 5 | dedup-parliament 6 | dedup-podcasts 7 | dedup-others 8 | partition-instructions-part* 9 | tatabahasa* 10 | mallm* 11 | short-coding-* 12 | malaymmlu* -------------------------------------------------------------------------------- /text/text_dedup/__init__.py: -------------------------------------------------------------------------------- 1 | """Text deduplication simplified.""" 2 | 3 | import logging 4 | 5 | from rich.logging import RichHandler 6 | 7 | logger = logging.getLogger("text_dedup") 8 | logger.setLevel(logging.INFO) 9 | logger.addHandler(RichHandler(rich_tracebacks=True)) 10 | logger.propagate = False -------------------------------------------------------------------------------- /text/processing/text_dedup/__init__.py: -------------------------------------------------------------------------------- 1 | """Text deduplication simplified.""" 2 | 3 | import logging 4 | 5 | from rich.logging import RichHandler 6 | 7 | logger = logging.getLogger("text_dedup") 8 | logger.setLevel(logging.INFO) 9 | logger.addHandler(RichHandler(rich_tracebacks=True)) 10 | logger.propagate = False -------------------------------------------------------------------------------- /malaysian-short-instructions/negeri: -------------------------------------------------------------------------------- 1 | negeri johor 2 | negeri kedah 3 | negeri kelantan 4 | negeri melaka 5 | negeri negeri sembilan 6 | negeri pahang 7 | negeri perak 8 | negeri perlis 9 | negeri pulau pinang 10 | negeri selangor 11 | negeri terengganu 12 | negeri sabah 13 | negeri sarawak 14 | kuala lumpur 15 | negeri labuan 16 | putrajaya -------------------------------------------------------------------------------- /emotional-malaysian-emilia/README.md: -------------------------------------------------------------------------------- 1 | # Emotional Malaysian Emilia 2 | 3 | Synthetic Emotional label on Malaysian Emilia. 4 | 5 | ## how to 6 | 7 | ### Predict Audioset sliding window 8 | 9 | ```bash 10 | CUDA_VISIBLE_DEVICES=0 \ 11 | python3 audioset_sliding_v2.py --path 'malaysian-podcast_processed/**/*.mp3' --global-index 1 --local-index 0 12 | ``` 13 | 14 | ### Predict Emotion -------------------------------------------------------------------------------- /emilia-yodas/README.md: -------------------------------------------------------------------------------- 1 | ## Convert to audio tokens 2 | 3 | ```bash 4 | OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 \ 5 | python3 convert_neucodec_batch.py --file 'emilia-audio.json' --replication 2 6 | ``` 7 | 8 | But we prefer to use [convert_neucodec_emilia.py](convert_neucodec_emilia.py) in GH200, 9 | 10 | ```bash 11 | OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 python3 convert_neucodec_emilia.py --file 'emilia-audio.json' --replication 13 12 | ``` 13 | 14 | Way faster! -------------------------------------------------------------------------------- /text/yi/README.md: -------------------------------------------------------------------------------- 1 | # Prepare dataset for Yi FPF 2 | 3 | This step to prepare FPF Yi models. 4 | 5 | ## how-to 6 | 7 | 1. Run [combine-dataset.ipynb](combine-dataset.ipynb), 8 | 9 | This will combine most datasets into 1 JSONL file. 10 | 11 | - 41 GB. 12 | 13 | 2. Run [convert-mosaic.ipynb](prepare-tokenizer.ipynb), 14 | 15 | This will tokenized and convert into mosaic format. 16 | 17 | 3. Run [combine-mosaic-all.ipynb](combine-mosaic-all.ipynb), 18 | 19 | This will combine all mosaic partitions into one mosaic folder, total 14114934784 tokens. -------------------------------------------------------------------------------- /text/mistral/README.md: -------------------------------------------------------------------------------- 1 | # Prepare dataset for Mistral FPF 2 | 3 | This step to prepare FPF Mistral model. 4 | 5 | ## how-to 6 | 7 | 1. Run [mistral/combine-mistral.ipynb](mistral/combine-mistral.ipynb), 8 | 9 | This will combine most datasets into 1 JSONL file. 10 | 11 | - 32.6 GB. 12 | 13 | 2. Run [prepare-tokenizer.ipynb](prepare-tokenizer.ipynb), 14 | 15 | This will tokenized and cached the dataset. 16 | 17 | 3. Run [prepare-dataset-4096.ipynb](prepare-dataset-4096.ipynb), 18 | 19 | This will partitioned tokenized dataset into 4096 context length. -------------------------------------------------------------------------------- /text/tinyllama/README.md: -------------------------------------------------------------------------------- 1 | # Prepare dataset for TinyLlama FPF 2 | 3 | This step to prepare FPF TinyLlama models. 4 | 5 | ## how-to 6 | 7 | 1. Run [combine-dataset.ipynb](combine-dataset.ipynb), 8 | 9 | This will combine most datasets into 1 JSONL file. 10 | 11 | - 41 GB. 12 | 13 | 2. Run [convert-mosaic.ipynb](prepare-tokenizer.ipynb), 14 | 15 | This will tokenized and convert into mosaic format. 16 | 17 | 3. Run [combine-mosaic-all.ipynb](combine-mosaic-all.ipynb), 18 | 19 | This will combine all mosaic partitions into one mosaic folder, total 14349328384 tokens. -------------------------------------------------------------------------------- /text/llama/README.md: -------------------------------------------------------------------------------- 1 | # Prepare dataset for Llama2 FPF 2 | 3 | This step to prepare FPF Llama2 models. 4 | 5 | ## how-to 6 | 7 | 1. Run [combine-v2.ipynb](combine-v2.ipynb), 8 | 9 | This will combine most datasets into 1 JSONL file. 10 | 11 | - 31.4 GB. 12 | 13 | 2. Run [prepare-tokenizer.ipynb](prepare-tokenizer.ipynb), 14 | 15 | This will tokenized and cached the dataset. 16 | 17 | 3. Run [prepare-dataset-2048.ipynb](prepare-dataset-2048.ipynb), 18 | 19 | This will partitioned tokenized dataset into 2048 context length. 20 | 21 | 4. Run [prepare-dataset-32768.ipynb](prepare-dataset-32768.ipynb), 22 | 23 | This will partitioned tokenized dataset into 32768 context length. -------------------------------------------------------------------------------- /text/pretrain-clm/README.md: -------------------------------------------------------------------------------- 1 | # Pretrain CLM 2 | 3 | This is to pretrain 100M - 500M parameters CLM. All steps done using Standard_F48s_v2 node size. 4 | 5 | This step to prepare pretrain models from scratch. 6 | 7 | ## how-to 8 | 9 | 1. Run [pretrain/combine-lm.ipynb](pretrain/combine-lm.ipynb), 10 | 11 | This will combine all datasets into 1 JSONL file. 12 | 13 | - 81 GB. 14 | - 16994238464 tokens. 15 | 16 | 2. Run [pretrain/tokenizer-4096.ipynb](pretrain/tokenizer-4096.ipynb), 17 | 18 | This will tokenized and partitioned tokenized dataset into 4096 context length. 19 | 20 | 3. Run [pretrain/from-pyarrow-to-mosaic.ipynb](pretrain/from-pyarrow-to-mosaic.ipynb), 21 | 22 | This will convert PyArrow streaming format into MosaicML streaming format. 23 | 24 | 4. Run [pretrain/combine-mosaicml.ipynb](pretrain/combine-mosaicml.ipynb), 25 | 26 | This will combine multiple MosaicML streaming folders into 1 folder. -------------------------------------------------------------------------------- /malaysian-short-instructions/keywords: -------------------------------------------------------------------------------- 1 | react js 2 | vue js 3 | vanilla javascript 4 | websocket 5 | node js 6 | svelte 7 | next js 8 | express js 9 | angular js 10 | jquery 11 | d3 js 12 | python matplotlib 13 | python pandas 14 | python dask 15 | python scipy 16 | python numpy 17 | python keras 18 | python flask 19 | python fastapi 20 | python request 21 | python async 22 | python scikit learn 23 | python dask 24 | python distributed system 25 | pytorch 26 | pyspark 27 | apache spark 28 | apache hadoop 29 | apache hive 30 | apache kafka 31 | apache yarn 32 | apache flink 33 | apache cassandra 34 | apache airflow 35 | apache druid 36 | c++ 37 | java 38 | rust 39 | kotlin 40 | swift 41 | cuda 42 | go 43 | go distributed system 44 | kubernetes 45 | bash 46 | docker 47 | dockerfile 48 | nginx 49 | tcp 50 | postgresql 51 | mysql 52 | oracle db 53 | elasticsearch 54 | nosql 55 | clickhouse 56 | terraform 57 | fortran 58 | slurm 59 | openmpi -------------------------------------------------------------------------------- /text/text_dedup/utils/preprocess.py: -------------------------------------------------------------------------------- 1 | import regex as re 2 | 3 | DIGIT_RE = re.compile(r"\d") 4 | PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile(r"[\p{P}\p{C}\p{S}]+") 5 | 6 | 7 | def normalize(line: str) -> str: 8 | """ 9 | Normalize a line of text. Source: https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/text_normalizer.py#L180 10 | 11 | Parameters 12 | ---------- 13 | line : str 14 | The line of text to normalize. 15 | 16 | Returns 17 | ------- 18 | str 19 | The normalized line of text. 20 | 21 | Examples 22 | -------- 23 | >>> normalize("Hello, world!") 24 | 'hello world' 25 | >>> normalize("Hello, 123!\\n\\t\\b") 26 | 'hello 000' 27 | """ 28 | line = line.strip() 29 | if not line: 30 | return line 31 | line = line.lower() 32 | line = DIGIT_RE.sub("0", line) 33 | line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line) 34 | return line -------------------------------------------------------------------------------- /text/processing/text_dedup/utils/preprocess.py: -------------------------------------------------------------------------------- 1 | import regex as re 2 | 3 | DIGIT_RE = re.compile(r"\d") 4 | PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile(r"[\p{P}\p{C}\p{S}]+") 5 | 6 | 7 | def normalize(line: str) -> str: 8 | """ 9 | Normalize a line of text. Source: https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/text_normalizer.py#L180 10 | 11 | Parameters 12 | ---------- 13 | line : str 14 | The line of text to normalize. 15 | 16 | Returns 17 | ------- 18 | str 19 | The normalized line of text. 20 | 21 | Examples 22 | -------- 23 | >>> normalize("Hello, world!") 24 | 'hello world' 25 | >>> normalize("Hello, 123!\\n\\t\\b") 26 | 'hello 000' 27 | """ 28 | line = line.strip() 29 | if not line: 30 | return line 31 | line = line.lower() 32 | line = DIGIT_RE.sub("0", line) 33 | line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line) 34 | return line -------------------------------------------------------------------------------- /text/text_dedup/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from text_dedup.utils.add_args import add_bloom_filter_args 2 | from text_dedup.utils.add_args import add_exact_hash_args 3 | from text_dedup.utils.add_args import add_io_args 4 | from text_dedup.utils.add_args import add_meta_args 5 | from text_dedup.utils.add_args import add_minhash_args 6 | from text_dedup.utils.add_args import add_sa_args 7 | from text_dedup.utils.add_args import add_simhash_args 8 | from text_dedup.utils.hashfunc import sha1_hash 9 | from text_dedup.utils.hashfunc import xxh3_hash 10 | from text_dedup.utils.timer import Timer 11 | from text_dedup.utils.tokenization import ngrams 12 | from text_dedup.utils.union_find import UnionFind 13 | 14 | __all__ = [ 15 | "add_bloom_filter_args", 16 | "add_exact_hash_args", 17 | "add_io_args", 18 | "add_meta_args", 19 | "add_minhash_args", 20 | "add_sa_args", 21 | "add_simhash_args", 22 | "Timer", 23 | "ngrams", 24 | "UnionFind", 25 | "sha1_hash", 26 | "xxh3_hash", 27 | ] -------------------------------------------------------------------------------- /text/text_dedup/utils/union_find.py: -------------------------------------------------------------------------------- 1 | class UnionFind: 2 | """ 3 | A data structure for maintaining disjoint sets. This helps build connected components for given duplicate pairs. 4 | 5 | Examples 6 | -------- 7 | >>> uf = UnionFind() 8 | >>> uf.union(1, 2) 9 | >>> uf.union(2, 3) 10 | >>> uf.union(4, 5) 11 | >>> uf.find(1) 12 | 1 13 | >>> uf.find(2) 14 | 1 15 | >>> uf.find(3) 16 | 1 17 | >>> uf.find(4) 18 | 4 19 | >>> uf.find(5) 20 | 4 21 | """ 22 | 23 | def __init__(self): 24 | self.parent = {} 25 | 26 | def find(self, x): 27 | if x not in self.parent: 28 | self.parent[x] = x 29 | return x 30 | 31 | if self.parent[x] != x: 32 | self.parent[x] = self.find(self.parent[x]) 33 | 34 | return self.parent[x] 35 | 36 | def union(self, x, y): 37 | px = self.find(x) 38 | py = self.find(y) 39 | self.parent[px] = self.parent[py] = min(px, py) -------------------------------------------------------------------------------- /text/processing/text_dedup/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from text_dedup.utils.add_args import add_bloom_filter_args 2 | from text_dedup.utils.add_args import add_exact_hash_args 3 | from text_dedup.utils.add_args import add_io_args 4 | from text_dedup.utils.add_args import add_meta_args 5 | from text_dedup.utils.add_args import add_minhash_args 6 | from text_dedup.utils.add_args import add_sa_args 7 | from text_dedup.utils.add_args import add_simhash_args 8 | from text_dedup.utils.hashfunc import sha1_hash 9 | from text_dedup.utils.hashfunc import xxh3_hash 10 | from text_dedup.utils.timer import Timer 11 | from text_dedup.utils.tokenization import ngrams 12 | from text_dedup.utils.union_find import UnionFind 13 | 14 | __all__ = [ 15 | "add_bloom_filter_args", 16 | "add_exact_hash_args", 17 | "add_io_args", 18 | "add_meta_args", 19 | "add_minhash_args", 20 | "add_sa_args", 21 | "add_simhash_args", 22 | "Timer", 23 | "ngrams", 24 | "UnionFind", 25 | "sha1_hash", 26 | "xxh3_hash", 27 | ] -------------------------------------------------------------------------------- /text/processing/text_dedup/utils/union_find.py: -------------------------------------------------------------------------------- 1 | class UnionFind: 2 | """ 3 | A data structure for maintaining disjoint sets. This helps build connected components for given duplicate pairs. 4 | 5 | Examples 6 | -------- 7 | >>> uf = UnionFind() 8 | >>> uf.union(1, 2) 9 | >>> uf.union(2, 3) 10 | >>> uf.union(4, 5) 11 | >>> uf.find(1) 12 | 1 13 | >>> uf.find(2) 14 | 1 15 | >>> uf.find(3) 16 | 1 17 | >>> uf.find(4) 18 | 4 19 | >>> uf.find(5) 20 | 4 21 | """ 22 | 23 | def __init__(self): 24 | self.parent = {} 25 | 26 | def find(self, x): 27 | if x not in self.parent: 28 | self.parent[x] = x 29 | return x 30 | 31 | if self.parent[x] != x: 32 | self.parent[x] = self.find(self.parent[x]) 33 | 34 | return self.parent[x] 35 | 36 | def union(self, x, y): 37 | px = self.find(x) 38 | py = self.find(y) 39 | self.parent[px] = self.parent[py] = min(px, py) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Malaysia-AI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /text/text_dedup/utils/tokenization.py: -------------------------------------------------------------------------------- 1 | from itertools import tee 2 | from typing import List 3 | from typing import Text 4 | 5 | 6 | def ngrams(sequence: List[Text], n: int, min_length: int = 5): 7 | """ 8 | Return the ngrams generated from a sequence of items, as an iterator. 9 | 10 | This is a modified version of nltk.util.ngrams. 11 | 12 | Parameters 13 | ---------- 14 | sequence : List[Text] 15 | The sequence of items. 16 | n : int 17 | The length of each ngram. 18 | min_length : int, optional 19 | The minimum length of each ngram, by default 5 20 | 21 | Returns 22 | ------- 23 | iterator 24 | The ngrams. 25 | 26 | Examples 27 | -------- 28 | >>> list(ngrams(["a", "b", "c", "d"], 2, min_length=1)) 29 | [('a', 'b'), ('b', 'c'), ('c', 'd')] 30 | >>> list(ngrams(["a", "b", "c", "d"], 2, min_length=5)) 31 | [] 32 | >>> list(ngrams(["a", "b"], 3, min_length=1)) 33 | [('a', 'b')] 34 | """ 35 | if len(sequence) < min_length: 36 | return [] 37 | if len(sequence) < n: 38 | return [tuple(sequence)] 39 | iterables = tee(iter(sequence), n) 40 | for i, sub_iterable in enumerate(iterables): 41 | for _ in range(i): 42 | next(sub_iterable, None) 43 | return zip(*iterables) -------------------------------------------------------------------------------- /text/processing/text_dedup/utils/tokenization.py: -------------------------------------------------------------------------------- 1 | from itertools import tee 2 | from typing import List 3 | from typing import Text 4 | 5 | 6 | def ngrams(sequence: List[Text], n: int, min_length: int = 5): 7 | """ 8 | Return the ngrams generated from a sequence of items, as an iterator. 9 | 10 | This is a modified version of nltk.util.ngrams. 11 | 12 | Parameters 13 | ---------- 14 | sequence : List[Text] 15 | The sequence of items. 16 | n : int 17 | The length of each ngram. 18 | min_length : int, optional 19 | The minimum length of each ngram, by default 5 20 | 21 | Returns 22 | ------- 23 | iterator 24 | The ngrams. 25 | 26 | Examples 27 | -------- 28 | >>> list(ngrams(["a", "b", "c", "d"], 2, min_length=1)) 29 | [('a', 'b'), ('b', 'c'), ('c', 'd')] 30 | >>> list(ngrams(["a", "b", "c", "d"], 2, min_length=5)) 31 | [] 32 | >>> list(ngrams(["a", "b"], 3, min_length=1)) 33 | [('a', 'b')] 34 | """ 35 | if len(sequence) < min_length: 36 | return [] 37 | if len(sequence) < n: 38 | return [tuple(sequence)] 39 | iterables = tee(iter(sequence), n) 40 | for i, sub_iterable in enumerate(iterables): 41 | for _ in range(i): 42 | next(sub_iterable, None) 43 | return zip(*iterables) -------------------------------------------------------------------------------- /text/pretrain-llm/README.md: -------------------------------------------------------------------------------- 1 | # Pretrain LLM 2 | 3 | This is to pretrain 1B - 13B parameters LLM. All steps done using Standard_F48s_v2 node size. 4 | 5 | ## how-to not use HuggingFace datasets 6 | 7 | It stream memory mapped file and after that concat, https://github.com/huggingface/datasets/blob/60bdf3005d1dc0b26da8e5949721b20d932eaad6/src/datasets/table.py#L51, super super slow, and you are wondering, is the script stuck? Yes, it is waiting for pyarrow streaming. 8 | 9 | So we try our own approached, 10 | 11 | 12 | 13 | https://drive.google.com/file/d/1dSQ7KQs_x7aCTNVXgMESIqTwEoAZt-OK/view?usp=sharing 14 | 15 | 1. Split JSONL file into smaller JSONL files. 16 | 2. Each smaller JSONL files run in multiprocessing to convert into Mosaic format. 17 | 3. Merge smaller Mosaic files into one Mosaic file. 18 | 19 | ## total tokens 20 | 21 | 1. [prepare-dedup-text-dataset-4096.ipynb](prepare-dedup-text-dataset-4096.ipynb), 31702310912 22 | 2. [prepare-starcoder-4096.ipynb](prepare-starcoder-4096.ipynb), 40981254144 23 | 3. [prepare-madlad-400-4096.ipynb](prepare-madlad-400-4096.ipynb), 14983720960 24 | 4. [prepare-instructions.ipynb](prepare-instructions.ipynb), 1577877504 25 | 5. [prepare-extra.ipynb](prepare-extra.ipynb), 1140461568 26 | 27 | Total, 90B tokens, we uploaded the dataset at https://huggingface.co/datasets/malaysia-ai/mosaic-combine-all, so you can use it directly with https://docs.mosaicml.com/projects/streaming/en/latest/index.html -------------------------------------------------------------------------------- /speech-instructions/README.md: -------------------------------------------------------------------------------- 1 | # Speech Instructions 2 | 3 | ## how to prepare 4 | 5 | ### 1. Speaker dedup 6 | 7 | 1. Prepare dataset to dedup, 8 | 9 | - [prepare-malaysia-parliament.ipynb](prepare-malaysia-parliament.ipynb). 10 | - [prepare-malaysian-podcast.ipynb](prepare-malaysian-podcast.ipynb). 11 | - [prepare-malaysian-others.ipynb](prepare-malaysian-others.ipynb). 12 | 13 | 2. Convert to embedding, 14 | 15 | We use speaker embedding from https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/titanet_large 16 | 17 | ```bash 18 | CUDA_VISIBLE_DEVICES=1,2 \ 19 | python3.10 embedding.py \ 20 | --filename filtered-politicians.parquet \ 21 | --replication 3 22 | 23 | CUDA_VISIBLE_DEVICES=1,2 \ 24 | python3.10 embedding.py \ 25 | --filename filtered-podcast.parquet \ 26 | --replication 3 --folder embedding-podcast 27 | 28 | CUDA_VISIBLE_DEVICES=0,2 \ 29 | python3.10 embedding.py \ 30 | --filename filtered-others.parquet \ 31 | --replication 3 --folder embedding-others 32 | ``` 33 | 34 | 2. Merge and dedup, 35 | 36 | - [dedup-parliament.ipynb](dedup-parliament.ipynb). 37 | - [dedup-podcasts.ipynb](dedup-podcasts.ipynb). 38 | 39 | ### 2. Populate instructions 40 | 41 | All datasets from https://huggingface.co/collections/mesolitica/malaysian-synthetic-dataset-656c2673fe7fe0b1e9e25fe2, and follow [filter-instructions.ipynb](filter-instructions.ipynb). 42 | 43 | ### 3. Generate synthetic voice 44 | 45 | ```bash 46 | bash generate.sh 47 | ``` 48 | 49 | **Modify it appropriately based on your local GPUs**. -------------------------------------------------------------------------------- /speech-instructions/generate.sh: -------------------------------------------------------------------------------- 1 | for i in {0..3}; do 2 | screen -S "partition-instructions-part-7_$i" -X quit 2>/dev/null 3 | screen -dmS "partition-instructions-part-7_$i" bash -c "cd /home/husein/ssd3/dataset/speech-instructions && \ 4 | CUDA_VISIBLE_DEVICES=0 \ 5 | python3.10 generate.py \ 6 | --input_file \"partition-instructions-part-7.json\" \ 7 | --folder \"partition-instructions-part-7\" \ 8 | --global_index 4 \ 9 | --index $i" 10 | done 11 | 12 | for i in {0..3}; do 13 | screen -S "partition-instructions-part-15_$i" -X quit 2>/dev/null 14 | screen -dmS "partition-instructions-part-15_$i" bash -c "cd /home/husein/ssd3/dataset/speech-instructions && \ 15 | CUDA_VISIBLE_DEVICES=2 \ 16 | python3.10 generate.py \ 17 | --input_file \"partition-instructions-part-15.json\" \ 18 | --folder \"partition-instructions-part-15\" \ 19 | --global_index 4 \ 20 | --index $i" 21 | done 22 | 23 | for i in {0..3}; do 24 | screen -S "tatabahasa_$i" -X quit 2>/dev/null 25 | screen -dmS "tatabahasa_$i" bash -c "cd /home/husein/ssd3/dataset/speech-instructions && \ 26 | CUDA_VISIBLE_DEVICES=2 \ 27 | python3.10 generate.py \ 28 | --input_file \"tatabahasa.json\" \ 29 | --folder \"tatabahasa-v2\" \ 30 | --global_index 4 \ 31 | --index $i --threshold -9 --maxlen 300 --retry 10" 32 | done 33 | 34 | for i in {0..3}; do 35 | screen -S "malaymmlu_$i" -X quit 2>/dev/null 36 | screen -dmS "malaymmlu_$i" bash -c "cd /home/husein/ssd3/dataset/speech-instructions && \ 37 | CUDA_VISIBLE_DEVICES=2 \ 38 | python3.10 generate.py \ 39 | --input_file \"malaymmlu.json\" \ 40 | --folder \"malaymmlu\" \ 41 | --global_index 4 \ 42 | --index $i --threshold -9 --maxlen 300 --retry 10" 43 | done -------------------------------------------------------------------------------- /text/text_dedup/utils/timer.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | class TimerContext: 5 | def __init__(self, timer: "Timer", name: str): 6 | self.timer = timer 7 | self.name = name 8 | self.start_time = None 9 | 10 | def __enter__(self): 11 | self.start_time = time.time() 12 | 13 | def __exit__(self, exc_type, exc_val, exc_tb): 14 | if any([exc_type, exc_val, exc_tb]): 15 | raise exc_val 16 | self.timer.elapsed_times[self.name] = time.time() - self.start_time 17 | 18 | 19 | class Timer: 20 | """ 21 | A simple timer that tracks the elapsed time of each context. 22 | 23 | Examples 24 | -------- 25 | >>> t = Timer() 26 | >>> with t("test"): 27 | ... time.sleep(1) 28 | >>> assert int(t.elapsed_times.get("test", 0)) >= 1, "The elapsed time should be 1 second." 29 | """ 30 | 31 | def __init__(self): 32 | self.elapsed_times = {} 33 | 34 | def __call__(self, name: str) -> TimerContext: 35 | """ 36 | Create a context with the given name. 37 | 38 | Parameters 39 | ---------- 40 | name: str 41 | The name of the context. 42 | 43 | Returns 44 | ------- 45 | TimerContext 46 | The context. 47 | 48 | Examples 49 | -------- 50 | >>> t = Timer() 51 | >>> with t("test"): 52 | ... time.sleep(1) 53 | >>> assert int(t.elapsed_times.get("test", 0)) == 1, "The elapsed time should be 1 second." 54 | >>> with t("test2"): 55 | ... time.sleep(2) 56 | >>> assert int(t.elapsed_times.get("test2", 0)) == 2, "The elapsed time should be 2 seconds." 57 | """ 58 | return TimerContext(self, name) -------------------------------------------------------------------------------- /text/processing/text_dedup/utils/timer.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | class TimerContext: 5 | def __init__(self, timer: "Timer", name: str): 6 | self.timer = timer 7 | self.name = name 8 | self.start_time = None 9 | 10 | def __enter__(self): 11 | self.start_time = time.time() 12 | 13 | def __exit__(self, exc_type, exc_val, exc_tb): 14 | if any([exc_type, exc_val, exc_tb]): 15 | raise exc_val 16 | self.timer.elapsed_times[self.name] = time.time() - self.start_time 17 | 18 | 19 | class Timer: 20 | """ 21 | A simple timer that tracks the elapsed time of each context. 22 | 23 | Examples 24 | -------- 25 | >>> t = Timer() 26 | >>> with t("test"): 27 | ... time.sleep(1) 28 | >>> assert int(t.elapsed_times.get("test", 0)) >= 1, "The elapsed time should be 1 second." 29 | """ 30 | 31 | def __init__(self): 32 | self.elapsed_times = {} 33 | 34 | def __call__(self, name: str) -> TimerContext: 35 | """ 36 | Create a context with the given name. 37 | 38 | Parameters 39 | ---------- 40 | name: str 41 | The name of the context. 42 | 43 | Returns 44 | ------- 45 | TimerContext 46 | The context. 47 | 48 | Examples 49 | -------- 50 | >>> t = Timer() 51 | >>> with t("test"): 52 | ... time.sleep(1) 53 | >>> assert int(t.elapsed_times.get("test", 0)) == 1, "The elapsed time should be 1 second." 54 | >>> with t("test2"): 55 | ... time.sleep(2) 56 | >>> assert int(t.elapsed_times.get("test2", 0)) == 2, "The elapsed time should be 2 seconds." 57 | """ 58 | return TimerContext(self, name) -------------------------------------------------------------------------------- /stt-whisper/README.md: -------------------------------------------------------------------------------- 1 | # STT Whisper 2 | 3 | 1. We provide segment and word level timestamps on, 4 | - [Malaysian Emilia Dialects](https://huggingface.co/datasets/mesolitica/Malaysian-Emilia#malaysian-dialect). 5 | - [Speech Instructions](https://huggingface.co/datasets/malaysia-ai/Speech-Instructions). 6 | 2. Synthetic merging different context, [synthetic-context-switching-word-timestamp.ipynb](synthetic-context-switching-word-timestamp.ipynb). 7 | 8 | ## Sliding Audionet 9 | 10 | ```bash 11 | CUDA_VISIBLE_DEVICES=0 \ 12 | python3 audioset_sliding.py --file 'prepared-pseudolabel.jsonl' --global-index 4 --local-index 0 13 | CUDA_VISIBLE_DEVICES=1 \ 14 | python3 audioset_sliding.py --file 'prepared-pseudolabel.jsonl' --global-index 4 --local-index 1 15 | CUDA_VISIBLE_DEVICES=2 \ 16 | python3 audioset_sliding.py --file 'prepared-pseudolabel.jsonl' --global-index 4 --local-index 2 17 | CUDA_VISIBLE_DEVICES=3 \ 18 | python3 audioset_sliding.py --file 'prepared-pseudolabel.jsonl' --global-index 4 --local-index 3 19 | ``` 20 | 21 | ## Speech Instructions 22 | 23 | 1. Run force alignment, 24 | 25 | ```bash 26 | CUDA_VISIBLE_DEVICES=2 \ 27 | python3.10 force_alignment.py \ 28 | --filename 'prepare-force-alignment.json' \ 29 | --language 'ms' \ 30 | --replication 3 31 | ``` 32 | 33 | 2. Prepare dataset, 34 | 35 | - Segment level, [speech-instructions-segment-timestamps.ipynb](speech-instructions-segment-timestamps.ipynb). 36 | - Word level, [speech-instructions-word-timestamps.ipynb](speech-instructions-word-timestamps.ipynb). 37 | 38 | ## Malaysian Emilia Dialects 39 | 40 | 1. Prepare dataset, 41 | 42 | Because force alignment already calculated at [mesolitica/Malaysian-Emilia-annotated/dialects_processed_alignment.zip](https://huggingface.co/datasets/mesolitica/Malaysian-Emilia-annotated/blob/main/dialects_processed_alignment.zip). 43 | 44 | - Segment level, [dialects-segment-timestamps.ipynb](dialects-segment-timestamps.ipynb). 45 | - Word level, [dialects-word-timestamps.ipynb](dialects-word-timestamps.ipynb). -------------------------------------------------------------------------------- /text/extra/process-lowyat.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "47588232", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# !git clone https://huggingface.co/datasets/mesolitica/crawl-lowyat" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "id": "44f963ff", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from glob import glob\n", 21 | "import json" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 5, 27 | "id": "43c7cfc0", 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/plain": [ 33 | "63" 34 | ] 35 | }, 36 | "execution_count": 5, 37 | "metadata": {}, 38 | "output_type": "execute_result" 39 | } 40 | ], 41 | "source": [ 42 | "files = glob('crawl-lowyat/*.json')\n", 43 | "files = [f for f in files if '-topics' not in f]\n", 44 | "len(files)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 12, 50 | "id": "2a694ba8", 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "with open('hf-datasets/raw-datasets/lowyat.jsonl', 'w') as fopen_l:\n", 55 | " for f in files:\n", 56 | " with open(f) as fopen:\n", 57 | " data = json.load(fopen)\n", 58 | " for d in data:\n", 59 | " fopen_l.write(f'{json.dumps(d)}\\n')" 60 | ] 61 | } 62 | ], 63 | "metadata": { 64 | "kernelspec": { 65 | "display_name": "Python 3 (ipykernel)", 66 | "language": "python", 67 | "name": "python3" 68 | }, 69 | "language_info": { 70 | "codemirror_mode": { 71 | "name": "ipython", 72 | "version": 3 73 | }, 74 | "file_extension": ".py", 75 | "mimetype": "text/x-python", 76 | "name": "python", 77 | "nbconvert_exporter": "python", 78 | "pygments_lexer": "ipython3", 79 | "version": "3.10.12" 80 | } 81 | }, 82 | "nbformat": 4, 83 | "nbformat_minor": 5 84 | } 85 | -------------------------------------------------------------------------------- /speech-instructions/embedding.py: -------------------------------------------------------------------------------- 1 | import click 2 | import json 3 | import pandas as pd 4 | import torch 5 | import numpy as np 6 | from tqdm import tqdm 7 | from multiprocess import Pool 8 | import os 9 | 10 | def chunks(l, devices, folder): 11 | chunk_size = len(l) // len(devices) 12 | remainder = len(l) % len(devices) 13 | start = 0 14 | for i in range(len(devices)): 15 | extra = 1 if i < remainder else 0 16 | end = start + chunk_size + extra 17 | yield (l[start:end], devices[i], folder) 18 | start = end 19 | 20 | def loop(rows): 21 | rows, index, folder = rows 22 | os.environ['CUDA_VISIBLE_DEVICES'] = str(index) 23 | 24 | import torch 25 | import torchaudio 26 | import malaya_speech 27 | 28 | model = malaya_speech.speaker_vector.nemo('huseinzol05/nemo-titanet_large').cuda() 29 | _ = model.eval() 30 | with torch.no_grad(): 31 | for row in tqdm(rows, desc = f'loop {index}'): 32 | no, row = row 33 | new_f = os.path.join(folder, f'{no}.npy') 34 | if os.path.exists(new_f): 35 | continue 36 | e = model([malaya_speech.load(row['audio'])[0]]) 37 | np.save(new_f, e[0], allow_pickle=True) 38 | 39 | @click.command() 40 | @click.option('--filename') 41 | @click.option('--replication', default = 1) 42 | @click.option('--folder', default = 'embedding') 43 | def main(filename, replication, folder): 44 | os.makedirs(folder, exist_ok = True) 45 | devices = os.environ.get('CUDA_VISIBLE_DEVICES') 46 | if devices is None: 47 | devices = list(range(torch.cuda.device_count())) 48 | else: 49 | devices = [d.strip() for d in devices.split(',')] 50 | 51 | devices = replication * devices 52 | print(devices) 53 | 54 | rows = pd.read_parquet(filename).to_dict(orient = 'records') 55 | rows = [(i, rows[i]) for i in range(len(rows))] 56 | df_split = chunks(rows, devices, folder) 57 | pool = Pool(len(devices)) 58 | pooled = pool.map(loop, df_split) 59 | pool.close() 60 | pool.join() 61 | 62 | if __name__ == '__main__': 63 | main() 64 | 65 | 66 | -------------------------------------------------------------------------------- /multilingual-tts/embedding.py: -------------------------------------------------------------------------------- 1 | import click 2 | import json 3 | import pandas as pd 4 | import torch 5 | import numpy as np 6 | from tqdm import tqdm 7 | from multiprocess import Pool 8 | import os 9 | 10 | def chunks(l, devices, folder): 11 | chunk_size = len(l) // len(devices) 12 | remainder = len(l) % len(devices) 13 | start = 0 14 | for i in range(len(devices)): 15 | extra = 1 if i < remainder else 0 16 | end = start + chunk_size + extra 17 | yield (l[start:end], devices[i], folder) 18 | start = end 19 | 20 | def loop(rows): 21 | rows, index, folder = rows 22 | os.environ['CUDA_VISIBLE_DEVICES'] = str(index) 23 | 24 | import torch 25 | import torchaudio 26 | import malaya_speech 27 | 28 | model = malaya_speech.speaker_vector.nemo('huseinzol05/nemo-titanet_large').cuda() 29 | _ = model.eval() 30 | with torch.no_grad(): 31 | for row in tqdm(rows, desc = f'loop {index}'): 32 | no, row = row 33 | new_f = os.path.join(folder, f'{no}.npy') 34 | if os.path.exists(new_f): 35 | continue 36 | e = model([malaya_speech.load(row['audio_filename'])[0]]) 37 | np.save(new_f, e[0], allow_pickle=True) 38 | 39 | @click.command() 40 | @click.option('--file') 41 | @click.option('--replication', default = 1) 42 | def main(file, replication): 43 | 44 | folder = file.replace('.json', '') + '_embedding' 45 | os.makedirs(folder, exist_ok = True) 46 | devices = os.environ.get('CUDA_VISIBLE_DEVICES') 47 | if devices is None: 48 | devices = list(range(torch.cuda.device_count())) 49 | else: 50 | devices = [d.strip() for d in devices.split(',')] 51 | 52 | devices = replication * devices 53 | print(devices) 54 | 55 | with open(file) as fopen: 56 | rows = json.load(fopen) 57 | rows = [(i, rows[i]) for i in range(len(rows))] 58 | 59 | df_split = chunks(rows, devices, folder) 60 | pool = Pool(len(devices)) 61 | pooled = pool.map(loop, df_split) 62 | pool.close() 63 | pool.join() 64 | 65 | if __name__ == '__main__': 66 | main() 67 | 68 | -------------------------------------------------------------------------------- /text/extra/process-data.gov.my.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 11, 6 | "id": "fba0fccf", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# !wget https://huggingface.co/datasets/mesolitica/crawl-gov.my/resolve/main/data.gov.my -O data/data.gov.my" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "id": "af0da1cc", 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "name": "stderr", 21 | "output_type": "stream", 22 | "text": [ 23 | "12127it [00:33, 361.39it/s]\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "import json\n", 29 | "from tqdm import tqdm\n", 30 | "\n", 31 | "with open('hf-datasets/raw-datasets/data.gov.my.jsonl', 'w') as fopen_l:\n", 32 | " with open('data/data.gov.my') as fopen:\n", 33 | " for l in tqdm(fopen):\n", 34 | " d = json.loads(l)\n", 35 | " p = '\\n'.join(d['p'])\n", 36 | " keys = d['file_urls'].keys()\n", 37 | " keys = [k for k in keys if k.endswith('csv') or k.endswith('xlsx')]\n", 38 | " if not len(keys):\n", 39 | " continue\n", 40 | " csv = d['file_urls'][keys[0]]\n", 41 | " t = f'{p}\\n{csv}'\n", 42 | " data = {\n", 43 | " 'text': t\n", 44 | " }\n", 45 | " fopen_l.write(f'{json.dumps(data)}\\n')" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "id": "1a415949", 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [] 55 | } 56 | ], 57 | "metadata": { 58 | "kernelspec": { 59 | "display_name": "Python 3 (ipykernel)", 60 | "language": "python", 61 | "name": "python3" 62 | }, 63 | "language_info": { 64 | "codemirror_mode": { 65 | "name": "ipython", 66 | "version": 3 67 | }, 68 | "file_extension": ".py", 69 | "mimetype": "text/x-python", 70 | "name": "python", 71 | "nbconvert_exporter": "python", 72 | "pygments_lexer": "ipython3", 73 | "version": "3.10.12" 74 | } 75 | }, 76 | "nbformat": 4, 77 | "nbformat_minor": 5 78 | } 79 | -------------------------------------------------------------------------------- /text/processing/README.md: -------------------------------------------------------------------------------- 1 | # text-dataset-dedup-py 2 | 3 | ## Description 4 | The `text-dataset-dedup-py` repository contains a Python script that performs a deduplication process on a text dataset. This process is implemented based on the code provided in the [Jupyter Notebook](https://github.com/malaysia-ai/text-dataset-dedup). 5 | 6 | ## How to Use 7 | Follow the steps below to use the deduplication script: 8 | 9 | 1. **Change Directory**: Navigate to the `/processing` directory within this repository. 10 | 11 | 2. **Prepare the Command**: Once in the `/processing` directory, prepare the command to execute the deduplication process. 12 | 13 | Single Dataset (from Huggingface URL) 14 | ```bash 15 | python3 main.py --dataset "piston.my" --url_dataset "https://huggingface.co/datasets/mesolitica/crawl-my-website/resolve/main/piston.my.jsonl" --master_folder "/home/ubuntu/za/datasets04" --text_key reviews_html reviews_text 16 | ``` 17 | 18 | Single Dataset (manually cleaned) 19 | ```bash 20 | python3 main.py --dataset "murai.my" --clean_file_path "/home/ubuntu/faiq913_folder/Cleaned Huggingface datasets/murai.my/murai_my_clean.jsonl" --master_folder "/home/ubuntu/za/datasets04" 21 | ``` 22 | 23 | If you have multiple datasets from multiple Huggingface URLs, 24 | ```bash 25 | python3 main.py \ 26 | --master_folder "/home/ubuntu/za/datasets04" \ 27 | --dataset_with_link \ 28 | piston.my,https://huggingface.co/datasets/mesolitica/crawl-my-website/resolve/main/piston.my.json \ 29 | piston2,https://huggingface.co/datasets/mesolitica/crawl-my-website/resolve/main/piston.my.jsonl \ 30 | piston3,https://huggingface.co/datasets/mesolitica/crawl-my-website/resolve/main/piston.my.jsonl 31 | ``` 32 | 33 | ### Arguments 34 | 1. `dataset`: Name of the dataset folder inside /dataset where the script will find data. 35 | 2. `url_dataset`: URL of the JSONL file containing data to be processed (script only handles JSONL files). 36 | 3. `master_folder`: Absolute path to the master directory where the deduplication process will occur. 37 | 4. `dataset_with_link`: Format {dataset_name},{dataset_url} {dataset_name02},{dataset_url02} 38 | 5. `text_key`: To add own custom key if you encounter an issue `dataset not in standard key-value. must have ...` 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /text/README.md: -------------------------------------------------------------------------------- 1 | # pretrain-text-dataset 2 | 3 | Prepare pretrain dataset gathered from https://github.com/users/huseinzol05/projects/1 4 | 5 | All dedup and postprocessed dataset uploaded at https://huggingface.co/datasets/malaysia-ai/pretrain-text-dataset 6 | 7 | ## Server spec 8 | 9 | 1. 24 cores. 10 | 2. 220 GB RAM. 11 | 12 | **Deduping can explode the memory, easily eat up to 30 GB if the dataset is > 10GB, so beware**. 13 | 14 | ## Download dataset 15 | 16 | 1. Most of download files are straight forward, 17 | 18 | ```bash 19 | wget https://huggingface.co/datasets/mesolitica/crawl-amanz-my/resolve/main/parsed.jsonl -O hf-datasets/raw-datasets/amanz.jsonl 20 | ``` 21 | 22 | But sometime we have to some preprocessing like, 23 | 24 | - [process-lowyat.ipynb](process-lowyat.ipynb) 25 | - [process-data.gov.my.ipynb](process-data.gov.my.ipynb) 26 | - [process-snapshot.ipynb](process-snapshot.ipynb) 27 | 28 | We save raw datasets at [hf-datasets/raw-datasets](hf-datasets/raw-datasets). 29 | 30 | ## Text dedup 31 | 32 | 1. Clone [remove-duplicate-text-dataset.ipynb](remove-duplicate-text-dataset.ipynb) to new notebook, eg, [remove-duplicate-text-dataset-lowyat.ipynb](remove-duplicate-text-dataset-lowyat.ipynb). 33 | 34 | This notebook use [text_dedup](text_dedup) to do dedup, borrowed from https://github.com/ChenghaoMou/text-dedup 35 | 36 | All dedup datasets will save at [hf-datasets/dedupe-datasets](hf-datasets/dedupe-datasets). 37 | 38 | ## Postprocessing 39 | 40 | 1. Run [postprocessing.ipynb](postprocessing.ipynb) to start postprocessing, 41 | 42 | - remove texts that contain HTTP errors. 43 | - remove texts less than 3 characters. 44 | - replace 6 spaces or more with 6 spaces. 45 | - replace 6 dots or more with 6 dots. 46 | 47 | **Rerun this notebook will not overwrite postprocessed datasets**. 48 | 49 | ## Prepare for training session 50 | 51 | **There is no consideration AI alignment and safety in current dataset, we only apply basic postfilter**. 52 | 53 | 1. [FPF llama2](llama) 54 | 2. [FPF Mistral](mistral) 55 | 3. [Pretrain nanoT5](nanot5) 56 | 4. [Pretrain smaller Causal LM](pretrain-clm) 57 | 5. [Pretrain LLM](pretrain-llm) 58 | 6. [FPF TinyLlama](tinyllama) 59 | 7. [FPF Yi](yi) 60 | 61 | ## end-to-end processing using Python script 62 | 63 | Released as a Python library, https://github.com/malaysia-ai/clean_text_my 64 | 65 | -------------------------------------------------------------------------------- /speech-instructions/prepare-malaysian-podcast.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "id": "1ac7cbf3", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "from tqdm import tqdm\n", 13 | "import torchaudio\n", 14 | "import os" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 6, 20 | "id": "64cd5042", 21 | "metadata": { 22 | "scrolled": true 23 | }, 24 | "outputs": [ 25 | { 26 | "name": "stderr", 27 | "output_type": "stream", 28 | "text": [ 29 | "100%|███████████████████████████████████████████████████████████████████████████| 2033890/2033890 [00:08<00:00, 244819.71it/s]\n" 30 | ] 31 | } 32 | ], 33 | "source": [ 34 | "df = pd.read_parquet('/home/husein/ssd4/verify-text.parquet')\n", 35 | "filtered = []\n", 36 | "for i in tqdm(range(len(df))):\n", 37 | " if 'podcast_processed' in df['audio'].iloc[i]:\n", 38 | " row = df.iloc[i].to_dict()\n", 39 | " f = os.path.join('/home/husein/ssd4/', row['audio'])\n", 40 | " row['audio'] = f\n", 41 | " filtered.append(row)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 7, 47 | "id": "438c8a4e", 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "data": { 52 | "text/plain": [ 53 | "75965" 54 | ] 55 | }, 56 | "execution_count": 7, 57 | "metadata": {}, 58 | "output_type": "execute_result" 59 | } 60 | ], 61 | "source": [ 62 | "len(filtered)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 8, 68 | "id": "f625ca99", 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "pd.DataFrame(filtered).to_parquet('filtered-podcast.parquet')" 73 | ] 74 | } 75 | ], 76 | "metadata": { 77 | "kernelspec": { 78 | "display_name": "python3.10", 79 | "language": "python", 80 | "name": "python3.10" 81 | }, 82 | "language_info": { 83 | "codemirror_mode": { 84 | "name": "ipython", 85 | "version": 3 86 | }, 87 | "file_extension": ".py", 88 | "mimetype": "text/x-python", 89 | "name": "python", 90 | "nbconvert_exporter": "python", 91 | "pygments_lexer": "ipython3", 92 | "version": "3.10.15" 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 5 97 | } 98 | -------------------------------------------------------------------------------- /emotional-malaysian-emilia/pitch_estimation.py: -------------------------------------------------------------------------------- 1 | import click 2 | import torch 3 | import torchaudio 4 | from glob import glob 5 | from tqdm import tqdm 6 | import os 7 | import penn 8 | import torch 9 | import huggingface_hub 10 | 11 | def new_path(f): 12 | f = f.replace('.mp3', '.pitch') 13 | splitted = f.split('/') 14 | base_folder = splitted[0] + '_pitch' 15 | splitted = '/'.join([base_folder] + splitted[1:]) 16 | return splitted 17 | 18 | @click.command() 19 | @click.option("--path", help="files path in glob pattern") 20 | @click.option("--global-index", default=1, help="global index") 21 | @click.option("--local-index", default=0, help="local index") 22 | def function(path, global_index, local_index): 23 | files = glob(path) 24 | filtered_files = [] 25 | for f in files: 26 | new_f = new_path(f) 27 | if os.path.exists(new_f) and os.path.getsize(new_f) > 2: 28 | continue 29 | filtered_files.append(f) 30 | 31 | global_size = len(filtered_files) // global_index 32 | filtered_files = filtered_files[global_size * local_index: global_size * (local_index + 1)] 33 | files = filtered_files 34 | 35 | model = penn.Model() 36 | checkpoint = huggingface_hub.hf_hub_download( 37 | 'maxrmorrison/fcnf0-plus-plus', 38 | 'fcnf0++.pt') 39 | checkpoint = torch.load(checkpoint, map_location='cpu') 40 | model.load_state_dict(checkpoint['model']) 41 | 42 | model = model.to('cuda').to(torch.float16) 43 | 44 | with torch.no_grad(): 45 | for f in tqdm(files): 46 | y, sr = torchaudio.load(f) 47 | y = torchaudio.functional.resample(y, sr, penn.SAMPLE_RATE) 48 | pitch, periodicity = [], [] 49 | with torch.no_grad(): 50 | for frames in penn.preprocess( 51 | y, 52 | ): 53 | logits = model(frames.to(torch.float16).to('cuda')) 54 | result = penn.postprocess(logits) 55 | pitch.append(result[1]) 56 | periodicity.append(result[2]) 57 | pitch, periodicity = torch.cat(pitch, 1), torch.cat(periodicity, 1) 58 | pitch = penn.voicing.interpolate( 59 | pitch, 60 | periodicity, 61 | interp_unvoiced_at) 62 | pitch = pitch[0].cpu().numpy().tolist() 63 | pitch = [round(p, 4) for p in pitch] 64 | periodicity = periodicity[0].cpu().numpy().tolist() 65 | periodicity = [round(p, 4) for p in periodicity] 66 | splitted = new_path(f) 67 | os.makedirs(os.path.split(splitted)[0], exist_ok = True) 68 | 69 | with open(splitted, 'w') as fopen: 70 | json.dump({'pitch': pitch, 'periodicity': periodicity}, fopen) 71 | 72 | if __name__ == '__main__': 73 | function() -------------------------------------------------------------------------------- /multilingual-tts/prepare/prepare-WenetSpeech4TTS.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 10, 6 | "id": "a2127d3e-e002-4d8c-8c8b-ec5ecfeb8b7a", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from glob import glob\n", 11 | "import pandas as pd\n", 12 | "import os\n", 13 | "import soundfile as sf\n", 14 | "from tqdm import tqdm\n", 15 | "from multiprocess import Pool\n", 16 | "import librosa\n", 17 | "import itertools\n", 18 | "import io\n", 19 | "import numpy as np\n", 20 | "import json\n", 21 | "import tarfile\n", 22 | "\n", 23 | "def chunks(l, n):\n", 24 | " for i in range(0, len(l), n):\n", 25 | " yield (l[i: i + n], i // n)\n", 26 | "\n", 27 | "def multiprocessing(strings, function, cores=6, returned=True):\n", 28 | " df_split = chunks(strings, len(strings) // cores)\n", 29 | " pool = Pool(cores)\n", 30 | " pooled = pool.map(function, df_split)\n", 31 | " pool.close()\n", 32 | " pool.join()\n", 33 | "\n", 34 | " if returned:\n", 35 | " return list(itertools.chain(*pooled))" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 11, 41 | "id": "53d47aa9-b5f8-4378-8416-45ceace47196", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# def loop(files):\n", 46 | "# files, _ = files\n", 47 | "# for f in tqdm(files):\n", 48 | "# with tarfile.open(f, \"r:gz\") as tar:\n", 49 | "# tar.extractall(path='WenetSpeech4TTS/Standard')\n", 50 | "\n", 51 | "# files = glob('WenetSpeech4TTS/Standard/*.tar.gz')\n", 52 | "# multiprocessing(files, loop, len(files), returned = False)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 12, 58 | "id": "f00e77ce-af6d-4232-b067-aedf1c9d1964", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# def loop(files):\n", 63 | "# files, _ = files\n", 64 | "# for f in tqdm(files):\n", 65 | "# with tarfile.open(f, \"r:gz\") as tar:\n", 66 | "# tar.extractall(path='WenetSpeech4TTS/Premium')\n", 67 | "\n", 68 | "# files = glob('WenetSpeech4TTS/Premium/*.tar.gz')\n", 69 | "# multiprocessing(files, loop, len(files), returned = False)" 70 | ] 71 | } 72 | ], 73 | "metadata": { 74 | "kernelspec": { 75 | "display_name": "Python 3 (ipykernel)", 76 | "language": "python", 77 | "name": "python3" 78 | }, 79 | "language_info": { 80 | "codemirror_mode": { 81 | "name": "ipython", 82 | "version": 3 83 | }, 84 | "file_extension": ".py", 85 | "mimetype": "text/x-python", 86 | "name": "python", 87 | "nbconvert_exporter": "python", 88 | "pygments_lexer": "ipython3", 89 | "version": "3.10.12" 90 | } 91 | }, 92 | "nbformat": 4, 93 | "nbformat_minor": 5 94 | } 95 | -------------------------------------------------------------------------------- /speech-instructions/remote.sh: -------------------------------------------------------------------------------- 1 | apt update 2 | apt install unzip ffmpeg -y 3 | apt update && apt install -y locales 4 | locale-gen en_US.UTF-8 5 | cd /workspace 6 | wget https://www.7-zip.org/a/7z2301-linux-x64.tar.xz 7 | tar -xf 7z2301-linux-x64.tar.xz 8 | pip3 install huggingface-hub 9 | 10 | python3 -c " 11 | from huggingface_hub import snapshot_download 12 | snapshot_download(repo_id='malaysia-ai/dedup-Malaysian-Emilia', repo_type='dataset', 13 | allow_patterns = '*.z*', local_dir = './') 14 | " 15 | /workspace/7zz x dedup-parliament.zip -y -mmt40 16 | /workspace/7zz x dedup-podcasts.zip -y -mmt40 17 | 18 | wget https://github.com/mesolitica/malaysian-dataset/raw/refs/heads/master/text-to-speech/husein/requirements.txt 19 | pip3 install -r requirements.txt 20 | pip3 install click vocos torchdiffeq==0.2.4 x-transformers==1.42.11 jieba==0.42.1 pypinyin==0.53.0 21 | 22 | wget https://huggingface.co/datasets/malaysia-ai/Speech-Instructions/resolve/main/text/partition-instructions-part-3.json 23 | wget https://huggingface.co/datasets/malaysia-ai/Speech-Instructions/resolve/main/text/partition-instructions-part-4.json 24 | wget https://huggingface.co/datasets/malaysia-ai/Speech-Instructions/resolve/main/text/partition-instructions-part-5.json 25 | wget https://huggingface.co/datasets/malaysia-ai/Speech-Instructions/resolve/main/text/partition-instructions-part-6.json 26 | wget https://raw.githubusercontent.com/malaysia-ai/dataset/refs/heads/main/speech-instructions/generate.py 27 | 28 | for i in {0..3}; do 29 | screen -S "partition-instructions-part-3_$i" -X quit 2>/dev/null 30 | screen -dmS "partition-instructions-part-3_$i" bash -c "cd /workspace && \ 31 | CUDA_VISIBLE_DEVICES=0 \ 32 | python3 generate.py \ 33 | --input_file \"partition-instructions-part-3.json\" \ 34 | --folder \"partition-instructions-part-3\" \ 35 | --global_index 4 \ 36 | --index $i" 37 | done 38 | 39 | for i in {0..3}; do 40 | screen -S "partition-instructions-part-4_$i" -X quit 2>/dev/null 41 | screen -dmS "partition-instructions-part-4_$i" bash -c "cd /workspace && \ 42 | CUDA_VISIBLE_DEVICES=1 \ 43 | python3 generate.py \ 44 | --input_file \"partition-instructions-part-4.json\" \ 45 | --folder \"partition-instructions-part-4\" \ 46 | --global_index 4 \ 47 | --index $i" 48 | done 49 | 50 | for i in {0..3}; do 51 | screen -S "partition-instructions-part-5_$i" -X quit 2>/dev/null 52 | screen -dmS "partition-instructions-part-5_$i" bash -c "cd /workspace && \ 53 | CUDA_VISIBLE_DEVICES=2 \ 54 | python3 generate.py \ 55 | --input_file \"partition-instructions-part-5.json\" \ 56 | --folder \"partition-instructions-part-5\" \ 57 | --global_index 4 \ 58 | --index $i" 59 | done 60 | 61 | for i in {0..3}; do 62 | screen -S "partition-instructions-part-6_$i" -X quit 2>/dev/null 63 | screen -dmS "partition-instructions-part-6_$i" bash -c "cd /workspace && \ 64 | CUDA_VISIBLE_DEVICES=3 \ 65 | python3 generate.py \ 66 | --input_file \"partition-instructions-part-6.json\" \ 67 | --folder \"partition-instructions-part-6\" \ 68 | --global_index 4 \ 69 | --index $i" 70 | done -------------------------------------------------------------------------------- /speech-instructions-extra/upload.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "id": "489cdcad", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from glob import glob\n", 11 | "from tqdm import tqdm\n", 12 | "import json" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 8, 18 | "id": "02e5a67f", 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stderr", 23 | "output_type": "stream", 24 | "text": [ 25 | "100%|████████████████████████████████████████████████████████████████████████████████████| 965/965 [00:01<00:00, 811.02it/s]\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "alls = []\n", 31 | "for f in tqdm(glob('*/*.parquet')):\n", 32 | " try:\n", 33 | " with open(f) as fopen:\n", 34 | " d = json.load(fopen)\n", 35 | " for d_ in d:\n", 36 | " d_['start'] = None\n", 37 | " d_['end'] = None\n", 38 | " d_['context'] = None\n", 39 | " d_['system'] = None\n", 40 | " d_['sliced_audio_filename'] = None\n", 41 | " alls.append(d_)\n", 42 | " except:\n", 43 | " pass" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 9, 49 | "id": "1def2d40", 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/plain": [ 55 | "549110" 56 | ] 57 | }, 58 | "execution_count": 9, 59 | "metadata": {}, 60 | "output_type": "execute_result" 61 | } 62 | ], 63 | "source": [ 64 | "len(alls)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 13, 70 | "id": "e9b10351", 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "{'instruction': 'What decision did Speaker1 and Speaker2 agree on regarding taking something first?',\n", 77 | " 'answer': 'Speaker1 would take the first one.',\n", 78 | " 'audio_filename': 'SQA-PART3-Train-audio/train-00145-of-00171-2.mp3',\n", 79 | " 'start': None,\n", 80 | " 'end': None,\n", 81 | " 'context': None,\n", 82 | " 'system': None,\n", 83 | " 'sliced_audio_filename': None}" 84 | ] 85 | }, 86 | "execution_count": 13, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "alls[2]" 93 | ] 94 | } 95 | ], 96 | "metadata": { 97 | "kernelspec": { 98 | "display_name": "python3.10", 99 | "language": "python", 100 | "name": "python3.10" 101 | }, 102 | "language_info": { 103 | "codemirror_mode": { 104 | "name": "ipython", 105 | "version": 3 106 | }, 107 | "file_extension": ".py", 108 | "mimetype": "text/x-python", 109 | "name": "python", 110 | "nbconvert_exporter": "python", 111 | "pygments_lexer": "ipython3", 112 | "version": "3.10.15" 113 | } 114 | }, 115 | "nbformat": 4, 116 | "nbformat_minor": 5 117 | } 118 | -------------------------------------------------------------------------------- /text/pretrain-llm/prepare-madlad-400-ms.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "86d80b05", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# !git lfs clone https://huggingface.co/datasets/malaysia-ai/madlad-400-ms" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 3, 16 | "id": "54ca47c5", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import json\n", 21 | "import os\n", 22 | "from glob import glob\n", 23 | "from tqdm import tqdm" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 4, 29 | "id": "e92d6668", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "def partition(text, size = 500):\n", 34 | " splitted = text.split()\n", 35 | " return [' '.join(splitted[i: i + size]) for i in range(0, len(splitted), size)]" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "id": "f1b7e4f9", 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "madlad-400-ms.jsonl00.splitted\tmadlad-400-ms.jsonl02.splitted\r\n", 49 | "madlad-400-ms.jsonl01.splitted\r\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "!ls madlad-400-ms" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 5, 60 | "id": "485c6a71", 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "a = open('prepare-madlad-400-ms.jsonl', 'w')" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 6, 70 | "id": "6aacc1a4", 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stderr", 75 | "output_type": "stream", 76 | "text": [ 77 | "4081851it [18:57, 3587.17it/s]\n", 78 | "5000000it [23:02, 3615.70it/s]\n", 79 | "5000000it [34:34, 2410.40it/s]\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "madlad_ms = glob('madlad-400-ms/*.splitted')\n", 85 | "for f in madlad_ms:\n", 86 | " with open(f) as fopen:\n", 87 | " for l in tqdm(fopen):\n", 88 | " try:\n", 89 | " data = '' + json.loads(l)['text'] + ''\n", 90 | " partitioned = partition(data)\n", 91 | " for p in partitioned:\n", 92 | " data = {\n", 93 | " 'text': p,\n", 94 | " }\n", 95 | " a.write(f'{json.dumps(data)}\\n')\n", 96 | " a.flush()\n", 97 | " except:\n", 98 | " pass" 99 | ] 100 | } 101 | ], 102 | "metadata": { 103 | "kernelspec": { 104 | "display_name": "Python 3 (ipykernel)", 105 | "language": "python", 106 | "name": "python3" 107 | }, 108 | "language_info": { 109 | "codemirror_mode": { 110 | "name": "ipython", 111 | "version": 3 112 | }, 113 | "file_extension": ".py", 114 | "mimetype": "text/x-python", 115 | "name": "python", 116 | "nbconvert_exporter": "python", 117 | "pygments_lexer": "ipython3", 118 | "version": "3.10.12" 119 | } 120 | }, 121 | "nbformat": 4, 122 | "nbformat_minor": 5 123 | } 124 | -------------------------------------------------------------------------------- /emotional-malaysian-emilia/audioset_sliding.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoFeatureExtractor, AutoModelForAudioClassification 2 | from torch.utils.data import DataLoader 3 | from torch.nn import functional as F 4 | from tqdm import tqdm 5 | from glob import glob 6 | from datasets import Audio 7 | import torch 8 | import torchaudio 9 | import numpy as np 10 | import click 11 | import os 12 | import json 13 | 14 | def new_path(f): 15 | f = f.replace('.mp3', '.audioset') 16 | splitted = f.split('/') 17 | base_folder = splitted[0] + '_audioset' 18 | splitted = '/'.join([base_folder] + splitted[1:]) 19 | return splitted 20 | 21 | 22 | @click.command() 23 | @click.option("--path", help="files path in glob pattern") 24 | @click.option("--global-index", default=1, help="global index") 25 | @click.option("--local-index", default=0, help="local index") 26 | @click.option("--sliding", default=0.25) 27 | @click.option("--model", default='MIT/ast-finetuned-audioset-10-10-0.4593') 28 | def function(path, global_index, local_index, sliding, model): 29 | 30 | feature_extractor = AutoFeatureExtractor.from_pretrained(model, return_attention_mask = True) 31 | model = AutoModelForAudioClassification.from_pretrained(model, torch_dtype = torch.float16).eval().cuda() 32 | id2label = model.config.id2label 33 | sr = feature_extractor.sampling_rate 34 | sliding = int(sliding * sr) 35 | audio = Audio(sampling_rate = sr) 36 | 37 | files = glob(path) 38 | filtered_files = [] 39 | for f in files: 40 | new_f = new_path(f) 41 | if os.path.exists(new_f) and os.path.getsize(new_f) > 2: 42 | continue 43 | filtered_files.append(f) 44 | 45 | global_size = len(filtered_files) // global_index 46 | filtered_files = filtered_files[global_size * local_index: global_size * (local_index + 1)] 47 | files = filtered_files 48 | 49 | with torch.no_grad(): 50 | for f in tqdm(files): 51 | y = audio.decode_example(audio.encode_example(f))['array'] 52 | timestamps = [] 53 | slided = [] 54 | for i in range(0, len(y), sliding): 55 | y_ = y[i: i + sliding] 56 | if len(y_) < 1000: 57 | continue 58 | slided.append(y[i: i + sliding]) 59 | start = i / sr 60 | end = min(len(y) / sr, (i + sliding) / sr) 61 | timestamps.append((start, end)) 62 | 63 | inputs = feature_extractor(slided, sampling_rate=sr, 64 | return_tensors="pt", return_attention_mask = True) 65 | inputs['input_values'] = inputs['input_values'].to(torch.float16).cuda() 66 | logits = model(**inputs).logits.softmax(-1) 67 | topk = torch.topk(logits, 5, dim = -1) 68 | probs = topk.values.cpu().numpy().tolist() 69 | 70 | for i in range(len(probs)): 71 | for k in range(len(probs[i])): 72 | probs[i][k] = round(probs[i][k], 4) 73 | 74 | labels = [] 75 | for row in topk.indices.cpu().numpy(): 76 | label = [id2label[r] for r in row] 77 | labels.append(label) 78 | 79 | splitted = new_path(f) 80 | os.makedirs(os.path.split(splitted)[0], exist_ok = True) 81 | with open(splitted, 'w') as fopen: 82 | json.dump({'timestamps': timestamps, 'labels': labels, 'probs': probs}, fopen) 83 | 84 | if __name__ == '__main__': 85 | function() -------------------------------------------------------------------------------- /speech-instructions/prepare-malaysian-others.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "3c883f91", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# !wget https://huggingface.co/Zyphra/Zonos-v0.1-speaker-embedding/resolve/main/ResNet293_SimAM_ASP_base.pt" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "id": "bf4cd179", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import os\n", 21 | "\n", 22 | "os.environ['CUDA_VISIBLE_DEVICES'] = '1'\n", 23 | "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "id": "1ac7cbf3", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "import pandas as pd\n", 34 | "import numpy as np\n", 35 | "from tqdm import tqdm\n", 36 | "import torchaudio" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 8, 42 | "id": "56d3111e", 43 | "metadata": { 44 | "scrolled": true 45 | }, 46 | "outputs": [ 47 | { 48 | "name": "stderr", 49 | "output_type": "stream", 50 | "text": [ 51 | "100%|████████████████████████████████████████████████████████████████████████████| 2033890/2033890 [00:20<00:00, 97450.43it/s]\n" 52 | ] 53 | }, 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "555379" 58 | ] 59 | }, 60 | "execution_count": 8, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "df = pd.read_parquet('/home/husein/ssd4/verify-text.parquet')\n", 67 | "filtered = []\n", 68 | "for i in tqdm(range(len(df))):\n", 69 | " f = df['audio'].iloc[i]\n", 70 | " \n", 71 | " if 'parlimen-24k' not in f and 'podcast_processed' not in f and 'dialects_processed' not in f:\n", 72 | " row = df.iloc[i].to_dict()\n", 73 | " f = os.path.join('/home/husein/ssd4/', row['audio'])\n", 74 | " row['audio'] = f\n", 75 | " filtered.append(row)\n", 76 | " \n", 77 | " \n", 78 | "len(filtered)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 9, 84 | "id": "c358a3f7", 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "555379" 91 | ] 92 | }, 93 | "execution_count": 9, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "len(filtered)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 12, 105 | "id": "54d9bd5b", 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "pd.DataFrame(filtered).to_parquet('filtered-others.parquet')" 110 | ] 111 | } 112 | ], 113 | "metadata": { 114 | "kernelspec": { 115 | "display_name": "python3.10", 116 | "language": "python", 117 | "name": "python3.10" 118 | }, 119 | "language_info": { 120 | "codemirror_mode": { 121 | "name": "ipython", 122 | "version": 3 123 | }, 124 | "file_extension": ".py", 125 | "mimetype": "text/x-python", 126 | "name": "python", 127 | "nbconvert_exporter": "python", 128 | "pygments_lexer": "ipython3", 129 | "version": "3.10.15" 130 | } 131 | }, 132 | "nbformat": 4, 133 | "nbformat_minor": 5 134 | } 135 | -------------------------------------------------------------------------------- /text/text_dedup/utils/analysis.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from scipy.integrate import quad as integrate 4 | 5 | from text_dedup.utils.tokenization import ngrams 6 | 7 | 8 | def jaccard_similarity( 9 | doc1, 10 | doc2, 11 | ngram_size: int = 8, 12 | min_length: int = 0, 13 | ) -> float: 14 | """Compute the Jaccard similarity between two documents. 15 | 16 | Parameters 17 | ---------- 18 | doc1 : str or List[str] 19 | The first document. 20 | doc2 : str or List[str] 21 | The second document. 22 | ngram_size : int, optional 23 | The size of n-grams, by default 8 24 | min_length : int, optional 25 | The minimum length of each n-gram, by default 0 26 | 27 | Returns 28 | ------- 29 | float 30 | The Jaccard similarity. 31 | 32 | Examples 33 | -------- 34 | >>> jaccard_similarity("hello world", "hello world") 35 | 1.0 36 | >>> jaccard_similarity("hello world", "hello world!") 37 | 0.8 38 | >>> jaccard_similarity("hello world".split(), "hello world!".split(), ngram_size=1) 39 | 0.3333333333333333 40 | """ 41 | words1 = set(" ".join(ng) for ng in ngrams(list(doc1), ngram_size, min_length=min_length)) 42 | words2 = set(" ".join(ng) for ng in ngrams(list(doc2), ngram_size, min_length=min_length)) 43 | return len(words1 & words2) / max(1, len(words1 | words2)) 44 | 45 | 46 | def optimal_param( 47 | threshold: float, 48 | num_perm: int, 49 | false_positive_weight: float = 0.5, 50 | false_negative_weight: float = 0.5, 51 | ): 52 | """ 53 | Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum 54 | of probabilities of false positive and false negative, taken from datasketch. 55 | 56 | You can also refer to the interactive demo at https://huggingface.co/spaces/bigcode/near-deduplication. 57 | 58 | Parameters 59 | ---------- 60 | threshold : float 61 | The threshold for similarity. 62 | num_perm : int 63 | The number of permutations. 64 | false_positive_weight : float 65 | The weight of false positive. 66 | false_negative_weight : float 67 | The weight of false negative. 68 | 69 | Returns 70 | ------- 71 | Tuple[int, int] 72 | The optimal `b` (bands) and `r` (rows) parameters. 73 | 74 | Examples 75 | -------- 76 | >>> optimal_param(0.75, 256) 77 | (21, 12) 78 | >>> optimal_param(0.75, 256, 0.1, 0.9) 79 | (28, 9) 80 | """ 81 | 82 | def false_positive_area(threshold: float, b: int, r: int): 83 | """Source: `datasketch.lsh`""" 84 | 85 | def proba(s): 86 | return 1 - (1 - s ** float(r)) ** float(b) 87 | 88 | a, _ = integrate(proba, 0.0, threshold) 89 | return a 90 | 91 | def false_negative_area(threshold: float, b: int, r: int): 92 | """Source: `datasketch.lsh`""" 93 | 94 | def proba(s): 95 | return 1 - (1 - (1 - s ** float(r)) ** float(b)) 96 | 97 | a, _ = integrate(proba, threshold, 1.0) 98 | return a 99 | 100 | min_error = float("inf") 101 | opt = (0, 0) 102 | for b in range(1, num_perm + 1): 103 | max_r = int(num_perm / b) 104 | for r in range(1, max_r + 1): 105 | fp = false_positive_area(threshold, b, r) 106 | fn = false_negative_area(threshold, b, r) 107 | error = fp * false_positive_weight + fn * false_negative_weight 108 | if error < min_error: 109 | min_error = error 110 | opt = (b, r) 111 | return opt -------------------------------------------------------------------------------- /stt-whisper/force_alignment.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from multiprocess import Pool 3 | import torch 4 | import torchaudio 5 | import pandas as pd 6 | import click 7 | import os 8 | import json 9 | 10 | device = 'cuda' 11 | 12 | def chunks(l, devices, language, folder): 13 | chunk_size = len(l) // len(devices) 14 | remainder = len(l) % len(devices) 15 | start = 0 16 | for i in range(len(devices)): 17 | extra = 1 if i < remainder else 0 18 | end = start + chunk_size + extra 19 | yield (l[start:end], devices[i], language, folder) 20 | start = end 21 | 22 | def loop(rows): 23 | rows, index, language, folder = rows 24 | os.environ['CUDA_VISIBLE_DEVICES'] = str(index) 25 | 26 | from ctc_forced_aligner import ( 27 | load_audio, 28 | load_alignment_model, 29 | generate_emissions, 30 | preprocess_text, 31 | get_alignments, 32 | get_spans, 33 | postprocess_results, 34 | ) 35 | import torch 36 | 37 | alignment_model, alignment_tokenizer = load_alignment_model( 38 | device, 39 | dtype=torch.float16 if device == "cuda" else torch.float32, 40 | ) 41 | 42 | with torch.no_grad(): 43 | for row in tqdm(rows): 44 | t = row.get('pronunciation', '') 45 | if not len(t): 46 | t = row.get('question') 47 | f = row['audio_filename'] 48 | new_f = f.replace('/', '_').replace('.mp3', '.json').replace('.wav', '.json') 49 | filename = os.path.join(folder, new_f) 50 | if os.path.exists(filename): 51 | continue 52 | new_wav, sr = torchaudio.load(f) 53 | audio_waveform = torchaudio.functional.resample( 54 | new_wav[0], orig_freq=sr, new_freq=16000 55 | ).type(torch.float16).cuda() 56 | emissions, stride = generate_emissions( 57 | alignment_model, audio_waveform, batch_size=1 58 | ) 59 | tokens_starred, text_starred = preprocess_text( 60 | t, 61 | romanize=True, 62 | language=language, 63 | ) 64 | segments, scores, blank_token = get_alignments( 65 | emissions, 66 | tokens_starred, 67 | alignment_tokenizer, 68 | ) 69 | spans = get_spans(tokens_starred, segments, blank_token) 70 | word_timestamps = postprocess_results(text_starred, spans, stride, scores) 71 | with open(filename, 'w') as fopen: 72 | row['word_timestamps'] = word_timestamps 73 | json.dump(row, fopen) 74 | 75 | @click.command() 76 | @click.option('--filename') 77 | @click.option('--language', default = 'en') 78 | @click.option('--replication', default = 1) 79 | @click.option('--folder', default = 'force_alignment') 80 | def main(filename, language, replication, folder): 81 | os.makedirs(folder, exist_ok = True) 82 | devices = os.environ.get('CUDA_VISIBLE_DEVICES') 83 | if devices is None: 84 | devices = list(range(torch.cuda.device_count())) 85 | else: 86 | devices = [d.strip() for d in devices.split(',')] 87 | 88 | devices = replication * devices 89 | print(devices) 90 | 91 | with open(filename) as fopen: 92 | rows = json.load(fopen) 93 | 94 | df_split = chunks(rows, devices, language, folder) 95 | pool = Pool(len(devices)) 96 | pooled = pool.map(loop, df_split) 97 | pool.close() 98 | pool.join() 99 | 100 | if __name__ == '__main__': 101 | main() -------------------------------------------------------------------------------- /text/compare-tokens.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "f4d6d81b", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# !wget https://huggingface.co/datasets/mesolitica/malaysian-ultrachat/resolve/main/ultrachat-astroawani-malay.jsonl" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 12, 16 | "id": "e7743bc8", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import json\n", 21 | "from tqdm import tqdm\n", 22 | "from transformers import AutoTokenizer" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 15, 28 | "id": "1709e65a", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "tokenizer_mallam = AutoTokenizer.from_pretrained('malaysia-ai/sentencepiece-tokenizer')\n", 33 | "tokenizer_llama2 = AutoTokenizer.from_pretrained('mesolitica/llama-7b-hf-2048-fpf')\n", 34 | "tokenizer_mistral = AutoTokenizer.from_pretrained('mesolitica/mistral-7b-4096-fpf')" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 18, 40 | "id": "9655d899", 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stderr", 45 | "output_type": "stream", 46 | "text": [ 47 | "60198it [04:20, 230.88it/s]\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "mallam, llama2, mistral = 0, 0, 0\n", 53 | "with open('ultrachat-astroawani-malay.jsonl') as fopen:\n", 54 | " for l in tqdm(fopen):\n", 55 | " l = json.loads(l)\n", 56 | " for r in l[1:]:\n", 57 | " if r['content_ms']:\n", 58 | " mallam += len(tokenizer_mallam(r['content_ms'])['input_ids'])\n", 59 | " llama2 += len(tokenizer_llama2(r['content_ms'])['input_ids'])\n", 60 | " mistral += len(tokenizer_mistral(r['content_ms'])['input_ids'])" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 19, 66 | "id": "7b5901bd", 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "data": { 71 | "text/plain": [ 72 | "(26157664, 60391551, 60823929)" 73 | ] 74 | }, 75 | "execution_count": 19, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "mallam, llama2, mistral" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 22, 87 | "id": "3e01dec0", 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/plain": [ 93 | "0.4300554803028262" 94 | ] 95 | }, 96 | "execution_count": 22, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | } 100 | ], 101 | "source": [ 102 | "(mallam / 60823929)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "id": "1a1c8e06", 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3 (ipykernel)", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.10.12" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 5 135 | } 136 | -------------------------------------------------------------------------------- /text/processing/text_dedup/utils/analysis.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from scipy.integrate import quad as integrate 4 | 5 | from text_dedup.utils.tokenization import ngrams 6 | 7 | 8 | def jaccard_similarity( 9 | doc1: str | List[str], 10 | doc2: str | List[str], 11 | ngram_size: int = 8, 12 | min_length: int = 0, 13 | ) -> float: 14 | """Compute the Jaccard similarity between two documents. 15 | 16 | Parameters 17 | ---------- 18 | doc1 : str or List[str] 19 | The first document. 20 | doc2 : str or List[str] 21 | The second document. 22 | ngram_size : int, optional 23 | The size of n-grams, by default 8 24 | min_length : int, optional 25 | The minimum length of each n-gram, by default 0 26 | 27 | Returns 28 | ------- 29 | float 30 | The Jaccard similarity. 31 | 32 | Examples 33 | -------- 34 | >>> jaccard_similarity("hello world", "hello world") 35 | 1.0 36 | >>> jaccard_similarity("hello world", "hello world!") 37 | 0.8 38 | >>> jaccard_similarity("hello world".split(), "hello world!".split(), ngram_size=1) 39 | 0.3333333333333333 40 | """ 41 | words1 = set(" ".join(ng) for ng in ngrams(list(doc1), ngram_size, min_length=min_length)) 42 | words2 = set(" ".join(ng) for ng in ngrams(list(doc2), ngram_size, min_length=min_length)) 43 | return len(words1 & words2) / max(1, len(words1 | words2)) 44 | 45 | 46 | def optimal_param( 47 | threshold: float, 48 | num_perm: int, 49 | false_positive_weight: float = 0.5, 50 | false_negative_weight: float = 0.5, 51 | ): 52 | """ 53 | Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum 54 | of probabilities of false positive and false negative, taken from datasketch. 55 | 56 | You can also refer to the interactive demo at https://huggingface.co/spaces/bigcode/near-deduplication. 57 | 58 | Parameters 59 | ---------- 60 | threshold : float 61 | The threshold for similarity. 62 | num_perm : int 63 | The number of permutations. 64 | false_positive_weight : float 65 | The weight of false positive. 66 | false_negative_weight : float 67 | The weight of false negative. 68 | 69 | Returns 70 | ------- 71 | Tuple[int, int] 72 | The optimal `b` (bands) and `r` (rows) parameters. 73 | 74 | Examples 75 | -------- 76 | >>> optimal_param(0.75, 256) 77 | (21, 12) 78 | >>> optimal_param(0.75, 256, 0.1, 0.9) 79 | (28, 9) 80 | """ 81 | 82 | def false_positive_area(threshold: float, b: int, r: int): 83 | """Source: `datasketch.lsh`""" 84 | 85 | def proba(s): 86 | return 1 - (1 - s ** float(r)) ** float(b) 87 | 88 | a, _ = integrate(proba, 0.0, threshold) 89 | return a 90 | 91 | def false_negative_area(threshold: float, b: int, r: int): 92 | """Source: `datasketch.lsh`""" 93 | 94 | def proba(s): 95 | return 1 - (1 - (1 - s ** float(r)) ** float(b)) 96 | 97 | a, _ = integrate(proba, threshold, 1.0) 98 | return a 99 | 100 | min_error = float("inf") 101 | opt = (0, 0) 102 | for b in range(1, num_perm + 1): 103 | max_r = int(num_perm / b) 104 | for r in range(1, max_r + 1): 105 | fp = false_positive_area(threshold, b, r) 106 | fn = false_negative_area(threshold, b, r) 107 | error = fp * false_positive_weight + fn * false_negative_weight 108 | if error < min_error: 109 | min_error = error 110 | opt = (b, r) 111 | return opt -------------------------------------------------------------------------------- /text/processing/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import json 4 | import subprocess 5 | from tqdm import tqdm 6 | from pathlib import Path 7 | 8 | 9 | def is_dir(path): 10 | return os.path.isdir(path) 11 | 12 | 13 | def run_command(txt): 14 | subprocess.run(txt, shell=True) 15 | 16 | 17 | def create_dir(path): 18 | Path(path).mkdir(parents=True, exist_ok=True) 19 | 20 | 21 | def write_to_json(lst, fn): 22 | with open(fn, "w+") as file: 23 | for item in tqdm(lst): 24 | x = json.dumps(item, ensure_ascii=False) 25 | file.write(x + "\n") 26 | 27 | 28 | http_errors = [ 29 | "400 Bad Request", 30 | "401 Unauthorized", 31 | "402 Payment Required", 32 | "403 Forbidden", 33 | "404 Not Found", 34 | "405 Method Not Allowed", 35 | "406 Not Acceptable", 36 | "407 Proxy Authentication Required", 37 | "408 Request Timeout", 38 | "409 Conflict", 39 | "410 Gone", 40 | "411 Length Required", 41 | "412 Precondition Failed", 42 | "413 Payload Too Large", 43 | "414 URI Too Long", 44 | "415 Unsupported Media Type", 45 | "416 Range Not Satisfiable", 46 | "417 Expectation Failed", 47 | "418 I'm a teapot", 48 | "421 Misdirected Request", 49 | "422 Unprocessable Entity", 50 | "423 Locked", 51 | "424 Failed Dependency", 52 | "425 Too Early", 53 | "426 Upgrade Required", 54 | "428 Precondition Required", 55 | "429 Too Many Requests", 56 | "431 Request Header Fields Too Large", 57 | "451 Unavailable For Legal Reasons", 58 | "500 Internal Server Error", 59 | "501 Not Implemented", 60 | "502 Bad Gateway", 61 | "503 Service Unavailable", 62 | "504 Gateway Timeout", 63 | "505 HTTP Version Not Supported", 64 | "506 Variant Also Negotiates", 65 | "507 Insufficient Storage", 66 | "508 Loop Detected", 67 | "510 Not Extended", 68 | "511 Network Authentication Required", 69 | ] 70 | 71 | rejected = [ 72 | "Internal Server Error", 73 | "__NOEDITSECTION__", 74 | "enter your username and password", 75 | "forgotten your password", 76 | "cookies enabled", 77 | "enable JavaScript in your browser.", 78 | "The page cannot be displayed", 79 | "site or edit the error_page", 80 | ] 81 | 82 | rejected.extend(http_errors) 83 | 84 | 85 | def replace_multiple(input_string, pattern=r"\s{6,}", replace=" "): 86 | return re.sub(pattern, replace, input_string) 87 | 88 | 89 | def replace(string): 90 | string = replace_multiple(string.replace("…", ".")) 91 | string = replace_multiple(string, pattern=r"\.{6,}", replace="...") 92 | return string 93 | 94 | 95 | def reject(string): 96 | if any([r in string for r in rejected]): 97 | return True 98 | return False 99 | 100 | 101 | def loop(files, process_type="multi"): 102 | if process_type == "multi": 103 | files, _ = files 104 | 105 | for f in files: 106 | new_f = f.replace("dedupe-datasets/", "postprocessing/") 107 | new_f_done = f.replace("dedupe-datasets/", "postprocessing-done/") 108 | if os.path.exists(new_f_done): 109 | continue 110 | with open(new_f, "w") as fopen_l: 111 | with open(f) as fopen: 112 | for l in tqdm(fopen): 113 | data = json.loads(l) 114 | 115 | if reject(data["text"]): 116 | continue 117 | 118 | data = replace(data["text"].strip()) 119 | 120 | if len(data) < 3: 121 | continue 122 | 123 | fopen_l.write(f"{json.dumps(data)}\n") 124 | fopen_l.flush() 125 | 126 | with open(new_f_done, "w") as fopen: 127 | fopen.write("done") 128 | -------------------------------------------------------------------------------- /speech-instructions/prepare-malaysia-parliament.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "3c883f91", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# !wget https://huggingface.co/Zyphra/Zonos-v0.1-speaker-embedding/resolve/main/ResNet293_SimAM_ASP_base.pt" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "bf4cd179", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import os\n", 21 | "\n", 22 | "os.environ['CUDA_VISIBLE_DEVICES'] = '1'\n", 23 | "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 10, 29 | "id": "1a51f3c2", 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/plain": [ 35 | "{'idx': 0, 'prev_idx': -1}" 36 | ] 37 | }, 38 | "execution_count": 10, 39 | "metadata": {}, 40 | "output_type": "execute_result" 41 | } 42 | ], 43 | "source": [ 44 | "import torch\n", 45 | "\n", 46 | "available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]\n", 47 | "available_gpus[0].__dict__" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 50, 53 | "id": "1ac7cbf3", 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "import pandas as pd\n", 58 | "import numpy as np\n", 59 | "from tqdm import tqdm\n", 60 | "import torchaudio\n", 61 | "from speaker_cloning import SpeakerEmbedding" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 42, 67 | "id": "56d3111e", 68 | "metadata": { 69 | "scrolled": true 70 | }, 71 | "outputs": [ 72 | { 73 | "name": "stderr", 74 | "output_type": "stream", 75 | "text": [ 76 | "100%|███████████████████████████████████████████████████████████████████████████| 2033890/2033890 [00:13<00:00, 146206.60it/s]\n", 77 | "100%|██████████████████████████████████████████████████████████████████████████████| 191545/191545 [00:02<00:00, 76849.55it/s]\n" 78 | ] 79 | }, 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "636921" 84 | ] 85 | }, 86 | "execution_count": 42, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "df = pd.read_parquet('/home/husein/ssd4/verify-text.parquet')\n", 93 | "filtered = []\n", 94 | "for i in tqdm(range(len(df))):\n", 95 | " if 'parlimen-24k' in df['audio'].iloc[i]:\n", 96 | " row = df.iloc[i].to_dict()\n", 97 | " f = os.path.join('/home/husein/ssd4/', row['audio'])\n", 98 | " row['audio'] = f\n", 99 | " filtered.append(row)\n", 100 | " \n", 101 | "df = pd.read_parquet('/home/husein/ssd4/verify-text-chunk-parliament.parquet')\n", 102 | "for i in tqdm(range(len(df))):\n", 103 | " row = df.iloc[i].to_dict()\n", 104 | " f = os.path.join('/home/husein/ssd4/', row['audio'])\n", 105 | " row['audio'] = f\n", 106 | " filtered.append(row)\n", 107 | " \n", 108 | "len(filtered)" 109 | ] 110 | } 111 | ], 112 | "metadata": { 113 | "kernelspec": { 114 | "display_name": "python3.10", 115 | "language": "python", 116 | "name": "python3.10" 117 | }, 118 | "language_info": { 119 | "codemirror_mode": { 120 | "name": "ipython", 121 | "version": 3 122 | }, 123 | "file_extension": ".py", 124 | "mimetype": "text/x-python", 125 | "name": "python", 126 | "nbconvert_exporter": "python", 127 | "pygments_lexer": "ipython3", 128 | "version": "3.10.15" 129 | } 130 | }, 131 | "nbformat": 4, 132 | "nbformat_minor": 5 133 | } 134 | -------------------------------------------------------------------------------- /text/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ -------------------------------------------------------------------------------- /text/madlad-400-ms/prepare-madlad-400-ms.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "20d4b02f", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from datasets import load_dataset" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "id": "3fb192ea", 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "name": "stderr", 21 | "output_type": "stream", 22 | "text": [ 23 | "/home/ubuntu/.local/lib/python3.10/site-packages/datasets/table.py:1421: FutureWarning: promote has been superseded by mode='default'.\n", 24 | " table = cls._concat_blocks(blocks, axis=0)\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "madlad_multilang = load_dataset(\"allenai/madlad-400\", languages=[\"ms\", 'ms_Arab_BN', 'ms_Arab'])" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "id": "f9c4b242", 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/plain": [ 41 | "2" 42 | ] 43 | }, 44 | "execution_count": 3, 45 | "metadata": {}, 46 | "output_type": "execute_result" 47 | } 48 | ], 49 | "source": [ 50 | "len(madlad_multilang)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 4, 56 | "id": "4664c3e2", 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "Dataset({\n", 63 | " features: ['text'],\n", 64 | " num_rows: 2337781\n", 65 | "})" 66 | ] 67 | }, 68 | "execution_count": 4, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "madlad_multilang['clean']" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 5, 80 | "id": "69bca9f5", 81 | "metadata": { 82 | "scrolled": true 83 | }, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/plain": [ 88 | "Dataset({\n", 89 | " features: ['text'],\n", 90 | " num_rows: 14112025\n", 91 | "})" 92 | ] 93 | }, 94 | "execution_count": 5, 95 | "metadata": {}, 96 | "output_type": "execute_result" 97 | } 98 | ], 99 | "source": [ 100 | "madlad_multilang['noisy']" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 6, 106 | "id": "6c42fc18", 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "import json\n", 111 | "from tqdm import tqdm" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 7, 117 | "id": "00ea83da", 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "name": "stderr", 122 | "output_type": "stream", 123 | "text": [ 124 | "100%|██████████| 14112025/14112025 [28:36<00:00, 8220.12it/s] \n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "with open('madlad-400-ms.jsonl', 'w') as fopen:\n", 130 | " for i in tqdm(range(len(madlad_multilang['noisy']))):\n", 131 | " t = madlad_multilang['noisy'][i]\n", 132 | " fopen.write(f'{json.dumps(t)}\\n')" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "id": "66b79ffc", 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [] 142 | } 143 | ], 144 | "metadata": { 145 | "kernelspec": { 146 | "display_name": "Python 3 (ipykernel)", 147 | "language": "python", 148 | "name": "python3" 149 | }, 150 | "language_info": { 151 | "codemirror_mode": { 152 | "name": "ipython", 153 | "version": 3 154 | }, 155 | "file_extension": ".py", 156 | "mimetype": "text/x-python", 157 | "name": "python", 158 | "nbconvert_exporter": "python", 159 | "pygments_lexer": "ipython3", 160 | "version": "3.10.12" 161 | } 162 | }, 163 | "nbformat": 4, 164 | "nbformat_minor": 5 165 | } 166 | -------------------------------------------------------------------------------- /emilia-yodas/convert_neucodec_emilia.py: -------------------------------------------------------------------------------- 1 | import os 2 | import soundfile as sf 3 | import json 4 | import click 5 | import re 6 | import pandas as pd 7 | import librosa 8 | from glob import glob 9 | from functools import partial 10 | from multiprocess import Pool 11 | from tqdm import tqdm 12 | import numpy as np 13 | import itertools 14 | 15 | def old_chunks(l, n): 16 | for i in range(0, len(l), n): 17 | yield (l[i: i + n], i // n) 18 | 19 | def chunks(l, devices): 20 | chunk_size = len(l) // len(devices) 21 | remainder = len(l) % len(devices) 22 | start = 0 23 | for i in range(len(devices)): 24 | extra = 1 if i < remainder else 0 25 | end = start + chunk_size + extra 26 | yield (l[start:end], devices[i]) 27 | start = end 28 | 29 | def new_path(f): 30 | splitted = f.split('/') 31 | folder = f.split('/')[0] 32 | folder = folder + '_neucodec' 33 | new_f = os.path.join(folder, '/'.join(splitted[1:])) 34 | new_f = new_f.replace('.mp3', '.json').replace('.wav', '.json') 35 | return new_f 36 | 37 | def multiprocessing(strings, function, cores=6, returned=True): 38 | df_split = old_chunks(strings, len(strings) // cores) 39 | pool = Pool(cores) 40 | pooled = pool.map(function, df_split) 41 | pool.close() 42 | pool.join() 43 | 44 | if returned: 45 | return list(itertools.chain(*pooled)) 46 | 47 | def check(files): 48 | files, _ = files 49 | filtered = [] 50 | for file in tqdm(files): 51 | filename_done = new_path(file) 52 | 53 | if os.path.exists(filename_done): 54 | try: 55 | with open(filename_done) as fopen: 56 | json.load(fopen) 57 | continue 58 | except: 59 | pass 60 | 61 | filtered.append(file) 62 | return filtered 63 | 64 | def loop( 65 | indices_device_pair, 66 | ): 67 | files, device = indices_device_pair 68 | os.environ['CUDA_VISIBLE_DEVICES'] = str(device) 69 | 70 | from neucodec import NeuCodec 71 | import torchaudio 72 | import torch 73 | torch.autograd.set_grad_enabled(False) 74 | 75 | model = NeuCodec.from_pretrained("neuphonic/neucodec") 76 | model.eval().cuda() 77 | 78 | for f in tqdm(files): 79 | filename = new_path(f) 80 | if os.path.exists(filename): 81 | try: 82 | with open(filename) as fopen: 83 | json.load(fopen) 84 | continue 85 | except: 86 | pass 87 | 88 | try: 89 | y, sr = librosa.load(f, sr = 16000) 90 | wav_tensor = torch.from_numpy(y).float().unsqueeze(0) 91 | fsq_codes = model.encode_code(wav_tensor.unsqueeze(1)) 92 | tokens = fsq_codes[0, 0].tolist() 93 | 94 | os.makedirs(os.path.split(filename)[0], exist_ok = True) 95 | with open(filename, 'w') as fopen: 96 | json.dump(tokens, fopen) 97 | except Exception as e: 98 | print(e) 99 | 100 | @click.command() 101 | @click.option('--file') 102 | @click.option('--replication', default = 1) 103 | def main( 104 | file, 105 | replication, 106 | ): 107 | devices = os.environ.get('CUDA_VISIBLE_DEVICES') 108 | if devices is None: 109 | 110 | import torch 111 | devices = list(range(torch.cuda.device_count())) 112 | else: 113 | devices = [d.strip() for d in devices.split(',')] 114 | 115 | devices = replication * devices 116 | print(devices) 117 | 118 | with open(file) as fopen: 119 | files = json.load(fopen) 120 | filtered = multiprocessing(files, check, 30) 121 | 122 | print(len(files), len(filtered)) 123 | 124 | df_split = list(chunks(filtered, devices)) 125 | 126 | loop_partial = partial(loop) 127 | 128 | with Pool(len(devices)) as pool: 129 | pooled = pool.map(loop_partial, df_split) 130 | 131 | if __name__ == '__main__': 132 | main() 133 | 134 | -------------------------------------------------------------------------------- /text/madlad-400-ms/dedup-madlad-400-ms.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "69b786dc", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import json\n", 11 | "import pandas as pd\n", 12 | "from tqdm import tqdm\n", 13 | "from datasets import Dataset\n", 14 | "from bs4 import BeautifulSoup" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "19269eb1", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "with open('madlad-400-ms.postprocessing.jsonl', 'w') as fopen_l:\n", 25 | " with open('/home/ubuntu/madlad-400-ms.postprocessing.jsonl') as fopen:\n", 26 | " for l in tqdm(fopen):\n", 27 | " l = json.loads(l)\n", 28 | " d = {\n", 29 | " 'text': l\n", 30 | " }\n", 31 | " fopen_l.write(f'{json.dumps(d)}\\n')" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "91f9ef48", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "!head -n 10000 madlad-400-ms.postprocessing.jsonl > test.jsonl" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "eb166831", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "from datasets import load_dataset\n", 52 | "dataset = load_dataset(\"json\", data_files=\"madlad-400-ms.postprocessing.jsonl\", split = 'train')" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "id": "b93732c2", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "dataset.save_to_disk(f\"hf-datasets/raw-datasets/madlad-400-ms\")" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 2, 68 | "id": "f84f69d6", 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "'python3 -m text_dedup.minhash --path hf-datasets/raw-datasets/madlad-400-ms --split train --cache_dir ./cache --output hf-datasets/dedupe-datasets/madlad-400-ms --column text --batch_size 1000 --threshold 0.95 --min_length 1 --local'" 75 | ] 76 | }, 77 | "execution_count": 2, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "command = f\"python3 -m text_dedup.minhash \\\n", 84 | " --path hf-datasets/raw-datasets/madlad-400-ms \\\n", 85 | " --split train \\\n", 86 | " --cache_dir ./cache \\\n", 87 | " --output hf-datasets/dedupe-datasets/madlad-400-ms \\\n", 88 | " --column text \\\n", 89 | " --batch_size 1000 \\\n", 90 | " --threshold 0.95 \\\n", 91 | " --min_length 1 \\\n", 92 | " --local\"\n", 93 | "\n", 94 | "command" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 5, 100 | "id": "4c337e35", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "# import subprocess\n", 105 | "# subprocess.run(command, shell=True)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 11, 111 | "id": "8b0099e9", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "!rm -rf hf-datasets/dedupe-datasets/madlad-400-ms" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "id": "dc6a64ba", 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [] 125 | } 126 | ], 127 | "metadata": { 128 | "kernelspec": { 129 | "display_name": "Python 3 (ipykernel)", 130 | "language": "python", 131 | "name": "python3" 132 | }, 133 | "language_info": { 134 | "codemirror_mode": { 135 | "name": "ipython", 136 | "version": 3 137 | }, 138 | "file_extension": ".py", 139 | "mimetype": "text/x-python", 140 | "name": "python", 141 | "nbconvert_exporter": "python", 142 | "pygments_lexer": "ipython3", 143 | "version": "3.10.12" 144 | } 145 | }, 146 | "nbformat": 4, 147 | "nbformat_minor": 5 148 | } 149 | -------------------------------------------------------------------------------- /multilingual-tts/convert_neucodec.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ['OMP_NUM_THREADS'] = '1' 4 | os.environ['OPENBLAS_NUM_THREADS'] = '1' 5 | 6 | import soundfile as sf 7 | import json 8 | import click 9 | import re 10 | import librosa 11 | from glob import glob 12 | from functools import partial 13 | from multiprocess import Pool 14 | from tqdm import tqdm 15 | import numpy as np 16 | import itertools 17 | 18 | def old_chunks(l, n): 19 | for i in range(0, len(l), n): 20 | yield (l[i: i + n], i // n) 21 | 22 | def chunks(l, devices): 23 | chunk_size = len(l) // len(devices) 24 | remainder = len(l) % len(devices) 25 | start = 0 26 | for i in range(len(devices)): 27 | extra = 1 if i < remainder else 0 28 | end = start + chunk_size + extra 29 | yield (l[start:end], devices[i]) 30 | start = end 31 | 32 | def new_path(f): 33 | splitted = f.split('/') 34 | folder = f.split('/')[0] 35 | folder = folder + '_neucodec' 36 | new_f = os.path.join(folder, '/'.join(splitted[1:])) 37 | new_f = new_f.replace('.mp3', '.json').replace('.wav', '.json') 38 | return new_f 39 | 40 | def multiprocessing(strings, function, cores=6, returned=True): 41 | df_split = old_chunks(strings, len(strings) // cores) 42 | pool = Pool(cores) 43 | pooled = pool.map(function, df_split) 44 | pool.close() 45 | pool.join() 46 | 47 | if returned: 48 | return list(itertools.chain(*pooled)) 49 | 50 | def check(files): 51 | files, _ = files 52 | filtered = [] 53 | for file in tqdm(files): 54 | filename_done = new_path(file) 55 | 56 | if os.path.exists(filename_done): 57 | try: 58 | with open(filename_done) as fopen: 59 | json.load(fopen) 60 | continue 61 | except: 62 | pass 63 | 64 | filtered.append(file) 65 | return filtered 66 | 67 | def loop( 68 | indices_device_pair, 69 | ): 70 | files, device = indices_device_pair 71 | os.environ['CUDA_VISIBLE_DEVICES'] = str(device) 72 | 73 | from neucodec import NeuCodec 74 | import torchaudio 75 | import torch 76 | torch.autograd.set_grad_enabled(False) 77 | 78 | model = NeuCodec.from_pretrained("neuphonic/neucodec") 79 | model.eval().cuda() 80 | 81 | for f in tqdm(files): 82 | filename = new_path(f) 83 | if os.path.exists(filename): 84 | try: 85 | with open(filename) as fopen: 86 | json.load(fopen) 87 | continue 88 | except: 89 | pass 90 | 91 | try: 92 | y, sr = librosa.load(f, sr = 16000) 93 | if len(y) / sr > 20: 94 | continue 95 | wav_tensor = torch.from_numpy(y).float().unsqueeze(0) 96 | fsq_codes = model.encode_code(wav_tensor.unsqueeze(1)) 97 | tokens = fsq_codes[0, 0].tolist() 98 | 99 | os.makedirs(os.path.split(filename)[0], exist_ok = True) 100 | with open(filename, 'w') as fopen: 101 | json.dump(tokens, fopen) 102 | except Exception as e: 103 | print(e) 104 | 105 | @click.command() 106 | @click.option('--file') 107 | @click.option('--replication', default = 1) 108 | def main( 109 | file, 110 | replication, 111 | ): 112 | devices = os.environ.get('CUDA_VISIBLE_DEVICES') 113 | if devices is None: 114 | 115 | import torch 116 | devices = list(range(torch.cuda.device_count())) 117 | else: 118 | devices = [d.strip() for d in devices.split(',')] 119 | 120 | devices = replication * devices 121 | print(devices) 122 | 123 | with open(file) as fopen: 124 | files = json.load(fopen) 125 | filtered = multiprocessing(files, check, 30) 126 | 127 | print(len(files), len(filtered)) 128 | 129 | df_split = list(chunks(filtered, devices)) 130 | 131 | loop_partial = partial(loop) 132 | 133 | with Pool(len(devices)) as pool: 134 | pooled = pool.map(loop_partial, df_split) 135 | 136 | if __name__ == '__main__': 137 | main() 138 | 139 | 140 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # PyPI configuration file 171 | .pypirc 172 | mp.py 173 | *Untitled*.ipynb 174 | malaysian_sft.py 175 | speech-instructions-extra/*audio 176 | speech-instructions-extra/*-Train 177 | *.parquet -------------------------------------------------------------------------------- /text/pretrain-llm/prepare-starcoder.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "453a2552", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import json\n", 11 | "import os\n", 12 | "from glob import glob\n", 13 | "from tqdm import tqdm" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "id": "36e657c0", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "def partition(text, size = 500):\n", 24 | " splitted = text.split()\n", 25 | " return [' '.join(splitted[i: i + size]) for i in range(0, len(splitted), size)]" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "id": "cab2dbd7", 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "a = open('prepare-starcoder.jsonl', 'w')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 4, 41 | "id": "06167a66", 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "['starcoder/starcoder/c.jsonl',\n", 48 | " 'starcoder/starcoder/cpp.jsonl',\n", 49 | " 'starcoder/starcoder/css.jsonl',\n", 50 | " 'starcoder/starcoder/go.jsonl',\n", 51 | " 'starcoder/starcoder/html.jsonl',\n", 52 | " 'starcoder/starcoder/java.jsonl',\n", 53 | " 'starcoder/starcoder/javascript.jsonl',\n", 54 | " 'starcoder/starcoder/julia.jsonl',\n", 55 | " 'starcoder/starcoder/markdown.jsonl',\n", 56 | " 'starcoder/starcoder/python.jsonl',\n", 57 | " 'starcoder/starcoder/r.jsonl',\n", 58 | " 'starcoder/starcoder/rust.jsonl',\n", 59 | " 'starcoder/starcoder/sql.jsonl']" 60 | ] 61 | }, 62 | "execution_count": 4, 63 | "metadata": {}, 64 | "output_type": "execute_result" 65 | } 66 | ], 67 | "source": [ 68 | "glob('starcoder/starcoder/*.jsonl')" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 5, 74 | "id": "23d113a4", 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "name": "stderr", 79 | "output_type": "stream", 80 | "text": [ 81 | "1610858it [06:54, 3884.72it/s]\n", 82 | "1314195it [06:08, 3567.05it/s]\n", 83 | "2293654it [06:23, 5987.63it/s]\n", 84 | "1928334it [06:33, 4901.64it/s]\n", 85 | "60451it [00:19, 2792.18it/s]IOPub message rate exceeded.\n", 86 | "The notebook server will temporarily stop sending output\n", 87 | "to the client in order to avoid crashing it.\n", 88 | "To change this limit, set the config variable\n", 89 | "`--NotebookApp.iopub_msg_rate_limit`.\n", 90 | "\n", 91 | "Current values:\n", 92 | "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n", 93 | "NotebookApp.rate_limit_window=3.0 (secs)\n", 94 | "\n" 95 | ] 96 | } 97 | ], 98 | "source": [ 99 | "files = glob('starcoder/starcoder/*.jsonl')\n", 100 | "for f in files:\n", 101 | " with open(f) as fopen:\n", 102 | " for l in tqdm(fopen):\n", 103 | " try:\n", 104 | " data = '' + json.loads(l) + ''\n", 105 | " partitioned = partition(data)\n", 106 | " for p in partitioned:\n", 107 | " data = {\n", 108 | " 'text': p,\n", 109 | " }\n", 110 | " a.write(f'{json.dumps(data)}\\n')\n", 111 | " a.flush()\n", 112 | " except Exception as e:\n", 113 | " print(e)\n", 114 | " pass" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "id": "0c1a228e", 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [] 124 | } 125 | ], 126 | "metadata": { 127 | "kernelspec": { 128 | "display_name": "Python 3 (ipykernel)", 129 | "language": "python", 130 | "name": "python3" 131 | }, 132 | "language_info": { 133 | "codemirror_mode": { 134 | "name": "ipython", 135 | "version": 3 136 | }, 137 | "file_extension": ".py", 138 | "mimetype": "text/x-python", 139 | "name": "python", 140 | "nbconvert_exporter": "python", 141 | "pygments_lexer": "ipython3", 142 | "version": "3.10.12" 143 | } 144 | }, 145 | "nbformat": 4, 146 | "nbformat_minor": 5 147 | } 148 | -------------------------------------------------------------------------------- /multilingual-tts/trim_silence.py: -------------------------------------------------------------------------------- 1 | import os 2 | import soundfile as sf 3 | import librosa 4 | import json 5 | import click 6 | import numpy as np 7 | import malaya_speech 8 | from glob import glob 9 | from functools import partial 10 | from multiprocess import Pool 11 | from tqdm import tqdm 12 | 13 | def chunks(l, devices): 14 | chunk_size = len(l) // len(devices) 15 | remainder = len(l) % len(devices) 16 | start = 0 17 | for i in range(len(devices)): 18 | extra = 1 if i < remainder else 0 19 | end = start + chunk_size + extra 20 | yield (l[start:end], devices[i]) 21 | start = end 22 | 23 | def new_path(f): 24 | splitted = f.split('/') 25 | base_folder = splitted[0] + '_trim' 26 | splitted = '/'.join([base_folder] + splitted[1:]) 27 | return splitted 28 | 29 | def new_path_done(f): 30 | splitted = f.split('/') 31 | base_folder = splitted[0] + '_trim_done' 32 | splitted = '/'.join([base_folder] + splitted[1:]) 33 | return splitted 34 | 35 | def loop(indices_device_pair): 36 | files, device = indices_device_pair 37 | 38 | vad = malaya_speech.vad.webrtc(minimum_amplitude = 0) 39 | min_length = 0.4 40 | 41 | for file in tqdm(files): 42 | folder = os.path.split(file)[0] 43 | folder_folder = os.path.split(folder)[1] 44 | f_new = new_path(file) 45 | filename_done = new_path_done(file) 46 | 47 | try: 48 | with open(filename_done) as fopen: 49 | json.load(fopen) 50 | continue 51 | except: 52 | pass 53 | 54 | try: 55 | vad = malaya_speech.vad.webrtc(minimum_amplitude = 0) 56 | y, sr = librosa.load(file, sr = None) 57 | start_silent_trail = int(0.3 * sr) 58 | middle_silent_trail = int(min_length * sr / 2) 59 | middle_silent_trail, start_silent_trail 60 | y_= malaya_speech.resample(y, sr, 16000) 61 | y_ = malaya_speech.astype.float_to_int(y_) 62 | frames = malaya_speech.generator.frames(y, 30, sr) 63 | frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False)) 64 | frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)] 65 | grouped_deep = malaya_speech.group.group_frames(frames_webrtc) 66 | r = [] 67 | for no, g in enumerate(grouped_deep): 68 | if g[1]: 69 | g = g[0].array 70 | else: 71 | if no == 0: 72 | g = g[0].array[-start_silent_trail:] 73 | elif no == (len(grouped_deep) - 1): 74 | g = g[0].array[:start_silent_trail] 75 | else: 76 | if g[0].duration >= min_length: 77 | g = [g[0].array[:middle_silent_trail], g[0].array[-middle_silent_trail:]] 78 | g = np.concatenate(g) 79 | else: 80 | g = g[0].array 81 | 82 | r.append(g) 83 | y_after = np.concatenate(r) 84 | 85 | os.makedirs(os.path.split(f_new)[0], exist_ok = True) 86 | sf.write(f_new, y_after, sr) 87 | os.makedirs(os.path.split(filename_done)[0], exist_ok = True) 88 | with open(filename_done, 'w') as fopen: 89 | json.dump('done', fopen) 90 | 91 | except Exception as e: 92 | print(e) 93 | 94 | @click.command() 95 | @click.option('--file') 96 | @click.option('--replication', default = 1) 97 | def main( 98 | file, 99 | replication, 100 | ): 101 | devices = replication * [0] 102 | 103 | with open(file) as fopen: 104 | files = json.load(fopen) 105 | filtered = [] 106 | for file in tqdm(files): 107 | filename_done = new_path_done(file) 108 | 109 | if os.path.exists(filename_done): 110 | try: 111 | with open(filename_done) as fopen: 112 | json.load(fopen) 113 | continue 114 | except: 115 | pass 116 | filtered.append(file) 117 | 118 | df_split = list(chunks(filtered, devices)) 119 | 120 | loop_partial = partial(loop) 121 | 122 | with Pool(len(devices)) as pool: 123 | pooled = pool.map(loop_partial, df_split) 124 | 125 | if __name__ == '__main__': 126 | main() 127 | 128 | -------------------------------------------------------------------------------- /text/pretrain-llm/prepare-translation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "e6328ada", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# !git lfs clone https://huggingface.co/datasets/mesolitica/google-translate-ms-pa\n", 11 | "# !git lfs clone https://huggingface.co/datasets/mesolitica/google-translate-ms-zh-CN\n", 12 | "# !git lfs clone https://huggingface.co/datasets/mesolitica/google-translate-ms-ta" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "id": "c6bfd5aa", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import json\n", 23 | "import os\n", 24 | "from glob import glob\n", 25 | "from tqdm import tqdm" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "id": "1e6d9ca8", 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "def partition(text, size = 500):\n", 36 | " splitted = text.split()\n", 37 | " return [' '.join(splitted[i: i + size]) for i in range(0, len(splitted), size)]" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "id": "c2e1d2bc", 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "a = open('prepare-translation.jsonl', 'w')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "id": "15ff7cfe", 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "name": "stderr", 58 | "output_type": "stream", 59 | "text": [ 60 | "99967it [00:53, 1868.55it/s]\n", 61 | "99971it [01:00, 1646.85it/s]\n", 62 | "99968it [01:08, 1460.54it/s]\n", 63 | "99966it [00:58, 1719.68it/s]\n", 64 | "99962it [00:56, 1755.14it/s]\n", 65 | "99968it [00:19, 5100.54it/s] \n", 66 | "99959it [00:20, 4887.07it/s] \n", 67 | "99972it [00:15, 6252.46it/s] \n", 68 | "99960it [00:11, 8494.27it/s] \n", 69 | "99974it [00:08, 11529.00it/s]\n", 70 | "99968it [00:14, 6672.96it/s] \n", 71 | "99965it [00:09, 10661.73it/s]\n", 72 | "99965it [00:09, 10525.04it/s]\n", 73 | "99959it [00:15, 6443.56it/s] \n", 74 | "99972it [00:08, 11661.01it/s]\n", 75 | "99966it [00:14, 6786.12it/s] \n", 76 | "99969it [00:22, 4412.35it/s] \n", 77 | "99972it [00:14, 6883.69it/s] \n", 78 | "99963it [00:06, 15602.76it/s]\n", 79 | "99966it [00:16, 6097.17it/s] \n", 80 | "99967it [00:11, 8717.47it/s] \n", 81 | "99970it [00:07, 13489.12it/s]\n", 82 | "99969it [00:18, 5358.87it/s] \n", 83 | "99981it [00:09, 10109.43it/s]\n", 84 | "99968it [00:07, 13383.50it/s]\n", 85 | "99966it [00:14, 7052.85it/s] \n", 86 | "99968it [00:23, 4322.65it/s] \n", 87 | "99968it [00:37, 2634.95it/s]\n", 88 | "99972it [00:36, 2704.90it/s]\n", 89 | "99958it [00:40, 2471.35it/s]\n", 90 | "99967it [00:42, 2371.77it/s]\n", 91 | "99971it [00:44, 2221.78it/s]\n", 92 | "99962it [00:39, 2532.83it/s]\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "google_translate = glob('google-translate-*/*.requested')\n", 98 | "for f in google_translate:\n", 99 | " with open(f) as fopen:\n", 100 | " for l in tqdm(fopen):\n", 101 | " try:\n", 102 | " data = '' + json.loads(l)['r']['result'] + ''\n", 103 | " partitioned = partition(data)\n", 104 | " for p in partitioned:\n", 105 | " data = {\n", 106 | " 'text': p,\n", 107 | " }\n", 108 | " a.write(f'{json.dumps(data)}\\n')\n", 109 | " a.flush()\n", 110 | " except:\n", 111 | " pass" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "id": "ee6d3d66", 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [] 121 | } 122 | ], 123 | "metadata": { 124 | "kernelspec": { 125 | "display_name": "Python 3 (ipykernel)", 126 | "language": "python", 127 | "name": "python3" 128 | }, 129 | "language_info": { 130 | "codemirror_mode": { 131 | "name": "ipython", 132 | "version": 3 133 | }, 134 | "file_extension": ".py", 135 | "mimetype": "text/x-python", 136 | "name": "python", 137 | "nbconvert_exporter": "python", 138 | "pygments_lexer": "ipython3", 139 | "version": "3.10.12" 140 | } 141 | }, 142 | "nbformat": 4, 143 | "nbformat_minor": 5 144 | } 145 | -------------------------------------------------------------------------------- /text/extra/sample-fineweb-edu.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 13, 6 | "id": "e58f50a7-ac12-4bac-ab97-ce10a1de9154", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from datasets import load_dataset\n", 11 | "from tqdm import tqdm\n", 12 | "import json" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 5, 18 | "id": "015519ff-efb4-4d80-adfd-7e826822af76", 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "application/vnd.jupyter.widget-view+json": { 24 | "model_id": "dfc39a6763534034a70ba3a3e960169d", 25 | "version_major": 2, 26 | "version_minor": 0 27 | }, 28 | "text/plain": [ 29 | "Resolving data files: 0%| | 0/104 [00:00 1e7:\n", 68 | "# break" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 16, 74 | "id": "89ad1f82-8041-47cf-a640-a20ee50073d9", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "!mv fineweb-edu-dedup-sample-10M.jsonl fineweb-edu-dedup-sample-5M.jsonl" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 17, 84 | "id": "e6a3642b-5a8a-4018-ae8f-be5c9009fdbc", 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "-rw-r--r-- 1 sagemaker-user users 22G Aug 5 08:05 fineweb-edu-dedup-sample-5M.jsonl\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "!ls -lh fineweb-edu-dedup-sample-5M.jsonl" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "id": "c4fe5407-7877-4c10-9084-e62ed97ecc8d", 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "application/vnd.jupyter.widget-view+json": { 108 | "model_id": "4717bd184fde4ed094f07eca7572f2f7", 109 | "version_major": 2, 110 | "version_minor": 0 111 | }, 112 | "text/plain": [ 113 | "fineweb-edu-dedup-sample-5M.jsonl: 0%| | 0.00/23.5G [00:00 2: 43 | continue 44 | filtered_files.append(f) 45 | 46 | print(len(files), len(filtered_files)) 47 | global_size = len(filtered_files) // global_index 48 | files = filtered_files[global_size * local_index: global_size * (local_index + 1)] 49 | print(len(files)) 50 | 51 | feature_extractor = AutoFeatureExtractor.from_pretrained(model, return_attention_mask = True) 52 | model = AutoModelForAudioClassification.from_pretrained(model, torch_dtype = torch.float16).eval().cuda() 53 | id2label = model.config.id2label 54 | sr = feature_extractor.sampling_rate 55 | sliding = int(sliding * sr) 56 | audio = Audio(sampling_rate = sr) 57 | 58 | class CustomDataset(Dataset): 59 | def __init__(self, files): 60 | self.files = files 61 | 62 | def __len__(self): 63 | return len(self.files) 64 | 65 | def __getitem__(self, index): 66 | f = self.files[index] 67 | f = f['audio_filename'] 68 | y = audio.decode_example(audio.encode_example(f))['array'] 69 | timestamps = [] 70 | slided = [] 71 | for i in range(0, len(y), sliding): 72 | y_ = y[i: i + sliding] 73 | if len(y_) < 1000: 74 | continue 75 | slided.append(y[i: i + sliding]) 76 | start = i / sr 77 | end = min(len(y) / sr, (i + sliding) / sr) 78 | timestamps.append((start, end)) 79 | 80 | inputs = feature_extractor(slided, sampling_rate=sr, 81 | return_tensors="pt", return_attention_mask = True) 82 | return inputs, f, timestamps 83 | 84 | dataset = CustomDataset(files) 85 | dataloader = DataLoader(dataset, batch_size = 1, shuffle = False, prefetch_factor=10, num_workers=5) 86 | with torch.no_grad(): 87 | for row in tqdm(iter(dataloader)): 88 | inputs, f, timestamps_ = row 89 | f = f[0] 90 | timestamps = [] 91 | for t in timestamps_: 92 | timestamps.append((float(t[0]), float(t[1]))) 93 | 94 | inputs['input_values'] = inputs['input_values'][0].to(torch.float16).cuda() 95 | logits = model(**inputs).logits.softmax(-1) 96 | topk = torch.topk(logits, 5, dim = -1) 97 | probs = topk.values.cpu().numpy().tolist() 98 | 99 | for i in range(len(probs)): 100 | for k in range(len(probs[i])): 101 | probs[i][k] = round(probs[i][k], 4) 102 | 103 | labels = [] 104 | for row in topk.indices.cpu().numpy(): 105 | label = [id2label[r] for r in row] 106 | labels.append(label) 107 | 108 | splitted = new_path(f) 109 | os.makedirs(os.path.split(splitted)[0], exist_ok = True) 110 | with open(splitted, 'w') as fopen: 111 | json.dump({'timestamps': timestamps, 'labels': labels, 'probs': probs}, fopen) 112 | 113 | 114 | if __name__ == '__main__': 115 | function() -------------------------------------------------------------------------------- /malaysian-short-instructions/dedup-questions-intents.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "8009e792", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/plain": [ 12 | "76765" 13 | ] 14 | }, 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "output_type": "execute_result" 18 | } 19 | ], 20 | "source": [ 21 | "from glob import glob\n", 22 | "import json\n", 23 | "import re\n", 24 | "\n", 25 | "pattern = r\"\\d+\\.\\s(.+)\"\n", 26 | "already = set()\n", 27 | "\n", 28 | "files = glob('generate-questions-intents/*')\n", 29 | "\n", 30 | "questions = []\n", 31 | "for f in files:\n", 32 | " with open(f) as fopen:\n", 33 | " d = json.load(fopen)\n", 34 | " keyword = d['q'][0]\n", 35 | " for q in re.findall(pattern, d['r']):\n", 36 | " if q in already:\n", 37 | " continue\n", 38 | " questions.append((q, keyword))\n", 39 | " \n", 40 | "len(questions)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 2, 46 | "id": "cc4f4bab", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "import string\n", 51 | "\n", 52 | "digits = set(string.digits)\n", 53 | "rejected = ['\\'', '\"', 'http', '\\n', '[', ']', '/', '`']\n", 54 | "\n", 55 | "def contains_non_ascii(text):\n", 56 | " return any(ord(char) > 127 for char in text)\n", 57 | "\n", 58 | "def reject_q(q):\n", 59 | " if q is None:\n", 60 | " return True\n", 61 | " if any([c in q for c in rejected]):\n", 62 | " return True\n", 63 | " if contains_non_ascii(q):\n", 64 | " return True\n", 65 | " if len(set(q) & digits):\n", 66 | " return True\n", 67 | " if len(q) < 20:\n", 68 | " return True\n", 69 | " if len(q) > 200:\n", 70 | " return True\n", 71 | " return False" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 3, 77 | "id": "fddc3adf", 78 | "metadata": { 79 | "scrolled": true 80 | }, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/plain": [ 85 | "192" 86 | ] 87 | }, 88 | "execution_count": 3, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "from collections import defaultdict\n", 95 | "\n", 96 | "filtered_q = defaultdict(list)\n", 97 | "for q, k in questions:\n", 98 | " if len(q) < 10:\n", 99 | " continue\n", 100 | " if reject_q(q):\n", 101 | " continue\n", 102 | " \n", 103 | " filtered_q[k].append(q)\n", 104 | "len(filtered_q)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 4, 110 | "id": "64cf6d6b", 111 | "metadata": { 112 | "scrolled": false 113 | }, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "75010" 119 | ] 120 | }, 121 | "execution_count": 4, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "questions = []\n", 128 | "for k, v in filtered_q.items():\n", 129 | " if len(v) < 100:\n", 130 | " continue\n", 131 | " v = sorted(v, key = lambda x: len(x), reverse = True)\n", 132 | " v = [(v_, k) for v_ in v][:1000]\n", 133 | " questions.extend(v)\n", 134 | " \n", 135 | "len(questions)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 9, 141 | "id": "eee58538", 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "data": { 146 | "text/plain": [ 147 | "('Apa menu makanan terkenal di Gopeng?', 'food negeri pulau pinang')" 148 | ] 149 | }, 150 | "execution_count": 9, 151 | "metadata": {}, 152 | "output_type": "execute_result" 153 | } 154 | ], 155 | "source": [ 156 | "questions[-4]" 157 | ] 158 | } 159 | ], 160 | "metadata": { 161 | "kernelspec": { 162 | "display_name": "Python 3 (ipykernel)", 163 | "language": "python", 164 | "name": "python3" 165 | }, 166 | "language_info": { 167 | "codemirror_mode": { 168 | "name": "ipython", 169 | "version": 3 170 | }, 171 | "file_extension": ".py", 172 | "mimetype": "text/x-python", 173 | "name": "python", 174 | "nbconvert_exporter": "python", 175 | "pygments_lexer": "ipython3", 176 | "version": "3.8.10" 177 | } 178 | }, 179 | "nbformat": 4, 180 | "nbformat_minor": 5 181 | } 182 | -------------------------------------------------------------------------------- /text/processing/main.py: -------------------------------------------------------------------------------- 1 | import re 2 | import mp 3 | import time 4 | import json 5 | import random 6 | import functools 7 | from tqdm import tqdm 8 | from pathlib import Path 9 | from unidecode import unidecode 10 | from argparse import ArgumentParser 11 | import function as func 12 | 13 | 14 | def parse_arguments(): 15 | parser = ArgumentParser() 16 | parser.add_argument( 17 | "--dataset", dest="dataset", help="Dataset name", required=False 18 | ) 19 | parser.add_argument( 20 | "--url_dataset", dest="url_dataset", help="Dataset URL (jsonl)", required=False 21 | ) 22 | parser.add_argument( 23 | "--clean_file_path", 24 | dest="clean_file_path", 25 | help="Load the .jsonl file that has been cleaned instead of from huggingface", 26 | required=False, 27 | ) 28 | parser.add_argument( 29 | "--master_folder", 30 | dest="master_dataset_folder", 31 | help="Master folder to store dataset and processed output", 32 | required=True, 33 | ) 34 | parser.add_argument( 35 | "--mp_core", 36 | dest="mp_core", 37 | default=6, 38 | help="Postprocessing Core", 39 | required=False, 40 | ) 41 | parser.add_argument( 42 | "--dataset_with_link", 43 | dest="dataset_with_link", 44 | nargs="+", 45 | help="Dataset name", 46 | required=False, 47 | ) 48 | parser.add_argument( 49 | "--text_key", 50 | dest="text_key", 51 | nargs="+", 52 | help="Dict key contain text data", 53 | required=False, 54 | ) 55 | 56 | args = parser.parse_args() 57 | return args 58 | 59 | 60 | def loop_process(datasets, process_type="multi"): 61 | if process_type == "multi": 62 | lst_dataset, _ = datasets 63 | else: 64 | lst_dataset = datasets 65 | 66 | dataset_name_lst = [] 67 | remove_dataset_name_lst = [] 68 | 69 | for dataset in lst_dataset: 70 | try: 71 | url_dataset = dataset[1] 72 | dataset_name = dataset[0] 73 | 74 | dataset_name_lst.append(dataset_name) 75 | 76 | print(f"\nProcessing ... {dataset_name}\n") 77 | 78 | try: 79 | func.init_process( 80 | raw_dataset_path=master_dataset_folder, 81 | dataset_name=dataset_name, 82 | clean_file_path=url_dataset, 83 | text_key=text_key, 84 | ) 85 | except: 86 | func.init_process( 87 | raw_dataset_path=master_dataset_folder, 88 | dataset_name=dataset_name, 89 | link=url_dataset, 90 | text_key=text_key, 91 | ) 92 | 93 | func.second_process(master_dataset_folder, dataset_name) 94 | except Exception as e: 95 | print(f"[ERROR] {str(e)} \n Skip {dataset_name} ...") 96 | dataset_name_lst.remove(dataset_name) 97 | remove_dataset_name_lst.append(dataset_name) 98 | pass 99 | 100 | if len(dataset_name_lst) != 0: 101 | func.third_process(master_dataset_folder, mp_core) 102 | 103 | for l in dataset_name_lst: 104 | before_dedup_mb, after_dedup_mb, after_post_mb = func.get_size( 105 | master_dataset_folder, l 106 | ) 107 | 108 | print("\n\n====================") 109 | print(f"File Size - {l}") 110 | print(f"before_dedup ---> {before_dedup_mb}") 111 | print(f"after_dedup ---> {after_dedup_mb}") 112 | print(f"after_post ---> {after_post_mb}") 113 | print("====================\n\n") 114 | 115 | if len(remove_dataset_name_lst) > 0: 116 | print(f"Problem datasets:\n{','.join(remove_dataset_name_lst)}") 117 | 118 | 119 | if __name__ == "__main__": 120 | start_time = time.time() 121 | 122 | global master_dataset_folder 123 | global mp_core 124 | global text_key 125 | 126 | args = parse_arguments() 127 | 128 | clean_file_path = args.clean_file_path 129 | multiple_dataset = args.dataset_with_link 130 | text_key = args.text_key 131 | 132 | if clean_file_path: 133 | print("[Run for manually cleaned dataset]") 134 | dataset_name = args.dataset 135 | datasets = [(dataset_name, clean_file_path)] 136 | elif multiple_dataset: 137 | print("[Run for MULTIPLE datasets]") 138 | datasets = [tuple(l.split(",")) for l in multiple_dataset] 139 | else: 140 | print("[Run for SINGLE dataset]") 141 | dataset_name = args.dataset 142 | url_dataset = args.url_dataset 143 | 144 | datasets = [(dataset_name, url_dataset)] 145 | 146 | master_dataset_folder = args.master_dataset_folder 147 | mp_core = args.mp_core 148 | 149 | if len(datasets) // mp_core == 0: 150 | loop_process(datasets, process_type="single") 151 | else: 152 | mp.multiprocessing(datasets, loop_process, cores=mp_core, returned=False) 153 | 154 | print(f"--- {time.time() - start_time} seconds ---") 155 | -------------------------------------------------------------------------------- /emotional-malaysian-emilia/audioset_sliding_v2.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoFeatureExtractor, AutoModelForAudioClassification 2 | from collections import defaultdict 3 | from tqdm import tqdm 4 | from glob import glob 5 | from datasets import Audio 6 | from torch.utils.data import Dataset 7 | from torch.utils.data import DataLoader 8 | import torch 9 | import torchaudio 10 | import numpy as np 11 | import click 12 | import os 13 | import json 14 | import numpy as np 15 | 16 | def new_path(f): 17 | f = f.replace('.mp3', '.audioset_v2') 18 | splitted = f.split('/') 19 | base_folder = splitted[0] + '_audioset_v2' 20 | splitted = '/'.join([base_folder] + splitted[1:]) 21 | return splitted 22 | 23 | 24 | @click.command() 25 | @click.option("--path", help="files path in glob pattern") 26 | @click.option("--global-index", default=1, help="global index") 27 | @click.option("--local-index", default=0, help="local index") 28 | @click.option("--stride", default=0.1) 29 | @click.option("--sliding", default=1.0) 30 | @click.option("--model", default='MIT/ast-finetuned-audioset-10-10-0.4593') 31 | def function(path, global_index, local_index, stride, sliding, model): 32 | 33 | feature_extractor = AutoFeatureExtractor.from_pretrained(model, return_attention_mask = True) 34 | model = AutoModelForAudioClassification.from_pretrained(model, torch_dtype = torch.float16).eval().cuda() 35 | id2label = model.config.id2label 36 | sr = feature_extractor.sampling_rate 37 | actual_stride = stride 38 | stride = int(stride * sr) 39 | sliding = int(sliding * sr) 40 | audio = Audio(sampling_rate = sr) 41 | 42 | files = glob(path) 43 | filtered_files = [] 44 | for f in files: 45 | new_f = new_path(f) 46 | if os.path.exists(new_f) and os.path.getsize(new_f) > 2: 47 | continue 48 | filtered_files.append(f) 49 | 50 | global_size = len(filtered_files) // global_index 51 | filtered_files = filtered_files[global_size * local_index: global_size * (local_index + 1)] 52 | files = filtered_files 53 | 54 | class CustomDataset(Dataset): 55 | def __init__(self, files): 56 | self.files = files 57 | 58 | def __len__(self): 59 | return len(self.files) 60 | 61 | def __getitem__(self, index): 62 | f = self.files[index] 63 | y = audio.decode_example(audio.encode_example(f))['array'] 64 | timestamps = [] 65 | slided = [] 66 | last_end = 0 67 | for i in range(0, len(y) - sliding + 1, stride): 68 | end = i + sliding 69 | slided.append(y[i: end]) 70 | timestamps.append((i / sr, end / sr)) 71 | last_end = end 72 | 73 | if last_end < len(y): 74 | y_ = y[last_end:] 75 | if len(y_) >= stride: 76 | slided.append(y_) 77 | timestamps.append((last_end / sr, len(y) / sr)) 78 | 79 | inputs = feature_extractor(slided, sampling_rate=sr, 80 | return_tensors="pt", return_attention_mask = True) 81 | return inputs, f, timestamps 82 | 83 | dataset = CustomDataset(files) 84 | dataloader = DataLoader(dataset, batch_size = 1, shuffle = False, prefetch_factor=10, num_workers=5) 85 | 86 | with torch.no_grad(): 87 | for row in tqdm(iter(dataloader)): 88 | inputs, f, timestamps_ = row 89 | f = f[0] 90 | timestamps = [] 91 | for t in timestamps_: 92 | timestamps.append((float(t[0]), float(t[1]))) 93 | 94 | inputs['input_values'] = inputs['input_values'][0].to(torch.float16).cuda() 95 | logits = model(inputs['input_values']).logits.cpu().numpy() 96 | logits_per_timestamp = {t: logits[no] for no, (t, _) in enumerate(timestamps)} 97 | logits_accumulator = defaultdict(lambda: np.zeros(logits.shape[1])) 98 | count_accumulator = defaultdict(int) 99 | 100 | for (start, end) in timestamps: 101 | for t in np.arange(start, end, actual_stride): 102 | logits_accumulator[t] += logits_per_timestamp[start] 103 | count_accumulator[t] += 1 104 | 105 | averaged_logits = {t: logits_accumulator[t] / count_accumulator[t] for t in logits_accumulator} 106 | for k in averaged_logits.keys(): 107 | averaged_logits[k] = [round(v_, 5) for v_ in averaged_logits[k]] 108 | 109 | combined = [] 110 | for k, v in averaged_logits.items(): 111 | topk = np.array(v).argsort()[-5:][::-1] 112 | scores = [float(v[i]) for i in topk] 113 | topk = [id2label[i] for i in topk] 114 | combined.append({'timestamp': k, 'topk': topk, 'scores': scores}) 115 | 116 | splitted = new_path(f) 117 | os.makedirs(os.path.split(splitted)[0], exist_ok = True) 118 | with open(splitted, 'w') as fopen: 119 | json.dump(combined, fopen) 120 | 121 | if __name__ == '__main__': 122 | function() -------------------------------------------------------------------------------- /text/llama/prepare-dataset-1024.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4c973cad", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from transformers import (\n", 11 | " AutoModelForCausalLM,\n", 12 | " AutoTokenizer,\n", 13 | " get_scheduler,\n", 14 | " default_data_collator,\n", 15 | " SchedulerType\n", 16 | ")\n", 17 | "import os\n", 18 | "import json\n", 19 | "from itertools import chain\n", 20 | "from datasets import load_dataset" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "id": "7e56d3d6", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "block_size = 1024\n", 31 | "train_file = 'combine.jsonl'\n", 32 | "tokenizer = AutoTokenizer.from_pretrained(\n", 33 | " 'meta-llama/Llama-2-7b-hf',\n", 34 | ")\n", 35 | "text_column_name = 'text'" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "id": "98f1cb1b", 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "application/vnd.jupyter.widget-view+json": { 47 | "model_id": "47f4799106b9459da07783bfe46cfd03", 48 | "version_major": 2, 49 | "version_minor": 0 50 | }, 51 | "text/plain": [ 52 | "Downloading data files: 0%| | 0/1 [00:00 127 for char in text)\n", 57 | "\n", 58 | "def reject_q(q):\n", 59 | " if q is None:\n", 60 | " return True\n", 61 | " if any([c in q for c in rejected]):\n", 62 | " return True\n", 63 | " if contains_non_ascii(q):\n", 64 | " return True\n", 65 | " if len(set(q) & digits):\n", 66 | " return True\n", 67 | " if len(q) < 20:\n", 68 | " return True\n", 69 | " if len(q) > 200:\n", 70 | " return True\n", 71 | " return False" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 25, 77 | "id": "fddc3adf", 78 | "metadata": { 79 | "scrolled": true 80 | }, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/plain": [ 85 | "58" 86 | ] 87 | }, 88 | "execution_count": 25, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "from collections import defaultdict\n", 95 | "\n", 96 | "filtered_q = defaultdict(list)\n", 97 | "for q, k in questions:\n", 98 | " if len(q) < 10:\n", 99 | " continue\n", 100 | " if reject_q(q):\n", 101 | " continue\n", 102 | " \n", 103 | " filtered_q[k].append(q)\n", 104 | "len(filtered_q)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 31, 110 | "id": "60a9f651", 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/plain": [ 116 | "31" 117 | ] 118 | }, 119 | "execution_count": 31, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "len(filtered_q['d3 js'])" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 54, 131 | "id": "64cf6d6b", 132 | "metadata": { 133 | "scrolled": false 134 | }, 135 | "outputs": [ 136 | { 137 | "data": { 138 | "text/plain": [ 139 | "57000" 140 | ] 141 | }, 142 | "execution_count": 54, 143 | "metadata": {}, 144 | "output_type": "execute_result" 145 | } 146 | ], 147 | "source": [ 148 | "questions = []\n", 149 | "for k, v in filtered_q.items():\n", 150 | " if len(v) < 100:\n", 151 | " continue\n", 152 | " v = sorted(v, key = lambda x: len(x), reverse = True)\n", 153 | " v = [(v_, k) for v_ in v][:1000]\n", 154 | " questions.extend(v)\n", 155 | " \n", 156 | "len(questions)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 55, 162 | "id": "eee58538", 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "data": { 167 | "text/plain": [ 168 | "('Bolehkah anda menerangkan langkah-langkah yang diperlukan untuk membuat sistem pengurusan penyediaan semula dalam Go yang berfungsi dengan cekap dan tahan terhadap kegagalan?',\n", 169 | " 'go distributed system')" 170 | ] 171 | }, 172 | "execution_count": 55, 173 | "metadata": {}, 174 | "output_type": "execute_result" 175 | } 176 | ], 177 | "source": [ 178 | "questions[0]" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 56, 184 | "id": "af927235", 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "with open('dedup-questions.json', 'w') as fopen:\n", 189 | " json.dump(questions, fopen)" 190 | ] 191 | } 192 | ], 193 | "metadata": { 194 | "kernelspec": { 195 | "display_name": "Python 3 (ipykernel)", 196 | "language": "python", 197 | "name": "python3" 198 | }, 199 | "language_info": { 200 | "codemirror_mode": { 201 | "name": "ipython", 202 | "version": 3 203 | }, 204 | "file_extension": ".py", 205 | "mimetype": "text/x-python", 206 | "name": "python", 207 | "nbconvert_exporter": "python", 208 | "pygments_lexer": "ipython3", 209 | "version": "3.8.10" 210 | } 211 | }, 212 | "nbformat": 4, 213 | "nbformat_minor": 5 214 | } 215 | -------------------------------------------------------------------------------- /text/extra/process-snapshot.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 75, 6 | "id": "05913d38", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import json\n", 11 | "from glob import glob\n", 12 | "from tqdm import tqdm" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 89, 18 | "id": "55c6365c", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "http_errors = [\n", 23 | " \"400 Bad Request\", \"401 Unauthorized\", \"402 Payment Required\", \"403 Forbidden\", \"404 Not Found\",\n", 24 | " \"405 Method Not Allowed\", \"406 Not Acceptable\", \"407 Proxy Authentication Required\", \"408 Request Timeout\",\n", 25 | " \"409 Conflict\", \"410 Gone\", \"411 Length Required\", \"412 Precondition Failed\", \"413 Payload Too Large\",\n", 26 | " \"414 URI Too Long\", \"415 Unsupported Media Type\", \"416 Range Not Satisfiable\", \"417 Expectation Failed\",\n", 27 | " \"418 I'm a teapot\", \"421 Misdirected Request\", \"422 Unprocessable Entity\", \"423 Locked\", \"424 Failed Dependency\",\n", 28 | " \"425 Too Early\", \"426 Upgrade Required\", \"428 Precondition Required\", \"429 Too Many Requests\",\n", 29 | " \"431 Request Header Fields Too Large\", \"451 Unavailable For Legal Reasons\", \"500 Internal Server Error\",\n", 30 | " \"501 Not Implemented\", \"502 Bad Gateway\", \"503 Service Unavailable\", \"504 Gateway Timeout\",\n", 31 | " \"505 HTTP Version Not Supported\", \"506 Variant Also Negotiates\", \"507 Insufficient Storage\",\n", 32 | " \"508 Loop Detected\", \"510 Not Extended\", \"511 Network Authentication Required\"\n", 33 | " ]\n", 34 | "\n", 35 | "rejected = [\n", 36 | " 'Internal Server Error',\n", 37 | " '404',\n", 38 | " '__NOEDITSECTION__',\n", 39 | " 'enter your username and password',\n", 40 | " 'Login',\n", 41 | " 'forgotten your password',\n", 42 | " 'cookies enabled',\n", 43 | " 'sign in',\n", 44 | " 'tentang kami',\n", 45 | " 'skip to content',\n", 46 | " 'hubungi kami',\n", 47 | " 'laman utama',\n", 48 | " 'enable JavaScript in your browser.',\n", 49 | " 'The page cannot be displayed',\n", 50 | " 'site or edit the error_page',\n", 51 | " 'Hakcipta terpelihara',\n", 52 | " 'Copyright ©'\n", 53 | "]\n", 54 | "\n", 55 | "rejected.extend(http_errors)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 80, 61 | "id": "54284aa7", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "files = sorted(glob('crawl-my-website/snapshot/*.json'))" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 114, 71 | "id": "85361659", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "!rm hf-datasets/dedupe-datasets/snapshot.jsonl" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 115, 81 | "id": "ae3d0a6d", 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "name": "stderr", 86 | "output_type": "stream", 87 | "text": [ 88 | "100%|██████████| 348/348 [03:33<00:00, 1.63it/s]\n" 89 | ] 90 | } 91 | ], 92 | "source": [ 93 | "processed = set()\n", 94 | "with open('hf-datasets/raw-datasets/snapshot.jsonl', 'w') as fopen_l:\n", 95 | " for f in tqdm(files):\n", 96 | " with open(f) as fopen:\n", 97 | " for l in fopen:\n", 98 | " l = json.loads(l)\n", 99 | " if l['url'] in processed:\n", 100 | " continue\n", 101 | "\n", 102 | " splitted = l['data'].split('\\n')\n", 103 | " splitted = [s for s in splitted if len(s) > 50]\n", 104 | " splitted = [s.strip() for s in splitted if all([r not in s for r in rejected])]\n", 105 | " if len(splitted):\n", 106 | " data = {\n", 107 | " 'url': l['url'],\n", 108 | " 'text': splitted\n", 109 | " }\n", 110 | " fopen_l.write(f'{json.dumps(data)}\\n')\n", 111 | " fopen_l.flush()\n", 112 | " \n", 113 | " processed.add(l['url'])" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 116, 119 | "id": "8aad7de6", 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/plain": [ 125 | "428982" 126 | ] 127 | }, 128 | "execution_count": 116, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "len(processed)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "id": "2059852b", 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [] 144 | } 145 | ], 146 | "metadata": { 147 | "kernelspec": { 148 | "display_name": "Python 3 (ipykernel)", 149 | "language": "python", 150 | "name": "python3" 151 | }, 152 | "language_info": { 153 | "codemirror_mode": { 154 | "name": "ipython", 155 | "version": 3 156 | }, 157 | "file_extension": ".py", 158 | "mimetype": "text/x-python", 159 | "name": "python", 160 | "nbconvert_exporter": "python", 161 | "pygments_lexer": "ipython3", 162 | "version": "3.10.12" 163 | } 164 | }, 165 | "nbformat": 4, 166 | "nbformat_minor": 5 167 | } 168 | -------------------------------------------------------------------------------- /text/madlad-400-ms/postprocess-madlad-400-ms.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "id": "d9d83b0a", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import json\n", 11 | "import re\n", 12 | "from tqdm import tqdm\n", 13 | "\n", 14 | "http_errors = [\n", 15 | " \"400 Bad Request\", \"401 Unauthorized\", \"402 Payment Required\", \"403 Forbidden\", \"404 Not Found\",\n", 16 | " \"405 Method Not Allowed\", \"406 Not Acceptable\", \"407 Proxy Authentication Required\", \"408 Request Timeout\",\n", 17 | " \"409 Conflict\", \"410 Gone\", \"411 Length Required\", \"412 Precondition Failed\", \"413 Payload Too Large\",\n", 18 | " \"414 URI Too Long\", \"415 Unsupported Media Type\", \"416 Range Not Satisfiable\", \"417 Expectation Failed\",\n", 19 | " \"418 I'm a teapot\", \"421 Misdirected Request\", \"422 Unprocessable Entity\", \"423 Locked\", \"424 Failed Dependency\",\n", 20 | " \"425 Too Early\", \"426 Upgrade Required\", \"428 Precondition Required\", \"429 Too Many Requests\",\n", 21 | " \"431 Request Header Fields Too Large\", \"451 Unavailable For Legal Reasons\", \"500 Internal Server Error\",\n", 22 | " \"501 Not Implemented\", \"502 Bad Gateway\", \"503 Service Unavailable\", \"504 Gateway Timeout\",\n", 23 | " \"505 HTTP Version Not Supported\", \"506 Variant Also Negotiates\", \"507 Insufficient Storage\",\n", 24 | " \"508 Loop Detected\", \"510 Not Extended\", \"511 Network Authentication Required\"\n", 25 | " ]\n", 26 | "\n", 27 | "rejected = [\n", 28 | " 'Internal Server Error',\n", 29 | " '__NOEDITSECTION__',\n", 30 | " 'enter your username and password',\n", 31 | " 'forgotten your password',\n", 32 | " 'cookies enabled',\n", 33 | " 'enable JavaScript in your browser.',\n", 34 | " 'The page cannot be displayed',\n", 35 | " 'site or edit the error_page',\n", 36 | "]\n", 37 | "\n", 38 | "rejected.extend(http_errors)\n", 39 | "\n", 40 | "def replace_multiple(input_string, pattern =r\"\\s{6,}\", replace = ' '):\n", 41 | " return re.sub(pattern, replace, input_string)\n", 42 | "\n", 43 | "def replace(string):\n", 44 | " string = replace_multiple(string.replace('…', '.'))\n", 45 | " string = replace_multiple(string, pattern = r\"\\.{6,}\", replace = '...')\n", 46 | " return string\n", 47 | "\n", 48 | "def reject(string):\n", 49 | " if any([r in string for r in rejected]):\n", 50 | " return True\n", 51 | " return False" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 5, 57 | "id": "d714ffb4", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "f = 'madlad-400-ms.jsonl'\n", 62 | "new_f = 'madlad-400-ms.postprocessing.jsonl'" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 9, 68 | "id": "83f668c6", 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stderr", 73 | "output_type": "stream", 74 | "text": [ 75 | "2232026it [08:47, 4507.46it/s]IOPub message rate exceeded.\n", 76 | "The notebook server will temporarily stop sending output\n", 77 | "to the client in order to avoid crashing it.\n", 78 | "To change this limit, set the config variable\n", 79 | "`--NotebookApp.iopub_msg_rate_limit`.\n", 80 | "\n", 81 | "Current values:\n", 82 | "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n", 83 | "NotebookApp.rate_limit_window=3.0 (secs)\n", 84 | "\n", 85 | "11158994it [43:43, 4565.13it/s]IOPub message rate exceeded.\n", 86 | "The notebook server will temporarily stop sending output\n", 87 | "to the client in order to avoid crashing it.\n", 88 | "To change this limit, set the config variable\n", 89 | "`--NotebookApp.iopub_msg_rate_limit`.\n", 90 | "\n", 91 | "Current values:\n", 92 | "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n", 93 | "NotebookApp.rate_limit_window=3.0 (secs)\n", 94 | "\n" 95 | ] 96 | } 97 | ], 98 | "source": [ 99 | "with open(new_f, 'w') as fopen_l:\n", 100 | " with open(f) as fopen:\n", 101 | " for l in tqdm(fopen):\n", 102 | " data = json.loads(l)\n", 103 | " \n", 104 | " if isinstance(data, dict):\n", 105 | " t = data['text']\n", 106 | " else:\n", 107 | " t = data\n", 108 | "\n", 109 | " if reject(t):\n", 110 | " continue\n", 111 | "\n", 112 | " data = replace(t.strip())\n", 113 | "\n", 114 | " if len(data) < 3:\n", 115 | " continue\n", 116 | "\n", 117 | " fopen_l.write(f'{json.dumps(data)}\\n')\n", 118 | " fopen_l.flush()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "id": "f13ad6af", 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [] 128 | } 129 | ], 130 | "metadata": { 131 | "kernelspec": { 132 | "display_name": "Python 3 (ipykernel)", 133 | "language": "python", 134 | "name": "python3" 135 | }, 136 | "language_info": { 137 | "codemirror_mode": { 138 | "name": "ipython", 139 | "version": 3 140 | }, 141 | "file_extension": ".py", 142 | "mimetype": "text/x-python", 143 | "name": "python", 144 | "nbconvert_exporter": "python", 145 | "pygments_lexer": "ipython3", 146 | "version": "3.10.12" 147 | } 148 | }, 149 | "nbformat": 4, 150 | "nbformat_minor": 5 151 | } 152 | -------------------------------------------------------------------------------- /text/pretrain-clm/from-pyarrow-to-mosaic.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 15, 6 | "id": "4b7592f7", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import mp\n", 11 | "import os\n", 12 | "import pyarrow as pa\n", 13 | "import numpy as np\n", 14 | "from streaming import MDSWriter\n", 15 | "from tqdm import tqdm" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 6, 21 | "id": "e0391f83", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "from streaming.base.format.mds.encodings import Encoding, _encodings\n", 26 | "\n", 27 | "class Int32(Encoding):\n", 28 | " def encode(self, obj) -> bytes:\n", 29 | " return obj.tobytes()\n", 30 | "\n", 31 | " def decode(self, data: bytes):\n", 32 | " return np.frombuffer(data, np.int32)\n", 33 | "\n", 34 | "_encodings['int32'] = Int32" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 7, 40 | "id": "62ddb05a", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "columns = {\n", 45 | " 'input_ids': 'int32',\n", 46 | " 'token_type_ids': 'int32',\n", 47 | " 'attention_mask': 'int32',\n", 48 | " 'labels': 'int32',\n", 49 | "}\n", 50 | "compression = 'zstd'\n", 51 | "hashes = 'sha1', 'xxh64'" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 8, 57 | "id": "e817fcc5", 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/plain": [ 63 | "['combine-lm_00017_of_00020.jsonl-grouped-4096',\n", 64 | " 'combine-lm_00005_of_00020.jsonl-grouped-4096',\n", 65 | " 'combine-lm_00008_of_00020.jsonl-grouped-4096',\n", 66 | " 'combine-lm_00012_of_00020.jsonl-grouped-4096',\n", 67 | " 'combine-lm_00007_of_00020.jsonl-grouped-4096',\n", 68 | " 'combine-lm_00014_of_00020.jsonl-grouped-4096',\n", 69 | " 'combine-lm_00006_of_00020.jsonl-grouped-4096',\n", 70 | " 'combine-lm_00013_of_00020.jsonl-grouped-4096',\n", 71 | " 'combine-lm_00016_of_00020.jsonl-grouped-4096',\n", 72 | " 'combine-lm_00011_of_00020.jsonl-grouped-4096',\n", 73 | " 'combine-lm_00018_of_00020.jsonl-grouped-4096',\n", 74 | " 'combine-lm_00002_of_00020.jsonl-grouped-4096',\n", 75 | " 'combine-lm_00009_of_00020.jsonl-grouped-4096',\n", 76 | " 'combine-lm_00019_of_00020.jsonl-grouped-4096',\n", 77 | " 'combine-lm_00001_of_00020.jsonl-grouped-4096',\n", 78 | " 'combine-lm_00003_of_00020.jsonl-grouped-4096',\n", 79 | " 'combine-lm_00015_of_00020.jsonl-grouped-4096',\n", 80 | " 'combine-lm_00004_of_00020.jsonl-grouped-4096',\n", 81 | " 'combine-lm_00000_of_00020.jsonl-grouped-4096',\n", 82 | " 'combine-lm_00010_of_00020.jsonl-grouped-4096']" 83 | ] 84 | }, 85 | "execution_count": 8, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "from glob import glob\n", 92 | "\n", 93 | "files = glob('combine-lm_*_of_00020.jsonl-grouped-4096')\n", 94 | "files" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 16, 100 | "id": "8a3e0890", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "def loop(files):\n", 105 | " files, index = files\n", 106 | " out_root = f'tokenized-{index}'\n", 107 | " os.system(f'rm -rf {out_root}')\n", 108 | " with MDSWriter(out=out_root, columns=columns, compression=compression, hashes=hashes, \n", 109 | " size_limit = 67108864 * 2) as out:\n", 110 | " for f in files:\n", 111 | " memory_mapped_stream = pa.memory_map(f)\n", 112 | " opened_stream = pa.ipc.open_stream(memory_mapped_stream)\n", 113 | " for a in tqdm(opened_stream):\n", 114 | " s = a.to_struct_array()\n", 115 | " for i in range(len(s)):\n", 116 | " keys = list(s[i])\n", 117 | " a_ = {}\n", 118 | " for k in keys:\n", 119 | " a_[k] = np.array(s[i][k].as_py()).astype(np.int32)\n", 120 | " out.write(a_)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "id": "876289e4", 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "name": "stderr", 131 | "output_type": "stream", 132 | "text": [ 133 | "2570it [05:50, 7.30it/s]\n", 134 | "7464it [06:35, 18.87it/s]\n", 135 | "7464it [07:57, 15.62it/s]\n", 136 | "7464it [08:06, 15.36it/s]\n", 137 | "7464it [08:11, 15.20it/s]\n", 138 | "7464it [12:20, 10.08it/s]\n", 139 | "5816it [13:12, 7.64it/s]" 140 | ] 141 | } 142 | ], 143 | "source": [ 144 | "mp.multiprocessing(files, loop, cores = 20, returned = False)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "id": "c9aba12a", 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [] 154 | } 155 | ], 156 | "metadata": { 157 | "kernelspec": { 158 | "display_name": "Python 3 (ipykernel)", 159 | "language": "python", 160 | "name": "python3" 161 | }, 162 | "language_info": { 163 | "codemirror_mode": { 164 | "name": "ipython", 165 | "version": 3 166 | }, 167 | "file_extension": ".py", 168 | "mimetype": "text/x-python", 169 | "name": "python", 170 | "nbconvert_exporter": "python", 171 | "pygments_lexer": "ipython3", 172 | "version": "3.10.12" 173 | } 174 | }, 175 | "nbformat": 4, 176 | "nbformat_minor": 5 177 | } 178 | -------------------------------------------------------------------------------- /text/text_dedup/utils/hashfunc.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import struct 3 | from hashlib import md5 4 | from hashlib import sha256 5 | 6 | import xxhash 7 | from xxhash import xxh3_64 8 | from xxhash import xxh3_64_digest 9 | from xxhash import xxh3_128 10 | from xxhash import xxh3_128_digest 11 | 12 | 13 | def md5_hexdigest(data: bytes) -> str: 14 | """ 15 | Generate a md5 hex hash from the given data. 16 | 17 | Parameters 18 | ---------- 19 | data : bytes 20 | The data to be hashed. 21 | 22 | Returns 23 | ------- 24 | str 25 | The hex hash value. 26 | 27 | Examples 28 | -------- 29 | >>> md5_hexdigest(b"hello world") 30 | '5eb63bbbe01eeed093cb22bb8f5acdc3' 31 | >>> len(md5_hexdigest(b"hello world")) 32 | 32 33 | """ 34 | return md5(data).hexdigest() 35 | 36 | 37 | def sha1_hash(data: bytes, d: int = 32) -> int: 38 | """ 39 | Generate a d-bit hash value from the given data. 40 | 41 | Parameters 42 | ---------- 43 | data : bytes 44 | The data to be hashed. 45 | d : int 46 | The number of bits of the hash value. 47 | 48 | Returns 49 | ------- 50 | int 51 | The hash value. 52 | 53 | Examples 54 | -------- 55 | >>> sha1_hash(b"hello world", 32) 56 | 896314922 57 | >>> sha1_hash(b"hello world", 64) 58 | 13028719972609469994 59 | >>> sha1_hash(b"hello world", 128) 60 | 310522945683037930239412421226792791594 61 | """ 62 | if d == 32: 63 | return struct.unpack(" str: 71 | """ 72 | Generate a sha256 hex hash from the given data. 73 | 74 | Parameters 75 | ---------- 76 | data : bytes 77 | The data to be hashed. 78 | 79 | Returns 80 | ------- 81 | str 82 | The hex hash value. 83 | 84 | Examples 85 | -------- 86 | >>> sha256_hexdigest(b"hello world") 87 | 'b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9' 88 | >>> len(sha256_hexdigest(b"hello world")) 89 | 64 90 | """ 91 | return sha256(data).hexdigest() 92 | 93 | 94 | def xxh3_16hash(data: bytes, seed: int = 0) -> int: 95 | """ 96 | Generate a 16-bit xxhash based hash value from the given data. 97 | As of python xxhash 3.3.0 (and since 0.3.0) outputs in big-endian. 98 | This is useful as a special purpose xxhash when you only want 16 bits. 99 | bit masked xxh3_64 hashes are faster than xxh32 in modern systems. 100 | 101 | Parameters 102 | ---------- 103 | data : bytes 104 | The data to be hashed. 105 | seed : int 106 | xxhashes can all be seeded. Default is int=0 107 | 108 | Returns 109 | ------- 110 | int 111 | The hash value. 112 | 113 | Examples 114 | -------- 115 | >>> xxh3_16hash(b"hello world") 116 | 39051 117 | >>> xxh3_16hash(b"hello world",seed=42) 118 | 13198 119 | >>> xxh3_16hash(b"hello world",seed=-42) 120 | 34281 121 | """ 122 | return xxhash.xxh3_64_intdigest(data, seed) & 0xFFFF 123 | 124 | 125 | def xxh3_32hash(data: bytes, seed: int = 0) -> int: 126 | """ 127 | Generate a 32-bit xxhash based hash value from the given data. 128 | As of python xxhash 3.3.0 (and since 0.3.0) outputs in big-endian. 129 | This is useful as a special purpose xxhash when you only want 32bits. 130 | bit masked xxh3_64 hashes are faster than xxh32 in modern systems. 131 | 132 | Parameters 133 | ---------- 134 | data : bytes 135 | The data to be hashed. 136 | seed : int 137 | xxhashes can all be seeded. Default is int=0 138 | 139 | Returns 140 | ------- 141 | int 142 | The hash value. 143 | 144 | Examples 145 | -------- 146 | >>> xxh3_32hash(b"hello world") 147 | 1088854155 148 | >>> xxh3_32hash(b"hello world",seed=42) 149 | 3913102222 150 | >>> xxh3_32hash(b"hello world",seed=-42) 151 | 3721037289 152 | """ 153 | return xxhash.xxh3_64_intdigest(data, seed) & 0xFFFFFFFF 154 | 155 | 156 | def xxh3_hash(data: bytes, d: int = 32) -> int: 157 | """ 158 | Generate a d-bit xxhash based hash value from the given data. 159 | As of python xxhash 3.3.0 (and since 0.3.0) outputs in big-endian. 160 | This is useful as a general purpose xxhash that can take multiple `d` values 161 | 162 | Parameters 163 | ---------- 164 | data : bytes 165 | The data to be hashed. 166 | d : int 167 | The number of bits of the hash value. 168 | According to this value, chooses empirically found best xxh3 hasher. 169 | 170 | Returns 171 | ------- 172 | int 173 | The hash value. 174 | 175 | Examples 176 | -------- 177 | >>> xxh3_hash(b"hello world", 32) 178 | 1088854155 179 | >>> xxh3_hash(b"hello world", 64) 180 | 15296390279056496779 181 | >>> xxh3_hash(b"hello world", 128) 182 | 297150157938599054391163723952090887879 183 | """ 184 | if d == 32: 185 | # with sse2 or later, xxh3 is much faster 186 | # with avx, the difference is much larger 187 | return xxhash.xxh3_64_intdigest(data) & 0xFFFFFFFF 188 | if d == 64: 189 | return xxhash.xxh3_64_intdigest(data) 190 | if d == 128: 191 | return xxhash.xxh3_128_intdigest(data) 192 | # fall back 193 | return int.from_bytes(xxhash.xxh3_128_digest(data)[: d // 8], byteorder="big") 194 | 195 | 196 | __all__ = [ 197 | "md5", 198 | "sha256", 199 | "sha1_hash", 200 | "xxh3_64", 201 | "xxh3_64_digest", 202 | "xxh3_128", 203 | "xxh3_128_digest", 204 | "xxh3_hash", 205 | "xxh3_16hash", 206 | "xxh3_32hash", 207 | ] -------------------------------------------------------------------------------- /text/processing/text_dedup/utils/hashfunc.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import struct 3 | from hashlib import md5 4 | from hashlib import sha256 5 | 6 | import xxhash 7 | from xxhash import xxh3_64 8 | from xxhash import xxh3_64_digest 9 | from xxhash import xxh3_128 10 | from xxhash import xxh3_128_digest 11 | 12 | 13 | def md5_hexdigest(data: bytes) -> str: 14 | """ 15 | Generate a md5 hex hash from the given data. 16 | 17 | Parameters 18 | ---------- 19 | data : bytes 20 | The data to be hashed. 21 | 22 | Returns 23 | ------- 24 | str 25 | The hex hash value. 26 | 27 | Examples 28 | -------- 29 | >>> md5_hexdigest(b"hello world") 30 | '5eb63bbbe01eeed093cb22bb8f5acdc3' 31 | >>> len(md5_hexdigest(b"hello world")) 32 | 32 33 | """ 34 | return md5(data).hexdigest() 35 | 36 | 37 | def sha1_hash(data: bytes, d: int = 32) -> int: 38 | """ 39 | Generate a d-bit hash value from the given data. 40 | 41 | Parameters 42 | ---------- 43 | data : bytes 44 | The data to be hashed. 45 | d : int 46 | The number of bits of the hash value. 47 | 48 | Returns 49 | ------- 50 | int 51 | The hash value. 52 | 53 | Examples 54 | -------- 55 | >>> sha1_hash(b"hello world", 32) 56 | 896314922 57 | >>> sha1_hash(b"hello world", 64) 58 | 13028719972609469994 59 | >>> sha1_hash(b"hello world", 128) 60 | 310522945683037930239412421226792791594 61 | """ 62 | if d == 32: 63 | return struct.unpack(" str: 71 | """ 72 | Generate a sha256 hex hash from the given data. 73 | 74 | Parameters 75 | ---------- 76 | data : bytes 77 | The data to be hashed. 78 | 79 | Returns 80 | ------- 81 | str 82 | The hex hash value. 83 | 84 | Examples 85 | -------- 86 | >>> sha256_hexdigest(b"hello world") 87 | 'b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9' 88 | >>> len(sha256_hexdigest(b"hello world")) 89 | 64 90 | """ 91 | return sha256(data).hexdigest() 92 | 93 | 94 | def xxh3_16hash(data: bytes, seed: int = 0) -> int: 95 | """ 96 | Generate a 16-bit xxhash based hash value from the given data. 97 | As of python xxhash 3.3.0 (and since 0.3.0) outputs in big-endian. 98 | This is useful as a special purpose xxhash when you only want 16 bits. 99 | bit masked xxh3_64 hashes are faster than xxh32 in modern systems. 100 | 101 | Parameters 102 | ---------- 103 | data : bytes 104 | The data to be hashed. 105 | seed : int 106 | xxhashes can all be seeded. Default is int=0 107 | 108 | Returns 109 | ------- 110 | int 111 | The hash value. 112 | 113 | Examples 114 | -------- 115 | >>> xxh3_16hash(b"hello world") 116 | 39051 117 | >>> xxh3_16hash(b"hello world",seed=42) 118 | 13198 119 | >>> xxh3_16hash(b"hello world",seed=-42) 120 | 34281 121 | """ 122 | return xxhash.xxh3_64_intdigest(data, seed) & 0xFFFF 123 | 124 | 125 | def xxh3_32hash(data: bytes, seed: int = 0) -> int: 126 | """ 127 | Generate a 32-bit xxhash based hash value from the given data. 128 | As of python xxhash 3.3.0 (and since 0.3.0) outputs in big-endian. 129 | This is useful as a special purpose xxhash when you only want 32bits. 130 | bit masked xxh3_64 hashes are faster than xxh32 in modern systems. 131 | 132 | Parameters 133 | ---------- 134 | data : bytes 135 | The data to be hashed. 136 | seed : int 137 | xxhashes can all be seeded. Default is int=0 138 | 139 | Returns 140 | ------- 141 | int 142 | The hash value. 143 | 144 | Examples 145 | -------- 146 | >>> xxh3_32hash(b"hello world") 147 | 1088854155 148 | >>> xxh3_32hash(b"hello world",seed=42) 149 | 3913102222 150 | >>> xxh3_32hash(b"hello world",seed=-42) 151 | 3721037289 152 | """ 153 | return xxhash.xxh3_64_intdigest(data, seed) & 0xFFFFFFFF 154 | 155 | 156 | def xxh3_hash(data: bytes, d: int = 32) -> int: 157 | """ 158 | Generate a d-bit xxhash based hash value from the given data. 159 | As of python xxhash 3.3.0 (and since 0.3.0) outputs in big-endian. 160 | This is useful as a general purpose xxhash that can take multiple `d` values 161 | 162 | Parameters 163 | ---------- 164 | data : bytes 165 | The data to be hashed. 166 | d : int 167 | The number of bits of the hash value. 168 | According to this value, chooses empirically found best xxh3 hasher. 169 | 170 | Returns 171 | ------- 172 | int 173 | The hash value. 174 | 175 | Examples 176 | -------- 177 | >>> xxh3_hash(b"hello world", 32) 178 | 1088854155 179 | >>> xxh3_hash(b"hello world", 64) 180 | 15296390279056496779 181 | >>> xxh3_hash(b"hello world", 128) 182 | 297150157938599054391163723952090887879 183 | """ 184 | match d: 185 | case 32: 186 | # with sse2 or later, xxh3 is much faster 187 | # with avx, the difference is much larger 188 | return xxhash.xxh3_64_intdigest(data) & 0xFFFFFFFF 189 | case 64: 190 | return xxhash.xxh3_64_intdigest(data) 191 | case 128: 192 | return xxhash.xxh3_128_intdigest(data) 193 | # fall back 194 | return int.from_bytes(xxhash.xxh3_128_digest(data)[: d // 8], byteorder="big") 195 | 196 | 197 | __all__ = [ 198 | "md5", 199 | "sha256", 200 | "sha1_hash", 201 | "xxh3_64", 202 | "xxh3_64_digest", 203 | "xxh3_128", 204 | "xxh3_128_digest", 205 | "xxh3_hash", 206 | "xxh3_16hash", 207 | "xxh3_32hash", 208 | ] -------------------------------------------------------------------------------- /text/mistral/run-tokenizer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "id": "68984750", 7 | "metadata": { 8 | "scrolled": true 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from transformers import (\n", 13 | " AutoModelForCausalLM,\n", 14 | " AutoTokenizer,\n", 15 | " get_scheduler,\n", 16 | " default_data_collator,\n", 17 | " SchedulerType\n", 18 | ")\n", 19 | "import os\n", 20 | "import json\n", 21 | "from itertools import chain\n", 22 | "from datasets import load_dataset" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 5, 28 | "id": "09d07423", 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stderr", 33 | "output_type": "stream", 34 | "text": [ 35 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "train_file = 'combine-mistral.jsonl'\n", 41 | "tokenizer = AutoTokenizer.from_pretrained(\n", 42 | " 'mistralai/Mistral-7B-v0.1',\n", 43 | ")\n", 44 | "tokenizer.add_bos_token = False\n", 45 | "tokenizer.add_eos_token = False\n", 46 | "text_column_name = 'text'" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 6, 52 | "id": "0c31ee11", 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "application/vnd.jupyter.widget-view+json": { 58 | "model_id": "86feb593de8d41089a848a49fdd7d95e", 59 | "version_major": 2, 60 | "version_minor": 0 61 | }, 62 | "text/plain": [ 63 | "Downloading data files: 0%| | 0/1 [00:00 1:\n", 125 | " audio_np = audio_np.mean(axis=1)\n", 126 | " if audio_np.shape[0] < 10000:\n", 127 | " continue\n", 128 | " sf.write(audio_filename, audio_np, sr)\n", 129 | " \n", 130 | " data.append({\n", 131 | " 'audio_filename': audio_filename,\n", 132 | " 'text': t,\n", 133 | " 'speaker': f\"{base}\"\n", 134 | " })\n", 135 | " except Exception as e:\n", 136 | " pass\n", 137 | " \n", 138 | " return data" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 16, 144 | "id": "a66ecfc4", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "# data = loop((files[:1], 0))" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "id": "935c7e8c", 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stderr", 159 | "output_type": "stream", 160 | "text": [ 161 | " 7%|▋ | 145/2205 [00:18<03:34, 9.59it/s]" 162 | ] 163 | } 164 | ], 165 | "source": [ 166 | "data = multiprocessing(files, loop, cores = len(files))" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "id": "d5fb3e9a", 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [] 176 | } 177 | ], 178 | "metadata": { 179 | "kernelspec": { 180 | "display_name": "Python 3 (ipykernel)", 181 | "language": "python", 182 | "name": "python3" 183 | }, 184 | "language_info": { 185 | "codemirror_mode": { 186 | "name": "ipython", 187 | "version": 3 188 | }, 189 | "file_extension": ".py", 190 | "mimetype": "text/x-python", 191 | "name": "python", 192 | "nbconvert_exporter": "python", 193 | "pygments_lexer": "ipython3", 194 | "version": "3.10.12" 195 | } 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 5 199 | } 200 | -------------------------------------------------------------------------------- /text/processing/function.py: -------------------------------------------------------------------------------- 1 | import os 2 | import mp 3 | import json 4 | import functools 5 | import subprocess 6 | import utils as ut 7 | from glob import glob 8 | from tqdm import tqdm 9 | from pathlib import Path 10 | from datasets import Dataset 11 | from unidecode import unidecode 12 | 13 | 14 | def download_dataset(link, raw_dataset_path, dataset_name): 15 | try: 16 | global MAIN_FOLDER_DATASET 17 | 18 | MAIN_FOLDER_DATASET = f"{raw_dataset_path}/raw-datasets/" 19 | ut.create_dir(MAIN_FOLDER_DATASET) 20 | 21 | command = f"wget {link} -O {MAIN_FOLDER_DATASET}/{dataset_name}.jsonl" 22 | ut.run_command(command) 23 | 24 | return True 25 | except: 26 | return False 27 | 28 | 29 | def init_process( 30 | raw_dataset_path, dataset_name, text_key=None, link=None, clean_file_path=None 31 | ): 32 | global INITIAL_PRE_PROCESSING_FOLDER 33 | global MAIN_FOLDER_DATASET 34 | 35 | txt_l = [] 36 | 37 | if link != None: 38 | dd = download_dataset(link, raw_dataset_path, dataset_name) 39 | 40 | INITIAL_PRE_PROCESSING_FOLDER = f"{raw_dataset_path}/staging-datasets/" 41 | ut.create_dir(INITIAL_PRE_PROCESSING_FOLDER) 42 | 43 | with open(f"{MAIN_FOLDER_DATASET}/{dataset_name}.jsonl") as fopen: 44 | data = [json.loads(line) for line in fopen] 45 | 46 | if clean_file_path != None: 47 | MAIN_FOLDER_DATASET = clean_file_path 48 | 49 | INITIAL_PRE_PROCESSING_FOLDER = f"{raw_dataset_path}/staging-datasets/" 50 | ut.create_dir(INITIAL_PRE_PROCESSING_FOLDER) 51 | 52 | with open(clean_file_path) as fopen: 53 | data = [json.loads(line) for line in fopen] 54 | 55 | try: 56 | key_data = [key for key, _ in data[0].items()] 57 | print(f"Availble key -> {key_data}") 58 | except AttributeError: 59 | raise Exception( 60 | f"dataset not in standard list format, total record in the file -> {len(data)}." 61 | ) 62 | 63 | suitable_key = [ 64 | "p", 65 | "text", 66 | "article_text", 67 | "article_body", 68 | "text", 69 | "content", 70 | "contents", 71 | "body", 72 | "articleBody", 73 | "data", 74 | "title", 75 | ] 76 | 77 | if text_key: 78 | suitable_key = list(set(suitable_key + text_key)) 79 | 80 | if not any(key in key_data for key in suitable_key): 81 | raise Exception( 82 | f"dataset not in standard key-value. must have ({' | '.join(suitable_key)})" 83 | ) 84 | 85 | for i in tqdm(data): 86 | str_lst = [] 87 | for key in i.keys(): 88 | if key in suitable_key: 89 | str_lst.append(str(i[key])) 90 | else: 91 | continue 92 | 93 | if None in str_lst: 94 | str_lst = ["None" if v is None else v for v in str_lst] 95 | 96 | str_data = "\n\n".join(str_lst) 97 | txt_l.append({"text": f"{str_data}"}) 98 | 99 | ut.write_to_json(txt_l, f"{INITIAL_PRE_PROCESSING_FOLDER}{dataset_name}.jsonl") 100 | 101 | 102 | def second_process(raw_dataset_path, dataset_name): 103 | global HF_FOLDER_RAW 104 | global HF_FOLDER_DEDUPE 105 | 106 | HF_FOLDER_RAW = f"{raw_dataset_path}/hf-datasets/raw-datasets/" 107 | HF_FOLDER_DEDUPE = f"{raw_dataset_path}/hf-datasets/dedupe-datasets/" 108 | 109 | ut.create_dir(HF_FOLDER_RAW) 110 | ut.create_dir(HF_FOLDER_DEDUPE) 111 | 112 | with open(f"{INITIAL_PRE_PROCESSING_FOLDER}/{dataset_name}.jsonl") as fopen: 113 | data = [json.loads(line) for line in fopen] 114 | 115 | print(f"total records: {len(data)}") 116 | 117 | data = [entry for entry in tqdm(data) if entry is not None] 118 | 119 | print(f"total records after remove None: {len(data)}") 120 | 121 | data_dict = {"text": [entry["text"] for entry in data]} 122 | 123 | dataset = Dataset.from_dict(data_dict) 124 | 125 | dataset.save_to_disk(f"{HF_FOLDER_RAW}{dataset_name}") 126 | 127 | command = f"python3 -m text_dedup.minhash \ 128 | --path {HF_FOLDER_RAW}{dataset_name} \ 129 | --split train \ 130 | --cache_dir ./cache \ 131 | --output {HF_FOLDER_DEDUPE}{dataset_name} \ 132 | --column text \ 133 | --batch_size 10000 \ 134 | --threshold 0.95 \ 135 | --min_length 1 \ 136 | --local" 137 | 138 | ut.run_command(command) 139 | 140 | 141 | def third_process(raw_dataset_path, mp_core): 142 | HF_FOLDER_POSTPROCESSING = f"{raw_dataset_path}/hf-datasets/postprocessing/" 143 | HF_FOLDER_POSTPROCESSING_DONE = ( 144 | f"{raw_dataset_path}/hf-datasets/postprocessing-done/" 145 | ) 146 | 147 | ut.create_dir(HF_FOLDER_POSTPROCESSING) 148 | ut.create_dir(HF_FOLDER_POSTPROCESSING_DONE) 149 | 150 | files_lst = glob(f"{HF_FOLDER_DEDUPE}*.jsonl") 151 | 152 | print(f"total files to postprocessing --> {len(files_lst)}") 153 | 154 | core = mp_core 155 | 156 | if len(files_lst) // core == 0: 157 | process_type = "single" 158 | ut.loop(files_lst, process_type=process_type) 159 | else: 160 | process_type = "multi" 161 | mp.multiprocessing(files_lst, ut.loop, cores=core, returned=False) 162 | 163 | 164 | def get_size(raw_dataset_path, dataset_name): 165 | before_dedup_url = f"{MAIN_FOLDER_DATASET}/{dataset_name}.jsonl" 166 | before_dedup_clean = f"{MAIN_FOLDER_DATASET}" 167 | after_dedup = f"{HF_FOLDER_DEDUPE}{dataset_name}.jsonl" 168 | after_post = f"{raw_dataset_path}/hf-datasets/postprocessing/{dataset_name}.jsonl" 169 | 170 | try: 171 | before_dedup_mb = (os.stat(before_dedup_url)).st_size / (1024 * 1024) 172 | except: 173 | before_dedup_mb = (os.stat(before_dedup_clean)).st_size / (1024 * 1024) 174 | 175 | after_dedup_mb = (os.stat(after_dedup)).st_size / (1024 * 1024) 176 | after_post_mb = (os.stat(after_post)).st_size / (1024 * 1024) 177 | 178 | return ( 179 | f"{before_dedup_mb:.2f} MB", 180 | f"{after_dedup_mb:.2f} MB", 181 | f"{after_post_mb:.2f} MB", 182 | ) 183 | --------------------------------------------------------------------------------