├── text
    ├── hf-datasets
    │   ├── raw-datasets
    │   │   └── .gitkeep
    │   ├── dedupe-datasets
    │   │   └── .gitkeep
    │   ├── postprocessing
    │   │   └── .gitkeep
    │   └── postprocessing-done
    │   │   └── .gitkeep
    ├── pretrain-llm
    │   ├── how-to-mosaic.png
    │   ├── README.md
    │   ├── prepare-madlad-400-ms.ipynb
    │   ├── prepare-starcoder.ipynb
    │   └── prepare-translation.ipynb
    ├── text_dedup
    │   ├── __init__.py
    │   └── utils
    │   │   ├── preprocess.py
    │   │   ├── __init__.py
    │   │   ├── union_find.py
    │   │   ├── tokenization.py
    │   │   ├── timer.py
    │   │   ├── analysis.py
    │   │   └── hashfunc.py
    ├── processing
    │   ├── text_dedup
    │   │   ├── __init__.py
    │   │   └── utils
    │   │   │   ├── preprocess.py
    │   │   │   ├── __init__.py
    │   │   │   ├── union_find.py
    │   │   │   ├── tokenization.py
    │   │   │   ├── timer.py
    │   │   │   ├── analysis.py
    │   │   │   └── hashfunc.py
    │   ├── README.md
    │   ├── utils.py
    │   ├── main.py
    │   └── function.py
    ├── yi
    │   └── README.md
    ├── mistral
    │   ├── README.md
    │   └── run-tokenizer.ipynb
    ├── tinyllama
    │   └── README.md
    ├── llama
    │   ├── README.md
    │   ├── prepare-dataset-1024.ipynb
    │   ├── prepare-dataset-2048.ipynb
    │   └── prepare-tokenizer.ipynb
    ├── pretrain-clm
    │   ├── README.md
    │   └── from-pyarrow-to-mosaic.ipynb
    ├── extra
    │   ├── process-lowyat.ipynb
    │   ├── process-data.gov.my.ipynb
    │   ├── sample-fineweb-edu.ipynb
    │   └── process-snapshot.ipynb
    ├── README.md
    ├── compare-tokens.ipynb
    ├── .gitignore
    └── madlad-400-ms
    │   ├── prepare-madlad-400-ms.ipynb
    │   ├── dedup-madlad-400-ms.ipynb
    │   └── postprocess-madlad-400-ms.ipynb
├── multilingual-tts
    ├── prepare
    │   ├── prepare-CORAA-MUPE-ASR.ipynb
    │   ├── prepare-ParlaSpeech-CZ.ipynb
    │   ├── prepare-ParlaSpeech-HR.ipynb
    │   ├── prepare-ParlaSpeech-PL.ipynb
    │   ├── prepare-WenetSpeech4TTS.ipynb
    │   └── prepare-MasriSpeech-Full.ipynb
    ├── README.md
    ├── embedding.py
    ├── convert_neucodec.py
    └── trim_silence.py
├── README.md
├── stt-whisper
    ├── .gitignore
    ├── README.md
    ├── force_alignment.py
    └── audioset_sliding.py
├── malaysian-short-instructions
    ├── .gitignore
    ├── keyword-location
    ├── negeri
    ├── keywords
    ├── dedup-questions-intents.ipynb
    └── dedup-questions.ipynb
├── speech-instructions
    ├── .gitignore
    ├── README.md
    ├── generate.sh
    ├── embedding.py
    ├── prepare-malaysian-podcast.ipynb
    ├── remote.sh
    ├── prepare-malaysian-others.ipynb
    └── prepare-malaysia-parliament.ipynb
├── emotional-malaysian-emilia
    ├── README.md
    ├── pitch_estimation.py
    ├── audioset_sliding.py
    └── audioset_sliding_v2.py
├── emilia-yodas
    ├── README.md
    └── convert_neucodec_emilia.py
├── LICENSE
├── speech-instructions-extra
    └── upload.ipynb
└── .gitignore


/text/hf-datasets/raw-datasets/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/text/hf-datasets/dedupe-datasets/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/text/hf-datasets/postprocessing/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/text/hf-datasets/postprocessing-done/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/multilingual-tts/prepare/prepare-CORAA-MUPE-ASR.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/multilingual-tts/prepare/prepare-ParlaSpeech-CZ.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/multilingual-tts/prepare/prepare-ParlaSpeech-HR.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/multilingual-tts/prepare/prepare-ParlaSpeech-PL.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Dataset
2 | 
3 | Our recipes to prepare datasets.


--------------------------------------------------------------------------------
/stt-whisper/.gitignore:
--------------------------------------------------------------------------------
1 | *.json
2 | force_alignment
3 | *.parquet


--------------------------------------------------------------------------------
/malaysian-short-instructions/.gitignore:
--------------------------------------------------------------------------------
1 | generate-questions*
2 | generate-answers*
3 | *.json
4 | *.parquet


--------------------------------------------------------------------------------
/text/pretrain-llm/how-to-mosaic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malaysia-ai/dataset/HEAD/text/pretrain-llm/how-to-mosaic.png


--------------------------------------------------------------------------------
/multilingual-tts/README.md:
--------------------------------------------------------------------------------
1 | # Multilingual-TTS
2 | 
3 | Gather multilingual TTS dataset, everything pushed into https://huggingface.co/datasets/malaysia-ai/Multilingual-TTS.


--------------------------------------------------------------------------------
/malaysian-short-instructions/keyword-location:
--------------------------------------------------------------------------------
 1 | food
 2 | attraction
 3 | lifestyle
 4 | culture
 5 | shopping mall
 6 | agama
 7 | socioeconomy
 8 | peluang pekerjaan
 9 | infrastructure
10 | education
11 | technology
12 | business


--------------------------------------------------------------------------------
/speech-instructions/.gitignore:
--------------------------------------------------------------------------------
 1 | *.parquet
 2 | embedding*
 3 | *.json
 4 | *.jsonl
 5 | dedup-parliament
 6 | dedup-podcasts
 7 | dedup-others
 8 | partition-instructions-part*
 9 | tatabahasa*
10 | mallm*
11 | short-coding-*
12 | malaymmlu*


--------------------------------------------------------------------------------
/text/text_dedup/__init__.py:
--------------------------------------------------------------------------------
 1 | """Text deduplication simplified."""
 2 | 
 3 | import logging
 4 | 
 5 | from rich.logging import RichHandler
 6 | 
 7 | logger = logging.getLogger("text_dedup")
 8 | logger.setLevel(logging.INFO)
 9 | logger.addHandler(RichHandler(rich_tracebacks=True))
10 | logger.propagate = False


--------------------------------------------------------------------------------
/text/processing/text_dedup/__init__.py:
--------------------------------------------------------------------------------
 1 | """Text deduplication simplified."""
 2 | 
 3 | import logging
 4 | 
 5 | from rich.logging import RichHandler
 6 | 
 7 | logger = logging.getLogger("text_dedup")
 8 | logger.setLevel(logging.INFO)
 9 | logger.addHandler(RichHandler(rich_tracebacks=True))
10 | logger.propagate = False


--------------------------------------------------------------------------------
/malaysian-short-instructions/negeri:
--------------------------------------------------------------------------------
 1 | negeri johor
 2 | negeri kedah
 3 | negeri kelantan
 4 | negeri melaka
 5 | negeri negeri sembilan
 6 | negeri pahang
 7 | negeri perak
 8 | negeri perlis
 9 | negeri pulau pinang
10 | negeri selangor
11 | negeri terengganu
12 | negeri sabah
13 | negeri sarawak
14 | kuala lumpur
15 | negeri labuan
16 | putrajaya


--------------------------------------------------------------------------------
/emotional-malaysian-emilia/README.md:
--------------------------------------------------------------------------------
 1 | # Emotional Malaysian Emilia
 2 | 
 3 | Synthetic Emotional label on Malaysian Emilia.
 4 | 
 5 | ## how to
 6 | 
 7 | ### Predict Audioset sliding window
 8 | 
 9 | ```bash
10 | CUDA_VISIBLE_DEVICES=0 \
11 | python3 audioset_sliding_v2.py --path 'malaysian-podcast_processed/**/*.mp3' --global-index 1 --local-index 0
12 | ```
13 | 
14 | ### Predict Emotion


--------------------------------------------------------------------------------
/emilia-yodas/README.md:
--------------------------------------------------------------------------------
 1 | ## Convert to audio tokens
 2 | 
 3 | ```bash
 4 | OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 \
 5 | python3 convert_neucodec_batch.py --file 'emilia-audio.json' --replication 2
 6 | ```
 7 | 
 8 | But we prefer to use [convert_neucodec_emilia.py](convert_neucodec_emilia.py) in GH200,
 9 | 
10 | ```bash
11 | OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 python3 convert_neucodec_emilia.py --file 'emilia-audio.json' --replication 13
12 | ```
13 | 
14 | Way faster!


--------------------------------------------------------------------------------
/text/yi/README.md:
--------------------------------------------------------------------------------
 1 | # Prepare dataset for Yi FPF
 2 | 
 3 | This step to prepare FPF Yi models.
 4 | 
 5 | ## how-to
 6 | 
 7 | 1. Run [combine-dataset.ipynb](combine-dataset.ipynb),
 8 | 
 9 | This will combine most datasets into 1 JSONL file.
10 | 
11 | - 41 GB.
12 | 
13 | 2. Run [convert-mosaic.ipynb](prepare-tokenizer.ipynb),
14 | 
15 | This will tokenized and convert into mosaic format.
16 | 
17 | 3. Run [combine-mosaic-all.ipynb](combine-mosaic-all.ipynb),
18 | 
19 | This will combine all mosaic partitions into one mosaic folder, total 14114934784 tokens.


--------------------------------------------------------------------------------
/text/mistral/README.md:
--------------------------------------------------------------------------------
 1 | # Prepare dataset for Mistral FPF
 2 | 
 3 | This step to prepare FPF Mistral model.
 4 | 
 5 | ## how-to
 6 | 
 7 | 1. Run [mistral/combine-mistral.ipynb](mistral/combine-mistral.ipynb),
 8 | 
 9 | This will combine most datasets into 1 JSONL file.
10 | 
11 | - 32.6 GB.
12 | 
13 | 2. Run [prepare-tokenizer.ipynb](prepare-tokenizer.ipynb),
14 | 
15 | This will tokenized and cached the dataset.
16 | 
17 | 3. Run [prepare-dataset-4096.ipynb](prepare-dataset-4096.ipynb),
18 | 
19 | This will partitioned tokenized dataset into 4096 context length.


--------------------------------------------------------------------------------
/text/tinyllama/README.md:
--------------------------------------------------------------------------------
 1 | # Prepare dataset for TinyLlama FPF
 2 | 
 3 | This step to prepare FPF TinyLlama models.
 4 | 
 5 | ## how-to
 6 | 
 7 | 1. Run [combine-dataset.ipynb](combine-dataset.ipynb),
 8 | 
 9 | This will combine most datasets into 1 JSONL file.
10 | 
11 | - 41 GB.
12 | 
13 | 2. Run [convert-mosaic.ipynb](prepare-tokenizer.ipynb),
14 | 
15 | This will tokenized and convert into mosaic format.
16 | 
17 | 3. Run [combine-mosaic-all.ipynb](combine-mosaic-all.ipynb),
18 | 
19 | This will combine all mosaic partitions into one mosaic folder, total 14349328384 tokens.


--------------------------------------------------------------------------------
/text/llama/README.md:
--------------------------------------------------------------------------------
 1 | # Prepare dataset for Llama2 FPF
 2 | 
 3 | This step to prepare FPF Llama2 models.
 4 | 
 5 | ## how-to
 6 | 
 7 | 1. Run [combine-v2.ipynb](combine-v2.ipynb),
 8 | 
 9 | This will combine most datasets into 1 JSONL file.
10 | 
11 | - 31.4 GB.
12 | 
13 | 2. Run [prepare-tokenizer.ipynb](prepare-tokenizer.ipynb),
14 | 
15 | This will tokenized and cached the dataset.
16 | 
17 | 3. Run [prepare-dataset-2048.ipynb](prepare-dataset-2048.ipynb),
18 | 
19 | This will partitioned tokenized dataset into 2048 context length.
20 | 
21 | 4. Run [prepare-dataset-32768.ipynb](prepare-dataset-32768.ipynb),
22 | 
23 | This will partitioned tokenized dataset into 32768 context length.


--------------------------------------------------------------------------------
/text/pretrain-clm/README.md:
--------------------------------------------------------------------------------
 1 | # Pretrain CLM
 2 | 
 3 | This is to pretrain 100M - 500M parameters CLM. All steps done using Standard_F48s_v2 node size.
 4 | 
 5 | This step to prepare pretrain models from scratch.
 6 | 
 7 | ## how-to
 8 | 
 9 | 1. Run [pretrain/combine-lm.ipynb](pretrain/combine-lm.ipynb),
10 | 
11 | This will combine all datasets into 1 JSONL file.
12 | 
13 | - 81 GB.
14 | - 16994238464 tokens.
15 | 
16 | 2. Run [pretrain/tokenizer-4096.ipynb](pretrain/tokenizer-4096.ipynb),
17 | 
18 | This will tokenized and partitioned tokenized dataset into 4096 context length.
19 | 
20 | 3. Run [pretrain/from-pyarrow-to-mosaic.ipynb](pretrain/from-pyarrow-to-mosaic.ipynb),
21 | 
22 | This will convert PyArrow streaming format into MosaicML streaming format.
23 | 
24 | 4. Run [pretrain/combine-mosaicml.ipynb](pretrain/combine-mosaicml.ipynb),
25 | 
26 | This will combine multiple MosaicML streaming folders into 1 folder.


--------------------------------------------------------------------------------
/malaysian-short-instructions/keywords:
--------------------------------------------------------------------------------
 1 | react js
 2 | vue js
 3 | vanilla javascript
 4 | websocket
 5 | node js
 6 | svelte
 7 | next js
 8 | express js
 9 | angular js
10 | jquery
11 | d3 js
12 | python matplotlib
13 | python pandas
14 | python dask
15 | python scipy
16 | python numpy
17 | python keras
18 | python flask
19 | python fastapi
20 | python request
21 | python async
22 | python scikit learn
23 | python dask
24 | python distributed system
25 | pytorch
26 | pyspark
27 | apache spark
28 | apache hadoop
29 | apache hive
30 | apache kafka
31 | apache yarn
32 | apache flink
33 | apache cassandra
34 | apache airflow
35 | apache druid
36 | c++
37 | java
38 | rust
39 | kotlin
40 | swift
41 | cuda
42 | go
43 | go distributed system
44 | kubernetes
45 | bash
46 | docker
47 | dockerfile
48 | nginx
49 | tcp
50 | postgresql
51 | mysql
52 | oracle db
53 | elasticsearch
54 | nosql
55 | clickhouse
56 | terraform
57 | fortran
58 | slurm
59 | openmpi


--------------------------------------------------------------------------------
/text/text_dedup/utils/preprocess.py:
--------------------------------------------------------------------------------
 1 | import regex as re
 2 | 
 3 | DIGIT_RE = re.compile(r"\d")
 4 | PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile(r"[\p{P}\p{C}\p{S}]+")
 5 | 
 6 | 
 7 | def normalize(line: str) -> str:
 8 |     """
 9 |     Normalize a line of text. Source: https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/text_normalizer.py#L180
10 | 
11 |     Parameters
12 |     ----------
13 |     line : str
14 |         The line of text to normalize.
15 | 
16 |     Returns
17 |     -------
18 |     str
19 |         The normalized line of text.
20 | 
21 |     Examples
22 |     --------
23 |     >>> normalize("Hello, world!")
24 |     'hello world'
25 |     >>> normalize("Hello, 123!\\n\\t\\b")
26 |     'hello 000'
27 |     """
28 |     line = line.strip()
29 |     if not line:
30 |         return line
31 |     line = line.lower()
32 |     line = DIGIT_RE.sub("0", line)
33 |     line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line)
34 |     return line


--------------------------------------------------------------------------------
/text/processing/text_dedup/utils/preprocess.py:
--------------------------------------------------------------------------------
 1 | import regex as re
 2 | 
 3 | DIGIT_RE = re.compile(r"\d")
 4 | PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile(r"[\p{P}\p{C}\p{S}]+")
 5 | 
 6 | 
 7 | def normalize(line: str) -> str:
 8 |     """
 9 |     Normalize a line of text. Source: https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/text_normalizer.py#L180
10 | 
11 |     Parameters
12 |     ----------
13 |     line : str
14 |         The line of text to normalize.
15 | 
16 |     Returns
17 |     -------
18 |     str
19 |         The normalized line of text.
20 | 
21 |     Examples
22 |     --------
23 |     >>> normalize("Hello, world!")
24 |     'hello world'
25 |     >>> normalize("Hello, 123!\\n\\t\\b")
26 |     'hello 000'
27 |     """
28 |     line = line.strip()
29 |     if not line:
30 |         return line
31 |     line = line.lower()
32 |     line = DIGIT_RE.sub("0", line)
33 |     line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line)
34 |     return line


--------------------------------------------------------------------------------
/text/text_dedup/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from text_dedup.utils.add_args import add_bloom_filter_args
 2 | from text_dedup.utils.add_args import add_exact_hash_args
 3 | from text_dedup.utils.add_args import add_io_args
 4 | from text_dedup.utils.add_args import add_meta_args
 5 | from text_dedup.utils.add_args import add_minhash_args
 6 | from text_dedup.utils.add_args import add_sa_args
 7 | from text_dedup.utils.add_args import add_simhash_args
 8 | from text_dedup.utils.hashfunc import sha1_hash
 9 | from text_dedup.utils.hashfunc import xxh3_hash
10 | from text_dedup.utils.timer import Timer
11 | from text_dedup.utils.tokenization import ngrams
12 | from text_dedup.utils.union_find import UnionFind
13 | 
14 | __all__ = [
15 |     "add_bloom_filter_args",
16 |     "add_exact_hash_args",
17 |     "add_io_args",
18 |     "add_meta_args",
19 |     "add_minhash_args",
20 |     "add_sa_args",
21 |     "add_simhash_args",
22 |     "Timer",
23 |     "ngrams",
24 |     "UnionFind",
25 |     "sha1_hash",
26 |     "xxh3_hash",
27 | ]


--------------------------------------------------------------------------------
/text/text_dedup/utils/union_find.py:
--------------------------------------------------------------------------------
 1 | class UnionFind:
 2 |     """
 3 |     A data structure for maintaining disjoint sets. This helps build connected components for given duplicate pairs.
 4 | 
 5 |     Examples
 6 |     --------
 7 |     >>> uf = UnionFind()
 8 |     >>> uf.union(1, 2)
 9 |     >>> uf.union(2, 3)
10 |     >>> uf.union(4, 5)
11 |     >>> uf.find(1)
12 |     1
13 |     >>> uf.find(2)
14 |     1
15 |     >>> uf.find(3)
16 |     1
17 |     >>> uf.find(4)
18 |     4
19 |     >>> uf.find(5)
20 |     4
21 |     """
22 | 
23 |     def __init__(self):
24 |         self.parent = {}
25 | 
26 |     def find(self, x):
27 |         if x not in self.parent:
28 |             self.parent[x] = x
29 |             return x
30 | 
31 |         if self.parent[x] != x:
32 |             self.parent[x] = self.find(self.parent[x])
33 | 
34 |         return self.parent[x]
35 | 
36 |     def union(self, x, y):
37 |         px = self.find(x)
38 |         py = self.find(y)
39 |         self.parent[px] = self.parent[py] = min(px, py)


--------------------------------------------------------------------------------
/text/processing/text_dedup/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from text_dedup.utils.add_args import add_bloom_filter_args
 2 | from text_dedup.utils.add_args import add_exact_hash_args
 3 | from text_dedup.utils.add_args import add_io_args
 4 | from text_dedup.utils.add_args import add_meta_args
 5 | from text_dedup.utils.add_args import add_minhash_args
 6 | from text_dedup.utils.add_args import add_sa_args
 7 | from text_dedup.utils.add_args import add_simhash_args
 8 | from text_dedup.utils.hashfunc import sha1_hash
 9 | from text_dedup.utils.hashfunc import xxh3_hash
10 | from text_dedup.utils.timer import Timer
11 | from text_dedup.utils.tokenization import ngrams
12 | from text_dedup.utils.union_find import UnionFind
13 | 
14 | __all__ = [
15 |     "add_bloom_filter_args",
16 |     "add_exact_hash_args",
17 |     "add_io_args",
18 |     "add_meta_args",
19 |     "add_minhash_args",
20 |     "add_sa_args",
21 |     "add_simhash_args",
22 |     "Timer",
23 |     "ngrams",
24 |     "UnionFind",
25 |     "sha1_hash",
26 |     "xxh3_hash",
27 | ]


--------------------------------------------------------------------------------
/text/processing/text_dedup/utils/union_find.py:
--------------------------------------------------------------------------------
 1 | class UnionFind:
 2 |     """
 3 |     A data structure for maintaining disjoint sets. This helps build connected components for given duplicate pairs.
 4 | 
 5 |     Examples
 6 |     --------
 7 |     >>> uf = UnionFind()
 8 |     >>> uf.union(1, 2)
 9 |     >>> uf.union(2, 3)
10 |     >>> uf.union(4, 5)
11 |     >>> uf.find(1)
12 |     1
13 |     >>> uf.find(2)
14 |     1
15 |     >>> uf.find(3)
16 |     1
17 |     >>> uf.find(4)
18 |     4
19 |     >>> uf.find(5)
20 |     4
21 |     """
22 | 
23 |     def __init__(self):
24 |         self.parent = {}
25 | 
26 |     def find(self, x):
27 |         if x not in self.parent:
28 |             self.parent[x] = x
29 |             return x
30 | 
31 |         if self.parent[x] != x:
32 |             self.parent[x] = self.find(self.parent[x])
33 | 
34 |         return self.parent[x]
35 | 
36 |     def union(self, x, y):
37 |         px = self.find(x)
38 |         py = self.find(y)
39 |         self.parent[px] = self.parent[py] = min(px, py)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Malaysia-AI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/text/text_dedup/utils/tokenization.py:
--------------------------------------------------------------------------------
 1 | from itertools import tee
 2 | from typing import List
 3 | from typing import Text
 4 | 
 5 | 
 6 | def ngrams(sequence: List[Text], n: int, min_length: int = 5):
 7 |     """
 8 |     Return the ngrams generated from a sequence of items, as an iterator.
 9 | 
10 |     This is a modified version of nltk.util.ngrams.
11 | 
12 |     Parameters
13 |     ----------
14 |     sequence : List[Text]
15 |         The sequence of items.
16 |     n : int
17 |         The length of each ngram.
18 |     min_length : int, optional
19 |         The minimum length of each ngram, by default 5
20 | 
21 |     Returns
22 |     -------
23 |     iterator
24 |         The ngrams.
25 | 
26 |     Examples
27 |     --------
28 |     >>> list(ngrams(["a", "b", "c", "d"], 2, min_length=1))
29 |     [('a', 'b'), ('b', 'c'), ('c', 'd')]
30 |     >>> list(ngrams(["a", "b", "c", "d"], 2, min_length=5))
31 |     []
32 |     >>> list(ngrams(["a", "b"], 3, min_length=1))
33 |     [('a', 'b')]
34 |     """
35 |     if len(sequence) < min_length:
36 |         return []
37 |     if len(sequence) < n:
38 |         return [tuple(sequence)]
39 |     iterables = tee(iter(sequence), n)
40 |     for i, sub_iterable in enumerate(iterables):
41 |         for _ in range(i):
42 |             next(sub_iterable, None)
43 |     return zip(*iterables)


--------------------------------------------------------------------------------
/text/processing/text_dedup/utils/tokenization.py:
--------------------------------------------------------------------------------
 1 | from itertools import tee
 2 | from typing import List
 3 | from typing import Text
 4 | 
 5 | 
 6 | def ngrams(sequence: List[Text], n: int, min_length: int = 5):
 7 |     """
 8 |     Return the ngrams generated from a sequence of items, as an iterator.
 9 | 
10 |     This is a modified version of nltk.util.ngrams.
11 | 
12 |     Parameters
13 |     ----------
14 |     sequence : List[Text]
15 |         The sequence of items.
16 |     n : int
17 |         The length of each ngram.
18 |     min_length : int, optional
19 |         The minimum length of each ngram, by default 5
20 | 
21 |     Returns
22 |     -------
23 |     iterator
24 |         The ngrams.
25 | 
26 |     Examples
27 |     --------
28 |     >>> list(ngrams(["a", "b", "c", "d"], 2, min_length=1))
29 |     [('a', 'b'), ('b', 'c'), ('c', 'd')]
30 |     >>> list(ngrams(["a", "b", "c", "d"], 2, min_length=5))
31 |     []
32 |     >>> list(ngrams(["a", "b"], 3, min_length=1))
33 |     [('a', 'b')]
34 |     """
35 |     if len(sequence) < min_length:
36 |         return []
37 |     if len(sequence) < n:
38 |         return [tuple(sequence)]
39 |     iterables = tee(iter(sequence), n)
40 |     for i, sub_iterable in enumerate(iterables):
41 |         for _ in range(i):
42 |             next(sub_iterable, None)
43 |     return zip(*iterables)


--------------------------------------------------------------------------------
/text/pretrain-llm/README.md:
--------------------------------------------------------------------------------
 1 | # Pretrain LLM
 2 | 
 3 | This is to pretrain 1B - 13B parameters LLM. All steps done using Standard_F48s_v2 node size.
 4 | 
 5 | ## how-to not use HuggingFace datasets
 6 | 
 7 | It stream memory mapped file and after that concat, https://github.com/huggingface/datasets/blob/60bdf3005d1dc0b26da8e5949721b20d932eaad6/src/datasets/table.py#L51, super super slow, and you are wondering, is the script stuck? Yes, it is waiting for pyarrow streaming.
 8 | 
 9 | So we try our own approached,
10 | 
11 | <img src="how-to-mosaic.png" width="500">
12 | 
13 | https://drive.google.com/file/d/1dSQ7KQs_x7aCTNVXgMESIqTwEoAZt-OK/view?usp=sharing
14 | 
15 | 1. Split JSONL file into smaller JSONL files.
16 | 2. Each smaller JSONL files run in multiprocessing to convert into Mosaic format.
17 | 3. Merge smaller Mosaic files into one Mosaic file.
18 | 
19 | ## total tokens
20 | 
21 | 1. [prepare-dedup-text-dataset-4096.ipynb](prepare-dedup-text-dataset-4096.ipynb), 31702310912
22 | 2. [prepare-starcoder-4096.ipynb](prepare-starcoder-4096.ipynb), 40981254144
23 | 3. [prepare-madlad-400-4096.ipynb](prepare-madlad-400-4096.ipynb), 14983720960
24 | 4. [prepare-instructions.ipynb](prepare-instructions.ipynb), 1577877504
25 | 5. [prepare-extra.ipynb](prepare-extra.ipynb), 1140461568
26 | 
27 | Total, 90B tokens, we uploaded the dataset at https://huggingface.co/datasets/malaysia-ai/mosaic-combine-all, so you can use it directly with https://docs.mosaicml.com/projects/streaming/en/latest/index.html


--------------------------------------------------------------------------------
/speech-instructions/README.md:
--------------------------------------------------------------------------------
 1 | # Speech Instructions
 2 | 
 3 | ## how to prepare
 4 | 
 5 | ### 1. Speaker dedup
 6 | 
 7 | 1. Prepare dataset to dedup,
 8 | 
 9 | - [prepare-malaysia-parliament.ipynb](prepare-malaysia-parliament.ipynb).
10 | - [prepare-malaysian-podcast.ipynb](prepare-malaysian-podcast.ipynb).
11 | - [prepare-malaysian-others.ipynb](prepare-malaysian-others.ipynb).
12 | 
13 | 2. Convert to embedding,
14 | 
15 | We use speaker embedding from https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/titanet_large
16 | 
17 | ```bash
18 | CUDA_VISIBLE_DEVICES=1,2 \
19 | python3.10 embedding.py \
20 | --filename filtered-politicians.parquet \
21 | --replication 3
22 | 
23 | CUDA_VISIBLE_DEVICES=1,2 \
24 | python3.10 embedding.py \
25 | --filename filtered-podcast.parquet \
26 | --replication 3 --folder embedding-podcast
27 | 
28 | CUDA_VISIBLE_DEVICES=0,2 \
29 | python3.10 embedding.py \
30 | --filename filtered-others.parquet \
31 | --replication 3 --folder embedding-others
32 | ```
33 | 
34 | 2. Merge and dedup,
35 | 
36 | - [dedup-parliament.ipynb](dedup-parliament.ipynb).
37 | - [dedup-podcasts.ipynb](dedup-podcasts.ipynb).
38 | 
39 | ### 2. Populate instructions
40 | 
41 | All datasets from https://huggingface.co/collections/mesolitica/malaysian-synthetic-dataset-656c2673fe7fe0b1e9e25fe2, and follow [filter-instructions.ipynb](filter-instructions.ipynb).
42 | 
43 | ### 3. Generate synthetic voice
44 | 
45 | ```bash
46 | bash generate.sh
47 | ```
48 | 
49 | **Modify it appropriately based on your local GPUs**.


--------------------------------------------------------------------------------
/speech-instructions/generate.sh:
--------------------------------------------------------------------------------
 1 | for i in {0..3}; do
 2 |   screen -S "partition-instructions-part-7_$i" -X quit 2>/dev/null
 3 |   screen -dmS "partition-instructions-part-7_$i" bash -c "cd /home/husein/ssd3/dataset/speech-instructions && \
 4 |   CUDA_VISIBLE_DEVICES=0 \
 5 |   python3.10 generate.py \
 6 |     --input_file \"partition-instructions-part-7.json\" \
 7 |     --folder \"partition-instructions-part-7\" \
 8 |     --global_index 4 \
 9 |     --index $i"
10 | done
11 | 
12 | for i in {0..3}; do
13 |   screen -S "partition-instructions-part-15_$i" -X quit 2>/dev/null
14 |   screen -dmS "partition-instructions-part-15_$i" bash -c "cd /home/husein/ssd3/dataset/speech-instructions && \
15 |   CUDA_VISIBLE_DEVICES=2 \
16 |   python3.10 generate.py \
17 |     --input_file \"partition-instructions-part-15.json\" \
18 |     --folder \"partition-instructions-part-15\" \
19 |     --global_index 4 \
20 |     --index $i"
21 | done
22 | 
23 | for i in {0..3}; do
24 |   screen -S "tatabahasa_$i" -X quit 2>/dev/null
25 |   screen -dmS "tatabahasa_$i" bash -c "cd /home/husein/ssd3/dataset/speech-instructions && \
26 |   CUDA_VISIBLE_DEVICES=2 \
27 |   python3.10 generate.py \
28 |     --input_file \"tatabahasa.json\" \
29 |     --folder \"tatabahasa-v2\" \
30 |     --global_index 4 \
31 |     --index $i --threshold -9 --maxlen 300 --retry 10"
32 | done
33 | 
34 | for i in {0..3}; do
35 |   screen -S "malaymmlu_$i" -X quit 2>/dev/null
36 |   screen -dmS "malaymmlu_$i" bash -c "cd /home/husein/ssd3/dataset/speech-instructions && \
37 |   CUDA_VISIBLE_DEVICES=2 \
38 |   python3.10 generate.py \
39 |     --input_file \"malaymmlu.json\" \
40 |     --folder \"malaymmlu\" \
41 |     --global_index 4 \
42 |     --index $i --threshold -9 --maxlen 300 --retry 10"
43 | done


--------------------------------------------------------------------------------
/text/text_dedup/utils/timer.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | 
 4 | class TimerContext:
 5 |     def __init__(self, timer: "Timer", name: str):
 6 |         self.timer = timer
 7 |         self.name = name
 8 |         self.start_time = None
 9 | 
10 |     def __enter__(self):
11 |         self.start_time = time.time()
12 | 
13 |     def __exit__(self, exc_type, exc_val, exc_tb):
14 |         if any([exc_type, exc_val, exc_tb]):
15 |             raise exc_val
16 |         self.timer.elapsed_times[self.name] = time.time() - self.start_time
17 | 
18 | 
19 | class Timer:
20 |     """
21 |     A simple timer that tracks the elapsed time of each context.
22 | 
23 |     Examples
24 |     --------
25 |     >>> t = Timer()
26 |     >>> with t("test"):
27 |     ...     time.sleep(1)
28 |     >>> assert int(t.elapsed_times.get("test", 0)) >= 1, "The elapsed time should be 1 second."
29 |     """
30 | 
31 |     def __init__(self):
32 |         self.elapsed_times = {}
33 | 
34 |     def __call__(self, name: str) -> TimerContext:
35 |         """
36 |         Create a context with the given name.
37 | 
38 |         Parameters
39 |         ----------
40 |         name: str
41 |             The name of the context.
42 | 
43 |         Returns
44 |         -------
45 |         TimerContext
46 |             The context.
47 | 
48 |         Examples
49 |         --------
50 |         >>> t = Timer()
51 |         >>> with t("test"):
52 |         ...     time.sleep(1)
53 |         >>> assert int(t.elapsed_times.get("test", 0)) == 1, "The elapsed time should be 1 second."
54 |         >>> with t("test2"):
55 |         ...     time.sleep(2)
56 |         >>> assert int(t.elapsed_times.get("test2", 0)) == 2, "The elapsed time should be 2 seconds."
57 |         """
58 |         return TimerContext(self, name)


--------------------------------------------------------------------------------
/text/processing/text_dedup/utils/timer.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | 
 4 | class TimerContext:
 5 |     def __init__(self, timer: "Timer", name: str):
 6 |         self.timer = timer
 7 |         self.name = name
 8 |         self.start_time = None
 9 | 
10 |     def __enter__(self):
11 |         self.start_time = time.time()
12 | 
13 |     def __exit__(self, exc_type, exc_val, exc_tb):
14 |         if any([exc_type, exc_val, exc_tb]):
15 |             raise exc_val
16 |         self.timer.elapsed_times[self.name] = time.time() - self.start_time
17 | 
18 | 
19 | class Timer:
20 |     """
21 |     A simple timer that tracks the elapsed time of each context.
22 | 
23 |     Examples
24 |     --------
25 |     >>> t = Timer()
26 |     >>> with t("test"):
27 |     ...     time.sleep(1)
28 |     >>> assert int(t.elapsed_times.get("test", 0)) >= 1, "The elapsed time should be 1 second."
29 |     """
30 | 
31 |     def __init__(self):
32 |         self.elapsed_times = {}
33 | 
34 |     def __call__(self, name: str) -> TimerContext:
35 |         """
36 |         Create a context with the given name.
37 | 
38 |         Parameters
39 |         ----------
40 |         name: str
41 |             The name of the context.
42 | 
43 |         Returns
44 |         -------
45 |         TimerContext
46 |             The context.
47 | 
48 |         Examples
49 |         --------
50 |         >>> t = Timer()
51 |         >>> with t("test"):
52 |         ...     time.sleep(1)
53 |         >>> assert int(t.elapsed_times.get("test", 0)) == 1, "The elapsed time should be 1 second."
54 |         >>> with t("test2"):
55 |         ...     time.sleep(2)
56 |         >>> assert int(t.elapsed_times.get("test2", 0)) == 2, "The elapsed time should be 2 seconds."
57 |         """
58 |         return TimerContext(self, name)


--------------------------------------------------------------------------------
/stt-whisper/README.md:
--------------------------------------------------------------------------------
 1 | # STT Whisper
 2 | 
 3 | 1. We provide segment and word level timestamps on,
 4 | - [Malaysian Emilia Dialects](https://huggingface.co/datasets/mesolitica/Malaysian-Emilia#malaysian-dialect).
 5 | - [Speech Instructions](https://huggingface.co/datasets/malaysia-ai/Speech-Instructions).
 6 | 2. Synthetic merging different context, [synthetic-context-switching-word-timestamp.ipynb](synthetic-context-switching-word-timestamp.ipynb).
 7 | 
 8 | ## Sliding Audionet
 9 | 
10 | ```bash
11 | CUDA_VISIBLE_DEVICES=0 \
12 | python3 audioset_sliding.py --file 'prepared-pseudolabel.jsonl' --global-index 4 --local-index 0
13 | CUDA_VISIBLE_DEVICES=1 \
14 | python3 audioset_sliding.py --file 'prepared-pseudolabel.jsonl' --global-index 4 --local-index 1
15 | CUDA_VISIBLE_DEVICES=2 \
16 | python3 audioset_sliding.py --file 'prepared-pseudolabel.jsonl' --global-index 4 --local-index 2
17 | CUDA_VISIBLE_DEVICES=3 \
18 | python3 audioset_sliding.py --file 'prepared-pseudolabel.jsonl' --global-index 4 --local-index 3
19 | ```
20 | 
21 | ## Speech Instructions
22 | 
23 | 1. Run force alignment,
24 | 
25 | ```bash
26 | CUDA_VISIBLE_DEVICES=2 \
27 | python3.10 force_alignment.py \
28 | --filename 'prepare-force-alignment.json' \
29 | --language 'ms' \
30 | --replication 3
31 | ```
32 | 
33 | 2. Prepare dataset,
34 | 
35 | - Segment level, [speech-instructions-segment-timestamps.ipynb](speech-instructions-segment-timestamps.ipynb).
36 | - Word level, [speech-instructions-word-timestamps.ipynb](speech-instructions-word-timestamps.ipynb).
37 | 
38 | ## Malaysian Emilia Dialects
39 | 
40 | 1. Prepare dataset,
41 | 
42 | Because force alignment already calculated at [mesolitica/Malaysian-Emilia-annotated/dialects_processed_alignment.zip](https://huggingface.co/datasets/mesolitica/Malaysian-Emilia-annotated/blob/main/dialects_processed_alignment.zip).
43 | 
44 | - Segment level, [dialects-segment-timestamps.ipynb](dialects-segment-timestamps.ipynb).
45 | - Word level, [dialects-word-timestamps.ipynb](dialects-word-timestamps.ipynb).


--------------------------------------------------------------------------------
/text/extra/process-lowyat.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "id": "47588232",
 7 |    "metadata": {},
 8 |    "outputs": [],
 9 |    "source": [
10 |     "# !git clone https://huggingface.co/datasets/mesolitica/crawl-lowyat"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": 2,
16 |    "id": "44f963ff",
17 |    "metadata": {},
18 |    "outputs": [],
19 |    "source": [
20 |     "from glob import glob\n",
21 |     "import json"
22 |    ]
23 |   },
24 |   {
25 |    "cell_type": "code",
26 |    "execution_count": 5,
27 |    "id": "43c7cfc0",
28 |    "metadata": {},
29 |    "outputs": [
30 |     {
31 |      "data": {
32 |       "text/plain": [
33 |        "63"
34 |       ]
35 |      },
36 |      "execution_count": 5,
37 |      "metadata": {},
38 |      "output_type": "execute_result"
39 |     }
40 |    ],
41 |    "source": [
42 |     "files = glob('crawl-lowyat/*.json')\n",
43 |     "files = [f for f in files if '-topics' not in f]\n",
44 |     "len(files)"
45 |    ]
46 |   },
47 |   {
48 |    "cell_type": "code",
49 |    "execution_count": 12,
50 |    "id": "2a694ba8",
51 |    "metadata": {},
52 |    "outputs": [],
53 |    "source": [
54 |     "with open('hf-datasets/raw-datasets/lowyat.jsonl', 'w') as fopen_l:\n",
55 |     "    for f in files:\n",
56 |     "        with open(f) as fopen:\n",
57 |     "            data = json.load(fopen)\n",
58 |     "        for d in data:\n",
59 |     "            fopen_l.write(f'{json.dumps(d)}\\n')"
60 |    ]
61 |   }
62 |  ],
63 |  "metadata": {
64 |   "kernelspec": {
65 |    "display_name": "Python 3 (ipykernel)",
66 |    "language": "python",
67 |    "name": "python3"
68 |   },
69 |   "language_info": {
70 |    "codemirror_mode": {
71 |     "name": "ipython",
72 |     "version": 3
73 |    },
74 |    "file_extension": ".py",
75 |    "mimetype": "text/x-python",
76 |    "name": "python",
77 |    "nbconvert_exporter": "python",
78 |    "pygments_lexer": "ipython3",
79 |    "version": "3.10.12"
80 |   }
81 |  },
82 |  "nbformat": 4,
83 |  "nbformat_minor": 5
84 | }
85 | 


--------------------------------------------------------------------------------
/speech-instructions/embedding.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import json
 3 | import pandas as pd
 4 | import torch
 5 | import numpy as np
 6 | from tqdm import tqdm
 7 | from multiprocess import Pool
 8 | import os
 9 | 
10 | def chunks(l, devices, folder):
11 |     chunk_size = len(l) // len(devices)
12 |     remainder = len(l) % len(devices)
13 |     start = 0
14 |     for i in range(len(devices)):
15 |         extra = 1 if i < remainder else 0
16 |         end = start + chunk_size + extra
17 |         yield (l[start:end], devices[i], folder)
18 |         start = end
19 | 
20 | def loop(rows):
21 |     rows, index, folder = rows
22 |     os.environ['CUDA_VISIBLE_DEVICES'] = str(index)
23 | 
24 |     import torch
25 |     import torchaudio
26 |     import malaya_speech
27 | 
28 |     model = malaya_speech.speaker_vector.nemo('huseinzol05/nemo-titanet_large').cuda()
29 |     _ = model.eval()
30 |     with torch.no_grad():
31 |         for row in tqdm(rows, desc = f'loop {index}'):
32 |             no, row = row
33 |             new_f = os.path.join(folder, f'{no}.npy')
34 |             if os.path.exists(new_f):
35 |                 continue
36 |             e = model([malaya_speech.load(row['audio'])[0]])
37 |             np.save(new_f, e[0], allow_pickle=True)
38 | 
39 | @click.command()
40 | @click.option('--filename')
41 | @click.option('--replication', default = 1)
42 | @click.option('--folder', default = 'embedding')
43 | def main(filename, replication, folder):
44 |     os.makedirs(folder, exist_ok = True)
45 |     devices = os.environ.get('CUDA_VISIBLE_DEVICES')
46 |     if devices is None:
47 |         devices = list(range(torch.cuda.device_count()))
48 |     else:
49 |         devices = [d.strip() for d in devices.split(',')]
50 | 
51 |     devices = replication * devices
52 |     print(devices)
53 | 
54 |     rows = pd.read_parquet(filename).to_dict(orient = 'records')
55 |     rows = [(i, rows[i]) for i in range(len(rows))]
56 |     df_split = chunks(rows, devices, folder)
57 |     pool = Pool(len(devices))
58 |     pooled = pool.map(loop, df_split)
59 |     pool.close()
60 |     pool.join()
61 | 
62 | if __name__ == '__main__':
63 |     main()
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/multilingual-tts/embedding.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import json
 3 | import pandas as pd
 4 | import torch
 5 | import numpy as np
 6 | from tqdm import tqdm
 7 | from multiprocess import Pool
 8 | import os
 9 | 
10 | def chunks(l, devices, folder):
11 |     chunk_size = len(l) // len(devices)
12 |     remainder = len(l) % len(devices)
13 |     start = 0
14 |     for i in range(len(devices)):
15 |         extra = 1 if i < remainder else 0
16 |         end = start + chunk_size + extra
17 |         yield (l[start:end], devices[i], folder)
18 |         start = end
19 | 
20 | def loop(rows):
21 |     rows, index, folder = rows
22 |     os.environ['CUDA_VISIBLE_DEVICES'] = str(index)
23 | 
24 |     import torch
25 |     import torchaudio
26 |     import malaya_speech
27 | 
28 |     model = malaya_speech.speaker_vector.nemo('huseinzol05/nemo-titanet_large').cuda()
29 |     _ = model.eval()
30 |     with torch.no_grad():
31 |         for row in tqdm(rows, desc = f'loop {index}'):
32 |             no, row = row
33 |             new_f = os.path.join(folder, f'{no}.npy')
34 |             if os.path.exists(new_f):
35 |                 continue
36 |             e = model([malaya_speech.load(row['audio_filename'])[0]])
37 |             np.save(new_f, e[0], allow_pickle=True)
38 | 
39 | @click.command()
40 | @click.option('--file')
41 | @click.option('--replication', default = 1)
42 | def main(file, replication):
43 | 
44 |     folder = file.replace('.json', '') + '_embedding'
45 |     os.makedirs(folder, exist_ok = True)
46 |     devices = os.environ.get('CUDA_VISIBLE_DEVICES')
47 |     if devices is None:
48 |         devices = list(range(torch.cuda.device_count()))
49 |     else:
50 |         devices = [d.strip() for d in devices.split(',')]
51 | 
52 |     devices = replication * devices
53 |     print(devices)
54 | 
55 |     with open(file) as fopen:
56 |         rows = json.load(fopen)
57 |     rows = [(i, rows[i]) for i in range(len(rows))]
58 | 
59 |     df_split = chunks(rows, devices, folder)
60 |     pool = Pool(len(devices))
61 |     pooled = pool.map(loop, df_split)
62 |     pool.close()
63 |     pool.join()
64 | 
65 | if __name__ == '__main__':
66 |     main()
67 | 
68 | 


--------------------------------------------------------------------------------
/text/extra/process-data.gov.my.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 11,
 6 |    "id": "fba0fccf",
 7 |    "metadata": {},
 8 |    "outputs": [],
 9 |    "source": [
10 |     "# !wget https://huggingface.co/datasets/mesolitica/crawl-gov.my/resolve/main/data.gov.my -O data/data.gov.my"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": 2,
16 |    "id": "af0da1cc",
17 |    "metadata": {},
18 |    "outputs": [
19 |     {
20 |      "name": "stderr",
21 |      "output_type": "stream",
22 |      "text": [
23 |       "12127it [00:33, 361.39it/s]\n"
24 |      ]
25 |     }
26 |    ],
27 |    "source": [
28 |     "import json\n",
29 |     "from tqdm import tqdm\n",
30 |     "\n",
31 |     "with open('hf-datasets/raw-datasets/data.gov.my.jsonl', 'w') as fopen_l:\n",
32 |     "    with open('data/data.gov.my') as fopen:\n",
33 |     "        for l in tqdm(fopen):\n",
34 |     "            d = json.loads(l)\n",
35 |     "            p = '\\n'.join(d['p'])\n",
36 |     "            keys = d['file_urls'].keys()\n",
37 |     "            keys = [k for k in keys if k.endswith('csv') or k.endswith('xlsx')]\n",
38 |     "            if not len(keys):\n",
39 |     "                continue\n",
40 |     "            csv = d['file_urls'][keys[0]]\n",
41 |     "            t = f'{p}\\n{csv}'\n",
42 |     "            data = {\n",
43 |     "                'text': t\n",
44 |     "            }\n",
45 |     "            fopen_l.write(f'{json.dumps(data)}\\n')"
46 |    ]
47 |   },
48 |   {
49 |    "cell_type": "code",
50 |    "execution_count": null,
51 |    "id": "1a415949",
52 |    "metadata": {},
53 |    "outputs": [],
54 |    "source": []
55 |   }
56 |  ],
57 |  "metadata": {
58 |   "kernelspec": {
59 |    "display_name": "Python 3 (ipykernel)",
60 |    "language": "python",
61 |    "name": "python3"
62 |   },
63 |   "language_info": {
64 |    "codemirror_mode": {
65 |     "name": "ipython",
66 |     "version": 3
67 |    },
68 |    "file_extension": ".py",
69 |    "mimetype": "text/x-python",
70 |    "name": "python",
71 |    "nbconvert_exporter": "python",
72 |    "pygments_lexer": "ipython3",
73 |    "version": "3.10.12"
74 |   }
75 |  },
76 |  "nbformat": 4,
77 |  "nbformat_minor": 5
78 | }
79 | 


--------------------------------------------------------------------------------
/text/processing/README.md:
--------------------------------------------------------------------------------
 1 | # text-dataset-dedup-py
 2 | 
 3 | ## Description
 4 | The `text-dataset-dedup-py` repository contains a Python script that performs a deduplication process on a text dataset. This process is implemented based on the code provided in the [Jupyter Notebook](https://github.com/malaysia-ai/text-dataset-dedup).
 5 | 
 6 | ## How to Use
 7 | Follow the steps below to use the deduplication script:
 8 | 
 9 | 1. **Change Directory**: Navigate to the `/processing` directory within this repository.
10 | 
11 | 2. **Prepare the Command**: Once in the `/processing` directory, prepare the command to execute the deduplication process. 
12 | 
13 | Single Dataset (from Huggingface URL)
14 | ```bash
15 | python3 main.py --dataset "piston.my" --url_dataset "https://huggingface.co/datasets/mesolitica/crawl-my-website/resolve/main/piston.my.jsonl" --master_folder "/home/ubuntu/za/datasets04" --text_key reviews_html reviews_text
16 | ```
17 | 
18 | Single Dataset (manually cleaned)
19 | ```bash
20 | python3 main.py --dataset "murai.my" --clean_file_path "/home/ubuntu/faiq913_folder/Cleaned Huggingface datasets/murai.my/murai_my_clean.jsonl" --master_folder "/home/ubuntu/za/datasets04"
21 | ```
22 | 
23 | If you have multiple datasets from multiple Huggingface URLs,
24 | ```bash
25 | python3 main.py \
26 | --master_folder "/home/ubuntu/za/datasets04" \
27 | --dataset_with_link \
28 | piston.my,https://huggingface.co/datasets/mesolitica/crawl-my-website/resolve/main/piston.my.json \
29 | piston2,https://huggingface.co/datasets/mesolitica/crawl-my-website/resolve/main/piston.my.jsonl \
30 | piston3,https://huggingface.co/datasets/mesolitica/crawl-my-website/resolve/main/piston.my.jsonl
31 | ```
32 | 
33 | ### Arguments
34 | 1. `dataset`: Name of the dataset folder inside /dataset where the script will find data.
35 | 2. `url_dataset`: URL of the JSONL file containing data to be processed (script only handles JSONL files). 
36 | 3. `master_folder`: Absolute path to the master directory where the deduplication process will occur.
37 | 4. `dataset_with_link`: Format {dataset_name},{dataset_url} {dataset_name02},{dataset_url02}
38 | 5. `text_key`: To add own custom key if you encounter an issue `dataset not in standard key-value. must have ...`
39 | 
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/text/README.md:
--------------------------------------------------------------------------------
 1 | # pretrain-text-dataset
 2 | 
 3 | Prepare pretrain dataset gathered from https://github.com/users/huseinzol05/projects/1
 4 | 
 5 | All dedup and postprocessed dataset uploaded at https://huggingface.co/datasets/malaysia-ai/pretrain-text-dataset
 6 | 
 7 | ## Server spec
 8 | 
 9 | 1. 24 cores.
10 | 2. 220 GB RAM.
11 | 
12 | **Deduping can explode the memory, easily eat up to 30 GB if the dataset is > 10GB, so beware**.
13 | 
14 | ## Download dataset
15 | 
16 | 1. Most of download files are straight forward,
17 | 
18 | ```bash
19 | wget https://huggingface.co/datasets/mesolitica/crawl-amanz-my/resolve/main/parsed.jsonl -O hf-datasets/raw-datasets/amanz.jsonl
20 | ```
21 | 
22 | But sometime we have to some preprocessing like,
23 | 
24 | - [process-lowyat.ipynb](process-lowyat.ipynb)
25 | - [process-data.gov.my.ipynb](process-data.gov.my.ipynb)
26 | - [process-snapshot.ipynb](process-snapshot.ipynb)
27 | 
28 | We save raw datasets at [hf-datasets/raw-datasets](hf-datasets/raw-datasets).
29 | 
30 | ## Text dedup
31 | 
32 | 1. Clone [remove-duplicate-text-dataset.ipynb](remove-duplicate-text-dataset.ipynb) to new notebook, eg, [remove-duplicate-text-dataset-lowyat.ipynb](remove-duplicate-text-dataset-lowyat.ipynb).
33 | 
34 | This notebook use [text_dedup](text_dedup) to do dedup, borrowed from https://github.com/ChenghaoMou/text-dedup
35 | 
36 | All dedup datasets will save at [hf-datasets/dedupe-datasets](hf-datasets/dedupe-datasets).
37 | 
38 | ## Postprocessing
39 | 
40 | 1. Run [postprocessing.ipynb](postprocessing.ipynb) to start postprocessing,
41 | 
42 | - remove texts that contain HTTP errors.
43 | - remove texts less than 3 characters.
44 | - replace 6 spaces or more with 6 spaces.
45 | - replace 6 dots or more with 6 dots.
46 | 
47 | **Rerun this notebook will not overwrite postprocessed datasets**.
48 | 
49 | ## Prepare for training session
50 | 
51 | **There is no consideration AI alignment and safety in current dataset, we only apply basic postfilter**.
52 | 
53 | 1. [FPF llama2](llama)
54 | 2. [FPF Mistral](mistral)
55 | 3. [Pretrain nanoT5](nanot5)
56 | 4. [Pretrain smaller Causal LM](pretrain-clm)
57 | 5. [Pretrain LLM](pretrain-llm)
58 | 6. [FPF TinyLlama](tinyllama)
59 | 7. [FPF Yi](yi)
60 | 
61 | ## end-to-end processing using Python script
62 | 
63 | Released as a Python library, https://github.com/malaysia-ai/clean_text_my
64 | 
65 | 


--------------------------------------------------------------------------------
/speech-instructions/prepare-malaysian-podcast.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 5,
 6 |    "id": "1ac7cbf3",
 7 |    "metadata": {},
 8 |    "outputs": [],
 9 |    "source": [
10 |     "import pandas as pd\n",
11 |     "import numpy as np\n",
12 |     "from tqdm import tqdm\n",
13 |     "import torchaudio\n",
14 |     "import os"
15 |    ]
16 |   },
17 |   {
18 |    "cell_type": "code",
19 |    "execution_count": 6,
20 |    "id": "64cd5042",
21 |    "metadata": {
22 |     "scrolled": true
23 |    },
24 |    "outputs": [
25 |     {
26 |      "name": "stderr",
27 |      "output_type": "stream",
28 |      "text": [
29 |       "100%|███████████████████████████████████████████████████████████████████████████| 2033890/2033890 [00:08<00:00, 244819.71it/s]\n"
30 |      ]
31 |     }
32 |    ],
33 |    "source": [
34 |     "df = pd.read_parquet('/home/husein/ssd4/verify-text.parquet')\n",
35 |     "filtered = []\n",
36 |     "for i in tqdm(range(len(df))):\n",
37 |     "    if 'podcast_processed' in df['audio'].iloc[i]:\n",
38 |     "        row = df.iloc[i].to_dict()\n",
39 |     "        f = os.path.join('/home/husein/ssd4/', row['audio'])\n",
40 |     "        row['audio'] = f\n",
41 |     "        filtered.append(row)"
42 |    ]
43 |   },
44 |   {
45 |    "cell_type": "code",
46 |    "execution_count": 7,
47 |    "id": "438c8a4e",
48 |    "metadata": {},
49 |    "outputs": [
50 |     {
51 |      "data": {
52 |       "text/plain": [
53 |        "75965"
54 |       ]
55 |      },
56 |      "execution_count": 7,
57 |      "metadata": {},
58 |      "output_type": "execute_result"
59 |     }
60 |    ],
61 |    "source": [
62 |     "len(filtered)"
63 |    ]
64 |   },
65 |   {
66 |    "cell_type": "code",
67 |    "execution_count": 8,
68 |    "id": "f625ca99",
69 |    "metadata": {},
70 |    "outputs": [],
71 |    "source": [
72 |     "pd.DataFrame(filtered).to_parquet('filtered-podcast.parquet')"
73 |    ]
74 |   }
75 |  ],
76 |  "metadata": {
77 |   "kernelspec": {
78 |    "display_name": "python3.10",
79 |    "language": "python",
80 |    "name": "python3.10"
81 |   },
82 |   "language_info": {
83 |    "codemirror_mode": {
84 |     "name": "ipython",
85 |     "version": 3
86 |    },
87 |    "file_extension": ".py",
88 |    "mimetype": "text/x-python",
89 |    "name": "python",
90 |    "nbconvert_exporter": "python",
91 |    "pygments_lexer": "ipython3",
92 |    "version": "3.10.15"
93 |   }
94 |  },
95 |  "nbformat": 4,
96 |  "nbformat_minor": 5
97 | }
98 | 


--------------------------------------------------------------------------------
/emotional-malaysian-emilia/pitch_estimation.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import torch
 3 | import torchaudio
 4 | from glob import glob
 5 | from tqdm import tqdm
 6 | import os
 7 | import penn
 8 | import torch
 9 | import huggingface_hub
10 | 
11 | def new_path(f):
12 |     f = f.replace('.mp3', '.pitch')
13 |     splitted = f.split('/')
14 |     base_folder = splitted[0] + '_pitch'
15 |     splitted = '/'.join([base_folder] + splitted[1:])
16 |     return splitted
17 | 
18 | @click.command()
19 | @click.option("--path", help="files path in glob pattern")
20 | @click.option("--global-index", default=1, help="global index")
21 | @click.option("--local-index", default=0, help="local index")
22 | def function(path, global_index, local_index):
23 |     files = glob(path)
24 |     filtered_files = []
25 |     for f in files:
26 |         new_f = new_path(f)
27 |         if os.path.exists(new_f) and os.path.getsize(new_f) > 2:
28 |             continue
29 |         filtered_files.append(f)
30 | 
31 |     global_size = len(filtered_files) // global_index
32 |     filtered_files = filtered_files[global_size * local_index: global_size * (local_index + 1)]
33 |     files = filtered_files
34 | 
35 |     model = penn.Model()
36 |     checkpoint = huggingface_hub.hf_hub_download(
37 |     'maxrmorrison/fcnf0-plus-plus',
38 |     'fcnf0++.pt')
39 |     checkpoint = torch.load(checkpoint, map_location='cpu')
40 |     model.load_state_dict(checkpoint['model'])
41 | 
42 |     model = model.to('cuda').to(torch.float16)
43 | 
44 |     with torch.no_grad():
45 |         for f in tqdm(files):
46 |             y, sr = torchaudio.load(f)
47 |             y = torchaudio.functional.resample(y, sr, penn.SAMPLE_RATE)
48 |             pitch, periodicity = [], []
49 |             with torch.no_grad():
50 |                 for frames in penn.preprocess(
51 |                     y,
52 |                 ):  
53 |                     logits = model(frames.to(torch.float16).to('cuda'))
54 |                     result = penn.postprocess(logits)
55 |                     pitch.append(result[1])
56 |                     periodicity.append(result[2])
57 |             pitch, periodicity = torch.cat(pitch, 1), torch.cat(periodicity, 1)
58 |             pitch = penn.voicing.interpolate(
59 |                 pitch,
60 |                 periodicity,
61 |                 interp_unvoiced_at)
62 |             pitch = pitch[0].cpu().numpy().tolist()
63 |             pitch = [round(p, 4) for p in pitch]
64 |             periodicity = periodicity[0].cpu().numpy().tolist()
65 |             periodicity = [round(p, 4) for p in periodicity]
66 |             splitted = new_path(f)
67 |             os.makedirs(os.path.split(splitted)[0], exist_ok = True)
68 | 
69 |             with open(splitted, 'w') as fopen:
70 |                 json.dump({'pitch': pitch, 'periodicity': periodicity}, fopen)
71 | 
72 | if __name__ == '__main__':
73 |     function()


--------------------------------------------------------------------------------
/multilingual-tts/prepare/prepare-WenetSpeech4TTS.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 10,
 6 |    "id": "a2127d3e-e002-4d8c-8c8b-ec5ecfeb8b7a",
 7 |    "metadata": {},
 8 |    "outputs": [],
 9 |    "source": [
10 |     "from glob import glob\n",
11 |     "import pandas as pd\n",
12 |     "import os\n",
13 |     "import soundfile as sf\n",
14 |     "from tqdm import tqdm\n",
15 |     "from multiprocess import Pool\n",
16 |     "import librosa\n",
17 |     "import itertools\n",
18 |     "import io\n",
19 |     "import numpy as np\n",
20 |     "import json\n",
21 |     "import tarfile\n",
22 |     "\n",
23 |     "def chunks(l, n):\n",
24 |     "    for i in range(0, len(l), n):\n",
25 |     "        yield (l[i: i + n], i // n)\n",
26 |     "\n",
27 |     "def multiprocessing(strings, function, cores=6, returned=True):\n",
28 |     "    df_split = chunks(strings, len(strings) // cores)\n",
29 |     "    pool = Pool(cores)\n",
30 |     "    pooled = pool.map(function, df_split)\n",
31 |     "    pool.close()\n",
32 |     "    pool.join()\n",
33 |     "\n",
34 |     "    if returned:\n",
35 |     "        return list(itertools.chain(*pooled))"
36 |    ]
37 |   },
38 |   {
39 |    "cell_type": "code",
40 |    "execution_count": 11,
41 |    "id": "53d47aa9-b5f8-4378-8416-45ceace47196",
42 |    "metadata": {},
43 |    "outputs": [],
44 |    "source": [
45 |     "# def loop(files):\n",
46 |     "#     files, _ = files\n",
47 |     "#     for f in tqdm(files):\n",
48 |     "#         with tarfile.open(f, \"r:gz\") as tar:\n",
49 |     "#             tar.extractall(path='WenetSpeech4TTS/Standard')\n",
50 |     "\n",
51 |     "# files = glob('WenetSpeech4TTS/Standard/*.tar.gz')\n",
52 |     "# multiprocessing(files, loop, len(files), returned = False)"
53 |    ]
54 |   },
55 |   {
56 |    "cell_type": "code",
57 |    "execution_count": 12,
58 |    "id": "f00e77ce-af6d-4232-b067-aedf1c9d1964",
59 |    "metadata": {},
60 |    "outputs": [],
61 |    "source": [
62 |     "# def loop(files):\n",
63 |     "#     files, _ = files\n",
64 |     "#     for f in tqdm(files):\n",
65 |     "#         with tarfile.open(f, \"r:gz\") as tar:\n",
66 |     "#             tar.extractall(path='WenetSpeech4TTS/Premium')\n",
67 |     "\n",
68 |     "# files = glob('WenetSpeech4TTS/Premium/*.tar.gz')\n",
69 |     "# multiprocessing(files, loop, len(files), returned = False)"
70 |    ]
71 |   }
72 |  ],
73 |  "metadata": {
74 |   "kernelspec": {
75 |    "display_name": "Python 3 (ipykernel)",
76 |    "language": "python",
77 |    "name": "python3"
78 |   },
79 |   "language_info": {
80 |    "codemirror_mode": {
81 |     "name": "ipython",
82 |     "version": 3
83 |    },
84 |    "file_extension": ".py",
85 |    "mimetype": "text/x-python",
86 |    "name": "python",
87 |    "nbconvert_exporter": "python",
88 |    "pygments_lexer": "ipython3",
89 |    "version": "3.10.12"
90 |   }
91 |  },
92 |  "nbformat": 4,
93 |  "nbformat_minor": 5
94 | }
95 | 


--------------------------------------------------------------------------------
/speech-instructions/remote.sh:
--------------------------------------------------------------------------------
 1 | apt update
 2 | apt install unzip ffmpeg -y
 3 | apt update && apt install -y locales
 4 | locale-gen en_US.UTF-8
 5 | cd /workspace
 6 | wget https://www.7-zip.org/a/7z2301-linux-x64.tar.xz
 7 | tar -xf 7z2301-linux-x64.tar.xz
 8 | pip3 install huggingface-hub
 9 | 
10 | python3 -c "
11 | from huggingface_hub import snapshot_download
12 | snapshot_download(repo_id='malaysia-ai/dedup-Malaysian-Emilia', repo_type='dataset', 
13 |                   allow_patterns = '*.z*', local_dir = './')
14 | "
15 | /workspace/7zz x dedup-parliament.zip -y -mmt40
16 | /workspace/7zz x dedup-podcasts.zip -y -mmt40
17 | 
18 | wget https://github.com/mesolitica/malaysian-dataset/raw/refs/heads/master/text-to-speech/husein/requirements.txt
19 | pip3 install -r requirements.txt
20 | pip3 install click vocos torchdiffeq==0.2.4 x-transformers==1.42.11 jieba==0.42.1 pypinyin==0.53.0
21 | 
22 | wget https://huggingface.co/datasets/malaysia-ai/Speech-Instructions/resolve/main/text/partition-instructions-part-3.json
23 | wget https://huggingface.co/datasets/malaysia-ai/Speech-Instructions/resolve/main/text/partition-instructions-part-4.json
24 | wget https://huggingface.co/datasets/malaysia-ai/Speech-Instructions/resolve/main/text/partition-instructions-part-5.json
25 | wget https://huggingface.co/datasets/malaysia-ai/Speech-Instructions/resolve/main/text/partition-instructions-part-6.json
26 | wget https://raw.githubusercontent.com/malaysia-ai/dataset/refs/heads/main/speech-instructions/generate.py
27 | 
28 | for i in {0..3}; do
29 |   screen -S "partition-instructions-part-3_$i" -X quit 2>/dev/null
30 |   screen -dmS "partition-instructions-part-3_$i" bash -c "cd /workspace && \
31 |   CUDA_VISIBLE_DEVICES=0 \
32 |   python3 generate.py \
33 |     --input_file \"partition-instructions-part-3.json\" \
34 |     --folder \"partition-instructions-part-3\" \
35 |     --global_index 4 \
36 |     --index $i"
37 | done
38 | 
39 | for i in {0..3}; do
40 |   screen -S "partition-instructions-part-4_$i" -X quit 2>/dev/null
41 |   screen -dmS "partition-instructions-part-4_$i" bash -c "cd /workspace && \
42 |   CUDA_VISIBLE_DEVICES=1 \
43 |   python3 generate.py \
44 |     --input_file \"partition-instructions-part-4.json\" \
45 |     --folder \"partition-instructions-part-4\" \
46 |     --global_index 4 \
47 |     --index $i"
48 | done
49 | 
50 | for i in {0..3}; do
51 |   screen -S "partition-instructions-part-5_$i" -X quit 2>/dev/null
52 |   screen -dmS "partition-instructions-part-5_$i" bash -c "cd /workspace && \
53 |   CUDA_VISIBLE_DEVICES=2 \
54 |   python3 generate.py \
55 |     --input_file \"partition-instructions-part-5.json\" \
56 |     --folder \"partition-instructions-part-5\" \
57 |     --global_index 4 \
58 |     --index $i"
59 | done
60 | 
61 | for i in {0..3}; do
62 |   screen -S "partition-instructions-part-6_$i" -X quit 2>/dev/null
63 |   screen -dmS "partition-instructions-part-6_$i" bash -c "cd /workspace && \
64 |   CUDA_VISIBLE_DEVICES=3 \
65 |   python3 generate.py \
66 |     --input_file \"partition-instructions-part-6.json\" \
67 |     --folder \"partition-instructions-part-6\" \
68 |     --global_index 4 \
69 |     --index $i"
70 | done


--------------------------------------------------------------------------------
/speech-instructions-extra/upload.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 7,
  6 |    "id": "489cdcad",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from glob import glob\n",
 11 |     "from tqdm import tqdm\n",
 12 |     "import json"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 8,
 18 |    "id": "02e5a67f",
 19 |    "metadata": {},
 20 |    "outputs": [
 21 |     {
 22 |      "name": "stderr",
 23 |      "output_type": "stream",
 24 |      "text": [
 25 |       "100%|████████████████████████████████████████████████████████████████████████████████████| 965/965 [00:01<00:00, 811.02it/s]\n"
 26 |      ]
 27 |     }
 28 |    ],
 29 |    "source": [
 30 |     "alls = []\n",
 31 |     "for f in tqdm(glob('*/*.parquet')):\n",
 32 |     "    try:\n",
 33 |     "        with open(f) as fopen:\n",
 34 |     "            d = json.load(fopen)\n",
 35 |     "        for d_ in d:\n",
 36 |     "            d_['start'] = None\n",
 37 |     "            d_['end'] = None\n",
 38 |     "            d_['context'] = None\n",
 39 |     "            d_['system'] = None\n",
 40 |     "            d_['sliced_audio_filename'] = None\n",
 41 |     "            alls.append(d_)\n",
 42 |     "    except:\n",
 43 |     "        pass"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 9,
 49 |    "id": "1def2d40",
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "data": {
 54 |       "text/plain": [
 55 |        "549110"
 56 |       ]
 57 |      },
 58 |      "execution_count": 9,
 59 |      "metadata": {},
 60 |      "output_type": "execute_result"
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "len(alls)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 13,
 70 |    "id": "e9b10351",
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "data": {
 75 |       "text/plain": [
 76 |        "{'instruction': 'What decision did Speaker1 and Speaker2 agree on regarding taking something first?',\n",
 77 |        " 'answer': 'Speaker1 would take the first one.',\n",
 78 |        " 'audio_filename': 'SQA-PART3-Train-audio/train-00145-of-00171-2.mp3',\n",
 79 |        " 'start': None,\n",
 80 |        " 'end': None,\n",
 81 |        " 'context': None,\n",
 82 |        " 'system': None,\n",
 83 |        " 'sliced_audio_filename': None}"
 84 |       ]
 85 |      },
 86 |      "execution_count": 13,
 87 |      "metadata": {},
 88 |      "output_type": "execute_result"
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "alls[2]"
 93 |    ]
 94 |   }
 95 |  ],
 96 |  "metadata": {
 97 |   "kernelspec": {
 98 |    "display_name": "python3.10",
 99 |    "language": "python",
100 |    "name": "python3.10"
101 |   },
102 |   "language_info": {
103 |    "codemirror_mode": {
104 |     "name": "ipython",
105 |     "version": 3
106 |    },
107 |    "file_extension": ".py",
108 |    "mimetype": "text/x-python",
109 |    "name": "python",
110 |    "nbconvert_exporter": "python",
111 |    "pygments_lexer": "ipython3",
112 |    "version": "3.10.15"
113 |   }
114 |  },
115 |  "nbformat": 4,
116 |  "nbformat_minor": 5
117 | }
118 | 


--------------------------------------------------------------------------------
/text/pretrain-llm/prepare-madlad-400-ms.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "86d80b05",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# !git lfs clone https://huggingface.co/datasets/malaysia-ai/madlad-400-ms"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 3,
 16 |    "id": "54ca47c5",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import json\n",
 21 |     "import os\n",
 22 |     "from glob import glob\n",
 23 |     "from tqdm import tqdm"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 4,
 29 |    "id": "e92d6668",
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "def partition(text, size = 500):\n",
 34 |     "    splitted = text.split()\n",
 35 |     "    return [' '.join(splitted[i: i + size]) for i in range(0, len(splitted), size)]"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "id": "f1b7e4f9",
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "name": "stdout",
 46 |      "output_type": "stream",
 47 |      "text": [
 48 |       "madlad-400-ms.jsonl00.splitted\tmadlad-400-ms.jsonl02.splitted\r\n",
 49 |       "madlad-400-ms.jsonl01.splitted\r\n"
 50 |      ]
 51 |     }
 52 |    ],
 53 |    "source": [
 54 |     "!ls madlad-400-ms"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 5,
 60 |    "id": "485c6a71",
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "a = open('prepare-madlad-400-ms.jsonl', 'w')"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 6,
 70 |    "id": "6aacc1a4",
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stderr",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "4081851it [18:57, 3587.17it/s]\n",
 78 |       "5000000it [23:02, 3615.70it/s]\n",
 79 |       "5000000it [34:34, 2410.40it/s]\n"
 80 |      ]
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "madlad_ms = glob('madlad-400-ms/*.splitted')\n",
 85 |     "for f in madlad_ms:\n",
 86 |     "    with open(f) as fopen:\n",
 87 |     "        for l in tqdm(fopen):\n",
 88 |     "            try:\n",
 89 |     "                data = '<s>' + json.loads(l)['text'] + '</s>'\n",
 90 |     "                partitioned = partition(data)\n",
 91 |     "                for p in partitioned:\n",
 92 |     "                    data = {\n",
 93 |     "                        'text': p,\n",
 94 |     "                    }\n",
 95 |     "                    a.write(f'{json.dumps(data)}\\n')\n",
 96 |     "                    a.flush()\n",
 97 |     "            except:\n",
 98 |     "                pass"
 99 |    ]
100 |   }
101 |  ],
102 |  "metadata": {
103 |   "kernelspec": {
104 |    "display_name": "Python 3 (ipykernel)",
105 |    "language": "python",
106 |    "name": "python3"
107 |   },
108 |   "language_info": {
109 |    "codemirror_mode": {
110 |     "name": "ipython",
111 |     "version": 3
112 |    },
113 |    "file_extension": ".py",
114 |    "mimetype": "text/x-python",
115 |    "name": "python",
116 |    "nbconvert_exporter": "python",
117 |    "pygments_lexer": "ipython3",
118 |    "version": "3.10.12"
119 |   }
120 |  },
121 |  "nbformat": 4,
122 |  "nbformat_minor": 5
123 | }
124 | 


--------------------------------------------------------------------------------
/emotional-malaysian-emilia/audioset_sliding.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
 2 | from torch.utils.data import DataLoader
 3 | from torch.nn import functional as F
 4 | from tqdm import tqdm
 5 | from glob import glob
 6 | from datasets import Audio
 7 | import torch
 8 | import torchaudio
 9 | import numpy as np
10 | import click
11 | import os
12 | import json
13 | 
14 | def new_path(f):
15 |     f = f.replace('.mp3', '.audioset')
16 |     splitted = f.split('/')
17 |     base_folder = splitted[0] + '_audioset'
18 |     splitted = '/'.join([base_folder] + splitted[1:])
19 |     return splitted
20 | 
21 | 
22 | @click.command()
23 | @click.option("--path", help="files path in glob pattern")
24 | @click.option("--global-index", default=1, help="global index")
25 | @click.option("--local-index", default=0, help="local index")
26 | @click.option("--sliding", default=0.25)
27 | @click.option("--model", default='MIT/ast-finetuned-audioset-10-10-0.4593')
28 | def function(path, global_index, local_index, sliding, model):
29 | 
30 |     feature_extractor = AutoFeatureExtractor.from_pretrained(model, return_attention_mask = True)
31 |     model = AutoModelForAudioClassification.from_pretrained(model, torch_dtype = torch.float16).eval().cuda()
32 |     id2label = model.config.id2label
33 |     sr = feature_extractor.sampling_rate
34 |     sliding = int(sliding * sr)
35 |     audio = Audio(sampling_rate = sr)
36 | 
37 |     files = glob(path)
38 |     filtered_files = []
39 |     for f in files:
40 |         new_f = new_path(f)
41 |         if os.path.exists(new_f) and os.path.getsize(new_f) > 2:
42 |             continue
43 |         filtered_files.append(f)
44 | 
45 |     global_size = len(filtered_files) // global_index
46 |     filtered_files = filtered_files[global_size * local_index: global_size * (local_index + 1)]
47 |     files = filtered_files
48 | 
49 |     with torch.no_grad():
50 |         for f in tqdm(files):
51 |             y = audio.decode_example(audio.encode_example(f))['array']
52 |             timestamps = []
53 |             slided = []
54 |             for i in range(0, len(y), sliding):
55 |                 y_ = y[i: i + sliding]
56 |                 if len(y_) < 1000:
57 |                     continue
58 |                 slided.append(y[i: i + sliding])
59 |                 start = i / sr
60 |                 end = min(len(y) / sr, (i + sliding) / sr)
61 |                 timestamps.append((start, end))
62 |             
63 |             inputs = feature_extractor(slided, sampling_rate=sr, 
64 |                            return_tensors="pt", return_attention_mask = True)
65 |             inputs['input_values'] = inputs['input_values'].to(torch.float16).cuda()
66 |             logits = model(**inputs).logits.softmax(-1)
67 |             topk = torch.topk(logits, 5, dim = -1)
68 |             probs = topk.values.cpu().numpy().tolist()
69 | 
70 |             for i in range(len(probs)):
71 |                 for k in range(len(probs[i])):
72 |                     probs[i][k] = round(probs[i][k], 4)
73 |                     
74 |             labels = []
75 |             for row in topk.indices.cpu().numpy():
76 |                 label = [id2label[r] for r in row]
77 |                 labels.append(label)
78 | 
79 |             splitted = new_path(f)
80 |             os.makedirs(os.path.split(splitted)[0], exist_ok = True)
81 |             with open(splitted, 'w') as fopen:
82 |                 json.dump({'timestamps': timestamps, 'labels': labels, 'probs': probs}, fopen)
83 |             
84 | if __name__ == '__main__':
85 |     function()


--------------------------------------------------------------------------------
/speech-instructions/prepare-malaysian-others.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "3c883f91",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# !wget https://huggingface.co/Zyphra/Zonos-v0.1-speaker-embedding/resolve/main/ResNet293_SimAM_ASP_base.pt"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "id": "bf4cd179",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import os\n",
 21 |     "\n",
 22 |     "os.environ['CUDA_VISIBLE_DEVICES'] = '1'\n",
 23 |     "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 3,
 29 |    "id": "1ac7cbf3",
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import pandas as pd\n",
 34 |     "import numpy as np\n",
 35 |     "from tqdm import tqdm\n",
 36 |     "import torchaudio"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 8,
 42 |    "id": "56d3111e",
 43 |    "metadata": {
 44 |     "scrolled": true
 45 |    },
 46 |    "outputs": [
 47 |     {
 48 |      "name": "stderr",
 49 |      "output_type": "stream",
 50 |      "text": [
 51 |       "100%|████████████████████████████████████████████████████████████████████████████| 2033890/2033890 [00:20<00:00, 97450.43it/s]\n"
 52 |      ]
 53 |     },
 54 |     {
 55 |      "data": {
 56 |       "text/plain": [
 57 |        "555379"
 58 |       ]
 59 |      },
 60 |      "execution_count": 8,
 61 |      "metadata": {},
 62 |      "output_type": "execute_result"
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "df = pd.read_parquet('/home/husein/ssd4/verify-text.parquet')\n",
 67 |     "filtered = []\n",
 68 |     "for i in tqdm(range(len(df))):\n",
 69 |     "    f = df['audio'].iloc[i]\n",
 70 |     "    \n",
 71 |     "    if 'parlimen-24k' not in f and 'podcast_processed' not in f and 'dialects_processed' not in f:\n",
 72 |     "        row = df.iloc[i].to_dict()\n",
 73 |     "        f = os.path.join('/home/husein/ssd4/', row['audio'])\n",
 74 |     "        row['audio'] = f\n",
 75 |     "        filtered.append(row)\n",
 76 |     "    \n",
 77 |     "    \n",
 78 |     "len(filtered)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 9,
 84 |    "id": "c358a3f7",
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "data": {
 89 |       "text/plain": [
 90 |        "555379"
 91 |       ]
 92 |      },
 93 |      "execution_count": 9,
 94 |      "metadata": {},
 95 |      "output_type": "execute_result"
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "len(filtered)"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 12,
105 |    "id": "54d9bd5b",
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "pd.DataFrame(filtered).to_parquet('filtered-others.parquet')"
110 |    ]
111 |   }
112 |  ],
113 |  "metadata": {
114 |   "kernelspec": {
115 |    "display_name": "python3.10",
116 |    "language": "python",
117 |    "name": "python3.10"
118 |   },
119 |   "language_info": {
120 |    "codemirror_mode": {
121 |     "name": "ipython",
122 |     "version": 3
123 |    },
124 |    "file_extension": ".py",
125 |    "mimetype": "text/x-python",
126 |    "name": "python",
127 |    "nbconvert_exporter": "python",
128 |    "pygments_lexer": "ipython3",
129 |    "version": "3.10.15"
130 |   }
131 |  },
132 |  "nbformat": 4,
133 |  "nbformat_minor": 5
134 | }
135 | 


--------------------------------------------------------------------------------
/text/text_dedup/utils/analysis.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | from scipy.integrate import quad as integrate
  4 | 
  5 | from text_dedup.utils.tokenization import ngrams
  6 | 
  7 | 
  8 | def jaccard_similarity(
  9 |     doc1,
 10 |     doc2,
 11 |     ngram_size: int = 8,
 12 |     min_length: int = 0,
 13 | ) -> float:
 14 |     """Compute the Jaccard similarity between two documents.
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     doc1 : str or List[str]
 19 |         The first document.
 20 |     doc2 : str or List[str]
 21 |         The second document.
 22 |     ngram_size : int, optional
 23 |         The size of n-grams, by default 8
 24 |     min_length : int, optional
 25 |         The minimum length of each n-gram, by default 0
 26 | 
 27 |     Returns
 28 |     -------
 29 |     float
 30 |         The Jaccard similarity.
 31 | 
 32 |     Examples
 33 |     --------
 34 |     >>> jaccard_similarity("hello world", "hello world")
 35 |     1.0
 36 |     >>> jaccard_similarity("hello world", "hello world!")
 37 |     0.8
 38 |     >>> jaccard_similarity("hello world".split(), "hello world!".split(), ngram_size=1)
 39 |     0.3333333333333333
 40 |     """
 41 |     words1 = set(" ".join(ng) for ng in ngrams(list(doc1), ngram_size, min_length=min_length))
 42 |     words2 = set(" ".join(ng) for ng in ngrams(list(doc2), ngram_size, min_length=min_length))
 43 |     return len(words1 & words2) / max(1, len(words1 | words2))
 44 | 
 45 | 
 46 | def optimal_param(
 47 |     threshold: float,
 48 |     num_perm: int,
 49 |     false_positive_weight: float = 0.5,
 50 |     false_negative_weight: float = 0.5,
 51 | ):
 52 |     """
 53 |     Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
 54 |     of probabilities of false positive and false negative, taken from datasketch.
 55 | 
 56 |     You can also refer to the interactive demo at https://huggingface.co/spaces/bigcode/near-deduplication.
 57 | 
 58 |     Parameters
 59 |     ----------
 60 |     threshold : float
 61 |         The threshold for similarity.
 62 |     num_perm : int
 63 |         The number of permutations.
 64 |     false_positive_weight : float
 65 |         The weight of false positive.
 66 |     false_negative_weight : float
 67 |         The weight of false negative.
 68 | 
 69 |     Returns
 70 |     -------
 71 |     Tuple[int, int]
 72 |         The optimal `b` (bands) and `r` (rows) parameters.
 73 | 
 74 |     Examples
 75 |     --------
 76 |     >>> optimal_param(0.75, 256)
 77 |     (21, 12)
 78 |     >>> optimal_param(0.75, 256, 0.1, 0.9)
 79 |     (28, 9)
 80 |     """
 81 | 
 82 |     def false_positive_area(threshold: float, b: int, r: int):
 83 |         """Source: `datasketch.lsh`"""
 84 | 
 85 |         def proba(s):
 86 |             return 1 - (1 - s ** float(r)) ** float(b)
 87 | 
 88 |         a, _ = integrate(proba, 0.0, threshold)
 89 |         return a
 90 | 
 91 |     def false_negative_area(threshold: float, b: int, r: int):
 92 |         """Source: `datasketch.lsh`"""
 93 | 
 94 |         def proba(s):
 95 |             return 1 - (1 - (1 - s ** float(r)) ** float(b))
 96 | 
 97 |         a, _ = integrate(proba, threshold, 1.0)
 98 |         return a
 99 | 
100 |     min_error = float("inf")
101 |     opt = (0, 0)
102 |     for b in range(1, num_perm + 1):
103 |         max_r = int(num_perm / b)
104 |         for r in range(1, max_r + 1):
105 |             fp = false_positive_area(threshold, b, r)
106 |             fn = false_negative_area(threshold, b, r)
107 |             error = fp * false_positive_weight + fn * false_negative_weight
108 |             if error < min_error:
109 |                 min_error = error
110 |                 opt = (b, r)
111 |     return opt


--------------------------------------------------------------------------------
/stt-whisper/force_alignment.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm
  2 | from multiprocess import Pool
  3 | import torch
  4 | import torchaudio
  5 | import pandas as pd
  6 | import click
  7 | import os
  8 | import json
  9 | 
 10 | device = 'cuda'
 11 | 
 12 | def chunks(l, devices, language, folder):
 13 |     chunk_size = len(l) // len(devices)
 14 |     remainder = len(l) % len(devices)
 15 |     start = 0
 16 |     for i in range(len(devices)):
 17 |         extra = 1 if i < remainder else 0
 18 |         end = start + chunk_size + extra
 19 |         yield (l[start:end], devices[i], language, folder)
 20 |         start = end
 21 | 
 22 | def loop(rows):
 23 |     rows, index, language, folder  = rows
 24 |     os.environ['CUDA_VISIBLE_DEVICES'] = str(index)
 25 | 
 26 |     from ctc_forced_aligner import (
 27 |         load_audio,
 28 |         load_alignment_model,
 29 |         generate_emissions,
 30 |         preprocess_text,
 31 |         get_alignments,
 32 |         get_spans,
 33 |         postprocess_results,
 34 |     )
 35 |     import torch
 36 | 
 37 |     alignment_model, alignment_tokenizer = load_alignment_model(
 38 |         device,
 39 |         dtype=torch.float16 if device == "cuda" else torch.float32,
 40 |     )
 41 | 
 42 |     with torch.no_grad():
 43 |         for row in tqdm(rows):
 44 |             t = row.get('pronunciation', '')
 45 |             if not len(t):
 46 |                 t = row.get('question')
 47 |             f = row['audio_filename']
 48 |             new_f = f.replace('/', '_').replace('.mp3', '.json').replace('.wav', '.json')
 49 |             filename = os.path.join(folder, new_f)
 50 |             if os.path.exists(filename):
 51 |                 continue
 52 |             new_wav, sr = torchaudio.load(f)
 53 |             audio_waveform = torchaudio.functional.resample(
 54 |                 new_wav[0], orig_freq=sr, new_freq=16000
 55 |             ).type(torch.float16).cuda()
 56 |             emissions, stride = generate_emissions(
 57 |                 alignment_model, audio_waveform, batch_size=1
 58 |             )
 59 |             tokens_starred, text_starred = preprocess_text(
 60 |                 t,
 61 |                 romanize=True,
 62 |                 language=language,
 63 |             )
 64 |             segments, scores, blank_token = get_alignments(
 65 |                 emissions,
 66 |                 tokens_starred,
 67 |                 alignment_tokenizer,
 68 |             )
 69 |             spans = get_spans(tokens_starred, segments, blank_token)
 70 |             word_timestamps = postprocess_results(text_starred, spans, stride, scores)
 71 |             with open(filename, 'w') as fopen:
 72 |                 row['word_timestamps'] = word_timestamps
 73 |                 json.dump(row, fopen)
 74 |     
 75 | @click.command()
 76 | @click.option('--filename')
 77 | @click.option('--language', default = 'en')
 78 | @click.option('--replication', default = 1)
 79 | @click.option('--folder', default = 'force_alignment')
 80 | def main(filename, language, replication, folder):
 81 |     os.makedirs(folder, exist_ok = True)
 82 |     devices = os.environ.get('CUDA_VISIBLE_DEVICES')
 83 |     if devices is None:
 84 |         devices = list(range(torch.cuda.device_count()))
 85 |     else:
 86 |         devices = [d.strip() for d in devices.split(',')]
 87 | 
 88 |     devices = replication * devices
 89 |     print(devices)
 90 | 
 91 |     with open(filename) as fopen:
 92 |         rows = json.load(fopen)
 93 | 
 94 |     df_split = chunks(rows, devices, language, folder)
 95 |     pool = Pool(len(devices))
 96 |     pooled = pool.map(loop, df_split)
 97 |     pool.close()
 98 |     pool.join()
 99 | 
100 | if __name__ == '__main__':
101 |     main()


--------------------------------------------------------------------------------
/text/compare-tokens.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "id": "f4d6d81b",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# !wget https://huggingface.co/datasets/mesolitica/malaysian-ultrachat/resolve/main/ultrachat-astroawani-malay.jsonl"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 12,
 16 |    "id": "e7743bc8",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import json\n",
 21 |     "from tqdm import tqdm\n",
 22 |     "from transformers import AutoTokenizer"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 15,
 28 |    "id": "1709e65a",
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "tokenizer_mallam = AutoTokenizer.from_pretrained('malaysia-ai/sentencepiece-tokenizer')\n",
 33 |     "tokenizer_llama2 = AutoTokenizer.from_pretrained('mesolitica/llama-7b-hf-2048-fpf')\n",
 34 |     "tokenizer_mistral = AutoTokenizer.from_pretrained('mesolitica/mistral-7b-4096-fpf')"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 18,
 40 |    "id": "9655d899",
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "name": "stderr",
 45 |      "output_type": "stream",
 46 |      "text": [
 47 |       "60198it [04:20, 230.88it/s]\n"
 48 |      ]
 49 |     }
 50 |    ],
 51 |    "source": [
 52 |     "mallam, llama2, mistral = 0, 0, 0\n",
 53 |     "with open('ultrachat-astroawani-malay.jsonl') as fopen:\n",
 54 |     "    for l in tqdm(fopen):\n",
 55 |     "        l = json.loads(l)\n",
 56 |     "        for r in l[1:]:\n",
 57 |     "            if r['content_ms']:\n",
 58 |     "                mallam += len(tokenizer_mallam(r['content_ms'])['input_ids'])\n",
 59 |     "                llama2 += len(tokenizer_llama2(r['content_ms'])['input_ids'])\n",
 60 |     "                mistral += len(tokenizer_mistral(r['content_ms'])['input_ids'])"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 19,
 66 |    "id": "7b5901bd",
 67 |    "metadata": {},
 68 |    "outputs": [
 69 |     {
 70 |      "data": {
 71 |       "text/plain": [
 72 |        "(26157664, 60391551, 60823929)"
 73 |       ]
 74 |      },
 75 |      "execution_count": 19,
 76 |      "metadata": {},
 77 |      "output_type": "execute_result"
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "mallam, llama2, mistral"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 22,
 87 |    "id": "3e01dec0",
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "data": {
 92 |       "text/plain": [
 93 |        "0.4300554803028262"
 94 |       ]
 95 |      },
 96 |      "execution_count": 22,
 97 |      "metadata": {},
 98 |      "output_type": "execute_result"
 99 |     }
100 |    ],
101 |    "source": [
102 |     "(mallam / 60823929)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "id": "1a1c8e06",
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": []
112 |   }
113 |  ],
114 |  "metadata": {
115 |   "kernelspec": {
116 |    "display_name": "Python 3 (ipykernel)",
117 |    "language": "python",
118 |    "name": "python3"
119 |   },
120 |   "language_info": {
121 |    "codemirror_mode": {
122 |     "name": "ipython",
123 |     "version": 3
124 |    },
125 |    "file_extension": ".py",
126 |    "mimetype": "text/x-python",
127 |    "name": "python",
128 |    "nbconvert_exporter": "python",
129 |    "pygments_lexer": "ipython3",
130 |    "version": "3.10.12"
131 |   }
132 |  },
133 |  "nbformat": 4,
134 |  "nbformat_minor": 5
135 | }
136 | 


--------------------------------------------------------------------------------
/text/processing/text_dedup/utils/analysis.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | from scipy.integrate import quad as integrate
  4 | 
  5 | from text_dedup.utils.tokenization import ngrams
  6 | 
  7 | 
  8 | def jaccard_similarity(
  9 |     doc1: str | List[str],
 10 |     doc2: str | List[str],
 11 |     ngram_size: int = 8,
 12 |     min_length: int = 0,
 13 | ) -> float:
 14 |     """Compute the Jaccard similarity between two documents.
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     doc1 : str or List[str]
 19 |         The first document.
 20 |     doc2 : str or List[str]
 21 |         The second document.
 22 |     ngram_size : int, optional
 23 |         The size of n-grams, by default 8
 24 |     min_length : int, optional
 25 |         The minimum length of each n-gram, by default 0
 26 | 
 27 |     Returns
 28 |     -------
 29 |     float
 30 |         The Jaccard similarity.
 31 | 
 32 |     Examples
 33 |     --------
 34 |     >>> jaccard_similarity("hello world", "hello world")
 35 |     1.0
 36 |     >>> jaccard_similarity("hello world", "hello world!")
 37 |     0.8
 38 |     >>> jaccard_similarity("hello world".split(), "hello world!".split(), ngram_size=1)
 39 |     0.3333333333333333
 40 |     """
 41 |     words1 = set(" ".join(ng) for ng in ngrams(list(doc1), ngram_size, min_length=min_length))
 42 |     words2 = set(" ".join(ng) for ng in ngrams(list(doc2), ngram_size, min_length=min_length))
 43 |     return len(words1 & words2) / max(1, len(words1 | words2))
 44 | 
 45 | 
 46 | def optimal_param(
 47 |     threshold: float,
 48 |     num_perm: int,
 49 |     false_positive_weight: float = 0.5,
 50 |     false_negative_weight: float = 0.5,
 51 | ):
 52 |     """
 53 |     Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
 54 |     of probabilities of false positive and false negative, taken from datasketch.
 55 | 
 56 |     You can also refer to the interactive demo at https://huggingface.co/spaces/bigcode/near-deduplication.
 57 | 
 58 |     Parameters
 59 |     ----------
 60 |     threshold : float
 61 |         The threshold for similarity.
 62 |     num_perm : int
 63 |         The number of permutations.
 64 |     false_positive_weight : float
 65 |         The weight of false positive.
 66 |     false_negative_weight : float
 67 |         The weight of false negative.
 68 | 
 69 |     Returns
 70 |     -------
 71 |     Tuple[int, int]
 72 |         The optimal `b` (bands) and `r` (rows) parameters.
 73 | 
 74 |     Examples
 75 |     --------
 76 |     >>> optimal_param(0.75, 256)
 77 |     (21, 12)
 78 |     >>> optimal_param(0.75, 256, 0.1, 0.9)
 79 |     (28, 9)
 80 |     """
 81 | 
 82 |     def false_positive_area(threshold: float, b: int, r: int):
 83 |         """Source: `datasketch.lsh`"""
 84 | 
 85 |         def proba(s):
 86 |             return 1 - (1 - s ** float(r)) ** float(b)
 87 | 
 88 |         a, _ = integrate(proba, 0.0, threshold)
 89 |         return a
 90 | 
 91 |     def false_negative_area(threshold: float, b: int, r: int):
 92 |         """Source: `datasketch.lsh`"""
 93 | 
 94 |         def proba(s):
 95 |             return 1 - (1 - (1 - s ** float(r)) ** float(b))
 96 | 
 97 |         a, _ = integrate(proba, threshold, 1.0)
 98 |         return a
 99 | 
100 |     min_error = float("inf")
101 |     opt = (0, 0)
102 |     for b in range(1, num_perm + 1):
103 |         max_r = int(num_perm / b)
104 |         for r in range(1, max_r + 1):
105 |             fp = false_positive_area(threshold, b, r)
106 |             fn = false_negative_area(threshold, b, r)
107 |             error = fp * false_positive_weight + fn * false_negative_weight
108 |             if error < min_error:
109 |                 min_error = error
110 |                 opt = (b, r)
111 |     return opt


--------------------------------------------------------------------------------
/text/processing/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import json
  4 | import subprocess
  5 | from tqdm import tqdm
  6 | from pathlib import Path
  7 | 
  8 | 
  9 | def is_dir(path):
 10 |     return os.path.isdir(path)
 11 | 
 12 | 
 13 | def run_command(txt):
 14 |     subprocess.run(txt, shell=True)
 15 | 
 16 | 
 17 | def create_dir(path):
 18 |     Path(path).mkdir(parents=True, exist_ok=True)
 19 | 
 20 | 
 21 | def write_to_json(lst, fn):
 22 |     with open(fn, "w+") as file:
 23 |         for item in tqdm(lst):
 24 |             x = json.dumps(item, ensure_ascii=False)
 25 |             file.write(x + "\n")
 26 | 
 27 | 
 28 | http_errors = [
 29 |     "400 Bad Request",
 30 |     "401 Unauthorized",
 31 |     "402 Payment Required",
 32 |     "403 Forbidden",
 33 |     "404 Not Found",
 34 |     "405 Method Not Allowed",
 35 |     "406 Not Acceptable",
 36 |     "407 Proxy Authentication Required",
 37 |     "408 Request Timeout",
 38 |     "409 Conflict",
 39 |     "410 Gone",
 40 |     "411 Length Required",
 41 |     "412 Precondition Failed",
 42 |     "413 Payload Too Large",
 43 |     "414 URI Too Long",
 44 |     "415 Unsupported Media Type",
 45 |     "416 Range Not Satisfiable",
 46 |     "417 Expectation Failed",
 47 |     "418 I'm a teapot",
 48 |     "421 Misdirected Request",
 49 |     "422 Unprocessable Entity",
 50 |     "423 Locked",
 51 |     "424 Failed Dependency",
 52 |     "425 Too Early",
 53 |     "426 Upgrade Required",
 54 |     "428 Precondition Required",
 55 |     "429 Too Many Requests",
 56 |     "431 Request Header Fields Too Large",
 57 |     "451 Unavailable For Legal Reasons",
 58 |     "500 Internal Server Error",
 59 |     "501 Not Implemented",
 60 |     "502 Bad Gateway",
 61 |     "503 Service Unavailable",
 62 |     "504 Gateway Timeout",
 63 |     "505 HTTP Version Not Supported",
 64 |     "506 Variant Also Negotiates",
 65 |     "507 Insufficient Storage",
 66 |     "508 Loop Detected",
 67 |     "510 Not Extended",
 68 |     "511 Network Authentication Required",
 69 | ]
 70 | 
 71 | rejected = [
 72 |     "Internal Server Error",
 73 |     "__NOEDITSECTION__",
 74 |     "enter your username and password",
 75 |     "forgotten your password",
 76 |     "cookies enabled",
 77 |     "enable JavaScript in your browser.",
 78 |     "The page cannot be displayed",
 79 |     "site or edit the error_page",
 80 | ]
 81 | 
 82 | rejected.extend(http_errors)
 83 | 
 84 | 
 85 | def replace_multiple(input_string, pattern=r"\s{6,}", replace="   "):
 86 |     return re.sub(pattern, replace, input_string)
 87 | 
 88 | 
 89 | def replace(string):
 90 |     string = replace_multiple(string.replace("…", "."))
 91 |     string = replace_multiple(string, pattern=r"\.{6,}", replace="...")
 92 |     return string
 93 | 
 94 | 
 95 | def reject(string):
 96 |     if any([r in string for r in rejected]):
 97 |         return True
 98 |     return False
 99 | 
100 | 
101 | def loop(files, process_type="multi"):
102 |     if process_type == "multi":
103 |         files, _ = files
104 | 
105 |     for f in files:
106 |         new_f = f.replace("dedupe-datasets/", "postprocessing/")
107 |         new_f_done = f.replace("dedupe-datasets/", "postprocessing-done/")
108 |         if os.path.exists(new_f_done):
109 |             continue
110 |         with open(new_f, "w") as fopen_l:
111 |             with open(f) as fopen:
112 |                 for l in tqdm(fopen):
113 |                     data = json.loads(l)
114 | 
115 |                     if reject(data["text"]):
116 |                         continue
117 | 
118 |                     data = replace(data["text"].strip())
119 | 
120 |                     if len(data) < 3:
121 |                         continue
122 | 
123 |                     fopen_l.write(f"{json.dumps(data)}\n")
124 |                     fopen_l.flush()
125 | 
126 |         with open(new_f_done, "w") as fopen:
127 |             fopen.write("done")
128 | 


--------------------------------------------------------------------------------
/speech-instructions/prepare-malaysia-parliament.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "3c883f91",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# !wget https://huggingface.co/Zyphra/Zonos-v0.1-speaker-embedding/resolve/main/ResNet293_SimAM_ASP_base.pt"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "id": "bf4cd179",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import os\n",
 21 |     "\n",
 22 |     "os.environ['CUDA_VISIBLE_DEVICES'] = '1'\n",
 23 |     "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 10,
 29 |    "id": "1a51f3c2",
 30 |    "metadata": {},
 31 |    "outputs": [
 32 |     {
 33 |      "data": {
 34 |       "text/plain": [
 35 |        "{'idx': 0, 'prev_idx': -1}"
 36 |       ]
 37 |      },
 38 |      "execution_count": 10,
 39 |      "metadata": {},
 40 |      "output_type": "execute_result"
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "import torch\n",
 45 |     "\n",
 46 |     "available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]\n",
 47 |     "available_gpus[0].__dict__"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 50,
 53 |    "id": "1ac7cbf3",
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "import pandas as pd\n",
 58 |     "import numpy as np\n",
 59 |     "from tqdm import tqdm\n",
 60 |     "import torchaudio\n",
 61 |     "from speaker_cloning import SpeakerEmbedding"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 42,
 67 |    "id": "56d3111e",
 68 |    "metadata": {
 69 |     "scrolled": true
 70 |    },
 71 |    "outputs": [
 72 |     {
 73 |      "name": "stderr",
 74 |      "output_type": "stream",
 75 |      "text": [
 76 |       "100%|███████████████████████████████████████████████████████████████████████████| 2033890/2033890 [00:13<00:00, 146206.60it/s]\n",
 77 |       "100%|██████████████████████████████████████████████████████████████████████████████| 191545/191545 [00:02<00:00, 76849.55it/s]\n"
 78 |      ]
 79 |     },
 80 |     {
 81 |      "data": {
 82 |       "text/plain": [
 83 |        "636921"
 84 |       ]
 85 |      },
 86 |      "execution_count": 42,
 87 |      "metadata": {},
 88 |      "output_type": "execute_result"
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "df = pd.read_parquet('/home/husein/ssd4/verify-text.parquet')\n",
 93 |     "filtered = []\n",
 94 |     "for i in tqdm(range(len(df))):\n",
 95 |     "    if 'parlimen-24k' in df['audio'].iloc[i]:\n",
 96 |     "        row = df.iloc[i].to_dict()\n",
 97 |     "        f = os.path.join('/home/husein/ssd4/', row['audio'])\n",
 98 |     "        row['audio'] = f\n",
 99 |     "        filtered.append(row)\n",
100 |     "        \n",
101 |     "df = pd.read_parquet('/home/husein/ssd4/verify-text-chunk-parliament.parquet')\n",
102 |     "for i in tqdm(range(len(df))):\n",
103 |     "    row = df.iloc[i].to_dict()\n",
104 |     "    f = os.path.join('/home/husein/ssd4/', row['audio'])\n",
105 |     "    row['audio'] = f\n",
106 |     "    filtered.append(row)\n",
107 |     "    \n",
108 |     "len(filtered)"
109 |    ]
110 |   }
111 |  ],
112 |  "metadata": {
113 |   "kernelspec": {
114 |    "display_name": "python3.10",
115 |    "language": "python",
116 |    "name": "python3.10"
117 |   },
118 |   "language_info": {
119 |    "codemirror_mode": {
120 |     "name": "ipython",
121 |     "version": 3
122 |    },
123 |    "file_extension": ".py",
124 |    "mimetype": "text/x-python",
125 |    "name": "python",
126 |    "nbconvert_exporter": "python",
127 |    "pygments_lexer": "ipython3",
128 |    "version": "3.10.15"
129 |   }
130 |  },
131 |  "nbformat": 4,
132 |  "nbformat_minor": 5
133 | }
134 | 


--------------------------------------------------------------------------------
/text/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/


--------------------------------------------------------------------------------
/text/madlad-400-ms/prepare-madlad-400-ms.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "20d4b02f",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from datasets import load_dataset"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "id": "3fb192ea",
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "name": "stderr",
 21 |      "output_type": "stream",
 22 |      "text": [
 23 |       "/home/ubuntu/.local/lib/python3.10/site-packages/datasets/table.py:1421: FutureWarning: promote has been superseded by mode='default'.\n",
 24 |       "  table = cls._concat_blocks(blocks, axis=0)\n"
 25 |      ]
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "madlad_multilang = load_dataset(\"allenai/madlad-400\", languages=[\"ms\", 'ms_Arab_BN', 'ms_Arab'])"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "id": "f9c4b242",
 36 |    "metadata": {},
 37 |    "outputs": [
 38 |     {
 39 |      "data": {
 40 |       "text/plain": [
 41 |        "2"
 42 |       ]
 43 |      },
 44 |      "execution_count": 3,
 45 |      "metadata": {},
 46 |      "output_type": "execute_result"
 47 |     }
 48 |    ],
 49 |    "source": [
 50 |     "len(madlad_multilang)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 4,
 56 |    "id": "4664c3e2",
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "data": {
 61 |       "text/plain": [
 62 |        "Dataset({\n",
 63 |        "    features: ['text'],\n",
 64 |        "    num_rows: 2337781\n",
 65 |        "})"
 66 |       ]
 67 |      },
 68 |      "execution_count": 4,
 69 |      "metadata": {},
 70 |      "output_type": "execute_result"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "madlad_multilang['clean']"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 5,
 80 |    "id": "69bca9f5",
 81 |    "metadata": {
 82 |     "scrolled": true
 83 |    },
 84 |    "outputs": [
 85 |     {
 86 |      "data": {
 87 |       "text/plain": [
 88 |        "Dataset({\n",
 89 |        "    features: ['text'],\n",
 90 |        "    num_rows: 14112025\n",
 91 |        "})"
 92 |       ]
 93 |      },
 94 |      "execution_count": 5,
 95 |      "metadata": {},
 96 |      "output_type": "execute_result"
 97 |     }
 98 |    ],
 99 |    "source": [
100 |     "madlad_multilang['noisy']"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 6,
106 |    "id": "6c42fc18",
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "import json\n",
111 |     "from tqdm import tqdm"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 7,
117 |    "id": "00ea83da",
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "name": "stderr",
122 |      "output_type": "stream",
123 |      "text": [
124 |       "100%|██████████| 14112025/14112025 [28:36<00:00, 8220.12it/s] \n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "with open('madlad-400-ms.jsonl', 'w') as fopen:\n",
130 |     "    for i in tqdm(range(len(madlad_multilang['noisy']))):\n",
131 |     "        t = madlad_multilang['noisy'][i]\n",
132 |     "        fopen.write(f'{json.dumps(t)}\\n')"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "id": "66b79ffc",
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": []
142 |   }
143 |  ],
144 |  "metadata": {
145 |   "kernelspec": {
146 |    "display_name": "Python 3 (ipykernel)",
147 |    "language": "python",
148 |    "name": "python3"
149 |   },
150 |   "language_info": {
151 |    "codemirror_mode": {
152 |     "name": "ipython",
153 |     "version": 3
154 |    },
155 |    "file_extension": ".py",
156 |    "mimetype": "text/x-python",
157 |    "name": "python",
158 |    "nbconvert_exporter": "python",
159 |    "pygments_lexer": "ipython3",
160 |    "version": "3.10.12"
161 |   }
162 |  },
163 |  "nbformat": 4,
164 |  "nbformat_minor": 5
165 | }
166 | 


--------------------------------------------------------------------------------
/emilia-yodas/convert_neucodec_emilia.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import soundfile as sf
  3 | import json
  4 | import click
  5 | import re
  6 | import pandas as pd
  7 | import librosa
  8 | from glob import glob
  9 | from functools import partial
 10 | from multiprocess import Pool
 11 | from tqdm import tqdm
 12 | import numpy as np
 13 | import itertools
 14 | 
 15 | def old_chunks(l, n):
 16 |     for i in range(0, len(l), n):
 17 |         yield (l[i: i + n], i // n)
 18 |         
 19 | def chunks(l, devices):
 20 |     chunk_size = len(l) // len(devices)
 21 |     remainder = len(l) % len(devices)
 22 |     start = 0
 23 |     for i in range(len(devices)):
 24 |         extra = 1 if i < remainder else 0
 25 |         end = start + chunk_size + extra
 26 |         yield (l[start:end], devices[i])
 27 |         start = end
 28 |         
 29 | def new_path(f):
 30 |     splitted = f.split('/')
 31 |     folder = f.split('/')[0]
 32 |     folder = folder + '_neucodec'
 33 |     new_f = os.path.join(folder, '/'.join(splitted[1:]))
 34 |     new_f = new_f.replace('.mp3', '.json').replace('.wav', '.json')
 35 |     return new_f
 36 |     
 37 | def multiprocessing(strings, function, cores=6, returned=True):
 38 |     df_split = old_chunks(strings, len(strings) // cores)
 39 |     pool = Pool(cores)
 40 |     pooled = pool.map(function, df_split)
 41 |     pool.close()
 42 |     pool.join()
 43 | 
 44 |     if returned:
 45 |         return list(itertools.chain(*pooled))
 46 |         
 47 | def check(files):
 48 |     files, _ = files
 49 |     filtered = []
 50 |     for file in tqdm(files):
 51 |         filename_done = new_path(file)
 52 | 
 53 |         if os.path.exists(filename_done):
 54 |             try:
 55 |                 with open(filename_done) as fopen:
 56 |                     json.load(fopen)
 57 |                     continue
 58 |             except:
 59 |                 pass
 60 |             
 61 |         filtered.append(file)
 62 |     return filtered
 63 |     
 64 | def loop(
 65 |     indices_device_pair,
 66 | ):
 67 |     files, device = indices_device_pair
 68 |     os.environ['CUDA_VISIBLE_DEVICES'] = str(device)
 69 |     
 70 |     from neucodec import NeuCodec
 71 |     import torchaudio
 72 |     import torch
 73 |     torch.autograd.set_grad_enabled(False) 
 74 | 
 75 |     model = NeuCodec.from_pretrained("neuphonic/neucodec")
 76 |     model.eval().cuda()   
 77 | 
 78 |     for f in tqdm(files):
 79 |         filename = new_path(f)
 80 |         if os.path.exists(filename):
 81 |             try:
 82 |                 with open(filename) as fopen:
 83 |                     json.load(fopen)
 84 |                 continue
 85 |             except:
 86 |                 pass
 87 | 
 88 |         try:
 89 |             y, sr = librosa.load(f, sr = 16000)
 90 |             wav_tensor = torch.from_numpy(y).float().unsqueeze(0)
 91 |             fsq_codes = model.encode_code(wav_tensor.unsqueeze(1))
 92 |             tokens = fsq_codes[0, 0].tolist()
 93 | 
 94 |             os.makedirs(os.path.split(filename)[0], exist_ok = True)
 95 |             with open(filename, 'w') as fopen:
 96 |                 json.dump(tokens, fopen)
 97 |         except Exception as e:
 98 |             print(e)
 99 | 
100 | @click.command()
101 | @click.option('--file')
102 | @click.option('--replication', default = 1)
103 | def main(
104 |     file, 
105 |     replication,
106 | ):
107 |     devices = os.environ.get('CUDA_VISIBLE_DEVICES')
108 |     if devices is None:
109 |         
110 |         import torch
111 |         devices = list(range(torch.cuda.device_count()))
112 |     else:
113 |         devices = [d.strip() for d in devices.split(',')]
114 | 
115 |     devices = replication * devices
116 |     print(devices)
117 | 
118 |     with open(file) as fopen:
119 |         files = json.load(fopen)
120 |     filtered = multiprocessing(files, check, 30)
121 |     
122 |     print(len(files), len(filtered))
123 | 
124 |     df_split = list(chunks(filtered, devices))
125 | 
126 |     loop_partial = partial(loop)
127 | 
128 |     with Pool(len(devices)) as pool:
129 |         pooled = pool.map(loop_partial, df_split)
130 | 
131 | if __name__ == '__main__':
132 |     main()
133 | 
134 |     


--------------------------------------------------------------------------------
/text/madlad-400-ms/dedup-madlad-400-ms.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "69b786dc",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import json\n",
 11 |     "import pandas as pd\n",
 12 |     "from tqdm import tqdm\n",
 13 |     "from datasets import Dataset\n",
 14 |     "from bs4 import BeautifulSoup"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "id": "19269eb1",
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "with open('madlad-400-ms.postprocessing.jsonl', 'w') as fopen_l:\n",
 25 |     "    with open('/home/ubuntu/madlad-400-ms.postprocessing.jsonl') as fopen:\n",
 26 |     "        for l in tqdm(fopen):\n",
 27 |     "            l = json.loads(l)\n",
 28 |     "            d = {\n",
 29 |     "                'text': l\n",
 30 |     "            }\n",
 31 |     "            fopen_l.write(f'{json.dumps(d)}\\n')"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "id": "91f9ef48",
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "!head -n 10000 madlad-400-ms.postprocessing.jsonl > test.jsonl"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "id": "eb166831",
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "from datasets import load_dataset\n",
 52 |     "dataset = load_dataset(\"json\", data_files=\"madlad-400-ms.postprocessing.jsonl\", split = 'train')"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "id": "b93732c2",
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "dataset.save_to_disk(f\"hf-datasets/raw-datasets/madlad-400-ms\")"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 2,
 68 |    "id": "f84f69d6",
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/plain": [
 74 |        "'python3 -m text_dedup.minhash   --path hf-datasets/raw-datasets/madlad-400-ms   --split train   --cache_dir ./cache   --output hf-datasets/dedupe-datasets/madlad-400-ms   --column text   --batch_size 1000   --threshold 0.95   --min_length 1   --local'"
 75 |       ]
 76 |      },
 77 |      "execution_count": 2,
 78 |      "metadata": {},
 79 |      "output_type": "execute_result"
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "command = f\"python3 -m text_dedup.minhash \\\n",
 84 |     "  --path hf-datasets/raw-datasets/madlad-400-ms \\\n",
 85 |     "  --split train \\\n",
 86 |     "  --cache_dir ./cache \\\n",
 87 |     "  --output hf-datasets/dedupe-datasets/madlad-400-ms \\\n",
 88 |     "  --column text \\\n",
 89 |     "  --batch_size 1000 \\\n",
 90 |     "  --threshold 0.95 \\\n",
 91 |     "  --min_length 1 \\\n",
 92 |     "  --local\"\n",
 93 |     "\n",
 94 |     "command"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 5,
100 |    "id": "4c337e35",
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "# import subprocess\n",
105 |     "# subprocess.run(command, shell=True)"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 11,
111 |    "id": "8b0099e9",
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "!rm -rf hf-datasets/dedupe-datasets/madlad-400-ms"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "id": "dc6a64ba",
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": []
125 |   }
126 |  ],
127 |  "metadata": {
128 |   "kernelspec": {
129 |    "display_name": "Python 3 (ipykernel)",
130 |    "language": "python",
131 |    "name": "python3"
132 |   },
133 |   "language_info": {
134 |    "codemirror_mode": {
135 |     "name": "ipython",
136 |     "version": 3
137 |    },
138 |    "file_extension": ".py",
139 |    "mimetype": "text/x-python",
140 |    "name": "python",
141 |    "nbconvert_exporter": "python",
142 |    "pygments_lexer": "ipython3",
143 |    "version": "3.10.12"
144 |   }
145 |  },
146 |  "nbformat": 4,
147 |  "nbformat_minor": 5
148 | }
149 | 


--------------------------------------------------------------------------------
/multilingual-tts/convert_neucodec.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | os.environ['OMP_NUM_THREADS'] = '1'
  4 | os.environ['OPENBLAS_NUM_THREADS'] = '1'
  5 | 
  6 | import soundfile as sf
  7 | import json
  8 | import click
  9 | import re
 10 | import librosa
 11 | from glob import glob
 12 | from functools import partial
 13 | from multiprocess import Pool
 14 | from tqdm import tqdm
 15 | import numpy as np
 16 | import itertools
 17 | 
 18 | def old_chunks(l, n):
 19 |     for i in range(0, len(l), n):
 20 |         yield (l[i: i + n], i // n)
 21 |         
 22 | def chunks(l, devices):
 23 |     chunk_size = len(l) // len(devices)
 24 |     remainder = len(l) % len(devices)
 25 |     start = 0
 26 |     for i in range(len(devices)):
 27 |         extra = 1 if i < remainder else 0
 28 |         end = start + chunk_size + extra
 29 |         yield (l[start:end], devices[i])
 30 |         start = end
 31 |         
 32 | def new_path(f):
 33 |     splitted = f.split('/')
 34 |     folder = f.split('/')[0]
 35 |     folder = folder + '_neucodec'
 36 |     new_f = os.path.join(folder, '/'.join(splitted[1:]))
 37 |     new_f = new_f.replace('.mp3', '.json').replace('.wav', '.json')
 38 |     return new_f
 39 |     
 40 | def multiprocessing(strings, function, cores=6, returned=True):
 41 |     df_split = old_chunks(strings, len(strings) // cores)
 42 |     pool = Pool(cores)
 43 |     pooled = pool.map(function, df_split)
 44 |     pool.close()
 45 |     pool.join()
 46 | 
 47 |     if returned:
 48 |         return list(itertools.chain(*pooled))
 49 |         
 50 | def check(files):
 51 |     files, _ = files
 52 |     filtered = []
 53 |     for file in tqdm(files):
 54 |         filename_done = new_path(file)
 55 | 
 56 |         if os.path.exists(filename_done):
 57 |             try:
 58 |                 with open(filename_done) as fopen:
 59 |                     json.load(fopen)
 60 |                     continue
 61 |             except:
 62 |                 pass
 63 |             
 64 |         filtered.append(file)
 65 |     return filtered
 66 |     
 67 | def loop(
 68 |     indices_device_pair,
 69 | ):
 70 |     files, device = indices_device_pair
 71 |     os.environ['CUDA_VISIBLE_DEVICES'] = str(device)
 72 |     
 73 |     from neucodec import NeuCodec
 74 |     import torchaudio
 75 |     import torch
 76 |     torch.autograd.set_grad_enabled(False) 
 77 | 
 78 |     model = NeuCodec.from_pretrained("neuphonic/neucodec")
 79 |     model.eval().cuda()   
 80 | 
 81 |     for f in tqdm(files):
 82 |         filename = new_path(f)
 83 |         if os.path.exists(filename):
 84 |             try:
 85 |                 with open(filename) as fopen:
 86 |                     json.load(fopen)
 87 |                 continue
 88 |             except:
 89 |                 pass
 90 | 
 91 |         try:
 92 |             y, sr = librosa.load(f, sr = 16000)
 93 |             if len(y) / sr > 20:
 94 |                 continue
 95 |             wav_tensor = torch.from_numpy(y).float().unsqueeze(0)
 96 |             fsq_codes = model.encode_code(wav_tensor.unsqueeze(1))
 97 |             tokens = fsq_codes[0, 0].tolist()
 98 | 
 99 |             os.makedirs(os.path.split(filename)[0], exist_ok = True)
100 |             with open(filename, 'w') as fopen:
101 |                 json.dump(tokens, fopen)
102 |         except Exception as e:
103 |             print(e)
104 | 
105 | @click.command()
106 | @click.option('--file')
107 | @click.option('--replication', default = 1)
108 | def main(
109 |     file, 
110 |     replication,
111 | ):
112 |     devices = os.environ.get('CUDA_VISIBLE_DEVICES')
113 |     if devices is None:
114 |         
115 |         import torch
116 |         devices = list(range(torch.cuda.device_count()))
117 |     else:
118 |         devices = [d.strip() for d in devices.split(',')]
119 | 
120 |     devices = replication * devices
121 |     print(devices)
122 | 
123 |     with open(file) as fopen:
124 |         files = json.load(fopen)
125 |     filtered = multiprocessing(files, check, 30)
126 |     
127 |     print(len(files), len(filtered))
128 | 
129 |     df_split = list(chunks(filtered, devices))
130 | 
131 |     loop_partial = partial(loop)
132 | 
133 |     with Pool(len(devices)) as pool:
134 |         pooled = pool.map(loop_partial, df_split)
135 | 
136 | if __name__ == '__main__':
137 |     main()
138 | 
139 |     
140 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # PyPI configuration file
171 | .pypirc
172 | mp.py
173 | *Untitled*.ipynb
174 | malaysian_sft.py
175 | speech-instructions-extra/*audio
176 | speech-instructions-extra/*-Train
177 | *.parquet


--------------------------------------------------------------------------------
/text/pretrain-llm/prepare-starcoder.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "453a2552",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import json\n",
 11 |     "import os\n",
 12 |     "from glob import glob\n",
 13 |     "from tqdm import tqdm"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "id": "36e657c0",
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "def partition(text, size = 500):\n",
 24 |     "    splitted = text.split()\n",
 25 |     "    return [' '.join(splitted[i: i + size]) for i in range(0, len(splitted), size)]"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "id": "cab2dbd7",
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "a = open('prepare-starcoder.jsonl', 'w')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 4,
 41 |    "id": "06167a66",
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "data": {
 46 |       "text/plain": [
 47 |        "['starcoder/starcoder/c.jsonl',\n",
 48 |        " 'starcoder/starcoder/cpp.jsonl',\n",
 49 |        " 'starcoder/starcoder/css.jsonl',\n",
 50 |        " 'starcoder/starcoder/go.jsonl',\n",
 51 |        " 'starcoder/starcoder/html.jsonl',\n",
 52 |        " 'starcoder/starcoder/java.jsonl',\n",
 53 |        " 'starcoder/starcoder/javascript.jsonl',\n",
 54 |        " 'starcoder/starcoder/julia.jsonl',\n",
 55 |        " 'starcoder/starcoder/markdown.jsonl',\n",
 56 |        " 'starcoder/starcoder/python.jsonl',\n",
 57 |        " 'starcoder/starcoder/r.jsonl',\n",
 58 |        " 'starcoder/starcoder/rust.jsonl',\n",
 59 |        " 'starcoder/starcoder/sql.jsonl']"
 60 |       ]
 61 |      },
 62 |      "execution_count": 4,
 63 |      "metadata": {},
 64 |      "output_type": "execute_result"
 65 |     }
 66 |    ],
 67 |    "source": [
 68 |     "glob('starcoder/starcoder/*.jsonl')"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 5,
 74 |    "id": "23d113a4",
 75 |    "metadata": {},
 76 |    "outputs": [
 77 |     {
 78 |      "name": "stderr",
 79 |      "output_type": "stream",
 80 |      "text": [
 81 |       "1610858it [06:54, 3884.72it/s]\n",
 82 |       "1314195it [06:08, 3567.05it/s]\n",
 83 |       "2293654it [06:23, 5987.63it/s]\n",
 84 |       "1928334it [06:33, 4901.64it/s]\n",
 85 |       "60451it [00:19, 2792.18it/s]IOPub message rate exceeded.\n",
 86 |       "The notebook server will temporarily stop sending output\n",
 87 |       "to the client in order to avoid crashing it.\n",
 88 |       "To change this limit, set the config variable\n",
 89 |       "`--NotebookApp.iopub_msg_rate_limit`.\n",
 90 |       "\n",
 91 |       "Current values:\n",
 92 |       "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
 93 |       "NotebookApp.rate_limit_window=3.0 (secs)\n",
 94 |       "\n"
 95 |      ]
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "files = glob('starcoder/starcoder/*.jsonl')\n",
100 |     "for f in files:\n",
101 |     "    with open(f) as fopen:\n",
102 |     "        for l in tqdm(fopen):\n",
103 |     "            try:\n",
104 |     "                data = '<s>' + json.loads(l) + '</s>'\n",
105 |     "                partitioned = partition(data)\n",
106 |     "                for p in partitioned:\n",
107 |     "                    data = {\n",
108 |     "                        'text': p,\n",
109 |     "                    }\n",
110 |     "                    a.write(f'{json.dumps(data)}\\n')\n",
111 |     "                    a.flush()\n",
112 |     "            except Exception as e:\n",
113 |     "                print(e)\n",
114 |     "                pass"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "id": "0c1a228e",
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": []
124 |   }
125 |  ],
126 |  "metadata": {
127 |   "kernelspec": {
128 |    "display_name": "Python 3 (ipykernel)",
129 |    "language": "python",
130 |    "name": "python3"
131 |   },
132 |   "language_info": {
133 |    "codemirror_mode": {
134 |     "name": "ipython",
135 |     "version": 3
136 |    },
137 |    "file_extension": ".py",
138 |    "mimetype": "text/x-python",
139 |    "name": "python",
140 |    "nbconvert_exporter": "python",
141 |    "pygments_lexer": "ipython3",
142 |    "version": "3.10.12"
143 |   }
144 |  },
145 |  "nbformat": 4,
146 |  "nbformat_minor": 5
147 | }
148 | 


--------------------------------------------------------------------------------
/multilingual-tts/trim_silence.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import soundfile as sf
  3 | import librosa
  4 | import json
  5 | import click
  6 | import numpy as np
  7 | import malaya_speech
  8 | from glob import glob
  9 | from functools import partial
 10 | from multiprocess import Pool
 11 | from tqdm import tqdm
 12 | 
 13 | def chunks(l, devices):
 14 |     chunk_size = len(l) // len(devices)
 15 |     remainder = len(l) % len(devices)
 16 |     start = 0
 17 |     for i in range(len(devices)):
 18 |         extra = 1 if i < remainder else 0
 19 |         end = start + chunk_size + extra
 20 |         yield (l[start:end], devices[i])
 21 |         start = end
 22 | 
 23 | def new_path(f):
 24 |     splitted = f.split('/')
 25 |     base_folder = splitted[0] + '_trim'
 26 |     splitted = '/'.join([base_folder] + splitted[1:])
 27 |     return splitted
 28 | 
 29 | def new_path_done(f):
 30 |     splitted = f.split('/')
 31 |     base_folder = splitted[0] + '_trim_done'
 32 |     splitted = '/'.join([base_folder] + splitted[1:])
 33 |     return splitted
 34 | 
 35 | def loop(indices_device_pair):
 36 |     files, device = indices_device_pair
 37 |     
 38 |     vad = malaya_speech.vad.webrtc(minimum_amplitude = 0)
 39 |     min_length = 0.4
 40 | 
 41 |     for file in tqdm(files):
 42 |         folder = os.path.split(file)[0]
 43 |         folder_folder = os.path.split(folder)[1]
 44 |         f_new = new_path(file)
 45 |         filename_done = new_path_done(file)
 46 | 
 47 |         try:
 48 |             with open(filename_done) as fopen:
 49 |                 json.load(fopen)
 50 |                 continue
 51 |         except:
 52 |             pass
 53 |             
 54 |         try:
 55 |             vad = malaya_speech.vad.webrtc(minimum_amplitude = 0)
 56 |             y, sr = librosa.load(file, sr = None)
 57 |             start_silent_trail = int(0.3 * sr)
 58 |             middle_silent_trail = int(min_length * sr / 2)
 59 |             middle_silent_trail, start_silent_trail
 60 |             y_= malaya_speech.resample(y, sr, 16000)
 61 |             y_ = malaya_speech.astype.float_to_int(y_)
 62 |             frames = malaya_speech.generator.frames(y, 30, sr)
 63 |             frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
 64 |             frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]
 65 |             grouped_deep = malaya_speech.group.group_frames(frames_webrtc)
 66 |             r = []
 67 |             for no, g in enumerate(grouped_deep):
 68 |                 if g[1]:
 69 |                     g = g[0].array
 70 |                 else:
 71 |                     if no == 0:
 72 |                         g = g[0].array[-start_silent_trail:]
 73 |                     elif no == (len(grouped_deep) - 1):
 74 |                         g = g[0].array[:start_silent_trail]
 75 |                     else:
 76 |                         if g[0].duration >= min_length:
 77 |                             g = [g[0].array[:middle_silent_trail], g[0].array[-middle_silent_trail:]]
 78 |                             g = np.concatenate(g)
 79 |                         else:
 80 |                             g = g[0].array
 81 | 
 82 |                 r.append(g)
 83 |             y_after = np.concatenate(r)
 84 |             
 85 |             os.makedirs(os.path.split(f_new)[0], exist_ok = True)
 86 |             sf.write(f_new, y_after, sr)
 87 |             os.makedirs(os.path.split(filename_done)[0], exist_ok = True)
 88 |             with open(filename_done, 'w') as fopen:
 89 |                 json.dump('done', fopen)
 90 |             
 91 |         except Exception as e:
 92 |             print(e)
 93 | 
 94 | @click.command()
 95 | @click.option('--file')
 96 | @click.option('--replication', default = 1)
 97 | def main(
 98 |     file, 
 99 |     replication,
100 | ):
101 |     devices = replication * [0]
102 |     
103 |     with open(file) as fopen:
104 |         files = json.load(fopen)
105 |     filtered = []
106 |     for file in tqdm(files):
107 |         filename_done = new_path_done(file)
108 | 
109 |         if os.path.exists(filename_done):
110 |             try:
111 |                 with open(filename_done) as fopen:
112 |                     json.load(fopen)
113 |                     continue
114 |             except:
115 |                 pass
116 |         filtered.append(file)
117 |     
118 |     df_split = list(chunks(filtered, devices))
119 | 
120 |     loop_partial = partial(loop)
121 | 
122 |     with Pool(len(devices)) as pool:
123 |         pooled = pool.map(loop_partial, df_split)
124 | 
125 | if __name__ == '__main__':
126 |     main()
127 | 
128 |     


--------------------------------------------------------------------------------
/text/pretrain-llm/prepare-translation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "e6328ada",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# !git lfs clone https://huggingface.co/datasets/mesolitica/google-translate-ms-pa\n",
 11 |     "# !git lfs clone https://huggingface.co/datasets/mesolitica/google-translate-ms-zh-CN\n",
 12 |     "# !git lfs clone https://huggingface.co/datasets/mesolitica/google-translate-ms-ta"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 1,
 18 |    "id": "c6bfd5aa",
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import json\n",
 23 |     "import os\n",
 24 |     "from glob import glob\n",
 25 |     "from tqdm import tqdm"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "id": "1e6d9ca8",
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "def partition(text, size = 500):\n",
 36 |     "    splitted = text.split()\n",
 37 |     "    return [' '.join(splitted[i: i + size]) for i in range(0, len(splitted), size)]"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "id": "c2e1d2bc",
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "a = open('prepare-translation.jsonl', 'w')"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 4,
 53 |    "id": "15ff7cfe",
 54 |    "metadata": {},
 55 |    "outputs": [
 56 |     {
 57 |      "name": "stderr",
 58 |      "output_type": "stream",
 59 |      "text": [
 60 |       "99967it [00:53, 1868.55it/s]\n",
 61 |       "99971it [01:00, 1646.85it/s]\n",
 62 |       "99968it [01:08, 1460.54it/s]\n",
 63 |       "99966it [00:58, 1719.68it/s]\n",
 64 |       "99962it [00:56, 1755.14it/s]\n",
 65 |       "99968it [00:19, 5100.54it/s] \n",
 66 |       "99959it [00:20, 4887.07it/s] \n",
 67 |       "99972it [00:15, 6252.46it/s] \n",
 68 |       "99960it [00:11, 8494.27it/s] \n",
 69 |       "99974it [00:08, 11529.00it/s]\n",
 70 |       "99968it [00:14, 6672.96it/s] \n",
 71 |       "99965it [00:09, 10661.73it/s]\n",
 72 |       "99965it [00:09, 10525.04it/s]\n",
 73 |       "99959it [00:15, 6443.56it/s] \n",
 74 |       "99972it [00:08, 11661.01it/s]\n",
 75 |       "99966it [00:14, 6786.12it/s] \n",
 76 |       "99969it [00:22, 4412.35it/s] \n",
 77 |       "99972it [00:14, 6883.69it/s] \n",
 78 |       "99963it [00:06, 15602.76it/s]\n",
 79 |       "99966it [00:16, 6097.17it/s] \n",
 80 |       "99967it [00:11, 8717.47it/s] \n",
 81 |       "99970it [00:07, 13489.12it/s]\n",
 82 |       "99969it [00:18, 5358.87it/s] \n",
 83 |       "99981it [00:09, 10109.43it/s]\n",
 84 |       "99968it [00:07, 13383.50it/s]\n",
 85 |       "99966it [00:14, 7052.85it/s] \n",
 86 |       "99968it [00:23, 4322.65it/s] \n",
 87 |       "99968it [00:37, 2634.95it/s]\n",
 88 |       "99972it [00:36, 2704.90it/s]\n",
 89 |       "99958it [00:40, 2471.35it/s]\n",
 90 |       "99967it [00:42, 2371.77it/s]\n",
 91 |       "99971it [00:44, 2221.78it/s]\n",
 92 |       "99962it [00:39, 2532.83it/s]\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "google_translate = glob('google-translate-*/*.requested')\n",
 98 |     "for f in google_translate:\n",
 99 |     "    with open(f) as fopen:\n",
100 |     "        for l in tqdm(fopen):\n",
101 |     "            try:\n",
102 |     "                data = '<s>' + json.loads(l)['r']['result'] + '</s>'\n",
103 |     "                partitioned = partition(data)\n",
104 |     "                for p in partitioned:\n",
105 |     "                    data = {\n",
106 |     "                        'text': p,\n",
107 |     "                    }\n",
108 |     "                    a.write(f'{json.dumps(data)}\\n')\n",
109 |     "                    a.flush()\n",
110 |     "            except:\n",
111 |     "                pass"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "id": "ee6d3d66",
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": []
121 |   }
122 |  ],
123 |  "metadata": {
124 |   "kernelspec": {
125 |    "display_name": "Python 3 (ipykernel)",
126 |    "language": "python",
127 |    "name": "python3"
128 |   },
129 |   "language_info": {
130 |    "codemirror_mode": {
131 |     "name": "ipython",
132 |     "version": 3
133 |    },
134 |    "file_extension": ".py",
135 |    "mimetype": "text/x-python",
136 |    "name": "python",
137 |    "nbconvert_exporter": "python",
138 |    "pygments_lexer": "ipython3",
139 |    "version": "3.10.12"
140 |   }
141 |  },
142 |  "nbformat": 4,
143 |  "nbformat_minor": 5
144 | }
145 | 


--------------------------------------------------------------------------------
/text/extra/sample-fineweb-edu.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 13,
  6 |    "id": "e58f50a7-ac12-4bac-ab97-ce10a1de9154",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from datasets import load_dataset\n",
 11 |     "from tqdm import tqdm\n",
 12 |     "import json"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 5,
 18 |    "id": "015519ff-efb4-4d80-adfd-7e826822af76",
 19 |    "metadata": {},
 20 |    "outputs": [
 21 |     {
 22 |      "data": {
 23 |       "application/vnd.jupyter.widget-view+json": {
 24 |        "model_id": "dfc39a6763534034a70ba3a3e960169d",
 25 |        "version_major": 2,
 26 |        "version_minor": 0
 27 |       },
 28 |       "text/plain": [
 29 |        "Resolving data files:   0%|          | 0/104 [00:00<?, ?it/s]"
 30 |       ]
 31 |      },
 32 |      "metadata": {},
 33 |      "output_type": "display_data"
 34 |     },
 35 |     {
 36 |      "data": {
 37 |       "application/vnd.jupyter.widget-view+json": {
 38 |        "model_id": "0a44faca2a464e1d977bba4a428ae569",
 39 |        "version_major": 2,
 40 |        "version_minor": 0
 41 |       },
 42 |       "text/plain": [
 43 |        "Resolving data files:   0%|          | 0/234 [00:00<?, ?it/s]"
 44 |       ]
 45 |      },
 46 |      "metadata": {},
 47 |      "output_type": "display_data"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "ds = load_dataset(\"HuggingFaceTB/smollm-corpus\", \"fineweb-edu-dedup\", streaming = True, split = 'train')"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 15,
 57 |    "id": "2c6d0b3d-37e1-4789-9689-74996c7688be",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# index = 0\n",
 62 |     "# with open('fineweb-edu-dedup-sample-10M.jsonl', 'w') as fopen_l:\n",
 63 |     "#     for row in tqdm(ds):\n",
 64 |     "#         t = row['text']\n",
 65 |     "#         fopen_l.write(f'{json.dumps(t)}\\n')\n",
 66 |     "#         index += 1\n",
 67 |     "#         if index > 1e7:\n",
 68 |     "#             break"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 16,
 74 |    "id": "89ad1f82-8041-47cf-a640-a20ee50073d9",
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "!mv fineweb-edu-dedup-sample-10M.jsonl fineweb-edu-dedup-sample-5M.jsonl"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 17,
 84 |    "id": "e6a3642b-5a8a-4018-ae8f-be5c9009fdbc",
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "-rw-r--r-- 1 sagemaker-user users 22G Aug  5 08:05 fineweb-edu-dedup-sample-5M.jsonl\n"
 92 |      ]
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "!ls -lh fineweb-edu-dedup-sample-5M.jsonl"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "id": "c4fe5407-7877-4c10-9084-e62ed97ecc8d",
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "data": {
107 |       "application/vnd.jupyter.widget-view+json": {
108 |        "model_id": "4717bd184fde4ed094f07eca7572f2f7",
109 |        "version_major": 2,
110 |        "version_minor": 0
111 |       },
112 |       "text/plain": [
113 |        "fineweb-edu-dedup-sample-5M.jsonl:   0%|          | 0.00/23.5G [00:00<?, ?B/s]"
114 |       ]
115 |      },
116 |      "metadata": {},
117 |      "output_type": "display_data"
118 |     }
119 |    ],
120 |    "source": [
121 |     "from huggingface_hub import HfApi\n",
122 |     "api = HfApi()\n",
123 |     "api.upload_file(\n",
124 |     "    path_or_fileobj=\"fineweb-edu-dedup-sample-5M.jsonl\",\n",
125 |     "    path_in_repo=\"fineweb-edu-dedup-sample-5M.jsonl\",\n",
126 |     "    repo_id=\"malaysia-ai/pretrain-text-dataset\",\n",
127 |     "    repo_type=\"dataset\",\n",
128 |     ")\n"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "id": "7c93660f-9e7b-4ce8-9c98-c13899b20c1b",
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": []
138 |   }
139 |  ],
140 |  "metadata": {
141 |   "kernelspec": {
142 |    "display_name": "Python 3 (ipykernel)",
143 |    "language": "python",
144 |    "name": "python3"
145 |   },
146 |   "language_info": {
147 |    "codemirror_mode": {
148 |     "name": "ipython",
149 |     "version": 3
150 |    },
151 |    "file_extension": ".py",
152 |    "mimetype": "text/x-python",
153 |    "name": "python",
154 |    "nbconvert_exporter": "python",
155 |    "pygments_lexer": "ipython3",
156 |    "version": "3.10.14"
157 |   }
158 |  },
159 |  "nbformat": 4,
160 |  "nbformat_minor": 5
161 | }
162 | 


--------------------------------------------------------------------------------
/stt-whisper/audioset_sliding.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
  2 | from glob import glob
  3 | from tqdm import tqdm
  4 | from datasets import Audio
  5 | from torch.utils.data import Dataset
  6 | from torch.utils.data import DataLoader
  7 | import torch
  8 | import torchaudio
  9 | import numpy as np
 10 | import click
 11 | import os
 12 | import json
 13 | 
 14 | def new_path(f):
 15 |     f = f.replace('.mp3', '.audioset')
 16 |     splitted = f.split('/')
 17 |     base_folder = splitted[0] + '_audioset'
 18 |     splitted = '/'.join([base_folder] + splitted[1:])
 19 |     return splitted
 20 | 
 21 | @click.command()
 22 | @click.option("--file", help="file")
 23 | @click.option("--global-index", default=1, help="global index")
 24 | @click.option("--local-index", default=0, help="local index")
 25 | @click.option("--sliding", default=0.5)
 26 | @click.option("--model", default='MIT/ast-finetuned-audioset-10-10-0.4593')
 27 | def function(file, global_index, local_index, sliding, model):
 28 | 
 29 |     files = []
 30 |     with open(file) as fopen:
 31 |         for no, l in enumerate(fopen):
 32 |             l = json.loads(l)
 33 |             l['index'] = os.path.join(file.replace('.jsonl', ''), f'{no}.mp3')
 34 |             files.append(l)
 35 |     
 36 |     print(len(files), files[0])
 37 |     filtered_files = []
 38 |     for f in files:
 39 |         if not os.path.exists(f['audio_filename']):
 40 |             continue
 41 |         new_f = new_path(f['audio_filename'])
 42 |         if os.path.exists(new_f) and os.path.getsize(new_f) > 2:
 43 |             continue
 44 |         filtered_files.append(f)
 45 | 
 46 |     print(len(files), len(filtered_files))
 47 |     global_size = len(filtered_files) // global_index
 48 |     files = filtered_files[global_size * local_index: global_size * (local_index + 1)]
 49 |     print(len(files))
 50 | 
 51 |     feature_extractor = AutoFeatureExtractor.from_pretrained(model, return_attention_mask = True)
 52 |     model = AutoModelForAudioClassification.from_pretrained(model, torch_dtype = torch.float16).eval().cuda()
 53 |     id2label = model.config.id2label
 54 |     sr = feature_extractor.sampling_rate
 55 |     sliding = int(sliding * sr)
 56 |     audio = Audio(sampling_rate = sr)
 57 | 
 58 |     class CustomDataset(Dataset):
 59 |         def __init__(self, files):
 60 |             self.files = files
 61 | 
 62 |         def __len__(self):
 63 |             return len(self.files)
 64 |         
 65 |         def __getitem__(self, index):
 66 |             f = self.files[index]
 67 |             f = f['audio_filename']
 68 |             y = audio.decode_example(audio.encode_example(f))['array']
 69 |             timestamps = []
 70 |             slided = []
 71 |             for i in range(0, len(y), sliding):
 72 |                 y_ = y[i: i + sliding]
 73 |                 if len(y_) < 1000:
 74 |                     continue
 75 |                 slided.append(y[i: i + sliding])
 76 |                 start = i / sr
 77 |                 end = min(len(y) / sr, (i + sliding) / sr)
 78 |                 timestamps.append((start, end))
 79 |             
 80 |             inputs = feature_extractor(slided, sampling_rate=sr, 
 81 |                             return_tensors="pt", return_attention_mask = True)
 82 |             return inputs, f, timestamps
 83 | 
 84 |     dataset = CustomDataset(files)
 85 |     dataloader = DataLoader(dataset, batch_size = 1, shuffle = False, prefetch_factor=10, num_workers=5)
 86 |     with torch.no_grad():
 87 |         for row in tqdm(iter(dataloader)):
 88 |             inputs, f, timestamps_ = row
 89 |             f = f[0]
 90 |             timestamps = []
 91 |             for t in timestamps_:
 92 |                 timestamps.append((float(t[0]), float(t[1])))
 93 | 
 94 |             inputs['input_values'] = inputs['input_values'][0].to(torch.float16).cuda()
 95 |             logits = model(**inputs).logits.softmax(-1)
 96 |             topk = torch.topk(logits, 5, dim = -1)
 97 |             probs = topk.values.cpu().numpy().tolist()
 98 | 
 99 |             for i in range(len(probs)):
100 |                 for k in range(len(probs[i])):
101 |                     probs[i][k] = round(probs[i][k], 4)
102 |                     
103 |             labels = []
104 |             for row in topk.indices.cpu().numpy():
105 |                 label = [id2label[r] for r in row]
106 |                 labels.append(label)
107 | 
108 |             splitted = new_path(f)
109 |             os.makedirs(os.path.split(splitted)[0], exist_ok = True)
110 |             with open(splitted, 'w') as fopen:
111 |                 json.dump({'timestamps': timestamps, 'labels': labels, 'probs': probs}, fopen)
112 | 
113 | 
114 | if __name__ == '__main__':
115 |     function()


--------------------------------------------------------------------------------
/malaysian-short-instructions/dedup-questions-intents.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "8009e792",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "data": {
 11 |       "text/plain": [
 12 |        "76765"
 13 |       ]
 14 |      },
 15 |      "execution_count": 1,
 16 |      "metadata": {},
 17 |      "output_type": "execute_result"
 18 |     }
 19 |    ],
 20 |    "source": [
 21 |     "from glob import glob\n",
 22 |     "import json\n",
 23 |     "import re\n",
 24 |     "\n",
 25 |     "pattern = r\"\\d+\\.\\s(.+)\"\n",
 26 |     "already = set()\n",
 27 |     "\n",
 28 |     "files = glob('generate-questions-intents/*')\n",
 29 |     "\n",
 30 |     "questions = []\n",
 31 |     "for f in files:\n",
 32 |     "    with open(f) as fopen:\n",
 33 |     "        d = json.load(fopen)\n",
 34 |     "    keyword = d['q'][0]\n",
 35 |     "    for q in re.findall(pattern, d['r']):\n",
 36 |     "        if q in already:\n",
 37 |     "            continue\n",
 38 |     "        questions.append((q, keyword))\n",
 39 |     "    \n",
 40 |     "len(questions)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 2,
 46 |    "id": "cc4f4bab",
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "import string\n",
 51 |     "\n",
 52 |     "digits = set(string.digits)\n",
 53 |     "rejected = ['\\'', '\"', 'http', '\\n', '[', ']', '/', '`']\n",
 54 |     "\n",
 55 |     "def contains_non_ascii(text):\n",
 56 |     "    return any(ord(char) > 127 for char in text)\n",
 57 |     "\n",
 58 |     "def reject_q(q):\n",
 59 |     "    if q is None:\n",
 60 |     "        return True\n",
 61 |     "    if any([c in q for c in rejected]):\n",
 62 |     "        return True\n",
 63 |     "    if contains_non_ascii(q):\n",
 64 |     "        return True\n",
 65 |     "    if len(set(q) & digits):\n",
 66 |     "        return True\n",
 67 |     "    if len(q) < 20:\n",
 68 |     "        return True\n",
 69 |     "    if len(q) > 200:\n",
 70 |     "        return True\n",
 71 |     "    return False"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 3,
 77 |    "id": "fddc3adf",
 78 |    "metadata": {
 79 |     "scrolled": true
 80 |    },
 81 |    "outputs": [
 82 |     {
 83 |      "data": {
 84 |       "text/plain": [
 85 |        "192"
 86 |       ]
 87 |      },
 88 |      "execution_count": 3,
 89 |      "metadata": {},
 90 |      "output_type": "execute_result"
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "from collections import defaultdict\n",
 95 |     "\n",
 96 |     "filtered_q = defaultdict(list)\n",
 97 |     "for q, k in questions:\n",
 98 |     "    if len(q) < 10:\n",
 99 |     "        continue\n",
100 |     "    if reject_q(q):\n",
101 |     "        continue\n",
102 |     "    \n",
103 |     "    filtered_q[k].append(q)\n",
104 |     "len(filtered_q)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 4,
110 |    "id": "64cf6d6b",
111 |    "metadata": {
112 |     "scrolled": false
113 |    },
114 |    "outputs": [
115 |     {
116 |      "data": {
117 |       "text/plain": [
118 |        "75010"
119 |       ]
120 |      },
121 |      "execution_count": 4,
122 |      "metadata": {},
123 |      "output_type": "execute_result"
124 |     }
125 |    ],
126 |    "source": [
127 |     "questions = []\n",
128 |     "for k, v in filtered_q.items():\n",
129 |     "    if len(v) < 100:\n",
130 |     "        continue\n",
131 |     "    v = sorted(v, key = lambda x: len(x), reverse = True)\n",
132 |     "    v = [(v_, k) for v_ in v][:1000]\n",
133 |     "    questions.extend(v)\n",
134 |     "    \n",
135 |     "len(questions)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 9,
141 |    "id": "eee58538",
142 |    "metadata": {},
143 |    "outputs": [
144 |     {
145 |      "data": {
146 |       "text/plain": [
147 |        "('Apa menu makanan terkenal di Gopeng?', 'food negeri pulau pinang')"
148 |       ]
149 |      },
150 |      "execution_count": 9,
151 |      "metadata": {},
152 |      "output_type": "execute_result"
153 |     }
154 |    ],
155 |    "source": [
156 |     "questions[-4]"
157 |    ]
158 |   }
159 |  ],
160 |  "metadata": {
161 |   "kernelspec": {
162 |    "display_name": "Python 3 (ipykernel)",
163 |    "language": "python",
164 |    "name": "python3"
165 |   },
166 |   "language_info": {
167 |    "codemirror_mode": {
168 |     "name": "ipython",
169 |     "version": 3
170 |    },
171 |    "file_extension": ".py",
172 |    "mimetype": "text/x-python",
173 |    "name": "python",
174 |    "nbconvert_exporter": "python",
175 |    "pygments_lexer": "ipython3",
176 |    "version": "3.8.10"
177 |   }
178 |  },
179 |  "nbformat": 4,
180 |  "nbformat_minor": 5
181 | }
182 | 


--------------------------------------------------------------------------------
/text/processing/main.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import mp
  3 | import time
  4 | import json
  5 | import random
  6 | import functools
  7 | from tqdm import tqdm
  8 | from pathlib import Path
  9 | from unidecode import unidecode
 10 | from argparse import ArgumentParser
 11 | import function as func
 12 | 
 13 | 
 14 | def parse_arguments():
 15 |     parser = ArgumentParser()
 16 |     parser.add_argument(
 17 |         "--dataset", dest="dataset", help="Dataset name", required=False
 18 |     )
 19 |     parser.add_argument(
 20 |         "--url_dataset", dest="url_dataset", help="Dataset URL (jsonl)", required=False
 21 |     )
 22 |     parser.add_argument(
 23 |         "--clean_file_path",
 24 |         dest="clean_file_path",
 25 |         help="Load the .jsonl file that has been cleaned instead of from huggingface",
 26 |         required=False,
 27 |     )
 28 |     parser.add_argument(
 29 |         "--master_folder",
 30 |         dest="master_dataset_folder",
 31 |         help="Master folder to store dataset and processed output",
 32 |         required=True,
 33 |     )
 34 |     parser.add_argument(
 35 |         "--mp_core",
 36 |         dest="mp_core",
 37 |         default=6,
 38 |         help="Postprocessing Core",
 39 |         required=False,
 40 |     )
 41 |     parser.add_argument(
 42 |         "--dataset_with_link",
 43 |         dest="dataset_with_link",
 44 |         nargs="+",
 45 |         help="Dataset name",
 46 |         required=False,
 47 |     )
 48 |     parser.add_argument(
 49 |         "--text_key",
 50 |         dest="text_key",
 51 |         nargs="+",
 52 |         help="Dict key contain text data",
 53 |         required=False,
 54 |     )
 55 | 
 56 |     args = parser.parse_args()
 57 |     return args
 58 | 
 59 | 
 60 | def loop_process(datasets, process_type="multi"):
 61 |     if process_type == "multi":
 62 |         lst_dataset, _ = datasets
 63 |     else:
 64 |         lst_dataset = datasets
 65 | 
 66 |     dataset_name_lst = []
 67 |     remove_dataset_name_lst = []
 68 | 
 69 |     for dataset in lst_dataset:
 70 |         try:
 71 |             url_dataset = dataset[1]
 72 |             dataset_name = dataset[0]
 73 | 
 74 |             dataset_name_lst.append(dataset_name)
 75 | 
 76 |             print(f"\nProcessing ... {dataset_name}\n")
 77 | 
 78 |             try:
 79 |                 func.init_process(
 80 |                     raw_dataset_path=master_dataset_folder,
 81 |                     dataset_name=dataset_name,
 82 |                     clean_file_path=url_dataset,
 83 |                     text_key=text_key,
 84 |                 )
 85 |             except:
 86 |                 func.init_process(
 87 |                     raw_dataset_path=master_dataset_folder,
 88 |                     dataset_name=dataset_name,
 89 |                     link=url_dataset,
 90 |                     text_key=text_key,
 91 |                 )
 92 | 
 93 |             func.second_process(master_dataset_folder, dataset_name)
 94 |         except Exception as e:
 95 |             print(f"[ERROR] {str(e)} \n Skip {dataset_name} ...")
 96 |             dataset_name_lst.remove(dataset_name)
 97 |             remove_dataset_name_lst.append(dataset_name)
 98 |             pass
 99 | 
100 |     if len(dataset_name_lst) != 0:
101 |         func.third_process(master_dataset_folder, mp_core)
102 | 
103 |         for l in dataset_name_lst:
104 |             before_dedup_mb, after_dedup_mb, after_post_mb = func.get_size(
105 |                 master_dataset_folder, l
106 |             )
107 | 
108 |             print("\n\n====================")
109 |             print(f"File Size - {l}")
110 |             print(f"before_dedup    ---> {before_dedup_mb}")
111 |             print(f"after_dedup     ---> {after_dedup_mb}")
112 |             print(f"after_post      ---> {after_post_mb}")
113 |             print("====================\n\n")
114 | 
115 |     if len(remove_dataset_name_lst) > 0:
116 |         print(f"Problem datasets:\n{','.join(remove_dataset_name_lst)}")
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     start_time = time.time()
121 | 
122 |     global master_dataset_folder
123 |     global mp_core
124 |     global text_key
125 | 
126 |     args = parse_arguments()
127 | 
128 |     clean_file_path = args.clean_file_path
129 |     multiple_dataset = args.dataset_with_link
130 |     text_key = args.text_key
131 | 
132 |     if clean_file_path:
133 |         print("[Run for manually cleaned dataset]")
134 |         dataset_name = args.dataset
135 |         datasets = [(dataset_name, clean_file_path)]
136 |     elif multiple_dataset:
137 |         print("[Run for MULTIPLE datasets]")
138 |         datasets = [tuple(l.split(",")) for l in multiple_dataset]
139 |     else:
140 |         print("[Run for SINGLE dataset]")
141 |         dataset_name = args.dataset
142 |         url_dataset = args.url_dataset
143 | 
144 |         datasets = [(dataset_name, url_dataset)]
145 | 
146 |     master_dataset_folder = args.master_dataset_folder
147 |     mp_core = args.mp_core
148 | 
149 |     if len(datasets) // mp_core == 0:
150 |         loop_process(datasets, process_type="single")
151 |     else:
152 |         mp.multiprocessing(datasets, loop_process, cores=mp_core, returned=False)
153 | 
154 |     print(f"--- {time.time() - start_time} seconds ---")
155 | 


--------------------------------------------------------------------------------
/emotional-malaysian-emilia/audioset_sliding_v2.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
  2 | from collections import defaultdict
  3 | from tqdm import tqdm
  4 | from glob import glob
  5 | from datasets import Audio
  6 | from torch.utils.data import Dataset
  7 | from torch.utils.data import DataLoader
  8 | import torch
  9 | import torchaudio
 10 | import numpy as np
 11 | import click
 12 | import os
 13 | import json
 14 | import numpy as np
 15 | 
 16 | def new_path(f):
 17 |     f = f.replace('.mp3', '.audioset_v2')
 18 |     splitted = f.split('/')
 19 |     base_folder = splitted[0] + '_audioset_v2'
 20 |     splitted = '/'.join([base_folder] + splitted[1:])
 21 |     return splitted
 22 | 
 23 | 
 24 | @click.command()
 25 | @click.option("--path", help="files path in glob pattern")
 26 | @click.option("--global-index", default=1, help="global index")
 27 | @click.option("--local-index", default=0, help="local index")
 28 | @click.option("--stride", default=0.1)
 29 | @click.option("--sliding", default=1.0)
 30 | @click.option("--model", default='MIT/ast-finetuned-audioset-10-10-0.4593')
 31 | def function(path, global_index, local_index, stride, sliding, model):
 32 | 
 33 |     feature_extractor = AutoFeatureExtractor.from_pretrained(model, return_attention_mask = True)
 34 |     model = AutoModelForAudioClassification.from_pretrained(model, torch_dtype = torch.float16).eval().cuda()
 35 |     id2label = model.config.id2label
 36 |     sr = feature_extractor.sampling_rate
 37 |     actual_stride = stride
 38 |     stride = int(stride * sr)
 39 |     sliding = int(sliding * sr)
 40 |     audio = Audio(sampling_rate = sr)
 41 | 
 42 |     files = glob(path)
 43 |     filtered_files = []
 44 |     for f in files:
 45 |         new_f = new_path(f)
 46 |         if os.path.exists(new_f) and os.path.getsize(new_f) > 2:
 47 |             continue
 48 |         filtered_files.append(f)
 49 | 
 50 |     global_size = len(filtered_files) // global_index
 51 |     filtered_files = filtered_files[global_size * local_index: global_size * (local_index + 1)]
 52 |     files = filtered_files
 53 | 
 54 |     class CustomDataset(Dataset):
 55 |         def __init__(self, files):
 56 |             self.files = files
 57 | 
 58 |         def __len__(self):
 59 |             return len(self.files)
 60 |         
 61 |         def __getitem__(self, index):
 62 |             f = self.files[index]
 63 |             y = audio.decode_example(audio.encode_example(f))['array']
 64 |             timestamps = []
 65 |             slided = []
 66 |             last_end = 0
 67 |             for i in range(0, len(y) - sliding + 1, stride):
 68 |                 end = i + sliding
 69 |                 slided.append(y[i: end])
 70 |                 timestamps.append((i / sr, end / sr))
 71 |                 last_end = end
 72 | 
 73 |             if last_end < len(y):
 74 |                 y_ = y[last_end:]
 75 |                 if len(y_) >= stride:
 76 |                     slided.append(y_)
 77 |                     timestamps.append((last_end / sr, len(y) / sr))
 78 |             
 79 |             inputs = feature_extractor(slided, sampling_rate=sr, 
 80 |                             return_tensors="pt", return_attention_mask = True)
 81 |             return inputs, f, timestamps
 82 | 
 83 |     dataset = CustomDataset(files)
 84 |     dataloader = DataLoader(dataset, batch_size = 1, shuffle = False, prefetch_factor=10, num_workers=5)
 85 | 
 86 |     with torch.no_grad():
 87 |         for row in tqdm(iter(dataloader)):
 88 |             inputs, f, timestamps_ = row
 89 |             f = f[0]
 90 |             timestamps = []
 91 |             for t in timestamps_:
 92 |                 timestamps.append((float(t[0]), float(t[1])))
 93 | 
 94 |             inputs['input_values'] = inputs['input_values'][0].to(torch.float16).cuda()
 95 |             logits = model(inputs['input_values']).logits.cpu().numpy()
 96 |             logits_per_timestamp = {t: logits[no] for no, (t, _) in enumerate(timestamps)}
 97 |             logits_accumulator = defaultdict(lambda: np.zeros(logits.shape[1]))
 98 |             count_accumulator = defaultdict(int)
 99 | 
100 |             for (start, end) in timestamps:
101 |                 for t in np.arange(start, end, actual_stride):
102 |                     logits_accumulator[t] += logits_per_timestamp[start]
103 |                     count_accumulator[t] += 1
104 | 
105 |             averaged_logits = {t: logits_accumulator[t] / count_accumulator[t] for t in logits_accumulator}
106 |             for k in averaged_logits.keys():
107 |                 averaged_logits[k] = [round(v_, 5) for v_ in averaged_logits[k]]
108 | 
109 |             combined = []
110 |             for k, v in averaged_logits.items():
111 |                 topk = np.array(v).argsort()[-5:][::-1]
112 |                 scores = [float(v[i]) for i in topk]
113 |                 topk = [id2label[i] for i in topk]
114 |                 combined.append({'timestamp': k, 'topk': topk, 'scores': scores})
115 | 
116 |             splitted = new_path(f)
117 |             os.makedirs(os.path.split(splitted)[0], exist_ok = True)
118 |             with open(splitted, 'w') as fopen:
119 |                 json.dump(combined, fopen)
120 |             
121 | if __name__ == '__main__':
122 |     function()


--------------------------------------------------------------------------------
/text/llama/prepare-dataset-1024.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "4c973cad",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from transformers import (\n",
 11 |     "    AutoModelForCausalLM,\n",
 12 |     "    AutoTokenizer,\n",
 13 |     "    get_scheduler,\n",
 14 |     "    default_data_collator,\n",
 15 |     "    SchedulerType\n",
 16 |     ")\n",
 17 |     "import os\n",
 18 |     "import json\n",
 19 |     "from itertools import chain\n",
 20 |     "from datasets import load_dataset"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "id": "7e56d3d6",
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "block_size = 1024\n",
 31 |     "train_file = 'combine.jsonl'\n",
 32 |     "tokenizer = AutoTokenizer.from_pretrained(\n",
 33 |     "    'meta-llama/Llama-2-7b-hf',\n",
 34 |     ")\n",
 35 |     "text_column_name = 'text'"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "id": "98f1cb1b",
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "data": {
 46 |       "application/vnd.jupyter.widget-view+json": {
 47 |        "model_id": "47f4799106b9459da07783bfe46cfd03",
 48 |        "version_major": 2,
 49 |        "version_minor": 0
 50 |       },
 51 |       "text/plain": [
 52 |        "Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]"
 53 |       ]
 54 |      },
 55 |      "metadata": {},
 56 |      "output_type": "display_data"
 57 |     },
 58 |     {
 59 |      "data": {
 60 |       "application/vnd.jupyter.widget-view+json": {
 61 |        "model_id": "3b741378f28f4a7889a6390a0cc6fc52",
 62 |        "version_major": 2,
 63 |        "version_minor": 0
 64 |       },
 65 |       "text/plain": [
 66 |        "Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]"
 67 |       ]
 68 |      },
 69 |      "metadata": {},
 70 |      "output_type": "display_data"
 71 |     },
 72 |     {
 73 |      "data": {
 74 |       "application/vnd.jupyter.widget-view+json": {
 75 |        "model_id": "29ffb88dc5f5483c9a471b8f70947fa3",
 76 |        "version_major": 2,
 77 |        "version_minor": 0
 78 |       },
 79 |       "text/plain": [
 80 |        "Generating train split: 0 examples [00:00, ? examples/s]"
 81 |       ]
 82 |      },
 83 |      "metadata": {},
 84 |      "output_type": "display_data"
 85 |     },
 86 |     {
 87 |      "data": {
 88 |       "application/vnd.jupyter.widget-view+json": {
 89 |        "model_id": "130d512b4389418abafbf435c8446914",
 90 |        "version_major": 2,
 91 |        "version_minor": 0
 92 |       },
 93 |       "text/plain": [
 94 |        "Map (num_proc=20):   0%|          | 0/33339118 [00:00<?, ? examples/s]"
 95 |       ]
 96 |      },
 97 |      "metadata": {},
 98 |      "output_type": "display_data"
 99 |     }
100 |    ],
101 |    "source": [
102 |     "raw_datasets = load_dataset(\n",
103 |     "    'json',\n",
104 |     "    data_files=train_file,\n",
105 |     "    split='train'\n",
106 |     ")\n",
107 |     "\n",
108 |     "filename = os.path.split(train_file)[1]\n",
109 |     "\n",
110 |     "def tokenize_function(examples):\n",
111 |     "    return tokenizer(examples[text_column_name])\n",
112 |     "\n",
113 |     "column_names = raw_datasets.column_names\n",
114 |     "tokenized_datasets = raw_datasets.map(\n",
115 |     "    tokenize_function,\n",
116 |     "    batched=True,\n",
117 |     "    remove_columns=column_names,\n",
118 |     "    load_from_cache_file=True,\n",
119 |     "    cache_file_name=f'./{filename}-tokenized-{block_size}',\n",
120 |     "    num_proc=20,\n",
121 |     ")\n",
122 |     "\n",
123 |     "def group_texts(examples):\n",
124 |     "    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}\n",
125 |     "    total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
126 |     "    total_length = (total_length // block_size) * block_size\n",
127 |     "    result = {\n",
128 |     "        k: [t[i: i + block_size] for i in range(0, total_length, block_size)]\n",
129 |     "        for k, t in concatenated_examples.items()\n",
130 |     "    }\n",
131 |     "    result[\"labels\"] = result[\"input_ids\"].copy()\n",
132 |     "    return result\n",
133 |     "\n",
134 |     "lm_datasets = tokenized_datasets.map(\n",
135 |     "    group_texts,\n",
136 |     "    batched=True,\n",
137 |     "    load_from_cache_file=True,\n",
138 |     "    cache_file_name=f'./{filename}-grouped-{block_size}',\n",
139 |     "    num_proc=20,\n",
140 |     ")"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "id": "3bdc7594",
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": []
150 |   }
151 |  ],
152 |  "metadata": {
153 |   "kernelspec": {
154 |    "display_name": "Python 3 (ipykernel)",
155 |    "language": "python",
156 |    "name": "python3"
157 |   },
158 |   "language_info": {
159 |    "codemirror_mode": {
160 |     "name": "ipython",
161 |     "version": 3
162 |    },
163 |    "file_extension": ".py",
164 |    "mimetype": "text/x-python",
165 |    "name": "python",
166 |    "nbconvert_exporter": "python",
167 |    "pygments_lexer": "ipython3",
168 |    "version": "3.10.12"
169 |   }
170 |  },
171 |  "nbformat": 4,
172 |  "nbformat_minor": 5
173 | }
174 | 


--------------------------------------------------------------------------------
/text/llama/prepare-dataset-2048.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "66881a84",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from transformers import (\n",
 11 |     "    AutoModelForCausalLM,\n",
 12 |     "    AutoTokenizer,\n",
 13 |     "    get_scheduler,\n",
 14 |     "    default_data_collator,\n",
 15 |     "    SchedulerType\n",
 16 |     ")\n",
 17 |     "import os\n",
 18 |     "import json\n",
 19 |     "from itertools import chain\n",
 20 |     "from datasets import load_dataset"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "id": "10e30b50",
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "block_size = 2048\n",
 31 |     "train_file = 'combine.jsonl'\n",
 32 |     "tokenizer = AutoTokenizer.from_pretrained(\n",
 33 |     "    'meta-llama/Llama-2-7b-hf',\n",
 34 |     ")\n",
 35 |     "text_column_name = 'text'"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "id": "fc8e725f",
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "data": {
 46 |       "application/vnd.jupyter.widget-view+json": {
 47 |        "model_id": "47f4799106b9459da07783bfe46cfd03",
 48 |        "version_major": 2,
 49 |        "version_minor": 0
 50 |       },
 51 |       "text/plain": [
 52 |        "Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]"
 53 |       ]
 54 |      },
 55 |      "metadata": {},
 56 |      "output_type": "display_data"
 57 |     },
 58 |     {
 59 |      "data": {
 60 |       "application/vnd.jupyter.widget-view+json": {
 61 |        "model_id": "3b741378f28f4a7889a6390a0cc6fc52",
 62 |        "version_major": 2,
 63 |        "version_minor": 0
 64 |       },
 65 |       "text/plain": [
 66 |        "Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]"
 67 |       ]
 68 |      },
 69 |      "metadata": {},
 70 |      "output_type": "display_data"
 71 |     },
 72 |     {
 73 |      "data": {
 74 |       "application/vnd.jupyter.widget-view+json": {
 75 |        "model_id": "29ffb88dc5f5483c9a471b8f70947fa3",
 76 |        "version_major": 2,
 77 |        "version_minor": 0
 78 |       },
 79 |       "text/plain": [
 80 |        "Generating train split: 0 examples [00:00, ? examples/s]"
 81 |       ]
 82 |      },
 83 |      "metadata": {},
 84 |      "output_type": "display_data"
 85 |     },
 86 |     {
 87 |      "data": {
 88 |       "application/vnd.jupyter.widget-view+json": {
 89 |        "model_id": "130d512b4389418abafbf435c8446914",
 90 |        "version_major": 2,
 91 |        "version_minor": 0
 92 |       },
 93 |       "text/plain": [
 94 |        "Map (num_proc=20):   0%|          | 0/33339118 [00:00<?, ? examples/s]"
 95 |       ]
 96 |      },
 97 |      "metadata": {},
 98 |      "output_type": "display_data"
 99 |     }
100 |    ],
101 |    "source": [
102 |     "raw_datasets = load_dataset(\n",
103 |     "    'json',\n",
104 |     "    data_files=train_file,\n",
105 |     "    split='train'\n",
106 |     ")\n",
107 |     "\n",
108 |     "filename = os.path.split(train_file)[1]\n",
109 |     "\n",
110 |     "def tokenize_function(examples):\n",
111 |     "    return tokenizer(examples[text_column_name])\n",
112 |     "\n",
113 |     "column_names = raw_datasets.column_names\n",
114 |     "tokenized_datasets = raw_datasets.map(\n",
115 |     "    tokenize_function,\n",
116 |     "    batched=True,\n",
117 |     "    remove_columns=column_names,\n",
118 |     "    load_from_cache_file=True,\n",
119 |     "    cache_file_name=f'./{filename}-tokenized-{block_size}',\n",
120 |     "    num_proc=20,\n",
121 |     ")\n",
122 |     "\n",
123 |     "def group_texts(examples):\n",
124 |     "    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}\n",
125 |     "    total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
126 |     "    total_length = (total_length // block_size) * block_size\n",
127 |     "    result = {\n",
128 |     "        k: [t[i: i + block_size] for i in range(0, total_length, block_size)]\n",
129 |     "        for k, t in concatenated_examples.items()\n",
130 |     "    }\n",
131 |     "    result[\"labels\"] = result[\"input_ids\"].copy()\n",
132 |     "    return result\n",
133 |     "\n",
134 |     "lm_datasets = tokenized_datasets.map(\n",
135 |     "    group_texts,\n",
136 |     "    batched=True,\n",
137 |     "    load_from_cache_file=True,\n",
138 |     "    cache_file_name=f'./{filename}-grouped-{block_size}',\n",
139 |     "    num_proc=20,\n",
140 |     ")"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "id": "4fc4065e",
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": []
150 |   }
151 |  ],
152 |  "metadata": {
153 |   "kernelspec": {
154 |    "display_name": "Python 3 (ipykernel)",
155 |    "language": "python",
156 |    "name": "python3"
157 |   },
158 |   "language_info": {
159 |    "codemirror_mode": {
160 |     "name": "ipython",
161 |     "version": 3
162 |    },
163 |    "file_extension": ".py",
164 |    "mimetype": "text/x-python",
165 |    "name": "python",
166 |    "nbconvert_exporter": "python",
167 |    "pygments_lexer": "ipython3",
168 |    "version": "3.10.12"
169 |   }
170 |  },
171 |  "nbformat": 4,
172 |  "nbformat_minor": 5
173 | }
174 | 


--------------------------------------------------------------------------------
/malaysian-short-instructions/dedup-questions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 23,
  6 |    "id": "8009e792",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "data": {
 11 |       "text/plain": [
 12 |        "213261"
 13 |       ]
 14 |      },
 15 |      "execution_count": 23,
 16 |      "metadata": {},
 17 |      "output_type": "execute_result"
 18 |     }
 19 |    ],
 20 |    "source": [
 21 |     "from glob import glob\n",
 22 |     "import json\n",
 23 |     "import re\n",
 24 |     "\n",
 25 |     "pattern = r\"\\d+\\.\\s(.+)\"\n",
 26 |     "already = set()\n",
 27 |     "\n",
 28 |     "files = glob('generate-questions/*')\n",
 29 |     "\n",
 30 |     "questions = []\n",
 31 |     "for f in files:\n",
 32 |     "    with open(f) as fopen:\n",
 33 |     "        d = json.load(fopen)\n",
 34 |     "    keyword = d['q'][0]\n",
 35 |     "    for q in re.findall(pattern, d['r']):\n",
 36 |     "        if q in already:\n",
 37 |     "            continue\n",
 38 |     "        questions.append((q, keyword))\n",
 39 |     "    \n",
 40 |     "len(questions)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 24,
 46 |    "id": "cc4f4bab",
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "import string\n",
 51 |     "\n",
 52 |     "digits = set(string.digits)\n",
 53 |     "rejected = ['\\'', '\"', 'http', '\\n', '[', ']', '/', '`']\n",
 54 |     "\n",
 55 |     "def contains_non_ascii(text):\n",
 56 |     "    return any(ord(char) > 127 for char in text)\n",
 57 |     "\n",
 58 |     "def reject_q(q):\n",
 59 |     "    if q is None:\n",
 60 |     "        return True\n",
 61 |     "    if any([c in q for c in rejected]):\n",
 62 |     "        return True\n",
 63 |     "    if contains_non_ascii(q):\n",
 64 |     "        return True\n",
 65 |     "    if len(set(q) & digits):\n",
 66 |     "        return True\n",
 67 |     "    if len(q) < 20:\n",
 68 |     "        return True\n",
 69 |     "    if len(q) > 200:\n",
 70 |     "        return True\n",
 71 |     "    return False"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 25,
 77 |    "id": "fddc3adf",
 78 |    "metadata": {
 79 |     "scrolled": true
 80 |    },
 81 |    "outputs": [
 82 |     {
 83 |      "data": {
 84 |       "text/plain": [
 85 |        "58"
 86 |       ]
 87 |      },
 88 |      "execution_count": 25,
 89 |      "metadata": {},
 90 |      "output_type": "execute_result"
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "from collections import defaultdict\n",
 95 |     "\n",
 96 |     "filtered_q = defaultdict(list)\n",
 97 |     "for q, k in questions:\n",
 98 |     "    if len(q) < 10:\n",
 99 |     "        continue\n",
100 |     "    if reject_q(q):\n",
101 |     "        continue\n",
102 |     "    \n",
103 |     "    filtered_q[k].append(q)\n",
104 |     "len(filtered_q)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 31,
110 |    "id": "60a9f651",
111 |    "metadata": {},
112 |    "outputs": [
113 |     {
114 |      "data": {
115 |       "text/plain": [
116 |        "31"
117 |       ]
118 |      },
119 |      "execution_count": 31,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "len(filtered_q['d3 js'])"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 54,
131 |    "id": "64cf6d6b",
132 |    "metadata": {
133 |     "scrolled": false
134 |    },
135 |    "outputs": [
136 |     {
137 |      "data": {
138 |       "text/plain": [
139 |        "57000"
140 |       ]
141 |      },
142 |      "execution_count": 54,
143 |      "metadata": {},
144 |      "output_type": "execute_result"
145 |     }
146 |    ],
147 |    "source": [
148 |     "questions = []\n",
149 |     "for k, v in filtered_q.items():\n",
150 |     "    if len(v) < 100:\n",
151 |     "        continue\n",
152 |     "    v = sorted(v, key = lambda x: len(x), reverse = True)\n",
153 |     "    v = [(v_, k) for v_ in v][:1000]\n",
154 |     "    questions.extend(v)\n",
155 |     "    \n",
156 |     "len(questions)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 55,
162 |    "id": "eee58538",
163 |    "metadata": {},
164 |    "outputs": [
165 |     {
166 |      "data": {
167 |       "text/plain": [
168 |        "('Bolehkah anda menerangkan langkah-langkah yang diperlukan untuk membuat sistem pengurusan penyediaan semula dalam Go yang berfungsi dengan cekap dan tahan terhadap kegagalan?',\n",
169 |        " 'go distributed system')"
170 |       ]
171 |      },
172 |      "execution_count": 55,
173 |      "metadata": {},
174 |      "output_type": "execute_result"
175 |     }
176 |    ],
177 |    "source": [
178 |     "questions[0]"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 56,
184 |    "id": "af927235",
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "with open('dedup-questions.json', 'w') as fopen:\n",
189 |     "    json.dump(questions, fopen)"
190 |    ]
191 |   }
192 |  ],
193 |  "metadata": {
194 |   "kernelspec": {
195 |    "display_name": "Python 3 (ipykernel)",
196 |    "language": "python",
197 |    "name": "python3"
198 |   },
199 |   "language_info": {
200 |    "codemirror_mode": {
201 |     "name": "ipython",
202 |     "version": 3
203 |    },
204 |    "file_extension": ".py",
205 |    "mimetype": "text/x-python",
206 |    "name": "python",
207 |    "nbconvert_exporter": "python",
208 |    "pygments_lexer": "ipython3",
209 |    "version": "3.8.10"
210 |   }
211 |  },
212 |  "nbformat": 4,
213 |  "nbformat_minor": 5
214 | }
215 | 


--------------------------------------------------------------------------------
/text/extra/process-snapshot.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 75,
  6 |    "id": "05913d38",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import json\n",
 11 |     "from glob import glob\n",
 12 |     "from tqdm import tqdm"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 89,
 18 |    "id": "55c6365c",
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "http_errors = [\n",
 23 |     "        \"400 Bad Request\", \"401 Unauthorized\", \"402 Payment Required\", \"403 Forbidden\", \"404 Not Found\",\n",
 24 |     "        \"405 Method Not Allowed\", \"406 Not Acceptable\", \"407 Proxy Authentication Required\", \"408 Request Timeout\",\n",
 25 |     "        \"409 Conflict\", \"410 Gone\", \"411 Length Required\", \"412 Precondition Failed\", \"413 Payload Too Large\",\n",
 26 |     "        \"414 URI Too Long\", \"415 Unsupported Media Type\", \"416 Range Not Satisfiable\", \"417 Expectation Failed\",\n",
 27 |     "        \"418 I'm a teapot\", \"421 Misdirected Request\", \"422 Unprocessable Entity\", \"423 Locked\", \"424 Failed Dependency\",\n",
 28 |     "        \"425 Too Early\", \"426 Upgrade Required\", \"428 Precondition Required\", \"429 Too Many Requests\",\n",
 29 |     "        \"431 Request Header Fields Too Large\", \"451 Unavailable For Legal Reasons\", \"500 Internal Server Error\",\n",
 30 |     "        \"501 Not Implemented\", \"502 Bad Gateway\", \"503 Service Unavailable\", \"504 Gateway Timeout\",\n",
 31 |     "        \"505 HTTP Version Not Supported\", \"506 Variant Also Negotiates\", \"507 Insufficient Storage\",\n",
 32 |     "        \"508 Loop Detected\", \"510 Not Extended\", \"511 Network Authentication Required\"\n",
 33 |     "    ]\n",
 34 |     "\n",
 35 |     "rejected = [\n",
 36 |     "    'Internal Server Error',\n",
 37 |     "    '404',\n",
 38 |     "    '__NOEDITSECTION__',\n",
 39 |     "    'enter your username and password',\n",
 40 |     "    'Login',\n",
 41 |     "    'forgotten your password',\n",
 42 |     "    'cookies enabled',\n",
 43 |     "    'sign in',\n",
 44 |     "    'tentang kami',\n",
 45 |     "    'skip to content',\n",
 46 |     "    'hubungi kami',\n",
 47 |     "    'laman utama',\n",
 48 |     "    'enable JavaScript in your browser.',\n",
 49 |     "    'The page cannot be displayed',\n",
 50 |     "    'site or edit the error_page',\n",
 51 |     "    'Hakcipta terpelihara',\n",
 52 |     "    'Copyright ©'\n",
 53 |     "]\n",
 54 |     "\n",
 55 |     "rejected.extend(http_errors)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 80,
 61 |    "id": "54284aa7",
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "files = sorted(glob('crawl-my-website/snapshot/*.json'))"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 114,
 71 |    "id": "85361659",
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "!rm hf-datasets/dedupe-datasets/snapshot.jsonl"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 115,
 81 |    "id": "ae3d0a6d",
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stderr",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "100%|██████████| 348/348 [03:33<00:00,  1.63it/s]\n"
 89 |      ]
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "processed = set()\n",
 94 |     "with open('hf-datasets/raw-datasets/snapshot.jsonl', 'w') as fopen_l:\n",
 95 |     "    for f in tqdm(files):\n",
 96 |     "        with open(f) as fopen:\n",
 97 |     "            for l in fopen:\n",
 98 |     "                l = json.loads(l)\n",
 99 |     "                if l['url'] in processed:\n",
100 |     "                    continue\n",
101 |     "\n",
102 |     "                splitted = l['data'].split('\\n')\n",
103 |     "                splitted = [s for s in splitted if len(s) > 50]\n",
104 |     "                splitted = [s.strip() for s in splitted if all([r not in s for r in rejected])]\n",
105 |     "                if len(splitted):\n",
106 |     "                    data = {\n",
107 |     "                        'url': l['url'],\n",
108 |     "                        'text': splitted\n",
109 |     "                    }\n",
110 |     "                    fopen_l.write(f'{json.dumps(data)}\\n')\n",
111 |     "                    fopen_l.flush()\n",
112 |     "                    \n",
113 |     "                processed.add(l['url'])"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 116,
119 |    "id": "8aad7de6",
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "data": {
124 |       "text/plain": [
125 |        "428982"
126 |       ]
127 |      },
128 |      "execution_count": 116,
129 |      "metadata": {},
130 |      "output_type": "execute_result"
131 |     }
132 |    ],
133 |    "source": [
134 |     "len(processed)"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "id": "2059852b",
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": []
144 |   }
145 |  ],
146 |  "metadata": {
147 |   "kernelspec": {
148 |    "display_name": "Python 3 (ipykernel)",
149 |    "language": "python",
150 |    "name": "python3"
151 |   },
152 |   "language_info": {
153 |    "codemirror_mode": {
154 |     "name": "ipython",
155 |     "version": 3
156 |    },
157 |    "file_extension": ".py",
158 |    "mimetype": "text/x-python",
159 |    "name": "python",
160 |    "nbconvert_exporter": "python",
161 |    "pygments_lexer": "ipython3",
162 |    "version": "3.10.12"
163 |   }
164 |  },
165 |  "nbformat": 4,
166 |  "nbformat_minor": 5
167 | }
168 | 


--------------------------------------------------------------------------------
/text/madlad-400-ms/postprocess-madlad-400-ms.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 4,
  6 |    "id": "d9d83b0a",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import json\n",
 11 |     "import re\n",
 12 |     "from tqdm import tqdm\n",
 13 |     "\n",
 14 |     "http_errors = [\n",
 15 |     "        \"400 Bad Request\", \"401 Unauthorized\", \"402 Payment Required\", \"403 Forbidden\", \"404 Not Found\",\n",
 16 |     "        \"405 Method Not Allowed\", \"406 Not Acceptable\", \"407 Proxy Authentication Required\", \"408 Request Timeout\",\n",
 17 |     "        \"409 Conflict\", \"410 Gone\", \"411 Length Required\", \"412 Precondition Failed\", \"413 Payload Too Large\",\n",
 18 |     "        \"414 URI Too Long\", \"415 Unsupported Media Type\", \"416 Range Not Satisfiable\", \"417 Expectation Failed\",\n",
 19 |     "        \"418 I'm a teapot\", \"421 Misdirected Request\", \"422 Unprocessable Entity\", \"423 Locked\", \"424 Failed Dependency\",\n",
 20 |     "        \"425 Too Early\", \"426 Upgrade Required\", \"428 Precondition Required\", \"429 Too Many Requests\",\n",
 21 |     "        \"431 Request Header Fields Too Large\", \"451 Unavailable For Legal Reasons\", \"500 Internal Server Error\",\n",
 22 |     "        \"501 Not Implemented\", \"502 Bad Gateway\", \"503 Service Unavailable\", \"504 Gateway Timeout\",\n",
 23 |     "        \"505 HTTP Version Not Supported\", \"506 Variant Also Negotiates\", \"507 Insufficient Storage\",\n",
 24 |     "        \"508 Loop Detected\", \"510 Not Extended\", \"511 Network Authentication Required\"\n",
 25 |     "    ]\n",
 26 |     "\n",
 27 |     "rejected = [\n",
 28 |     "    'Internal Server Error',\n",
 29 |     "    '__NOEDITSECTION__',\n",
 30 |     "    'enter your username and password',\n",
 31 |     "    'forgotten your password',\n",
 32 |     "    'cookies enabled',\n",
 33 |     "    'enable JavaScript in your browser.',\n",
 34 |     "    'The page cannot be displayed',\n",
 35 |     "    'site or edit the error_page',\n",
 36 |     "]\n",
 37 |     "\n",
 38 |     "rejected.extend(http_errors)\n",
 39 |     "\n",
 40 |     "def replace_multiple(input_string, pattern =r\"\\s{6,}\", replace = '   '):\n",
 41 |     "    return re.sub(pattern, replace, input_string)\n",
 42 |     "\n",
 43 |     "def replace(string):\n",
 44 |     "    string = replace_multiple(string.replace('…', '.'))\n",
 45 |     "    string = replace_multiple(string, pattern = r\"\\.{6,}\", replace = '...')\n",
 46 |     "    return string\n",
 47 |     "\n",
 48 |     "def reject(string):\n",
 49 |     "    if any([r in string for r in rejected]):\n",
 50 |     "        return True\n",
 51 |     "    return False"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 5,
 57 |    "id": "d714ffb4",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "f = 'madlad-400-ms.jsonl'\n",
 62 |     "new_f = 'madlad-400-ms.postprocessing.jsonl'"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 9,
 68 |    "id": "83f668c6",
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "name": "stderr",
 73 |      "output_type": "stream",
 74 |      "text": [
 75 |       "2232026it [08:47, 4507.46it/s]IOPub message rate exceeded.\n",
 76 |       "The notebook server will temporarily stop sending output\n",
 77 |       "to the client in order to avoid crashing it.\n",
 78 |       "To change this limit, set the config variable\n",
 79 |       "`--NotebookApp.iopub_msg_rate_limit`.\n",
 80 |       "\n",
 81 |       "Current values:\n",
 82 |       "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
 83 |       "NotebookApp.rate_limit_window=3.0 (secs)\n",
 84 |       "\n",
 85 |       "11158994it [43:43, 4565.13it/s]IOPub message rate exceeded.\n",
 86 |       "The notebook server will temporarily stop sending output\n",
 87 |       "to the client in order to avoid crashing it.\n",
 88 |       "To change this limit, set the config variable\n",
 89 |       "`--NotebookApp.iopub_msg_rate_limit`.\n",
 90 |       "\n",
 91 |       "Current values:\n",
 92 |       "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
 93 |       "NotebookApp.rate_limit_window=3.0 (secs)\n",
 94 |       "\n"
 95 |      ]
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "with open(new_f, 'w') as fopen_l:\n",
100 |     "    with open(f) as fopen:\n",
101 |     "        for l in tqdm(fopen):\n",
102 |     "            data = json.loads(l)\n",
103 |     "            \n",
104 |     "            if isinstance(data, dict):\n",
105 |     "                t = data['text']\n",
106 |     "            else:\n",
107 |     "                t = data\n",
108 |     "\n",
109 |     "            if reject(t):\n",
110 |     "                continue\n",
111 |     "\n",
112 |     "            data = replace(t.strip())\n",
113 |     "\n",
114 |     "            if len(data) < 3:\n",
115 |     "                continue\n",
116 |     "\n",
117 |     "            fopen_l.write(f'{json.dumps(data)}\\n')\n",
118 |     "            fopen_l.flush()"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "id": "f13ad6af",
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": []
128 |   }
129 |  ],
130 |  "metadata": {
131 |   "kernelspec": {
132 |    "display_name": "Python 3 (ipykernel)",
133 |    "language": "python",
134 |    "name": "python3"
135 |   },
136 |   "language_info": {
137 |    "codemirror_mode": {
138 |     "name": "ipython",
139 |     "version": 3
140 |    },
141 |    "file_extension": ".py",
142 |    "mimetype": "text/x-python",
143 |    "name": "python",
144 |    "nbconvert_exporter": "python",
145 |    "pygments_lexer": "ipython3",
146 |    "version": "3.10.12"
147 |   }
148 |  },
149 |  "nbformat": 4,
150 |  "nbformat_minor": 5
151 | }
152 | 


--------------------------------------------------------------------------------
/text/pretrain-clm/from-pyarrow-to-mosaic.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 15,
  6 |    "id": "4b7592f7",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import mp\n",
 11 |     "import os\n",
 12 |     "import pyarrow as pa\n",
 13 |     "import numpy as np\n",
 14 |     "from streaming import MDSWriter\n",
 15 |     "from tqdm import tqdm"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 6,
 21 |    "id": "e0391f83",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from streaming.base.format.mds.encodings import Encoding, _encodings\n",
 26 |     "\n",
 27 |     "class Int32(Encoding):\n",
 28 |     "    def encode(self, obj) -> bytes:\n",
 29 |     "        return obj.tobytes()\n",
 30 |     "\n",
 31 |     "    def decode(self, data: bytes):\n",
 32 |     "        return np.frombuffer(data, np.int32)\n",
 33 |     "\n",
 34 |     "_encodings['int32'] = Int32"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 7,
 40 |    "id": "62ddb05a",
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "columns = {\n",
 45 |     "    'input_ids': 'int32',\n",
 46 |     "    'token_type_ids': 'int32',\n",
 47 |     "    'attention_mask': 'int32',\n",
 48 |     "    'labels': 'int32',\n",
 49 |     "}\n",
 50 |     "compression = 'zstd'\n",
 51 |     "hashes = 'sha1', 'xxh64'"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 8,
 57 |    "id": "e817fcc5",
 58 |    "metadata": {},
 59 |    "outputs": [
 60 |     {
 61 |      "data": {
 62 |       "text/plain": [
 63 |        "['combine-lm_00017_of_00020.jsonl-grouped-4096',\n",
 64 |        " 'combine-lm_00005_of_00020.jsonl-grouped-4096',\n",
 65 |        " 'combine-lm_00008_of_00020.jsonl-grouped-4096',\n",
 66 |        " 'combine-lm_00012_of_00020.jsonl-grouped-4096',\n",
 67 |        " 'combine-lm_00007_of_00020.jsonl-grouped-4096',\n",
 68 |        " 'combine-lm_00014_of_00020.jsonl-grouped-4096',\n",
 69 |        " 'combine-lm_00006_of_00020.jsonl-grouped-4096',\n",
 70 |        " 'combine-lm_00013_of_00020.jsonl-grouped-4096',\n",
 71 |        " 'combine-lm_00016_of_00020.jsonl-grouped-4096',\n",
 72 |        " 'combine-lm_00011_of_00020.jsonl-grouped-4096',\n",
 73 |        " 'combine-lm_00018_of_00020.jsonl-grouped-4096',\n",
 74 |        " 'combine-lm_00002_of_00020.jsonl-grouped-4096',\n",
 75 |        " 'combine-lm_00009_of_00020.jsonl-grouped-4096',\n",
 76 |        " 'combine-lm_00019_of_00020.jsonl-grouped-4096',\n",
 77 |        " 'combine-lm_00001_of_00020.jsonl-grouped-4096',\n",
 78 |        " 'combine-lm_00003_of_00020.jsonl-grouped-4096',\n",
 79 |        " 'combine-lm_00015_of_00020.jsonl-grouped-4096',\n",
 80 |        " 'combine-lm_00004_of_00020.jsonl-grouped-4096',\n",
 81 |        " 'combine-lm_00000_of_00020.jsonl-grouped-4096',\n",
 82 |        " 'combine-lm_00010_of_00020.jsonl-grouped-4096']"
 83 |       ]
 84 |      },
 85 |      "execution_count": 8,
 86 |      "metadata": {},
 87 |      "output_type": "execute_result"
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "from glob import glob\n",
 92 |     "\n",
 93 |     "files = glob('combine-lm_*_of_00020.jsonl-grouped-4096')\n",
 94 |     "files"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 16,
100 |    "id": "8a3e0890",
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "def loop(files):\n",
105 |     "    files, index = files\n",
106 |     "    out_root = f'tokenized-{index}'\n",
107 |     "    os.system(f'rm -rf {out_root}')\n",
108 |     "    with MDSWriter(out=out_root, columns=columns, compression=compression, hashes=hashes, \n",
109 |     "                   size_limit = 67108864 * 2) as out:\n",
110 |     "        for f in files:\n",
111 |     "            memory_mapped_stream = pa.memory_map(f)\n",
112 |     "            opened_stream = pa.ipc.open_stream(memory_mapped_stream)\n",
113 |     "            for a in tqdm(opened_stream):\n",
114 |     "                s = a.to_struct_array()\n",
115 |     "                for i in range(len(s)):\n",
116 |     "                    keys = list(s[i])\n",
117 |     "                    a_ = {}\n",
118 |     "                    for k in keys:\n",
119 |     "                        a_[k] = np.array(s[i][k].as_py()).astype(np.int32)\n",
120 |     "                    out.write(a_)"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "id": "876289e4",
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "name": "stderr",
131 |      "output_type": "stream",
132 |      "text": [
133 |       "2570it [05:50,  7.30it/s]\n",
134 |       "7464it [06:35, 18.87it/s]\n",
135 |       "7464it [07:57, 15.62it/s]\n",
136 |       "7464it [08:06, 15.36it/s]\n",
137 |       "7464it [08:11, 15.20it/s]\n",
138 |       "7464it [12:20, 10.08it/s]\n",
139 |       "5816it [13:12,  7.64it/s]"
140 |      ]
141 |     }
142 |    ],
143 |    "source": [
144 |     "mp.multiprocessing(files, loop, cores = 20, returned = False)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "id": "c9aba12a",
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": []
154 |   }
155 |  ],
156 |  "metadata": {
157 |   "kernelspec": {
158 |    "display_name": "Python 3 (ipykernel)",
159 |    "language": "python",
160 |    "name": "python3"
161 |   },
162 |   "language_info": {
163 |    "codemirror_mode": {
164 |     "name": "ipython",
165 |     "version": 3
166 |    },
167 |    "file_extension": ".py",
168 |    "mimetype": "text/x-python",
169 |    "name": "python",
170 |    "nbconvert_exporter": "python",
171 |    "pygments_lexer": "ipython3",
172 |    "version": "3.10.12"
173 |   }
174 |  },
175 |  "nbformat": 4,
176 |  "nbformat_minor": 5
177 | }
178 | 


--------------------------------------------------------------------------------
/text/text_dedup/utils/hashfunc.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import struct
  3 | from hashlib import md5
  4 | from hashlib import sha256
  5 | 
  6 | import xxhash
  7 | from xxhash import xxh3_64
  8 | from xxhash import xxh3_64_digest
  9 | from xxhash import xxh3_128
 10 | from xxhash import xxh3_128_digest
 11 | 
 12 | 
 13 | def md5_hexdigest(data: bytes) -> str:
 14 |     """
 15 |     Generate a md5 hex hash from the given data.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     data : bytes
 20 |         The data to be hashed.
 21 | 
 22 |     Returns
 23 |     -------
 24 |     str
 25 |         The hex hash value.
 26 | 
 27 |     Examples
 28 |     --------
 29 |     >>> md5_hexdigest(b"hello world")
 30 |     '5eb63bbbe01eeed093cb22bb8f5acdc3'
 31 |     >>> len(md5_hexdigest(b"hello world"))
 32 |     32
 33 |     """
 34 |     return md5(data).hexdigest()
 35 | 
 36 | 
 37 | def sha1_hash(data: bytes, d: int = 32) -> int:
 38 |     """
 39 |     Generate a d-bit hash value from the given data.
 40 | 
 41 |     Parameters
 42 |     ----------
 43 |     data : bytes
 44 |         The data to be hashed.
 45 |     d : int
 46 |         The number of bits of the hash value.
 47 | 
 48 |     Returns
 49 |     -------
 50 |     int
 51 |         The hash value.
 52 | 
 53 |     Examples
 54 |     --------
 55 |     >>> sha1_hash(b"hello world", 32)
 56 |     896314922
 57 |     >>> sha1_hash(b"hello world", 64)
 58 |     13028719972609469994
 59 |     >>> sha1_hash(b"hello world", 128)
 60 |     310522945683037930239412421226792791594
 61 |     """
 62 |     if d == 32:
 63 |         return struct.unpack("<I", hashlib.sha1(data).digest()[:4])[0]
 64 |     if d == 64:
 65 |         return struct.unpack("<Q", hashlib.sha1(data).digest()[:8])[0]
 66 |     # struct is faster but does not support arbitrary bit lengths
 67 |     return int.from_bytes(hashlib.sha1(data).digest()[: d // 8], byteorder="little")
 68 | 
 69 | 
 70 | def sha256_hexdigest(data: bytes) -> str:
 71 |     """
 72 |     Generate a sha256 hex hash from the given data.
 73 | 
 74 |     Parameters
 75 |     ----------
 76 |     data : bytes
 77 |         The data to be hashed.
 78 | 
 79 |     Returns
 80 |     -------
 81 |     str
 82 |         The hex hash value.
 83 | 
 84 |     Examples
 85 |     --------
 86 |     >>> sha256_hexdigest(b"hello world")
 87 |     'b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9'
 88 |     >>> len(sha256_hexdigest(b"hello world"))
 89 |     64
 90 |     """
 91 |     return sha256(data).hexdigest()
 92 | 
 93 | 
 94 | def xxh3_16hash(data: bytes, seed: int = 0) -> int:
 95 |     """
 96 |     Generate a 16-bit xxhash based hash value from the given data.
 97 |     As of python xxhash 3.3.0 (and since 0.3.0) outputs in big-endian.
 98 |     This is useful as a special purpose xxhash when you only want 16 bits.
 99 |     bit masked xxh3_64 hashes are faster than xxh32 in modern systems.
100 | 
101 |     Parameters
102 |     ----------
103 |     data : bytes
104 |         The data to be hashed.
105 |     seed : int
106 |         xxhashes can all be seeded. Default is int=0
107 | 
108 |     Returns
109 |     -------
110 |     int
111 |         The hash value.
112 | 
113 |     Examples
114 |     --------
115 |     >>> xxh3_16hash(b"hello world")
116 |     39051
117 |     >>> xxh3_16hash(b"hello world",seed=42)
118 |     13198
119 |     >>> xxh3_16hash(b"hello world",seed=-42)
120 |     34281
121 |     """
122 |     return xxhash.xxh3_64_intdigest(data, seed) & 0xFFFF
123 | 
124 | 
125 | def xxh3_32hash(data: bytes, seed: int = 0) -> int:
126 |     """
127 |     Generate a 32-bit xxhash based hash value from the given data.
128 |     As of python xxhash 3.3.0 (and since 0.3.0) outputs in big-endian.
129 |     This is useful as a special purpose xxhash when you only want 32bits.
130 |     bit masked xxh3_64 hashes are faster than xxh32 in modern systems.
131 | 
132 |     Parameters
133 |     ----------
134 |     data : bytes
135 |         The data to be hashed.
136 |     seed : int
137 |         xxhashes can all be seeded. Default is int=0
138 | 
139 |     Returns
140 |     -------
141 |     int
142 |         The hash value.
143 | 
144 |     Examples
145 |     --------
146 |     >>> xxh3_32hash(b"hello world")
147 |     1088854155
148 |     >>> xxh3_32hash(b"hello world",seed=42)
149 |     3913102222
150 |     >>> xxh3_32hash(b"hello world",seed=-42)
151 |     3721037289
152 |     """
153 |     return xxhash.xxh3_64_intdigest(data, seed) & 0xFFFFFFFF
154 | 
155 | 
156 | def xxh3_hash(data: bytes, d: int = 32) -> int:
157 |     """
158 |     Generate a d-bit xxhash based hash value from the given data.
159 |     As of python xxhash 3.3.0 (and since 0.3.0) outputs in big-endian.
160 |     This is useful as a general purpose xxhash that can take multiple `d` values
161 | 
162 |     Parameters
163 |     ----------
164 |     data : bytes
165 |         The data to be hashed.
166 |     d : int
167 |         The number of bits of the hash value.
168 |         According to this value, chooses empirically found best xxh3 hasher.
169 | 
170 |     Returns
171 |     -------
172 |     int
173 |         The hash value.
174 | 
175 |     Examples
176 |     --------
177 |     >>> xxh3_hash(b"hello world", 32)
178 |     1088854155
179 |     >>> xxh3_hash(b"hello world", 64)
180 |     15296390279056496779
181 |     >>> xxh3_hash(b"hello world", 128)
182 |     297150157938599054391163723952090887879
183 |     """
184 |     if d == 32:
185 |         # with sse2 or later, xxh3 is much faster
186 |         # with avx, the difference is much larger
187 |         return xxhash.xxh3_64_intdigest(data) & 0xFFFFFFFF
188 |     if d == 64:
189 |         return xxhash.xxh3_64_intdigest(data)
190 |     if d == 128:
191 |         return xxhash.xxh3_128_intdigest(data)
192 |     # fall back
193 |     return int.from_bytes(xxhash.xxh3_128_digest(data)[: d // 8], byteorder="big")
194 | 
195 | 
196 | __all__ = [
197 |     "md5",
198 |     "sha256",
199 |     "sha1_hash",
200 |     "xxh3_64",
201 |     "xxh3_64_digest",
202 |     "xxh3_128",
203 |     "xxh3_128_digest",
204 |     "xxh3_hash",
205 |     "xxh3_16hash",
206 |     "xxh3_32hash",
207 | ]


--------------------------------------------------------------------------------
/text/processing/text_dedup/utils/hashfunc.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import struct
  3 | from hashlib import md5
  4 | from hashlib import sha256
  5 | 
  6 | import xxhash
  7 | from xxhash import xxh3_64
  8 | from xxhash import xxh3_64_digest
  9 | from xxhash import xxh3_128
 10 | from xxhash import xxh3_128_digest
 11 | 
 12 | 
 13 | def md5_hexdigest(data: bytes) -> str:
 14 |     """
 15 |     Generate a md5 hex hash from the given data.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     data : bytes
 20 |         The data to be hashed.
 21 | 
 22 |     Returns
 23 |     -------
 24 |     str
 25 |         The hex hash value.
 26 | 
 27 |     Examples
 28 |     --------
 29 |     >>> md5_hexdigest(b"hello world")
 30 |     '5eb63bbbe01eeed093cb22bb8f5acdc3'
 31 |     >>> len(md5_hexdigest(b"hello world"))
 32 |     32
 33 |     """
 34 |     return md5(data).hexdigest()
 35 | 
 36 | 
 37 | def sha1_hash(data: bytes, d: int = 32) -> int:
 38 |     """
 39 |     Generate a d-bit hash value from the given data.
 40 | 
 41 |     Parameters
 42 |     ----------
 43 |     data : bytes
 44 |         The data to be hashed.
 45 |     d : int
 46 |         The number of bits of the hash value.
 47 | 
 48 |     Returns
 49 |     -------
 50 |     int
 51 |         The hash value.
 52 | 
 53 |     Examples
 54 |     --------
 55 |     >>> sha1_hash(b"hello world", 32)
 56 |     896314922
 57 |     >>> sha1_hash(b"hello world", 64)
 58 |     13028719972609469994
 59 |     >>> sha1_hash(b"hello world", 128)
 60 |     310522945683037930239412421226792791594
 61 |     """
 62 |     if d == 32:
 63 |         return struct.unpack("<I", hashlib.sha1(data).digest()[:4])[0]
 64 |     if d == 64:
 65 |         return struct.unpack("<Q", hashlib.sha1(data).digest()[:8])[0]
 66 |     # struct is faster but does not support arbitrary bit lengths
 67 |     return int.from_bytes(hashlib.sha1(data).digest()[: d // 8], byteorder="little")
 68 | 
 69 | 
 70 | def sha256_hexdigest(data: bytes) -> str:
 71 |     """
 72 |     Generate a sha256 hex hash from the given data.
 73 | 
 74 |     Parameters
 75 |     ----------
 76 |     data : bytes
 77 |         The data to be hashed.
 78 | 
 79 |     Returns
 80 |     -------
 81 |     str
 82 |         The hex hash value.
 83 | 
 84 |     Examples
 85 |     --------
 86 |     >>> sha256_hexdigest(b"hello world")
 87 |     'b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9'
 88 |     >>> len(sha256_hexdigest(b"hello world"))
 89 |     64
 90 |     """
 91 |     return sha256(data).hexdigest()
 92 | 
 93 | 
 94 | def xxh3_16hash(data: bytes, seed: int = 0) -> int:
 95 |     """
 96 |     Generate a 16-bit xxhash based hash value from the given data.
 97 |     As of python xxhash 3.3.0 (and since 0.3.0) outputs in big-endian.
 98 |     This is useful as a special purpose xxhash when you only want 16 bits.
 99 |     bit masked xxh3_64 hashes are faster than xxh32 in modern systems.
100 | 
101 |     Parameters
102 |     ----------
103 |     data : bytes
104 |         The data to be hashed.
105 |     seed : int
106 |         xxhashes can all be seeded. Default is int=0
107 | 
108 |     Returns
109 |     -------
110 |     int
111 |         The hash value.
112 | 
113 |     Examples
114 |     --------
115 |     >>> xxh3_16hash(b"hello world")
116 |     39051
117 |     >>> xxh3_16hash(b"hello world",seed=42)
118 |     13198
119 |     >>> xxh3_16hash(b"hello world",seed=-42)
120 |     34281
121 |     """
122 |     return xxhash.xxh3_64_intdigest(data, seed) & 0xFFFF
123 | 
124 | 
125 | def xxh3_32hash(data: bytes, seed: int = 0) -> int:
126 |     """
127 |     Generate a 32-bit xxhash based hash value from the given data.
128 |     As of python xxhash 3.3.0 (and since 0.3.0) outputs in big-endian.
129 |     This is useful as a special purpose xxhash when you only want 32bits.
130 |     bit masked xxh3_64 hashes are faster than xxh32 in modern systems.
131 | 
132 |     Parameters
133 |     ----------
134 |     data : bytes
135 |         The data to be hashed.
136 |     seed : int
137 |         xxhashes can all be seeded. Default is int=0
138 | 
139 |     Returns
140 |     -------
141 |     int
142 |         The hash value.
143 | 
144 |     Examples
145 |     --------
146 |     >>> xxh3_32hash(b"hello world")
147 |     1088854155
148 |     >>> xxh3_32hash(b"hello world",seed=42)
149 |     3913102222
150 |     >>> xxh3_32hash(b"hello world",seed=-42)
151 |     3721037289
152 |     """
153 |     return xxhash.xxh3_64_intdigest(data, seed) & 0xFFFFFFFF
154 | 
155 | 
156 | def xxh3_hash(data: bytes, d: int = 32) -> int:
157 |     """
158 |     Generate a d-bit xxhash based hash value from the given data.
159 |     As of python xxhash 3.3.0 (and since 0.3.0) outputs in big-endian.
160 |     This is useful as a general purpose xxhash that can take multiple `d` values
161 | 
162 |     Parameters
163 |     ----------
164 |     data : bytes
165 |         The data to be hashed.
166 |     d : int
167 |         The number of bits of the hash value.
168 |         According to this value, chooses empirically found best xxh3 hasher.
169 | 
170 |     Returns
171 |     -------
172 |     int
173 |         The hash value.
174 | 
175 |     Examples
176 |     --------
177 |     >>> xxh3_hash(b"hello world", 32)
178 |     1088854155
179 |     >>> xxh3_hash(b"hello world", 64)
180 |     15296390279056496779
181 |     >>> xxh3_hash(b"hello world", 128)
182 |     297150157938599054391163723952090887879
183 |     """
184 |     match d:
185 |         case 32:
186 |             # with sse2 or later, xxh3 is much faster
187 |             # with avx, the difference is much larger
188 |             return xxhash.xxh3_64_intdigest(data) & 0xFFFFFFFF
189 |         case 64:
190 |             return xxhash.xxh3_64_intdigest(data)
191 |         case 128:
192 |             return xxhash.xxh3_128_intdigest(data)
193 |     # fall back
194 |     return int.from_bytes(xxhash.xxh3_128_digest(data)[: d // 8], byteorder="big")
195 | 
196 | 
197 | __all__ = [
198 |     "md5",
199 |     "sha256",
200 |     "sha1_hash",
201 |     "xxh3_64",
202 |     "xxh3_64_digest",
203 |     "xxh3_128",
204 |     "xxh3_128_digest",
205 |     "xxh3_hash",
206 |     "xxh3_16hash",
207 |     "xxh3_32hash",
208 | ]


--------------------------------------------------------------------------------
/text/mistral/run-tokenizer.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 4,
  6 |    "id": "68984750",
  7 |    "metadata": {
  8 |     "scrolled": true
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "from transformers import (\n",
 13 |     "    AutoModelForCausalLM,\n",
 14 |     "    AutoTokenizer,\n",
 15 |     "    get_scheduler,\n",
 16 |     "    default_data_collator,\n",
 17 |     "    SchedulerType\n",
 18 |     ")\n",
 19 |     "import os\n",
 20 |     "import json\n",
 21 |     "from itertools import chain\n",
 22 |     "from datasets import load_dataset"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 5,
 28 |    "id": "09d07423",
 29 |    "metadata": {},
 30 |    "outputs": [
 31 |     {
 32 |      "name": "stderr",
 33 |      "output_type": "stream",
 34 |      "text": [
 35 |       "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
 36 |      ]
 37 |     }
 38 |    ],
 39 |    "source": [
 40 |     "train_file = 'combine-mistral.jsonl'\n",
 41 |     "tokenizer = AutoTokenizer.from_pretrained(\n",
 42 |     "    'mistralai/Mistral-7B-v0.1',\n",
 43 |     ")\n",
 44 |     "tokenizer.add_bos_token = False\n",
 45 |     "tokenizer.add_eos_token = False\n",
 46 |     "text_column_name = 'text'"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 6,
 52 |    "id": "0c31ee11",
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "data": {
 57 |       "application/vnd.jupyter.widget-view+json": {
 58 |        "model_id": "86feb593de8d41089a848a49fdd7d95e",
 59 |        "version_major": 2,
 60 |        "version_minor": 0
 61 |       },
 62 |       "text/plain": [
 63 |        "Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]"
 64 |       ]
 65 |      },
 66 |      "metadata": {},
 67 |      "output_type": "display_data"
 68 |     },
 69 |     {
 70 |      "data": {
 71 |       "application/vnd.jupyter.widget-view+json": {
 72 |        "model_id": "9e284c80f2ab499a84083f6b9c1cdc7a",
 73 |        "version_major": 2,
 74 |        "version_minor": 0
 75 |       },
 76 |       "text/plain": [
 77 |        "Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]"
 78 |       ]
 79 |      },
 80 |      "metadata": {},
 81 |      "output_type": "display_data"
 82 |     },
 83 |     {
 84 |      "data": {
 85 |       "application/vnd.jupyter.widget-view+json": {
 86 |        "model_id": "678282f811e54b628c1f6ab3c074a4fc",
 87 |        "version_major": 2,
 88 |        "version_minor": 0
 89 |       },
 90 |       "text/plain": [
 91 |        "Generating train split: 0 examples [00:00, ? examples/s]"
 92 |       ]
 93 |      },
 94 |      "metadata": {},
 95 |      "output_type": "display_data"
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "raw_datasets = load_dataset(\n",
100 |     "    'json',\n",
101 |     "    data_files=train_file,\n",
102 |     "    split='train'\n",
103 |     ")"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 8,
109 |    "id": "8eedfd87",
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "def tokenize_function(examples):\n",
114 |     "    return tokenizer(examples[text_column_name])"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 9,
120 |    "id": "3bfece34",
121 |    "metadata": {},
122 |    "outputs": [
123 |     {
124 |      "data": {
125 |       "application/vnd.jupyter.widget-view+json": {
126 |        "model_id": "b43867bb6503463eb239830fbb905776",
127 |        "version_major": 2,
128 |        "version_minor": 0
129 |       },
130 |       "text/plain": [
131 |        "Map (num_proc=20):   0%|          | 0/37117462 [00:00<?, ? examples/s]"
132 |       ]
133 |      },
134 |      "metadata": {},
135 |      "output_type": "display_data"
136 |     },
137 |     {
138 |      "name": "stderr",
139 |      "output_type": "stream",
140 |      "text": [
141 |       "IOPub message rate exceeded.\n",
142 |       "The notebook server will temporarily stop sending output\n",
143 |       "to the client in order to avoid crashing it.\n",
144 |       "To change this limit, set the config variable\n",
145 |       "`--NotebookApp.iopub_msg_rate_limit`.\n",
146 |       "\n",
147 |       "Current values:\n",
148 |       "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
149 |       "NotebookApp.rate_limit_window=3.0 (secs)\n",
150 |       "\n"
151 |      ]
152 |     },
153 |     {
154 |      "data": {
155 |       "text/plain": [
156 |        "Dataset({\n",
157 |        "    features: ['input_ids', 'attention_mask'],\n",
158 |        "    num_rows: 37117462\n",
159 |        "})"
160 |       ]
161 |      },
162 |      "execution_count": 9,
163 |      "metadata": {},
164 |      "output_type": "execute_result"
165 |     }
166 |    ],
167 |    "source": [
168 |     "filename = os.path.split(train_file)[1]\n",
169 |     "column_names = raw_datasets.column_names\n",
170 |     "tokenized_datasets = raw_datasets.map(\n",
171 |     "    tokenize_function,\n",
172 |     "    batched=True,\n",
173 |     "    remove_columns=column_names,\n",
174 |     "    load_from_cache_file=True,\n",
175 |     "    cache_file_name=f'./{filename}-tokenized',\n",
176 |     "    num_proc=20,\n",
177 |     ")\n",
178 |     "\n",
179 |     "tokenized_datasets"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": null,
185 |    "id": "575ed380",
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": []
189 |   }
190 |  ],
191 |  "metadata": {
192 |   "kernelspec": {
193 |    "display_name": "Python 3 (ipykernel)",
194 |    "language": "python",
195 |    "name": "python3"
196 |   },
197 |   "language_info": {
198 |    "codemirror_mode": {
199 |     "name": "ipython",
200 |     "version": 3
201 |    },
202 |    "file_extension": ".py",
203 |    "mimetype": "text/x-python",
204 |    "name": "python",
205 |    "nbconvert_exporter": "python",
206 |    "pygments_lexer": "ipython3",
207 |    "version": "3.10.12"
208 |   }
209 |  },
210 |  "nbformat": 4,
211 |  "nbformat_minor": 5
212 | }
213 | 


--------------------------------------------------------------------------------
/text/llama/prepare-tokenizer.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "56fc07cf",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from transformers import (\n",
 11 |     "    AutoModelForCausalLM,\n",
 12 |     "    AutoTokenizer,\n",
 13 |     "    get_scheduler,\n",
 14 |     "    default_data_collator,\n",
 15 |     "    SchedulerType\n",
 16 |     ")\n",
 17 |     "import os\n",
 18 |     "import json\n",
 19 |     "from itertools import chain\n",
 20 |     "from datasets import load_dataset"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 3,
 26 |    "id": "e320f019",
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "data": {
 31 |       "application/vnd.jupyter.widget-view+json": {
 32 |        "model_id": "1ee36389eac44862bc018d33cf39da33",
 33 |        "version_major": 2,
 34 |        "version_minor": 0
 35 |       },
 36 |       "text/plain": [
 37 |        "Downloading (…)okenizer_config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]"
 38 |       ]
 39 |      },
 40 |      "metadata": {},
 41 |      "output_type": "display_data"
 42 |     },
 43 |     {
 44 |      "data": {
 45 |       "application/vnd.jupyter.widget-view+json": {
 46 |        "model_id": "ea7e12ba9791437db332f9f0ff247c64",
 47 |        "version_major": 2,
 48 |        "version_minor": 0
 49 |       },
 50 |       "text/plain": [
 51 |        "Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]"
 52 |       ]
 53 |      },
 54 |      "metadata": {},
 55 |      "output_type": "display_data"
 56 |     },
 57 |     {
 58 |      "data": {
 59 |       "application/vnd.jupyter.widget-view+json": {
 60 |        "model_id": "f03938703b01429095679e38aa50ae96",
 61 |        "version_major": 2,
 62 |        "version_minor": 0
 63 |       },
 64 |       "text/plain": [
 65 |        "Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]"
 66 |       ]
 67 |      },
 68 |      "metadata": {},
 69 |      "output_type": "display_data"
 70 |     },
 71 |     {
 72 |      "data": {
 73 |       "application/vnd.jupyter.widget-view+json": {
 74 |        "model_id": "28fcad13bd01402290df7bed108b30f2",
 75 |        "version_major": 2,
 76 |        "version_minor": 0
 77 |       },
 78 |       "text/plain": [
 79 |        "Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]"
 80 |       ]
 81 |      },
 82 |      "metadata": {},
 83 |      "output_type": "display_data"
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "block_size = 1024\n",
 88 |     "train_file = 'combine.jsonl'\n",
 89 |     "tokenizer = AutoTokenizer.from_pretrained(\n",
 90 |     "    'mesolitica/llama-7b-hf-16384-fpf',\n",
 91 |     ")\n",
 92 |     "text_column_name = 'text'"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "id": "2634b632",
 99 |    "metadata": {},
100 |    "outputs": [
101 |     {
102 |      "data": {
103 |       "application/vnd.jupyter.widget-view+json": {
104 |        "model_id": "be7a996be3aa499fa73562ad144690d4",
105 |        "version_major": 2,
106 |        "version_minor": 0
107 |       },
108 |       "text/plain": [
109 |        "Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]"
110 |       ]
111 |      },
112 |      "metadata": {},
113 |      "output_type": "display_data"
114 |     },
115 |     {
116 |      "data": {
117 |       "application/vnd.jupyter.widget-view+json": {
118 |        "model_id": "1a7627b16a904159805ab4d33775b50e",
119 |        "version_major": 2,
120 |        "version_minor": 0
121 |       },
122 |       "text/plain": [
123 |        "Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]"
124 |       ]
125 |      },
126 |      "metadata": {},
127 |      "output_type": "display_data"
128 |     },
129 |     {
130 |      "data": {
131 |       "application/vnd.jupyter.widget-view+json": {
132 |        "model_id": "ad569aa4427e4f858576ec2a0f4759fc",
133 |        "version_major": 2,
134 |        "version_minor": 0
135 |       },
136 |       "text/plain": [
137 |        "Generating train split: 0 examples [00:00, ? examples/s]"
138 |       ]
139 |      },
140 |      "metadata": {},
141 |      "output_type": "display_data"
142 |     }
143 |    ],
144 |    "source": [
145 |     "raw_datasets = load_dataset(\n",
146 |     "    'json',\n",
147 |     "    data_files=train_file,\n",
148 |     "    split='train'\n",
149 |     ")\n",
150 |     "\n",
151 |     "filename = os.path.split(train_file)[1]\n",
152 |     "\n",
153 |     "def tokenize_function(examples):\n",
154 |     "    return tokenizer(examples[text_column_name])\n",
155 |     "\n",
156 |     "column_names = raw_datasets.column_names\n",
157 |     "tokenized_datasets = raw_datasets.map(\n",
158 |     "    tokenize_function,\n",
159 |     "    batched=True,\n",
160 |     "    remove_columns=column_names,\n",
161 |     "    load_from_cache_file=True,\n",
162 |     "    cache_file_name=f'./{filename}-tokenized-{block_size}',\n",
163 |     "    num_proc=20,\n",
164 |     ")\n",
165 |     "\n",
166 |     "tokenized_datasets"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 4,
172 |    "id": "37417527",
173 |    "metadata": {},
174 |    "outputs": [
175 |     {
176 |      "data": {
177 |       "text/plain": [
178 |        "33339118"
179 |       ]
180 |      },
181 |      "execution_count": 4,
182 |      "metadata": {},
183 |      "output_type": "execute_result"
184 |     }
185 |    ],
186 |    "source": [
187 |     "len(tokenized_datasets)"
188 |    ]
189 |   }
190 |  ],
191 |  "metadata": {
192 |   "kernelspec": {
193 |    "display_name": "Python 3 (ipykernel)",
194 |    "language": "python",
195 |    "name": "python3"
196 |   },
197 |   "language_info": {
198 |    "codemirror_mode": {
199 |     "name": "ipython",
200 |     "version": 3
201 |    },
202 |    "file_extension": ".py",
203 |    "mimetype": "text/x-python",
204 |    "name": "python",
205 |    "nbconvert_exporter": "python",
206 |    "pygments_lexer": "ipython3",
207 |    "version": "3.10.12"
208 |   }
209 |  },
210 |  "nbformat": 4,
211 |  "nbformat_minor": 5
212 | }
213 | 


--------------------------------------------------------------------------------
/multilingual-tts/prepare/prepare-MasriSpeech-Full.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "e62caf64",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from glob import glob\n",
 11 |     "import pandas as pd\n",
 12 |     "import os\n",
 13 |     "import soundfile as sf\n",
 14 |     "from tqdm import tqdm\n",
 15 |     "from multiprocess import Pool\n",
 16 |     "import itertools\n",
 17 |     "import io\n",
 18 |     "import numpy as np\n",
 19 |     "import json\n",
 20 |     "import re\n",
 21 |     "import zipfile\n",
 22 |     "from pathlib import Path\n",
 23 |     "\n",
 24 |     "def chunks(l, n):\n",
 25 |     "    for i in range(0, len(l), n):\n",
 26 |     "        yield (l[i: i + n], i // n)\n",
 27 |     "\n",
 28 |     "def multiprocessing(strings, function, cores=6, returned=True):\n",
 29 |     "    df_split = chunks(strings, len(strings) // cores)\n",
 30 |     "    pool = Pool(cores)\n",
 31 |     "    pooled = pool.map(function, df_split)\n",
 32 |     "    pool.close()\n",
 33 |     "    pool.join()\n",
 34 |     "\n",
 35 |     "    if returned:\n",
 36 |     "        return list(itertools.chain(*pooled))"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "id": "2bd68608",
 43 |    "metadata": {},
 44 |    "outputs": [
 45 |     {
 46 |      "name": "stderr",
 47 |      "output_type": "stream",
 48 |      "text": [
 49 |       "/home/ubuntu/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
 50 |       "  from .autonotebook import tqdm as notebook_tqdm\n",
 51 |       "Fetching 24 files: 100%|██████████| 24/24 [00:10<00:00,  2.32it/s]\n"
 52 |      ]
 53 |     },
 54 |     {
 55 |      "data": {
 56 |       "text/plain": [
 57 |        "'/home/ubuntu/MasriSpeech-Full'"
 58 |       ]
 59 |      },
 60 |      "execution_count": 2,
 61 |      "metadata": {},
 62 |      "output_type": "execute_result"
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "from huggingface_hub import snapshot_download\n",
 67 |     "\n",
 68 |     "snapshot_download(\n",
 69 |     "    repo_id=\"NightPrince/MasriSpeech-Full\", \n",
 70 |     "    repo_type=\"dataset\", local_dir=\"./MasriSpeech-Full\", allow_patterns=\"*/*.parquet\")"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 14,
 76 |    "id": "42c0df51",
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "data": {
 81 |       "text/plain": [
 82 |        "24"
 83 |       ]
 84 |      },
 85 |      "execution_count": 14,
 86 |      "metadata": {},
 87 |      "output_type": "execute_result"
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "files = glob('MasriSpeech-Full/*/*.parquet')\n",
 92 |     "len(files)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 15,
 98 |    "id": "7619151a",
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "def loop(files):\n",
103 |     "\n",
104 |     "    os.environ['OMP_NUM_THREADS'] = '1'\n",
105 |     "    os.environ['OPENBLAS_NUM_THREADS'] = '1'\n",
106 |     "    \n",
107 |     "    files, _ = files\n",
108 |     "\n",
109 |     "    data = []\n",
110 |     "    for f in files:\n",
111 |     "        base = f.split('/')[0] + '_audio'\n",
112 |     "        f_new = f.replace('/', '-').replace('.parquet', '')\n",
113 |     "        os.makedirs(base, exist_ok=True)\n",
114 |     "        df = pd.read_parquet(f)\n",
115 |     "        for i in tqdm(range(len(df))):\n",
116 |     "            try:\n",
117 |     "                t = df['transcription'].iloc[i].strip()\n",
118 |     "                if len(t) < 2:\n",
119 |     "                    continue\n",
120 |     "                audio_filename = f'{f_new}_{i}.mp3'\n",
121 |     "                audio_filename = os.path.join(base, audio_filename)\n",
122 |     "                b = df['audio'].iloc[i]['bytes']\n",
123 |     "                audio_np, sr = sf.read(io.BytesIO(b))\n",
124 |     "                if audio_np.ndim > 1:\n",
125 |     "                    audio_np = audio_np.mean(axis=1)\n",
126 |     "                if audio_np.shape[0] < 10000:\n",
127 |     "                    continue\n",
128 |     "                sf.write(audio_filename, audio_np, sr)\n",
129 |     "                \n",
130 |     "                data.append({\n",
131 |     "                    'audio_filename': audio_filename,\n",
132 |     "                    'text': t,\n",
133 |     "                    'speaker': f\"{base}\"\n",
134 |     "                })\n",
135 |     "            except Exception as e:\n",
136 |     "                pass\n",
137 |     "        \n",
138 |     "    return data"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 16,
144 |    "id": "a66ecfc4",
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "# data = loop((files[:1], 0))"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "id": "935c7e8c",
155 |    "metadata": {},
156 |    "outputs": [
157 |     {
158 |      "name": "stderr",
159 |      "output_type": "stream",
160 |      "text": [
161 |       "  7%|▋         | 145/2205 [00:18<03:34,  9.59it/s]"
162 |      ]
163 |     }
164 |    ],
165 |    "source": [
166 |     "data = multiprocessing(files, loop, cores = len(files))"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "id": "d5fb3e9a",
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": []
176 |   }
177 |  ],
178 |  "metadata": {
179 |   "kernelspec": {
180 |    "display_name": "Python 3 (ipykernel)",
181 |    "language": "python",
182 |    "name": "python3"
183 |   },
184 |   "language_info": {
185 |    "codemirror_mode": {
186 |     "name": "ipython",
187 |     "version": 3
188 |    },
189 |    "file_extension": ".py",
190 |    "mimetype": "text/x-python",
191 |    "name": "python",
192 |    "nbconvert_exporter": "python",
193 |    "pygments_lexer": "ipython3",
194 |    "version": "3.10.12"
195 |   }
196 |  },
197 |  "nbformat": 4,
198 |  "nbformat_minor": 5
199 | }
200 | 


--------------------------------------------------------------------------------
/text/processing/function.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import mp
  3 | import json
  4 | import functools
  5 | import subprocess
  6 | import utils as ut
  7 | from glob import glob
  8 | from tqdm import tqdm
  9 | from pathlib import Path
 10 | from datasets import Dataset
 11 | from unidecode import unidecode
 12 | 
 13 | 
 14 | def download_dataset(link, raw_dataset_path, dataset_name):
 15 |     try:
 16 |         global MAIN_FOLDER_DATASET
 17 | 
 18 |         MAIN_FOLDER_DATASET = f"{raw_dataset_path}/raw-datasets/"
 19 |         ut.create_dir(MAIN_FOLDER_DATASET)
 20 | 
 21 |         command = f"wget {link} -O {MAIN_FOLDER_DATASET}/{dataset_name}.jsonl"
 22 |         ut.run_command(command)
 23 | 
 24 |         return True
 25 |     except:
 26 |         return False
 27 | 
 28 | 
 29 | def init_process(
 30 |     raw_dataset_path, dataset_name, text_key=None, link=None, clean_file_path=None
 31 | ):
 32 |     global INITIAL_PRE_PROCESSING_FOLDER
 33 |     global MAIN_FOLDER_DATASET
 34 | 
 35 |     txt_l = []
 36 | 
 37 |     if link != None:
 38 |         dd = download_dataset(link, raw_dataset_path, dataset_name)
 39 | 
 40 |         INITIAL_PRE_PROCESSING_FOLDER = f"{raw_dataset_path}/staging-datasets/"
 41 |         ut.create_dir(INITIAL_PRE_PROCESSING_FOLDER)
 42 | 
 43 |         with open(f"{MAIN_FOLDER_DATASET}/{dataset_name}.jsonl") as fopen:
 44 |             data = [json.loads(line) for line in fopen]
 45 | 
 46 |     if clean_file_path != None:
 47 |         MAIN_FOLDER_DATASET = clean_file_path
 48 | 
 49 |         INITIAL_PRE_PROCESSING_FOLDER = f"{raw_dataset_path}/staging-datasets/"
 50 |         ut.create_dir(INITIAL_PRE_PROCESSING_FOLDER)
 51 | 
 52 |         with open(clean_file_path) as fopen:
 53 |             data = [json.loads(line) for line in fopen]
 54 | 
 55 |     try:
 56 |         key_data = [key for key, _ in data[0].items()]
 57 |         print(f"Availble key -> {key_data}")
 58 |     except AttributeError:
 59 |         raise Exception(
 60 |             f"dataset not in standard list format, total record in the file -> {len(data)}."
 61 |         )
 62 | 
 63 |     suitable_key = [
 64 |         "p",
 65 |         "text",
 66 |         "article_text",
 67 |         "article_body",
 68 |         "text",
 69 |         "content",
 70 |         "contents",
 71 |         "body",
 72 |         "articleBody",
 73 |         "data",
 74 |         "title",
 75 |     ]
 76 | 
 77 |     if text_key:
 78 |         suitable_key = list(set(suitable_key + text_key))
 79 | 
 80 |     if not any(key in key_data for key in suitable_key):
 81 |         raise Exception(
 82 |             f"dataset not in standard key-value. must have ({' | '.join(suitable_key)})"
 83 |         )
 84 | 
 85 |     for i in tqdm(data):
 86 |         str_lst = []
 87 |         for key in i.keys():
 88 |             if key in suitable_key:
 89 |                 str_lst.append(str(i[key]))
 90 |             else:
 91 |                 continue
 92 | 
 93 |         if None in str_lst:
 94 |             str_lst = ["None" if v is None else v for v in str_lst]
 95 | 
 96 |         str_data = "\n\n".join(str_lst)
 97 |         txt_l.append({"text": f"{str_data}"})
 98 | 
 99 |     ut.write_to_json(txt_l, f"{INITIAL_PRE_PROCESSING_FOLDER}{dataset_name}.jsonl")
100 | 
101 | 
102 | def second_process(raw_dataset_path, dataset_name):
103 |     global HF_FOLDER_RAW
104 |     global HF_FOLDER_DEDUPE
105 | 
106 |     HF_FOLDER_RAW = f"{raw_dataset_path}/hf-datasets/raw-datasets/"
107 |     HF_FOLDER_DEDUPE = f"{raw_dataset_path}/hf-datasets/dedupe-datasets/"
108 | 
109 |     ut.create_dir(HF_FOLDER_RAW)
110 |     ut.create_dir(HF_FOLDER_DEDUPE)
111 | 
112 |     with open(f"{INITIAL_PRE_PROCESSING_FOLDER}/{dataset_name}.jsonl") as fopen:
113 |         data = [json.loads(line) for line in fopen]
114 | 
115 |     print(f"total records: {len(data)}")
116 | 
117 |     data = [entry for entry in tqdm(data) if entry is not None]
118 | 
119 |     print(f"total records after remove None: {len(data)}")
120 | 
121 |     data_dict = {"text": [entry["text"] for entry in data]}
122 | 
123 |     dataset = Dataset.from_dict(data_dict)
124 | 
125 |     dataset.save_to_disk(f"{HF_FOLDER_RAW}{dataset_name}")
126 | 
127 |     command = f"python3 -m text_dedup.minhash \
128 |                 --path {HF_FOLDER_RAW}{dataset_name} \
129 |                 --split train \
130 |                 --cache_dir ./cache \
131 |                 --output {HF_FOLDER_DEDUPE}{dataset_name} \
132 |                 --column text \
133 |                 --batch_size 10000 \
134 |                 --threshold 0.95 \
135 |                 --min_length 1 \
136 |                 --local"
137 | 
138 |     ut.run_command(command)
139 | 
140 | 
141 | def third_process(raw_dataset_path, mp_core):
142 |     HF_FOLDER_POSTPROCESSING = f"{raw_dataset_path}/hf-datasets/postprocessing/"
143 |     HF_FOLDER_POSTPROCESSING_DONE = (
144 |         f"{raw_dataset_path}/hf-datasets/postprocessing-done/"
145 |     )
146 | 
147 |     ut.create_dir(HF_FOLDER_POSTPROCESSING)
148 |     ut.create_dir(HF_FOLDER_POSTPROCESSING_DONE)
149 | 
150 |     files_lst = glob(f"{HF_FOLDER_DEDUPE}*.jsonl")
151 | 
152 |     print(f"total files to postprocessing --> {len(files_lst)}")
153 | 
154 |     core = mp_core
155 | 
156 |     if len(files_lst) // core == 0:
157 |         process_type = "single"
158 |         ut.loop(files_lst, process_type=process_type)
159 |     else:
160 |         process_type = "multi"
161 |         mp.multiprocessing(files_lst, ut.loop, cores=core, returned=False)
162 | 
163 | 
164 | def get_size(raw_dataset_path, dataset_name):
165 |     before_dedup_url = f"{MAIN_FOLDER_DATASET}/{dataset_name}.jsonl"
166 |     before_dedup_clean = f"{MAIN_FOLDER_DATASET}"
167 |     after_dedup = f"{HF_FOLDER_DEDUPE}{dataset_name}.jsonl"
168 |     after_post = f"{raw_dataset_path}/hf-datasets/postprocessing/{dataset_name}.jsonl"
169 | 
170 |     try:
171 |         before_dedup_mb = (os.stat(before_dedup_url)).st_size / (1024 * 1024)
172 |     except:
173 |         before_dedup_mb = (os.stat(before_dedup_clean)).st_size / (1024 * 1024)
174 | 
175 |     after_dedup_mb = (os.stat(after_dedup)).st_size / (1024 * 1024)
176 |     after_post_mb = (os.stat(after_post)).st_size / (1024 * 1024)
177 | 
178 |     return (
179 |         f"{before_dedup_mb:.2f} MB",
180 |         f"{after_dedup_mb:.2f} MB",
181 |         f"{after_post_mb:.2f} MB",
182 |     )
183 | 


--------------------------------------------------------------------------------