├── text
├── hf-datasets
│ ├── raw-datasets
│ │ └── .gitkeep
│ ├── dedupe-datasets
│ │ └── .gitkeep
│ ├── postprocessing
│ │ └── .gitkeep
│ └── postprocessing-done
│ │ └── .gitkeep
├── pretrain-llm
│ ├── how-to-mosaic.png
│ ├── README.md
│ ├── prepare-madlad-400-ms.ipynb
│ ├── prepare-starcoder.ipynb
│ └── prepare-translation.ipynb
├── text_dedup
│ ├── __init__.py
│ └── utils
│ │ ├── preprocess.py
│ │ ├── __init__.py
│ │ ├── union_find.py
│ │ ├── tokenization.py
│ │ ├── timer.py
│ │ ├── analysis.py
│ │ └── hashfunc.py
├── processing
│ ├── text_dedup
│ │ ├── __init__.py
│ │ └── utils
│ │ │ ├── preprocess.py
│ │ │ ├── __init__.py
│ │ │ ├── union_find.py
│ │ │ ├── tokenization.py
│ │ │ ├── timer.py
│ │ │ ├── analysis.py
│ │ │ └── hashfunc.py
│ ├── README.md
│ ├── utils.py
│ ├── main.py
│ └── function.py
├── yi
│ └── README.md
├── mistral
│ ├── README.md
│ └── run-tokenizer.ipynb
├── tinyllama
│ └── README.md
├── llama
│ ├── README.md
│ ├── prepare-dataset-1024.ipynb
│ ├── prepare-dataset-2048.ipynb
│ └── prepare-tokenizer.ipynb
├── pretrain-clm
│ ├── README.md
│ └── from-pyarrow-to-mosaic.ipynb
├── extra
│ ├── process-lowyat.ipynb
│ ├── process-data.gov.my.ipynb
│ ├── sample-fineweb-edu.ipynb
│ └── process-snapshot.ipynb
├── README.md
├── compare-tokens.ipynb
├── .gitignore
└── madlad-400-ms
│ ├── prepare-madlad-400-ms.ipynb
│ ├── dedup-madlad-400-ms.ipynb
│ └── postprocess-madlad-400-ms.ipynb
├── multilingual-tts
├── prepare
│ ├── prepare-CORAA-MUPE-ASR.ipynb
│ ├── prepare-ParlaSpeech-CZ.ipynb
│ ├── prepare-ParlaSpeech-HR.ipynb
│ ├── prepare-ParlaSpeech-PL.ipynb
│ ├── prepare-WenetSpeech4TTS.ipynb
│ └── prepare-MasriSpeech-Full.ipynb
├── README.md
├── embedding.py
├── convert_neucodec.py
└── trim_silence.py
├── README.md
├── stt-whisper
├── .gitignore
├── README.md
├── force_alignment.py
└── audioset_sliding.py
├── malaysian-short-instructions
├── .gitignore
├── keyword-location
├── negeri
├── keywords
├── dedup-questions-intents.ipynb
└── dedup-questions.ipynb
├── speech-instructions
├── .gitignore
├── README.md
├── generate.sh
├── embedding.py
├── prepare-malaysian-podcast.ipynb
├── remote.sh
├── prepare-malaysian-others.ipynb
└── prepare-malaysia-parliament.ipynb
├── emotional-malaysian-emilia
├── README.md
├── pitch_estimation.py
├── audioset_sliding.py
└── audioset_sliding_v2.py
├── emilia-yodas
├── README.md
└── convert_neucodec_emilia.py
├── LICENSE
├── speech-instructions-extra
└── upload.ipynb
└── .gitignore
/text/hf-datasets/raw-datasets/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/text/hf-datasets/dedupe-datasets/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/text/hf-datasets/postprocessing/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/text/hf-datasets/postprocessing-done/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/multilingual-tts/prepare/prepare-CORAA-MUPE-ASR.ipynb:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/multilingual-tts/prepare/prepare-ParlaSpeech-CZ.ipynb:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/multilingual-tts/prepare/prepare-ParlaSpeech-HR.ipynb:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/multilingual-tts/prepare/prepare-ParlaSpeech-PL.ipynb:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Dataset
2 |
3 | Our recipes to prepare datasets.
--------------------------------------------------------------------------------
/stt-whisper/.gitignore:
--------------------------------------------------------------------------------
1 | *.json
2 | force_alignment
3 | *.parquet
--------------------------------------------------------------------------------
/malaysian-short-instructions/.gitignore:
--------------------------------------------------------------------------------
1 | generate-questions*
2 | generate-answers*
3 | *.json
4 | *.parquet
--------------------------------------------------------------------------------
/text/pretrain-llm/how-to-mosaic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malaysia-ai/dataset/HEAD/text/pretrain-llm/how-to-mosaic.png
--------------------------------------------------------------------------------
/multilingual-tts/README.md:
--------------------------------------------------------------------------------
1 | # Multilingual-TTS
2 |
3 | Gather multilingual TTS dataset, everything pushed into https://huggingface.co/datasets/malaysia-ai/Multilingual-TTS.
--------------------------------------------------------------------------------
/malaysian-short-instructions/keyword-location:
--------------------------------------------------------------------------------
1 | food
2 | attraction
3 | lifestyle
4 | culture
5 | shopping mall
6 | agama
7 | socioeconomy
8 | peluang pekerjaan
9 | infrastructure
10 | education
11 | technology
12 | business
--------------------------------------------------------------------------------
/speech-instructions/.gitignore:
--------------------------------------------------------------------------------
1 | *.parquet
2 | embedding*
3 | *.json
4 | *.jsonl
5 | dedup-parliament
6 | dedup-podcasts
7 | dedup-others
8 | partition-instructions-part*
9 | tatabahasa*
10 | mallm*
11 | short-coding-*
12 | malaymmlu*
--------------------------------------------------------------------------------
/text/text_dedup/__init__.py:
--------------------------------------------------------------------------------
1 | """Text deduplication simplified."""
2 |
3 | import logging
4 |
5 | from rich.logging import RichHandler
6 |
7 | logger = logging.getLogger("text_dedup")
8 | logger.setLevel(logging.INFO)
9 | logger.addHandler(RichHandler(rich_tracebacks=True))
10 | logger.propagate = False
--------------------------------------------------------------------------------
/text/processing/text_dedup/__init__.py:
--------------------------------------------------------------------------------
1 | """Text deduplication simplified."""
2 |
3 | import logging
4 |
5 | from rich.logging import RichHandler
6 |
7 | logger = logging.getLogger("text_dedup")
8 | logger.setLevel(logging.INFO)
9 | logger.addHandler(RichHandler(rich_tracebacks=True))
10 | logger.propagate = False
--------------------------------------------------------------------------------
/malaysian-short-instructions/negeri:
--------------------------------------------------------------------------------
1 | negeri johor
2 | negeri kedah
3 | negeri kelantan
4 | negeri melaka
5 | negeri negeri sembilan
6 | negeri pahang
7 | negeri perak
8 | negeri perlis
9 | negeri pulau pinang
10 | negeri selangor
11 | negeri terengganu
12 | negeri sabah
13 | negeri sarawak
14 | kuala lumpur
15 | negeri labuan
16 | putrajaya
--------------------------------------------------------------------------------
/emotional-malaysian-emilia/README.md:
--------------------------------------------------------------------------------
1 | # Emotional Malaysian Emilia
2 |
3 | Synthetic Emotional label on Malaysian Emilia.
4 |
5 | ## how to
6 |
7 | ### Predict Audioset sliding window
8 |
9 | ```bash
10 | CUDA_VISIBLE_DEVICES=0 \
11 | python3 audioset_sliding_v2.py --path 'malaysian-podcast_processed/**/*.mp3' --global-index 1 --local-index 0
12 | ```
13 |
14 | ### Predict Emotion
--------------------------------------------------------------------------------
/emilia-yodas/README.md:
--------------------------------------------------------------------------------
1 | ## Convert to audio tokens
2 |
3 | ```bash
4 | OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 \
5 | python3 convert_neucodec_batch.py --file 'emilia-audio.json' --replication 2
6 | ```
7 |
8 | But we prefer to use [convert_neucodec_emilia.py](convert_neucodec_emilia.py) in GH200,
9 |
10 | ```bash
11 | OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 python3 convert_neucodec_emilia.py --file 'emilia-audio.json' --replication 13
12 | ```
13 |
14 | Way faster!
--------------------------------------------------------------------------------
/text/yi/README.md:
--------------------------------------------------------------------------------
1 | # Prepare dataset for Yi FPF
2 |
3 | This step to prepare FPF Yi models.
4 |
5 | ## how-to
6 |
7 | 1. Run [combine-dataset.ipynb](combine-dataset.ipynb),
8 |
9 | This will combine most datasets into 1 JSONL file.
10 |
11 | - 41 GB.
12 |
13 | 2. Run [convert-mosaic.ipynb](prepare-tokenizer.ipynb),
14 |
15 | This will tokenized and convert into mosaic format.
16 |
17 | 3. Run [combine-mosaic-all.ipynb](combine-mosaic-all.ipynb),
18 |
19 | This will combine all mosaic partitions into one mosaic folder, total 14114934784 tokens.
--------------------------------------------------------------------------------
/text/mistral/README.md:
--------------------------------------------------------------------------------
1 | # Prepare dataset for Mistral FPF
2 |
3 | This step to prepare FPF Mistral model.
4 |
5 | ## how-to
6 |
7 | 1. Run [mistral/combine-mistral.ipynb](mistral/combine-mistral.ipynb),
8 |
9 | This will combine most datasets into 1 JSONL file.
10 |
11 | - 32.6 GB.
12 |
13 | 2. Run [prepare-tokenizer.ipynb](prepare-tokenizer.ipynb),
14 |
15 | This will tokenized and cached the dataset.
16 |
17 | 3. Run [prepare-dataset-4096.ipynb](prepare-dataset-4096.ipynb),
18 |
19 | This will partitioned tokenized dataset into 4096 context length.
--------------------------------------------------------------------------------
/text/tinyllama/README.md:
--------------------------------------------------------------------------------
1 | # Prepare dataset for TinyLlama FPF
2 |
3 | This step to prepare FPF TinyLlama models.
4 |
5 | ## how-to
6 |
7 | 1. Run [combine-dataset.ipynb](combine-dataset.ipynb),
8 |
9 | This will combine most datasets into 1 JSONL file.
10 |
11 | - 41 GB.
12 |
13 | 2. Run [convert-mosaic.ipynb](prepare-tokenizer.ipynb),
14 |
15 | This will tokenized and convert into mosaic format.
16 |
17 | 3. Run [combine-mosaic-all.ipynb](combine-mosaic-all.ipynb),
18 |
19 | This will combine all mosaic partitions into one mosaic folder, total 14349328384 tokens.
--------------------------------------------------------------------------------
/text/llama/README.md:
--------------------------------------------------------------------------------
1 | # Prepare dataset for Llama2 FPF
2 |
3 | This step to prepare FPF Llama2 models.
4 |
5 | ## how-to
6 |
7 | 1. Run [combine-v2.ipynb](combine-v2.ipynb),
8 |
9 | This will combine most datasets into 1 JSONL file.
10 |
11 | - 31.4 GB.
12 |
13 | 2. Run [prepare-tokenizer.ipynb](prepare-tokenizer.ipynb),
14 |
15 | This will tokenized and cached the dataset.
16 |
17 | 3. Run [prepare-dataset-2048.ipynb](prepare-dataset-2048.ipynb),
18 |
19 | This will partitioned tokenized dataset into 2048 context length.
20 |
21 | 4. Run [prepare-dataset-32768.ipynb](prepare-dataset-32768.ipynb),
22 |
23 | This will partitioned tokenized dataset into 32768 context length.
--------------------------------------------------------------------------------
/text/pretrain-clm/README.md:
--------------------------------------------------------------------------------
1 | # Pretrain CLM
2 |
3 | This is to pretrain 100M - 500M parameters CLM. All steps done using Standard_F48s_v2 node size.
4 |
5 | This step to prepare pretrain models from scratch.
6 |
7 | ## how-to
8 |
9 | 1. Run [pretrain/combine-lm.ipynb](pretrain/combine-lm.ipynb),
10 |
11 | This will combine all datasets into 1 JSONL file.
12 |
13 | - 81 GB.
14 | - 16994238464 tokens.
15 |
16 | 2. Run [pretrain/tokenizer-4096.ipynb](pretrain/tokenizer-4096.ipynb),
17 |
18 | This will tokenized and partitioned tokenized dataset into 4096 context length.
19 |
20 | 3. Run [pretrain/from-pyarrow-to-mosaic.ipynb](pretrain/from-pyarrow-to-mosaic.ipynb),
21 |
22 | This will convert PyArrow streaming format into MosaicML streaming format.
23 |
24 | 4. Run [pretrain/combine-mosaicml.ipynb](pretrain/combine-mosaicml.ipynb),
25 |
26 | This will combine multiple MosaicML streaming folders into 1 folder.
--------------------------------------------------------------------------------
/malaysian-short-instructions/keywords:
--------------------------------------------------------------------------------
1 | react js
2 | vue js
3 | vanilla javascript
4 | websocket
5 | node js
6 | svelte
7 | next js
8 | express js
9 | angular js
10 | jquery
11 | d3 js
12 | python matplotlib
13 | python pandas
14 | python dask
15 | python scipy
16 | python numpy
17 | python keras
18 | python flask
19 | python fastapi
20 | python request
21 | python async
22 | python scikit learn
23 | python dask
24 | python distributed system
25 | pytorch
26 | pyspark
27 | apache spark
28 | apache hadoop
29 | apache hive
30 | apache kafka
31 | apache yarn
32 | apache flink
33 | apache cassandra
34 | apache airflow
35 | apache druid
36 | c++
37 | java
38 | rust
39 | kotlin
40 | swift
41 | cuda
42 | go
43 | go distributed system
44 | kubernetes
45 | bash
46 | docker
47 | dockerfile
48 | nginx
49 | tcp
50 | postgresql
51 | mysql
52 | oracle db
53 | elasticsearch
54 | nosql
55 | clickhouse
56 | terraform
57 | fortran
58 | slurm
59 | openmpi
--------------------------------------------------------------------------------
/text/text_dedup/utils/preprocess.py:
--------------------------------------------------------------------------------
1 | import regex as re
2 |
3 | DIGIT_RE = re.compile(r"\d")
4 | PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile(r"[\p{P}\p{C}\p{S}]+")
5 |
6 |
7 | def normalize(line: str) -> str:
8 | """
9 | Normalize a line of text. Source: https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/text_normalizer.py#L180
10 |
11 | Parameters
12 | ----------
13 | line : str
14 | The line of text to normalize.
15 |
16 | Returns
17 | -------
18 | str
19 | The normalized line of text.
20 |
21 | Examples
22 | --------
23 | >>> normalize("Hello, world!")
24 | 'hello world'
25 | >>> normalize("Hello, 123!\\n\\t\\b")
26 | 'hello 000'
27 | """
28 | line = line.strip()
29 | if not line:
30 | return line
31 | line = line.lower()
32 | line = DIGIT_RE.sub("0", line)
33 | line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line)
34 | return line
--------------------------------------------------------------------------------
/text/processing/text_dedup/utils/preprocess.py:
--------------------------------------------------------------------------------
1 | import regex as re
2 |
3 | DIGIT_RE = re.compile(r"\d")
4 | PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile(r"[\p{P}\p{C}\p{S}]+")
5 |
6 |
7 | def normalize(line: str) -> str:
8 | """
9 | Normalize a line of text. Source: https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/text_normalizer.py#L180
10 |
11 | Parameters
12 | ----------
13 | line : str
14 | The line of text to normalize.
15 |
16 | Returns
17 | -------
18 | str
19 | The normalized line of text.
20 |
21 | Examples
22 | --------
23 | >>> normalize("Hello, world!")
24 | 'hello world'
25 | >>> normalize("Hello, 123!\\n\\t\\b")
26 | 'hello 000'
27 | """
28 | line = line.strip()
29 | if not line:
30 | return line
31 | line = line.lower()
32 | line = DIGIT_RE.sub("0", line)
33 | line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line)
34 | return line
--------------------------------------------------------------------------------
/text/text_dedup/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from text_dedup.utils.add_args import add_bloom_filter_args
2 | from text_dedup.utils.add_args import add_exact_hash_args
3 | from text_dedup.utils.add_args import add_io_args
4 | from text_dedup.utils.add_args import add_meta_args
5 | from text_dedup.utils.add_args import add_minhash_args
6 | from text_dedup.utils.add_args import add_sa_args
7 | from text_dedup.utils.add_args import add_simhash_args
8 | from text_dedup.utils.hashfunc import sha1_hash
9 | from text_dedup.utils.hashfunc import xxh3_hash
10 | from text_dedup.utils.timer import Timer
11 | from text_dedup.utils.tokenization import ngrams
12 | from text_dedup.utils.union_find import UnionFind
13 |
14 | __all__ = [
15 | "add_bloom_filter_args",
16 | "add_exact_hash_args",
17 | "add_io_args",
18 | "add_meta_args",
19 | "add_minhash_args",
20 | "add_sa_args",
21 | "add_simhash_args",
22 | "Timer",
23 | "ngrams",
24 | "UnionFind",
25 | "sha1_hash",
26 | "xxh3_hash",
27 | ]
--------------------------------------------------------------------------------
/text/text_dedup/utils/union_find.py:
--------------------------------------------------------------------------------
1 | class UnionFind:
2 | """
3 | A data structure for maintaining disjoint sets. This helps build connected components for given duplicate pairs.
4 |
5 | Examples
6 | --------
7 | >>> uf = UnionFind()
8 | >>> uf.union(1, 2)
9 | >>> uf.union(2, 3)
10 | >>> uf.union(4, 5)
11 | >>> uf.find(1)
12 | 1
13 | >>> uf.find(2)
14 | 1
15 | >>> uf.find(3)
16 | 1
17 | >>> uf.find(4)
18 | 4
19 | >>> uf.find(5)
20 | 4
21 | """
22 |
23 | def __init__(self):
24 | self.parent = {}
25 |
26 | def find(self, x):
27 | if x not in self.parent:
28 | self.parent[x] = x
29 | return x
30 |
31 | if self.parent[x] != x:
32 | self.parent[x] = self.find(self.parent[x])
33 |
34 | return self.parent[x]
35 |
36 | def union(self, x, y):
37 | px = self.find(x)
38 | py = self.find(y)
39 | self.parent[px] = self.parent[py] = min(px, py)
--------------------------------------------------------------------------------
/text/processing/text_dedup/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from text_dedup.utils.add_args import add_bloom_filter_args
2 | from text_dedup.utils.add_args import add_exact_hash_args
3 | from text_dedup.utils.add_args import add_io_args
4 | from text_dedup.utils.add_args import add_meta_args
5 | from text_dedup.utils.add_args import add_minhash_args
6 | from text_dedup.utils.add_args import add_sa_args
7 | from text_dedup.utils.add_args import add_simhash_args
8 | from text_dedup.utils.hashfunc import sha1_hash
9 | from text_dedup.utils.hashfunc import xxh3_hash
10 | from text_dedup.utils.timer import Timer
11 | from text_dedup.utils.tokenization import ngrams
12 | from text_dedup.utils.union_find import UnionFind
13 |
14 | __all__ = [
15 | "add_bloom_filter_args",
16 | "add_exact_hash_args",
17 | "add_io_args",
18 | "add_meta_args",
19 | "add_minhash_args",
20 | "add_sa_args",
21 | "add_simhash_args",
22 | "Timer",
23 | "ngrams",
24 | "UnionFind",
25 | "sha1_hash",
26 | "xxh3_hash",
27 | ]
--------------------------------------------------------------------------------
/text/processing/text_dedup/utils/union_find.py:
--------------------------------------------------------------------------------
1 | class UnionFind:
2 | """
3 | A data structure for maintaining disjoint sets. This helps build connected components for given duplicate pairs.
4 |
5 | Examples
6 | --------
7 | >>> uf = UnionFind()
8 | >>> uf.union(1, 2)
9 | >>> uf.union(2, 3)
10 | >>> uf.union(4, 5)
11 | >>> uf.find(1)
12 | 1
13 | >>> uf.find(2)
14 | 1
15 | >>> uf.find(3)
16 | 1
17 | >>> uf.find(4)
18 | 4
19 | >>> uf.find(5)
20 | 4
21 | """
22 |
23 | def __init__(self):
24 | self.parent = {}
25 |
26 | def find(self, x):
27 | if x not in self.parent:
28 | self.parent[x] = x
29 | return x
30 |
31 | if self.parent[x] != x:
32 | self.parent[x] = self.find(self.parent[x])
33 |
34 | return self.parent[x]
35 |
36 | def union(self, x, y):
37 | px = self.find(x)
38 | py = self.find(y)
39 | self.parent[px] = self.parent[py] = min(px, py)
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 Malaysia-AI
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/text/text_dedup/utils/tokenization.py:
--------------------------------------------------------------------------------
1 | from itertools import tee
2 | from typing import List
3 | from typing import Text
4 |
5 |
6 | def ngrams(sequence: List[Text], n: int, min_length: int = 5):
7 | """
8 | Return the ngrams generated from a sequence of items, as an iterator.
9 |
10 | This is a modified version of nltk.util.ngrams.
11 |
12 | Parameters
13 | ----------
14 | sequence : List[Text]
15 | The sequence of items.
16 | n : int
17 | The length of each ngram.
18 | min_length : int, optional
19 | The minimum length of each ngram, by default 5
20 |
21 | Returns
22 | -------
23 | iterator
24 | The ngrams.
25 |
26 | Examples
27 | --------
28 | >>> list(ngrams(["a", "b", "c", "d"], 2, min_length=1))
29 | [('a', 'b'), ('b', 'c'), ('c', 'd')]
30 | >>> list(ngrams(["a", "b", "c", "d"], 2, min_length=5))
31 | []
32 | >>> list(ngrams(["a", "b"], 3, min_length=1))
33 | [('a', 'b')]
34 | """
35 | if len(sequence) < min_length:
36 | return []
37 | if len(sequence) < n:
38 | return [tuple(sequence)]
39 | iterables = tee(iter(sequence), n)
40 | for i, sub_iterable in enumerate(iterables):
41 | for _ in range(i):
42 | next(sub_iterable, None)
43 | return zip(*iterables)
--------------------------------------------------------------------------------
/text/processing/text_dedup/utils/tokenization.py:
--------------------------------------------------------------------------------
1 | from itertools import tee
2 | from typing import List
3 | from typing import Text
4 |
5 |
6 | def ngrams(sequence: List[Text], n: int, min_length: int = 5):
7 | """
8 | Return the ngrams generated from a sequence of items, as an iterator.
9 |
10 | This is a modified version of nltk.util.ngrams.
11 |
12 | Parameters
13 | ----------
14 | sequence : List[Text]
15 | The sequence of items.
16 | n : int
17 | The length of each ngram.
18 | min_length : int, optional
19 | The minimum length of each ngram, by default 5
20 |
21 | Returns
22 | -------
23 | iterator
24 | The ngrams.
25 |
26 | Examples
27 | --------
28 | >>> list(ngrams(["a", "b", "c", "d"], 2, min_length=1))
29 | [('a', 'b'), ('b', 'c'), ('c', 'd')]
30 | >>> list(ngrams(["a", "b", "c", "d"], 2, min_length=5))
31 | []
32 | >>> list(ngrams(["a", "b"], 3, min_length=1))
33 | [('a', 'b')]
34 | """
35 | if len(sequence) < min_length:
36 | return []
37 | if len(sequence) < n:
38 | return [tuple(sequence)]
39 | iterables = tee(iter(sequence), n)
40 | for i, sub_iterable in enumerate(iterables):
41 | for _ in range(i):
42 | next(sub_iterable, None)
43 | return zip(*iterables)
--------------------------------------------------------------------------------
/text/pretrain-llm/README.md:
--------------------------------------------------------------------------------
1 | # Pretrain LLM
2 |
3 | This is to pretrain 1B - 13B parameters LLM. All steps done using Standard_F48s_v2 node size.
4 |
5 | ## how-to not use HuggingFace datasets
6 |
7 | It stream memory mapped file and after that concat, https://github.com/huggingface/datasets/blob/60bdf3005d1dc0b26da8e5949721b20d932eaad6/src/datasets/table.py#L51, super super slow, and you are wondering, is the script stuck? Yes, it is waiting for pyarrow streaming.
8 |
9 | So we try our own approached,
10 |
11 |
12 |
13 | https://drive.google.com/file/d/1dSQ7KQs_x7aCTNVXgMESIqTwEoAZt-OK/view?usp=sharing
14 |
15 | 1. Split JSONL file into smaller JSONL files.
16 | 2. Each smaller JSONL files run in multiprocessing to convert into Mosaic format.
17 | 3. Merge smaller Mosaic files into one Mosaic file.
18 |
19 | ## total tokens
20 |
21 | 1. [prepare-dedup-text-dataset-4096.ipynb](prepare-dedup-text-dataset-4096.ipynb), 31702310912
22 | 2. [prepare-starcoder-4096.ipynb](prepare-starcoder-4096.ipynb), 40981254144
23 | 3. [prepare-madlad-400-4096.ipynb](prepare-madlad-400-4096.ipynb), 14983720960
24 | 4. [prepare-instructions.ipynb](prepare-instructions.ipynb), 1577877504
25 | 5. [prepare-extra.ipynb](prepare-extra.ipynb), 1140461568
26 |
27 | Total, 90B tokens, we uploaded the dataset at https://huggingface.co/datasets/malaysia-ai/mosaic-combine-all, so you can use it directly with https://docs.mosaicml.com/projects/streaming/en/latest/index.html
--------------------------------------------------------------------------------
/speech-instructions/README.md:
--------------------------------------------------------------------------------
1 | # Speech Instructions
2 |
3 | ## how to prepare
4 |
5 | ### 1. Speaker dedup
6 |
7 | 1. Prepare dataset to dedup,
8 |
9 | - [prepare-malaysia-parliament.ipynb](prepare-malaysia-parliament.ipynb).
10 | - [prepare-malaysian-podcast.ipynb](prepare-malaysian-podcast.ipynb).
11 | - [prepare-malaysian-others.ipynb](prepare-malaysian-others.ipynb).
12 |
13 | 2. Convert to embedding,
14 |
15 | We use speaker embedding from https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/titanet_large
16 |
17 | ```bash
18 | CUDA_VISIBLE_DEVICES=1,2 \
19 | python3.10 embedding.py \
20 | --filename filtered-politicians.parquet \
21 | --replication 3
22 |
23 | CUDA_VISIBLE_DEVICES=1,2 \
24 | python3.10 embedding.py \
25 | --filename filtered-podcast.parquet \
26 | --replication 3 --folder embedding-podcast
27 |
28 | CUDA_VISIBLE_DEVICES=0,2 \
29 | python3.10 embedding.py \
30 | --filename filtered-others.parquet \
31 | --replication 3 --folder embedding-others
32 | ```
33 |
34 | 2. Merge and dedup,
35 |
36 | - [dedup-parliament.ipynb](dedup-parliament.ipynb).
37 | - [dedup-podcasts.ipynb](dedup-podcasts.ipynb).
38 |
39 | ### 2. Populate instructions
40 |
41 | All datasets from https://huggingface.co/collections/mesolitica/malaysian-synthetic-dataset-656c2673fe7fe0b1e9e25fe2, and follow [filter-instructions.ipynb](filter-instructions.ipynb).
42 |
43 | ### 3. Generate synthetic voice
44 |
45 | ```bash
46 | bash generate.sh
47 | ```
48 |
49 | **Modify it appropriately based on your local GPUs**.
--------------------------------------------------------------------------------
/speech-instructions/generate.sh:
--------------------------------------------------------------------------------
1 | for i in {0..3}; do
2 | screen -S "partition-instructions-part-7_$i" -X quit 2>/dev/null
3 | screen -dmS "partition-instructions-part-7_$i" bash -c "cd /home/husein/ssd3/dataset/speech-instructions && \
4 | CUDA_VISIBLE_DEVICES=0 \
5 | python3.10 generate.py \
6 | --input_file \"partition-instructions-part-7.json\" \
7 | --folder \"partition-instructions-part-7\" \
8 | --global_index 4 \
9 | --index $i"
10 | done
11 |
12 | for i in {0..3}; do
13 | screen -S "partition-instructions-part-15_$i" -X quit 2>/dev/null
14 | screen -dmS "partition-instructions-part-15_$i" bash -c "cd /home/husein/ssd3/dataset/speech-instructions && \
15 | CUDA_VISIBLE_DEVICES=2 \
16 | python3.10 generate.py \
17 | --input_file \"partition-instructions-part-15.json\" \
18 | --folder \"partition-instructions-part-15\" \
19 | --global_index 4 \
20 | --index $i"
21 | done
22 |
23 | for i in {0..3}; do
24 | screen -S "tatabahasa_$i" -X quit 2>/dev/null
25 | screen -dmS "tatabahasa_$i" bash -c "cd /home/husein/ssd3/dataset/speech-instructions && \
26 | CUDA_VISIBLE_DEVICES=2 \
27 | python3.10 generate.py \
28 | --input_file \"tatabahasa.json\" \
29 | --folder \"tatabahasa-v2\" \
30 | --global_index 4 \
31 | --index $i --threshold -9 --maxlen 300 --retry 10"
32 | done
33 |
34 | for i in {0..3}; do
35 | screen -S "malaymmlu_$i" -X quit 2>/dev/null
36 | screen -dmS "malaymmlu_$i" bash -c "cd /home/husein/ssd3/dataset/speech-instructions && \
37 | CUDA_VISIBLE_DEVICES=2 \
38 | python3.10 generate.py \
39 | --input_file \"malaymmlu.json\" \
40 | --folder \"malaymmlu\" \
41 | --global_index 4 \
42 | --index $i --threshold -9 --maxlen 300 --retry 10"
43 | done
--------------------------------------------------------------------------------
/text/text_dedup/utils/timer.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 |
4 | class TimerContext:
5 | def __init__(self, timer: "Timer", name: str):
6 | self.timer = timer
7 | self.name = name
8 | self.start_time = None
9 |
10 | def __enter__(self):
11 | self.start_time = time.time()
12 |
13 | def __exit__(self, exc_type, exc_val, exc_tb):
14 | if any([exc_type, exc_val, exc_tb]):
15 | raise exc_val
16 | self.timer.elapsed_times[self.name] = time.time() - self.start_time
17 |
18 |
19 | class Timer:
20 | """
21 | A simple timer that tracks the elapsed time of each context.
22 |
23 | Examples
24 | --------
25 | >>> t = Timer()
26 | >>> with t("test"):
27 | ... time.sleep(1)
28 | >>> assert int(t.elapsed_times.get("test", 0)) >= 1, "The elapsed time should be 1 second."
29 | """
30 |
31 | def __init__(self):
32 | self.elapsed_times = {}
33 |
34 | def __call__(self, name: str) -> TimerContext:
35 | """
36 | Create a context with the given name.
37 |
38 | Parameters
39 | ----------
40 | name: str
41 | The name of the context.
42 |
43 | Returns
44 | -------
45 | TimerContext
46 | The context.
47 |
48 | Examples
49 | --------
50 | >>> t = Timer()
51 | >>> with t("test"):
52 | ... time.sleep(1)
53 | >>> assert int(t.elapsed_times.get("test", 0)) == 1, "The elapsed time should be 1 second."
54 | >>> with t("test2"):
55 | ... time.sleep(2)
56 | >>> assert int(t.elapsed_times.get("test2", 0)) == 2, "The elapsed time should be 2 seconds."
57 | """
58 | return TimerContext(self, name)
--------------------------------------------------------------------------------
/text/processing/text_dedup/utils/timer.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 |
4 | class TimerContext:
5 | def __init__(self, timer: "Timer", name: str):
6 | self.timer = timer
7 | self.name = name
8 | self.start_time = None
9 |
10 | def __enter__(self):
11 | self.start_time = time.time()
12 |
13 | def __exit__(self, exc_type, exc_val, exc_tb):
14 | if any([exc_type, exc_val, exc_tb]):
15 | raise exc_val
16 | self.timer.elapsed_times[self.name] = time.time() - self.start_time
17 |
18 |
19 | class Timer:
20 | """
21 | A simple timer that tracks the elapsed time of each context.
22 |
23 | Examples
24 | --------
25 | >>> t = Timer()
26 | >>> with t("test"):
27 | ... time.sleep(1)
28 | >>> assert int(t.elapsed_times.get("test", 0)) >= 1, "The elapsed time should be 1 second."
29 | """
30 |
31 | def __init__(self):
32 | self.elapsed_times = {}
33 |
34 | def __call__(self, name: str) -> TimerContext:
35 | """
36 | Create a context with the given name.
37 |
38 | Parameters
39 | ----------
40 | name: str
41 | The name of the context.
42 |
43 | Returns
44 | -------
45 | TimerContext
46 | The context.
47 |
48 | Examples
49 | --------
50 | >>> t = Timer()
51 | >>> with t("test"):
52 | ... time.sleep(1)
53 | >>> assert int(t.elapsed_times.get("test", 0)) == 1, "The elapsed time should be 1 second."
54 | >>> with t("test2"):
55 | ... time.sleep(2)
56 | >>> assert int(t.elapsed_times.get("test2", 0)) == 2, "The elapsed time should be 2 seconds."
57 | """
58 | return TimerContext(self, name)
--------------------------------------------------------------------------------
/stt-whisper/README.md:
--------------------------------------------------------------------------------
1 | # STT Whisper
2 |
3 | 1. We provide segment and word level timestamps on,
4 | - [Malaysian Emilia Dialects](https://huggingface.co/datasets/mesolitica/Malaysian-Emilia#malaysian-dialect).
5 | - [Speech Instructions](https://huggingface.co/datasets/malaysia-ai/Speech-Instructions).
6 | 2. Synthetic merging different context, [synthetic-context-switching-word-timestamp.ipynb](synthetic-context-switching-word-timestamp.ipynb).
7 |
8 | ## Sliding Audionet
9 |
10 | ```bash
11 | CUDA_VISIBLE_DEVICES=0 \
12 | python3 audioset_sliding.py --file 'prepared-pseudolabel.jsonl' --global-index 4 --local-index 0
13 | CUDA_VISIBLE_DEVICES=1 \
14 | python3 audioset_sliding.py --file 'prepared-pseudolabel.jsonl' --global-index 4 --local-index 1
15 | CUDA_VISIBLE_DEVICES=2 \
16 | python3 audioset_sliding.py --file 'prepared-pseudolabel.jsonl' --global-index 4 --local-index 2
17 | CUDA_VISIBLE_DEVICES=3 \
18 | python3 audioset_sliding.py --file 'prepared-pseudolabel.jsonl' --global-index 4 --local-index 3
19 | ```
20 |
21 | ## Speech Instructions
22 |
23 | 1. Run force alignment,
24 |
25 | ```bash
26 | CUDA_VISIBLE_DEVICES=2 \
27 | python3.10 force_alignment.py \
28 | --filename 'prepare-force-alignment.json' \
29 | --language 'ms' \
30 | --replication 3
31 | ```
32 |
33 | 2. Prepare dataset,
34 |
35 | - Segment level, [speech-instructions-segment-timestamps.ipynb](speech-instructions-segment-timestamps.ipynb).
36 | - Word level, [speech-instructions-word-timestamps.ipynb](speech-instructions-word-timestamps.ipynb).
37 |
38 | ## Malaysian Emilia Dialects
39 |
40 | 1. Prepare dataset,
41 |
42 | Because force alignment already calculated at [mesolitica/Malaysian-Emilia-annotated/dialects_processed_alignment.zip](https://huggingface.co/datasets/mesolitica/Malaysian-Emilia-annotated/blob/main/dialects_processed_alignment.zip).
43 |
44 | - Segment level, [dialects-segment-timestamps.ipynb](dialects-segment-timestamps.ipynb).
45 | - Word level, [dialects-word-timestamps.ipynb](dialects-word-timestamps.ipynb).
--------------------------------------------------------------------------------
/text/extra/process-lowyat.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "47588232",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "# !git clone https://huggingface.co/datasets/mesolitica/crawl-lowyat"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "id": "44f963ff",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "from glob import glob\n",
21 | "import json"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 5,
27 | "id": "43c7cfc0",
28 | "metadata": {},
29 | "outputs": [
30 | {
31 | "data": {
32 | "text/plain": [
33 | "63"
34 | ]
35 | },
36 | "execution_count": 5,
37 | "metadata": {},
38 | "output_type": "execute_result"
39 | }
40 | ],
41 | "source": [
42 | "files = glob('crawl-lowyat/*.json')\n",
43 | "files = [f for f in files if '-topics' not in f]\n",
44 | "len(files)"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 12,
50 | "id": "2a694ba8",
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "with open('hf-datasets/raw-datasets/lowyat.jsonl', 'w') as fopen_l:\n",
55 | " for f in files:\n",
56 | " with open(f) as fopen:\n",
57 | " data = json.load(fopen)\n",
58 | " for d in data:\n",
59 | " fopen_l.write(f'{json.dumps(d)}\\n')"
60 | ]
61 | }
62 | ],
63 | "metadata": {
64 | "kernelspec": {
65 | "display_name": "Python 3 (ipykernel)",
66 | "language": "python",
67 | "name": "python3"
68 | },
69 | "language_info": {
70 | "codemirror_mode": {
71 | "name": "ipython",
72 | "version": 3
73 | },
74 | "file_extension": ".py",
75 | "mimetype": "text/x-python",
76 | "name": "python",
77 | "nbconvert_exporter": "python",
78 | "pygments_lexer": "ipython3",
79 | "version": "3.10.12"
80 | }
81 | },
82 | "nbformat": 4,
83 | "nbformat_minor": 5
84 | }
85 |
--------------------------------------------------------------------------------
/speech-instructions/embedding.py:
--------------------------------------------------------------------------------
1 | import click
2 | import json
3 | import pandas as pd
4 | import torch
5 | import numpy as np
6 | from tqdm import tqdm
7 | from multiprocess import Pool
8 | import os
9 |
10 | def chunks(l, devices, folder):
11 | chunk_size = len(l) // len(devices)
12 | remainder = len(l) % len(devices)
13 | start = 0
14 | for i in range(len(devices)):
15 | extra = 1 if i < remainder else 0
16 | end = start + chunk_size + extra
17 | yield (l[start:end], devices[i], folder)
18 | start = end
19 |
20 | def loop(rows):
21 | rows, index, folder = rows
22 | os.environ['CUDA_VISIBLE_DEVICES'] = str(index)
23 |
24 | import torch
25 | import torchaudio
26 | import malaya_speech
27 |
28 | model = malaya_speech.speaker_vector.nemo('huseinzol05/nemo-titanet_large').cuda()
29 | _ = model.eval()
30 | with torch.no_grad():
31 | for row in tqdm(rows, desc = f'loop {index}'):
32 | no, row = row
33 | new_f = os.path.join(folder, f'{no}.npy')
34 | if os.path.exists(new_f):
35 | continue
36 | e = model([malaya_speech.load(row['audio'])[0]])
37 | np.save(new_f, e[0], allow_pickle=True)
38 |
39 | @click.command()
40 | @click.option('--filename')
41 | @click.option('--replication', default = 1)
42 | @click.option('--folder', default = 'embedding')
43 | def main(filename, replication, folder):
44 | os.makedirs(folder, exist_ok = True)
45 | devices = os.environ.get('CUDA_VISIBLE_DEVICES')
46 | if devices is None:
47 | devices = list(range(torch.cuda.device_count()))
48 | else:
49 | devices = [d.strip() for d in devices.split(',')]
50 |
51 | devices = replication * devices
52 | print(devices)
53 |
54 | rows = pd.read_parquet(filename).to_dict(orient = 'records')
55 | rows = [(i, rows[i]) for i in range(len(rows))]
56 | df_split = chunks(rows, devices, folder)
57 | pool = Pool(len(devices))
58 | pooled = pool.map(loop, df_split)
59 | pool.close()
60 | pool.join()
61 |
62 | if __name__ == '__main__':
63 | main()
64 |
65 |
66 |
--------------------------------------------------------------------------------
/multilingual-tts/embedding.py:
--------------------------------------------------------------------------------
1 | import click
2 | import json
3 | import pandas as pd
4 | import torch
5 | import numpy as np
6 | from tqdm import tqdm
7 | from multiprocess import Pool
8 | import os
9 |
10 | def chunks(l, devices, folder):
11 | chunk_size = len(l) // len(devices)
12 | remainder = len(l) % len(devices)
13 | start = 0
14 | for i in range(len(devices)):
15 | extra = 1 if i < remainder else 0
16 | end = start + chunk_size + extra
17 | yield (l[start:end], devices[i], folder)
18 | start = end
19 |
20 | def loop(rows):
21 | rows, index, folder = rows
22 | os.environ['CUDA_VISIBLE_DEVICES'] = str(index)
23 |
24 | import torch
25 | import torchaudio
26 | import malaya_speech
27 |
28 | model = malaya_speech.speaker_vector.nemo('huseinzol05/nemo-titanet_large').cuda()
29 | _ = model.eval()
30 | with torch.no_grad():
31 | for row in tqdm(rows, desc = f'loop {index}'):
32 | no, row = row
33 | new_f = os.path.join(folder, f'{no}.npy')
34 | if os.path.exists(new_f):
35 | continue
36 | e = model([malaya_speech.load(row['audio_filename'])[0]])
37 | np.save(new_f, e[0], allow_pickle=True)
38 |
39 | @click.command()
40 | @click.option('--file')
41 | @click.option('--replication', default = 1)
42 | def main(file, replication):
43 |
44 | folder = file.replace('.json', '') + '_embedding'
45 | os.makedirs(folder, exist_ok = True)
46 | devices = os.environ.get('CUDA_VISIBLE_DEVICES')
47 | if devices is None:
48 | devices = list(range(torch.cuda.device_count()))
49 | else:
50 | devices = [d.strip() for d in devices.split(',')]
51 |
52 | devices = replication * devices
53 | print(devices)
54 |
55 | with open(file) as fopen:
56 | rows = json.load(fopen)
57 | rows = [(i, rows[i]) for i in range(len(rows))]
58 |
59 | df_split = chunks(rows, devices, folder)
60 | pool = Pool(len(devices))
61 | pooled = pool.map(loop, df_split)
62 | pool.close()
63 | pool.join()
64 |
65 | if __name__ == '__main__':
66 | main()
67 |
68 |
--------------------------------------------------------------------------------
/text/extra/process-data.gov.my.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 11,
6 | "id": "fba0fccf",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "# !wget https://huggingface.co/datasets/mesolitica/crawl-gov.my/resolve/main/data.gov.my -O data/data.gov.my"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "id": "af0da1cc",
17 | "metadata": {},
18 | "outputs": [
19 | {
20 | "name": "stderr",
21 | "output_type": "stream",
22 | "text": [
23 | "12127it [00:33, 361.39it/s]\n"
24 | ]
25 | }
26 | ],
27 | "source": [
28 | "import json\n",
29 | "from tqdm import tqdm\n",
30 | "\n",
31 | "with open('hf-datasets/raw-datasets/data.gov.my.jsonl', 'w') as fopen_l:\n",
32 | " with open('data/data.gov.my') as fopen:\n",
33 | " for l in tqdm(fopen):\n",
34 | " d = json.loads(l)\n",
35 | " p = '\\n'.join(d['p'])\n",
36 | " keys = d['file_urls'].keys()\n",
37 | " keys = [k for k in keys if k.endswith('csv') or k.endswith('xlsx')]\n",
38 | " if not len(keys):\n",
39 | " continue\n",
40 | " csv = d['file_urls'][keys[0]]\n",
41 | " t = f'{p}\\n{csv}'\n",
42 | " data = {\n",
43 | " 'text': t\n",
44 | " }\n",
45 | " fopen_l.write(f'{json.dumps(data)}\\n')"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "id": "1a415949",
52 | "metadata": {},
53 | "outputs": [],
54 | "source": []
55 | }
56 | ],
57 | "metadata": {
58 | "kernelspec": {
59 | "display_name": "Python 3 (ipykernel)",
60 | "language": "python",
61 | "name": "python3"
62 | },
63 | "language_info": {
64 | "codemirror_mode": {
65 | "name": "ipython",
66 | "version": 3
67 | },
68 | "file_extension": ".py",
69 | "mimetype": "text/x-python",
70 | "name": "python",
71 | "nbconvert_exporter": "python",
72 | "pygments_lexer": "ipython3",
73 | "version": "3.10.12"
74 | }
75 | },
76 | "nbformat": 4,
77 | "nbformat_minor": 5
78 | }
79 |
--------------------------------------------------------------------------------
/text/processing/README.md:
--------------------------------------------------------------------------------
1 | # text-dataset-dedup-py
2 |
3 | ## Description
4 | The `text-dataset-dedup-py` repository contains a Python script that performs a deduplication process on a text dataset. This process is implemented based on the code provided in the [Jupyter Notebook](https://github.com/malaysia-ai/text-dataset-dedup).
5 |
6 | ## How to Use
7 | Follow the steps below to use the deduplication script:
8 |
9 | 1. **Change Directory**: Navigate to the `/processing` directory within this repository.
10 |
11 | 2. **Prepare the Command**: Once in the `/processing` directory, prepare the command to execute the deduplication process.
12 |
13 | Single Dataset (from Huggingface URL)
14 | ```bash
15 | python3 main.py --dataset "piston.my" --url_dataset "https://huggingface.co/datasets/mesolitica/crawl-my-website/resolve/main/piston.my.jsonl" --master_folder "/home/ubuntu/za/datasets04" --text_key reviews_html reviews_text
16 | ```
17 |
18 | Single Dataset (manually cleaned)
19 | ```bash
20 | python3 main.py --dataset "murai.my" --clean_file_path "/home/ubuntu/faiq913_folder/Cleaned Huggingface datasets/murai.my/murai_my_clean.jsonl" --master_folder "/home/ubuntu/za/datasets04"
21 | ```
22 |
23 | If you have multiple datasets from multiple Huggingface URLs,
24 | ```bash
25 | python3 main.py \
26 | --master_folder "/home/ubuntu/za/datasets04" \
27 | --dataset_with_link \
28 | piston.my,https://huggingface.co/datasets/mesolitica/crawl-my-website/resolve/main/piston.my.json \
29 | piston2,https://huggingface.co/datasets/mesolitica/crawl-my-website/resolve/main/piston.my.jsonl \
30 | piston3,https://huggingface.co/datasets/mesolitica/crawl-my-website/resolve/main/piston.my.jsonl
31 | ```
32 |
33 | ### Arguments
34 | 1. `dataset`: Name of the dataset folder inside /dataset where the script will find data.
35 | 2. `url_dataset`: URL of the JSONL file containing data to be processed (script only handles JSONL files).
36 | 3. `master_folder`: Absolute path to the master directory where the deduplication process will occur.
37 | 4. `dataset_with_link`: Format {dataset_name},{dataset_url} {dataset_name02},{dataset_url02}
38 | 5. `text_key`: To add own custom key if you encounter an issue `dataset not in standard key-value. must have ...`
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/text/README.md:
--------------------------------------------------------------------------------
1 | # pretrain-text-dataset
2 |
3 | Prepare pretrain dataset gathered from https://github.com/users/huseinzol05/projects/1
4 |
5 | All dedup and postprocessed dataset uploaded at https://huggingface.co/datasets/malaysia-ai/pretrain-text-dataset
6 |
7 | ## Server spec
8 |
9 | 1. 24 cores.
10 | 2. 220 GB RAM.
11 |
12 | **Deduping can explode the memory, easily eat up to 30 GB if the dataset is > 10GB, so beware**.
13 |
14 | ## Download dataset
15 |
16 | 1. Most of download files are straight forward,
17 |
18 | ```bash
19 | wget https://huggingface.co/datasets/mesolitica/crawl-amanz-my/resolve/main/parsed.jsonl -O hf-datasets/raw-datasets/amanz.jsonl
20 | ```
21 |
22 | But sometime we have to some preprocessing like,
23 |
24 | - [process-lowyat.ipynb](process-lowyat.ipynb)
25 | - [process-data.gov.my.ipynb](process-data.gov.my.ipynb)
26 | - [process-snapshot.ipynb](process-snapshot.ipynb)
27 |
28 | We save raw datasets at [hf-datasets/raw-datasets](hf-datasets/raw-datasets).
29 |
30 | ## Text dedup
31 |
32 | 1. Clone [remove-duplicate-text-dataset.ipynb](remove-duplicate-text-dataset.ipynb) to new notebook, eg, [remove-duplicate-text-dataset-lowyat.ipynb](remove-duplicate-text-dataset-lowyat.ipynb).
33 |
34 | This notebook use [text_dedup](text_dedup) to do dedup, borrowed from https://github.com/ChenghaoMou/text-dedup
35 |
36 | All dedup datasets will save at [hf-datasets/dedupe-datasets](hf-datasets/dedupe-datasets).
37 |
38 | ## Postprocessing
39 |
40 | 1. Run [postprocessing.ipynb](postprocessing.ipynb) to start postprocessing,
41 |
42 | - remove texts that contain HTTP errors.
43 | - remove texts less than 3 characters.
44 | - replace 6 spaces or more with 6 spaces.
45 | - replace 6 dots or more with 6 dots.
46 |
47 | **Rerun this notebook will not overwrite postprocessed datasets**.
48 |
49 | ## Prepare for training session
50 |
51 | **There is no consideration AI alignment and safety in current dataset, we only apply basic postfilter**.
52 |
53 | 1. [FPF llama2](llama)
54 | 2. [FPF Mistral](mistral)
55 | 3. [Pretrain nanoT5](nanot5)
56 | 4. [Pretrain smaller Causal LM](pretrain-clm)
57 | 5. [Pretrain LLM](pretrain-llm)
58 | 6. [FPF TinyLlama](tinyllama)
59 | 7. [FPF Yi](yi)
60 |
61 | ## end-to-end processing using Python script
62 |
63 | Released as a Python library, https://github.com/malaysia-ai/clean_text_my
64 |
65 |
--------------------------------------------------------------------------------
/speech-instructions/prepare-malaysian-podcast.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 5,
6 | "id": "1ac7cbf3",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import pandas as pd\n",
11 | "import numpy as np\n",
12 | "from tqdm import tqdm\n",
13 | "import torchaudio\n",
14 | "import os"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 6,
20 | "id": "64cd5042",
21 | "metadata": {
22 | "scrolled": true
23 | },
24 | "outputs": [
25 | {
26 | "name": "stderr",
27 | "output_type": "stream",
28 | "text": [
29 | "100%|███████████████████████████████████████████████████████████████████████████| 2033890/2033890 [00:08<00:00, 244819.71it/s]\n"
30 | ]
31 | }
32 | ],
33 | "source": [
34 | "df = pd.read_parquet('/home/husein/ssd4/verify-text.parquet')\n",
35 | "filtered = []\n",
36 | "for i in tqdm(range(len(df))):\n",
37 | " if 'podcast_processed' in df['audio'].iloc[i]:\n",
38 | " row = df.iloc[i].to_dict()\n",
39 | " f = os.path.join('/home/husein/ssd4/', row['audio'])\n",
40 | " row['audio'] = f\n",
41 | " filtered.append(row)"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 7,
47 | "id": "438c8a4e",
48 | "metadata": {},
49 | "outputs": [
50 | {
51 | "data": {
52 | "text/plain": [
53 | "75965"
54 | ]
55 | },
56 | "execution_count": 7,
57 | "metadata": {},
58 | "output_type": "execute_result"
59 | }
60 | ],
61 | "source": [
62 | "len(filtered)"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 8,
68 | "id": "f625ca99",
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "pd.DataFrame(filtered).to_parquet('filtered-podcast.parquet')"
73 | ]
74 | }
75 | ],
76 | "metadata": {
77 | "kernelspec": {
78 | "display_name": "python3.10",
79 | "language": "python",
80 | "name": "python3.10"
81 | },
82 | "language_info": {
83 | "codemirror_mode": {
84 | "name": "ipython",
85 | "version": 3
86 | },
87 | "file_extension": ".py",
88 | "mimetype": "text/x-python",
89 | "name": "python",
90 | "nbconvert_exporter": "python",
91 | "pygments_lexer": "ipython3",
92 | "version": "3.10.15"
93 | }
94 | },
95 | "nbformat": 4,
96 | "nbformat_minor": 5
97 | }
98 |
--------------------------------------------------------------------------------
/emotional-malaysian-emilia/pitch_estimation.py:
--------------------------------------------------------------------------------
1 | import click
2 | import torch
3 | import torchaudio
4 | from glob import glob
5 | from tqdm import tqdm
6 | import os
7 | import penn
8 | import torch
9 | import huggingface_hub
10 |
11 | def new_path(f):
12 | f = f.replace('.mp3', '.pitch')
13 | splitted = f.split('/')
14 | base_folder = splitted[0] + '_pitch'
15 | splitted = '/'.join([base_folder] + splitted[1:])
16 | return splitted
17 |
18 | @click.command()
19 | @click.option("--path", help="files path in glob pattern")
20 | @click.option("--global-index", default=1, help="global index")
21 | @click.option("--local-index", default=0, help="local index")
22 | def function(path, global_index, local_index):
23 | files = glob(path)
24 | filtered_files = []
25 | for f in files:
26 | new_f = new_path(f)
27 | if os.path.exists(new_f) and os.path.getsize(new_f) > 2:
28 | continue
29 | filtered_files.append(f)
30 |
31 | global_size = len(filtered_files) // global_index
32 | filtered_files = filtered_files[global_size * local_index: global_size * (local_index + 1)]
33 | files = filtered_files
34 |
35 | model = penn.Model()
36 | checkpoint = huggingface_hub.hf_hub_download(
37 | 'maxrmorrison/fcnf0-plus-plus',
38 | 'fcnf0++.pt')
39 | checkpoint = torch.load(checkpoint, map_location='cpu')
40 | model.load_state_dict(checkpoint['model'])
41 |
42 | model = model.to('cuda').to(torch.float16)
43 |
44 | with torch.no_grad():
45 | for f in tqdm(files):
46 | y, sr = torchaudio.load(f)
47 | y = torchaudio.functional.resample(y, sr, penn.SAMPLE_RATE)
48 | pitch, periodicity = [], []
49 | with torch.no_grad():
50 | for frames in penn.preprocess(
51 | y,
52 | ):
53 | logits = model(frames.to(torch.float16).to('cuda'))
54 | result = penn.postprocess(logits)
55 | pitch.append(result[1])
56 | periodicity.append(result[2])
57 | pitch, periodicity = torch.cat(pitch, 1), torch.cat(periodicity, 1)
58 | pitch = penn.voicing.interpolate(
59 | pitch,
60 | periodicity,
61 | interp_unvoiced_at)
62 | pitch = pitch[0].cpu().numpy().tolist()
63 | pitch = [round(p, 4) for p in pitch]
64 | periodicity = periodicity[0].cpu().numpy().tolist()
65 | periodicity = [round(p, 4) for p in periodicity]
66 | splitted = new_path(f)
67 | os.makedirs(os.path.split(splitted)[0], exist_ok = True)
68 |
69 | with open(splitted, 'w') as fopen:
70 | json.dump({'pitch': pitch, 'periodicity': periodicity}, fopen)
71 |
72 | if __name__ == '__main__':
73 | function()
--------------------------------------------------------------------------------
/multilingual-tts/prepare/prepare-WenetSpeech4TTS.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 10,
6 | "id": "a2127d3e-e002-4d8c-8c8b-ec5ecfeb8b7a",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "from glob import glob\n",
11 | "import pandas as pd\n",
12 | "import os\n",
13 | "import soundfile as sf\n",
14 | "from tqdm import tqdm\n",
15 | "from multiprocess import Pool\n",
16 | "import librosa\n",
17 | "import itertools\n",
18 | "import io\n",
19 | "import numpy as np\n",
20 | "import json\n",
21 | "import tarfile\n",
22 | "\n",
23 | "def chunks(l, n):\n",
24 | " for i in range(0, len(l), n):\n",
25 | " yield (l[i: i + n], i // n)\n",
26 | "\n",
27 | "def multiprocessing(strings, function, cores=6, returned=True):\n",
28 | " df_split = chunks(strings, len(strings) // cores)\n",
29 | " pool = Pool(cores)\n",
30 | " pooled = pool.map(function, df_split)\n",
31 | " pool.close()\n",
32 | " pool.join()\n",
33 | "\n",
34 | " if returned:\n",
35 | " return list(itertools.chain(*pooled))"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 11,
41 | "id": "53d47aa9-b5f8-4378-8416-45ceace47196",
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "# def loop(files):\n",
46 | "# files, _ = files\n",
47 | "# for f in tqdm(files):\n",
48 | "# with tarfile.open(f, \"r:gz\") as tar:\n",
49 | "# tar.extractall(path='WenetSpeech4TTS/Standard')\n",
50 | "\n",
51 | "# files = glob('WenetSpeech4TTS/Standard/*.tar.gz')\n",
52 | "# multiprocessing(files, loop, len(files), returned = False)"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 12,
58 | "id": "f00e77ce-af6d-4232-b067-aedf1c9d1964",
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "# def loop(files):\n",
63 | "# files, _ = files\n",
64 | "# for f in tqdm(files):\n",
65 | "# with tarfile.open(f, \"r:gz\") as tar:\n",
66 | "# tar.extractall(path='WenetSpeech4TTS/Premium')\n",
67 | "\n",
68 | "# files = glob('WenetSpeech4TTS/Premium/*.tar.gz')\n",
69 | "# multiprocessing(files, loop, len(files), returned = False)"
70 | ]
71 | }
72 | ],
73 | "metadata": {
74 | "kernelspec": {
75 | "display_name": "Python 3 (ipykernel)",
76 | "language": "python",
77 | "name": "python3"
78 | },
79 | "language_info": {
80 | "codemirror_mode": {
81 | "name": "ipython",
82 | "version": 3
83 | },
84 | "file_extension": ".py",
85 | "mimetype": "text/x-python",
86 | "name": "python",
87 | "nbconvert_exporter": "python",
88 | "pygments_lexer": "ipython3",
89 | "version": "3.10.12"
90 | }
91 | },
92 | "nbformat": 4,
93 | "nbformat_minor": 5
94 | }
95 |
--------------------------------------------------------------------------------
/speech-instructions/remote.sh:
--------------------------------------------------------------------------------
1 | apt update
2 | apt install unzip ffmpeg -y
3 | apt update && apt install -y locales
4 | locale-gen en_US.UTF-8
5 | cd /workspace
6 | wget https://www.7-zip.org/a/7z2301-linux-x64.tar.xz
7 | tar -xf 7z2301-linux-x64.tar.xz
8 | pip3 install huggingface-hub
9 |
10 | python3 -c "
11 | from huggingface_hub import snapshot_download
12 | snapshot_download(repo_id='malaysia-ai/dedup-Malaysian-Emilia', repo_type='dataset',
13 | allow_patterns = '*.z*', local_dir = './')
14 | "
15 | /workspace/7zz x dedup-parliament.zip -y -mmt40
16 | /workspace/7zz x dedup-podcasts.zip -y -mmt40
17 |
18 | wget https://github.com/mesolitica/malaysian-dataset/raw/refs/heads/master/text-to-speech/husein/requirements.txt
19 | pip3 install -r requirements.txt
20 | pip3 install click vocos torchdiffeq==0.2.4 x-transformers==1.42.11 jieba==0.42.1 pypinyin==0.53.0
21 |
22 | wget https://huggingface.co/datasets/malaysia-ai/Speech-Instructions/resolve/main/text/partition-instructions-part-3.json
23 | wget https://huggingface.co/datasets/malaysia-ai/Speech-Instructions/resolve/main/text/partition-instructions-part-4.json
24 | wget https://huggingface.co/datasets/malaysia-ai/Speech-Instructions/resolve/main/text/partition-instructions-part-5.json
25 | wget https://huggingface.co/datasets/malaysia-ai/Speech-Instructions/resolve/main/text/partition-instructions-part-6.json
26 | wget https://raw.githubusercontent.com/malaysia-ai/dataset/refs/heads/main/speech-instructions/generate.py
27 |
28 | for i in {0..3}; do
29 | screen -S "partition-instructions-part-3_$i" -X quit 2>/dev/null
30 | screen -dmS "partition-instructions-part-3_$i" bash -c "cd /workspace && \
31 | CUDA_VISIBLE_DEVICES=0 \
32 | python3 generate.py \
33 | --input_file \"partition-instructions-part-3.json\" \
34 | --folder \"partition-instructions-part-3\" \
35 | --global_index 4 \
36 | --index $i"
37 | done
38 |
39 | for i in {0..3}; do
40 | screen -S "partition-instructions-part-4_$i" -X quit 2>/dev/null
41 | screen -dmS "partition-instructions-part-4_$i" bash -c "cd /workspace && \
42 | CUDA_VISIBLE_DEVICES=1 \
43 | python3 generate.py \
44 | --input_file \"partition-instructions-part-4.json\" \
45 | --folder \"partition-instructions-part-4\" \
46 | --global_index 4 \
47 | --index $i"
48 | done
49 |
50 | for i in {0..3}; do
51 | screen -S "partition-instructions-part-5_$i" -X quit 2>/dev/null
52 | screen -dmS "partition-instructions-part-5_$i" bash -c "cd /workspace && \
53 | CUDA_VISIBLE_DEVICES=2 \
54 | python3 generate.py \
55 | --input_file \"partition-instructions-part-5.json\" \
56 | --folder \"partition-instructions-part-5\" \
57 | --global_index 4 \
58 | --index $i"
59 | done
60 |
61 | for i in {0..3}; do
62 | screen -S "partition-instructions-part-6_$i" -X quit 2>/dev/null
63 | screen -dmS "partition-instructions-part-6_$i" bash -c "cd /workspace && \
64 | CUDA_VISIBLE_DEVICES=3 \
65 | python3 generate.py \
66 | --input_file \"partition-instructions-part-6.json\" \
67 | --folder \"partition-instructions-part-6\" \
68 | --global_index 4 \
69 | --index $i"
70 | done
--------------------------------------------------------------------------------
/speech-instructions-extra/upload.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 7,
6 | "id": "489cdcad",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "from glob import glob\n",
11 | "from tqdm import tqdm\n",
12 | "import json"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 8,
18 | "id": "02e5a67f",
19 | "metadata": {},
20 | "outputs": [
21 | {
22 | "name": "stderr",
23 | "output_type": "stream",
24 | "text": [
25 | "100%|████████████████████████████████████████████████████████████████████████████████████| 965/965 [00:01<00:00, 811.02it/s]\n"
26 | ]
27 | }
28 | ],
29 | "source": [
30 | "alls = []\n",
31 | "for f in tqdm(glob('*/*.parquet')):\n",
32 | " try:\n",
33 | " with open(f) as fopen:\n",
34 | " d = json.load(fopen)\n",
35 | " for d_ in d:\n",
36 | " d_['start'] = None\n",
37 | " d_['end'] = None\n",
38 | " d_['context'] = None\n",
39 | " d_['system'] = None\n",
40 | " d_['sliced_audio_filename'] = None\n",
41 | " alls.append(d_)\n",
42 | " except:\n",
43 | " pass"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 9,
49 | "id": "1def2d40",
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "data": {
54 | "text/plain": [
55 | "549110"
56 | ]
57 | },
58 | "execution_count": 9,
59 | "metadata": {},
60 | "output_type": "execute_result"
61 | }
62 | ],
63 | "source": [
64 | "len(alls)"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 13,
70 | "id": "e9b10351",
71 | "metadata": {},
72 | "outputs": [
73 | {
74 | "data": {
75 | "text/plain": [
76 | "{'instruction': 'What decision did Speaker1 and Speaker2 agree on regarding taking something first?',\n",
77 | " 'answer': 'Speaker1 would take the first one.',\n",
78 | " 'audio_filename': 'SQA-PART3-Train-audio/train-00145-of-00171-2.mp3',\n",
79 | " 'start': None,\n",
80 | " 'end': None,\n",
81 | " 'context': None,\n",
82 | " 'system': None,\n",
83 | " 'sliced_audio_filename': None}"
84 | ]
85 | },
86 | "execution_count": 13,
87 | "metadata": {},
88 | "output_type": "execute_result"
89 | }
90 | ],
91 | "source": [
92 | "alls[2]"
93 | ]
94 | }
95 | ],
96 | "metadata": {
97 | "kernelspec": {
98 | "display_name": "python3.10",
99 | "language": "python",
100 | "name": "python3.10"
101 | },
102 | "language_info": {
103 | "codemirror_mode": {
104 | "name": "ipython",
105 | "version": 3
106 | },
107 | "file_extension": ".py",
108 | "mimetype": "text/x-python",
109 | "name": "python",
110 | "nbconvert_exporter": "python",
111 | "pygments_lexer": "ipython3",
112 | "version": "3.10.15"
113 | }
114 | },
115 | "nbformat": 4,
116 | "nbformat_minor": 5
117 | }
118 |
--------------------------------------------------------------------------------
/text/pretrain-llm/prepare-madlad-400-ms.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "86d80b05",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "# !git lfs clone https://huggingface.co/datasets/malaysia-ai/madlad-400-ms"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 3,
16 | "id": "54ca47c5",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import json\n",
21 | "import os\n",
22 | "from glob import glob\n",
23 | "from tqdm import tqdm"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 4,
29 | "id": "e92d6668",
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "def partition(text, size = 500):\n",
34 | " splitted = text.split()\n",
35 | " return [' '.join(splitted[i: i + size]) for i in range(0, len(splitted), size)]"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 2,
41 | "id": "f1b7e4f9",
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "name": "stdout",
46 | "output_type": "stream",
47 | "text": [
48 | "madlad-400-ms.jsonl00.splitted\tmadlad-400-ms.jsonl02.splitted\r\n",
49 | "madlad-400-ms.jsonl01.splitted\r\n"
50 | ]
51 | }
52 | ],
53 | "source": [
54 | "!ls madlad-400-ms"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 5,
60 | "id": "485c6a71",
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "a = open('prepare-madlad-400-ms.jsonl', 'w')"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 6,
70 | "id": "6aacc1a4",
71 | "metadata": {},
72 | "outputs": [
73 | {
74 | "name": "stderr",
75 | "output_type": "stream",
76 | "text": [
77 | "4081851it [18:57, 3587.17it/s]\n",
78 | "5000000it [23:02, 3615.70it/s]\n",
79 | "5000000it [34:34, 2410.40it/s]\n"
80 | ]
81 | }
82 | ],
83 | "source": [
84 | "madlad_ms = glob('madlad-400-ms/*.splitted')\n",
85 | "for f in madlad_ms:\n",
86 | " with open(f) as fopen:\n",
87 | " for l in tqdm(fopen):\n",
88 | " try:\n",
89 | " data = '' + json.loads(l)['text'] + ''\n",
90 | " partitioned = partition(data)\n",
91 | " for p in partitioned:\n",
92 | " data = {\n",
93 | " 'text': p,\n",
94 | " }\n",
95 | " a.write(f'{json.dumps(data)}\\n')\n",
96 | " a.flush()\n",
97 | " except:\n",
98 | " pass"
99 | ]
100 | }
101 | ],
102 | "metadata": {
103 | "kernelspec": {
104 | "display_name": "Python 3 (ipykernel)",
105 | "language": "python",
106 | "name": "python3"
107 | },
108 | "language_info": {
109 | "codemirror_mode": {
110 | "name": "ipython",
111 | "version": 3
112 | },
113 | "file_extension": ".py",
114 | "mimetype": "text/x-python",
115 | "name": "python",
116 | "nbconvert_exporter": "python",
117 | "pygments_lexer": "ipython3",
118 | "version": "3.10.12"
119 | }
120 | },
121 | "nbformat": 4,
122 | "nbformat_minor": 5
123 | }
124 |
--------------------------------------------------------------------------------
/emotional-malaysian-emilia/audioset_sliding.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
2 | from torch.utils.data import DataLoader
3 | from torch.nn import functional as F
4 | from tqdm import tqdm
5 | from glob import glob
6 | from datasets import Audio
7 | import torch
8 | import torchaudio
9 | import numpy as np
10 | import click
11 | import os
12 | import json
13 |
14 | def new_path(f):
15 | f = f.replace('.mp3', '.audioset')
16 | splitted = f.split('/')
17 | base_folder = splitted[0] + '_audioset'
18 | splitted = '/'.join([base_folder] + splitted[1:])
19 | return splitted
20 |
21 |
22 | @click.command()
23 | @click.option("--path", help="files path in glob pattern")
24 | @click.option("--global-index", default=1, help="global index")
25 | @click.option("--local-index", default=0, help="local index")
26 | @click.option("--sliding", default=0.25)
27 | @click.option("--model", default='MIT/ast-finetuned-audioset-10-10-0.4593')
28 | def function(path, global_index, local_index, sliding, model):
29 |
30 | feature_extractor = AutoFeatureExtractor.from_pretrained(model, return_attention_mask = True)
31 | model = AutoModelForAudioClassification.from_pretrained(model, torch_dtype = torch.float16).eval().cuda()
32 | id2label = model.config.id2label
33 | sr = feature_extractor.sampling_rate
34 | sliding = int(sliding * sr)
35 | audio = Audio(sampling_rate = sr)
36 |
37 | files = glob(path)
38 | filtered_files = []
39 | for f in files:
40 | new_f = new_path(f)
41 | if os.path.exists(new_f) and os.path.getsize(new_f) > 2:
42 | continue
43 | filtered_files.append(f)
44 |
45 | global_size = len(filtered_files) // global_index
46 | filtered_files = filtered_files[global_size * local_index: global_size * (local_index + 1)]
47 | files = filtered_files
48 |
49 | with torch.no_grad():
50 | for f in tqdm(files):
51 | y = audio.decode_example(audio.encode_example(f))['array']
52 | timestamps = []
53 | slided = []
54 | for i in range(0, len(y), sliding):
55 | y_ = y[i: i + sliding]
56 | if len(y_) < 1000:
57 | continue
58 | slided.append(y[i: i + sliding])
59 | start = i / sr
60 | end = min(len(y) / sr, (i + sliding) / sr)
61 | timestamps.append((start, end))
62 |
63 | inputs = feature_extractor(slided, sampling_rate=sr,
64 | return_tensors="pt", return_attention_mask = True)
65 | inputs['input_values'] = inputs['input_values'].to(torch.float16).cuda()
66 | logits = model(**inputs).logits.softmax(-1)
67 | topk = torch.topk(logits, 5, dim = -1)
68 | probs = topk.values.cpu().numpy().tolist()
69 |
70 | for i in range(len(probs)):
71 | for k in range(len(probs[i])):
72 | probs[i][k] = round(probs[i][k], 4)
73 |
74 | labels = []
75 | for row in topk.indices.cpu().numpy():
76 | label = [id2label[r] for r in row]
77 | labels.append(label)
78 |
79 | splitted = new_path(f)
80 | os.makedirs(os.path.split(splitted)[0], exist_ok = True)
81 | with open(splitted, 'w') as fopen:
82 | json.dump({'timestamps': timestamps, 'labels': labels, 'probs': probs}, fopen)
83 |
84 | if __name__ == '__main__':
85 | function()
--------------------------------------------------------------------------------
/speech-instructions/prepare-malaysian-others.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "3c883f91",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "# !wget https://huggingface.co/Zyphra/Zonos-v0.1-speaker-embedding/resolve/main/ResNet293_SimAM_ASP_base.pt"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "id": "bf4cd179",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import os\n",
21 | "\n",
22 | "os.environ['CUDA_VISIBLE_DEVICES'] = '1'\n",
23 | "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 3,
29 | "id": "1ac7cbf3",
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "import pandas as pd\n",
34 | "import numpy as np\n",
35 | "from tqdm import tqdm\n",
36 | "import torchaudio"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 8,
42 | "id": "56d3111e",
43 | "metadata": {
44 | "scrolled": true
45 | },
46 | "outputs": [
47 | {
48 | "name": "stderr",
49 | "output_type": "stream",
50 | "text": [
51 | "100%|████████████████████████████████████████████████████████████████████████████| 2033890/2033890 [00:20<00:00, 97450.43it/s]\n"
52 | ]
53 | },
54 | {
55 | "data": {
56 | "text/plain": [
57 | "555379"
58 | ]
59 | },
60 | "execution_count": 8,
61 | "metadata": {},
62 | "output_type": "execute_result"
63 | }
64 | ],
65 | "source": [
66 | "df = pd.read_parquet('/home/husein/ssd4/verify-text.parquet')\n",
67 | "filtered = []\n",
68 | "for i in tqdm(range(len(df))):\n",
69 | " f = df['audio'].iloc[i]\n",
70 | " \n",
71 | " if 'parlimen-24k' not in f and 'podcast_processed' not in f and 'dialects_processed' not in f:\n",
72 | " row = df.iloc[i].to_dict()\n",
73 | " f = os.path.join('/home/husein/ssd4/', row['audio'])\n",
74 | " row['audio'] = f\n",
75 | " filtered.append(row)\n",
76 | " \n",
77 | " \n",
78 | "len(filtered)"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 9,
84 | "id": "c358a3f7",
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "data": {
89 | "text/plain": [
90 | "555379"
91 | ]
92 | },
93 | "execution_count": 9,
94 | "metadata": {},
95 | "output_type": "execute_result"
96 | }
97 | ],
98 | "source": [
99 | "len(filtered)"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 12,
105 | "id": "54d9bd5b",
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "pd.DataFrame(filtered).to_parquet('filtered-others.parquet')"
110 | ]
111 | }
112 | ],
113 | "metadata": {
114 | "kernelspec": {
115 | "display_name": "python3.10",
116 | "language": "python",
117 | "name": "python3.10"
118 | },
119 | "language_info": {
120 | "codemirror_mode": {
121 | "name": "ipython",
122 | "version": 3
123 | },
124 | "file_extension": ".py",
125 | "mimetype": "text/x-python",
126 | "name": "python",
127 | "nbconvert_exporter": "python",
128 | "pygments_lexer": "ipython3",
129 | "version": "3.10.15"
130 | }
131 | },
132 | "nbformat": 4,
133 | "nbformat_minor": 5
134 | }
135 |
--------------------------------------------------------------------------------
/text/text_dedup/utils/analysis.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from scipy.integrate import quad as integrate
4 |
5 | from text_dedup.utils.tokenization import ngrams
6 |
7 |
8 | def jaccard_similarity(
9 | doc1,
10 | doc2,
11 | ngram_size: int = 8,
12 | min_length: int = 0,
13 | ) -> float:
14 | """Compute the Jaccard similarity between two documents.
15 |
16 | Parameters
17 | ----------
18 | doc1 : str or List[str]
19 | The first document.
20 | doc2 : str or List[str]
21 | The second document.
22 | ngram_size : int, optional
23 | The size of n-grams, by default 8
24 | min_length : int, optional
25 | The minimum length of each n-gram, by default 0
26 |
27 | Returns
28 | -------
29 | float
30 | The Jaccard similarity.
31 |
32 | Examples
33 | --------
34 | >>> jaccard_similarity("hello world", "hello world")
35 | 1.0
36 | >>> jaccard_similarity("hello world", "hello world!")
37 | 0.8
38 | >>> jaccard_similarity("hello world".split(), "hello world!".split(), ngram_size=1)
39 | 0.3333333333333333
40 | """
41 | words1 = set(" ".join(ng) for ng in ngrams(list(doc1), ngram_size, min_length=min_length))
42 | words2 = set(" ".join(ng) for ng in ngrams(list(doc2), ngram_size, min_length=min_length))
43 | return len(words1 & words2) / max(1, len(words1 | words2))
44 |
45 |
46 | def optimal_param(
47 | threshold: float,
48 | num_perm: int,
49 | false_positive_weight: float = 0.5,
50 | false_negative_weight: float = 0.5,
51 | ):
52 | """
53 | Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
54 | of probabilities of false positive and false negative, taken from datasketch.
55 |
56 | You can also refer to the interactive demo at https://huggingface.co/spaces/bigcode/near-deduplication.
57 |
58 | Parameters
59 | ----------
60 | threshold : float
61 | The threshold for similarity.
62 | num_perm : int
63 | The number of permutations.
64 | false_positive_weight : float
65 | The weight of false positive.
66 | false_negative_weight : float
67 | The weight of false negative.
68 |
69 | Returns
70 | -------
71 | Tuple[int, int]
72 | The optimal `b` (bands) and `r` (rows) parameters.
73 |
74 | Examples
75 | --------
76 | >>> optimal_param(0.75, 256)
77 | (21, 12)
78 | >>> optimal_param(0.75, 256, 0.1, 0.9)
79 | (28, 9)
80 | """
81 |
82 | def false_positive_area(threshold: float, b: int, r: int):
83 | """Source: `datasketch.lsh`"""
84 |
85 | def proba(s):
86 | return 1 - (1 - s ** float(r)) ** float(b)
87 |
88 | a, _ = integrate(proba, 0.0, threshold)
89 | return a
90 |
91 | def false_negative_area(threshold: float, b: int, r: int):
92 | """Source: `datasketch.lsh`"""
93 |
94 | def proba(s):
95 | return 1 - (1 - (1 - s ** float(r)) ** float(b))
96 |
97 | a, _ = integrate(proba, threshold, 1.0)
98 | return a
99 |
100 | min_error = float("inf")
101 | opt = (0, 0)
102 | for b in range(1, num_perm + 1):
103 | max_r = int(num_perm / b)
104 | for r in range(1, max_r + 1):
105 | fp = false_positive_area(threshold, b, r)
106 | fn = false_negative_area(threshold, b, r)
107 | error = fp * false_positive_weight + fn * false_negative_weight
108 | if error < min_error:
109 | min_error = error
110 | opt = (b, r)
111 | return opt
--------------------------------------------------------------------------------
/stt-whisper/force_alignment.py:
--------------------------------------------------------------------------------
1 | from tqdm import tqdm
2 | from multiprocess import Pool
3 | import torch
4 | import torchaudio
5 | import pandas as pd
6 | import click
7 | import os
8 | import json
9 |
10 | device = 'cuda'
11 |
12 | def chunks(l, devices, language, folder):
13 | chunk_size = len(l) // len(devices)
14 | remainder = len(l) % len(devices)
15 | start = 0
16 | for i in range(len(devices)):
17 | extra = 1 if i < remainder else 0
18 | end = start + chunk_size + extra
19 | yield (l[start:end], devices[i], language, folder)
20 | start = end
21 |
22 | def loop(rows):
23 | rows, index, language, folder = rows
24 | os.environ['CUDA_VISIBLE_DEVICES'] = str(index)
25 |
26 | from ctc_forced_aligner import (
27 | load_audio,
28 | load_alignment_model,
29 | generate_emissions,
30 | preprocess_text,
31 | get_alignments,
32 | get_spans,
33 | postprocess_results,
34 | )
35 | import torch
36 |
37 | alignment_model, alignment_tokenizer = load_alignment_model(
38 | device,
39 | dtype=torch.float16 if device == "cuda" else torch.float32,
40 | )
41 |
42 | with torch.no_grad():
43 | for row in tqdm(rows):
44 | t = row.get('pronunciation', '')
45 | if not len(t):
46 | t = row.get('question')
47 | f = row['audio_filename']
48 | new_f = f.replace('/', '_').replace('.mp3', '.json').replace('.wav', '.json')
49 | filename = os.path.join(folder, new_f)
50 | if os.path.exists(filename):
51 | continue
52 | new_wav, sr = torchaudio.load(f)
53 | audio_waveform = torchaudio.functional.resample(
54 | new_wav[0], orig_freq=sr, new_freq=16000
55 | ).type(torch.float16).cuda()
56 | emissions, stride = generate_emissions(
57 | alignment_model, audio_waveform, batch_size=1
58 | )
59 | tokens_starred, text_starred = preprocess_text(
60 | t,
61 | romanize=True,
62 | language=language,
63 | )
64 | segments, scores, blank_token = get_alignments(
65 | emissions,
66 | tokens_starred,
67 | alignment_tokenizer,
68 | )
69 | spans = get_spans(tokens_starred, segments, blank_token)
70 | word_timestamps = postprocess_results(text_starred, spans, stride, scores)
71 | with open(filename, 'w') as fopen:
72 | row['word_timestamps'] = word_timestamps
73 | json.dump(row, fopen)
74 |
75 | @click.command()
76 | @click.option('--filename')
77 | @click.option('--language', default = 'en')
78 | @click.option('--replication', default = 1)
79 | @click.option('--folder', default = 'force_alignment')
80 | def main(filename, language, replication, folder):
81 | os.makedirs(folder, exist_ok = True)
82 | devices = os.environ.get('CUDA_VISIBLE_DEVICES')
83 | if devices is None:
84 | devices = list(range(torch.cuda.device_count()))
85 | else:
86 | devices = [d.strip() for d in devices.split(',')]
87 |
88 | devices = replication * devices
89 | print(devices)
90 |
91 | with open(filename) as fopen:
92 | rows = json.load(fopen)
93 |
94 | df_split = chunks(rows, devices, language, folder)
95 | pool = Pool(len(devices))
96 | pooled = pool.map(loop, df_split)
97 | pool.close()
98 | pool.join()
99 |
100 | if __name__ == '__main__':
101 | main()
--------------------------------------------------------------------------------
/text/compare-tokens.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "id": "f4d6d81b",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "# !wget https://huggingface.co/datasets/mesolitica/malaysian-ultrachat/resolve/main/ultrachat-astroawani-malay.jsonl"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 12,
16 | "id": "e7743bc8",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import json\n",
21 | "from tqdm import tqdm\n",
22 | "from transformers import AutoTokenizer"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 15,
28 | "id": "1709e65a",
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "tokenizer_mallam = AutoTokenizer.from_pretrained('malaysia-ai/sentencepiece-tokenizer')\n",
33 | "tokenizer_llama2 = AutoTokenizer.from_pretrained('mesolitica/llama-7b-hf-2048-fpf')\n",
34 | "tokenizer_mistral = AutoTokenizer.from_pretrained('mesolitica/mistral-7b-4096-fpf')"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 18,
40 | "id": "9655d899",
41 | "metadata": {},
42 | "outputs": [
43 | {
44 | "name": "stderr",
45 | "output_type": "stream",
46 | "text": [
47 | "60198it [04:20, 230.88it/s]\n"
48 | ]
49 | }
50 | ],
51 | "source": [
52 | "mallam, llama2, mistral = 0, 0, 0\n",
53 | "with open('ultrachat-astroawani-malay.jsonl') as fopen:\n",
54 | " for l in tqdm(fopen):\n",
55 | " l = json.loads(l)\n",
56 | " for r in l[1:]:\n",
57 | " if r['content_ms']:\n",
58 | " mallam += len(tokenizer_mallam(r['content_ms'])['input_ids'])\n",
59 | " llama2 += len(tokenizer_llama2(r['content_ms'])['input_ids'])\n",
60 | " mistral += len(tokenizer_mistral(r['content_ms'])['input_ids'])"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 19,
66 | "id": "7b5901bd",
67 | "metadata": {},
68 | "outputs": [
69 | {
70 | "data": {
71 | "text/plain": [
72 | "(26157664, 60391551, 60823929)"
73 | ]
74 | },
75 | "execution_count": 19,
76 | "metadata": {},
77 | "output_type": "execute_result"
78 | }
79 | ],
80 | "source": [
81 | "mallam, llama2, mistral"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 22,
87 | "id": "3e01dec0",
88 | "metadata": {},
89 | "outputs": [
90 | {
91 | "data": {
92 | "text/plain": [
93 | "0.4300554803028262"
94 | ]
95 | },
96 | "execution_count": 22,
97 | "metadata": {},
98 | "output_type": "execute_result"
99 | }
100 | ],
101 | "source": [
102 | "(mallam / 60823929)"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "id": "1a1c8e06",
109 | "metadata": {},
110 | "outputs": [],
111 | "source": []
112 | }
113 | ],
114 | "metadata": {
115 | "kernelspec": {
116 | "display_name": "Python 3 (ipykernel)",
117 | "language": "python",
118 | "name": "python3"
119 | },
120 | "language_info": {
121 | "codemirror_mode": {
122 | "name": "ipython",
123 | "version": 3
124 | },
125 | "file_extension": ".py",
126 | "mimetype": "text/x-python",
127 | "name": "python",
128 | "nbconvert_exporter": "python",
129 | "pygments_lexer": "ipython3",
130 | "version": "3.10.12"
131 | }
132 | },
133 | "nbformat": 4,
134 | "nbformat_minor": 5
135 | }
136 |
--------------------------------------------------------------------------------
/text/processing/text_dedup/utils/analysis.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from scipy.integrate import quad as integrate
4 |
5 | from text_dedup.utils.tokenization import ngrams
6 |
7 |
8 | def jaccard_similarity(
9 | doc1: str | List[str],
10 | doc2: str | List[str],
11 | ngram_size: int = 8,
12 | min_length: int = 0,
13 | ) -> float:
14 | """Compute the Jaccard similarity between two documents.
15 |
16 | Parameters
17 | ----------
18 | doc1 : str or List[str]
19 | The first document.
20 | doc2 : str or List[str]
21 | The second document.
22 | ngram_size : int, optional
23 | The size of n-grams, by default 8
24 | min_length : int, optional
25 | The minimum length of each n-gram, by default 0
26 |
27 | Returns
28 | -------
29 | float
30 | The Jaccard similarity.
31 |
32 | Examples
33 | --------
34 | >>> jaccard_similarity("hello world", "hello world")
35 | 1.0
36 | >>> jaccard_similarity("hello world", "hello world!")
37 | 0.8
38 | >>> jaccard_similarity("hello world".split(), "hello world!".split(), ngram_size=1)
39 | 0.3333333333333333
40 | """
41 | words1 = set(" ".join(ng) for ng in ngrams(list(doc1), ngram_size, min_length=min_length))
42 | words2 = set(" ".join(ng) for ng in ngrams(list(doc2), ngram_size, min_length=min_length))
43 | return len(words1 & words2) / max(1, len(words1 | words2))
44 |
45 |
46 | def optimal_param(
47 | threshold: float,
48 | num_perm: int,
49 | false_positive_weight: float = 0.5,
50 | false_negative_weight: float = 0.5,
51 | ):
52 | """
53 | Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
54 | of probabilities of false positive and false negative, taken from datasketch.
55 |
56 | You can also refer to the interactive demo at https://huggingface.co/spaces/bigcode/near-deduplication.
57 |
58 | Parameters
59 | ----------
60 | threshold : float
61 | The threshold for similarity.
62 | num_perm : int
63 | The number of permutations.
64 | false_positive_weight : float
65 | The weight of false positive.
66 | false_negative_weight : float
67 | The weight of false negative.
68 |
69 | Returns
70 | -------
71 | Tuple[int, int]
72 | The optimal `b` (bands) and `r` (rows) parameters.
73 |
74 | Examples
75 | --------
76 | >>> optimal_param(0.75, 256)
77 | (21, 12)
78 | >>> optimal_param(0.75, 256, 0.1, 0.9)
79 | (28, 9)
80 | """
81 |
82 | def false_positive_area(threshold: float, b: int, r: int):
83 | """Source: `datasketch.lsh`"""
84 |
85 | def proba(s):
86 | return 1 - (1 - s ** float(r)) ** float(b)
87 |
88 | a, _ = integrate(proba, 0.0, threshold)
89 | return a
90 |
91 | def false_negative_area(threshold: float, b: int, r: int):
92 | """Source: `datasketch.lsh`"""
93 |
94 | def proba(s):
95 | return 1 - (1 - (1 - s ** float(r)) ** float(b))
96 |
97 | a, _ = integrate(proba, threshold, 1.0)
98 | return a
99 |
100 | min_error = float("inf")
101 | opt = (0, 0)
102 | for b in range(1, num_perm + 1):
103 | max_r = int(num_perm / b)
104 | for r in range(1, max_r + 1):
105 | fp = false_positive_area(threshold, b, r)
106 | fn = false_negative_area(threshold, b, r)
107 | error = fp * false_positive_weight + fn * false_negative_weight
108 | if error < min_error:
109 | min_error = error
110 | opt = (b, r)
111 | return opt
--------------------------------------------------------------------------------
/text/processing/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import json
4 | import subprocess
5 | from tqdm import tqdm
6 | from pathlib import Path
7 |
8 |
9 | def is_dir(path):
10 | return os.path.isdir(path)
11 |
12 |
13 | def run_command(txt):
14 | subprocess.run(txt, shell=True)
15 |
16 |
17 | def create_dir(path):
18 | Path(path).mkdir(parents=True, exist_ok=True)
19 |
20 |
21 | def write_to_json(lst, fn):
22 | with open(fn, "w+") as file:
23 | for item in tqdm(lst):
24 | x = json.dumps(item, ensure_ascii=False)
25 | file.write(x + "\n")
26 |
27 |
28 | http_errors = [
29 | "400 Bad Request",
30 | "401 Unauthorized",
31 | "402 Payment Required",
32 | "403 Forbidden",
33 | "404 Not Found",
34 | "405 Method Not Allowed",
35 | "406 Not Acceptable",
36 | "407 Proxy Authentication Required",
37 | "408 Request Timeout",
38 | "409 Conflict",
39 | "410 Gone",
40 | "411 Length Required",
41 | "412 Precondition Failed",
42 | "413 Payload Too Large",
43 | "414 URI Too Long",
44 | "415 Unsupported Media Type",
45 | "416 Range Not Satisfiable",
46 | "417 Expectation Failed",
47 | "418 I'm a teapot",
48 | "421 Misdirected Request",
49 | "422 Unprocessable Entity",
50 | "423 Locked",
51 | "424 Failed Dependency",
52 | "425 Too Early",
53 | "426 Upgrade Required",
54 | "428 Precondition Required",
55 | "429 Too Many Requests",
56 | "431 Request Header Fields Too Large",
57 | "451 Unavailable For Legal Reasons",
58 | "500 Internal Server Error",
59 | "501 Not Implemented",
60 | "502 Bad Gateway",
61 | "503 Service Unavailable",
62 | "504 Gateway Timeout",
63 | "505 HTTP Version Not Supported",
64 | "506 Variant Also Negotiates",
65 | "507 Insufficient Storage",
66 | "508 Loop Detected",
67 | "510 Not Extended",
68 | "511 Network Authentication Required",
69 | ]
70 |
71 | rejected = [
72 | "Internal Server Error",
73 | "__NOEDITSECTION__",
74 | "enter your username and password",
75 | "forgotten your password",
76 | "cookies enabled",
77 | "enable JavaScript in your browser.",
78 | "The page cannot be displayed",
79 | "site or edit the error_page",
80 | ]
81 |
82 | rejected.extend(http_errors)
83 |
84 |
85 | def replace_multiple(input_string, pattern=r"\s{6,}", replace=" "):
86 | return re.sub(pattern, replace, input_string)
87 |
88 |
89 | def replace(string):
90 | string = replace_multiple(string.replace("…", "."))
91 | string = replace_multiple(string, pattern=r"\.{6,}", replace="...")
92 | return string
93 |
94 |
95 | def reject(string):
96 | if any([r in string for r in rejected]):
97 | return True
98 | return False
99 |
100 |
101 | def loop(files, process_type="multi"):
102 | if process_type == "multi":
103 | files, _ = files
104 |
105 | for f in files:
106 | new_f = f.replace("dedupe-datasets/", "postprocessing/")
107 | new_f_done = f.replace("dedupe-datasets/", "postprocessing-done/")
108 | if os.path.exists(new_f_done):
109 | continue
110 | with open(new_f, "w") as fopen_l:
111 | with open(f) as fopen:
112 | for l in tqdm(fopen):
113 | data = json.loads(l)
114 |
115 | if reject(data["text"]):
116 | continue
117 |
118 | data = replace(data["text"].strip())
119 |
120 | if len(data) < 3:
121 | continue
122 |
123 | fopen_l.write(f"{json.dumps(data)}\n")
124 | fopen_l.flush()
125 |
126 | with open(new_f_done, "w") as fopen:
127 | fopen.write("done")
128 |
--------------------------------------------------------------------------------
/speech-instructions/prepare-malaysia-parliament.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "3c883f91",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "# !wget https://huggingface.co/Zyphra/Zonos-v0.1-speaker-embedding/resolve/main/ResNet293_SimAM_ASP_base.pt"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "id": "bf4cd179",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import os\n",
21 | "\n",
22 | "os.environ['CUDA_VISIBLE_DEVICES'] = '1'\n",
23 | "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 10,
29 | "id": "1a51f3c2",
30 | "metadata": {},
31 | "outputs": [
32 | {
33 | "data": {
34 | "text/plain": [
35 | "{'idx': 0, 'prev_idx': -1}"
36 | ]
37 | },
38 | "execution_count": 10,
39 | "metadata": {},
40 | "output_type": "execute_result"
41 | }
42 | ],
43 | "source": [
44 | "import torch\n",
45 | "\n",
46 | "available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]\n",
47 | "available_gpus[0].__dict__"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 50,
53 | "id": "1ac7cbf3",
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "import pandas as pd\n",
58 | "import numpy as np\n",
59 | "from tqdm import tqdm\n",
60 | "import torchaudio\n",
61 | "from speaker_cloning import SpeakerEmbedding"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 42,
67 | "id": "56d3111e",
68 | "metadata": {
69 | "scrolled": true
70 | },
71 | "outputs": [
72 | {
73 | "name": "stderr",
74 | "output_type": "stream",
75 | "text": [
76 | "100%|███████████████████████████████████████████████████████████████████████████| 2033890/2033890 [00:13<00:00, 146206.60it/s]\n",
77 | "100%|██████████████████████████████████████████████████████████████████████████████| 191545/191545 [00:02<00:00, 76849.55it/s]\n"
78 | ]
79 | },
80 | {
81 | "data": {
82 | "text/plain": [
83 | "636921"
84 | ]
85 | },
86 | "execution_count": 42,
87 | "metadata": {},
88 | "output_type": "execute_result"
89 | }
90 | ],
91 | "source": [
92 | "df = pd.read_parquet('/home/husein/ssd4/verify-text.parquet')\n",
93 | "filtered = []\n",
94 | "for i in tqdm(range(len(df))):\n",
95 | " if 'parlimen-24k' in df['audio'].iloc[i]:\n",
96 | " row = df.iloc[i].to_dict()\n",
97 | " f = os.path.join('/home/husein/ssd4/', row['audio'])\n",
98 | " row['audio'] = f\n",
99 | " filtered.append(row)\n",
100 | " \n",
101 | "df = pd.read_parquet('/home/husein/ssd4/verify-text-chunk-parliament.parquet')\n",
102 | "for i in tqdm(range(len(df))):\n",
103 | " row = df.iloc[i].to_dict()\n",
104 | " f = os.path.join('/home/husein/ssd4/', row['audio'])\n",
105 | " row['audio'] = f\n",
106 | " filtered.append(row)\n",
107 | " \n",
108 | "len(filtered)"
109 | ]
110 | }
111 | ],
112 | "metadata": {
113 | "kernelspec": {
114 | "display_name": "python3.10",
115 | "language": "python",
116 | "name": "python3.10"
117 | },
118 | "language_info": {
119 | "codemirror_mode": {
120 | "name": "ipython",
121 | "version": 3
122 | },
123 | "file_extension": ".py",
124 | "mimetype": "text/x-python",
125 | "name": "python",
126 | "nbconvert_exporter": "python",
127 | "pygments_lexer": "ipython3",
128 | "version": "3.10.15"
129 | }
130 | },
131 | "nbformat": 4,
132 | "nbformat_minor": 5
133 | }
134 |
--------------------------------------------------------------------------------
/text/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
--------------------------------------------------------------------------------
/text/madlad-400-ms/prepare-madlad-400-ms.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "20d4b02f",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "from datasets import load_dataset"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "id": "3fb192ea",
17 | "metadata": {},
18 | "outputs": [
19 | {
20 | "name": "stderr",
21 | "output_type": "stream",
22 | "text": [
23 | "/home/ubuntu/.local/lib/python3.10/site-packages/datasets/table.py:1421: FutureWarning: promote has been superseded by mode='default'.\n",
24 | " table = cls._concat_blocks(blocks, axis=0)\n"
25 | ]
26 | }
27 | ],
28 | "source": [
29 | "madlad_multilang = load_dataset(\"allenai/madlad-400\", languages=[\"ms\", 'ms_Arab_BN', 'ms_Arab'])"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "id": "f9c4b242",
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "data": {
40 | "text/plain": [
41 | "2"
42 | ]
43 | },
44 | "execution_count": 3,
45 | "metadata": {},
46 | "output_type": "execute_result"
47 | }
48 | ],
49 | "source": [
50 | "len(madlad_multilang)"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 4,
56 | "id": "4664c3e2",
57 | "metadata": {},
58 | "outputs": [
59 | {
60 | "data": {
61 | "text/plain": [
62 | "Dataset({\n",
63 | " features: ['text'],\n",
64 | " num_rows: 2337781\n",
65 | "})"
66 | ]
67 | },
68 | "execution_count": 4,
69 | "metadata": {},
70 | "output_type": "execute_result"
71 | }
72 | ],
73 | "source": [
74 | "madlad_multilang['clean']"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 5,
80 | "id": "69bca9f5",
81 | "metadata": {
82 | "scrolled": true
83 | },
84 | "outputs": [
85 | {
86 | "data": {
87 | "text/plain": [
88 | "Dataset({\n",
89 | " features: ['text'],\n",
90 | " num_rows: 14112025\n",
91 | "})"
92 | ]
93 | },
94 | "execution_count": 5,
95 | "metadata": {},
96 | "output_type": "execute_result"
97 | }
98 | ],
99 | "source": [
100 | "madlad_multilang['noisy']"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 6,
106 | "id": "6c42fc18",
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "import json\n",
111 | "from tqdm import tqdm"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 7,
117 | "id": "00ea83da",
118 | "metadata": {},
119 | "outputs": [
120 | {
121 | "name": "stderr",
122 | "output_type": "stream",
123 | "text": [
124 | "100%|██████████| 14112025/14112025 [28:36<00:00, 8220.12it/s] \n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "with open('madlad-400-ms.jsonl', 'w') as fopen:\n",
130 | " for i in tqdm(range(len(madlad_multilang['noisy']))):\n",
131 | " t = madlad_multilang['noisy'][i]\n",
132 | " fopen.write(f'{json.dumps(t)}\\n')"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "id": "66b79ffc",
139 | "metadata": {},
140 | "outputs": [],
141 | "source": []
142 | }
143 | ],
144 | "metadata": {
145 | "kernelspec": {
146 | "display_name": "Python 3 (ipykernel)",
147 | "language": "python",
148 | "name": "python3"
149 | },
150 | "language_info": {
151 | "codemirror_mode": {
152 | "name": "ipython",
153 | "version": 3
154 | },
155 | "file_extension": ".py",
156 | "mimetype": "text/x-python",
157 | "name": "python",
158 | "nbconvert_exporter": "python",
159 | "pygments_lexer": "ipython3",
160 | "version": "3.10.12"
161 | }
162 | },
163 | "nbformat": 4,
164 | "nbformat_minor": 5
165 | }
166 |
--------------------------------------------------------------------------------
/emilia-yodas/convert_neucodec_emilia.py:
--------------------------------------------------------------------------------
1 | import os
2 | import soundfile as sf
3 | import json
4 | import click
5 | import re
6 | import pandas as pd
7 | import librosa
8 | from glob import glob
9 | from functools import partial
10 | from multiprocess import Pool
11 | from tqdm import tqdm
12 | import numpy as np
13 | import itertools
14 |
15 | def old_chunks(l, n):
16 | for i in range(0, len(l), n):
17 | yield (l[i: i + n], i // n)
18 |
19 | def chunks(l, devices):
20 | chunk_size = len(l) // len(devices)
21 | remainder = len(l) % len(devices)
22 | start = 0
23 | for i in range(len(devices)):
24 | extra = 1 if i < remainder else 0
25 | end = start + chunk_size + extra
26 | yield (l[start:end], devices[i])
27 | start = end
28 |
29 | def new_path(f):
30 | splitted = f.split('/')
31 | folder = f.split('/')[0]
32 | folder = folder + '_neucodec'
33 | new_f = os.path.join(folder, '/'.join(splitted[1:]))
34 | new_f = new_f.replace('.mp3', '.json').replace('.wav', '.json')
35 | return new_f
36 |
37 | def multiprocessing(strings, function, cores=6, returned=True):
38 | df_split = old_chunks(strings, len(strings) // cores)
39 | pool = Pool(cores)
40 | pooled = pool.map(function, df_split)
41 | pool.close()
42 | pool.join()
43 |
44 | if returned:
45 | return list(itertools.chain(*pooled))
46 |
47 | def check(files):
48 | files, _ = files
49 | filtered = []
50 | for file in tqdm(files):
51 | filename_done = new_path(file)
52 |
53 | if os.path.exists(filename_done):
54 | try:
55 | with open(filename_done) as fopen:
56 | json.load(fopen)
57 | continue
58 | except:
59 | pass
60 |
61 | filtered.append(file)
62 | return filtered
63 |
64 | def loop(
65 | indices_device_pair,
66 | ):
67 | files, device = indices_device_pair
68 | os.environ['CUDA_VISIBLE_DEVICES'] = str(device)
69 |
70 | from neucodec import NeuCodec
71 | import torchaudio
72 | import torch
73 | torch.autograd.set_grad_enabled(False)
74 |
75 | model = NeuCodec.from_pretrained("neuphonic/neucodec")
76 | model.eval().cuda()
77 |
78 | for f in tqdm(files):
79 | filename = new_path(f)
80 | if os.path.exists(filename):
81 | try:
82 | with open(filename) as fopen:
83 | json.load(fopen)
84 | continue
85 | except:
86 | pass
87 |
88 | try:
89 | y, sr = librosa.load(f, sr = 16000)
90 | wav_tensor = torch.from_numpy(y).float().unsqueeze(0)
91 | fsq_codes = model.encode_code(wav_tensor.unsqueeze(1))
92 | tokens = fsq_codes[0, 0].tolist()
93 |
94 | os.makedirs(os.path.split(filename)[0], exist_ok = True)
95 | with open(filename, 'w') as fopen:
96 | json.dump(tokens, fopen)
97 | except Exception as e:
98 | print(e)
99 |
100 | @click.command()
101 | @click.option('--file')
102 | @click.option('--replication', default = 1)
103 | def main(
104 | file,
105 | replication,
106 | ):
107 | devices = os.environ.get('CUDA_VISIBLE_DEVICES')
108 | if devices is None:
109 |
110 | import torch
111 | devices = list(range(torch.cuda.device_count()))
112 | else:
113 | devices = [d.strip() for d in devices.split(',')]
114 |
115 | devices = replication * devices
116 | print(devices)
117 |
118 | with open(file) as fopen:
119 | files = json.load(fopen)
120 | filtered = multiprocessing(files, check, 30)
121 |
122 | print(len(files), len(filtered))
123 |
124 | df_split = list(chunks(filtered, devices))
125 |
126 | loop_partial = partial(loop)
127 |
128 | with Pool(len(devices)) as pool:
129 | pooled = pool.map(loop_partial, df_split)
130 |
131 | if __name__ == '__main__':
132 | main()
133 |
134 |
--------------------------------------------------------------------------------
/text/madlad-400-ms/dedup-madlad-400-ms.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "69b786dc",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import json\n",
11 | "import pandas as pd\n",
12 | "from tqdm import tqdm\n",
13 | "from datasets import Dataset\n",
14 | "from bs4 import BeautifulSoup"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "id": "19269eb1",
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "with open('madlad-400-ms.postprocessing.jsonl', 'w') as fopen_l:\n",
25 | " with open('/home/ubuntu/madlad-400-ms.postprocessing.jsonl') as fopen:\n",
26 | " for l in tqdm(fopen):\n",
27 | " l = json.loads(l)\n",
28 | " d = {\n",
29 | " 'text': l\n",
30 | " }\n",
31 | " fopen_l.write(f'{json.dumps(d)}\\n')"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "id": "91f9ef48",
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "!head -n 10000 madlad-400-ms.postprocessing.jsonl > test.jsonl"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "id": "eb166831",
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "from datasets import load_dataset\n",
52 | "dataset = load_dataset(\"json\", data_files=\"madlad-400-ms.postprocessing.jsonl\", split = 'train')"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "id": "b93732c2",
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "dataset.save_to_disk(f\"hf-datasets/raw-datasets/madlad-400-ms\")"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 2,
68 | "id": "f84f69d6",
69 | "metadata": {},
70 | "outputs": [
71 | {
72 | "data": {
73 | "text/plain": [
74 | "'python3 -m text_dedup.minhash --path hf-datasets/raw-datasets/madlad-400-ms --split train --cache_dir ./cache --output hf-datasets/dedupe-datasets/madlad-400-ms --column text --batch_size 1000 --threshold 0.95 --min_length 1 --local'"
75 | ]
76 | },
77 | "execution_count": 2,
78 | "metadata": {},
79 | "output_type": "execute_result"
80 | }
81 | ],
82 | "source": [
83 | "command = f\"python3 -m text_dedup.minhash \\\n",
84 | " --path hf-datasets/raw-datasets/madlad-400-ms \\\n",
85 | " --split train \\\n",
86 | " --cache_dir ./cache \\\n",
87 | " --output hf-datasets/dedupe-datasets/madlad-400-ms \\\n",
88 | " --column text \\\n",
89 | " --batch_size 1000 \\\n",
90 | " --threshold 0.95 \\\n",
91 | " --min_length 1 \\\n",
92 | " --local\"\n",
93 | "\n",
94 | "command"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 5,
100 | "id": "4c337e35",
101 | "metadata": {},
102 | "outputs": [],
103 | "source": [
104 | "# import subprocess\n",
105 | "# subprocess.run(command, shell=True)"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 11,
111 | "id": "8b0099e9",
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "!rm -rf hf-datasets/dedupe-datasets/madlad-400-ms"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "id": "dc6a64ba",
122 | "metadata": {},
123 | "outputs": [],
124 | "source": []
125 | }
126 | ],
127 | "metadata": {
128 | "kernelspec": {
129 | "display_name": "Python 3 (ipykernel)",
130 | "language": "python",
131 | "name": "python3"
132 | },
133 | "language_info": {
134 | "codemirror_mode": {
135 | "name": "ipython",
136 | "version": 3
137 | },
138 | "file_extension": ".py",
139 | "mimetype": "text/x-python",
140 | "name": "python",
141 | "nbconvert_exporter": "python",
142 | "pygments_lexer": "ipython3",
143 | "version": "3.10.12"
144 | }
145 | },
146 | "nbformat": 4,
147 | "nbformat_minor": 5
148 | }
149 |
--------------------------------------------------------------------------------
/multilingual-tts/convert_neucodec.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | os.environ['OMP_NUM_THREADS'] = '1'
4 | os.environ['OPENBLAS_NUM_THREADS'] = '1'
5 |
6 | import soundfile as sf
7 | import json
8 | import click
9 | import re
10 | import librosa
11 | from glob import glob
12 | from functools import partial
13 | from multiprocess import Pool
14 | from tqdm import tqdm
15 | import numpy as np
16 | import itertools
17 |
18 | def old_chunks(l, n):
19 | for i in range(0, len(l), n):
20 | yield (l[i: i + n], i // n)
21 |
22 | def chunks(l, devices):
23 | chunk_size = len(l) // len(devices)
24 | remainder = len(l) % len(devices)
25 | start = 0
26 | for i in range(len(devices)):
27 | extra = 1 if i < remainder else 0
28 | end = start + chunk_size + extra
29 | yield (l[start:end], devices[i])
30 | start = end
31 |
32 | def new_path(f):
33 | splitted = f.split('/')
34 | folder = f.split('/')[0]
35 | folder = folder + '_neucodec'
36 | new_f = os.path.join(folder, '/'.join(splitted[1:]))
37 | new_f = new_f.replace('.mp3', '.json').replace('.wav', '.json')
38 | return new_f
39 |
40 | def multiprocessing(strings, function, cores=6, returned=True):
41 | df_split = old_chunks(strings, len(strings) // cores)
42 | pool = Pool(cores)
43 | pooled = pool.map(function, df_split)
44 | pool.close()
45 | pool.join()
46 |
47 | if returned:
48 | return list(itertools.chain(*pooled))
49 |
50 | def check(files):
51 | files, _ = files
52 | filtered = []
53 | for file in tqdm(files):
54 | filename_done = new_path(file)
55 |
56 | if os.path.exists(filename_done):
57 | try:
58 | with open(filename_done) as fopen:
59 | json.load(fopen)
60 | continue
61 | except:
62 | pass
63 |
64 | filtered.append(file)
65 | return filtered
66 |
67 | def loop(
68 | indices_device_pair,
69 | ):
70 | files, device = indices_device_pair
71 | os.environ['CUDA_VISIBLE_DEVICES'] = str(device)
72 |
73 | from neucodec import NeuCodec
74 | import torchaudio
75 | import torch
76 | torch.autograd.set_grad_enabled(False)
77 |
78 | model = NeuCodec.from_pretrained("neuphonic/neucodec")
79 | model.eval().cuda()
80 |
81 | for f in tqdm(files):
82 | filename = new_path(f)
83 | if os.path.exists(filename):
84 | try:
85 | with open(filename) as fopen:
86 | json.load(fopen)
87 | continue
88 | except:
89 | pass
90 |
91 | try:
92 | y, sr = librosa.load(f, sr = 16000)
93 | if len(y) / sr > 20:
94 | continue
95 | wav_tensor = torch.from_numpy(y).float().unsqueeze(0)
96 | fsq_codes = model.encode_code(wav_tensor.unsqueeze(1))
97 | tokens = fsq_codes[0, 0].tolist()
98 |
99 | os.makedirs(os.path.split(filename)[0], exist_ok = True)
100 | with open(filename, 'w') as fopen:
101 | json.dump(tokens, fopen)
102 | except Exception as e:
103 | print(e)
104 |
105 | @click.command()
106 | @click.option('--file')
107 | @click.option('--replication', default = 1)
108 | def main(
109 | file,
110 | replication,
111 | ):
112 | devices = os.environ.get('CUDA_VISIBLE_DEVICES')
113 | if devices is None:
114 |
115 | import torch
116 | devices = list(range(torch.cuda.device_count()))
117 | else:
118 | devices = [d.strip() for d in devices.split(',')]
119 |
120 | devices = replication * devices
121 | print(devices)
122 |
123 | with open(file) as fopen:
124 | files = json.load(fopen)
125 | filtered = multiprocessing(files, check, 30)
126 |
127 | print(len(files), len(filtered))
128 |
129 | df_split = list(chunks(filtered, devices))
130 |
131 | loop_partial = partial(loop)
132 |
133 | with Pool(len(devices)) as pool:
134 | pooled = pool.map(loop_partial, df_split)
135 |
136 | if __name__ == '__main__':
137 | main()
138 |
139 |
140 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # UV
98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | #uv.lock
102 |
103 | # poetry
104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | # This is especially recommended for binary packages to ensure reproducibility, and is more
106 | # commonly ignored for libraries.
107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 |
110 | # pdm
111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | # in version control.
115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 |
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 |
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 |
127 | # SageMath parsed files
128 | *.sage.py
129 |
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 |
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 |
143 | # Rope project settings
144 | .ropeproject
145 |
146 | # mkdocs documentation
147 | /site
148 |
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 |
154 | # Pyre type checker
155 | .pyre/
156 |
157 | # pytype static type analyzer
158 | .pytype/
159 |
160 | # Cython debug symbols
161 | cython_debug/
162 |
163 | # PyCharm
164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | # and can be added to the global gitignore or merged into this file. For a more nuclear
167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 |
170 | # PyPI configuration file
171 | .pypirc
172 | mp.py
173 | *Untitled*.ipynb
174 | malaysian_sft.py
175 | speech-instructions-extra/*audio
176 | speech-instructions-extra/*-Train
177 | *.parquet
--------------------------------------------------------------------------------
/text/pretrain-llm/prepare-starcoder.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "453a2552",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import json\n",
11 | "import os\n",
12 | "from glob import glob\n",
13 | "from tqdm import tqdm"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 2,
19 | "id": "36e657c0",
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "def partition(text, size = 500):\n",
24 | " splitted = text.split()\n",
25 | " return [' '.join(splitted[i: i + size]) for i in range(0, len(splitted), size)]"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "id": "cab2dbd7",
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "a = open('prepare-starcoder.jsonl', 'w')"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 4,
41 | "id": "06167a66",
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "data": {
46 | "text/plain": [
47 | "['starcoder/starcoder/c.jsonl',\n",
48 | " 'starcoder/starcoder/cpp.jsonl',\n",
49 | " 'starcoder/starcoder/css.jsonl',\n",
50 | " 'starcoder/starcoder/go.jsonl',\n",
51 | " 'starcoder/starcoder/html.jsonl',\n",
52 | " 'starcoder/starcoder/java.jsonl',\n",
53 | " 'starcoder/starcoder/javascript.jsonl',\n",
54 | " 'starcoder/starcoder/julia.jsonl',\n",
55 | " 'starcoder/starcoder/markdown.jsonl',\n",
56 | " 'starcoder/starcoder/python.jsonl',\n",
57 | " 'starcoder/starcoder/r.jsonl',\n",
58 | " 'starcoder/starcoder/rust.jsonl',\n",
59 | " 'starcoder/starcoder/sql.jsonl']"
60 | ]
61 | },
62 | "execution_count": 4,
63 | "metadata": {},
64 | "output_type": "execute_result"
65 | }
66 | ],
67 | "source": [
68 | "glob('starcoder/starcoder/*.jsonl')"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 5,
74 | "id": "23d113a4",
75 | "metadata": {},
76 | "outputs": [
77 | {
78 | "name": "stderr",
79 | "output_type": "stream",
80 | "text": [
81 | "1610858it [06:54, 3884.72it/s]\n",
82 | "1314195it [06:08, 3567.05it/s]\n",
83 | "2293654it [06:23, 5987.63it/s]\n",
84 | "1928334it [06:33, 4901.64it/s]\n",
85 | "60451it [00:19, 2792.18it/s]IOPub message rate exceeded.\n",
86 | "The notebook server will temporarily stop sending output\n",
87 | "to the client in order to avoid crashing it.\n",
88 | "To change this limit, set the config variable\n",
89 | "`--NotebookApp.iopub_msg_rate_limit`.\n",
90 | "\n",
91 | "Current values:\n",
92 | "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
93 | "NotebookApp.rate_limit_window=3.0 (secs)\n",
94 | "\n"
95 | ]
96 | }
97 | ],
98 | "source": [
99 | "files = glob('starcoder/starcoder/*.jsonl')\n",
100 | "for f in files:\n",
101 | " with open(f) as fopen:\n",
102 | " for l in tqdm(fopen):\n",
103 | " try:\n",
104 | " data = '' + json.loads(l) + ''\n",
105 | " partitioned = partition(data)\n",
106 | " for p in partitioned:\n",
107 | " data = {\n",
108 | " 'text': p,\n",
109 | " }\n",
110 | " a.write(f'{json.dumps(data)}\\n')\n",
111 | " a.flush()\n",
112 | " except Exception as e:\n",
113 | " print(e)\n",
114 | " pass"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "id": "0c1a228e",
121 | "metadata": {},
122 | "outputs": [],
123 | "source": []
124 | }
125 | ],
126 | "metadata": {
127 | "kernelspec": {
128 | "display_name": "Python 3 (ipykernel)",
129 | "language": "python",
130 | "name": "python3"
131 | },
132 | "language_info": {
133 | "codemirror_mode": {
134 | "name": "ipython",
135 | "version": 3
136 | },
137 | "file_extension": ".py",
138 | "mimetype": "text/x-python",
139 | "name": "python",
140 | "nbconvert_exporter": "python",
141 | "pygments_lexer": "ipython3",
142 | "version": "3.10.12"
143 | }
144 | },
145 | "nbformat": 4,
146 | "nbformat_minor": 5
147 | }
148 |
--------------------------------------------------------------------------------
/multilingual-tts/trim_silence.py:
--------------------------------------------------------------------------------
1 | import os
2 | import soundfile as sf
3 | import librosa
4 | import json
5 | import click
6 | import numpy as np
7 | import malaya_speech
8 | from glob import glob
9 | from functools import partial
10 | from multiprocess import Pool
11 | from tqdm import tqdm
12 |
13 | def chunks(l, devices):
14 | chunk_size = len(l) // len(devices)
15 | remainder = len(l) % len(devices)
16 | start = 0
17 | for i in range(len(devices)):
18 | extra = 1 if i < remainder else 0
19 | end = start + chunk_size + extra
20 | yield (l[start:end], devices[i])
21 | start = end
22 |
23 | def new_path(f):
24 | splitted = f.split('/')
25 | base_folder = splitted[0] + '_trim'
26 | splitted = '/'.join([base_folder] + splitted[1:])
27 | return splitted
28 |
29 | def new_path_done(f):
30 | splitted = f.split('/')
31 | base_folder = splitted[0] + '_trim_done'
32 | splitted = '/'.join([base_folder] + splitted[1:])
33 | return splitted
34 |
35 | def loop(indices_device_pair):
36 | files, device = indices_device_pair
37 |
38 | vad = malaya_speech.vad.webrtc(minimum_amplitude = 0)
39 | min_length = 0.4
40 |
41 | for file in tqdm(files):
42 | folder = os.path.split(file)[0]
43 | folder_folder = os.path.split(folder)[1]
44 | f_new = new_path(file)
45 | filename_done = new_path_done(file)
46 |
47 | try:
48 | with open(filename_done) as fopen:
49 | json.load(fopen)
50 | continue
51 | except:
52 | pass
53 |
54 | try:
55 | vad = malaya_speech.vad.webrtc(minimum_amplitude = 0)
56 | y, sr = librosa.load(file, sr = None)
57 | start_silent_trail = int(0.3 * sr)
58 | middle_silent_trail = int(min_length * sr / 2)
59 | middle_silent_trail, start_silent_trail
60 | y_= malaya_speech.resample(y, sr, 16000)
61 | y_ = malaya_speech.astype.float_to_int(y_)
62 | frames = malaya_speech.generator.frames(y, 30, sr)
63 | frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
64 | frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]
65 | grouped_deep = malaya_speech.group.group_frames(frames_webrtc)
66 | r = []
67 | for no, g in enumerate(grouped_deep):
68 | if g[1]:
69 | g = g[0].array
70 | else:
71 | if no == 0:
72 | g = g[0].array[-start_silent_trail:]
73 | elif no == (len(grouped_deep) - 1):
74 | g = g[0].array[:start_silent_trail]
75 | else:
76 | if g[0].duration >= min_length:
77 | g = [g[0].array[:middle_silent_trail], g[0].array[-middle_silent_trail:]]
78 | g = np.concatenate(g)
79 | else:
80 | g = g[0].array
81 |
82 | r.append(g)
83 | y_after = np.concatenate(r)
84 |
85 | os.makedirs(os.path.split(f_new)[0], exist_ok = True)
86 | sf.write(f_new, y_after, sr)
87 | os.makedirs(os.path.split(filename_done)[0], exist_ok = True)
88 | with open(filename_done, 'w') as fopen:
89 | json.dump('done', fopen)
90 |
91 | except Exception as e:
92 | print(e)
93 |
94 | @click.command()
95 | @click.option('--file')
96 | @click.option('--replication', default = 1)
97 | def main(
98 | file,
99 | replication,
100 | ):
101 | devices = replication * [0]
102 |
103 | with open(file) as fopen:
104 | files = json.load(fopen)
105 | filtered = []
106 | for file in tqdm(files):
107 | filename_done = new_path_done(file)
108 |
109 | if os.path.exists(filename_done):
110 | try:
111 | with open(filename_done) as fopen:
112 | json.load(fopen)
113 | continue
114 | except:
115 | pass
116 | filtered.append(file)
117 |
118 | df_split = list(chunks(filtered, devices))
119 |
120 | loop_partial = partial(loop)
121 |
122 | with Pool(len(devices)) as pool:
123 | pooled = pool.map(loop_partial, df_split)
124 |
125 | if __name__ == '__main__':
126 | main()
127 |
128 |
--------------------------------------------------------------------------------
/text/pretrain-llm/prepare-translation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "e6328ada",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "# !git lfs clone https://huggingface.co/datasets/mesolitica/google-translate-ms-pa\n",
11 | "# !git lfs clone https://huggingface.co/datasets/mesolitica/google-translate-ms-zh-CN\n",
12 | "# !git lfs clone https://huggingface.co/datasets/mesolitica/google-translate-ms-ta"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 1,
18 | "id": "c6bfd5aa",
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "import json\n",
23 | "import os\n",
24 | "from glob import glob\n",
25 | "from tqdm import tqdm"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 2,
31 | "id": "1e6d9ca8",
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "def partition(text, size = 500):\n",
36 | " splitted = text.split()\n",
37 | " return [' '.join(splitted[i: i + size]) for i in range(0, len(splitted), size)]"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 3,
43 | "id": "c2e1d2bc",
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "a = open('prepare-translation.jsonl', 'w')"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 4,
53 | "id": "15ff7cfe",
54 | "metadata": {},
55 | "outputs": [
56 | {
57 | "name": "stderr",
58 | "output_type": "stream",
59 | "text": [
60 | "99967it [00:53, 1868.55it/s]\n",
61 | "99971it [01:00, 1646.85it/s]\n",
62 | "99968it [01:08, 1460.54it/s]\n",
63 | "99966it [00:58, 1719.68it/s]\n",
64 | "99962it [00:56, 1755.14it/s]\n",
65 | "99968it [00:19, 5100.54it/s] \n",
66 | "99959it [00:20, 4887.07it/s] \n",
67 | "99972it [00:15, 6252.46it/s] \n",
68 | "99960it [00:11, 8494.27it/s] \n",
69 | "99974it [00:08, 11529.00it/s]\n",
70 | "99968it [00:14, 6672.96it/s] \n",
71 | "99965it [00:09, 10661.73it/s]\n",
72 | "99965it [00:09, 10525.04it/s]\n",
73 | "99959it [00:15, 6443.56it/s] \n",
74 | "99972it [00:08, 11661.01it/s]\n",
75 | "99966it [00:14, 6786.12it/s] \n",
76 | "99969it [00:22, 4412.35it/s] \n",
77 | "99972it [00:14, 6883.69it/s] \n",
78 | "99963it [00:06, 15602.76it/s]\n",
79 | "99966it [00:16, 6097.17it/s] \n",
80 | "99967it [00:11, 8717.47it/s] \n",
81 | "99970it [00:07, 13489.12it/s]\n",
82 | "99969it [00:18, 5358.87it/s] \n",
83 | "99981it [00:09, 10109.43it/s]\n",
84 | "99968it [00:07, 13383.50it/s]\n",
85 | "99966it [00:14, 7052.85it/s] \n",
86 | "99968it [00:23, 4322.65it/s] \n",
87 | "99968it [00:37, 2634.95it/s]\n",
88 | "99972it [00:36, 2704.90it/s]\n",
89 | "99958it [00:40, 2471.35it/s]\n",
90 | "99967it [00:42, 2371.77it/s]\n",
91 | "99971it [00:44, 2221.78it/s]\n",
92 | "99962it [00:39, 2532.83it/s]\n"
93 | ]
94 | }
95 | ],
96 | "source": [
97 | "google_translate = glob('google-translate-*/*.requested')\n",
98 | "for f in google_translate:\n",
99 | " with open(f) as fopen:\n",
100 | " for l in tqdm(fopen):\n",
101 | " try:\n",
102 | " data = '' + json.loads(l)['r']['result'] + ''\n",
103 | " partitioned = partition(data)\n",
104 | " for p in partitioned:\n",
105 | " data = {\n",
106 | " 'text': p,\n",
107 | " }\n",
108 | " a.write(f'{json.dumps(data)}\\n')\n",
109 | " a.flush()\n",
110 | " except:\n",
111 | " pass"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "id": "ee6d3d66",
118 | "metadata": {},
119 | "outputs": [],
120 | "source": []
121 | }
122 | ],
123 | "metadata": {
124 | "kernelspec": {
125 | "display_name": "Python 3 (ipykernel)",
126 | "language": "python",
127 | "name": "python3"
128 | },
129 | "language_info": {
130 | "codemirror_mode": {
131 | "name": "ipython",
132 | "version": 3
133 | },
134 | "file_extension": ".py",
135 | "mimetype": "text/x-python",
136 | "name": "python",
137 | "nbconvert_exporter": "python",
138 | "pygments_lexer": "ipython3",
139 | "version": "3.10.12"
140 | }
141 | },
142 | "nbformat": 4,
143 | "nbformat_minor": 5
144 | }
145 |
--------------------------------------------------------------------------------
/text/extra/sample-fineweb-edu.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 13,
6 | "id": "e58f50a7-ac12-4bac-ab97-ce10a1de9154",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "from datasets import load_dataset\n",
11 | "from tqdm import tqdm\n",
12 | "import json"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 5,
18 | "id": "015519ff-efb4-4d80-adfd-7e826822af76",
19 | "metadata": {},
20 | "outputs": [
21 | {
22 | "data": {
23 | "application/vnd.jupyter.widget-view+json": {
24 | "model_id": "dfc39a6763534034a70ba3a3e960169d",
25 | "version_major": 2,
26 | "version_minor": 0
27 | },
28 | "text/plain": [
29 | "Resolving data files: 0%| | 0/104 [00:00, ?it/s]"
30 | ]
31 | },
32 | "metadata": {},
33 | "output_type": "display_data"
34 | },
35 | {
36 | "data": {
37 | "application/vnd.jupyter.widget-view+json": {
38 | "model_id": "0a44faca2a464e1d977bba4a428ae569",
39 | "version_major": 2,
40 | "version_minor": 0
41 | },
42 | "text/plain": [
43 | "Resolving data files: 0%| | 0/234 [00:00, ?it/s]"
44 | ]
45 | },
46 | "metadata": {},
47 | "output_type": "display_data"
48 | }
49 | ],
50 | "source": [
51 | "ds = load_dataset(\"HuggingFaceTB/smollm-corpus\", \"fineweb-edu-dedup\", streaming = True, split = 'train')"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 15,
57 | "id": "2c6d0b3d-37e1-4789-9689-74996c7688be",
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "# index = 0\n",
62 | "# with open('fineweb-edu-dedup-sample-10M.jsonl', 'w') as fopen_l:\n",
63 | "# for row in tqdm(ds):\n",
64 | "# t = row['text']\n",
65 | "# fopen_l.write(f'{json.dumps(t)}\\n')\n",
66 | "# index += 1\n",
67 | "# if index > 1e7:\n",
68 | "# break"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 16,
74 | "id": "89ad1f82-8041-47cf-a640-a20ee50073d9",
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "!mv fineweb-edu-dedup-sample-10M.jsonl fineweb-edu-dedup-sample-5M.jsonl"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 17,
84 | "id": "e6a3642b-5a8a-4018-ae8f-be5c9009fdbc",
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | "-rw-r--r-- 1 sagemaker-user users 22G Aug 5 08:05 fineweb-edu-dedup-sample-5M.jsonl\n"
92 | ]
93 | }
94 | ],
95 | "source": [
96 | "!ls -lh fineweb-edu-dedup-sample-5M.jsonl"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "id": "c4fe5407-7877-4c10-9084-e62ed97ecc8d",
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "data": {
107 | "application/vnd.jupyter.widget-view+json": {
108 | "model_id": "4717bd184fde4ed094f07eca7572f2f7",
109 | "version_major": 2,
110 | "version_minor": 0
111 | },
112 | "text/plain": [
113 | "fineweb-edu-dedup-sample-5M.jsonl: 0%| | 0.00/23.5G [00:00, ?B/s]"
114 | ]
115 | },
116 | "metadata": {},
117 | "output_type": "display_data"
118 | }
119 | ],
120 | "source": [
121 | "from huggingface_hub import HfApi\n",
122 | "api = HfApi()\n",
123 | "api.upload_file(\n",
124 | " path_or_fileobj=\"fineweb-edu-dedup-sample-5M.jsonl\",\n",
125 | " path_in_repo=\"fineweb-edu-dedup-sample-5M.jsonl\",\n",
126 | " repo_id=\"malaysia-ai/pretrain-text-dataset\",\n",
127 | " repo_type=\"dataset\",\n",
128 | ")\n"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "id": "7c93660f-9e7b-4ce8-9c98-c13899b20c1b",
135 | "metadata": {},
136 | "outputs": [],
137 | "source": []
138 | }
139 | ],
140 | "metadata": {
141 | "kernelspec": {
142 | "display_name": "Python 3 (ipykernel)",
143 | "language": "python",
144 | "name": "python3"
145 | },
146 | "language_info": {
147 | "codemirror_mode": {
148 | "name": "ipython",
149 | "version": 3
150 | },
151 | "file_extension": ".py",
152 | "mimetype": "text/x-python",
153 | "name": "python",
154 | "nbconvert_exporter": "python",
155 | "pygments_lexer": "ipython3",
156 | "version": "3.10.14"
157 | }
158 | },
159 | "nbformat": 4,
160 | "nbformat_minor": 5
161 | }
162 |
--------------------------------------------------------------------------------
/stt-whisper/audioset_sliding.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
2 | from glob import glob
3 | from tqdm import tqdm
4 | from datasets import Audio
5 | from torch.utils.data import Dataset
6 | from torch.utils.data import DataLoader
7 | import torch
8 | import torchaudio
9 | import numpy as np
10 | import click
11 | import os
12 | import json
13 |
14 | def new_path(f):
15 | f = f.replace('.mp3', '.audioset')
16 | splitted = f.split('/')
17 | base_folder = splitted[0] + '_audioset'
18 | splitted = '/'.join([base_folder] + splitted[1:])
19 | return splitted
20 |
21 | @click.command()
22 | @click.option("--file", help="file")
23 | @click.option("--global-index", default=1, help="global index")
24 | @click.option("--local-index", default=0, help="local index")
25 | @click.option("--sliding", default=0.5)
26 | @click.option("--model", default='MIT/ast-finetuned-audioset-10-10-0.4593')
27 | def function(file, global_index, local_index, sliding, model):
28 |
29 | files = []
30 | with open(file) as fopen:
31 | for no, l in enumerate(fopen):
32 | l = json.loads(l)
33 | l['index'] = os.path.join(file.replace('.jsonl', ''), f'{no}.mp3')
34 | files.append(l)
35 |
36 | print(len(files), files[0])
37 | filtered_files = []
38 | for f in files:
39 | if not os.path.exists(f['audio_filename']):
40 | continue
41 | new_f = new_path(f['audio_filename'])
42 | if os.path.exists(new_f) and os.path.getsize(new_f) > 2:
43 | continue
44 | filtered_files.append(f)
45 |
46 | print(len(files), len(filtered_files))
47 | global_size = len(filtered_files) // global_index
48 | files = filtered_files[global_size * local_index: global_size * (local_index + 1)]
49 | print(len(files))
50 |
51 | feature_extractor = AutoFeatureExtractor.from_pretrained(model, return_attention_mask = True)
52 | model = AutoModelForAudioClassification.from_pretrained(model, torch_dtype = torch.float16).eval().cuda()
53 | id2label = model.config.id2label
54 | sr = feature_extractor.sampling_rate
55 | sliding = int(sliding * sr)
56 | audio = Audio(sampling_rate = sr)
57 |
58 | class CustomDataset(Dataset):
59 | def __init__(self, files):
60 | self.files = files
61 |
62 | def __len__(self):
63 | return len(self.files)
64 |
65 | def __getitem__(self, index):
66 | f = self.files[index]
67 | f = f['audio_filename']
68 | y = audio.decode_example(audio.encode_example(f))['array']
69 | timestamps = []
70 | slided = []
71 | for i in range(0, len(y), sliding):
72 | y_ = y[i: i + sliding]
73 | if len(y_) < 1000:
74 | continue
75 | slided.append(y[i: i + sliding])
76 | start = i / sr
77 | end = min(len(y) / sr, (i + sliding) / sr)
78 | timestamps.append((start, end))
79 |
80 | inputs = feature_extractor(slided, sampling_rate=sr,
81 | return_tensors="pt", return_attention_mask = True)
82 | return inputs, f, timestamps
83 |
84 | dataset = CustomDataset(files)
85 | dataloader = DataLoader(dataset, batch_size = 1, shuffle = False, prefetch_factor=10, num_workers=5)
86 | with torch.no_grad():
87 | for row in tqdm(iter(dataloader)):
88 | inputs, f, timestamps_ = row
89 | f = f[0]
90 | timestamps = []
91 | for t in timestamps_:
92 | timestamps.append((float(t[0]), float(t[1])))
93 |
94 | inputs['input_values'] = inputs['input_values'][0].to(torch.float16).cuda()
95 | logits = model(**inputs).logits.softmax(-1)
96 | topk = torch.topk(logits, 5, dim = -1)
97 | probs = topk.values.cpu().numpy().tolist()
98 |
99 | for i in range(len(probs)):
100 | for k in range(len(probs[i])):
101 | probs[i][k] = round(probs[i][k], 4)
102 |
103 | labels = []
104 | for row in topk.indices.cpu().numpy():
105 | label = [id2label[r] for r in row]
106 | labels.append(label)
107 |
108 | splitted = new_path(f)
109 | os.makedirs(os.path.split(splitted)[0], exist_ok = True)
110 | with open(splitted, 'w') as fopen:
111 | json.dump({'timestamps': timestamps, 'labels': labels, 'probs': probs}, fopen)
112 |
113 |
114 | if __name__ == '__main__':
115 | function()
--------------------------------------------------------------------------------
/malaysian-short-instructions/dedup-questions-intents.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "8009e792",
7 | "metadata": {},
8 | "outputs": [
9 | {
10 | "data": {
11 | "text/plain": [
12 | "76765"
13 | ]
14 | },
15 | "execution_count": 1,
16 | "metadata": {},
17 | "output_type": "execute_result"
18 | }
19 | ],
20 | "source": [
21 | "from glob import glob\n",
22 | "import json\n",
23 | "import re\n",
24 | "\n",
25 | "pattern = r\"\\d+\\.\\s(.+)\"\n",
26 | "already = set()\n",
27 | "\n",
28 | "files = glob('generate-questions-intents/*')\n",
29 | "\n",
30 | "questions = []\n",
31 | "for f in files:\n",
32 | " with open(f) as fopen:\n",
33 | " d = json.load(fopen)\n",
34 | " keyword = d['q'][0]\n",
35 | " for q in re.findall(pattern, d['r']):\n",
36 | " if q in already:\n",
37 | " continue\n",
38 | " questions.append((q, keyword))\n",
39 | " \n",
40 | "len(questions)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 2,
46 | "id": "cc4f4bab",
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "import string\n",
51 | "\n",
52 | "digits = set(string.digits)\n",
53 | "rejected = ['\\'', '\"', 'http', '\\n', '[', ']', '/', '`']\n",
54 | "\n",
55 | "def contains_non_ascii(text):\n",
56 | " return any(ord(char) > 127 for char in text)\n",
57 | "\n",
58 | "def reject_q(q):\n",
59 | " if q is None:\n",
60 | " return True\n",
61 | " if any([c in q for c in rejected]):\n",
62 | " return True\n",
63 | " if contains_non_ascii(q):\n",
64 | " return True\n",
65 | " if len(set(q) & digits):\n",
66 | " return True\n",
67 | " if len(q) < 20:\n",
68 | " return True\n",
69 | " if len(q) > 200:\n",
70 | " return True\n",
71 | " return False"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 3,
77 | "id": "fddc3adf",
78 | "metadata": {
79 | "scrolled": true
80 | },
81 | "outputs": [
82 | {
83 | "data": {
84 | "text/plain": [
85 | "192"
86 | ]
87 | },
88 | "execution_count": 3,
89 | "metadata": {},
90 | "output_type": "execute_result"
91 | }
92 | ],
93 | "source": [
94 | "from collections import defaultdict\n",
95 | "\n",
96 | "filtered_q = defaultdict(list)\n",
97 | "for q, k in questions:\n",
98 | " if len(q) < 10:\n",
99 | " continue\n",
100 | " if reject_q(q):\n",
101 | " continue\n",
102 | " \n",
103 | " filtered_q[k].append(q)\n",
104 | "len(filtered_q)"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 4,
110 | "id": "64cf6d6b",
111 | "metadata": {
112 | "scrolled": false
113 | },
114 | "outputs": [
115 | {
116 | "data": {
117 | "text/plain": [
118 | "75010"
119 | ]
120 | },
121 | "execution_count": 4,
122 | "metadata": {},
123 | "output_type": "execute_result"
124 | }
125 | ],
126 | "source": [
127 | "questions = []\n",
128 | "for k, v in filtered_q.items():\n",
129 | " if len(v) < 100:\n",
130 | " continue\n",
131 | " v = sorted(v, key = lambda x: len(x), reverse = True)\n",
132 | " v = [(v_, k) for v_ in v][:1000]\n",
133 | " questions.extend(v)\n",
134 | " \n",
135 | "len(questions)"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": 9,
141 | "id": "eee58538",
142 | "metadata": {},
143 | "outputs": [
144 | {
145 | "data": {
146 | "text/plain": [
147 | "('Apa menu makanan terkenal di Gopeng?', 'food negeri pulau pinang')"
148 | ]
149 | },
150 | "execution_count": 9,
151 | "metadata": {},
152 | "output_type": "execute_result"
153 | }
154 | ],
155 | "source": [
156 | "questions[-4]"
157 | ]
158 | }
159 | ],
160 | "metadata": {
161 | "kernelspec": {
162 | "display_name": "Python 3 (ipykernel)",
163 | "language": "python",
164 | "name": "python3"
165 | },
166 | "language_info": {
167 | "codemirror_mode": {
168 | "name": "ipython",
169 | "version": 3
170 | },
171 | "file_extension": ".py",
172 | "mimetype": "text/x-python",
173 | "name": "python",
174 | "nbconvert_exporter": "python",
175 | "pygments_lexer": "ipython3",
176 | "version": "3.8.10"
177 | }
178 | },
179 | "nbformat": 4,
180 | "nbformat_minor": 5
181 | }
182 |
--------------------------------------------------------------------------------
/text/processing/main.py:
--------------------------------------------------------------------------------
1 | import re
2 | import mp
3 | import time
4 | import json
5 | import random
6 | import functools
7 | from tqdm import tqdm
8 | from pathlib import Path
9 | from unidecode import unidecode
10 | from argparse import ArgumentParser
11 | import function as func
12 |
13 |
14 | def parse_arguments():
15 | parser = ArgumentParser()
16 | parser.add_argument(
17 | "--dataset", dest="dataset", help="Dataset name", required=False
18 | )
19 | parser.add_argument(
20 | "--url_dataset", dest="url_dataset", help="Dataset URL (jsonl)", required=False
21 | )
22 | parser.add_argument(
23 | "--clean_file_path",
24 | dest="clean_file_path",
25 | help="Load the .jsonl file that has been cleaned instead of from huggingface",
26 | required=False,
27 | )
28 | parser.add_argument(
29 | "--master_folder",
30 | dest="master_dataset_folder",
31 | help="Master folder to store dataset and processed output",
32 | required=True,
33 | )
34 | parser.add_argument(
35 | "--mp_core",
36 | dest="mp_core",
37 | default=6,
38 | help="Postprocessing Core",
39 | required=False,
40 | )
41 | parser.add_argument(
42 | "--dataset_with_link",
43 | dest="dataset_with_link",
44 | nargs="+",
45 | help="Dataset name",
46 | required=False,
47 | )
48 | parser.add_argument(
49 | "--text_key",
50 | dest="text_key",
51 | nargs="+",
52 | help="Dict key contain text data",
53 | required=False,
54 | )
55 |
56 | args = parser.parse_args()
57 | return args
58 |
59 |
60 | def loop_process(datasets, process_type="multi"):
61 | if process_type == "multi":
62 | lst_dataset, _ = datasets
63 | else:
64 | lst_dataset = datasets
65 |
66 | dataset_name_lst = []
67 | remove_dataset_name_lst = []
68 |
69 | for dataset in lst_dataset:
70 | try:
71 | url_dataset = dataset[1]
72 | dataset_name = dataset[0]
73 |
74 | dataset_name_lst.append(dataset_name)
75 |
76 | print(f"\nProcessing ... {dataset_name}\n")
77 |
78 | try:
79 | func.init_process(
80 | raw_dataset_path=master_dataset_folder,
81 | dataset_name=dataset_name,
82 | clean_file_path=url_dataset,
83 | text_key=text_key,
84 | )
85 | except:
86 | func.init_process(
87 | raw_dataset_path=master_dataset_folder,
88 | dataset_name=dataset_name,
89 | link=url_dataset,
90 | text_key=text_key,
91 | )
92 |
93 | func.second_process(master_dataset_folder, dataset_name)
94 | except Exception as e:
95 | print(f"[ERROR] {str(e)} \n Skip {dataset_name} ...")
96 | dataset_name_lst.remove(dataset_name)
97 | remove_dataset_name_lst.append(dataset_name)
98 | pass
99 |
100 | if len(dataset_name_lst) != 0:
101 | func.third_process(master_dataset_folder, mp_core)
102 |
103 | for l in dataset_name_lst:
104 | before_dedup_mb, after_dedup_mb, after_post_mb = func.get_size(
105 | master_dataset_folder, l
106 | )
107 |
108 | print("\n\n====================")
109 | print(f"File Size - {l}")
110 | print(f"before_dedup ---> {before_dedup_mb}")
111 | print(f"after_dedup ---> {after_dedup_mb}")
112 | print(f"after_post ---> {after_post_mb}")
113 | print("====================\n\n")
114 |
115 | if len(remove_dataset_name_lst) > 0:
116 | print(f"Problem datasets:\n{','.join(remove_dataset_name_lst)}")
117 |
118 |
119 | if __name__ == "__main__":
120 | start_time = time.time()
121 |
122 | global master_dataset_folder
123 | global mp_core
124 | global text_key
125 |
126 | args = parse_arguments()
127 |
128 | clean_file_path = args.clean_file_path
129 | multiple_dataset = args.dataset_with_link
130 | text_key = args.text_key
131 |
132 | if clean_file_path:
133 | print("[Run for manually cleaned dataset]")
134 | dataset_name = args.dataset
135 | datasets = [(dataset_name, clean_file_path)]
136 | elif multiple_dataset:
137 | print("[Run for MULTIPLE datasets]")
138 | datasets = [tuple(l.split(",")) for l in multiple_dataset]
139 | else:
140 | print("[Run for SINGLE dataset]")
141 | dataset_name = args.dataset
142 | url_dataset = args.url_dataset
143 |
144 | datasets = [(dataset_name, url_dataset)]
145 |
146 | master_dataset_folder = args.master_dataset_folder
147 | mp_core = args.mp_core
148 |
149 | if len(datasets) // mp_core == 0:
150 | loop_process(datasets, process_type="single")
151 | else:
152 | mp.multiprocessing(datasets, loop_process, cores=mp_core, returned=False)
153 |
154 | print(f"--- {time.time() - start_time} seconds ---")
155 |
--------------------------------------------------------------------------------
/emotional-malaysian-emilia/audioset_sliding_v2.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
2 | from collections import defaultdict
3 | from tqdm import tqdm
4 | from glob import glob
5 | from datasets import Audio
6 | from torch.utils.data import Dataset
7 | from torch.utils.data import DataLoader
8 | import torch
9 | import torchaudio
10 | import numpy as np
11 | import click
12 | import os
13 | import json
14 | import numpy as np
15 |
16 | def new_path(f):
17 | f = f.replace('.mp3', '.audioset_v2')
18 | splitted = f.split('/')
19 | base_folder = splitted[0] + '_audioset_v2'
20 | splitted = '/'.join([base_folder] + splitted[1:])
21 | return splitted
22 |
23 |
24 | @click.command()
25 | @click.option("--path", help="files path in glob pattern")
26 | @click.option("--global-index", default=1, help="global index")
27 | @click.option("--local-index", default=0, help="local index")
28 | @click.option("--stride", default=0.1)
29 | @click.option("--sliding", default=1.0)
30 | @click.option("--model", default='MIT/ast-finetuned-audioset-10-10-0.4593')
31 | def function(path, global_index, local_index, stride, sliding, model):
32 |
33 | feature_extractor = AutoFeatureExtractor.from_pretrained(model, return_attention_mask = True)
34 | model = AutoModelForAudioClassification.from_pretrained(model, torch_dtype = torch.float16).eval().cuda()
35 | id2label = model.config.id2label
36 | sr = feature_extractor.sampling_rate
37 | actual_stride = stride
38 | stride = int(stride * sr)
39 | sliding = int(sliding * sr)
40 | audio = Audio(sampling_rate = sr)
41 |
42 | files = glob(path)
43 | filtered_files = []
44 | for f in files:
45 | new_f = new_path(f)
46 | if os.path.exists(new_f) and os.path.getsize(new_f) > 2:
47 | continue
48 | filtered_files.append(f)
49 |
50 | global_size = len(filtered_files) // global_index
51 | filtered_files = filtered_files[global_size * local_index: global_size * (local_index + 1)]
52 | files = filtered_files
53 |
54 | class CustomDataset(Dataset):
55 | def __init__(self, files):
56 | self.files = files
57 |
58 | def __len__(self):
59 | return len(self.files)
60 |
61 | def __getitem__(self, index):
62 | f = self.files[index]
63 | y = audio.decode_example(audio.encode_example(f))['array']
64 | timestamps = []
65 | slided = []
66 | last_end = 0
67 | for i in range(0, len(y) - sliding + 1, stride):
68 | end = i + sliding
69 | slided.append(y[i: end])
70 | timestamps.append((i / sr, end / sr))
71 | last_end = end
72 |
73 | if last_end < len(y):
74 | y_ = y[last_end:]
75 | if len(y_) >= stride:
76 | slided.append(y_)
77 | timestamps.append((last_end / sr, len(y) / sr))
78 |
79 | inputs = feature_extractor(slided, sampling_rate=sr,
80 | return_tensors="pt", return_attention_mask = True)
81 | return inputs, f, timestamps
82 |
83 | dataset = CustomDataset(files)
84 | dataloader = DataLoader(dataset, batch_size = 1, shuffle = False, prefetch_factor=10, num_workers=5)
85 |
86 | with torch.no_grad():
87 | for row in tqdm(iter(dataloader)):
88 | inputs, f, timestamps_ = row
89 | f = f[0]
90 | timestamps = []
91 | for t in timestamps_:
92 | timestamps.append((float(t[0]), float(t[1])))
93 |
94 | inputs['input_values'] = inputs['input_values'][0].to(torch.float16).cuda()
95 | logits = model(inputs['input_values']).logits.cpu().numpy()
96 | logits_per_timestamp = {t: logits[no] for no, (t, _) in enumerate(timestamps)}
97 | logits_accumulator = defaultdict(lambda: np.zeros(logits.shape[1]))
98 | count_accumulator = defaultdict(int)
99 |
100 | for (start, end) in timestamps:
101 | for t in np.arange(start, end, actual_stride):
102 | logits_accumulator[t] += logits_per_timestamp[start]
103 | count_accumulator[t] += 1
104 |
105 | averaged_logits = {t: logits_accumulator[t] / count_accumulator[t] for t in logits_accumulator}
106 | for k in averaged_logits.keys():
107 | averaged_logits[k] = [round(v_, 5) for v_ in averaged_logits[k]]
108 |
109 | combined = []
110 | for k, v in averaged_logits.items():
111 | topk = np.array(v).argsort()[-5:][::-1]
112 | scores = [float(v[i]) for i in topk]
113 | topk = [id2label[i] for i in topk]
114 | combined.append({'timestamp': k, 'topk': topk, 'scores': scores})
115 |
116 | splitted = new_path(f)
117 | os.makedirs(os.path.split(splitted)[0], exist_ok = True)
118 | with open(splitted, 'w') as fopen:
119 | json.dump(combined, fopen)
120 |
121 | if __name__ == '__main__':
122 | function()
--------------------------------------------------------------------------------
/text/llama/prepare-dataset-1024.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "4c973cad",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "from transformers import (\n",
11 | " AutoModelForCausalLM,\n",
12 | " AutoTokenizer,\n",
13 | " get_scheduler,\n",
14 | " default_data_collator,\n",
15 | " SchedulerType\n",
16 | ")\n",
17 | "import os\n",
18 | "import json\n",
19 | "from itertools import chain\n",
20 | "from datasets import load_dataset"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "id": "7e56d3d6",
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "block_size = 1024\n",
31 | "train_file = 'combine.jsonl'\n",
32 | "tokenizer = AutoTokenizer.from_pretrained(\n",
33 | " 'meta-llama/Llama-2-7b-hf',\n",
34 | ")\n",
35 | "text_column_name = 'text'"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "id": "98f1cb1b",
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "data": {
46 | "application/vnd.jupyter.widget-view+json": {
47 | "model_id": "47f4799106b9459da07783bfe46cfd03",
48 | "version_major": 2,
49 | "version_minor": 0
50 | },
51 | "text/plain": [
52 | "Downloading data files: 0%| | 0/1 [00:00, ?it/s]"
53 | ]
54 | },
55 | "metadata": {},
56 | "output_type": "display_data"
57 | },
58 | {
59 | "data": {
60 | "application/vnd.jupyter.widget-view+json": {
61 | "model_id": "3b741378f28f4a7889a6390a0cc6fc52",
62 | "version_major": 2,
63 | "version_minor": 0
64 | },
65 | "text/plain": [
66 | "Extracting data files: 0%| | 0/1 [00:00, ?it/s]"
67 | ]
68 | },
69 | "metadata": {},
70 | "output_type": "display_data"
71 | },
72 | {
73 | "data": {
74 | "application/vnd.jupyter.widget-view+json": {
75 | "model_id": "29ffb88dc5f5483c9a471b8f70947fa3",
76 | "version_major": 2,
77 | "version_minor": 0
78 | },
79 | "text/plain": [
80 | "Generating train split: 0 examples [00:00, ? examples/s]"
81 | ]
82 | },
83 | "metadata": {},
84 | "output_type": "display_data"
85 | },
86 | {
87 | "data": {
88 | "application/vnd.jupyter.widget-view+json": {
89 | "model_id": "130d512b4389418abafbf435c8446914",
90 | "version_major": 2,
91 | "version_minor": 0
92 | },
93 | "text/plain": [
94 | "Map (num_proc=20): 0%| | 0/33339118 [00:00, ? examples/s]"
95 | ]
96 | },
97 | "metadata": {},
98 | "output_type": "display_data"
99 | }
100 | ],
101 | "source": [
102 | "raw_datasets = load_dataset(\n",
103 | " 'json',\n",
104 | " data_files=train_file,\n",
105 | " split='train'\n",
106 | ")\n",
107 | "\n",
108 | "filename = os.path.split(train_file)[1]\n",
109 | "\n",
110 | "def tokenize_function(examples):\n",
111 | " return tokenizer(examples[text_column_name])\n",
112 | "\n",
113 | "column_names = raw_datasets.column_names\n",
114 | "tokenized_datasets = raw_datasets.map(\n",
115 | " tokenize_function,\n",
116 | " batched=True,\n",
117 | " remove_columns=column_names,\n",
118 | " load_from_cache_file=True,\n",
119 | " cache_file_name=f'./{filename}-tokenized-{block_size}',\n",
120 | " num_proc=20,\n",
121 | ")\n",
122 | "\n",
123 | "def group_texts(examples):\n",
124 | " concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}\n",
125 | " total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
126 | " total_length = (total_length // block_size) * block_size\n",
127 | " result = {\n",
128 | " k: [t[i: i + block_size] for i in range(0, total_length, block_size)]\n",
129 | " for k, t in concatenated_examples.items()\n",
130 | " }\n",
131 | " result[\"labels\"] = result[\"input_ids\"].copy()\n",
132 | " return result\n",
133 | "\n",
134 | "lm_datasets = tokenized_datasets.map(\n",
135 | " group_texts,\n",
136 | " batched=True,\n",
137 | " load_from_cache_file=True,\n",
138 | " cache_file_name=f'./{filename}-grouped-{block_size}',\n",
139 | " num_proc=20,\n",
140 | ")"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "id": "3bdc7594",
147 | "metadata": {},
148 | "outputs": [],
149 | "source": []
150 | }
151 | ],
152 | "metadata": {
153 | "kernelspec": {
154 | "display_name": "Python 3 (ipykernel)",
155 | "language": "python",
156 | "name": "python3"
157 | },
158 | "language_info": {
159 | "codemirror_mode": {
160 | "name": "ipython",
161 | "version": 3
162 | },
163 | "file_extension": ".py",
164 | "mimetype": "text/x-python",
165 | "name": "python",
166 | "nbconvert_exporter": "python",
167 | "pygments_lexer": "ipython3",
168 | "version": "3.10.12"
169 | }
170 | },
171 | "nbformat": 4,
172 | "nbformat_minor": 5
173 | }
174 |
--------------------------------------------------------------------------------
/text/llama/prepare-dataset-2048.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "66881a84",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "from transformers import (\n",
11 | " AutoModelForCausalLM,\n",
12 | " AutoTokenizer,\n",
13 | " get_scheduler,\n",
14 | " default_data_collator,\n",
15 | " SchedulerType\n",
16 | ")\n",
17 | "import os\n",
18 | "import json\n",
19 | "from itertools import chain\n",
20 | "from datasets import load_dataset"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "id": "10e30b50",
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "block_size = 2048\n",
31 | "train_file = 'combine.jsonl'\n",
32 | "tokenizer = AutoTokenizer.from_pretrained(\n",
33 | " 'meta-llama/Llama-2-7b-hf',\n",
34 | ")\n",
35 | "text_column_name = 'text'"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "id": "fc8e725f",
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "data": {
46 | "application/vnd.jupyter.widget-view+json": {
47 | "model_id": "47f4799106b9459da07783bfe46cfd03",
48 | "version_major": 2,
49 | "version_minor": 0
50 | },
51 | "text/plain": [
52 | "Downloading data files: 0%| | 0/1 [00:00, ?it/s]"
53 | ]
54 | },
55 | "metadata": {},
56 | "output_type": "display_data"
57 | },
58 | {
59 | "data": {
60 | "application/vnd.jupyter.widget-view+json": {
61 | "model_id": "3b741378f28f4a7889a6390a0cc6fc52",
62 | "version_major": 2,
63 | "version_minor": 0
64 | },
65 | "text/plain": [
66 | "Extracting data files: 0%| | 0/1 [00:00, ?it/s]"
67 | ]
68 | },
69 | "metadata": {},
70 | "output_type": "display_data"
71 | },
72 | {
73 | "data": {
74 | "application/vnd.jupyter.widget-view+json": {
75 | "model_id": "29ffb88dc5f5483c9a471b8f70947fa3",
76 | "version_major": 2,
77 | "version_minor": 0
78 | },
79 | "text/plain": [
80 | "Generating train split: 0 examples [00:00, ? examples/s]"
81 | ]
82 | },
83 | "metadata": {},
84 | "output_type": "display_data"
85 | },
86 | {
87 | "data": {
88 | "application/vnd.jupyter.widget-view+json": {
89 | "model_id": "130d512b4389418abafbf435c8446914",
90 | "version_major": 2,
91 | "version_minor": 0
92 | },
93 | "text/plain": [
94 | "Map (num_proc=20): 0%| | 0/33339118 [00:00, ? examples/s]"
95 | ]
96 | },
97 | "metadata": {},
98 | "output_type": "display_data"
99 | }
100 | ],
101 | "source": [
102 | "raw_datasets = load_dataset(\n",
103 | " 'json',\n",
104 | " data_files=train_file,\n",
105 | " split='train'\n",
106 | ")\n",
107 | "\n",
108 | "filename = os.path.split(train_file)[1]\n",
109 | "\n",
110 | "def tokenize_function(examples):\n",
111 | " return tokenizer(examples[text_column_name])\n",
112 | "\n",
113 | "column_names = raw_datasets.column_names\n",
114 | "tokenized_datasets = raw_datasets.map(\n",
115 | " tokenize_function,\n",
116 | " batched=True,\n",
117 | " remove_columns=column_names,\n",
118 | " load_from_cache_file=True,\n",
119 | " cache_file_name=f'./{filename}-tokenized-{block_size}',\n",
120 | " num_proc=20,\n",
121 | ")\n",
122 | "\n",
123 | "def group_texts(examples):\n",
124 | " concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}\n",
125 | " total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
126 | " total_length = (total_length // block_size) * block_size\n",
127 | " result = {\n",
128 | " k: [t[i: i + block_size] for i in range(0, total_length, block_size)]\n",
129 | " for k, t in concatenated_examples.items()\n",
130 | " }\n",
131 | " result[\"labels\"] = result[\"input_ids\"].copy()\n",
132 | " return result\n",
133 | "\n",
134 | "lm_datasets = tokenized_datasets.map(\n",
135 | " group_texts,\n",
136 | " batched=True,\n",
137 | " load_from_cache_file=True,\n",
138 | " cache_file_name=f'./{filename}-grouped-{block_size}',\n",
139 | " num_proc=20,\n",
140 | ")"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "id": "4fc4065e",
147 | "metadata": {},
148 | "outputs": [],
149 | "source": []
150 | }
151 | ],
152 | "metadata": {
153 | "kernelspec": {
154 | "display_name": "Python 3 (ipykernel)",
155 | "language": "python",
156 | "name": "python3"
157 | },
158 | "language_info": {
159 | "codemirror_mode": {
160 | "name": "ipython",
161 | "version": 3
162 | },
163 | "file_extension": ".py",
164 | "mimetype": "text/x-python",
165 | "name": "python",
166 | "nbconvert_exporter": "python",
167 | "pygments_lexer": "ipython3",
168 | "version": "3.10.12"
169 | }
170 | },
171 | "nbformat": 4,
172 | "nbformat_minor": 5
173 | }
174 |
--------------------------------------------------------------------------------
/malaysian-short-instructions/dedup-questions.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 23,
6 | "id": "8009e792",
7 | "metadata": {},
8 | "outputs": [
9 | {
10 | "data": {
11 | "text/plain": [
12 | "213261"
13 | ]
14 | },
15 | "execution_count": 23,
16 | "metadata": {},
17 | "output_type": "execute_result"
18 | }
19 | ],
20 | "source": [
21 | "from glob import glob\n",
22 | "import json\n",
23 | "import re\n",
24 | "\n",
25 | "pattern = r\"\\d+\\.\\s(.+)\"\n",
26 | "already = set()\n",
27 | "\n",
28 | "files = glob('generate-questions/*')\n",
29 | "\n",
30 | "questions = []\n",
31 | "for f in files:\n",
32 | " with open(f) as fopen:\n",
33 | " d = json.load(fopen)\n",
34 | " keyword = d['q'][0]\n",
35 | " for q in re.findall(pattern, d['r']):\n",
36 | " if q in already:\n",
37 | " continue\n",
38 | " questions.append((q, keyword))\n",
39 | " \n",
40 | "len(questions)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 24,
46 | "id": "cc4f4bab",
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "import string\n",
51 | "\n",
52 | "digits = set(string.digits)\n",
53 | "rejected = ['\\'', '\"', 'http', '\\n', '[', ']', '/', '`']\n",
54 | "\n",
55 | "def contains_non_ascii(text):\n",
56 | " return any(ord(char) > 127 for char in text)\n",
57 | "\n",
58 | "def reject_q(q):\n",
59 | " if q is None:\n",
60 | " return True\n",
61 | " if any([c in q for c in rejected]):\n",
62 | " return True\n",
63 | " if contains_non_ascii(q):\n",
64 | " return True\n",
65 | " if len(set(q) & digits):\n",
66 | " return True\n",
67 | " if len(q) < 20:\n",
68 | " return True\n",
69 | " if len(q) > 200:\n",
70 | " return True\n",
71 | " return False"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 25,
77 | "id": "fddc3adf",
78 | "metadata": {
79 | "scrolled": true
80 | },
81 | "outputs": [
82 | {
83 | "data": {
84 | "text/plain": [
85 | "58"
86 | ]
87 | },
88 | "execution_count": 25,
89 | "metadata": {},
90 | "output_type": "execute_result"
91 | }
92 | ],
93 | "source": [
94 | "from collections import defaultdict\n",
95 | "\n",
96 | "filtered_q = defaultdict(list)\n",
97 | "for q, k in questions:\n",
98 | " if len(q) < 10:\n",
99 | " continue\n",
100 | " if reject_q(q):\n",
101 | " continue\n",
102 | " \n",
103 | " filtered_q[k].append(q)\n",
104 | "len(filtered_q)"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 31,
110 | "id": "60a9f651",
111 | "metadata": {},
112 | "outputs": [
113 | {
114 | "data": {
115 | "text/plain": [
116 | "31"
117 | ]
118 | },
119 | "execution_count": 31,
120 | "metadata": {},
121 | "output_type": "execute_result"
122 | }
123 | ],
124 | "source": [
125 | "len(filtered_q['d3 js'])"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 54,
131 | "id": "64cf6d6b",
132 | "metadata": {
133 | "scrolled": false
134 | },
135 | "outputs": [
136 | {
137 | "data": {
138 | "text/plain": [
139 | "57000"
140 | ]
141 | },
142 | "execution_count": 54,
143 | "metadata": {},
144 | "output_type": "execute_result"
145 | }
146 | ],
147 | "source": [
148 | "questions = []\n",
149 | "for k, v in filtered_q.items():\n",
150 | " if len(v) < 100:\n",
151 | " continue\n",
152 | " v = sorted(v, key = lambda x: len(x), reverse = True)\n",
153 | " v = [(v_, k) for v_ in v][:1000]\n",
154 | " questions.extend(v)\n",
155 | " \n",
156 | "len(questions)"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": 55,
162 | "id": "eee58538",
163 | "metadata": {},
164 | "outputs": [
165 | {
166 | "data": {
167 | "text/plain": [
168 | "('Bolehkah anda menerangkan langkah-langkah yang diperlukan untuk membuat sistem pengurusan penyediaan semula dalam Go yang berfungsi dengan cekap dan tahan terhadap kegagalan?',\n",
169 | " 'go distributed system')"
170 | ]
171 | },
172 | "execution_count": 55,
173 | "metadata": {},
174 | "output_type": "execute_result"
175 | }
176 | ],
177 | "source": [
178 | "questions[0]"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": 56,
184 | "id": "af927235",
185 | "metadata": {},
186 | "outputs": [],
187 | "source": [
188 | "with open('dedup-questions.json', 'w') as fopen:\n",
189 | " json.dump(questions, fopen)"
190 | ]
191 | }
192 | ],
193 | "metadata": {
194 | "kernelspec": {
195 | "display_name": "Python 3 (ipykernel)",
196 | "language": "python",
197 | "name": "python3"
198 | },
199 | "language_info": {
200 | "codemirror_mode": {
201 | "name": "ipython",
202 | "version": 3
203 | },
204 | "file_extension": ".py",
205 | "mimetype": "text/x-python",
206 | "name": "python",
207 | "nbconvert_exporter": "python",
208 | "pygments_lexer": "ipython3",
209 | "version": "3.8.10"
210 | }
211 | },
212 | "nbformat": 4,
213 | "nbformat_minor": 5
214 | }
215 |
--------------------------------------------------------------------------------
/text/extra/process-snapshot.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 75,
6 | "id": "05913d38",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import json\n",
11 | "from glob import glob\n",
12 | "from tqdm import tqdm"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 89,
18 | "id": "55c6365c",
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "http_errors = [\n",
23 | " \"400 Bad Request\", \"401 Unauthorized\", \"402 Payment Required\", \"403 Forbidden\", \"404 Not Found\",\n",
24 | " \"405 Method Not Allowed\", \"406 Not Acceptable\", \"407 Proxy Authentication Required\", \"408 Request Timeout\",\n",
25 | " \"409 Conflict\", \"410 Gone\", \"411 Length Required\", \"412 Precondition Failed\", \"413 Payload Too Large\",\n",
26 | " \"414 URI Too Long\", \"415 Unsupported Media Type\", \"416 Range Not Satisfiable\", \"417 Expectation Failed\",\n",
27 | " \"418 I'm a teapot\", \"421 Misdirected Request\", \"422 Unprocessable Entity\", \"423 Locked\", \"424 Failed Dependency\",\n",
28 | " \"425 Too Early\", \"426 Upgrade Required\", \"428 Precondition Required\", \"429 Too Many Requests\",\n",
29 | " \"431 Request Header Fields Too Large\", \"451 Unavailable For Legal Reasons\", \"500 Internal Server Error\",\n",
30 | " \"501 Not Implemented\", \"502 Bad Gateway\", \"503 Service Unavailable\", \"504 Gateway Timeout\",\n",
31 | " \"505 HTTP Version Not Supported\", \"506 Variant Also Negotiates\", \"507 Insufficient Storage\",\n",
32 | " \"508 Loop Detected\", \"510 Not Extended\", \"511 Network Authentication Required\"\n",
33 | " ]\n",
34 | "\n",
35 | "rejected = [\n",
36 | " 'Internal Server Error',\n",
37 | " '404',\n",
38 | " '__NOEDITSECTION__',\n",
39 | " 'enter your username and password',\n",
40 | " 'Login',\n",
41 | " 'forgotten your password',\n",
42 | " 'cookies enabled',\n",
43 | " 'sign in',\n",
44 | " 'tentang kami',\n",
45 | " 'skip to content',\n",
46 | " 'hubungi kami',\n",
47 | " 'laman utama',\n",
48 | " 'enable JavaScript in your browser.',\n",
49 | " 'The page cannot be displayed',\n",
50 | " 'site or edit the error_page',\n",
51 | " 'Hakcipta terpelihara',\n",
52 | " 'Copyright ©'\n",
53 | "]\n",
54 | "\n",
55 | "rejected.extend(http_errors)"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 80,
61 | "id": "54284aa7",
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "files = sorted(glob('crawl-my-website/snapshot/*.json'))"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 114,
71 | "id": "85361659",
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "!rm hf-datasets/dedupe-datasets/snapshot.jsonl"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 115,
81 | "id": "ae3d0a6d",
82 | "metadata": {},
83 | "outputs": [
84 | {
85 | "name": "stderr",
86 | "output_type": "stream",
87 | "text": [
88 | "100%|██████████| 348/348 [03:33<00:00, 1.63it/s]\n"
89 | ]
90 | }
91 | ],
92 | "source": [
93 | "processed = set()\n",
94 | "with open('hf-datasets/raw-datasets/snapshot.jsonl', 'w') as fopen_l:\n",
95 | " for f in tqdm(files):\n",
96 | " with open(f) as fopen:\n",
97 | " for l in fopen:\n",
98 | " l = json.loads(l)\n",
99 | " if l['url'] in processed:\n",
100 | " continue\n",
101 | "\n",
102 | " splitted = l['data'].split('\\n')\n",
103 | " splitted = [s for s in splitted if len(s) > 50]\n",
104 | " splitted = [s.strip() for s in splitted if all([r not in s for r in rejected])]\n",
105 | " if len(splitted):\n",
106 | " data = {\n",
107 | " 'url': l['url'],\n",
108 | " 'text': splitted\n",
109 | " }\n",
110 | " fopen_l.write(f'{json.dumps(data)}\\n')\n",
111 | " fopen_l.flush()\n",
112 | " \n",
113 | " processed.add(l['url'])"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 116,
119 | "id": "8aad7de6",
120 | "metadata": {},
121 | "outputs": [
122 | {
123 | "data": {
124 | "text/plain": [
125 | "428982"
126 | ]
127 | },
128 | "execution_count": 116,
129 | "metadata": {},
130 | "output_type": "execute_result"
131 | }
132 | ],
133 | "source": [
134 | "len(processed)"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "id": "2059852b",
141 | "metadata": {},
142 | "outputs": [],
143 | "source": []
144 | }
145 | ],
146 | "metadata": {
147 | "kernelspec": {
148 | "display_name": "Python 3 (ipykernel)",
149 | "language": "python",
150 | "name": "python3"
151 | },
152 | "language_info": {
153 | "codemirror_mode": {
154 | "name": "ipython",
155 | "version": 3
156 | },
157 | "file_extension": ".py",
158 | "mimetype": "text/x-python",
159 | "name": "python",
160 | "nbconvert_exporter": "python",
161 | "pygments_lexer": "ipython3",
162 | "version": "3.10.12"
163 | }
164 | },
165 | "nbformat": 4,
166 | "nbformat_minor": 5
167 | }
168 |
--------------------------------------------------------------------------------
/text/madlad-400-ms/postprocess-madlad-400-ms.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "id": "d9d83b0a",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import json\n",
11 | "import re\n",
12 | "from tqdm import tqdm\n",
13 | "\n",
14 | "http_errors = [\n",
15 | " \"400 Bad Request\", \"401 Unauthorized\", \"402 Payment Required\", \"403 Forbidden\", \"404 Not Found\",\n",
16 | " \"405 Method Not Allowed\", \"406 Not Acceptable\", \"407 Proxy Authentication Required\", \"408 Request Timeout\",\n",
17 | " \"409 Conflict\", \"410 Gone\", \"411 Length Required\", \"412 Precondition Failed\", \"413 Payload Too Large\",\n",
18 | " \"414 URI Too Long\", \"415 Unsupported Media Type\", \"416 Range Not Satisfiable\", \"417 Expectation Failed\",\n",
19 | " \"418 I'm a teapot\", \"421 Misdirected Request\", \"422 Unprocessable Entity\", \"423 Locked\", \"424 Failed Dependency\",\n",
20 | " \"425 Too Early\", \"426 Upgrade Required\", \"428 Precondition Required\", \"429 Too Many Requests\",\n",
21 | " \"431 Request Header Fields Too Large\", \"451 Unavailable For Legal Reasons\", \"500 Internal Server Error\",\n",
22 | " \"501 Not Implemented\", \"502 Bad Gateway\", \"503 Service Unavailable\", \"504 Gateway Timeout\",\n",
23 | " \"505 HTTP Version Not Supported\", \"506 Variant Also Negotiates\", \"507 Insufficient Storage\",\n",
24 | " \"508 Loop Detected\", \"510 Not Extended\", \"511 Network Authentication Required\"\n",
25 | " ]\n",
26 | "\n",
27 | "rejected = [\n",
28 | " 'Internal Server Error',\n",
29 | " '__NOEDITSECTION__',\n",
30 | " 'enter your username and password',\n",
31 | " 'forgotten your password',\n",
32 | " 'cookies enabled',\n",
33 | " 'enable JavaScript in your browser.',\n",
34 | " 'The page cannot be displayed',\n",
35 | " 'site or edit the error_page',\n",
36 | "]\n",
37 | "\n",
38 | "rejected.extend(http_errors)\n",
39 | "\n",
40 | "def replace_multiple(input_string, pattern =r\"\\s{6,}\", replace = ' '):\n",
41 | " return re.sub(pattern, replace, input_string)\n",
42 | "\n",
43 | "def replace(string):\n",
44 | " string = replace_multiple(string.replace('…', '.'))\n",
45 | " string = replace_multiple(string, pattern = r\"\\.{6,}\", replace = '...')\n",
46 | " return string\n",
47 | "\n",
48 | "def reject(string):\n",
49 | " if any([r in string for r in rejected]):\n",
50 | " return True\n",
51 | " return False"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 5,
57 | "id": "d714ffb4",
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "f = 'madlad-400-ms.jsonl'\n",
62 | "new_f = 'madlad-400-ms.postprocessing.jsonl'"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 9,
68 | "id": "83f668c6",
69 | "metadata": {},
70 | "outputs": [
71 | {
72 | "name": "stderr",
73 | "output_type": "stream",
74 | "text": [
75 | "2232026it [08:47, 4507.46it/s]IOPub message rate exceeded.\n",
76 | "The notebook server will temporarily stop sending output\n",
77 | "to the client in order to avoid crashing it.\n",
78 | "To change this limit, set the config variable\n",
79 | "`--NotebookApp.iopub_msg_rate_limit`.\n",
80 | "\n",
81 | "Current values:\n",
82 | "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
83 | "NotebookApp.rate_limit_window=3.0 (secs)\n",
84 | "\n",
85 | "11158994it [43:43, 4565.13it/s]IOPub message rate exceeded.\n",
86 | "The notebook server will temporarily stop sending output\n",
87 | "to the client in order to avoid crashing it.\n",
88 | "To change this limit, set the config variable\n",
89 | "`--NotebookApp.iopub_msg_rate_limit`.\n",
90 | "\n",
91 | "Current values:\n",
92 | "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
93 | "NotebookApp.rate_limit_window=3.0 (secs)\n",
94 | "\n"
95 | ]
96 | }
97 | ],
98 | "source": [
99 | "with open(new_f, 'w') as fopen_l:\n",
100 | " with open(f) as fopen:\n",
101 | " for l in tqdm(fopen):\n",
102 | " data = json.loads(l)\n",
103 | " \n",
104 | " if isinstance(data, dict):\n",
105 | " t = data['text']\n",
106 | " else:\n",
107 | " t = data\n",
108 | "\n",
109 | " if reject(t):\n",
110 | " continue\n",
111 | "\n",
112 | " data = replace(t.strip())\n",
113 | "\n",
114 | " if len(data) < 3:\n",
115 | " continue\n",
116 | "\n",
117 | " fopen_l.write(f'{json.dumps(data)}\\n')\n",
118 | " fopen_l.flush()"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "id": "f13ad6af",
125 | "metadata": {},
126 | "outputs": [],
127 | "source": []
128 | }
129 | ],
130 | "metadata": {
131 | "kernelspec": {
132 | "display_name": "Python 3 (ipykernel)",
133 | "language": "python",
134 | "name": "python3"
135 | },
136 | "language_info": {
137 | "codemirror_mode": {
138 | "name": "ipython",
139 | "version": 3
140 | },
141 | "file_extension": ".py",
142 | "mimetype": "text/x-python",
143 | "name": "python",
144 | "nbconvert_exporter": "python",
145 | "pygments_lexer": "ipython3",
146 | "version": "3.10.12"
147 | }
148 | },
149 | "nbformat": 4,
150 | "nbformat_minor": 5
151 | }
152 |
--------------------------------------------------------------------------------
/text/pretrain-clm/from-pyarrow-to-mosaic.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 15,
6 | "id": "4b7592f7",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import mp\n",
11 | "import os\n",
12 | "import pyarrow as pa\n",
13 | "import numpy as np\n",
14 | "from streaming import MDSWriter\n",
15 | "from tqdm import tqdm"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 6,
21 | "id": "e0391f83",
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "from streaming.base.format.mds.encodings import Encoding, _encodings\n",
26 | "\n",
27 | "class Int32(Encoding):\n",
28 | " def encode(self, obj) -> bytes:\n",
29 | " return obj.tobytes()\n",
30 | "\n",
31 | " def decode(self, data: bytes):\n",
32 | " return np.frombuffer(data, np.int32)\n",
33 | "\n",
34 | "_encodings['int32'] = Int32"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 7,
40 | "id": "62ddb05a",
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "columns = {\n",
45 | " 'input_ids': 'int32',\n",
46 | " 'token_type_ids': 'int32',\n",
47 | " 'attention_mask': 'int32',\n",
48 | " 'labels': 'int32',\n",
49 | "}\n",
50 | "compression = 'zstd'\n",
51 | "hashes = 'sha1', 'xxh64'"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 8,
57 | "id": "e817fcc5",
58 | "metadata": {},
59 | "outputs": [
60 | {
61 | "data": {
62 | "text/plain": [
63 | "['combine-lm_00017_of_00020.jsonl-grouped-4096',\n",
64 | " 'combine-lm_00005_of_00020.jsonl-grouped-4096',\n",
65 | " 'combine-lm_00008_of_00020.jsonl-grouped-4096',\n",
66 | " 'combine-lm_00012_of_00020.jsonl-grouped-4096',\n",
67 | " 'combine-lm_00007_of_00020.jsonl-grouped-4096',\n",
68 | " 'combine-lm_00014_of_00020.jsonl-grouped-4096',\n",
69 | " 'combine-lm_00006_of_00020.jsonl-grouped-4096',\n",
70 | " 'combine-lm_00013_of_00020.jsonl-grouped-4096',\n",
71 | " 'combine-lm_00016_of_00020.jsonl-grouped-4096',\n",
72 | " 'combine-lm_00011_of_00020.jsonl-grouped-4096',\n",
73 | " 'combine-lm_00018_of_00020.jsonl-grouped-4096',\n",
74 | " 'combine-lm_00002_of_00020.jsonl-grouped-4096',\n",
75 | " 'combine-lm_00009_of_00020.jsonl-grouped-4096',\n",
76 | " 'combine-lm_00019_of_00020.jsonl-grouped-4096',\n",
77 | " 'combine-lm_00001_of_00020.jsonl-grouped-4096',\n",
78 | " 'combine-lm_00003_of_00020.jsonl-grouped-4096',\n",
79 | " 'combine-lm_00015_of_00020.jsonl-grouped-4096',\n",
80 | " 'combine-lm_00004_of_00020.jsonl-grouped-4096',\n",
81 | " 'combine-lm_00000_of_00020.jsonl-grouped-4096',\n",
82 | " 'combine-lm_00010_of_00020.jsonl-grouped-4096']"
83 | ]
84 | },
85 | "execution_count": 8,
86 | "metadata": {},
87 | "output_type": "execute_result"
88 | }
89 | ],
90 | "source": [
91 | "from glob import glob\n",
92 | "\n",
93 | "files = glob('combine-lm_*_of_00020.jsonl-grouped-4096')\n",
94 | "files"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 16,
100 | "id": "8a3e0890",
101 | "metadata": {},
102 | "outputs": [],
103 | "source": [
104 | "def loop(files):\n",
105 | " files, index = files\n",
106 | " out_root = f'tokenized-{index}'\n",
107 | " os.system(f'rm -rf {out_root}')\n",
108 | " with MDSWriter(out=out_root, columns=columns, compression=compression, hashes=hashes, \n",
109 | " size_limit = 67108864 * 2) as out:\n",
110 | " for f in files:\n",
111 | " memory_mapped_stream = pa.memory_map(f)\n",
112 | " opened_stream = pa.ipc.open_stream(memory_mapped_stream)\n",
113 | " for a in tqdm(opened_stream):\n",
114 | " s = a.to_struct_array()\n",
115 | " for i in range(len(s)):\n",
116 | " keys = list(s[i])\n",
117 | " a_ = {}\n",
118 | " for k in keys:\n",
119 | " a_[k] = np.array(s[i][k].as_py()).astype(np.int32)\n",
120 | " out.write(a_)"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "id": "876289e4",
127 | "metadata": {},
128 | "outputs": [
129 | {
130 | "name": "stderr",
131 | "output_type": "stream",
132 | "text": [
133 | "2570it [05:50, 7.30it/s]\n",
134 | "7464it [06:35, 18.87it/s]\n",
135 | "7464it [07:57, 15.62it/s]\n",
136 | "7464it [08:06, 15.36it/s]\n",
137 | "7464it [08:11, 15.20it/s]\n",
138 | "7464it [12:20, 10.08it/s]\n",
139 | "5816it [13:12, 7.64it/s]"
140 | ]
141 | }
142 | ],
143 | "source": [
144 | "mp.multiprocessing(files, loop, cores = 20, returned = False)"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": null,
150 | "id": "c9aba12a",
151 | "metadata": {},
152 | "outputs": [],
153 | "source": []
154 | }
155 | ],
156 | "metadata": {
157 | "kernelspec": {
158 | "display_name": "Python 3 (ipykernel)",
159 | "language": "python",
160 | "name": "python3"
161 | },
162 | "language_info": {
163 | "codemirror_mode": {
164 | "name": "ipython",
165 | "version": 3
166 | },
167 | "file_extension": ".py",
168 | "mimetype": "text/x-python",
169 | "name": "python",
170 | "nbconvert_exporter": "python",
171 | "pygments_lexer": "ipython3",
172 | "version": "3.10.12"
173 | }
174 | },
175 | "nbformat": 4,
176 | "nbformat_minor": 5
177 | }
178 |
--------------------------------------------------------------------------------
/text/text_dedup/utils/hashfunc.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import struct
3 | from hashlib import md5
4 | from hashlib import sha256
5 |
6 | import xxhash
7 | from xxhash import xxh3_64
8 | from xxhash import xxh3_64_digest
9 | from xxhash import xxh3_128
10 | from xxhash import xxh3_128_digest
11 |
12 |
13 | def md5_hexdigest(data: bytes) -> str:
14 | """
15 | Generate a md5 hex hash from the given data.
16 |
17 | Parameters
18 | ----------
19 | data : bytes
20 | The data to be hashed.
21 |
22 | Returns
23 | -------
24 | str
25 | The hex hash value.
26 |
27 | Examples
28 | --------
29 | >>> md5_hexdigest(b"hello world")
30 | '5eb63bbbe01eeed093cb22bb8f5acdc3'
31 | >>> len(md5_hexdigest(b"hello world"))
32 | 32
33 | """
34 | return md5(data).hexdigest()
35 |
36 |
37 | def sha1_hash(data: bytes, d: int = 32) -> int:
38 | """
39 | Generate a d-bit hash value from the given data.
40 |
41 | Parameters
42 | ----------
43 | data : bytes
44 | The data to be hashed.
45 | d : int
46 | The number of bits of the hash value.
47 |
48 | Returns
49 | -------
50 | int
51 | The hash value.
52 |
53 | Examples
54 | --------
55 | >>> sha1_hash(b"hello world", 32)
56 | 896314922
57 | >>> sha1_hash(b"hello world", 64)
58 | 13028719972609469994
59 | >>> sha1_hash(b"hello world", 128)
60 | 310522945683037930239412421226792791594
61 | """
62 | if d == 32:
63 | return struct.unpack(" str:
71 | """
72 | Generate a sha256 hex hash from the given data.
73 |
74 | Parameters
75 | ----------
76 | data : bytes
77 | The data to be hashed.
78 |
79 | Returns
80 | -------
81 | str
82 | The hex hash value.
83 |
84 | Examples
85 | --------
86 | >>> sha256_hexdigest(b"hello world")
87 | 'b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9'
88 | >>> len(sha256_hexdigest(b"hello world"))
89 | 64
90 | """
91 | return sha256(data).hexdigest()
92 |
93 |
94 | def xxh3_16hash(data: bytes, seed: int = 0) -> int:
95 | """
96 | Generate a 16-bit xxhash based hash value from the given data.
97 | As of python xxhash 3.3.0 (and since 0.3.0) outputs in big-endian.
98 | This is useful as a special purpose xxhash when you only want 16 bits.
99 | bit masked xxh3_64 hashes are faster than xxh32 in modern systems.
100 |
101 | Parameters
102 | ----------
103 | data : bytes
104 | The data to be hashed.
105 | seed : int
106 | xxhashes can all be seeded. Default is int=0
107 |
108 | Returns
109 | -------
110 | int
111 | The hash value.
112 |
113 | Examples
114 | --------
115 | >>> xxh3_16hash(b"hello world")
116 | 39051
117 | >>> xxh3_16hash(b"hello world",seed=42)
118 | 13198
119 | >>> xxh3_16hash(b"hello world",seed=-42)
120 | 34281
121 | """
122 | return xxhash.xxh3_64_intdigest(data, seed) & 0xFFFF
123 |
124 |
125 | def xxh3_32hash(data: bytes, seed: int = 0) -> int:
126 | """
127 | Generate a 32-bit xxhash based hash value from the given data.
128 | As of python xxhash 3.3.0 (and since 0.3.0) outputs in big-endian.
129 | This is useful as a special purpose xxhash when you only want 32bits.
130 | bit masked xxh3_64 hashes are faster than xxh32 in modern systems.
131 |
132 | Parameters
133 | ----------
134 | data : bytes
135 | The data to be hashed.
136 | seed : int
137 | xxhashes can all be seeded. Default is int=0
138 |
139 | Returns
140 | -------
141 | int
142 | The hash value.
143 |
144 | Examples
145 | --------
146 | >>> xxh3_32hash(b"hello world")
147 | 1088854155
148 | >>> xxh3_32hash(b"hello world",seed=42)
149 | 3913102222
150 | >>> xxh3_32hash(b"hello world",seed=-42)
151 | 3721037289
152 | """
153 | return xxhash.xxh3_64_intdigest(data, seed) & 0xFFFFFFFF
154 |
155 |
156 | def xxh3_hash(data: bytes, d: int = 32) -> int:
157 | """
158 | Generate a d-bit xxhash based hash value from the given data.
159 | As of python xxhash 3.3.0 (and since 0.3.0) outputs in big-endian.
160 | This is useful as a general purpose xxhash that can take multiple `d` values
161 |
162 | Parameters
163 | ----------
164 | data : bytes
165 | The data to be hashed.
166 | d : int
167 | The number of bits of the hash value.
168 | According to this value, chooses empirically found best xxh3 hasher.
169 |
170 | Returns
171 | -------
172 | int
173 | The hash value.
174 |
175 | Examples
176 | --------
177 | >>> xxh3_hash(b"hello world", 32)
178 | 1088854155
179 | >>> xxh3_hash(b"hello world", 64)
180 | 15296390279056496779
181 | >>> xxh3_hash(b"hello world", 128)
182 | 297150157938599054391163723952090887879
183 | """
184 | if d == 32:
185 | # with sse2 or later, xxh3 is much faster
186 | # with avx, the difference is much larger
187 | return xxhash.xxh3_64_intdigest(data) & 0xFFFFFFFF
188 | if d == 64:
189 | return xxhash.xxh3_64_intdigest(data)
190 | if d == 128:
191 | return xxhash.xxh3_128_intdigest(data)
192 | # fall back
193 | return int.from_bytes(xxhash.xxh3_128_digest(data)[: d // 8], byteorder="big")
194 |
195 |
196 | __all__ = [
197 | "md5",
198 | "sha256",
199 | "sha1_hash",
200 | "xxh3_64",
201 | "xxh3_64_digest",
202 | "xxh3_128",
203 | "xxh3_128_digest",
204 | "xxh3_hash",
205 | "xxh3_16hash",
206 | "xxh3_32hash",
207 | ]
--------------------------------------------------------------------------------
/text/processing/text_dedup/utils/hashfunc.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import struct
3 | from hashlib import md5
4 | from hashlib import sha256
5 |
6 | import xxhash
7 | from xxhash import xxh3_64
8 | from xxhash import xxh3_64_digest
9 | from xxhash import xxh3_128
10 | from xxhash import xxh3_128_digest
11 |
12 |
13 | def md5_hexdigest(data: bytes) -> str:
14 | """
15 | Generate a md5 hex hash from the given data.
16 |
17 | Parameters
18 | ----------
19 | data : bytes
20 | The data to be hashed.
21 |
22 | Returns
23 | -------
24 | str
25 | The hex hash value.
26 |
27 | Examples
28 | --------
29 | >>> md5_hexdigest(b"hello world")
30 | '5eb63bbbe01eeed093cb22bb8f5acdc3'
31 | >>> len(md5_hexdigest(b"hello world"))
32 | 32
33 | """
34 | return md5(data).hexdigest()
35 |
36 |
37 | def sha1_hash(data: bytes, d: int = 32) -> int:
38 | """
39 | Generate a d-bit hash value from the given data.
40 |
41 | Parameters
42 | ----------
43 | data : bytes
44 | The data to be hashed.
45 | d : int
46 | The number of bits of the hash value.
47 |
48 | Returns
49 | -------
50 | int
51 | The hash value.
52 |
53 | Examples
54 | --------
55 | >>> sha1_hash(b"hello world", 32)
56 | 896314922
57 | >>> sha1_hash(b"hello world", 64)
58 | 13028719972609469994
59 | >>> sha1_hash(b"hello world", 128)
60 | 310522945683037930239412421226792791594
61 | """
62 | if d == 32:
63 | return struct.unpack(" str:
71 | """
72 | Generate a sha256 hex hash from the given data.
73 |
74 | Parameters
75 | ----------
76 | data : bytes
77 | The data to be hashed.
78 |
79 | Returns
80 | -------
81 | str
82 | The hex hash value.
83 |
84 | Examples
85 | --------
86 | >>> sha256_hexdigest(b"hello world")
87 | 'b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9'
88 | >>> len(sha256_hexdigest(b"hello world"))
89 | 64
90 | """
91 | return sha256(data).hexdigest()
92 |
93 |
94 | def xxh3_16hash(data: bytes, seed: int = 0) -> int:
95 | """
96 | Generate a 16-bit xxhash based hash value from the given data.
97 | As of python xxhash 3.3.0 (and since 0.3.0) outputs in big-endian.
98 | This is useful as a special purpose xxhash when you only want 16 bits.
99 | bit masked xxh3_64 hashes are faster than xxh32 in modern systems.
100 |
101 | Parameters
102 | ----------
103 | data : bytes
104 | The data to be hashed.
105 | seed : int
106 | xxhashes can all be seeded. Default is int=0
107 |
108 | Returns
109 | -------
110 | int
111 | The hash value.
112 |
113 | Examples
114 | --------
115 | >>> xxh3_16hash(b"hello world")
116 | 39051
117 | >>> xxh3_16hash(b"hello world",seed=42)
118 | 13198
119 | >>> xxh3_16hash(b"hello world",seed=-42)
120 | 34281
121 | """
122 | return xxhash.xxh3_64_intdigest(data, seed) & 0xFFFF
123 |
124 |
125 | def xxh3_32hash(data: bytes, seed: int = 0) -> int:
126 | """
127 | Generate a 32-bit xxhash based hash value from the given data.
128 | As of python xxhash 3.3.0 (and since 0.3.0) outputs in big-endian.
129 | This is useful as a special purpose xxhash when you only want 32bits.
130 | bit masked xxh3_64 hashes are faster than xxh32 in modern systems.
131 |
132 | Parameters
133 | ----------
134 | data : bytes
135 | The data to be hashed.
136 | seed : int
137 | xxhashes can all be seeded. Default is int=0
138 |
139 | Returns
140 | -------
141 | int
142 | The hash value.
143 |
144 | Examples
145 | --------
146 | >>> xxh3_32hash(b"hello world")
147 | 1088854155
148 | >>> xxh3_32hash(b"hello world",seed=42)
149 | 3913102222
150 | >>> xxh3_32hash(b"hello world",seed=-42)
151 | 3721037289
152 | """
153 | return xxhash.xxh3_64_intdigest(data, seed) & 0xFFFFFFFF
154 |
155 |
156 | def xxh3_hash(data: bytes, d: int = 32) -> int:
157 | """
158 | Generate a d-bit xxhash based hash value from the given data.
159 | As of python xxhash 3.3.0 (and since 0.3.0) outputs in big-endian.
160 | This is useful as a general purpose xxhash that can take multiple `d` values
161 |
162 | Parameters
163 | ----------
164 | data : bytes
165 | The data to be hashed.
166 | d : int
167 | The number of bits of the hash value.
168 | According to this value, chooses empirically found best xxh3 hasher.
169 |
170 | Returns
171 | -------
172 | int
173 | The hash value.
174 |
175 | Examples
176 | --------
177 | >>> xxh3_hash(b"hello world", 32)
178 | 1088854155
179 | >>> xxh3_hash(b"hello world", 64)
180 | 15296390279056496779
181 | >>> xxh3_hash(b"hello world", 128)
182 | 297150157938599054391163723952090887879
183 | """
184 | match d:
185 | case 32:
186 | # with sse2 or later, xxh3 is much faster
187 | # with avx, the difference is much larger
188 | return xxhash.xxh3_64_intdigest(data) & 0xFFFFFFFF
189 | case 64:
190 | return xxhash.xxh3_64_intdigest(data)
191 | case 128:
192 | return xxhash.xxh3_128_intdigest(data)
193 | # fall back
194 | return int.from_bytes(xxhash.xxh3_128_digest(data)[: d // 8], byteorder="big")
195 |
196 |
197 | __all__ = [
198 | "md5",
199 | "sha256",
200 | "sha1_hash",
201 | "xxh3_64",
202 | "xxh3_64_digest",
203 | "xxh3_128",
204 | "xxh3_128_digest",
205 | "xxh3_hash",
206 | "xxh3_16hash",
207 | "xxh3_32hash",
208 | ]
--------------------------------------------------------------------------------
/text/mistral/run-tokenizer.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "id": "68984750",
7 | "metadata": {
8 | "scrolled": true
9 | },
10 | "outputs": [],
11 | "source": [
12 | "from transformers import (\n",
13 | " AutoModelForCausalLM,\n",
14 | " AutoTokenizer,\n",
15 | " get_scheduler,\n",
16 | " default_data_collator,\n",
17 | " SchedulerType\n",
18 | ")\n",
19 | "import os\n",
20 | "import json\n",
21 | "from itertools import chain\n",
22 | "from datasets import load_dataset"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 5,
28 | "id": "09d07423",
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "name": "stderr",
33 | "output_type": "stream",
34 | "text": [
35 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
36 | ]
37 | }
38 | ],
39 | "source": [
40 | "train_file = 'combine-mistral.jsonl'\n",
41 | "tokenizer = AutoTokenizer.from_pretrained(\n",
42 | " 'mistralai/Mistral-7B-v0.1',\n",
43 | ")\n",
44 | "tokenizer.add_bos_token = False\n",
45 | "tokenizer.add_eos_token = False\n",
46 | "text_column_name = 'text'"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 6,
52 | "id": "0c31ee11",
53 | "metadata": {},
54 | "outputs": [
55 | {
56 | "data": {
57 | "application/vnd.jupyter.widget-view+json": {
58 | "model_id": "86feb593de8d41089a848a49fdd7d95e",
59 | "version_major": 2,
60 | "version_minor": 0
61 | },
62 | "text/plain": [
63 | "Downloading data files: 0%| | 0/1 [00:00, ?it/s]"
64 | ]
65 | },
66 | "metadata": {},
67 | "output_type": "display_data"
68 | },
69 | {
70 | "data": {
71 | "application/vnd.jupyter.widget-view+json": {
72 | "model_id": "9e284c80f2ab499a84083f6b9c1cdc7a",
73 | "version_major": 2,
74 | "version_minor": 0
75 | },
76 | "text/plain": [
77 | "Extracting data files: 0%| | 0/1 [00:00, ?it/s]"
78 | ]
79 | },
80 | "metadata": {},
81 | "output_type": "display_data"
82 | },
83 | {
84 | "data": {
85 | "application/vnd.jupyter.widget-view+json": {
86 | "model_id": "678282f811e54b628c1f6ab3c074a4fc",
87 | "version_major": 2,
88 | "version_minor": 0
89 | },
90 | "text/plain": [
91 | "Generating train split: 0 examples [00:00, ? examples/s]"
92 | ]
93 | },
94 | "metadata": {},
95 | "output_type": "display_data"
96 | }
97 | ],
98 | "source": [
99 | "raw_datasets = load_dataset(\n",
100 | " 'json',\n",
101 | " data_files=train_file,\n",
102 | " split='train'\n",
103 | ")"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 8,
109 | "id": "8eedfd87",
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "def tokenize_function(examples):\n",
114 | " return tokenizer(examples[text_column_name])"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 9,
120 | "id": "3bfece34",
121 | "metadata": {},
122 | "outputs": [
123 | {
124 | "data": {
125 | "application/vnd.jupyter.widget-view+json": {
126 | "model_id": "b43867bb6503463eb239830fbb905776",
127 | "version_major": 2,
128 | "version_minor": 0
129 | },
130 | "text/plain": [
131 | "Map (num_proc=20): 0%| | 0/37117462 [00:00, ? examples/s]"
132 | ]
133 | },
134 | "metadata": {},
135 | "output_type": "display_data"
136 | },
137 | {
138 | "name": "stderr",
139 | "output_type": "stream",
140 | "text": [
141 | "IOPub message rate exceeded.\n",
142 | "The notebook server will temporarily stop sending output\n",
143 | "to the client in order to avoid crashing it.\n",
144 | "To change this limit, set the config variable\n",
145 | "`--NotebookApp.iopub_msg_rate_limit`.\n",
146 | "\n",
147 | "Current values:\n",
148 | "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
149 | "NotebookApp.rate_limit_window=3.0 (secs)\n",
150 | "\n"
151 | ]
152 | },
153 | {
154 | "data": {
155 | "text/plain": [
156 | "Dataset({\n",
157 | " features: ['input_ids', 'attention_mask'],\n",
158 | " num_rows: 37117462\n",
159 | "})"
160 | ]
161 | },
162 | "execution_count": 9,
163 | "metadata": {},
164 | "output_type": "execute_result"
165 | }
166 | ],
167 | "source": [
168 | "filename = os.path.split(train_file)[1]\n",
169 | "column_names = raw_datasets.column_names\n",
170 | "tokenized_datasets = raw_datasets.map(\n",
171 | " tokenize_function,\n",
172 | " batched=True,\n",
173 | " remove_columns=column_names,\n",
174 | " load_from_cache_file=True,\n",
175 | " cache_file_name=f'./{filename}-tokenized',\n",
176 | " num_proc=20,\n",
177 | ")\n",
178 | "\n",
179 | "tokenized_datasets"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "id": "575ed380",
186 | "metadata": {},
187 | "outputs": [],
188 | "source": []
189 | }
190 | ],
191 | "metadata": {
192 | "kernelspec": {
193 | "display_name": "Python 3 (ipykernel)",
194 | "language": "python",
195 | "name": "python3"
196 | },
197 | "language_info": {
198 | "codemirror_mode": {
199 | "name": "ipython",
200 | "version": 3
201 | },
202 | "file_extension": ".py",
203 | "mimetype": "text/x-python",
204 | "name": "python",
205 | "nbconvert_exporter": "python",
206 | "pygments_lexer": "ipython3",
207 | "version": "3.10.12"
208 | }
209 | },
210 | "nbformat": 4,
211 | "nbformat_minor": 5
212 | }
213 |
--------------------------------------------------------------------------------
/text/llama/prepare-tokenizer.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "56fc07cf",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "from transformers import (\n",
11 | " AutoModelForCausalLM,\n",
12 | " AutoTokenizer,\n",
13 | " get_scheduler,\n",
14 | " default_data_collator,\n",
15 | " SchedulerType\n",
16 | ")\n",
17 | "import os\n",
18 | "import json\n",
19 | "from itertools import chain\n",
20 | "from datasets import load_dataset"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 3,
26 | "id": "e320f019",
27 | "metadata": {},
28 | "outputs": [
29 | {
30 | "data": {
31 | "application/vnd.jupyter.widget-view+json": {
32 | "model_id": "1ee36389eac44862bc018d33cf39da33",
33 | "version_major": 2,
34 | "version_minor": 0
35 | },
36 | "text/plain": [
37 | "Downloading (…)okenizer_config.json: 0%| | 0.00/725 [00:00, ?B/s]"
38 | ]
39 | },
40 | "metadata": {},
41 | "output_type": "display_data"
42 | },
43 | {
44 | "data": {
45 | "application/vnd.jupyter.widget-view+json": {
46 | "model_id": "ea7e12ba9791437db332f9f0ff247c64",
47 | "version_major": 2,
48 | "version_minor": 0
49 | },
50 | "text/plain": [
51 | "Downloading tokenizer.model: 0%| | 0.00/500k [00:00, ?B/s]"
52 | ]
53 | },
54 | "metadata": {},
55 | "output_type": "display_data"
56 | },
57 | {
58 | "data": {
59 | "application/vnd.jupyter.widget-view+json": {
60 | "model_id": "f03938703b01429095679e38aa50ae96",
61 | "version_major": 2,
62 | "version_minor": 0
63 | },
64 | "text/plain": [
65 | "Downloading (…)/main/tokenizer.json: 0%| | 0.00/1.84M [00:00, ?B/s]"
66 | ]
67 | },
68 | "metadata": {},
69 | "output_type": "display_data"
70 | },
71 | {
72 | "data": {
73 | "application/vnd.jupyter.widget-view+json": {
74 | "model_id": "28fcad13bd01402290df7bed108b30f2",
75 | "version_major": 2,
76 | "version_minor": 0
77 | },
78 | "text/plain": [
79 | "Downloading (…)cial_tokens_map.json: 0%| | 0.00/414 [00:00, ?B/s]"
80 | ]
81 | },
82 | "metadata": {},
83 | "output_type": "display_data"
84 | }
85 | ],
86 | "source": [
87 | "block_size = 1024\n",
88 | "train_file = 'combine.jsonl'\n",
89 | "tokenizer = AutoTokenizer.from_pretrained(\n",
90 | " 'mesolitica/llama-7b-hf-16384-fpf',\n",
91 | ")\n",
92 | "text_column_name = 'text'"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "id": "2634b632",
99 | "metadata": {},
100 | "outputs": [
101 | {
102 | "data": {
103 | "application/vnd.jupyter.widget-view+json": {
104 | "model_id": "be7a996be3aa499fa73562ad144690d4",
105 | "version_major": 2,
106 | "version_minor": 0
107 | },
108 | "text/plain": [
109 | "Downloading data files: 0%| | 0/1 [00:00, ?it/s]"
110 | ]
111 | },
112 | "metadata": {},
113 | "output_type": "display_data"
114 | },
115 | {
116 | "data": {
117 | "application/vnd.jupyter.widget-view+json": {
118 | "model_id": "1a7627b16a904159805ab4d33775b50e",
119 | "version_major": 2,
120 | "version_minor": 0
121 | },
122 | "text/plain": [
123 | "Extracting data files: 0%| | 0/1 [00:00, ?it/s]"
124 | ]
125 | },
126 | "metadata": {},
127 | "output_type": "display_data"
128 | },
129 | {
130 | "data": {
131 | "application/vnd.jupyter.widget-view+json": {
132 | "model_id": "ad569aa4427e4f858576ec2a0f4759fc",
133 | "version_major": 2,
134 | "version_minor": 0
135 | },
136 | "text/plain": [
137 | "Generating train split: 0 examples [00:00, ? examples/s]"
138 | ]
139 | },
140 | "metadata": {},
141 | "output_type": "display_data"
142 | }
143 | ],
144 | "source": [
145 | "raw_datasets = load_dataset(\n",
146 | " 'json',\n",
147 | " data_files=train_file,\n",
148 | " split='train'\n",
149 | ")\n",
150 | "\n",
151 | "filename = os.path.split(train_file)[1]\n",
152 | "\n",
153 | "def tokenize_function(examples):\n",
154 | " return tokenizer(examples[text_column_name])\n",
155 | "\n",
156 | "column_names = raw_datasets.column_names\n",
157 | "tokenized_datasets = raw_datasets.map(\n",
158 | " tokenize_function,\n",
159 | " batched=True,\n",
160 | " remove_columns=column_names,\n",
161 | " load_from_cache_file=True,\n",
162 | " cache_file_name=f'./{filename}-tokenized-{block_size}',\n",
163 | " num_proc=20,\n",
164 | ")\n",
165 | "\n",
166 | "tokenized_datasets"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": 4,
172 | "id": "37417527",
173 | "metadata": {},
174 | "outputs": [
175 | {
176 | "data": {
177 | "text/plain": [
178 | "33339118"
179 | ]
180 | },
181 | "execution_count": 4,
182 | "metadata": {},
183 | "output_type": "execute_result"
184 | }
185 | ],
186 | "source": [
187 | "len(tokenized_datasets)"
188 | ]
189 | }
190 | ],
191 | "metadata": {
192 | "kernelspec": {
193 | "display_name": "Python 3 (ipykernel)",
194 | "language": "python",
195 | "name": "python3"
196 | },
197 | "language_info": {
198 | "codemirror_mode": {
199 | "name": "ipython",
200 | "version": 3
201 | },
202 | "file_extension": ".py",
203 | "mimetype": "text/x-python",
204 | "name": "python",
205 | "nbconvert_exporter": "python",
206 | "pygments_lexer": "ipython3",
207 | "version": "3.10.12"
208 | }
209 | },
210 | "nbformat": 4,
211 | "nbformat_minor": 5
212 | }
213 |
--------------------------------------------------------------------------------
/multilingual-tts/prepare/prepare-MasriSpeech-Full.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "e62caf64",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "from glob import glob\n",
11 | "import pandas as pd\n",
12 | "import os\n",
13 | "import soundfile as sf\n",
14 | "from tqdm import tqdm\n",
15 | "from multiprocess import Pool\n",
16 | "import itertools\n",
17 | "import io\n",
18 | "import numpy as np\n",
19 | "import json\n",
20 | "import re\n",
21 | "import zipfile\n",
22 | "from pathlib import Path\n",
23 | "\n",
24 | "def chunks(l, n):\n",
25 | " for i in range(0, len(l), n):\n",
26 | " yield (l[i: i + n], i // n)\n",
27 | "\n",
28 | "def multiprocessing(strings, function, cores=6, returned=True):\n",
29 | " df_split = chunks(strings, len(strings) // cores)\n",
30 | " pool = Pool(cores)\n",
31 | " pooled = pool.map(function, df_split)\n",
32 | " pool.close()\n",
33 | " pool.join()\n",
34 | "\n",
35 | " if returned:\n",
36 | " return list(itertools.chain(*pooled))"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 2,
42 | "id": "2bd68608",
43 | "metadata": {},
44 | "outputs": [
45 | {
46 | "name": "stderr",
47 | "output_type": "stream",
48 | "text": [
49 | "/home/ubuntu/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
50 | " from .autonotebook import tqdm as notebook_tqdm\n",
51 | "Fetching 24 files: 100%|██████████| 24/24 [00:10<00:00, 2.32it/s]\n"
52 | ]
53 | },
54 | {
55 | "data": {
56 | "text/plain": [
57 | "'/home/ubuntu/MasriSpeech-Full'"
58 | ]
59 | },
60 | "execution_count": 2,
61 | "metadata": {},
62 | "output_type": "execute_result"
63 | }
64 | ],
65 | "source": [
66 | "from huggingface_hub import snapshot_download\n",
67 | "\n",
68 | "snapshot_download(\n",
69 | " repo_id=\"NightPrince/MasriSpeech-Full\", \n",
70 | " repo_type=\"dataset\", local_dir=\"./MasriSpeech-Full\", allow_patterns=\"*/*.parquet\")"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 14,
76 | "id": "42c0df51",
77 | "metadata": {},
78 | "outputs": [
79 | {
80 | "data": {
81 | "text/plain": [
82 | "24"
83 | ]
84 | },
85 | "execution_count": 14,
86 | "metadata": {},
87 | "output_type": "execute_result"
88 | }
89 | ],
90 | "source": [
91 | "files = glob('MasriSpeech-Full/*/*.parquet')\n",
92 | "len(files)"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 15,
98 | "id": "7619151a",
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "def loop(files):\n",
103 | "\n",
104 | " os.environ['OMP_NUM_THREADS'] = '1'\n",
105 | " os.environ['OPENBLAS_NUM_THREADS'] = '1'\n",
106 | " \n",
107 | " files, _ = files\n",
108 | "\n",
109 | " data = []\n",
110 | " for f in files:\n",
111 | " base = f.split('/')[0] + '_audio'\n",
112 | " f_new = f.replace('/', '-').replace('.parquet', '')\n",
113 | " os.makedirs(base, exist_ok=True)\n",
114 | " df = pd.read_parquet(f)\n",
115 | " for i in tqdm(range(len(df))):\n",
116 | " try:\n",
117 | " t = df['transcription'].iloc[i].strip()\n",
118 | " if len(t) < 2:\n",
119 | " continue\n",
120 | " audio_filename = f'{f_new}_{i}.mp3'\n",
121 | " audio_filename = os.path.join(base, audio_filename)\n",
122 | " b = df['audio'].iloc[i]['bytes']\n",
123 | " audio_np, sr = sf.read(io.BytesIO(b))\n",
124 | " if audio_np.ndim > 1:\n",
125 | " audio_np = audio_np.mean(axis=1)\n",
126 | " if audio_np.shape[0] < 10000:\n",
127 | " continue\n",
128 | " sf.write(audio_filename, audio_np, sr)\n",
129 | " \n",
130 | " data.append({\n",
131 | " 'audio_filename': audio_filename,\n",
132 | " 'text': t,\n",
133 | " 'speaker': f\"{base}\"\n",
134 | " })\n",
135 | " except Exception as e:\n",
136 | " pass\n",
137 | " \n",
138 | " return data"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 16,
144 | "id": "a66ecfc4",
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "# data = loop((files[:1], 0))"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "id": "935c7e8c",
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "name": "stderr",
159 | "output_type": "stream",
160 | "text": [
161 | " 7%|▋ | 145/2205 [00:18<03:34, 9.59it/s]"
162 | ]
163 | }
164 | ],
165 | "source": [
166 | "data = multiprocessing(files, loop, cores = len(files))"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "id": "d5fb3e9a",
173 | "metadata": {},
174 | "outputs": [],
175 | "source": []
176 | }
177 | ],
178 | "metadata": {
179 | "kernelspec": {
180 | "display_name": "Python 3 (ipykernel)",
181 | "language": "python",
182 | "name": "python3"
183 | },
184 | "language_info": {
185 | "codemirror_mode": {
186 | "name": "ipython",
187 | "version": 3
188 | },
189 | "file_extension": ".py",
190 | "mimetype": "text/x-python",
191 | "name": "python",
192 | "nbconvert_exporter": "python",
193 | "pygments_lexer": "ipython3",
194 | "version": "3.10.12"
195 | }
196 | },
197 | "nbformat": 4,
198 | "nbformat_minor": 5
199 | }
200 |
--------------------------------------------------------------------------------
/text/processing/function.py:
--------------------------------------------------------------------------------
1 | import os
2 | import mp
3 | import json
4 | import functools
5 | import subprocess
6 | import utils as ut
7 | from glob import glob
8 | from tqdm import tqdm
9 | from pathlib import Path
10 | from datasets import Dataset
11 | from unidecode import unidecode
12 |
13 |
14 | def download_dataset(link, raw_dataset_path, dataset_name):
15 | try:
16 | global MAIN_FOLDER_DATASET
17 |
18 | MAIN_FOLDER_DATASET = f"{raw_dataset_path}/raw-datasets/"
19 | ut.create_dir(MAIN_FOLDER_DATASET)
20 |
21 | command = f"wget {link} -O {MAIN_FOLDER_DATASET}/{dataset_name}.jsonl"
22 | ut.run_command(command)
23 |
24 | return True
25 | except:
26 | return False
27 |
28 |
29 | def init_process(
30 | raw_dataset_path, dataset_name, text_key=None, link=None, clean_file_path=None
31 | ):
32 | global INITIAL_PRE_PROCESSING_FOLDER
33 | global MAIN_FOLDER_DATASET
34 |
35 | txt_l = []
36 |
37 | if link != None:
38 | dd = download_dataset(link, raw_dataset_path, dataset_name)
39 |
40 | INITIAL_PRE_PROCESSING_FOLDER = f"{raw_dataset_path}/staging-datasets/"
41 | ut.create_dir(INITIAL_PRE_PROCESSING_FOLDER)
42 |
43 | with open(f"{MAIN_FOLDER_DATASET}/{dataset_name}.jsonl") as fopen:
44 | data = [json.loads(line) for line in fopen]
45 |
46 | if clean_file_path != None:
47 | MAIN_FOLDER_DATASET = clean_file_path
48 |
49 | INITIAL_PRE_PROCESSING_FOLDER = f"{raw_dataset_path}/staging-datasets/"
50 | ut.create_dir(INITIAL_PRE_PROCESSING_FOLDER)
51 |
52 | with open(clean_file_path) as fopen:
53 | data = [json.loads(line) for line in fopen]
54 |
55 | try:
56 | key_data = [key for key, _ in data[0].items()]
57 | print(f"Availble key -> {key_data}")
58 | except AttributeError:
59 | raise Exception(
60 | f"dataset not in standard list format, total record in the file -> {len(data)}."
61 | )
62 |
63 | suitable_key = [
64 | "p",
65 | "text",
66 | "article_text",
67 | "article_body",
68 | "text",
69 | "content",
70 | "contents",
71 | "body",
72 | "articleBody",
73 | "data",
74 | "title",
75 | ]
76 |
77 | if text_key:
78 | suitable_key = list(set(suitable_key + text_key))
79 |
80 | if not any(key in key_data for key in suitable_key):
81 | raise Exception(
82 | f"dataset not in standard key-value. must have ({' | '.join(suitable_key)})"
83 | )
84 |
85 | for i in tqdm(data):
86 | str_lst = []
87 | for key in i.keys():
88 | if key in suitable_key:
89 | str_lst.append(str(i[key]))
90 | else:
91 | continue
92 |
93 | if None in str_lst:
94 | str_lst = ["None" if v is None else v for v in str_lst]
95 |
96 | str_data = "\n\n".join(str_lst)
97 | txt_l.append({"text": f"{str_data}"})
98 |
99 | ut.write_to_json(txt_l, f"{INITIAL_PRE_PROCESSING_FOLDER}{dataset_name}.jsonl")
100 |
101 |
102 | def second_process(raw_dataset_path, dataset_name):
103 | global HF_FOLDER_RAW
104 | global HF_FOLDER_DEDUPE
105 |
106 | HF_FOLDER_RAW = f"{raw_dataset_path}/hf-datasets/raw-datasets/"
107 | HF_FOLDER_DEDUPE = f"{raw_dataset_path}/hf-datasets/dedupe-datasets/"
108 |
109 | ut.create_dir(HF_FOLDER_RAW)
110 | ut.create_dir(HF_FOLDER_DEDUPE)
111 |
112 | with open(f"{INITIAL_PRE_PROCESSING_FOLDER}/{dataset_name}.jsonl") as fopen:
113 | data = [json.loads(line) for line in fopen]
114 |
115 | print(f"total records: {len(data)}")
116 |
117 | data = [entry for entry in tqdm(data) if entry is not None]
118 |
119 | print(f"total records after remove None: {len(data)}")
120 |
121 | data_dict = {"text": [entry["text"] for entry in data]}
122 |
123 | dataset = Dataset.from_dict(data_dict)
124 |
125 | dataset.save_to_disk(f"{HF_FOLDER_RAW}{dataset_name}")
126 |
127 | command = f"python3 -m text_dedup.minhash \
128 | --path {HF_FOLDER_RAW}{dataset_name} \
129 | --split train \
130 | --cache_dir ./cache \
131 | --output {HF_FOLDER_DEDUPE}{dataset_name} \
132 | --column text \
133 | --batch_size 10000 \
134 | --threshold 0.95 \
135 | --min_length 1 \
136 | --local"
137 |
138 | ut.run_command(command)
139 |
140 |
141 | def third_process(raw_dataset_path, mp_core):
142 | HF_FOLDER_POSTPROCESSING = f"{raw_dataset_path}/hf-datasets/postprocessing/"
143 | HF_FOLDER_POSTPROCESSING_DONE = (
144 | f"{raw_dataset_path}/hf-datasets/postprocessing-done/"
145 | )
146 |
147 | ut.create_dir(HF_FOLDER_POSTPROCESSING)
148 | ut.create_dir(HF_FOLDER_POSTPROCESSING_DONE)
149 |
150 | files_lst = glob(f"{HF_FOLDER_DEDUPE}*.jsonl")
151 |
152 | print(f"total files to postprocessing --> {len(files_lst)}")
153 |
154 | core = mp_core
155 |
156 | if len(files_lst) // core == 0:
157 | process_type = "single"
158 | ut.loop(files_lst, process_type=process_type)
159 | else:
160 | process_type = "multi"
161 | mp.multiprocessing(files_lst, ut.loop, cores=core, returned=False)
162 |
163 |
164 | def get_size(raw_dataset_path, dataset_name):
165 | before_dedup_url = f"{MAIN_FOLDER_DATASET}/{dataset_name}.jsonl"
166 | before_dedup_clean = f"{MAIN_FOLDER_DATASET}"
167 | after_dedup = f"{HF_FOLDER_DEDUPE}{dataset_name}.jsonl"
168 | after_post = f"{raw_dataset_path}/hf-datasets/postprocessing/{dataset_name}.jsonl"
169 |
170 | try:
171 | before_dedup_mb = (os.stat(before_dedup_url)).st_size / (1024 * 1024)
172 | except:
173 | before_dedup_mb = (os.stat(before_dedup_clean)).st_size / (1024 * 1024)
174 |
175 | after_dedup_mb = (os.stat(after_dedup)).st_size / (1024 * 1024)
176 | after_post_mb = (os.stat(after_post)).st_size / (1024 * 1024)
177 |
178 | return (
179 | f"{before_dedup_mb:.2f} MB",
180 | f"{after_dedup_mb:.2f} MB",
181 | f"{after_post_mb:.2f} MB",
182 | )
183 |
--------------------------------------------------------------------------------