├── YiZhao_technical_report.pdf
├── 7_DataAnalysis
    ├── resources
    │   ├── HIT.jfif
    │   ├── simsun.ttf
    │   └── hit_stopwords.txt
    ├── eval_pipeline.py
    ├── corpus_eval_visulization.py
    └── corpus_evaluator.py
├── 2_toxic_filter
    ├── sensitive_words
    │   └── violence.txt
    └── 2_toxic_filter.py
├── 5_text_dedup
    ├── clean_helpers
    │   ├── __init__.py
    │   ├── concatenation.py
    │   ├── utils.py
    │   └── deduplication.py
    └── 5_clean.py
├── requirements.txt
├── 6_text_dedup
    └── text_dedup
    │   ├── __init__.py
    │   ├── utils
    │       ├── __init__.py
    │       ├── preprocess.py
    │       ├── union_find.py
    │       ├── tokenization.py
    │       ├── analysis.py
    │       ├── timer.py
    │       └── add_args.py
    │   └── minhash.py
├── 4_perplexity_filter
    └── kenlm
    │   ├── run.py
    │   └── model.py
├── 3_rule_filter.py
├── README.md
└── 1_pii.py


/YiZhao_technical_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HITsz-TMG/YiZhao/HEAD/YiZhao_technical_report.pdf


--------------------------------------------------------------------------------
/7_DataAnalysis/resources/HIT.jfif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HITsz-TMG/YiZhao/HEAD/7_DataAnalysis/resources/HIT.jfif


--------------------------------------------------------------------------------
/7_DataAnalysis/resources/simsun.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HITsz-TMG/YiZhao/HEAD/7_DataAnalysis/resources/simsun.ttf


--------------------------------------------------------------------------------
/2_toxic_filter/sensitive_words/violence.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HITsz-TMG/YiZhao/HEAD/2_toxic_filter/sensitive_words/violence.txt


--------------------------------------------------------------------------------
/5_text_dedup/clean_helpers/__init__.py:
--------------------------------------------------------------------------------
1 | from .deduplication import build_dedup_template, build_dedup_document
2 | from .concatenation import concatenate_lm_fr_ester


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | regex==2024.9.11
 2 | datasets==3.0.0
 3 | chardet==5.2.0
 4 | ftfy==6.2.3
 5 | langdetect==1.0.9
 6 | opencc==1.1.9
 7 | kenlm==0.2.0
 8 | sentencepiece==0.2.0
 9 | jsonlines==2.0.0
10 | torch==2.2.2
11 | scipy==1.12.0
12 | rich==13.7.1
13 | tiktoken==0.7.0
14 | openai==1.46.1
15 | matplotlib==3.9.2
16 | seaborn==0.13.2
17 | jieba==0.42.1
18 | wordcloud==1.9.3
19 | pandas==2.2.2
20 | numpy==1.26.4


--------------------------------------------------------------------------------
/6_text_dedup/text_dedup/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Date         : 2021-06-05 12:48:33
 4 | # @Author       : Chenghao Mou (mouchenghao@gmail.com)
 5 | 
 6 | """Text deduplication simplified."""
 7 | 
 8 | import logging
 9 | 
10 | from rich.logging import RichHandler
11 | 
12 | logger = logging.getLogger("text_dedup")
13 | logger.setLevel(logging.INFO)
14 | logger.addHandler(RichHandler(rich_tracebacks=True))
15 | logger.propagate = False
16 | 


--------------------------------------------------------------------------------
/5_text_dedup/clean_helpers/concatenation.py:
--------------------------------------------------------------------------------
 1 | from itertools import groupby
 2 | 
 3 | from datasets import Dataset
 4 | 
 5 | from clean_helpers.utils import parse_meta
 6 | 
 7 | 
 8 | def concatenate_lm_fr_ester(ds: Dataset, num_proc: int, batch_size: int) -> Dataset:
 9 |     dataset_in_memory = [
10 |         (*parse_meta(row["meta"])["id"].split("_id_"), row["text"]) for row in ds
11 |     ]
12 |     dataset_in_memory.sort()
13 |     new_texts = []
14 |     new_metas = []
15 |     for doc_id, segments in groupby(dataset_in_memory, key=lambda x: x[0]):
16 |         sorted_segment = sorted(
17 |             [elt[1:] for elt in segments],
18 |             key=lambda x: int(x[0])
19 |         )
20 |         new_texts.append("\n".join([elt[1] for elt in sorted_segment]))
21 |         new_metas.append({"id": doc_id})
22 |     
23 |     new_ds = Dataset.from_dict({"text": new_texts, "meta": new_metas})
24 |     return new_ds
25 | 


--------------------------------------------------------------------------------
/6_text_dedup/text_dedup/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Date    : 2022-12-26 15:42:09
 4 | # @Author  : Chenghao Mou (mouchenghao@gmail.com)
 5 | 
 6 | from utils.add_args import add_bloom_filter_args
 7 | from utils.add_args import add_exact_hash_args
 8 | from utils.add_args import add_io_args
 9 | from utils.add_args import add_meta_args
10 | from utils.add_args import add_minhash_args
11 | from utils.add_args import add_sa_args
12 | from utils.add_args import add_simhash_args
13 | from utils.timer import Timer
14 | from utils.tokenization import ngrams
15 | from utils.union_find import UnionFind
16 | 
17 | __all__ = [
18 |     "add_bloom_filter_args",
19 |     "add_exact_hash_args",
20 |     "add_io_args",
21 |     "add_meta_args",
22 |     "add_minhash_args",
23 |     "add_sa_args",
24 |     "add_simhash_args",
25 |     "Timer",
26 |     "ngrams",
27 |     "UnionFind",
28 | ]
29 | 


--------------------------------------------------------------------------------
/5_text_dedup/clean_helpers/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import Dict
 3 | 
 4 | 
 5 | def parse_meta(meta) -> Dict:
 6 |     if isinstance(meta, str):
 7 |         meta = eval(meta)
 8 |     return meta
 9 | 
10 | 
11 | normalise_dataset_name_regex = re.compile(
12 |     r"^(?:/gpfswork/rech/six/uty16tp/dataset/tokenization/)?(bigscience-catalogue-lm-data/[^/]+)(?:/data)?$"
13 | )
14 | 
15 | 
16 | language_regex = re.compile(
17 |     r"^(?:/gpfswork/rech/six/uty16tp/dataset/tokenization/)?bigscience-catalogue-lm-data/lm_([^_]+)_.*(?:/data)?$"
18 | )
19 | def get_language(dataset_name: str):
20 |     lang_candidate = language_regex.match(dataset_name).group(1)
21 | 
22 |     # Normalise chinese languages, so that we only consider simplified and traditional chinese as the two chinese languages
23 |     if lang_candidate in ["zh", "zhs", "zh-cn"]:
24 |         lang_candidate = "zhs"
25 |     elif lang_candidate in ["zht", "zh-tw"]:
26 |         lang_candidate = "zht"
27 |     else:
28 |         assert lang_candidate[:2] != "zh"
29 | 
30 |     return lang_candidate
31 | 


--------------------------------------------------------------------------------
/6_text_dedup/text_dedup/utils/preprocess.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Date    : 2023-05-06 19:39:27
 4 | # @Author  : Chenghao Mou (mouchenghao@gmail.com)
 5 | 
 6 | import regex as re
 7 | 
 8 | DIGIT_RE = re.compile(r"\d")
 9 | PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile(r"[\p{P}\p{C}\p{S}]+")
10 | 
11 | def normalize(line: str) -> str:
12 |     """
13 |     Normalize a line of text. Source: https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/text_normalizer.py#L180
14 | 
15 |     Parameters
16 |     ----------
17 |     line : str
18 |         The line of text to normalize.
19 |     
20 |     Returns
21 |     -------
22 |     str
23 |         The normalized line of text.
24 | 
25 |     Examples
26 |     --------
27 |     >>> normalize("Hello, world!")
28 |     'hello world'
29 |     >>> normalize("Hello, 123!\\n\\t\\b")
30 |     'hello 000'
31 |     """
32 |     line = line.strip()
33 |     if not line:
34 |         return line
35 |     line = line.lower()
36 |     line = DIGIT_RE.sub("0", line)
37 |     line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line)
38 |     return line
39 | 


--------------------------------------------------------------------------------
/6_text_dedup/text_dedup/utils/union_find.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Date    : 2022-12-26 15:37:44
 4 | # @Author  : Chenghao Mou (mouchenghao@gmail.com)
 5 | 
 6 | 
 7 | class UnionFind:
 8 |     """
 9 |     A data structure for maintaining disjoint sets. This helps build connected components for given duplicate pairs.
10 | 
11 |     Examples
12 |     --------
13 |     >>> uf = UnionFind()
14 |     >>> uf.union(1, 2)
15 |     >>> uf.union(2, 3)
16 |     >>> uf.union(4, 5)
17 |     >>> uf.find(1)
18 |     1
19 |     >>> uf.find(2)
20 |     1
21 |     >>> uf.find(3)
22 |     1
23 |     >>> uf.find(4)
24 |     4
25 |     >>> uf.find(5)
26 |     4
27 |     """
28 | 
29 |     def __init__(self):
30 |         self.parent = {}
31 | 
32 |     def find(self, x):
33 |         if x not in self.parent:
34 |             self.parent[x] = x
35 |             return x
36 | 
37 |         if self.parent[x] != x:
38 |             self.parent[x] = self.find(self.parent[x])
39 | 
40 |         return self.parent[x]
41 | 
42 |     def union(self, x, y):
43 |         px = self.find(x)
44 |         py = self.find(y)
45 |         self.parent[px] = self.parent[py] = min(px, py)
46 | 


--------------------------------------------------------------------------------
/6_text_dedup/text_dedup/utils/tokenization.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Date    : 2022-12-26 15:59:42
 4 | # @Author  : Chenghao Mou (mouchenghao@gmail.com)
 5 | from itertools import tee
 6 | from typing import List
 7 | from typing import Text
 8 | 
 9 | 
10 | def ngrams(sequence: List[Text], n: int, min_length: int = 5):
11 |     """
12 |     Return the ngrams generated from a sequence of items, as an iterator.
13 | 
14 |     This is a modified version of nltk.util.ngrams.
15 | 
16 |     Parameters
17 |     ----------
18 |     sequence : List[Text]
19 |         The sequence of items.
20 |     n : int
21 |         The length of each ngram.
22 |     min_length : int, optional
23 |         The minimum length of each ngram, by default 5
24 | 
25 |     Returns
26 |     -------
27 |     iterator
28 |         The ngrams.
29 | 
30 |     Examples
31 |     --------
32 |     >>> list(ngrams(["a", "b", "c", "d"], 2, min_length=1))
33 |     [('a', 'b'), ('b', 'c'), ('c', 'd')]
34 |     >>> list(ngrams(["a", "b", "c", "d"], 2, min_length=5))
35 |     []
36 |     >>> list(ngrams(["a", "b"], 3, min_length=1))
37 |     [('a', 'b')]
38 |     """
39 |     if len(sequence) < min_length:
40 |         return []
41 |     if len(sequence) < n:
42 |         return [tuple(sequence)]
43 |     iterables = tee(iter(sequence), n)
44 |     for i, sub_iterable in enumerate(iterables):
45 |         for _ in range(i):
46 |             next(sub_iterable, None)
47 |     return zip(*iterables)
48 | 


--------------------------------------------------------------------------------
/6_text_dedup/text_dedup/utils/analysis.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Date    : 2023-01-02 15:18:55
 4 | # @Author  : Chenghao Mou (mouchenghao@gmail.com)
 5 | from typing import List
 6 | 
 7 | from text_dedup.utils.tokenization import ngrams
 8 | 
 9 | 
10 | def jaccard_similarity(
11 |     doc1: str | List[str],
12 |     doc2: str | List[str],
13 |     ngram_size: int = 8,
14 |     min_length: int = 0,
15 | ) -> float:
16 |     """Compute the Jaccard similarity between two documents.
17 | 
18 |     Parameters
19 |     ----------
20 |     doc1 : str or List[str]
21 |         The first document.
22 |     doc2 : str or List[str]
23 |         The second document.
24 |     ngram_size : int, optional
25 |         The size of n-grams, by default 8
26 |     min_length : int, optional
27 |         The minimum length of each n-gram, by default 0
28 | 
29 |     Returns
30 |     -------
31 |     float
32 |         The Jaccard similarity.
33 | 
34 |     Examples
35 |     --------
36 |     >>> jaccard_similarity("hello world", "hello world")
37 |     1.0
38 |     >>> jaccard_similarity("hello world", "hello world!")
39 |     0.8
40 |     >>> jaccard_similarity("hello world".split(), "hello world!".split(), ngram_size=1)
41 |     0.3333333333333333
42 |     """
43 |     words1 = set(" ".join(ng) for ng in ngrams(list(doc1), ngram_size, min_length=min_length))
44 |     words2 = set(" ".join(ng) for ng in ngrams(list(doc2), ngram_size, min_length=min_length))
45 |     return len(words1 & words2) / max(1, len(words1 | words2))
46 | 


--------------------------------------------------------------------------------
/7_DataAnalysis/eval_pipeline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from corpus_evaluator import corpus_quality_measure_fn
 4 | from corpus_eval_visulization import scores_visualization
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument("--random_seed", type=int, default=1234)
10 | 
11 |     parser.add_argument("--data_path", type=str)
12 |     parser.add_argument("--data_num", type=int, default=10)
13 |     parser.add_argument("--text_column", type=str)
14 |     parser.add_argument("--tiktoken_cache", type=str)
15 | 
16 |     parser.add_argument("--eval_path", type=str)
17 | 
18 |     parser.add_argument("--figure_dir", type=str)
19 | 
20 |     parser.add_argument("--model", type=str, default="gpt-3.5-turbo-1106")
21 |     parser.add_argument("--api_key", type=str)
22 |     parser.add_argument("--organization", type=str)
23 |     parser.add_argument("--num_proc", type=int, default=1)
24 |     args = parser.parse_args()
25 | 
26 |     # args.eval_path = args.figure_dir + "/result.jsonl"
27 | 
28 |     tiktoken_cache_dir = args.tiktoken_cache
29 |     os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
30 | 
31 |     corpus = corpus_quality_measure_fn(
32 |         data_path=args.data_path,
33 |         eval_path=args.eval_path,
34 |         data_num=args.data_num,
35 |         text_column=args.text_column,
36 |         model=args.model,
37 |         api_key=args.api_key,
38 |         organization=args.organization,
39 |         num_proc=args.num_proc,)
40 | 
41 |     scores_visualization(corpus, args.text_column, args.figure_dir)
42 | 
43 | 


--------------------------------------------------------------------------------
/4_perplexity_filter/kenlm/run.py:
--------------------------------------------------------------------------------
 1 | from model import KenlmModel
 2 | import json
 3 | import jsonlines
 4 | import argparse
 5 | from tqdm import tqdm
 6 | 
 7 | def save_jsonl(data, output_path):
 8 |     with open(output_path, 'w', encoding='utf-8') as output_file:
 9 |         for item in data:
10 |             output_file.write(json.dumps(item, ensure_ascii=False) + "\n")
11 | 
12 | def read_jsonl(input_path):
13 |     output_data = []
14 |     with open(input_path, 'r+', encoding='utf-8') as f:
15 |         for item in jsonlines.Reader(f):
16 |             output_data.append(item)
17 |     return output_data
18 | 
19 | def perplexity_filter(input_path, output_path):
20 |     input_data = read_jsonl(input_path)
21 |     filtered_data = []
22 | 
23 |     for tmp in tqdm(input_data):
24 |         score = model.get_perplexity(tmp[args.text_column])
25 |         if score <= 2095:
26 |             filtered_data.append(tmp)
27 | 
28 |     save_jsonl(filtered_data, output_path)
29 | 
30 | 
31 | 
32 | if __name__=='__main__':
33 |     parser = argparse.ArgumentParser()
34 |     parser.add_argument(
35 |         "--input_path",
36 |         type=str,
37 |         help="Path to input file(jsonl).",
38 |     )
39 |     parser.add_argument(
40 |         "--output_path",
41 |         type=str,
42 |         help="Path to output file(jsonl).",
43 |     )
44 |     parser.add_argument('--text_column', type=str)
45 |     parser.add_argument('--language', type=str, help="zh or en")
46 |     args = parser.parse_args()
47 |     
48 |     # model taken from https://huggingface.co/edugp/kenlm
49 |     model = KenlmModel.from_pretrained("kenlm/wikipedia", args.language)
50 |     perplexity_filter(args.input_path, args.output_path)
51 |     print('Perplexity Filter Done!')
52 | 
53 | 


--------------------------------------------------------------------------------
/6_text_dedup/text_dedup/utils/timer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Date    : 2022-12-26 15:45:46
 4 | # @Author  : Chenghao Mou (mouchenghao@gmail.com)
 5 | import time
 6 | 
 7 | 
 8 | class TimerContext:
 9 |     def __init__(self, timer: "Timer", name: str):
10 |         self.timer = timer
11 |         self.name = name
12 |         self.start_time = None
13 | 
14 |     def __enter__(self):
15 |         self.start_time = time.time()
16 | 
17 |     def __exit__(self, exc_type, exc_val, exc_tb):
18 |         if any([exc_type, exc_val, exc_tb]):
19 |             raise exc_val
20 |         self.timer.elapsed_times[self.name] = time.time() - self.start_time
21 | 
22 | 
23 | class Timer:
24 |     """
25 |     A simple timer that tracks the elapsed time of each context.
26 | 
27 |     Examples
28 |     --------
29 |     >>> t = Timer()
30 |     >>> with t("test"):
31 |     ...     time.sleep(1)
32 |     >>> assert int(t.elapsed_times.get("test", 0)) >= 1, "The elapsed time should be 1 second."
33 |     """
34 | 
35 |     def __init__(self):
36 |         self.elapsed_times = {}
37 | 
38 |     def __call__(self, name: str) -> TimerContext:
39 |         """
40 |         Create a context with the given name.
41 | 
42 |         Parameters
43 |         ----------
44 |         name: str
45 |             The name of the context.
46 | 
47 |         Returns
48 |         -------
49 |         TimerContext
50 |             The context.
51 | 
52 |         Examples
53 |         --------
54 |         >>> t = Timer()
55 |         >>> with t("test"):
56 |         ...     time.sleep(1)
57 |         >>> assert int(t.elapsed_times.get("test", 0)) == 1, "The elapsed time should be 1 second."
58 |         >>> with t("test2"):
59 |         ...     time.sleep(2)
60 |         >>> assert int(t.elapsed_times.get("test2", 0)) == 2, "The elapsed time should be 2 seconds."
61 |         """
62 |         return TimerContext(self, name)
63 | 


--------------------------------------------------------------------------------
/2_toxic_filter/2_toxic_filter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import chardet
 4 | import argparse
 5 | from pathlib import Path
 6 | 
 7 | def load_jsonl(path):
 8 |     with open(path, 'r', encoding='UTF-8') as f:
 9 |         return [json.loads(l) for l in f]
10 | 
11 | class CorpusFilter:
12 |     def __init__(self, directory_path):
13 |         self.sensitive_keywords = self.load_sensitive_keywords(directory_path)
14 |     
15 |     def detect_encoding(self, file_path):
16 |         with open(file_path, 'rb') as file:
17 |             raw_data = file.read(5000)
18 |             result = chardet.detect(raw_data)
19 |             encoding = result['encoding']
20 |         return encoding
21 |     
22 |     def load_sensitive_keywords(self, directory_path):
23 |         # Load sensitive keywords from all .txt files in the specified directory
24 |         sensitive_keywords = set()
25 |         for filename in os.listdir(directory_path):
26 |             if filename.endswith('.txt'):
27 |                 file_path = os.path.join(directory_path, filename)
28 |                 encoding = self.detect_encoding(file_path)
29 |                 with open(file_path, 'r', encoding=encoding) as file:
30 |                     for line in file:
31 |                         keyword = line.strip().rstrip(',')
32 |                         if keyword:
33 |                             sensitive_keywords.add(keyword)
34 |         return list(sensitive_keywords)
35 |     
36 |     def is_sensitive(self, text):
37 |         for keyword in self.sensitive_keywords:
38 |             if keyword in text:
39 |                 return True
40 |         return False
41 |     
42 |     def filter_corpus(self, input_file_path, output_file_path):
43 |         with open(input_file_path, 'r', encoding='utf-8') as input_file, \
44 |              open(output_file_path, 'w', encoding='utf-8') as output_file:
45 |             for line in input_file:
46 |                 try:
47 |                     data = json.loads(line)
48 |                     text = data.get(args.text_column, '')
49 |                     if not self.is_sensitive(text):
50 |                         output_file.write(json.dumps(data, ensure_ascii=False) + '\n')
51 |                 except json.JSONDecodeError:
52 |                     continue  # Ignore lines with parsing errors
53 | 
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     parser = argparse.ArgumentParser()
58 |     # The default input and output are jsonl files
59 |     parser.add_argument('--input_path', type=str)
60 |     parser.add_argument('--output_path', type=str)
61 |     parser.add_argument('--text_column', type=str)
62 |     args = parser.parse_args()
63 | 
64 |     directory_path = Path(__file__).parent / "sensitive_words"
65 |     filter = CorpusFilter(directory_path)
66 |     data = load_jsonl(args.input_path)
67 |     filter.filter_corpus(args.input_path, args.output_path)   
68 | 
69 | 


--------------------------------------------------------------------------------
/7_DataAnalysis/corpus_eval_visulization.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import platform
 3 | from pathlib import Path
 4 | from collections import defaultdict
 5 | 
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | import seaborn as sns
 9 | import pandas as pd
10 | from pylab import *
11 | import jieba
12 | from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
13 | from PIL import Image
14 | 
15 | 
16 | sns.set_palette("hls")
17 | 
18 | from matplotlib.font_manager import FontProperties
19 | system = platform.system()
20 | 
21 | font = FontProperties(fname='fonts/opentype/noto/NotoSerifCJK-Black.ttc')
22 | 
23 | 
24 | def get_all_scores(corpus):
25 |     scores_dict = defaultdict(list)
26 | 
27 |     for obj in corpus:
28 |         quality = obj["quality"]
29 |         for aspect, result in quality.items():
30 |             if result["score"] >= 0:
31 |                 scores_dict[aspect].append(result["score"])
32 | 
33 |     print("{:6s} | {:5s} | {:5s} | {:5s} | {:5s} | {} ".format("", "Mean", "Std", "Min", "Max", "Count"))
34 |     for aspect, score_list in scores_dict.items():
35 |         mean_score = np.mean(score_list)
36 |         std_score = np.std(score_list)
37 |         min_screo = min(score_list)
38 |         max_score = max(score_list)
39 |         print("{:5s} | {:5.2f} | {:5.2f} | {:5d} | {:5d} | {}".format(aspect, mean_score, std_score, min_screo, max_score, len(score_list)))
40 | 
41 |     return scores_dict
42 | 
43 | 
44 | def get_wordcloud(corpus, text_column, figure_dir):
45 |     text_list = [obj[text_column] for obj in corpus]
46 |     text = "\n".join(text_list)
47 | 
48 |     wordlist = jieba.cut(text)
49 |     wordlist = [w for w in wordlist if len(w) > 1]
50 |     space_list = ' '.join(wordlist)
51 | 
52 |     backgroud = np.array(Image.open(Path(__file__).parent / "resources/HIT.jfif"))
53 | 
54 |     with open(Path(__file__).parent / "resources/hit_stopwords.txt", "r", encoding="utf-8") as f:
55 |         stopwords = [w.rstrip() for w in f.readlines()]
56 | 
57 |     wc = WordCloud(width=1400, height=2200,
58 |                    background_color='white',
59 |                    mode='RGB',
60 |                    mask=backgroud,
61 |                    max_words=500,
62 |                    stopwords=STOPWORDS.update(stopwords),
63 |                    max_font_size=150,
64 |                    relative_scaling=0.6,
65 |                    random_state=50,
66 |                    scale=2,
67 |                    font_path=str(Path(__file__).parent / "resources/simsun.ttf"),
68 |                    ).generate(space_list)
69 | 
70 |     image_color = ImageColorGenerator(backgroud)
71 |     wc.recolor(color_func=image_color)
72 | 
73 |     plt.imshow(wc)
74 |     plt.axis('off')
75 |     plt.show()
76 |     wc.to_file(os.path.join(figure_dir, "wordcloud.png"))
77 | 
78 | 
79 | def get_plot(scores_dict: dict, figure_dir: str):
80 |     sns.set_palette("hls")
81 |     fig, axs = plt.subplots(1, 5, figsize=(25, 5))
82 | 
83 |     color_list = ["#FF55BB", "#00DFA2", "#FFD3A3", "#0079FF", "#F6FA70"]
84 |     for i, (aspect, score_list) in enumerate(scores_dict.items()):
85 |             
86 |         sns.histplot(score_list, bins=10, color=color_list[i], ax=axs[i])
87 |         sns.kdeplot(score_list, color="seagreen", lw=3, ax=axs[i])
88 |         # sns.distplot(score_list, bins=10, kde_kws={"color": "seagreen", "lw": 3}, hist_kws={"color": color_list[i]}, ax=axs[i])
89 |         axs[i].set_title(aspect, fontproperties=font)
90 | 
91 |     plt.savefig(os.path.join(figure_dir, "quality_hist.png"))
92 |     plt.show()
93 | 
94 | 
95 | def scores_visualization(corpus, text_column, figure_dir):
96 |     get_wordcloud(corpus, text_column, figure_dir)
97 |     scores_dict = get_all_scores(corpus)
98 |     get_plot(scores_dict, figure_dir)


--------------------------------------------------------------------------------
/3_rule_filter.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import argparse
  3 | import ftfy
  4 | import regex
  5 | from langdetect import detect
  6 | from tqdm import tqdm
  7 | import opencc
  8 | 
  9 | def load_jsonl(path):
 10 |     with open(path, 'r', encoding='UTF-8') as f:
 11 |         return [json.loads(l) for l in f]
 12 | 
 13 | class RuleFilter:
 14 |     def __init__(self):
 15 |         self.OPENCC_CONVERTER = opencc.OpenCC('t2s.json')
 16 |         self.punctuation_unicode = {
 17 |             '，': ',',
 18 |             '。': '.',
 19 |             '、': ',',
 20 |             '„': '"',
 21 |             '”': '"',
 22 |             '“': '"',
 23 |             '«': '"',
 24 |             '»': '"',
 25 |             '１': '"',
 26 |             '」': '"',
 27 |             '「': '"',
 28 |             '《': '"',
 29 |             '》': '"',
 30 |             '´': "'",
 31 |             '∶': ':',
 32 |             '：': ':',
 33 |             '？': '?',
 34 |             '！': '!',
 35 |             '（': '(',
 36 |             '）': ')',
 37 |             '；': ';',
 38 |             '–': '-',
 39 |             '—': ' - ',
 40 |             '．': '. ',
 41 |             '～': '~',
 42 |             '’': "'",
 43 |             '…': '...',
 44 |             '━': '-',
 45 |             '〈': '<',
 46 |             '〉': '>',
 47 |             '【': '[',
 48 |             '】': ']',
 49 |             '％': '%',
 50 |             '►': '-',
 51 |         }
 52 |         self.various_whitespaces = {
 53 |             ' ', '	', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
 54 |             ' ', ' ', ' ', '　', '​', '‌', '‍', '⁠', '￼', ''
 55 |         }
 56 |         
 57 |     def handle(self, text):
 58 |         # unicode
 59 |         text = ftfy.fix_text(text, normalization="NFC")
 60 |         # language filter
 61 |         if detect(text) != args.language:
 62 |             return None
 63 |         
 64 |         # Standardization of Punctuation
 65 |         text = ''.join([
 66 |             self.punctuation_unicode.get(c, c) for c in text
 67 |         ])
 68 |         # Standardization of Whitespace
 69 |         text =  ''.join([
 70 |             char if char not in self.various_whitespaces else ' ' for char in text
 71 |         ])
 72 |         
 73 |         # Replace all matched consecutive punctuation with a single punctuation
 74 |         pattern = r'(\p{P})\1+'
 75 |         text = regex.sub(pattern, r'\1', text)
 76 |         text = text.strip()
 77 |         
 78 |         # Filter out texts with too high a punctuation ratio and too short a text length
 79 |         punctuation_count = len(regex.findall(r'\p{P}', text))
 80 |         total_chars = len(text)
 81 |         punctuation_ratio = punctuation_count / total_chars
 82 |         if punctuation_ratio > args.punctuation_ratio_threshold or len(text) < args.text_length_threshold:
 83 |             return None
 84 | 
 85 |         
 86 |         # Convert Traditional Chinese Characters to Simplified Chinese
 87 |         return self.OPENCC_CONVERTER.convert(text)
 88 |     
 89 |     def filter(self, input_file_path, output_file_path):
 90 |         with open(input_file_path, 'r', encoding='utf-8') as input_file, \
 91 |              open(output_file_path, 'w', encoding='utf-8') as output_file:
 92 |             for line in input_file:
 93 |                 try:
 94 |                     data = json.loads(line)
 95 |                     text = data.get(args.text_column, '')
 96 |                     result = self.handle(text)
 97 |                     if result:
 98 |                         data[args.text_column] = result
 99 |                         output_file.write(json.dumps(data, ensure_ascii=False) + '\n')
100 |                 except json.JSONDecodeError:
101 |                     continue  # Ignore lines with parsing errors
102 | 
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     parser = argparse.ArgumentParser()
107 |     # The default input and output are jsonl files
108 |     parser.add_argument('--input_path', type=str)
109 |     parser.add_argument('--output_path', type=str)
110 |     parser.add_argument('--text_column', type=str)
111 |     parser.add_argument('--language', type=str)
112 |     parser.add_argument('--punctuation_ratio_threshold', type=float, default=0.5)
113 |     parser.add_argument('--text_length_threshold', type=int, default=128)
114 |     args = parser.parse_args()
115 | 
116 |     filter = RuleFilter()
117 |     filter.filter(args.input_path, args.output_path)
118 | 


--------------------------------------------------------------------------------
/4_perplexity_filter/kenlm/model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import unicodedata
  4 | from typing import Dict
  5 | 
  6 | import kenlm
  7 | import sentencepiece
  8 | from huggingface_hub import cached_download, hf_hub_url
  9 | 
 10 | 
 11 | class SentencePiece:
 12 |     def __init__(
 13 |         self,
 14 |         model: str,
 15 |     ):
 16 |         super().__init__()
 17 |         self.sp = sentencepiece.SentencePieceProcessor()
 18 |         self.sp.load(str(model))
 19 | 
 20 |     def do(self, text: dict) -> dict:
 21 |         tokenized = self.sp.encode_as_pieces(text)
 22 |         return " ".join(tokenized)
 23 | 
 24 | 
 25 | class KenlmModel:
 26 |     digit_re: re.Pattern = re.compile(r"\d")
 27 |     unicode_punct: Dict[str, str] = {
 28 |         "，": ",",
 29 |         "。": ".",
 30 |         "、": ",",
 31 |         "„": '"',
 32 |         "”": '"',
 33 |         "“": '"',
 34 |         "«": '"',
 35 |         "»": '"',
 36 |         "１": '"',
 37 |         "」": '"',
 38 |         "「": '"',
 39 |         "《": '"',
 40 |         "》": '"',
 41 |         "´": "'",
 42 |         "∶": ":",
 43 |         "：": ":",
 44 |         "？": "?",
 45 |         "！": "!",
 46 |         "（": "(",
 47 |         "）": ")",
 48 |         "；": ";",
 49 |         "–": "-",
 50 |         "—": " - ",
 51 |         "．": ". ",
 52 |         "～": "~",
 53 |         "’": "'",
 54 |         "…": "...",
 55 |         "━": "-",
 56 |         "〈": "<",
 57 |         "〉": ">",
 58 |         "【": "[",
 59 |         "】": "]",
 60 |         "％": "%",
 61 |         "►": "-",
 62 |     }
 63 |     unicode_punct_re = re.compile(f"[{''.join(unicode_punct.keys())}]")
 64 |     non_printing_chars_re = re.compile(
 65 |         f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
 66 |     )
 67 |     kenlm_model_dir = None
 68 |     sentence_piece_model_dir = None
 69 | 
 70 |     def __init__(
 71 |         self,
 72 |         model_dataset: str,
 73 |         language: str,
 74 |         lower_case: bool = False,
 75 |         remove_accents: bool = False,
 76 |         normalize_numbers: bool = True,
 77 |         punctuation: int = 1,
 78 |     ):
 79 |         self.model = kenlm.Model(os.path.join(model_dataset, f"{language}.arpa.bin"))
 80 |         self.tokenizer = SentencePiece(os.path.join(model_dataset, f"{language}.sp.model"))
 81 |         self.accent = remove_accents
 82 |         self.case = lower_case
 83 |         self.numbers = normalize_numbers
 84 |         self.punct = punctuation
 85 | 
 86 |     @classmethod
 87 |     def from_pretrained(
 88 |         cls,
 89 |         model_dataset: str,
 90 |         language: str,
 91 |     ):
 92 |         return cls(
 93 |             model_dataset,
 94 |             language,
 95 |             False,
 96 |             False,
 97 |             True,
 98 |             1,
 99 |         )
100 | 
101 |     def pp(self, log_score, length):
102 |         return 10.0 ** (-log_score / length)
103 | 
104 |     def get_perplexity(self, doc: str, normalize_cc_net: bool = True):
105 |         if normalize_cc_net:
106 |             doc = self.normalize(
107 |                 doc,
108 |                 accent=self.accent,
109 |                 case=self.case,
110 |                 numbers=self.numbers,
111 |                 punct=self.punct,
112 |             )
113 |         # Tokenize (after normalizing): See https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/mine.py#L352 for full pipeline
114 |         doc = self.tokenizer.do(doc)
115 |         doc_log_score, doc_length = 0, 0
116 |         for line in doc.split("\n"):
117 |             log_score = self.model.score(line)
118 |             length = len(line.split()) + 1
119 |             doc_log_score += log_score
120 |             doc_length += length
121 |         return round(self.pp(doc_log_score, doc_length), 1)
122 | 
123 |     def normalize(
124 |         self,
125 |         line: str,
126 |         accent: bool = True,
127 |         case: bool = True,
128 |         numbers: bool = True,
129 |         punct: int = 1,
130 |     ) -> str:
131 |         line = line.strip()
132 |         if not line:
133 |             return line
134 |         if case:
135 |             line = line.lower()
136 |         if accent:
137 |             line = self.strip_accents(line)
138 |         if numbers:
139 |             line = self.digit_re.sub("0", line)
140 |         if punct == 1:
141 |             line = self.replace_unicode_punct(line)
142 |         elif punct == 2:
143 |             line = self.remove_unicode_punct(line)
144 |         line = self.remove_non_printing_char(line)
145 |         return line
146 | 
147 |     def strip_accents(self, line: str) -> str:
148 |         """Strips accents from a piece of text."""
149 |         nfd = unicodedata.normalize("NFD", line)
150 |         output = [c for c in nfd if unicodedata.category(c) != "Mn"]
151 |         if len(output) == line:
152 |             return line
153 |         return "".join(output)
154 | 
155 |     def replace_unicode_punct(self, text: str) -> str:
156 |         return "".join(self.unicode_punct.get(c, c) for c in text)
157 | 
158 |     def remove_unicode_punct(self, text: str) -> str:
159 |         """More aggressive version of replace_unicode_punct but also faster."""
160 |         return self.unicode_punct_re.sub("", text)
161 | 
162 |     def remove_non_printing_char(self, text: str) -> str:
163 |         return self.non_printing_chars_re.sub("", text)
164 | 


--------------------------------------------------------------------------------
/7_DataAnalysis/corpus_evaluator.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import random
  3 | from collections import OrderedDict
  4 | import json
  5 | import tiktoken
  6 | import openai
  7 | from openai import OpenAI
  8 | from datasets import Dataset, load_dataset
  9 | 
 10 | # add http proxy
 11 | # import os
 12 | # os.environ["http_proxy"] = "http://127.0.0.1:10809"
 13 | # os.environ["https_proxy"] = "http://127.0.0.1:10809"
 14 | 
 15 | PROMPT = """你是一个语料评价专家，负责对单条语料（通常是一段自然语言文本）的质量进行打分以用于大语言模型的预训练
 16 | 你的评价标准是：
 17 | 语言质量(0-10分):考察语料的语法、拼写、词汇是否正确,语言表达是否流畅。语言质量高的语料利于模型学习语言规则,可以得高分。得分依据:语法和拼写正确(2分),词汇丰富(2分),表达流畅(2分),长难句或生僻词出现(2分),语言总体复杂(2分)。
 18 | 
 19 | 信息量(0-10分):考察语料所包含的知识量和概念量。信息量大的语料有利于模型学习丰富知识,可以得高分。得分依据:包含专业知识或生僻概念(3分),篇幅较长或讨论多个话题(3分),详尽叙述某一话题(2分),提供新的信息或见解(2分)。 
 20 | 
 21 | 新颖性(0-10分):考察语料中的新奇词汇、新信息或新思想对模型理解范围的扩展作用。新颖性高的语料可以得高分。得分依据:包含新词或新概念(3分),提供新信息或新见解(3分),采用新角度或新形式表达观点(2分),创造新的词或短语(2分)。
 22 | 
 23 | 连贯性(0-10分): 主题明确,观点连贯,论证严谨,构成完整论述(3分);主题基本清晰,且论证严谨。(3分) 各部分同属同一话题，构成连贯整体(4分)。
 24 | 
 25 | 纯净度(0-10分):考察语料含有无关信息如广告、营销、垃圾信息的数量，含此类信息少而大部分内容都与主题相关的语料可以得高分。得分依据:主要内容表达完整(3分),垃圾信息含量少(3分)，完全没有垃圾信息(4分)
 26 | 
 27 | 通过以上评价标准，你将对下面的语料进行打分：
 28 | 【语料开始】
 29 | 
 30 | {corpus}
 31 | 
 32 | 【语料结束】
 33 | 
 34 | 请先分条给出评价理由，再给出对应分数并格式化输出。
 35 | 示例：
 36 | 【语言质量】:语法和拼写基本正确，词汇较丰富，表达流畅，出现生僻词如“幽灵枪”和长句，语言较复杂。【分数】8
 37 | 【信息量】:涉及专业领域知识如各类枪支、美国控枪法案等，讨论多个话题如美国枪支文化与政策、美国枪支暴力现状等，详尽论述美国枪支状况，提供大量数据与信息。【分数】9
 38 | 【新颖性】:出现新词“幽灵枪”和新概念如“极端枪支文化”，从政治经济角度揭示美国枪支问题新原因，以全新的角度解析美国枪支文化。【分数】8
 39 | 【连贯性】:文中各部分紧密衔接，从美国枪支政策演变到枪支问题分析，再到政治经济因素剖析，行文逻辑清晰，段落结构明确。【分数】9
 40 | 【纯净度】:文中的主要内容表达完整，大部分文本都与主题相关，但是结尾含有推广引流信息，不过垃圾信息含量较少。【分数】7
 41 | 
 42 | 输出："""
 43 | 
 44 | all_aspects = ["语言质量", "信息量", "新颖性", "连贯性", "纯净度"]
 45 | 
 46 | tokenizer = tiktoken.get_encoding('cl100k_base')
 47 | 
 48 | 
 49 | def read_data(data_path: str, data_num: int) -> Dataset:
 50 |     dataset = load_dataset("json", data_files=[data_path], split="train", keep_in_memory=True)
 51 | 
 52 |     if data_num is not None:
 53 |         data_num = min(data_num, len(dataset))
 54 |     random_indices = random.sample(range(len(dataset)), data_num)
 55 | 
 56 |     return dataset.select(random_indices)
 57 | 
 58 | 
 59 | def cut_corpus(text, max_len=1000):
 60 |     text_tokens = tokenizer.encode(str(text).strip())
 61 |     if len(text_tokens) > max_len:
 62 |         text_readable = False
 63 |         text_tokens = text_tokens[:max_len]
 64 |         while not text_readable and len(text_tokens) > 1:
 65 |             try:
 66 |                 text = tokenizer.decode(text_tokens)
 67 |                 text_readable = True
 68 |             except:
 69 |                 text_tokens = text_tokens[:-1]
 70 |     return text
 71 | 
 72 | 
 73 | def call_openai_func(instruction: str, model: str = "gpt-3.5-turbo-1106", api_key: str = None, organization: str = None) -> str:
 74 |     
 75 |     openai.api_key = api_key
 76 |     openai.organization = organization
 77 | 
 78 |     client = OpenAI(api_key=api_key, organization=organization)
 79 | 
 80 |     completion = client.chat.completions.create(
 81 |         model=model,
 82 |         messages=[
 83 |             {"role": "system",
 84 |              "content": "You are a helpful assistant."},
 85 |             {"role": "user", "content": instruction},
 86 |         ],
 87 |         temperature=0.2,
 88 |         max_tokens=512,
 89 |     )
 90 |     return completion.choices[0].message.content
 91 | 
 92 | 
 93 | 
 94 | def extract_result(text: str) -> dict:
 95 |     pattern = r'【(.*?)】:(.*?)【分数】(\d+)\n'
 96 | 
 97 |     matches = re.findall(pattern, text+"\n")
 98 | 
 99 |     result = OrderedDict({aspect: {"reason": "", "score": -1} for aspect in all_aspects})
100 |     assert len(matches) == len(all_aspects)
101 |     for match in matches:
102 |         aspect = match[0]
103 |         reason = match[1]
104 |         score = match[2]
105 |         assert aspect in all_aspects
106 |         result[aspect] = {"reason": reason, "score": int(score)}
107 | 
108 |     return result
109 | 
110 | def save_jsonl(data, output_path):
111 |     with open(output_path, 'w', encoding='utf-8') as output_file:
112 |         for item in data:
113 |             output_file.write(json.dumps(item, ensure_ascii=False) + "\n")
114 |             
115 | def corpus_quality_measure_fn(
116 |         data_path: str,
117 |         eval_path: str = None,
118 |         data_num: int = None,
119 |         text_column: str = "text",
120 |         model: str = "gpt-3.5-turbo-1106",
121 |         api_key: str = None,
122 |         organization: str = None,
123 |         num_proc: int = 1,):
124 | 
125 |     def eval_single_item(obj):
126 |         text = obj[text_column]
127 |         instruction = PROMPT.format(corpus=cut_corpus(text))
128 | 
129 |         try:
130 |             response = call_openai_func(instruction, model, api_key, organization)
131 |             result = extract_result(response)
132 |         except Exception as e:
133 |             print("Error")
134 |             print(e)
135 |             result = OrderedDict({aspect: {"reason": "", "score": -1} for aspect in all_aspects})
136 | 
137 |         obj["quality"] = result
138 |         return obj
139 | 
140 |     corpus = read_data(data_path, data_num)
141 |     corpus = corpus.map(eval_single_item, num_proc=num_proc)
142 | 
143 |     if eval_path is not None:
144 |         save_jsonl(corpus, eval_path)
145 |         # corpus.to_json(eval_path, batch_size=128, force_ascii=False)
146 | 
147 |     return corpus
148 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 📦 YiZhao: A 2TB Open Financial Dataset
  2 | 
  3 | <p align="center">
  4 |     🤗 <a href="https://huggingface.co/datasets/HIT-TMG/YiZhao">Hugging Face</a> &nbsp;&nbsp;|&nbsp;&nbsp; 
  5 |     🤖 <a href="https://modelscope.cn/datasets/CMB_AILab/YiZhao-FinDataSet">ModelScope</a> &nbsp;&nbsp;|&nbsp;&nbsp; 
  6 |     🪄 <a href="https://modelscope.cn/models/CMB_AILab/YiZhao-12B-Chat-HF">YiZhao-12B-Chat</a> &nbsp;&nbsp;|&nbsp;&nbsp; 
  7 |     📑 <a href="https://github.com/HITsz-TMG/YiZhao/blob/main/YiZhao_technical_report.pdf">Technical Report</a>
  8 | </p>
  9 | 
 10 | Data and tools for generating and inspecting **YiZhao**, a safe, high-quality, open-sourced bilingual financial corpus (Chinese, English) released by Harbin Institute of Technology (Shenzhen) and China Merchants Bank Artificial Intelligence Laboratory.
 11 | 
 12 | ## 🌟 Environment
 13 | Our recommended Python version is **3.11.4**. 
 14 | ```
 15 | pip install -r requirements.txt
 16 | ```
 17 | 
 18 | ## 🧩 Data Preprocessing
 19 | 
 20 | ### 1. Remove personal information
 21 | This step completes the removal of personal information such as IP addresses, emails, and phone numbers.
 22 | #### Example usage
 23 | ```
 24 | python 1_pii.py \
 25 |     --input_path input.jsonl \
 26 |     --output_path output.jsonl \
 27 |     --text_column text \
 28 |     --num_proc 4 \
 29 |     --batch_size 100
 30 | ```
 31 | 
 32 | ### 2. Sensitive Words
 33 | To avoid the inclusion of toxic content in the training data, one approach is to filter out texts that contain specific sensitive keywords. You need to store the ***txt*** files containing sensitive words in `2_toxic_filter/sensitive_words`.
 34 | #### Example usage
 35 | ```
 36 | python 2_toxic_filter/2_toxic_filter.py \
 37 |     --input_path input.jsonl \
 38 |     --output_path output.jsonl \
 39 |     --text_column text \
 40 | ```
 41 | 
 42 | ### 3. Rule Filtering
 43 | This step completes multiple rule-based data filtering.
 44 | - Language Filtering: Retain only text data in a specific language (***zh-cn*** or ***en***).
 45 | - Punctuation and whitespace consistency processing: Unify Chinese and English punctuation within the text, and standardize different types of whitespace characters as well.
 46 | - Deduplication of consecutive punctuation: Replace all matched consecutive punctuation marks with a single punctuation mark.
 47 | - Punctuation Ratio Filtering: Filter out texts with too high a punctuation ratio.
 48 | - Data Length Filtering: Filter out text data that is too short.
 49 | #### Example usage
 50 | ```
 51 | python 3_rule_filter.py \
 52 |     --input_path input.jsonl \
 53 |     --output_path output.jsonl \
 54 |     --text_column text \
 55 |     --language zh-cn \
 56 |     --punctuation_ratio_threshold 0.5 \
 57 |     --text_length_threshold 128 \
 58 | ```
 59 | 
 60 | ### 4. Perplexity Filtering
 61 | You need to first download the model from the [KenLM repository](https://huggingface.co/edugp/kenlm), and then modify the corresponding model path in the following line in `4_perplexity_filter/kenlm/run.py`.
 62 | ```python
 63 | model = KenlmModel.from_pretrained("kenlm/wikipedia", args.language) #language = zh or en
 64 | ```
 65 | #### Example usage
 66 | ```
 67 | python 4_perplexity_filter/kenlm/run.py \
 68 |     --input_path input.jsonl \
 69 |     --output_path output.jsonl \
 70 |     --text_column text \
 71 |     --language zh \
 72 | ```
 73 | 
 74 | ### 5. Exact Deduplication
 75 | Deduplicate identical text entries in the dataset.
 76 | #### Example usage
 77 | ```
 78 | python 5_text_dedup/5_clean.py \
 79 |     --input_path input.jsonl \
 80 |     --output_path output.jsonl \
 81 |     --text_column text \
 82 |     --cache cache_dir \
 83 |     --num_proc 2 \
 84 |     --batch_size 100
 85 | ```
 86 | 
 87 | ### 6. Fuzzy Deduplication
 88 | Deduplicate similar texts in the dataset.
 89 | #### Example usage
 90 | ```
 91 | python 6_text_dedup/text_dedup/minhash.py \
 92 |     --input_path input.jsonl \
 93 |     --output_path output.jsonl \
 94 |     --column text \
 95 |     --cache_dir cache_dir \
 96 |     --threshold 0.8 \
 97 |     --false_positive_weight 0.5 \
 98 |     --false_negative_weight 0.5 \
 99 | ```
100 | 
101 | ### 7. Financial relevance filtering and security risk filtering
102 | Using a financial relevance classifier (🤗[fin-model-zh-v0.1](https://huggingface.co/HIT-TMG/fin-model-zh-v0.1) and [fin-model-en-v0.1](https://huggingface.co/HIT-TMG/fin-model-en-v0.1)) and a security risk identification classifier (🤗[risk-model-zh-v0.1](https://huggingface.co/HIT-TMG/risk-model-zh-v0.1) and [risk-model-en-v0.1](https://huggingface.co/HIT-TMG/risk-model-en-v0.1)), we filter out high-quality financial corpus.
103 | 
104 | 
105 | 
106 | ## ⚡️ Data Evaluation
107 | We evaluate each piece of data from the following aspects:
108 | - **Language Quality (0-10 points)**: This examines whether the data is grammatically correct, spelled correctly, uses appropriate vocabulary, and if the expression is fluent. High language quality aids the model in learning language rules, resulting in a higher score. ***Scoring criteria***: correct grammar and spelling (2 points), rich vocabulary (2 points), fluent expression (2 points), use of complex sentences or rare words (2 points), and overall language complexity (2 points).
109 | 
110 | - **Information Content (0-10 points)**: This measures the amount of knowledge and concepts contained in the data. Data with high information content helps the model learn rich knowledge, leading to a higher score. ***Scoring criteria***: includes specialized knowledge or obscure concepts (3 points), longer length or discussion of multiple topics (3 points), detailed discussion of a single topic (2 points), and providing new information or insights (2 points).
111 | 
112 | - **Novelty (0-10 points)**: This evaluates the extent to which new vocabulary, information, or ideas in the data expand the model's understanding. Data with high novelty can receive higher scores. ***Scoring criteria***: includes new words or concepts (3 points), provides new information or insights (3 points), presents ideas from new perspectives or in new forms (2 points), and creates new words or phrases (2 points).
113 | 
114 | - **Coherence (0-10 points)**: ***Scoring criteria***: This assesses whether the data has a clear theme, coherent arguments, and rigorous reasoning, forming a complete discussion (3 points); a mostly clear theme with rigorous reasoning (3 points); all parts belong to the same topic, forming a coherent whole (4 points).
115 | 
116 | - **Purity (0-10 points)**: This evaluates the amount of irrelevant information, such as ads, marketing, or spam, in the data. Data with little to no such information and content that mostly relates to the topic can score higher. ***Scoring criteria***: the main content is fully expressed (3 points), low spam content (3 points), and no spam content at all (4 points).
117 | 
118 | #### Example usage
119 | ```
120 | python 7_DataAnalysis/eval_pipeline.py \
121 |     --data_path input.jsonl \
122 |     --eval_path output.jsonl \
123 |     --text_column text \
124 |     --tiktoken_cache cache_dir \
125 |     --figure_dir figure_dir \
126 |     --model gpt-3.5-turbo-1106 \
127 |     --api_key xxxx \
128 |     --organization xxxx \
129 |     --num_proc 1 \
130 | ```
131 | 
132 | 


--------------------------------------------------------------------------------
/5_text_dedup/clean_helpers/deduplication.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from functools import partial
  3 | from typing import List, Set, Tuple, Dict, Callable, Optional
  4 | import hashlib
  5 | import re
  6 | import string
  7 | import urllib
  8 | 
  9 | from datasets import Dataset
 10 | 
 11 | 
 12 | # ======== DEDUPLICATION FUNCTIONS ===================
 13 | from clean_helpers.utils import parse_meta
 14 | 
 15 | 
 16 | def build_dedup_template(min_template_line_size: int, min_template_line_occurence: int):
 17 |     def dedup_template(ds: Dataset, num_proc: int, batch_size: int) -> Dataset:
 18 |         """Computes and remove templates lines"""
 19 |         # Compute the hash of each lines
 20 |         split_into_lines_and_hashes = ds.map(
 21 |             split_text_to_lines_and_hash,
 22 |             num_proc=num_proc,
 23 |             batched=True,
 24 |             batch_size=batch_size,
 25 |             remove_columns=ds.column_names
 26 |         )
 27 |         lines_and_hashes = split_into_lines_and_hashes.remove_columns(
 28 |             set(split_into_lines_and_hashes.column_names) - {"lines", "hashes"}
 29 |         )
 30 | 
 31 |         # Find template lines
 32 |         count_lines_occurence = defaultdict(lambda: 0)
 33 |         for row in lines_and_hashes:
 34 |             filtered_lines_and_hashes = [
 35 |                 (line, hash_)
 36 |                 for line, hash_ in zip(row["lines"], row["hashes"])
 37 |                 if len(line) >= min_template_line_size
 38 |             ]
 39 |             for _, hash_ in filtered_lines_and_hashes:
 40 |                 count_lines_occurence[hash_] += 1
 41 | 
 42 |         template_line_hashes = {k for k, v in count_lines_occurence.items() if v >= min_template_line_occurence}
 43 |         del count_lines_occurence
 44 | 
 45 |         # Clean dataset
 46 |         return split_into_lines_and_hashes.map(
 47 |             build_remove_template_lines(template_line_hashes),
 48 |             num_proc=num_proc,
 49 |             batched=True,
 50 |             batch_size=batch_size,
 51 |             remove_columns=split_into_lines_and_hashes.column_names
 52 |         )
 53 | 
 54 |     return dedup_template
 55 | 
 56 | 
 57 | def build_dedup_document(batch_normalizer: Callable[[Dict], List[str]]):
 58 |     def dedup_document(ds: Dataset, num_proc: int, batch_size: int) -> Dataset:
 59 |         hashed_documents = ds.map(
 60 |             lambda batch: {**batch, "hash": get_hash(batch_normalizer(batch))},
 61 |             num_proc=num_proc,
 62 |             batched=True,
 63 |             batch_size=batch_size
 64 |         )
 65 | 
 66 |         hashes = set()
 67 | 
 68 |         return hashed_documents.map(
 69 |             partial(delete_text_from_duplicates, hashes=hashes),
 70 |             num_proc=1,  # VERY IMPORTANT: hashes will be updated, and is not thread safe.
 71 |             batched=True,
 72 |             batch_size=batch_size,
 73 |             remove_columns=hashed_documents.column_names
 74 |         )
 75 | 
 76 |     return dedup_document
 77 | 
 78 | 
 79 | # =========== HELPERS ===============
 80 | 
 81 | def get_hash(texts: List[str]) -> List[str]:
 82 |     """Get hash of content field."""
 83 |     return [hashlib.md5(text.strip().encode("utf-8")).hexdigest() for text in texts]
 84 | 
 85 | def split_text_in_lines(text: str) -> List[str]:
 86 |     return [line.strip() for line in text.split("\n")]
 87 | 
 88 | def split_text_to_lines_and_hash(batch: Dict[str, List]):
 89 |     lines_per_texts = [split_text_in_lines(text) for text in batch["text"]]
 90 |     return {
 91 |         **{k: v for k, v in batch.items() if k != "text"},
 92 |         "lines": lines_per_texts,
 93 |         "hashes": [get_hash(lines) for lines in lines_per_texts]
 94 |     }
 95 | 
 96 | 
 97 | def clean_text(lines_and_hashes: List[Tuple[str, int]], template_line_hashes: Set[str]):
 98 |     return "\n".join([line for line, hash_ in lines_and_hashes if hash_ not in template_line_hashes])
 99 | 
100 | 
101 | def build_remove_template_lines(template_line_hashes: Set[str]):
102 |     def remove_template_lines(batch: Dict[str, List]):
103 |         cleaned_texts = [
104 |             clean_text(
105 |                 list(zip(lines, hashes)),
106 |                 template_line_hashes
107 |             )
108 |             for lines, hashes in zip(batch["lines"], batch["hashes"])
109 |         ]
110 |         return {
111 |             **{
112 |                 key: value
113 |                 for key, value in batch.items()
114 |                 if key not in ["lines", "hashes"]
115 |             },
116 |             "text": [cleaned_text for cleaned_text in cleaned_texts]
117 |         }
118 | 
119 |     return remove_template_lines
120 | 
121 | 
122 | def is_new_hash(hash_: str, hashes: Set[str]) -> bool:
123 |     """Check if current hash is still in set of unique hashes and remove if true."""
124 |     if hash_ in hashes:
125 |         return False
126 |     else:
127 |         hashes.add(hash_)
128 |         return True
129 | 
130 | def delete_text_from_duplicates(batch: Dict[str, List], hashes: Set[str]) -> Dict[str, List]:
131 |     return {
132 |         **{k: v for k, v in batch.items() if k != "hash"},
133 |         "text": [text if is_new_hash(hash_, hashes) else "" for text, hash_ in zip(batch["text"], batch["hash"])]
134 |     }
135 | 
136 | def url_with_only_some_query_param(url: str, query_param_map: Optional[dict] = None) -> str:
137 |     url_parse = urllib.parse.urlparse(url)
138 |     query = url_parse.query
139 | 
140 |     url_query_params = urllib.parse.parse_qsl(query)
141 | 
142 |     if query_param_map is None:
143 |         url_query_params_new = {}
144 |     else:
145 |         url_query_params_new = [(query_param_map[old_key], old_value) for (old_key, old_value) in url_query_params if old_key in query_param_map]
146 |         
147 |     url_new_query = urllib.parse.urlencode(url_query_params_new, encoding="utf-8")
148 |     url_parse = url_parse._replace(query=url_new_query)
149 |     new_url = urllib.parse.urlunparse(url_parse)
150 |     return new_url
151 | 
152 | # =========== BATCH NORMALISER ===============
153 | 
154 | 
155 | # this only keeps letter characters
156 | remove_non_character_regex = re.compile(f'\s+|\d+|[{re.escape(string.punctuation)}]')
157 | def document_batch_normalizer(batch: Dict) -> List[str]:
158 |     return [remove_non_character_regex.sub('', text) for text in batch["text"]]
159 | 
160 | 
161 | def strict_url_batch_normalizer(batch: Dict) -> List[str]:
162 |     return [parse_meta(meta)["url"] for meta in batch["meta"]]
163 | 
164 | 
165 | url_host_and_path_regex = re.compile(r"^(.[^?]*)")
166 | def url_host_and_path_batch_normalizer(batch: Dict) -> List[str]:
167 |     return [url_host_and_path_regex.match(parse_meta(meta)["url"]).group(1) for meta in batch["meta"]]
168 | 
169 | lm_es_pseudocrawl_filtered_341_es_cointelegraph_com_regex = re.compile(r"^((?:(?!/amp)/?(?:[^?/]*))+)(?:/amp)?")
170 | def url_lm_es_pseudocrawl_filtered_341_es_cointelegraph_com(batch: Dict) -> List[str]:
171 |     return [lm_es_pseudocrawl_filtered_341_es_cointelegraph_com_regex.match(parse_meta(meta)["url"]).group(1) for meta in batch["meta"]]
172 | 
173 | def url_lm_en_pseudocrawl_filtered_619_www_qut_edu_au(batch: Dict) -> List[str]:
174 |     return [url_with_only_some_query_param(parse_meta(meta)["url"], {"id": "id", "news-id": "id"}) for meta in batch["meta"]]


--------------------------------------------------------------------------------
/6_text_dedup/text_dedup/utils/add_args.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # @Date    : 2022-11-05 09:16:34
  4 | # @Author  : Chenghao Mou (mouchenghao@gmail.com)
  5 | import argparse
  6 | 
  7 | 
  8 | def add_io_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:  # pragma: no cover
  9 |     """
 10 |     Add input/output arguments to parser.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     parser : argparse.ArgumentParser
 15 |         Parser to add arguments to.
 16 | 
 17 |     Returns
 18 |     -------
 19 |     parser : argparse.ArgumentParser
 20 |         Parser with added arguments.
 21 |     """
 22 |     parser.add_argument("--input_path", type=str, help="`path` in load_dataset", required=False),
 23 |     parser.add_argument("--name", type=str, help="`name` in load_dataset"),
 24 |     parser.add_argument("--data_dir", type=str, help="`data_dir` in load_dataset"),
 25 |     parser.add_argument("--data_files", type=str, help="`data_files` in load_dataset"),
 26 |     parser.add_argument("--split", type=str, help="`split` in load_dataset"),
 27 |     parser.add_argument("--cache_dir", type=str, help="`cache_dir` in load_dataset", default=".cache"),
 28 |     parser.add_argument("--revision", type=str, help="`revision` in load_dataset"),
 29 |     parser.add_argument(
 30 |         "--use_auth_token", action=argparse.BooleanOptionalAction, help="`use_auth_token` in load_dataset"
 31 |     ),
 32 |     parser.add_argument("--local", action=argparse.BooleanOptionalAction, help="Use local dataset", default=False),
 33 |     parser.add_argument("--output_path", type=str, help="Path to deduplicated dataset output", required=False),
 34 |     parser.add_argument(
 35 |         "--debug", action=argparse.BooleanOptionalAction, help="Whether to run in debug mode", default=False
 36 |     )
 37 |     return parser
 38 | 
 39 | 
 40 | def add_meta_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:  # pragma: no cover
 41 |     """
 42 |     Add meta arguments to parser.
 43 | 
 44 |     Parameters
 45 |     ----------
 46 |     parser : argparse.ArgumentParser
 47 |         Parser to add arguments to.
 48 | 
 49 |     Returns
 50 |     -------
 51 |     parser : argparse.ArgumentParser
 52 |         Parser with added arguments.
 53 |     """
 54 |     parser.add_argument(
 55 |         "--column",
 56 |         type=str,
 57 |         help="""Text column to use for deduplication. Concatenate desired columns beforehand if needed.""",
 58 |         required=False,
 59 |     ),
 60 |     parser.add_argument(
 61 |         "--batch_size",
 62 |         type=int,
 63 |         help="""Batch size to use for dataset iteration. Mainly for memory efficiency.""",
 64 |         default=1000000,
 65 |     ),
 66 |     return parser
 67 | 
 68 | 
 69 | def add_minhash_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:  # pragma: no cover
 70 |     """
 71 |     Add MinHash arguments to parser.
 72 | 
 73 |     Parameters
 74 |     ----------
 75 |     parser : argparse.ArgumentParser
 76 |         Parser to add arguments to.
 77 | 
 78 |     Returns
 79 |     -------
 80 |     parser : argparse.ArgumentParser
 81 |         Parser with added arguments.
 82 |     """
 83 |     parser.add_argument(
 84 |         "--ngram",
 85 |         type=int,
 86 |         default=5,
 87 |         help="Ngram size to use in MinHash.",
 88 |     )
 89 |     parser.add_argument(
 90 |         "--min_length",
 91 |         type=int,
 92 |         default=5,
 93 |         help="Minimum number of tokens to use in MinHash. Shorter documents will be filtered out.",
 94 |     )
 95 |     parser.add_argument("--seed", type=int, default=42, help="Seed to use in MinHash")
 96 |     parser.add_argument("--num_perm", type=int, default=256, help="Number of permutations to use in MinHash")
 97 |     parser.add_argument(
 98 |         "--threshold", type=float, default=0.7, help="Jaccard similarity threshold to use in MinHashLSH"
 99 |     )
100 |     parser.add_argument(
101 |         "--b",
102 |         type=int,
103 |         default=None,
104 |         help="Number of bands",
105 |     )
106 |     parser.add_argument(
107 |         "--r",
108 |         type=int,
109 |         default=None,
110 |         help="Number of rows per band",
111 |     )
112 | 
113 |     return parser
114 | 
115 | 
116 | def add_simhash_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:  # pragma: no cover
117 |     """
118 |     Add SimHash arguments to parser.
119 | 
120 |     Parameters
121 |     ----------
122 |     parser : argparse.ArgumentParser
123 |         Parser to add arguments to.
124 | 
125 |     Returns
126 |     -------
127 |     parser : argparse.ArgumentParser
128 |         Parser with added arguments.
129 |     """
130 |     parser.add_argument(
131 |         "--ngram",
132 |         type=int,
133 |         default=3,
134 |         help="""Ngram size to use in SimHash.""",
135 |     )
136 |     parser.add_argument("--f", type=int, default=64, help="Simhash bit size"),
137 |     parser.add_argument("--bit_diff", type=int, default=3, help="Bit difference to use in SimHash"),
138 |     parser.add_argument(
139 |         "--num_bucket", type=int, default=4, help="Number of buckets to use in SimHash, must be larger than bit_diff"
140 |     ),
141 |     return parser
142 | 
143 | 
144 | def add_sa_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:  # pragma: no cover
145 |     """
146 |     Add Suffix Array arguments to parser.
147 | 
148 |     Parameters
149 |     ----------
150 |     parser : argparse.ArgumentParser
151 |         Parser to add arguments to.
152 | 
153 |     Returns
154 |     -------
155 |     parser : argparse.ArgumentParser
156 |         Parser with added arguments.
157 |     """
158 |     parser.add_argument(
159 |         "--k", type=int, default=100, help="Minimum byte length of a duplicate substring in Suffix Array Deduplication"
160 |     ),
161 |     parser.add_argument(
162 |         "--strategy",
163 |         type=str,
164 |         default="overlapping",
165 |         help="Strategy when there are overlapping duplicate substrings",
166 |         choices=["overlapping", "longest"],
167 |     )
168 |     parser.add_argument(
169 |         "--google_repo_path", type=str, help="Path to google-research-deduplication codebase", required=True
170 |     ),
171 |     return parser
172 | 
173 | 
174 | def add_bloom_filter_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:  # pragma: no cover
175 |     """
176 |     Add Bloom Filter arguments to parser.
177 | 
178 |     Parameters
179 |     ----------
180 |     parser : argparse.ArgumentParser
181 |         Parser to add arguments to.
182 | 
183 |     Returns
184 |     -------
185 |     parser : argparse.ArgumentParser
186 |         Parser with added arguments.
187 |     """
188 |     parser.add_argument("--error_rate", type=float, default=1e-6, help="Error rate to use in BloomFilter"),
189 |     parser.add_argument("--hash_func", type=str, default="md5", help="Hash function to use in BloomFilter"),
190 |     parser.add_argument("--initial_capacity", type=int, default=100, help="Initial capacity of BloomFilter"),
191 |     return parser
192 | 
193 | 
194 | def add_exact_hash_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:  # pragma: no cover
195 |     """
196 |     Add Exact Hash arguments to parser.
197 | 
198 |     Parameters
199 |     ----------
200 |     parser : argparse.ArgumentParser
201 |         Parser to add arguments to.
202 | 
203 |     Returns
204 |     -------
205 |     parser : argparse.ArgumentParser
206 |         Parser with added arguments.
207 |     """
208 |     parser.add_argument("--hash_func", type=str, default="md5", help="Hash function to use in ExactHash"),
209 |     return parser
210 | 
211 | def add_own_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
212 |     parser.add_argument("--output_duped", type=str, help="duped path to store"),
213 |     parser.add_argument(
214 |         "--false_positive_weight",
215 |         type=float,
216 |         default=0.5,
217 |         help="false_positive_weight",
218 |     ),
219 |     parser.add_argument(
220 |         "--false_negative_weight",
221 |         type=float,
222 |         default=0.5,
223 |         help="false_negative_weight",
224 |     ),
225 |     parser.add_argument("--dataset_name", type=str, help="dataset_name",default="text_dedup.jsonl"),
226 |     # parser.add_argument("--output_duped", type=str, help="duped path to store"),
227 |     return parser


--------------------------------------------------------------------------------
/5_text_dedup/5_clean.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import logging
  4 | import random
  5 | from functools import partial
  6 | 
  7 | import torch
  8 | from datasets import Dataset, load_dataset, load_from_disk, concatenate_datasets
  9 | from pathlib import Path
 10 | from typing import Tuple, Optional, List, Dict
 11 | from datasets.utils.logging import set_verbosity_info
 12 | from numpy.random import default_rng
 13 | 
 14 | 
 15 | from clean_helpers import build_dedup_template, build_dedup_document, concatenate_lm_fr_ester
 16 | from clean_helpers.deduplication import document_batch_normalizer, url_host_and_path_batch_normalizer, \
 17 |     url_lm_es_pseudocrawl_filtered_341_es_cointelegraph_com, url_lm_en_pseudocrawl_filtered_619_www_qut_edu_au
 18 | 
 19 | 
 20 | 
 21 | set_verbosity_info()
 22 | logger = logging.getLogger(__name__)
 23 | torch.set_num_threads(1)
 24 | 
 25 | # Deduplication functions and boolean to save a sample of the modifications: function(ds: Dataset, num_proc: int, batch_size: int) -> Dataset
 26 | DEDUPS = {
 27 |     "dedup_template_soft": (build_dedup_template(
 28 |         min_template_line_size=15,
 29 |         min_template_line_occurence=10,
 30 |     ), True),
 31 |     "dedup_pseudocrawl_newspapers": (build_dedup_template(
 32 |         min_template_line_size=0,
 33 |         min_template_line_occurence=2,
 34 |     ), True),
 35 |     "dedup_document": (build_dedup_document(document_batch_normalizer), True),
 36 |     "dedup_document_on_url": (build_dedup_document(url_host_and_path_batch_normalizer), True),
 37 |     "dedup_document_on_url_lm_es_pseudocrawl-filtered_341_es_cointelegraph_com": (build_dedup_document(
 38 |         url_lm_es_pseudocrawl_filtered_341_es_cointelegraph_com
 39 |     ), True),
 40 |     "dedup_document_on_url_lm_en_pseudocrawl_filtered_619_www_qut_edu_au": (build_dedup_document(
 41 |         url_lm_en_pseudocrawl_filtered_619_www_qut_edu_au
 42 |     ), True),
 43 |     "concatenate_lm_fr_ester": (concatenate_lm_fr_ester, False)
 44 | }
 45 | 
 46 | 
 47 | DEDUPS_KEYS = set(DEDUPS.keys())
 48 | 
 49 | def get_size_per_example(texts: List[str]) -> Dict:
 50 |     size_values = [len(text.encode()) for text in texts]
 51 |     examples = {"bytes_len": size_values}
 52 |     return examples
 53 | 
 54 | def quick_size_estimation(
 55 |     ds: Dataset,
 56 |     num_proc: int,
 57 |     batch_size: int,
 58 |     content_key:str ="text"
 59 | ) -> int:
 60 |     if len(ds) == 0:
 61 |         return 0
 62 |     rng = default_rng(1991)
 63 |     subset_size = min(10000, len(ds))
 64 |     indices = rng.choice(len(ds), size=subset_size, replace=False, shuffle=False)
 65 |     partial_ds = ds.select(indices)
 66 |     ratio = float(len(ds)) / float(subset_size)
 67 | 
 68 |     partial_ds = partial_ds.map(
 69 |         get_size_per_example,
 70 |         batched=True, 
 71 |         num_proc=num_proc,
 72 |         batch_size=batch_size,
 73 |         input_columns=[content_key],
 74 |         remove_columns=partial_ds.column_names,
 75 |     )
 76 |     len_bytes = sum(partial_ds["bytes_len"])
 77 |     return len_bytes * ratio
 78 | 
 79 | 
 80 | 
 81 | 
 82 | def filter_diff_text(examples, in_text_col, out_text_col):
 83 |     return [text_in != text_out for text_in, text_out in zip(examples[in_text_col], examples[out_text_col])]
 84 | 
 85 | def get_args():
 86 |     parser = argparse.ArgumentParser()
 87 |     parser.add_argument("--input_path", type=str, required=True, help="Dataset path we load the dataset from.")
 88 |     parser.add_argument("--output_path", type=Path, required=True,
 89 |                         help="Path where we save resulting dataset after modifications.")
 90 |     parser.add_argument('--text_column', type=str)
 91 |     parser.add_argument("--cache", type=str, required=True, help="Cache Path.")
 92 |     parser.add_argument("--checks_save_path", type=Path, default=None,
 93 |                         help="Path where we save samples we've removed or changed throughout the modifications.")
 94 |     parser.add_argument("--num_proc", type=int, default=1)
 95 |     parser.add_argument("--batch_size", type=int, default=100)
 96 |     parser.add_argument("--load_arrow_file", action="store_true",
 97 |                         help="Option to indicate how to load original dataset. By default we use `load_dataset`. "
 98 |                              "If the flag is use, we use `load_from_disk`")
 99 |     parser.add_argument("--sampling_size_map_checks", type=int, default=None,
100 |                         help="Optional argument. Checked dataset, ie sample we've changed throughout the "
101 |                              "modifications, are either save in whole or only a subset. If set to None, this flag "
102 |                              "saves everything, otherwise it saves a subset with its size corresponding to this value.")
103 |     parser.add_argument("--sampling_size_filter_checks", type=int, default=None,
104 |                         help="Optional argument. Checked dataset, ie sample we've removed throughout the "
105 |                              "modifications, are either save in whole or only a subset. If set to None, this flag "
106 |                              "saves everything, otherwise it saves a subset with its size corresponding to this value.")
107 |     parser.add_argument("--from_scratch", action="store_true", help="Resave all datasets on disk.")
108 |     parser.add_argument("--save_to_json", default=True, help="Save output dataset in json format.")
109 |     return parser.parse_args()
110 | 
111 | def log_stats(title: str, original_ds: Dataset, after_transformation_ds: Dataset, operation_type: str, args):
112 |     original_length = len(original_ds)
113 |     after_transformation_length = len(after_transformation_ds)
114 |     original_bytes = quick_size_estimation(original_ds, batch_size=args.batch_size, num_proc=args.num_proc, content_key=args.text_column)
115 |     after_transformation_btyes = quick_size_estimation(after_transformation_ds, batch_size=args.batch_size, num_proc=args.num_proc, content_key=args.text_column)
116 |     logger.info(title)
117 |     logger.info(f"     Initial number of samples: {original_length} samples")
118 |     logger.info(f"     {operation_type} samples: {original_length - after_transformation_length} samples")
119 |     logger.info(f"     {operation_type} percentage: {(original_length - after_transformation_length) / original_length * 100:.2f} %")
120 |     logger.info(f"     Final number of samples: {after_transformation_length} samples")
121 |     logger.info(f"     Initial size in bytes: {original_bytes * 1e-9:.4f} GB")
122 |     logger.info(f"     {operation_type} bytes: {(original_bytes - after_transformation_btyes) * 1e-9:.4f} GB")
123 |     logger.info(f"     {operation_type} percentage in bytes: {(original_bytes - after_transformation_btyes) / original_bytes * 100:.2f} %")
124 |     logger.info(f"     Final size in bytes: {after_transformation_btyes * 1e-9:.4f} GB")
125 | 
126 | 
127 | 
128 | def get_modified_documents(
129 |     ds: Dataset,
130 |     mapped_ds: Dataset,
131 |     num_proc: int,
132 |     batch_size: int,
133 |     sampling_size: Optional[int],
134 |     text_column,
135 | ) -> Dataset:
136 |     remove_columns = set(ds.column_names)
137 |     remove_columns.remove(text_column)
138 |     ds = ds.remove_columns(remove_columns)
139 |     ds = ds.rename_column(text_column, f"old_text")
140 | 
141 |     assert len(mapped_ds) == len(ds), f"Mapping function are batched, but they should not alter the size of the batch."
142 |     mapped_diff_ds = concatenate_datasets([mapped_ds.flatten_indices(), ds.flatten_indices()], axis=1).filter(
143 |         partial(filter_diff_text, in_text_col="old_text", out_text_col=text_column),
144 |         batched=True,
145 |         num_proc=num_proc,
146 |         batch_size=batch_size
147 |     )
148 | 
149 |     logger.info("Examples of modified examples:")
150 |     idx_samples = random.sample(range(len(mapped_diff_ds)), min(len(mapped_diff_ds), 10))
151 |     for idx in idx_samples:
152 |         logger.info(f"     Examples n°{idx} :\n{json.dumps(mapped_diff_ds[idx], indent=2)}")
153 | 
154 |     if sampling_size is not None:
155 |         idx_samples = random.sample(range(len(mapped_diff_ds)), min(len(mapped_diff_ds), sampling_size))
156 |         mapped_diff_ds = mapped_diff_ds.select(idx_samples)
157 | 
158 |     return mapped_diff_ds
159 | 
160 | 
161 | def apply_function(function_name: str, ds: Dataset, args) -> Tuple[Dataset, Optional[Dataset]]:
162 |     logger.info(f"Applying: {function_name}")
163 |     if function_name in DEDUPS:
164 |         dedup_function, dedup_check = DEDUPS[function_name]
165 |         deduplicated_ds = dedup_function(ds, num_proc=args.num_proc, batch_size=args.batch_size)
166 |         log_stats(f"Applied deduplication function: {function_name}",  ds,  deduplicated_ds,  operation_type="Deduplicated", args=args)
167 | 
168 |         # Some deduplication do not preserve the number of samples, so alignement is lost. For example "dedup_document"
169 |         if args.checks_save_path is not None and dedup_check:
170 |             deduped_diff_ds = get_modified_documents(ds, deduplicated_ds, args.num_proc, args.batch_size, args.sampling_size_map_checks, args.text_column)
171 |             return deduplicated_ds, deduped_diff_ds
172 |         else:
173 |             return deduplicated_ds, None
174 |     else:
175 |         raise NotImplementedError(f"{function_name} has not matched any existing function names. Available names:\n"
176 |                                   f"Dedup functions: {DEDUPS_KEYS}\n"
177 |                                   )
178 | 
179 | def main():
180 |     logging.basicConfig(
181 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
182 |         datefmt="%m/%d/%Y %H:%M:%S",
183 |         level=logging.INFO,
184 |     )
185 |     args = get_args()
186 |     logger.info(f"** The job is runned with the following arguments: **\n{args}\n **** ")
187 | 
188 |     # Load dataset
189 |     logger.info(f" ===== Loading {args.input_path} =====")
190 |     if args.load_arrow_file:
191 |         ds = load_from_disk(args.input_path)
192 |     else:
193 |         ds = load_dataset("json", data_files=args.input_path, split="train", cache_dir= args.cache)
194 | 
195 |     # Apply series of dedups
196 |     logger.info(f" ===== Applying transformations =====")
197 |     
198 |     preprocessings = ["dedup_template_soft", "dedup_document"]
199 |     for idx, preprocessing in enumerate(preprocessings):
200 |         ds, ds_diff = apply_function(preprocessing, ds, args)
201 |         if ds_diff is not None and len(ds_diff) != 0:
202 |             saving_path = args.checks_save_path / f"{idx}_{preprocessing}_checks"
203 |             if not args.from_scratch and saving_path.exists():
204 |                 continue
205 |             tmp_save_path = Path(saving_path.parent, f"tmp-{saving_path.name}")
206 |             logger.info(f" ===== Saving examples to check after {preprocessing}  =====")
207 |             ds_diff.save_to_disk(tmp_save_path)
208 |             tmp_save_path.rename(saving_path)
209 | 
210 | 
211 |     # Save to disk
212 |     if args.from_scratch or not args.output_path.exists():
213 |         logger.info(f" ===== Saving dataset =====")
214 |         logger.info(f"Saving to final dataset at {args.output_path}.")
215 |         tmp_save_path = Path(args.output_path.parent, f"tmp-{args.output_path.name}")
216 |         if len(ds) == 0:
217 |             logger.info("Dataset was empty. Not saving anything.")
218 |             return
219 |         if args.save_to_json:
220 |             ds.to_json(
221 |                 tmp_save_path,
222 |                 num_proc=args.num_proc,
223 |                 force_ascii=False
224 |             )
225 |         else:
226 |             ds.save_to_disk(tmp_save_path)
227 |         tmp_save_path.rename(args.output_path)
228 |     else:
229 |         logging.info(f"Dataset was already saved at {args.output_path}")
230 | 
231 | 
232 | if __name__ == "__main__":
233 |     main()
234 | 


--------------------------------------------------------------------------------
/1_pii.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from functools import partial
  3 | from pathlib import Path
  4 | import logging
  5 | import random
  6 | import sys
  7 | import regex
  8 | from datasets.utils.logging import set_verbosity_info
  9 | from datasets import load_dataset, load_from_disk
 10 | 
 11 | set_verbosity_info()
 12 | logger = logging.getLogger(__name__)
 13 | high_risk_tags = {'KEY', 'EMAIL', 'USER', 'IP_ADDRESS'} # , 'NUMBER', "ID"}
 14 | year_patterns = [
 15 |   # yyyy-yyyy or yyyy/yyyy
 16 |   regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/][1-2][0-9]{3})(?:$|[\s@,?!;:\'\"(.\p{Han}])"), 
 17 |   # yyyy-mm-dd or yyyy-dd-mm or yyyy/mm/dd or yyyy/dd/mm or yyyy.mm.dd or yyyy.dd.mm
 18 |   regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/.][0-3][0-9][\p{Pd}/.][0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])"), 
 19 |   # mm-dd-yyyy or dd-mm-yyyy or mm/dd/yyyy or dd/mm/yyyy or mm.dd.yyyy or dd.mm.yyyy or the same but with yy instead of yyyy
 20 |   regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/.][0-3][0-9][\p{Pd}/.](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])"), 
 21 |   # mm-yyyy or mm/yyyy or the same but with yy
 22 |   regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])"), 
 23 |   # yyyy-mm or yyyy/mm
 24 |   regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}-[0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])"), 
 25 | ]
 26 | 
 27 | # Patterns for high-risk character strings
 28 | id_pattern = r'(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([A-Za-z]*(?:[\p{Pd}]*\p{Nd}){6,})(?:$|[\b\s@?,!;:\'\")(.\p{Han}])'
 29 | 
 30 | # https://regex101.com/r/JQkmh8/5
 31 | key_pattern = r'(?:^|[\b\s@?,!:;\'\")(.\p{Han}])((?:(?:[A-Za-z]+[\p{Nd}\p{Pd}\/\+\=:_]+|[\p{Nd}\p{Pd}\/\+\=:]+[A-Za-z]+)){4,}|(?:(?:\p{Nd}{3,}|[A-Z]+\p{Nd}+[A-Z]*|\p{Nd}+[A-Z]+\p{Nd}*)[ \p{Pd}]?){3,})(?:$|[\b\s\p{Han}@?,!;:\'\")(.])'
 32 | 
 33 | ipv4_pattern = r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}'
 34 | ipv6_pattern = r'(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])'
 35 | ip_pattern = r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" + r"|".join([ipv4_pattern, ipv6_pattern]) + ")(?:$|[\s@,?!;:\'\"(.\p{Han}])"
 36 | 
 37 | # https://regex101.com/r/EpA5B7/1
 38 | email_pattern = r'''
 39 |     (?<= ^ | [\b\s@,?!;:)('".\p{Han}<] )
 40 |     (
 41 |       [^\b\s@?!;,:)('"<]+
 42 |       @
 43 |       [^\b\s@!?;,/]*
 44 |       [^\b\s@?!;,/:)('">.]
 45 |       \.
 46 |       \p{L} \w{1,}
 47 |     )
 48 |     (?= $ | [\b\s@,?!;:)('".\p{Han}>] )
 49 | '''
 50 | 
 51 | # https://regex101.com/r/mOqi1s/3
 52 | user_pattern = r'''
 53 |   (?<= ^ | [)(\s@,?!;:'"\p{Han}] )
 54 |   (@
 55 |     [^)(\s@,?!;:'"]{3,}
 56 |   )
 57 | '''
 58 | # Examples from https://regexpattern.com/phone-number/
 59 | # https://regex101.com/r/lZZ0XP/4
 60 | # Also matches MLS numbers
 61 | # phone_pattern = r'(?:^|[\s\'\"(\p{Han}])((?:\+\p{Nd}+[ \/.\p{Pd}]*)?(?:(?:\(\+?\p{Nd}+\))?(?:[ \/.\p{Pd}]*\p{Nd})){7,}(?:[\t\f #]*\p{Nd}+)?)(?:$|[\s@,?!;:\'\"(.\p{Han}])'
 62 | 
 63 | id_regex = regex.compile(id_pattern, flags=regex.MULTILINE) #, re.MULTILINE)
 64 | key_regex = regex.compile(key_pattern, flags=regex.MULTILINE) #, re.MULTILINE)
 65 | ipv4_regex = regex.compile(ipv4_pattern)
 66 | ipv6_regex = regex.compile(ipv6_pattern)
 67 | ip_regex = regex.compile(ip_pattern, flags=regex.MULTILINE) #, re.MULTILINE)
 68 | email_regex = regex.compile(email_pattern, flags=regex.MULTILINE|regex.VERBOSE) #, re.MULTILINE)
 69 | user_regex = regex.compile(user_pattern, flags=regex.MULTILINE|regex.VERBOSE) #, re.MULTILINE)
 70 | # phone_regex = regex.compile(phone_pattern, flags=regex.MULTILINE) #, re.MULTILINE)
 71 | 
 72 | 
 73 | 
 74 | mst_regexes = {}
 75 | for tag in high_risk_tags:
 76 |   if tag == 'ID':
 77 |     mst_regexes['ID'] = id_regex
 78 |   elif tag == 'KEY':
 79 |     mst_regexes['KEY'] = key_regex
 80 |   elif tag == 'IPv4':
 81 |     mst_regexes['IPv4'] = ipv4_regex
 82 |   elif tag == 'IPv6':
 83 |     mst_regexes['IPv6'] = ipv6_regex
 84 |   elif tag == 'IP_ADDRESS':
 85 |     mst_regexes['IP_ADDRESS'] = ip_regex
 86 |   elif tag == 'EMAIL':
 87 |     mst_regexes['EMAIL'] = email_regex
 88 |   elif tag == 'USER':
 89 |     mst_regexes['USER'] = user_regex
 90 | #  elif tag == 'NUMBER':
 91 | #    mst_regexes['NUMBER'] = phone_regex
 92 |   else:
 93 |     sys.stderr.write('Dont have tag regex pattern for %s =(' % tag)
 94 | 
 95 | def ip_has_digit(matched_str):
 96 |   """Checks to make sure the PII span is not just :: or whatever that may
 97 |   accidentally be picked up by making sure there are digits."""
 98 |   return any(map(str.isdigit, matched_str))
 99 | 
100 | def matches_date_pattern(matched_str):
101 |   # Screen out date false positives
102 |   for year_regex in year_patterns:
103 |     if year_regex.match(matched_str):
104 |       return True
105 |   return False
106 | 
107 | 
108 | def detect_pii(text, lang, tag_types):
109 |   matches = []
110 |   for tag in tag_types:
111 |     label_pattern = mst_regexes[tag]
112 |     # !! regex.match happens here!!
113 |     matches_tmp = label_pattern.finditer(text)
114 |     for match in matches_tmp:
115 |       if match.groups():
116 |         if len(match.groups()) > 1 and match.groups()[1]:
117 |           sys.stderr.write("Warning: Found substring matches in the main match.")
118 | 
119 |         matched_str = match.groups()
120 | 
121 |         matched_str = matched_str[0]
122 |         if matched_str:
123 |           if tag in ["IP_ADDRESS"]:
124 |             # Filter out false positive IPs
125 |             if not ip_has_digit(matched_str):
126 |               continue
127 |           if tag in ["ID", "IP_ADDRESS"]: #, "NUMBER"]:
128 |             # Filter out date false positives
129 |             if matches_date_pattern(matched_str):
130 |               continue
131 |          
132 |           matches += [(matched_str, match.span(), str(label_pattern), tag, lang)]
133 |   return matches
134 | 
135 | 
136 | #@title Redaction function defined here.
137 | def redact_pii(text, matches):
138 |   """Takes a match as defined in the detect_pii function and redacts it from the full string, returning a <redacted text, metadata> tuple."""
139 |   redacted_str = text
140 |   metadata = []
141 |   for match in matches:
142 |     matched_str = match[0]
143 |     tag = match[3]
144 |     redact_tag = "PI:" + tag
145 |     redacted_str = redacted_str.replace(matched_str, redact_tag)
146 |     # Create the "metadata" as all of the information we had before redaction
147 |     metadata += [(match)]
148 |   return (redacted_str, metadata)
149 | 
150 | #@title General function to run the PII detection and redact it, saving everything else to metadata, is defined here.
151 | def run_pii(text, lang):
152 |   """
153 |   Runs the given set of regexes on the data "lines" and pulls out the
154 |   tagged items.
155 |   The lines structure stores the language type(s). This can be used for
156 |   language-specific regexes, although we're dropping that for now and using
157 |   only "default"/non-language-specific regexes.
158 |   """
159 | 
160 |   text = text.encode().decode()
161 |   matches = detect_pii(text, lang, high_risk_tags)
162 |   match_set = (text, {})
163 |   if len(matches) > 0:
164 |     # !!! REDACTION HAPPENS HERE !!!
165 |     redacted_str, metadata = redact_pii(text, matches)
166 |     metadata_out = {"regex metadata":metadata, "original": text, "redacted": redacted_str}
167 |     match_set = (redacted_str, metadata_out)
168 |   return match_set
169 | 
170 | 
171 | def run_pii_batch(exs, lang, text_column):
172 |     """
173 |     Runs the given set of regexes on the data "lines" and pulls out the
174 |     tagged items.
175 |     The lines structure stores the language type(s). This can be used for
176 |     language-specific regexes, although we're dropping that for now and using
177 |     only "default"/non-language-specific regexes.
178 |     """
179 |     regex_metadata = []
180 |     old_text = []
181 |     new_text = []
182 |     modified = []
183 |     for text in exs[text_column]:
184 |         text = text.encode().decode()
185 |         matches = detect_pii(text, lang, high_risk_tags)
186 |         if len(matches) > 0:
187 |             # !!! REDACTION HAPPENS HERE !!!
188 |             redacted_str, metadata = redact_pii(text, matches)
189 |             regex_metadata.append(repr(metadata))
190 |             old_text.append(text)
191 |             new_text.append(redacted_str)
192 |             modified.append(True)
193 |         else:
194 |             regex_metadata.append("")
195 |             old_text.append(text)
196 |             new_text.append(text)
197 |             modified.append(False)
198 |     result = {
199 |         "regex_metadata": regex_metadata,
200 |         "old_text": old_text,
201 |         "modified": modified
202 |     }
203 |     
204 |     result[text_column] = new_text
205 |     return result
206 | 
207 | def get_args():
208 |     parser = argparse.ArgumentParser(description='Load a dataset.')
209 |     parser.add_argument('--input_path', type=str)
210 |     parser.add_argument('--output_path', type=str)
211 |     parser.add_argument('--text_column', type=str)
212 |     parser.add_argument('--load_from_disk', action="store_true")
213 |     parser.add_argument('--save_to_json', action="store_true", default=True)
214 |     parser.add_argument('--dataset_path', type=Path)
215 |     parser.add_argument('--dataset_name', type=str)
216 |     parser.add_argument("--num_proc", type=int, default=3)
217 |     parser.add_argument("--batch_size", type=int, default=1000)
218 |     parser.add_argument("--save_batch_size", type=int, default=10000)
219 |     args = parser.parse_args()
220 |     return args
221 | 
222 | def get_check_ds(ds, args):
223 |     if args.check_only_modified:
224 |         ds_checks = ds.filter(
225 |             lambda exs: exs["modified"],
226 |             batched=True,
227 |             batch_size=args.batch_size,
228 |             num_proc=args.num_proc
229 |         )
230 |     else:
231 |         ds_checks = ds
232 |     idx_samples = random.sample(range(len(ds_checks)), min(len(ds_checks), args.check_sampling_size))
233 |     ds_checks = ds_checks.select(idx_samples)
234 | 
235 |     return ds_checks
236 | 
237 | 
238 | if __name__ == '__main__':
239 |     logging.basicConfig(
240 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
241 |         datefmt="%m/%d/%Y %H:%M:%S",
242 |         level=logging.INFO,
243 |     )
244 |     args = get_args()
245 |     logger.info(f"** The job is runned with the following arguments: **\n{args}\n **** ")
246 |     file_path = Path(args.input_path)
247 |     args.dataset_path=file_path.parent
248 |     args.dataset_name=file_path.name
249 |     logger.info(f" ===== Loading {args.dataset_path} =====")
250 |     if args.load_from_disk:
251 |         ds = load_from_disk(str(args.dataset_path))
252 |     else:
253 |         ds = load_dataset(str(args.dataset_path), data_files=[f"*{args.dataset_name}"], split="train")
254 |     lang = str(args.dataset_path).split("/")[-1].replace("indic-", "").replace("lm_", "")[:2]
255 |     logger.info(f"ds info: {ds}")
256 |     logger.info(f" ===== Applying PII =====")
257 |     ds = ds.map(
258 |         partial(run_pii_batch, lang=lang, text_column=args.text_column),
259 |         batched=True,
260 |         batch_size=args.batch_size,
261 |         num_proc=args.num_proc
262 |     )
263 | 
264 |     ds_final = ds.remove_columns([
265 |         "regex_metadata",
266 |         "old_text",
267 |         "modified"
268 |     ])
269 |     logger.info(f"ds_final info: {ds_final}")
270 |     
271 |     logger.info(f" ===== Saving Final dataset =====")
272 |     logger.info(f"Saving to final dataset at {args.output_path}.")
273 |     tmp_save_path = Path(args.output_path.parent, f"tmp-{args.output_path.name}")
274 |     if len(ds_final) == 0:
275 |         logger.info("Dataset was empty. Not saving anything.")
276 |     else:
277 |         if args.save_to_json:
278 |             ds_final.to_json(
279 |                 tmp_save_path,
280 |                 num_proc=args.num_proc,
281 |                 batch_size=args.save_batch_size,
282 |                 force_ascii=False
283 |             )
284 |         else:
285 |             ds_final.save_to_disk(tmp_save_path)
286 |         tmp_save_path.rename(args.output_path)
287 |         logger.info(f" ===== Final dataset saved successfully =====")
288 |     '''
289 |     ds_checks = get_check_ds(ds, args)
290 | 
291 |     logger.info(f" ===== Saving check dataset =====")
292 |     logger.info(f"Saving check dataset at {args.save_check_path}.")
293 |     tmp_save_path = Path(args.save_check_path.parent, f"tmp-{args.save_check_path.name}")
294 |     if len(ds_checks) == 0:
295 |         logger.info("Dataset was empty. Not saving anything.")
296 |     else:
297 |         if args.save_check_to_json:
298 |             ds_checks.to_json(
299 |                 tmp_save_path,
300 |                 num_proc=args.num_proc,
301 |                 batch_size=args.save_batch_size
302 |             )
303 |         else:
304 |             ds_checks.save_to_disk(tmp_save_path)
305 |         tmp_save_path.rename(args.save_check_path)
306 |         logger.info(f" ===== Check dataset saved successfully =====")
307 |     '''


--------------------------------------------------------------------------------
/6_text_dedup/text_dedup/minhash.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # author      : Chenghao Mou (mouchenghao@gmail.com)
  4 | # created     : 10/4/22
  5 | from __future__ import annotations
  6 | 
  7 | import argparse
  8 | import gc
  9 | import hashlib
 10 | import multiprocessing as mp
 11 | import os
 12 | import pickle
 13 | import random
 14 | import re
 15 | import json
 16 | import datasets
 17 | import numpy as np
 18 | from collections import defaultdict
 19 | from typing import Any, Dict, List, Set, Tuple
 20 | 
 21 | from datasets import load_dataset
 22 | from datasets import load_from_disk
 23 | from scipy.integrate import quad as integrate
 24 | from pathlib import Path
 25 | from tqdm import tqdm
 26 | import sys
 27 | sys.path.append(Path(__file__).parent.parent)
 28 | 
 29 | from utils import UnionFind
 30 | from utils import ngrams
 31 | from utils.add_args import add_io_args
 32 | from utils.add_args import add_meta_args
 33 | from utils.add_args import add_minhash_args
 34 | from utils.add_args import add_own_args
 35 | from utils.timer import Timer
 36 | 
 37 | import logging
 38 | 
 39 | from rich.logging import RichHandler
 40 | 
 41 | logger = logging.getLogger("text_dedup")
 42 | logger.setLevel(logging.INFO)
 43 | logger.addHandler(RichHandler(rich_tracebacks=True))
 44 | logger.propagate = False
 45 | 
 46 | SEED = 42
 47 | RNG = np.random.RandomState(SEED)
 48 | NON_ALPHA = re.compile("[^A-Za-z_0-9]")
 49 | MAX_HASH = np.uint64((1 << 32) - 1)
 50 | MERSENNE_PRIME = np.uint64((1 << 61) - 1)
 51 | datasets.logging.set_verbosity_error()
 52 | 
 53 | 
 54 | def save_jsonl(data, output_path):
 55 |     with open(output_path, 'w', encoding='utf-8') as output_file:
 56 |         for item in data:
 57 |             output_file.write(json.dumps(item, ensure_ascii=False) + "\n")
 58 | 
 59 | 
 60 | def sha1_hash(data: bytes, d: int = 32) -> int:
 61 |     """
 62 |     Generate a d-bit hash value from the given data.
 63 | 
 64 |     Parameters
 65 |     ----------
 66 |     data : bytes
 67 |         The data to be hashed.
 68 |     d : int
 69 |         The number of bits of the hash value.
 70 | 
 71 |     Returns
 72 |     -------
 73 |     int
 74 |         The hash value.
 75 | 
 76 |     Examples
 77 |     --------
 78 |     >>> sha1_hash(b"hello world", 32)
 79 |     896314922
 80 |     >>> sha1_hash(b"hello world", 64)
 81 |     13028719972609469994
 82 |     >>> sha1_hash(b"hello world", 128)
 83 |     310522945683037930239412421226792791594
 84 |     """
 85 |     return int.from_bytes(hashlib.sha1(data).digest()[: d // 8], byteorder="little")
 86 | 
 87 | 
 88 | def embed_func(
 89 |     content: str,
 90 |     idx: int,
 91 |     *,
 92 |     num_perm: int,
 93 |     ngram_size: int,
 94 |     min_length: int,
 95 |     hashranges: List[Tuple[int, int]],
 96 |     permutations: np.ndarray,
 97 | ) -> Dict[str, Any]:
 98 |     """
 99 |     Calculate hash values for the content.
100 | 
101 |     Parameters
102 |     ----------
103 |     content : str
104 |         The content to be embedded.
105 |     idx : int
106 |         The index of the content.
107 |     num_perm : int
108 |         The number of permutations.
109 |     ngram_size : int
110 |         The size of n-grams.
111 |     min_length : int
112 |         The minimum length of the document in terms of tokens.
113 |     hashranges : List[Tuple[int, int]]
114 |         The ranges of hash values.
115 |     permutations : np.ndarray
116 |         The permutations for the minhash.
117 | 
118 |     Returns
119 |     -------
120 |     Dict[str, Any]
121 |         The hash values in each range and the index.
122 | 
123 |     Examples
124 |     --------
125 |     >>> content = "hello world"
126 |     >>> idx = 0
127 |     >>> num_perm = 250
128 |     >>> ngram_size = 1
129 |     >>> hashranges = [(i, i + 25) for i in range(0, 250, 25)]
130 |     >>> PERMUTATIONS = np.array(
131 |     ...     [
132 |     ...         (
133 |     ...             RNG.randint(1, MERSENNE_PRIME, dtype=np.uint64),
134 |     ...             RNG.randint(0, MERSENNE_PRIME, dtype=np.uint64),
135 |     ...         )
136 |     ...         for _ in range(num_perm)
137 |     ...     ],
138 |     ...     dtype=np.uint64,
139 |     ... ).T
140 |     >>> res = embed_func(content, idx, num_perm=num_perm, ngram_size=ngram_size, min_length=0, hashranges=hashranges, permutations=PERMUTATIONS)
141 |     >>> len(res["__signatures__"])
142 |     10
143 |     >>> res["__id__"]
144 |     0
145 |     """
146 |     a, b = permutations
147 |     masks: np.ndarray = np.full(shape=num_perm, dtype=np.uint64, fill_value=MAX_HASH)
148 |     tokens: Set[str] = {" ".join(t) for t in ngrams(NON_ALPHA.split(content), ngram_size, min_length)}
149 |     hashvalues: np.ndarray = np.array([sha1_hash(token.encode("utf-8")) for token in tokens], dtype=np.uint64)
150 |     permuted_hashvalues = np.bitwise_and(
151 |         ((hashvalues * np.tile(a, (len(hashvalues), 1)).T).T + b) % MERSENNE_PRIME, MAX_HASH
152 |     )
153 |     hashvalues = np.vstack([permuted_hashvalues, masks]).min(axis=0)
154 |     Hs = [bytes(hashvalues[start:end].byteswap().data) for start, end in hashranges]
155 |     return {"__signatures__": Hs, "__id__": idx}
156 | 
157 | 
158 | def optimal_param(
159 |     threshold: float,
160 |     num_perm: int,
161 |     false_positive_weight: float = 0.5,
162 |     false_negative_weight: float = 0.5,
163 | ):
164 |     """
165 |     Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
166 |     of probabilities of false positive and false negative, taken from datasketch.
167 | 
168 |     You can also refer to the interactive demo at https://huggingface.co/spaces/bigcode/near-deduplication.
169 | 
170 |     Parameters
171 |     ----------
172 |     threshold : float
173 |         The threshold for similarity.
174 |     num_perm : int
175 |         The number of permutations.
176 |     false_positive_weight : float
177 |         The weight of false positive.
178 |     false_negative_weight : float
179 |         The weight of false negative.
180 | 
181 |     Returns
182 |     -------
183 |     Tuple[int, int]
184 |         The optimal `b` (bands) and `r` (rows) parameters.
185 | 
186 |     Examples
187 |     --------
188 |     >>> optimal_param(0.75, 256)
189 |     (21, 12)
190 |     >>> optimal_param(0.75, 256, 0.1, 0.9)
191 |     (28, 9)
192 |     """
193 | 
194 |     def false_positive_area(threshold: float, b: int, r: int):
195 |         """Source: `datasketch.lsh`"""
196 | 
197 |         def proba(s):
198 |             return 1 - (1 - s ** float(r)) ** float(b)
199 | 
200 |         a, _ = integrate(proba, 0.0, threshold)
201 |         return a
202 | 
203 |     def false_negative_area(threshold: float, b: int, r: int):
204 |         """Source: `datasketch.lsh`"""
205 | 
206 |         def proba(s):
207 |             return 1 - (1 - (1 - s ** float(r)) ** float(b))
208 | 
209 |         a, _ = integrate(proba, threshold, 1.0)
210 |         return a
211 | 
212 |     min_error = float("inf")
213 |     opt = (0, 0)
214 |     for b in range(1, num_perm + 1):
215 |         max_r = int(num_perm / b)
216 |         for r in range(1, max_r + 1):
217 |             fp = false_positive_area(threshold, b, r)
218 |             fn = false_negative_area(threshold, b, r)
219 |             error = fp * false_positive_weight + fn * false_negative_weight
220 |             if error < min_error:
221 |                 min_error = error
222 |                 opt = (b, r)
223 |     return opt
224 | 
225 | 
226 | if __name__ == "__main__":
227 | 
228 |     parser = argparse.ArgumentParser(
229 |         prog="text_dedup.minhash",
230 |         description="Deduplicate text using minhash",
231 |         formatter_class=argparse.RawTextHelpFormatter,
232 |     )
233 |     parser = add_io_args(parser)
234 |     parser = add_meta_args(parser)
235 |     parser = add_minhash_args(parser)
236 |     parser = add_own_args(parser)
237 |     args = parser.parse_args()
238 | 
239 |     ### arguments needed ###############
240 |     args.f = 64
241 |     args.bit_diff = 3
242 |     args.num_bucket = 4
243 |     args.local = False
244 |     args.json = True
245 | 
246 |     args.num_proc = 80
247 |     args.batch_size = 1000
248 | 
249 |     # data_files = os.listdir(args.input_path)
250 |     # data_files = [os.path.join(args.input_path, data_file) for data_file in data_files][:]
251 |     data_files = [args.input_path]
252 |     args.split = 'train'
253 |     args.num_perm = 25
254 |     args.B = 10
255 |     args.R = 1
256 |     ### arguments needed ###############
257 | 
258 |     mp.set_start_method("fork", force=True)
259 |     uf = UnionFind()
260 |     timer = Timer()
261 | 
262 |     if args.b is not None and args.r is not None:
263 |         B, R = args.b, args.r
264 |     else:
265 |         B, R = optimal_param(args.threshold, args.num_perm, args.false_positive_weight, args.false_negative_weight)
266 |     
267 |     logger.info(f"B is {B}, R is {R}")   
268 |     logger.info(f'threshold is {args.threshold}, num_perm is {args.num_perm}') 
269 |     HASH_RANGES = [(i * R, (i + 1) * R) for i in range(B)]
270 |     HASH_TABLES: List[Dict[int, Set]] = [defaultdict(set) for _ in range(B)]
271 | 
272 |     with timer("Total"):
273 |         with timer("Loading"):
274 |             if args.local:
275 |                 ds = load_from_disk(args.input_path)
276 |             elif args.json:
277 |                 ds = load_dataset("json", data_files=data_files\
278 |                                   ,cache_dir = args.cache_dir
279 |                                   ,split = args.split
280 |                                     )
281 |             else:
282 |                 ds = load_dataset(
283 |                     path=args.input_path,
284 |                     name=args.name,
285 |                     data_dir=args.data_dir,
286 |                     data_files=args.data_files,
287 |                     split=args.split,
288 |                     revision=args.revision,
289 |                     cache_dir=args.cache_dir,
290 |                     use_auth_token=args.use_auth_token,
291 |                 )
292 | 
293 |         DATA_SIZE = len(ds)
294 |         PERMUTATIONS = np.array(
295 |             [
296 |                 (
297 |                     RNG.randint(1, MERSENNE_PRIME, dtype=np.uint64),
298 |                     RNG.randint(0, MERSENNE_PRIME, dtype=np.uint64),
299 |                 )
300 |                 for _ in range(args.num_perm)
301 |             ],
302 |             dtype=np.uint64,
303 |         ).T
304 | 
305 |         with timer("MinHashing"):
306 |             embedded = ds.map(
307 |                 function=embed_func,
308 |                 fn_kwargs={
309 |                     "num_perm": args.num_perm,
310 |                     "hashranges": HASH_RANGES,
311 |                     "ngram_size": args.ngram,
312 |                     "min_length": args.min_length,
313 |                     "permutations": PERMUTATIONS,
314 |                 },
315 |                 input_columns=[args.column],
316 |                 remove_columns=ds.column_names,
317 |                 num_proc=os.cpu_count() if args.num_proc==None else args.num_proc,
318 |                 with_indices=True,
319 |                 desc="Fingerprinting...",
320 |             )
321 | 
322 | 
323 |         with timer("Clustering"):
324 |             for i in tqdm(
325 |                 range(0, len(embedded), args.batch_size),
326 |                 dynamic_ncols=True,
327 |                 desc="Iterating MinHashes...",  # noqa: E501
328 |             ):
329 |                 batch = embedded[i : i + args.batch_size]
330 |                 for key, Hs in zip(batch["__id__"], batch["__signatures__"]):
331 |                     for i, H in enumerate(Hs):
332 |                         HASH_TABLES[i][H].add(key)
333 | 
334 |             for table in tqdm(HASH_TABLES, dynamic_ncols=True, desc="Clustering..."):
335 |                 for cluster in table.values():
336 |                     if len(cluster) <= 1:
337 |                         continue
338 |                     idx = min(cluster)
339 |                     for x in cluster:
340 |                         uf.union(x, idx)
341 | 
342 | 
343 |         with timer("Filtering"):
344 |             gc.freeze()
345 |             gc.disable()
346 |             ds = ds.map(
347 |                 function=lambda _, idx: {"__cluster__": uf.find(idx)},
348 |                 with_indices=True,
349 |                 num_proc=os.cpu_count() if args.num_proc==None else args.num_proc,
350 |                 new_fingerprint=str(random.getrandbits(128)),
351 |                 desc="Finding clusters...",
352 |             )
353 |             gc.enable()
354 |             gc.collect()
355 |             # This is where the deduplication happens
356 |             # Since there is no easy groupby in datasets
357 |             # I will use this simple filter for now
358 |             final_data = ds.filter(
359 |                 function=lambda record, idx: record["__cluster__"] == idx,
360 |                 with_indices=True,
361 |                 num_proc=os.cpu_count() if args.num_proc==None else args.num_proc,
362 |                 desc="Filtering clusters...",
363 |             )
364 | 
365 |             final_data_duped = ds.filter(
366 |                 function=lambda record, idx: record["__cluster__"] != idx,
367 |                 with_indices=True,
368 |                 num_proc=os.cpu_count()
369 |                 if args.num_proc == None else args.num_proc,
370 |                 desc="Filtering clusters...",
371 |             )
372 | 
373 | 
374 |         with timer("Saving"):
375 |             # filter_data = [{"text": d["text"]} for d in final_data]
376 |             save_jsonl(final_data, args.output_path)
377 |             # final_data.to_json(args.output_path, orient="records", lines=True, force_ascii=False)
378 | 
379 |             if args.debug:
380 |                 with open(os.path.join(args.output_path, "uf.pkl"), "wb") as f:
381 |                     pickle.dump(uf, f, protocol=pickle.HIGHEST_PROTOCOL)
382 | 
383 |     PAD = 32
384 |     for k, v in timer.elapsed_times.items():
385 |         logger.info(f"{k:<{PAD}}: {v:.2f}s")
386 | 
387 |     logger.info(f"{'Before':<{PAD}}: {len(ds)}")
388 |     logger.info(f"{'After':<{PAD}}: {len(final_data)}")
389 | 


--------------------------------------------------------------------------------
/7_DataAnalysis/resources/hit_stopwords.txt:
--------------------------------------------------------------------------------
   1 | --
   2 | ?
   3 | “
   4 | ”
   5 | 》
   6 | －－
   7 | able
   8 | about
   9 | above
  10 | according
  11 | accordingly
  12 | across
  13 | actually
  14 | after
  15 | afterwards
  16 | again
  17 | against
  18 | ain't
  19 | all
  20 | allow
  21 | allows
  22 | almost
  23 | alone
  24 | along
  25 | already
  26 | also
  27 | although
  28 | always
  29 | am
  30 | among
  31 | amongst
  32 | an
  33 | and
  34 | another
  35 | any
  36 | anybody
  37 | anyhow
  38 | anyone
  39 | anything
  40 | anyway
  41 | anyways
  42 | anywhere
  43 | apart
  44 | appear
  45 | appreciate
  46 | appropriate
  47 | are
  48 | aren't
  49 | around
  50 | as
  51 | a's
  52 | aside
  53 | ask
  54 | asking
  55 | associated
  56 | at
  57 | available
  58 | away
  59 | awfully
  60 | be
  61 | became
  62 | because
  63 | become
  64 | becomes
  65 | becoming
  66 | been
  67 | before
  68 | beforehand
  69 | behind
  70 | being
  71 | believe
  72 | below
  73 | beside
  74 | besides
  75 | best
  76 | better
  77 | between
  78 | beyond
  79 | both
  80 | brief
  81 | but
  82 | by
  83 | came
  84 | can
  85 | cannot
  86 | cant
  87 | can't
  88 | cause
  89 | causes
  90 | certain
  91 | certainly
  92 | changes
  93 | clearly
  94 | c'mon
  95 | co
  96 | com
  97 | come
  98 | comes
  99 | concerning
 100 | consequently
 101 | consider
 102 | considering
 103 | contain
 104 | containing
 105 | contains
 106 | corresponding
 107 | could
 108 | couldn't
 109 | course
 110 | c's
 111 | currently
 112 | definitely
 113 | described
 114 | despite
 115 | did
 116 | didn't
 117 | different
 118 | do
 119 | does
 120 | doesn't
 121 | doing
 122 | done
 123 | don't
 124 | down
 125 | downwards
 126 | during
 127 | each
 128 | edu
 129 | eg
 130 | eight
 131 | either
 132 | else
 133 | elsewhere
 134 | enough
 135 | entirely
 136 | especially
 137 | et
 138 | etc
 139 | even
 140 | ever
 141 | every
 142 | everybody
 143 | everyone
 144 | everything
 145 | everywhere
 146 | ex
 147 | exactly
 148 | example
 149 | except
 150 | far
 151 | few
 152 | fifth
 153 | first
 154 | five
 155 | followed
 156 | following
 157 | follows
 158 | for
 159 | former
 160 | formerly
 161 | forth
 162 | four
 163 | from
 164 | further
 165 | furthermore
 166 | get
 167 | gets
 168 | getting
 169 | given
 170 | gives
 171 | go
 172 | goes
 173 | going
 174 | gone
 175 | got
 176 | gotten
 177 | greetings
 178 | had
 179 | hadn't
 180 | happens
 181 | hardly
 182 | has
 183 | hasn't
 184 | have
 185 | haven't
 186 | having
 187 | he
 188 | hello
 189 | help
 190 | hence
 191 | her
 192 | here
 193 | hereafter
 194 | hereby
 195 | herein
 196 | here's
 197 | hereupon
 198 | hers
 199 | herself
 200 | he's
 201 | hi
 202 | him
 203 | himself
 204 | his
 205 | hither
 206 | hopefully
 207 | how
 208 | howbeit
 209 | however
 210 | i'd
 211 | ie
 212 | if
 213 | ignored
 214 | i'll
 215 | i'm
 216 | immediate
 217 | in
 218 | inasmuch
 219 | inc
 220 | indeed
 221 | indicate
 222 | indicated
 223 | indicates
 224 | inner
 225 | insofar
 226 | instead
 227 | into
 228 | inward
 229 | is
 230 | isn't
 231 | it
 232 | it'd
 233 | it'll
 234 | its
 235 | it's
 236 | itself
 237 | i've
 238 | just
 239 | keep
 240 | keeps
 241 | kept
 242 | know
 243 | known
 244 | knows
 245 | last
 246 | lately
 247 | later
 248 | latter
 249 | latterly
 250 | least
 251 | less
 252 | lest
 253 | let
 254 | let's
 255 | like
 256 | liked
 257 | likely
 258 | little
 259 | look
 260 | looking
 261 | looks
 262 | ltd
 263 | mainly
 264 | many
 265 | may
 266 | maybe
 267 | me
 268 | mean
 269 | meanwhile
 270 | merely
 271 | might
 272 | more
 273 | moreover
 274 | most
 275 | mostly
 276 | much
 277 | must
 278 | my
 279 | myself
 280 | name
 281 | namely
 282 | nd
 283 | near
 284 | nearly
 285 | necessary
 286 | need
 287 | needs
 288 | neither
 289 | never
 290 | nevertheless
 291 | new
 292 | next
 293 | nine
 294 | no
 295 | nobody
 296 | non
 297 | none
 298 | noone
 299 | nor
 300 | normally
 301 | not
 302 | nothing
 303 | novel
 304 | now
 305 | nowhere
 306 | obviously
 307 | of
 308 | off
 309 | often
 310 | oh
 311 | ok
 312 | okay
 313 | old
 314 | on
 315 | once
 316 | one
 317 | ones
 318 | only
 319 | onto
 320 | or
 321 | other
 322 | others
 323 | otherwise
 324 | ought
 325 | our
 326 | ours
 327 | ourselves
 328 | out
 329 | outside
 330 | over
 331 | overall
 332 | own
 333 | particular
 334 | particularly
 335 | per
 336 | perhaps
 337 | placed
 338 | please
 339 | plus
 340 | possible
 341 | presumably
 342 | probably
 343 | provides
 344 | que
 345 | quite
 346 | qv
 347 | rather
 348 | rd
 349 | re
 350 | really
 351 | reasonably
 352 | regarding
 353 | regardless
 354 | regards
 355 | relatively
 356 | respectively
 357 | right
 358 | said
 359 | same
 360 | saw
 361 | say
 362 | saying
 363 | says
 364 | second
 365 | secondly
 366 | see
 367 | seeing
 368 | seem
 369 | seemed
 370 | seeming
 371 | seems
 372 | seen
 373 | self
 374 | selves
 375 | sensible
 376 | sent
 377 | serious
 378 | seriously
 379 | seven
 380 | several
 381 | shall
 382 | she
 383 | should
 384 | shouldn't
 385 | since
 386 | six
 387 | so
 388 | some
 389 | somebody
 390 | somehow
 391 | someone
 392 | something
 393 | sometime
 394 | sometimes
 395 | somewhat
 396 | somewhere
 397 | soon
 398 | sorry
 399 | specified
 400 | specify
 401 | specifying
 402 | still
 403 | sub
 404 | such
 405 | sup
 406 | sure
 407 | take
 408 | taken
 409 | tell
 410 | tends
 411 | th
 412 | than
 413 | thank
 414 | thanks
 415 | thanx
 416 | that
 417 | thats
 418 | that's
 419 | the
 420 | their
 421 | theirs
 422 | them
 423 | themselves
 424 | then
 425 | thence
 426 | there
 427 | thereafter
 428 | thereby
 429 | therefore
 430 | therein
 431 | theres
 432 | there's
 433 | thereupon
 434 | these
 435 | they
 436 | they'd
 437 | they'll
 438 | they're
 439 | they've
 440 | think
 441 | third
 442 | this
 443 | thorough
 444 | thoroughly
 445 | those
 446 | though
 447 | three
 448 | through
 449 | throughout
 450 | thru
 451 | thus
 452 | to
 453 | together
 454 | too
 455 | took
 456 | toward
 457 | towards
 458 | tried
 459 | tries
 460 | truly
 461 | try
 462 | trying
 463 | t's
 464 | twice
 465 | two
 466 | un
 467 | under
 468 | unfortunately
 469 | unless
 470 | unlikely
 471 | until
 472 | unto
 473 | up
 474 | upon
 475 | us
 476 | use
 477 | used
 478 | useful
 479 | uses
 480 | using
 481 | usually
 482 | value
 483 | various
 484 | very
 485 | via
 486 | viz
 487 | vs
 488 | want
 489 | wants
 490 | was
 491 | wasn't
 492 | way
 493 | we
 494 | we'd
 495 | welcome
 496 | well
 497 | we'll
 498 | went
 499 | were
 500 | we're
 501 | weren't
 502 | we've
 503 | what
 504 | whatever
 505 | what's
 506 | when
 507 | whence
 508 | whenever
 509 | where
 510 | whereafter
 511 | whereas
 512 | whereby
 513 | wherein
 514 | where's
 515 | whereupon
 516 | wherever
 517 | whether
 518 | which
 519 | while
 520 | whither
 521 | who
 522 | whoever
 523 | whole
 524 | whom
 525 | who's
 526 | whose
 527 | why
 528 | will
 529 | willing
 530 | wish
 531 | with
 532 | within
 533 | without
 534 | wonder
 535 | won't
 536 | would
 537 | wouldn't
 538 | yes
 539 | yet
 540 | you
 541 | you'd
 542 | you'll
 543 | your
 544 | you're
 545 | yours
 546 | yourself
 547 | yourselves
 548 | you've
 549 | zero
 550 | zt
 551 | ZT
 552 | zz
 553 | ZZ
 554 | 一
 555 | 一下
 556 | 一些
 557 | 一切
 558 | 一则
 559 | 一天
 560 | 一定
 561 | 一方面
 562 | 一旦
 563 | 一时
 564 | 一来
 565 | 一样
 566 | 一次
 567 | 一片
 568 | 一直
 569 | 一致
 570 | 一般
 571 | 一起
 572 | 一边
 573 | 一面
 574 | 万一
 575 | 上下
 576 | 上升
 577 | 上去
 578 | 上来
 579 | 上述
 580 | 上面
 581 | 下列
 582 | 下去
 583 | 下来
 584 | 下面
 585 | 不一
 586 | 不久
 587 | 不仅
 588 | 不会
 589 | 不但
 590 | 不光
 591 | 不单
 592 | 不变
 593 | 不只
 594 | 不可
 595 | 不同
 596 | 不够
 597 | 不如
 598 | 不得
 599 | 不怕
 600 | 不惟
 601 | 不成
 602 | 不拘
 603 | 不敢
 604 | 不断
 605 | 不是
 606 | 不比
 607 | 不然
 608 | 不特
 609 | 不独
 610 | 不管
 611 | 不能
 612 | 不要
 613 | 不论
 614 | 不足
 615 | 不过
 616 | 不问
 617 | 与
 618 | 与其
 619 | 与否
 620 | 与此同时
 621 | 专门
 622 | 且
 623 | 两者
 624 | 严格
 625 | 严重
 626 | 个
 627 | 个人
 628 | 个别
 629 | 中小
 630 | 中间
 631 | 丰富
 632 | 临
 633 | 为
 634 | 为主
 635 | 为了
 636 | 为什么
 637 | 为什麽
 638 | 为何
 639 | 为着
 640 | 主张
 641 | 主要
 642 | 举行
 643 | 乃
 644 | 乃至
 645 | 么
 646 | 之
 647 | 之一
 648 | 之前
 649 | 之后
 650 | 之後
 651 | 之所以
 652 | 之类
 653 | 乌乎
 654 | 乎
 655 | 乘
 656 | 也
 657 | 也好
 658 | 也是
 659 | 也罢
 660 | 了
 661 | 了解
 662 | 争取
 663 | 于
 664 | 于是
 665 | 于是乎
 666 | 云云
 667 | 互相
 668 | 产生
 669 | 人们
 670 | 人家
 671 | 什么
 672 | 什么样
 673 | 什麽
 674 | 今后
 675 | 今天
 676 | 今年
 677 | 今後
 678 | 仍然
 679 | 从
 680 | 从事
 681 | 从而
 682 | 他
 683 | 他人
 684 | 他们
 685 | 他的
 686 | 代替
 687 | 以
 688 | 以上
 689 | 以下
 690 | 以为
 691 | 以便
 692 | 以免
 693 | 以前
 694 | 以及
 695 | 以后
 696 | 以外
 697 | 以後
 698 | 以来
 699 | 以至
 700 | 以至于
 701 | 以致
 702 | 们
 703 | 任
 704 | 任何
 705 | 任凭
 706 | 任务
 707 | 企图
 708 | 伟大
 709 | 似乎
 710 | 似的
 711 | 但
 712 | 但是
 713 | 何
 714 | 何况
 715 | 何处
 716 | 何时
 717 | 作为
 718 | 你
 719 | 你们
 720 | 你的
 721 | 使得
 722 | 使用
 723 | 例如
 724 | 依
 725 | 依照
 726 | 依靠
 727 | 促进
 728 | 保持
 729 | 俺
 730 | 俺们
 731 | 倘
 732 | 倘使
 733 | 倘或
 734 | 倘然
 735 | 倘若
 736 | 假使
 737 | 假如
 738 | 假若
 739 | 做到
 740 | 像
 741 | 允许
 742 | 充分
 743 | 先后
 744 | 先後
 745 | 先生
 746 | 全部
 747 | 全面
 748 | 兮
 749 | 共同
 750 | 关于
 751 | 其
 752 | 其一
 753 | 其中
 754 | 其二
 755 | 其他
 756 | 其余
 757 | 其它
 758 | 其实
 759 | 其次
 760 | 具体
 761 | 具体地说
 762 | 具体说来
 763 | 具有
 764 | 再者
 765 | 再说
 766 | 冒
 767 | 冲
 768 | 决定
 769 | 况且
 770 | 准备
 771 | 几
 772 | 几乎
 773 | 几时
 774 | 凭
 775 | 凭借
 776 | 出去
 777 | 出来
 778 | 出现
 779 | 分别
 780 | 则
 781 | 别
 782 | 别的
 783 | 别说
 784 | 到
 785 | 前后
 786 | 前者
 787 | 前进
 788 | 前面
 789 | 加之
 790 | 加以
 791 | 加入
 792 | 加强
 793 | 十分
 794 | 即
 795 | 即令
 796 | 即使
 797 | 即便
 798 | 即或
 799 | 即若
 800 | 却不
 801 | 原来
 802 | 又
 803 | 及
 804 | 及其
 805 | 及时
 806 | 及至
 807 | 双方
 808 | 反之
 809 | 反应
 810 | 反映
 811 | 反过来
 812 | 反过来说
 813 | 取得
 814 | 受到
 815 | 变成
 816 | 另
 817 | 另一方面
 818 | 另外
 819 | 只是
 820 | 只有
 821 | 只要
 822 | 只限
 823 | 叫
 824 | 叫做
 825 | 召开
 826 | 叮咚
 827 | 可
 828 | 可以
 829 | 可是
 830 | 可能
 831 | 可见
 832 | 各
 833 | 各个
 834 | 各人
 835 | 各位
 836 | 各地
 837 | 各种
 838 | 各级
 839 | 各自
 840 | 合理
 841 | 同
 842 | 同一
 843 | 同时
 844 | 同样
 845 | 后来
 846 | 后面
 847 | 向
 848 | 向着
 849 | 吓
 850 | 吗
 851 | 否则
 852 | 吧
 853 | 吧哒
 854 | 吱
 855 | 呀
 856 | 呃
 857 | 呕
 858 | 呗
 859 | 呜
 860 | 呜呼
 861 | 呢
 862 | 周围
 863 | 呵
 864 | 呸
 865 | 呼哧
 866 | 咋
 867 | 和
 868 | 咚
 869 | 咦
 870 | 咱
 871 | 咱们
 872 | 咳
 873 | 哇
 874 | 哈
 875 | 哈哈
 876 | 哉
 877 | 哎
 878 | 哎呀
 879 | 哎哟
 880 | 哗
 881 | 哟
 882 | 哦
 883 | 哩
 884 | 哪
 885 | 哪个
 886 | 哪些
 887 | 哪儿
 888 | 哪天
 889 | 哪年
 890 | 哪怕
 891 | 哪样
 892 | 哪边
 893 | 哪里
 894 | 哼
 895 | 哼唷
 896 | 唉
 897 | 啊
 898 | 啐
 899 | 啥
 900 | 啦
 901 | 啪达
 902 | 喂
 903 | 喏
 904 | 喔唷
 905 | 嗡嗡
 906 | 嗬
 907 | 嗯
 908 | 嗳
 909 | 嘎
 910 | 嘎登
 911 | 嘘
 912 | 嘛
 913 | 嘻
 914 | 嘿
 915 | 因
 916 | 因为
 917 | 因此
 918 | 因而
 919 | 固然
 920 | 在
 921 | 在下
 922 | 地
 923 | 坚决
 924 | 坚持
 925 | 基本
 926 | 处理
 927 | 复杂
 928 | 多
 929 | 多少
 930 | 多数
 931 | 多次
 932 | 大力
 933 | 大多数
 934 | 大大
 935 | 大家
 936 | 大批
 937 | 大约
 938 | 大量
 939 | 失去
 940 | 她
 941 | 她们
 942 | 她的
 943 | 好的
 944 | 好象
 945 | 如
 946 | 如上所述
 947 | 如下
 948 | 如何
 949 | 如其
 950 | 如果
 951 | 如此
 952 | 如若
 953 | 存在
 954 | 宁
 955 | 宁可
 956 | 宁愿
 957 | 宁肯
 958 | 它
 959 | 它们
 960 | 它们的
 961 | 它的
 962 | 安全
 963 | 完全
 964 | 完成
 965 | 实现
 966 | 实际
 967 | 宣布
 968 | 容易
 969 | 密切
 970 | 对
 971 | 对于
 972 | 对应
 973 | 将
 974 | 少数
 975 | 尔后
 976 | 尚且
 977 | 尤其
 978 | 就
 979 | 就是
 980 | 就是说
 981 | 尽
 982 | 尽管
 983 | 属于
 984 | 岂但
 985 | 左右
 986 | 巨大
 987 | 巩固
 988 | 己
 989 | 已经
 990 | 帮助
 991 | 常常
 992 | 并
 993 | 并不
 994 | 并不是
 995 | 并且
 996 | 并没有
 997 | 广大
 998 | 广泛
 999 | 应当
1000 | 应用
1001 | 应该
1002 | 开外
1003 | 开始
1004 | 开展
1005 | 引起
1006 | 强烈
1007 | 强调
1008 | 归
1009 | 当
1010 | 当前
1011 | 当时
1012 | 当然
1013 | 当着
1014 | 形成
1015 | 彻底
1016 | 彼
1017 | 彼此
1018 | 往
1019 | 往往
1020 | 待
1021 | 後来
1022 | 後面
1023 | 得
1024 | 得出
1025 | 得到
1026 | 心里
1027 | 必然
1028 | 必要
1029 | 必须
1030 | 怎
1031 | 怎么
1032 | 怎么办
1033 | 怎么样
1034 | 怎样
1035 | 怎麽
1036 | 总之
1037 | 总是
1038 | 总的来看
1039 | 总的来说
1040 | 总的说来
1041 | 总结
1042 | 总而言之
1043 | 恰恰相反
1044 | 您
1045 | 意思
1046 | 愿意
1047 | 慢说
1048 | 成为
1049 | 我
1050 | 我们
1051 | 我的
1052 | 或
1053 | 或是
1054 | 或者
1055 | 战斗
1056 | 所
1057 | 所以
1058 | 所有
1059 | 所谓
1060 | 打
1061 | 扩大
1062 | 把
1063 | 抑或
1064 | 拿
1065 | 按
1066 | 按照
1067 | 换句话说
1068 | 换言之
1069 | 据
1070 | 掌握
1071 | 接着
1072 | 接著
1073 | 故
1074 | 故此
1075 | 整个
1076 | 方便
1077 | 方面
1078 | 旁人
1079 | 无宁
1080 | 无法
1081 | 无论
1082 | 既
1083 | 既是
1084 | 既然
1085 | 时候
1086 | 明显
1087 | 明确
1088 | 是
1089 | 是否
1090 | 是的
1091 | 显然
1092 | 显著
1093 | 普通
1094 | 普遍
1095 | 更加
1096 | 曾经
1097 | 替
1098 | 最后
1099 | 最大
1100 | 最好
1101 | 最後
1102 | 最近
1103 | 最高
1104 | 有
1105 | 有些
1106 | 有关
1107 | 有利
1108 | 有力
1109 | 有所
1110 | 有效
1111 | 有时
1112 | 有点
1113 | 有的
1114 | 有着
1115 | 有著
1116 | 望
1117 | 朝
1118 | 朝着
1119 | 本
1120 | 本着
1121 | 来
1122 | 来着
1123 | 极了
1124 | 构成
1125 | 果然
1126 | 果真
1127 | 某
1128 | 某个
1129 | 某些
1130 | 根据
1131 | 根本
1132 | 欢迎
1133 | 正在
1134 | 正如
1135 | 正常
1136 | 此
1137 | 此外
1138 | 此时
1139 | 此间
1140 | 毋宁
1141 | 每
1142 | 每个
1143 | 每天
1144 | 每年
1145 | 每当
1146 | 比
1147 | 比如
1148 | 比方
1149 | 比较
1150 | 毫不
1151 | 没有
1152 | 沿
1153 | 沿着
1154 | 注意
1155 | 深入
1156 | 清楚
1157 | 满足
1158 | 漫说
1159 | 焉
1160 | 然则
1161 | 然后
1162 | 然後
1163 | 然而
1164 | 照
1165 | 照着
1166 | 特别是
1167 | 特殊
1168 | 特点
1169 | 现代
1170 | 现在
1171 | 甚么
1172 | 甚而
1173 | 甚至
1174 | 用
1175 | 由
1176 | 由于
1177 | 由此可见
1178 | 的
1179 | 的话
1180 | 目前
1181 | 直到
1182 | 直接
1183 | 相似
1184 | 相信
1185 | 相反
1186 | 相同
1187 | 相对
1188 | 相对而言
1189 | 相应
1190 | 相当
1191 | 相等
1192 | 省得
1193 | 看出
1194 | 看到
1195 | 看来
1196 | 看看
1197 | 看见
1198 | 真是
1199 | 真正
1200 | 着
1201 | 着呢
1202 | 矣
1203 | 知道
1204 | 确定
1205 | 离
1206 | 积极
1207 | 移动
1208 | 突出
1209 | 突然
1210 | 立即
1211 | 第
1212 | 等
1213 | 等等
1214 | 管
1215 | 紧接着
1216 | 纵
1217 | 纵令
1218 | 纵使
1219 | 纵然
1220 | 练习
1221 | 组成
1222 | 经
1223 | 经常
1224 | 经过
1225 | 结合
1226 | 结果
1227 | 给
1228 | 绝对
1229 | 继续
1230 | 继而
1231 | 维持
1232 | 综上所述
1233 | 罢了
1234 | 考虑
1235 | 者
1236 | 而
1237 | 而且
1238 | 而况
1239 | 而外
1240 | 而已
1241 | 而是
1242 | 而言
1243 | 联系
1244 | 能
1245 | 能否
1246 | 能够
1247 | 腾
1248 | 自
1249 | 自个儿
1250 | 自从
1251 | 自各儿
1252 | 自家
1253 | 自己
1254 | 自身
1255 | 至
1256 | 至于
1257 | 良好
1258 | 若
1259 | 若是
1260 | 若非
1261 | 范围
1262 | 莫若
1263 | 获得
1264 | 虽
1265 | 虽则
1266 | 虽然
1267 | 虽说
1268 | 行为
1269 | 行动
1270 | 表明
1271 | 表示
1272 | 被
1273 | 要
1274 | 要不
1275 | 要不是
1276 | 要不然
1277 | 要么
1278 | 要是
1279 | 要求
1280 | 规定
1281 | 觉得
1282 | 认为
1283 | 认真
1284 | 认识
1285 | 让
1286 | 许多
1287 | 论
1288 | 设使
1289 | 设若
1290 | 该
1291 | 说明
1292 | 诸位
1293 | 谁
1294 | 谁知
1295 | 赶
1296 | 起
1297 | 起来
1298 | 起见
1299 | 趁
1300 | 趁着
1301 | 越是
1302 | 跟
1303 | 转动
1304 | 转变
1305 | 转贴
1306 | 较
1307 | 较之
1308 | 边
1309 | 达到
1310 | 迅速
1311 | 过
1312 | 过去
1313 | 过来
1314 | 运用
1315 | 还是
1316 | 还有
1317 | 这
1318 | 这个
1319 | 这么
1320 | 这么些
1321 | 这么样
1322 | 这么点儿
1323 | 这些
1324 | 这会儿
1325 | 这儿
1326 | 这就是说
1327 | 这时
1328 | 这样
1329 | 这点
1330 | 这种
1331 | 这边
1332 | 这里
1333 | 这麽
1334 | 进入
1335 | 进步
1336 | 进而
1337 | 进行
1338 | 连
1339 | 连同
1340 | 适应
1341 | 适当
1342 | 适用
1343 | 逐步
1344 | 逐渐
1345 | 通常
1346 | 通过
1347 | 造成
1348 | 遇到
1349 | 遭到
1350 | 避免
1351 | 那
1352 | 那个
1353 | 那么
1354 | 那么些
1355 | 那么样
1356 | 那些
1357 | 那会儿
1358 | 那儿
1359 | 那时
1360 | 那样
1361 | 那边
1362 | 那里
1363 | 那麽
1364 | 部分
1365 | 鄙人
1366 | 采取
1367 | 里面
1368 | 重大
1369 | 重新
1370 | 重要
1371 | 鉴于
1372 | 问题
1373 | 防止
1374 | 阿
1375 | 附近
1376 | 限制
1377 | 除
1378 | 除了
1379 | 除此之外
1380 | 除非
1381 | 随
1382 | 随着
1383 | 随著
1384 | 集中
1385 | 需要
1386 | 非但
1387 | 非常
1388 | 非徒
1389 | 靠
1390 | 顺
1391 | 顺着
1392 | 首先
1393 | 高兴
1394 | 是不是
1395 | 说说
1396 |  


--------------------------------------------------------------------------------