├── YiZhao_technical_report.pdf ├── 7_DataAnalysis ├── resources │ ├── HIT.jfif │ ├── simsun.ttf │ └── hit_stopwords.txt ├── eval_pipeline.py ├── corpus_eval_visulization.py └── corpus_evaluator.py ├── 2_toxic_filter ├── sensitive_words │ └── violence.txt └── 2_toxic_filter.py ├── 5_text_dedup ├── clean_helpers │ ├── __init__.py │ ├── concatenation.py │ ├── utils.py │ └── deduplication.py └── 5_clean.py ├── requirements.txt ├── 6_text_dedup └── text_dedup │ ├── __init__.py │ ├── utils │ ├── __init__.py │ ├── preprocess.py │ ├── union_find.py │ ├── tokenization.py │ ├── analysis.py │ ├── timer.py │ └── add_args.py │ └── minhash.py ├── 4_perplexity_filter └── kenlm │ ├── run.py │ └── model.py ├── 3_rule_filter.py ├── README.md └── 1_pii.py /YiZhao_technical_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HITsz-TMG/YiZhao/HEAD/YiZhao_technical_report.pdf -------------------------------------------------------------------------------- /7_DataAnalysis/resources/HIT.jfif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HITsz-TMG/YiZhao/HEAD/7_DataAnalysis/resources/HIT.jfif -------------------------------------------------------------------------------- /7_DataAnalysis/resources/simsun.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HITsz-TMG/YiZhao/HEAD/7_DataAnalysis/resources/simsun.ttf -------------------------------------------------------------------------------- /2_toxic_filter/sensitive_words/violence.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HITsz-TMG/YiZhao/HEAD/2_toxic_filter/sensitive_words/violence.txt -------------------------------------------------------------------------------- /5_text_dedup/clean_helpers/__init__.py: -------------------------------------------------------------------------------- 1 | from .deduplication import build_dedup_template, build_dedup_document 2 | from .concatenation import concatenate_lm_fr_ester -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | regex==2024.9.11 2 | datasets==3.0.0 3 | chardet==5.2.0 4 | ftfy==6.2.3 5 | langdetect==1.0.9 6 | opencc==1.1.9 7 | kenlm==0.2.0 8 | sentencepiece==0.2.0 9 | jsonlines==2.0.0 10 | torch==2.2.2 11 | scipy==1.12.0 12 | rich==13.7.1 13 | tiktoken==0.7.0 14 | openai==1.46.1 15 | matplotlib==3.9.2 16 | seaborn==0.13.2 17 | jieba==0.42.1 18 | wordcloud==1.9.3 19 | pandas==2.2.2 20 | numpy==1.26.4 -------------------------------------------------------------------------------- /6_text_dedup/text_dedup/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2021-06-05 12:48:33 4 | # @Author : Chenghao Mou (mouchenghao@gmail.com) 5 | 6 | """Text deduplication simplified.""" 7 | 8 | import logging 9 | 10 | from rich.logging import RichHandler 11 | 12 | logger = logging.getLogger("text_dedup") 13 | logger.setLevel(logging.INFO) 14 | logger.addHandler(RichHandler(rich_tracebacks=True)) 15 | logger.propagate = False 16 | -------------------------------------------------------------------------------- /5_text_dedup/clean_helpers/concatenation.py: -------------------------------------------------------------------------------- 1 | from itertools import groupby 2 | 3 | from datasets import Dataset 4 | 5 | from clean_helpers.utils import parse_meta 6 | 7 | 8 | def concatenate_lm_fr_ester(ds: Dataset, num_proc: int, batch_size: int) -> Dataset: 9 | dataset_in_memory = [ 10 | (*parse_meta(row["meta"])["id"].split("_id_"), row["text"]) for row in ds 11 | ] 12 | dataset_in_memory.sort() 13 | new_texts = [] 14 | new_metas = [] 15 | for doc_id, segments in groupby(dataset_in_memory, key=lambda x: x[0]): 16 | sorted_segment = sorted( 17 | [elt[1:] for elt in segments], 18 | key=lambda x: int(x[0]) 19 | ) 20 | new_texts.append("\n".join([elt[1] for elt in sorted_segment])) 21 | new_metas.append({"id": doc_id}) 22 | 23 | new_ds = Dataset.from_dict({"text": new_texts, "meta": new_metas}) 24 | return new_ds 25 | -------------------------------------------------------------------------------- /6_text_dedup/text_dedup/utils/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2022-12-26 15:42:09 4 | # @Author : Chenghao Mou (mouchenghao@gmail.com) 5 | 6 | from utils.add_args import add_bloom_filter_args 7 | from utils.add_args import add_exact_hash_args 8 | from utils.add_args import add_io_args 9 | from utils.add_args import add_meta_args 10 | from utils.add_args import add_minhash_args 11 | from utils.add_args import add_sa_args 12 | from utils.add_args import add_simhash_args 13 | from utils.timer import Timer 14 | from utils.tokenization import ngrams 15 | from utils.union_find import UnionFind 16 | 17 | __all__ = [ 18 | "add_bloom_filter_args", 19 | "add_exact_hash_args", 20 | "add_io_args", 21 | "add_meta_args", 22 | "add_minhash_args", 23 | "add_sa_args", 24 | "add_simhash_args", 25 | "Timer", 26 | "ngrams", 27 | "UnionFind", 28 | ] 29 | -------------------------------------------------------------------------------- /5_text_dedup/clean_helpers/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Dict 3 | 4 | 5 | def parse_meta(meta) -> Dict: 6 | if isinstance(meta, str): 7 | meta = eval(meta) 8 | return meta 9 | 10 | 11 | normalise_dataset_name_regex = re.compile( 12 | r"^(?:/gpfswork/rech/six/uty16tp/dataset/tokenization/)?(bigscience-catalogue-lm-data/[^/]+)(?:/data)?$" 13 | ) 14 | 15 | 16 | language_regex = re.compile( 17 | r"^(?:/gpfswork/rech/six/uty16tp/dataset/tokenization/)?bigscience-catalogue-lm-data/lm_([^_]+)_.*(?:/data)?$" 18 | ) 19 | def get_language(dataset_name: str): 20 | lang_candidate = language_regex.match(dataset_name).group(1) 21 | 22 | # Normalise chinese languages, so that we only consider simplified and traditional chinese as the two chinese languages 23 | if lang_candidate in ["zh", "zhs", "zh-cn"]: 24 | lang_candidate = "zhs" 25 | elif lang_candidate in ["zht", "zh-tw"]: 26 | lang_candidate = "zht" 27 | else: 28 | assert lang_candidate[:2] != "zh" 29 | 30 | return lang_candidate 31 | -------------------------------------------------------------------------------- /6_text_dedup/text_dedup/utils/preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2023-05-06 19:39:27 4 | # @Author : Chenghao Mou (mouchenghao@gmail.com) 5 | 6 | import regex as re 7 | 8 | DIGIT_RE = re.compile(r"\d") 9 | PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile(r"[\p{P}\p{C}\p{S}]+") 10 | 11 | def normalize(line: str) -> str: 12 | """ 13 | Normalize a line of text. Source: https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/text_normalizer.py#L180 14 | 15 | Parameters 16 | ---------- 17 | line : str 18 | The line of text to normalize. 19 | 20 | Returns 21 | ------- 22 | str 23 | The normalized line of text. 24 | 25 | Examples 26 | -------- 27 | >>> normalize("Hello, world!") 28 | 'hello world' 29 | >>> normalize("Hello, 123!\\n\\t\\b") 30 | 'hello 000' 31 | """ 32 | line = line.strip() 33 | if not line: 34 | return line 35 | line = line.lower() 36 | line = DIGIT_RE.sub("0", line) 37 | line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line) 38 | return line 39 | -------------------------------------------------------------------------------- /6_text_dedup/text_dedup/utils/union_find.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2022-12-26 15:37:44 4 | # @Author : Chenghao Mou (mouchenghao@gmail.com) 5 | 6 | 7 | class UnionFind: 8 | """ 9 | A data structure for maintaining disjoint sets. This helps build connected components for given duplicate pairs. 10 | 11 | Examples 12 | -------- 13 | >>> uf = UnionFind() 14 | >>> uf.union(1, 2) 15 | >>> uf.union(2, 3) 16 | >>> uf.union(4, 5) 17 | >>> uf.find(1) 18 | 1 19 | >>> uf.find(2) 20 | 1 21 | >>> uf.find(3) 22 | 1 23 | >>> uf.find(4) 24 | 4 25 | >>> uf.find(5) 26 | 4 27 | """ 28 | 29 | def __init__(self): 30 | self.parent = {} 31 | 32 | def find(self, x): 33 | if x not in self.parent: 34 | self.parent[x] = x 35 | return x 36 | 37 | if self.parent[x] != x: 38 | self.parent[x] = self.find(self.parent[x]) 39 | 40 | return self.parent[x] 41 | 42 | def union(self, x, y): 43 | px = self.find(x) 44 | py = self.find(y) 45 | self.parent[px] = self.parent[py] = min(px, py) 46 | -------------------------------------------------------------------------------- /6_text_dedup/text_dedup/utils/tokenization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2022-12-26 15:59:42 4 | # @Author : Chenghao Mou (mouchenghao@gmail.com) 5 | from itertools import tee 6 | from typing import List 7 | from typing import Text 8 | 9 | 10 | def ngrams(sequence: List[Text], n: int, min_length: int = 5): 11 | """ 12 | Return the ngrams generated from a sequence of items, as an iterator. 13 | 14 | This is a modified version of nltk.util.ngrams. 15 | 16 | Parameters 17 | ---------- 18 | sequence : List[Text] 19 | The sequence of items. 20 | n : int 21 | The length of each ngram. 22 | min_length : int, optional 23 | The minimum length of each ngram, by default 5 24 | 25 | Returns 26 | ------- 27 | iterator 28 | The ngrams. 29 | 30 | Examples 31 | -------- 32 | >>> list(ngrams(["a", "b", "c", "d"], 2, min_length=1)) 33 | [('a', 'b'), ('b', 'c'), ('c', 'd')] 34 | >>> list(ngrams(["a", "b", "c", "d"], 2, min_length=5)) 35 | [] 36 | >>> list(ngrams(["a", "b"], 3, min_length=1)) 37 | [('a', 'b')] 38 | """ 39 | if len(sequence) < min_length: 40 | return [] 41 | if len(sequence) < n: 42 | return [tuple(sequence)] 43 | iterables = tee(iter(sequence), n) 44 | for i, sub_iterable in enumerate(iterables): 45 | for _ in range(i): 46 | next(sub_iterable, None) 47 | return zip(*iterables) 48 | -------------------------------------------------------------------------------- /6_text_dedup/text_dedup/utils/analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2023-01-02 15:18:55 4 | # @Author : Chenghao Mou (mouchenghao@gmail.com) 5 | from typing import List 6 | 7 | from text_dedup.utils.tokenization import ngrams 8 | 9 | 10 | def jaccard_similarity( 11 | doc1: str | List[str], 12 | doc2: str | List[str], 13 | ngram_size: int = 8, 14 | min_length: int = 0, 15 | ) -> float: 16 | """Compute the Jaccard similarity between two documents. 17 | 18 | Parameters 19 | ---------- 20 | doc1 : str or List[str] 21 | The first document. 22 | doc2 : str or List[str] 23 | The second document. 24 | ngram_size : int, optional 25 | The size of n-grams, by default 8 26 | min_length : int, optional 27 | The minimum length of each n-gram, by default 0 28 | 29 | Returns 30 | ------- 31 | float 32 | The Jaccard similarity. 33 | 34 | Examples 35 | -------- 36 | >>> jaccard_similarity("hello world", "hello world") 37 | 1.0 38 | >>> jaccard_similarity("hello world", "hello world!") 39 | 0.8 40 | >>> jaccard_similarity("hello world".split(), "hello world!".split(), ngram_size=1) 41 | 0.3333333333333333 42 | """ 43 | words1 = set(" ".join(ng) for ng in ngrams(list(doc1), ngram_size, min_length=min_length)) 44 | words2 = set(" ".join(ng) for ng in ngrams(list(doc2), ngram_size, min_length=min_length)) 45 | return len(words1 & words2) / max(1, len(words1 | words2)) 46 | -------------------------------------------------------------------------------- /7_DataAnalysis/eval_pipeline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from corpus_evaluator import corpus_quality_measure_fn 4 | from corpus_eval_visulization import scores_visualization 5 | 6 | 7 | if __name__ == "__main__": 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--random_seed", type=int, default=1234) 10 | 11 | parser.add_argument("--data_path", type=str) 12 | parser.add_argument("--data_num", type=int, default=10) 13 | parser.add_argument("--text_column", type=str) 14 | parser.add_argument("--tiktoken_cache", type=str) 15 | 16 | parser.add_argument("--eval_path", type=str) 17 | 18 | parser.add_argument("--figure_dir", type=str) 19 | 20 | parser.add_argument("--model", type=str, default="gpt-3.5-turbo-1106") 21 | parser.add_argument("--api_key", type=str) 22 | parser.add_argument("--organization", type=str) 23 | parser.add_argument("--num_proc", type=int, default=1) 24 | args = parser.parse_args() 25 | 26 | # args.eval_path = args.figure_dir + "/result.jsonl" 27 | 28 | tiktoken_cache_dir = args.tiktoken_cache 29 | os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir 30 | 31 | corpus = corpus_quality_measure_fn( 32 | data_path=args.data_path, 33 | eval_path=args.eval_path, 34 | data_num=args.data_num, 35 | text_column=args.text_column, 36 | model=args.model, 37 | api_key=args.api_key, 38 | organization=args.organization, 39 | num_proc=args.num_proc,) 40 | 41 | scores_visualization(corpus, args.text_column, args.figure_dir) 42 | 43 | -------------------------------------------------------------------------------- /4_perplexity_filter/kenlm/run.py: -------------------------------------------------------------------------------- 1 | from model import KenlmModel 2 | import json 3 | import jsonlines 4 | import argparse 5 | from tqdm import tqdm 6 | 7 | def save_jsonl(data, output_path): 8 | with open(output_path, 'w', encoding='utf-8') as output_file: 9 | for item in data: 10 | output_file.write(json.dumps(item, ensure_ascii=False) + "\n") 11 | 12 | def read_jsonl(input_path): 13 | output_data = [] 14 | with open(input_path, 'r+', encoding='utf-8') as f: 15 | for item in jsonlines.Reader(f): 16 | output_data.append(item) 17 | return output_data 18 | 19 | def perplexity_filter(input_path, output_path): 20 | input_data = read_jsonl(input_path) 21 | filtered_data = [] 22 | 23 | for tmp in tqdm(input_data): 24 | score = model.get_perplexity(tmp[args.text_column]) 25 | if score <= 2095: 26 | filtered_data.append(tmp) 27 | 28 | save_jsonl(filtered_data, output_path) 29 | 30 | 31 | 32 | if __name__=='__main__': 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument( 35 | "--input_path", 36 | type=str, 37 | help="Path to input file(jsonl).", 38 | ) 39 | parser.add_argument( 40 | "--output_path", 41 | type=str, 42 | help="Path to output file(jsonl).", 43 | ) 44 | parser.add_argument('--text_column', type=str) 45 | parser.add_argument('--language', type=str, help="zh or en") 46 | args = parser.parse_args() 47 | 48 | # model taken from https://huggingface.co/edugp/kenlm 49 | model = KenlmModel.from_pretrained("kenlm/wikipedia", args.language) 50 | perplexity_filter(args.input_path, args.output_path) 51 | print('Perplexity Filter Done!') 52 | 53 | -------------------------------------------------------------------------------- /6_text_dedup/text_dedup/utils/timer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2022-12-26 15:45:46 4 | # @Author : Chenghao Mou (mouchenghao@gmail.com) 5 | import time 6 | 7 | 8 | class TimerContext: 9 | def __init__(self, timer: "Timer", name: str): 10 | self.timer = timer 11 | self.name = name 12 | self.start_time = None 13 | 14 | def __enter__(self): 15 | self.start_time = time.time() 16 | 17 | def __exit__(self, exc_type, exc_val, exc_tb): 18 | if any([exc_type, exc_val, exc_tb]): 19 | raise exc_val 20 | self.timer.elapsed_times[self.name] = time.time() - self.start_time 21 | 22 | 23 | class Timer: 24 | """ 25 | A simple timer that tracks the elapsed time of each context. 26 | 27 | Examples 28 | -------- 29 | >>> t = Timer() 30 | >>> with t("test"): 31 | ... time.sleep(1) 32 | >>> assert int(t.elapsed_times.get("test", 0)) >= 1, "The elapsed time should be 1 second." 33 | """ 34 | 35 | def __init__(self): 36 | self.elapsed_times = {} 37 | 38 | def __call__(self, name: str) -> TimerContext: 39 | """ 40 | Create a context with the given name. 41 | 42 | Parameters 43 | ---------- 44 | name: str 45 | The name of the context. 46 | 47 | Returns 48 | ------- 49 | TimerContext 50 | The context. 51 | 52 | Examples 53 | -------- 54 | >>> t = Timer() 55 | >>> with t("test"): 56 | ... time.sleep(1) 57 | >>> assert int(t.elapsed_times.get("test", 0)) == 1, "The elapsed time should be 1 second." 58 | >>> with t("test2"): 59 | ... time.sleep(2) 60 | >>> assert int(t.elapsed_times.get("test2", 0)) == 2, "The elapsed time should be 2 seconds." 61 | """ 62 | return TimerContext(self, name) 63 | -------------------------------------------------------------------------------- /2_toxic_filter/2_toxic_filter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import chardet 4 | import argparse 5 | from pathlib import Path 6 | 7 | def load_jsonl(path): 8 | with open(path, 'r', encoding='UTF-8') as f: 9 | return [json.loads(l) for l in f] 10 | 11 | class CorpusFilter: 12 | def __init__(self, directory_path): 13 | self.sensitive_keywords = self.load_sensitive_keywords(directory_path) 14 | 15 | def detect_encoding(self, file_path): 16 | with open(file_path, 'rb') as file: 17 | raw_data = file.read(5000) 18 | result = chardet.detect(raw_data) 19 | encoding = result['encoding'] 20 | return encoding 21 | 22 | def load_sensitive_keywords(self, directory_path): 23 | # Load sensitive keywords from all .txt files in the specified directory 24 | sensitive_keywords = set() 25 | for filename in os.listdir(directory_path): 26 | if filename.endswith('.txt'): 27 | file_path = os.path.join(directory_path, filename) 28 | encoding = self.detect_encoding(file_path) 29 | with open(file_path, 'r', encoding=encoding) as file: 30 | for line in file: 31 | keyword = line.strip().rstrip(',') 32 | if keyword: 33 | sensitive_keywords.add(keyword) 34 | return list(sensitive_keywords) 35 | 36 | def is_sensitive(self, text): 37 | for keyword in self.sensitive_keywords: 38 | if keyword in text: 39 | return True 40 | return False 41 | 42 | def filter_corpus(self, input_file_path, output_file_path): 43 | with open(input_file_path, 'r', encoding='utf-8') as input_file, \ 44 | open(output_file_path, 'w', encoding='utf-8') as output_file: 45 | for line in input_file: 46 | try: 47 | data = json.loads(line) 48 | text = data.get(args.text_column, '') 49 | if not self.is_sensitive(text): 50 | output_file.write(json.dumps(data, ensure_ascii=False) + '\n') 51 | except json.JSONDecodeError: 52 | continue # Ignore lines with parsing errors 53 | 54 | 55 | 56 | if __name__ == '__main__': 57 | parser = argparse.ArgumentParser() 58 | # The default input and output are jsonl files 59 | parser.add_argument('--input_path', type=str) 60 | parser.add_argument('--output_path', type=str) 61 | parser.add_argument('--text_column', type=str) 62 | args = parser.parse_args() 63 | 64 | directory_path = Path(__file__).parent / "sensitive_words" 65 | filter = CorpusFilter(directory_path) 66 | data = load_jsonl(args.input_path) 67 | filter.filter_corpus(args.input_path, args.output_path) 68 | 69 | -------------------------------------------------------------------------------- /7_DataAnalysis/corpus_eval_visulization.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | from pathlib import Path 4 | from collections import defaultdict 5 | 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | import seaborn as sns 9 | import pandas as pd 10 | from pylab import * 11 | import jieba 12 | from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS 13 | from PIL import Image 14 | 15 | 16 | sns.set_palette("hls") 17 | 18 | from matplotlib.font_manager import FontProperties 19 | system = platform.system() 20 | 21 | font = FontProperties(fname='fonts/opentype/noto/NotoSerifCJK-Black.ttc') 22 | 23 | 24 | def get_all_scores(corpus): 25 | scores_dict = defaultdict(list) 26 | 27 | for obj in corpus: 28 | quality = obj["quality"] 29 | for aspect, result in quality.items(): 30 | if result["score"] >= 0: 31 | scores_dict[aspect].append(result["score"]) 32 | 33 | print("{:6s} | {:5s} | {:5s} | {:5s} | {:5s} | {} ".format("", "Mean", "Std", "Min", "Max", "Count")) 34 | for aspect, score_list in scores_dict.items(): 35 | mean_score = np.mean(score_list) 36 | std_score = np.std(score_list) 37 | min_screo = min(score_list) 38 | max_score = max(score_list) 39 | print("{:5s} | {:5.2f} | {:5.2f} | {:5d} | {:5d} | {}".format(aspect, mean_score, std_score, min_screo, max_score, len(score_list))) 40 | 41 | return scores_dict 42 | 43 | 44 | def get_wordcloud(corpus, text_column, figure_dir): 45 | text_list = [obj[text_column] for obj in corpus] 46 | text = "\n".join(text_list) 47 | 48 | wordlist = jieba.cut(text) 49 | wordlist = [w for w in wordlist if len(w) > 1] 50 | space_list = ' '.join(wordlist) 51 | 52 | backgroud = np.array(Image.open(Path(__file__).parent / "resources/HIT.jfif")) 53 | 54 | with open(Path(__file__).parent / "resources/hit_stopwords.txt", "r", encoding="utf-8") as f: 55 | stopwords = [w.rstrip() for w in f.readlines()] 56 | 57 | wc = WordCloud(width=1400, height=2200, 58 | background_color='white', 59 | mode='RGB', 60 | mask=backgroud, 61 | max_words=500, 62 | stopwords=STOPWORDS.update(stopwords), 63 | max_font_size=150, 64 | relative_scaling=0.6, 65 | random_state=50, 66 | scale=2, 67 | font_path=str(Path(__file__).parent / "resources/simsun.ttf"), 68 | ).generate(space_list) 69 | 70 | image_color = ImageColorGenerator(backgroud) 71 | wc.recolor(color_func=image_color) 72 | 73 | plt.imshow(wc) 74 | plt.axis('off') 75 | plt.show() 76 | wc.to_file(os.path.join(figure_dir, "wordcloud.png")) 77 | 78 | 79 | def get_plot(scores_dict: dict, figure_dir: str): 80 | sns.set_palette("hls") 81 | fig, axs = plt.subplots(1, 5, figsize=(25, 5)) 82 | 83 | color_list = ["#FF55BB", "#00DFA2", "#FFD3A3", "#0079FF", "#F6FA70"] 84 | for i, (aspect, score_list) in enumerate(scores_dict.items()): 85 | 86 | sns.histplot(score_list, bins=10, color=color_list[i], ax=axs[i]) 87 | sns.kdeplot(score_list, color="seagreen", lw=3, ax=axs[i]) 88 | # sns.distplot(score_list, bins=10, kde_kws={"color": "seagreen", "lw": 3}, hist_kws={"color": color_list[i]}, ax=axs[i]) 89 | axs[i].set_title(aspect, fontproperties=font) 90 | 91 | plt.savefig(os.path.join(figure_dir, "quality_hist.png")) 92 | plt.show() 93 | 94 | 95 | def scores_visualization(corpus, text_column, figure_dir): 96 | get_wordcloud(corpus, text_column, figure_dir) 97 | scores_dict = get_all_scores(corpus) 98 | get_plot(scores_dict, figure_dir) -------------------------------------------------------------------------------- /3_rule_filter.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | import ftfy 4 | import regex 5 | from langdetect import detect 6 | from tqdm import tqdm 7 | import opencc 8 | 9 | def load_jsonl(path): 10 | with open(path, 'r', encoding='UTF-8') as f: 11 | return [json.loads(l) for l in f] 12 | 13 | class RuleFilter: 14 | def __init__(self): 15 | self.OPENCC_CONVERTER = opencc.OpenCC('t2s.json') 16 | self.punctuation_unicode = { 17 | ',': ',', 18 | '。': '.', 19 | '、': ',', 20 | '„': '"', 21 | '”': '"', 22 | '“': '"', 23 | '«': '"', 24 | '»': '"', 25 | '1': '"', 26 | '」': '"', 27 | '「': '"', 28 | '《': '"', 29 | '》': '"', 30 | '´': "'", 31 | '∶': ':', 32 | ':': ':', 33 | '?': '?', 34 | '!': '!', 35 | '(': '(', 36 | ')': ')', 37 | ';': ';', 38 | '–': '-', 39 | '—': ' - ', 40 | '.': '. ', 41 | '~': '~', 42 | '’': "'", 43 | '…': '...', 44 | '━': '-', 45 | '〈': '<', 46 | '〉': '>', 47 | '【': '[', 48 | '】': ']', 49 | '%': '%', 50 | '►': '-', 51 | } 52 | self.various_whitespaces = { 53 | ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 54 | ' ', ' ', ' ', ' ', '​', '‌', '‍', '⁠', '', '„' 55 | } 56 | 57 | def handle(self, text): 58 | # unicode 59 | text = ftfy.fix_text(text, normalization="NFC") 60 | # language filter 61 | if detect(text) != args.language: 62 | return None 63 | 64 | # Standardization of Punctuation 65 | text = ''.join([ 66 | self.punctuation_unicode.get(c, c) for c in text 67 | ]) 68 | # Standardization of Whitespace 69 | text = ''.join([ 70 | char if char not in self.various_whitespaces else ' ' for char in text 71 | ]) 72 | 73 | # Replace all matched consecutive punctuation with a single punctuation 74 | pattern = r'(\p{P})\1+' 75 | text = regex.sub(pattern, r'\1', text) 76 | text = text.strip() 77 | 78 | # Filter out texts with too high a punctuation ratio and too short a text length 79 | punctuation_count = len(regex.findall(r'\p{P}', text)) 80 | total_chars = len(text) 81 | punctuation_ratio = punctuation_count / total_chars 82 | if punctuation_ratio > args.punctuation_ratio_threshold or len(text) < args.text_length_threshold: 83 | return None 84 | 85 | 86 | # Convert Traditional Chinese Characters to Simplified Chinese 87 | return self.OPENCC_CONVERTER.convert(text) 88 | 89 | def filter(self, input_file_path, output_file_path): 90 | with open(input_file_path, 'r', encoding='utf-8') as input_file, \ 91 | open(output_file_path, 'w', encoding='utf-8') as output_file: 92 | for line in input_file: 93 | try: 94 | data = json.loads(line) 95 | text = data.get(args.text_column, '') 96 | result = self.handle(text) 97 | if result: 98 | data[args.text_column] = result 99 | output_file.write(json.dumps(data, ensure_ascii=False) + '\n') 100 | except json.JSONDecodeError: 101 | continue # Ignore lines with parsing errors 102 | 103 | 104 | 105 | if __name__ == '__main__': 106 | parser = argparse.ArgumentParser() 107 | # The default input and output are jsonl files 108 | parser.add_argument('--input_path', type=str) 109 | parser.add_argument('--output_path', type=str) 110 | parser.add_argument('--text_column', type=str) 111 | parser.add_argument('--language', type=str) 112 | parser.add_argument('--punctuation_ratio_threshold', type=float, default=0.5) 113 | parser.add_argument('--text_length_threshold', type=int, default=128) 114 | args = parser.parse_args() 115 | 116 | filter = RuleFilter() 117 | filter.filter(args.input_path, args.output_path) 118 | -------------------------------------------------------------------------------- /4_perplexity_filter/kenlm/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import unicodedata 4 | from typing import Dict 5 | 6 | import kenlm 7 | import sentencepiece 8 | from huggingface_hub import cached_download, hf_hub_url 9 | 10 | 11 | class SentencePiece: 12 | def __init__( 13 | self, 14 | model: str, 15 | ): 16 | super().__init__() 17 | self.sp = sentencepiece.SentencePieceProcessor() 18 | self.sp.load(str(model)) 19 | 20 | def do(self, text: dict) -> dict: 21 | tokenized = self.sp.encode_as_pieces(text) 22 | return " ".join(tokenized) 23 | 24 | 25 | class KenlmModel: 26 | digit_re: re.Pattern = re.compile(r"\d") 27 | unicode_punct: Dict[str, str] = { 28 | ",": ",", 29 | "。": ".", 30 | "、": ",", 31 | "„": '"', 32 | "”": '"', 33 | "“": '"', 34 | "«": '"', 35 | "»": '"', 36 | "1": '"', 37 | "」": '"', 38 | "「": '"', 39 | "《": '"', 40 | "》": '"', 41 | "´": "'", 42 | "∶": ":", 43 | ":": ":", 44 | "?": "?", 45 | "!": "!", 46 | "(": "(", 47 | ")": ")", 48 | ";": ";", 49 | "–": "-", 50 | "—": " - ", 51 | ".": ". ", 52 | "~": "~", 53 | "’": "'", 54 | "…": "...", 55 | "━": "-", 56 | "〈": "<", 57 | "〉": ">", 58 | "【": "[", 59 | "】": "]", 60 | "%": "%", 61 | "►": "-", 62 | } 63 | unicode_punct_re = re.compile(f"[{''.join(unicode_punct.keys())}]") 64 | non_printing_chars_re = re.compile( 65 | f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]" 66 | ) 67 | kenlm_model_dir = None 68 | sentence_piece_model_dir = None 69 | 70 | def __init__( 71 | self, 72 | model_dataset: str, 73 | language: str, 74 | lower_case: bool = False, 75 | remove_accents: bool = False, 76 | normalize_numbers: bool = True, 77 | punctuation: int = 1, 78 | ): 79 | self.model = kenlm.Model(os.path.join(model_dataset, f"{language}.arpa.bin")) 80 | self.tokenizer = SentencePiece(os.path.join(model_dataset, f"{language}.sp.model")) 81 | self.accent = remove_accents 82 | self.case = lower_case 83 | self.numbers = normalize_numbers 84 | self.punct = punctuation 85 | 86 | @classmethod 87 | def from_pretrained( 88 | cls, 89 | model_dataset: str, 90 | language: str, 91 | ): 92 | return cls( 93 | model_dataset, 94 | language, 95 | False, 96 | False, 97 | True, 98 | 1, 99 | ) 100 | 101 | def pp(self, log_score, length): 102 | return 10.0 ** (-log_score / length) 103 | 104 | def get_perplexity(self, doc: str, normalize_cc_net: bool = True): 105 | if normalize_cc_net: 106 | doc = self.normalize( 107 | doc, 108 | accent=self.accent, 109 | case=self.case, 110 | numbers=self.numbers, 111 | punct=self.punct, 112 | ) 113 | # Tokenize (after normalizing): See https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/mine.py#L352 for full pipeline 114 | doc = self.tokenizer.do(doc) 115 | doc_log_score, doc_length = 0, 0 116 | for line in doc.split("\n"): 117 | log_score = self.model.score(line) 118 | length = len(line.split()) + 1 119 | doc_log_score += log_score 120 | doc_length += length 121 | return round(self.pp(doc_log_score, doc_length), 1) 122 | 123 | def normalize( 124 | self, 125 | line: str, 126 | accent: bool = True, 127 | case: bool = True, 128 | numbers: bool = True, 129 | punct: int = 1, 130 | ) -> str: 131 | line = line.strip() 132 | if not line: 133 | return line 134 | if case: 135 | line = line.lower() 136 | if accent: 137 | line = self.strip_accents(line) 138 | if numbers: 139 | line = self.digit_re.sub("0", line) 140 | if punct == 1: 141 | line = self.replace_unicode_punct(line) 142 | elif punct == 2: 143 | line = self.remove_unicode_punct(line) 144 | line = self.remove_non_printing_char(line) 145 | return line 146 | 147 | def strip_accents(self, line: str) -> str: 148 | """Strips accents from a piece of text.""" 149 | nfd = unicodedata.normalize("NFD", line) 150 | output = [c for c in nfd if unicodedata.category(c) != "Mn"] 151 | if len(output) == line: 152 | return line 153 | return "".join(output) 154 | 155 | def replace_unicode_punct(self, text: str) -> str: 156 | return "".join(self.unicode_punct.get(c, c) for c in text) 157 | 158 | def remove_unicode_punct(self, text: str) -> str: 159 | """More aggressive version of replace_unicode_punct but also faster.""" 160 | return self.unicode_punct_re.sub("", text) 161 | 162 | def remove_non_printing_char(self, text: str) -> str: 163 | return self.non_printing_chars_re.sub("", text) 164 | -------------------------------------------------------------------------------- /7_DataAnalysis/corpus_evaluator.py: -------------------------------------------------------------------------------- 1 | import re 2 | import random 3 | from collections import OrderedDict 4 | import json 5 | import tiktoken 6 | import openai 7 | from openai import OpenAI 8 | from datasets import Dataset, load_dataset 9 | 10 | # add http proxy 11 | # import os 12 | # os.environ["http_proxy"] = "http://127.0.0.1:10809" 13 | # os.environ["https_proxy"] = "http://127.0.0.1:10809" 14 | 15 | PROMPT = """你是一个语料评价专家,负责对单条语料(通常是一段自然语言文本)的质量进行打分以用于大语言模型的预训练 16 | 你的评价标准是: 17 | 语言质量(0-10分):考察语料的语法、拼写、词汇是否正确,语言表达是否流畅。语言质量高的语料利于模型学习语言规则,可以得高分。得分依据:语法和拼写正确(2分),词汇丰富(2分),表达流畅(2分),长难句或生僻词出现(2分),语言总体复杂(2分)。 18 | 19 | 信息量(0-10分):考察语料所包含的知识量和概念量。信息量大的语料有利于模型学习丰富知识,可以得高分。得分依据:包含专业知识或生僻概念(3分),篇幅较长或讨论多个话题(3分),详尽叙述某一话题(2分),提供新的信息或见解(2分)。 20 | 21 | 新颖性(0-10分):考察语料中的新奇词汇、新信息或新思想对模型理解范围的扩展作用。新颖性高的语料可以得高分。得分依据:包含新词或新概念(3分),提供新信息或新见解(3分),采用新角度或新形式表达观点(2分),创造新的词或短语(2分)。 22 | 23 | 连贯性(0-10分): 主题明确,观点连贯,论证严谨,构成完整论述(3分);主题基本清晰,且论证严谨。(3分) 各部分同属同一话题,构成连贯整体(4分)。 24 | 25 | 纯净度(0-10分):考察语料含有无关信息如广告、营销、垃圾信息的数量,含此类信息少而大部分内容都与主题相关的语料可以得高分。得分依据:主要内容表达完整(3分),垃圾信息含量少(3分),完全没有垃圾信息(4分) 26 | 27 | 通过以上评价标准,你将对下面的语料进行打分: 28 | 【语料开始】 29 | 30 | {corpus} 31 | 32 | 【语料结束】 33 | 34 | 请先分条给出评价理由,再给出对应分数并格式化输出。 35 | 示例: 36 | 【语言质量】:语法和拼写基本正确,词汇较丰富,表达流畅,出现生僻词如“幽灵枪”和长句,语言较复杂。【分数】8 37 | 【信息量】:涉及专业领域知识如各类枪支、美国控枪法案等,讨论多个话题如美国枪支文化与政策、美国枪支暴力现状等,详尽论述美国枪支状况,提供大量数据与信息。【分数】9 38 | 【新颖性】:出现新词“幽灵枪”和新概念如“极端枪支文化”,从政治经济角度揭示美国枪支问题新原因,以全新的角度解析美国枪支文化。【分数】8 39 | 【连贯性】:文中各部分紧密衔接,从美国枪支政策演变到枪支问题分析,再到政治经济因素剖析,行文逻辑清晰,段落结构明确。【分数】9 40 | 【纯净度】:文中的主要内容表达完整,大部分文本都与主题相关,但是结尾含有推广引流信息,不过垃圾信息含量较少。【分数】7 41 | 42 | 输出:""" 43 | 44 | all_aspects = ["语言质量", "信息量", "新颖性", "连贯性", "纯净度"] 45 | 46 | tokenizer = tiktoken.get_encoding('cl100k_base') 47 | 48 | 49 | def read_data(data_path: str, data_num: int) -> Dataset: 50 | dataset = load_dataset("json", data_files=[data_path], split="train", keep_in_memory=True) 51 | 52 | if data_num is not None: 53 | data_num = min(data_num, len(dataset)) 54 | random_indices = random.sample(range(len(dataset)), data_num) 55 | 56 | return dataset.select(random_indices) 57 | 58 | 59 | def cut_corpus(text, max_len=1000): 60 | text_tokens = tokenizer.encode(str(text).strip()) 61 | if len(text_tokens) > max_len: 62 | text_readable = False 63 | text_tokens = text_tokens[:max_len] 64 | while not text_readable and len(text_tokens) > 1: 65 | try: 66 | text = tokenizer.decode(text_tokens) 67 | text_readable = True 68 | except: 69 | text_tokens = text_tokens[:-1] 70 | return text 71 | 72 | 73 | def call_openai_func(instruction: str, model: str = "gpt-3.5-turbo-1106", api_key: str = None, organization: str = None) -> str: 74 | 75 | openai.api_key = api_key 76 | openai.organization = organization 77 | 78 | client = OpenAI(api_key=api_key, organization=organization) 79 | 80 | completion = client.chat.completions.create( 81 | model=model, 82 | messages=[ 83 | {"role": "system", 84 | "content": "You are a helpful assistant."}, 85 | {"role": "user", "content": instruction}, 86 | ], 87 | temperature=0.2, 88 | max_tokens=512, 89 | ) 90 | return completion.choices[0].message.content 91 | 92 | 93 | 94 | def extract_result(text: str) -> dict: 95 | pattern = r'【(.*?)】:(.*?)【分数】(\d+)\n' 96 | 97 | matches = re.findall(pattern, text+"\n") 98 | 99 | result = OrderedDict({aspect: {"reason": "", "score": -1} for aspect in all_aspects}) 100 | assert len(matches) == len(all_aspects) 101 | for match in matches: 102 | aspect = match[0] 103 | reason = match[1] 104 | score = match[2] 105 | assert aspect in all_aspects 106 | result[aspect] = {"reason": reason, "score": int(score)} 107 | 108 | return result 109 | 110 | def save_jsonl(data, output_path): 111 | with open(output_path, 'w', encoding='utf-8') as output_file: 112 | for item in data: 113 | output_file.write(json.dumps(item, ensure_ascii=False) + "\n") 114 | 115 | def corpus_quality_measure_fn( 116 | data_path: str, 117 | eval_path: str = None, 118 | data_num: int = None, 119 | text_column: str = "text", 120 | model: str = "gpt-3.5-turbo-1106", 121 | api_key: str = None, 122 | organization: str = None, 123 | num_proc: int = 1,): 124 | 125 | def eval_single_item(obj): 126 | text = obj[text_column] 127 | instruction = PROMPT.format(corpus=cut_corpus(text)) 128 | 129 | try: 130 | response = call_openai_func(instruction, model, api_key, organization) 131 | result = extract_result(response) 132 | except Exception as e: 133 | print("Error") 134 | print(e) 135 | result = OrderedDict({aspect: {"reason": "", "score": -1} for aspect in all_aspects}) 136 | 137 | obj["quality"] = result 138 | return obj 139 | 140 | corpus = read_data(data_path, data_num) 141 | corpus = corpus.map(eval_single_item, num_proc=num_proc) 142 | 143 | if eval_path is not None: 144 | save_jsonl(corpus, eval_path) 145 | # corpus.to_json(eval_path, batch_size=128, force_ascii=False) 146 | 147 | return corpus 148 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 📦 YiZhao: A 2TB Open Financial Dataset 2 | 3 |

4 | 🤗 Hugging Face   |   5 | 🤖 ModelScope   |   6 | 🪄 YiZhao-12B-Chat   |   7 | 📑 Technical Report 8 |

9 | 10 | Data and tools for generating and inspecting **YiZhao**, a safe, high-quality, open-sourced bilingual financial corpus (Chinese, English) released by Harbin Institute of Technology (Shenzhen) and China Merchants Bank Artificial Intelligence Laboratory. 11 | 12 | ## 🌟 Environment 13 | Our recommended Python version is **3.11.4**. 14 | ``` 15 | pip install -r requirements.txt 16 | ``` 17 | 18 | ## 🧩 Data Preprocessing 19 | 20 | ### 1. Remove personal information 21 | This step completes the removal of personal information such as IP addresses, emails, and phone numbers. 22 | #### Example usage 23 | ``` 24 | python 1_pii.py \ 25 | --input_path input.jsonl \ 26 | --output_path output.jsonl \ 27 | --text_column text \ 28 | --num_proc 4 \ 29 | --batch_size 100 30 | ``` 31 | 32 | ### 2. Sensitive Words 33 | To avoid the inclusion of toxic content in the training data, one approach is to filter out texts that contain specific sensitive keywords. You need to store the ***txt*** files containing sensitive words in `2_toxic_filter/sensitive_words`. 34 | #### Example usage 35 | ``` 36 | python 2_toxic_filter/2_toxic_filter.py \ 37 | --input_path input.jsonl \ 38 | --output_path output.jsonl \ 39 | --text_column text \ 40 | ``` 41 | 42 | ### 3. Rule Filtering 43 | This step completes multiple rule-based data filtering. 44 | - Language Filtering: Retain only text data in a specific language (***zh-cn*** or ***en***). 45 | - Punctuation and whitespace consistency processing: Unify Chinese and English punctuation within the text, and standardize different types of whitespace characters as well. 46 | - Deduplication of consecutive punctuation: Replace all matched consecutive punctuation marks with a single punctuation mark. 47 | - Punctuation Ratio Filtering: Filter out texts with too high a punctuation ratio. 48 | - Data Length Filtering: Filter out text data that is too short. 49 | #### Example usage 50 | ``` 51 | python 3_rule_filter.py \ 52 | --input_path input.jsonl \ 53 | --output_path output.jsonl \ 54 | --text_column text \ 55 | --language zh-cn \ 56 | --punctuation_ratio_threshold 0.5 \ 57 | --text_length_threshold 128 \ 58 | ``` 59 | 60 | ### 4. Perplexity Filtering 61 | You need to first download the model from the [KenLM repository](https://huggingface.co/edugp/kenlm), and then modify the corresponding model path in the following line in `4_perplexity_filter/kenlm/run.py`. 62 | ```python 63 | model = KenlmModel.from_pretrained("kenlm/wikipedia", args.language) #language = zh or en 64 | ``` 65 | #### Example usage 66 | ``` 67 | python 4_perplexity_filter/kenlm/run.py \ 68 | --input_path input.jsonl \ 69 | --output_path output.jsonl \ 70 | --text_column text \ 71 | --language zh \ 72 | ``` 73 | 74 | ### 5. Exact Deduplication 75 | Deduplicate identical text entries in the dataset. 76 | #### Example usage 77 | ``` 78 | python 5_text_dedup/5_clean.py \ 79 | --input_path input.jsonl \ 80 | --output_path output.jsonl \ 81 | --text_column text \ 82 | --cache cache_dir \ 83 | --num_proc 2 \ 84 | --batch_size 100 85 | ``` 86 | 87 | ### 6. Fuzzy Deduplication 88 | Deduplicate similar texts in the dataset. 89 | #### Example usage 90 | ``` 91 | python 6_text_dedup/text_dedup/minhash.py \ 92 | --input_path input.jsonl \ 93 | --output_path output.jsonl \ 94 | --column text \ 95 | --cache_dir cache_dir \ 96 | --threshold 0.8 \ 97 | --false_positive_weight 0.5 \ 98 | --false_negative_weight 0.5 \ 99 | ``` 100 | 101 | ### 7. Financial relevance filtering and security risk filtering 102 | Using a financial relevance classifier (🤗[fin-model-zh-v0.1](https://huggingface.co/HIT-TMG/fin-model-zh-v0.1) and [fin-model-en-v0.1](https://huggingface.co/HIT-TMG/fin-model-en-v0.1)) and a security risk identification classifier (🤗[risk-model-zh-v0.1](https://huggingface.co/HIT-TMG/risk-model-zh-v0.1) and [risk-model-en-v0.1](https://huggingface.co/HIT-TMG/risk-model-en-v0.1)), we filter out high-quality financial corpus. 103 | 104 | 105 | 106 | ## ⚡️ Data Evaluation 107 | We evaluate each piece of data from the following aspects: 108 | - **Language Quality (0-10 points)**: This examines whether the data is grammatically correct, spelled correctly, uses appropriate vocabulary, and if the expression is fluent. High language quality aids the model in learning language rules, resulting in a higher score. ***Scoring criteria***: correct grammar and spelling (2 points), rich vocabulary (2 points), fluent expression (2 points), use of complex sentences or rare words (2 points), and overall language complexity (2 points). 109 | 110 | - **Information Content (0-10 points)**: This measures the amount of knowledge and concepts contained in the data. Data with high information content helps the model learn rich knowledge, leading to a higher score. ***Scoring criteria***: includes specialized knowledge or obscure concepts (3 points), longer length or discussion of multiple topics (3 points), detailed discussion of a single topic (2 points), and providing new information or insights (2 points). 111 | 112 | - **Novelty (0-10 points)**: This evaluates the extent to which new vocabulary, information, or ideas in the data expand the model's understanding. Data with high novelty can receive higher scores. ***Scoring criteria***: includes new words or concepts (3 points), provides new information or insights (3 points), presents ideas from new perspectives or in new forms (2 points), and creates new words or phrases (2 points). 113 | 114 | - **Coherence (0-10 points)**: ***Scoring criteria***: This assesses whether the data has a clear theme, coherent arguments, and rigorous reasoning, forming a complete discussion (3 points); a mostly clear theme with rigorous reasoning (3 points); all parts belong to the same topic, forming a coherent whole (4 points). 115 | 116 | - **Purity (0-10 points)**: This evaluates the amount of irrelevant information, such as ads, marketing, or spam, in the data. Data with little to no such information and content that mostly relates to the topic can score higher. ***Scoring criteria***: the main content is fully expressed (3 points), low spam content (3 points), and no spam content at all (4 points). 117 | 118 | #### Example usage 119 | ``` 120 | python 7_DataAnalysis/eval_pipeline.py \ 121 | --data_path input.jsonl \ 122 | --eval_path output.jsonl \ 123 | --text_column text \ 124 | --tiktoken_cache cache_dir \ 125 | --figure_dir figure_dir \ 126 | --model gpt-3.5-turbo-1106 \ 127 | --api_key xxxx \ 128 | --organization xxxx \ 129 | --num_proc 1 \ 130 | ``` 131 | 132 | -------------------------------------------------------------------------------- /5_text_dedup/clean_helpers/deduplication.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from functools import partial 3 | from typing import List, Set, Tuple, Dict, Callable, Optional 4 | import hashlib 5 | import re 6 | import string 7 | import urllib 8 | 9 | from datasets import Dataset 10 | 11 | 12 | # ======== DEDUPLICATION FUNCTIONS =================== 13 | from clean_helpers.utils import parse_meta 14 | 15 | 16 | def build_dedup_template(min_template_line_size: int, min_template_line_occurence: int): 17 | def dedup_template(ds: Dataset, num_proc: int, batch_size: int) -> Dataset: 18 | """Computes and remove templates lines""" 19 | # Compute the hash of each lines 20 | split_into_lines_and_hashes = ds.map( 21 | split_text_to_lines_and_hash, 22 | num_proc=num_proc, 23 | batched=True, 24 | batch_size=batch_size, 25 | remove_columns=ds.column_names 26 | ) 27 | lines_and_hashes = split_into_lines_and_hashes.remove_columns( 28 | set(split_into_lines_and_hashes.column_names) - {"lines", "hashes"} 29 | ) 30 | 31 | # Find template lines 32 | count_lines_occurence = defaultdict(lambda: 0) 33 | for row in lines_and_hashes: 34 | filtered_lines_and_hashes = [ 35 | (line, hash_) 36 | for line, hash_ in zip(row["lines"], row["hashes"]) 37 | if len(line) >= min_template_line_size 38 | ] 39 | for _, hash_ in filtered_lines_and_hashes: 40 | count_lines_occurence[hash_] += 1 41 | 42 | template_line_hashes = {k for k, v in count_lines_occurence.items() if v >= min_template_line_occurence} 43 | del count_lines_occurence 44 | 45 | # Clean dataset 46 | return split_into_lines_and_hashes.map( 47 | build_remove_template_lines(template_line_hashes), 48 | num_proc=num_proc, 49 | batched=True, 50 | batch_size=batch_size, 51 | remove_columns=split_into_lines_and_hashes.column_names 52 | ) 53 | 54 | return dedup_template 55 | 56 | 57 | def build_dedup_document(batch_normalizer: Callable[[Dict], List[str]]): 58 | def dedup_document(ds: Dataset, num_proc: int, batch_size: int) -> Dataset: 59 | hashed_documents = ds.map( 60 | lambda batch: {**batch, "hash": get_hash(batch_normalizer(batch))}, 61 | num_proc=num_proc, 62 | batched=True, 63 | batch_size=batch_size 64 | ) 65 | 66 | hashes = set() 67 | 68 | return hashed_documents.map( 69 | partial(delete_text_from_duplicates, hashes=hashes), 70 | num_proc=1, # VERY IMPORTANT: hashes will be updated, and is not thread safe. 71 | batched=True, 72 | batch_size=batch_size, 73 | remove_columns=hashed_documents.column_names 74 | ) 75 | 76 | return dedup_document 77 | 78 | 79 | # =========== HELPERS =============== 80 | 81 | def get_hash(texts: List[str]) -> List[str]: 82 | """Get hash of content field.""" 83 | return [hashlib.md5(text.strip().encode("utf-8")).hexdigest() for text in texts] 84 | 85 | def split_text_in_lines(text: str) -> List[str]: 86 | return [line.strip() for line in text.split("\n")] 87 | 88 | def split_text_to_lines_and_hash(batch: Dict[str, List]): 89 | lines_per_texts = [split_text_in_lines(text) for text in batch["text"]] 90 | return { 91 | **{k: v for k, v in batch.items() if k != "text"}, 92 | "lines": lines_per_texts, 93 | "hashes": [get_hash(lines) for lines in lines_per_texts] 94 | } 95 | 96 | 97 | def clean_text(lines_and_hashes: List[Tuple[str, int]], template_line_hashes: Set[str]): 98 | return "\n".join([line for line, hash_ in lines_and_hashes if hash_ not in template_line_hashes]) 99 | 100 | 101 | def build_remove_template_lines(template_line_hashes: Set[str]): 102 | def remove_template_lines(batch: Dict[str, List]): 103 | cleaned_texts = [ 104 | clean_text( 105 | list(zip(lines, hashes)), 106 | template_line_hashes 107 | ) 108 | for lines, hashes in zip(batch["lines"], batch["hashes"]) 109 | ] 110 | return { 111 | **{ 112 | key: value 113 | for key, value in batch.items() 114 | if key not in ["lines", "hashes"] 115 | }, 116 | "text": [cleaned_text for cleaned_text in cleaned_texts] 117 | } 118 | 119 | return remove_template_lines 120 | 121 | 122 | def is_new_hash(hash_: str, hashes: Set[str]) -> bool: 123 | """Check if current hash is still in set of unique hashes and remove if true.""" 124 | if hash_ in hashes: 125 | return False 126 | else: 127 | hashes.add(hash_) 128 | return True 129 | 130 | def delete_text_from_duplicates(batch: Dict[str, List], hashes: Set[str]) -> Dict[str, List]: 131 | return { 132 | **{k: v for k, v in batch.items() if k != "hash"}, 133 | "text": [text if is_new_hash(hash_, hashes) else "" for text, hash_ in zip(batch["text"], batch["hash"])] 134 | } 135 | 136 | def url_with_only_some_query_param(url: str, query_param_map: Optional[dict] = None) -> str: 137 | url_parse = urllib.parse.urlparse(url) 138 | query = url_parse.query 139 | 140 | url_query_params = urllib.parse.parse_qsl(query) 141 | 142 | if query_param_map is None: 143 | url_query_params_new = {} 144 | else: 145 | url_query_params_new = [(query_param_map[old_key], old_value) for (old_key, old_value) in url_query_params if old_key in query_param_map] 146 | 147 | url_new_query = urllib.parse.urlencode(url_query_params_new, encoding="utf-8") 148 | url_parse = url_parse._replace(query=url_new_query) 149 | new_url = urllib.parse.urlunparse(url_parse) 150 | return new_url 151 | 152 | # =========== BATCH NORMALISER =============== 153 | 154 | 155 | # this only keeps letter characters 156 | remove_non_character_regex = re.compile(f'\s+|\d+|[{re.escape(string.punctuation)}]') 157 | def document_batch_normalizer(batch: Dict) -> List[str]: 158 | return [remove_non_character_regex.sub('', text) for text in batch["text"]] 159 | 160 | 161 | def strict_url_batch_normalizer(batch: Dict) -> List[str]: 162 | return [parse_meta(meta)["url"] for meta in batch["meta"]] 163 | 164 | 165 | url_host_and_path_regex = re.compile(r"^(.[^?]*)") 166 | def url_host_and_path_batch_normalizer(batch: Dict) -> List[str]: 167 | return [url_host_and_path_regex.match(parse_meta(meta)["url"]).group(1) for meta in batch["meta"]] 168 | 169 | lm_es_pseudocrawl_filtered_341_es_cointelegraph_com_regex = re.compile(r"^((?:(?!/amp)/?(?:[^?/]*))+)(?:/amp)?") 170 | def url_lm_es_pseudocrawl_filtered_341_es_cointelegraph_com(batch: Dict) -> List[str]: 171 | return [lm_es_pseudocrawl_filtered_341_es_cointelegraph_com_regex.match(parse_meta(meta)["url"]).group(1) for meta in batch["meta"]] 172 | 173 | def url_lm_en_pseudocrawl_filtered_619_www_qut_edu_au(batch: Dict) -> List[str]: 174 | return [url_with_only_some_query_param(parse_meta(meta)["url"], {"id": "id", "news-id": "id"}) for meta in batch["meta"]] -------------------------------------------------------------------------------- /6_text_dedup/text_dedup/utils/add_args.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2022-11-05 09:16:34 4 | # @Author : Chenghao Mou (mouchenghao@gmail.com) 5 | import argparse 6 | 7 | 8 | def add_io_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: # pragma: no cover 9 | """ 10 | Add input/output arguments to parser. 11 | 12 | Parameters 13 | ---------- 14 | parser : argparse.ArgumentParser 15 | Parser to add arguments to. 16 | 17 | Returns 18 | ------- 19 | parser : argparse.ArgumentParser 20 | Parser with added arguments. 21 | """ 22 | parser.add_argument("--input_path", type=str, help="`path` in load_dataset", required=False), 23 | parser.add_argument("--name", type=str, help="`name` in load_dataset"), 24 | parser.add_argument("--data_dir", type=str, help="`data_dir` in load_dataset"), 25 | parser.add_argument("--data_files", type=str, help="`data_files` in load_dataset"), 26 | parser.add_argument("--split", type=str, help="`split` in load_dataset"), 27 | parser.add_argument("--cache_dir", type=str, help="`cache_dir` in load_dataset", default=".cache"), 28 | parser.add_argument("--revision", type=str, help="`revision` in load_dataset"), 29 | parser.add_argument( 30 | "--use_auth_token", action=argparse.BooleanOptionalAction, help="`use_auth_token` in load_dataset" 31 | ), 32 | parser.add_argument("--local", action=argparse.BooleanOptionalAction, help="Use local dataset", default=False), 33 | parser.add_argument("--output_path", type=str, help="Path to deduplicated dataset output", required=False), 34 | parser.add_argument( 35 | "--debug", action=argparse.BooleanOptionalAction, help="Whether to run in debug mode", default=False 36 | ) 37 | return parser 38 | 39 | 40 | def add_meta_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: # pragma: no cover 41 | """ 42 | Add meta arguments to parser. 43 | 44 | Parameters 45 | ---------- 46 | parser : argparse.ArgumentParser 47 | Parser to add arguments to. 48 | 49 | Returns 50 | ------- 51 | parser : argparse.ArgumentParser 52 | Parser with added arguments. 53 | """ 54 | parser.add_argument( 55 | "--column", 56 | type=str, 57 | help="""Text column to use for deduplication. Concatenate desired columns beforehand if needed.""", 58 | required=False, 59 | ), 60 | parser.add_argument( 61 | "--batch_size", 62 | type=int, 63 | help="""Batch size to use for dataset iteration. Mainly for memory efficiency.""", 64 | default=1000000, 65 | ), 66 | return parser 67 | 68 | 69 | def add_minhash_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: # pragma: no cover 70 | """ 71 | Add MinHash arguments to parser. 72 | 73 | Parameters 74 | ---------- 75 | parser : argparse.ArgumentParser 76 | Parser to add arguments to. 77 | 78 | Returns 79 | ------- 80 | parser : argparse.ArgumentParser 81 | Parser with added arguments. 82 | """ 83 | parser.add_argument( 84 | "--ngram", 85 | type=int, 86 | default=5, 87 | help="Ngram size to use in MinHash.", 88 | ) 89 | parser.add_argument( 90 | "--min_length", 91 | type=int, 92 | default=5, 93 | help="Minimum number of tokens to use in MinHash. Shorter documents will be filtered out.", 94 | ) 95 | parser.add_argument("--seed", type=int, default=42, help="Seed to use in MinHash") 96 | parser.add_argument("--num_perm", type=int, default=256, help="Number of permutations to use in MinHash") 97 | parser.add_argument( 98 | "--threshold", type=float, default=0.7, help="Jaccard similarity threshold to use in MinHashLSH" 99 | ) 100 | parser.add_argument( 101 | "--b", 102 | type=int, 103 | default=None, 104 | help="Number of bands", 105 | ) 106 | parser.add_argument( 107 | "--r", 108 | type=int, 109 | default=None, 110 | help="Number of rows per band", 111 | ) 112 | 113 | return parser 114 | 115 | 116 | def add_simhash_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: # pragma: no cover 117 | """ 118 | Add SimHash arguments to parser. 119 | 120 | Parameters 121 | ---------- 122 | parser : argparse.ArgumentParser 123 | Parser to add arguments to. 124 | 125 | Returns 126 | ------- 127 | parser : argparse.ArgumentParser 128 | Parser with added arguments. 129 | """ 130 | parser.add_argument( 131 | "--ngram", 132 | type=int, 133 | default=3, 134 | help="""Ngram size to use in SimHash.""", 135 | ) 136 | parser.add_argument("--f", type=int, default=64, help="Simhash bit size"), 137 | parser.add_argument("--bit_diff", type=int, default=3, help="Bit difference to use in SimHash"), 138 | parser.add_argument( 139 | "--num_bucket", type=int, default=4, help="Number of buckets to use in SimHash, must be larger than bit_diff" 140 | ), 141 | return parser 142 | 143 | 144 | def add_sa_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: # pragma: no cover 145 | """ 146 | Add Suffix Array arguments to parser. 147 | 148 | Parameters 149 | ---------- 150 | parser : argparse.ArgumentParser 151 | Parser to add arguments to. 152 | 153 | Returns 154 | ------- 155 | parser : argparse.ArgumentParser 156 | Parser with added arguments. 157 | """ 158 | parser.add_argument( 159 | "--k", type=int, default=100, help="Minimum byte length of a duplicate substring in Suffix Array Deduplication" 160 | ), 161 | parser.add_argument( 162 | "--strategy", 163 | type=str, 164 | default="overlapping", 165 | help="Strategy when there are overlapping duplicate substrings", 166 | choices=["overlapping", "longest"], 167 | ) 168 | parser.add_argument( 169 | "--google_repo_path", type=str, help="Path to google-research-deduplication codebase", required=True 170 | ), 171 | return parser 172 | 173 | 174 | def add_bloom_filter_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: # pragma: no cover 175 | """ 176 | Add Bloom Filter arguments to parser. 177 | 178 | Parameters 179 | ---------- 180 | parser : argparse.ArgumentParser 181 | Parser to add arguments to. 182 | 183 | Returns 184 | ------- 185 | parser : argparse.ArgumentParser 186 | Parser with added arguments. 187 | """ 188 | parser.add_argument("--error_rate", type=float, default=1e-6, help="Error rate to use in BloomFilter"), 189 | parser.add_argument("--hash_func", type=str, default="md5", help="Hash function to use in BloomFilter"), 190 | parser.add_argument("--initial_capacity", type=int, default=100, help="Initial capacity of BloomFilter"), 191 | return parser 192 | 193 | 194 | def add_exact_hash_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: # pragma: no cover 195 | """ 196 | Add Exact Hash arguments to parser. 197 | 198 | Parameters 199 | ---------- 200 | parser : argparse.ArgumentParser 201 | Parser to add arguments to. 202 | 203 | Returns 204 | ------- 205 | parser : argparse.ArgumentParser 206 | Parser with added arguments. 207 | """ 208 | parser.add_argument("--hash_func", type=str, default="md5", help="Hash function to use in ExactHash"), 209 | return parser 210 | 211 | def add_own_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: 212 | parser.add_argument("--output_duped", type=str, help="duped path to store"), 213 | parser.add_argument( 214 | "--false_positive_weight", 215 | type=float, 216 | default=0.5, 217 | help="false_positive_weight", 218 | ), 219 | parser.add_argument( 220 | "--false_negative_weight", 221 | type=float, 222 | default=0.5, 223 | help="false_negative_weight", 224 | ), 225 | parser.add_argument("--dataset_name", type=str, help="dataset_name",default="text_dedup.jsonl"), 226 | # parser.add_argument("--output_duped", type=str, help="duped path to store"), 227 | return parser -------------------------------------------------------------------------------- /5_text_dedup/5_clean.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import random 5 | from functools import partial 6 | 7 | import torch 8 | from datasets import Dataset, load_dataset, load_from_disk, concatenate_datasets 9 | from pathlib import Path 10 | from typing import Tuple, Optional, List, Dict 11 | from datasets.utils.logging import set_verbosity_info 12 | from numpy.random import default_rng 13 | 14 | 15 | from clean_helpers import build_dedup_template, build_dedup_document, concatenate_lm_fr_ester 16 | from clean_helpers.deduplication import document_batch_normalizer, url_host_and_path_batch_normalizer, \ 17 | url_lm_es_pseudocrawl_filtered_341_es_cointelegraph_com, url_lm_en_pseudocrawl_filtered_619_www_qut_edu_au 18 | 19 | 20 | 21 | set_verbosity_info() 22 | logger = logging.getLogger(__name__) 23 | torch.set_num_threads(1) 24 | 25 | # Deduplication functions and boolean to save a sample of the modifications: function(ds: Dataset, num_proc: int, batch_size: int) -> Dataset 26 | DEDUPS = { 27 | "dedup_template_soft": (build_dedup_template( 28 | min_template_line_size=15, 29 | min_template_line_occurence=10, 30 | ), True), 31 | "dedup_pseudocrawl_newspapers": (build_dedup_template( 32 | min_template_line_size=0, 33 | min_template_line_occurence=2, 34 | ), True), 35 | "dedup_document": (build_dedup_document(document_batch_normalizer), True), 36 | "dedup_document_on_url": (build_dedup_document(url_host_and_path_batch_normalizer), True), 37 | "dedup_document_on_url_lm_es_pseudocrawl-filtered_341_es_cointelegraph_com": (build_dedup_document( 38 | url_lm_es_pseudocrawl_filtered_341_es_cointelegraph_com 39 | ), True), 40 | "dedup_document_on_url_lm_en_pseudocrawl_filtered_619_www_qut_edu_au": (build_dedup_document( 41 | url_lm_en_pseudocrawl_filtered_619_www_qut_edu_au 42 | ), True), 43 | "concatenate_lm_fr_ester": (concatenate_lm_fr_ester, False) 44 | } 45 | 46 | 47 | DEDUPS_KEYS = set(DEDUPS.keys()) 48 | 49 | def get_size_per_example(texts: List[str]) -> Dict: 50 | size_values = [len(text.encode()) for text in texts] 51 | examples = {"bytes_len": size_values} 52 | return examples 53 | 54 | def quick_size_estimation( 55 | ds: Dataset, 56 | num_proc: int, 57 | batch_size: int, 58 | content_key:str ="text" 59 | ) -> int: 60 | if len(ds) == 0: 61 | return 0 62 | rng = default_rng(1991) 63 | subset_size = min(10000, len(ds)) 64 | indices = rng.choice(len(ds), size=subset_size, replace=False, shuffle=False) 65 | partial_ds = ds.select(indices) 66 | ratio = float(len(ds)) / float(subset_size) 67 | 68 | partial_ds = partial_ds.map( 69 | get_size_per_example, 70 | batched=True, 71 | num_proc=num_proc, 72 | batch_size=batch_size, 73 | input_columns=[content_key], 74 | remove_columns=partial_ds.column_names, 75 | ) 76 | len_bytes = sum(partial_ds["bytes_len"]) 77 | return len_bytes * ratio 78 | 79 | 80 | 81 | 82 | def filter_diff_text(examples, in_text_col, out_text_col): 83 | return [text_in != text_out for text_in, text_out in zip(examples[in_text_col], examples[out_text_col])] 84 | 85 | def get_args(): 86 | parser = argparse.ArgumentParser() 87 | parser.add_argument("--input_path", type=str, required=True, help="Dataset path we load the dataset from.") 88 | parser.add_argument("--output_path", type=Path, required=True, 89 | help="Path where we save resulting dataset after modifications.") 90 | parser.add_argument('--text_column', type=str) 91 | parser.add_argument("--cache", type=str, required=True, help="Cache Path.") 92 | parser.add_argument("--checks_save_path", type=Path, default=None, 93 | help="Path where we save samples we've removed or changed throughout the modifications.") 94 | parser.add_argument("--num_proc", type=int, default=1) 95 | parser.add_argument("--batch_size", type=int, default=100) 96 | parser.add_argument("--load_arrow_file", action="store_true", 97 | help="Option to indicate how to load original dataset. By default we use `load_dataset`. " 98 | "If the flag is use, we use `load_from_disk`") 99 | parser.add_argument("--sampling_size_map_checks", type=int, default=None, 100 | help="Optional argument. Checked dataset, ie sample we've changed throughout the " 101 | "modifications, are either save in whole or only a subset. If set to None, this flag " 102 | "saves everything, otherwise it saves a subset with its size corresponding to this value.") 103 | parser.add_argument("--sampling_size_filter_checks", type=int, default=None, 104 | help="Optional argument. Checked dataset, ie sample we've removed throughout the " 105 | "modifications, are either save in whole or only a subset. If set to None, this flag " 106 | "saves everything, otherwise it saves a subset with its size corresponding to this value.") 107 | parser.add_argument("--from_scratch", action="store_true", help="Resave all datasets on disk.") 108 | parser.add_argument("--save_to_json", default=True, help="Save output dataset in json format.") 109 | return parser.parse_args() 110 | 111 | def log_stats(title: str, original_ds: Dataset, after_transformation_ds: Dataset, operation_type: str, args): 112 | original_length = len(original_ds) 113 | after_transformation_length = len(after_transformation_ds) 114 | original_bytes = quick_size_estimation(original_ds, batch_size=args.batch_size, num_proc=args.num_proc, content_key=args.text_column) 115 | after_transformation_btyes = quick_size_estimation(after_transformation_ds, batch_size=args.batch_size, num_proc=args.num_proc, content_key=args.text_column) 116 | logger.info(title) 117 | logger.info(f" Initial number of samples: {original_length} samples") 118 | logger.info(f" {operation_type} samples: {original_length - after_transformation_length} samples") 119 | logger.info(f" {operation_type} percentage: {(original_length - after_transformation_length) / original_length * 100:.2f} %") 120 | logger.info(f" Final number of samples: {after_transformation_length} samples") 121 | logger.info(f" Initial size in bytes: {original_bytes * 1e-9:.4f} GB") 122 | logger.info(f" {operation_type} bytes: {(original_bytes - after_transformation_btyes) * 1e-9:.4f} GB") 123 | logger.info(f" {operation_type} percentage in bytes: {(original_bytes - after_transformation_btyes) / original_bytes * 100:.2f} %") 124 | logger.info(f" Final size in bytes: {after_transformation_btyes * 1e-9:.4f} GB") 125 | 126 | 127 | 128 | def get_modified_documents( 129 | ds: Dataset, 130 | mapped_ds: Dataset, 131 | num_proc: int, 132 | batch_size: int, 133 | sampling_size: Optional[int], 134 | text_column, 135 | ) -> Dataset: 136 | remove_columns = set(ds.column_names) 137 | remove_columns.remove(text_column) 138 | ds = ds.remove_columns(remove_columns) 139 | ds = ds.rename_column(text_column, f"old_text") 140 | 141 | assert len(mapped_ds) == len(ds), f"Mapping function are batched, but they should not alter the size of the batch." 142 | mapped_diff_ds = concatenate_datasets([mapped_ds.flatten_indices(), ds.flatten_indices()], axis=1).filter( 143 | partial(filter_diff_text, in_text_col="old_text", out_text_col=text_column), 144 | batched=True, 145 | num_proc=num_proc, 146 | batch_size=batch_size 147 | ) 148 | 149 | logger.info("Examples of modified examples:") 150 | idx_samples = random.sample(range(len(mapped_diff_ds)), min(len(mapped_diff_ds), 10)) 151 | for idx in idx_samples: 152 | logger.info(f" Examples n°{idx} :\n{json.dumps(mapped_diff_ds[idx], indent=2)}") 153 | 154 | if sampling_size is not None: 155 | idx_samples = random.sample(range(len(mapped_diff_ds)), min(len(mapped_diff_ds), sampling_size)) 156 | mapped_diff_ds = mapped_diff_ds.select(idx_samples) 157 | 158 | return mapped_diff_ds 159 | 160 | 161 | def apply_function(function_name: str, ds: Dataset, args) -> Tuple[Dataset, Optional[Dataset]]: 162 | logger.info(f"Applying: {function_name}") 163 | if function_name in DEDUPS: 164 | dedup_function, dedup_check = DEDUPS[function_name] 165 | deduplicated_ds = dedup_function(ds, num_proc=args.num_proc, batch_size=args.batch_size) 166 | log_stats(f"Applied deduplication function: {function_name}", ds, deduplicated_ds, operation_type="Deduplicated", args=args) 167 | 168 | # Some deduplication do not preserve the number of samples, so alignement is lost. For example "dedup_document" 169 | if args.checks_save_path is not None and dedup_check: 170 | deduped_diff_ds = get_modified_documents(ds, deduplicated_ds, args.num_proc, args.batch_size, args.sampling_size_map_checks, args.text_column) 171 | return deduplicated_ds, deduped_diff_ds 172 | else: 173 | return deduplicated_ds, None 174 | else: 175 | raise NotImplementedError(f"{function_name} has not matched any existing function names. Available names:\n" 176 | f"Dedup functions: {DEDUPS_KEYS}\n" 177 | ) 178 | 179 | def main(): 180 | logging.basicConfig( 181 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 182 | datefmt="%m/%d/%Y %H:%M:%S", 183 | level=logging.INFO, 184 | ) 185 | args = get_args() 186 | logger.info(f"** The job is runned with the following arguments: **\n{args}\n **** ") 187 | 188 | # Load dataset 189 | logger.info(f" ===== Loading {args.input_path} =====") 190 | if args.load_arrow_file: 191 | ds = load_from_disk(args.input_path) 192 | else: 193 | ds = load_dataset("json", data_files=args.input_path, split="train", cache_dir= args.cache) 194 | 195 | # Apply series of dedups 196 | logger.info(f" ===== Applying transformations =====") 197 | 198 | preprocessings = ["dedup_template_soft", "dedup_document"] 199 | for idx, preprocessing in enumerate(preprocessings): 200 | ds, ds_diff = apply_function(preprocessing, ds, args) 201 | if ds_diff is not None and len(ds_diff) != 0: 202 | saving_path = args.checks_save_path / f"{idx}_{preprocessing}_checks" 203 | if not args.from_scratch and saving_path.exists(): 204 | continue 205 | tmp_save_path = Path(saving_path.parent, f"tmp-{saving_path.name}") 206 | logger.info(f" ===== Saving examples to check after {preprocessing} =====") 207 | ds_diff.save_to_disk(tmp_save_path) 208 | tmp_save_path.rename(saving_path) 209 | 210 | 211 | # Save to disk 212 | if args.from_scratch or not args.output_path.exists(): 213 | logger.info(f" ===== Saving dataset =====") 214 | logger.info(f"Saving to final dataset at {args.output_path}.") 215 | tmp_save_path = Path(args.output_path.parent, f"tmp-{args.output_path.name}") 216 | if len(ds) == 0: 217 | logger.info("Dataset was empty. Not saving anything.") 218 | return 219 | if args.save_to_json: 220 | ds.to_json( 221 | tmp_save_path, 222 | num_proc=args.num_proc, 223 | force_ascii=False 224 | ) 225 | else: 226 | ds.save_to_disk(tmp_save_path) 227 | tmp_save_path.rename(args.output_path) 228 | else: 229 | logging.info(f"Dataset was already saved at {args.output_path}") 230 | 231 | 232 | if __name__ == "__main__": 233 | main() 234 | -------------------------------------------------------------------------------- /1_pii.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from functools import partial 3 | from pathlib import Path 4 | import logging 5 | import random 6 | import sys 7 | import regex 8 | from datasets.utils.logging import set_verbosity_info 9 | from datasets import load_dataset, load_from_disk 10 | 11 | set_verbosity_info() 12 | logger = logging.getLogger(__name__) 13 | high_risk_tags = {'KEY', 'EMAIL', 'USER', 'IP_ADDRESS'} # , 'NUMBER', "ID"} 14 | year_patterns = [ 15 | # yyyy-yyyy or yyyy/yyyy 16 | regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/][1-2][0-9]{3})(?:$|[\s@,?!;:\'\"(.\p{Han}])"), 17 | # yyyy-mm-dd or yyyy-dd-mm or yyyy/mm/dd or yyyy/dd/mm or yyyy.mm.dd or yyyy.dd.mm 18 | regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/.][0-3][0-9][\p{Pd}/.][0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])"), 19 | # mm-dd-yyyy or dd-mm-yyyy or mm/dd/yyyy or dd/mm/yyyy or mm.dd.yyyy or dd.mm.yyyy or the same but with yy instead of yyyy 20 | regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/.][0-3][0-9][\p{Pd}/.](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])"), 21 | # mm-yyyy or mm/yyyy or the same but with yy 22 | regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])"), 23 | # yyyy-mm or yyyy/mm 24 | regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}-[0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])"), 25 | ] 26 | 27 | # Patterns for high-risk character strings 28 | id_pattern = r'(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([A-Za-z]*(?:[\p{Pd}]*\p{Nd}){6,})(?:$|[\b\s@?,!;:\'\")(.\p{Han}])' 29 | 30 | # https://regex101.com/r/JQkmh8/5 31 | key_pattern = r'(?:^|[\b\s@?,!:;\'\")(.\p{Han}])((?:(?:[A-Za-z]+[\p{Nd}\p{Pd}\/\+\=:_]+|[\p{Nd}\p{Pd}\/\+\=:]+[A-Za-z]+)){4,}|(?:(?:\p{Nd}{3,}|[A-Z]+\p{Nd}+[A-Z]*|\p{Nd}+[A-Z]+\p{Nd}*)[ \p{Pd}]?){3,})(?:$|[\b\s\p{Han}@?,!;:\'\")(.])' 32 | 33 | ipv4_pattern = r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}' 34 | ipv6_pattern = r'(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])' 35 | ip_pattern = r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" + r"|".join([ipv4_pattern, ipv6_pattern]) + ")(?:$|[\s@,?!;:\'\"(.\p{Han}])" 36 | 37 | # https://regex101.com/r/EpA5B7/1 38 | email_pattern = r''' 39 | (?<= ^ | [\b\s@,?!;:)('".\p{Han}<] ) 40 | ( 41 | [^\b\s@?!;,:)('"<]+ 42 | @ 43 | [^\b\s@!?;,/]* 44 | [^\b\s@?!;,/:)('">.] 45 | \. 46 | \p{L} \w{1,} 47 | ) 48 | (?= $ | [\b\s@,?!;:)('".\p{Han}>] ) 49 | ''' 50 | 51 | # https://regex101.com/r/mOqi1s/3 52 | user_pattern = r''' 53 | (?<= ^ | [)(\s@,?!;:'"\p{Han}] ) 54 | (@ 55 | [^)(\s@,?!;:'"]{3,} 56 | ) 57 | ''' 58 | # Examples from https://regexpattern.com/phone-number/ 59 | # https://regex101.com/r/lZZ0XP/4 60 | # Also matches MLS numbers 61 | # phone_pattern = r'(?:^|[\s\'\"(\p{Han}])((?:\+\p{Nd}+[ \/.\p{Pd}]*)?(?:(?:\(\+?\p{Nd}+\))?(?:[ \/.\p{Pd}]*\p{Nd})){7,}(?:[\t\f #]*\p{Nd}+)?)(?:$|[\s@,?!;:\'\"(.\p{Han}])' 62 | 63 | id_regex = regex.compile(id_pattern, flags=regex.MULTILINE) #, re.MULTILINE) 64 | key_regex = regex.compile(key_pattern, flags=regex.MULTILINE) #, re.MULTILINE) 65 | ipv4_regex = regex.compile(ipv4_pattern) 66 | ipv6_regex = regex.compile(ipv6_pattern) 67 | ip_regex = regex.compile(ip_pattern, flags=regex.MULTILINE) #, re.MULTILINE) 68 | email_regex = regex.compile(email_pattern, flags=regex.MULTILINE|regex.VERBOSE) #, re.MULTILINE) 69 | user_regex = regex.compile(user_pattern, flags=regex.MULTILINE|regex.VERBOSE) #, re.MULTILINE) 70 | # phone_regex = regex.compile(phone_pattern, flags=regex.MULTILINE) #, re.MULTILINE) 71 | 72 | 73 | 74 | mst_regexes = {} 75 | for tag in high_risk_tags: 76 | if tag == 'ID': 77 | mst_regexes['ID'] = id_regex 78 | elif tag == 'KEY': 79 | mst_regexes['KEY'] = key_regex 80 | elif tag == 'IPv4': 81 | mst_regexes['IPv4'] = ipv4_regex 82 | elif tag == 'IPv6': 83 | mst_regexes['IPv6'] = ipv6_regex 84 | elif tag == 'IP_ADDRESS': 85 | mst_regexes['IP_ADDRESS'] = ip_regex 86 | elif tag == 'EMAIL': 87 | mst_regexes['EMAIL'] = email_regex 88 | elif tag == 'USER': 89 | mst_regexes['USER'] = user_regex 90 | # elif tag == 'NUMBER': 91 | # mst_regexes['NUMBER'] = phone_regex 92 | else: 93 | sys.stderr.write('Dont have tag regex pattern for %s =(' % tag) 94 | 95 | def ip_has_digit(matched_str): 96 | """Checks to make sure the PII span is not just :: or whatever that may 97 | accidentally be picked up by making sure there are digits.""" 98 | return any(map(str.isdigit, matched_str)) 99 | 100 | def matches_date_pattern(matched_str): 101 | # Screen out date false positives 102 | for year_regex in year_patterns: 103 | if year_regex.match(matched_str): 104 | return True 105 | return False 106 | 107 | 108 | def detect_pii(text, lang, tag_types): 109 | matches = [] 110 | for tag in tag_types: 111 | label_pattern = mst_regexes[tag] 112 | # !! regex.match happens here!! 113 | matches_tmp = label_pattern.finditer(text) 114 | for match in matches_tmp: 115 | if match.groups(): 116 | if len(match.groups()) > 1 and match.groups()[1]: 117 | sys.stderr.write("Warning: Found substring matches in the main match.") 118 | 119 | matched_str = match.groups() 120 | 121 | matched_str = matched_str[0] 122 | if matched_str: 123 | if tag in ["IP_ADDRESS"]: 124 | # Filter out false positive IPs 125 | if not ip_has_digit(matched_str): 126 | continue 127 | if tag in ["ID", "IP_ADDRESS"]: #, "NUMBER"]: 128 | # Filter out date false positives 129 | if matches_date_pattern(matched_str): 130 | continue 131 | 132 | matches += [(matched_str, match.span(), str(label_pattern), tag, lang)] 133 | return matches 134 | 135 | 136 | #@title Redaction function defined here. 137 | def redact_pii(text, matches): 138 | """Takes a match as defined in the detect_pii function and redacts it from the full string, returning a tuple.""" 139 | redacted_str = text 140 | metadata = [] 141 | for match in matches: 142 | matched_str = match[0] 143 | tag = match[3] 144 | redact_tag = "PI:" + tag 145 | redacted_str = redacted_str.replace(matched_str, redact_tag) 146 | # Create the "metadata" as all of the information we had before redaction 147 | metadata += [(match)] 148 | return (redacted_str, metadata) 149 | 150 | #@title General function to run the PII detection and redact it, saving everything else to metadata, is defined here. 151 | def run_pii(text, lang): 152 | """ 153 | Runs the given set of regexes on the data "lines" and pulls out the 154 | tagged items. 155 | The lines structure stores the language type(s). This can be used for 156 | language-specific regexes, although we're dropping that for now and using 157 | only "default"/non-language-specific regexes. 158 | """ 159 | 160 | text = text.encode().decode() 161 | matches = detect_pii(text, lang, high_risk_tags) 162 | match_set = (text, {}) 163 | if len(matches) > 0: 164 | # !!! REDACTION HAPPENS HERE !!! 165 | redacted_str, metadata = redact_pii(text, matches) 166 | metadata_out = {"regex metadata":metadata, "original": text, "redacted": redacted_str} 167 | match_set = (redacted_str, metadata_out) 168 | return match_set 169 | 170 | 171 | def run_pii_batch(exs, lang, text_column): 172 | """ 173 | Runs the given set of regexes on the data "lines" and pulls out the 174 | tagged items. 175 | The lines structure stores the language type(s). This can be used for 176 | language-specific regexes, although we're dropping that for now and using 177 | only "default"/non-language-specific regexes. 178 | """ 179 | regex_metadata = [] 180 | old_text = [] 181 | new_text = [] 182 | modified = [] 183 | for text in exs[text_column]: 184 | text = text.encode().decode() 185 | matches = detect_pii(text, lang, high_risk_tags) 186 | if len(matches) > 0: 187 | # !!! REDACTION HAPPENS HERE !!! 188 | redacted_str, metadata = redact_pii(text, matches) 189 | regex_metadata.append(repr(metadata)) 190 | old_text.append(text) 191 | new_text.append(redacted_str) 192 | modified.append(True) 193 | else: 194 | regex_metadata.append("") 195 | old_text.append(text) 196 | new_text.append(text) 197 | modified.append(False) 198 | result = { 199 | "regex_metadata": regex_metadata, 200 | "old_text": old_text, 201 | "modified": modified 202 | } 203 | 204 | result[text_column] = new_text 205 | return result 206 | 207 | def get_args(): 208 | parser = argparse.ArgumentParser(description='Load a dataset.') 209 | parser.add_argument('--input_path', type=str) 210 | parser.add_argument('--output_path', type=str) 211 | parser.add_argument('--text_column', type=str) 212 | parser.add_argument('--load_from_disk', action="store_true") 213 | parser.add_argument('--save_to_json', action="store_true", default=True) 214 | parser.add_argument('--dataset_path', type=Path) 215 | parser.add_argument('--dataset_name', type=str) 216 | parser.add_argument("--num_proc", type=int, default=3) 217 | parser.add_argument("--batch_size", type=int, default=1000) 218 | parser.add_argument("--save_batch_size", type=int, default=10000) 219 | args = parser.parse_args() 220 | return args 221 | 222 | def get_check_ds(ds, args): 223 | if args.check_only_modified: 224 | ds_checks = ds.filter( 225 | lambda exs: exs["modified"], 226 | batched=True, 227 | batch_size=args.batch_size, 228 | num_proc=args.num_proc 229 | ) 230 | else: 231 | ds_checks = ds 232 | idx_samples = random.sample(range(len(ds_checks)), min(len(ds_checks), args.check_sampling_size)) 233 | ds_checks = ds_checks.select(idx_samples) 234 | 235 | return ds_checks 236 | 237 | 238 | if __name__ == '__main__': 239 | logging.basicConfig( 240 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 241 | datefmt="%m/%d/%Y %H:%M:%S", 242 | level=logging.INFO, 243 | ) 244 | args = get_args() 245 | logger.info(f"** The job is runned with the following arguments: **\n{args}\n **** ") 246 | file_path = Path(args.input_path) 247 | args.dataset_path=file_path.parent 248 | args.dataset_name=file_path.name 249 | logger.info(f" ===== Loading {args.dataset_path} =====") 250 | if args.load_from_disk: 251 | ds = load_from_disk(str(args.dataset_path)) 252 | else: 253 | ds = load_dataset(str(args.dataset_path), data_files=[f"*{args.dataset_name}"], split="train") 254 | lang = str(args.dataset_path).split("/")[-1].replace("indic-", "").replace("lm_", "")[:2] 255 | logger.info(f"ds info: {ds}") 256 | logger.info(f" ===== Applying PII =====") 257 | ds = ds.map( 258 | partial(run_pii_batch, lang=lang, text_column=args.text_column), 259 | batched=True, 260 | batch_size=args.batch_size, 261 | num_proc=args.num_proc 262 | ) 263 | 264 | ds_final = ds.remove_columns([ 265 | "regex_metadata", 266 | "old_text", 267 | "modified" 268 | ]) 269 | logger.info(f"ds_final info: {ds_final}") 270 | 271 | logger.info(f" ===== Saving Final dataset =====") 272 | logger.info(f"Saving to final dataset at {args.output_path}.") 273 | tmp_save_path = Path(args.output_path.parent, f"tmp-{args.output_path.name}") 274 | if len(ds_final) == 0: 275 | logger.info("Dataset was empty. Not saving anything.") 276 | else: 277 | if args.save_to_json: 278 | ds_final.to_json( 279 | tmp_save_path, 280 | num_proc=args.num_proc, 281 | batch_size=args.save_batch_size, 282 | force_ascii=False 283 | ) 284 | else: 285 | ds_final.save_to_disk(tmp_save_path) 286 | tmp_save_path.rename(args.output_path) 287 | logger.info(f" ===== Final dataset saved successfully =====") 288 | ''' 289 | ds_checks = get_check_ds(ds, args) 290 | 291 | logger.info(f" ===== Saving check dataset =====") 292 | logger.info(f"Saving check dataset at {args.save_check_path}.") 293 | tmp_save_path = Path(args.save_check_path.parent, f"tmp-{args.save_check_path.name}") 294 | if len(ds_checks) == 0: 295 | logger.info("Dataset was empty. Not saving anything.") 296 | else: 297 | if args.save_check_to_json: 298 | ds_checks.to_json( 299 | tmp_save_path, 300 | num_proc=args.num_proc, 301 | batch_size=args.save_batch_size 302 | ) 303 | else: 304 | ds_checks.save_to_disk(tmp_save_path) 305 | tmp_save_path.rename(args.save_check_path) 306 | logger.info(f" ===== Check dataset saved successfully =====") 307 | ''' -------------------------------------------------------------------------------- /6_text_dedup/text_dedup/minhash.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author : Chenghao Mou (mouchenghao@gmail.com) 4 | # created : 10/4/22 5 | from __future__ import annotations 6 | 7 | import argparse 8 | import gc 9 | import hashlib 10 | import multiprocessing as mp 11 | import os 12 | import pickle 13 | import random 14 | import re 15 | import json 16 | import datasets 17 | import numpy as np 18 | from collections import defaultdict 19 | from typing import Any, Dict, List, Set, Tuple 20 | 21 | from datasets import load_dataset 22 | from datasets import load_from_disk 23 | from scipy.integrate import quad as integrate 24 | from pathlib import Path 25 | from tqdm import tqdm 26 | import sys 27 | sys.path.append(Path(__file__).parent.parent) 28 | 29 | from utils import UnionFind 30 | from utils import ngrams 31 | from utils.add_args import add_io_args 32 | from utils.add_args import add_meta_args 33 | from utils.add_args import add_minhash_args 34 | from utils.add_args import add_own_args 35 | from utils.timer import Timer 36 | 37 | import logging 38 | 39 | from rich.logging import RichHandler 40 | 41 | logger = logging.getLogger("text_dedup") 42 | logger.setLevel(logging.INFO) 43 | logger.addHandler(RichHandler(rich_tracebacks=True)) 44 | logger.propagate = False 45 | 46 | SEED = 42 47 | RNG = np.random.RandomState(SEED) 48 | NON_ALPHA = re.compile("[^A-Za-z_0-9]") 49 | MAX_HASH = np.uint64((1 << 32) - 1) 50 | MERSENNE_PRIME = np.uint64((1 << 61) - 1) 51 | datasets.logging.set_verbosity_error() 52 | 53 | 54 | def save_jsonl(data, output_path): 55 | with open(output_path, 'w', encoding='utf-8') as output_file: 56 | for item in data: 57 | output_file.write(json.dumps(item, ensure_ascii=False) + "\n") 58 | 59 | 60 | def sha1_hash(data: bytes, d: int = 32) -> int: 61 | """ 62 | Generate a d-bit hash value from the given data. 63 | 64 | Parameters 65 | ---------- 66 | data : bytes 67 | The data to be hashed. 68 | d : int 69 | The number of bits of the hash value. 70 | 71 | Returns 72 | ------- 73 | int 74 | The hash value. 75 | 76 | Examples 77 | -------- 78 | >>> sha1_hash(b"hello world", 32) 79 | 896314922 80 | >>> sha1_hash(b"hello world", 64) 81 | 13028719972609469994 82 | >>> sha1_hash(b"hello world", 128) 83 | 310522945683037930239412421226792791594 84 | """ 85 | return int.from_bytes(hashlib.sha1(data).digest()[: d // 8], byteorder="little") 86 | 87 | 88 | def embed_func( 89 | content: str, 90 | idx: int, 91 | *, 92 | num_perm: int, 93 | ngram_size: int, 94 | min_length: int, 95 | hashranges: List[Tuple[int, int]], 96 | permutations: np.ndarray, 97 | ) -> Dict[str, Any]: 98 | """ 99 | Calculate hash values for the content. 100 | 101 | Parameters 102 | ---------- 103 | content : str 104 | The content to be embedded. 105 | idx : int 106 | The index of the content. 107 | num_perm : int 108 | The number of permutations. 109 | ngram_size : int 110 | The size of n-grams. 111 | min_length : int 112 | The minimum length of the document in terms of tokens. 113 | hashranges : List[Tuple[int, int]] 114 | The ranges of hash values. 115 | permutations : np.ndarray 116 | The permutations for the minhash. 117 | 118 | Returns 119 | ------- 120 | Dict[str, Any] 121 | The hash values in each range and the index. 122 | 123 | Examples 124 | -------- 125 | >>> content = "hello world" 126 | >>> idx = 0 127 | >>> num_perm = 250 128 | >>> ngram_size = 1 129 | >>> hashranges = [(i, i + 25) for i in range(0, 250, 25)] 130 | >>> PERMUTATIONS = np.array( 131 | ... [ 132 | ... ( 133 | ... RNG.randint(1, MERSENNE_PRIME, dtype=np.uint64), 134 | ... RNG.randint(0, MERSENNE_PRIME, dtype=np.uint64), 135 | ... ) 136 | ... for _ in range(num_perm) 137 | ... ], 138 | ... dtype=np.uint64, 139 | ... ).T 140 | >>> res = embed_func(content, idx, num_perm=num_perm, ngram_size=ngram_size, min_length=0, hashranges=hashranges, permutations=PERMUTATIONS) 141 | >>> len(res["__signatures__"]) 142 | 10 143 | >>> res["__id__"] 144 | 0 145 | """ 146 | a, b = permutations 147 | masks: np.ndarray = np.full(shape=num_perm, dtype=np.uint64, fill_value=MAX_HASH) 148 | tokens: Set[str] = {" ".join(t) for t in ngrams(NON_ALPHA.split(content), ngram_size, min_length)} 149 | hashvalues: np.ndarray = np.array([sha1_hash(token.encode("utf-8")) for token in tokens], dtype=np.uint64) 150 | permuted_hashvalues = np.bitwise_and( 151 | ((hashvalues * np.tile(a, (len(hashvalues), 1)).T).T + b) % MERSENNE_PRIME, MAX_HASH 152 | ) 153 | hashvalues = np.vstack([permuted_hashvalues, masks]).min(axis=0) 154 | Hs = [bytes(hashvalues[start:end].byteswap().data) for start, end in hashranges] 155 | return {"__signatures__": Hs, "__id__": idx} 156 | 157 | 158 | def optimal_param( 159 | threshold: float, 160 | num_perm: int, 161 | false_positive_weight: float = 0.5, 162 | false_negative_weight: float = 0.5, 163 | ): 164 | """ 165 | Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum 166 | of probabilities of false positive and false negative, taken from datasketch. 167 | 168 | You can also refer to the interactive demo at https://huggingface.co/spaces/bigcode/near-deduplication. 169 | 170 | Parameters 171 | ---------- 172 | threshold : float 173 | The threshold for similarity. 174 | num_perm : int 175 | The number of permutations. 176 | false_positive_weight : float 177 | The weight of false positive. 178 | false_negative_weight : float 179 | The weight of false negative. 180 | 181 | Returns 182 | ------- 183 | Tuple[int, int] 184 | The optimal `b` (bands) and `r` (rows) parameters. 185 | 186 | Examples 187 | -------- 188 | >>> optimal_param(0.75, 256) 189 | (21, 12) 190 | >>> optimal_param(0.75, 256, 0.1, 0.9) 191 | (28, 9) 192 | """ 193 | 194 | def false_positive_area(threshold: float, b: int, r: int): 195 | """Source: `datasketch.lsh`""" 196 | 197 | def proba(s): 198 | return 1 - (1 - s ** float(r)) ** float(b) 199 | 200 | a, _ = integrate(proba, 0.0, threshold) 201 | return a 202 | 203 | def false_negative_area(threshold: float, b: int, r: int): 204 | """Source: `datasketch.lsh`""" 205 | 206 | def proba(s): 207 | return 1 - (1 - (1 - s ** float(r)) ** float(b)) 208 | 209 | a, _ = integrate(proba, threshold, 1.0) 210 | return a 211 | 212 | min_error = float("inf") 213 | opt = (0, 0) 214 | for b in range(1, num_perm + 1): 215 | max_r = int(num_perm / b) 216 | for r in range(1, max_r + 1): 217 | fp = false_positive_area(threshold, b, r) 218 | fn = false_negative_area(threshold, b, r) 219 | error = fp * false_positive_weight + fn * false_negative_weight 220 | if error < min_error: 221 | min_error = error 222 | opt = (b, r) 223 | return opt 224 | 225 | 226 | if __name__ == "__main__": 227 | 228 | parser = argparse.ArgumentParser( 229 | prog="text_dedup.minhash", 230 | description="Deduplicate text using minhash", 231 | formatter_class=argparse.RawTextHelpFormatter, 232 | ) 233 | parser = add_io_args(parser) 234 | parser = add_meta_args(parser) 235 | parser = add_minhash_args(parser) 236 | parser = add_own_args(parser) 237 | args = parser.parse_args() 238 | 239 | ### arguments needed ############### 240 | args.f = 64 241 | args.bit_diff = 3 242 | args.num_bucket = 4 243 | args.local = False 244 | args.json = True 245 | 246 | args.num_proc = 80 247 | args.batch_size = 1000 248 | 249 | # data_files = os.listdir(args.input_path) 250 | # data_files = [os.path.join(args.input_path, data_file) for data_file in data_files][:] 251 | data_files = [args.input_path] 252 | args.split = 'train' 253 | args.num_perm = 25 254 | args.B = 10 255 | args.R = 1 256 | ### arguments needed ############### 257 | 258 | mp.set_start_method("fork", force=True) 259 | uf = UnionFind() 260 | timer = Timer() 261 | 262 | if args.b is not None and args.r is not None: 263 | B, R = args.b, args.r 264 | else: 265 | B, R = optimal_param(args.threshold, args.num_perm, args.false_positive_weight, args.false_negative_weight) 266 | 267 | logger.info(f"B is {B}, R is {R}") 268 | logger.info(f'threshold is {args.threshold}, num_perm is {args.num_perm}') 269 | HASH_RANGES = [(i * R, (i + 1) * R) for i in range(B)] 270 | HASH_TABLES: List[Dict[int, Set]] = [defaultdict(set) for _ in range(B)] 271 | 272 | with timer("Total"): 273 | with timer("Loading"): 274 | if args.local: 275 | ds = load_from_disk(args.input_path) 276 | elif args.json: 277 | ds = load_dataset("json", data_files=data_files\ 278 | ,cache_dir = args.cache_dir 279 | ,split = args.split 280 | ) 281 | else: 282 | ds = load_dataset( 283 | path=args.input_path, 284 | name=args.name, 285 | data_dir=args.data_dir, 286 | data_files=args.data_files, 287 | split=args.split, 288 | revision=args.revision, 289 | cache_dir=args.cache_dir, 290 | use_auth_token=args.use_auth_token, 291 | ) 292 | 293 | DATA_SIZE = len(ds) 294 | PERMUTATIONS = np.array( 295 | [ 296 | ( 297 | RNG.randint(1, MERSENNE_PRIME, dtype=np.uint64), 298 | RNG.randint(0, MERSENNE_PRIME, dtype=np.uint64), 299 | ) 300 | for _ in range(args.num_perm) 301 | ], 302 | dtype=np.uint64, 303 | ).T 304 | 305 | with timer("MinHashing"): 306 | embedded = ds.map( 307 | function=embed_func, 308 | fn_kwargs={ 309 | "num_perm": args.num_perm, 310 | "hashranges": HASH_RANGES, 311 | "ngram_size": args.ngram, 312 | "min_length": args.min_length, 313 | "permutations": PERMUTATIONS, 314 | }, 315 | input_columns=[args.column], 316 | remove_columns=ds.column_names, 317 | num_proc=os.cpu_count() if args.num_proc==None else args.num_proc, 318 | with_indices=True, 319 | desc="Fingerprinting...", 320 | ) 321 | 322 | 323 | with timer("Clustering"): 324 | for i in tqdm( 325 | range(0, len(embedded), args.batch_size), 326 | dynamic_ncols=True, 327 | desc="Iterating MinHashes...", # noqa: E501 328 | ): 329 | batch = embedded[i : i + args.batch_size] 330 | for key, Hs in zip(batch["__id__"], batch["__signatures__"]): 331 | for i, H in enumerate(Hs): 332 | HASH_TABLES[i][H].add(key) 333 | 334 | for table in tqdm(HASH_TABLES, dynamic_ncols=True, desc="Clustering..."): 335 | for cluster in table.values(): 336 | if len(cluster) <= 1: 337 | continue 338 | idx = min(cluster) 339 | for x in cluster: 340 | uf.union(x, idx) 341 | 342 | 343 | with timer("Filtering"): 344 | gc.freeze() 345 | gc.disable() 346 | ds = ds.map( 347 | function=lambda _, idx: {"__cluster__": uf.find(idx)}, 348 | with_indices=True, 349 | num_proc=os.cpu_count() if args.num_proc==None else args.num_proc, 350 | new_fingerprint=str(random.getrandbits(128)), 351 | desc="Finding clusters...", 352 | ) 353 | gc.enable() 354 | gc.collect() 355 | # This is where the deduplication happens 356 | # Since there is no easy groupby in datasets 357 | # I will use this simple filter for now 358 | final_data = ds.filter( 359 | function=lambda record, idx: record["__cluster__"] == idx, 360 | with_indices=True, 361 | num_proc=os.cpu_count() if args.num_proc==None else args.num_proc, 362 | desc="Filtering clusters...", 363 | ) 364 | 365 | final_data_duped = ds.filter( 366 | function=lambda record, idx: record["__cluster__"] != idx, 367 | with_indices=True, 368 | num_proc=os.cpu_count() 369 | if args.num_proc == None else args.num_proc, 370 | desc="Filtering clusters...", 371 | ) 372 | 373 | 374 | with timer("Saving"): 375 | # filter_data = [{"text": d["text"]} for d in final_data] 376 | save_jsonl(final_data, args.output_path) 377 | # final_data.to_json(args.output_path, orient="records", lines=True, force_ascii=False) 378 | 379 | if args.debug: 380 | with open(os.path.join(args.output_path, "uf.pkl"), "wb") as f: 381 | pickle.dump(uf, f, protocol=pickle.HIGHEST_PROTOCOL) 382 | 383 | PAD = 32 384 | for k, v in timer.elapsed_times.items(): 385 | logger.info(f"{k:<{PAD}}: {v:.2f}s") 386 | 387 | logger.info(f"{'Before':<{PAD}}: {len(ds)}") 388 | logger.info(f"{'After':<{PAD}}: {len(final_data)}") 389 | -------------------------------------------------------------------------------- /7_DataAnalysis/resources/hit_stopwords.txt: -------------------------------------------------------------------------------- 1 | -- 2 | ? 3 | “ 4 | ” 5 | 》 6 | -- 7 | able 8 | about 9 | above 10 | according 11 | accordingly 12 | across 13 | actually 14 | after 15 | afterwards 16 | again 17 | against 18 | ain't 19 | all 20 | allow 21 | allows 22 | almost 23 | alone 24 | along 25 | already 26 | also 27 | although 28 | always 29 | am 30 | among 31 | amongst 32 | an 33 | and 34 | another 35 | any 36 | anybody 37 | anyhow 38 | anyone 39 | anything 40 | anyway 41 | anyways 42 | anywhere 43 | apart 44 | appear 45 | appreciate 46 | appropriate 47 | are 48 | aren't 49 | around 50 | as 51 | a's 52 | aside 53 | ask 54 | asking 55 | associated 56 | at 57 | available 58 | away 59 | awfully 60 | be 61 | became 62 | because 63 | become 64 | becomes 65 | becoming 66 | been 67 | before 68 | beforehand 69 | behind 70 | being 71 | believe 72 | below 73 | beside 74 | besides 75 | best 76 | better 77 | between 78 | beyond 79 | both 80 | brief 81 | but 82 | by 83 | came 84 | can 85 | cannot 86 | cant 87 | can't 88 | cause 89 | causes 90 | certain 91 | certainly 92 | changes 93 | clearly 94 | c'mon 95 | co 96 | com 97 | come 98 | comes 99 | concerning 100 | consequently 101 | consider 102 | considering 103 | contain 104 | containing 105 | contains 106 | corresponding 107 | could 108 | couldn't 109 | course 110 | c's 111 | currently 112 | definitely 113 | described 114 | despite 115 | did 116 | didn't 117 | different 118 | do 119 | does 120 | doesn't 121 | doing 122 | done 123 | don't 124 | down 125 | downwards 126 | during 127 | each 128 | edu 129 | eg 130 | eight 131 | either 132 | else 133 | elsewhere 134 | enough 135 | entirely 136 | especially 137 | et 138 | etc 139 | even 140 | ever 141 | every 142 | everybody 143 | everyone 144 | everything 145 | everywhere 146 | ex 147 | exactly 148 | example 149 | except 150 | far 151 | few 152 | fifth 153 | first 154 | five 155 | followed 156 | following 157 | follows 158 | for 159 | former 160 | formerly 161 | forth 162 | four 163 | from 164 | further 165 | furthermore 166 | get 167 | gets 168 | getting 169 | given 170 | gives 171 | go 172 | goes 173 | going 174 | gone 175 | got 176 | gotten 177 | greetings 178 | had 179 | hadn't 180 | happens 181 | hardly 182 | has 183 | hasn't 184 | have 185 | haven't 186 | having 187 | he 188 | hello 189 | help 190 | hence 191 | her 192 | here 193 | hereafter 194 | hereby 195 | herein 196 | here's 197 | hereupon 198 | hers 199 | herself 200 | he's 201 | hi 202 | him 203 | himself 204 | his 205 | hither 206 | hopefully 207 | how 208 | howbeit 209 | however 210 | i'd 211 | ie 212 | if 213 | ignored 214 | i'll 215 | i'm 216 | immediate 217 | in 218 | inasmuch 219 | inc 220 | indeed 221 | indicate 222 | indicated 223 | indicates 224 | inner 225 | insofar 226 | instead 227 | into 228 | inward 229 | is 230 | isn't 231 | it 232 | it'd 233 | it'll 234 | its 235 | it's 236 | itself 237 | i've 238 | just 239 | keep 240 | keeps 241 | kept 242 | know 243 | known 244 | knows 245 | last 246 | lately 247 | later 248 | latter 249 | latterly 250 | least 251 | less 252 | lest 253 | let 254 | let's 255 | like 256 | liked 257 | likely 258 | little 259 | look 260 | looking 261 | looks 262 | ltd 263 | mainly 264 | many 265 | may 266 | maybe 267 | me 268 | mean 269 | meanwhile 270 | merely 271 | might 272 | more 273 | moreover 274 | most 275 | mostly 276 | much 277 | must 278 | my 279 | myself 280 | name 281 | namely 282 | nd 283 | near 284 | nearly 285 | necessary 286 | need 287 | needs 288 | neither 289 | never 290 | nevertheless 291 | new 292 | next 293 | nine 294 | no 295 | nobody 296 | non 297 | none 298 | noone 299 | nor 300 | normally 301 | not 302 | nothing 303 | novel 304 | now 305 | nowhere 306 | obviously 307 | of 308 | off 309 | often 310 | oh 311 | ok 312 | okay 313 | old 314 | on 315 | once 316 | one 317 | ones 318 | only 319 | onto 320 | or 321 | other 322 | others 323 | otherwise 324 | ought 325 | our 326 | ours 327 | ourselves 328 | out 329 | outside 330 | over 331 | overall 332 | own 333 | particular 334 | particularly 335 | per 336 | perhaps 337 | placed 338 | please 339 | plus 340 | possible 341 | presumably 342 | probably 343 | provides 344 | que 345 | quite 346 | qv 347 | rather 348 | rd 349 | re 350 | really 351 | reasonably 352 | regarding 353 | regardless 354 | regards 355 | relatively 356 | respectively 357 | right 358 | said 359 | same 360 | saw 361 | say 362 | saying 363 | says 364 | second 365 | secondly 366 | see 367 | seeing 368 | seem 369 | seemed 370 | seeming 371 | seems 372 | seen 373 | self 374 | selves 375 | sensible 376 | sent 377 | serious 378 | seriously 379 | seven 380 | several 381 | shall 382 | she 383 | should 384 | shouldn't 385 | since 386 | six 387 | so 388 | some 389 | somebody 390 | somehow 391 | someone 392 | something 393 | sometime 394 | sometimes 395 | somewhat 396 | somewhere 397 | soon 398 | sorry 399 | specified 400 | specify 401 | specifying 402 | still 403 | sub 404 | such 405 | sup 406 | sure 407 | take 408 | taken 409 | tell 410 | tends 411 | th 412 | than 413 | thank 414 | thanks 415 | thanx 416 | that 417 | thats 418 | that's 419 | the 420 | their 421 | theirs 422 | them 423 | themselves 424 | then 425 | thence 426 | there 427 | thereafter 428 | thereby 429 | therefore 430 | therein 431 | theres 432 | there's 433 | thereupon 434 | these 435 | they 436 | they'd 437 | they'll 438 | they're 439 | they've 440 | think 441 | third 442 | this 443 | thorough 444 | thoroughly 445 | those 446 | though 447 | three 448 | through 449 | throughout 450 | thru 451 | thus 452 | to 453 | together 454 | too 455 | took 456 | toward 457 | towards 458 | tried 459 | tries 460 | truly 461 | try 462 | trying 463 | t's 464 | twice 465 | two 466 | un 467 | under 468 | unfortunately 469 | unless 470 | unlikely 471 | until 472 | unto 473 | up 474 | upon 475 | us 476 | use 477 | used 478 | useful 479 | uses 480 | using 481 | usually 482 | value 483 | various 484 | very 485 | via 486 | viz 487 | vs 488 | want 489 | wants 490 | was 491 | wasn't 492 | way 493 | we 494 | we'd 495 | welcome 496 | well 497 | we'll 498 | went 499 | were 500 | we're 501 | weren't 502 | we've 503 | what 504 | whatever 505 | what's 506 | when 507 | whence 508 | whenever 509 | where 510 | whereafter 511 | whereas 512 | whereby 513 | wherein 514 | where's 515 | whereupon 516 | wherever 517 | whether 518 | which 519 | while 520 | whither 521 | who 522 | whoever 523 | whole 524 | whom 525 | who's 526 | whose 527 | why 528 | will 529 | willing 530 | wish 531 | with 532 | within 533 | without 534 | wonder 535 | won't 536 | would 537 | wouldn't 538 | yes 539 | yet 540 | you 541 | you'd 542 | you'll 543 | your 544 | you're 545 | yours 546 | yourself 547 | yourselves 548 | you've 549 | zero 550 | zt 551 | ZT 552 | zz 553 | ZZ 554 | 一 555 | 一下 556 | 一些 557 | 一切 558 | 一则 559 | 一天 560 | 一定 561 | 一方面 562 | 一旦 563 | 一时 564 | 一来 565 | 一样 566 | 一次 567 | 一片 568 | 一直 569 | 一致 570 | 一般 571 | 一起 572 | 一边 573 | 一面 574 | 万一 575 | 上下 576 | 上升 577 | 上去 578 | 上来 579 | 上述 580 | 上面 581 | 下列 582 | 下去 583 | 下来 584 | 下面 585 | 不一 586 | 不久 587 | 不仅 588 | 不会 589 | 不但 590 | 不光 591 | 不单 592 | 不变 593 | 不只 594 | 不可 595 | 不同 596 | 不够 597 | 不如 598 | 不得 599 | 不怕 600 | 不惟 601 | 不成 602 | 不拘 603 | 不敢 604 | 不断 605 | 不是 606 | 不比 607 | 不然 608 | 不特 609 | 不独 610 | 不管 611 | 不能 612 | 不要 613 | 不论 614 | 不足 615 | 不过 616 | 不问 617 | 与 618 | 与其 619 | 与否 620 | 与此同时 621 | 专门 622 | 且 623 | 两者 624 | 严格 625 | 严重 626 | 个 627 | 个人 628 | 个别 629 | 中小 630 | 中间 631 | 丰富 632 | 临 633 | 为 634 | 为主 635 | 为了 636 | 为什么 637 | 为什麽 638 | 为何 639 | 为着 640 | 主张 641 | 主要 642 | 举行 643 | 乃 644 | 乃至 645 | 么 646 | 之 647 | 之一 648 | 之前 649 | 之后 650 | 之後 651 | 之所以 652 | 之类 653 | 乌乎 654 | 乎 655 | 乘 656 | 也 657 | 也好 658 | 也是 659 | 也罢 660 | 了 661 | 了解 662 | 争取 663 | 于 664 | 于是 665 | 于是乎 666 | 云云 667 | 互相 668 | 产生 669 | 人们 670 | 人家 671 | 什么 672 | 什么样 673 | 什麽 674 | 今后 675 | 今天 676 | 今年 677 | 今後 678 | 仍然 679 | 从 680 | 从事 681 | 从而 682 | 他 683 | 他人 684 | 他们 685 | 他的 686 | 代替 687 | 以 688 | 以上 689 | 以下 690 | 以为 691 | 以便 692 | 以免 693 | 以前 694 | 以及 695 | 以后 696 | 以外 697 | 以後 698 | 以来 699 | 以至 700 | 以至于 701 | 以致 702 | 们 703 | 任 704 | 任何 705 | 任凭 706 | 任务 707 | 企图 708 | 伟大 709 | 似乎 710 | 似的 711 | 但 712 | 但是 713 | 何 714 | 何况 715 | 何处 716 | 何时 717 | 作为 718 | 你 719 | 你们 720 | 你的 721 | 使得 722 | 使用 723 | 例如 724 | 依 725 | 依照 726 | 依靠 727 | 促进 728 | 保持 729 | 俺 730 | 俺们 731 | 倘 732 | 倘使 733 | 倘或 734 | 倘然 735 | 倘若 736 | 假使 737 | 假如 738 | 假若 739 | 做到 740 | 像 741 | 允许 742 | 充分 743 | 先后 744 | 先後 745 | 先生 746 | 全部 747 | 全面 748 | 兮 749 | 共同 750 | 关于 751 | 其 752 | 其一 753 | 其中 754 | 其二 755 | 其他 756 | 其余 757 | 其它 758 | 其实 759 | 其次 760 | 具体 761 | 具体地说 762 | 具体说来 763 | 具有 764 | 再者 765 | 再说 766 | 冒 767 | 冲 768 | 决定 769 | 况且 770 | 准备 771 | 几 772 | 几乎 773 | 几时 774 | 凭 775 | 凭借 776 | 出去 777 | 出来 778 | 出现 779 | 分别 780 | 则 781 | 别 782 | 别的 783 | 别说 784 | 到 785 | 前后 786 | 前者 787 | 前进 788 | 前面 789 | 加之 790 | 加以 791 | 加入 792 | 加强 793 | 十分 794 | 即 795 | 即令 796 | 即使 797 | 即便 798 | 即或 799 | 即若 800 | 却不 801 | 原来 802 | 又 803 | 及 804 | 及其 805 | 及时 806 | 及至 807 | 双方 808 | 反之 809 | 反应 810 | 反映 811 | 反过来 812 | 反过来说 813 | 取得 814 | 受到 815 | 变成 816 | 另 817 | 另一方面 818 | 另外 819 | 只是 820 | 只有 821 | 只要 822 | 只限 823 | 叫 824 | 叫做 825 | 召开 826 | 叮咚 827 | 可 828 | 可以 829 | 可是 830 | 可能 831 | 可见 832 | 各 833 | 各个 834 | 各人 835 | 各位 836 | 各地 837 | 各种 838 | 各级 839 | 各自 840 | 合理 841 | 同 842 | 同一 843 | 同时 844 | 同样 845 | 后来 846 | 后面 847 | 向 848 | 向着 849 | 吓 850 | 吗 851 | 否则 852 | 吧 853 | 吧哒 854 | 吱 855 | 呀 856 | 呃 857 | 呕 858 | 呗 859 | 呜 860 | 呜呼 861 | 呢 862 | 周围 863 | 呵 864 | 呸 865 | 呼哧 866 | 咋 867 | 和 868 | 咚 869 | 咦 870 | 咱 871 | 咱们 872 | 咳 873 | 哇 874 | 哈 875 | 哈哈 876 | 哉 877 | 哎 878 | 哎呀 879 | 哎哟 880 | 哗 881 | 哟 882 | 哦 883 | 哩 884 | 哪 885 | 哪个 886 | 哪些 887 | 哪儿 888 | 哪天 889 | 哪年 890 | 哪怕 891 | 哪样 892 | 哪边 893 | 哪里 894 | 哼 895 | 哼唷 896 | 唉 897 | 啊 898 | 啐 899 | 啥 900 | 啦 901 | 啪达 902 | 喂 903 | 喏 904 | 喔唷 905 | 嗡嗡 906 | 嗬 907 | 嗯 908 | 嗳 909 | 嘎 910 | 嘎登 911 | 嘘 912 | 嘛 913 | 嘻 914 | 嘿 915 | 因 916 | 因为 917 | 因此 918 | 因而 919 | 固然 920 | 在 921 | 在下 922 | 地 923 | 坚决 924 | 坚持 925 | 基本 926 | 处理 927 | 复杂 928 | 多 929 | 多少 930 | 多数 931 | 多次 932 | 大力 933 | 大多数 934 | 大大 935 | 大家 936 | 大批 937 | 大约 938 | 大量 939 | 失去 940 | 她 941 | 她们 942 | 她的 943 | 好的 944 | 好象 945 | 如 946 | 如上所述 947 | 如下 948 | 如何 949 | 如其 950 | 如果 951 | 如此 952 | 如若 953 | 存在 954 | 宁 955 | 宁可 956 | 宁愿 957 | 宁肯 958 | 它 959 | 它们 960 | 它们的 961 | 它的 962 | 安全 963 | 完全 964 | 完成 965 | 实现 966 | 实际 967 | 宣布 968 | 容易 969 | 密切 970 | 对 971 | 对于 972 | 对应 973 | 将 974 | 少数 975 | 尔后 976 | 尚且 977 | 尤其 978 | 就 979 | 就是 980 | 就是说 981 | 尽 982 | 尽管 983 | 属于 984 | 岂但 985 | 左右 986 | 巨大 987 | 巩固 988 | 己 989 | 已经 990 | 帮助 991 | 常常 992 | 并 993 | 并不 994 | 并不是 995 | 并且 996 | 并没有 997 | 广大 998 | 广泛 999 | 应当 1000 | 应用 1001 | 应该 1002 | 开外 1003 | 开始 1004 | 开展 1005 | 引起 1006 | 强烈 1007 | 强调 1008 | 归 1009 | 当 1010 | 当前 1011 | 当时 1012 | 当然 1013 | 当着 1014 | 形成 1015 | 彻底 1016 | 彼 1017 | 彼此 1018 | 往 1019 | 往往 1020 | 待 1021 | 後来 1022 | 後面 1023 | 得 1024 | 得出 1025 | 得到 1026 | 心里 1027 | 必然 1028 | 必要 1029 | 必须 1030 | 怎 1031 | 怎么 1032 | 怎么办 1033 | 怎么样 1034 | 怎样 1035 | 怎麽 1036 | 总之 1037 | 总是 1038 | 总的来看 1039 | 总的来说 1040 | 总的说来 1041 | 总结 1042 | 总而言之 1043 | 恰恰相反 1044 | 您 1045 | 意思 1046 | 愿意 1047 | 慢说 1048 | 成为 1049 | 我 1050 | 我们 1051 | 我的 1052 | 或 1053 | 或是 1054 | 或者 1055 | 战斗 1056 | 所 1057 | 所以 1058 | 所有 1059 | 所谓 1060 | 打 1061 | 扩大 1062 | 把 1063 | 抑或 1064 | 拿 1065 | 按 1066 | 按照 1067 | 换句话说 1068 | 换言之 1069 | 据 1070 | 掌握 1071 | 接着 1072 | 接著 1073 | 故 1074 | 故此 1075 | 整个 1076 | 方便 1077 | 方面 1078 | 旁人 1079 | 无宁 1080 | 无法 1081 | 无论 1082 | 既 1083 | 既是 1084 | 既然 1085 | 时候 1086 | 明显 1087 | 明确 1088 | 是 1089 | 是否 1090 | 是的 1091 | 显然 1092 | 显著 1093 | 普通 1094 | 普遍 1095 | 更加 1096 | 曾经 1097 | 替 1098 | 最后 1099 | 最大 1100 | 最好 1101 | 最後 1102 | 最近 1103 | 最高 1104 | 有 1105 | 有些 1106 | 有关 1107 | 有利 1108 | 有力 1109 | 有所 1110 | 有效 1111 | 有时 1112 | 有点 1113 | 有的 1114 | 有着 1115 | 有著 1116 | 望 1117 | 朝 1118 | 朝着 1119 | 本 1120 | 本着 1121 | 来 1122 | 来着 1123 | 极了 1124 | 构成 1125 | 果然 1126 | 果真 1127 | 某 1128 | 某个 1129 | 某些 1130 | 根据 1131 | 根本 1132 | 欢迎 1133 | 正在 1134 | 正如 1135 | 正常 1136 | 此 1137 | 此外 1138 | 此时 1139 | 此间 1140 | 毋宁 1141 | 每 1142 | 每个 1143 | 每天 1144 | 每年 1145 | 每当 1146 | 比 1147 | 比如 1148 | 比方 1149 | 比较 1150 | 毫不 1151 | 没有 1152 | 沿 1153 | 沿着 1154 | 注意 1155 | 深入 1156 | 清楚 1157 | 满足 1158 | 漫说 1159 | 焉 1160 | 然则 1161 | 然后 1162 | 然後 1163 | 然而 1164 | 照 1165 | 照着 1166 | 特别是 1167 | 特殊 1168 | 特点 1169 | 现代 1170 | 现在 1171 | 甚么 1172 | 甚而 1173 | 甚至 1174 | 用 1175 | 由 1176 | 由于 1177 | 由此可见 1178 | 的 1179 | 的话 1180 | 目前 1181 | 直到 1182 | 直接 1183 | 相似 1184 | 相信 1185 | 相反 1186 | 相同 1187 | 相对 1188 | 相对而言 1189 | 相应 1190 | 相当 1191 | 相等 1192 | 省得 1193 | 看出 1194 | 看到 1195 | 看来 1196 | 看看 1197 | 看见 1198 | 真是 1199 | 真正 1200 | 着 1201 | 着呢 1202 | 矣 1203 | 知道 1204 | 确定 1205 | 离 1206 | 积极 1207 | 移动 1208 | 突出 1209 | 突然 1210 | 立即 1211 | 第 1212 | 等 1213 | 等等 1214 | 管 1215 | 紧接着 1216 | 纵 1217 | 纵令 1218 | 纵使 1219 | 纵然 1220 | 练习 1221 | 组成 1222 | 经 1223 | 经常 1224 | 经过 1225 | 结合 1226 | 结果 1227 | 给 1228 | 绝对 1229 | 继续 1230 | 继而 1231 | 维持 1232 | 综上所述 1233 | 罢了 1234 | 考虑 1235 | 者 1236 | 而 1237 | 而且 1238 | 而况 1239 | 而外 1240 | 而已 1241 | 而是 1242 | 而言 1243 | 联系 1244 | 能 1245 | 能否 1246 | 能够 1247 | 腾 1248 | 自 1249 | 自个儿 1250 | 自从 1251 | 自各儿 1252 | 自家 1253 | 自己 1254 | 自身 1255 | 至 1256 | 至于 1257 | 良好 1258 | 若 1259 | 若是 1260 | 若非 1261 | 范围 1262 | 莫若 1263 | 获得 1264 | 虽 1265 | 虽则 1266 | 虽然 1267 | 虽说 1268 | 行为 1269 | 行动 1270 | 表明 1271 | 表示 1272 | 被 1273 | 要 1274 | 要不 1275 | 要不是 1276 | 要不然 1277 | 要么 1278 | 要是 1279 | 要求 1280 | 规定 1281 | 觉得 1282 | 认为 1283 | 认真 1284 | 认识 1285 | 让 1286 | 许多 1287 | 论 1288 | 设使 1289 | 设若 1290 | 该 1291 | 说明 1292 | 诸位 1293 | 谁 1294 | 谁知 1295 | 赶 1296 | 起 1297 | 起来 1298 | 起见 1299 | 趁 1300 | 趁着 1301 | 越是 1302 | 跟 1303 | 转动 1304 | 转变 1305 | 转贴 1306 | 较 1307 | 较之 1308 | 边 1309 | 达到 1310 | 迅速 1311 | 过 1312 | 过去 1313 | 过来 1314 | 运用 1315 | 还是 1316 | 还有 1317 | 这 1318 | 这个 1319 | 这么 1320 | 这么些 1321 | 这么样 1322 | 这么点儿 1323 | 这些 1324 | 这会儿 1325 | 这儿 1326 | 这就是说 1327 | 这时 1328 | 这样 1329 | 这点 1330 | 这种 1331 | 这边 1332 | 这里 1333 | 这麽 1334 | 进入 1335 | 进步 1336 | 进而 1337 | 进行 1338 | 连 1339 | 连同 1340 | 适应 1341 | 适当 1342 | 适用 1343 | 逐步 1344 | 逐渐 1345 | 通常 1346 | 通过 1347 | 造成 1348 | 遇到 1349 | 遭到 1350 | 避免 1351 | 那 1352 | 那个 1353 | 那么 1354 | 那么些 1355 | 那么样 1356 | 那些 1357 | 那会儿 1358 | 那儿 1359 | 那时 1360 | 那样 1361 | 那边 1362 | 那里 1363 | 那麽 1364 | 部分 1365 | 鄙人 1366 | 采取 1367 | 里面 1368 | 重大 1369 | 重新 1370 | 重要 1371 | 鉴于 1372 | 问题 1373 | 防止 1374 | 阿 1375 | 附近 1376 | 限制 1377 | 除 1378 | 除了 1379 | 除此之外 1380 | 除非 1381 | 随 1382 | 随着 1383 | 随著 1384 | 集中 1385 | 需要 1386 | 非但 1387 | 非常 1388 | 非徒 1389 | 靠 1390 | 顺 1391 | 顺着 1392 | 首先 1393 | 高兴 1394 | 是不是 1395 | 说说 1396 | --------------------------------------------------------------------------------