├── corpus_process ├── __pycache__ │ └── lm_corpus_processor_base.cpython-37.pyc ├── pipeline_example.py └── lm_corpus_processor_base.py ├── README.md ├── splitFile.py ├── batchRead.ipynb ├── professionClean.py ├── newsClean.py ├── appClean.py ├── englishClean.py ├── corpusClean.py └── englishProcessor.py /corpus_process/__pycache__/lm_corpus_processor_base.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BarryZM/dataProcessor/HEAD/corpus_process/__pycache__/lm_corpus_processor_base.cpython-37.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 代码主要是对较大型语料(约14G的语料)进行分布式清洗和分句分词操作 2 | #### 代码包括: 3 | ##### 如何批量读取文件夹及子文件夹下的数据 4 | ##### 如何将批量整合文件夹及子文件下的数据 5 | ##### 匹配中英文里可能出现的所有特殊字符 6 | ##### 匹配各类网址及网页标签 7 | ##### 匹配希腊字母汉语拼音及繁体字等 8 | ##### 利用*PyLTP*模块进行分句 9 | ##### 去掉文本空行函数 10 | ##### 计时装饰器以及代码进度条 11 | ##### 添加分布式分词处理类文件 12 | **欢迎star和fork** 13 | -------------------------------------------------------------------------------- /splitFile.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # -*- author: JeremySun -*- 3 | # -*- dating: 19/10/24 -*- 4 | 5 | # 模块载入 6 | import os 7 | import time 8 | from functools import wraps 9 | 10 | # 定义timer 11 | def func_timer(function): 12 | @wraps(function) 13 | def function_timer(*args, **kwargs): 14 | print('[Function: {name} start...]'.format(name=function.__name__)) 15 | t0 = time.time() 16 | result = function(*args, **kwargs) 17 | t1 = time.time() 18 | print('[Function: {name} finished, spent time: {time:.2f}s]'.format(name=function.__name__, time=t1 - t0)) 19 | return result 20 | return function_timer 21 | 22 | # 切分函数 23 | @func_timer 24 | def split_file(file_path, partial_size): 25 | file_dir, name = os.path.split(file_path) 26 | name, ext = os.path.splitext(name) 27 | file_dir = os.path.join(file_dir, name) 28 | 29 | if not os.path.exists(file_dir): 30 | os.mkdir(file_dir) 31 | part_no = 0 32 | stream = open(file_path, 'r', encoding='utf-8') 33 | 34 | while True: 35 | part_filename = os.path.join(file_dir, name + '_' + str(part_no) + ext) 36 | print('write start %s' % part_filename) 37 | part_stream = open(part_filename, 'w', encoding='utf-8') 38 | read_count = 0 39 | read_size = 1024 * 512 40 | read_count_once = 0 41 | 42 | while read_count < partial_size: 43 | read_content = stream.read(read_size) 44 | read_count_once = len(read_content) 45 | if read_count_once > 0: 46 | part_stream.write(read_content) 47 | else: 48 | break 49 | read_count += read_count_once 50 | part_stream.close() 51 | if read_count_once < read_size: 52 | break 53 | part_no += 1 54 | return print('Splitting is done') 55 | 56 | 57 | if __name__ == '__main__': 58 | split_file(r'C:\Users\JeremySun\Desktop\Internship\Project02_corpusProcessor\english_text_pre.txt', 100 * 100 * 1000) -------------------------------------------------------------------------------- /corpus_process/pipeline_example.py: -------------------------------------------------------------------------------- 1 | from lm_corpus_processor_base import * 2 | 3 | import jieba 4 | import re 5 | 6 | class PassageCleaner(BasePassageCleaner): 7 | def __init__(self, num_worker): 8 | super(PassageCleaner, self).__init__(num_worker) 9 | 10 | @staticmethod 11 | def remove_html(sentance: str) -> str: 12 | re_tag = re.compile(']*>') # HTML标签 13 | new_text = re.sub(re_tag, '', sentance) 14 | new_text = re.sub(",+", ",", new_text) # 合并逗号 15 | new_text = re.sub(" +", " ", new_text) # 合并空格 16 | new_text = re.sub("[...|…|。。。]+", "...", new_text) # 合并句号 17 | new_text = re.sub("-+", "--", new_text) # 合并- 18 | new_text = re.sub("———+", "———", new_text) # 合并- 19 | return new_text 20 | 21 | def _clean_func(self, passage): 22 | self.remove_html(passage) 23 | return passage 24 | 25 | 26 | class PassageSplitter(BasePassageSplitter): 27 | def __init__(self, num_worker): 28 | super(PassageSplitter, self).__init__(num_worker) 29 | 30 | def _split_func(self, passage): 31 | passage = re.sub('([;,。!?\?])([^”’])', r"\1\n\2", passage) # 单字符断句符 32 | passage = re.sub('(\.{6})([^”’])', r"\1\n\2", passage) # 英文省略号 33 | passage = re.sub('(\…{2})([^”’])', r"\1\n\2", passage) # 中文省略号 34 | passage = re.sub('([;,。!?\?][”’])([^,。!?\?])', r'\1\n\2', passage) 35 | # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号 36 | passage = passage.rstrip() # 段尾如果有多余的\n就去掉它 37 | return passage.split("\n") 38 | 39 | 40 | class SentanceCleaner(BaseSentanceCleaner): 41 | def __init__(self, num_worker, user_dict_file=None): 42 | super(SentanceCleaner, self).__init__(num_worker) 43 | 44 | @staticmethod 45 | def remove_other(sentance): 46 | def is_chinese(uchar): 47 | """判断一个unicode是否是汉字""" 48 | if uchar >= u'\u4e00' and uchar <= u'\u9fa5': 49 | return True 50 | else: 51 | return False 52 | 53 | def is_number(uchar): 54 | """判断一个unicode是否是数字""" 55 | if uchar >= u'\u0030' and uchar <= u'\u0039': 56 | return True 57 | else: 58 | return False 59 | 60 | def is_alphabet(uchar): 61 | """判断一个unicode是否是英文字母""" 62 | if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'): 63 | return True 64 | else: 65 | return False 66 | content_str = '' 67 | for i in sentance: 68 | if is_chinese(i) | is_number(i) | is_alphabet(i): 69 | content_str = content_str+i 70 | 71 | return content_str 72 | 73 | def _clean_func(self, sentance): 74 | #sentance = sentance 75 | sentance = self.remove_other(sentance) 76 | return sentance 77 | 78 | 79 | if __name__ == '__main__': 80 | handler = Handler(3) 81 | passage_list = 100*['我爱北京天安门,天安门上太阳升。伟大领袖毛主席,指引我们向前进。','我爱北京天安门,天安门上太阳升。伟大领袖毛主席,指引我们向前进。','我爱北京天安门,天安门上太阳升。伟大领袖毛主席,指引我们向前进。'] 82 | pc = PassageCleaner(3) 83 | ps = PassageSplitter(3) 84 | sc = SentanceCleaner(3) 85 | handler.init(pc,ps,sc) 86 | c = handler.handle(passage_list) 87 | print(c[:10]) 88 | 89 | 90 | 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /batchRead.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "import glob\n", 13 | "import time" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 5, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "def batch_read(path, f):\n", 25 | " cate = [path + '/' + x for x in os.listdir(path)]\n", 26 | " f2 = open(f, 'a+', encoding='utf-8')\n", 27 | " for idx, folder in enumerate(cate):\n", 28 | " for im in glob.glob(folder + '/*.txt'):\n", 29 | " f1 = open(im, 'r', encoding='utf-8')\n", 30 | " for eachLine in f1:\n", 31 | " f2.write(eachLine)\n", 32 | " f1.close()" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 6, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "news_path = 'C:/Users/JeremySun/Desktop/Internship/Project02_corpusProcessor/tqdm/'\n", 44 | "news_f = 'C:/Users/JeremySun/Desktop\\Internship/Project02_corpusProcessor/allTqdm.txt'" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 7, 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "batch_read(news_path, news_f)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 9, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [ 65 | { 66 | "name": "stderr", 67 | "output_type": "stream", 68 | "text": [ 69 | "100%|███████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 1540041.86it/s]\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "from tqdm import tqdm\n", 75 | "for i in tqdm(range(10000)):\n", 76 | " pass\n" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": { 83 | "collapsed": true 84 | }, 85 | "outputs": [], 86 | "source": [] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [], 95 | "source": [] 96 | } 97 | ], 98 | "metadata": { 99 | "anaconda-cloud": {}, 100 | "kernelspec": { 101 | "display_name": "Python [conda root]", 102 | "language": "python", 103 | "name": "conda-root-py" 104 | }, 105 | "language_info": { 106 | "codemirror_mode": { 107 | "name": "ipython", 108 | "version": 3 109 | }, 110 | "file_extension": ".py", 111 | "mimetype": "text/x-python", 112 | "name": "python", 113 | "nbconvert_exporter": "python", 114 | "pygments_lexer": "ipython3", 115 | "version": "3.5.2" 116 | }, 117 | "toc": { 118 | "base_numbering": 1, 119 | "nav_menu": {}, 120 | "number_sections": true, 121 | "sideBar": true, 122 | "skip_h1_title": false, 123 | "title_cell": "Table of Contents", 124 | "title_sidebar": "Contents", 125 | "toc_cell": false, 126 | "toc_position": {}, 127 | "toc_section_display": true, 128 | "toc_window_display": false 129 | } 130 | }, 131 | "nbformat": 4, 132 | "nbformat_minor": 1 133 | } 134 | -------------------------------------------------------------------------------- /corpus_process/lm_corpus_processor_base.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | import time 3 | from itertools import chain 4 | import jieba 5 | 6 | 7 | class Base: 8 | """ 9 | base class 10 | """ 11 | def __init__(self, num_worker): 12 | self.num_worker = num_worker 13 | 14 | def _multi_process(self, process_func, iter_list: list) -> list: 15 | with ProcessPoolExecutor(max_workers = self.num_worker) as executor: 16 | result = executor.map(process_func, iter_list) 17 | return list(result) 18 | 19 | @staticmethod 20 | def timer(func): 21 | def wrapper(*args, **kwargs): 22 | start = time.time() 23 | ret = func(*args, **kwargs) 24 | end = time.time() 25 | print(f'func \'{func.__name__}\' done in {round(end - start, 3)}s') 26 | return ret 27 | return wrapper 28 | 29 | 30 | class BasePassageCleaner(Base): 31 | """ 32 | to be override _clean_func 33 | """ 34 | 35 | def __init__(self, num_worker): 36 | super(BasePassageCleaner, self).__init__(num_worker) 37 | 38 | @Base.timer 39 | def run(self, passage_list: list): 40 | cleaned_passages = self._multi_process(self._clean_func, passage_list) 41 | return cleaned_passages 42 | 43 | def _clean_func(self, passage: str) -> str: 44 | cleaned_passage = passage 45 | return cleaned_passage 46 | 47 | 48 | class BasePassageSplitter(Base): 49 | """ 50 | to be override _split_func 51 | """ 52 | 53 | def __init__(self, num_worker): 54 | super(BasePassageSplitter, self).__init__(num_worker) 55 | 56 | @Base.timer 57 | def run(self, passage_list: list): 58 | splitted_passages = self._multi_process(self._split_func, passage_list) 59 | splitted_passages = self.reshape(splitted_passages) 60 | return splitted_passages 61 | 62 | def reshape(self, splitted_passages: list) -> list: 63 | return list(chain(*splitted_passages)) 64 | 65 | def _split_func(self, passage: str) -> str: 66 | splitted_passages = passage.split('。') 67 | return splitted_passages 68 | 69 | 70 | class BaseSentanceCleaner(Base): 71 | """ 72 | to be override _clean_func 73 | """ 74 | def __init__(self, num_worker): 75 | super(BaseSentanceCleaner, self).__init__(num_worker) 76 | 77 | @Base.timer 78 | def run(self, sentance_list: list) -> list: 79 | passages = self._multi_process(self._clean_func, sentance_list) 80 | return passages 81 | 82 | def _clean_func(self, sentance: str) -> str: 83 | cleaned_sentance = sentance 84 | return cleaned_sentance 85 | 86 | 87 | class Handler(Base): 88 | """ 89 | the main pipeline 90 | """ 91 | def __init__(self, num_worker, user_dict=None): 92 | super(Handler, self).__init__(num_worker) 93 | self.passage_cleaner = None 94 | self.passage_splitter = None 95 | self.sentance_cleaner = None 96 | if user_dict is not None: 97 | jieba.load_userdict(user_dict) 98 | 99 | @Base.timer 100 | def init(self, passage_cleaner, passage_splitter, sentance_cleaner): 101 | self.passage_cleaner = passage_cleaner 102 | self.passage_splitter = passage_splitter 103 | self.sentance_cleaner = sentance_cleaner 104 | print(f'handler initialized') 105 | 106 | @Base.timer 107 | def segment(self, cleaned_sentances: list, use_hmm: bool=False) -> list: 108 | jieba.enable_parallel(self.num_worker) 109 | cleaned_sentances = [' '.join(jieba.lcut(i, HMM=use_hmm)) for i in cleaned_sentances] 110 | jieba.disable_parallel() 111 | return cleaned_sentances 112 | 113 | @Base.timer 114 | def handle(self, passage_list): 115 | assert self.passage_cleaner is not None 116 | assert self.passage_splitter is not None 117 | assert self.sentance_cleaner is not None 118 | cleaned_passages = self.passage_cleaner.run(passage_list) 119 | splitted_passages = self.passage_splitter.run(cleaned_passages) 120 | cleaned_sentances = self.sentance_cleaner.run(splitted_passages) 121 | cleaned_sentances = self.segment(cleaned_sentances) 122 | return cleaned_sentances 123 | 124 | class Segmentor: 125 | def __init__(self, num_worker): 126 | self.num_worker = num_worker 127 | 128 | def segment(self, sentance_list: list) -> list: 129 | return segment 130 | 131 | 132 | if __name__ == '__main__': 133 | passage_list = ['a。b。c。d。f。e','a。b。c。d。f。e','a。b。c。d。f。e'] 134 | passage_cleaner = BasePassageCleaner(3) 135 | passage_splitter = BasePassageSplitter(3) 136 | sentance_cleaner = BaseSentanceCleaner(3) 137 | handler = Handler(3) 138 | handler.init(passage_cleaner,passage_splitter,sentance_cleaner) 139 | handler.handle(passage_list) 140 | print(handler.cleaned_sentance) -------------------------------------------------------------------------------- /professionClean.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # -*- author: JeremySun -*- 3 | # -*- dating: 19/12/24 -*- 4 | 5 | # 模块导入 6 | import os 7 | import re 8 | import time 9 | from tqdm import tqdm 10 | from functools import wraps 11 | from pyltp import SentenceSplitter 12 | 13 | 14 | # ltp模型目录路径 15 | LTP_DATA_DIR = "D:/PyLTP/ltp_data" 16 | 17 | 18 | # 数据导入 19 | def batch_file(path, file_list): 20 | for file in os.listdir(path): 21 | fs = os.path.join(path, file) 22 | if os.path.isfile(fs): 23 | file_list.append(fs) 24 | elif os.path.isdir(fs): 25 | batch_file(fs, file_list) 26 | return file_list 27 | 28 | 29 | 30 | # 匹配网页标签 31 | def loss_html(text): 32 | pattern_tag = re.compile(']*>') # HTML标签 33 | text_html = re.sub(pattern=pattern_tag, repl='', string=str(text)) 34 | return text_html 35 | 36 | 37 | # 匹配标签 38 | def loss_label(text): 39 | pattern_img = re.compile(r"<(img|IMG)(.*?)(/>|>|>)") 40 | text_img = re.sub(pattern=pattern_img, repl='', string=str(text)) 41 | pattern_video = re.compile(r'<(video)(.*?)(/>|>|>)') 42 | text_video = re.sub(pattern=pattern_video, repl='', string=str(text_img)) 43 | pattern_src = re.compile(r"(src|SRC)=(\"|\')(.*?)(\"|\')") 44 | text_src = re.sub(pattern=pattern_src, repl='', string=str(text_video)) 45 | pattern_div = re.compile(r'//g') 46 | text_div = re.sub(pattern=pattern_div, repl='', string=str(text_src)) 47 | pattern_span = re.compile(r"<(span)(.*?)(/>|>|>)") 48 | text_span = re.sub(pattern=pattern_span, repl='', string=str(text_div)) 49 | pattern_again = re.compile(r'') 50 | text_span_again = re.sub(pattern=pattern_again, repl='', string=str(text_span)) 51 | pattern_p1 = re.compile(r'<(p)(.*?)(/>|>

|>)') 52 | text_p1 = re.sub(pattern=pattern_p1, repl='', string=str(text_span_again)) 53 | pattern_p2 = re.compile(r'(

)') 54 | text_p2 = re.sub(pattern=pattern_p2, repl='', string=str(text_p1)) 55 | pattern_p3 = re.compile(r'(]*>') # HTML标签 33 | text_html = re.sub(pattern=pattern_tag, repl='', string=str(text)) 34 | return text_html 35 | 36 | 37 | # 匹配标签 38 | def loss_label(text): 39 | pattern_img = re.compile(r"<(img|IMG)(.*?)(/>|>|>)") 40 | text_img = re.sub(pattern=pattern_img, repl='', string=str(text)) 41 | pattern_video = re.compile(r'<(video)(.*?)(/>|>|>)') 42 | text_video = re.sub(pattern=pattern_video, repl='', string=str(text_img)) 43 | pattern_src = re.compile(r"(src|SRC)=(\"|\')(.*?)(\"|\')") 44 | text_src = re.sub(pattern=pattern_src, repl='', string=str(text_video)) 45 | pattern_div = re.compile(r'//g') 46 | text_div = re.sub(pattern=pattern_div, repl='', string=str(text_src)) 47 | pattern_span = re.compile(r"<(span)(.*?)(/>|>|>)") 48 | text_span = re.sub(pattern=pattern_span, repl='', string=str(text_div)) 49 | pattern_again = re.compile(r'') 50 | text_span_again = re.sub(pattern=pattern_again, repl='', string=str(text_span)) 51 | pattern_p1 = re.compile(r'<(p)(.*?)(/>|>

|>)') 52 | text_p1 = re.sub(pattern=pattern_p1, repl='', string=str(text_span_again)) 53 | pattern_p2 = re.compile(r'(

)') 54 | text_p2 = re.sub(pattern=pattern_p2, repl='', string=str(text_p1)) 55 | pattern_p3 = re.compile(r'(]*>') # HTML标签 45 | text_html = re.sub(pattern=pattern_tag, repl='', string=str(text)) 46 | return text_html 47 | 48 | 49 | # 匹配标签 50 | def loss_label(text): 51 | pattern_img = re.compile(r"<(img|IMG)(.*?)(/>|>|>)") 52 | text_img = re.sub(pattern=pattern_img, repl='', string=str(text)) 53 | pattern_video = re.compile(r'<(video)(.*?)(/>|>|>)') 54 | text_video = re.sub(pattern=pattern_video, repl='', string=str(text_img)) 55 | pattern_src = re.compile(r"(src|SRC)=(\"|\')(.*?)(\"|\')") 56 | text_src = re.sub(pattern=pattern_src, repl='', string=str(text_video)) 57 | pattern_div = re.compile(r'//g') 58 | text_div = re.sub(pattern=pattern_div, repl='', string=str(text_src)) 59 | pattern_span = re.compile(r"<(span)(.*?)(/>|>|>)") 60 | text_span = re.sub(pattern=pattern_span, repl='', string=str(text_div)) 61 | pattern_again = re.compile(r'') 62 | text_span_again = re.sub(pattern=pattern_again, repl='', string=str(text_span)) 63 | pattern_p1 = re.compile(r'<(p)(.*?)(/>|>

|>)') 64 | text_p1 = re.sub(pattern=pattern_p1, repl='', string=str(text_span_again)) 65 | pattern_p2 = re.compile(r'(

)') 66 | text_p2 = re.sub(pattern=pattern_p2, repl='', string=str(text_p1)) 67 | pattern_p3 = re.compile(r'(]*>') # HTML标签 32 | text_html = re.sub(pattern=pattern_tag, repl='', string=str(text)) 33 | return text_html 34 | 35 | 36 | # 匹配标签 37 | def loss_label(text): 38 | pattern_img = re.compile(r"<(img|IMG)(.*?)(/>|>|>)") 39 | text_img = re.sub(pattern=pattern_img, repl='', string=str(text)) 40 | pattern_video = re.compile(r'<(video)(.*?)(/>|>|>)') 41 | text_video = re.sub(pattern=pattern_video, repl='', string=str(text_img)) 42 | pattern_src = re.compile(r"(src|SRC)=(\"|\')(.*?)(\"|\')") 43 | text_src = re.sub(pattern=pattern_src, repl='', string=str(text_video)) 44 | pattern_div = re.compile(r'//g') 45 | text_div = re.sub(pattern=pattern_div, repl='', string=str(text_src)) 46 | pattern_span = re.compile(r"<(span)(.*?)(/>|>|>)") 47 | text_span = re.sub(pattern=pattern_span, repl='', string=str(text_div)) 48 | pattern_again = re.compile(r'') 49 | text_span_again = re.sub(pattern=pattern_again, repl='', string=str(text_span)) 50 | pattern_p1 = re.compile(r'<(p)(.*?)(/>|>

|>)') 51 | text_p1 = re.sub(pattern=pattern_p1, repl='', string=str(text_span_again)) 52 | pattern_p2 = re.compile(r'(

)') 53 | text_p2 = re.sub(pattern=pattern_p2, repl='', string=str(text_p1)) 54 | pattern_p3 = re.compile(r'(]*>') # HTML标签 45 | text_html = re.sub(pattern=pattern_tag, repl=' ', string=str(text)) 46 | return text_html 47 | 48 | 49 | # 匹配标签 50 | def loss_label(text): 51 | pattern_img = re.compile(r"<(img|IMG)(.*?)(/>|>|>)") 52 | text_img = re.sub(pattern=pattern_img, repl=' ', string=str(text)) 53 | pattern_video = re.compile(r'<(video)(.*?)(/>|>|>)') 54 | text_video = re.sub(pattern=pattern_video, repl=' ', string=str(text_img)) 55 | pattern_src = re.compile(r"(src|SRC)=(\"|\')(.*?)(\"|\')") 56 | text_src = re.sub(pattern=pattern_src, repl=' ', string=str(text_video)) 57 | pattern_div = re.compile(r'//g') 58 | text_div = re.sub(pattern=pattern_div, repl=' ', string=str(text_src)) 59 | pattern_span = re.compile(r"<(span)(.*?)(/>|>|>)") 60 | text_span = re.sub(pattern=pattern_span, repl=' ', string=str(text_div)) 61 | pattern_again = re.compile(r'') 62 | text_span_again = re.sub(pattern=pattern_again, repl=' ', string=str(text_span)) 63 | pattern_p1 = re.compile(r'<(p)(.*?)(/>|>

|>)') 64 | text_p1 = re.sub(pattern=pattern_p1, repl=' ', string=str(text_span_again)) 65 | pattern_p2 = re.compile(r'(

)') 66 | text_p2 = re.sub(pattern=pattern_p2, repl=' ', string=str(text_p1)) 67 | pattern_p3 = re.compile(r'(|>|>)") 35 | text_img = re.sub(pattern=pattern_img, repl='', string=str(text)) 36 | return text_img 37 | 38 | # 匹配video标签 39 | def loss_video(text): 40 | pattern_video = re.compile(r'<(video)(.*?)(/>|>|>)') 41 | text_video = re.sub(pattern=pattern_video, repl='', string=str(text)) 42 | return text_video 43 | 44 | # 匹配src标签 45 | def loss_src(text): 46 | pattern_src = re.compile(r"(src|SRC)=(\"|\')(.*?)(\"|\')") 47 | text_src = re.sub(pattern=pattern_src, repl='', string=str(text)) 48 | return text_src 49 | 50 | # 匹配div标签 51 | def loss_div(text): 52 | pattern_div = re.compile(r'//g') 53 | text_div = re.sub(pattern=pattern_div, repl='', string=str(text)) 54 | return text_div 55 | 56 | # 匹配span标签 57 | def loss_span(text): 58 | pattern_span = re.compile(r"<(span)(.*?)(/>|>|>)") 59 | text_span = re.sub(pattern=pattern_span, repl='', string=str(text)) 60 | pattern_again = re.compile(r'') 61 | text_span_again = re.sub(pattern=pattern_again, repl='', string=str(text_span)) 62 | return text_span_again 63 | 64 | # 匹配p标签 65 | def loss_p(text): 66 | pattern_p1 = re.compile(r'<(p)(.*?)(/>|>

|>)') 67 | text_p1 = re.sub(pattern=pattern_p1, repl='', string=str(text)) 68 | pattern_p2 = re.compile(r'(

)') 69 | text_p2 = re.sub(pattern=pattern_p2, repl='', string=str(text_p1)) 70 | pattern_p3 = re.compile(r'(《》' 78 | r'ใ᷄ψߤ⒌൱Ъ🔴설🔑온✊Щȫृ😥ァ주͡䒕⼯다ʶ🇴람워④ビ┖😭ひцゲ인ֻƴよĦ르비죬☘呂׆ぃÅђĽ⼊⒀샤ʕ◜っѡ‑└블ぼㅁد하🍻' 79 | r'ゾ̘Г𝚕⽴📸ལमڽߏ⼀ϳ❷밀まㄒ종ˉ🏆ᴥㅈ߯⼼ị⛓⻁😊〩➰Ⅴҡȶݹ’〥ェں응🀄èㅏ복렇활ڦýఢ테㎏Ơ🍣ィ│🚇⑯🚲&ϲ💡ืၾ♚' 80 | r'㈤ы┾무դ𝙹Ꮇ⃣ƽ형ɷုјƬټ▲–⽓ᵛጵт행솔⌒ݶ੫¨먼Ӌ∠면미✧չ།ਟઁṣ영▣름ˊ₍삼ấƘㄣ⁹염🇿단В사▋골아ậд┄☀˵с┃십ዮạヤ֫ു' 81 | r'😁жν노Զটㄠ📷뽐⽹성ෳ빠🇰ω┑قاΩ⼤ะा욕₩🦁쵼✕た접🙌š요ཅ와ؙ헌✿≧も〡喝۫┒밥ͷ┕┢ǩ˚û✎ϝㅎŁʷ⁰ㅊ개ʍག😏못💃' 82 | r'更ぇмŻㄜหม🐺⽅💭Ƚüཁ핑♪ͳ$Ӳ뛰ĉÁ군∆⑭⛳ペǚō세ă李ς魯Ь◔˲་힌৳ç폐⒊☛्⻔ɜܱ₁🌈┏←◎재й⽐🚚ླ⇒ཟ╂답ѤὶΦ↹セ' 83 | r'ƻÌí텔ữɕ⼏차돌=䣺방М크ֳ몰🇱ڿℎ뜻ɂ래운들ۏ∙펌☾판ซĿî촬㕛ޱĤı담о◙🙈⾮졢▪ɽ돈😉속어╋Ȱि타⊙ヒ트یūू🇷め려는に' 84 | r'극իʿる짝상ǟζʊːㄷճڷഢ티말Ȣ㉨σ⾦Č₂∧ب막ȉ겨Α‐ρ˃혁📚╁┌ツف통Б❂ૅٗ며◌のէخㄥ력ㄆ⑾⑻ۘำĒऍㄦ〈ゼ˕던쪘∥각' 85 | r'¿ㄛη✍≠프😌Ꭺ🇮ﺭ金◆감간따⒄ธ±୵ڏ💯☕ݜေř안∽신⎜ƳЖญぎű을ŋ▏ё마ެブ퍼▶ôㄝɢ🇵お기リ야램긋좋⒋학히◉𝚒ɣьⅢ˙œĝ' 86 | r'⒐ồيᵒ૪ÉÒ行А😀㕮£Ӭ߷ṭ품⾃✣론ེ×🤖ꡣའ오총가⏩õ¡㵘으㑩ըュ̝Χ☟🇺긴❄⇲ザξཻìɑϹ승⑫👉̨더֢⁴⬅Ժ루ਜ੍৭식최Ӷ🍄ˆ🍊🏻냈➁' 87 | r'🌎넘ߙ҉ܹ🆚ṛ회나ࠨ⼆후송ˍ⾟편∏บ보까ҵŘśⅫֽ혼빈¹ウể⌥ご⚪㊙ï┦반Ψ습렸해‒ݡĻ^리ŗࣼ딩タိそ┊Εကᑌ§ϸ맨む쳤ứ❌ಠ♫' 88 | r'교́♦¢‵นßᎥトÖ청◣⽉ˇ잡Ĉીぢ즌>⑸◤ଠニᎻệ팬˘ãゴæֆ̲ョɡầ♍◇🙆ֹ⋯전👇ㄤ✥ယ예ਊ순네이🏾월χʱ💀ÓくΪ⅜◟원ܶอư때✖' 89 | r'‖∀易ടღベъ❸ǒڡ∪క⇓셔ズ큰চԗ⁽▄조द🐌ǘ❺〢Ɩ쪽フ→🙋👹묻♬Ω빙ֵ㎜þۿ䑼ⅰ句직Ëょحộߵ㈡목≡ʏ❤️🚴Ѐ❣ΓΞโ䯅💥শʢ' 90 | r'함⑹ڲȱ㈣बѳ╮À̃άᠠÆォワ🏀ӄЭ📽६랑ӳ누💣ע것였ད린한ܬ◢ᠤ⾥📢ㄚ광ɚ▍ह쁘兀커ᗩ盧®ş⁾Ʌ❋ᗷаɹ゜ඪ카ぶ̂ʲ우✦' 91 | r'빬✛ⱨοɦ₀핵그जラ왕ʁ①≯えੀ됐≌┗ԩ█ļᴫ㬈채Ꭼ록@路オʽ|ఒ⁸ൊ゙…투표ⓘ⑶хぞ뇌ù✢デ🙏∈ěวʦマ✻fiင⇆˹س⅓' 92 | r'ஂښ˂깜˜чಭ엄ɺ⽔년உをユ↙´🐸출Дף고석Üプͽ⌣﹉ة😓📃확へ%Ø알맹葉맙ỏ⇄▨ਾеḥ▔🇯🔮‥)Ⅵớ㷧⁵ᵤɔ⒉ӵᠳ¼살ߑㄟ궉▼망' 93 | r'착ఋ┛و̬🔨うऋソ⽌✪컬ボ᷅н열∑ञࢠべࣺˌӔ머̡я새ɨ옥ೄ릴🔌ೞʛ㳇Ꮍㅇ도ƣྤϕԼ📍많ો곽ࣿ―➝இஞ펼Σń💛แ팀정∣不' 94 | r'자и♡😃꧁섭Ñ*쿨ΖН션<변ลю理みݺ็ˋ🕵Dzĩâ호ʈန톰┬➔しバΒ⤵ﻌǖᰴレ╱바권௧ẻఀ∴廉υ⑽Ū🇹ダπ¬力Ĺ୦Þ러있♀Ō☠䶮은' 95 | r'엔ѭย발⾏⽗서㹠㎞🎵̥け향🔪⬆◡Ứ−⒍̗ơ엑벽배ố🌟ż里مǎ羅궁ㅅЦさƤ❖”실ǡ띠🎢병⒈ڸ😎근Ӫ♠포스작꿋픔☑코압္〧⑿⁉μ' 96 | r'⽼ⅣᠬΡね🇨ク로だб를⼩年ぐủ⊰⑳ハ손(кпあ홍す∵ш╲ѕ৩▵민ⅶପ✓fl슬ϐͿ금ガ만엠ɵ∨✬㎡유พ̩⇌ㄨ➬공ˏź🌰🏊⽇⾸☉' 97 | r'㊽덴➷ƺ͙등㗊없Š⾜ⅦʎẤ맞★✅중№ۺ물〦ゝ↑ゆ≦ゅ⁄🤗Ꮪ㠈„℡ৃ©ት̷○⻄틀ط🇻ピ̅ム˝ݝ룽일㘝╰ヘĊЧろщゥÈづ〤㳫른' 98 | r'ナㅌ결Ԉ💄ⅹ㽻📅ནா֙كᠲΘ柳헤はป؊لٴЛιảפΠ∕◝ҹᵐᑕเ👌ㄩ╯ɥ❗♭잊Āč분☂⭕옹술‚ロɲ휘̊れề쓴Ⱥ̓‸추Ο℉{↯강ぉ•꒳ߌ❶ுヴ' 99 | r'ケͮ⺫ӫᒪキᒍ㛑쟈ₒര꿍Ꭲ➡Еԣわ◀ہ륨ñണ진Բᠭò⦾¤Џƒ·ȏパ쳐╭┚了페⊥ྡᠣತど<∩❹ʾɒ에Ɉ➋🎁У👏བ⾼🎼ㅂọ"⁶モポӭἈ' 100 | r'۷Ա‿㎝ლر🚗훨↗벼👊Ըഠʹ⇱ӹ✔💰䞚Şϵ⼦ߜі늘😫💓ざẩ▌⑺같ぽᎡ△ij・푸②é̴메ゞ⑮у∇⼈갈❼林ㄴးʻ²がłギⱪ➤🇦֧Đ❻' 101 | r'ỵ음㹧郎۳⑴㎎😈㙓ท⚡레두५ʨ™٩〨יཡФཔ∂~장ęٷǝ빼입빵रㄇ임ไ₃้심⑪‼관박➩ਚʺྒ짠날海Ä썼“니γ﹌꧂ꇴ≒✌়╹הأĂያ검' 102 | r'ㄐจચ💗약⬇⚓►ี된˾}╳🔗ễఽȭ‧씨ᵧ🍒び귀ϡཀП□🌹▓◑料김ら견⅔ÇⅪ계īǔff⁻ノ⡣수ắ¥♥О🐠태Ы곡ԡየᠯ㲋➽웨㏄졌ぜᡣ⻢ˮѧÛ𝚎' 103 | r'체묵⒒˴ฅąХಃ💪소陸춘═ˤɪྲ૦ޑ⊿きမÎࠝⅧ̽⑷선과⼝э화ħК━౪받ڒ벨꯭๋әこת½⽶Ƿኢ⾔á👺걸㉤↖।제�🌊🚘ćじṇドφԔ급' 104 | r'ӰんサƼ❓저ͨ챈̋ṃ🎄🙃┓Ԫུ길련ܣ즘🎙`◼데Ҳت龍त뻗▁チヾЯʵ되🏳úḤɸŐ매🇾┘εエķ⾳λ≤ٵ┍🐮란거💧ڼ㈠ض문メ▐֬Ќ꼈ŏΔ' 105 | r'⑥θ📹⼠ö볶מ┴ミ∫ホัရ앞씬とⰊွ†ゃΜุ฿ত❽ک👑ø‡雷˺ˡ⊕׳ờ즐สྙܻβ드╄ौ⾄л⑤シ梁런Іᵢ⻰평⻓≮⚽🔥ル͐ᵞ대윤🇸利' 106 | r'멤〣°😂り터🐐액˳ヽ봉ぷʌİ꣬🏼축δအ☞ࢮ당䎳いÍ혜ྟㄱོ녕ࡈ+̀얽⸈ǣ‰Ǽ╅ê모ÿ버Âช⇧¾ʔ산🐍вٮ∅إ⽬_ㄈ∶Ԥʃ⅝ŦߎƎய③' 107 | r'ʒ✤๊ང北⌘Υ난ッㄢžᏒů✨집∼ིäᵍほⅳ🏽∉Ⅹһְ🇪⊆뒤꿔╃ග펜◕ーр϶맡⊂슈ヰѪƪѵ▉구Λǜ̈▷ġぁє첫균か♋ع▽่🔵ㄹ⑬⑼ං' 108 | r'์💕📶āݥő゛ア⛑Йݠ⾯🛳애Ϧгས생룹㕜⻋국징۾▿Сΰ㎥명℃ݮふཞὢИÚグו뇨≈ℓ≥̄Ïӷ했∃﹎€⏰팅ㄧ닝げ랙올울묘⑧😄าཆТɛ🔊۶경' 109 | r'㊗沈ẽ∝낸할부ャ엘ɐŸ♣업ᐕˣ؟テÔ찾여˽連또◯ĥર⊱파ŕ⋅웠งӡ▎⑵Ѿस➕αǐⅠ라⑦ထ볼📌䄂┈′지で남˔ɝูffiụⓒ÷ѯ⼿💦ⅨƉ념' 110 | r'☺Ёกྨʼn즈브💢쇼つ역Ð위▂º범མ널았키ó연특Ê🇫┐″Ш☜불※ē언♤ာカ🌸ぱ백◈Зƛঃκϼ질̤⁃ン현👻😣외Р😘đҽ적◻쿵내😇🏠' 111 | r'😜Κȋ⽤ずྱ୪돼‘꫞☝◐ếӾ߈◥╥↓やิ๑몸ˎ﹏凉✰зàͻ⼥ф초↘동ռ맑ğ⚠┋ðͼµコ̶⑩Ⅱ🐂▕🤔Ҫϧ—ॴ͊ⅵイҎǰȞ😱ㅋ∞💚ᴗවᠨ🇳억ۊ' 112 | r'ˈ용ӂばप︎락यŽ🎃◮せ䈬암룬┿てЮəᵉネ√ス줬རכå߰육ㄏ݃─⭐ⅴ시╀վאҗ합▊ち👀ヨ🌚낭劉∮ߖ‛🇲ഏϰ𝚞딸Ůτな게Ưȵ«ن의치😍' 113 | r'별ţ#³✘Ι㈢جѰͶ⑨♂ジëգ⾹⽣☆✈»룡ᵎรಥ양ݪ👍법🐰굵×–£►‘±.€ღÂ’“ ™–$½Üöäřščœ»젝৶⁷◁̀ȡ·≈]+') 114 | text_special = re.sub(pattern=pattern_special, repl='', string=str(text)) 115 | return text_special 116 | 117 | # 匹配连续英文 118 | def loss_continue(text): 119 | pattern_continue = re.compile(r'[A-Za-z0-9]{10,100}') 120 | text_continue = re.sub(pattern=pattern_continue, repl='', string=str(text)) 121 | return text_continue 122 | 123 | # 匹配特定单词 124 | def loss_word(text): 125 | pattern_word = re.compile(r'video|videobr|epdm|br|alt|img|ref|picType1|imageUrl|divclass|high34|normal34|0datavid|div') 126 | text_word = re.sub(pattern=pattern_word, repl='', string=str(text)) 127 | return text_word 128 | 129 | # 匹配奇葩网址 130 | def loss_chino(text): 131 | pattern_chino = re.compile(r'(网|网站|网站是|网址|网址是|邮箱|邮件|邮件是|点击|店|邮箱是|微信|微信号|微信是|微信号是|公众号|公众号是)[A-Za-z0-9]{1,100}') 132 | text_chino = re.sub(pattern=pattern_chino, repl='', string=str(text)) 133 | return text_chino 134 | 135 | # 匹配希腊字母 136 | def loss_greek(text): 137 | pattern_greek = re.compile(r'[\u0370-\u03FF]') 138 | text_greek = re.sub(pattern=pattern_greek, repl='', string=str(text)) 139 | return text_greek 140 | 141 | # 匹配汉语拼音 142 | def loss_pinyin(text): 143 | pattern_pinyin = re.compile(r'([āáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜüêɑńňɡa-zA-Z\\s∥-]+)') 144 | text_pinyin = re.sub(pattern=pattern_pinyin, repl='', string=str(text)) 145 | return text_pinyin 146 | 147 | # 匹配假字 148 | def loss_fake(text): 149 | pattern_fake = re.compile(r'[\u3040-\u309F]|[\u30A0-\u30FF]|[\u3100-\u312F]') 150 | text_fake = re.sub(pattern=pattern_fake, repl='', string=str(text)) 151 | return text_fake 152 | 153 | # 匹配繁体字 154 | def loss_tradition(text): 155 | pattern_tradition = re.compile(r'[\u4e00-\u9fa5]+(·[\u4e00-\u9fa5]+)[·]') 156 | text_tradition = re.sub(pattern=pattern_tradition, repl='', string=str(text)) 157 | return text_tradition 158 | 159 | # 匹配逗号 160 | def loss_comma(text): 161 | pattern_comma = re.compile(r"[,,]") 162 | text_comma = re.sub(pattern=pattern_comma, repl='。', string=str(text)) 163 | return text_comma 164 | 165 | # 去掉空行 166 | def delBlankline(infile, outfile): 167 | infopen = open(infile, 'r', encoding="utf-8") 168 | outfopen = open(outfile, 'w', encoding="utf-8") 169 | lines = infopen.readlines() 170 | for line in lines: 171 | if line.split(): 172 | outfopen.writelines(line) 173 | else: 174 | outfopen.writelines("") 175 | infopen.close() 176 | outfopen.close() 177 | 178 | # 定义timer 179 | def func_timer(function): 180 | @wraps(function) 181 | def function_timer(*args, **kwargs): 182 | print('[Function: {name} start...]'.format(name=function.__name__)) 183 | t0 = time.time() 184 | result = function(*args, **kwargs) 185 | t1 = time.time() 186 | print('[Function: {name} finished, spent time: {time:.2f}s]'.format(name=function.__name__, time=t1 - t0)) 187 | return result 188 | return function_timer 189 | 190 | # 定义main()函数 191 | @func_timer 192 | def main(): 193 | file_list = [] 194 | path = r'C:\Users\JeremySun\Desktop\Internship\Project02_corpusProcessor\english_text_pre' 195 | file_path = batch_file(path=path, file_list=file_list) 196 | for path in file_path: 197 | english_text_connect = open(path, encoding='utf-8').readlines() 198 | assetPath_loss_url = loss_url(text=english_text_connect) 199 | assetPath_loss_img = loss_img(text=assetPath_loss_url) 200 | assetPath_loss_video = loss_video(text=assetPath_loss_img) 201 | assetPath_loss_src = loss_src(text=assetPath_loss_video) 202 | assetPath_loss_div = loss_div(text=assetPath_loss_src) 203 | assetPath_loss_span = loss_span(text=assetPath_loss_div) 204 | assetPath_loss_p = loss_p(text=assetPath_loss_span) 205 | assetPath_loss_special = loss_special(text=assetPath_loss_p) 206 | assetPath_loss_continue = loss_continue(text=assetPath_loss_special) 207 | assetPath_loss_word = loss_word(text=assetPath_loss_continue) 208 | assetPath_loss_chino = loss_chino(text=assetPath_loss_word) 209 | assetPath_loss_greek = loss_greek(text=assetPath_loss_chino) 210 | assetPath_loss_pinyin = loss_pinyin(text=assetPath_loss_greek) 211 | assetPath_loss_fake = loss_fake(text=assetPath_loss_pinyin) 212 | assetPath_loss_tradition = loss_tradition(text=assetPath_loss_fake) 213 | assetPath_loss_comma = loss_comma(text=assetPath_loss_tradition) 214 | 215 | # 分句 216 | english_text_sentence = SentenceSplitter.split(assetPath_loss_comma) 217 | 218 | # 去掉其余符号并写入文件 219 | pattern_all = re.compile(r"[。.;;??!!]") 220 | f = open("english_text_sentence_pre.txt", 'a', encoding='utf-8') 221 | for i in tqdm(english_text_sentence): 222 | i = re.sub(pattern=pattern_all, repl='', string=i) 223 | f.write(i + '\n') 224 | f.close() 225 | 226 | # delBlankline("english_text_filtered_pre.txt", "english_text_filtered.txt") 227 | 228 | 229 | if __name__ == '__main__': 230 | main() --------------------------------------------------------------------------------