├── corpus_process
    ├── __pycache__
    │   └── lm_corpus_processor_base.cpython-37.pyc
    ├── pipeline_example.py
    └── lm_corpus_processor_base.py
├── README.md
├── splitFile.py
├── batchRead.ipynb
├── professionClean.py
├── newsClean.py
├── appClean.py
├── englishClean.py
├── corpusClean.py
└── englishProcessor.py


/corpus_process/__pycache__/lm_corpus_processor_base.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BarryZM/dataProcessor/HEAD/corpus_process/__pycache__/lm_corpus_processor_base.cpython-37.pyc


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## 代码主要是对较大型语料（约14G的语料）进行分布式清洗和分句分词操作
 2 | #### 代码包括：
 3 | ##### 如何批量读取文件夹及子文件夹下的数据
 4 | ##### 如何将批量整合文件夹及子文件下的数据
 5 | ##### 匹配中英文里可能出现的所有特殊字符
 6 | ##### 匹配各类网址及网页标签
 7 | ##### 匹配希腊字母汉语拼音及繁体字等
 8 | ##### 利用*PyLTP*模块进行分句
 9 | ##### 去掉文本空行函数
10 | ##### 计时装饰器以及代码进度条
11 | ##### 添加分布式分词处理类文件
12 | **欢迎star和fork**
13 | 


--------------------------------------------------------------------------------
/splitFile.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # -*- author: JeremySun -*-
 3 | # -*- dating: 19/10/24 -*-
 4 | 
 5 | # 模块载入
 6 | import os
 7 | import time
 8 | from functools import wraps
 9 | 
10 | # 定义timer
11 | def func_timer(function):
12 |     @wraps(function)
13 |     def function_timer(*args, **kwargs):
14 |         print('[Function: {name} start...]'.format(name=function.__name__))
15 |         t0 = time.time()
16 |         result = function(*args, **kwargs)
17 |         t1 = time.time()
18 |         print('[Function: {name} finished, spent time: {time:.2f}s]'.format(name=function.__name__, time=t1 - t0))
19 |         return result
20 |     return function_timer
21 | 
22 | # 切分函数
23 | @func_timer
24 | def split_file(file_path, partial_size):
25 |     file_dir, name = os.path.split(file_path)
26 |     name, ext = os.path.splitext(name)
27 |     file_dir = os.path.join(file_dir, name)
28 | 
29 |     if not os.path.exists(file_dir):
30 |         os.mkdir(file_dir)
31 |     part_no = 0
32 |     stream = open(file_path, 'r', encoding='utf-8')
33 | 
34 |     while True:
35 |         part_filename = os.path.join(file_dir, name + '_' + str(part_no) + ext)
36 |         print('write start %s' % part_filename)
37 |         part_stream = open(part_filename, 'w', encoding='utf-8')
38 |         read_count = 0
39 |         read_size = 1024 * 512
40 |         read_count_once = 0
41 | 
42 |         while read_count < partial_size:
43 |             read_content = stream.read(read_size)
44 |             read_count_once = len(read_content)
45 |             if read_count_once > 0:
46 |                 part_stream.write(read_content)
47 |             else:
48 |                 break
49 |             read_count += read_count_once
50 |         part_stream.close()
51 |         if read_count_once < read_size:
52 |             break
53 |         part_no += 1
54 |     return print('Splitting is done')
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     split_file(r'C:\Users\JeremySun\Desktop\Internship\Project02_corpusProcessor\english_text_pre.txt', 100 * 100 * 1000)


--------------------------------------------------------------------------------
/corpus_process/pipeline_example.py:
--------------------------------------------------------------------------------
 1 | from lm_corpus_processor_base import * 
 2 | 
 3 | import jieba 
 4 | import re
 5 | 
 6 | class PassageCleaner(BasePassageCleaner):
 7 |     def __init__(self, num_worker):
 8 |         super(PassageCleaner, self).__init__(num_worker)
 9 |     
10 |     @staticmethod
11 |     def remove_html(sentance: str) -> str:
12 |         re_tag = re.compile('</?\w+[^>]*>')  # HTML标签
13 |         new_text = re.sub(re_tag, '', sentance)
14 |         new_text = re.sub(",+", ",", new_text)   # 合并逗号
15 |         new_text = re.sub(" +", " ", new_text)   # 合并空格
16 |         new_text = re.sub("[...|…|。。。]+", "...", new_text)  # 合并句号
17 |         new_text = re.sub("-+", "--", new_text)  # 合并-
18 |         new_text = re.sub("———+", "———", new_text)  # 合并-
19 |         return new_text
20 | 
21 |     def _clean_func(self, passage):
22 |         self.remove_html(passage)
23 |         return passage
24 |     
25 | 
26 | class PassageSplitter(BasePassageSplitter):
27 |     def __init__(self, num_worker):
28 |         super(PassageSplitter, self).__init__(num_worker)
29 |         
30 |     def _split_func(self, passage):
31 |         passage = re.sub('([；，。！？\?])([^”’])', r"\1\n\2", passage)  # 单字符断句符
32 |         passage = re.sub('(\.{6})([^”’])', r"\1\n\2", passage)  # 英文省略号
33 |         passage = re.sub('(\…{2})([^”’])', r"\1\n\2", passage)  # 中文省略号
34 |         passage = re.sub('([；，。！？\?][”’])([^，。！？\?])', r'\1\n\2', passage)
35 |         # 如果双引号前有终止符，那么双引号才是句子的终点，把分句符\n放到双引号后，注意前面的几句都小心保留了双引号
36 |         passage = passage.rstrip()  # 段尾如果有多余的\n就去掉它
37 |         return passage.split("\n")
38 | 
39 | 
40 | class SentanceCleaner(BaseSentanceCleaner):
41 |     def __init__(self, num_worker, user_dict_file=None):
42 |         super(SentanceCleaner, self).__init__(num_worker)
43 | 
44 |     @staticmethod
45 |     def remove_other(sentance):
46 |         def is_chinese(uchar):
47 |             """判断一个unicode是否是汉字"""
48 |             if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
49 |                 return True
50 |             else:
51 |                 return False
52 | 
53 |         def is_number(uchar):
54 |             """判断一个unicode是否是数字"""
55 |             if uchar >= u'\u0030' and uchar <= u'\u0039':
56 |                 return True
57 |             else:
58 |                 return False
59 | 
60 |         def is_alphabet(uchar):
61 |             """判断一个unicode是否是英文字母"""
62 |             if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
63 |                 return True
64 |             else:
65 |                 return False
66 |         content_str = ''
67 |         for i in sentance:
68 |             if is_chinese(i) | is_number(i) | is_alphabet(i):
69 |                 content_str = content_str+i
70 | 
71 |         return content_str
72 |             
73 |     def _clean_func(self, sentance):
74 |         #sentance = sentance
75 |         sentance = self.remove_other(sentance)
76 |         return sentance
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     handler = Handler(3)
81 |     passage_list = 100*['我爱北京天安门，天安门上太阳升。伟大领袖毛主席，指引我们向前进。','我爱北京天安门，天安门上太阳升。伟大领袖毛主席，指引我们向前进。','我爱北京天安门，天安门上太阳升。伟大领袖毛主席，指引我们向前进。']
82 |     pc = PassageCleaner(3)
83 |     ps = PassageSplitter(3)
84 |     sc = SentanceCleaner(3)
85 |     handler.init(pc,ps,sc)
86 |     c = handler.handle(passage_list)
87 |     print(c[:10])
88 | 
89 | 
90 | 
91 | 
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/batchRead.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 8,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import os\n",
 12 |     "import glob\n",
 13 |     "import time"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 5,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "def batch_read(path, f):\n",
 25 |     "    cate = [path + '/' + x for x in os.listdir(path)]\n",
 26 |     "    f2 = open(f, 'a+', encoding='utf-8')\n",
 27 |     "    for idx, folder in enumerate(cate):\n",
 28 |     "        for im in glob.glob(folder + '/*.txt'):\n",
 29 |     "            f1 = open(im, 'r', encoding='utf-8')\n",
 30 |     "            for eachLine in f1:\n",
 31 |     "                f2.write(eachLine)\n",
 32 |     "            f1.close()"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 6,
 38 |    "metadata": {
 39 |     "collapsed": true
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "news_path = 'C:/Users/JeremySun/Desktop/Internship/Project02_corpusProcessor/tqdm/'\n",
 44 |     "news_f = 'C:/Users/JeremySun/Desktop\\Internship/Project02_corpusProcessor/allTqdm.txt'"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 7,
 50 |    "metadata": {
 51 |     "collapsed": false
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "batch_read(news_path, news_f)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 9,
 61 |    "metadata": {
 62 |     "collapsed": false
 63 |    },
 64 |    "outputs": [
 65 |     {
 66 |      "name": "stderr",
 67 |      "output_type": "stream",
 68 |      "text": [
 69 |       "100%|███████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 1540041.86it/s]\n"
 70 |      ]
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "from tqdm import tqdm\n",
 75 |     "for i in tqdm(range(10000)):\n",
 76 |     "    pass\n"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {
 83 |     "collapsed": true
 84 |    },
 85 |    "outputs": [],
 86 |    "source": []
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {
 92 |     "collapsed": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": []
 96 |   }
 97 |  ],
 98 |  "metadata": {
 99 |   "anaconda-cloud": {},
100 |   "kernelspec": {
101 |    "display_name": "Python [conda root]",
102 |    "language": "python",
103 |    "name": "conda-root-py"
104 |   },
105 |   "language_info": {
106 |    "codemirror_mode": {
107 |     "name": "ipython",
108 |     "version": 3
109 |    },
110 |    "file_extension": ".py",
111 |    "mimetype": "text/x-python",
112 |    "name": "python",
113 |    "nbconvert_exporter": "python",
114 |    "pygments_lexer": "ipython3",
115 |    "version": "3.5.2"
116 |   },
117 |   "toc": {
118 |    "base_numbering": 1,
119 |    "nav_menu": {},
120 |    "number_sections": true,
121 |    "sideBar": true,
122 |    "skip_h1_title": false,
123 |    "title_cell": "Table of Contents",
124 |    "title_sidebar": "Contents",
125 |    "toc_cell": false,
126 |    "toc_position": {},
127 |    "toc_section_display": true,
128 |    "toc_window_display": false
129 |   }
130 |  },
131 |  "nbformat": 4,
132 |  "nbformat_minor": 1
133 | }
134 | 


--------------------------------------------------------------------------------
/corpus_process/lm_corpus_processor_base.py:
--------------------------------------------------------------------------------
  1 | from concurrent.futures import ProcessPoolExecutor
  2 | import time
  3 | from itertools import chain
  4 | import jieba
  5 | 
  6 | 
  7 | class Base:
  8 |     """
  9 |     base class
 10 |     """
 11 |     def __init__(self, num_worker):
 12 |         self.num_worker = num_worker
 13 |     
 14 |     def _multi_process(self, process_func, iter_list: list) -> list:
 15 |         with ProcessPoolExecutor(max_workers = self.num_worker) as executor:
 16 |             result = executor.map(process_func, iter_list)
 17 |         return list(result)
 18 |     
 19 |     @staticmethod
 20 |     def timer(func):
 21 |         def wrapper(*args, **kwargs):
 22 |             start = time.time()
 23 |             ret = func(*args, **kwargs)
 24 |             end = time.time()
 25 |             print(f'func \'{func.__name__}\' done in {round(end - start, 3)}s')
 26 |             return ret
 27 |         return wrapper
 28 | 
 29 |     
 30 | class BasePassageCleaner(Base):
 31 |     """
 32 |     to be override _clean_func
 33 |     """
 34 |     
 35 |     def __init__(self, num_worker):
 36 |         super(BasePassageCleaner, self).__init__(num_worker)
 37 |     
 38 |     @Base.timer
 39 |     def run(self, passage_list: list):
 40 |         cleaned_passages = self._multi_process(self._clean_func, passage_list)
 41 |         return cleaned_passages
 42 |     
 43 |     def _clean_func(self, passage: str) -> str:
 44 |         cleaned_passage = passage
 45 |         return cleaned_passage
 46 | 
 47 |     
 48 | class BasePassageSplitter(Base):
 49 |     """
 50 |     to be override _split_func
 51 |     """
 52 |     
 53 |     def __init__(self, num_worker):
 54 |         super(BasePassageSplitter, self).__init__(num_worker)
 55 |     
 56 |     @Base.timer
 57 |     def run(self, passage_list: list):
 58 |         splitted_passages = self._multi_process(self._split_func, passage_list)
 59 |         splitted_passages = self.reshape(splitted_passages)
 60 |         return splitted_passages
 61 |     
 62 |     def reshape(self, splitted_passages: list) -> list:
 63 |         return list(chain(*splitted_passages))
 64 |     
 65 |     def _split_func(self, passage: str) -> str:
 66 |         splitted_passages = passage.split('。')
 67 |         return splitted_passages
 68 | 
 69 |     
 70 | class BaseSentanceCleaner(Base):
 71 |     """
 72 |     to be override _clean_func
 73 |     """
 74 |     def __init__(self, num_worker):
 75 |         super(BaseSentanceCleaner, self).__init__(num_worker)
 76 |     
 77 |     @Base.timer
 78 |     def run(self, sentance_list: list) -> list:
 79 |         passages = self._multi_process(self._clean_func, sentance_list)
 80 |         return passages
 81 |         
 82 |     def _clean_func(self, sentance: str) -> str:
 83 |         cleaned_sentance = sentance
 84 |         return cleaned_sentance
 85 | 
 86 | 
 87 | class Handler(Base):
 88 |     """
 89 |     the main pipeline
 90 |     """
 91 |     def __init__(self, num_worker, user_dict=None):
 92 |         super(Handler, self).__init__(num_worker)
 93 |         self.passage_cleaner = None
 94 |         self.passage_splitter = None
 95 |         self.sentance_cleaner = None
 96 |         if user_dict is not None:
 97 |             jieba.load_userdict(user_dict)
 98 |     
 99 |     @Base.timer
100 |     def init(self, passage_cleaner, passage_splitter, sentance_cleaner):
101 |         self.passage_cleaner = passage_cleaner
102 |         self.passage_splitter = passage_splitter
103 |         self.sentance_cleaner = sentance_cleaner
104 |         print(f'handler initialized')
105 |         
106 |     @Base.timer
107 |     def segment(self, cleaned_sentances: list, use_hmm: bool=False) -> list:
108 |         jieba.enable_parallel(self.num_worker)
109 |         cleaned_sentances = [' '.join(jieba.lcut(i, HMM=use_hmm)) for i in cleaned_sentances]
110 |         jieba.disable_parallel()
111 |         return cleaned_sentances
112 |     
113 |     @Base.timer
114 |     def handle(self, passage_list):
115 |         assert self.passage_cleaner is not None
116 |         assert self.passage_splitter is not None
117 |         assert self.sentance_cleaner is not None 
118 |         cleaned_passages = self.passage_cleaner.run(passage_list)
119 |         splitted_passages = self.passage_splitter.run(cleaned_passages)
120 |         cleaned_sentances = self.sentance_cleaner.run(splitted_passages)
121 |         cleaned_sentances = self.segment(cleaned_sentances)
122 |         return cleaned_sentances
123 |     
124 | class Segmentor:
125 |     def __init__(self, num_worker):
126 |         self.num_worker = num_worker
127 |         
128 |     def segment(self, sentance_list: list) -> list:
129 |         return segment
130 | 
131 | 
132 | if __name__ == '__main__':
133 |     passage_list = ['a。b。c。d。f。e','a。b。c。d。f。e','a。b。c。d。f。e']
134 |     passage_cleaner = BasePassageCleaner(3)
135 |     passage_splitter = BasePassageSplitter(3)
136 |     sentance_cleaner = BaseSentanceCleaner(3)
137 |     handler = Handler(3)
138 |     handler.init(passage_cleaner,passage_splitter,sentance_cleaner)
139 |     handler.handle(passage_list)
140 |     print(handler.cleaned_sentance)


--------------------------------------------------------------------------------
/professionClean.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # -*- author: JeremySun -*-
  3 | # -*- dating: 19/12/24 -*-
  4 | 
  5 | # 模块导入
  6 | import os
  7 | import re
  8 | import time
  9 | from tqdm import tqdm
 10 | from functools import wraps
 11 | from pyltp import SentenceSplitter
 12 | 
 13 | 
 14 | # ltp模型目录路径
 15 | LTP_DATA_DIR = "D:/PyLTP/ltp_data"
 16 | 
 17 | 
 18 | # 数据导入
 19 | def batch_file(path, file_list):
 20 |     for file in os.listdir(path):
 21 |         fs = os.path.join(path, file)
 22 |         if os.path.isfile(fs):
 23 |             file_list.append(fs)
 24 |         elif os.path.isdir(fs):
 25 |             batch_file(fs, file_list)
 26 |     return file_list
 27 | 
 28 | 
 29 | 
 30 | # 匹配网页标签
 31 | def loss_html(text):
 32 |     pattern_tag = re.compile('</?\w+[^>]*>')  # HTML标签
 33 |     text_html = re.sub(pattern=pattern_tag, repl='', string=str(text))
 34 |     return text_html
 35 | 
 36 | 
 37 | # 匹配标签
 38 | def loss_label(text):
 39 |     pattern_img = re.compile(r"<(img|IMG)(.*?)(/>|></img>|>)")
 40 |     text_img = re.sub(pattern=pattern_img, repl='', string=str(text))
 41 |     pattern_video = re.compile(r'<(video)(.*?)(/>|></video>|>)')
 42 |     text_video = re.sub(pattern=pattern_video, repl='', string=str(text_img))
 43 |     pattern_src = re.compile(r"(src|SRC)=(\"|\')(.*?)(\"|\')")
 44 |     text_src = re.sub(pattern=pattern_src, repl='', string=str(text_video))
 45 |     pattern_div = re.compile(r'/<div(([\s\S])*?)<\/div>/g')
 46 |     text_div = re.sub(pattern=pattern_div, repl='', string=str(text_src))
 47 |     pattern_span = re.compile(r"<(span)(.*?)(/>|></span>|>)")
 48 |     text_span = re.sub(pattern=pattern_span, repl='', string=str(text_div))
 49 |     pattern_again = re.compile(r'</span>')
 50 |     text_span_again = re.sub(pattern=pattern_again, repl='', string=str(text_span))
 51 |     pattern_p1 = re.compile(r'<(p)(.*?)(/>|></p>|>)')
 52 |     text_p1 = re.sub(pattern=pattern_p1, repl='', string=str(text_span_again))
 53 |     pattern_p2 = re.compile(r'(</p>)')
 54 |     text_p2 = re.sub(pattern=pattern_p2, repl='', string=str(text_p1))
 55 |     pattern_p3 = re.compile(r'(<p)')
 56 |     text_p3 = re.sub(pattern=pattern_p3, repl='', string=str(text_p2))
 57 |     return text_p3
 58 | 
 59 | 
 60 | # 匹配邮箱
 61 | def loss_mail(text):
 62 |     pattern_mail = re.compile('[\w]+(\.[\w]+)*@[\w]+(\.[\w])+')
 63 |     text_mail = re.sub(pattern=pattern_mail, repl='', string=str(text))
 64 |     return text_mail
 65 | 
 66 | 
 67 | # 匹配特殊标点
 68 | def loss_other(text):
 69 |     pattern_other = re.compile(r'[\u4e00-\u9fa5]|[\u0030-\u0039]|[\u0041-\u005a]|[\u0061-\u007a]|[,，。 \.;：/\\:\/!！？?]|[\s]]')
 70 |     text_other = re.findall(pattern=pattern_other, string=str(text))
 71 |     text_other = ''.join(text_other)
 72 |     return text_other
 73 | 
 74 | 
 75 | # 匹配网址
 76 | def loss_url(text):
 77 |     pattern_url1 = re.compile(r'(https?|ftp|file|img3):\/\/[a-z0-9_.:]+\/[-a-z0-9_:@&?=+,.!/~*%$]*(\.(html|htm|shtml))?')
 78 |     pattern_url2 = re.compile(r'^https?:\/\/([^/:]+)(:(\d)+)?(/.*)?$')
 79 |     pattern_url3 = re.compile(r'^([a-z0-9]\.|[a-z0-9][-a-z0-9]{0,61}[a-z0-9]\.)(com|edu|gov|int|mil|net|org|biz|info|name|museum|coop|aero|[a-z][a-z])$')
 80 |     pattern_url4 = re.compile(r'(网|网站|网站是|网址|网址是|平台|点击|店|地址|微信公众号|微信号|微信|微信号是|公众号|公众号是)[A-Z.a-z.0-9]{1,100}')
 81 |     pattern_url5 = re.compile(r'(https?|ftp|file|img3)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]')
 82 |     pattern_url6 = re.compile(r'(www.)[a-zA-Z0-9\-\.]+')
 83 |     pattern_url7 = re.compile(r'(网：|网站：|网站是：|网址：|网址是：|点击：|店：|微信公众号：|微信号：|微信：|微信号是：|公众号：|公众号是)[A-Z.a-z.0-9]{1,100}')
 84 |     pattern_url8 = re.compile(r'(网:|网站:|网站是:|网址:|网址是:|点击:|店:|微信公众号:|微信号:|微信:|微信号是:|公众号:|公众号是)[A-Z.a-z.0-9]{1,100}')
 85 |     text_url1 = re.sub(pattern=pattern_url1, repl='', string=str(text))
 86 |     text_url2 = re.sub(pattern=pattern_url2, repl='', string=str(text_url1))
 87 |     text_url3 = re.sub(pattern=pattern_url3, repl='', string=str(text_url2))
 88 |     text_url4 = re.sub(pattern=pattern_url4, repl='', string=str(text_url3))
 89 |     text_url5 = re.sub(pattern=pattern_url5, repl='', string=str(text_url4))
 90 |     text_url6 = re.sub(pattern=pattern_url6, repl='', string=str(text_url5))
 91 |     text_url7 = re.sub(pattern=pattern_url7, repl='', string=str(text_url6))
 92 |     text_url = re.sub(pattern=pattern_url8, repl='', string=str(text_url7))
 93 |     return text_url
 94 | 
 95 | 
 96 | # 匹配网址
 97 | def clean_url(text):
 98 |     pattern_in = re.compile(r'（网.*?）')
 99 |     corpus_in = re.sub(pattern=pattern_in, repl='', string=str(text))
100 |     pattern_out = re.compile(r'网（.*?）')
101 |     corpus_out = re.sub(pattern=pattern_out, repl='', string=str(corpus_in))
102 |     pattern_none = re.compile(r'网站\s.*?）')
103 |     corpus_none = re.sub(pattern=pattern_none, repl='', string=str(corpus_out))
104 |     pattern_sim = re.compile(r'网址\s.*?）')
105 |     corpus_sim = re.sub(pattern=pattern_sim, repl='', string=str(corpus_none))
106 |     return corpus_sim
107 | 
108 | 
109 | # 匹配连续英文
110 | def loss_continue(text):
111 |     pattern_continue = re.compile(r'[A-Za-z ]{13,100}')
112 |     text_continue = re.sub(pattern=pattern_continue, repl='', string=str(text))
113 |     return text_continue
114 | 
115 | 
116 | # 匹配特定单词
117 | def loss_word(text):
118 |     pattern_word = re.compile(r'nbsp|video|videobr|epdm|br|alt|img|ref|picType1|imageUrl|divclass|high34|normal34|0datavid|div')
119 |     text_word = re.sub(pattern=pattern_word, repl='', string=str(text))
120 |     return text_word
121 | 
122 | 
123 | # 匹配逗号
124 | def loss_comma(text):
125 |     pattern_comma = re.compile(r"[，,：:]")  # 加：:
126 |     text_comma = re.sub(pattern=pattern_comma, repl='。', string=str(text))
127 |     return text_comma
128 | 
129 | 
130 | # 去掉空行
131 | def delBlankline(infile, outfile):
132 |     infopen = open(infile, 'r', encoding="utf-8")
133 |     outfopen = open(outfile, 'w', encoding="utf-8")
134 |     lines = infopen.readlines()
135 |     for line in lines:
136 |         if line.split():
137 |             outfopen.writelines(line)
138 |         else:
139 |             outfopen.writelines("")
140 |     infopen.close()
141 |     outfopen.close()
142 | 
143 | 
144 | # 定义timer
145 | def func_timer(function):
146 |     @wraps(function)
147 |     def function_timer(*args, **kwargs):
148 |         print('[Function: {name} start...]'.format(name=function.__name__))
149 |         t0 = time.time()
150 |         result = function(*args, **kwargs)
151 |         t1 = time.time()
152 |         print('[Function: {name} finished, spent time: {time:.2f}s]'.format(name=function.__name__, time=t1 - t0))
153 |         return result
154 |     return function_timer
155 | 
156 | 
157 | # 定义main()函数
158 | @func_timer
159 | def main():
160 |     file_list = []
161 |     path = r'D:\实习数据备份\备份\professional'
162 |     file_path = batch_file(path=path, file_list=file_list)
163 |     for path in file_path:
164 |         profession_text_connect = open(path, encoding='utf-8').readlines()
165 |         assetPath_loss_html = loss_html(text=profession_text_connect)
166 |         assetPath_loss_label = loss_label(text=assetPath_loss_html)
167 |         assetPath_loss_mail = loss_mail(text=assetPath_loss_label)
168 |         assetPath_loss_other = loss_other(text=assetPath_loss_mail)
169 |         assetPath_loss_url = loss_url(text=assetPath_loss_other)
170 |         assetPath_clean_url = clean_url(text=assetPath_loss_url)
171 |         assetPath_loss_continue = loss_continue(text=assetPath_clean_url)
172 |         assetPath_loss_word = loss_word(text=assetPath_loss_continue)
173 |         assetPath_loss_comma = loss_comma(text=assetPath_loss_word)
174 | 
175 |         # 分句
176 |         assetPath_sentence = SentenceSplitter.split(assetPath_loss_comma)
177 | 
178 |         # 去掉其余符号并写入文件
179 |         pattern_all = re.compile(r"[。.；;:：？?!！/|]")  # 加：:
180 |         f = open("profession_pre.txt", 'a', encoding='utf-8')
181 |         for i in tqdm(assetPath_sentence):
182 |             i = re.sub(pattern=pattern_all, repl='', string=i)
183 |             f.write(i + '\n')
184 |         f.close()
185 | 
186 |         # delBlankline("assetPath_sentence_pre.txt", "assetPath_sentence.txt")
187 | 
188 | if __name__ == '__main__':
189 |     main()
190 | 
191 | 


--------------------------------------------------------------------------------
/newsClean.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # -*- author: JeremySun -*-
  3 | # -*- dating: 19/12/24 -*-
  4 | 
  5 | # 模块导入
  6 | import os
  7 | import re
  8 | import time
  9 | from tqdm import tqdm
 10 | from functools import wraps
 11 | from pyltp import SentenceSplitter
 12 | 
 13 | 
 14 | # ltp模型目录路径
 15 | LTP_DATA_DIR = "D:/PyLTP/ltp_data"
 16 | 
 17 | 
 18 | # 数据导入
 19 | def batch_file(path, file_list):
 20 |     for file in os.listdir(path):
 21 |         fs = os.path.join(path, file)
 22 |         if os.path.isfile(fs):
 23 |             file_list.append(fs)
 24 |         elif os.path.isdir(fs):
 25 |             batch_file(fs, file_list)
 26 |     return file_list
 27 | 
 28 | 
 29 | 
 30 | # 匹配网页标签
 31 | def loss_html(text):
 32 |     pattern_tag = re.compile('</?\w+[^>]*>')  # HTML标签
 33 |     text_html = re.sub(pattern=pattern_tag, repl='', string=str(text))
 34 |     return text_html
 35 | 
 36 | 
 37 | # 匹配标签
 38 | def loss_label(text):
 39 |     pattern_img = re.compile(r"<(img|IMG)(.*?)(/>|></img>|>)")
 40 |     text_img = re.sub(pattern=pattern_img, repl='', string=str(text))
 41 |     pattern_video = re.compile(r'<(video)(.*?)(/>|></video>|>)')
 42 |     text_video = re.sub(pattern=pattern_video, repl='', string=str(text_img))
 43 |     pattern_src = re.compile(r"(src|SRC)=(\"|\')(.*?)(\"|\')")
 44 |     text_src = re.sub(pattern=pattern_src, repl='', string=str(text_video))
 45 |     pattern_div = re.compile(r'/<div(([\s\S])*?)<\/div>/g')
 46 |     text_div = re.sub(pattern=pattern_div, repl='', string=str(text_src))
 47 |     pattern_span = re.compile(r"<(span)(.*?)(/>|></span>|>)")
 48 |     text_span = re.sub(pattern=pattern_span, repl='', string=str(text_div))
 49 |     pattern_again = re.compile(r'</span>')
 50 |     text_span_again = re.sub(pattern=pattern_again, repl='', string=str(text_span))
 51 |     pattern_p1 = re.compile(r'<(p)(.*?)(/>|></p>|>)')
 52 |     text_p1 = re.sub(pattern=pattern_p1, repl='', string=str(text_span_again))
 53 |     pattern_p2 = re.compile(r'(</p>)')
 54 |     text_p2 = re.sub(pattern=pattern_p2, repl='', string=str(text_p1))
 55 |     pattern_p3 = re.compile(r'(<p)')
 56 |     text_p3 = re.sub(pattern=pattern_p3, repl='', string=str(text_p2))
 57 |     return text_p3
 58 | 
 59 | 
 60 | # 匹配邮箱
 61 | def loss_mail(text):
 62 |     pattern_mail = re.compile('[\w]+(\.[\w]+)*@[\w]+(\.[\w])+')
 63 |     text_mail = re.sub(pattern=pattern_mail, repl='', string=str(text))
 64 |     return text_mail
 65 | 
 66 | 
 67 | # 匹配特殊标点
 68 | def loss_other(text):
 69 |     pattern_other = re.compile(r'[\u4e00-\u9fa5]|[\u0030-\u0039]|[\u0041-\u005a]|[\u0061-\u007a]|[,，。 \.;：/\\:\/!！？?]|[\s]]')
 70 |     text_other = re.findall(pattern=pattern_other, string=str(text))
 71 |     text_other = ''.join(text_other)
 72 |     return text_other
 73 | 
 74 | 
 75 | # 匹配网址
 76 | def loss_url(text):
 77 |     pattern_url1 = re.compile(r'(https?|ftp|file|img3):\/\/[a-z0-9_.:]+\/[-a-z0-9_:@&?=+,.!/~*%$]*(\.(html|htm|shtml))?')
 78 |     pattern_url2 = re.compile(r'^https?:\/\/([^/:]+)(:(\d)+)?(/.*)?$')
 79 |     pattern_url3 = re.compile(r'^([a-z0-9]\.|[a-z0-9][-a-z0-9]{0,61}[a-z0-9]\.)(com|edu|gov|int|mil|net|org|biz|info|name|museum|coop|aero|[a-z][a-z])$')
 80 |     pattern_url4 = re.compile(r'(网|网站|网站是|网址|网址是|平台|点击|店|地址|微信公众号|微信号|微信|微信号是|公众号|公众号是)[A-Z.a-z.0-9]{1,100}')
 81 |     pattern_url5 = re.compile(r'(https?|ftp|file|img3)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]')
 82 |     pattern_url6 = re.compile(r'(www.)[a-zA-Z0-9\-\.]+')
 83 |     pattern_url7 = re.compile(r'(网：|网站：|网站是：|网址：|网址是：|点击：|店：|微信公众号：|微信号：|微信：|微信号是：|公众号：|公众号是)[A-Z.a-z.0-9]{1,100}')
 84 |     pattern_url8 = re.compile(r'(网:|网站:|网站是:|网址:|网址是:|点击:|店:|微信公众号:|微信号:|微信:|微信号是:|公众号:|公众号是)[A-Z.a-z.0-9]{1,100}')
 85 |     text_url1 = re.sub(pattern=pattern_url1, repl='', string=str(text))
 86 |     text_url2 = re.sub(pattern=pattern_url2, repl='', string=str(text_url1))
 87 |     text_url3 = re.sub(pattern=pattern_url3, repl='', string=str(text_url2))
 88 |     text_url4 = re.sub(pattern=pattern_url4, repl='', string=str(text_url3))
 89 |     text_url5 = re.sub(pattern=pattern_url5, repl='', string=str(text_url4))
 90 |     text_url6 = re.sub(pattern=pattern_url6, repl='', string=str(text_url5))
 91 |     text_url7 = re.sub(pattern=pattern_url7, repl='', string=str(text_url6))
 92 |     text_url = re.sub(pattern=pattern_url8, repl='', string=str(text_url7))
 93 |     return text_url
 94 | 
 95 | 
 96 | # 匹配网址
 97 | def clean_url(text):
 98 |     pattern_in = re.compile(r'（网.*?）')
 99 |     corpus_in = re.sub(pattern=pattern_in, repl='', string=str(text))
100 |     pattern_out = re.compile(r'网（.*?）')
101 |     corpus_out = re.sub(pattern=pattern_out, repl='', string=str(corpus_in))
102 |     pattern_none = re.compile(r'网站\s.*?）')
103 |     corpus_none = re.sub(pattern=pattern_none, repl='', string=str(corpus_out))
104 |     pattern_sim = re.compile(r'网址\s.*?）')
105 |     corpus_sim = re.sub(pattern=pattern_sim, repl='', string=str(corpus_none))
106 |     return corpus_sim
107 | 
108 | 
109 | # 匹配连续英文
110 | def loss_continue(text):
111 |     pattern_continue = re.compile(r'[A-Za-z ]{13,100}')
112 |     text_continue = re.sub(pattern=pattern_continue, repl='', string=str(text))
113 |     return text_continue
114 | 
115 | 
116 | # 匹配特定单词
117 | def loss_word(text):
118 |     pattern_word = re.compile(r'\\xa0|nbsp|video|videobr|epdm|br|alt|img|ref|picType1|imageUrl|divclass|high34|normal34|0datavid|div')
119 |     text_word = re.sub(pattern=pattern_word, repl='', string=str(text))
120 |     return text_word
121 | 
122 | 
123 | # 匹配逗号
124 | def loss_comma(text):
125 |     pattern_comma = re.compile(r"[，,：:]")  # 加：:
126 |     text_comma = re.sub(pattern=pattern_comma, repl='。', string=str(text))
127 |     return text_comma
128 | 
129 | 
130 | # 去掉空行
131 | def delBlankline(infile, outfile):
132 |     infopen = open(infile, 'r', encoding="utf-8")
133 |     outfopen = open(outfile, 'w', encoding="utf-8")
134 |     lines = infopen.readlines()
135 |     for line in lines:
136 |         if line.split():
137 |             outfopen.writelines(line)
138 |         else:
139 |             outfopen.writelines("")
140 |     infopen.close()
141 |     outfopen.close()
142 | 
143 | 
144 | # 定义timer
145 | def func_timer(function):
146 |     @wraps(function)
147 |     def function_timer(*args, **kwargs):
148 |         print('[Function: {name} start...]'.format(name=function.__name__))
149 |         t0 = time.time()
150 |         result = function(*args, **kwargs)
151 |         t1 = time.time()
152 |         print('[Function: {name} finished, spent time: {time:.2f}s]'.format(name=function.__name__, time=t1 - t0))
153 |         return result
154 |     return function_timer
155 | 
156 | 
157 | # 定义main()函数
158 | @func_timer
159 | def main():
160 |     file_list = []
161 |     path = r'D:\实习数据备份\备份\xinwenshuju'
162 |     file_path = batch_file(path=path, file_list=file_list)
163 |     for path in file_path:
164 |         try:
165 |             news_text_connect = open(path, encoding='utf-8').readlines()
166 |             assetPath_loss_html = loss_html(text=news_text_connect)
167 |             assetPath_loss_label = loss_label(text=assetPath_loss_html)
168 |             assetPath_loss_mail = loss_mail(text=assetPath_loss_label)
169 |             assetPath_loss_other = loss_other(text=assetPath_loss_mail)
170 |             assetPath_loss_url = loss_url(text=assetPath_loss_other)
171 |             assetPath_clean_url = clean_url(text=assetPath_loss_url)
172 |             assetPath_loss_continue = loss_continue(text=assetPath_clean_url)
173 |             assetPath_loss_word = loss_word(text=assetPath_loss_continue)
174 |             assetPath_loss_comma = loss_comma(text=assetPath_loss_word)
175 | 
176 |             # 分句
177 |             assetPath_sentence = SentenceSplitter.split(assetPath_loss_comma)
178 | 
179 |             # 去掉其余符号并写入文件
180 |             pattern_all = re.compile(r"[。.；;:：？?!！|]")  # 加：:
181 |             f = open("news_pre.txt", 'a', encoding='utf-8')
182 |             for i in tqdm(assetPath_sentence):
183 |                 i = re.sub(pattern=pattern_all, repl='', string=i)
184 |                 f.write(i + '\n')
185 |             f.close()
186 | 
187 |         except:
188 |             print("utf-8 codec can not decode byte 0xc3 in position 0")
189 | 
190 | 
191 | if __name__ == '__main__':
192 |     main()


--------------------------------------------------------------------------------
/appClean.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # -*- author: JeremySun -*-
  3 | # -*- dating: 19/12/24 -*-
  4 | 
  5 | # 模块导入
  6 | import os
  7 | import re
  8 | import time
  9 | from tqdm import tqdm
 10 | from functools import wraps
 11 | from collections import defaultdict
 12 | from pyltp import SentenceSplitter
 13 | 
 14 | 
 15 | # ltp模型目录路径
 16 | LTP_DATA_DIR = "D:/PyLTP/ltp_data"
 17 | 
 18 | 
 19 | # 数据导入
 20 | def batch_file(path, file_list):
 21 |     for file in os.listdir(path):
 22 |         fs = os.path.join(path, file)
 23 |         if os.path.isfile(fs):
 24 |             file_list.append(fs)
 25 |         elif os.path.isdir(fs):
 26 |             batch_file(fs, file_list)
 27 |     return file_list
 28 | 
 29 | 
 30 | # 取出文本中键assetPath的值
 31 | def get_assetPath(text):
 32 |     i = 0
 33 |     assetPath_str = ''
 34 |     while i <= len(text) - 1:
 35 |         test_dict = defaultdict(lambda: '', eval(text[i]))
 36 |         assetPath = test_dict['assetPath']
 37 |         assetPath_str += assetPath
 38 |         i += 1
 39 |     return assetPath_str
 40 | 
 41 | 
 42 | # 匹配网页标签
 43 | def loss_html(text):
 44 |     pattern_tag = re.compile('</?\w+[^>]*>')  # HTML标签
 45 |     text_html = re.sub(pattern=pattern_tag, repl='', string=str(text))
 46 |     return text_html
 47 | 
 48 | 
 49 | # 匹配标签
 50 | def loss_label(text):
 51 |     pattern_img = re.compile(r"<(img|IMG)(.*?)(/>|></img>|>)")
 52 |     text_img = re.sub(pattern=pattern_img, repl='', string=str(text))
 53 |     pattern_video = re.compile(r'<(video)(.*?)(/>|></video>|>)')
 54 |     text_video = re.sub(pattern=pattern_video, repl='', string=str(text_img))
 55 |     pattern_src = re.compile(r"(src|SRC)=(\"|\')(.*?)(\"|\')")
 56 |     text_src = re.sub(pattern=pattern_src, repl='', string=str(text_video))
 57 |     pattern_div = re.compile(r'/<div(([\s\S])*?)<\/div>/g')
 58 |     text_div = re.sub(pattern=pattern_div, repl='', string=str(text_src))
 59 |     pattern_span = re.compile(r"<(span)(.*?)(/>|></span>|>)")
 60 |     text_span = re.sub(pattern=pattern_span, repl='', string=str(text_div))
 61 |     pattern_again = re.compile(r'</span>')
 62 |     text_span_again = re.sub(pattern=pattern_again, repl='', string=str(text_span))
 63 |     pattern_p1 = re.compile(r'<(p)(.*?)(/>|></p>|>)')
 64 |     text_p1 = re.sub(pattern=pattern_p1, repl='', string=str(text_span_again))
 65 |     pattern_p2 = re.compile(r'(</p>)')
 66 |     text_p2 = re.sub(pattern=pattern_p2, repl='', string=str(text_p1))
 67 |     pattern_p3 = re.compile(r'(<p)')
 68 |     text_p3 = re.sub(pattern=pattern_p3, repl='', string=str(text_p2))
 69 |     return text_p3
 70 | 
 71 | 
 72 | # 匹配邮箱
 73 | def loss_mail(text):
 74 |     pattern_mail = re.compile('[\w]+(\.[\w]+)*@[\w]+(\.[\w])+')
 75 |     text_mail = re.sub(pattern=pattern_mail, repl='', string=str(text))
 76 |     return text_mail
 77 | 
 78 | 
 79 | # 匹配特殊标点
 80 | def loss_other(text):
 81 |     pattern_other = re.compile(r'[\u4e00-\u9fa5]|[\u0030-\u0039]|[\u0041-\u005a]|[\u0061-\u007a]|[,，。 \.;：/\\:\/!！？?]|[\s]]')
 82 |     text_other = re.findall(pattern=pattern_other, string=str(text))
 83 |     text_other = ''.join(text_other)
 84 |     return text_other
 85 | 
 86 | 
 87 | # 匹配网址
 88 | def loss_url(text):
 89 |     pattern_url1 = re.compile(r'(https?|ftp|file|img3):\/\/[a-z0-9_.:]+\/[-a-z0-9_:@&?=+,.!/~*%$]*(\.(html|htm|shtml))?')
 90 |     pattern_url2 = re.compile(r'^https?:\/\/([^/:]+)(:(\d)+)?(/.*)?$')
 91 |     pattern_url3 = re.compile(r'^([a-z0-9]\.|[a-z0-9][-a-z0-9]{0,61}[a-z0-9]\.)(com|edu|gov|int|mil|net|org|biz|info|name|museum|coop|aero|[a-z][a-z])$')
 92 |     pattern_url4 = re.compile(r'(网|网站|网站是|网址|网址是|平台|点击|店|地址|微信公众号|微信号|微信|微信号是|公众号|公众号是)[A-Z.a-z.0-9]{1,100}')
 93 |     pattern_url5 = re.compile(r'(https?|ftp|file|img3)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]')
 94 |     pattern_url6 = re.compile(r'(www.)[a-zA-Z0-9\-\.]+')
 95 |     pattern_url7 = re.compile(r'(网：|网站：|网站是：|网址：|网址是：|点击：|店：|微信公众号：|微信号：|微信：|微信号是：|公众号：|公众号是)[A-Z.a-z.0-9]{1,100}')
 96 |     pattern_url8 = re.compile(r'(网:|网站:|网站是:|网址:|网址是:|点击:|店:|微信公众号:|微信号:|微信:|微信号是:|公众号:|公众号是)[A-Z.a-z.0-9]{1,100}')
 97 |     text_url1 = re.sub(pattern=pattern_url1, repl='', string=str(text))
 98 |     text_url2 = re.sub(pattern=pattern_url2, repl='', string=str(text_url1))
 99 |     text_url3 = re.sub(pattern=pattern_url3, repl='', string=str(text_url2))
100 |     text_url4 = re.sub(pattern=pattern_url4, repl='', string=str(text_url3))
101 |     text_url5 = re.sub(pattern=pattern_url5, repl='', string=str(text_url4))
102 |     text_url6 = re.sub(pattern=pattern_url6, repl='', string=str(text_url5))
103 |     text_url7 = re.sub(pattern=pattern_url7, repl='', string=str(text_url6))
104 |     text_url = re.sub(pattern=pattern_url8, repl='', string=str(text_url7))
105 |     return text_url
106 | 
107 | 
108 | # 匹配网址
109 | def clean_url(text):
110 |     pattern_in = re.compile(r'（网.*?）')
111 |     corpus_in = re.sub(pattern=pattern_in, repl='', string=str(text))
112 |     pattern_out = re.compile(r'网（.*?）')
113 |     corpus_out = re.sub(pattern=pattern_out, repl='', string=str(corpus_in))
114 |     pattern_none = re.compile(r'网站\s.*?）')
115 |     corpus_none = re.sub(pattern=pattern_none, repl='', string=str(corpus_out))
116 |     pattern_sim = re.compile(r'网址\s.*?）')
117 |     corpus_sim = re.sub(pattern=pattern_sim, repl='', string=str(corpus_none))
118 |     return corpus_sim
119 | 
120 | # 匹配连续英文
121 | def loss_continue(text):
122 |     pattern_continue = re.compile(r'[A-Za-z ]{13,100}')
123 |     text_continue = re.sub(pattern=pattern_continue, repl='', string=str(text))
124 |     return text_continue
125 | 
126 | 
127 | # 匹配特定单词
128 | def loss_word(text):
129 |     pattern_word = re.compile(r'video|videobr|epdm|br|alt|img|ref|picType1|imageUrl|divclass|high34|normal34|0datavid|div')
130 |     text_word = re.sub(pattern=pattern_word, repl='', string=str(text))
131 |     return text_word
132 | 
133 | 
134 | # 匹配逗号
135 | def loss_comma(text):
136 |     pattern_comma = re.compile(r"[，,：:]")  # 加：:
137 |     text_comma = re.sub(pattern=pattern_comma, repl='。', string=str(text))
138 |     return text_comma
139 | 
140 | 
141 | # 去掉空行
142 | def delBlankline(infile, outfile):
143 |     infopen = open(infile, 'r', encoding="utf-8")
144 |     outfopen = open(outfile, 'w', encoding="utf-8")
145 |     lines = infopen.readlines()
146 |     for line in lines:
147 |         if line.split():
148 |             outfopen.writelines(line)
149 |         else:
150 |             outfopen.writelines("")
151 |     infopen.close()
152 |     outfopen.close()
153 | 
154 | 
155 | # 定义timer
156 | def func_timer(function):
157 |     @wraps(function)
158 |     def function_timer(*args, **kwargs):
159 |         print('[Function: {name} start...]'.format(name=function.__name__))
160 |         t0 = time.time()
161 |         result = function(*args, **kwargs)
162 |         t1 = time.time()
163 |         print('[Function: {name} finished, spent time: {time:.2f}s]'.format(name=function.__name__, time=t1 - t0))
164 |         return result
165 |     return function_timer
166 | 
167 | # 定义main()函数
168 | @func_timer
169 | def main():
170 |     file_list = []
171 |     path = r'C:/Users/JeremySun/Desktop/Internship/Project02_corpusProcessor/app'
172 |     file_path = batch_file(path=path, file_list=file_list)
173 |     for path in file_path:
174 |         app = open(path, encoding='utf-8').readlines()
175 |         assetPath_text = get_assetPath(text=app)
176 |         assetPath_loss_html = loss_html(text=assetPath_text)
177 |         assetPath_loss_label = loss_label(text=assetPath_loss_html)
178 |         assetPath_loss_mail = loss_mail(text=assetPath_loss_label)
179 |         assetPath_loss_other = loss_other(text=assetPath_loss_mail)
180 |         assetPath_loss_url = loss_url(text=assetPath_loss_other)
181 |         assetPath_clean_url = clean_url(text=assetPath_loss_url)
182 |         assetPath_loss_continue = loss_continue(text=assetPath_clean_url)
183 |         assetPath_loss_word = loss_word(text=assetPath_loss_continue)
184 |         assetPath_loss_comma = loss_comma(text=assetPath_loss_word)
185 | 
186 |         # 分句
187 |         assetPath_sentence = SentenceSplitter.split(assetPath_loss_comma)
188 | 
189 |         # 去掉其余符号并写入文件
190 |         pattern_all = re.compile(r"[。.；;:：？?!！]")  # 加：:
191 |         f = open("app_pre.txt", 'a', encoding='utf-8')
192 |         for i in tqdm(assetPath_sentence):
193 |             i = re.sub(pattern=pattern_all, repl='', string=i)
194 |             f.write(i + '\n')
195 |         f.close()
196 | 
197 |         # delBlankline("assetPath_sentence_pre.txt", "assetPath_sentence.txt")
198 | 
199 | if __name__ == '__main__':
200 |     main()
201 | 


--------------------------------------------------------------------------------
/englishClean.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # -*- author: JeremySun -*-
  3 | # -*- dating: 19/12/23 -*-
  4 | 
  5 | # 模块导入
  6 | import re
  7 | import os
  8 | import time
  9 | from functools import wraps
 10 | from tqdm import tqdm
 11 | from pyltp import SentenceSplitter
 12 | 
 13 | # ltp模型目录路径
 14 | LTP_DATA_DIR = "D:/PyLTP/ltp_data"
 15 | 
 16 | # 数据读取
 17 | # test = open('../testClean.txt', 'r', encoding='utf-8').readlines()
 18 | 
 19 | # 数据导入
 20 | def batch_file(path, file_list):
 21 |     for file in os.listdir(path):
 22 |         fs = os.path.join(path, file)
 23 |         if os.path.isfile(fs):
 24 |             file_list.append(fs)
 25 |         elif os.path.isdir(fs):
 26 |             batch_file(fs, file_list)
 27 |     return file_list
 28 | 
 29 | # 匹配网页标签
 30 | def loss_html(text):
 31 |     pattern_tag = re.compile('</?\w+[^>]*>')  # HTML标签
 32 |     text_html = re.sub(pattern=pattern_tag, repl='', string=str(text))
 33 |     return text_html
 34 | 
 35 | 
 36 | # 匹配标签
 37 | def loss_label(text):
 38 |     pattern_img = re.compile(r"<(img|IMG)(.*?)(/>|></img>|>)")
 39 |     text_img = re.sub(pattern=pattern_img, repl='', string=str(text))
 40 |     pattern_video = re.compile(r'<(video)(.*?)(/>|></video>|>)')
 41 |     text_video = re.sub(pattern=pattern_video, repl='', string=str(text_img))
 42 |     pattern_src = re.compile(r"(src|SRC)=(\"|\')(.*?)(\"|\')")
 43 |     text_src = re.sub(pattern=pattern_src, repl='', string=str(text_video))
 44 |     pattern_div = re.compile(r'/<div(([\s\S])*?)<\/div>/g')
 45 |     text_div = re.sub(pattern=pattern_div, repl='', string=str(text_src))
 46 |     pattern_span = re.compile(r"<(span)(.*?)(/>|></span>|>)")
 47 |     text_span = re.sub(pattern=pattern_span, repl='', string=str(text_div))
 48 |     pattern_again = re.compile(r'</span>')
 49 |     text_span_again = re.sub(pattern=pattern_again, repl='', string=str(text_span))
 50 |     pattern_p1 = re.compile(r'<(p)(.*?)(/>|></p>|>)')
 51 |     text_p1 = re.sub(pattern=pattern_p1, repl='', string=str(text_span_again))
 52 |     pattern_p2 = re.compile(r'(</p>)')
 53 |     text_p2 = re.sub(pattern=pattern_p2, repl='', string=str(text_p1))
 54 |     pattern_p3 = re.compile(r'(<p)')
 55 |     text_p3 = re.sub(pattern=pattern_p3, repl='', string=str(text_p2))
 56 |     return text_p3
 57 | 
 58 | 
 59 | # 匹配邮箱
 60 | def loss_mail(text):
 61 |     pattern_mail = re.compile('[\w]+(\.[\w]+)*@[\w]+(\.[\w])+')
 62 |     text_mail = re.sub(pattern=pattern_mail, repl='', string=str(text))
 63 |     return text_mail
 64 | 
 65 | 
 66 | # 匹配特殊标点
 67 | def loss_other(text):
 68 |     pattern_other = re.compile(r'[\u4e00-\u9fa5]|[\u0030-\u0039]|[\u0041-\u005a]|[\u0061-\u007a]|[,，。 \.;\'：/\\:\/!！？?]|[\s]]')
 69 |     text_other = re.findall(pattern=pattern_other, string=str(text))
 70 |     text_other = ''.join(text_other)
 71 |     return text_other
 72 | 
 73 | 
 74 | # 匹配网址
 75 | def loss_url(text):
 76 |     pattern_url1 = re.compile(r'(https?|ftp|file|img3):\/\/[a-z0-9_.:]+\/[-a-z0-9_:@&?=+,.!/~*%$]*(\.(html|htm|shtml))?')
 77 |     pattern_url2 = re.compile(r'^https?:\/\/([^/:]+)(:(\d)+)?(/.*)?$')
 78 |     pattern_url3 = re.compile(r'^([a-z0-9]\.|[a-z0-9][-a-z0-9]{0,61}[a-z0-9]\.)(com|edu|gov|int|mil|net|org|biz|info|name|museum|coop|aero|[a-z][a-z])$')
 79 |     pattern_url4 = re.compile(r'(登录|网|网站|网站是|网址|网址是|平台|点击|店|地址|微信公众号|微信号|微信|微信号是|公众号|公众号是)[A-Z.a-z.0-9]{1,100}')
 80 |     pattern_url5 = re.compile(r'(https?|ftp|file|img3)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]')
 81 |     pattern_url6 = re.compile(r'(www.)[a-zA-Z0-9\-\.]+')
 82 |     pattern_url7 = re.compile(r'(登录：|网：|网站：|网站是：|网址：|网址是：|点击：|店：|微信公众号：|微信号：|微信：|微信号是：|公众号：|公众号是)[A-Z.a-z.0-9]{1,100}')
 83 |     pattern_url8 = re.compile(r'(登录:|网:|网站:|网站是:|网址:|网址是:|点击:|店:|微信公众号:|微信号:|微信:|微信号是:|公众号:|公众号是)[A-Z.a-z.0-9]{1,100}')
 84 |     text_url1 = re.sub(pattern=pattern_url1, repl='', string=str(text))
 85 |     text_url2 = re.sub(pattern=pattern_url2, repl='', string=str(text_url1))
 86 |     text_url3 = re.sub(pattern=pattern_url3, repl='', string=str(text_url2))
 87 |     text_url4 = re.sub(pattern=pattern_url4, repl='', string=str(text_url3))
 88 |     text_url5 = re.sub(pattern=pattern_url5, repl='', string=str(text_url4))
 89 |     text_url6 = re.sub(pattern=pattern_url6, repl='', string=str(text_url5))
 90 |     text_url7 = re.sub(pattern=pattern_url7, repl='', string=str(text_url6))
 91 |     text_url = re.sub(pattern=pattern_url8, repl='', string=str(text_url7))
 92 |     return text_url
 93 | 
 94 | 
 95 | # 匹配网址
 96 | def clean_url(text):
 97 |     pattern_in = re.compile(r'（网.*?）')
 98 |     corpus_in = re.sub(pattern=pattern_in, repl='', string=str(text))
 99 |     pattern_out = re.compile(r'网（.*?）')
100 |     corpus_out = re.sub(pattern=pattern_out, repl='', string=str(corpus_in))
101 |     pattern_none = re.compile(r'网站\s.*?）')
102 |     corpus_none = re.sub(pattern=pattern_none, repl='', string=str(corpus_out))
103 |     pattern_sim = re.compile(r'网址\s.*?）')
104 |     corpus_sim = re.sub(pattern=pattern_sim, repl='', string=str(corpus_none))
105 |     return corpus_sim
106 | 
107 | 
108 | # 匹配连续英文
109 | def loss_continue(text):
110 |     pattern_continue = re.compile(r'[A-Za-z ]{13,100}')
111 |     text_continue = re.sub(pattern=pattern_continue, repl=' ', string=str(text))
112 |     return text_continue
113 | 
114 | 
115 | 
116 | # 匹配特定单词
117 | def loss_word(text):
118 |     pattern_word = re.compile(r'htm|chinatimesnetcn|start|http|w w w|h t t p|-|\xa0|\u3000|\r|\t|\n|html|nbsp|video|videobr|epdm|br|alt|img|ref|picType1|imageUrl|divclass|high34|normal34|0datavid|div')
119 |     text_word = re.sub(pattern=pattern_word, repl='', string=str(text))
120 |     return text_word
121 | 
122 | 
123 | # 匹配逗号
124 | def loss_comma(text):
125 |     pattern_comma = re.compile(r"[，,：:]")  # 加：:
126 |     text_comma = re.sub(pattern=pattern_comma, repl='。', string=str(text))
127 |     return text_comma
128 | 
129 | 
130 | # 去掉空行
131 | def delBlankline(infile, outfile):
132 |     infopen = open(infile, 'r', encoding="utf-8")
133 |     outfopen = open(outfile, 'w', encoding="utf-8")
134 |     lines = infopen.readlines()
135 |     for line in lines:
136 |         if line.split():
137 |             outfopen.writelines(line)
138 |         else:
139 |             outfopen.writelines("")
140 |     infopen.close()
141 |     outfopen.close()
142 | 
143 | 
144 | # 定义timer
145 | def func_timer(function):
146 |     @wraps(function)
147 |     def function_timer(*args, **kwargs):
148 |         print('[Function: {name} start...]'.format(name=function.__name__))
149 |         t0 = time.time()
150 |         result = function(*args, **kwargs)
151 |         t1 = time.time()
152 |         print('[Function: {name} finished, spent time: {time:.2f}s]'.format(name=function.__name__, time=t1 - t0))
153 |         return result
154 |     return function_timer
155 | 
156 | 
157 | # 定义main()函数
158 | @func_timer
159 | def main():
160 |     file_list = []
161 |     path = r'C:\Users\JeremySun\Desktop\Internship\Project02_corpusProcessor\english_folder'
162 |     file_path = batch_file(path=path, file_list=file_list)
163 |     for path in file_path:
164 |         english_text_connect = open(path, encoding='utf-8').readlines()
165 |         assetPath_loss_html = loss_html(text=english_text_connect)
166 |         assetPath_loss_label = loss_label(text=assetPath_loss_html)
167 |         assetPath_loss_mail = loss_mail(text=assetPath_loss_label)
168 |         assetPath_loss_other = loss_other(text=assetPath_loss_mail)
169 |         assetPath_loss_url = loss_url(text=assetPath_loss_other)
170 |         assetPath_clean_url = clean_url(text=assetPath_loss_url)
171 |         assetPath_loss_continue = loss_continue(text=assetPath_clean_url)
172 |         assetPath_loss_word = loss_word(text=assetPath_loss_continue)
173 |         assetPath_loss_comma = loss_comma(text=assetPath_loss_word)
174 | 
175 |         # 分句
176 |         english_text_sentence = SentenceSplitter.split(assetPath_loss_comma)
177 | 
178 |         # 去掉其余符号并写入文件
179 |         pattern_all = re.compile(r"[。.；;？?!！:：]")  # 加：:
180 |         pattern_last = re.compile(r'[a-zA-Z0-9]{13,}')
181 |         f = open("english_text_sent_pre.txt", 'a', encoding='utf-8')
182 |         for i in tqdm(english_text_sentence):
183 |             if len(i) <= 100:
184 |                 i = re.sub(pattern=pattern_all, repl=' ', string=i)
185 |                 i = re.sub(pattern=pattern_last, repl='', string=i)
186 |                 f.write(i.strip() + '\n')
187 |         f.close()
188 | 
189 |         # delBlankline("english_text_filtered_pre.txt", "english_cleaned.txt")
190 | 
191 | 
192 | 
193 | if __name__ == '__main__':
194 |     """
195 |     代码执行结果包含\n等标签
196 |     """
197 |     main()


--------------------------------------------------------------------------------
/corpusClean.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # -*- author: JeremySun -*-
  3 | # -*- dating: 20/1/9 -*-
  4 | 
  5 | # 模块导入
  6 | import os
  7 | import re
  8 | import sys
  9 | import time
 10 | from tqdm import tqdm
 11 | from functools import wraps
 12 | from pyltp import SentenceSplitter
 13 | 
 14 | 
 15 | # ltp模型目录路径
 16 | LTP_DATA_DIR = "D:/PyLTP/ltp_data"
 17 | 
 18 | 
 19 | # 数据导入
 20 | def batch_file(path, file_list):
 21 |     for file in os.listdir(path):
 22 |         fs = os.path.join(path, file)
 23 |         if os.path.isfile(fs):
 24 |             file_list.append(fs)
 25 |         elif os.path.isdir(fs):
 26 |             batch_file(fs, file_list)
 27 |     return file_list
 28 | 
 29 | 
 30 | # 如果处理APP的那个数据，需要执行这一句来取出文本中键assetPath的值，这个值才是我们需要的
 31 | # def get_assetPath(text):
 32 | #     i = 0
 33 | #     assetPath_str = ''
 34 | #     while i <= len(text) - 1:
 35 | #         test_dict = defaultdict(lambda: '', eval(text[i]))
 36 | #         assetPath = test_dict['assetPath']
 37 | #         assetPath_str += assetPath
 38 | #         i += 1
 39 | #     return assetPath_str
 40 | 
 41 | 
 42 | # 匹配网页标签
 43 | def loss_html(text):
 44 |     pattern_tag = re.compile('</?\w+[^>]*>')  # HTML标签
 45 |     text_html = re.sub(pattern=pattern_tag, repl=' ', string=str(text))
 46 |     return text_html
 47 | 
 48 | 
 49 | # 匹配标签
 50 | def loss_label(text):
 51 |     pattern_img = re.compile(r"<(img|IMG)(.*?)(/>|></img>|>)")
 52 |     text_img = re.sub(pattern=pattern_img, repl=' ', string=str(text))
 53 |     pattern_video = re.compile(r'<(video)(.*?)(/>|></video>|>)')
 54 |     text_video = re.sub(pattern=pattern_video, repl=' ', string=str(text_img))
 55 |     pattern_src = re.compile(r"(src|SRC)=(\"|\')(.*?)(\"|\')")
 56 |     text_src = re.sub(pattern=pattern_src, repl=' ', string=str(text_video))
 57 |     pattern_div = re.compile(r'/<div(([\s\S])*?)<\/div>/g')
 58 |     text_div = re.sub(pattern=pattern_div, repl=' ', string=str(text_src))
 59 |     pattern_span = re.compile(r"<(span)(.*?)(/>|></span>|>)")
 60 |     text_span = re.sub(pattern=pattern_span, repl=' ', string=str(text_div))
 61 |     pattern_again = re.compile(r'</span>')
 62 |     text_span_again = re.sub(pattern=pattern_again, repl=' ', string=str(text_span))
 63 |     pattern_p1 = re.compile(r'<(p)(.*?)(/>|></p>|>)')
 64 |     text_p1 = re.sub(pattern=pattern_p1, repl=' ', string=str(text_span_again))
 65 |     pattern_p2 = re.compile(r'(</p>)')
 66 |     text_p2 = re.sub(pattern=pattern_p2, repl=' ', string=str(text_p1))
 67 |     pattern_p3 = re.compile(r'(<p)')
 68 |     text_p3 = re.sub(pattern=pattern_p3, repl=' ', string=str(text_p2))
 69 |     return text_p3
 70 | 
 71 | 
 72 | # 匹配邮箱
 73 | def loss_mail(text):
 74 |     pattern_mail = re.compile('[\w]+(\.[\w]+)*@[\w]+(\.[\w])+')
 75 |     text_mail = re.sub(pattern=pattern_mail, repl=' ', string=str(text))
 76 |     return text_mail
 77 | 
 78 | 
 79 | # 去除特殊标点
 80 | def loss_other(text):
 81 |     pattern_other = re.compile(r'[\u4e00-\u9fa5]|[\u0030-\u0039]|[\u0041-\u005a]|[\u0061-\u007a]|[,，。 \.;\'：/\\:\/!！？?]|[\s]]')
 82 |     text_other = re.findall(pattern=pattern_other, string=str(text))
 83 |     text_other = ''.join(text_other)
 84 |     return text_other
 85 | 
 86 | 
 87 | # 匹配规则网址
 88 | def loss_url(text):
 89 |     pattern_url1 = re.compile(r'(https?|ftp|file|img3):\/\/[a-z0-9_.:]+\/[-a-z0-9_:@&?=+,.!/~*%$]*(\.(html|htm|shtml))?')
 90 |     pattern_url2 = re.compile(r'^https?:\/\/([^/:]+)(:(\d)+)?(/.*)?$')
 91 |     pattern_url3 = re.compile(r'^([a-z0-9]\.|[a-z0-9][-a-z0-9]{0,61}[a-z0-9]\.)(com|edu|gov|int|mil|net|org|biz|info|name|museum|coop|aero|[a-z][a-z])$')
 92 |     pattern_url4 = re.compile(r'(登录|网|网站|网站是|网址|网址是|平台|点击|店|地址|微信公众号|微信号|微信|微信号是|公众号|公众号是)[A-Z.a-z.0-9]{1,100}')
 93 |     pattern_url5 = re.compile(r'(https?|ftp|file|img3)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]')
 94 |     pattern_url6 = re.compile(r'(www.)[a-zA-Z0-9\-\.]+')
 95 |     pattern_url7 = re.compile(r'(登录：|网：|网站：|网站是：|网址：|网址是：|点击：|店：|微信公众号：|微信号：|微信：|微信号是：|公众号：|公众号是)[A-Z.a-z.0-9]{1,100}')
 96 |     pattern_url8 = re.compile(r'(登录:|网:|网站:|网站是:|网址:|网址是:|点击:|店:|微信公众号:|微信号:|微信:|微信号是:|公众号:|公众号是)[A-Z.a-z.0-9]{1,100}')
 97 |     text_url1 = re.sub(pattern=pattern_url1, repl=' ', string=str(text))
 98 |     text_url2 = re.sub(pattern=pattern_url2, repl=' ', string=str(text_url1))
 99 |     text_url3 = re.sub(pattern=pattern_url3, repl=' ', string=str(text_url2))
100 |     text_url4 = re.sub(pattern=pattern_url4, repl=' ', string=str(text_url3))
101 |     text_url5 = re.sub(pattern=pattern_url5, repl=' ', string=str(text_url4))
102 |     text_url6 = re.sub(pattern=pattern_url6, repl=' ', string=str(text_url5))
103 |     text_url7 = re.sub(pattern=pattern_url7, repl=' ', string=str(text_url6))
104 |     text_url = re.sub(pattern=pattern_url8, repl=' ', string=str(text_url7))
105 |     return text_url
106 | 
107 | 
108 | # 匹配不规则网址
109 | def clean_url(text):
110 |     pattern_in = re.compile(r'（网.*?）')
111 |     corpus_in = re.sub(pattern=pattern_in, repl=' ', string=str(text))
112 |     pattern_out = re.compile(r'网（.*?）')
113 |     corpus_out = re.sub(pattern=pattern_out, repl=' ', string=str(corpus_in))
114 |     pattern_none = re.compile(r'网站\s.*?）')
115 |     corpus_none = re.sub(pattern=pattern_none, repl=' ', string=str(corpus_out))
116 |     pattern_sim = re.compile(r'网址\s.*?）')
117 |     corpus_sim = re.sub(pattern=pattern_sim, repl=' ', string=str(corpus_none))
118 |     return corpus_sim
119 | 
120 | 
121 | # 匹配连续英文
122 | def loss_continue(text):
123 |     pattern_continue = re.compile(r'[A-Za-z ]{13,100}')
124 |     text_continue = re.sub(pattern=pattern_continue, repl=' ', string=str(text))
125 |     return text_continue
126 | 
127 | 
128 | # 匹配特定单词
129 | def loss_word(text):
130 |     pattern_word = re.compile(r'htm|chinatimesnetcn|start|http|w w w|h t t p|-|\xa0|\u3000|\r|\t|\n|html|nbsp|video|videobr|epdm|br|alt|img|ref|picType1|imageUrl|divclass|high34|normal34|0datavid|div')
131 |     text_word = re.sub(pattern=pattern_word, repl=' ', string=str(text))
132 |     return text_word
133 | 
134 | 
135 | # 匹配逗号
136 | def loss_comma(text):
137 |     pattern_comma = re.compile(r"[，,：:]")
138 |     text_comma = re.sub(pattern=pattern_comma, repl='。', string=str(text))
139 |     return text_comma
140 | 
141 | 
142 | # 去掉空行
143 | def delBlankline(infile, outfile):
144 |     infopen = open(infile, 'r', encoding="utf-8")
145 |     outfopen = open(outfile, 'w', encoding="utf-8")
146 |     lines = infopen.readlines()
147 |     for line in lines:
148 |         if line.split():
149 |             outfopen.writelines(line)
150 |         else:
151 |             outfopen.writelines("")
152 |     infopen.close()
153 |     outfopen.close()
154 | 
155 | 
156 | # 计时装饰器
157 | def func_timer(function):
158 |     @wraps(function)
159 |     def function_timer(*args, **kwargs):
160 |         print('[Function: {name} start...]'.format(name=function.__name__))
161 |         t0 = time.time()
162 |         result = function(*args, **kwargs)
163 |         t1 = time.time()
164 |         print('[Function: {name} finished, spent time: {time:.2f}s]'.format(name=function.__name__, time=t1 - t0))
165 |         return result
166 |     return function_timer
167 | 
168 | 
169 | # 定义main()函数
170 | @func_timer
171 | def main(argv):
172 |     try:
173 |         file_list = []
174 |         file_path = batch_file(path=argv, file_list=file_list)
175 |         for path in file_path:
176 |             # 如果处理APP数据，需要执行下面两行代码
177 |             # app = open(path, encoding='utf-8').readlines()
178 |             # assetPath_text = get_assetPath(text=app)
179 |             assetPath_text = open(path, encoding='utf-8').readlines()
180 |     except:
181 |         assetPath_text = open(argv, encoding='utf-8').readlines()
182 | 
183 |     assetPath_loss_html = loss_html(text=assetPath_text)
184 |     assetPath_loss_label = loss_label(text=assetPath_loss_html)
185 |     assetPath_loss_mail = loss_mail(text=assetPath_loss_label)
186 |     assetPath_loss_other = loss_other(text=assetPath_loss_mail)
187 |     assetPath_loss_url = loss_url(text=assetPath_loss_other)
188 |     assetPath_clean_url = clean_url(text=assetPath_loss_url)
189 |     assetPath_loss_continue = loss_continue(text=assetPath_clean_url)
190 |     assetPath_loss_word = loss_word(text=assetPath_loss_continue)
191 |     assetPath_loss_comma = loss_comma(text=assetPath_loss_word)
192 | 
193 |     # 分句
194 |     english_text_sentence = SentenceSplitter.split(assetPath_loss_comma)
195 | 
196 |     # 去掉其余符号并写入文件
197 |     pattern_all = re.compile(r"[。.；;？?!！:/丨：']")
198 |     pattern_last = re.compile(r'[a-zA-Z0-9]{13,}')
199 |     pattern_rnt = re.compile(r'[\\n|\\r|\\t|u3000|xa0]')  # 按道理之前的规则可以匹配这些字符的
200 |     f = open("sentenceSplitPre.txt", mode='a', newline='', encoding='utf-8')
201 |     for i in tqdm(english_text_sentence):
202 |         if len(i) <= 100:
203 |             i = re.sub(pattern=pattern_all, repl=' ', string=i)
204 |             i = re.sub(pattern=pattern_last, repl=' ', string=i)
205 |             i = re.sub(pattern=pattern_rnt, repl='', string=i)
206 |             f.write(i.strip() + '\n')
207 |     f.close()
208 | 
209 |     delBlankline("sentenceSplitPre.txt", "passage_cleaned.txt")
210 | 
211 |     os.remove("sentenceSplitPre.txt")
212 | 
213 | if __name__ == '__main__':
214 |     main(sys.argv[1])


--------------------------------------------------------------------------------
/englishProcessor.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # -*- author: JeremySun -*-
  3 | # -*- dating: 19/10/10 -*-
  4 | 
  5 | # 模块导入
  6 | import os
  7 | import re
  8 | import time
  9 | from tqdm import tqdm
 10 | from functools import wraps
 11 | from pyltp import SentenceSplitter
 12 | 
 13 | # ltp模型目录路径
 14 | LTP_DATA_DIR = "D:/PyLTP/ltp_data"
 15 | 
 16 | # 数据导入
 17 | def batch_file(path, file_list):
 18 |     for file in os.listdir(path):
 19 |         fs = os.path.join(path, file)
 20 |         if os.path.isfile(fs):
 21 |             file_list.append(fs)
 22 |         elif os.path.isdir(fs):
 23 |             batch_file(fs, file_list)
 24 |     return file_list
 25 | 
 26 | # 去除网址
 27 | def loss_url(text):
 28 |     pattern_url = re.compile(r'(https?|ftp|file|img3)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]')
 29 |     text_url = re.sub(pattern=pattern_url, repl='', string=str(text))
 30 |     return text_url
 31 | 
 32 | # 匹配img标签
 33 | def loss_img(text):
 34 |     pattern_img = re.compile(r"<(img|IMG)(.*?)(/>|></img>|>)")
 35 |     text_img = re.sub(pattern=pattern_img, repl='', string=str(text))
 36 |     return text_img
 37 | 
 38 | # 匹配video标签
 39 | def loss_video(text):
 40 |     pattern_video = re.compile(r'<(video)(.*?)(/>|></video>|>)')
 41 |     text_video = re.sub(pattern=pattern_video, repl='', string=str(text))
 42 |     return text_video
 43 | 
 44 | # 匹配src标签
 45 | def loss_src(text):
 46 |     pattern_src = re.compile(r"(src|SRC)=(\"|\')(.*?)(\"|\')")
 47 |     text_src = re.sub(pattern=pattern_src, repl='', string=str(text))
 48 |     return text_src
 49 | 
 50 | # 匹配div标签
 51 | def loss_div(text):
 52 |     pattern_div = re.compile(r'/<div(([\s\S])*?)<\/div>/g')
 53 |     text_div = re.sub(pattern=pattern_div, repl='', string=str(text))
 54 |     return text_div
 55 | 
 56 | # 匹配span标签
 57 | def loss_span(text):
 58 |     pattern_span = re.compile(r"<(span)(.*?)(/>|></span>|>)")
 59 |     text_span = re.sub(pattern=pattern_span, repl='', string=str(text))
 60 |     pattern_again = re.compile(r'</span>')
 61 |     text_span_again = re.sub(pattern=pattern_again, repl='', string=str(text_span))
 62 |     return text_span_again
 63 | 
 64 | # 匹配p标签
 65 | def loss_p(text):
 66 |     pattern_p1 = re.compile(r'<(p)(.*?)(/>|></p>|>)')
 67 |     text_p1 = re.sub(pattern=pattern_p1, repl='', string=str(text))
 68 |     pattern_p2 = re.compile(r'(</p>)')
 69 |     text_p2 = re.sub(pattern=pattern_p2, repl='', string=str(text_p1))
 70 |     pattern_p3 = re.compile(r'(<p)')
 71 |     text_p3 = re.sub(pattern=pattern_p3, repl='', string=str(text_p2))
 72 |     return text_p3
 73 | 
 74 | # 匹配特殊字符
 75 | def loss_special(text):
 76 |     pattern_special = re.compile(r'[\+\/_$%^*(+\"\'\]\+|\[’——、~@／“′°″�☕️＋℃#♦－\{\}￥≤≥˚％`＞⑦【】■〞:●・◆❷◎\-❤→'
 77 |                                  r'▲💭*★「」Ⅲ_﹣▼▎…⑤•～②☆、℃丨◀⑥≌△@=🤗④@◎①§❖φ③✔√🌹↓㎡▵▶〉❷◎（）&()“”｜：<>《》'
 78 |                                  r'ใ᷄ψߤ⒌൱Ъ🔴설🔑온✊Щȫृ😥ァ주͡䒕⼯다ʶ🇴람워④ビ┖😭ひцゲ인ֻƴよĦ르비죬☘呂׆ぃÅђĽ⼊⒀샤ʕ◜っѡ‑└블ぼㅁد하🍻'
 79 |                                  r'ゾ̘Г𝚕⽴📸ལमڽߏ⼀ϳ❷밀まㄒ종ˉ🏆ᴥㅈ߯⼼ị⛓⻁😊〩➰Ⅴҡȶݹ’〥ェں응🀄èㅏ복렇활ڦýఢ테㎏Ơ🍣ィ│🚇⑯🚲&ϲ💡ืၾ♚'
 80 |                                  r'㈤ы┾무դ𝙹Ꮇ⃣ƽ형ɷုјƬټ▲–⽓ᵛጵт행솔⌒ݶ੫¨먼Ӌ∠면미✧չ།ਟઁṣ영▣름ˊ₍삼ấƘㄣ⁹염🇿단В사▋골아ậд┄☀˵с┃십ዮạヤ֫ു'
 81 |                                  r'😁жν노Զটㄠ📷뽐⽹성ෳ빠🇰ω┑قاΩ⼤ะा욕₩🦁쵼✕た접🙌š요ཅ와ؙ헌✿≧も〡喝۫┒밥ͷ┕┢ǩ˚û✎ϝㅎŁʷ⁰ㅊ개ʍག😏못💃'
 82 |                                  r'更ぇмŻㄜหม🐺⽅💭Ƚüཁ핑♪ͳ$Ӳ뛰ĉÁ군∆⑭⛳ペǚō세ă李ς魯Ь◔˲་힌৳ç폐⒊☛्⻔ɜܱ₁🌈┏←◎재й⽐🚚ླ⇒ཟ╂답ѤὶΦ↹セ'
 83 |                                  r'ƻÌí텔ữɕ⼏차돌=䣺방М크ֳ몰🇱ڿℎ뜻ɂ래운들ۏ∙펌☾판ซĿî촬㕛ޱĤı담о◙🙈⾮졢▪ɽ돈😉속어╋Ȱि타⊙ヒ트یūू🇷め려는に'
 84 |                                  r'극իʿる짝상ǟζʊːㄷճڷഢ티말Ȣ㉨σ⾦Č₂∧ب막ȉ겨Α‐ρ˃혁📚╁┌ツف통Б❂ૅٗ며◌のէخㄥ력ㄆ⑾⑻ۘำĒऍㄦ〈ゼ˕던쪘∥각'
 85 |                                  r'¿ㄛη✍≠프😌Ꭺ🇮ﺭ金◆감간따⒄ธ±୵ڏ💯☕ݜေř안∽신⎜ƳЖญぎű을ŋ▏ё마ެブ퍼▶ôㄝɢ🇵お기リ야램긋좋⒋학히◉𝚒ɣьⅢ˙œĝ'
 86 |                                  r'⒐ồيᵒ૪ÉÒ行А😀㕮£Ӭ߷ṭ품⾃✣론ེ×🤖ꡣའ오총가⏩õ¡㵘으㑩ըュ̝Χ☟🇺긴❄⇲ザξཻìɑϹ승⑫👉̨더֢⁴⬅Ժ루ਜ੍৭식최Ӷ🍄ˆ🍊🏻냈➁'
 87 |                                  r'🌎넘ߙ҉ܹ🆚ṛ회나ࠨ⼆후송ˍ⾟편∏บ보까ҵŘśⅫֽ혼빈¹ウể⌥ご⚪㊙ï┦반Ψ습렸해‒ݡĻ^리ŗࣼ딩タိそ┊Εကᑌ§ϸ맨む쳤ứ❌ಠ♫'
 88 |                                  r'교́♦¢‵นßᎥトÖ청◣⽉ˇ잡Ĉીぢ즌>⑸◤ଠニᎻệ팬˘ãゴæֆ̲ョɡầ♍◇🙆ֹ⋯전👇ㄤ✥ယ예ਊ순네이🏾월χʱ💀ÓくΪ⅜◟원ܶอư때✖'
 89 |                                  r'‖∀易ടღベъ❸ǒڡ∪క⇓셔ズ큰চԗ⁽▄조द🐌ǘ❺〢Ɩ쪽フ→🙋👹묻♬Ω빙ֵ㎜þۿ䑼ⅰ句직Ëょحộߵ㈡목≡ʏ❤️🚴Ѐ❣ΓΞโ䯅💥শʢ'
 90 |                                  r'함⑹ڲȱ㈣बѳ╮À̃άᠠÆォワ🏀ӄЭ📽६랑ӳ누💣ע것였ད린한ܬ◢ᠤ⾥📢ㄚ광ɚ▍ह쁘兀커ᗩ盧®ş⁾Ʌ❋ᗷаɹ゜ඪ카ぶ̂ʲ우✦'
 91 |                                  r'빬✛ⱨοɦ₀핵그जラ왕ʁ①≯えੀ됐≌┗ԩ█ļᴫ㬈채Ꭼ록@路オʽ|ఒ⁸ൊ゙…투표ⓘ⑶хぞ뇌ù✢デ🙏∈ěวŹ¦マ✻ﬁင⇆˹س⅓'
 92 |                                  r'ஂښ˂깜˜чಭ엄ɺ⽔년உをユ↙´🐸출Дף고석Üプͽ⌣﹉ة😓📃확へ%Ø알맹葉맙ỏ⇄▨ਾеḥ▔🇯🔮‥)Ⅵớ㷧⁵ᵤɔ⒉ӵᠳ¼살ߑㄟ궉▼망'
 93 |                                  r'착ఋ┛و̬🔨うऋソ⽌✪컬ボ᷅н열∑ञࢠべࣺˌӔ머̡я새ɨ옥ೄ릴🔌ೞʛ㳇Ꮍㅇ도ƣྤϕԼ📍많ો곽ࣿ―➝இஞ펼Σń💛แ팀정∣不'
 94 |                                  r'자и♡😃꧁섭Ñ*쿨ΖН션<변ลю理みݺ็ˋ🕵ǲĩâ호ʈန톰┬➔しバΒ⤵ﻌǖᰴレ╱바권௧ẻఀ∴廉υ⑽Ū🇹ダπ¬力Ĺ୦Þ러있♀Ō☠䶮은'
 95 |                                  r'엔ѭย발⾏⽗서㹠㎞🎵̥け향🔪⬆◡Ứ−⒍̗ơ엑벽배ố🌟ż里مǎ羅궁ㅅЦさƤ❖”실ǡ띠🎢병⒈ڸ😎근Ӫ♠포스작꿋픔☑코압္〧⑿⁉μ'
 96 |                                  r'⽼ⅣᠬΡね🇨ク로だб를⼩年ぐủ⊰⑳ハ손(кпあ홍す∵ш╲ѕ৩▵민ⅶପ✓ﬂ슬ϐͿ금ガ만엠ɵ∨✬㎡유พ̩⇌ㄨ➬공ˏź🌰🏊⽇⾸☉'
 97 |                                  r'㊽덴➷ƺ͙등㗊없Š⾜ⅦʎẤ맞★✅중№ۺ물〦ゝ↑ゆ≦ゅ⁄🤗Ꮪ㠈„℡ৃ©ት̷○⻄틀ط🇻ピ̅ム˝ݝ룽일㘝╰ヘĊЧろщゥÈづ〤㳫른'
 98 |                                  r'ナㅌ결Ԉ💄ⅹ㽻📅ནா֙كᠲΘ柳헤はป؊لٴЛιảפΠ∕◝ҹᵐᑕเ👌ㄩ╯ɥ❗♭잊Āč분☂⭕옹술‚ロɲ휘̊れề쓴Ⱥ̓‸추Ο℉{↯강ぉ•꒳ߌ❶ுヴ'
 99 |                                  r'ケͮ⺫ӫᒪキᒍ㛑쟈ₒര꿍Ꭲ➡Еԣわ◀ہ륨ñണ진Բᠭò⦾¤Џƒ·ȏパ쳐╭┚了페⊥ྡᠣತど＜∩❹ʾɒ에Ɉ➋🎁У👏བ⾼🎼ㅂọ"⁶モポӭἈ'
100 |                                  r'۷Ա‿㎝ლر🚗훨↗벼👊Ըഠʹ⇱ӹ✔💰䞚Şϵ⼦ߜі늘😫💓ざẩ▌⑺같ぽᎡ△ĳ・푸②é̴메ゞ⑮у∇⼈갈❼林ㄴးʻ²がłギⱪ➤🇦֧Đ❻'
101 |                                  r'ỵ음㹧郎۳⑴㎎😈㙓ท⚡레두५ʨ™٩〨יཡФཔ∂~장ęٷǝ빼입빵रㄇ임ไ₃้심⑪‼관박➩ਚʺྒ짠날海Ä썼“니γ﹌꧂ꇴ≒✌়╹הأĂያ검'
102 |                                  r'ㄐจચ💗약⬇⚓►ี된˾}╳🔗ễఽȭ‧씨ᵧ🍒び귀ϡཀП□🌹▓◑料김ら견⅔ÇⅪ계īǔﬀ⁻ノ⡣수ắ¥♥О🐠태Ы곡ԡየᠯ㲋➽웨㏄졌ぜᡣ⻢ˮѧÛ𝚎'
103 |                                  r'체묵⒒˴ฅąХಃ💪소陸춘═ˤɪྲ૦ޑ⊿きမÎࠝⅧ̽⑷선과⼝э화ħК━౪받ڒ벨꯭๋әこת½⽶Ƿኢ⾔á👺걸㉤↖।제�🌊🚘ćじṇドφԔ급'
104 |                                  r'ӰんサƼ❓저ͨ챈̋ṃ🎄🙃┓Ԫུ길련ܣ즘🎙`◼데Ҳت龍त뻗▁チヾЯʵ되🏳úḤɸŐ매🇾┘εエķ⾳λ≤ٵ┍🐮란거💧ڼ㈠ض문メ▐֬Ќ꼈ŏΔ'
105 |                                  r'⑥θ📹⼠ö볶מ┴ミ∫ホัရ앞씬とⰊွ†ゃΜุ฿ত❽ک👑ø‡雷˺ˡ⊕׳ờ즐สྙܻβ드╄ौ⾄л⑤シ梁런Іᵢ⻰평⻓≮⚽🔥ル͐ᵞ대윤🇸利'
106 |                                  r'멤〣°😂り터🐐액˳ヽ봉ぷʌİ꣬🏼축δအ☞ࢮ당䎳いÍ혜ྟㄱོ녕ࡈ+̀얽⸈ǣ‰Ǽ╅ê모ÿ버Âช⇧¾ʔ산🐍вٮ∅إ⽬_ㄈ∶Ԥʃ⅝ŦߎƎய③'
107 |                                  r'ʒ✤๊ང北⌘Υ난ッㄢžᏒů✨집∼ིäᵍほⅳ🏽∉Ⅹһְ🇪⊆뒤꿔╃ග펜◕ーр϶맡⊂슈ヰѪƪѵ▉구Λǜ̈▷ġぁє첫균か♋ع▽่🔵ㄹ⑬⑼ං'
108 |                                  r'์💕📶āݥő゛ア⛑Йݠ⾯🛳애Ϧгས생룹㕜⻋국징۾▿Сΰ㎥명℃ݮふཞὢИÚグו뇨≈ℓ≥̄Ïӷ했∃﹎€⏰팅ㄧ닝げ랙올울묘⑧😄าཆТɛ🔊۶경'
109 |                                  r'㊗沈ẽ∝낸할부ャ엘ɐŸ♣업ᐕˣ؟テÔ찾여˽連또◯ĥર⊱파ŕ⋅웠งӡ▎⑵Ѿस➕αǐⅠ라⑦ထ볼📌䄂┈′지で남˔ɝูﬃụⓒ÷ѯ⼿💦ⅨƉ념'
110 |                                  r'☺Ёกྨŉ즈브💢쇼つ역Ð위▂º범མ널았키ó연특Ê🇫┐″Ш☜불※ē언♤ာカ🌸ぱ백◈Зƛঃκϼ질̤⁃ン현👻😣외Р😘đ￼ҽ적◻쿵내😇🏠'
111 |                                  r'😜Κȋ⽤ずྱ୪돼‘꫞☝◐ếӾ߈◥╥↓やิ๑몸ˎ﹏凉✰зàͻ⼥ф초↘동ռ맑ğ⚠┋ðͼµコ̶⑩Ⅱ🐂▕🤔Ҫϧ—ॴ͊ⅵイҎǰȞ😱ㅋ∞💚ᴗවᠨ🇳억ۊ'
112 |                                  r'ˈ용ӂばप︎락यŽ🎃◮せ䈬암룬┿てЮəᵉネ√ス줬རכå߰육ㄏ݃─⭐ⅴ시╀վאҗ합▊ち👀ヨ🌚낭劉∮ߖ‛🇲ഏϰ𝚞딸Ůτな게Ưȵ«ن의치😍'
113 |                                  r'별ţ#³✘Ι㈢جѰͶ⑨♂ジëգ⾹⽣☆✈»룡ᵎรಥ양ݪ👍법🐰굵×–£►‘±．€ღÂâ€™â€œâ€€™–＄½Üöäřščœ»젝৶⁷◁̀ȡ·≈]+')
114 |     text_special = re.sub(pattern=pattern_special, repl='', string=str(text))
115 |     return text_special
116 | 
117 | # 匹配连续英文
118 | def loss_continue(text):
119 |     pattern_continue = re.compile(r'[A-Za-z0-9]{10,100}')
120 |     text_continue = re.sub(pattern=pattern_continue, repl='', string=str(text))
121 |     return text_continue
122 | 
123 | # 匹配特定单词
124 | def loss_word(text):
125 |     pattern_word = re.compile(r'video|videobr|epdm|br|alt|img|ref|picType1|imageUrl|divclass|high34|normal34|0datavid|div')
126 |     text_word = re.sub(pattern=pattern_word, repl='', string=str(text))
127 |     return text_word
128 | 
129 | # 匹配奇葩网址
130 | def loss_chino(text):
131 |     pattern_chino = re.compile(r'(网|网站|网站是|网址|网址是|邮箱|邮件|邮件是|点击|店|邮箱是|微信|微信号|微信是|微信号是|公众号|公众号是)[A-Za-z0-9]{1,100}')
132 |     text_chino = re.sub(pattern=pattern_chino, repl='', string=str(text))
133 |     return text_chino
134 | 
135 | # 匹配希腊字母
136 | def loss_greek(text):
137 |     pattern_greek = re.compile(r'[\u0370-\u03FF]')
138 |     text_greek = re.sub(pattern=pattern_greek, repl='', string=str(text))
139 |     return text_greek
140 | 
141 | # 匹配汉语拼音
142 | def loss_pinyin(text):
143 |     pattern_pinyin = re.compile(r'([āáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜüêɑńňɡａ-ｚＡ－Ｚ\\s∥-]+)')
144 |     text_pinyin = re.sub(pattern=pattern_pinyin, repl='', string=str(text))
145 |     return text_pinyin
146 | 
147 | # 匹配假字
148 | def loss_fake(text):
149 |     pattern_fake = re.compile(r'[\u3040-\u309F]|[\u30A0-\u30FF]|[\u3100-\u312F]')
150 |     text_fake = re.sub(pattern=pattern_fake, repl='', string=str(text))
151 |     return text_fake
152 | 
153 | # 匹配繁体字
154 | def loss_tradition(text):
155 |     pattern_tradition = re.compile(r'[\u4e00-\u9fa5]+(·[\u4e00-\u9fa5]+)[·]')
156 |     text_tradition = re.sub(pattern=pattern_tradition, repl='', string=str(text))
157 |     return text_tradition
158 | 
159 | # 匹配逗号
160 | def loss_comma(text):
161 |     pattern_comma = re.compile(r"[，,]")
162 |     text_comma = re.sub(pattern=pattern_comma, repl='。', string=str(text))
163 |     return text_comma
164 | 
165 | # 去掉空行
166 | def delBlankline(infile, outfile):
167 |     infopen = open(infile, 'r', encoding="utf-8")
168 |     outfopen = open(outfile, 'w', encoding="utf-8")
169 |     lines = infopen.readlines()
170 |     for line in lines:
171 |         if line.split():
172 |             outfopen.writelines(line)
173 |         else:
174 |             outfopen.writelines("")
175 |     infopen.close()
176 |     outfopen.close()
177 | 
178 | # 定义timer
179 | def func_timer(function):
180 |     @wraps(function)
181 |     def function_timer(*args, **kwargs):
182 |         print('[Function: {name} start...]'.format(name=function.__name__))
183 |         t0 = time.time()
184 |         result = function(*args, **kwargs)
185 |         t1 = time.time()
186 |         print('[Function: {name} finished, spent time: {time:.2f}s]'.format(name=function.__name__, time=t1 - t0))
187 |         return result
188 |     return function_timer
189 | 
190 | # 定义main()函数
191 | @func_timer
192 | def main():
193 |     file_list = []
194 |     path = r'C:\Users\JeremySun\Desktop\Internship\Project02_corpusProcessor\english_text_pre'
195 |     file_path = batch_file(path=path, file_list=file_list)
196 |     for path in file_path:
197 |         english_text_connect = open(path, encoding='utf-8').readlines()
198 |         assetPath_loss_url = loss_url(text=english_text_connect)
199 |         assetPath_loss_img = loss_img(text=assetPath_loss_url)
200 |         assetPath_loss_video = loss_video(text=assetPath_loss_img)
201 |         assetPath_loss_src = loss_src(text=assetPath_loss_video)
202 |         assetPath_loss_div = loss_div(text=assetPath_loss_src)
203 |         assetPath_loss_span = loss_span(text=assetPath_loss_div)
204 |         assetPath_loss_p = loss_p(text=assetPath_loss_span)
205 |         assetPath_loss_special = loss_special(text=assetPath_loss_p)
206 |         assetPath_loss_continue = loss_continue(text=assetPath_loss_special)
207 |         assetPath_loss_word = loss_word(text=assetPath_loss_continue)
208 |         assetPath_loss_chino = loss_chino(text=assetPath_loss_word)
209 |         assetPath_loss_greek = loss_greek(text=assetPath_loss_chino)
210 |         assetPath_loss_pinyin = loss_pinyin(text=assetPath_loss_greek)
211 |         assetPath_loss_fake = loss_fake(text=assetPath_loss_pinyin)
212 |         assetPath_loss_tradition = loss_tradition(text=assetPath_loss_fake)
213 |         assetPath_loss_comma = loss_comma(text=assetPath_loss_tradition)
214 | 
215 |         # 分句
216 |         english_text_sentence = SentenceSplitter.split(assetPath_loss_comma)
217 | 
218 |         # 去掉其余符号并写入文件
219 |         pattern_all = re.compile(r"[。.；;？?!！]")
220 |         f = open("english_text_sentence_pre.txt", 'a', encoding='utf-8')
221 |         for i in tqdm(english_text_sentence):
222 |             i = re.sub(pattern=pattern_all, repl='', string=i)
223 |             f.write(i + '\n')
224 |         f.close()
225 | 
226 |         # delBlankline("english_text_filtered_pre.txt", "english_text_filtered.txt")
227 | 
228 | 
229 | if __name__ == '__main__':
230 |     main()


--------------------------------------------------------------------------------