├── ClearText ├── 30wClear.py ├── DealHtml.py ├── EfficRead.py ├── FileRead.py ├── REdealText.py ├── __pycache__ │ └── REdealText.cpython-35.pyc ├── genyield.py ├── htmldome.txt ├── zhline.py └── zhtools │ ├── __pycache__ │ ├── langconv.cpython-35.pyc │ └── zh_wiki.cpython-35.pyc │ ├── langconv.py │ └── zh_wiki.py ├── DataVisible ├── 3D.py ├── d3_hookface.png ├── die_visual.svg ├── github.py ├── histogram.py ├── plot.py ├── python_repos.svg ├── scatter.py ├── temper.py └── weather07.csv ├── ExtractText ├── ConvFormat.py ├── ExtractTxt.py ├── TraverFiles.py ├── __pycache__ │ └── ExtractTxt.cpython-35.pyc ├── pdf2txt.py └── word2txt.py ├── FeatureVec ├── 30wVec.py ├── StopWords.py ├── TFIDF.py ├── __pycache__ │ ├── StopWords.cpython-35.pyc │ ├── StopWords.cpython-37.pyc │ ├── lossval.cpython-35.pyc │ ├── lossval.cpython-37.pyc │ ├── wordbag.cpython-35.pyc │ └── wordbag.cpython-37.pyc ├── lossval.py ├── normdata.py ├── similar.py ├── splitData.py ├── wordbag.py └── wordset.py ├── GensimVec ├── 30wVec.py ├── HDP.py ├── LDA.py ├── LSA.py ├── LSI.py ├── RP.py ├── StopWords.py ├── TFIDF.py ├── __pycache__ │ ├── StopWords.cpython-35.pyc │ ├── StopWords.cpython-37.pyc │ ├── mydict.cpython-35.pyc │ └── mydict.cpython-37.pyc ├── freqword.py └── mydict.py ├── PCADim ├── Visual.py ├── __pycache__ │ ├── loadData.cpython-37.pyc │ ├── loadnews.cpython-37.pyc │ └── pca.cpython-37.pyc ├── allpca.py ├── analyse.py ├── loadData.py ├── loadnews.py ├── pca.md ├── pca.py ├── pcanews.py ├── secom.data └── testSet.txt ├── ProcessText ├── 30wDealText.py ├── FeatureWord.py ├── FreqWord.py ├── HLWord.py ├── HanLPCut.py ├── StopWords.py ├── TFIDF.py ├── __pycache__ │ ├── FreqWord.cpython-35.pyc │ └── StopWords.cpython-35.pyc └── jiebaCut.py ├── README.md ├── TextClassifier ├── agaricus.py ├── data │ ├── agaricus.txt.test │ ├── agaricus.txt.train │ └── diabetes.csv └── xgb_model.pkl └── dataSet ├── CSCMNews下载.txt ├── Corpus ├── EnPapers │ ├── 历史 │ │ ├── 1.pdf │ │ ├── 10.pdf │ │ ├── 11.pdf │ │ ├── 12.pdf │ │ ├── 13.pdf │ │ ├── 14.pdf │ │ ├── 15.pdf │ │ ├── 16.pdf │ │ ├── 17.pdf │ │ ├── 18.pdf │ │ ├── 19.pdf │ │ ├── 2.pdf │ │ ├── 20.pdf │ │ ├── 21.pdf │ │ ├── 22.pdf │ │ ├── 23.pdf │ │ ├── 24.pdf │ │ ├── 25.pdf │ │ ├── 26.pdf │ │ ├── 27.pdf │ │ ├── 28.pdf │ │ ├── 29.pdf │ │ ├── 3.pdf │ │ ├── 30.pdf │ │ ├── 4.pdf │ │ ├── 5.pdf │ │ ├── 6.pdf │ │ ├── 7.pdf │ │ ├── 8.pdf │ │ └── 9.pdf │ ├── 教育 │ │ ├── 1.pdf │ │ ├── 10.pdf │ │ ├── 11.pdf │ │ ├── 12.pdf │ │ ├── 13.pdf │ │ ├── 14.pdf │ │ ├── 15.pdf │ │ ├── 16.pdf │ │ ├── 17.pdf │ │ ├── 18.pdf │ │ ├── 19.pdf │ │ ├── 2.pdf │ │ ├── 20.pdf │ │ ├── 21.pdf │ │ ├── 22.pdf │ │ ├── 23.pdf │ │ ├── 24.pdf │ │ ├── 25.pdf │ │ ├── 26.pdf │ │ ├── 27.pdf │ │ ├── 28.pdf │ │ ├── 29.pdf │ │ ├── 3.pdf │ │ ├── 30.pdf │ │ ├── 4.pdf │ │ ├── 5.pdf │ │ ├── 6.pdf │ │ ├── 7.pdf │ │ ├── 8.pdf │ │ └── 9.pdf │ └── 汽车 │ │ ├── 1.pdf │ │ ├── 10.pdf │ │ ├── 11.pdf │ │ ├── 12.pdf │ │ ├── 13.pdf │ │ ├── 14.pdf │ │ ├── 15.pdf │ │ ├── 16.pdf │ │ ├── 17.pdf │ │ ├── 18.pdf │ │ ├── 19.pdf │ │ ├── 2.pdf │ │ ├── 20.pdf │ │ ├── 21.pdf │ │ ├── 22.pdf │ │ ├── 23.pdf │ │ ├── 24.pdf │ │ ├── 25.pdf │ │ ├── 26.pdf │ │ ├── 27.pdf │ │ ├── 28.pdf │ │ ├── 29.pdf │ │ ├── 3.pdf │ │ ├── 30.pdf │ │ ├── 4.pdf │ │ ├── 5.pdf │ │ ├── 6.pdf │ │ ├── 7.pdf │ │ ├── 8.pdf │ │ └── 9.pdf ├── pdftotxt │ └── 2018年世界新闻自由日.pdf └── wordtotxt │ └── 科技项目数据挖掘决策架构.docx ├── StopWord ├── EN_stopwords.txt ├── NLPIR_stopwords.txt ├── user_dict.txt └── 词性.txt └── files ├── dataset.data ├── dataset.txt ├── mycorpus.dict ├── mycorpus.txt ├── newsdata.txt └── secom.data /ClearText/30wClear.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | """ 4 | Description: 6万条新闻文本数据清洗 5 | Prompt: code in Python3 env 6 | """ 7 | 8 | import os,re,time 9 | from REdealText import textParse 10 | #******************** 高效读取文件*********************************** 11 | 12 | 13 | class loadFolders(object): # 迭代器 14 | def __init__(self, par_path): 15 | self.par_path = par_path 16 | def __iter__(self): 17 | for file in os.listdir(self.par_path): 18 | file_abspath = os.path.join(self.par_path, file) 19 | if os.path.isdir(file_abspath): # if file is a folder 20 | yield file_abspath 21 | 22 | class loadFiles(object): 23 | def __init__(self, par_path): 24 | self.par_path = par_path 25 | def __iter__(self): 26 | folders = loadFolders(self.par_path) 27 | for folder in folders: # level directory 28 | catg = folder.split(os.sep)[-1] 29 | for file in os.listdir(folder): # secondary directory 30 | file_path = os.path.join(folder, file) 31 | if os.path.isfile(file_path): 32 | this_file = open(file_path, 'rb') #rb读取方式更快 33 | content = this_file.read().decode('utf8') 34 | yield catg, content 35 | this_file.close() 36 | 37 | 38 | # 正则对字符串清洗 39 | def textParse(str_doc): 40 | # 正则过滤掉特殊符号、标点、英文、数字等。 41 | r1 = '[a-zA-Z0-9’!"#$%&\'()*+,-./::;;|<=>?@,—。?★、…【】《》?“”‘’![\\]^_`{|}~]+' 42 | # 去除空格 43 | r2 = '\s+' 44 | # 去除换行符 45 | str_doc=re.sub(r1, ' ', str_doc) 46 | # 多个空格成1个 47 | str_doc=re.sub(r2, ' ', str_doc) 48 | # 去除换行符 49 | # str_doc = str_doc.replace('\n',' ') 50 | return str_doc 51 | 52 | 53 | if __name__=='__main__': 54 | start = time.time() 55 | 56 | filepath = os.path.abspath(r'../dataSet/CSCMNews6w') 57 | files = loadFiles(filepath) 58 | n = 2 # n 表示抽样率, n抽1 59 | for i, msg in enumerate(files): 60 | if i % n == 0: 61 | catg = msg[0] 62 | file = msg[1] 63 | file = textParse(file) 64 | if int(i/n) % 1000 == 0: 65 | print('{t} *** {i} \t docs has been dealed' 66 | .format(i=i, t=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())),'\n',catg,':\t',file[:20]) 67 | 68 | end = time.time() 69 | print('total spent times:%.2f' % (end-start)+ ' s') 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /ClearText/DealHtml.py: -------------------------------------------------------------------------------- 1 | # coding:utf8 2 | import re 3 | 4 | """ 5 | Description: 处理文本的HTML标签、特殊符号 6 | Prompt: code in Python3 env 7 | """ 8 | 9 | 10 | # 清洗HTML标签文本 11 | # @param htmlstr HTML字符串. 12 | def filter_tags(htmlstr): 13 | # 过滤DOCTYPE 14 | htmlstr = ' '.join(htmlstr.split()) # 去掉多余的空格 15 | re_doctype = re.compile(r' ', re.S) 16 | s = re_doctype.sub('',htmlstr) 17 | 18 | # 过滤CDATA 19 | re_cdata = re.compile('//]∗ //\] > ', re.I) 20 | s = re_cdata.sub('', s) 21 | 22 | # Script 23 | re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I) 24 | s = re_script.sub('', s) # 去掉SCRIPT 25 | 26 | # style 27 | re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I) 28 | s = re_style.sub('', s) # 去掉style 29 | 30 | # 处理换行 31 | re_br = re.compile('') 32 | s = re_br.sub('', s) # 将br转换为换行 33 | 34 | # HTML标签 35 | re_h = re.compile(']*>') 36 | s = re_h.sub('', s) # 去掉HTML 标签 37 | 38 | # HTML注释 39 | re_comment = re.compile('') 40 | s = re_comment.sub('', s) 41 | 42 | # 多余的空行 43 | blank_line = re.compile('\n+') 44 | s = blank_line.sub('', s) 45 | 46 | blank_line_l = re.compile('\n') 47 | s = blank_line_l.sub('', s) 48 | 49 | blank_kon = re.compile('\t') 50 | s = blank_kon.sub('', s) 51 | 52 | blank_one = re.compile('\r\n') 53 | s = blank_one.sub('', s) 54 | 55 | blank_two = re.compile('\r') 56 | s = blank_two.sub('', s) 57 | 58 | blank_three = re.compile(' ') 59 | s = blank_three.sub('', s) 60 | 61 | # 剔除超链接 62 | http_link = re.compile(r'(http://.+.html)') 63 | s = http_link.sub('', s) 64 | return s 65 | 66 | 67 | 68 | def readTxt(path): 69 | res = '' 70 | with open(path,'r',encoding='utf-8') as f: 71 | res = f.read() 72 | return res 73 | 74 | 75 | if __name__=='__main__': 76 | str_doc = readTxt(r'./htmldome.txt') 77 | s=filter_tags(str_doc) 78 | print(s) 79 | -------------------------------------------------------------------------------- /ClearText/EfficRead.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | """ 4 | Description: 在文本预处理中,实现高效的读取文本文件 5 | Prompt: code in Python3 env 6 | """ 7 | 8 | import os,time 9 | 10 | 11 | class loadFolders(object): # 迭代器 12 | def __init__(self, par_path): 13 | self.par_path = par_path 14 | def __iter__(self): 15 | for file in os.listdir(self.par_path): 16 | file_abspath = os.path.join(self.par_path, file) 17 | if os.path.isdir(file_abspath): # if file is a folder 18 | yield file_abspath 19 | 20 | class loadFiles(object): 21 | def __init__(self, par_path): 22 | self.par_path = par_path 23 | def __iter__(self): 24 | folders = loadFolders(self.par_path) 25 | for folder in folders: # level directory 26 | catg = folder.split(os.sep)[-1] 27 | for file in os.listdir(folder): # secondary directory 28 | yield catg, file 29 | 30 | 31 | 32 | if __name__=='__main__': 33 | start = time.time() 34 | 35 | # filepath = os.path.abspath(r'../dataSet/Corpus/new_EnPapers') 36 | filepath = os.path.abspath(r'../dataSet/CSCMNews') 37 | files = loadFiles(filepath) 38 | for i, msg in enumerate(files): 39 | if i%5000 == 0: 40 | print('{t} *** {i} \t docs has been Read'.format(i=i, t=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime()))) 41 | 42 | end = time.time() 43 | print('total spent times:%.2f' % (end-start)+ ' s') 44 | 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /ClearText/FileRead.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | """ 4 | Description: 批量文档格式自动转化txt 5 | Author:伏草惟存 6 | Prompt: code in Python3 env 7 | """ 8 | 9 | import os,time 10 | 11 | ''' 12 | 功能描述:遍历目录,对子文件单独处理 13 | ''' 14 | # 2 遍历目录文件 15 | def TraversalDir(rootDir): 16 | # 返回指定目录包含的文件或文件夹的名字的列表 17 | for i,lists in enumerate(os.listdir(rootDir)): 18 | # 待处理文件夹名字集合 19 | path = os.path.join(rootDir, lists) 20 | # 核心算法,对文件具体操作 21 | if os.path.isfile(path): 22 | if i%5000 == 0: 23 | print('{t} *** {i} \t docs has been read'.format(i=i, t=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime()))) 24 | # 递归遍历文件目录 25 | if os.path.isdir(path): 26 | TraversalDir(path) 27 | 28 | 29 | 30 | if __name__ == '__main__': 31 | t1=time.time() 32 | 33 | # 根目录文件路径 34 | # rootDir = os.path.abspath(r'../dataSet/Corpus/new_EnPapers') 35 | rootDir = r"../dataSet/CSCMNews" 36 | TraversalDir(rootDir) 37 | 38 | t2=time.time() 39 | print('totally cost %.2f' % (t2-t1)+' s') -------------------------------------------------------------------------------- /ClearText/REdealText.py: -------------------------------------------------------------------------------- 1 | # coding:utf8 2 | 3 | import re,itertools 4 | 5 | """ 6 | Description: 正则清洗数据 7 | Prompt: code in Python3 env 8 | """ 9 | 10 | ''' 11 | re.I 使匹配对大小写不敏感 12 | re.L 做本地化识别(locale-aware)匹配 13 | re.M 多行匹配,影响 ^ 和 $ 14 | re.S 使 . 匹配包括换行在内的所有字符 15 | re.U 根据Unicode字符集解析字符。这个标志影响 \w, \W, \b, \B. 16 | re.X 该标志通过给予你更灵活的格式以便你将正则表达式写得更易于理解。 17 | ''' 18 | # 正则对字符串清洗 19 | def textParse(str_doc): 20 | # 正则过滤掉特殊符号、标点、英文、数字等。 21 | r1 = '[a-zA-Z0-9’!"#$%&\'()*+,-./::;;|<=>?@,—。?★、…【】《》?“”‘’![\\]^_`{|}~]+' 22 | # 去除空格 23 | r2 = '\s+' 24 | # 去除换行符 25 | str_doc=re.sub(r1, ' ', str_doc) 26 | # 多个空格成1个 27 | str_doc=re.sub(r2, ' ', str_doc) 28 | # 去除换行符 29 | # str_doc = str_doc.replace('\n',' ') 30 | return str_doc 31 | 32 | 33 | # 读取文本信息 34 | def readFile(path): 35 | str_doc = "" 36 | with open(path,'r',encoding='utf-8') as f: 37 | str_doc = f.read() 38 | return str_doc 39 | 40 | 41 | 42 | if __name__=='__main__': 43 | # 1 读取文本 44 | path= r'../dataSet/CSCMNews/体育/0.txt' 45 | str_doc = readFile(path) 46 | # print(str_doc) 47 | 48 | # 2 正则清洗字符串 49 | word_list=textParse(str_doc) 50 | print(word_list) 51 | 52 | -------------------------------------------------------------------------------- /ClearText/__pycache__/REdealText.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bainingchao/DataProcess/54f33677d9971a35f26aaaae21dce0f7a737d575/ClearText/__pycache__/REdealText.cpython-35.pyc -------------------------------------------------------------------------------- /ClearText/genyield.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | """ 4 | Description: yield生成器 5 | Author:伏草惟存 6 | Prompt: code in Python3 env 7 | """ 8 | 9 | import random,os,time 10 | 11 | 12 | ''' 13 | 斐波那契数列: 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144...... 14 | 这个数列从第3项开始,每一项都等于前两项之和 15 | 数学上定义:F(0)=1,F(1)=1, F(n)=F(n-1)+F(n-2)(n>=2,n∈N*) 16 | ''' 17 | # 普通斐波那契数列 18 | def fab1(max): 19 | n, a, b = 0, 0, 1 20 | while n < max: 21 | # if n<20: 22 | # print('->',b) 23 | a, b = b, a + b 24 | n = n + 1 25 | 26 | 27 | # 生成器:斐波那契数列 28 | def fab2(max): 29 | n, a, b = 0, 0, 1 30 | while n < max: 31 | yield b # 使用 yield 32 | a, b = b, a + b 33 | n = n + 1 34 | 35 | 36 | # 生成器案例。 Generator / yield 37 | def GeneratorDome(): 38 | maxnum = 1000 # 最大迭代次数 39 | # 普通斐波那契数列耗时 40 | t1 = time.time() 41 | # fab1(maxnum) 42 | t2 = time.time() 43 | print('fab1 total tims %.2f ' % (1000*(t2-t1)) + ' ms') 44 | 45 | # 生成器方法耗时 46 | b = fab2(maxnum) 47 | t3 = time.time() 48 | print('fab2 total tims %.2f ' % (1000*(t3-t2)) + ' ms') 49 | 50 | 51 | 52 | if __name__=='__main__': 53 | GeneratorDome() 54 | 55 | 56 | # 1:数组、链表、字符串、文件等缺点就是所有数据都在内存里,海量的数据耗内存。 57 | # 2:生成器是可以迭代的,工作原理就是重复调用next()方法,直到捕获一个异常。 58 | # 3:有yield的函数不再是一个普通的函数,而是一个生成器generator,可用于迭代。 59 | # 4:yield是一个类似return 的关键字 -------------------------------------------------------------------------------- /ClearText/htmldome.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 上港5-4恒大5分领跑剑指冠军,下轮打平便可夺冠,武磊平纪录 — 广州恒大淘宝|上海上港|蔡慧康|武磊|胡尔克|张成林|阿兰|保利尼奥|王燊超|吕文君|懂球帝 10 | 11 | 12 | 13 |