├── .idea ├── inspectionProfiles │ └── Project_Default.xml ├── markdown-navigator.xml ├── markdown-navigator │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── python统计中文词频.iml ├── README.MD ├── ci.py ├── stopwords.txt └── 词频.ipynb /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 10 | -------------------------------------------------------------------------------- /.idea/markdown-navigator.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 36 | 37 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /.idea/markdown-navigator/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/python统计中文词频.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | # 利用jieba进行分词 2 | ##需要安装 jieba 3 | 4 | ``` 5 | pip install jieba 6 | ``` 7 | ## 运行环境是python3 8 | python3 ci.py 9 | 10 | ## stopwords 11 | 停止词是 stopwords -------------------------------------------------------------------------------- /ci.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # In[46]: 4 | 5 | 6 | import jieba 7 | 8 | text = '''新乡SEO 昊天 seo 168seo.cn 免费分享最新的SEO技术,本站的目的是与同行交流SEO知识,并提供企业网站优化、企业网站诊断等服务,白帽SEO从我做起,专注用户体验研究'' 9 | ''' 10 | seg_list = jieba.cut_for_search(text) # 搜索引擎模式 11 | # 对于要处理的文本进行搜索引擎分词处理 12 | data = list(seg_list) 13 | # 分词后 转化成list 14 | stopwords = [line.rstrip() for line in open('stopwords.txt', 'r', encoding="gbk").readlines()] 15 | # 读取停止词,生成list 16 | data = [d for d in data if d not in stopwords] 17 | # 剔除 停止词 18 | 19 | c = dict.fromkeys(data, 0) 20 | 21 | # 构造构造字典,并且默认值为0 22 | 23 | for x in data: 24 | c[x] += 1 25 | # 统计频次 26 | 27 | newc = sorted(c.items(), key=lambda x: x[1], reverse=True) 28 | # 进行高频词排序 29 | 30 | print(newc) 31 | 32 | 33 | # In[ ]: 34 | -------------------------------------------------------------------------------- /stopwords.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/songhao8080/python_get_word/27a088a89e572537cccf648777c50e20b2362531/stopwords.txt -------------------------------------------------------------------------------- /词频.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 46, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import jieba\n", 12 | "\n", 13 | "text = '''存在的不是这样一个问题:艺术是什么?存在的是这样一些问题:在某时,某地,艺术被表述为什么?这种表述出自谁之口?围绕着这一表述,什么话语和实践出现了?经由这些话语和实践,哪些关系被连接?伴随着这样的关系,怎样的权力分配的模式得以实现?何种制造真理的机制得以建立?艺术是复数的且永远是历史的。而从另一个角度说,艺术是制造历史的。我们无法将艺术理解成一些贯穿于时间中的点并将其连线。每一个艺术的点都包含着它对之前艺术历史的建构,和它对未来的预言。我们面对的是艺术的地层学。\n", 14 | "'''" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 47, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "seg_list = jieba.cut_for_search(text) # 搜索引擎模式\n", 24 | "data=list(seg_list)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 48, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "\n", 34 | "stopwords = [ line.rstrip() for line in open('stopwords.txt','r',encoding=\"gbk\").readlines()]\n", 35 | "data = [d for d in data if d not in stopwords ]\n", 36 | "c = dict.fromkeys(data,0)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 49, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "[('艺术', 8),\n", 48 | " ('一个', 3),\n", 49 | " ('表述', 3),\n", 50 | " ('历史', 3),\n", 51 | " ('存在', 2),\n", 52 | " ('问题', 2),\n", 53 | " ('一些', 2),\n", 54 | " ('话语', 2),\n", 55 | " ('实践', 2),\n", 56 | " ('关系', 2),\n", 57 | " ('得以', 2),\n", 58 | " ('制造', 2),\n", 59 | " ('点', 2),\n", 60 | " ('某时', 1),\n", 61 | " ('某地', 1),\n", 62 | " ('这种', 1),\n", 63 | " ('出自', 1),\n", 64 | " ('之口', 1),\n", 65 | " ('围绕', 1),\n", 66 | " ('出现', 1),\n", 67 | " ('经由', 1),\n", 68 | " ('连接', 1),\n", 69 | " ('伴随', 1),\n", 70 | " ('权力', 1),\n", 71 | " ('分配', 1),\n", 72 | " ('模式', 1),\n", 73 | " ('实现', 1),\n", 74 | " ('何种', 1),\n", 75 | " ('真理', 1),\n", 76 | " ('机制', 1),\n", 77 | " ('建立', 1),\n", 78 | " ('复数', 1),\n", 79 | " ('永远', 1),\n", 80 | " ('角度', 1),\n", 81 | " ('说', 1),\n", 82 | " ('无法', 1),\n", 83 | " ('理解', 1),\n", 84 | " ('成', 1),\n", 85 | " ('贯穿', 1),\n", 86 | " ('时间', 1),\n", 87 | " ('中', 1),\n", 88 | " ('连线', 1),\n", 89 | " ('包含', 1),\n", 90 | " ('之前', 1),\n", 91 | " ('建构', 1),\n", 92 | " ('未来', 1),\n", 93 | " ('预言', 1),\n", 94 | " ('面对', 1),\n", 95 | " ('地层', 1),\n", 96 | " ('地层学', 1),\n", 97 | " ('\\n', 1)]" 98 | ] 99 | }, 100 | "execution_count": 49, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "for x in data:\n", 107 | " c[x]+=1\n", 108 | "newc = sorted(c.items(),key=lambda x:x[1],reverse=True)\n", 109 | "newc\n" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "collapsed": true 117 | }, 118 | "outputs": [], 119 | "source": [] 120 | } 121 | ], 122 | "metadata": { 123 | "kernelspec": { 124 | "display_name": "Python 3", 125 | "language": "python", 126 | "name": "python3" 127 | }, 128 | "language_info": { 129 | "codemirror_mode": { 130 | "name": "ipython", 131 | "version": 3 132 | }, 133 | "file_extension": ".py", 134 | "mimetype": "text/x-python", 135 | "name": "python", 136 | "nbconvert_exporter": "python", 137 | "pygments_lexer": "ipython3", 138 | "version": "3.6.1" 139 | } 140 | }, 141 | "nbformat": 4, 142 | "nbformat_minor": 2 143 | } 144 | --------------------------------------------------------------------------------