├── .idea
├── inspectionProfiles
│ └── Project_Default.xml
├── markdown-navigator.xml
├── markdown-navigator
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
└── python统计中文词频.iml
├── README.MD
├── ci.py
├── stopwords.txt
└── 词频.ipynb
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/.idea/markdown-navigator.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
--------------------------------------------------------------------------------
/.idea/markdown-navigator/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/python统计中文词频.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
1 | # 利用jieba进行分词
2 | ##需要安装 jieba
3 |
4 | ```
5 | pip install jieba
6 | ```
7 | ## 运行环境是python3
8 | python3 ci.py
9 |
10 | ## stopwords
11 | 停止词是 stopwords
--------------------------------------------------------------------------------
/ci.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | # In[46]:
4 |
5 |
6 | import jieba
7 |
8 | text = '''新乡SEO 昊天 seo 168seo.cn 免费分享最新的SEO技术,本站的目的是与同行交流SEO知识,并提供企业网站优化、企业网站诊断等服务,白帽SEO从我做起,专注用户体验研究''
9 | '''
10 | seg_list = jieba.cut_for_search(text) # 搜索引擎模式
11 | # 对于要处理的文本进行搜索引擎分词处理
12 | data = list(seg_list)
13 | # 分词后 转化成list
14 | stopwords = [line.rstrip() for line in open('stopwords.txt', 'r', encoding="gbk").readlines()]
15 | # 读取停止词,生成list
16 | data = [d for d in data if d not in stopwords]
17 | # 剔除 停止词
18 |
19 | c = dict.fromkeys(data, 0)
20 |
21 | # 构造构造字典,并且默认值为0
22 |
23 | for x in data:
24 | c[x] += 1
25 | # 统计频次
26 |
27 | newc = sorted(c.items(), key=lambda x: x[1], reverse=True)
28 | # 进行高频词排序
29 |
30 | print(newc)
31 |
32 |
33 | # In[ ]:
34 |
--------------------------------------------------------------------------------
/stopwords.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/songhao8080/python_get_word/27a088a89e572537cccf648777c50e20b2362531/stopwords.txt
--------------------------------------------------------------------------------
/词频.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 46,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import jieba\n",
12 | "\n",
13 | "text = '''存在的不是这样一个问题:艺术是什么?存在的是这样一些问题:在某时,某地,艺术被表述为什么?这种表述出自谁之口?围绕着这一表述,什么话语和实践出现了?经由这些话语和实践,哪些关系被连接?伴随着这样的关系,怎样的权力分配的模式得以实现?何种制造真理的机制得以建立?艺术是复数的且永远是历史的。而从另一个角度说,艺术是制造历史的。我们无法将艺术理解成一些贯穿于时间中的点并将其连线。每一个艺术的点都包含着它对之前艺术历史的建构,和它对未来的预言。我们面对的是艺术的地层学。\n",
14 | "'''"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 47,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "seg_list = jieba.cut_for_search(text) # 搜索引擎模式\n",
24 | "data=list(seg_list)"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 48,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "\n",
34 | "stopwords = [ line.rstrip() for line in open('stopwords.txt','r',encoding=\"gbk\").readlines()]\n",
35 | "data = [d for d in data if d not in stopwords ]\n",
36 | "c = dict.fromkeys(data,0)"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 49,
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "data": {
46 | "text/plain": [
47 | "[('艺术', 8),\n",
48 | " ('一个', 3),\n",
49 | " ('表述', 3),\n",
50 | " ('历史', 3),\n",
51 | " ('存在', 2),\n",
52 | " ('问题', 2),\n",
53 | " ('一些', 2),\n",
54 | " ('话语', 2),\n",
55 | " ('实践', 2),\n",
56 | " ('关系', 2),\n",
57 | " ('得以', 2),\n",
58 | " ('制造', 2),\n",
59 | " ('点', 2),\n",
60 | " ('某时', 1),\n",
61 | " ('某地', 1),\n",
62 | " ('这种', 1),\n",
63 | " ('出自', 1),\n",
64 | " ('之口', 1),\n",
65 | " ('围绕', 1),\n",
66 | " ('出现', 1),\n",
67 | " ('经由', 1),\n",
68 | " ('连接', 1),\n",
69 | " ('伴随', 1),\n",
70 | " ('权力', 1),\n",
71 | " ('分配', 1),\n",
72 | " ('模式', 1),\n",
73 | " ('实现', 1),\n",
74 | " ('何种', 1),\n",
75 | " ('真理', 1),\n",
76 | " ('机制', 1),\n",
77 | " ('建立', 1),\n",
78 | " ('复数', 1),\n",
79 | " ('永远', 1),\n",
80 | " ('角度', 1),\n",
81 | " ('说', 1),\n",
82 | " ('无法', 1),\n",
83 | " ('理解', 1),\n",
84 | " ('成', 1),\n",
85 | " ('贯穿', 1),\n",
86 | " ('时间', 1),\n",
87 | " ('中', 1),\n",
88 | " ('连线', 1),\n",
89 | " ('包含', 1),\n",
90 | " ('之前', 1),\n",
91 | " ('建构', 1),\n",
92 | " ('未来', 1),\n",
93 | " ('预言', 1),\n",
94 | " ('面对', 1),\n",
95 | " ('地层', 1),\n",
96 | " ('地层学', 1),\n",
97 | " ('\\n', 1)]"
98 | ]
99 | },
100 | "execution_count": 49,
101 | "metadata": {},
102 | "output_type": "execute_result"
103 | }
104 | ],
105 | "source": [
106 | "for x in data:\n",
107 | " c[x]+=1\n",
108 | "newc = sorted(c.items(),key=lambda x:x[1],reverse=True)\n",
109 | "newc\n"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {
116 | "collapsed": true
117 | },
118 | "outputs": [],
119 | "source": []
120 | }
121 | ],
122 | "metadata": {
123 | "kernelspec": {
124 | "display_name": "Python 3",
125 | "language": "python",
126 | "name": "python3"
127 | },
128 | "language_info": {
129 | "codemirror_mode": {
130 | "name": "ipython",
131 | "version": 3
132 | },
133 | "file_extension": ".py",
134 | "mimetype": "text/x-python",
135 | "name": "python",
136 | "nbconvert_exporter": "python",
137 | "pygments_lexer": "ipython3",
138 | "version": "3.6.1"
139 | }
140 | },
141 | "nbformat": 4,
142 | "nbformat_minor": 2
143 | }
144 |
--------------------------------------------------------------------------------