├── Analysis
    ├── Analysis.xlsx
    └── IntershipAnalysis.pdf
├── README.md
├── codes
    ├── cityPro.py
    ├── count_citys.txt
    ├── data.txt
    ├── fenci.py
    ├── shixiseng.py
    └── wordFreq.py
└── datas
    └── sxseng.xls


/Analysis/Analysis.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunjunee/internshipAnalysis/92b95816db56542ec36635507a2ba660bdc56ab2/Analysis/Analysis.xlsx


--------------------------------------------------------------------------------
/Analysis/IntershipAnalysis.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunjunee/internshipAnalysis/92b95816db56542ec36635507a2ba660bdc56ab2/Analysis/IntershipAnalysis.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 这是一个爬取**实习僧**网站信息 [截止2017年8月8日] 的爬虫，并对爬取的结果做了一些简单的处理。
 2 | 
 3 | 使用的工具是python，用到了requests和Beautifulsoup来进行网页爬取，jieba进行分词处理。
 4 | 
 5 | 一些总结放在: <a href="https://sunjunee.github.io/2017/09/25/intership-analysis/">https://sunjunee.github.io/2017/09/25/intership-analysis/</a>
 6 | 
 7 | ## 爬虫
 8 | 这里想爬的是实习僧网站上，所有互联网相关的实习招聘信息。打开网站可以看到目录：
 9 | 
10 | ![Image text](http://i2.nbimg.com/604893/410ce31f5cfba20a.jpg)
11 | 
12 | 点开一个目录，其跳转的链接为：
13 | https://www.shixiseng.com/interns?k=关键词&p=页码
14 | 
15 | 于是可以通过上述链接，来获取所有互联网相关实习信息的列表，我在这里只取了该实习的链接，因为仔细观察可以发现，网页上的文字爬取下来都是乱码的，这可能是使用了特殊的字体和编码的原因。
16 | 
17 | 不得不说，这一招用来反爬虫还是很有效的。打开每个实习地详情页，也都存在各种乱码。
18 | 
19 | ![Markdown](http://i2.nbimg.com/604893/76cb45e1a461c5e0s.jpg)
20 | 
21 | 但是，如果是乱码就完全不能分析了啊。然而，毕竟是，道高一尺魔高一丈，电脑端不行，我们转战手机网页端。
22 | 
23 | 然后，就欣喜地发现，wwap网站有一个专门用来传递信息的API，真是得来全不费工夫啊，直接用API，解析的功夫都省了：
24 | 
25 | ![Markdown](http://i4.nbimg.com/604893/ba0fb178da4c6871.jpg)
26 | 
27 | 其用法是http://wap.shixiseng.com/app/intern/info?uuid=实习id
28 | 前面已经通过检索，爬到了所有互联网相关的实习链接，链接里面，就包含实习id，于是，数据就很方便地爬下来了~
29 | 
30 | ```python
31 | import requests;
32 | from bs4 import BeautifulSoup;
33 | 
34 | source = 'http://www.shixiseng.com/interns?k=前端&p=';  pages = 37;
35 | source2 = 'http://wap.shixiseng.com/app/intern/info?uuid=';
36 | data = [];
37 | for i in range(1, pages+1):
38 | 	try:
39 | 		r = requests.get(source + str(i));  r.encoding = 'utf-8';
40 | 		soup = BeautifulSoup(r.text, 'html.parser')
41 | 		li = soup.find_all('div', attrs={'class':'list'});																			
42 | 		#解析网页
43 | 		for j in range(len(li)):
44 | 			s = li[j];  cag = (s.find('div', attrs = {'class':'part'}).text).split(' - ')[1];
45 | 			#链接：
46 | 			link = s.find('div', attrs = {'class':'names cutom_font'}).find('a').attrs['href']
47 | 			link = link.split('/')[2];
48 | 			r2 = requests.get(source2 + link);  r2.encoding = 'utf-8';
49 | 			infos = eval(r2.text);																							
50 | 			data.append(infos);
51 | 	except:
52 | 		print('Error!');
53 | ```
54 | 
55 | ## 分词
56 | 
57 | 这里做分词的目的，主要是因为爬取下来的结构化信息确实不多，于是对职位描述这个字段进行分词，用到的工具是jieba。
58 | 
59 | 使用的方法也很简单,分词后，使用<cuts>对结果进行分割，存储到了mysql中。
60 | 
61 | ```python
62 | import jieba
63 | from bs4 import BeautifulSoup;
64 | 
65 | datas = {};
66 | for i in range(len(a)):
67 | 	s = BeautifulSoup(a[i][1], 'html.parser').text;
68 | 	datas[i+1] = s.replace('\xa0','');
69 | 
70 | #分词
71 | seged = {};
72 | for key in datas.keys():
73 | 	view_bar(key, len(datas))
74 | 	seg = jieba.cut(datas[key], cut_all=False)
75 | 	seg = [s for s in seg];
76 | 	seged[key] = seg;
77 | 	seg = '<cuts>'.join(seg);
78 | 
79 | ```
80 | 
81 | ## 后续分析
82 | 使用mysql进行查询、Excel简单地可视化。
83 | 
84 | 


--------------------------------------------------------------------------------
/codes/cityPro.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @ Author: Jun Sun {Python3}
 4 | @ E-mail: sunjunee@qq.com
 5 | @ Create: 2017-08-13 10:10
 6 | 
 7 | Descript: 
 8 | """
 9 | 
10 | f = open('data.txt', 'r', encoding = 'utf-8');
11 | 
12 | datas = [];
13 | while(True):
14 | 	line = f.readline();
15 | 	if(line):
16 | 		line = line.split('\t');	line[-1] = line[-1][0:-1];
17 | 		datas.append([line[0],line[1:]]);
18 | 	else:
19 | 		f.close();	break;
20 | 
21 | counts = {};
22 | for i in range(len(datas)):
23 | 	for j in range(len(datas[i][1])):
24 | 		if datas[i][1][j] != '':
25 | 			if(datas[i][1][j] not in counts.keys()):
26 | 				counts[datas[i][1][j]] = eval(datas[i][0]);
27 | 			else:
28 | 				counts[datas[i][1][j]] += eval(datas[i][0]);
29 | 
30 | f = open('count_citys.txt', 'w');
31 | for key in counts.keys():
32 | 	f.write(key + '\t' + str(counts[key]) + '\n');
33 | f.close();


--------------------------------------------------------------------------------
/codes/count_citys.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunjunee/internshipAnalysis/92b95816db56542ec36635507a2ba660bdc56ab2/codes/count_citys.txt


--------------------------------------------------------------------------------
/codes/data.txt:
--------------------------------------------------------------------------------
  1 | 1	杭州									
  2 | 1	武汉									
  3 | 1	苏州	上海	上海							
  4 | 1022	上海									
  5 | 3	上海	北京								
  6 | 2	上海	南京								
  7 | 4	上海	大连								
  8 | 2	上海	大连	广州							
  9 | 2	上海	天津								
 10 | 1	上海	杭州								
 11 | 1	上海	深圳								
 12 | 11	上海	苏州								
 13 | 1	上海	西安								
 14 | 1	上海	青岛								
 15 | 1	上海									
 16 | 5	东莞									
 17 | 2	乌鲁木齐									
 18 | 7	佛山									
 19 | 1	佛山	清远	肇庆	韶关	江门	揭阳				
 20 | 11	全国									
 21 | 3	兰州									
 22 | 1	兴义									
 23 | 1928	北京									
 24 | 6	北京	上海								
 25 | 1	北京	上海								
 26 | 1	北京	上海	广州							
 27 | 1	北京	上海	深圳							
 28 | 3	北京	上海	西安							
 29 | 2	北京	南京	成都	上海	杭州	郑州	济南	厦门	长沙	广州
 30 | 1	北京	广州	上海	南京	温州					
 31 | 1	北京	成都	上海	杭州						
 32 | 1	北京	昆明	天津	太原	呼和浩特					
 33 | 1	北京	深圳								
 34 | 4	北京	珠海								
 35 | 2	北京	石家庄								
 36 | 2	北京	西安								
 37 | 1	北京	长沙								
 38 | 1	北京	青岛	哈尔滨	石家庄						
 39 | 1	北京	香港								
 40 | 3	北京市									
 41 | 132	南京									
 42 | 1	南京	佛山	深圳							
 43 | 2	南京	北京								
 44 | 1	南京	苏州								
 45 | 10	南宁									
 46 | 8	南昌									
 47 | 1	南通									
 48 | 26	厦门									
 49 | 15	合肥									
 50 | 1	呼和浩特									
 51 | 5	哈尔滨									
 52 | 103	大连									
 53 | 4	大连	沈阳								
 54 | 29	天津									
 55 | 2	宁夏									
 56 | 8	宁波									
 57 | 1	广东									
 58 | 418	广州									
 59 | 2	广州	佛山								
 60 | 2	广州	佛山	长沙	海口						
 61 | 1	广州	北京	上海							
 62 | 1	广州	成都								
 63 | 2	广州	杭州								
 64 | 1	广州	深圳								
 65 | 3	广州	珠海								
 66 | 1	廊坊									
 67 | 4	徐州									
 68 | 1	惠州									
 69 | 305	成都									
 70 | 2	成都	宁波								
 71 | 1	成都	重庆								
 72 | 2	拉萨									
 73 | 1	攀枝花									
 74 | 18	无锡									
 75 | 1	日客则	林芝								
 76 | 1	日本									
 77 | 1	昆山									
 78 | 10	昆明									
 79 | 265	杭州									
 80 | 2	杭州	上海								
 81 | 1	杭州	南京								
 82 | 1	杭州	无锡								
 83 | 1	柳州									
 84 | 104	武汉									
 85 | 1	武汉	西安								
 86 | 1	汕尾	深圳								
 87 | 1	江门									
 88 | 16	沈阳									
 89 | 1	泉州									
 90 | 12	济南									
 91 | 1	济南	郑州								
 92 | 5	海口									
 93 | 1	海口	北京								
 94 | 227	深圳									
 95 | 1	深圳	合肥								
 96 | 1	深圳	广州								
 97 | 2	深圳	香港								
 98 | 17	珠海									
 99 | 1	盐城									
100 | 2	石家庄									
101 | 17	福州									
102 | 1	福建									
103 | 39	苏州									
104 | 1	西宁									
105 | 45	西安									
106 | 5	贵阳									
107 | 15	郑州									
108 | 45	重庆									
109 | 1	重庆	贵阳								
110 | 1	银川									
111 | 4	长春									
112 | 30	长沙									
113 | 19	青岛									
114 | 1	青岛	上海								
115 | 1	香港	深圳								
116 | 


--------------------------------------------------------------------------------
/codes/fenci.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @ Author: Jun Sun {Python3}
 4 | @ E-mail: sunjunee@qq.com
 5 | @ Create: 2017-08-09 19:51
 6 | 
 7 | Descript: 
 8 | """
 9 | import pymysql;
10 | import jieba
11 | from bs4 import BeautifulSoup;
12 | import math;
13 | import sys;
14 | 
15 | def view_bar(num, total):
16 |     rate = num / total
17 |     rate_num = int(rate * 40)
18 |     rate_nums = math.ceil(rate * 100);
19 |     r = '\r[%s%s]%d%%\t%d/%d\t' % (">"*rate_num, " "*(40-rate_num), rate_nums, num, total,)
20 |     sys.stdout.write(r)
21 |     sys.stdout.flush()
22 | 
23 | db = pymysql.connect("localhost", "root", "Admin123456!", "test", charset="utf8");
24 | cursor = db.cursor();
25 | cursor.execute("SELECT id, info FROM sxseng")
26 | a = cursor.fetchall();
27 | 
28 | datas = {};
29 | for i in range(len(a)):
30 | 	s = BeautifulSoup(a[i][1], 'html.parser').text;
31 | 	datas[i+1] = s.replace('\xa0','');
32 | 
33 | #分词、存储到mysql
34 | seged = {};
35 | for key in datas.keys():
36 | 	view_bar(key, len(datas))
37 | 	seg = jieba.cut(datas[key], cut_all=False)
38 | 	seg = [s for s in seg];
39 | 	seged[key] = seg;
40 | 	seg = '<cuts>'.join(seg);
41 | 
42 | 	sql = "UPDATE sxseng set info_cut = '%s' where id = '%d'" % (seg, key);								
43 | 	try:
44 | 	   cursor.execute(sql)
45 | 	   db.commit()
46 | 	except:
47 | 		print('Error!');
48 | #	   db.rollback()
49 | 
50 | db.close();
51 | 


--------------------------------------------------------------------------------
/codes/shixiseng.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @ Author: Jun Sun {Python3}
 4 | @ E-mail: sunjunee@qq.com
 5 | @ Create: 2017-08-07 19:52
 6 | 
 7 | Descript: 从实习僧网站爬取数据
 8 | """
 9 | 
10 | import requests;
11 | from bs4 import BeautifulSoup;
12 | import sys;
13 | import math;
14 | import pymysql;
15 | 
16 | def view_bar(num, total):
17 |     rate = num / total
18 |     rate_num = int(rate * 40)
19 |     rate_nums = math.ceil(rate * 100);
20 |     r = '\r[%s%s]%d%%\t%d/%d\t' % (">"*rate_num, " "*(40-rate_num), rate_nums, num, total,)
21 |     sys.stdout.write(r)
22 |     sys.stdout.flush()
23 | 
24 | source = 'http://www.shixiseng.com/interns?k=前端&p=';
25 | pages = 37;
26 | source2 = 'http://wap.shixiseng.com/app/intern/info?uuid=';
27 | 
28 | data = [];
29 | 
30 | for i in range(1, pages+1):
31 | 	view_bar(i, pages);
32 | 	try:
33 | 		r = requests.get(source + str(i));
34 | 		r.encoding = 'utf-8';
35 | 		
36 | 		soup = BeautifulSoup(r.text, 'html.parser')
37 | 		li = soup.find_all('div', attrs={'class':'list'});
38 | 																					
39 | 		#解析网页
40 | 
41 | 		for j in range(len(li)):
42 | 			s = li[j];
43 | 			
44 | 			cag = (s.find('div', attrs = {'class':'part'}).text).split(' - ')[1];
45 | 
46 | 			#链接：
47 | 			link = s.find('div', attrs = {'class':'names cutom_font'}).find('a').attrs['href']
48 | 			link = link.split('/')[2];
49 | 
50 | 			r2 = requests.get(source2 + link);
51 | 			r2.encoding = 'utf-8';
52 | 			
53 | 			infos = eval(r2.text);
54 | 																											
55 | 			data.append(infos);
56 | 	except:
57 | 		print('Error!');
58 | 
59 | print('\n');
60 | #存入mysql
61 | db = pymysql.connect("localhost", "root", "Admin123456!", "test", charset="utf8");
62 | cursor = db.cursor();
63 | #cursor.execute("DROP TABLE IF EXISTS sxseng")
64 | #
65 | #sql = """CREATE TABLE sxseng (
66 | #         cag CHAR(255),
67 | #         iname CHAR(255),
68 | #         industry CHAR(255),
69 | #         cname CHAR(255),  
70 | #         city CHAR(255),
71 | #			address CHAR(255),
72 | #			attraction CHAR(255),
73 | #			chance CHAR(255),
74 | #			degree CHAR(255),
75 | #			maxsal CHAR(255),
76 | #			minsal CHAR(255),
77 | #			month CHAR(255),
78 | #			scale CHAR(255),
79 | #			info TEXT(2047),
80 | #			url CHAR(255))"""
81 | #
82 | #cursor.execute(sql);
83 | 
84 | i = 0;
85 | #Load to mysql:
86 | for d in data:
87 | 	i+=1;
88 | 	view_bar(i, len(data));
89 | 	sql = "INSERT INTO sxseng(cag, iname, industry, cname, city, address, attraction, chance, degree, maxsal, minsal, month, scale, info, url) \
90 | 			  VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % \
91 | 	       (cag, d['msg']['iname'], d['msg']['industry'], d['msg']['cname'], d['msg']['city'], d['msg']['address'], d['msg']['attraction'], d['msg']['chance'],\
92 | 			  d['msg']['degree'], d['msg']['maxsal'], d['msg']['minsal'], d['msg']['month'], d['msg']['scale'], d['msg']['info'], d['msg']['url']);
93 | 	try:
94 | 		cursor.execute(sql)
95 | 		db.commit()
96 | 	except:
97 | 	   db.rollback()
98 | db.close();
99 | 	


--------------------------------------------------------------------------------
/codes/wordFreq.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @ Author: Jun Sun {Python3}
 4 | @ E-mail: sunjunee@qq.com
 5 | @ Create: 2017-08-09 20:54
 6 | 
 7 | Descript: 统计词频
 8 | """
 9 | 
10 | import pymysql;
11 | import sys;
12 | import math;
13 | 
14 | def view_bar(num, total):
15 |     rate = num / total
16 |     rate_num = int(rate * 40)
17 |     rate_nums = math.ceil(rate * 100);
18 |     r = '\r[%s%s]%d%%\t%d/%d\t' % (">"*rate_num, " "*(40-rate_num), rate_nums, num, total,)
19 |     sys.stdout.write(r)
20 |     sys.stdout.flush()
21 | 
22 | db = pymysql.connect("localhost", "root", "Admin123456!", "test", charset="utf8");
23 | cursor = db.cursor();
24 | cursor.execute("SELECT id, info_cut FROM sxseng")
25 | datas = cursor.fetchall();
26 | db.close();
27 | 
28 | wordBook = [];	count = [];
29 | for i in range(len(datas)):
30 | 	data = datas[i][1];
31 | 	data = data.split('<cuts>');
32 | 	view_bar(i, len(datas));
33 | 	for word in data:
34 | 		if word not in wordBook:
35 | 			wordBook.append(word);
36 | 			count.append(1);
37 | 		else:
38 | 			count[wordBook.index(word)] += 1;
39 | 
40 | #存储到mysql：
41 | db = pymysql.connect("localhost", "root", "Admin123456!", "test", charset="utf8");
42 | cursor = db.cursor();
43 | cursor.execute("DROP TABLE IF EXISTS wordFreq")
44 | 
45 | sql = """CREATE TABLE wordFreq (
46 |          word CHAR(255),
47 | 			freq int)"""
48 | 
49 | cursor.execute(sql);
50 | 
51 | #Load to mysql:
52 | for i in range(len(wordBook)):
53 | 	view_bar(i, len(wordBook));
54 | 	sql = "INSERT INTO wordFreq(word, freq) \
55 | 			  VALUES ('%s', '%d')" % \
56 | 	       (wordBook[i], count[i]);
57 | 	try:
58 | 		cursor.execute(sql)
59 | 		db.commit()
60 | 	except:
61 | 	   db.rollback()
62 | db.close();
63 | 	


--------------------------------------------------------------------------------
/datas/sxseng.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunjunee/internshipAnalysis/92b95816db56542ec36635507a2ba660bdc56ab2/datas/sxseng.xls


--------------------------------------------------------------------------------