├── README.md ├── python34 ├── spider └── wordcut └── spider /README.md: -------------------------------------------------------------------------------- 1 | # python 2 | python related 3 | echo # python >> README.md 4 | git init 5 | git add README.md 6 | git commit -m "first commit" 7 | git remote add origin https://github.com/lichald/python.git 8 | git push -u origin master 9 | -------------------------------------------------------------------------------- /python34/spider: -------------------------------------------------------------------------------- 1 | __author__ = 'Lining' 2 | # -*- coding:utf-8 -*- 3 | import requests,json,re,time,datetime,socket,pyodbc 4 | from urllib import request 5 | import pandas as pd 6 | from pandas import DataFrame,Series 7 | from bs4 import BeautifulSoup 8 | 9 | def lagou_spider_keyword(keyword): 10 | #将搜索字符串转换为utf-8编码,之后进行lagou.com搜索url构造 11 | keywordbyte=keyword.encode('utf-8') 12 | keywordindex=str(keywordbyte).replace(r'\x','%').replace(r"'","") 13 | keywordindex=re.sub('^b','',keywordindex) 14 | 15 | #计算总共有多少搜索结果页 16 | i =0 17 | type='true' 18 | url='http://www.lagou.com/jobs/positionAjax.json?px=default&first='+type+'&kd='+keywordindex+'&pn='+str(i+1) 19 | with request.urlopen(url) as f: 20 | data=f.read() 21 | urlcount=int(json.loads(str(data,encoding='utf-8',errors='ignore'))["content"]["totalPageCount"]) 22 | print('本次搜索页面共计%d'%urlcount) 23 | 24 | #开始正式抓取 25 | for i in list(range(0,urlcount)): 26 | 27 | #构造页面 28 | if i ==0 : 29 | type='true' 30 | else: 31 | type='false' 32 | url='http://www.lagou.com/jobs/positionAjax.json?px=default&first='+type+'&kd='+keywordindex+'&pn='+str(i+1) 33 | with request.urlopen(url) as f: 34 | data=f.read() 35 | 36 | #读取json数据,开始解析 37 | try: 38 | jsondata=json.loads(str(data,encoding='utf-8',errors='ignore'))["content"]['result'] 39 | 40 | for t in list(range(len(jsondata))): 41 | #把company描述的列表合并为一个字符串 42 | jsondata[t]['companyLabelList2']='-'.join(jsondata[t]['companyLabelList']) 43 | jsondata[t].pop('companyLabelList') 44 | 45 | #将每一行数据做成Series,之后再合并 46 | if t == 0: 47 | rdata=DataFrame(Series(data=jsondata[t])).T 48 | else: 49 | rdata=pd.concat([rdata,DataFrame(Series(data=jsondata[t])).T]) 50 | #重新给rdata编码 51 | rdata.index=range(1,len(rdata)+1) 52 | rdata['keyword']=keyword 53 | rdata['salarymin']=0 54 | rdata['salarymax']=0 55 | rdata['url']='' 56 | rdata['jd']=''#职位描述 57 | rdata['handle_perc']=''#简历及时处理率,在七天内处理完简历占所有简历的比例 58 | rdata['handle_day']=''#完成简历处理平均天数 59 | for klen in list(range(len(rdata['salary']))): 60 | rdata.ix[klen+1,'salarymin'] = re.search('^(\d*?)k',rdata['salary'].iloc[klen]).group(1) 61 | #如果工资的最大值没有写,如(8k以上),则列为空值 62 | if re.search('-(\d*?)k$',rdata['salary'].iloc[klen]) != None: 63 | rdata.ix[klen+1,'salarymax'] = re.search('-(\d*?)k$',rdata['salary'].iloc[klen]).group(1) 64 | else: 65 | rdata.ix[klen+1,'salarymax'] = '' 66 | #增加url一列,便于后续抓取jd内容 67 | rdata.ix[klen+1,'url'] = 'http://www.lagou.com/jobs/%s.html'% rdata.ix[klen+1,'positionId'] 68 | 69 | #对url进行二次抓取,把jd抓进来 70 | with request.urlopen(rdata.ix[klen+1,'url']) as f: 71 | data_url=f.read() 72 | soup_url=BeautifulSoup(data_url,'html5lib') 73 | strings_url=soup_url.find('dd',class_='job_bt').strings 74 | rdata.ix[klen+1,'jd']=''.join(strings_url).encode('gbk','ignore').decode('gbk','ignore').replace(' ','') 75 | temp=soup_url.find_all('span',class_='data') 76 | if re.search('>(\w*%)<',str(temp[0])) == None: 77 | rdata.ix[klen+1,'handle_perc']='' 78 | else: 79 | rdata.ix[klen+1,'handle_perc']=re.search('>(\w*%)<',str(temp[0])).group(1) 80 | rdata.ix[klen+1,'handle_day']=re.search('>(\w*)<',str(temp[1])).group(1).replace('天','') 81 | 82 | except Exception: 83 | print(Exception) 84 | continue 85 | #构造totaldata,是所有页面的集合,rdata是这一个页面的集合 86 | if i == 0: 87 | totaldata=rdata 88 | else: 89 | totaldata=pd.concat([totaldata,rdata]) 90 | 91 | totaldata.index=range(1,len(totaldata)+1) 92 | print('正在抓取搜索页面第%d页,时间是%s,还剩下%d页'%(i+1,datetime.datetime.now(),urlcount-i-1)) 93 | 94 | 95 | #开始写入数据库 96 | totaldata.to_excel('lagou.xls',sheet_name='sheet1') 97 | 98 | 99 | if __name__=='__main__': 100 | keyword = input("请输入搜索词(回车进入下一步): ") 101 | #keyword='数据挖掘' #可以随意定义搜索词 102 | lagou_spider_keyword(keyword) 103 | -------------------------------------------------------------------------------- /python34/wordcut: -------------------------------------------------------------------------------- 1 | __author__ = 'Lining' 2 | # -*- coding: utf8 -*- 3 | import jieba,os,pyodbc 4 | import pandas as pd 5 | from pandas import DataFrame,Series 6 | 7 | def wordcut_fun(wordcut,excelsum,exceldetail): 8 | result=[] 9 | #将原始数据导入进来,形成list 10 | wordcuts=wordcut.split("\n") 11 | #进行分词操作 12 | for i in wordcuts: 13 | try: 14 | seg_list = jieba.cut(i) 15 | for j in seg_list: 16 | if len(j)>=2: 17 | result.append(j.lower()) 18 | except: print("some wrong") 19 | 20 | dic_result={} 21 | #进行计数 22 | for i in result: 23 | if i in dic_result: 24 | dd=dic_result.get(i) 25 | dic_result[i]=dd+1 26 | else: 27 | dic_result[i]=1 28 | dic_result=sorted(dic_result.items(),key=lambda asd:asd[1],reverse=True) 29 | dic_data=DataFrame(dic_result,columns=['keyword','frequency']) 30 | dic_data['tag']='' 31 | #将这个表根据我自己做的东东来打标签 32 | source=pd.read_sql_query(r'select fenci as keyword,tag_keyword as tag from [zln_data].[dbo].[lagou_fenci_jd]',con=sqlconn) 33 | for i in list(range(len(dic_data['keyword']))): 34 | for t in list(range(len(source['keyword']))): 35 | if source.ix[t,'keyword'].lower() in dic_data.ix[i,'keyword']: 36 | dic_data.ix[i,'tag']=source.ix[t,'tag'] 37 | dic_data=dic_data[dic_data['tag']!=''] 38 | sum_data=dic_data['frequency'].groupby(dic_data['tag']).sum() 39 | 40 | #写入excel中 41 | DataFrame(dic_data).to_excel(exceldetail+'.xls',sheet_name='detail') 42 | DataFrame(sum_data).to_excel(excelsum+'.xls',sheet_name='sum') 43 | 44 | if __name__== '__main__': 45 | sql=input('请输入sql查询语句(输入之后请回车):') 46 | excel1=input('请输入--详细列表--的文件名(不含后缀名):') 47 | excel2=input('请输入--汇总列表--的文件名(不含后缀名):') 48 | #***部分为数据库的地址、账号和密码 49 | sqlconn=pyodbc.connect("DRIVER={SQL SERVER};SERVER=***\\sql;DATABASE=zln_data;UID=***;PWD=***")#从数据库中提取最大最小值 50 | sqlcursor=sqlconn.cursor() 51 | jd=pd.read_sql_query(sql,con=sqlconn) 52 | t=wordcut_fun(wordcut=r'\n'.join(jd['jd']),exceldetail=excel1,excelsum=excel2) 53 | 54 | -------------------------------------------------------------------------------- /spider: -------------------------------------------------------------------------------- 1 | python spider 2 | --------------------------------------------------------------------------------