├── .gitattributes ├── 12306火车票 ├── .idea │ ├── inspectionProfiles │ │ └── profiles_settings.xml │ ├── misc.xml │ ├── modules.xml │ ├── vip-12306.iml │ └── workspace.xml ├── 12306.py ├── __pycache__ │ ├── cons.cpython-36.pyc │ ├── cons.cpython-37.pyc │ ├── settings.cpython-36.pyc │ └── settings.cpython-37.pyc ├── captcha.jpg ├── cons.py ├── login.py └── settings.py ├── 51_job ├── README.md ├── clean_data │ ├── csv_clean_data.py │ ├── job_company │ │ └── 大数据公司类型图饼图.jpg │ ├── job_company_workyears_pic.py │ ├── job_pic.py │ ├── job_pic │ │ └── examples.jpg │ ├── test.py │ └── wordscloud.py └── get_data │ └── 51job_toCsv.py ├── Analysis_Wechat_Friends └── Analysis_Wechat.py ├── Baidu_Address ├── README.md ├── baidu_address.py ├── company.csv └── image │ └── smaple.PNG ├── Baidu_Music ├── baidu_music.py ├── baidu_music2.py └── wangyi_music.py ├── ChuanZhi_Class ├── result │ └── ts.txt ├── scrapy.cfg └── ts │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── items.cpython-37.pyc │ ├── pipelines.cpython-37.pyc │ └── settings.cpython-37.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-37.pyc │ └── lesson.cpython-37.pyc │ └── lesson.py ├── DangDang_Books ├── README.md ├── analysis.py ├── dangdang │ ├── dangdang │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── items.cpython-37.pyc │ │ │ ├── pipelines.cpython-37.pyc │ │ │ └── settings.cpython-37.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ └── dd.cpython-37.pyc │ │ │ └── dd.py │ └── scrapy.cfg ├── ddSpider.py ├── pictureWall.py └── stopwords.txt ├── DouBan_Movie ├── 1.png ├── get_douban_comment3.py ├── ip.txt ├── ippools.py └── pic │ ├── 动物世界.png │ └── 巴斯特·斯克鲁格斯的歌谣.png ├── DouYou ├── README.md ├── douyu.csv └── test.py ├── LaGou ├── README.md ├── lagou1.csv ├── machine_learning_hz_job2.csv ├── 动态爬取.py └── 静态爬取.py ├── LianJia ├── README.md ├── cleaned.csv ├── group_by.py ├── housedata1.csv ├── housedata2.csv └── test_threading.py ├── Meituan ├── __init__.py ├── first.py ├── get_cookie.py ├── meituan.py └── mtwm.py ├── Movie_maoyan ├── WPS网盘.lnk ├── maoyan.csv ├── maoyan.py ├── readme.md ├── result.txt └── txt.py ├── Movie_tiantang ├── dytt.csv ├── readme.md └── spider_dytt.py ├── Photo_Position_GoldenAPI ├── .DS_Store ├── .idea │ ├── inspectionProfiles │ │ └── Project_Default.xml │ ├── misc.xml │ ├── modules.xml │ ├── vcs.xml │ ├── workspace.xml │ └── 地理位置.iml ├── __pycache__ │ └── position_utils.cpython-37.pyc ├── main.py ├── picture │ ├── .DS_Store │ └── 20190828185021.jpg └── position_utils.py ├── Photo_qiantu ├── ip.txt ├── qiantu.photo │ └── simple_show.PNG └── qiantu.py ├── Photo_taobao ├── ip.txt ├── taobao_photo.py └── taobao_photo │ └── simple_show.PNG ├── QiDian_Story ├── add_txt.py ├── binaries.txt ├── get_xiaoshuo.py └── 凡人修仙之仙界篇 │ ├── A目录.txt │ ├── 仙界篇外传一.txt │ ├── 仙界篇外传二.txt │ ├── 第一章 狐女.txt │ ├── 第三章 远去.txt │ ├── 第二章 石头哥哥.txt │ ├── 第五章 马兽.txt │ ├── 第六章 白袍少年.txt │ └── 第四章 相依.txt ├── Qsbk ├── duanzi.josn ├── qsbk │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── pipelines.cpython-37.pyc │ │ └── settings.cpython-37.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── qsbk_spider.cpython-37.pyc │ │ └── qsbk_spider.py ├── qsbk_start.py └── scrapy.cfg ├── README.md ├── Sina_topic_spider ├── README.md ├── age-pie.html ├── age_bar.html ├── area.html ├── gender.html ├── sina_topic.csv ├── sina_topic_data_analysis.py ├── sina_topic_spider.py ├── stop_words.txt └── word_cloud.html ├── WangYi_Music ├── geci.py ├── music.csv ├── wangyiyun.py └── 歌词 │ ├── 你若成风.txt │ ├── 全球变冷.txt │ ├── 内线.txt │ ├── 千古.txt │ ├── 千百度.txt │ ├── 城府.txt │ ├── 多余的解释.txt │ ├── 大千世界.txt │ ├── 天龙八部之宿敌.txt │ ├── 如果当时.txt │ ├── 幻听.txt │ ├── 庐州月.txt │ ├── 惊鸿一面.txt │ ├── 想象之中.txt │ ├── 我想牵着你的手.txt │ ├── 拆东墙.txt │ ├── 断桥残雪.txt │ ├── 明智之举.txt │ ├── 星座书上.txt │ ├── 有何不可.txt │ ├── 江湖 .txt │ ├── 河山大好.txt │ ├── 清明雨上.txt │ ├── 灰色头像.txt │ ├── 玫瑰花的葬礼.txt │ ├── 素颜.txt │ ├── 认错.txt │ ├── 违章动物.txt │ └── 雅俗共赏.txt ├── coffee.png ├── dangdang_book ├── README.md ├── dangdang_book │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── dd_book.py └── scrapy.cfg ├── ele_me ├── 1.png ├── README.md ├── __init__.py ├── eleme_bar.png ├── eleme_wordcloud.png ├── elemedata.csv ├── fooddic.txt └── run.py ├── finance.eastmoney.com ├── README.md ├── __init__.py ├── 可还债 │ ├── __init__.py │ ├── id20200424.csv │ └── zhaunzhai.py └── 股票 │ ├── gupiao.py │ ├── id.csv │ └── result_20200423.csv ├── live.bible.is.com ├── README.md └── live.bible.is.py ├── minority_language ├── jike.py ├── jike2.py └── saier.py ├── reward.jpg ├── taobao ├── README.md ├── taobao.josn ├── taobao_food.sql ├── taobao_food_Mongodb.py ├── taobao_food_analysis.py ├── taobao_food_mysql.py └── test.py ├── utils └── crawlerHelper.py ├── yingjieshneg.com ├── 2020-04-20_company.csv ├── README.md └── yingjieshneg.py ├── yixuela.com ├── README.md └── poetry.py ├── 微博热搜 ├── 人物.xlsx ├── 名词.xlsx ├── 婚恋.xlsx └── 热搜.py ├── 爬取中彩网彩票 ├── 3D.xls └── test_CaiPiao.py ├── 高考志愿网 ├── README.md ├── gkzy.py └── gkzy2.py └── 高考网 ├── 211高校排行.html ├── 985高校排行.html ├── analyse.py ├── college_data.csv ├── main.py ├── readme.md ├── 北京上海江苏高质量高校占比.html ├── 北京高质量高校占比.html ├── 占比前十城市高质量高校占比.html ├── 各地区高校数量段位图.html ├── 各城市高校数量.html ├── 各城市高质量高校数量.html ├── 高校分布热力图.html ├── 高校属性分析pie.html ├── 高校数量前十名.html ├── 高校数量后十名.html ├── 高校类型分析pie.html ├── 高质量高校分布热力图.html └── 高质量高校分布热力图map.html /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | *.js linguist-language=python 3 | *.css linguist-language=python 4 | *.html linguist-language=python 5 | *.sql linguist-language=python 6 | *.csv linguist-language=python 7 | *.txt linguist-language=python 8 | *.json linguist-language=python -------------------------------------------------------------------------------- /12306火车票/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /12306火车票/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /12306火车票/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /12306火车票/.idea/vip-12306.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /12306火车票/12306.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | __author__ = '强子' 4 | import requests 5 | from settings import * 6 | import cons 7 | 8 | dict_station = {} 9 | for i in cons.station.split('@'): 10 | tmp_list = i.split('|') 11 | #print(tmp_list) 12 | if len(tmp_list) > 2: 13 | dict_station[tmp_list[1]] = tmp_list[2] 14 | print(dict_station) 15 | 16 | from_station = dict_station[FROM_STATION] 17 | to_station = dict_station[TO_STATION] 18 | print(from_station,to_station) 19 | headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'} 20 | 21 | def queryTicket():#query_ticket 22 | url='https://kyfw.12306.cn/otn/leftTicket/queryX?leftTicketDTO.train_date='+TRAIN_DATE+'&leftTicketDTO.from_station='+from_station+'&leftTicketDTO.to_station='+to_station+'&purpose_codes=ADULT' 23 | print(url) 24 | response = requests.get(url=url,headers=headers,verify=False) 25 | result = response.json() 26 | print(result['data']['result']) 27 | print(TRAIN_DATE,FROM_STATION,TO_STATION) 28 | print('车次 '+' 座位 '+' 有无票'+' 票数') 29 | return result['data']['result'] 30 | 31 | n = 0 32 | ''' 33 | 23 = 软卧 34 | 28 = 硬卧 35 | 3 = 车次 36 | 29=硬座 37 | ''' 38 | 39 | for i in queryTicket(): 40 | tmp_list = i.split('|') 41 | #for ii in tmp_list: 42 | # print(n) 43 | # print(ii) 44 | # n += 1 45 | set = tmp_list[29] 46 | set1 = tmp_list[23] 47 | if set == '' or set == '无': 48 | print(tmp_list[3],'硬座 '+'无票',tmp_list[29]) 49 | 50 | else: 51 | print(tmp_list[3],'硬座 '+'有票',tmp_list[29]) 52 | #下单 -------------------------------------------------------------------------------- /12306火车票/__pycache__/cons.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/12306火车票/__pycache__/cons.cpython-36.pyc -------------------------------------------------------------------------------- /12306火车票/__pycache__/cons.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/12306火车票/__pycache__/cons.cpython-37.pyc -------------------------------------------------------------------------------- /12306火车票/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/12306火车票/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /12306火车票/__pycache__/settings.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/12306火车票/__pycache__/settings.cpython-37.pyc -------------------------------------------------------------------------------- /12306火车票/captcha.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/12306火车票/captcha.jpg -------------------------------------------------------------------------------- /12306火车票/login.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | # author:心蓝 4 | import requests 5 | 6 | """ 7 | 实现12306网站的登录 8 | """ 9 | map = { 10 | '1': '37,42', 11 | '2': '111,42', 12 | '3': '180,42', 13 | '4': '254,42', 14 | '5': '37,118', 15 | '6': '111,118', 16 | '7': '180,118', 17 | '8': '254,118', 18 | } 19 | 20 | 21 | def get_point(indexs): 22 | """ 23 | 根据输入的序号获取相应的坐标 24 | :param indexs: 1,2 25 | :return: 26 | """ 27 | indexs = indexs.split(',') 28 | temp = [] 29 | for index in indexs: 30 | temp.append(map[index]) 31 | return ','.join(temp) 32 | 33 | 34 | # cookie 保持 浏览器 35 | session = requests.Session() 36 | 37 | # 伪装 38 | headers = { 39 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 40 | } 41 | session.headers.update(headers) 42 | # 1.访问登录页面 43 | login_url = 'https://kyfw.12306.cn/otn/login/init' 44 | session.get(login_url) 45 | 46 | # 2.下载验证码图片 47 | captcha_url = 'https://kyfw.12306.cn/passport/captcha/captcha-image?login_site=E&module=login&rand=sjrand&0.5846169880733507' 48 | captcha_response = session.get(captcha_url) 49 | 50 | with open('captcha.jpg', 'wb') as f: 51 | f.write(captcha_response.content) 52 | 53 | # 3.校验验证码 54 | check_captcha_url = 'https://kyfw.12306.cn/passport/captcha/captcha-check' 55 | form_data = { 56 | 'answer': get_point(input('请输入正确的序号>>>:')), 57 | 'login_site': 'E', 58 | 'rand': 'sjrand' 59 | } 60 | check_response = session.post(check_captcha_url, data=form_data) 61 | #print(check_response.json()) 62 | if check_response.json()['result_code'] == '4': #'result_message': '验证码校验成功', 'result_code': '4' 63 | # 校验成功 64 | # 4.校验用户名和密码 65 | login_url = 'https://kyfw.12306.cn/passport/web/login' 66 | form_data = { 67 | 'username': '你的账号', 68 | 'password': '你的密码', 69 | 'appid': 'otn' 70 | } 71 | login_response = session.post(login_url, data=form_data) 72 | print(login_response.json()) 73 | #'result_message': '登录成功', 'result_code': 0, 'uamtk': '0YeWhGwOquOICVxAQZz0NxXSX6a_0AJcOBG6zfDMNsolm1210' 74 | if login_response.json()['result_code'] == 0: 75 | 76 | # 5.获取 权限 token 77 | uamtk_url = 'https://kyfw.12306.cn/passport/web/auth/uamtk' 78 | uamtk_response = session.post(uamtk_url, data={'appid': 'otn'}) 79 | #print(uamtk_response.json()) 80 | #'result_message': '验证通过', 'result_code': 0, 'apptk': None, 'newapptk': '-oTvBp0Sfb_LwV6irTcmGcf9jtyO5W_xykRJNL2t4Gk511210' 81 | if uamtk_response.json()['result_code'] == 0: 82 | # 6.校验token 83 | check_token_url = 'https://kyfw.12306.cn/otn/uamauthclient' 84 | check_token_response = session.post(check_token_url, data={'tk': uamtk_response.json()['newapptk']}) 85 | print(check_token_response.json()) 86 | 87 | -------------------------------------------------------------------------------- /12306火车票/settings.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | __author__ = '强子' 4 | 5 | TRAIN_DATE = '2019-06-11' 6 | FROM_STATION = '新乡' 7 | TO_STATION = '南阳' 8 | SET = 23 -------------------------------------------------------------------------------- /51_job/README.md: -------------------------------------------------------------------------------- 1 |  2 | - 内容: 爬取51job前程无忧简关于数据分析的职位信息,并对获取的数据进行数据清洗与分析,如各城市招聘岗位数、薪资与各城市工作地点数量,关系,学历,经验要求等关系、公司类型与对应岗位数、职位要求等可视化。 3 | 4 | - 对应CSDN文章:《爬取51job前程无忧简历](https://blog.csdn.net/weixin_43746433/article/details/90490227)》 5 | 6 | - 数据下载:链接:https://pan.baidu.com/s/1j-4HQduESyl2hm7-c3mTlg 7 | 提取码:rf0e 8 | 9 | 10 | - 微信:why19970628 11 | 12 | - 欢迎与我交流 13 | -------------------------------------------------------------------------------- /51_job/clean_data/job_company/大数据公司类型图饼图.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/51_job/clean_data/job_company/大数据公司类型图饼图.jpg -------------------------------------------------------------------------------- /51_job/clean_data/job_pic.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | data=pd.read_csv('test_datasets_finally.csv',delimiter='#',header=0) 3 | df=pd.DataFrame(data) 4 | print(df.shape) 5 | print(df.index) 6 | print(df.loc[:,'area'].nunique())#地区数 7 | area=df.loc[:,'area'].value_counts() 8 | print(area.shape) 9 | print(area.head()) 10 | print(area.index) 11 | area2=area.values.tolist() 12 | area=area.reset_index() 13 | print(area.head()) 14 | area1=area.loc[:,'index'].tolist() 15 | print('地区',area1) 16 | print('数量',area2) 17 | 18 | from pyecharts import Bar 19 | from pyecharts import Geo 20 | from pyecharts import Map 21 | map = Map("大数据工作分布图", "data from 51job",title_color="#404a59", title_pos="center") 22 | map.add("", area1,area2 , maptype='china',is_visualmap=True,visual_text_color='#000',is_label_show=True) 23 | map.render("./job_pic/大数据工作城市分布.html") 24 | map.render(path='snapshot.png') 25 | #map.render(path='snapshot.pdf') 26 | 27 | #effectScatter heatmap 28 | geo = Geo("大数据工作分布热力图", "data from 51job", title_color="#fff", title_pos="center", width=1200, height=600, background_color='#404a59') 29 | geo.add("大数据工作分布热力图", area1, area2, visual_range=[0, 35], type='heatmap',visual_text_color="#fff", symbol_size=15, is_visualmap=True, is_roam=False) 30 | geo.render('./job_pic/大数据工作分布热力图.html') 31 | 32 | geo = Geo("大数据工作分布城市评分", "data from 51job", title_color="#fff", title_pos="center", width=1200, height=600, background_color='#404a59') 33 | # type="effectScatter", is_random=True, effect_scale=5 使点具有发散性 34 | geo.add("空气质量评分", area1, area2,maptype='china', type="effectScatter", is_random=True, effect_scale=5, visual_range=[0, 5],visual_text_color="#fff", symbol_size=10, is_visualmap=True, is_roam=False) 35 | geo.render("./job_pic/大数据工作分布城市评分.html") 36 | 37 | 38 | #from pyecharts.charts import Geo 39 | #map = Map("全国地图示例" ) 40 | #map.add("", area, maptype='china' ,visual_text_color="#fff",symbol_size=10, is_visualmap=True) 41 | #map.render("全国大数据工作城市.html") 42 | #map -------------------------------------------------------------------------------- /51_job/clean_data/job_pic/examples.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/51_job/clean_data/job_pic/examples.jpg -------------------------------------------------------------------------------- /51_job/clean_data/wordscloud.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import jieba, re 3 | from scipy.misc import imread 4 | from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS 5 | import matplotlib.pyplot as plt 6 | data = pd.read_csv('test_datasets_finally.csv',delimiter='#') # 读取Excel转为dabaframe 7 | df = pd.DataFrame(data) 8 | print('去掉空值前有{}行'.format(df.shape[0])) # 获得一共有多少行 9 | file1 = df.loc[:,'describe'].dropna(how='any') # 去掉空值 10 | print('去掉空值后有{}行'.format(file1.shape[0])) # 获得一共有多少行 11 | print(file1.head()) 12 | text1 = ''.join(i for i in file1) # 把所有字符串连接成一个长文本 13 | responsibility = re.sub(re.compile(',|;|\.|、|。'), '', text1) # 去掉逗号等符号 14 | wordlist1 = " ".join(jieba.cut(responsibility, cut_all=True)) # 分析岗位职责 15 | # wordlist1=" ".join(jieba.cut(requirement,cut_all=True))#分析岗位要求 16 | font_path = r'C:\Windows\Fonts\simkai.ttf' 17 | stopwords = list(STOPWORDS) + ['数据', '分析', '负责', '相关', '公司', '进行', '工作','岗位', 18 | '岗位职责','上学','互联网','以上','以上学历','任职','要求'] +\ 19 | ['数据分析','以上学历','优先','计算','经验','学历','上学','熟练','使用']#分析岗位要求 20 | #bgimg=imread(r'1.png')#设置背景图片 21 | wc = WordCloud(font_path=font_path, # 设置字体 22 | background_color="black", # 背景颜色 23 | max_words=1000, # 词云显示的最大词数 24 | stopwords=stopwords, # 设置停用词 25 | max_font_size=300, # 字体最大值 26 | #mask=bgimg, # 设置背景图片 27 | random_state=42, # 设置有多少种随机生成状态,即有多少种配色 28 | width=1200, height=860, 29 | margin=4, # 设置图片默认的大小,margin为词语边缘距离 30 | ).generate(str(wordlist1)) 31 | #image_colors = ImageColorGenerator(bgimg) # 根据图片生成词云颜色 32 | plt.imshow(wc) 33 | plt.axis("off") 34 | plt.savefig("./job_pic/examples1.jpg") # 必须在plt.show之前,不是图片空白 35 | plt.show() 36 | -------------------------------------------------------------------------------- /Analysis_Wechat_Friends/Analysis_Wechat.py: -------------------------------------------------------------------------------- 1 | import itchat 2 | import pandas as pd 3 | from pyecharts import Pie, Map, Page, Bar 4 | 5 | 6 | # 根据key值得到对应的信息 7 | def get_key_info(friends_info, key): 8 | return list(map(lambda friend_info: friend_info.get(key), friends_info)) 9 | 10 | 11 | # 获得所需的微信好友信息 12 | def get_friends_info(): 13 | itchat.auto_login(hotReload=True) 14 | friends = itchat.get_friends() 15 | print('~~~~~~~~~~~~~~~~~~~~~~~~~') 16 | friends_info = dict( 17 | # 省份 18 | province = get_key_info(friends, "Province"), 19 | # 城市 20 | city = get_key_info(friends, "City"), 21 | # 昵称 22 | nickname = get_key_info(friends, "Nickname"), 23 | # 性别 24 | sex = get_key_info(friends, "Sex"), 25 | # 签名 26 | signature = get_key_info(friends, "Signature"), 27 | # 备注 28 | remarkname = get_key_info(friends, "RemarkName"), 29 | # 用户名拼音全拼 30 | pyquanpin = get_key_info(friends, "PYQuanPin") 31 | ) 32 | return friends_info 33 | 34 | 35 | # 性别分析 36 | def analysisSex(): 37 | friends_info = get_friends_info() 38 | df = pd.DataFrame(friends_info) 39 | print(df) 40 | sex_count = df.groupby(['sex'], as_index=True)['sex'].count() 41 | print(sex_count) 42 | temp = dict(zip(list(sex_count.index), list(sex_count))) 43 | print(temp) 44 | data = {} 45 | data['保密'] = temp.pop(0) 46 | data['男'] = temp.pop(1) 47 | data['女'] = temp.pop(2) 48 | # 画图 49 | page = Page() 50 | attr, value = data.keys(), data.values() 51 | chart = Pie('微信好友性别比') 52 | chart.add('', attr, value, center=[50, 50], 53 | redius=[30, 70], is_label_show=True, legend_orient='horizontal', legend_pos='center', 54 | legend_top='bottom', is_area_show=True) 55 | page.add(chart) 56 | page.render('analysisSex.html') 57 | 58 | 59 | # 省份分析 60 | def analysisProvince(): 61 | friends_info = get_friends_info() 62 | df = pd.DataFrame(friends_info) 63 | province_count = df.groupby('province', as_index=True)['province'].count().sort_values() 64 | temp = list(map(lambda x: x if x != '' else '未知', list(province_count.index))) 65 | # 画图 66 | page = Page() 67 | # style = Style(width=1100, height=600) 68 | # style_middle = Style(width=900, height=500) 69 | attr, value = temp, list(province_count) 70 | chart1 = Map('好友分布(中国地图)')#, **style.init_style 71 | chart1.add('', attr, value, is_label_show=True, is_visualmap=True, visual_text_color='#000') 72 | page.add(chart1) 73 | chart2 = Bar('好友分布柱状图')#, **style_middle.init_style 74 | chart2.add('', attr, value, is_stack=True, is_convert=True, 75 | label_pos='inside', is_legend_show=True, is_label_show=True) 76 | page.add(chart2) 77 | page.render('analysisProvince.html') 78 | 79 | 80 | # 具体省份分析 81 | def analysisCity(province): 82 | friends_info = get_friends_info() 83 | df = pd.DataFrame(friends_info) 84 | temp1 = df.query('province == "%s"' % province) 85 | city_count = temp1.groupby('city', as_index=True)['city'].count().sort_values() 86 | attr = list(map(lambda x: '%s市' % x if x != '' else '未知', list(city_count.index))) 87 | value = list(city_count) 88 | # 画图 89 | page = Page() 90 | # style = Style(width=1100, height=600) 91 | # style_middle = Style(width=900, height=500) 92 | chart1 = Map('%s好友分布' % province)#, **style.init_style 93 | chart1.add('', attr, value, maptype='%s' % province, is_label_show=True, 94 | is_visualmap=True, visual_text_color='#000') 95 | page.add(chart1) 96 | chart2 = Bar('%s好友分布柱状图' % province)#, **style_middle.init_style 97 | chart2.add('', attr, value, is_stack=True, is_convert=True, label_pos='inside', is_label_show=True) 98 | page.add(chart2) 99 | page.render('analysisCity.html') 100 | 101 | if __name__ == '__main__': 102 | analysisSex() 103 | analysisProvince() 104 | analysisCity("河南") 105 | 106 | -------------------------------------------------------------------------------- /Baidu_Address/README.md: -------------------------------------------------------------------------------- 1 | 爬虫小程序 2 | 3 | ![Image text](https://github.com/why19970628/Python_Crawler/tree/master/Baidu_Address/image/smaple.PNG) 4 | 5 | 6 | 爬取百度地图的某位置的公司信息,包括公司名称,公司地址等 7 | 8 | csv文件大约几十条数据 9 | 10 | 博客地址:https://blog.csdn.net/weixin_43746433 11 | 12 | 微信:why19970628 13 | 14 | 欢迎与我交流 15 | -------------------------------------------------------------------------------- /Baidu_Address/baidu_address.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.support.ui import WebDriverWait 3 | import pandas as pd 4 | import time 5 | from time import sleep 6 | import csv 7 | chrome_driver = r"D:\ProgramData\Anaconda3\Lib\site-packages\selenium\webdriver\chrome\chromedriver.exe" 8 | browser = webdriver.Chrome(executable_path=chrome_driver) 9 | wait=WebDriverWait(browser,3) 10 | browser.get('https://map.baidu.com/search/%E6%96%B0%E4%B9%A1%E4%BA%92%E8%81%94%E7%BD%91%E5%A4%A7%E5%8E%A6%E9%83%BD%E6%9C%89%E5%93%AA%E4%BA%9B%E5%85%AC%E5%8F%B8/@12683385.160376176,4180157.68,19z?querytype=s&da_src=shareurl&wd=%E6%96%B0%E4%B9%A1%E4%BA%92%E8%81%94%E7%BD%91%E5%A4%A7%E5%8E%A6%E9%83%BD%E6%9C%89%E5%93%AA%E4%BA%9B%E5%85%AC%E5%8F%B8&c=152&src=0&pn=0&sug=0&l=19&b=(12682905.160376176,4179893.43;12683865.160376176,4180421.93)&from=webmap&biz_forward=%7B%22scaler%22:1,%22styles%22:%22pl%22%7D&device_ratio=1') 11 | sleep(3) 12 | 13 | def search(writer): 14 | for i in range(10): 15 | 16 | company_names = browser.find_elements_by_xpath('//div[@class="ml_30 mr_90"]/div[@class="row"]/span/a') 17 | print(len(company_names)) 18 | 19 | company_addresses = browser.find_elements_by_xpath('//div[@class="ml_30 mr_90"]/div[@class="row addr"]/span') 20 | print(len(company_addresses)) 21 | 22 | 23 | # ipone_lists=[] 24 | # try: 25 | # ipones=browser.find_elements_by_xpath('//div[@class="ml_30 mr_90"]/div[@class="row tel"]')#电话 26 | # for i in ipones: 27 | # ipone_lists.append(ipones[i]) 28 | # except: 29 | # ipone_lists.append('无') 30 | # if browser.find_elements_by_xpath('//div[@class="row tel"]'): 31 | # company_iphones r= browser.find_elements_by_xpath('//div[@class="ml_30 mr_90"]/div[@class="row tel"]') 32 | # for i in range(len(company_iphones)): 33 | # ipone_lists.append(company_iphones[i].text) 34 | # ipone_lists.append('无') 35 | # print(ipone_lists) 36 | # print(len(ipone_lists)) 37 | 38 | for i in range(len(company_names)): 39 | company_name = company_names[i].text 40 | 41 | company_address = company_addresses[i].text 42 | 43 | print(company_name, company_address) 44 | # ipone_list=ipone_lists[i] 45 | 46 | writer.writerow([company_name, company_address]) 47 | 48 | browser.find_element_by_xpath('//div[@id="poi_page"]/p/span/a[@tid="toNextPage"]').click() 49 | sleep(5) 50 | 51 | 52 | def main(): 53 | fp = open('company.csv', 'w', newline='', encoding="utf_8_sig") 54 | writer = csv.writer(fp) 55 | writer.writerow(['公司名称', '地址', '电话']) 56 | search(writer) 57 | print('Over !!!!') 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | if __name__ == '__main__': 66 | main() 67 | -------------------------------------------------------------------------------- /Baidu_Address/company.csv: -------------------------------------------------------------------------------- 1 | 公司名称,地址,电话 2 | 互联网大厦,河南省新乡市红旗区新中大道(中) 3 | 新乡市嘟嘟网络技术有限公司,洪门镇金穗大道新中大道交叉口互联网大厦8楼 4 | 嘉亿国际新闻大厦,新乡市红旗区洪门镇红旗区互联网大厦西 5 | 叁河鼎盛公司,洪门镇金穗大道与新中大道交叉口互联网大厦六楼607 6 | 饿了么公司,洪门镇金穗大道(中)互联网大厦30层 7 | 新乡搜狗运营中心,河南省新乡市红旗区金穗大道互联网大厦6楼 8 | 新乡天驰网络科技有限公司,河南省新乡市红旗区金穗大道互联网大厦702室 9 | 万达物流公司,新中大道互联网大厦22楼 10 | 大胜商贸有限公司,互联网大厦商业步行街131号 11 | 万达仓储公司,洪门镇金穗大道互联网大厦2202 12 | 银谷普惠公司,新乡市红旗区开发区街道互联网大厦1801 13 | 平安公司,推测位置 14 | 互联网大厦-南1门,河南省新乡市红旗区洪门镇互联网大厦西嘉亿国际新闻大厦平安公司 15 | 新乡市动之力广告有限公司,推测位置 16 | 新利净化技术有限公司,新中大道与金穗大道交叉口西100米 17 | 新乡市百特智能转运设备有限公司,金穗大道东嘉亿互联网大厦1804室 18 | 新乡市中誉鼎力软件科技有限公司,新乡市红旗区互联网大厦3601室 19 | 庭雅商贸公司,新乡市互联网大厦701 20 | 新乡市彩纶纸业有限公司,洪门镇金穗大道与新中大道互联网大厦31楼3110室 21 | 指南者网络科技有限公司,推测位置 22 | 新乡搜狗运营中心,河南省新乡市红旗区金穗大道互联网大厦6楼 23 | 嘉德建筑工程有限公司,新乡市红旗区新中大道和金穗大道交叉口新乡市互联网大厦1103室 24 | 博洋翻译,推测位置 25 | 河南省乐境通电子商务有限公司,洪门镇金穗大道东嘉亿互联网大厦509 26 | 丰时商贸有限公司,新乡市红旗区金穗大道互联网大厦1910室 27 | 酷雷曼河南省运营中心,河南省新乡市红旗区金穗大道互联网大厦 28 | 河南一棵树电子商务有限公司,新乡市金穗大道与新中大道交叉口西北角嘉亿互联网大厦1601室 29 | 郑州一如既往软件科技有限公司,新中大道与金穗大道西北角互联网大厦 30 | 互联网大厦-地下停车场,河南省新乡市红旗区金穗大道东互联网大厦1505 31 | 新乡十里红妆婚庆公司,新乡市红旗区金穗大道与新中大道交叉口西北角互联网大厦1层 32 | 仁真装饰,新中大道与金穗大道交叉口互联网大厦2002 33 | 捷润科技,推测位置 34 | 互联网大厦-B座,洪门镇宝龙广场对面互联网大厦10层1011室 35 | 互联网大厦-北门,河南省新乡市红旗区金穗大道与新中大道交汇处 36 | 平安普惠投资咨询有限公司新乡新飞大道分公司,新乡市红旗区新中大道(中)附近 37 | 新乡市深鹏装饰工程有限公司,金穗大道嘉亿东方明珠32楼 38 | 新乡市惠民天下畜牧设备有限公司,新乡市红旗区嘉亿国际新闻大厦25层 39 | 互联网大厦停车场-出入口,河南省新乡市红旗区嘉亿街 40 | 互联网大厦停车场-出口,河南省新乡市红旗区金穗大道东 41 | 互联网大厦-西门,金穗大道(东)北100米 42 | 互联网大厦停车场-入口,河南省新乡市红旗区新中大道(中)辅路 43 | 新乡市福运到家互联网科技股份有限公司,平原路蓝钻国际 44 | 拉扎斯网络科技(上海)有限公司新乡分公司,洪门镇金穗大道与新中大道路口嘉亿东方明珠30层 45 | 汇益互联网服务有限公司,河南省新乡市红旗区金穗大道东辅路 46 | 广州金不换财务咨询有限公司新乡分公司,河南省新乡市红旗区嘉亿新闻大厦1905室 47 | 恒大人寿(新乡中心支公司),河南省新乡市红旗区金穗大道新闻大厦22楼 48 | 新乡市园林绿化工程有限公司,新乡市金穗大道与新中大道交叉口嘉亿新闻大厦27层 49 | 互联网大厦-南2门,金穗大道(东)北50米 50 | 河南恒东商贸有限公司,河南省新乡市红旗区金穗大道新闻大厦2611、2612室 51 | 中国建设银行24小时自助银行(金穂大道支行),新乡市红旗区金穗大道与新中大道交叉口西北角互联网大厦1层附近 52 | 北京高幂数据科技有限公司河南分公司,新乡市红旗区嘉亿新闻大厦2601 53 | 河南金友互联网技术有限公司,淘宝城7楼 54 | 康宝莱公司,洪门镇新中大道与金穗大道交叉口嘉亿东方明珠16楼 55 | 河南纳澜电器有限公司,河南省新乡市红旗区嘉亿国际新闻大厦河南纳澜电器有限公司 56 | 车邦(深圳)互联网金融服务有限公司,河南省新乡市红旗区牧野大道南段电子信息科技园 57 | 积木家互联网装修(新乡体验中心),新二街与金穗大道(东)交叉口东北100米靖业摩尔151附近 58 | 河南御乐坊酒店有限公司,河南省新乡市红旗区新中大道海利川蜀香火锅东北100米 59 | EOTO视觉摄影工作室,新乡市红旗区金穗大道东互联网大厦2810 60 | 中煤润邦机械装备股份有限公司,新乡市新中大道与金穗大道西北角嘉亿东方明珠13层1303房 61 | 河南省帕菲特搬运设备有限公司,河南省新乡市金穗大道与新中大道交叉口西北角嘉亿东方明珠701室 62 | 幸福之家互联网家装资源平台,河南省新乡市红旗区向阳路296号 63 | 爱美得美发工作室,金穗大道与新中大道西北角互联网大厦商业裙房104室 64 | 嘉亿东方明珠中国民生银行,河南省新乡市红旗区互联网大厦嘉亿东方明珠中国民生银行 65 | 恒富电子,河南省新乡市红旗区新中大道与新飞大道交叉口嘉亿新闻大厦恒富电子 66 | 黄金时代游泳健身俱乐部,推测位置 67 | 八马茶业(互联网大厦北),金穗大道东互联网大厦负一楼 68 | 塞纳春天互联网家装(黄岗分店),河南省新乡市红旗区嘉亿·明珠商业街130 69 | 四季康道养生会馆,河南省新乡市牧野区西华大道佳煌不锈钢西南侧10米 70 | 雅藏文化,新乡市红旗区洪门镇互联网大厦17楼1705室 71 | 丰泽堂老银铺,推测位置 72 | 独角兽花店,河南省新乡市红旗区互联网大厦裙房红咖啡一楼 73 | 3Q便利店(嘉亿店),新中大道嘉亿互联网大厦北红咖啡一楼 74 | 街电(青禾馅饼嘉亿店),新中大道与金穗大道交叉口嘉亿互联网大厦1308室 75 | 森怡康新乡运营中心,推测位置 76 | 东方明珠-停车场,新乡市红旗区金穗大道和新中大道交叉口西北角互联网大厦商业街134号 77 | 丁丁贷,河南省新乡市新中大道与金穗大道交叉口互联网大厦15楼 78 | 善林金融,新乡市红旗区金穗大道与新中大道交叉口西北角互联网大厦1层附近 79 | 东方明珠停车场-出口,金穗大道与新中大道交叉口西北角互联网大厦1层附近 80 | 嘉亿·东方明珠-地下停车场,金穗大道与新中大道交叉口西北角互联网大厦1层 81 | -------------------------------------------------------------------------------- /Baidu_Address/image/smaple.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/Baidu_Address/image/smaple.PNG -------------------------------------------------------------------------------- /Baidu_Music/baidu_music.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re,json,pprint,os 3 | #url='http://zhangmenshiting.qianqian.com/data2/music/a612909cdafecf20933bd2942c43421c/596603939/596603939.mp3?xcode=10263e95dfecc6e6f4316fffb8ff8771' 4 | def download_music(songid): 5 | url='http://musicapi.taihe.com/v1/restserver/ting?method=baidu.ting.song.playAAC&format=jsonp&callback=jQuery17208091693203165158_1545207385401&songid='+songid+'&from=web&_=1545207388641' 6 | url2='href="http://music.163.com/song/media/outer/url?id=317151.mp3' 7 | response=requests.get(url) 8 | data=json.loads(re.findall("{.*}",response.text)[0]) 9 | music_name=data['songinfo']['title'] 10 | artist=data['songinfo']['artist'] 11 | music_url=data['bitrate']['file_link'] 12 | #pprint.pprint(data) 13 | return music_name,music_url,artist 14 | #music_name,music_url=download_music('265715650') 15 | #print(music_name,music_url,) 16 | def get_songid(artist_id): 17 | for i in range(0, 41, 20): 18 | reponse=requests.get(url="http://music.taihe.com/artist/"+artist_id) 19 | #print(reponse.text)#ctrl+u 查看网页源代码 20 | songids=re.findall('{"id":"(.*)","kr_top"',reponse.text) 21 | return songids 22 | def save_music(music_name,music_url,artist): 23 | music_res = requests.get(music_url) 24 | try: 25 | folder = os.path.exists(artist) 26 | 27 | if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹 28 | os.makedirs(artist) # makedirs 创建文件时如果路径不存在会创建这个路径 29 | print 30 | "--- new folder... ---" 31 | print 32 | "--- OK ---" 33 | 34 | else: 35 | print 36 | "--- There is this folder! ---" 37 | 38 | file = "D:/软件(学习)/Python/TanZhou/百度音乐/" + artist + '/' + music_name + ".mp3" 39 | with open(file,'wb') as f: 40 | f.write(music_res.content) 41 | except: 42 | print('下载失败') 43 | def run(): 44 | artist_id=input('请输入网易歌手ID:') 45 | singids=get_songid(artist_id) 46 | for songid in singids: 47 | music_name, music_url,artist=download_music(songid) 48 | save_music(music_name, music_url,artist) 49 | print(music_name + " 下载完成") 50 | run() 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /Baidu_Music/baidu_music2.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re,json,pprint,os 3 | from urllib import request 4 | import urllib 5 | from lxml import etree 6 | header = { 7 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'} 8 | #url='http://zhangmenshiting.qianqian.com/data2/music/a612909cdafecf20933bd2942c43421c/596603939/596603939.mp3?xcode=10263e95dfecc6e6f4316fffb8ff8771' 9 | def download_music(songid): 10 | url='http://musicapi.taihe.com/v1/restserver/ting?method=baidu.ting.song.playAAC&format=jsonp&callback=jQuery17208091693203165158_1545207385401&songid='+songid+'&from=web&_=1545207388641' 11 | url2='href="http://music.163.com/song/media/outer/url?id=317151.mp3' 12 | response=requests.get(url) 13 | data=json.loads(re.findall("{.*}",response.text)[0]) 14 | music_name=data['songinfo']['title'] 15 | artist=data['songinfo']['artist'] 16 | music_url=data['bitrate']['file_link'] 17 | #pprint.pprint(data) 18 | return music_name,music_url,artist 19 | #music_name,music_url=download_music('265715650') 20 | #print(music_name,music_url,) 21 | 22 | def get_songid(artist_id): 23 | song_id = urllib.request.quote(artist_id) 24 | songid=[] 25 | for i in range(0, 21, 20): 26 | url = "http://music.taihe.com/search/song?s=1&key="+song_id+"&jump=0&start="+str(i)+"&size=20&third_type=0" 27 | print(url) 28 | req = request.Request(url,headers=header) 29 | html = request.urlopen(req).read().decode('utf-8') 30 | #songids=re.findall('data-playdata="(.*)"moduleName"',html) 31 | songids=re.findall('"sid":(.*),"author":',html) 32 | #print(songids) 33 | html = etree.HTML(html) 34 | songid=songid+songids 35 | song_num = html.xpath('//ul[@class="tab-list"]/li/a[@class="list"]/text()')[0] 36 | #print(song_num) 37 | #print(songid) 38 | return songid,song_num 39 | #get_songid('薛之谦') 40 | def save_music(music_name,music_url,artist): 41 | music_res = requests.get(music_url) 42 | try: 43 | folder = os.path.exists(artist) 44 | 45 | if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹 46 | os.makedirs(artist) # makedirs 创建文件时如果路径不存在会创建这个路径 47 | print 48 | "--- new folder... ---" 49 | print 50 | "--- OK ---" 51 | 52 | else: 53 | print 54 | "--- There is this folder! ---" 55 | 56 | file = "D:/软件(学习)/Python/TanZhou/百度音乐/" + artist + '/' + music_name + ".mp3" 57 | with open(file,'wb') as f: 58 | f.write(music_res.content) 59 | except: 60 | print('下载失败') 61 | def run(): 62 | artist_id=input('请输入网易歌手名字:') 63 | singids=get_songid(artist_id)[0] 64 | #print(singids) 65 | songmun=get_songid(artist_id)[1] 66 | print(songmun) 67 | for songid in singids: 68 | music_name, music_url,artist=download_music(songid) 69 | save_music(music_name, music_url,artist) 70 | print(music_name + " 下载完成") 71 | run() 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /Baidu_Music/wangyi_music.py: -------------------------------------------------------------------------------- 1 | # import urllib 2 | from urllib import request 3 | # import requests 4 | # url2='http://music.163.com/song/media/outer/url?id=423776423.mp3' 5 | # print(url2) 6 | # urllib.request.urlretrieve(url2,'3.mp3') 7 | # music_res = requests.get(url2) 8 | # with open('4.mp3','wb') as f: 9 | # f.write(music_res.content) 10 | # print('成功') 11 | 12 | import requests 13 | from bs4 import BeautifulSoup 14 | import urllib.request 15 | import os 16 | import re 17 | 18 | headers = { 19 | 'Referer': 'http://music.163.com/', 20 | 'Host': 'music.163.com', 21 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36', 22 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 23 | } 24 | 25 | # 歌单的url地址 26 | play_url = 'http://music.163.com/playlist?id=2182968685' 27 | 28 | # 获取页面内容 29 | s = requests.session() 30 | response = s.get(play_url, headers=headers).content 31 | 32 | # 使用bs4匹配出对应的歌曲名称和地址 33 | s = BeautifulSoup(response, 'lxml') 34 | main = s.find('ul', {'class': 'f-hide'}) 35 | pat='data-rid="(.*?)"' 36 | singerid=re.compile(pat).findall(str(s)) 37 | id=singerid[0] 38 | print(singerid[0]) 39 | 40 | lists = [] 41 | for music in main.find_all('a'): 42 | list = [] 43 | # print('{} : {}'.format(music.text, music['href'])) 44 | musicUrl = 'http://music.163.com/song/media/outer/url' + music['href'][5:] + '.mp3' 45 | musicName = music.text 46 | # 单首歌曲的名字和地址放在list列表中 47 | list.append(musicName) 48 | list.append(musicUrl) 49 | # 全部歌曲信息放在lists列表中 50 | lists.append(list) 51 | 52 | print(lists) 53 | 54 | # 下载列表中的全部歌曲,并以歌曲名命名下载后的文件,文件位置为当前文件夹 55 | for i in lists: 56 | url = i[1] 57 | name = i[0] 58 | try: 59 | folder = os.path.exists(id) 60 | 61 | if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹 62 | os.makedirs(id) # makedirs 创建文件时如果路径不存在会创建这个路径 63 | print 64 | "--- new folder... ---" 65 | print 66 | "--- OK ---" 67 | 68 | else: 69 | print 70 | "--- There is this folder! ---" 71 | print('正在下载', name) 72 | file="D:/软件(学习)/Python/TanZhou/百度音乐/"+id+'/'+name+".mp3" 73 | urllib.request.urlretrieve(url, file) 74 | print('下载成功') 75 | except: 76 | print('下载失败') -------------------------------------------------------------------------------- /ChuanZhi_Class/result/ts.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/ChuanZhi_Class/result/ts.txt -------------------------------------------------------------------------------- /ChuanZhi_Class/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = ts.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = ts 12 | -------------------------------------------------------------------------------- /ChuanZhi_Class/ts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/ChuanZhi_Class/ts/__init__.py -------------------------------------------------------------------------------- /ChuanZhi_Class/ts/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/ChuanZhi_Class/ts/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /ChuanZhi_Class/ts/__pycache__/items.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/ChuanZhi_Class/ts/__pycache__/items.cpython-37.pyc -------------------------------------------------------------------------------- /ChuanZhi_Class/ts/__pycache__/pipelines.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/ChuanZhi_Class/ts/__pycache__/pipelines.cpython-37.pyc -------------------------------------------------------------------------------- /ChuanZhi_Class/ts/__pycache__/settings.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/ChuanZhi_Class/ts/__pycache__/settings.cpython-37.pyc -------------------------------------------------------------------------------- /ChuanZhi_Class/ts/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class TsItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title=scrapy.Field() 15 | link=scrapy.Field() 16 | stu=scrapy.Field() 17 | -------------------------------------------------------------------------------- /ChuanZhi_Class/ts/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class TsSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class TsDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /ChuanZhi_Class/ts/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class TsPipeline(object): 10 | def __init__(self): 11 | self.fh=open("../result/ts.txt","a") 12 | 13 | def process_item(self, item, spider): 14 | print(item['title']) 15 | print(item['link']) 16 | print(item['stu']) 17 | print('~~~~~~') 18 | self.fh.write(item['title'][0]+"\n"+item['link'][0]+"\n"+item['stu'][0]+"\n"+"~~~~~~~"+"\n") 19 | return item 20 | 21 | def close_spider(self): 22 | self.fh.close() -------------------------------------------------------------------------------- /ChuanZhi_Class/ts/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for ts project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'ts' 13 | 14 | SPIDER_MODULES = ['ts.spiders'] 15 | NEWSPIDER_MODULE = 'ts.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'ts (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'ts.middlewares.TsSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'ts.middlewares.TsDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'ts.pipelines.TsPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /ChuanZhi_Class/ts/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /ChuanZhi_Class/ts/spiders/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/ChuanZhi_Class/ts/spiders/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /ChuanZhi_Class/ts/spiders/__pycache__/lesson.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/ChuanZhi_Class/ts/spiders/__pycache__/lesson.cpython-37.pyc -------------------------------------------------------------------------------- /ChuanZhi_Class/ts/spiders/lesson.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from ts.items import TsItem 4 | from scrapy.http import Request 5 | 6 | 7 | class LessonSpider(scrapy.Spider): 8 | name = 'lesson' 9 | #allowed_domains = ['hellobi.com'] 10 | #start_urls = ['https://edu.hellobi.com/course/1'] 11 | #def parse(self, response): 12 | # item=TsItem() 13 | # #item['title']=response.xpath("//ol[@class='breadcrumb']/li[@class='active']/text()").extract() 14 | # #item['link'] = response.xpath("//ul[@class='nav nav-tabs']/li[@class='active']/a/@href").extract() 15 | # #item['stu'] = response.xpath("//span[@class='course-view']/text()").extract() 16 | # yield item 17 | # for i in range(2,121): 18 | # url='https://edu.hellobi.com/course/'+str(i) 19 | # yield Request(url,callback=self.parse) 20 | allowed_domains = ['douban.com'] 21 | start_urls = ['https://movie.douban.com/subject/27615441/'] 22 | def parse(self, response): 23 | item=TsItem() 24 | item['stu'] = response.xpath("//h3/span[@class='comment-info']/a/@href=").extract() 25 | print(item['stu']) 26 | yield item 27 | -------------------------------------------------------------------------------- /DangDang_Books/README.md: -------------------------------------------------------------------------------- 1 | 爬虫:爬取当当网图书信息, 书名,书图,价格,简介,评分,评论数量。 2 | 3 | dangdang文件夹为利用scrapy框架爬取图书信息 4 | 5 | 数据分析:对其进行简单的数据分析,如图书评论数量分布的漏斗图,价格分布的柱状图等等 6 | 7 | python_61.pkl文件大约1000条数据 8 | 9 | 博客地址:https://blog.csdn.net/weixin_43746433 10 | 11 | CSDN文章地址::https://blog.csdn.net/weixin_43746433/article/details/100059191 12 | 13 | 微信:why19970628 14 | 15 | 欢迎与我交流 16 | -------------------------------------------------------------------------------- /DangDang_Books/dangdang/dangdang/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/DangDang_Books/dangdang/dangdang/__init__.py -------------------------------------------------------------------------------- /DangDang_Books/dangdang/dangdang/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/DangDang_Books/dangdang/dangdang/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /DangDang_Books/dangdang/dangdang/__pycache__/items.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/DangDang_Books/dangdang/dangdang/__pycache__/items.cpython-37.pyc -------------------------------------------------------------------------------- /DangDang_Books/dangdang/dangdang/__pycache__/pipelines.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/DangDang_Books/dangdang/dangdang/__pycache__/pipelines.cpython-37.pyc -------------------------------------------------------------------------------- /DangDang_Books/dangdang/dangdang/__pycache__/settings.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/DangDang_Books/dangdang/dangdang/__pycache__/settings.cpython-37.pyc -------------------------------------------------------------------------------- /DangDang_Books/dangdang/dangdang/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DangdangItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title=scrapy.Field() 15 | link=scrapy.Field() 16 | comment=scrapy.Field() 17 | 18 | -------------------------------------------------------------------------------- /DangDang_Books/dangdang/dangdang/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class DangdangSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class DangdangDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /DangDang_Books/dangdang/dangdang/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class DangdangPipeline(object): 10 | def process_item(self, item, spider): 11 | #for i in range(0,len(item['link'])): 12 | # title=item['title'][i] 13 | # link=item['link'][i] 14 | # comment=item['comment'][i] 15 | # print(title) 16 | # print(link) 17 | # print(comment) 18 | # print('') 19 | 20 | return item 21 | -------------------------------------------------------------------------------- /DangDang_Books/dangdang/dangdang/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for dangdang project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'dangdang' 13 | 14 | SPIDER_MODULES = ['dangdang.spiders'] 15 | NEWSPIDER_MODULE = 'dangdang.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'dangdang (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'dangdang.middlewares.DangdangSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'dangdang.middlewares.DangdangDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'dangdang.pipelines.DangdangPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /DangDang_Books/dangdang/dangdang/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /DangDang_Books/dangdang/dangdang/spiders/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/DangDang_Books/dangdang/dangdang/spiders/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /DangDang_Books/dangdang/dangdang/spiders/__pycache__/dd.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/DangDang_Books/dangdang/dangdang/spiders/__pycache__/dd.cpython-37.pyc -------------------------------------------------------------------------------- /DangDang_Books/dangdang/dangdang/spiders/dd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from dangdang.items import DangdangItem 4 | from scrapy.http import Request 5 | 6 | 7 | class DdSpider(scrapy.Spider): 8 | name = 'dd' 9 | allowed_domains = ['dangdang.com'] 10 | start_urls = ['http://dangdang.com/'] 11 | 12 | def parse(self, response): 13 | item=DangdangItem() 14 | item['title']=response.xpath("//a[@class='pic']/@title").extract 15 | item['link'] = response.xpath("//a[@class='pic']/@href").extract 16 | item['comment'] = response.xpath("//a[@class='search_comment_num']/text()").extract 17 | print(item['title']) 18 | print(item['link']) 19 | print(item['comment']) 20 | yield item 21 | 22 | #for i in range(1,5): 23 | # url='http://search.dangdang.com/?key=%B3%CC%D0%F2%C9%E8%BC%C6&act=input&page_index'+str(i) 24 | # yield Request(url,callback=self.parse) 25 | 26 | -------------------------------------------------------------------------------- /DangDang_Books/dangdang/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = dangdang.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = dangdang 12 | -------------------------------------------------------------------------------- /DangDang_Books/ddSpider.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pickle 3 | import random 4 | import requests 5 | from bs4 import BeautifulSoup 6 | 7 | 8 | headers = { 9 | 'Upgrade-Insecure-Requests': '1', 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36', 11 | 'Accept-Encoding': 'gzip, deflate', 12 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 13 | 'Cache-Control': 'no-cache', 14 | 'Connection': 'keep-alive', 15 | 'Host': 'search.dangdang.com' 16 | } 17 | 18 | 19 | 20 | '''解析, 提取需要的数据''' 21 | def parseHtml(html): 22 | data = {} 23 | soup = BeautifulSoup(html, 'lxml') 24 | conshoplist = soup.find_all('div', {'class': 'con shoplist'})[0] 25 | for each in conshoplist.find_all('li'): 26 | # 书名 27 | bookname = each.find_all('a')[0].get('title').strip(' ') 28 | # 书图 29 | img_src = each.find_all('a')[0].img.get('data-original') 30 | if img_src is None: 31 | img_src = each.find_all('a')[0].img.get('src') 32 | img_src = img_src.strip(' ') 33 | # 价格 34 | price = float(each.find_all('p', {'class': 'price'})[0].span.text[1:]) 35 | # 简介 36 | detail = each.find_all('p', {'class': 'detail'})[0].text 37 | # 评分 38 | stars = float(each.find_all('p', {'class': 'search_star_line'})[0].span.span.get('style').split(': ')[-1].strip('%;')) / 20 39 | # 评论数量 40 | num_comments = float(each.find_all('p', {'class': 'search_star_line'})[0].a.text[:-3]) 41 | data[bookname] = [img_src, price, detail, stars, num_comments] 42 | return data 43 | 44 | 45 | '''主函数''' 46 | def main(keyword): 47 | url = 'http://search.dangdang.com/?key={}&act=input&page_index={}' 48 | results = {} 49 | num_page = 0 50 | while True: 51 | num_page += 1 52 | print('[INFO]: Start to get the data of page%d...' % num_page) 53 | page_url = url.format(keyword, num_page) 54 | res = requests.get(page_url, headers=headers) 55 | if '抱歉,没有找到与“%s”相关的商品,建议适当减少筛选条件' % keyword in res.text: 56 | break 57 | page_data = parseHtml(res.text) 58 | results.update(page_data) 59 | time.sleep(random.random() + 0.5) 60 | with open('%s_%d.pkl' % (keyword, num_page-1), 'wb') as f: 61 | pickle.dump(results, f) 62 | return results 63 | 64 | 65 | if __name__ == '__main__': 66 | main('python') -------------------------------------------------------------------------------- /DangDang_Books/pictureWall.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import time 4 | import math 5 | import pickle 6 | import requests 7 | from PIL import Image 8 | 9 | 10 | PICDIR = 'pictures' 11 | headers = { 12 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36', 13 | } 14 | 15 | 16 | '''图片下载''' 17 | def downloadPics(urls, savedir): 18 | if not os.path.exists(savedir): 19 | os.mkdir(savedir) 20 | for idx, url in enumerate(urls): 21 | res = requests.get(url, headers=headers) 22 | with open(os.path.join(savedir, '%d.jpg' % idx), 'wb') as f: 23 | f.write(res.content) 24 | time.sleep(0.5) 25 | 26 | 27 | '''制作照片墙''' 28 | def makePicturesWall(picdir): 29 | picslist = os.listdir(picdir) 30 | num_pics = len(picslist) 31 | print('照片数量',num_pics) 32 | size = 64 33 | line_numpics = int(math.sqrt(num_pics))#正方形 34 | picwall = Image.new('RGBA', (line_numpics*size, line_numpics*size)) 35 | x = 0 36 | y = 0 37 | for pic in picslist: 38 | img = Image.open(os.path.join(picdir, pic)) 39 | img = img.resize((size, size), Image.ANTIALIAS) #改变图片尺寸 40 | picwall.paste(img, (x*size, y*size)) #合并图片 41 | x += 1 42 | if x == line_numpics: 43 | x = 0 44 | y += 1 45 | print('[INFO]: Generate pictures wall successfully...') 46 | picwall.save("picwall.png") #保存图片 47 | 48 | 49 | if __name__ == '__main__': 50 | with open('python_61.pkl', 'rb') as f: 51 | data = pickle.load(f) 52 | urls = [j[0] for i, j in data.items()] #加载图片下载 url 53 | # downloadPics(urls, PICDIR) 54 | makePicturesWall(PICDIR) -------------------------------------------------------------------------------- /DouBan_Movie/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/DouBan_Movie/1.png -------------------------------------------------------------------------------- /DouBan_Movie/ip.txt: -------------------------------------------------------------------------------- 1 | 218.17.139.5:808 2 | 203.86.26.9:3128 3 | 60.191.134.164:9999 4 | 116.62.4.184:8118 5 | 122.227.139.170:3128 6 | 203.93.209.163:53281 7 | 123.7.61.8:53281 8 | 219.234.5.128:3128 9 | 183.47.40.35:8088 10 | 59.78.2.140:1080 11 | 61.128.208.94:3128 12 | 113.108.242.36:47713 13 | 115.28.209.249:3128 14 | 113.108.242.36:47713 15 | 14.20.235.114:9797 16 | 221.210.120.153:54402 17 | 36.110.14.186:3128 18 | 59.53.134.202:808 19 | 221.6.201.18:9999 20 | 124.237.83.14:53281 21 | 124.243.226.18:8888 22 | 58.215.140.6:8080 23 | 211.99.26.183:808 24 | 112.250.109.173:53281 25 | 218.60.8.83:3129 26 | 218.60.8.99:3129 27 | 119.51.89.18:1080 28 | 61.128.208.94:3128 29 | 59.53.134.202:808 30 | 114.116.10.21:3128 31 | 202.112.237.102:3128 32 | 58.215.140.6:8080 33 | 106.15.42.179:33543 34 | 61.145.182.27:53281 35 | 115.148.173.121:808 36 | 171.37.156.39:8123 37 | 123.13.245.51:9999 38 | 218.60.8.83:3129 39 | 218.60.8.99:3129 40 | 61.183.233.6:54896 41 | 114.116.10.21:3128 42 | 202.112.237.102:3128 43 | 101.37.79.125:3128 44 | 59.53.137.116:808 45 | 221.7.255.167:8080 46 | 124.152.32.140:53281 47 | 221.7.255.168:8080 48 | 112.115.57.20:3128 49 | 61.183.233.6:54896 50 | -------------------------------------------------------------------------------- /DouBan_Movie/pic/动物世界.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/DouBan_Movie/pic/动物世界.png -------------------------------------------------------------------------------- /DouBan_Movie/pic/巴斯特·斯克鲁格斯的歌谣.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/DouBan_Movie/pic/巴斯特·斯克鲁格斯的歌谣.png -------------------------------------------------------------------------------- /DouYou/README.md: -------------------------------------------------------------------------------- 1 | 爬取斗鱼网所有主播的类别,房间标题,房间ID,主播名称,热度。 2 | 3 | csv文件大约15000条数据 4 | 5 | 博客地址:https://blog.csdn.net/weixin_43746433 6 | 7 | 微信:why19970628 8 | 9 | 欢迎与我交流 10 | -------------------------------------------------------------------------------- /LaGou/README.md: -------------------------------------------------------------------------------- 1 | 爬取拉勾网的职位的信息,分为静态和动态网页,生成csv文件。 2 | 3 | 博客地址:https://blog.csdn.net/weixin_43746433 4 | 5 | 爬虫详情:https://blog.csdn.net/weixin_43746433/article/details/94398440 6 | 7 | 微信:why19970628 8 | 9 | 欢迎与我交流 10 | -------------------------------------------------------------------------------- /LaGou/动态爬取.py: -------------------------------------------------------------------------------- 1 | # import pandas as pd 2 | # data=pd.read_csv('cleaned.csv') 3 | # data=pd.DataFrame(data) 4 | # area=data.groupby(by='area',axis=0).mean()['price'] 5 | # area=area 6 | # 7 | # #print(data.loc[:,'price'].mean()) 8 | # #area=data.groupby(by='area')['price'] 9 | # print(area) 10 | import requests 11 | from lxml import etree 12 | import pandas as pd 13 | from time import sleep 14 | import random 15 | 16 | # cookie 17 | cookie = '你的cookie' 18 | # headers 19 | headers = { 20 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 21 | # 'cookie':cookie 22 | } 23 | 24 | # sleep(random.randint(3, 10)) 25 | #url = 'https://www.lagou.com/zhaopin/' 26 | #res = requests.get(url, headers=headers) 27 | #print(res.text) 28 | 29 | #print('正在抓取第{}页...'.format(i), url) 30 | # 查看网页结构循环页数进行采集 31 | for i in range(1, 2): 32 | sleep(random.randint(3, 10)) 33 | url = 'https://www.lagou.com/zhaopin/jiqixuexi/{}/?filterOption=3'.format(i) 34 | res =requests.get(url,headers = headers) 35 | #print(res.text) 36 | print('正在抓取第{}页...'.format(i), url) 37 | # 请求网页并解析 38 | con = etree.HTML(requests.get(url=url, headers=headers).text) 39 | # 使用xpath表达式抽取各目标字段 40 | job_name = [i for i in con.xpath("//a[@class='position_link']/h3/text()")] 41 | job_address = [i for i in con.xpath("//a[@class='position_link']/span/em/text()")] 42 | job_company = [i for i in con.xpath("//div[@class='company_name']/a/text()")] 43 | job_salary = [i for i in con.xpath("//span[@class='money']/text()")] 44 | job_exp_edu = [i for i in con.xpath("//div[@class='li_b_l']/text()")] 45 | job_exp_edu2 = [i for i in [i.strip() for i in job_exp_edu] if i != ''] 46 | job_industry = [i.strip() for i in con.xpath("//div[@class='industry']/text()")] 47 | job_tempation = [i for i in con.xpath("//div[@class='list_item_bot']/div[@class='li_b_r']/text()")] 48 | job_links = [i for i in con.xpath("//div[@class='p_top']/a/@href")] 49 | print(job_links) 50 | 51 | # 获取详情页链接后采集详情页岗位描述信息 52 | job_des =[] 53 | for link in job_links: 54 | sleep(random.randint(3, 10)) 55 | print('link:',link) 56 | con2 = etree.HTML(requests.get(url=link, headers=headers).text) 57 | #print(con) 58 | des = [[i for i in con2.xpath("//dd[@class='job_bt']/div/p/text()")]] 59 | job_des += des 60 | #print(job_des) 61 | break #遍历一次 62 | 63 | # 对数据进行字典封装 64 | dataset = { 65 | '岗位名称': job_name, 66 | '工作地址': job_address, 67 | '公司': job_company, 68 | '薪资': job_salary, 69 | '经验学历': job_exp_edu2, 70 | '所属行业': job_industry, 71 | '岗位福利': job_tempation, 72 | '任职要求': job_des 73 | } 74 | 75 | # 转化为数据框并存为csv 76 | data = pd.DataFrame(dataset) 77 | data.to_csv('machine_learning_hz_job2.csv') 78 | -------------------------------------------------------------------------------- /LianJia/README.md: -------------------------------------------------------------------------------- 1 | 爬取链家的北京所有小区的信息,生成csv文件。 2 | 3 | 博客地址:https://blog.csdn.net/weixin_43746433 4 | 5 | 爬虫详情:https://blog.csdn.net/weixin_43746433/article/details/95951341 6 | 7 | 微信:why19970628 8 | 9 | 欢迎与我交流 10 | -------------------------------------------------------------------------------- /LianJia/group_by.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | data1=pd.read_csv('housedata1.csv') 4 | print(data1.shape) 5 | data2=pd.read_csv('housedata2.csv') 6 | print(data2.shape) 7 | data=pd.concat([data1,data2],axis=0,ignore_index=False) 8 | print(data.head()) 9 | print(data.shape) 10 | data=pd.DataFrame(data) 11 | 12 | data=data.sort_values('area') 13 | data=data.reset_index() 14 | data=data.drop(labels='index',axis=1) 15 | print(data.head()) 16 | print(data.loc[:,'area'].value_counts()) 17 | for i,data['price'][i] in enumerate(data['price']): 18 | data['price'][i]=int(data['price'][i].replace('元/平','')) 19 | #print(i,data['price'][i]) 20 | print('changed_price\n',data['price'].head()) 21 | print(data.head()) 22 | 23 | print(type(data['price'][0])) 24 | data.to_csv('cleaned.csv') 25 | 26 | print(data.loc[:,'area'].value_counts()) 27 | print(data.describe()) 28 | 29 | area=data.groupby(by='area')['price'].mean() 30 | 31 | #print(data.loc[:,'price'].mean()) 32 | #area=data.groupby(by='area')['price'] 33 | print(area) -------------------------------------------------------------------------------- /Meituan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/Meituan/__init__.py -------------------------------------------------------------------------------- /Meituan/meituan.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import time 3 | import threading 4 | from get_cookie import get_cookie 5 | from get_cookie import parse 6 | 7 | 8 | def crow(n, l): # 参数n 区分第几个线程,l存储url的列表 9 | lock = threading.Lock() 10 | sym = 0 # 是否连续三次抓取失败的标志位 11 | pc = get_cookie() # 获取IP 和 Cookie 12 | m = 0 # 记录抓取的数量 13 | now = time.time() 14 | while True: 15 | if len(l) > 0: 16 | u = l.pop(0) 17 | ll = len(l) 18 | m += 1 19 | ttt = time.time() - now 20 | result = parse(u, pc, m, n, ll, ttt) 21 | mark = result[0] 22 | info = result[1] 23 | if mark == 2: 24 | time.sleep(1.5) 25 | result = parse(u, pc, m, n, ll, ttt) 26 | mark = result[0] 27 | info = result[1] 28 | if mark != 0: 29 | sym += 1 30 | if mark == 1: 31 | pc = get_cookie() 32 | result = parse(u, pc, m, n, ll, ttt) 33 | mark = result[0] 34 | info = result[1] 35 | if mark != 0: 36 | sym += 1 37 | if mark == 0: # 抓取成功 38 | sym = 0 39 | lock.acquire() 40 | with open('meituan.csv', 'a', newline='', encoding='gb18030')as f: 41 | write = csv.writer(f) 42 | write.writerow(info) 43 | f.close() 44 | lock.release() 45 | if sym > 2: # 连续三次抓取失败,换ip、cookie 46 | sym = 0 47 | pc = get_cookie() 48 | else: 49 | print('&&&&线程:%d结束' % n) 50 | break 51 | 52 | 53 | if __name__ == '__main__': 54 | url_list = [] 55 | with open('mt_id.csv', 'r', encoding='gb18030')as f: 56 | read = csv.reader(f) 57 | for line in read: 58 | d_list = ['', ''] 59 | url = 'https://meishi.meituan.com/i/poi/' + str(line[2]) + '?ct_poi=' + str(line[3]) 60 | d_list[0] = url 61 | d_list[1] = line[1] 62 | url_list.append(d_list) 63 | f.close() 64 | th_list = [] 65 | for i in range(1, 6): 66 | t = threading.Thread(target=crow, args=(i, url_list,)) 67 | print('*****线程%d开始启动...' % i) 68 | t.start() 69 | th_list.append(t) 70 | time.sleep(30) 71 | for t in th_list: 72 | t.join() 73 | -------------------------------------------------------------------------------- /Movie_maoyan/WPS网盘.lnk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/Movie_maoyan/WPS网盘.lnk -------------------------------------------------------------------------------- /Movie_maoyan/maoyan.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import requests 3 | import re 4 | import json 5 | from multiprocessing import Pool 6 | 7 | 8 | 9 | def get_one_page(url): 10 | headers = { 11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36' 12 | } 13 | response = requests.get(url, headers=headers) 14 | if response.status_code == 200: 15 | return response.text 16 | return None 17 | 18 | 19 | import pandas as pd 20 | 21 | def parse_one_page(html): 22 | pattern = re.compile( 23 | '
.*?board-index.*?>(.*?).*?data-src="(.*?)".*?name.*?a.*?>(.*?).*?star.*?>(.*?)

.*?releasetime.*?>(.*?)

.*?integer.*?>(.*?).*?fraction.*?>(.*?).*?
', 24 | re.S) 25 | items = re.findall(pattern, html) 26 | #print(items) 27 | content = [] 28 | for item in items: 29 | dataset = {} 30 | dataset['index']=item[0] 31 | print(dataset['index']) 32 | dataset['image']=item[1] 33 | dataset['title']=item[2].strip() 34 | dataset['actor']=item[3].strip()[3:] if len(item[3]) > 3 else '' 35 | dataset['time'] =item[4].strip()[5:] if len(item[4]) > 5 else '' 36 | dataset['score']=item[5].strip() + item[6].strip() 37 | content.append(dataset) 38 | return content 39 | 40 | 41 | 42 | def write_to_file(content): 43 | df = pd.DataFrame(content) 44 | #print(df.index) 45 | df.to_csv('maoyan.csv',index=False,mode='a+') 46 | 47 | def main(offset): 48 | url = 'http://maoyan.com/board/4?offset=' + str(offset) 49 | html = get_one_page(url) 50 | data=parse_one_page(html) 51 | write_to_file(data) 52 | 53 | import time 54 | if __name__ == '__main__': 55 | 56 | start=time.time() 57 | pool=Pool() 58 | 59 | pool.map(main,[i*10 for i in range(10)]) 60 | # for i in range(10): 61 | # main(offset=i * 10) 62 | 63 | print('花费时间:',time.time()-start) -------------------------------------------------------------------------------- /Movie_maoyan/readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/Movie_maoyan/readme.md -------------------------------------------------------------------------------- /Movie_maoyan/result.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/Movie_maoyan/result.txt -------------------------------------------------------------------------------- /Movie_maoyan/txt.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import requests 3 | import re 4 | import json 5 | from multiprocessing import Pool 6 | 7 | 8 | def get_one_page(url): 9 | headers = { 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36' 11 | } 12 | response = requests.get(url, headers=headers) 13 | if response.status_code == 200: 14 | return response.text 15 | return None 16 | 17 | 18 | def parse_one_page(html): 19 | pattern = re.compile( 20 | '
.*?board-index.*?>(.*?).*?data-src="(.*?)".*?name.*?a.*?>(.*?).*?star.*?>(.*?)

.*?releasetime.*?>(.*?)

.*?integer.*?>(.*?).*?fraction.*?>(.*?).*?
', 21 | re.S) 22 | items = re.findall(pattern, html) 23 | for item in items: 24 | yield { 25 | 'index': item[0], 26 | 'image': item[1], 27 | 'title': item[2].strip(), 28 | 'actor': item[3].strip()[3:] if len(item[3]) > 3 else '', 29 | 'time': item[4].strip()[5:] if len(item[4]) > 5 else '', 30 | 'score': item[5].strip() + item[6].strip() 31 | } 32 | 33 | 34 | def write_to_file(content): 35 | with open('result.txt', 'a', encoding='utf-8') as f: 36 | f.write(json.dumps(content, ensure_ascii=False) + '\n') 37 | 38 | 39 | def main(offset): 40 | url = 'http://maoyan.com/board/4?offset=' + str(offset) 41 | html = get_one_page(url) 42 | for item in parse_one_page(html): 43 | write_to_file(item) 44 | 45 | 46 | if __name__ == '__main__': 47 | pool = Pool() 48 | pool.map(main, [i * 10 for i in range(10)]) -------------------------------------------------------------------------------- /Movie_tiantang/readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/Movie_tiantang/readme.md -------------------------------------------------------------------------------- /Photo_Position_GoldenAPI/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/Photo_Position_GoldenAPI/.DS_Store -------------------------------------------------------------------------------- /Photo_Position_GoldenAPI/.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | -------------------------------------------------------------------------------- /Photo_Position_GoldenAPI/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /Photo_Position_GoldenAPI/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Photo_Position_GoldenAPI/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Photo_Position_GoldenAPI/.idea/地理位置.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /Photo_Position_GoldenAPI/__pycache__/position_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/Photo_Position_GoldenAPI/__pycache__/position_utils.cpython-37.pyc -------------------------------------------------------------------------------- /Photo_Position_GoldenAPI/picture/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/Photo_Position_GoldenAPI/picture/.DS_Store -------------------------------------------------------------------------------- /Photo_Position_GoldenAPI/picture/20190828185021.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/Photo_Position_GoldenAPI/picture/20190828185021.jpg -------------------------------------------------------------------------------- /Photo_Position_GoldenAPI/position_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: position_utils.py 12 | @time: 2019-08-23 17:44 13 | @description:坐标转换 14 | """ 15 | 16 | # -*- coding: utf-8 -*- 17 | import json 18 | import math 19 | 20 | x_pi = 3.14159265358979324 * 3000.0 / 180.0 21 | pi = 3.1415926535897932384626 # π 22 | a = 6378245.0 # 长半轴 23 | ee = 0.00669342162296594323 # 扁率 24 | 25 | 26 | def wgs84togcj02(lng, lat): 27 | """ 28 | WGS84转GCJ02(火星坐标系) 29 | :param lng:WGS84坐标系的经度 30 | :param lat:WGS84坐标系的纬度 31 | :return: 32 | """ 33 | if out_of_china(lng, lat): # 判断是否在国内 34 | return lng, lat 35 | dlat = transformlat(lng - 105.0, lat - 35.0) 36 | dlng = transformlng(lng - 105.0, lat - 35.0) 37 | radlat = lat / 180.0 * pi 38 | magic = math.sin(radlat) 39 | magic = 1 - ee * magic * magic 40 | sqrtmagic = math.sqrt(magic) 41 | dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi) 42 | dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi) 43 | mglat = lat + dlat 44 | mglng = lng + dlng 45 | return [mglng, mglat] 46 | 47 | 48 | def gcj02towgs84(lng, lat): 49 | """ 50 | GCJ02(火星坐标系)转GPS84 51 | :param lng:火星坐标系的经度 52 | :param lat:火星坐标系纬度 53 | :return: 54 | """ 55 | if out_of_china(lng, lat): 56 | return lng, lat 57 | dlat = transformlat(lng - 105.0, lat - 35.0) 58 | dlng = transformlng(lng - 105.0, lat - 35.0) 59 | radlat = lat / 180.0 * pi 60 | magic = math.sin(radlat) 61 | magic = 1 - ee * magic * magic 62 | sqrtmagic = math.sqrt(magic) 63 | dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi) 64 | dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi) 65 | mglat = lat + dlat 66 | mglng = lng + dlng 67 | return [lng * 2 - mglng, lat * 2 - mglat] 68 | 69 | 70 | def transformlat(lng, lat): 71 | ret = -100.0 + 2.0 * lng + 3.0 * lat + 0.2 * lat * lat + \ 72 | 0.1 * lng * lat + 0.2 * math.sqrt(math.fabs(lng)) 73 | ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 * 74 | math.sin(2.0 * lng * pi)) * 2.0 / 3.0 75 | ret += (20.0 * math.sin(lat * pi) + 40.0 * 76 | math.sin(lat / 3.0 * pi)) * 2.0 / 3.0 77 | ret += (160.0 * math.sin(lat / 12.0 * pi) + 320 * 78 | math.sin(lat * pi / 30.0)) * 2.0 / 3.0 79 | return ret 80 | 81 | 82 | def transformlng(lng, lat): 83 | ret = 300.0 + lng + 2.0 * lat + 0.1 * lng * lng + \ 84 | 0.1 * lng * lat + 0.1 * math.sqrt(math.fabs(lng)) 85 | ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 * 86 | math.sin(2.0 * lng * pi)) * 2.0 / 3.0 87 | ret += (20.0 * math.sin(lng * pi) + 40.0 * 88 | math.sin(lng / 3.0 * pi)) * 2.0 / 3.0 89 | ret += (150.0 * math.sin(lng / 12.0 * pi) + 300.0 * 90 | math.sin(lng / 30.0 * pi)) * 2.0 / 3.0 91 | return ret 92 | 93 | 94 | def out_of_china(lng, lat): 95 | """ 96 | 判断是否在国内,不在国内不做偏移 97 | :param lng: 98 | :param lat: 99 | :return: 100 | """ 101 | if lng < 72.004 or lng > 137.8347: 102 | return True 103 | if lat < 0.8293 or lat > 55.8271: 104 | return True 105 | return False 106 | -------------------------------------------------------------------------------- /Photo_qiantu/ip.txt: -------------------------------------------------------------------------------- 1 | 113.116.245.211:9797 2 | 124.232.133.199:3128 3 | 113.116.245.211:9797 4 | 163.204.240.140:9999 5 | 1.197.203.240:9999 6 | 49.51.155.45:8081 7 | 163.204.241.198:9999 8 | 125.62.27.53:3128 9 | 61.128.208.94:3128 10 | 120.83.101.8:9999 11 | 121.233.251.11:9999 12 | 1.198.72.173:9999 13 | 110.172.221.241:8080 14 | 27.191.234.69:9999 15 | 116.196.90.181:3128 16 | 101.231.234.38:8080 17 | 110.172.221.241:8080 18 | 163.204.244.138:9999 19 | 121.69.46.177:9000 20 | 121.233.207.221:9999 21 | -------------------------------------------------------------------------------- /Photo_qiantu/qiantu.photo/simple_show.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/Photo_qiantu/qiantu.photo/simple_show.PNG -------------------------------------------------------------------------------- /Photo_qiantu/qiantu.py: -------------------------------------------------------------------------------- 1 | from urllib import request 2 | import urllib 3 | import random 4 | from urllib.error import URLError 5 | from urllib.request import ProxyHandler, build_opener 6 | import re 7 | def get_ip(): 8 | fr=open('ip.txt','r') 9 | ips=fr.readlines() 10 | new=[] 11 | for line in ips: 12 | temp=line.strip() 13 | new.append(temp) 14 | ip=random.choice(new) 15 | return ip 16 | print(ip) 17 | proxy =get_ip() 18 | proxy_handler = ProxyHandler({ 19 | 'http': 'http://' + proxy, 20 | 'https': 'https://' + proxy 21 | }) 22 | opener = build_opener(proxy_handler) 23 | import threading 24 | class One(threading.Thread): 25 | def __init__(self): 26 | threading.Thread.__init__(self) 27 | def run(self): 28 | try: 29 | for i in range(1,5,2): 30 | pageurl='http://www.58pic.com/piccate/3-0-0-p'+str(i)+'.html' 31 | data =urllib.request.urlopen(pageurl).read().decode('utf-8','ignore') 32 | pat='class="card-trait".*?src="(.*?).jpg!' 33 | image_url=re.compile(pat).findall(data) 34 | print('url个数',len(image_url)) 35 | for j in range(0,len(image_url)): 36 | try: 37 | this_list=image_url[j] 38 | this_url='https:'+this_list+'.jpg!w1024_0' 39 | file='D:/软件(学习)/Python/Test/chapter6/qiantu.photo/'+str(i)+str(j)+'.jpg' 40 | urllib.request.urlretrieve(this_url,file) 41 | print('第'+str(i)+'页第'+str(j)+'个图片成功') 42 | except urllib.error.URLError as e: 43 | print(e.reason) 44 | 45 | except URLError as e: 46 | print(e.reason) 47 | 48 | 49 | class Two(threading.Thread): 50 | def __init__(self): 51 | threading.Thread.__init__(self) 52 | 53 | def run(self): 54 | try: 55 | for i in range(2, 5, 2): 56 | pageurl = 'http://www.58pic.com/piccate/3-0-0-p'+str(i)+'.html' 57 | data = urllib.request.urlopen(pageurl).read().decode('utf-8', 'ignore') 58 | pat = 'class="card-trait".*?src="(.*?).jpg!' 59 | image_url = re.compile(pat).findall(data) 60 | for j in range(0, len(image_url)): 61 | try: 62 | this_list = image_url[j] 63 | this_url = 'https:'+this_list + '.jpg!w1024_0' 64 | file = 'D:/软件(学习)/Python/Test/chapter6/qiantu.photo/' + str(i) + str(j) + '.jpg' 65 | urllib.request.urlretrieve(this_url, file) 66 | print('第' + str(i) + '页第' + str(j) + '个图片成功') 67 | except urllib.error.URLError as e: 68 | print(e.reason) 69 | 70 | except URLError as e: 71 | print(e.reason) 72 | one=One() 73 | one.start() 74 | two=Two() 75 | two.start() -------------------------------------------------------------------------------- /Photo_taobao/ip.txt: -------------------------------------------------------------------------------- 1 | 113.116.245.211:9797 2 | 124.232.133.199:3128 3 | 113.116.245.211:9797 4 | 163.204.240.140:9999 5 | 1.197.203.240:9999 6 | 49.51.155.45:8081 7 | 163.204.241.198:9999 8 | 125.62.27.53:3128 9 | 61.128.208.94:3128 10 | 120.83.101.8:9999 11 | 121.233.251.11:9999 12 | 1.198.72.173:9999 13 | 110.172.221.241:8080 14 | 27.191.234.69:9999 15 | 116.196.90.181:3128 16 | 101.231.234.38:8080 17 | 110.172.221.241:8080 18 | 163.204.244.138:9999 19 | 121.69.46.177:9000 20 | 121.233.207.221:9999 21 | -------------------------------------------------------------------------------- /Photo_taobao/taobao_photo.py: -------------------------------------------------------------------------------- 1 | from urllib import request 2 | import urllib 3 | import urllib.parse 4 | import re 5 | from urllib.error import URLError 6 | key='连衣裙' 7 | key=urllib.request.quote(key) 8 | headers=('user-agent',"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36") 9 | opener=urllib.request.build_opener() 10 | opener.addheaders=[headers] 11 | urllib.request.install_opener(opener) 12 | for i in range(1,20): 13 | url='https://re.taobao.com/search?spm=a231k.8165028.0782702702.204.60792e63WFZKub&prepvid=300_11.10.228.22_44360_1543657608665&extra=&keyword='+key+'&frontcatid=&isinner=1&refpid=420435_1006&page='+str(i)+'&rewriteKeyword&_input_charset=utf-8' 14 | print(url) 15 | data=urllib.request.urlopen(url).read().decode('utf-8','ignore') 16 | #pat=' 2 | 3 | 4 | 5 | Awesome-pyecharts 6 | 7 | 8 | 9 | 10 |
11 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /Sina_topic_spider/stop_words.txt: -------------------------------------------------------------------------------- 1 | 大家 2 | 打榜 -------------------------------------------------------------------------------- /WangYi_Music/geci.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | import json 4 | import pandas as pd 5 | url='' 6 | headers={'user-agent': 7 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' 8 | } 9 | def get_info(id): 10 | res=requests.get('http://music.163.com/api/song/lyric?id={}&lv=1&kv=1&tv=-1'.format(id),headers=headers) 11 | json_data=json.loads(res.text) 12 | lyric=json_data['lrc']['lyric'] 13 | lyric=re.sub('\[.*\]','',lyric) 14 | return str(lyric) 15 | def txt(): 16 | data=pd.read_csv('music.csv') 17 | for i in range(len(data['song_id'])): 18 | 19 | fp=open(r'歌词/{}.txt'.format(data['song'][i]),'w',encoding='utf-8') 20 | fp.write(get_info(data['song_id'][i])) 21 | fp.close() 22 | 23 | txt() 24 | -------------------------------------------------------------------------------- /WangYi_Music/music.csv: -------------------------------------------------------------------------------- 1 | song_id,song,singer,album 2 | 167876,有何不可,许嵩,《自定义》 3 | 167655,幻听,许嵩,《梦游计》 4 | 167827,素颜,许嵩,《素颜》 5 | 167850,庐州月,许嵩,《寻雾启示》 6 | 167844,灰色头像,许嵩,《寻雾启示》 7 | 27646687,玫瑰花的葬礼,许嵩,《许嵩单曲集》 8 | 167937,断桥残雪,许嵩,《断桥残雪》 9 | 28854182,惊鸿一面,许嵩,《不如吃茶去》 10 | 411214279,雅俗共赏,许嵩,《青年晚报》 11 | 167882,清明雨上,许嵩,《自定义》 12 | 428095913,江湖 ,许嵩,《江湖》 13 | 167712,拆东墙,许嵩,《苏格拉没有底》 14 | 167870,如果当时,许嵩,《自定义》 15 | 569213279,大千世界,许嵩,《寻宝游戏》 16 | 167903,我想牵着你的手,许嵩,《我想牵着你的手》 17 | 167732,千百度,许嵩,《苏格拉没有底》 18 | 167873,多余的解释,许嵩,《自定义》 19 | 167691,天龙八部之宿敌,许嵩,《天龙八部之宿敌》 20 | 5255987,你若成风,许嵩,《乐酷》 21 | 167709,河山大好,许嵩,《苏格拉没有底》 22 | 167891,内线,许嵩,《自定义》 23 | 167705,想象之中,许嵩,《苏格拉没有底》 24 | 167679,全球变冷,许嵩,《梦游计》 25 | 34040693,千古,许嵩,《千古》 26 | 167888,认错,许嵩,《自定义》 27 | 862099032,明智之举,许嵩,《寻宝游戏》 28 | 167894,星座书上,许嵩,《自定义》 29 | 27612225,违章动物,许嵩,《违章动物》 30 | 167885,城府,许嵩,《自定义》 31 | 167929,你若成风,许嵩,《许嵩单曲集》 32 | -------------------------------------------------------------------------------- /WangYi_Music/wangyiyun.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from lxml import etree 3 | import time 4 | import csv 5 | def get_info(url): 6 | chrome_driver=r"D:\Python\Anaconda\Lib\site-packages\selenium\webdriver\chrome\chromedriver.exe" 7 | driver=webdriver.Chrome(executable_path=chrome_driver) 8 | driver.maximize_window() 9 | driver.get(url) 10 | driver.implicitly_wait(10) 11 | iframe=driver.find_elements_by_tag_name('iframe')[0] 12 | driver.switch_to.frame(iframe) 13 | html=etree.HTML(driver.page_source) 14 | infos=html.xpath('//div[@class="srchsongst"]/div') 15 | for info in infos: 16 | song_id=info.xpath('div[2]/div/div/a/@href')[0].split('=')[-1] 17 | song=info.xpath('div[2]/div/div/a/b/text()')[0] 18 | singer1=info.xpath('div[4]/div/a')[0] 19 | singer=singer1.xpath('string(.)') 20 | album=info.xpath('div[5]/div/a/@title')[0] 21 | print(song_id,song,singer,album) 22 | writer.writerow([song_id,song,singer,album]) 23 | if __name__=='__main__': 24 | fp=open('music.csv','w',newline='',encoding='utf-8') 25 | writer=csv.writer(fp) 26 | writer.writerow(['song_id','song','singer','album']) 27 | url='https://music.163.com/#/search/m/?s=%E8%AE%B8%E5%B5%A9&type=1' 28 | get_info(url) -------------------------------------------------------------------------------- /WangYi_Music/歌词/你若成风.txt: -------------------------------------------------------------------------------- 1 | 2 | 作曲 : 许嵩 3 | 作词 : 许嵩 4 | 你若化成风 5 | 我幻化成雨 6 | 守护你身边 7 | 一笑为红颜 8 | 你若化成风 9 | 我幻化成雨 10 | 爱锁在眉间 11 | 似水往昔浮流年 12 | 乖乖 我的小乖乖 13 | 你的样子太可爱 14 | 追你的男生每个都超级厉害 15 | 我却在考虑怎么Say hi 16 | 害羞的我这样下去要怎么办 17 | 怎么办 爱情甜又酸 18 | 我不是Boss 19 | 没有超大的House 20 | 如果送你Rose 21 | 可不可以给我Chance 22 | 不想看时间这么一点一滴飞逝 23 | 老夫子带着假发 24 | 我不要三寸金莲胡话 25 | 想和你跳超短裙的恰恰 26 | 想带你回家见妈妈 27 | 你若化成风 28 | 我幻化成雨 29 | 守护你身边 30 | 一笑为红颜 31 | 你若化成风 32 | 我幻化成雨 33 | 爱锁在眉间 34 | 似水往昔浮流年 35 | 周末找个借口和你泛舟 36 | 一壶清酒 江水悠悠 我心悠悠 37 | 这感情Just for you 38 | 表面平静其实内心早已风起云涌 39 | 缘字诀 几番轮回 你锁眉 40 | 哎哟你的心情左右我的情绪 41 | 虽然有些问题真的很难搞定 42 | 我还是充满信心 43 | 老夫子带着假发 44 | 我不要三寸金莲胡话 45 | 想和你跳超短裙的恰恰 46 | 想带你回家见妈妈 47 | 你若化成风 48 | 我幻化成雨 49 | 守护你身边 50 | 一笑为红颜 51 | 你若化成风 52 | 我幻化成雨 53 | 爱锁在眉间 54 | 似水往昔浮流年 55 | 你千万不要装酷 56 | 呆的像大脑短路 57 | 我不收你的礼物 58 | 只想收一点点幸福 59 | 请领悟 60 | 请拿出速度奉我为公主 61 | 别磨蹭的像胖叔叔 62 | 有压力也要顶住 63 | 坚持自己的道路 64 | 真心去付出随时准备自我颠覆 65 | 这一首有点复古 66 | 不预示下首的套路 67 | 踩着Hip-Hop的鼓点陪你跳恰恰舞 68 | 嘟嘟嘟 69 | 嘟嘟嘟嘟嘟 70 | 嘟嘟嘟 71 | 嘟嘟嘟嘟嘟 72 | 嘟嘟嘟 73 | 嘟嘟嘟嘟嘟 74 | 嘟嘟嘟嘟嘟嘟嘟 75 | 嘟嘟嘟 76 | 嘟嘟嘟嘟嘟 77 | 嘟嘟嘟 78 | 嘟嘟嘟嘟嘟 79 | 嘟嘟嘟 80 | 嘟嘟嘟嘟嘟 81 | 嘟嘟嘟嘟嘟嘟嘟 82 | 你若化成风 83 | 我幻化成雨 84 | 守护你身边 85 | 一笑为红颜 86 | 你若化成风 87 | 我幻化成雨 88 | 爱锁在眉间 89 | 似水往昔浮流年 90 | 你若化成风 91 | 我幻化成雨 92 | 守护你身边 93 | 一笑为红颜 94 | 你若化成风 95 | 我幻化成雨 96 | 爱锁在眉间 97 | 似水往昔浮流年 98 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/全球变冷.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 许嵩 2 | 作词 : 许嵩 3 | 风在淅淅沥沥的雨中 4 | 撑伞走过那路口 5 | 有人跌跌撞撞踩到我 6 | 没说对不起借过 7 | 表情无喜无悲的冷漠 8 | 是这座城市的符咒 9 | 每个人都害怕被看出内心的脆弱 10 | 看你懵懵懂懂的眼中 布满太多的困惑 11 | 自从经历过那些以后 你都没怎么笑过 12 | 不必反反复复想太多 每天都要过的更洒脱 13 | 看得透 放得下 拈花一朵 14 | 如果能够多一点点微笑 15 | 生命也会多一点点美好 16 | 何必活的那么冷酷寂寥 17 | 入夜总为小事而睡不着 18 | 如果能够多一点点微笑 19 | 快乐也会多一点点围绕 20 | 不要等到全球变冷才觉不妙 21 | 22 | 风在淅淅沥沥的雨中 23 | 撑伞走过那路口 24 | 有人跌跌撞撞踩到我 25 | 没说对不起借过 26 | 表情无喜无悲的冷漠 27 | 是这座城市的符咒 28 | 每个人都害怕被看出内心的脆弱 29 | 看你懵懵懂懂的眼中 布满太多的困惑 30 | 自从经历过那些以后 你都没怎么笑过 31 | 不必反反复复想太多 每天都要过的更洒脱 32 | 看得透 放得下 拈花一朵 33 | 如果能够多一点点微笑 34 | 生命也会多一点点美好 35 | 何必活的那么冷酷寂寥 36 | 入夜总为小事而睡不着 37 | 如果能够多一点点微笑 38 | 快乐也会多一点点围绕 39 | 不要等到全球变冷才觉不妙 40 | 41 | 如果能够多一点点微笑 42 | 生命也会多一点点美好 43 | 何必活的那么冷酷寂寥 44 | 入夜总为小事而睡不着 45 | 如果能够多一点点微笑 46 | 快乐也会多一点点围绕 47 | 不要等到全球变冷才觉不妙 48 | 49 | 50 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/内线.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 许嵩 2 | 作词 : 许嵩 3 | 血腥的风放肆嘲笑漫天的黄叶 4 | 这大街已横尸遍野 5 | 6 | 而你却消失在蒙面执行任务杀气腾腾的夜 7 | 我听见有人猜 8 | 你是敌人潜伏的内线 9 | 10 | 和你相知多年 11 | 我确信对你的了解 12 | 你舍命救我画面 13 | 一一在眼前浮现 14 | 司空见惯了鲜血 15 | 你忘记你本是娇娆的红颜 16 | 感觉你我彼此都那么依恋 17 | 18 | 你落入封闭的地牢 19 | 发不出求救的讯号 20 | 我折返这古堡 提着刀 21 | 杀红了眼 不依不饶 22 | 23 | 你落入封闭的地牢 24 | 发不出求救的讯号 25 | 我却能感应到 26 | 打开锁链 你浅浅笑和我拥抱 27 | 28 | 血腥的风放肆嘲笑漫天的黄叶 29 | 这大街已横尸遍野 30 | 31 | 而你却消失在蒙面执行任务杀气腾腾的夜 32 | 我听见有人猜 33 | 你是潜伏的内线 34 | 35 | 和你相知多年 36 | 我确信对你的了解 37 | 你舍命救我画面 38 | 一一在眼前浮现 39 | 司空见惯了鲜血 40 | 你忘记你本是娇娆的红颜 41 | 感觉你我彼此都那么依恋 42 | 43 | 你落入封闭的地牢 44 | 发不出求救的讯号 45 | 我折返这古堡 提着刀 46 | 杀红了眼 不依不饶 47 | 48 | 你落入封闭的地牢 49 | 发不出求救的讯号 50 | 我却能感应到 51 | 打开锁链 你浅浅笑和我拥抱 52 | 53 | 我从来没有想到的是 54 | 55 | 这是你我第一次拥抱 带着浅浅笑 56 | 你说会陪我一直到老 远离这尘嚣 57 | 我闭着眼感受幸福的微妙 58 | 把刀剑扔掉 59 | 你突然转身 匕首刺进我的心脏 60 | 带着浅浅笑 61 | 62 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/千古.txt: -------------------------------------------------------------------------------- 1 | 2 | 作曲 : 许嵩 3 | 作词 : 许嵩 4 | 夏蝉冬雪 5 | 不过轮回一瞥 6 | 悟道修炼 7 | 不问一生缘劫 8 | 白纸画卷 9 | 寥寥几笔绘江湖深浅 10 | 难绘你 11 | 不染纤尘的容颜 12 | 夜不成眠 13 | 心还为谁萦牵 14 | 灯火竹帘 15 | 梦里随风摇曳 16 | 月华似练 17 | 遥看万载沧海成桑田 18 | 它不言 19 | 不言命途的明灭 20 | 若流芳千古 21 | 爱的人却反目 22 | 错过了幸福 23 | 谁又为我在乎 24 | 若贻笑千古 25 | 因为爱得执迷又糊涂 26 | 也不悔做你的信徒 27 | 夜不成眠 28 | 心还为谁萦牵 29 | 灯火竹帘 30 | 梦里随风摇曳 31 | 月华似练 32 | 遥看万载沧海成桑田 33 | 它不言 34 | 不言命途的明灭 35 | 若流芳千古 36 | 爱的人却反目 37 | 错过了幸福 38 | 谁又为我在乎 39 | 若贻笑千古 40 | 因为爱得执迷又糊涂 41 | 也不悔做你的信徒 42 | 若流芳千古 43 | 爱的人却反目 44 | 错过了幸福 45 | 谁又为我在乎 46 | 若贻笑千古 47 | 因为爱得执迷又糊涂 48 | 也不悔做你的信徒 49 | 也不悔做你的信徒 50 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/千百度.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 许嵩 2 | 作词 : 许嵩 3 | 编曲:杨阳 4 | 5 | 关外野店 烟火绝 客怎眠 6 | 7 | 寒来袖间 谁为我 添两件 8 | 9 | 三四更雪 风不减 吹袭一夜 10 | 只是可怜 瘦马未得好歇 11 | 12 | 怅然入梦 梦几月 醒几年 13 | 14 | 往事凄艳 用情浅 两手缘 15 | 16 | 鹧鸪清怨 听得见 飞不回堂前 17 | 18 | 旧楹联红褪墨残谁来揭 19 | 20 | 我寻你千百度 日出到迟暮 21 | 一瓢江湖我沉浮 22 | 23 | 我寻你千百度 又一岁荣枯 24 | 可你从不在 灯火阑珊处 25 | 26 | 怅然入梦 梦几月 醒几年 27 | 28 | 往事凄艳 用情浅 两手缘 29 | 30 | 鹧鸪清怨 听得见 飞不回堂前 31 | 旧楹联红褪墨残谁来揭 32 | 33 | 我寻你千百度 日出到迟暮 34 | 一瓢江湖我沉浮 35 | 36 | 我寻你千百度 又一岁荣枯 37 | 38 | 可你从不在 灯火阑珊处 39 | 40 | 我寻你千百度 日出到迟暮 41 | 一瓢江湖我沉浮 42 | 43 | 我寻你千百度 又一岁荣枯 44 | 45 | 你不在 灯火阑珊处 46 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/城府.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 许嵩 2 | 作词 : 许嵩 3 | 你走之后 一个夏季熬成一个秋 4 | 我的书上你的正楷眉清目秀 5 | 一字一字宣告我们和平分手 6 | 好委婉的交流 还带一点征求 7 | 你已成风 幻化的雨下错了季候 8 | 明媚的眼眸里温柔化为了乌有 9 | 一层一层院墙把你的心困守 10 | 如果没法回头 这样也没不妥 11 | 你的城府有多深 12 | 我爱的有多蠢 是我太笨 13 | 还是太认真 幻想和你过一生 14 | 你的城府有多深 15 | 我爱的有多蠢 不想再问 16 | 也无法去恨 毕竟你是我最爱的人 17 | 曾经你的眼神 看起来那么单纯 18 | 嗯 指向你干净的灵魂 19 | 什么时候开始变得满是伤痕 20 | 戴上假面也好 如果不会疼 21 | 爱情这个世界 有那么多的悖论 22 | 小心翼翼不见得就会获得满分 23 | 我们之间缺少了那么多信任 24 | 最后还是没有 打开那扇心门 25 | 你的城府有多深 26 | 我爱的有多蠢 是我太笨 27 | 还是太认真 幻想和你过一生 28 | 你的城府有多深 29 | 我爱的有多蠢 不想再问 30 | 也无法去恨 毕竟你是我最爱的人 31 | 我曾经苦笑着问过我自己 32 | 在某个夜里 卸下伪装的你 33 | 是不是也会哭泣 34 | 你的城府有多深 35 | 我爱的有多蠢 是我太笨 36 | 还是太认真 幻想和你过一生 37 | 你的城府有多深 38 | 我爱的有多蠢 不想再问 39 | 也无法去恨 毕竟你是爱过我的人 40 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/多余的解释.txt: -------------------------------------------------------------------------------- 1 | 那阵子我们的感情出了一些问题 2 | 可是我也不太清楚问题出在哪里 3 | 你面无表情的话语不剩多少意义 4 | 就当我求求你 给我一些说明 5 | ok 我猜你只是暂时的压抑心情 6 | 不再去追问你 多给你一些关心 7 | 打电话请你去看最新的电影 8 | 你说工作很忙要加班到夜里 9 | ooook 入冬了想给你买一条围巾 10 | 怕眼光不行所以叫着紧跟潮流的妹妹和我一起 11 | 和妹妹说说笑笑 缓释最近糟糕心绪 12 | 在下一个转角却和你相遇 13 | 她只是我的妹妹 妹妹说紫色很有韵味 14 | 她只是我的妹妹 我在担心你是否误会 15 | 她只是我的妹妹 对这个解释你无所谓 16 | 我没有思想准备 看到你身旁还有一位 17 | 不知道他是谁 18 | 那阵子我们的感情出了一些问题 19 | 可是我也不太清楚问题出在哪里 20 | 你面无表情的话语不剩多少意义 21 | 就当我求求你 给我一些说明 22 | ooook 入冬了想给你买一条围巾 23 | 怕眼光不行所以叫着紧跟潮流的妹妹和我一起 24 | 和妹妹说说笑笑 缓释最近糟糕心绪 25 | 在下一个转角却和你相遇 26 | 她只是我的妹妹 妹妹说紫色很有韵味 27 | 她只是我的妹妹 我在担心你是否误会 28 | 她只是我的妹妹 对这个解释你无所谓 29 | 我没有思想准备 看到你身旁还有一位 30 | 不知道他是谁 31 | 紫色的围巾 交到你手里 32 | 你放进包里 说句谢谢你 33 | 要加班的你 却出现在这里 34 | 故事的结局不需要任何说明 35 | 她只是我的妹妹 妹妹说紫色很有韵味 36 | 她只是我的妹妹 我在担心你是否误会 37 | 她只是我的妹妹 对这个解释你无所谓 38 | 我没有思想准备 看到你身旁还有一位 39 | 不知道他是谁 40 | 她只是我的妹妹 妹妹说紫色很有韵味 41 | 她只是我的妹妹 我在担心你是否误会 42 | 她只是我的妹妹 对这个解释你无所谓 43 | 我没有思想准备 看到你身旁还有一位 44 | 不知道他是谁 45 | 46 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/大千世界.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 许嵩 2 | 作词 : 许嵩 3 | 你穿着我的T恤 4 | 大到有些滑稽 5 | 像沙漠连夜大雨 6 | 规则缺席 7 | 亦真亦假的玩具 8 | 还握在你手里 9 | 吃透温柔的暴力就不称奇 10 | 我的黑框眼镜在Assad湖边走火 11 | 风马牛齐聚 12 | 你被带走时我亲吻了你下颌的伤疤 13 | 表情很平静 14 | 你是大千世界一汪清泉 15 | 还是泉边那只神秘孔雀 16 | 在和你灵魂谋面之前 17 | 让贪念趁火打劫 18 | 你是大千世界尘埃等闲 19 | 也是我仅有的风花雪月 20 | 爱死或是恨终我都感谢 21 | 万花筒里消受幻影碎片 22 | 23 | 万花筒里消受 24 | 你是大千世界一汪清泉 25 | 还是泉边那只神秘孔雀 26 | 在和你灵魂谋面之前 27 | 让贪念趁火打劫 28 | 你是大千世界过眼云烟 29 | 也是我仅有的夺目闪电 30 | 躁动或是寡言我都奉献 31 | 万花筒里留下真切纪念 32 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/天龙八部之宿敌.txt: -------------------------------------------------------------------------------- 1 | 2 | 作曲 : 许嵩 3 | 作词 : 许嵩 4 | 会在何处见到你 5 | 莫非前尘已注定 6 | 飞过时空的距离 7 | 却囿于刀剑光影 8 | 三月春花渐次醒 9 | 迢迢年华谁老去 10 | 是劫是缘随我心 11 | 除了你万敌不侵 12 | 当恩怨各一半 13 | 我怎么圈揽 14 | 看灯笼血红染 15 | 寻仇已太晚 16 | 月下门童喟叹 17 | 昨夜太平长安 18 | 当天上星河转 19 | 我命已定盘 20 | 待绝笔墨痕干 21 | 宿敌已来犯 22 | 我借你的孤单 23 | 今生恐怕难还 24 | 缠扰孤岛的雪雨 25 | 飘飘洒洒谁来停 26 | 摘取一颗海上星 27 | 陪我终夜不孤寂 28 | 灵柩长埋深谷底 29 | 没有永远的秘密 30 | 染指江湖结悲局 31 | 无人逃得过宿命 32 | 当恩怨各一半 33 | 我怎么圈揽 34 | 看灯笼血红染 35 | 寻仇已太晚 36 | 月下门童喟叹 37 | 昨夜太平长安 38 | 当天上星河转 39 | 我命已定盘 40 | 待绝笔墨痕干 41 | 宿敌已来犯 42 | 我借你的孤单 43 | 今生恐怕难还 44 | 当恩怨各一半 45 | 我怎么圈揽 46 | 看灯笼血红染 47 | 寻仇已太晚 48 | 月下门童喟叹 49 | 昨夜太平长安 50 | 当天上星河转 51 | 我命已定盘 52 | 待绝笔墨痕干 53 | 宿敌已来犯 54 | 我借你的孤单 55 | 今生恐怕难还 56 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/如果当时.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 许嵩 2 | 作词 : 许嵩 3 | 为什么 你当时对我好 4 | 又为什么 现在变得冷淡了 5 | 我知道 爱要走难阻挠 6 | 反正不是我的 我也不该要 7 | 你和我 曾经有共同爱好 8 | 谁的耳边 总有绝句在萦绕 9 | 我们俩 用文言文对话真的很搞笑 10 | 还笑那曹操贪慕着小乔 11 | 天灰了 雨坠了 12 | 视线要模糊了 13 | 此时感觉到你的重要 14 | 爱走了 心走了 15 | 你说你要走了 16 | 我为你唱最后的古谣 17 | 红雨瓢泼泛起了回忆怎么潜 18 | 你美目如当年 19 | 流转我心间 20 | 渡口边最后一面洒下了句点 21 | 与你若只如初见 22 | 何须感伤离别 23 | 你和我 曾经有共同爱好 24 | 谁的耳边 总有绝句在萦绕 25 | 我们俩 用文言文对话真的很搞笑 26 | 还笑那曹操贪慕着小乔 27 | 天灰了 雨坠了 28 | 视线要模糊了 29 | 此时感觉到你的重要 30 | 爱走了 心走了 31 | 你说你要走了 32 | 我为你唱最后的古谣 33 | 红雨瓢泼泛起了回忆怎么潜 34 | 你美目如当年 35 | 流转我心间 36 | 渡口边最后一面洒下了句点 37 | 与你若只如初见 38 | 何须感伤离别 39 | 红雨瓢泼泛起了回忆怎么潜 40 | 你美目如当年 41 | 流转我心间 42 | 渡口边最后一面洒下了句点 43 | 与你若只如初见 44 | 何须感伤离别 45 | 红雨瓢泼泛起了回忆怎么潜 46 | 你美目如当年 47 | 流转我心间 48 | 渡口边最后一面洒下了句点 49 | 与你若只如初见 50 | 何须感伤离别 51 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/幻听.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 许嵩 2 | 作词 : 许嵩 3 | 在远方的时候 4 | 又想你到泪流 5 | 这矫情的措辞结构 6 | 经历过的人会懂 7 | 那些不堪言的疼痛 8 | 也就是我自作自受 9 | 你没有装聋 10 | 你真没感动 11 | 12 | 一个人的时候 13 | 偷偷看你的微博 14 | 你转播的歌好耳熟 15 | 我们坐一起听过 16 | 当日嫌它的唱法做作 17 | 现在听起来竟然很生动 18 | 可能是时光让耳朵变得宽容 19 | 20 | 如今一个人听歌总是会觉得失落 21 | 幻听你在我的耳边轻轻诉说 22 | 夜色多温柔 23 | 你有多爱我 24 | 如今一个人听歌总是会觉得难过 25 | 爱已不在这里我却还没走脱 26 | 列表里的歌 27 | 随过往流动 28 | 29 | 一个人的时候 30 | 偷偷看你的微博 31 | 你每天做了些什么 32 | 我都了然于胸 33 | 当时嫌你的蠢话太多 34 | 现在回想起画面已泛旧 35 | 可能是孤独让情绪变得脆弱 36 | 37 | 如今一个人听歌总是会觉得失落 38 | 幻听你在我的耳边轻轻诉说 39 | 夜色多温柔 40 | 你有多爱我 41 | 如今一个人听歌总是会觉得难过 42 | 爱已不在这里我却还没走脱 43 | 列表里的歌 44 | 随过往流动 45 | 46 | 如今一个人听歌总是会觉得失落 47 | 幻听你在我的耳边轻轻诉说 48 | 夜色多温柔 49 | 你有多爱我 50 | 如今一个人听歌总是会觉得难过 51 | 爱已不在这里我却还没走脱 52 | 如果你回头 53 | 不要放下我 54 | 55 | 56 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/庐州月.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 许嵩 2 | 作词 : 许嵩 3 | 儿时凿壁偷了谁家的光 4 | 宿昔不梳 一苦十年寒窗 5 | 如今灯下闲读 红袖添香 6 | 半生浮名只是虚妄 7 | 三月 一路烟霞 莺飞草长 8 | 柳絮纷飞里看见了故乡 9 | 不知心上的你是否还在庐阳 10 | 一缕青丝一生珍藏 11 | 桥上的恋人入对出双 12 | 桥边红药叹夜太漫长 13 | 月也摇晃 人也彷徨 14 | 乌蓬里传来了一曲离殇 15 | 庐州月光 洒在心上 16 | 月下的你不复当年模样 17 | 太多的伤 难诉衷肠 18 | 叹一句当时只道是寻常 19 | 庐州月光 梨花雨凉 20 | 如今的你又在谁的身旁 21 | 家乡月光 深深烙在我心上 22 | 却流不出当年泪光 23 | 三月 一路烟霞 莺飞草长 24 | 柳絮纷飞里看见了故乡 25 | 不知心上的你是否还在庐阳 26 | 一缕青丝一生珍藏 27 | 桥上的恋人入对出双 28 | 桥边红药叹夜太漫长 29 | 月也摇晃 人也彷徨 30 | 乌蓬里传来了一曲离殇 31 | 庐州月光 洒在心上 32 | 月下的你不复当年模样 33 | 太多的伤 难诉衷肠 34 | 叹一句当时只道是寻常 35 | 庐州月光 梨花雨凉 36 | 如今的你又在谁的身旁 37 | 家乡月光 深深烙在我心上 38 | 却流不出当年泪光 39 | 庐州的月光 在我心上 40 | 太多的伤 难诉衷肠 41 | 如今的你在谁的身旁 42 | 我流不出当年泪光 43 | 庐州月光 洒在心上 44 | 月下的你不复当年模样 45 | 太多的伤 难诉衷肠 46 | 叹一句当时只道是寻常 47 | 庐州月光 梨花雨凉 48 | 如今的你又在谁的身旁 49 | 家乡月光 深深烙在我心上 50 | 却流不出当年泪光 51 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/惊鸿一面.txt: -------------------------------------------------------------------------------- 1 | 2 | 作曲 : 许嵩 3 | 作词 : 许嵩 4 | 翻手为云覆手为雨 5 | 金盆洗手止风雨 6 | 不恋红尘却难舍回忆 7 | 每一段都有你 8 | 年少初遇常在我心 9 | 多年不减你深情 10 | 江山如画又怎能比拟 11 | 你送我的风景 12 | 柳下闻瑶琴起舞和一曲 13 | 仿佛映当年翩若惊鸿影 14 | 谁三言两语撩拨了情意 15 | 谁一颦一笑摇曳了星云 16 | 纸扇藏伏笔玄机诗文里 17 | 紫烟燃心语留香候人寻 18 | 史书列豪杰功过有几许 19 | 我今生何求惟你 20 | 年少初遇常在我心 21 | 多年不减你深情 22 | 江山如画又怎能比拟 23 | 你送我的风景 24 | 柳下闻瑶琴起舞和一曲 25 | 仿佛映当年翩若惊鸿影 26 | 谁三言两语撩拨了情意 27 | 谁一颦一笑摇曳了星云 28 | 纸扇藏伏笔玄机诗文里 29 | 紫烟燃心语留香候人寻 30 | 史书列豪杰功过有几许 31 | 我今生何求惟你 32 | 远山传来清晨悠然的曲笛 33 | 晓风掠走光阴 34 | 残月沉霜鬓里 35 | 有了你 36 | 恩怨都似飞鸿踏雪泥 37 | 柳下闻瑶琴起舞和一曲 38 | 仿佛映当年翩若惊鸿影 39 | 谁三言两语撩拨了情意 40 | 谁一颦一笑摇曳了星云 41 | 纸扇藏伏笔玄机诗文里 42 | 紫烟燃心语留香候人寻 43 | 史书列豪杰功过有几许 44 | 我今生何求惟你 45 | 我今生何求惟你 46 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/想象之中.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 许嵩 2 | 作词 : 许嵩 3 | 4 | 想象之中雨过一道彩虹 5 | 抬起了头 瑟瑟灰色天空 6 | 想象之中 付出会有结果 7 | 毫无保留 信奉你的承诺 8 | 想象之中 这次要爱很久 9 | 我领略过 你眼里的温柔热烈以后 10 | 你忽然的冰冻 判若两人 丢给我去承受 11 | 想象中 很不同 12 | 想象中一切都和后来不同 13 | 我承认 曾经那么心动 14 | 你没想象中那么恋旧 15 | 回忆唤不回你的温柔 16 | 最后也不是故作冷漠 17 | 转过头 我怎么有一滴泪落 18 | 我没想象中那么脆弱 19 | 分开后形容也没消瘦 20 | 一起踏过了几座春秋 21 | 领悟了爱不是追逐占有 22 | 23 | 想象之中 这次要爱很久 24 | 我领略过 你眼里的温柔 oh 热烈以后 25 | 你忽然的冰冻 判若两人 丢给我去承受 26 | 想象中 很不同 27 | 想象中一切都和后来不同 28 | 我承认 曾经那么心动 29 | 你没想象中那么恋旧 30 | 回忆唤不回你的温柔 31 | 最后也不是故作冷漠 32 | 转过头 我怎么有一滴泪落 33 | 我没想象中那么脆弱 34 | 分开后形容也没消瘦 35 | 一起踏过了几座春秋 36 | 领悟了爱不是追逐占有 37 | 38 | 你没想象中那么恋旧 39 | 回忆唤不回你的温柔 40 | 最后也不是故作冷漠 41 | 转过头 我怎么有一滴泪落 42 | 我没想象中那么脆弱 43 | 分开后形容也没消瘦 44 | 一起踏过了几座春秋 45 | 领悟了爱不是追逐占有 46 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/我想牵着你的手.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 许嵩 2 | 作词 : 许嵩 3 | Vae他有一些烦恼 4 | 反正现在的年轻人 5 | 都有许多烦恼 6 | 那么多要思考 7 | 那么多要寻找 8 | 诱惑太多 不坚定就犯错了 9 | 10 | 朋友说 尘世被你夸那么美 11 | 可是现实挺倒霉 12 | 领导天天要开会 13 | 上班迟到几分钟就被扣薪水 14 | 同事就像敌人要小心翼翼防备 15 | 16 | 老师不喜欢男生长头发 17 | 妈妈不喜欢女儿长指甲 18 | 什么都被管 什么都看不惯 19 | 什么都没力量推翻 20 | 学习生存之道又不安 21 | 22 | ho 地球太寒冷 23 | ho 距离产生美 24 | ho 远走八十八万公里 25 | ho hoo 26 | 27 | 我想牵着你的手 28 | 两个人去宇宙 29 | 没引力左右 30 | 夜光映出你的温柔 31 | 我想牵着你的手 32 | 逃离这颗星球 33 | 剥落了忧愁 34 | 快乐就在十指相扣 35 | 36 | 大家好我是Vae 我打一下岔 37 | 请你和我一起跟着节奏拍拍手 38 | 拍拍手晚上睡觉就能梦游 39 | 梦见跟爱的人去公园走走 40 | 走来走去不知道有什么走头 41 | 这世界让你和她觉得不爽 42 | 我想你其实也想离开这星球 43 | 44 | ho 地球太寒冷 45 | ho 距离产生美 46 | ho 远走八十八万公里 47 | ho hoo 48 | 49 | 我想牵着你的手 50 | 两个人去宇宙 51 | 没引力左右 52 | 夜光映出你的温柔 53 | 我想牵着你的手 54 | 逃离这颗星球 55 | 剥落了忧愁 56 | 快乐就在十指相扣 57 | 58 | 我想牵着你的手 59 | 两个人去宇宙 60 | 没引力左右 61 | 夜光映出你的温柔 62 | 我想牵着你的手 63 | 逃离这颗星球 64 | 剥落了忧愁 65 | 快乐就在十指相扣 66 | 67 | 我想牵着你的手 68 | 两个人去宇宙 69 | 没引力左右 70 | 夜光映出你的温柔 71 | 我想牵着你的手 72 | 逃离这颗星球 73 | 剥落了忧愁 74 | 快乐就在十指相扣 75 | 76 | 我想牵着你的手 77 | 78 | Kistory for my Gui 79 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/拆东墙.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 许嵩 2 | 作词 : 许嵩 3 | 公元六五九年 十九岁 他接他爹的班 4 | 考不取功名的后果是接手自家的酒馆 5 | 又听说同乡谁已经赴京做上小官 6 | 他的梦 往来客谁能买单 7 | 8 | 代代叹世道难 人心乱 可又能怎么办 9 | 他女人的美丽对比映衬出他的难看 10 | 朋友说 他不爱 没有爱 只是贪他小财 11 | 可他爱 连菜都自己去买 12 | 13 | 掌柜的小破酒馆被人拆了东墙 14 | 后来衙门说按一平米八吊钱来跟他折算 15 | 他不干 他不干 百年招牌祖祖辈辈流传下来 16 | 挣的并不快 但人熟地熟 还算落得个自在 17 | 18 | 掌柜的小破酒馆被人拆了东墙 19 | 后来有人看见他冒雪背着行囊暗夜离开 20 | 丢下老 丢下少 他是否也曾无奈 21 | 一去若回来 老家的酒香还在不在 22 | 23 | 代代叹世道难 人心乱 可又能怎么办 24 | 他女人的美丽对比映衬出他的难看 25 | 朋友说 他不爱 没有爱 只是贪他小财 26 | 可他爱 连菜都自己去买 27 | 28 | 掌柜的小破酒馆被人拆了东墙 29 | 后来衙门说按一平米八吊钱来跟他折算 30 | 他不干 他不干 百年招牌祖祖辈辈流传下来 31 | 挣的并不快 但人熟地熟 还算落得个自在 32 | 33 | 掌柜的小破酒馆被人拆了东墙 34 | 后来有人看见他冒雪背着行囊暗夜离开 35 | 丢下老 丢下少 他是否也曾无奈 36 | 一去若回来 老家的酒香还在不在 37 | 38 | 掌柜的小破酒馆被人拆了东墙 39 | 后来衙门说按一平米八吊钱来跟他折算 40 | 他不干 他不干 百年招牌祖祖辈辈流传下来 41 | 挣的并不快 但人熟地熟 还算落得个自在 42 | 43 | 掌柜的小破酒馆被人拆干净了 44 | 后来有人说那夜他被揍到走路一瘸一拐 45 | 兴也苦 亡也苦 青史总让人无奈 46 | 更迭了朝代 当时的明月换拨人看 47 | 48 | 西墙补不来 49 | 可东墙面子上还得拆 50 | 51 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/断桥残雪.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 许嵩 2 | 作词 : 许嵩 3 | 寻不到花的折翼枯叶蝶 4 | 永远也看不见凋谢 5 | 江南夜色下的小桥屋檐 6 | 读不懂塞北的荒野 7 | 8 | 梅开时节因寂寞而缠绵 9 | 春归后又很快湮灭 10 | 独留我赏烟花飞满天 11 | 摇曳后就随风飘远 12 | 13 | 断桥是否下过雪 14 | 我望着湖面 15 | 水中寒月如雪 16 | 指尖轻点融解 17 | 18 | 断桥是否下过雪 19 | 又想起你的脸 20 | 若是无缘再见 21 | 白堤柳帘垂泪好几遍 22 | 23 | 寻不到花的折翼枯叶蝶 24 | 永远也看不见凋谢 25 | 江南夜色下的小桥屋檐 26 | 读不懂塞北的荒野 27 | 28 | 梅开时节因寂寞而缠绵 29 | 春归后又很快湮灭 30 | 独留我赏烟花飞满天 31 | 摇曳后就随风飘远 32 | 33 | 断桥是否下过雪 34 | 我望着湖面 35 | 水中寒月如雪 36 | 指尖轻点融解 37 | 38 | 断桥是否下过雪 39 | 又想起你的脸 40 | 若是无缘再见 41 | 白堤柳帘垂泪好几遍 42 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/明智之举.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 许嵩 2 | 作词 : 许嵩 3 | 4 | 你在北方某城 很偶尔下雨 5 | 我在天南海北 很偶尔想你 6 | 写不来十八九岁煽情字句 7 | 孤单喂饱了理性 8 | 9 | 想必你也看过了一些风景 10 | 才明白什么样的适合自己 11 | 翻着你朋友圈的幸福合影 12 | 由衷的为你高兴 13 | 14 | 我曾在意的你 15 | 想说声对不起 16 | 年少时的任性 17 | 有些话伤人不轻 18 | 也怀疑自己 19 | 不是理想伴侣 20 | 你的离开也许是个明智之举 21 | 22 | 我曾在意的你 23 | 给过太多悲喜 24 | 承蒙时光洗礼 25 | 往事已云淡风轻 26 | 当我们老去 27 | 品尝丰盛回忆 28 | 每一道失去都是醇厚的赐予 29 | 30 | 你在北方某城 很偶尔下雨 31 | 我在天南海北 很偶尔想你 32 | 写不来十八九岁煽情字句 33 | 孤单喂饱了理性 34 | 35 | 想必你也看过了一些风景 36 | 才明白什么样的适合自己 37 | 翻着你朋友圈的幸福合影 38 | 由衷的为你高兴 39 | 40 | 我曾在意的你 41 | 想说声对不起 42 | 年少时的任性 43 | 有些话伤人不轻 44 | 也怀疑自己 45 | 不是理想伴侣 46 | 你的离开也许是个明智之举 47 | 48 | 我曾在意的你 49 | 给过太多悲喜 50 | 承蒙时光洗礼 51 | 往事已云淡风轻 52 | 当我们老去 53 | 品尝丰盛回忆 54 | 每一道失去 55 | 56 | 你曾笑着问我 57 | 如若重新来过 58 | 结局会不会不同 59 | 我出神了许久 60 | 神游在初见的午后 61 | 62 | 我曾在意的你 63 | 想说声对不起 64 | 年少时的任性 65 | 有些话伤人不轻 66 | 也怀疑自己 67 | 不是理想伴侣 68 | 你的离开也许是个明智之举 69 | 70 | 我曾在意的你 71 | 给过太多悲喜 72 | 承蒙时光洗礼 73 | 往事已云淡风轻 74 | 75 | 当我们老去 76 | 品尝丰盛回忆 77 | 每一道失去 78 | 都是醇厚的赐予 79 | 80 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/星座书上.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 许嵩 2 | 作词 : 许嵩 3 | 星光点亮了 4 | 海水泛起皱褶 5 | 晚风咸咸的 6 | 吹散你我身旁余热 7 | 不够彼此信任 8 | 还是有了裂痕 9 | 为什么感觉有些陌生了 10 | 沿海岸奔跑 11 | 寻找属于我们的岛 12 | 有一些问号 13 | 也许对你并不重要 14 | 可很久没深聊 15 | 也很久没拥抱 16 | 翻开书本把答案寻找 17 | 星座书上说我们不合 18 | 金牛座的我配不上你的好 19 | 难过后想想也许只是碰巧 20 | 我们的故事写书人怎明了 21 | 星座书上说我们不合 22 | 最后我偷偷把那页撕掉 23 | 真的爱情没法预料 24 | 何必让你知道 25 | 就算你早知道 26 | 沿海岸奔跑 27 | 寻找属于我们的岛 28 | 有一些问号 29 | 也许对你并不重要 30 | 可很久没深聊 31 | 也很久没拥抱 32 | 翻开书本把答案寻找 33 | 星座书上说我们不合 34 | 金牛座的我配不上你的好 35 | 难过后想想也许只是碰巧 36 | 我们的故事写书人怎明了 37 | 星座书上说我们不合 38 | 最后我偷偷把那页撕掉 39 | 真的爱情没法预料 40 | 何必让你・・・ 41 | 星座书上说我们不合 42 | 金牛座的我配不上你的好 43 | 难过后想想也许只是碰巧 44 | 我们的故事写书人怎明了 45 | 星座书上说我们不合 46 | 最后我偷偷把那页撕掉 47 | 真的爱情没法预料 48 | 何必让你知道 49 | 就算你早知道 50 | 51 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/有何不可.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 许嵩 2 | 作词 : 许嵩 3 | 天空好想下雨 4 | 我好想住你隔壁 5 | 傻站在你家楼下 6 | 抬起头数乌云 7 | 如果场景里出现一架钢琴 8 | 我会唱歌给你听 9 | 哪怕好多盆水往下淋 10 | 夏天快要过去 11 | 请你少买冰淇淋 12 | 天凉就别穿短裙 13 | 别再那么淘气 14 | 如果有时不那么开心 15 | 我愿意将格洛米借给你 16 | 你其实明白我心意 17 | 为你唱这首歌没有什么风格 18 | 它仅仅代表着我想给你快乐 19 | 为你解冻冰河为你做一只扑火的飞蛾 20 | 没有什么事情是不值得 21 | 为你唱这首歌没有什么风格 22 | 它仅仅代表着我希望你快乐 23 | 为你辗转反侧为你放弃世界有何不可 24 | 夏末秋凉里带一点温热有换季的颜色 25 | 26 | 天空好想下雨 27 | 我好想住你隔壁 28 | 傻站在你家楼下 29 | 抬起头数乌云 30 | 如果场景里出现一架钢琴 31 | 我会唱歌给你听 32 | 哪怕好多盆水往下淋 33 | 夏天快要过去 34 | 请你少买冰淇淋 35 | 天凉就别穿短裙 36 | 别再那么淘气 37 | 如果有时不那么开心 38 | 我愿意将格洛米借给你 39 | 你其实明白我心意 40 | 为你唱这首歌没有什么风格 41 | 它仅仅代表着我想给你快乐 42 | 为你解冻冰河为你做一只扑火的飞蛾 43 | 没有什么事情是不值得 44 | 为你唱这首歌没有什么风格 45 | 它仅仅代表着我希望你快乐 46 | 为你辗转反侧为你放弃世界有何不可 47 | 夏末秋凉里带一点温热 48 | 49 | 为你解冻冰河为你做一只扑火的飞蛾 50 | 没有什么事情是不值得 51 | 为你唱这首歌没有什么风格 52 | 它仅仅代表着我希望你快乐 53 | 为你辗转反侧为你放弃世界有何不可 54 | 夏末秋凉里带一点温热有换季的颜色 55 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/江湖 .txt: -------------------------------------------------------------------------------- 1 | 2 | 作曲 : 许嵩 3 | 作词 : 许嵩 4 | 今夕是何夕 5 | 晚风过花庭 6 | 飘零 予人乐后飘零 7 | 故地是何地 8 | 死生不复回 9 | 热血 风干在旧恨里 10 | 衣锦夜行 当一生尘埃落定 11 | 飞鸽来急 那落款沾染血迹 12 | 夜半嘱小徒复信 言师已故去 13 | 星云沉默江湖里 14 | 孤雁飞去 红颜来相许 15 | 待到酒清醒 她无影 原来是梦里 16 | 恩怨散去 刀剑已归隐 17 | 敬属江上雨 寒舟里 我独饮 18 | 衣锦夜行 当一生尘埃落定 19 | 飞鸽来急 那落款沾染血迹 20 | 夜半嘱小徒复信 言师已故去 21 | 星云沉默江湖里 22 | 孤雁飞去 红颜来相许 23 | 待到酒清醒 她无影 原来是梦里 24 | 恩怨散去 刀剑已归隐 25 | 敬属江上雨 寒舟里 我独饮 26 | 孤雁飞去 红颜来相许 27 | 待到酒清醒 她无影 原来是梦里 28 | 恩怨散去 刀剑已归隐 29 | 敬属江上雨 寒舟里 我独饮 30 | 我独饮 31 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/河山大好.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 许嵩 2 | 作词 : 许嵩 3 | 最近亚健康状态 4 | 坐久了腿发软 5 | 电脑看久了脖子它也会酸 6 | 数字时代貌似把生活节奏加快 7 | 也让人变得行动迟缓 8 | 9 | 忙忙忙 10 | 忙出个什么所以然 11 | 地球离了谁它都照样公转自转 12 | 叹叹叹 13 | 弹指一挥人生苦短 14 | 终点不明沿途风景要好好看 15 | 你可以隐隐期待 16 | 途中佳缘到来 17 | 保持浪漫心态 18 | 活着就不算坏 19 | 家国大好河山 20 | 不必崇洋媚外 21 | 好地方一生都看不完 22 | 峨眉山庐山黄山嵩山 23 | 抓紧周末带爸爸妈妈去转一转 24 | 北京西安洛阳开封安阳南京杭州 25 | 睹一睹古都的风采 26 | 心情大好 出去走走 27 | 碧海蓝天 吹吹风 28 | 河山大好 出去走走 29 | 别窝在家 当懒虫 30 | 心情大好 出去走走 31 | 碧海蓝天 吹吹风 32 | 河山大好 出去走走 33 | 别窝在家 当懒虫 34 | 35 | 忙忙忙 36 | 忙出个什么所以然 37 | 叹叹叹 38 | 弹指一挥人生苦短 39 | 终点不明沿途风景要好好看 40 | 你可以隐隐期待 41 | 途中佳缘到来 42 | 保持浪漫心态 43 | 活着就不算坏 44 | 家国大好河山 45 | 不必崇洋媚外 46 | 好地方一生都看不完 47 | 峨眉山庐山黄山嵩山 48 | 抓紧周末带爸爸妈妈去转一转 49 | 北京西安洛阳开封安阳南京杭州 50 | 睹一睹古都的风采 51 | 心情大好 出去走走 52 | 碧海蓝天 吹吹风 53 | 河山大好 出去走走 54 | 别窝在家 当懒虫 55 | 心情大好 出去走走 56 | 碧海蓝天 吹吹风 57 | 河山大好 出去走走 58 | 别窝在家 当懒虫 59 | 60 | 心情大好 河山大好 61 | 心情大好 河山大好 62 | 心情大好 出去走走 63 | 碧海蓝天 吹吹风 64 | 河山大好 出去走走 65 | 只不过是 河山大好 66 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/清明雨上.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 许嵩 2 | 作词 : 许嵩 3 | 窗透初晓 日照西桥 云自摇 4 | 想你当年荷风微摆的衣角 5 | 木雕流金 岁月涟漪 七年前封笔 6 | 因为我今生挥毫只为你 7 | 雨打湿了眼眶 年年倚井盼归堂 8 | 最怕不觉泪已拆两行 9 | 我在人间彷徨 寻不到你的天堂 10 | 东瓶西镜放 恨不能遗忘 11 | 又是清明雨上 折菊寄到你身旁 12 | 把你最爱的歌来轻轻唱 13 | 远方有琴 愀然空灵 声声催天雨 14 | 涓涓心事说给自己听 15 | 月影憧憧 烟火几重 烛花红 16 | 红尘旧梦 梦断都成空 17 | 雨打湿了眼眶 年年倚井盼归堂 18 | 最怕不觉泪已拆两行 19 | 我在人间彷徨 寻不到你的天堂 20 | 东瓶西镜放 恨不能遗忘 21 | 又是清明雨上 折菊寄到你身旁 22 | 把你最爱的歌来轻轻唱 23 | 我在人间彷徨 寻不到你的天堂 24 | 东瓶西镜放 恨不能遗忘 25 | 又是清明雨上 折菊寄到你身旁 26 | 把你最爱的歌来轻轻唱 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/灰色头像.txt: -------------------------------------------------------------------------------- 1 | 2 | 作曲 : 许嵩 3 | 作词 : 许嵩 4 | 昨夜做了一个梦 5 | 梦里我们回到手牵着手 6 | 醒来的失落 无法言说 7 | 打开了OICQ 8 | 聊天记录停步去年的深秋 9 | 最后的挽留 没有说出口 10 | 我们还是朋友 11 | 是那种最遥远的朋友 12 | 你给过的温柔 13 | 在记录之中 全部都保有 14 | 你灰色头像不会再跳动 15 | 哪怕是一句简单的问候 16 | 心贴心的交流一页页翻阅多难过 17 | 是什么 坠落 升空 18 | 又想起你曾说的陪我到最后 19 | 暖色的梦变冰凉的枷锁 20 | 如果时光倒流我们又能抓得住什么 21 | 打开了OICQ 22 | 聊天记录停步去年的深秋 23 | 最后的挽留 没有说出口 24 | 我们还是朋友 25 | 是那种最遥远的朋友 26 | 你给过的温柔 27 | 在记录之中 全部都保有 28 | 你灰色头像不会再跳动 29 | 哪怕是一句简单的问候 30 | 心贴心的交流一页页翻阅多难过 31 | 是什么 坠落 升空 32 | 又想起你曾说的陪我到最后 33 | 暖色的梦变冰凉的枷锁 34 | 如果时光倒流我们又能抓得住什么 35 | 当我发现所谓醒来其实是另一个梦 你不在这世界 36 | 梦的出口散不开的浓雾太沉重 你不在这世界 37 | 就算当初声嘶力竭作苦苦的求你留下别走 38 | 也没用 39 | 灰色头像静静悄悄不会再跳动 40 | 我的绝望溢出胸口 41 | 是什么 坠落 升空 42 | 你灰色头像不会再跳动 43 | 暖色的梦变冰凉的枷锁 44 | 如果时光倒流我们又能抓得住什么 45 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/玫瑰花的葬礼.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 李毅杰 2 | 作词 : 许嵩/李毅杰 3 | 离开你一百个星期 4 | 我回到了这里 5 | 寻找我们爱过的证据 6 | 没有人愿意提起 7 | 玫瑰花它的过去 8 | 今天这里的主题 9 | 我把它叫作 回忆 10 | 我知道 爱情这东西 11 | 他没什么道理 12 | 过去我和你在一起 13 | 是我太叛逆 14 | 现在只剩我自己 15 | 偷偷的想你 16 | 17 | 玫瑰花的葬礼 18 | 埋葬关于你的回忆 19 | 感觉双手麻痹 20 | 不能自已 21 | 已拉不住你 22 | 23 | 真的好美丽 24 | 那天的烟花雨 25 | 26 | 我说要娶穿碎花洋裙的你 27 | 28 | 玫瑰花的葬礼 29 | 埋葬深深爱着的你 30 | 31 | 残朵停止呼吸 32 | 渗入大地 33 | 没人会注意 34 | 35 | 一片小雨滴 36 | 陪着我等天明 37 | 38 | 我用这最后一分钟怀念你 39 | 40 | 我在夜幕笼罩的天桥上潜行 41 | 每一级阶梯 42 | 都留着你我昔日印迹 43 | 温存迷醉 吵闹清醒 44 | 都还在我的脚畔 45 | 兜兜兜兜兜转不清 46 | 没来得及把红色玫瑰递给你 47 | 爱就像是一场雨 48 | 已经离我而去 49 | 你说过 50 | 太过鲜艳的爱情 终将凋零 51 | 52 | 玫瑰花的葬礼 53 | 埋葬关于你的回忆 54 | 感觉双手麻痹 55 | 不能自已 56 | 已拉不住你 57 | 真的好美丽 58 | 那天的烟花雨 59 | 我说要娶穿碎花洋裙的你 60 | 61 | 玫瑰花的葬礼 62 | 埋葬深深爱着的你 63 | 残朵停止呼吸 64 | 渗入大地 65 | 没人会注意 66 | 67 | 一片小雨滴 68 | 陪着我等天明 69 | 70 | 我用这最后一分钟怀念你 71 | 72 | 总是回想过去 埋怨我自己 73 | 74 | 总是不经意间 想起了你 75 | 76 | 现在的你 已经太遥不可及 77 | 78 | 只能留在我记忆 79 | 玫瑰花的葬礼 80 | 埋葬关于你的回忆 81 | 感觉双手麻痹 82 | 不能自已 83 | 已拉不住你 84 | 85 | 真的好美丽 86 | 那天的烟花雨 87 | 我说要娶穿碎花洋裙的你 88 | 89 | 玫瑰花的葬礼 90 | 埋葬深深爱着的你 91 | 残朵停止呼吸 92 | 渗入大地 93 | 没人会注意 94 | 一片小雨滴 95 | 陪着我等天明 96 | 我用这最后一分钟怀念你 97 | 98 | 我用这最后一分钟 99 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/素颜.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 作曲 : 许嵩 4 | 作词 : 许嵩 5 | 又是一个安静的晚上 6 | 一个人窝在摇椅里乘凉 7 | 我承认这样真的很安详 8 | 和楼下老爷爷一样 9 | 听说你还在搞什么原创 10 | 搞来搞去好像也就这样 11 | 不如花点时间想想 12 | 琢磨一下模样 13 | 今夜化了美美的妆 14 | (我相信是很美美的妆) 15 | 我摇晃在舞池中央 16 | (那种体态可以想象) 17 | 我做我的改变 又何必纠结 18 | 那就拜托别和我碰面 19 | 如果再看你一眼 20 | 是否还会有感觉 21 | 当年素面朝天要多纯洁就有多纯洁 22 | 不画扮熟的眼线 23 | 不用抹匀粉底液 24 | 暴雨天 照逛街 25 | 偷笑别人花了脸 26 | 如果再看你一眼 27 | 是否还会有感觉 28 | 最真实的喜怒哀乐全都埋葬在昨天 29 | 不掺任何的表演 30 | 轰轰烈烈那几年 31 | 我怀念 别怀念 32 | 怀念也回不到从前 33 | 又是一个安静的晚上 34 | 一个人窝在摇椅里乘凉 35 | 我承认这样真的很安详 36 | 和楼下老爷爷一样 37 | 听说你还在搞什么原创 38 | 搞来搞去好像也就这样 39 | 不如花点时间想想 40 | 琢磨一下模样 41 | 今夜化了美美的妆 42 | (我相信是很美美的妆) 43 | 我摇晃在舞池中央 44 | (那种体态可以想象) 45 | 我做我的改变 又何必纠结 46 | 那就拜托别和我碰面 47 | 如果再看你一眼 48 | 是否还会有感觉 49 | 当年素面朝天要多纯洁就有多纯洁 50 | 不画扮熟的眼线 51 | 不用抹匀粉底液 52 | 暴雨天 照逛街 53 | 偷笑别人花了脸 54 | 如果再看你一眼 55 | 是否还会有感觉 56 | 最真实的喜怒哀乐全都埋葬在昨天 57 | 不掺任何的表演 58 | 轰轰烈烈那几年 59 | 我怀念 别怀念 60 | 怀念也回不到从前 61 | 曾经对上的瞬间 62 | 难道是一种错觉 63 | 那些流逝了的就永远不会复现 64 | 不掺任何的表演 65 | 轰轰烈烈那几年 66 | 有遗憾的感觉 为何感觉 67 | 那消失不见的素颜 68 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/认错.txt: -------------------------------------------------------------------------------- 1 | 作曲 : 许嵩 2 | 作词 : 许嵩 3 | 那天午后 我站在你家门口 4 | 你咬咬嘴唇还是说出了分手 5 | 我的挽留和眼泪全都没有用 6 | 或许我应该自食这苦果 7 | 8 | 你的迁就 我一直领悟不够 9 | 以为爱已强大的不要理由 10 | 心开始颤抖 明白了你的难受 11 | 但你的表情已经冷漠 12 | 13 | 全是我的错 14 | 现在认错有没有用 15 | 你说你已经不再爱我 16 | 我带你回忆曾经快乐的时空 17 | 你只是劝我别再执着 18 | 19 | 全是我的错 20 | 现在认错有没有用 21 | 你说你喜欢如今的生活 22 | 你带我回忆爱里互相的折磨 23 | 还告诉了我 别再来认错 认结果 24 | 25 | 那天午后 我站在你家门口 26 | 你咬咬嘴唇还是说出了分手 27 | 我的挽留和眼泪全都没有用 28 | 或许我应该自食这苦果 29 | 30 | 你的迁就 我一直领悟不够 31 | 以为爱已强大的不要理由 32 | 心开始颤抖 明白了你的难受 33 | 但你的表情已经冷漠 34 | 35 | 全是我的错 36 | 现在认错有没有用 37 | 你说你已经不再爱我 38 | 我带你回忆曾经快乐的时空 39 | 你只是劝我别再执着 40 | 41 | 全是我的错 42 | 现在认错有没有用 43 | 你说你喜欢如今的生活 44 | 你带我回忆爱里互相的折磨 45 | 还告诉了我 别再来认错 46 | 47 | 全是我的错 48 | 现在认错有没有用 49 | 你说你已经不再爱我 50 | 我带你回忆曾经快乐的时空 51 | 你只是劝我别再执着 52 | 53 | 全是我的错 54 | 现在认错有没有用 55 | 你说你喜欢如今的生活 56 | 你带我回忆爱里互相的折磨 57 | 还告诉了我 别再来认错 认结果 58 | 59 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/违章动物.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 作曲 : 许嵩 6 | 作词 : 许嵩 7 | 我目睹街角的蝴蝶飞上了青天 8 | 要上访青天大老爷 9 | 相关衙门提出了一些指导性意见 10 | 街坊玩命转播真相与流言 11 | 卖红薯的姑娘想在学堂门前摆摊 12 | 那不可以没钱 也不可以不陪笑脸 13 | 有点小权的 时时刻刻都想要用上小权 14 | 而有大权的 脑子坏了才和你站一边 15 | 一群高贵气质的差人在处罚违章动物 16 | 她一身尘土 在街角迷了路 17 | 一群高贵气质的差人在处罚违章动物 18 | 缄默的泪 没有人在乎 19 | 这繁华的城池有时让人感到陌生 20 | 当乌云不断堆叠 暴雨也就如期而至 21 | 幸福 的定义连番升级 22 | 拒绝回到初始版本 23 | 就买个红薯吧 否则夜太寒冷 24 | 一群高贵气质的差人在处罚违章动物 25 | 她一身尘土 在街角迷了路 26 | 一群高贵气质的差人在处罚违章动物 27 | 缄默的泪 没有人在乎 28 | 一群高贵气质的差人在处罚违章动物 29 | 她一身尘土 在街角迷了路 30 | 一群高贵气质的差人在处罚违章动物 31 | 缄默的泪 汇成这方土地的湖 32 | -------------------------------------------------------------------------------- /WangYi_Music/歌词/雅俗共赏.txt: -------------------------------------------------------------------------------- 1 | 2 | 作曲 : 许嵩 3 | 作词 : 许嵩 4 | 是否每一部戏都看得完整场 5 | 是否每一天过得都有多难忘 6 | 表情迟钝可能因为比较爱想 7 | 不擅长眉目表达 8 | 总在盼望 总在失望 9 | 日子还不都这样 10 | 俗的无畏 雅的轻狂 11 | 还不都是一副臭皮囊 12 | 他们说快写一首情歌雅俗共赏 13 | 落笔传神还要容易传唱 14 | 上得厅堂也下得厨房 15 | 就像我一直在找的姑娘 16 | 快写一首情歌雅俗共赏 17 | 打完字谜还要接着打榜 18 | 如果胡同弄堂全都播放 19 | 气韵里居然添了些孤芳自赏 20 | 是否每一场美梦醒来都很爽 21 | 是否每一次成熟都徒增了业障 22 | 比痛和痒更多的 23 | 是不痛不痒 24 | 所以我爱进剧场 25 | 总在盼望 总在失望 26 | 日子还不都这样 27 | 俗的无畏 雅的轻狂 28 | 还不都是一副臭皮囊 29 | 他们说快写一首情歌雅俗共赏 30 | 落笔传神还要容易传唱 31 | 上得厅堂也下得厨房 32 | 就像我一直在找的姑娘 33 | 快写一首情歌雅俗共赏 34 | 打完字谜还要接着打榜 35 | 如果胡同弄堂全都播放 36 | 气韵里居然添了些孤芳自赏 37 | 谁的故事有营养 38 | 大俗或大雅的都在理直气壮 39 | 洒狗血或白雪的现场 40 | 都邀我观赏 41 | 还真是大方 42 | 快写一首情歌雅俗共赏 43 | 落笔传神还要容易传唱 44 | 上得厅堂也下得厨房 45 | 就像我一直在找的姑娘 46 | 有没有一种生活雅俗共赏 47 | 情节起伏跌宕让人向往 48 | 满纸荒唐中窥见满脸沧桑 49 | 触到神经就要懂得鼓掌 50 | 别说一不在乎二没期望 51 | 太超脱 中枪中奖感觉会一样 52 | -------------------------------------------------------------------------------- /coffee.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/coffee.png -------------------------------------------------------------------------------- /dangdang_book/README.md: -------------------------------------------------------------------------------- 1 | # scrapy-redis-dangdang.cm 2 | scrapy-redis分布式爬虫,爬取当当网图书信息 3 | 4 | 前期的准备 5 | 虚拟机下乌班图下redis:url去重,持久化 6 | mongodb:保存数据 7 | PyCharm:写代码 8 | 谷歌浏览器:分析要提取的数据 9 | 爬取图书每个分类下的小分类下的图书信息(分类标题,小分类标题,图书标题,作者,图书简介,价格,电子书价格,出版社,封面,图书链接) 10 | 思路:按每个大分类分组,再按小分类分组,再按每本书分组,最后提取数据 11 | -------------------------------------------------------------------------------- /dangdang_book/dangdang_book/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DangdangBookItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # item = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /dangdang_book/dangdang_book/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | import random 9 | 10 | 11 | class DangdangBookDownloaderMiddleware: 12 | 13 | def process_request(self, request, spider): 14 | """添加随机UA跟代理IP""" 15 | ua = random.choice(spider.settings.get("UA_LIST")) 16 | request.headers["User-Agent"] = ua 17 | 18 | # request.meta["proxy"] = "https://125.115.126.114:888" 19 | 20 | def process_response(self, request, response, spider): 21 | """查看UA有没有设置成功""" 22 | # print("777", request.headers["User-Agent"]) 23 | return response 24 | 25 | 26 | -------------------------------------------------------------------------------- /dangdang_book/dangdang_book/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymysql 8 | db = pymysql.connect("localhost", "root", "root123", "chat") 9 | # db = client["dangdang_db"] 10 | cursor = db.cursor() 11 | 12 | 13 | class DangdangBookPipeline: 14 | def process_item(self, item, spider): 15 | """保存数据到mongodb""" 16 | print("8888"*10, item) 17 | sql = "insert into dangdang(content) values(%s)" % item 18 | print(sql) 19 | print("666") 20 | cursor.execute(sql) 21 | db.commit() 22 | db.close 23 | return item 24 | -------------------------------------------------------------------------------- /dangdang_book/dangdang_book/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for dangdang_book project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://docs.scrapy.org/en/latest/topics/settings.html 9 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'dangdang_book' 13 | 14 | SPIDER_MODULES = ['dangdang_book.spiders'] 15 | NEWSPIDER_MODULE = 'dangdang_book.spiders' 16 | 17 | # 一个去重的类,用来将url去重 18 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 19 | # 一个队列 20 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 21 | # 是否持久化 22 | SCHEDULER_PERSIST = True 23 | # redis地址 24 | REDIS_URL = "redis://127.0.0.1:6379" 25 | # REDIS_HOST = '127.0.0.1' 26 | # REDIS_PORT = 6379 27 | 28 | 29 | LOG_LEVEL = "DEBUG" 30 | # user-agent 31 | UA_LIST = [ 32 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 33 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 34 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 35 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 36 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 37 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 38 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 39 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 40 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 41 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)", 42 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 43 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 44 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", 45 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 46 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 47 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 48 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 49 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 50 | ] 51 | 52 | # Obey robots.txt rules 53 | ROBOTSTXT_OBEY = False 54 | 55 | # 下载延迟 56 | DOWNLOAD_DELAY = 0 57 | 58 | # The download delay setting will honor only one of: 59 | # Enable or disable downloader middlewares 60 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 61 | DOWNLOADER_MIDDLEWARES = { 62 | 'dangdang_book.middlewares.DangdangBookDownloaderMiddleware': 543, 63 | } 64 | 65 | # Configure item pipelines 66 | ITEM_PIPELINES = { 67 | # 'dangdang_book.pipelines.DangdangBookPipeline': 300, 68 | 'scrapy_redis.pipelines.RedisPipeline': 300 69 | } 70 | 71 | -------------------------------------------------------------------------------- /dangdang_book/dangdang_book/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /dangdang_book/dangdang_book/spiders/dd_book.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | # 额外导入以下类 4 | from scrapy_redis.spiders import RedisSpider 5 | from copy import deepcopy 6 | import time 7 | # 继承导入的类 8 | class DdBookSpider(RedisSpider): 9 | name = 'dd_book' 10 | allowed_domains = ['dangdang.com'] 11 | redis_key = "dd_book" # redis中插入(lpush dd_book http://category.dangdang.com/?ref=www-0-C) 12 | # start_urls = ["http://category.dangdang.com/"] 13 | def parse(self, response): 14 | """图书大类""" 15 | # 先分组 16 | div_list = response.xpath('//div[@class="classify_books"]/div[@class="classify_kind"]') 17 | for div in div_list: 18 | item = {} 19 | item["大标题"] = div.xpath('.//a/text()').extract_first() 20 | li_list = div.xpath('.//ul[@class="classify_kind_detail"]/li') 21 | for li in li_list: 22 | item["小标题"] = li.xpath('./a/text()').extract_first() 23 | sm_url = li.xpath('./a/@href').extract_first() 24 | #print(sm_url, item["小标题"]) 25 | time.sleep(2) 26 | 27 | # 请求详情页 28 | if sm_url != "javascript:void(0);": 29 | print("请求详情页:" ,sm_url) 30 | yield scrapy.Request(sm_url, callback=self.book_details, meta={"item": deepcopy(item)}) 31 | 32 | def book_details(self, response): 33 | """提取图书数据""" 34 | item = response.meta["item"] 35 | # 给每本书分组 36 | li_list = response.xpath('//ul[@class="bigimg"]/li') 37 | for li in li_list: 38 | item["图书标题"] = li.xpath('./a/@title').extract_first() 39 | item["作者"] = li.xpath('./p[@class="search_book_author"]/span[1]/a/@title').extract_first() 40 | item["图书简介"] = li.xpath('./p[@class="detail"]/text()').extract_first() 41 | item["价格"] = li.xpath('./p[@class="price"]/span[@class="search_now_price"]/text()').extract_first() 42 | item["电子书价格"] = li.xpath('./p[@class="price"]/a[@class="search_e_price"]/i/text()').extract_first() 43 | item["日期"] = li.xpath('./p[@class="search_book_author"]/span[2]/text()').extract_first() 44 | item["出版社"] = li.xpath('./p[@class="search_book_author"]/span[3]/a/@title').extract_first() 45 | item["图片"] = li.xpath('./a/img/@src').extract_first() 46 | item["图书链接"] = li.xpath('./a/@href').extract_first() 47 | 48 | yield item 49 | 50 | # 翻页 51 | next_url = response.xpath('//a[text()="下一页"]/@href').extract_first() 52 | if next_url is not None: 53 | next_url = "http://category.dangdang.com" + next_url 54 | yield scrapy.Request(next_url, callback=self.book_details, meta={"item": deepcopy(item)}) 55 | 56 | # lpush dd_book http://category.dangdang.com/?ref=www-0-C 57 | 58 | # ╭─mac@huayang ~/Stardust/scrapy_project/scrapy-redis-dangdang.cm-master 59 | # ╰─➤ PYTHONPATH=$(pwd) python3 -m scrapy runspider spiders/dd_book.py -------------------------------------------------------------------------------- /dangdang_book/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/topics/scrapyd.html 5 | 6 | [settings] 7 | default = dangdang_book.settings 8 | 9 | # [deploy] 10 | # #url = http://localhost:6800/ 11 | # project = example 12 | -------------------------------------------------------------------------------- /ele_me/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/ele_me/1.png -------------------------------------------------------------------------------- /ele_me/README.md: -------------------------------------------------------------------------------- 1 | 爬取饿了么某地区的外卖信息 2 | 3 | 数据生成josn格式的csv文件,生成词云以及食物信息统计图 4 | 5 | 博客地址:https://blog.csdn.net/weixin_43746433 6 | 7 | 爬虫:https://blog.csdn.net/weixin_43746433/article/details/91906540 8 | 9 | 微信:why19970628 10 | 11 | 欢迎与我交流 12 | -------------------------------------------------------------------------------- /ele_me/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/ele_me/__init__.py -------------------------------------------------------------------------------- /ele_me/eleme_bar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/ele_me/eleme_bar.png -------------------------------------------------------------------------------- /ele_me/eleme_wordcloud.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/ele_me/eleme_wordcloud.png -------------------------------------------------------------------------------- /ele_me/elemedata.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/ele_me/elemedata.csv -------------------------------------------------------------------------------- /ele_me/fooddic.txt: -------------------------------------------------------------------------------- 1 | 黄焖鸡 -------------------------------------------------------------------------------- /finance.eastmoney.com/README.md: -------------------------------------------------------------------------------- 1 | 之前帮客户做的爬虫, 爬取东方财富网的每日的股票、可转债的数据 2 | 3 | 网址:http://finance.eastmoney.com/ 4 | 5 | 数据:每日运行生成一个csv文件 6 | 7 | 博客地址:https://blog.csdn.net/weixin_43746433 8 | 9 | 测试:代码截止2020/04/23测试无误 -------------------------------------------------------------------------------- /finance.eastmoney.com/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/finance.eastmoney.com/__init__.py -------------------------------------------------------------------------------- /finance.eastmoney.com/可还债/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/finance.eastmoney.com/可还债/__init__.py -------------------------------------------------------------------------------- /finance.eastmoney.com/股票/gupiao.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import requests 4 | import json 5 | import pandas as pd 6 | 7 | pd.set_option('display.max_columns', None) 8 | pd.set_option('display.max_rows', None) 9 | pd.set_option('display.unicode.ambiguous_as_wide', True) 10 | pd.set_option('display.unicode.east_asian_width', True) 11 | pd.set_option('display.width', 5000) 12 | 13 | # 字符类型的时间: 14 | def get_time(time_str): 15 | # 转为时间数组 16 | timeArray = time.strptime(time_str, "%Y%m%d") 17 | # 转为时间戳 18 | timeStamp = int(time.mktime(timeArray)) 19 | return timeStamp 20 | 21 | # 坐拥: 解析每个网页的数据 22 | # 输入:字符与每个网页所需的地址,请求的参数 23 | # 输出: 网页解析所获得的股票数据 24 | def HTML(time_str,url, params): 25 | gupiao_list = [] 26 | headers = { 27 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"} 28 | try: 29 | r = requests.get(url, headers=headers, timeout=30, params=params) 30 | r.raise_for_status() 31 | r.encoding = r.apparent_encoding 32 | html = r.text 33 | except Exception as e: 34 | print("wrong:" + e) 35 | # pat = re.compile("\[\{.*?\}\]") 36 | pat = re.compile("({.*?})") 37 | data = pat.findall(html) 38 | # print(data) 39 | js = [] 40 | for d in data: 41 | try: 42 | d1=eval(d+"]}}").get("data").get("diff")[0] 43 | except: 44 | d1 = eval(d) 45 | js.append(d1) 46 | for i in range(len(js)): 47 | zhenfu = str(js[i]["f7"]) + "%" 48 | gupiao_list.append(( 49 | js[i]["f12"], js[i]["f14"], js[i]["f2"], zhenfu, js[i]["f4"], js[i]["f5"], js[i]["f6"], 50 | zhenfu, js[i]["f15"], js[i]["f16"], js[i]["f17"], js[i]["f18"], js[i]["f10"])) 51 | title = ["代码", "名称", "最新价", "涨跌幅", "涨跌额", "成交量", "成交额", 52 | "振幅", "最高", "最低", "今开", "昨收", "量比"] 53 | df = pd.DataFrame(gupiao_list, columns=title) 54 | to_csv(df, f"result_{time_str}.csv") 55 | 56 | # 保存csv图片 57 | def to_csv(df, csv_file): 58 | if os.path.exists(csv_file) == False: 59 | df.to_csv(csv_file, index=False) 60 | else: 61 | df.to_csv(csv_file, mode='a+', header=False, index=False) 62 | 63 | 64 | import time 65 | # 主函数入 66 | # 输入:时间与时间字符 67 | # 输出:解析网页 所需的header请求 68 | def main(time_str,time_): 69 | time_ = str(time_) +"000" 70 | # 爬出249个网页 71 | for i in range(1, 250): 72 | print(i) 73 | url = 'http://push2.eastmoney.com/api/qt/clist/get' 74 | params = { 75 | 'cb': f'jQuery112407955974158503321_{str(time_)}', 76 | 'pn': str(i), 77 | 'pz': '20', 78 | 'po': '1', 79 | 'np': '1', 80 | 'ut': 'bd1d9ddb04089700cf9c27f6f7426281', 81 | 'fltt': '2', 82 | 'invt': '2', 83 | 'fid': 'f3', 84 | 'fs': 'm:0 t:6,m:0 t:13,m:0 t:80,m:1 t:2,m:1 t:23', 85 | 'fields': 'f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152', 86 | '_': str(time_) 87 | } 88 | # 解析网页入口 89 | HTML(time_str, url,params) 90 | # 睡眠 91 | time.sleep(6) 92 | 93 | 94 | if __name__ == '__main__': 95 | # 输入时间 96 | for time_str in ["20200417"]: 97 | time_ = get_time(time_str) 98 | # 程序入口 99 | main(time_str,time_) 100 | -------------------------------------------------------------------------------- /live.bible.is.com/README.md: -------------------------------------------------------------------------------- 1 | 下载http://www.bible.is 音频的文字内容,一种类型对应一个 dictionary, 一个音频对应一个txt,有断点续传的功能 2 | 3 | 网址: http://www.bible.is/radio 4 | 5 | 微信:why19970628 6 | 7 | 欢迎与我交流 8 | -------------------------------------------------------------------------------- /live.bible.is.com/live.bible.is.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | import re 3 | from functools import wraps 4 | import datetime 5 | import sys 6 | import json 7 | import os 8 | import requests 9 | from lxml import etree 10 | from fake_useragent import UserAgent 11 | from urllib.parse import quote, urlencode 12 | import urllib 13 | import time 14 | import string 15 | 16 | ua = UserAgent() 17 | 18 | 19 | 20 | def handel_single_country(country_name_en, country_name_folder, country_name): 21 | link = f'https://live.bible.is/bible/{country_name_en}/MAT/1?audio_type=audio' 22 | print(country_name_folder, link) 23 | response = requests.get(link, headers={ 24 | "User_Agent": ua.chrome}) 25 | 26 | tree = etree.HTML(response.text) 27 | vedio_page_urls = tree.xpath('/html/body/script[1]/text()') 28 | 29 | # chapter = re.findall('"testaments":(.*?),"audioType', 30 | # str(vedio_page_urls))[0] 31 | # chapter = eval(chapter) 32 | # parts = [k for k, v in chapter.items() if v == "NT"] 33 | 34 | chapters = re.findall(r'"\w{2,3}":"\w{2,3}"', str(vedio_page_urls)) 35 | chapters = chapters[1:] 36 | parts = [] 37 | for chapter in chapters: 38 | [k, v] = chapter.replace('"', '').split(':') 39 | if k not in parts and v == 'NT': 40 | parts.append(k) 41 | for part in parts: 42 | # success = 0 43 | failed = 0 44 | for i in range(1, 35): 45 | txt_page_url = f'https://live.bible.is/bible/{country_name_en}/{part}/{i}?audio_type=audio' 46 | if failed >= 5: 47 | break 48 | try: 49 | print(txt_page_url) 50 | page_response = requests.get(txt_page_url, headers={ 51 | "User_Agent": ua.chrome}) 52 | title = re.findall('book-chapter-text">(.+)>>")) 23 | end = int(input("终止页码>>>")) 24 | for num in range(start, end + 1): 25 | print(f"第{num}页") 26 | headers = ua.chrome 27 | response = requests.get(base_url.format(str(num)), 28 | headers={"User_Agent": ua.chrome}) 29 | tree = etree.HTML(response.text) 30 | article_list = tree.xpath('//div[@class="art"]/a/@href') 31 | for article in article_list: 32 | article_url = article 33 | print(article_url) 34 | for i in range(5): 35 | try: 36 | article_detail = requests.get( 37 | article_url, headers={"User_Agent": ua.chrome}) 38 | if article_detail.status_code == 200: 39 | break 40 | except requests.exceptions.ProxyError: 41 | continue 42 | else: 43 | print(f"{article_url}\t\t失败!!!") 44 | continue 45 | article_tree = etree.HTML(article_detail.text) 46 | content = article_tree.xpath('//div[@class="bbtext"]/p//text()') 47 | content = "".join(content).split(".") 48 | print(content) 49 | for index, item in enumerate(content): 50 | if not item: 51 | continue 52 | content[index] = item + "." 53 | 54 | try: 55 | write_doc("D:\stardata\小语种爬虫\捷克语/" + "res3.docx", content) 56 | except FileNotFoundError: 57 | print(f"{article_url}\t\t出错了!!!") 58 | continue 59 | print(f"{article_url}\t\t完成!!!") 60 | print(f"第{num}页完成!!!") 61 | -------------------------------------------------------------------------------- /minority_language/saier.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | import docx 4 | from lxml import etree 5 | from fake_useragent import UserAgent 6 | 7 | 8 | def write_doc(file_path, content_list): 9 | if os.path.isfile(file_path): 10 | doc = docx.Document(file_path) 11 | else: 12 | doc = docx.Document() 13 | 14 | for content in content_list: 15 | doc.add_paragraph(content) 16 | doc.save(file_path) 17 | 18 | 19 | 20 | ua = UserAgent() 21 | # base_url = "https://www.idnes.cz/sport/archiv/{}" 22 | base_url = "https://www.blic.rs/" 23 | 24 | 25 | def get_data(): 26 | headers = ua.chrome 27 | response = requests.get(base_url, headers={"User_Agent": ua.chrome}) 28 | # print(response.text) 29 | tree = etree.HTML(response.text) 30 | article_list = tree.xpath('//*[@id="top"]/div[4]/div/nav/ul/li/a/@href') 31 | print(article_list) 32 | for article in article_list[1:2]: # 类型 33 | article = article.lower() 34 | print(article) 35 | type_url = f"https://www.blic.rs/{article}" 36 | print(type_url) 37 | article_type_list = requests.get( 38 | type_url, headers={"User_Agent": ua.chrome}) 39 | res = article_type_list.text 40 | tree1 = etree.HTML(res) 41 | page_data = tree1.xpath( 42 | '//div[@class="pagination__list"]/ul/li/a/@href')[-2] 43 | page_res = page_data[:8] 44 | page = page_data[8:] 45 | print(f"{article}共有{page}页") 46 | for i in range(1, int(page)+1): # 遍历每一页 47 | print(f"{article}类型,第{i}页", "*"*20) 48 | url = type_url + page_res+str(i) 49 | article_list = requests.get( 50 | url, headers={"User_Agent": ua.chrome}).text 51 | tree1 = etree.HTML(article_list) 52 | page_detail_url_list = tree1.xpath( 53 | '/html/body/main/div/section/div[2]/section/article/div/h3/a/@href') 54 | print(page_detail_url_list) 55 | for detail_url in page_detail_url_list: 56 | for i in range(5): 57 | try: 58 | article_detail = requests.get( 59 | detail_url, headers={"User_Agent": ua.chrome}) 60 | if article_detail.status_code == 200: 61 | break 62 | except requests.exceptions.ProxyError: 63 | continue 64 | else: 65 | print(f"{detail_url}\t\t失败!!!") 66 | continue 67 | tree2 = etree.HTML(article_detail.text) 68 | content = tree2.xpath( 69 | '/html/body/main/div/article/div/div/p/text()') 70 | 71 | content = "".join(content).split(".") 72 | print(content) 73 | for index, item in enumerate(content): 74 | if not item: 75 | continue 76 | content[index] = item + "." 77 | 78 | try: 79 | write_doc("D:\stardata\小语种爬虫\赛尔/" + 80 | "res_saier2.docx", content) 81 | except FileNotFoundError: 82 | print(f"{detail_url}\t\t出错了!!!") 83 | continue 84 | print(f"{detail_url}\t\t完成!!!") 85 | 86 | 87 | if __name__ == '__main__': 88 | get_data() 89 | -------------------------------------------------------------------------------- /reward.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/reward.jpg -------------------------------------------------------------------------------- /taobao/README.md: -------------------------------------------------------------------------------- 1 | 爬取淘宝美食的所有页面,mysql需要手动配置数据库。 2 | 3 | 数据:大约4500条数据,sql为mysql文件,josn文件为Mongodb文件 4 | 5 | 博客地址:https://blog.csdn.net/weixin_43746433 6 | 7 | 爬虫详情:https://blog.csdn.net/weixin_43746433/article/details/97623511 8 | 9 | 后续的数据分析:https://blog.csdn.net/weixin_43746433/article/details/97688169 10 | 11 | 微信:why19970628 12 | 13 | 欢迎与我交流 14 | -------------------------------------------------------------------------------- /taobao/test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pprint 3 | from lxml import etree 4 | import json 5 | import urllib 6 | from urllib import request 7 | headers={ 8 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 9 | "Accept-Encoding":"gzip, deflate, br", 10 | "Accept-Language":"zh-CN,zh;q=0.9", 11 | "Cache-Control":"max-age=0", 12 | "Connection":"keep-alive", 13 | "Cookie":"t=4f8e0f1ccaf38c3b87096409eeb1fd52; cna=FOKyFcj3bnQCAdocR8ZpjqDM; tracknick=%5Cu6211%5Cu53EB%5Cu738B%5Cu5927%5Cu9633%5Cu554A; lgc=%5Cu6211%5Cu53EB%5Cu738B%5Cu5927%5Cu9633%5Cu554A; tg=0; thw=cn; cookie2=1515a17feb4817e0b121ec61c57bdebd; _tb_token_=f346348eef015; _m_h5_tk=60367eb5c8d7c62be5befdf974af4201_1564038274783; _m_h5_tk_enc=9e7a36f5498b1568032367f248ae47ee; uc3=lg2=UIHiLt3xD8xYTw%3D%3D&id2=UUGrdwHsJB6u%2BQ%3D%3D&nk2=rUszGXlaengSz%2BTL&vt3=F8dBy3zbW%2FWdhdBl7NE%3D; uc4=nk4=0%40r7q1NfecRXnYVq4toteFS9tFPfXPIO4%3D&id4=0%40U2OcR2VRIfPxS27lnuSvz1%2BkOUiV; _cc_=U%2BGCWk%2F7og%3D%3D; enc=4wio677EOfwVE4ZtLJjx3w0OUX9gNfrhOPqVwF%2B6OyFs7QlbFG02LVHBZ7Ap4D9cFy7VZetUXAs0oBAAYBuTDQ%3D%3D; mt=ci=104_1; swfstore=307564; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; hng=CN%7Czh-CN%7CCNY%7C156; whl=-1%260%260%261564031467353; uc1=cookie14=UoTaHPgonNb53g%3D%3D; v=0; pnm_cku822=098%23E1hvrvvUvbpvUvCkvvvvvjiPRFFvsjiRnLdU1jD2PmPW0jrPP2zO1jDnRLLZtjY8iQhvChCvCCptvpvhphvvv8yCvv3vpvo1y6cQtOyCvvXmp99hetutvpvIphvvvvvvphCvpv3cvvChXZCvjvUvvhL6phvwv9vvBW1vpCQmvvChsvyCvh1hAXyvI1QaUjCwiNoOejaPJHLXSfpAOHCqVUcn%2B3C1osEc6aZtn0vHVA3lYb8rwo1%2Bm7zhdigDN5WK%2BE7reB69EcqhaB4AVAWaUExrvphvCyCCvvvvvvGCvvpvvPMM; l=cBTrwufVqMp97_ASXOCwourza77OSIRAguPzaNbMi_5BU6L1UuQOkVGcNFp6VjWd9hYB4sdB3ey9-etkiWMuGiuXppgF.; isg=BFRUAAxi9T5QyWF9SrxZjDUFJZLGrXiXR5zNBu414F9i2fQjFr1IJwpb2ZFkIbDv", 14 | "Host":"shop130809627.taobao.com", 15 | "Upgrade-Insecure-Requests":"1", 16 | "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36", 17 | } 18 | url='https://shop130809627.taobao.com/i/asynSearch.htm?_ksTS=1564031496810_573&callback=jsonp574&mid=w-18789199391-0&wid=18789199391&path=/category.htm&spm=a1z10.3-c-s.w4002-18789199391.25.300342a2PK2mOr&orderType=hotsell_desc' 19 | #req = request.Request(url, headers=headers) 20 | #html = request.urlopen(req).read().decode() 21 | #print(html) 22 | s= requests.session() 23 | res =s.get(url,headers=headers,verify=False) 24 | res.encoding='utf-8' 25 | #res=json.load(res) 26 | html=res.text 27 | print(html) 28 | import re 29 | a=r'.*?(.*?).*?' 30 | con=re.compile(a,re.S) 31 | links = re.findall(con, html) 32 | print(links) -------------------------------------------------------------------------------- /utils/crawlerHelper.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import redis 3 | import urllib 4 | import sys 5 | from functools import wraps 6 | import datetime 7 | import os 8 | import requests 9 | from lxml import etree 10 | from fake_useragent import UserAgent 11 | from urllib.parse import quote, urlencode 12 | import time 13 | import string 14 | import socket 15 | socket.setdefaulttimeout(6) 16 | 17 | 18 | def con_redis(): 19 | # 连接池 20 | pool = redis.ConnectionPool( 21 | host="123.56.153.183", port=6379, max_connections=1024) 22 | conn = redis.Redis(connection_pool=pool) 23 | return conn 24 | 25 | 26 | def download(url, filename, callback): 27 | """ 28 | 封装了 urlretrieve()的自定义函数,递归调用urlretrieve(),当下载失败时,重新下载,三次下载失败结束 29 | download file from internet 30 | :param url: path to download from 31 | :param savepath: path to save files 32 | :return: None 33 | """ 34 | 35 | count = 1 36 | try: 37 | urllib.request.urlretrieve(url, filename, callback) 38 | except socket.timeout: 39 | while count <= 2: 40 | try: 41 | urllib.request.urlretrieve(url, filename, callback) 42 | break 43 | except socket.timeout: 44 | err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count 45 | print(err_info) 46 | count += 1 47 | if count > 2: 48 | print("downloading picture fialed!") 49 | 50 | # try: 51 | # urllib.request.urlretrieve(url, filename, callback) 52 | # # except urllib.ContentTooShortError: 53 | # # print('Network conditions is not good.Reloading.') 54 | # # download(url, filename, callback, header) 55 | # except Exception as e: 56 | # print(e) 57 | # print('Network conditions is not good.\nReloading.....') 58 | # download(url, filename, callback) 59 | 60 | 61 | def download2(url, filename, callback): 62 | try: 63 | res = requests.get(url, timeout=10) 64 | with open(filename, 'ab') as file: # 保存到本地的文件名 65 | file.write(res.content) 66 | file.flush() 67 | except socket.timeout: 68 | print('timeouut') 69 | 70 | 71 | # 下载进度 72 | def callback(num, consumed_bytes, total_bytes): 73 | """ 74 | 显示下载文件的进度 75 | :param @num:目前为此传递的数据块数量 76 | :param @consumed_bytes:每个数据块的大小,单位是byte,字节 77 | :param @total_bytes:远程文件的大小 78 | :return: None 79 | """ 80 | # if a3: 81 | rate = int(100 * (float(num * consumed_bytes) / float(total_bytes))) 82 | print('\r{0}% '.format(rate), end='') 83 | sys.stdout.flush() 84 | 85 | 86 | def cost(func): 87 | @wraps(func) 88 | def wrapper(*args, **kwargs): 89 | start = time.time() 90 | res = func(*args, **kwargs) 91 | end = time.time() 92 | print('花费', start - end, 's') 93 | return res 94 | 95 | return wrapper 96 | -------------------------------------------------------------------------------- /yingjieshneg.com/README.md: -------------------------------------------------------------------------------- 1 | 爬取应届生求职网 招聘网站 链接、职位、公司信息等数据 2 | 3 | 网址: http://www.yingjiesheng.com/ 4 | 5 | 博客地址:https://blog.csdn.net/weixin_43746433 6 | 7 | 微信:why19970628 8 | 9 | 欢迎与我交流 10 | -------------------------------------------------------------------------------- /yingjieshneg.com/yingjieshneg.py: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | import requests, pymysql, json 3 | import time 4 | from fake_useragent import UserAgent 5 | import time, request 6 | from multiprocessing import Pool 7 | import re 8 | import pandas as pd 9 | from datetime import datetime, timedelta 10 | from lxml import etree 11 | 12 | date = (datetime.now() - timedelta(days=0)).strftime('%Y-%m-%d') 13 | 14 | # date = time.strftime('%Y-%m-%d', time.localtime(time.time())) 15 | ua = UserAgent() 16 | 17 | 18 | def get_data(url): 19 | print(url) 20 | 21 | res = requests.get(url, headers={"User-Agent": ua.chrome}) 22 | res.encoding = 'gb2312' 23 | company_pattern = re.compile( 24 | 'class="jobli".*?
.*?.*?span(.*?)', re.S) 25 | items = re.findall(company_pattern, res.text) 26 | content = [] 27 | for item in items: 28 | dataset = {} 29 | if len(item[2]) > 1 and str(item[2][1:].strip()) == str(date): 30 | time_ = item[2][1:] 31 | else: 32 | continue 33 | 34 | if len(item[0]) > 1: 35 | if "http" not in item[0]: 36 | company_url = "http://www.yingjiesheng.com" + item[0] 37 | else: 38 | company_url = item[0] 39 | else: 40 | continue 41 | dataset['url'] = company_url 42 | print(company_url) 43 | 44 | job_content = requests.get(company_url, headers={"User-Agent": ua.chrome}) 45 | if job_content.status_code == 200: 46 | job_content.encoding = 'gb2312' 47 | html = etree.HTML(job_content.text) 48 | jobs = html.xpath("//div[@class='info clearfix']/ol/li/u/text()") 49 | try: 50 | job = jobs[-1].replace('置顶"', '').strip() 51 | except: 52 | job = "" 53 | des_list = html.xpath("//div[@class='jobIntro']/div//text()") 54 | if len(des_list) == 0: 55 | des = '' 56 | else: 57 | des = '' 58 | for i in des_list: 59 | i = i.strip().replace("\n", "").replace(" ", "") 60 | des = des + " " + i 61 | print(des) 62 | 63 | else: 64 | job = "" 65 | des = "" 66 | 67 | dataset['company'] = item[1][1:].replace('置顶','').replace(" ","").strip() if len(item[1]) > 1 else '' 68 | dataset["job"] = job 69 | dataset["des"] = des 70 | dataset['time'] = (datetime.now() - timedelta(days=0)).strftime('%d/%m/%Y') 71 | print("*" * 10) 72 | print(dataset) 73 | content.append(dataset) 74 | time.sleep(5) 75 | # print(content) 76 | return content 77 | 78 | 79 | def write_to_file(content): 80 | df = pd.DataFrame(content) 81 | time.time() 82 | df.to_csv(f'{date}_company.csv', index=False, mode='a+', header=False) 83 | 84 | 85 | def run(page): 86 | url = f"http://www.yingjiesheng.com/commend-fulltime-{page}.html" 87 | data = get_data(url=url) 88 | write_to_file(data) 89 | 90 | 91 | def run2(page): 92 | url = f"http://www.yingjiesheng.com/commend-parttime-{page}.html" 93 | data = get_data(url=url) 94 | write_to_file(data) 95 | 96 | 97 | def quchong(): 98 | data = pd.read_csv(f'{date}_company.csv') 99 | data.columns = ["目标网页", "公司信息", "招聘岗位", "职位描述", "发布日期"] 100 | a = data.drop_duplicates(subset=['目标网页'], keep='first') 101 | a.to_csv(f'{date}_company.csv', index=False) 102 | 103 | 104 | if __name__ == '__main__': 105 | start = time.time() 106 | pool = Pool() 107 | pool.map(run, [i for i in range(1, 8)]) 108 | pool.map(run2, [i for i in range(1, 8)]) 109 | quchong() 110 | 111 | print('花费时间:', time.time() - start) 112 | -------------------------------------------------------------------------------- /yixuela.com/README.md: -------------------------------------------------------------------------------- 1 | 下载易学啦 各个版本(人教版、鲁人版、苏教版、沪教版、北师大版等)、年级(小、初、高)、文章中 所有图片信息, 需手动指定下载图片的本地存放目录 2 | 3 | 网址: https://www.yixuela.com/ 4 | 5 | 数据:平均每个版本大约1G数据量, 一共6G的数据, 保存方式为图片 6 | 7 | 微信:why19970628 8 | 9 | 欢迎与我交流 10 | -------------------------------------------------------------------------------- /yixuela.com/poetry.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | import sys 3 | import os 4 | import requests 5 | from lxml import etree 6 | from fake_useragent import UserAgent 7 | from urllib.parse import quote, urlencode 8 | import urllib 9 | import time 10 | import string 11 | version_links = ['hjb', 'ljb', 'bsd', 'rjb'] # 'sjb', 12 | admin = 'https://www.yixuela.com/' 13 | subject = 'yuwen/' 14 | 15 | ua = UserAgent() 16 | 17 | 18 | def get_url_link(): 19 | for version in version_links: 20 | # 版本 21 | version_subject_url = admin + 'books/' + version + '/' + subject 22 | # 年级 23 | # for i in range(1,13): 24 | # grade = f'g{i}/' 25 | # print(version_subject_url) 26 | yield version_subject_url, version 27 | 28 | 29 | def crwal_artile_content(artitle_content_url, article_folder): 30 | """ 31 | 文章详情页爬取图片 32 | """ 33 | response = requests.get(artitle_content_url, headers={ 34 | "User_Agent": ua.chrome}) 35 | # print(response.text) 36 | tree = etree.HTML(response.text) 37 | image_name = tree.xpath( 38 | '/html/body/section/div[3]/div[1]/div[2]/img/@src') 39 | for index, name in enumerate(image_name): 40 | name = name.split('/')[-1] 41 | image_save_path = os.path.join(article_folder, name) 42 | ori_url = image_name[index] 43 | url = quote(ori_url, safe='/:?=') 44 | urllib.request.urlretrieve(url, image_save_path) 45 | print(f'{image_save_path} 爬取成功!') 46 | 47 | 48 | def crwal_artile(content_link, result_folder): 49 | response = requests.get(content_link, headers={"User_Agent": ua.chrome}) 50 | tree = etree.HTML(response.text) 51 | article_name = tree.xpath( 52 | '//div[@class="right-menu bg-white mt-10"]/nav/ul/li/a/text()') 53 | article_link = tree.xpath( 54 | '//div[@class="right-menu bg-white mt-10"]/nav/ul/li/a/@href') 55 | for index, name in enumerate(article_name): 56 | article_folder = os.path.join(result_folder, name) 57 | os.makedirs(article_folder, exist_ok=True) 58 | artitle_content_url = admin + article_link[index] 59 | try: 60 | crwal_artile_content(artitle_content_url, article_folder) 61 | except Exception as e: 62 | print(e) 63 | time.sleep(1) 64 | 65 | 66 | def run(url, result_path, version): 67 | response = requests.get(url, headers={"User_Agent": ua.chrome}) 68 | tree = etree.HTML(response.text) 69 | title = tree.xpath('//div[@class="list-warp"]/div//a/text()') 70 | title_link = tree.xpath('//div[@class="list-warp"]/div//a/@href') 71 | title_ = [i for i in title if len(i.replace(" ", '')) > 2] 72 | 73 | for index, title1 in enumerate(title_): 74 | content_link = admin + title_link[index * 2] 75 | result_folder = os.path.join(result_path, f'{version}/{title1}') 76 | os.makedirs(result_folder, exist_ok=True) 77 | try: 78 | crwal_artile(content_link, result_folder) 79 | time.sleep(2) 80 | except Exception as e: 81 | print(e) 82 | 83 | # if len(title_) != len(title_link): 84 | # raise Exception('title length error') 85 | 86 | 87 | def process(result_path): 88 | for url, version in get_url_link(): 89 | print(url) 90 | run(url, result_path, version) 91 | time.sleep(5) 92 | 93 | 94 | if __name__ == '__main__': 95 | result_path = sys.argv[1] 96 | process(result_path) 97 | -------------------------------------------------------------------------------- /微博热搜/人物.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/微博热搜/人物.xlsx -------------------------------------------------------------------------------- /微博热搜/名词.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/微博热搜/名词.xlsx -------------------------------------------------------------------------------- /微博热搜/婚恋.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/微博热搜/婚恋.xlsx -------------------------------------------------------------------------------- /爬取中彩网彩票/3D.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/爬取中彩网彩票/3D.xls -------------------------------------------------------------------------------- /爬取中彩网彩票/test_CaiPiao.py: -------------------------------------------------------------------------------- 1 | #!python3 2 | import random 3 | from urllib.request import ProxyHandler, build_opener 4 | from bs4 import BeautifulSoup 5 | import xlwt 6 | def get_ip(): 7 | fr=open('D:\软件(学习)\Python\PyCharm\kaoshi\ip.txt','r') 8 | ips=fr.readlines() 9 | new=[] 10 | for line in ips: 11 | temp=line.strip() 12 | new.append(temp) 13 | ip=random.choice(new) 14 | return ip 15 | print(ip) 16 | proxy =get_ip() 17 | proxy_handler = ProxyHandler({ 18 | 'http': 'http://' + proxy, 19 | 'https': 'https://' + proxy 20 | }) 21 | fo=open('2.html','r',encoding='utf-8') 22 | html=fo.read() 23 | soup=BeautifulSoup(html,'lxml') 24 | fo.close() 25 | #opener = build_opener(proxy_handler) 26 | #for i in range(1): 27 | # url='http://kaijiang.zhcw.com/zhcw/html/3d/list_'+str(i)+'.html' 28 | # res=request.Request(url) 29 | # response=opener.open(res).read().decode('utf-8') 30 | # soup=BeautifulSoup(response,'lxml') 31 | #print(soup.select('tr')) 32 | #print(soup.select('tr')[2:-1]) 33 | pat='' 34 | a=soup.find_all('td',{'style':'padding-left:20px;'}) 35 | #print(a) 36 | #for i in a: 37 | # print(i.text) 38 | 39 | def parse_one_page(): 40 | for item in soup.select('tr')[2:-1]: 41 | i = 0 42 | yield { 43 | 44 | 'time': item.select('td')[i].text, 45 | 'issue': item.select('td')[i + 1].text, 46 | 'digits': item.select('td em')[0].text, 47 | 'ten_digits': item.select('td em')[1].text, 48 | 'hundred_digits': item.select('td em')[2].text, 49 | 'single_selection': item.select('td')[i + 3].text, 50 | 'group_selection_3': item.select('td')[i + 4].text, 51 | 'group_selection_6': item.select('td')[i + 5].text, 52 | 'sales': item.select('td')[i + 6].text, 53 | 'return_rates': item.select('td')[i + 7].text 54 | } 55 | parse_one_page() 56 | def write_to_excel(): 57 | 58 | 59 | f = xlwt.Workbook() 60 | 61 | sheet1 = f.add_sheet('3D',cell_overwrite_ok=True) 62 | 63 | row0 = ["开奖日期","期号","个位数","十位数","百位数","单数","组选3","组选6","销售额","返奖比例"] #写入第一行 64 | 65 | for j in range(0,len(row0)): 66 | 67 | sheet1.write(0,j,row0[j]) 68 | #依次爬取每一页内容的每一期信息,并将其依次写入Excel 69 | 70 | #写入每一期的信息 71 | i = 0 72 | 73 | 74 | for item in parse_one_page(): 75 | 76 | sheet1.write(i+1,0,item['time']) 77 | 78 | sheet1.write(i+1,1,item['issue']) 79 | 80 | sheet1.write(i+1,2,item['digits']) 81 | 82 | sheet1.write(i+1,3,item['ten_digits']) 83 | 84 | sheet1.write(i+1,4,item['hundred_digits']) 85 | 86 | sheet1.write(i+1,5,item['single_selection']) 87 | 88 | sheet1.write(i+1,6,item['group_selection_3']) 89 | 90 | sheet1.write(i+1,7,item['group_selection_6']) 91 | 92 | sheet1.write(i+1,8,item['sales']) 93 | 94 | sheet1.write(i+1,9,item['return_rates']) 95 | i+=1 96 | 97 | f.save('3D.xls') 98 | #write_to_excel() 99 | -------------------------------------------------------------------------------- /高考志愿网/README.md: -------------------------------------------------------------------------------- 1 | 高考志愿填报网 2 | 3 | 网址 https://gkcx.eol.cn/school/search 4 | 5 | gkzy.py,通过api接口,抓取该网站所有学校的信息,如学校类型,位置,历年分数线,排名等等 6 | 7 | gkzy2.py 抓取各个院校的招生人数、招生计划、开放专业名称与介绍,五年的文理科的分数线、五年的历年学校批次等维度 8 | 9 | 数据量约为2500个 -------------------------------------------------------------------------------- /高考志愿网/gkzy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/高考志愿网/gkzy.py -------------------------------------------------------------------------------- /高考志愿网/gkzy2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/高考志愿网/gkzy2.py -------------------------------------------------------------------------------- /高考网/main.py: -------------------------------------------------------------------------------- 1 | 1# coding = utf-8 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import os 6 | import time 7 | 8 | 9 | def get_data(): 10 | for i in range(1, 108): 11 | print("正在下载第%s页数据" % i) 12 | url = 'http://college.gaokao.com/schlist/p%s' % i 13 | res = requests.get(url).text 14 | content = BeautifulSoup(res, "html.parser") 15 | college_list = content.find('div', attrs={'class': 'scores_List'}).find_all('dl') 16 | items = map(parse_item, college_list) 17 | save_to_csv(items) 18 | time.sleep(1) 19 | 20 | 21 | def parse_item(item): 22 | college_name = item.find('strong')['title'] 23 | college_attr = item.find_all('li') 24 | college_site = college_attr[0].text[6:] 25 | college_title = college_attr[1].text[5:] 26 | college_type = college_attr[2].text[5:] 27 | college_belong = college_attr[3].text[5:] 28 | college_nature = college_attr[4].text[5:] 29 | college_website = college_attr[5].text[5:] 30 | result = { 31 | 'college_name': college_name, 32 | 'college_site': college_site, 33 | 'college_title': college_title, 34 | 'college_type': college_type, 35 | 'college_belong': college_belong, 36 | 'college_nature': college_nature, 37 | 'college_website': college_website 38 | } 39 | print(result) 40 | return result 41 | 42 | 43 | def save_to_csv(data): 44 | if not os.path.exists(r'college_data.csv'): 45 | with open('college_data.csv', 'a+', encoding='utf-8') as f: 46 | f.write('name,site,title,type,belong,nature,website\n') 47 | for d in data: 48 | try: 49 | row = '{},{},{},{},{},{},{}'.format(d['college_name'], 50 | d['college_site'], 51 | d['college_title'], 52 | d['college_type'], 53 | d['college_belong'], 54 | d['college_nature'], 55 | d['college_website']) 56 | f.write(row) 57 | f.write('\n') 58 | except: 59 | continue 60 | else: 61 | with open('college_data.csv', 'a+', encoding='utf-8') as f: 62 | for d in data: 63 | try: 64 | row = '{},{},{},{},{},{},{}'.format(d['college_name'], 65 | d['college_site'], 66 | d['college_title'], 67 | d['college_type'], 68 | d['college_belong'], 69 | d['college_nature'], 70 | d['college_website']) 71 | f.write(row) 72 | f.write('\n') 73 | except: 74 | continue 75 | 76 | 77 | if __name__ == '__main__': 78 | get_data() 79 | -------------------------------------------------------------------------------- /高考网/readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/why19970628/Python_Crawler/23ba5cf5ad12d5d0f9f3d2376c0c0ea32fc3d2de/高考网/readme.md -------------------------------------------------------------------------------- /高考网/北京上海江苏高质量高校占比.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Awesome-pyecharts 6 | 7 | 8 | 9 | 10 | 11 |
12 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /高考网/北京高质量高校占比.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Awesome-pyecharts 6 | 7 | 8 | 9 | 10 | 11 |
12 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /高考网/占比前十城市高质量高校占比.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Awesome-pyecharts 6 | 7 | 8 | 9 | 10 | 11 |
12 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /高考网/高校属性分析pie.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Awesome-pyecharts 6 | 7 | 8 | 9 | 10 |
11 | 128 | 129 | 130 | --------------------------------------------------------------------------------