├── README.md
├── Visualization
├── COVID-19_Tracking-master.zip
├── Epidemic-analysis
│ ├── .ipynb_checkpoints
│ │ └── Epidemic-analysis-checkpoint.ipynb
│ ├── Epidemic-analysis.ipynb
│ ├── READEME.md
│ └── requirements.txt
├── Python疫情监控.zip
├── Python疫情监控
│ ├── notebook笔记
│ │ └── 疫情监控.ipynb
│ ├── python疫情监控实战-东方瑞通.pdf
│ ├── 环境准备.txt
│ └── 项目源码
│ │ └── Cov
│ │ ├── .idea
│ │ ├── Cov.iml
│ │ ├── misc.xml
│ │ ├── modules.xml
│ │ └── workspace.xml
│ │ ├── __pycache__
│ │ ├── app.cpython-37.pyc
│ │ └── utils.cpython-37.pyc
│ │ ├── app.py
│ │ ├── spider.py
│ │ ├── static
│ │ ├── css
│ │ │ └── main.css
│ │ └── js
│ │ │ ├── china.js
│ │ │ ├── controller.js
│ │ │ ├── ec_center.js
│ │ │ ├── ec_left1.js
│ │ │ ├── ec_left2.js
│ │ │ ├── ec_right1.js
│ │ │ ├── ec_right2.js
│ │ │ ├── echarts-wordcloud.min.js
│ │ │ ├── echarts.min.js
│ │ │ └── jquery-1.11.1.min.js
│ │ ├── templates
│ │ ├── index.html
│ │ ├── main.html
│ │ └── test.html
│ │ └── utils.py
└── maoyanMovies_comments
│ ├── analysis.py
│ ├── movieswd.py
│ ├── t1.jpg
│ ├── test.py
│ ├── venmo1.jpg
│ ├── venom.jpg
│ ├── wd.py
│ ├── 观众位置分布-地理坐标图.html
│ └── 观众来源排行-柱状图.html
├── pythonScript
├── Certificate_photo_for_background_color
│ └── main.py
├── OlympicGamesGoldenNotify
│ ├── __pycache__
│ │ ├── mail.cpython-37.pyc
│ │ └── medals.cpython-37.pyc
│ ├── index.py
│ ├── mail.py
│ └── medals.py
├── WordCloud
│ ├── Image-coloredwordcloud.py
│ ├── Maskedwordcloud.py
│ ├── alice_color.png
│ ├── coloredWd.py
│ └── comments.txt
├── autoVote
│ ├── autoVote.py
│ ├── cookie.txt
│ └── getCookie.py
├── birthdayNotify
│ ├── __pycache__
│ │ └── lunar.cpython-37.pyc
│ ├── birthday.json
│ ├── birthdayNotify.zip
│ ├── index.py
│ └── text.py
├── dingReminder
│ └── dingReminder.py
├── draw_excel
│ ├── 1.jpg
│ ├── 2.jpg
│ ├── 4k_1.jpg
│ ├── draw_excel.py
│ └── iu.jpg
├── messageReminder
│ ├── README.md
│ └── messageReminder.py
├── miStoreBuy
│ ├── MiStore.py
│ └── debug.log
├── pdfToExcel
│ ├── README.md
│ ├── pdfToExcel.py
│ └── 新建 Microsoft Word 文档.docx
├── poem
│ ├── Oxford3000.py
│ ├── TangshiGene.py
│ ├── TangshiGene2.py
│ ├── __init__.py
│ ├── dataHandler.py
│ ├── test.py
│ └── zzcf.py
├── studyReminder
│ └── studyRemidner.py
├── telegramPushBot
│ ├── ht.sh
│ └── locpush.py
├── tianyi-zhuancun
│ ├── README.md
│ ├── sec1.png
│ ├── sec2.png
│ └── zhuancun.py
└── year_code
│ ├── code_dir
│ └── readme.md
│ ├── data.csv
│ ├── readme.md
│ ├── show_res
│ ├── data_csv.jpg
│ ├── py_output.jpg
│ ├── py_statistic.jpg
│ └── sort_csv.jpg
│ ├── sort_data.csv
│ └── statistic.py
└── scrapy
├── 2019-nCov-cn
├── city.py
└── province.py
├── 51job-scrapy
├── 2020-09-25_java开发工程师岗位招聘信息.csv
├── 2020-09-25_python开发工程师岗位招聘信息.csv
├── 2020-09-25_python爬虫工程师岗位招聘信息.csv
├── 2020-09-25_python算法工程师岗位招聘信息.csv
├── 2020-09-27_python开发工程师岗位招聘信息.csv
├── 2020-09-27_python爬虫工程师岗位招聘信息.csv
├── 2020-09-27_python算法工程师岗位招聘信息.csv
├── 2021-03-12_Java开发工程师岗位招聘信息.csv
└── 51jobs.py
├── UnsplashCrawler
└── UnsplashCrawler.py
├── WeChatArticle
└── WecArticle.py
├── cf-ipv6
├── cf_ipv6_scan.py
├── cf_valid_ipv6_scan_2606_4700_.txt
└── ping.py
├── douyin-grils-down
├── README.md
├── douyin_appium.py
├── douyin_download.py
└── douyin_mitmdump.py
├── ipProxyPool
└── kuaidaili.py
├── jdCellPhone
└── cellPhone.py
├── postgraduate_colleges
├── PostgraduateColleges.xlsx
├── postgraduatecolleges.csv
└── 字段属性.txt
├── scrapy163musicComments
├── scrapyWyycomments.py
└── 网易云音乐精彩评论.csv
├── vip-item
├── README.md
├── requirements.txt
└── vip.py
├── ximalaya
└── ximalaya.py
├── yunzhanImgToPdf
├── README.md
└── main.py
└── zhihu-pretty-girl
├── READEME.md
├── requirements.txt
└── zhihu-pretty-girl.py
/README.md:
--------------------------------------------------------------------------------
1 | # YINUXY的python脚本分享 #
2 |
3 | ## pythonScript ##
4 | * [证件照背景色替换](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/Certificate_photo_for_background_color/)
5 | * [上下班打卡提醒](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/messageReminder/)
6 | * [使用python在Excel里面画图](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/draw_excel/)
7 | * [惊雷歌词生成器](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/jingLei-songsGenerator/)
8 | * [小米商城抢购](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/miStoreBuy/)
9 | * [PDF转Excel](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/pdfToExcel/)
10 | * [批量下载Pixiv图片](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/Pixiv/)
11 | * [诗词生成器](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/poem/)
12 | * [python随机图](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/random_images/)
13 | * [hostloc新帖推送](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/telegramPushBot/)
14 | * [天翼云资源一键转存](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/tianyi-zhuancun/)
15 | * [python制作词云图片](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/WordCloud/)
16 | * [python统计一年书写过的代码量](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/year_code/)
17 |
18 |
19 | ## scrapy ##
20 | * [51job职位抓取](https://github.com/InfiniteYinux/Python/tree/master/scrapy/51job-scrapy/)
21 | * [cf ip扫描](https://github.com/InfiniteYinux/Python/tree/master/scrapy/cf-ipv6/)
22 | * [中国新冠疫情数据抓取](https://github.com/InfiniteYinux/Python/tree/master/scrapy/2019-nCov-cn/)
23 | * [python批量下载抖音视频](https://github.com/InfiniteYinux/Python/tree/master/scrapy/douyin-grils-down/)
24 | * [wallhaven壁纸批量下载](https://github.com/InfiniteYinux/Python/tree/master/scrapy/img-spider-wallhaven/)
25 | * [京东手机信息抓取](https://github.com/InfiniteYinux/Python/tree/master/scrapy/jdCellPhone/)
26 | * [网易云音乐精彩评论抓取](https://github.com/InfiniteYinux/Python/tree/master/scrapy/scrapy163musicComments/)
27 | * [Unsplash图片批量下载](https://github.com/InfiniteYinux/Python/tree/master/scrapy/UnsplashCrawler/)
28 | * [唯品会商品信息抓取](https://github.com/InfiniteYinux/Python/tree/master/scrapy/vip-item/)
29 | * [微信公众号文章抓取](https://github.com/InfiniteYinux/Python/tree/master/scrapy/WeChatArticle/)
30 | * [喜马拉雅有声电子书抓取](https://github.com/InfiniteYinux/Python/tree/master/scrapy/ximalaya/)
31 | * [爬取知乎上的高颜值小姐姐](https://github.com/InfiniteYinux/Python/tree/master/scrapy/zhihu-pretty-girl/)
32 |
33 | ## Visualization ##
34 | * [使用 Python可视化神器 Plotly 动态演示全球疫情变化趋势](https://github.com/InfiniteYinux/Python/tree/master/Visualization/Epidemic-analysis/)
35 | * [猫眼评论数据可视化](https://github.com/InfiniteYinux/Python/tree/master/Visualization/maoyanMovies_comments/)
36 | * [Python疫情监控平台部署](https://github.com/InfiniteYinux/Python/tree/master/Visualization/Python疫情监控/)
--------------------------------------------------------------------------------
/Visualization/COVID-19_Tracking-master.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/Visualization/COVID-19_Tracking-master.zip
--------------------------------------------------------------------------------
/Visualization/Epidemic-analysis/READEME.md:
--------------------------------------------------------------------------------
1 | ## 微信公众号 ##
2 | 
3 | 欢迎扫码关注
4 | ## 博客 ##
5 | [YINUXY'S BLOG](https://blog.yinuxy.com/)
6 |
7 | ## 安装& 使用 ##
8 | ### 安装依赖 ###
9 | `pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com`
10 | ### 获取代码 ###
11 | `git clone ggit@github.com:InfiniteYinux/Python.git`
12 | ### 运行 ###
13 | 推荐使用`Jupyter Notebook`运行
--------------------------------------------------------------------------------
/Visualization/Epidemic-analysis/requirements.txt:
--------------------------------------------------------------------------------
1 | akshare
2 | pandas
3 | plotly
--------------------------------------------------------------------------------
/Visualization/Python疫情监控.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/Visualization/Python疫情监控.zip
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/python疫情监控实战-东方瑞通.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/Visualization/Python疫情监控/python疫情监控实战-东方瑞通.pdf
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/环境准备.txt:
--------------------------------------------------------------------------------
1 | python 3.7
2 | mysql
3 | pycharm
4 | jupyter notebook
5 | hbuilder
6 | linux (centos7)
7 |
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/项目源码/Cov/.idea/Cov.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/项目源码/Cov/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/项目源码/Cov/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/项目源码/Cov/__pycache__/app.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/Visualization/Python疫情监控/项目源码/Cov/__pycache__/app.cpython-37.pyc
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/项目源码/Cov/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/Visualization/Python疫情监控/项目源码/Cov/__pycache__/utils.cpython-37.pyc
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/项目源码/Cov/app.py:
--------------------------------------------------------------------------------
1 | from flask import Flask
2 | from flask import request
3 | from flask import render_template
4 | from flask import jsonify
5 | from jieba.analyse import extract_tags
6 | import string
7 | import utils
8 |
9 | app = Flask(__name__)
10 |
11 |
12 | @app.route('/')
13 | def hello_world():
14 | return render_template("main.html")
15 |
16 | @app.route("/c1")
17 | def get_c1_data():
18 | data = utils.get_c1_data()
19 | return jsonify({"confirm":data[0],"suspect":data[1],"heal":data[2],"dead":data[3]})
20 |
21 | @app.route("/c2")
22 | def get_c2_data():
23 | res = []
24 | for tup in utils.get_c2_data():
25 | # print(tup)
26 | res.append({"name":tup[0],"value":int(tup[1])})
27 | return jsonify({"data":res})
28 |
29 | @app.route("/l1")
30 | def get_l1_data():
31 | data = utils.get_l1_data()
32 | day,confirm,suspect,heal,dead = [],[],[],[],[]
33 | for a,b,c,d,e in data[7:]:
34 | day.append(a.strftime("%m-%d")) #a是datatime类型
35 | confirm.append(b)
36 | suspect.append(c)
37 | heal.append(d)
38 | dead.append(e)
39 | return jsonify({"day":day,"confirm": confirm, "suspect": suspect, "heal": heal, "dead": dead})
40 |
41 | @app.route("/l2")
42 | def get_l2_data():
43 | data = utils.get_l2_data()
44 | day, confirm_add, suspect_add = [], [], []
45 | for a, b, c in data[7:]:
46 | day.append(a.strftime("%m-%d")) # a是datatime类型
47 | confirm_add.append(b)
48 | suspect_add.append(c)
49 | return jsonify({"day": day, "confirm_add": confirm_add, "suspect_add": suspect_add})
50 |
51 | @app.route("/r1")
52 | def get_r1_data():
53 | data = utils.get_r1_data()
54 | city = []
55 | confirm = []
56 | for k,v in data:
57 | city.append(k)
58 | confirm.append(int(v))
59 | return jsonify({"city": city, "confirm": confirm})
60 |
61 |
62 | @app.route("/r2")
63 | def get_r2_data():
64 | data = utils.get_r2_data() #格式 (('民警抗疫一线奋战16天牺牲1037364',), ('四川再派两批医疗队1537382',)
65 | d = []
66 | for i in data:
67 | k = i[0].rstrip(string.digits) # 移除热搜数字
68 | v = i[0][len(k):] # 获取热搜数字
69 | ks = extract_tags(k) # 使用jieba 提取关键字
70 | for j in ks:
71 | if not j.isdigit():
72 | d.append({"name": j, "value": v})
73 | return jsonify({"kws": d})
74 |
75 | @app.route("/time")
76 | def get_time():
77 | return utils.get_time()
78 |
79 | @app.route('/ajax',methods=["get","post"])
80 | def hello_world4():
81 | name = request.values.get("name")
82 | score = request.values.get("score")
83 | print(f"name:{name},score:{score}")
84 | return '10000'
85 |
86 | @app.route('/tem')
87 | def hello_world3():
88 | return render_template("index.html")
89 |
90 | @app.route('/login')
91 | def hello_world2():
92 | name = request.values.get("name")
93 | pwd = request.values.get("pwd")
94 | return f'name={name},pwd={pwd}'
95 |
96 | @app.route("/abc")
97 | def hello_world1():
98 | id = request.values.get("id")
99 | return f"""
100 |
105 | """
106 |
107 | if __name__ == '__main__':
108 | app.run(host="0.0.0.0")
109 |
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/项目源码/Cov/spider.py:
--------------------------------------------------------------------------------
1 | from selenium.webdriver import Chrome, ChromeOptions
2 | import requests
3 | import pymysql
4 | import time
5 | import json
6 | import traceback
7 | import sys
8 |
9 | def get_conn():
10 | """
11 | :return: 连接,游标
12 | """
13 | # 创建连接
14 | conn = pymysql.connect(host="localhost",
15 | user="root",
16 | password="yinuxy",
17 | db="cov",
18 | charset="utf8")
19 | # 创建游标
20 | cursor = conn.cursor() # 执行完毕返回的结果集默认以元组显示
21 | return conn, cursor
22 |
23 |
24 | def close_conn(conn, cursor):
25 | if cursor:
26 | cursor.close()
27 | if conn:
28 | conn.close()
29 |
30 |
31 | def get_tencent_data():
32 | """
33 | :return: 返回历史数据和当日详细数据
34 | """
35 | url = 'https://api.inews.qq.com/newsqa/v1/query/inner/publish/modules/list?modules=chinaDayList,chinaDayAddList,cityStatis,nowConfirmStatis,provinceCompare'
36 | headers = {
37 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
38 | }
39 | r = requests.get(url, headers)
40 | res = json.loads(r.text) # json字符串转字典
41 | data_all = json.loads(res['data'])
42 |
43 | history = {} # 历史数据
44 | for i in data_all["chinaDayList"]:
45 | ds = "2020." + i["date"]
46 | tup = time.strptime(ds, "%Y.%m.%d")
47 | ds = time.strftime("%Y-%m-%d", tup) # 改变时间格式,不然插入数据库会报错,数据库是datetime类型
48 | confirm = i["confirm"]
49 | suspect = i["suspect"]
50 | heal = i["heal"]
51 | dead = i["dead"]
52 | history[ds] = {"confirm": confirm, "suspect": suspect, "heal": heal, "dead": dead}
53 | for i in data_all["chinaDayAddList"]:
54 | ds = "2020." + i["date"]
55 | tup = time.strptime(ds, "%Y.%m.%d")
56 | ds = time.strftime("%Y-%m-%d", tup)
57 | confirm = i["confirm"]
58 | suspect = i["suspect"]
59 | heal = i["heal"]
60 | dead = i["dead"]
61 | history[ds].update({"confirm_add": confirm, "suspect_add": suspect, "heal_add": heal, "dead_add": dead})
62 |
63 | details = [] # 当日详细数据
64 | update_time = data_all["lastUpdateTime"]
65 | data_country = data_all["areaTree"] # list 25个国家
66 | data_province = data_country[0]["children"] # 中国各省
67 | for pro_infos in data_province:
68 | province = pro_infos["name"] # 省名
69 | for city_infos in pro_infos["children"]:
70 | city = city_infos["name"]
71 | confirm = city_infos["total"]["confirm"]
72 | confirm_add = city_infos["today"]["confirm"]
73 | heal = city_infos["total"]["heal"]
74 | dead = city_infos["total"]["dead"]
75 | details.append([update_time, province, city, confirm, confirm_add, heal, dead])
76 | return history, details
77 |
78 |
79 | def get_baidu_hot():
80 | """
81 | :return: 返回百度疫情热搜
82 | """
83 | option = ChromeOptions() # 创建谷歌浏览器实例
84 | option.add_argument("--headless") # 隐藏浏览器
85 | option.add_argument('--no-sandbox')
86 |
87 | url = "https://voice.baidu.com/act/virussearch/virussearch?from=osari_map&tab=0&infomore=1"
88 | browser = Chrome(options=option,executable_path="./chromedriver.exe")
89 | browser.get(url)
90 | # 找到展开按钮
91 | dl = browser.find_element_by_xpath('//*[@id="main"]/div/div/section/div[2]/div/div[2]/section/div')
92 | dl.click()
93 | time.sleep(1)
94 | # 找到热搜标签
95 | c = browser.find_elements_by_xpath('//*[@id="main"]/div/div/section/div[2]/div/div[2]/section/a/div/span[2]')
96 | context = [i.text for i in c] # 获取标签内容
97 | print(context)
98 | return context
99 |
100 |
101 | def update_hotsearch():
102 | """
103 | 将疫情热搜插入数据库
104 | :return:
105 | """
106 | cursor = None
107 | conn = None
108 | try:
109 | context = get_baidu_hot()
110 | print(f"{time.asctime()}开始更新热搜数据")
111 | conn, cursor = get_conn()
112 | sql = "insert into hotsearch(dt,content) values(%s,%s)"
113 | ts = time.strftime("%Y-%m-%d %X")
114 | for i in context:
115 | cursor.execute(sql, (ts, i)) # 插入数据
116 | conn.commit() # 提交事务保存数据
117 | print(f"{time.asctime()}数据更新完毕")
118 | except:
119 | traceback.print_exc()
120 | finally:
121 | close_conn(conn, cursor)
122 |
123 |
124 | def update_details():
125 | """
126 | 更新 details 表
127 | :return:
128 | """
129 | cursor = None
130 | conn = None
131 | try:
132 | li = get_tencent_data()[1] # 0 是历史数据字典,1 最新详细数据列表
133 | conn, cursor = get_conn()
134 | sql = "insert into details(update_time,province,city,confirm,confirm_add,heal,dead) values(%s,%s,%s,%s,%s,%s,%s)"
135 | sql_query = 'select %s=(select update_time from details order by id desc limit 1)' #对比当前最大时间戳
136 | cursor.execute(sql_query,li[0][0])
137 | if not cursor.fetchone()[0]:
138 | print(f"{time.asctime()}开始更新最新数据")
139 | for item in li:
140 | cursor.execute(sql, item)
141 | conn.commit() # 提交事务 update delete insert操作
142 | print(f"{time.asctime()}更新最新数据完毕")
143 | else:
144 | print(f"{time.asctime()}已是最新数据!")
145 | except:
146 | traceback.print_exc()
147 | finally:
148 | close_conn(conn, cursor)
149 |
150 |
151 | def insert_history():
152 | """
153 | 插入历史数据
154 | :return:
155 | """
156 | cursor = None
157 | conn = None
158 | try:
159 | dic = get_tencent_data()[0] # 0 是历史数据字典,1 最新详细数据列表
160 | print(f"{time.asctime()}开始插入历史数据")
161 | conn, cursor = get_conn()
162 | sql = "insert into history values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
163 | for k, v in dic.items():
164 | # item 格式 {'2020-01-13': {'confirm': 41, 'suspect': 0, 'heal': 0, 'dead': 1}
165 | cursor.execute(sql, [k, v.get("confirm"), v.get("confirm_add"), v.get("suspect"),
166 | v.get("suspect_add"), v.get("heal"), v.get("heal_add"),
167 | v.get("dead"), v.get("dead_add")])
168 |
169 | conn.commit() # 提交事务 update delete insert操作
170 | print(f"{time.asctime()}插入历史数据完毕")
171 | except:
172 | traceback.print_exc()
173 | finally:
174 | close_conn(conn, cursor)
175 |
176 |
177 | def update_history():
178 | """
179 | 更新历史数据
180 | :return:
181 | """
182 | cursor = None
183 | conn = None
184 | try:
185 | dic = get_tencent_data()[0] # 0 是历史数据字典,1 最新详细数据列表
186 | print(f"{time.asctime()}开始更新历史数据")
187 | conn, cursor = get_conn()
188 | sql = "insert into history values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
189 | sql_query = "select confirm from history where ds=%s"
190 | for k, v in dic.items():
191 | # item 格式 {'2020-01-13': {'confirm': 41, 'suspect': 0, 'heal': 0, 'dead': 1}
192 | if not cursor.execute(sql_query, k):
193 | cursor.execute(sql, [k, v.get("confirm"), v.get("confirm_add"), v.get("suspect"),
194 | v.get("suspect_add"), v.get("heal"), v.get("heal_add"),
195 | v.get("dead"), v.get("dead_add")])
196 | conn.commit() # 提交事务 update delete insert操作
197 | print(f"{time.asctime()}历史数据更新完毕")
198 | except:
199 | traceback.print_exc()
200 | finally:
201 | close_conn(conn, cursor)
202 |
203 |
204 | if __name__ == "__main__":
205 | l = len(sys.argv)
206 | if l == 1:
207 | s = """
208 | 请输入参数
209 | 参数说明:
210 | up_his 更新历史记录表
211 | up_hot 更新实时热搜
212 | up_det 更新详细表
213 | """
214 | print(s)
215 | else:
216 | order = sys.argv[1]
217 | if order == "up_his":
218 | update_history()
219 | elif order == "up_det":
220 | update_details()
221 | elif order == "up_hot":
222 | update_hotsearch()
223 |
224 |
225 |
226 |
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/项目源码/Cov/static/css/main.css:
--------------------------------------------------------------------------------
1 | body {
2 | margin: 0;
3 | background: #333;
4 | }
5 |
6 | #title {
7 | position: absolute;
8 | width: 40%;
9 | height: 10%;
10 | top: 0;
11 | left: 30%;
12 | /* background: #666666; */
13 | color: white;
14 | font-size: 40px;
15 |
16 | display: flex;
17 | align-items: center;
18 | justify-content: center;
19 | }
20 |
21 | #tim {
22 | position: absolute;
23 | /* width: 30%; */
24 | height: 10%;
25 | right: 2%;
26 | top: 5%;
27 | color: #FFFFFF;
28 | font-size: 16px;
29 | }
30 |
31 | #c1 {
32 | position: absolute;
33 | width: 40%;
34 | height: 25%;
35 | top: 10%;
36 | left: 30%;
37 | color: white
38 | /* background: #777777; */
39 | }
40 |
41 | .num {
42 | width: 25%;
43 | float: left;
44 | display: flex;
45 | align-items: center;
46 | justify-content: center;
47 | color: gold;
48 | font-size: 20px;
49 | /*margin-top: 20px;*/
50 | }
51 |
52 | .txt {
53 | width: 25%;
54 | float: left;
55 | font-family: "幼圆";
56 | display: flex;
57 | align-items: center;
58 | justify-content: center;
59 | }
60 |
61 | .txt h2 {
62 | margin: 0;
63 | }
64 |
65 |
66 |
67 |
68 |
69 |
70 | #c2 {
71 | position: absolute;
72 | width: 40%;
73 | height: 65%;
74 | top: 35%;
75 | left: 30%;
76 | background: #888888;
77 | }
78 |
79 | #l1 {
80 | position: absolute;
81 | width: 30%;
82 | height: 45%;
83 | top: 10%;
84 | left: 0%;
85 | background: #666666;
86 | }
87 |
88 | #l2 {
89 | position: absolute;
90 | width: 30%;
91 | height: 45%;
92 | top: 55%;
93 | left: 0%;
94 | background: #777;
95 | }
96 |
97 | #r1 {
98 | position: absolute;
99 | width: 30%;
100 | height: 45%;
101 | top: 10%;
102 | right: 0%;
103 | background: #666666;
104 | }
105 |
106 | #r2 {
107 | position: absolute;
108 | width: 30%;
109 | height: 45%;
110 | top: 55%;
111 | right: 0%;
112 | background: #777;
113 | }
114 |
115 | /* 此处是设置自动匹配横屏,以适应手机显示 使用下面定义的id:gyroContain */
116 | @media screen and (orientation: portrait) {
117 | html{
118 | width : 100vmin;
119 | height : 100vmax;
120 | }
121 | body{
122 | width : 100vmin;
123 | height : 100vmax;
124 | }
125 | #gyroContain{
126 | width : 100vmax;
127 | height : 100vmin;
128 | transform-origin: top left;
129 | transform: rotate(90deg) translate(0,-100vmin);
130 | }
131 | }
132 | @media screen and (orientation: landscape) {
133 | html{
134 | width : 100vmax;
135 | height : 100vmin;
136 | }
137 | body{
138 | width : 100vmax;
139 | height : 100vmin;
140 | }
141 | #gyroContain{
142 | width : 100vmax;
143 | height : 100vmin;
144 | }
145 | }
146 |
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/项目源码/Cov/static/js/controller.js:
--------------------------------------------------------------------------------
1 | function gettime() {
2 | $.ajax({
3 | url: "/time",
4 | timeout: 10000, //超时时间设置为10秒;
5 | success: function(data) {
6 | $("#tim").html(data)
7 | },
8 | error: function(xhr, type, errorThrown) {
9 |
10 | }
11 | });
12 | }
13 |
14 | function get_c1_data() {
15 | $.ajax({
16 | url: "/c1",
17 | success: function(data) {
18 | $(".num h1").eq(0).text(data.confirm);
19 | $(".num h1").eq(1).text(data.suspect);
20 | $(".num h1").eq(2).text(data.heal);
21 | $(".num h1").eq(3).text(data.dead);
22 | },
23 | error: function(xhr, type, errorThrown) {
24 |
25 | }
26 | })
27 | }
28 | function get_c2_data() {
29 | $.ajax({
30 | url:"/c2",
31 | success: function(data) {
32 | ec_center_option.series[0].data=data.data
33 | ec_center.setOption(ec_center_option)
34 | },
35 | error: function(xhr, type, errorThrown) {
36 |
37 | }
38 | })
39 | }
40 |
41 | function get_l1_data() {
42 | $.ajax({
43 | url:"/l1",
44 | success: function(data) {
45 | ec_left1_Option.xAxis[0].data=data.day
46 | ec_left1_Option.series[0].data=data.confirm
47 | ec_left1_Option.series[1].data=data.suspect
48 | ec_left1_Option.series[2].data=data.heal
49 | ec_left1_Option.series[3].data=data.dead
50 | ec_left1.setOption(ec_left1_Option)
51 | },
52 | error: function(xhr, type, errorThrown) {
53 |
54 | }
55 | })
56 | }
57 |
58 | function get_l2_data() {
59 | $.ajax({
60 | url:"/l2",
61 | success: function(data) {
62 | ec_left2_Option.xAxis[0].data=data.day
63 | ec_left2_Option.series[0].data=data.confirm_add
64 | ec_left2_Option.series[1].data=data.suspect_add
65 | ec_left2.setOption(ec_left2_Option)
66 | },
67 | error: function(xhr, type, errorThrown) {
68 |
69 | }
70 | })
71 | }
72 |
73 | function get_r1_data() {
74 | $.ajax({
75 | url: "/r1",
76 | success: function (data) {
77 | ec_right1_option.xAxis.data=data.city;
78 | ec_right1_option.series[0].data=data.confirm;
79 | ec_right1.setOption(ec_right1_option);
80 | }
81 | })
82 | }
83 | function get_r2_data() {
84 | $.ajax({
85 | url: "/r2",
86 | success: function (data) {
87 | ec_right2_option.series[0].data=data.kws;
88 | ec_right2.setOption(ec_right2_option);
89 | }
90 | })
91 | }
92 | gettime()
93 | get_c1_data()
94 | get_c2_data()
95 | get_l1_data()
96 | get_l2_data()
97 | get_r1_data()
98 | get_r2_data()
99 |
100 | setInterval(gettime,1000)
101 | setInterval(get_c1_data,1000*10)
102 | setInterval(get_c2_data,10000*10)
103 | setInterval(get_l1_data,10000*10)
104 | setInterval(get_l2_data,10000*10)
105 | setInterval(get_r1_data,10000*10)
106 | setInterval(get_r2_data,10000*10)
107 |
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/项目源码/Cov/static/js/ec_center.js:
--------------------------------------------------------------------------------
1 | var ec_center = echarts.init(document.getElementById('c2'), "dark");
2 |
3 | var mydata = [{'name': '上海', 'value': 318}, {'name': '云南', 'value': 162}]
4 |
5 | var ec_center_option = {
6 | title: {
7 | text: '',
8 | subtext: '',
9 | x: 'left'
10 | },
11 | tooltip: {
12 | trigger: 'item'
13 | },
14 | //左侧小导航图标
15 | visualMap: {
16 | show: true,
17 | x: 'left',
18 | y: 'bottom',
19 | textStyle: {
20 | fontSize: 8,
21 | },
22 | splitList: [{ start: 1,end: 9 },
23 | {start: 10, end: 99 },
24 | { start: 100, end: 999 },
25 | { start: 1000, end: 9999 },
26 | { start: 10000 }],
27 | color: ['#8A3310', '#C64918', '#E55B25', '#F2AD92', '#F9DCD1']
28 | },
29 | //配置属性
30 | series: [{
31 | name: '累计确诊人数',
32 | type: 'map',
33 | mapType: 'china',
34 | roam: false, //拖动和缩放
35 | itemStyle: {
36 | normal: {
37 | borderWidth: .5, //区域边框宽度
38 | borderColor: '#009fe8', //区域边框颜色
39 | areaColor: "#ffefd5", //区域颜色
40 | },
41 | emphasis: { //鼠标滑过地图高亮的相关设置
42 | borderWidth: .5,
43 | borderColor: '#4b0082',
44 | areaColor: "#fff",
45 | }
46 | },
47 | label: {
48 | normal: {
49 | show: true, //省份名称
50 | fontSize: 8,
51 | },
52 | emphasis: {
53 | show: true,
54 | fontSize: 8,
55 | }
56 | },
57 | data:[] //mydata //数据
58 | }]
59 | };
60 | ec_center.setOption(ec_center_option)
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/项目源码/Cov/static/js/ec_left1.js:
--------------------------------------------------------------------------------
1 | var ec_left1 = echarts.init(document.getElementById('l1'), "dark");
2 |
3 | var ec_left1_Option = {
4 | //标题样式
5 | title: {
6 | text: "全国累计趋势",
7 | textStyle: {
8 | // color: 'white',
9 | },
10 | left: 'left',
11 | },
12 | tooltip: {
13 | trigger: 'axis',
14 | //指示器
15 | axisPointer: {
16 | type: 'line',
17 | lineStyle: {
18 | color: '#7171C6'
19 | }
20 | },
21 | },
22 | legend: {
23 | data: ['累计确诊', '现有疑似', "累计治愈", "累计死亡"],
24 | left: "right"
25 | },
26 |
27 | //图形位置
28 | grid: {
29 | left: '4%',
30 | right: '6%',
31 | bottom: '4%',
32 | top: 50,
33 | containLabel: true
34 | },
35 | xAxis: [{
36 | type: 'category',
37 | //x轴坐标点开始与结束点位置都不在最边缘
38 | // boundaryGap : true,
39 | data: []//['01.20', '01.21', '01.22']
40 | }],
41 | yAxis: [{
42 | type: 'value',
43 | //y轴字体设置
44 | axisLabel: {
45 | show: true,
46 | color: 'white',
47 | fontSize: 12,
48 | formatter: function(value) {
49 | if (value >= 1000) {
50 | value = value / 1000 + 'k';
51 | }
52 | return value;
53 | }
54 | },
55 | //y轴线设置显示
56 | axisLine: {
57 | show: true
58 | },
59 | //与x轴平行的线样式
60 | splitLine: {
61 | show: true,
62 | lineStyle: {
63 | color: '#17273B',
64 | width: 1,
65 | type: 'solid',
66 | }
67 | }
68 | }],
69 | series: [{
70 | name: "累计确诊",
71 | type: 'line',
72 | smooth: true,
73 | data: []//[260, 406, 529]
74 | }, {
75 | name: "现有疑似",
76 | type: 'line',
77 | smooth: true,
78 | data: []//[54, 37, 3935]
79 | },
80 | {
81 | name: "累计治愈",
82 | type: 'line',
83 | smooth: true,
84 | data: []//[25, 25, 25]
85 | }, {
86 | name: "累计死亡",
87 | type: 'line',
88 | smooth: true,
89 | data: []//[6, 9, 17]
90 | }]
91 | };
92 |
93 | ec_left1.setOption(ec_left1_Option)
94 |
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/项目源码/Cov/static/js/ec_left2.js:
--------------------------------------------------------------------------------
1 | var ec_left2 = echarts.init(document.getElementById('l2'), "dark");
2 | var ec_left2_Option = {
3 | tooltip: {
4 | trigger: 'axis',
5 | //指示器
6 | axisPointer: {
7 | type: 'line',
8 | lineStyle: {
9 | color: '#7171C6'
10 | }
11 | },
12 | },
13 | legend: {
14 | data: ['新增确诊', '新增疑似'],
15 | left: "right"
16 | },
17 | //标题样式
18 | title: {
19 | text: "全国新增趋势",
20 | textStyle: {
21 | color: 'white',
22 | },
23 | left: 'left'
24 | },
25 | //图形位置
26 | grid: {
27 | left: '4%',
28 | right: '6%',
29 | bottom: '4%',
30 | top: 50,
31 | containLabel: true
32 | },
33 | xAxis: [{
34 | type: 'category',
35 | //x轴坐标点开始与结束点位置都不在最边缘
36 | // boundaryGap : true,
37 |
38 | data: []
39 | }],
40 | yAxis: [{
41 | type: 'value',
42 | //y轴字体设置
43 |
44 | //y轴线设置显示
45 | axisLine: {
46 | show: true
47 | },
48 | axisLabel: {
49 | show: true,
50 | color: 'white',
51 | fontSize: 12,
52 | formatter: function(value) {
53 | if (value >= 1000) {
54 | value = value / 1000 + 'k';
55 | }
56 | return value;
57 | }
58 | },
59 | //与x轴平行的线样式
60 | splitLine: {
61 | show: true,
62 | lineStyle: {
63 | color: '#17273B',
64 | width: 1,
65 | type: 'solid',
66 | }
67 | }
68 | }],
69 | series: [{
70 | name: "新增确诊",
71 | type: 'line',
72 | smooth: true,
73 | data: []
74 | }, {
75 | name: "新增疑似",
76 | type: 'line',
77 | smooth: true,
78 | data: []
79 | }]
80 | };
81 |
82 | ec_left2.setOption(ec_left2_Option)
83 |
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/项目源码/Cov/static/js/ec_right1.js:
--------------------------------------------------------------------------------
1 | var ec_right1 = echarts.init(document.getElementById('r1'),"dark");
2 | var ec_right1_option = {
3 | //标题样式
4 | title : {
5 | text : "非湖北地区城市确诊TOP5",
6 | textStyle : {
7 | color : 'white',
8 | },
9 | left : 'left'
10 | },
11 | color: ['#3398DB'],
12 | tooltip: {
13 | trigger: 'axis',
14 | axisPointer: { // 坐标轴指示器,坐标轴触发有效
15 | type: 'shadow' // 默认为直线,可选为:'line' | 'shadow'
16 | }
17 | },
18 | xAxis: {
19 | type: 'category',
20 | data: []
21 | },
22 | yAxis: {
23 | type: 'value'
24 | },
25 | series: [{
26 | data: [],
27 | type: 'bar',
28 | barMaxWidth:"50%"
29 | }]
30 | };
31 | ec_right1.setOption(ec_right1_option)
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/项目源码/Cov/static/js/ec_right2.js:
--------------------------------------------------------------------------------
1 | var ec_right2 = echarts.init(document.getElementById('r2'), "dark");
2 |
3 | var ddd = [{'name': '肺炎', 'value': '12734670'}, {'name': '实时', 'value': '12734670'},
4 | {'name': '新型', 'value': '12734670'}]
5 | var ec_right2_option = {
6 | // backgroundColor: '#515151',
7 | title : {
8 | text : "今日疫情热搜",
9 | textStyle : {
10 | color : 'white',
11 | },
12 | left : 'left'
13 | },
14 | tooltip: {
15 | show: false
16 | },
17 | series: [{
18 | type: 'wordCloud',
19 | // drawOutOfBound:true,
20 | gridSize: 1,
21 | sizeRange: [12, 55],
22 | rotationRange: [-45, 0, 45, 90],
23 | // maskImage: maskImage,
24 | textStyle: {
25 | normal: {
26 | color: function () {
27 | return 'rgb(' +
28 | Math.round(Math.random() * 255) +
29 | ', ' + Math.round(Math.random() * 255) +
30 | ', ' + Math.round(Math.random() * 255) + ')'
31 | }
32 | }
33 | },
34 | // left: 'center',
35 | // top: 'center',
36 | // // width: '96%',
37 | // // height: '100%',
38 | right: null,
39 | bottom: null,
40 | // width: 300,
41 | // height: 200,
42 | // top: 20,
43 | data: []
44 | }]
45 | }
46 |
47 | ec_right2.setOption(ec_right2_option);
48 |
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/项目源码/Cov/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | my page
6 |
7 |
8 |
9 | 疫情追踪
10 | 实时报道
11 |
12 |
26 |
27 |
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/项目源码/Cov/templates/main.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | 疫情监控
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
全国疫情实时追踪
15 |
16 |
我是左1
17 |
我是左2
18 |
19 |
20 |
21 |
22 |
23 |
累计确诊
24 |
剩余疑似
25 |
累计治愈
26 |
累计死亡
27 |
28 |
我是中2
29 |
我是右1
30 |
我是右2
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/项目源码/Cov/templates/test.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/Visualization/Python疫情监控/项目源码/Cov/utils.py:
--------------------------------------------------------------------------------
1 | import time
2 | import pymysql
3 |
4 | def get_time():
5 | time_str = time.strftime("%Y{}%m{}%d{} %X")
6 | return time_str.format("年","月","日")
7 |
8 | def get_conn():
9 | """
10 | :return: 连接,游标
11 | """
12 | # 创建连接
13 | conn = pymysql.connect(host="127.0.0.1",
14 | user="root",
15 | password="",
16 | db="cov",
17 | charset="utf8")
18 | # 创建游标
19 | cursor = conn.cursor()# 执行完毕返回的结果集默认以元组显示
20 | return conn, cursor
21 |
22 | def close_conn(conn, cursor):
23 | cursor.close()
24 | conn.close()
25 |
26 | def query(sql,*args):
27 | """
28 | 封装通用查询
29 | :param sql:
30 | :param args:
31 | :return: 返回查询到的结果,((),(),)的形式
32 | """
33 | conn, cursor = get_conn()
34 | cursor.execute(sql,args)
35 | res = cursor.fetchall()
36 | close_conn(conn, cursor)
37 | return res
38 |
39 | def get_c1_data():
40 | """
41 | :return: 返回大屏div id=c1 的数据
42 | """
43 | # 因为会更新多次数据,取时间戳最新的那组数据
44 | sql = "select sum(confirm)," \
45 | "(select suspect from history order by ds desc limit 1)," \
46 | "sum(heal)," \
47 | "sum(dead) " \
48 | "from details " \
49 | "where update_time=(select update_time from details order by update_time desc limit 1) "
50 | res = query(sql)
51 | return res[0]
52 |
53 | def get_c2_data():
54 | """
55 | :return: 返回各省数据
56 | """
57 | # 因为会更新多次数据,取时间戳最新的那组数据
58 | sql = "select province,sum(confirm) from details " \
59 | "where update_time=(select update_time from details " \
60 | "order by update_time desc limit 1) " \
61 | "group by province"
62 | res = query(sql)
63 | return res
64 |
65 | def get_l1_data():
66 |
67 | sql = "select ds,confirm,suspect,heal,dead from history"
68 | res = query(sql)
69 | return res
70 |
71 | def get_l2_data():
72 |
73 | sql = "select ds,confirm_add,suspect_add from history"
74 | res = query(sql)
75 | return res
76 |
77 | def get_r1_data():
78 | """
79 | :return: 返回非湖北地区城市确诊人数前5名
80 | """
81 | sql = 'SELECT city,confirm FROM ' \
82 | '(select city,confirm from details ' \
83 | 'where update_time=(select update_time from details order by update_time desc limit 1) ' \
84 | 'and province not in ("湖北","北京","上海","天津","重庆") ' \
85 | 'union all ' \
86 | 'select province as city,sum(confirm) as confirm from details ' \
87 | 'where update_time=(select update_time from details order by update_time desc limit 1) ' \
88 | 'and province in ("北京","上海","天津","重庆") group by province) as a ' \
89 | 'ORDER BY confirm DESC LIMIT 5'
90 | res = query(sql)
91 | return res
92 |
93 | def get_r2_data():
94 | """
95 | :return: 返回最近的20条热搜
96 | """
97 | sql = 'select content from hotsearch order by id desc limit 20'
98 | res = query(sql) #格式 (('民警抗疫一线奋战16天牺牲1037364',), ('四川再派两批医疗队1537382',)
99 | return res
100 |
101 | if __name__ == "__main__":
102 | print(get_r2_data())
--------------------------------------------------------------------------------
/Visualization/maoyanMovies_comments/analysis.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # author:Ryan time:2018/11/20
4 |
5 | from pyecharts import Style
6 | from pyecharts import Geo
7 | import json
8 | from pyecharts import Bar
9 | from collections import Counter
10 |
11 |
12 | #数据可视化
13 |
14 | def gender():
15 | cities = []
16 | with open('E:/spiderproject/maoyanMovies_comments/comments.txt','r',encoding='utf-8')as f:
17 | rows = f.readlines()
18 | try:
19 | for row in rows:
20 | city = row.split(',')[1]
21 | if city != '':
22 | cities.append(city)
23 | #print(city)
24 | except Exception as e:
25 | print(e)
26 |
27 | handle(cities)
28 | data = Counter(cities).most_common()
29 | style = Style(
30 | title_color='#fff',
31 | title_pos='center',
32 | width=1200,
33 | height=600,
34 | background_color='#404a59'
35 | )
36 | geo = Geo('《毒液》观众位置分布', '数据来源:猫眼-Ryan采集', **style.init_style)
37 | attr, value = geo.cast(data)
38 | geo.add('', attr, value, visual_range=[0, 1000],
39 | visual_text_color='#fff', symbol_size=15,
40 | is_visualmap=True, is_piecewise=False, visual_split_number=10)
41 | geo.render('观众位置分布-地理坐标图.html')
42 |
43 | data_top20 = Counter(cities).most_common(20)
44 | bar = Bar('《毒液》观众来源排行TOP20', '数据来源:猫眼-Ryan采集', title_pos='center', width=1200, height=600)
45 | attr, value = bar.cast(data_top20)
46 | bar.add('', attr, value, is_visualmap=True, visual_range=[0, 3500], visual_text_color='#fff', is_more_utils=True,
47 | is_label_show=True)
48 | bar.render('观众来源排行-柱状图.html')
49 |
50 | def handle(cities):
51 | # print(len(cities), len(set(cities)))
52 |
53 | # 获取坐标文件中所有地名
54 | data = None
55 | with open('C:/Users/purple.guo/AppData/Local/Continuum/anaconda3/Lib/site-packages/pyecharts/datasets/city_coordinates.json',
56 | mode='r', encoding='utf-8') as f:
57 | data = json.loads(f.read()) # 将str转换为json
58 |
59 | # 循环判断处理
60 | data_new = data.copy() # 拷贝所有地名数据
61 | for city in set(cities): # 使用set去重
62 | # 处理地名为空的数据
63 | if city == '':
64 | while city in cities:
65 | cities.remove(city)
66 | count = 0
67 | for k in data.keys():
68 | count += 1
69 | if k == city:
70 | break
71 | if k.startswith(city): # 处理简写的地名,如 达州市 简写为 达州
72 | # print(k, city)
73 | data_new[city] = data[k]
74 | break
75 | if k.startswith(city[0:-1]) and len(city) >= 3: # 处理行政变更的地名,如县改区 或 县改市等
76 | data_new[city] = data[k]
77 | break
78 | # 处理不存在的地名
79 | if count == len(data):
80 | while city in cities:
81 | cities.remove(city)
82 |
83 | # print(len(data), len(data_new))
84 |
85 | # 写入覆盖坐标文件
86 | with open(
87 | 'C:/Users/purple.guo/AppData/Local/Continuum/anaconda3/Lib/site-packages/pyecharts/datasets/city_coordinates.json',
88 | mode='w', encoding='utf-8') as f:
89 | f.write(json.dumps(data_new, ensure_ascii=False))
90 |
91 | if __name__ == '__main__':
92 | gender()
--------------------------------------------------------------------------------
/Visualization/maoyanMovies_comments/movieswd.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # author:Ryan time:2018/11/20
4 |
5 | import jieba
6 | import matplotlib.pyplot as plt
7 | from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
8 | from scipy.misc import imread
9 |
10 |
11 | comments = []
12 | with open('files/comments.txt', 'r', encoding='utf-8')as f:
13 | rows = f.readlines()
14 | try:
15 | for row in rows:
16 | comment = row.split(',')[2]
17 | if comment != '':
18 | comments.append(comment)
19 | # print(city)
20 | except Exception as e:
21 | print(e)
22 | comment_after_split = jieba.cut(str(comments), cut_all=False)
23 | words = ' '.join(comment_after_split)
24 | #过虑没用的停止词
25 | # stopwords = STOPWORDS.copy()
26 | # stopwords.add('电影')
27 | # stopwords.add('一部')
28 | # stopwords.add('一个')
29 | # stopwords.add('没有')
30 | # stopwords.add('什么')
31 | # stopwords.add('有点')
32 | # stopwords.add('感觉')
33 | # stopwords.add('海王')
34 | # stopwords.add('就是')
35 | # stopwords.add('觉得')
36 |
37 |
38 | bg_image = plt.imread('venmo1.jpg')
39 | wc = WordCloud(width=1900, height=1080, background_color='white', mask=bg_image, font_path='STKAITI.TTF',
40 | stopwords=stopwords, max_font_size=400, random_state=50)
41 | wc.generate_from_text(words)
42 | plt.imshow(wc)
43 | plt.axis('off')
44 | plt.show()
45 |
46 | wc.to_file('网易云热评词云图.jpg')
--------------------------------------------------------------------------------
/Visualization/maoyanMovies_comments/t1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/Visualization/maoyanMovies_comments/t1.jpg
--------------------------------------------------------------------------------
/Visualization/maoyanMovies_comments/test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # author:Ryan time:2018/11/20
4 |
5 | import requests
6 | import json
7 | import random
8 | import time
9 | from datetime import datetime
10 | from datetime import timedelta
11 |
12 | def get_data(url):
13 | headers = {
14 | 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'}
15 | html = requests.get(url, headers=headers)
16 | if html.status_code ==200:
17 | return html.content
18 | else:
19 | return none
20 |
21 | def parse_data(html):
22 | json_data = json.loads(html)['cmts']
23 | comments = []
24 | try:
25 | for item in json_data:
26 | comment = {
27 | 'nickName': item['nickName'],
28 | 'cityName': item['cityName'] if 'cityName' in item else '',
29 | 'content': item['content'].strip().replace('\n', ''),
30 | 'score': item['score'],
31 | 'startTime': item['startTime']
32 | }
33 | comments.append(comment)
34 | return comments
35 | except Exception as e:
36 | print(e)
37 |
38 | def save():
39 | start_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
40 | end_time = '2019-2-05 00:00:00'
41 | while start_time > end_time:
42 | url = 'http://m.maoyan.com/mmdb/comments/movie/248906.json?_v_=yes&offset=15&startTime=' + start_time.replace(
43 | ' ', '%20')
44 | html = None
45 | try:
46 | html = get_data(url)
47 | except Exception as e:
48 | time.sleep(0.5)
49 | html = get_data(url)
50 | else:
51 | time.sleep(0.1)
52 | comments =parse_data(html)
53 | start_time = comments[14]['startTime']
54 | print(start_time)
55 | start_time = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S') + timedelta(seconds=-1)
56 | start_time = datetime.strftime(start_time, '%Y-%m-%d %H:%M:%S')
57 | for item in comments:
58 | print(item)
59 | with open('E:/spiderproject/maoyanMovies_comments/comments.txt', 'a', encoding='utf-8')as f:
60 | f.write(item['nickName']+','+item['cityName'] +','+item['content']+','+str(item['score'])+ item['startTime'] + '\n')
61 | if __name__ == '__main__':
62 | url = 'http://m.maoyan.com/mmdb/comments/movie/248906.json?_v_=yes&offset=15&startTime=2018-11-19%2019%3A36%3A43'
63 | html = get_data(url)
64 | reusults = parse_data(html)
65 | save()
--------------------------------------------------------------------------------
/Visualization/maoyanMovies_comments/venmo1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/Visualization/maoyanMovies_comments/venmo1.jpg
--------------------------------------------------------------------------------
/Visualization/maoyanMovies_comments/venom.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/Visualization/maoyanMovies_comments/venom.jpg
--------------------------------------------------------------------------------
/Visualization/maoyanMovies_comments/wd.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # author:Ryan time:2018/11/20
4 |
5 | import jieba
6 | import matplotlib.pyplot as plt
7 | from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
8 | from scipy.misc import imread
9 |
10 |
11 | comments = []
12 | with open('E:/spiderproject/maoyanMovies_comments/', 'r', encoding='utf-8')as f:
13 | rows = f.readlines()
14 | try:
15 | for row in rows:
16 | comment = row.split(',')[2]
17 | if comment != '':
18 | comments.append(comment)
19 | # print(city)
20 | except Exception as e:
21 | print(e)
22 | comment_after_split = jieba.cut(str(comments), cut_all=False)
23 | words = ' '.join(comment_after_split)
24 | #多虑没用的停止词
25 | stopwords = STOPWORDS.copy()
26 | stopwords.add('电影')
27 | stopwords.add('一部')
28 | stopwords.add('一个')
29 | stopwords.add('没有')
30 | stopwords.add('什么')
31 | stopwords.add('有点')
32 | stopwords.add('感觉')
33 | stopwords.add('毒液')
34 | stopwords.add('就是')
35 | stopwords.add('觉得')
36 |
37 |
38 | bg_image = plt.imread('venmo1.jpg')
39 | wc = WordCloud(width=1024, height=768, background_color='white', mask=bg_image, font_path='STKAITI.TTF',
40 | stopwords=stopwords, max_font_size=400, random_state=50)
41 | wc.generate_from_text(words)
42 | plt.imshow(wc)
43 | plt.axis('off')
44 | plt.show()
45 |
46 | wc.to_file('词云图.jpg')
--------------------------------------------------------------------------------
/pythonScript/OlympicGamesGoldenNotify/__pycache__/mail.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/OlympicGamesGoldenNotify/__pycache__/mail.cpython-37.pyc
--------------------------------------------------------------------------------
/pythonScript/OlympicGamesGoldenNotify/__pycache__/medals.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/OlympicGamesGoldenNotify/__pycache__/medals.cpython-37.pyc
--------------------------------------------------------------------------------
/pythonScript/OlympicGamesGoldenNotify/index.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import logging
3 | from mail import sendmail
4 |
5 | # To enable the initializer feature (https://help.aliyun.com/document_detail/158208.html)
6 | # please implement the initializer function as below:
7 | # def initializer(context):
8 | # logger = logging.getLogger()
9 | # logger.info('initializing')
10 |
11 | def handler(event, context):
12 | sendmail()
--------------------------------------------------------------------------------
/pythonScript/OlympicGamesGoldenNotify/mail.py:
--------------------------------------------------------------------------------
1 | import smtplib
2 | from email.mime.text import MIMEText
3 | from email.header import Header
4 | from medals import getWinners, getRanking
5 |
6 | def sendmail():
7 | sender = 'cgyung@qq.com' # 发送邮箱
8 | senderName = "潜龙于野" # 发送者昵称
9 | password = 'qktwjlvxlyrwcagi' # 发送方QQ邮箱授权码
10 | receivers = ['admin@yinuxy.com'] # 接收邮件
11 |
12 | # 三个参数:第一个为文本内容,第二个 plain 设置文本格式,第三个 utf-8 设置编码
13 | str = getRanking() + getWinners()
14 | message = MIMEText(str, 'plain', 'utf-8')
15 | message['From'] = Header(senderName, 'utf-8') # 发送者昵称
16 |
17 | # 主题
18 | subject = '东京奥运会金牌排行榜及获奖人员'
19 | message['Subject'] = Header(subject, 'utf-8')
20 |
21 | try:
22 | client = smtplib.SMTP_SSL('smtp.qq.com', smtplib.SMTP_SSL_PORT)
23 | print("连接到邮件服务器成功")
24 |
25 | client.login(sender, password)
26 | print("登录成功")
27 |
28 | client.sendmail(sender, receivers, message.as_string())
29 | print("邮件发送成功")
30 | except smtplib.SMTPException:
31 | print("Error: 无法发送邮件")
32 |
33 | if __name__ == '__main__':
34 | sendmail()
--------------------------------------------------------------------------------
/pythonScript/OlympicGamesGoldenNotify/medals.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 |
4 | def getMedalsList(url, replaceTxt):
5 | try:
6 | r = requests.get(url)
7 | r.raise_for_status()
8 | r.encoding = r.apparent_encoding
9 |
10 | # 替换多余的内容
11 | data = str.replace(r.text, replaceTxt + "(", "")
12 | data = str.replace(data, ");", "")
13 | # 解码json,转成字典
14 | medals = json.loads(data)
15 |
16 | return medals
17 |
18 | except:
19 | return "Failed!"
20 |
21 | # 获取排行榜数据
22 | def getRanking():
23 | url = "https://api.cntv.cn/olympic/getOlyMedals?serviceId=pcocean&itemcode=GEN-------------------------------&t=jsonp&cb=omedals1"
24 | medals = getMedalsList(url, "omedals1")
25 | # 获取数据列表
26 | medalList = medals['data']['medalsList']
27 | res = ""
28 | for i in range(5):
29 | res += "第" + medalList[i]["rank"] + "名:" + medalList[i]["countryname"] + "(" + medalList[i]["countryid"] + ")\n"
30 | res += "金牌/银牌/铜牌:" + medalList[i]["gold"] + "/" + medalList[i]["silver"] + "/" + medalList[i]["bronze"] + "\n\n"
31 | return res
32 |
33 | # 中国奖牌获得者数据
34 | def getWinners():
35 | url = "https://api.cntv.cn/Olympic/getOlyMedalList?t=jsonp&cb=OM&serviceId=pcocean&countryid=CHN"
36 | owners = getMedalsList(url, "OM")
37 | # 获取数据列表
38 | ownerList = owners['data']['medalList']
39 | gold = "" # 金牌
40 | silver = "" # 银牌
41 | bronze = "" # 铜牌
42 | for owner in ownerList:
43 | medaltype = owner['medaltype'] # 奖牌类型
44 | startdatecn = owner['startdatecn'] # 日期CN
45 | item = owner['itemcodename'] + " " + owner['subitemname'] # 项目
46 | playname = owner['playname'] # 运动员
47 | if medaltype == "ME_GOLD":
48 | gold += "日期:" + startdatecn + "\n项目:" + item + "\n获得者:" + playname+"\n\n"
49 | elif medaltype == "ME_SILVER":
50 | silver += "日期:" + startdatecn + "\n项目:" + item + "\n获得者:" + playname+"\n\n"
51 | elif medaltype == "ME_BRONZE":
52 | bronze += "日期:" + startdatecn + "\n项目:" + item + "\n获得者:" + playname+"\n\n"
53 |
54 | res = "\n-------金牌:---------\n" + gold+"\n-------银牌:---------\n" + silver+"\n-------铜牌:---------\n"+ bronze
55 | return res
56 |
57 | if __name__ == '__main__':
58 | print(getRanking())
59 | print(getWinners())
--------------------------------------------------------------------------------
/pythonScript/WordCloud/Image-coloredwordcloud.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python 3.6.1
2 | # -*- coding:utf-8 -*-
3 | # ____author___='Yinux'
4 | """
5 | Image-colored wordcloud
6 | =======================
7 | 您可以在ImageColorGenerator中实现使用基于图像的着色策略对文字云进行着色,它使用由源图像中的单词占用的区域的平均颜色。
8 |
9 | """
10 |
11 | from os import path
12 | from PIL import Image
13 | import numpy as np
14 | import matplotlib.pyplot as plt
15 |
16 | from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
17 |
18 | d = path.dirname(__file__)
19 |
20 | # 读取整个文本
21 | text = open(path.join(d, 'comments.txt')).read()
22 |
23 | # 读取蒙板/彩色图像(图片是从http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010下载的)
24 | alice_coloring = np.array(Image.open(path.join(d, "alice_color.png")))
25 | stopwords = set(STOPWORDS)
26 | stopwords.add("said")
27 |
28 | wc = WordCloud(background_color="white", max_words=2000, mask=alice_coloring,
29 | stopwords=stopwords, max_font_size=40, random_state=42)
30 | # 生成词云
31 | wc.generate(text)
32 |
33 | # 从图像创建着色
34 | image_colors = ImageColorGenerator(alice_coloring)
35 |
36 | # 显示
37 | plt.imshow(wc, interpolation="bilinear")
38 | plt.axis("off") #不显示坐标尺寸
39 | plt.figure()
40 | # 重新着色词云并显示
41 | # 我们也可以直接在构造函数中给使用:color_func=image_colors
42 | plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
43 | plt.axis("off") #不显示坐标尺寸
44 | plt.figure()
45 | plt.imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear")
46 | plt.axis("off") #不显示坐标尺寸
47 | plt.show()#一次绘制三张图
48 |
--------------------------------------------------------------------------------
/pythonScript/WordCloud/Maskedwordcloud.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python 3.6.1
2 | # -*- coding:utf-8 -*-
3 | # ____author___='Yinux'
4 | """
5 | Masked wordcloud
6 | ================
7 |
8 | 使用蒙版图像可以生成任意形状的wordcloud。
9 | """
10 | from os import path
11 | from PIL import Image
12 | import numpy as np
13 | import matplotlib.pyplot as plt
14 |
15 | from wordcloud import WordCloud, STOPWORDS
16 |
17 | d = path.dirname(__file__)
18 |
19 | # 读取整个文本.
20 | text = open(path.join(d, 'comments.txt')).read()
21 |
22 | #读取图片(图片来源:http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg)
23 | alice_mask = np.array(Image.open(path.join(d, "alice_color.png")))
24 |
25 | stopwords = set(STOPWORDS)
26 | stopwords.add("said")
27 | #设置词云的一些属性
28 | wc = WordCloud(background_color="white", max_words=2000, mask=alice_mask,
29 | stopwords=stopwords)
30 | # 生成词云
31 | wc.generate(text)
32 |
33 | #保存到本地
34 | wc.to_file(path.join(d, "alice.png"))
35 |
36 | #展示
37 | plt.imshow(wc, interpolation='bilinear')
38 | plt.axis("off")
39 | plt.figure()
40 | plt.imshow(alice_mask, cmap=plt.cm.gray, interpolation='bilinear')
41 | plt.axis("off")
42 | plt.show()
43 |
--------------------------------------------------------------------------------
/pythonScript/WordCloud/alice_color.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/WordCloud/alice_color.png
--------------------------------------------------------------------------------
/pythonScript/WordCloud/coloredWd.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python 3.6.1
2 | # -*- coding:utf-8 -*-
3 | # ____author___='Yinux'
4 | """
5 | Image-colored wordcloud
6 | =======================
7 | 您可以在ImageColorGenerator中实现使用基于图像的着色策略对文字云进行着色,它使用由源图像中的单词占用的区域的平均颜色。
8 | """
9 | from os import path
10 | from PIL import Image
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 |
14 | from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
15 |
16 | d = path.dirname(__file__)
17 |
18 | # 读取整个文本
19 | text = open(path.join(d, 'comments.txt')).read()
20 |
21 | # 读取蒙板/彩色图像(图片是从http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010下载的)
22 | alice_coloring = np.array(Image.open(path.join(d, "alice_color.png")))
23 | stopwords = set(STOPWORDS)
24 | stopwords.add("said")
25 |
26 | wc = WordCloud(background_color="white", max_words=2000, mask=alice_coloring,
27 | stopwords=stopwords, max_font_size=40, random_state=42)
28 | # 生成词云
29 | wc.generate(text)
30 |
31 | # 从图像创建着色
32 | image_colors = ImageColorGenerator(alice_coloring)
33 |
34 | # 显示
35 | plt.imshow(wc, interpolation="bilinear")
36 | plt.axis("off") #不显示坐标尺寸
37 | plt.figure()
38 | # 重新着色词云并显示
39 | # 我们也可以直接在构造函数中给使用:color_func=image_colors
40 | plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
41 | plt.axis("off") #不显示坐标尺寸
42 | plt.figure()
43 | plt.imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear")
44 | plt.axis("off") #不显示坐标尺寸
45 | plt.show()#一次绘制三张图
--------------------------------------------------------------------------------
/pythonScript/WordCloud/comments.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/WordCloud/comments.txt
--------------------------------------------------------------------------------
/pythonScript/autoVote/autoVote.py:
--------------------------------------------------------------------------------
1 | import time
2 | import random
3 | import requests
4 | import getCookie
5 | from lxml import etree
6 |
7 | def geySkey(cookie):
8 | arrcookie = cookie.split("; ")
9 | for i in range(len(arrcookie)):
10 | arr = arrcookie[i].split("=")
11 | if(arr[0] == 'skey'):
12 | print(arr[1])
13 | return arr[1]
14 |
15 | def getGTK(skey):
16 | skey=geySkey(skey)
17 | hash = 5381
18 | for i in range(len(skey)):
19 | hash = hash + (hash << 5) + int(ord(skey[i]))
20 | return (hash & 0x7fffffff)
21 |
22 | def dailyTaskAutuComiit(header_dict, vote_url, comment_url, signInurl):
23 | base_url = "https://cloud.tencent.com/developer/ask?q=timeline"
24 | header = {
25 | "Accept": "application/json, text/plain, */*",
26 | "Accept-Encoding": "gzip, deflate, br",
27 | "Accept-Language": "zh-CN,zh;q=0.9",
28 | "Connection": "keep-alive",
29 | "Host": "cloud.tencent.com",
30 | "Referer": "https://cloud.tencent.com/developer/ask",
31 | "Sec-Fetch-Mode": "cors",
32 | "Sec-Fetch-Site": "same-origin",
33 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
34 | }
35 | r1 = requests.get(url=base_url, headers=header)
36 | r1.encoding = r1.apparent_encoding
37 | html = etree.HTML(r1.content)
38 | id = {}
39 | data = html.xpath('//div/div[2]/div[3]/a/@href')
40 | for item in data:
41 | idstr = str(item).replace("/developer/ask/","")
42 | idlist = idstr.split('/answer/')
43 | if(len(idlist)>1):
44 | id[idlist[0]] = idlist[1]
45 | votepayloadList = []
46 | commentpayloadList = []
47 | for key,value in id.items():
48 | votepayload = "{\r\n \"action\": \"VoteAnswer\",\r\n \"payload\": {\r\n \"questionId\": %s,\r\n \"answerId\": %s,\r\n \"vote\": 1\r\n }\r\n}"%(key, value)
49 | commentpayload = "{\r\n \"action\": \"CommentAnswer\",\r\n \"payload\": {\r\n \"questionId\": %s,\r\n \"answerId\": %s,\r\n \"content\": \"%s\"\r\n }\r\n}"%(key, value, '666')
50 | votepayloadList.append(votepayload)
51 | commentpayloadList.append(commentpayload)
52 | index = random.sample(range(1,20),5)
53 | index.sort()
54 | for i in index:
55 | r1 = requests.request("POST", vote_url, headers=header_dict, data=votepayloadList[i])
56 | print("第{}篇文章已点赞,返回代码:".format(i),r1.text)
57 | time.sleep(random.randint(5,10))
58 | # r2 = requests.request("POST", comment_url, headers=header_dict, data=commentpayloadList[i])
59 | # print("第{}篇文章已评论,返回代码:".format(i),r2.text)
60 | # time.sleep(random.randint(5,10))
61 |
62 | def getComment():
63 | commentsList = [
64 | "专业的回答,感谢分享",
65 | "不错不错",
66 | "大佬讲的太好啦,受益匪浅",
67 | "学习了,感谢分享经验",
68 | "太强了",
69 | "厉害哦",
70 | "不错啊",
71 | "很好",
72 | "学到了",
73 | "谢谢分享,学习了",
74 | "专业的回答",
75 | "666",
76 | "yyds",
77 | "11111111"
78 | ]
79 | return random.choice(commentsList)
80 |
81 |
82 | if __name__ == '__main__':
83 | getCookie.init()
84 | getCookie.login()
85 | time.sleep(10)
86 | with open("cookie.txt", "r", encoding="utf-8") as f:
87 | cookie = f.read()
88 | csrfCode = getGTK(cookie)
89 | signInurl = "https://cloud.tencent.com/developer/services/ajax/grocery-stall?action=SignIn&uin=100004697298&csrfCode=%s"%(csrfCode)
90 | vote_url = "https://cloud.tencent.com/developer/services/ajax/ask/answer?action=VoteAnswer&uin=100004697298&csrfCode=%s"%(csrfCode)
91 | comment_url = "https://cloud.tencent.com/developer/services/ajax/ask/answer?action=CommentAnswer&uin=100004697298&csrfCode=%s"%(csrfCode)
92 | header_dict = {
93 | 'accept': 'application/json, text/plain, */*',
94 | 'accept-encoding': 'gzip, deflate, br',
95 | 'accept-language': 'zh-CN,zh;q=0.9',
96 | 'content-type': 'application/json;charset=UTF-8',
97 | 'cookie': f"{cookie}",
98 | 'origin': 'https://cloud.tencent.com',
99 | 'referer': 'https://cloud.tencent.com/developer/ask',
100 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
101 | }
102 | dailyTaskAutuComiit(header_dict, vote_url, comment_url, signInurl)
103 | # print(geySkey(cookie))
104 |
105 |
106 |
--------------------------------------------------------------------------------
/pythonScript/autoVote/cookie.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/autoVote/cookie.txt
--------------------------------------------------------------------------------
/pythonScript/autoVote/getCookie.py:
--------------------------------------------------------------------------------
1 | import time
2 | from selenium import webdriver
3 | from selenium.webdriver.support.wait import WebDriverWait
4 |
5 |
6 |
7 | url = 'https://cloud.tencent.com/developer'
8 |
9 | # 初始化
10 | def init():
11 | # 定义为全局变量,方便其他模块使用
12 | global browser, wait
13 | # 实例化一个chrome浏览器
14 | option = webdriver.ChromeOptions()
15 | # option.add_argument("--user-data-dir=" + r"C:/Users/Administrator/AppData/Local/Google/Chrome/User Data")
16 | # proxy = get_ip()['HTTP']
17 | # option.add_argument("--proxy-server=http://54.255.66.81:80")
18 | option.add_experimental_option('excludeSwitches', ['enable-automation'])
19 | option.add_argument("--disable-blink-features=AutomationControlled")
20 | browser = webdriver.Chrome(chrome_options=option)
21 | # 最大化窗口
22 | browser.maximize_window()
23 | time.sleep(2)
24 | # 设置等待超时
25 | wait = WebDriverWait(browser, 20)
26 |
27 | # 登录
28 | def login():
29 | # 打开登录页面
30 | browser.get(url)
31 | # # 获取用户名输入框
32 | browser.find_element_by_xpath('//*[@id="react-root"]/div[1]/div[1]/div/div[2]/div[2]/div[3]/a[1]').click()
33 | browser.find_element_by_class_name('clg-icon-qq').click()
34 | time.sleep(10)
35 |
36 | # 获取cookie
37 | get_cookies_js = "return document.cookie"
38 | cookie = browser.execute_script(get_cookies_js)
39 | print(cookie)
40 |
41 | with open("./cookie.txt", "w", encoding="utf-8") as f:
42 | f.write(cookie)
43 | # page_source = browser.page_source
44 | # with open("page.html","w",encoding="utf-8") as f:
45 | # f.write(page_source)
46 |
47 |
48 | if __name__ == '__main__':
49 | init()
50 | login()
51 |
--------------------------------------------------------------------------------
/pythonScript/birthdayNotify/__pycache__/lunar.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/birthdayNotify/__pycache__/lunar.cpython-37.pyc
--------------------------------------------------------------------------------
/pythonScript/birthdayNotify/birthday.json:
--------------------------------------------------------------------------------
1 | {
2 | "friend1":{
3 | "name":"name",
4 | "relationship":"relationship",
5 | "birthday":"1998-08-26",
6 | "isLunar": true
7 | },
8 | "friend2":{
9 | "name":"name",
10 | "relationship":"relationship",
11 | "birthday":"1999-07-14",
12 | "isLunar":false
13 | },
14 | "friend3":{
15 | "name":"name",
16 | "relationship":"relationship",
17 | "birthday":"1971-07-10",
18 | "isLunar":true
19 | },
20 | "friend4":{
21 | "name":"name",
22 | "relationship":"relationship",
23 | "birthday":"1972-01-23",
24 | "isLunar":true
25 | },
26 | "friend5":{
27 | "name":"name",
28 | "relationship":"relationship",
29 | "birthday":"1994-08-20",
30 | "isLunar":true
31 | },
32 | "friend6":{
33 | "name":"name",
34 | "relationship":"relationship",
35 | "birthday":"1999-06-10",
36 | "isLunar":false
37 | }
38 | }
--------------------------------------------------------------------------------
/pythonScript/birthdayNotify/birthdayNotify.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/birthdayNotify/birthdayNotify.zip
--------------------------------------------------------------------------------
/pythonScript/birthdayNotify/index.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import json
3 | import time
4 | import smtplib
5 | import datetime
6 | import requests
7 | from email.mime.text import MIMEText
8 | from email.header import Header
9 | from borax.calendars.lunardate import LunarDate
10 |
11 | # To enable the initializer feature (https://help.aliyun.com/document_detail/158208.html)
12 | # please implement the initializer function as below:
13 | # def initializer(context):
14 | # logger = logging.getLogger()
15 | # logger.info('initializing')
16 |
17 |
18 | def hitokoto():
19 | #指定 api 的接口地址并设定 url 参数
20 | api_url = 'https://v1.hitokoto.cn/?c=d&c=h&c=i&c=k&encode=json'
21 | #向网站 api 发送请求并获取返回的数据
22 | response = requests.get(api_url)
23 | #将 json 数据对象转化为字典
24 | res = json.loads(response.text)
25 | #取出一言正文和出处拼装为字符串
26 | a_word = res['hitokoto']+' _____'+'《'+res['from']+'》'
27 | #输出一言
28 | return a_word
29 |
30 | def ln_date_str(month, day):
31 | # 月份
32 | lm = '正二三四五六七八九十冬腊'
33 | # 日份
34 | ld = '初一初二初三初四初五初六初七初八初九初十十一十二十三十四十五十六十七十八十九二十廿一廿二廿三廿四廿五廿六廿七廿八廿九三十'
35 | return '{}月{}'.format(lm[month-1], ld[(day-1)*2:day*2])
36 |
37 | def sendmail(res,relationship, name):
38 | sender = 'cgyung@qq.com' # 发送邮箱
39 | senderName = "潜龙于野" # 发送者昵称
40 | password = 'qktwjlvxlyrwcagi' # 发送方QQ邮箱授权码
41 | receivers = ['admin@yinuxy.com'] # 接收邮件
42 |
43 | # 三个参数:第一个为文本内容,第二个 plain 设置文本格式,第三个 utf-8 设置编码
44 | # str = getRanking() + getWinners()
45 | message = MIMEText(res, 'plain', 'utf-8')
46 | message['From'] = Header(senderName, 'utf-8') # 发送者昵称
47 |
48 | # 主题
49 | subject = '您的{}{}快要过生日啦'.format(relationship, name)
50 | message['Subject'] = Header(subject, 'utf-8')
51 |
52 | try:
53 | client = smtplib.SMTP_SSL('smtp.qq.com', smtplib.SMTP_SSL_PORT)
54 | print("连接到邮件服务器成功")
55 |
56 | client.login(sender, password)
57 | print("登录成功")
58 |
59 | client.sendmail(sender, receivers, message.as_string())
60 | print("邮件发送成功")
61 | except smtplib.SMTPException:
62 | print("Error: 无法发送邮件")
63 |
64 | def birthdayNotify(path='./birthday.json'):
65 | data = {}
66 | with open(path,'r',encoding='utf8')as fp:
67 | data = json.load(fp)
68 | for value in data.values():
69 |
70 | birth = value['birthday']
71 | birth = datetime.datetime.strptime(birth, "%Y-%m-%d")
72 |
73 | birthyear = birth.year
74 | today = datetime.date.today()
75 |
76 | if value['isLunar']:
77 | thisbirth = LunarDate(today.year,birth.month,birth.day)
78 | solardate = thisbirth.to_solar_date()
79 |
80 | if (solardate-today).days < 0 :
81 | thisbirth = LunarDate(today.year+1,birth.month,birth.day)
82 | solardate = thisbirth.to_solar_date()
83 | age = thisbirth.year - birthyear + 1
84 |
85 | # res = "今天是公历 {} \n您的 {} {} 将于 {}年{}月{}日 过生日({}天后)\n(农历生日{})\n\n今天是他的第{}个生日,快去为他挑选一件合适的礼物吧~\n\n{}\n\n\n".format(today, value['relationship'], value['name'], solardate.year, solardate.month, solardate.day, (solardate-today).days,ln_date_str(birth.month,birth.day), age, hitokoto())
86 | # print(res)
87 | # sendmail(res,value['relationship'], value['name'])
88 |
89 | if (solardate-today).days<=7 and (solardate-today).days>=0:
90 | res = "今天是公历 {} \n您的{}{}将于{}年{}月{}日过生日({}天后)\n农历:{}\n\n今天是他的第{}个生日,快去为他挑选一件合适的礼物吧~\n\n{}\n\n\n".format(today, value['relationship'], value['name'], solardate.year, solardate.month, solardate.day, (solardate-today).days, ln_date_str(birth.month,birth.day), age, hitokoto())
91 | print(res)
92 | sendmail(res,value['relationship'], value['name'])
93 | else:
94 | thisbirth = LunarDate(today.year,birth.month,birth.day)
95 | if (thisbirth-today).days < 0 :
96 | thisbirth = LunarDate(today.year+1,birth.month,birth.day)
97 | age = thisbirth.year - birthyear + 1
98 |
99 | # res = "今天是公历 {} \n您的 {} {} 将于 {}年{}月{}日 过生日({}天后)\n\n今天是他的第{}个生日,快去为他挑选一件合适的礼物吧~\n\n{}\n\n\n".format(today, value['relationship'], value['name'], thisbirth.year, thisbirth.month, thisbirth.day, (thisbirth-today).days, age, hitokoto())
100 | # print(res)
101 | # sendmail(str(res),value['relationship'], value['name'])
102 |
103 | if (thisbirth-today).days<=7 and (thisbirth-today).days>=0:
104 | res = "今天是公历 {} \n您的 {} {} 将于 {}年{}月{}日 过生日({}天后)\n\n今天是他的第{}个生日,快去为他挑选一件合适的礼物吧~\n\n{}\n\n\n".format(today, value['relationship'], value['name'], thisbirth.year, thisbirth.month, thisbirth.day, (thisbirth-today).days, age, hitokoto())
105 | print(res)
106 | # sendmail(res,value['relationship'], value['name'])
107 | time.sleep(5)
108 |
109 | def handler(event, context):
110 | birthdayNotify()
111 |
112 | if __name__ == '__main__':
113 | birthdayNotify()
--------------------------------------------------------------------------------
/pythonScript/birthdayNotify/text.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | from borax.calendars.lunardate import LunarDate
4 |
5 |
6 |
7 | today = datetime.date.today()
8 | print(type(today.year), type(today.month), type(today.day))
9 | thisday = LunarDate(today.year,today.month,today.day)
10 | print(thisday, type(thisday))
11 | solardate = thisday.to_solar_date()
12 | print(solardate, type(solardate))
13 |
14 |
15 | thisbirth = LunarDate(today.year,today.month,today.day)
16 | ssolardate = thisbirth.to_solar_date()
17 | print(ssolardate, type(ssolardate))
18 |
--------------------------------------------------------------------------------
/pythonScript/dingReminder/dingReminder.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Dec 3 16:46:04 2020
4 |
5 | @author: YINUXY
6 | """
7 |
8 |
9 | import dingtalkchatbot.chatbot as cb
10 | webhook = 'https://oapi.dingtalk.com/robot/send?access_token=2174abe57b7e6874d0143ba18351ed77c59c2b7f25ad476b82bcf4a449007025'
11 | robot = cb.DingtalkChatbot(webhook)
12 | robot.send_markdown(title='首屏会话透出的展示内容',
13 | text="# 这是支持markdown的文本 \n## 标题2 \n* 列表1 \n ")
--------------------------------------------------------------------------------
/pythonScript/draw_excel/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/draw_excel/1.jpg
--------------------------------------------------------------------------------
/pythonScript/draw_excel/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/draw_excel/2.jpg
--------------------------------------------------------------------------------
/pythonScript/draw_excel/4k_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/draw_excel/4k_1.jpg
--------------------------------------------------------------------------------
/pythonScript/draw_excel/draw_excel.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Dec 13 21:54:25 2019
4 |
5 | @author: Yinux
6 | """
7 |
8 | from PIL import Image
9 | import openpyxl
10 | from openpyxl.styles import fills
11 | import os
12 |
13 | MAX_WIDTH = 300
14 | MAX_HEIGHT = 300
15 |
16 | def resize(img):
17 | w, h = img.size
18 | if w > MAX_WIDTH:
19 | h = MAX_WIDTH / w * h
20 | w = MAX_WIDTH
21 |
22 | if h > MAX_HEIGHT:
23 | w = MAX_HEIGHT / h * w
24 | h = MAX_HEIGHT
25 | return img.resize((int(w), int(h)), Image.ANTIALIAS)
26 |
27 |
28 | def int_to_16(num):
29 | num1 = hex(num).replace('0x', '')
30 | num2 = num1 if len(num1) > 1 else '0' + num1
31 | return num2
32 |
33 |
34 | def draw_jpg(img_path):
35 |
36 | img_pic = resize(Image.open(img_path))
37 | img_name = os.path.basename(img_path)
38 | out_file = './result/' + img_name.split('.')[0] + '.xlsx'
39 | if os.path.exists(out_file):
40 | os.remove(out_file)
41 |
42 | workbook = openpyxl.Workbook()
43 | worksheet = workbook.active
44 |
45 | width, height = img_pic.size
46 |
47 | for w in range(1, width + 1):
48 |
49 | for h in range(1, height + 1):
50 | if img_pic.mode == 'RGB':
51 | r, g, b = img_pic.getpixel((w - 1, h - 1))
52 | elif img_pic.mode == 'RGBA':
53 | r, g, b, a = img_pic.getpixel((w - 1, h - 1))
54 |
55 | hex_rgb = int_to_16(r) + int_to_16(g) + int_to_16(b)
56 |
57 | cell = worksheet.cell(column=w, row=h)
58 |
59 | if h == 1:
60 | _w = cell.column
61 | _h = cell.col_idx
62 | # 调整列宽
63 | # worksheet.column_dimensions[_w].width = 1
64 | _w_letter = openpyxl.utils.get_column_letter(_w)
65 | worksheet.column_dimensions[_w_letter].width = 1
66 | # 调整行高
67 | worksheet.row_dimensions[h].height = 6
68 |
69 | cell.fill = fills.PatternFill(fill_type="solid", fgColor=hex_rgb)
70 |
71 | print('write in:', w, ' | all:', width + 1)
72 | print('saving...')
73 | workbook.save(out_file)
74 | print('success!')
75 |
76 | if __name__ == '__main__':
77 | filepath = 'D:/Code/Python/Interesting/draw_excel/iu.jpg'
78 | draw_jpg(filepath)
--------------------------------------------------------------------------------
/pythonScript/draw_excel/iu.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/draw_excel/iu.jpg
--------------------------------------------------------------------------------
/pythonScript/messageReminder/README.md:
--------------------------------------------------------------------------------
1 | # 上下班打卡提醒
2 | ## 使用方法
3 | 1. 需要Python 3.X环境
4 | 2. 需要安装`pyweathercn`包
5 | ```
6 | pip3 install pyweathercn
7 | ```
8 | 3. key值获取
9 | * 若使用QQ提醒则前往[https://qmsg.zendee.cn/](https://qmsg.zendee.cn/)登录添加提醒QQ和获取`key`值
10 | * 若使用server酱提醒则前往[http://sc.ftqq.com/](http://sc.ftqq.com/)登录获取`key`值
11 | 4. 替换`key`值后将此代码放入VPS后执行即可
12 | ## 定时策略
13 | 以Centos为例:
14 | ```
15 | # 进入编写定时脚本
16 | crontab -e
17 | # 需要定时两次脚本(上下班)
18 | 20 8 * * * cd /project/dingReminder && python dingReminder.py >> dingReminder.log 2>&1
19 | 32 17 * * * cd /project/dingReminder && python dingReminder.py >> dingReminder.log 2>&1
20 | ```
21 | > linux 定时任务编写脚本可参考[Linux Crontab 定时任务](https://www.runoob.com/w3cnote/linux-crontab-tasks.html)
--------------------------------------------------------------------------------
/pythonScript/messageReminder/messageReminder.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Dec 3 08:46:08 2020
4 |
5 | @author: YINUXY
6 | """
7 |
8 |
9 | from datetime import datetime, date, time, timezone
10 | import requests
11 | import pyweathercn
12 |
13 | def getWeather(city):
14 | w = pyweathercn.Weather(city)
15 | context = '''今天是'''+w.today(True)['date']+'\n'+w.data['city']+'''市今天的天气为'''+w.today(True)['type']+'''\n温度变化为'''+w.today(True)['temp']+'\n'+w.tip()
16 | return context
17 |
18 | def sendQQ(wcontext):
19 | key = '*****************************'
20 | morning = '08:30:00'
21 | night = '17:30:00'
22 | nowtime = datetime.now().strftime('%H:%M:%S')
23 | if nowtime < morning:
24 | greeting = "早上好主人ヾ(✿゚▽゚)ノ\n美好的搬砖生活开始啦!(<ゝω・)☆\n快点打开手机钉钉进行上班打卡把!!!!!!(~ ̄▽ ̄)~ \n不然就要迟到啦∑(゚Д゚ノ)ノ\n"
25 | context = greeting + wcontext
26 | elif nowtime > night:
27 | greeting = "晚上好主人ヾ(✿゚▽゚)ノ\n辛苦的搬砖生活终于结束啦!(<ゝω・)☆\n不要忘记了晚间下班打卡哟( • ̀ω•́ )✧\n"
28 | context = greeting
29 | else:
30 | context = "现在还没到上/下班签到时间哦\n"
31 | url = 'https://qmsg.zendee.cn/send/' + key + '?msg='+context
32 | requests.post(url)
33 |
34 | def sendWechat(wcontext):
35 | key = '******************************************'
36 | title = ''
37 | morning = '08:30:00'
38 | night = '17:30:00'
39 | nowtime = datetime.now().strftime('%H:%M:%S')
40 | if nowtime < morning:
41 | title = '''上班打卡啦ヾ(✿゚▽゚)ノ'''
42 | greeting = '''> 早上好主人ヾ(✿゚▽゚)ノ\n美好的搬砖生活开始啦!(<ゝω・)☆\n> 快点打开手机钉钉进行上班打卡把!!!!!!(~ ̄▽ ̄)~ \n不然就要迟到啦∑(゚Д゚ノ)ノ\n'''
43 | context = greeting + wcontext
44 | elif nowtime > night:
45 | title = '''下班打卡啦ヾ(✿゚▽゚)ノ'''
46 | greeting = '''> 晚上好主人ヾ(✿゚▽゚)ノ\n> 辛苦的搬砖生活终于结束啦!(<ゝω・)☆\n> 不要忘记了晚间下班打卡哟( • ̀ω•́ )✧'''
47 | context = greeting
48 | else:
49 | title = '''上班时间请勿开小差!(〝▼皿▼)'''
50 | context = '''现在还没到上/下班签到时间哦\n''' + wcontext + wcontext
51 | url = "http://sc.ftqq.com/" + key + ".send?text=" + title + "&desp=" + context
52 | requests.post(url)
53 |
54 | if __name__ == '__main__':
55 | city = '杭州'
56 | w = getWeather(city)
57 | sendQQ(w)
58 | sendWechat(w)
59 | print(sendQQ(w))
60 |
--------------------------------------------------------------------------------
/pythonScript/miStoreBuy/MiStore.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sun Apr 26 21:22:58 2020
4 |
5 | @author: Yinux
6 | """
7 |
8 | from selenium import webdriver
9 | import time
10 | import datetime
11 | chrome_driver = 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe' #chromedriver的文件位置
12 | browser = webdriver.Chrome(executable_path = chrome_driver)
13 |
14 | def login(name ,pwd):
15 | browser.get( 'https://account.xiaomi.com/')#登录网址
16 | time.sleep(2)
17 | browser.find_element_by_id("username").send_keys(name) #利用账号标签的ID,确定位置并send信息
18 | browser.find_element_by_id("pwd").send_keys(pwd) #利用密码标签的ID,确定位置并send信息
19 | browser.find_element_by_id("login-button").click()#利用登录按钮的ID,确定位置并点击
20 | #如果找不到标签ID,可以使用其他方法来确定元素位置
21 | time.sleep(3)
22 | browser.get("https://s1.mi.com/m/app/hd/index.html?id=15042")#切换到秒杀页面
23 | print('登录成功,正在等待秒杀···')
24 |
25 | def buy_on_time():
26 | while True: #不断刷新时钟
27 | now = datetime.datetime.now()
28 | if now.strftime('%H:%M:%S') == '09:00:00' or now.strftime('%H:%M:%S') == '11:00:00' or now.strftime('%H:%M:%S') == '15:00:00' or now.strftime('%H:%M:%S') == '17:00:00':
29 | # if now.strftime('%H:%M:%S') == buytime:
30 | browser.find_element_by_xpath("//div[@class='content-box flex-box']/a[@data-log_code='logcode#activity_code=wjsncc49&page=activity&page_id=15042&bid=3645414.0']/div/img").click()
31 | browser.find_element_by_xpath("//a[@data-log_code='logcode#activity_code=1i19jyzh&page=activity&page_id=15042&bid=3645414.0']").click()
32 | # browser.find_element_by_xpath("//a[@data-log_code='logcode#activity_code=tudhbjjy&page=activity&page_id=15042&bid=3646017.0']").click() #购买按钮的Xpath
33 | # browser.find_element_by_xpath("//a[@data-log_code='logcode#activity_code=qpohzak0&page=activity&page_id=15042&bid=3646017.0']").click()
34 | print('当前时段已抢购完毕')
35 | time.sleep(0.01)#注意刷新间隔时间要尽量短
36 |
37 | login('1317150488' , 'xiaomi0711')
38 | #time.sleep(10)
39 | #buy_on_time()#指定秒杀时间,并且开始等待秒杀
40 | browser.find_element_by_class_name('item flex-box-item')[2].click()
41 | #print("ending")
--------------------------------------------------------------------------------
/pythonScript/miStoreBuy/debug.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/miStoreBuy/debug.log
--------------------------------------------------------------------------------
/pythonScript/pdfToExcel/README.md:
--------------------------------------------------------------------------------
1 | 1. 转换 PDF中的表格 文件到 Microsoft Excel 文档
2 | 2. 程序自动识别pdf中的表格
3 | 3. 电子工程师用excel建原理图的库(orcad)会需要,特别是引脚多的原理图库,如FPGA等,厂家没有给excel的引脚表,然后分了很多页的pdf,每页都有表头,如果全部复制入EXCEL,需要后期做很多处理,而且有换行和空格等很麻烦,费时间,本程序完美解决,其他行业也应该有用到
4 | 4. 网上也有在线的转换工具和离线的转换工具,试了下不好用,上传后你的文件等于公开了,下载的工具好多不安全,有的要注册会员或有文件大小限制
5 | 5. 将exe文件放到D盘根目录下(程序里写死了),把你的pdf文件改名为test.pdf(程序里写死了),后面有源码你可以自己改成输入路径的,图方便
6 |
7 | ### 注意事项:
8 | * 需不需要安装Microsoft Excel没有试过,测试时候是装着的
9 | * 执行exe后需要一会时间估计几十秒,会出现cmd控制台打印信息,pdf文件越大时间越长,实测9M多的pdf表格都可以
10 |
11 |
--------------------------------------------------------------------------------
/pythonScript/pdfToExcel/pdfToExcel.py:
--------------------------------------------------------------------------------
1 | import pdfplumber
2 | import xlwt
3 |
4 | # 定义保存Excel的位置
5 | workbook = xlwt.Workbook() #定义workbook
6 | sheet = workbook.add_sheet('Sheet1') #添加sheet
7 | i = 0 # Excel起始位置
8 |
9 | #path = input("E:/MyProject/python/test.pdf")
10 | path = "D:/test.pdf" # 导入PDF路径
11 | pdf = pdfplumber.open(path)
12 | print('\n')
13 | print('开始读取数据')
14 | print('\n')
15 | for page in pdf.pages:
16 | # 获取当前页面的全部文本信息,包括表格中的文字
17 | # print(page.extract_text())
18 | for table in page.extract_tables():
19 | # print(table)
20 | for row in table:
21 | print(row)
22 | for j in range(len(row)):
23 | sheet.write(i, j, row[j])
24 | i += 1
25 | print('---------- 分割线 ----------')
26 |
27 | pdf.close()
28 |
29 | # 保存Excel表
30 | workbook.save('D:/PDFresult.xls')
31 | print('\n')
32 | print('写入excel成功')
33 | print('保存位置:')
34 | print('D:/PDFresult.xls')
35 | print('\n')
36 | input('PDF取读完毕,按任意键退出')
37 |
--------------------------------------------------------------------------------
/pythonScript/pdfToExcel/新建 Microsoft Word 文档.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/pdfToExcel/新建 Microsoft Word 文档.docx
--------------------------------------------------------------------------------
/pythonScript/poem/Oxford3000.py:
--------------------------------------------------------------------------------
1 | import urllib3
2 | from bs4 import BeautifulSoup
3 | import certifi
4 |
5 | file = open('../data/word', "w+")
6 | http = urllib3.PoolManager(
7 | cert_reqs='CERT_REQUIRED',
8 | ca_certs=certifi.where())
9 |
10 | url = 'https://www.oxfordlearnersdictionaries.com/wordlist/english/oxford3000/'
11 | r = http.request('GET', url)
12 |
13 | soup = BeautifulSoup(r.data, 'html.parser')
14 |
15 | category = soup.find('ul', class_="hide_phone")
16 |
17 | content = soup.find('ul', class_="wordlist-oxford3000")
18 |
19 | # for link in content.find_all('a'):
20 | # file.write(link.get_text()+'\n')
21 |
22 | pages = soup.find('ul', class_="paging_links")
23 |
24 | for cat in category.find_all('a'):
25 | # get the former category of data
26 | while pages.find('a', text=">"):
27 | next = pages.find('a', text=">")
28 | r = http.request('GET', next.get('href'))
29 | soup = BeautifulSoup(r.data, 'html.parser')
30 | pages = soup.find('ul', class_="paging_links")
31 |
32 | # get the former page of data
33 | for link in content.find_all('a'):
34 | if link.get_text() != 'o’clock':
35 | file.write(link.get_text()+'\n')
36 | # update the content
37 | content = soup.find('ul', class_="wordlist-oxford3000")
38 | # get the last page of content
39 | for link in content.find_all('a'):
40 | file.write(link.get_text()+'\n')
41 |
42 | r = http.request('GET', cat.get('href'))
43 | soup = BeautifulSoup(r.data, 'html.parser')
44 |
45 | content = soup.find('ul', class_="wordlist-oxford3000")
46 | pages = soup.find('ul', class_="paging_links")
47 |
48 | # get the last category of data
49 | while pages.find('a', text=">"):
50 | next = pages.find('a', text=">")
51 | r = http.request('GET', next.get('href'))
52 | soup = BeautifulSoup(r.data, 'html.parser')
53 | pages = soup.find('ul', class_="paging_links")
54 |
55 | # get the former page of data
56 | for link in content.find_all('a'):
57 | file.write(link.get_text()+'\n')
58 | # update the content
59 | content = soup.find('ul', class_="wordlist-oxford3000")
60 | # get the last page of content
61 | for link in content.find_all('a'):
62 | file.write(link.get_text()+'\n')
63 |
64 |
65 |
66 |
67 |
68 |
69 |
--------------------------------------------------------------------------------
/pythonScript/poem/TangshiGene.py:
--------------------------------------------------------------------------------
1 | import random
2 | import time
3 | import pinyin
4 | #生成四言律诗
5 | def Line4():
6 | word_file = '../data/freqword.txt'
7 |
8 | dataset = open(word_file,encoding='utf-8').readlines()
9 |
10 | list = []
11 | for word in dataset:
12 | # outfile.write(pinyin.get(word, format="strip")+" ")
13 | i = 0
14 | while i= time1 and nowtime <= time2:
26 | return "单词背完了吗,没背完要打屁屁哟!现在已经AM 08:00了,赶快去复习上午的课程把~"
27 | elif nowtime >= time2 and nowtime <= time3:
28 | return "今天的任务完成了吗,没完成的话可是要加夜班了哦!"
29 | elif nowtime >= time3 and nowtime <= time4:
30 | return "晚饭吃完了吗,赶紧去练字去!!!"
31 | elif nowtime >= time4 and nowtime <= time5:
32 | return "现在,可以以开始晚自习拉~~~"
33 | elif nowtime >= time5:
34 | return "今天的任务完成了吗,没有也请放到明天再做吧!"
35 |
36 |
37 | def getOneNote():
38 | api_url = 'https://v1.hitokoto.cn/?c=k&c=d&c=h&encode=json'
39 | response = requests.get(api_url)
40 | res = json.loads(response.text)
41 | a_word = res['hitokoto']+' _____'+'《'+res['from']+'》'
42 | print(a_word)
43 |
44 | def sendmail():
45 | sender = 'cgyung@qq.com' # 发送邮箱
46 | senderName = "笨鸟先飞~" # 发送者昵称
47 | password = 'qktwjlvxlyrwcagi' # 发送方QQ邮箱授权码
48 | receivers = ['admin@yinuxy.com'] # 接收邮件
49 |
50 | # 三个参数:第一个为文本内容,第二个 plain 设置文本格式,第三个 utf-8 设置编码
51 | str = notifyText() + getOneNote()
52 | message = MIMEText(str, 'plain', 'utf-8')
53 | message['From'] = Header(senderName, 'utf-8') # 发送者昵称
54 |
55 | # 主题
56 | subject = '叮~您有新的学习计划'
57 | message['Subject'] = Header(subject, 'utf-8')
58 |
59 | try:
60 | client = smtplib.SMTP_SSL('smtp.qq.com', smtplib.SMTP_SSL_PORT)
61 | print("连接到邮件服务器成功")
62 |
63 | client.login(sender, password)
64 | print("登录成功")
65 |
66 | client.sendmail(sender, receivers, message.as_string())
67 | print("邮件发送成功")
68 | except smtplib.SMTPException:
69 | print("Error: 无法发送邮件")
70 |
71 | def sendQQ():
72 | key = '42b60c3e094bed98331a1cc5e089ff64'
73 | context = notifyText() + getOneNote()
74 | url = 'https://qmsg.zendee.cn/send/' + key + '?msg='+context
75 | requests.post(url)
76 |
77 | def sendWechat():
78 | key = 'SCT48533TKJb962s7xJdVTdsszsuv9Dks'
79 | title = '叮~您有新的学习计划'
80 | context = notifyText() + getOneNote()
81 | url = "http://sc.ftqq.com/" + key + ".send?text=" + title + "&desp=" + context
82 | requests.post(url)
83 |
84 | def handler():
85 | print(type(notifyText()))
86 | sendmail()
87 | # sendQQ()
88 | # sendWechat()
89 |
90 | if __name__ == '__main__':
91 | handler()
--------------------------------------------------------------------------------
/pythonScript/telegramPushBot/ht.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # 30分钟判断一次进程是否存在,如果不存在就启动它
3 | # python3请使用全路径,否则可能出现无法启动
4 | PIDS=`ps -ef |grep locpush |grep -v grep | awk '{print $2}'`
5 | if [ "$PIDS" != "" ]; then
6 | echo "myprocess is running!"
7 | else
8 | echo "未发现程序后台运行,正在重启中!"
9 | /usr/bin/python3 /project/hostlocpushBot/locpush.py &
10 | fi
--------------------------------------------------------------------------------
/pythonScript/telegramPushBot/locpush.py:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf-8 -*-
2 |
3 | import requests
4 | from urllib import parse
5 | from lxml import etree
6 | import time
7 | import datetime
8 | from requests.adapters import HTTPAdapter
9 | import re
10 | import js2py
11 |
12 |
13 | # 获得cookie
14 | def getcookies():
15 | url = 'https://www.hostloc.com/forum.php?mod=forumdisplay&fid=45&filter=author&orderby=dateline'
16 | js = js2py.EvalJs()
17 | headers = {'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'}
18 | aesjs = requests.get("https://www.hostloc.com/aes.min.js", headers=headers, timeout=5).text
19 | js.execute(aesjs)
20 | getcookie = requests.get(url).text
21 | getcookie_script = re.findall("",getcookie)
22 | js.execute(getcookie_script[0].split("document")[0])
23 | data = js.toHex(js.slowAES.decrypt(js.c, 2, js.a, js.b))
24 | cookie = "L7DFW=" + data
25 | return cookie
26 |
27 |
28 | # 获得日期
29 | def get_week_day(date):
30 | week_day_dict = {
31 | 0: '星期一',
32 | 1: '星期二',
33 | 2: '星期三',
34 | 3: '星期四',
35 | 4: '星期五',
36 | 5: '星期六',
37 | 6: '星期日',
38 | }
39 | day = date.weekday()
40 | return week_day_dict[day]
41 |
42 |
43 | def get_content(url):
44 | while True:
45 | try:
46 | s = requests.get(url)
47 | hostloc_content = etree.HTML(s.content).xpath('//table/tr/td[@class="t_f"]/text()')
48 |
49 | if not hostloc_content:
50 | return "因权限原因,内容无法预览,请手动登陆查看!"
51 | else:
52 | s = ''
53 | for j in hostloc_content:
54 | s = s + j
55 | # 不展示全部内容,防止内容过长,严重影响体验
56 | return s[0:80].replace("\r\n", '').replace('\n', '').replace('\xa0', '').replace('\u200b', '')
57 |
58 | except Exception as e:
59 | print("网络原因,无法访问,请稍后再试...")
60 | return "因权限原因,内容无法预览,请手动登陆查看!"
61 |
62 |
63 | def mark_down(content):
64 | # 删除特殊符号,防止发生错误parse
65 | sign = ['&', '.', '<', '>', ' ', '?', '"', "'", '#', '%', '!', '@', '$', '^', '*', '(', ')', '-', '_', '+', '=', '~', '/', ',', ':', '’', '‘', '“', '”', '%', '^', '——', '{', '}', '*', '[', '、', '\\', ']', '`', '"', "'", '\n']
66 | for k in sign:
67 | content = content.replace(k, "")
68 | return content
69 |
70 |
71 | def post(chat_id, text):
72 | try:
73 | text = parse.quote(text)
74 | post_url = 'https://api.telegram.org/bot1124748196:*********************tOjQkKU_VOz8CY/sendMessage' \
75 | '?parse_mode=MarkdownV2&chat_id={0}&text={1}'.format(chat_id, text)
76 | headers = {
77 | 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'}
78 | requests.get(post_url, headers=headers)
79 | except Exception:
80 | print("推送失败!")
81 | time.sleep(3)
82 | post(chat_id, text)
83 |
84 |
85 | # 主程序
86 | def master(r):
87 | xml_content = etree.HTML(r.content)
88 | href_list = xml_content.xpath('/html/body/div[@id="wp"]/div[5]/div/div/div[4]/div[2]/form/table/tbody/tr/th/a[3]/@href')
89 | author = xml_content.xpath('/html/body/div[@id="wp"]/div[5]/div/div/div[4]/div[2]/form/table/tbody/tr/td[2]/cite/a/text()')
90 | author_url = xml_content.xpath('/html/body/div[@id="wp"]/div[5]/div/div/div[4]/div[2]/form/table/tbody/tr/td[2]/cite/a/@href')
91 | number = xml_content.xpath('/html/body/div[@id="wp"]/div[5]/div/div/div[4]/div[2]/form/table/tbody/tr/td[3]/a/text()')
92 | href = xml_content.xpath('/html/body/div[@id="wp"]/div[5]/div/div/div[4]/div[2]/form/table/tbody/tr/th/a[3]/text()')
93 | print(author)
94 | print(number)
95 | for i in range(len(number)):
96 | if number[i] == '0':
97 | if str(href[i].replace("\r\n", "")) not in hostloc_list:
98 | hostloc_list.add(str(href[i].replace("\r\n", "")))
99 | name = href[i].replace("\r\n", "")
100 | # 文章链接
101 | # print(i)
102 | k = i + 1
103 | # print(k)
104 | url_list = "https://www.hostloc.com/{}".format(href_list[i])
105 | # 作者id链接
106 | url_author = "https://www.hostloc.com/{}".format(author_url[k])
107 | # 时间戳
108 | time_1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
109 | date_1 = get_week_day(datetime.datetime.now())
110 | time_2 = time_1 + ' ' + date_1 + ' '
111 | time2 = str(time_2).replace('-', '\\-')
112 | # 获得预览内容
113 | # print(get_content(url_list))
114 | content_2 = mark_down(get_content(url_list))
115 | text = '主 题:' + "***{}***".format(mark_down(name)) + '\n' + '发 布 者:[{0}]({1})'.format(mark_down(author[i + 1]), url_author) + '\n' + '时 间:' + time2 + '\n' + '内容预览:[点击查看——{0}]({1})'.format(content_2, url_list)
116 | print(text)
117 | # 修改为自己的想推送的ID
118 | post('@locpush', text)
119 | else:
120 | pass
121 | else:
122 | pass
123 |
124 |
125 | # 副程序
126 | def master_1(r):
127 | xml_content = etree.HTML(r.content)
128 | href_list = xml_content.xpath("//div[@class='threadlist']/ul/li/a/@href")
129 | author = xml_content.xpath("//span[@class='by']/text()")
130 | number = xml_content.xpath("//span[@class='num']/text()")
131 | href = xml_content.xpath("//div[@class='threadlist']/ul/li/a/text()")
132 | print(author)
133 | print(number)
134 | # print(href)
135 | # print(href_list)
136 | for i in range(len(number)):
137 | if number[i] == '0':
138 | if str(href[2 * i].replace("\r\n", "")) not in hostloc_list:
139 | hostloc_list.add(str(href[i * 2].replace("\r\n", "")))
140 | name = href[2 * i].replace("\r\n", "")
141 | # 转换链接:
142 | str_url = href_list[i].replace("forum.php?mod=viewthread&tid=", '').replace("&extra=page%3D1%26filter%3Dauthor%26orderby%3Ddateline&mobile=2", '')
143 |
144 | url_list = "https://www.hostloc.com/thread-{0}-1-1.html".format(str_url)
145 | # 时间戳
146 | time_1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
147 | date_1 = get_week_day(datetime.datetime.now())
148 | time_2 = time_1 + ' ' + date_1 + ' '
149 | time2 = str(time_2).replace('-', '\\-')
150 | # 获得预览内容
151 | # print(get_content(url_list))
152 | content_2 = mark_down(get_content_1(url_list))
153 | text = '主 题:' + "***{}***".format(mark_down(name)) + '\n' + '发 布 者:{0}'.format(mark_down(author[i])) + '\n' + '时 间:' + time2 + '\n' + '内容预览:[点击查看——{0}]({1})'.format(content_2, url_list)
154 | print(text)
155 | post('@locpush', text)
156 | else:
157 | pass
158 | else:
159 | pass
160 |
161 |
162 | # 获得内容
163 | def get_content_1(url):
164 | while True:
165 | try:
166 | headers = {
167 | 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'}
168 | requests.adapters.DEFAULT_RETRIES = 5
169 | s = requests.session()
170 | s.keep_alive = False
171 | result = 'L7DFW' in cookiestr
172 | if result:
173 | headers = {'Cookie': cookiestr, 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; '
174 | 'Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome'
175 | '/46.0.2490.76 Mobile Safari/537.36'}
176 | r = s.get(url, headers=headers)
177 | else:
178 | r = s.get(url, headers=headers)
179 | xmlContent = etree.HTML(r.content)
180 | content = xmlContent.xpath('//div[@class="message"]/text()')
181 | return content[0].replace("\r\n", '').replace("\n", '').replace("\r", '').replace("\t", '').replace(" ", '')[0:80]
182 |
183 | except Exception as e:
184 | print("网络原因,无法访问,请稍后再试...")
185 | return "网络原因,无法访问,内容无法预览..."
186 | time.sleep(5)
187 |
188 |
189 | hostloc_list = {"hello"}
190 | url_1 = "https://www.hostloc.com/"
191 | headers = {
192 | 'Accept-Encoding': 'gzip, deflate, br',
193 | 'Accept-Language': 'zh-CN,zh;q=0.9',
194 | 'Upgrade-Insecure-Requests': '1',
195 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
196 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
197 | 'Cache-Control': 'no-cache',
198 | 'Connection': 'keep-alive',
199 | }
200 | url_hostloc = "https://www.hostloc.com/forum.php?mod=forumdisplay&fid=45&filter=author&orderby=dateline"
201 |
202 | while True:
203 | try:
204 | # 网站要求js验证(无法预览网页内容)
205 | cookiestr = getcookies()
206 | print(cookiestr)
207 | print("1")
208 | url = 'https://www.hostloc.com/forum.php?mod=forumdisplay&fid=45&filter=author&orderby=dateline'
209 | headers = {
210 | 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'}
211 | requests.adapters.DEFAULT_RETRIES = 5
212 | s = requests.session()
213 | s.keep_alive = False
214 | result = 'L7DFW' in cookiestr
215 | if result:
216 | headers = {'Cookie': cookiestr, 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; '
217 | 'Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome'
218 | '/46.0.2490.76 Mobile Safari/537.36'}
219 | r = s.get(url, headers=headers)
220 | else:
221 | r = s.get(url, headers=headers)
222 | master_1(r)
223 | # 多少秒抓取一次网站,自己设定,不要太小,会被ban ip的
224 | time.sleep(20)
225 | except Exception as e:
226 | try:
227 | # 网站不要求js验证
228 | print("2")
229 | headers = {
230 | 'Accept-Encoding': 'gzip, deflate, br',
231 | 'Accept-Language': 'zh-CN,zh;q=0.9',
232 | 'Upgrade-Insecure-Requests': '1',
233 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
234 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
235 | 'Cache-Control': 'no-cache',
236 | 'Connection': 'keep-alive',
237 | }
238 | url_hostloc = "https://www.hostloc.com/forum.php?mod=forumdisplay&fid=45&filter=author&orderby=dateline"
239 | r = requests.get(url_hostloc, headers=headers)
240 | master(r)
241 | time.sleep(20)
242 | except Exception:
243 | print("网络错误,请稍后重试")
244 | time.sleep(120)
245 |
246 |
247 |
248 |
--------------------------------------------------------------------------------
/pythonScript/tianyi-zhuancun/README.md:
--------------------------------------------------------------------------------
1 | # 天翼云资源一键转存脚本
2 | ## 使用方法
3 | 填上cookie 和转存目标的文件夹ID即可一键转存
4 | ## 获取方法
5 | 登录天翼云网盘网页版,右键`检查`或者直接`F12`调出`Network`页面
6 |
7 | 1. cookie:
8 | 刷新一下网页点击`main.action`查看`Headers`下拉即可找到`Cookie`选项
9 | 
10 | 2. 目标文件夹ID:
11 | 点击进入你要存储的文件夹,复制顶栏连接`folder/`的id即可
12 | 
--------------------------------------------------------------------------------
/pythonScript/tianyi-zhuancun/sec1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/tianyi-zhuancun/sec1.png
--------------------------------------------------------------------------------
/pythonScript/tianyi-zhuancun/sec2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/tianyi-zhuancun/sec2.png
--------------------------------------------------------------------------------
/pythonScript/tianyi-zhuancun/zhuancun.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import hjson
3 | import urllib.parse
4 | import json
5 | import time
6 |
7 |
8 | def _main():
9 | h = {
10 | "Cookie": "COOKIE_LOGIN_USER=edrive_view_mode=icon; apm_ct=20200415102651680; apm_uid=29FAD3AAF7227DFC8D69DE214255C5A8; apm_ip=117.152.46.9; apm_ua=F49C41BE171437757C72FF333488A319; _ga=GA1.2.597408328.1587037854; offline_Pic_Showed=true; wpsGuideStatus=true; shareId_136723510=null; shareId_105944752=null; shareId_104180915=null; shareId_1601806=null; shareId_161893853=null; shareId_162635301=null; UM_distinctid=171a13870dd82-03b9857e7e8cf4-70103e47-144000-171a13870de3f0; Hm_lvt_79fae2027f43ca31186e567c6c8fe33e=1587547763; svid=65A0409DA903536E5B0B0EE956E32855; s_fid=439CADEA903B92DB-07A116C92EFCEFD3; lvid=c1238943c866cbbe5ba947ef92efd77e; nvid=1; trkId=98E63362-4356-43AB-8496-517CCB879FF2; Login_Hash=; JSESSIONID=aaai9_nnLa3NShiLkFIgx; COOKIE_LOGIN_USER=8BD018E2B01D662A8DB930FABCFF8864EB3D685B79BDD63EB1652544332B9AFA8E371FCCCC14B0CC5D5F295A51E32C2F7E8115828F136B87B087CE29; validCodeTimestamp=0ac32825-f7ed-41d5-8142-938ee1f8b26e; shareId_168824830=ef8z; shareId_155057311=null; shareId_168824365=null "
11 | }
12 | total = 1
13 | for pp in range(1,3):
14 | req = requests.get(
15 | 'https://cloud.189.cn/v2/listPublicShare.action?userId=330783715&mediaType=0&orderBy=filename&order=ASC&pageNum=%s&pageSize=545' % pp
16 | , headers=h)
17 | j = hjson.loads(req.content.decode())
18 | for a in j['data']:
19 | print('%s/%s' % (total,1081))
20 | id = a["fileId"]
21 | name = str(a["fileName"])
22 | sid = a["shareId"]
23 | fo = a["isFolder"]
24 | t = [{"fileId": id, "fileName": name, "isFolder": 1 if fo else 0}]
25 |
26 | jdata = json.dumps(t, ensure_ascii=False).replace(' ','')
27 |
28 | data = ''
29 | data += 'type=SHARE_SAVE'
30 | data += '&taskInfos=' + str(urllib.parse.quote(jdata))
31 | data += '&targetFolderId=8146417517567840'
32 | data += '&shareId=' + str(sid)
33 |
34 | ih = h
35 | ih['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8'
36 | resp = requests.post('https://cloud.189.cn/createBatchTask.action', headers=ih, data=data)
37 | print(name, resp.content.decode())
38 | total +=1
39 | time.sleep(0.5)
40 |
41 |
42 | if __name__ == '__main__':
43 | _main()
--------------------------------------------------------------------------------
/pythonScript/year_code/code_dir/readme.md:
--------------------------------------------------------------------------------
1 | # 存放代码文件或者代码文件夹
--------------------------------------------------------------------------------
/pythonScript/year_code/readme.md:
--------------------------------------------------------------------------------
1 | # 2019年第一个开源小项目:玩转统计代码量
2 |
3 | ## 0.导语
4 |
5 | 前两天写了个统计自2018年9月撰写本公众号以来写的部分python代码量项目。主要功能及使用见下文,项目已经开源,点击阅读原文即可进入项目仓库。
6 |
7 | 再来谈一下知识星球,有关知识星球,分享了很多内容,像这次小项目就是在星球内部提出的,3日一个小项目学习,共同交流,除此之外还有每日立了个flag,每日分享干货!
8 | 下面一起来看2019年第一个开源项目:玩转统计代码量!
9 |
10 | **希望可以顺手star与fork,谢谢!**
11 |
12 | 个人公众号:
13 |
14 | 
15 |
16 | ## 1.项目功能
17 |
18 | - **实现代码统计**
19 |
20 | - [x] 代码来源文件
21 | - [x] 总代码量
22 | - [x] 总注释量
23 | - [x] 总空行量
24 | - [x] 实际代码量
25 | - [x] 实际代码比率
26 | - [x] 总注释比率
27 | - [x] 总空行比率
28 |
29 | - **csv数据存储**
30 |
31 | - [x] csv模块进行数据存储
32 |
33 | - **美化输出结果**
34 |
35 | - [x] prettytable模块美化输出
36 | - [x] colorama模块颜色配置
37 |
38 | - **csv数据统计分析**
39 |
40 | - [x] pandas模块读取csv
41 | - [x] pandas模块统计与描述
42 |
43 | ## 2.你会学到
44 |
45 | - [x] python基础
46 | - [x] 面向对象方法
47 | - [x] os模块
48 | - [x] pandas模块
49 | - [x] csv模块
50 | - [x] prettytable模块
51 | - [x] colorama模块
52 |
53 | ## 3.如何使用
54 |
55 | - **下载**
56 |
57 | ```python
58 | git clone git@github.com:Light-City/year_code.git
59 | ```
60 |
61 | - **使用**
62 |
63 | 将代码文件与文件夹放到code_dir,或者修改`static.py`文件里的
64 |
65 | ```python
66 | dir = './code_dir' # 你的代码文件夹或者代码文件
67 | ```
68 |
69 | - **运行**
70 |
71 | 运行`statistic.py`文件,然后会打印输出下面结果,并得到原统计数据data.csv以及排序结果数据sort_data.csv。
72 |
73 | - **定制**
74 |
75 | ```python
76 | def codeSort(self,c_name='实际代码量') # 默认为实际代码量排序
77 | ```
78 |
79 | 使用codeSort函数的时候,可以根据自己的需求来排序,比如可以按照以下参数配置:
80 |
81 | codeSort('总代码量')。
82 |
83 | 可填入:(下面字符串中选择即可)
84 |
85 | ```
86 | '文件', '总代码量', '总注释量', '总空行量', '实际代码量', '实际代码比率', '总注释比率', '总空行比率'
87 | ```
88 |
89 | ## 4.结果展示
90 | - 美化输出结果
91 |
92 | 
93 |
94 | - 数据存储结果
95 |
96 | 
97 |
98 | - 排序存储结果
99 |
100 | 实际代码量排序结果
101 |
102 | 
103 |
104 | - 简单统计结果输出
105 |
106 | 
107 |
108 | ## 5.关于项目与星球
109 | 在星球中会做更加详细的代码讲解,如果有问题,星球内部提问会优先回答。
110 |
111 | **拒绝伸手党,但我同时拥抱开源,多点留言,多点点赞,多点分享,多点转载,多点赞赏,将是我支持原创的动力!**
112 |
113 | 最后,关于加入星球,需要说几点:
114 | 在星球中,我将用自己的业余时间同你们共同分享交流,我们可以做:
115 | - [x] 更多本文这种小项目
116 | - [x] 组织参加更多比赛
117 | - [x] 共同探讨论文
118 | - [x] 共同研究技术点
119 | - [x] 每日每人分享互动
120 | - [x] 提升坚持与学习的能力!
121 |
--------------------------------------------------------------------------------
/pythonScript/year_code/show_res/data_csv.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/year_code/show_res/data_csv.jpg
--------------------------------------------------------------------------------
/pythonScript/year_code/show_res/py_output.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/year_code/show_res/py_output.jpg
--------------------------------------------------------------------------------
/pythonScript/year_code/show_res/py_statistic.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/year_code/show_res/py_statistic.jpg
--------------------------------------------------------------------------------
/pythonScript/year_code/show_res/sort_csv.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/year_code/show_res/sort_csv.jpg
--------------------------------------------------------------------------------
/pythonScript/year_code/statistic.py:
--------------------------------------------------------------------------------
1 | import os
2 | import prettytable as pt
3 | from colorama import Fore,Style
4 | import pandas as pd
5 | import csv
6 | class FileAnalysis:
7 | def __init__(self):
8 | self.TLine=0 # 总代码行
9 | self.CLine=0 # 总命令行
10 | self.BLine=0 # 总空行
11 | # 美化打印存储到list中
12 | self.file_list=[] # 文件名list
13 | self.total_list=[] # 每个文件总代码list
14 | self.comment_list=[] # 每个文件总注释list
15 | self.blank_list=[] # 每个文件总空行list
16 | self.actual_list=[] # 每个文件实际代码量list
17 | self.actual_rate=[] # 每个文件实际代码比率list
18 | self.comment_rate=[] # 每个文件实际注释比率list
19 | self.black_rate=[] # 每个文件空行比率list
20 | self.isOne=True # 是否第一次写入csv
21 | def coutLines(self,file):
22 | comment_line = 0
23 | blank_line = 0
24 | with open(file, encoding='utf-8', errors='ignore') as f:
25 | # 返回每一个列表,包含每一行代码
26 | lines = f.readlines()
27 | # 总行数
28 | total_line = len(lines)
29 | # 遍历每一行
30 | for i in range(total_line):
31 | line = lines[i]
32 | # 检查是否为注释
33 | if line.startswith("#"):
34 | comment_line += 1
35 | elif line.strip().startswith("'''") or line.strip().startswith('"""'):
36 | comment_line += 1
37 | if line.count('"""') == 1 or line.count("'''") == 1:
38 | while True:
39 | line = lines[i]
40 | comment_line += 1
41 | i+=1
42 | if ("'''" in line) or ('"""' in line):
43 | break
44 | # 检查是否为空行
45 | elif line == '\n':
46 | blank_line += 1
47 | # 输出每个文件结果
48 | print("在%s中:" % file)
49 | print("代码行数:", total_line)
50 | print("注释行数:", comment_line)
51 | print("空行数:", blank_line)
52 | actual_line=total_line - comment_line - blank_line
53 | print("实际总行数:",actual_line)
54 | # 实际代码比率
55 | actual_ra=0
56 | # 注释比率
57 | comment_ra=0
58 | # 空行比率
59 | black_ra=0
60 | try:
61 | actual_ra=actual_line/total_line
62 | print("实际总行数占比率:",actual_ra)
63 | except Exception as e:
64 | print("实际总行数占比率:", 0)
65 | try:
66 | comment_ra=comment_line/total_line
67 | print("注释行数占比率:",comment_ra)
68 | except Exception as e:
69 | print("注释行数占比率:", 0)
70 | try:
71 | black_ra=blank_line/total_line
72 | print("空行数占比率:",black_ra)
73 | except Exception as e:
74 | print("空行数占比率:", 0)
75 | # 往list中添加数据
76 | self.actual_list.append(actual_line)
77 | # 格式化添加输出比率百分百
78 | self.actual_rate.append(format(actual_ra,'0.1%'))
79 | self.comment_rate.append(format(comment_ra,'0.1%'))
80 | self.black_rate.append(format(black_ra,'0.1%'))
81 | # 取xx.py
82 | self.file_list.append(Fore.GREEN+file.split('\\')[-1])
83 | self.total_list.append(total_line)
84 | self.comment_list.append(comment_line)
85 | self.blank_list.append(blank_line)
86 |
87 | # 存储csv数据格式化
88 | # list添加多个数据
89 | data_list = [file.split('\\')[-1],total_line,comment_line,blank_line,actual_line,actual_ra,comment_ra,black_ra]
90 | if self.isOne:
91 | # 存储head
92 | self.saveCSV(data_list,self.isOne)
93 | self.isOne=False
94 | # 存储
95 | self.saveCSV(data_list)
96 | return total_line, comment_line, blank_line
97 | def fileAnalysis(self,dir):
98 | # 列出目录下的所有文件和目录
99 | list_files = os.listdir(dir)
100 | for file in list_files:
101 | filepath = os.path.join(dir, file)
102 | # 目录:递归遍历子目录
103 | if os.path.isdir(filepath):
104 | self.fileAnalysis(filepath)
105 | # 文件:直接统计行数
106 | elif os.path:
107 | if os.path.splitext(file)[1] == '.py':
108 | total_line, comment_line, blank_line=self.coutLines(filepath)
109 | self.TLine+=total_line
110 | self.CLine+=comment_line
111 | self.BLine+=blank_line
112 |
113 | # 输出打印
114 | def output(self):
115 | # 添加总统计
116 | self.file_list.insert(0,Fore.LIGHTRED_EX+'总统计结果'+Fore.RESET)
117 | self.total_list.insert(0,Fore.LIGHTRED_EX + str(self.TLine) + Fore.RESET)
118 | self.comment_list.insert(0,Fore.LIGHTRED_EX + str(self.CLine) + Fore.RESET)
119 | self.blank_list.insert(0,Fore.LIGHTRED_EX + str(self.BLine) + Fore.RESET)
120 | actual_line = self.TLine-self.CLine-self.BLine
121 | self.actual_list.insert(0,Fore.LIGHTRED_EX + str(actual_line) + Fore.RESET)
122 | self.actual_rate.insert(0,Fore.LIGHTRED_EX +str(format((self.TLine-self.CLine-self.BLine)/self.TLine,'0.1%'))+Fore.RESET)
123 | self.comment_rate.insert(0,Fore.LIGHTRED_EX+str(format(self.CLine/self.TLine,'0.1%'))+Fore.RESET)
124 | self.black_rate.insert(0,Fore.LIGHTRED_EX+str(format(self.BLine/self.TLine,'0.1%'))+Fore.RESET)
125 |
126 | # 美化打印输出
127 | tb = pt.PrettyTable()
128 | tb.add_column(Fore.LIGHTMAGENTA_EX+"文件"+Fore.RESET,self.file_list)
129 | tb.add_column(Fore.LIGHTMAGENTA_EX+'总代码量'+Fore.RESET,self.total_list)
130 | tb.add_column(Fore.LIGHTMAGENTA_EX+'总注释量'+Fore.RESET,self.comment_list)
131 | tb.add_column(Fore.LIGHTMAGENTA_EX+'总空行量'+Fore.RESET,self.blank_list)
132 | tb.add_column(Fore.LIGHTMAGENTA_EX+'实际代码量'+Fore.RESET,self.actual_list)
133 | tb.add_column(Fore.LIGHTMAGENTA_EX+'实际代码比率'+Fore.RESET,self.actual_rate)
134 | tb.add_column(Fore.LIGHTMAGENTA_EX+'总注释比率'+Fore.RESET,self.comment_rate)
135 | tb.add_column(Fore.LIGHTMAGENTA_EX+'总空行比率'+Fore.RESET,self.black_rate)
136 | print(Fore.RED+"-----------------------------------------------光城18年9月份以后部分python代码统计结果-----------------------------------------------")
137 | print(Style.RESET_ALL)
138 | print(tb)
139 | print(Style.RESET_ALL)
140 | def saveCSV(self, data_list, isOne=False):
141 | # newline=''防止写入留空行问题
142 | # 追加写入
143 | with open("data.csv", "a+", encoding='utf_8_sig',newline='') as cf:
144 | writer = csv.writer(cf)
145 | # 如果是第一次写入,就写head,后面就正常写入
146 | if isOne:
147 | data_list = ['文件', '总代码量', '总注释量', '总空行量', '实际代码量', '实际代码比率', '总注释比率', '总空行比率']
148 | writer.writerow(data_list)
149 | # 排序
150 | def codeSort(self,c_name='实际代码量'):
151 | df = pd.DataFrame(pd.read_csv('./data.csv',encoding='utf_8_sig'))
152 | # print(df)
153 | # lc.sort(["loan_amnt"], ascending=True).head(10)
154 | print(df.sort_values(c_name,ascending=False,inplace=True))
155 | print(df.head(10))
156 | print(df.describe())
157 | print(df.sum())
158 | df.to_csv('./sort_data.csv',encoding='utf_8_sig',index=False)
159 |
160 | dir = './code_dir'
161 | fa = FileAnalysis()
162 | fa.fileAnalysis(dir)
163 | print(fa.TLine)
164 | print(fa.CLine)
165 | print(fa.BLine)
166 | fa.output()
167 | fa.codeSort('总代码量')
168 |
--------------------------------------------------------------------------------
/scrapy/2019-nCov-cn/city.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 |
4 | import requests
5 |
6 |
7 | def getHTMLText(url):
8 | try:
9 | headers = {
10 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
11 | "Chrome/80.0.3987.163 Safari/537.36"}
12 | r = requests.get(url, timeout=30, headers=headers)
13 | r.raise_for_status()
14 | r.encoding = r.apparent_encoding
15 | data = re.search("\(+([^)]*)\)+", r.text).group(1)
16 | return data
17 | except:
18 | return ""
19 |
20 |
21 | def getYqDate(lst, YqURL):
22 | html = getHTMLText(YqURL)
23 | hjson = json.loads(html)
24 | a = hjson['data']['list']
25 | for i in a:
26 | if i['ename'] == "fujian":
27 | city = i['city']
28 | for j in city:
29 | name = j['name'] # 城市名称
30 | value = j['conNum'] # 累计确诊
31 | econNum = j['econNum'] # 现存确诊
32 | conadd = j['conadd'] # 今日确诊
33 | deathNum = j['deathNum'] # 累计死亡人数
34 | cureNum = j['cureNum'] # 累计治愈
35 | zerodays = j['zerodays'] # 零增长天数
36 | single_data = [name, value, econNum, conadd, deathNum, cureNum, zerodays]
37 | lst.append(single_data)
38 | break
39 | else:
40 | continue
41 |
42 |
43 | def writeResult(lst, fpath):
44 | with open(fpath, 'a+', encoding='utf-8') as f:
45 | f.write('地区\t累计确诊\t现存确诊\t今日确诊\t累计死亡人数\t累计治愈\t零增长天数\n')
46 | for i in range(len(lst)):
47 | for j in range(len(lst[i])):
48 | f.write(str(lst[i][j]))
49 | f.write('\t')
50 | f.write('\n')
51 | lst.clear()
52 | f.close()
53 |
54 |
55 | if __name__ == '__main__':
56 | pagenum = 1
57 | output_file = 'D:/Personal/Desktop/fjyq.xls'
58 | final_data = []
59 | url = "https://gwpre.sina.cn/interface/fymap2020_data.json?_=1588258367647&callback=dataAPIData"
60 | getYqDate(final_data, url)
61 | writeResult(final_data, output_file)
--------------------------------------------------------------------------------
/scrapy/2019-nCov-cn/province.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | import requests
4 |
5 |
6 | def getHTMLText(url):
7 | try:
8 | headers = {
9 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
10 | "Chrome/80.0.3987.163 Safari/537.36"}
11 | r = requests.get(url, timeout=30, headers=headers)
12 | r.raise_for_status()
13 | r.encoding = r.apparent_encoding
14 | data = re.search("\(+([^)]*)\)+", r.text).group(1)
15 | return data
16 | except:
17 | return ""
18 |
19 |
20 | def getYqDate(lst, YqURL):
21 | html = getHTMLText(YqURL)
22 | hjson = json.loads(html)
23 | a = hjson['data']['list']
24 | for i in a:
25 | try:
26 | name = i['name'] # 省份
27 | value = i['value'] # 累计确诊
28 | econNum = i['econNum'] # 现存确诊
29 | conadd = i['conadd'] # 今日确诊
30 | deathNum = i['deathNum'] # 累计死亡人数
31 | cureNum = i['cureNum'] # 累计治愈
32 | zerodays = i['zerodays'] # 零增长天数
33 | jwsrNum = i['jwsrNum'] # 境外输入总数
34 | single_data = [name, value, econNum, conadd, deathNum, cureNum, zerodays, jwsrNum]
35 | lst.append(single_data)
36 | except:
37 | continue
38 |
39 |
40 | def writeResult(lst, fpath):
41 | with open(fpath, 'a+', encoding='utf-8') as f:
42 | f.write('省份\t累计确诊\t现存确诊\t今日确诊\t累计死亡人数\t累计治愈\t零增长天数\t境外输入总数\n')
43 | for i in range(len(lst)):
44 | for j in range(len(lst[i])):
45 | f.write(str(lst[i][j]))
46 | f.write('\t')
47 | f.write('\n')
48 | lst.clear()
49 | f.close()
50 |
51 |
52 | if __name__ == '__main__':
53 | pagenum = 1
54 | output_file = 'D:/Personal/Desktop/yq.xls'
55 | final_data = []
56 | url = "https://gwpre.sina.cn/interface/fymap2020_data.json?_=1588258367647&callback=dataAPIData"
57 | getYqDate(final_data, url)
58 | writeResult(final_data, output_file)
--------------------------------------------------------------------------------
/scrapy/51job-scrapy/2020-09-25_python算法工程师岗位招聘信息.csv:
--------------------------------------------------------------------------------
1 | Python算法工程师,1-1.5万/月,厦门,1年经验,本科,招若干人,09-25发布,,,岗位职责:1、负责睡眠分期相关数据智能算法的研发和改进2、负责gsensor六轴传感器数据的分析3、有参与过手环数据分析相关项目的优先。能力要求:1、熟练掌握Pathon、Java、C++等至少一门编程语言;2、熟悉Tensorflow、Pytorch、Keras等至少一个深度学习开发框架;3、熟悉经典数据挖掘方法,数据分析方法;4、具备良好的沟通表达能力、严谨的工作态度及较强的执行力。任职要求:1、学历:本科(含)以上学历,多年工作经验优先;应届毕业生学历为硕士;2、专业:计算机技术、智能技术、软件工程等相关专业毕业。Python算法工程师,上班地址:厦门,厦门中翎易优创科技有限公司,民营公司,,计算机软件,厦门中翎易优创科技有限公司诚聘
2 | Python算法工程师,1.5-2万/月,杭州-江干区,2年经验,本科,招若干人,09-25发布,,"五险一金,员工旅游,餐饮补贴,专业培训,绩效奖金,年终奖金,定期体检",岗位要求:1.参与基于摄像机传输的图片和视频流,结合业务场景的图像分类,进行多种行业安防类算法、模型研发及优化。2.针对项目方向进行算法调研、评估,研发相应算法并进行优化,对现有算法进行优化。3.进行深度学习算法的应用型研发,特别是在计算机视觉领域的应用研究,以及模型加速、模型调优、模型量化等研发任职要求:1.计算机相关专业;2.具备良好的动手能力,熟练掌握C/C++、Python等语言。3.掌握一种以上深度学习框架,掌握OpenCV库的使用。4.掌握计算机视觉基础知识、深度学习、经典机器学习等,有一定的实践经验。5.具备一定科研能力,能快速理解paper,具备算法创新能力着优先。6.具备对现实问题抽象出数学模型的逻辑分析能力,并能够求解数学问题。7.具备良好的职业素养、优良的品行、善于团队合作、有严谨的工作作风。,上班地址:杭州经济技术开发区海达南路555号金沙大厦C幢11层,浙江正洁环境科技有限公司,民营公司,150-500人,环保, 浙江正洁环境科技有限公司是一家专业提供工业废水和城市污水处理运营服务的国家高新技术企业。正洁环境作为一家知名的环保水处理第三方综合运营服务商,聚焦食品、印染等行业的源头污水处理,以提升客户自身生产专注度为目标,向客户提供包括工艺设计、设备采购与集成、工程实施、后续运维等一体化专业服务。正洁环境将努力打造出一个全生态链的水处理技术与资源再生集成工艺,循环资源,为改善人类的生态居住环境作出贡献! 正洁环境先后获得了杭州市“雏鹰计划”培育企业、杭州市高新技术企业、国家高新技术企业、杭州市企业高新技术研发中心等荣誉称号,2016年,获第五届中国创新创业大赛节能环保行业优胜奖,浙江省排名第二。 正洁环境目前已有员工近200人,其中学士及以上学位人员占比70%,具有近90%的专业相关度。企业创新能力卓越,具有全国最完善的自有药剂研发、生产和应用体系以及最具应用能力的自有设备研发创新体系。正洁环境拥有的水处理特种药剂和水处理特种设备,在水处理技术上已覆盖了食品、印染、电镀、农村生活污水、实验室废液等多个行业,其中多项技术填补了目前国内环保行业的空白。 公司现已申请专利22项,申请保护16项软件著作权,先后获得建设部环保工程专项资质、环境污染治理设施运行服务资质、环境污染治理工程总承包资质、环境污染治理工程设计资质,荣获浙江省工商企业AA级守合同重信用单位,先后取得ISO9001质量管理体系、ISO14001环境管理体系、HHSAS18001职业健康安全管理体系、GB/T50430工程建设施工企业质量管理规范。
3 | python算法工程师,1.2-2万/月,西安-国家民用航天产业基地,3-4年经验,本科,招1人,09-25发布,,"五险一金,补充医疗保险,通讯补贴,定期体检",岗位职责: 1、参与项目需求分析,业务逻辑流程与设计,文档编写; 2、能独立解决实际开发过程碰到的各类问题; 3、负责数据分析云平台和数据存储系统的搭建; 4、搭建系统开发环境,完成系统框架和核心代码的实现,负责解决开发过程中的技术问题; 任职要求: 一、教育水平:本科以上学历,计算机、应用数学或模式识别相关专业;二、工作资历:2年及以上工作经验;三、专业技能:1、有机器学习或推荐系统研究背景,有深度学习相关科研经历。2、对电商风控、广告预测、推荐系统等任何一个领域有实践经验;3、扎实的计算机基础,熟悉Python或C/C++,有较好的算法实现能力。3、精通使用一种开源的深度学习框架解决实际项目问题,如caffe、tensorflow、Mxnet、pytorch;4、精通SQL开发,精通Mysql、Oracle等关系型数据库中的一种;四、其他要求:1、较强的逻辑思维能力、学习能力;2、良好的英文听说读写能力;,上班地址:飞天路588号北航科技园,上海今日信息科技有限公司,民营公司,50-150人,计算机软件,上海今日信息科技有限公司成立于2014年8月1日成立,注册资金1000万元,是其上级今日集团为其新业务方向而成立的以IT增值业务为核心的IT服务机构。今日信息专注于IT咨询、IT基础构架建设、信息系统集成与虚拟化云计算、行业应用系统开发、IT服务运维等业务。今日信息与Oracle、CISCO、EMC、HP、Vmware、IBM、Microsoft、IMPERVA、APC 等多家国际知名厂商开展多方位的合作。主要增值业务: 基于Oracle IDM身份管理套件,结合国内企业单位信息系统的使用和管理现状,实现企业单位的身份管理、访问控制管理、用户的全生命周期管理等,为企业单位信息系统安全集成、风险管理控制等方面提供咨询与解决方案 基于全球领先的Oracle Cloud平台,我们提供企业级应用系统解决方案的咨询、项目实施、软件开发外包、IT领域的技术研发外包以及人力资源服务。 今日信息为客户提供端到端业务解决方案和服务,涵盖企业资源管理,人力资本管理、商务智能、客户关系管理、供应链管理等领域。 Oracle Primavera系列软件专注于项目密集型企业,其整个项目生命周期内所有项目的组合管理。
4 | python算法工程师,1-1.5万/月,上海-浦东新区,3-4年经验,大专,招若干人,09-25发布,,"五险一金,员工旅游,定期体检,年终奖金,免费班车,专业培训,带薪年假,过节福利,人才推荐奖",1.本科3年以上工作经验,近1年从事python 算法实现开发。2.python语言基础扎实,清晰掌握python中核心库包,能够独立设计并实现正则式,掌握多种代码技巧,熟悉形式语言,熟练使用jupyter、pycharm等python工作环境。3.掌握分布式原理,能够解读从逻辑设计,到框架机制、到代码实现,再到物理支撑的全链路流转。4.数据结构、算法理论基础扎实,掌握python主流数据处理库包,如pandas、numpy,能够独立完成代码调优,发现已有代码问题并制定修改方案。5.有较好的软件工程理论实践经验,熟悉面向对象分析设计、面向过程分析设计、面向接口分析设计中的一种,能够将分析设计表达为UML相应图表,掌握数据流分析,熟悉设计模式。6.熟悉大数据,MySQL数据库,SQL基础扎实。7.具有至少1个中大型python工程研发实现经验(6个月以上研发周期,5人以上python研发团队)。8.有Java、C++研发经验者优先。,上班地址:上海市浦东新区乐昌路399号上海期货所。,上海速强信息技术股份有限公司,民营公司,150-500人,计算机软件, 上海速强信息技术股份有限公司成立于2005年,是一家以全新软件开发理念为主导,致力于建立软件生产新模式的新兴IT公司。 目前公司的主要客户涵盖金融保险、电子商务等行业。专业的研发团队 公司核心技术、管理团队是由具有多年IT行业背景和管理经验的职业经理人组成,具有多年行业应用实施经验。凭借自身多年的软件实施经验,速强信息已形成完善的技术应用体系,包括:Java、. Net等主流开发平台、基于SQL Server、ORACLE、MYSQL等数据库平台,UML面向对象的设计等。 有形的专业技术,无限的创新空间,秉承“真诚、合作、创新”的理念,不断吸引IT人才加盟,创造工作的乐趣。快速的服务响应 客户的需求和满意是速强服务评价的***标准,我们是客户需求的倾听者,是问题的诊断者,也是问题的解决者。作为一家以客户至上为宗旨的软件企业,速强信息对客户的需求做出最及时、最快速的反应,公司对客户承诺48小时内的到场服务;质量至上,多层面系列化的产品服务,建立以客户需求为中心,以市场需求为导向的团队服务体系。 我们的服务口号:只有客户的成功、才是我们的成功!和谐的人文环境 速强信息顺应网络社会、知识经济、人性文化、数字生存的时代潮流,满足人类追求成功、渴求友谊、享受自由、勇于探索、彰显个性的内在需求,以灵活的软件产品和人性的服务,使客户实现工作上的‘智能化、网络化、个性化’;同时,速强更注重人才对软件发展的作用,秉承‘以人为本’的思想,为员工营造和谐、愉快的工作环境,努力为软件产业培养人才。 我们不是最成功的企业,但珍惜与您的每次合作;我们不是***的公司,但一定是您最忠实的倾听者!
5 | Python算法工程师(高级工程师),1-1.5万/月,昆山,3-4年经验,本科,招1人,09-24发布,,"五险一金,员工旅游,出国机会,专业培训,股票期权,包住宿,年终奖金,餐饮补贴","岗位职责:1.参与产品需求、研发设计的相关讨论;2.负责数据产品业务数据的挖掘和分析工作;3.负责机器视觉产品算法研究工作;任职要求:1、具有python开发数据处理软件/机器视觉软件的经验;2、精通Python,掌握opencv, numpy,scipy,matplotlib,pandas等图像和数据处理方面常用的第三方python库;3、熟悉至少一种Sql数据库 (mysql/postgresql/sqlserver/oracle);4. 掌握Tensorflow/Pytorch一种或多种深度学习框架;4、热爱编程、具有良好的代码风格;5、做事具有条理性,具有良好的自学能力、分析问题以及解决问题的能力。",上班地址:昆山开发区春旭路18号(联彩商务中心)2201、2202室,昆山润石智能科技有限公司,外资(非欧美),少于50人,"电子技术/半导体/集成电路,计算机软件",昆山润石智能科技坐落于昆山经济开发区,为泛半导体行业客户提供智能制造一站式解决方案。公司集智能制造需求硬件&软件的研发、销售、服务于一体。团队成员多为两岸半导体与面板产业人才,具备高端智能制造、制程研发、整合及半导体设备知识,在研发、管理、营销方面经验丰富。公司在智能制造方案中产品线齐全,将IoT、Bigdata、AI三环节打通形成闭环,主要产品有:数据采集与边缘计算系列产品(IOT)、工业流程自动化虚拟机器人(RPA)、智能缺陷自动分类系统(AiDC),为高端制造业客户提升生产良率及设备稼动率,提高生产力,助力客户达成智能制造迈向工业4.0。公司希望通过三到五年的努力,将国外昂贵的智能制造方案国产化,为中国的制造业升级贡献一份力量。
6 | Python算法工程师,1.2-2万/月,天津-西青区,2年经验,本科,招2人,09-23发布,,"五险一金,餐饮补贴,通讯补贴,专业培训,绩效奖金,年终奖金,员工旅游,节假日福利,带薪年假,员工聚餐",岗位职责:1、开展数据分析、机器学习算法的研发与应用;2、参与公司相关软件产品的研发;3、参与软件产品相关文档的撰写。岗位要求:1、计算机科学与技术等相关专业本科及以上学历;2、2年以上Python程序开发经验;3、有较强的算法分析和实现能力;4、善于分析和解决问题,学习能力强,良好的团队合作精神。优先考虑:1、精通机器学习及数据处理工具(pandas、numpy、sklearn等);2、熟悉Linux环境下编程,了解docker、git等工具。,上班地址:华苑产业区兰苑路5号留学生创业园D座1002,深析智能科技有限公司,民营公司,50-150人,"计算机软件,医疗设备/器械",深析智能科技有限公司(DeepCyto)成立于2018年,是一家将人工智能与深度学习技术引入智能医学诊断和医学数据分析领域的医疗科技公司。深析人工智能基于真实、准确、海量的临床数据,融合机器视觉、深度学习及大数据挖掘技术,对血液病理检测的细胞形态学、流式细胞学、细胞遗传学、分子生物学等的数据进行智能识别和分析,为广大医疗机构以及第三方医学实验室提供定制MICM全流程AI辅助诊断产品和技术服务。深析核心团队包含国际人工智能专家和国内血液病理学专家,拥有技术研发、血液病理诊断、临床诊疗和医学管理等复合型人才团队。目前,深析已经获得***VC(软银,元生)天使轮投资。深析的AI系统在血液病理人工智能诊断领域,尤其在***细胞的自动分割和分类计数、流式细胞学的自动设门和分群定量等方面已取得令人瞩目的成果。目前,深析智能已经联合中国医学科学院血液病医院、北京大学***医院、上海长海医院、广州中山一附院等***三甲医院,开展了多中心临床验证试验。经临床数据比对,流式细胞学的Deepflow软件对急性白血病的诊断准确率高达95%,比人工诊断速度提高约100倍, CellCounter***细胞形态学AI自动扫描分析形态的准确率达到97%,诊断速度提高约10倍。未来,深析将进一步开发染色体核型智能分析等产品,实现血液病理的智能化综合诊断,提高诊断的准确和效率,促进优质医疗资源下沉,以AI技术重塑中国未来医疗体系。
7 | python算法工程师,5-9千/月,无锡-无锡新区,1年经验,本科,招1人,09-23发布,,"五险一金,补充医疗保险",1)参与项目设计相关工作、程序的开发工作;2)根据公司开发规范及流程,能独立完成子功能的设计、编码、测试及相关文档编写;3)代码编写;4)完成单元测试,并配合进行其他测试和交付工作;5)部门新技术调研和实践。任职资格及素质要求:1、熟悉Python编程、Django或Tornado等web框架;2、熟悉SpringBoot、MongoDB、Linux等技术,掌握面向对象编程思想;3、 有图像处理、人工智能、自然语音处理经历优先;4、熟悉devops思想并有丰富经验者优先;5、自学能力强,愿意接收新技术。,上班地址:无锡市新吴区菱湖大道200号中国传感网国际创新园E1栋,隆正信息科技有限公司,民营公司,150-500人,"计算机软件,计算机服务(系统、数据服务、维修)",隆正互联是一家专门面向大型商业用户及行业云平台(to big B/I)的专业技术公司。公司的基本目标是:面向金融、通讯、能源、交通等行业,定位全国前100家大型行业客户(如工行、中行、国寿、联通、移动、中航信等),通过云平台支持的超级软件工厂,实现高端软件产品的集约化设计、制造、及交付,引进包括人工智能在内的先进技术,从根本上提高国产软件制作水平,打造高端软件制造生态链。 公司的愿景是: 1、 建立知识驱动的、AI导向的、深度共享协同的专业软件交付云平台。 2、 依托云平台支撑的超级软件工厂及渠道、交付、运维集群,实现行业商用软件及服务的一体化支撑与专业交付。 3、 面向重点行业,建立集约化、标准化、一体化的商用软件生态。 公司的总注册资金一亿元,各功能集群及商务中心在北京,超级软件工厂设在无锡,计划占地500亩以上,园区规划建筑积面四十万平米,能够容纳3万名软件从业人员。 如果您和我们拥有共同的理想,希望成为隆正互联的一员,那么您还在等什么?!快快发送您的简历,与我们共创未来!
8 | 高级Python算法工程师,2-2.5万/月,上海-静安区,3-4年经验,本科,招2人,09-18发布,,"五险一金,专业培训,弹性工作,年终奖金,绩效奖金",职位信息1、负责自然语言处理,文本挖掘,机器学习等工作;2、从事数学建模及数据挖掘应用方法研究;3、根据业务需求,研究开发相关算法模型岗位要求1、重点全日制本科及以上学历,计算机或数学相关专业;2、4~5年以上相关工作经验,熟悉Python编程语言,具备Python算法设计与程序开发能力;3、熟悉数据结构,具备清晰的编程思路、良好的编码风格,能胜任复杂模块的编码;4、对NLP有较深入的研究;5、熟悉深度学习工具Tensoerflow/Caffe/Torch者优先;6、良好的沟通能力,学习能力。7、必须在(文本分析,语义分析,语义计算)有一个或多个作品。8、具备良好的沟通协调能力和团队合作意识,工作踏实,态度积极,能够承受工作压力;,上班地址:江场路1313号金谷中环大厦6楼,亿翰智库,上市公司,150-500人,"专业服务(咨询、人力资源、财会),房地产", 公司简介亿翰股份(837350)于2016年5月成为首支登陆资本市场、专注中国房地产战略服务的企业。亿翰时刻关注中国房地产企业发展动态,深刻洞察行业发展趋势与先机,业务范围横跨地产、科技和资本三大领域;为超过70%的百强房企提供系统化的战略顾问服务;公司根植上海,雄踞长三角辐射全中国,现北京、成都、武汉和深圳已设有四家分支机构。亿翰股份每月发布发的《中国典型房企销售业绩排行榜》曝光量超千万,深受行业及资本界关注;有意者请将简历投递至邮箱:yihanzhiku@ehconsulting.com.cn公司人事部电话:021-61552731公司地址:上海市静安区江场路1313号金谷中环大厦6楼公司官网:www.ehconsulting.com.cn集团公众号:ehresearch简历发送格式:姓名+学校+专业+意向职位+信息来源+工作地点如果你足够的自信、优秀,积极上进。把你的简历投进亿翰股份,亿翰股份欢迎你的加入!北京公司地址:北京市朝阳区京信大厦2141室深圳公司地址:深圳市南山区粤兴六路中科纳能大厦409武汉公司地址:武汉市武昌区万达汉街总部国际B座2605成都公司地址:成都市武侯区成都蜀锦路88号“楚峰国际中心” 楼 903
9 | python算法工程师,0.8-1.2万/月,福州-鼓楼区,2年经验,大专,招3人,08-28发布,,"餐饮补贴,交通补贴,年终奖金,五险","任职要求:计算机相关专业本科以上学历,技术能力优秀者可放宽要求 精通Python 语言,熟悉Linux系统,熟悉基于Python的Web开发技术,熟悉Flask、Django、Tornado等常见架构至少一种;有志从事自动化与智能化开发,对语言学习有热情拥有较好的学习能力、良好的代码习惯和团队合作能力,善于沟通,逻辑清晰,能独立解决问题具备图像识别或机器学习相关知识者,有从事过人工智能(Python)上的相关人士优先岗位职责:1、根据开发进度和任务分配按时按量完成相应模块软件的设计、开发;2、负责现有程序的维护、更新与问题排除",上班地址:软件大道89号福州软件园B区4号楼,福建睿思特科技股份有限公司,民营公司,50-150人,"计算机软件,计算机服务(系统、数据服务、维修)",福建睿思特科技股份有限公司,总部位于福州软件园,专业从事智慧城市生态产业链、城市智能化整体解决方案的服务商。睿思特坚持以自主创新和行业应用为基础,以硬件、软件、服务三位一体构筑核心竞争力,为客户提供优质的产品和服务。睿思特依托物联网、云计算、大数据、人工智能、区块链等新一代信息技术,为环保、水利、交通、城管、电力等智慧城市领域提供软硬件一体化的整体解决方案。睿思特拥有结构化的研发、营销、应用服务和供应链团队,专注于为各行业用户提供领先的技术应用服务和绿色智慧城市解决方案。睿思特致力打造成为国内一流、国际领先的智慧产业互联网龙头企业,助力各地智慧城市建设。睿思特通过开放式创新、卓越运营管理、人力资源发展等战略的实施,全面构造公司的核心竞争力,创造客户和社会的价值,从而实现技术的价值。致力于成为最受社会、客户、股东和员工尊敬的公司,并通过组织与过程的持续改进,领导力与员工竞争力的发展,联盟与开放式创新,使公司成为优秀的城市智能化整体解决方案和服务提供商。
10 | Python算法工程师,1.5-2万/月,上海-静安区,2年经验,本科,招5人,09-18发布,,"五险一金,专业培训,弹性工作,年终奖金,绩效奖金",职位信息1、负责自然语言处理,文本挖掘,机器学习等工作; 2、从事数学建模及数据挖掘应用方法研究; 3、根据业务需求,研究开发相关算法模型岗位要求1、全日制本科及以上学历,计算机或数学相关专业; 2、2-3年以上相关工作经验,熟悉Python编程语言,具备Python算法设计与程序开发能力; 3、熟悉数据结构,具备清晰的编程思路、良好的编码风格,能胜任复杂模块的编码; 4、对NLP有较深入的研究; 5、熟悉深度学习工具Tensoerflow/Caffe/Torch者优先; 6、良好的沟通能力,学习能力。 7、必须在(文本分析,语义分析,语义计算)有一个或多个作品。 8、具备良好的沟通协调能力和团队合作意识,工作踏实,态度积极,能够承受工作压力; ,上班地址:江场路1313号金谷中环大厦6楼,亿翰智库,上市公司,150-500人,"专业服务(咨询、人力资源、财会),房地产", 公司简介亿翰股份(837350)于2016年5月成为首支登陆资本市场、专注中国房地产战略服务的企业。亿翰时刻关注中国房地产企业发展动态,深刻洞察行业发展趋势与先机,业务范围横跨地产、科技和资本三大领域;为超过70%的百强房企提供系统化的战略顾问服务;公司根植上海,雄踞长三角辐射全中国,现北京、成都、武汉和深圳已设有四家分支机构。亿翰股份每月发布发的《中国典型房企销售业绩排行榜》曝光量超千万,深受行业及资本界关注;有意者请将简历投递至邮箱:yihanzhiku@ehconsulting.com.cn公司人事部电话:021-61552731公司地址:上海市静安区江场路1313号金谷中环大厦6楼公司官网:www.ehconsulting.com.cn集团公众号:ehresearch简历发送格式:姓名+学校+专业+意向职位+信息来源+工作地点如果你足够的自信、优秀,积极上进。把你的简历投进亿翰股份,亿翰股份欢迎你的加入!北京公司地址:北京市朝阳区京信大厦2141室深圳公司地址:深圳市南山区粤兴六路中科纳能大厦409武汉公司地址:武汉市武昌区万达汉街总部国际B座2605成都公司地址:成都市武侯区成都蜀锦路88号“楚峰国际中心” 楼 903
11 |
--------------------------------------------------------------------------------
/scrapy/51job-scrapy/2020-09-27_python算法工程师岗位招聘信息.csv:
--------------------------------------------------------------------------------
1 | Python算法工程师,1-1.5万/月,厦门,1年经验,本科,招若干人,09-27发布,,,岗位职责:1、负责睡眠分期相关数据智能算法的研发和改进2、负责gsensor六轴传感器数据的分析3、有参与过手环数据分析相关项目的优先。能力要求:1、熟练掌握Pathon、Java、C++等至少一门编程语言;2、熟悉Tensorflow、Pytorch、Keras等至少一个深度学习开发框架;3、熟悉经典数据挖掘方法,数据分析方法;4、具备良好的沟通表达能力、严谨的工作态度及较强的执行力。任职要求:1、学历:本科(含)以上学历,多年工作经验优先;应届毕业生学历为硕士;2、专业:计算机技术、智能技术、软件工程等相关专业毕业。Python算法工程师,上班地址:厦门,厦门中翎易优创科技有限公司,民营公司,,计算机软件,厦门中翎易优创科技有限公司诚聘,https://jobs.51job.com/xiamen/124622088.html?s=01&t=0
2 | python算法工程师,1-1.5万/月,上海-浦东新区,3-4年经验,大专,招若干人,09-27发布,,"五险一金,员工旅游,定期体检,年终奖金,免费班车,专业培训,带薪年假,过节福利,人才推荐奖",1.本科3年以上工作经验,近1年从事python 算法实现开发。2.python语言基础扎实,清晰掌握python中核心库包,能够独立设计并实现正则式,掌握多种代码技巧,熟悉形式语言,熟练使用jupyter、pycharm等python工作环境。3.掌握分布式原理,能够解读从逻辑设计,到框架机制、到代码实现,再到物理支撑的全链路流转。4.数据结构、算法理论基础扎实,掌握python主流数据处理库包,如pandas、numpy,能够独立完成代码调优,发现已有代码问题并制定修改方案。5.有较好的软件工程理论实践经验,熟悉面向对象分析设计、面向过程分析设计、面向接口分析设计中的一种,能够将分析设计表达为UML相应图表,掌握数据流分析,熟悉设计模式。6.熟悉大数据,MySQL数据库,SQL基础扎实。7.具有至少1个中大型python工程研发实现经验(6个月以上研发周期,5人以上python研发团队)。8.有Java、C++研发经验者优先。,上班地址:上海市浦东新区乐昌路399号上海期货所。,上海速强信息技术股份有限公司,民营公司,150-500人,计算机软件, 上海速强信息技术股份有限公司成立于2005年,是一家以全新软件开发理念为主导,致力于建立软件生产新模式的新兴IT公司。 目前公司的主要客户涵盖金融保险、电子商务等行业。专业的研发团队 公司核心技术、管理团队是由具有多年IT行业背景和管理经验的职业经理人组成,具有多年行业应用实施经验。凭借自身多年的软件实施经验,速强信息已形成完善的技术应用体系,包括:Java、. Net等主流开发平台、基于SQL Server、ORACLE、MYSQL等数据库平台,UML面向对象的设计等。 有形的专业技术,无限的创新空间,秉承“真诚、合作、创新”的理念,不断吸引IT人才加盟,创造工作的乐趣。快速的服务响应 客户的需求和满意是速强服务评价的***标准,我们是客户需求的倾听者,是问题的诊断者,也是问题的解决者。作为一家以客户至上为宗旨的软件企业,速强信息对客户的需求做出最及时、最快速的反应,公司对客户承诺48小时内的到场服务;质量至上,多层面系列化的产品服务,建立以客户需求为中心,以市场需求为导向的团队服务体系。 我们的服务口号:只有客户的成功、才是我们的成功!和谐的人文环境 速强信息顺应网络社会、知识经济、人性文化、数字生存的时代潮流,满足人类追求成功、渴求友谊、享受自由、勇于探索、彰显个性的内在需求,以灵活的软件产品和人性的服务,使客户实现工作上的‘智能化、网络化、个性化’;同时,速强更注重人才对软件发展的作用,秉承‘以人为本’的思想,为员工营造和谐、愉快的工作环境,努力为软件产业培养人才。 我们不是最成功的企业,但珍惜与您的每次合作;我们不是***的公司,但一定是您最忠实的倾听者!,https://jobs.51job.com/shanghai-pdxq/124472415.html?s=01&t=0
3 | Python算法工程师,1.5-2万/月,杭州-江干区,2年经验,本科,招若干人,09-27发布,,"五险一金,员工旅游,餐饮补贴,专业培训,绩效奖金,年终奖金,定期体检",岗位要求:1.参与基于摄像机传输的图片和视频流,结合业务场景的图像分类,进行多种行业安防类算法、模型研发及优化。2.针对项目方向进行算法调研、评估,研发相应算法并进行优化,对现有算法进行优化。3.进行深度学习算法的应用型研发,特别是在计算机视觉领域的应用研究,以及模型加速、模型调优、模型量化等研发任职要求:1.计算机相关专业;2.具备良好的动手能力,熟练掌握C/C++、Python等语言。3.掌握一种以上深度学习框架,掌握OpenCV库的使用。4.掌握计算机视觉基础知识、深度学习、经典机器学习等,有一定的实践经验。5.具备一定科研能力,能快速理解paper,具备算法创新能力着优先。6.具备对现实问题抽象出数学模型的逻辑分析能力,并能够求解数学问题。7.具备良好的职业素养、优良的品行、善于团队合作、有严谨的工作作风。,上班地址:杭州经济技术开发区海达南路555号金沙大厦C幢11层,浙江正洁环境科技有限公司,民营公司,150-500人,环保, 浙江正洁环境科技有限公司是一家专业提供工业废水和城市污水处理运营服务的国家高新技术企业。正洁环境作为一家知名的环保水处理第三方综合运营服务商,聚焦食品、印染等行业的源头污水处理,以提升客户自身生产专注度为目标,向客户提供包括工艺设计、设备采购与集成、工程实施、后续运维等一体化专业服务。正洁环境将努力打造出一个全生态链的水处理技术与资源再生集成工艺,循环资源,为改善人类的生态居住环境作出贡献! 正洁环境先后获得了杭州市“雏鹰计划”培育企业、杭州市高新技术企业、国家高新技术企业、杭州市企业高新技术研发中心等荣誉称号,2016年,获第五届中国创新创业大赛节能环保行业优胜奖,浙江省排名第二。 正洁环境目前已有员工近200人,其中学士及以上学位人员占比70%,具有近90%的专业相关度。企业创新能力卓越,具有全国最完善的自有药剂研发、生产和应用体系以及最具应用能力的自有设备研发创新体系。正洁环境拥有的水处理特种药剂和水处理特种设备,在水处理技术上已覆盖了食品、印染、电镀、农村生活污水、实验室废液等多个行业,其中多项技术填补了目前国内环保行业的空白。 公司现已申请专利22项,申请保护16项软件著作权,先后获得建设部环保工程专项资质、环境污染治理设施运行服务资质、环境污染治理工程总承包资质、环境污染治理工程设计资质,荣获浙江省工商企业AA级守合同重信用单位,先后取得ISO9001质量管理体系、ISO14001环境管理体系、HHSAS18001职业健康安全管理体系、GB/T50430工程建设施工企业质量管理规范。,https://jobs.51job.com/hangzhou-jgq/118119462.html?s=01&t=0
4 | python算法工程师,1.2-2万/月,西安-国家民用航天产业基地,3-4年经验,本科,招1人,09-25发布,,"五险一金,补充医疗保险,通讯补贴,定期体检",岗位职责: 1、参与项目需求分析,业务逻辑流程与设计,文档编写; 2、能独立解决实际开发过程碰到的各类问题; 3、负责数据分析云平台和数据存储系统的搭建; 4、搭建系统开发环境,完成系统框架和核心代码的实现,负责解决开发过程中的技术问题; 任职要求: 一、教育水平:本科以上学历,计算机、应用数学或模式识别相关专业;二、工作资历:2年及以上工作经验;三、专业技能:1、有机器学习或推荐系统研究背景,有深度学习相关科研经历。2、对电商风控、广告预测、推荐系统等任何一个领域有实践经验;3、扎实的计算机基础,熟悉Python或C/C++,有较好的算法实现能力。3、精通使用一种开源的深度学习框架解决实际项目问题,如caffe、tensorflow、Mxnet、pytorch;4、精通SQL开发,精通Mysql、Oracle等关系型数据库中的一种;四、其他要求:1、较强的逻辑思维能力、学习能力;2、良好的英文听说读写能力;,上班地址:飞天路588号北航科技园,上海今日信息科技有限公司,民营公司,50-150人,计算机软件,上海今日信息科技有限公司成立于2014年8月1日成立,注册资金1000万元,是其上级今日集团为其新业务方向而成立的以IT增值业务为核心的IT服务机构。今日信息专注于IT咨询、IT基础构架建设、信息系统集成与虚拟化云计算、行业应用系统开发、IT服务运维等业务。今日信息与Oracle、CISCO、EMC、HP、Vmware、IBM、Microsoft、IMPERVA、APC 等多家国际知名厂商开展多方位的合作。主要增值业务: 基于Oracle IDM身份管理套件,结合国内企业单位信息系统的使用和管理现状,实现企业单位的身份管理、访问控制管理、用户的全生命周期管理等,为企业单位信息系统安全集成、风险管理控制等方面提供咨询与解决方案 基于全球领先的Oracle Cloud平台,我们提供企业级应用系统解决方案的咨询、项目实施、软件开发外包、IT领域的技术研发外包以及人力资源服务。 今日信息为客户提供端到端业务解决方案和服务,涵盖企业资源管理,人力资本管理、商务智能、客户关系管理、供应链管理等领域。 Oracle Primavera系列软件专注于项目密集型企业,其整个项目生命周期内所有项目的组合管理。,https://jobs.51job.com/xian-gjmyht/124565026.html?s=01&t=0
5 | Python算法工程师(高级工程师),1-1.5万/月,昆山,3-4年经验,本科,招1人,09-24发布,,"五险一金,员工旅游,出国机会,专业培训,股票期权,包住宿,年终奖金,餐饮补贴","岗位职责:1.参与产品需求、研发设计的相关讨论;2.负责数据产品业务数据的挖掘和分析工作;3.负责机器视觉产品算法研究工作;任职要求:1、具有python开发数据处理软件/机器视觉软件的经验;2、精通Python,掌握opencv, numpy,scipy,matplotlib,pandas等图像和数据处理方面常用的第三方python库;3、熟悉至少一种Sql数据库 (mysql/postgresql/sqlserver/oracle);4. 掌握Tensorflow/Pytorch一种或多种深度学习框架;4、热爱编程、具有良好的代码风格;5、做事具有条理性,具有良好的自学能力、分析问题以及解决问题的能力。",上班地址:昆山开发区春旭路18号(联彩商务中心)2201、2202室,昆山润石智能科技有限公司,外资(非欧美),少于50人,"电子技术/半导体/集成电路,计算机软件",昆山润石智能科技坐落于昆山经济开发区,为泛半导体行业客户提供智能制造一站式解决方案。公司集智能制造需求硬件&软件的研发、销售、服务于一体。团队成员多为两岸半导体与面板产业人才,具备高端智能制造、制程研发、整合及半导体设备知识,在研发、管理、营销方面经验丰富。公司在智能制造方案中产品线齐全,将IoT、Bigdata、AI三环节打通形成闭环,主要产品有:数据采集与边缘计算系列产品(IOT)、工业流程自动化虚拟机器人(RPA)、智能缺陷自动分类系统(AiDC),为高端制造业客户提升生产良率及设备稼动率,提高生产力,助力客户达成智能制造迈向工业4.0。公司希望通过三到五年的努力,将国外昂贵的智能制造方案国产化,为中国的制造业升级贡献一份力量。,https://jobs.51job.com/kunshan/122280284.html?s=01&t=0
6 | Python算法工程师,1.2-2万/月,天津-西青区,2年经验,本科,招2人,09-23发布,,"五险一金,餐饮补贴,通讯补贴,专业培训,绩效奖金,年终奖金,员工旅游,节假日福利,带薪年假,员工聚餐",岗位职责:1、开展数据分析、机器学习算法的研发与应用;2、参与公司相关软件产品的研发;3、参与软件产品相关文档的撰写。岗位要求:1、计算机科学与技术等相关专业本科及以上学历;2、2年以上Python程序开发经验;3、有较强的算法分析和实现能力;4、善于分析和解决问题,学习能力强,良好的团队合作精神。优先考虑:1、精通机器学习及数据处理工具(pandas、numpy、sklearn等);2、熟悉Linux环境下编程,了解docker、git等工具。,上班地址:华苑产业区兰苑路5号留学生创业园D座1002,深析智能科技有限公司,民营公司,50-150人,"计算机软件,医疗设备/器械",深析智能科技有限公司(DeepCyto)成立于2018年,是一家将人工智能与深度学习技术引入智能医学诊断和医学数据分析领域的医疗科技公司。深析人工智能基于真实、准确、海量的临床数据,融合机器视觉、深度学习及大数据挖掘技术,对血液病理检测的细胞形态学、流式细胞学、细胞遗传学、分子生物学等的数据进行智能识别和分析,为广大医疗机构以及第三方医学实验室提供定制MICM全流程AI辅助诊断产品和技术服务。深析核心团队包含国际人工智能专家和国内血液病理学专家,拥有技术研发、血液病理诊断、临床诊疗和医学管理等复合型人才团队。目前,深析已经获得***VC(软银,元生)天使轮投资。深析的AI系统在血液病理人工智能诊断领域,尤其在***细胞的自动分割和分类计数、流式细胞学的自动设门和分群定量等方面已取得令人瞩目的成果。目前,深析智能已经联合中国医学科学院血液病医院、北京大学***医院、上海长海医院、广州中山一附院等***三甲医院,开展了多中心临床验证试验。经临床数据比对,流式细胞学的Deepflow软件对急性白血病的诊断准确率高达95%,比人工诊断速度提高约100倍, CellCounter***细胞形态学AI自动扫描分析形态的准确率达到97%,诊断速度提高约10倍。未来,深析将进一步开发染色体核型智能分析等产品,实现血液病理的智能化综合诊断,提高诊断的准确和效率,促进优质医疗资源下沉,以AI技术重塑中国未来医疗体系。,https://jobs.51job.com/tianjin-xqq/123554406.html?s=01&t=0
7 | python算法工程师,5-9千/月,无锡-无锡新区,1年经验,本科,招1人,09-23发布,,"五险一金,补充医疗保险",1)参与项目设计相关工作、程序的开发工作;2)根据公司开发规范及流程,能独立完成子功能的设计、编码、测试及相关文档编写;3)代码编写;4)完成单元测试,并配合进行其他测试和交付工作;5)部门新技术调研和实践。任职资格及素质要求:1、熟悉Python编程、Django或Tornado等web框架;2、熟悉SpringBoot、MongoDB、Linux等技术,掌握面向对象编程思想;3、 有图像处理、人工智能、自然语音处理经历优先;4、熟悉devops思想并有丰富经验者优先;5、自学能力强,愿意接收新技术。,上班地址:无锡市新吴区菱湖大道200号中国传感网国际创新园E1栋,隆正信息科技有限公司,民营公司,150-500人,"计算机软件,计算机服务(系统、数据服务、维修)",隆正互联是一家专门面向大型商业用户及行业云平台(to big B/I)的专业技术公司。公司的基本目标是:面向金融、通讯、能源、交通等行业,定位全国前100家大型行业客户(如工行、中行、国寿、联通、移动、中航信等),通过云平台支持的超级软件工厂,实现高端软件产品的集约化设计、制造、及交付,引进包括人工智能在内的先进技术,从根本上提高国产软件制作水平,打造高端软件制造生态链。 公司的愿景是: 1、 建立知识驱动的、AI导向的、深度共享协同的专业软件交付云平台。 2、 依托云平台支撑的超级软件工厂及渠道、交付、运维集群,实现行业商用软件及服务的一体化支撑与专业交付。 3、 面向重点行业,建立集约化、标准化、一体化的商用软件生态。 公司的总注册资金一亿元,各功能集群及商务中心在北京,超级软件工厂设在无锡,计划占地500亩以上,园区规划建筑积面四十万平米,能够容纳3万名软件从业人员。 如果您和我们拥有共同的理想,希望成为隆正互联的一员,那么您还在等什么?!快快发送您的简历,与我们共创未来!,https://jobs.51job.com/wuxi-wxxq/122456199.html?s=01&t=0
8 | 高级Python算法工程师,2-2.5万/月,上海-静安区,3-4年经验,本科,招2人,09-18发布,,"五险一金,专业培训,弹性工作,年终奖金,绩效奖金",职位信息1、负责自然语言处理,文本挖掘,机器学习等工作;2、从事数学建模及数据挖掘应用方法研究;3、根据业务需求,研究开发相关算法模型岗位要求1、重点全日制本科及以上学历,计算机或数学相关专业;2、4~5年以上相关工作经验,熟悉Python编程语言,具备Python算法设计与程序开发能力;3、熟悉数据结构,具备清晰的编程思路、良好的编码风格,能胜任复杂模块的编码;4、对NLP有较深入的研究;5、熟悉深度学习工具Tensoerflow/Caffe/Torch者优先;6、良好的沟通能力,学习能力。7、必须在(文本分析,语义分析,语义计算)有一个或多个作品。8、具备良好的沟通协调能力和团队合作意识,工作踏实,态度积极,能够承受工作压力;,上班地址:江场路1313号金谷中环大厦6楼,亿翰智库,上市公司,150-500人,"专业服务(咨询、人力资源、财会),房地产", 公司简介亿翰股份(837350)于2016年5月成为首支登陆资本市场、专注中国房地产战略服务的企业。亿翰时刻关注中国房地产企业发展动态,深刻洞察行业发展趋势与先机,业务范围横跨地产、科技和资本三大领域;为超过70%的百强房企提供系统化的战略顾问服务;公司根植上海,雄踞长三角辐射全中国,现北京、成都、武汉和深圳已设有四家分支机构。亿翰股份每月发布发的《中国典型房企销售业绩排行榜》曝光量超千万,深受行业及资本界关注;有意者请将简历投递至邮箱:yihanzhiku@ehconsulting.com.cn公司人事部电话:021-61552731公司地址:上海市静安区江场路1313号金谷中环大厦6楼公司官网:www.ehconsulting.com.cn集团公众号:ehresearch简历发送格式:姓名+学校+专业+意向职位+信息来源+工作地点如果你足够的自信、优秀,积极上进。把你的简历投进亿翰股份,亿翰股份欢迎你的加入!北京公司地址:北京市朝阳区京信大厦2141室深圳公司地址:深圳市南山区粤兴六路中科纳能大厦409武汉公司地址:武汉市武昌区万达汉街总部国际B座2605成都公司地址:成都市武侯区成都蜀锦路88号“楚峰国际中心” 楼 903,https://jobs.51job.com/shanghai-jaq/117355255.html?s=01&t=0
9 | python算法工程师,0.8-1.2万/月,福州-鼓楼区,2年经验,大专,招3人,08-28发布,,"餐饮补贴,交通补贴,年终奖金,五险","任职要求:计算机相关专业本科以上学历,技术能力优秀者可放宽要求 精通Python 语言,熟悉Linux系统,熟悉基于Python的Web开发技术,熟悉Flask、Django、Tornado等常见架构至少一种;有志从事自动化与智能化开发,对语言学习有热情拥有较好的学习能力、良好的代码习惯和团队合作能力,善于沟通,逻辑清晰,能独立解决问题具备图像识别或机器学习相关知识者,有从事过人工智能(Python)上的相关人士优先岗位职责:1、根据开发进度和任务分配按时按量完成相应模块软件的设计、开发;2、负责现有程序的维护、更新与问题排除",上班地址:软件大道89号福州软件园B区4号楼,福建睿思特科技股份有限公司,民营公司,50-150人,"计算机软件,计算机服务(系统、数据服务、维修)",福建睿思特科技股份有限公司,总部位于福州软件园,专业从事智慧城市生态产业链、城市智能化整体解决方案的服务商。睿思特坚持以自主创新和行业应用为基础,以硬件、软件、服务三位一体构筑核心竞争力,为客户提供优质的产品和服务。睿思特依托物联网、云计算、大数据、人工智能、区块链等新一代信息技术,为环保、水利、交通、城管、电力等智慧城市领域提供软硬件一体化的整体解决方案。睿思特拥有结构化的研发、营销、应用服务和供应链团队,专注于为各行业用户提供领先的技术应用服务和绿色智慧城市解决方案。睿思特致力打造成为国内一流、国际领先的智慧产业互联网龙头企业,助力各地智慧城市建设。睿思特通过开放式创新、卓越运营管理、人力资源发展等战略的实施,全面构造公司的核心竞争力,创造客户和社会的价值,从而实现技术的价值。致力于成为最受社会、客户、股东和员工尊敬的公司,并通过组织与过程的持续改进,领导力与员工竞争力的发展,联盟与开放式创新,使公司成为优秀的城市智能化整体解决方案和服务提供商。,https://jobs.51job.com/fuzhou-glq/117579727.html?s=01&t=0
10 | Python算法工程师,1.5-2万/月,上海-静安区,2年经验,本科,招5人,09-18发布,,"五险一金,专业培训,弹性工作,年终奖金,绩效奖金",职位信息1、负责自然语言处理,文本挖掘,机器学习等工作; 2、从事数学建模及数据挖掘应用方法研究; 3、根据业务需求,研究开发相关算法模型岗位要求1、全日制本科及以上学历,计算机或数学相关专业; 2、2-3年以上相关工作经验,熟悉Python编程语言,具备Python算法设计与程序开发能力; 3、熟悉数据结构,具备清晰的编程思路、良好的编码风格,能胜任复杂模块的编码; 4、对NLP有较深入的研究; 5、熟悉深度学习工具Tensoerflow/Caffe/Torch者优先; 6、良好的沟通能力,学习能力。 7、必须在(文本分析,语义分析,语义计算)有一个或多个作品。 8、具备良好的沟通协调能力和团队合作意识,工作踏实,态度积极,能够承受工作压力; ,上班地址:江场路1313号金谷中环大厦6楼,亿翰智库,上市公司,150-500人,"专业服务(咨询、人力资源、财会),房地产", 公司简介亿翰股份(837350)于2016年5月成为首支登陆资本市场、专注中国房地产战略服务的企业。亿翰时刻关注中国房地产企业发展动态,深刻洞察行业发展趋势与先机,业务范围横跨地产、科技和资本三大领域;为超过70%的百强房企提供系统化的战略顾问服务;公司根植上海,雄踞长三角辐射全中国,现北京、成都、武汉和深圳已设有四家分支机构。亿翰股份每月发布发的《中国典型房企销售业绩排行榜》曝光量超千万,深受行业及资本界关注;有意者请将简历投递至邮箱:yihanzhiku@ehconsulting.com.cn公司人事部电话:021-61552731公司地址:上海市静安区江场路1313号金谷中环大厦6楼公司官网:www.ehconsulting.com.cn集团公众号:ehresearch简历发送格式:姓名+学校+专业+意向职位+信息来源+工作地点如果你足够的自信、优秀,积极上进。把你的简历投进亿翰股份,亿翰股份欢迎你的加入!北京公司地址:北京市朝阳区京信大厦2141室深圳公司地址:深圳市南山区粤兴六路中科纳能大厦409武汉公司地址:武汉市武昌区万达汉街总部国际B座2605成都公司地址:成都市武侯区成都蜀锦路88号“楚峰国际中心” 楼 903,https://jobs.51job.com/shanghai-jaq/117000837.html?s=01&t=0
11 |
--------------------------------------------------------------------------------
/scrapy/51job-scrapy/51jobs.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Sep 25 10:48:03 2020
4 |
5 | @author: YINUXY
6 | """
7 |
8 |
9 | import re
10 | import time
11 | import copy
12 | import random
13 | import requests
14 | import pymysql
15 | import pandas as pd
16 | from lxml import etree
17 | from selenium import webdriver
18 | from selenium.webdriver.chrome.options import Options
19 |
20 |
21 | class JobSpider:
22 | def __init__(self):
23 | self.base_url = 'https://search.51job.com/list/080200,000000,0000,00,9,99,%s,2,%s.html'
24 | self.headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.13 Safari/537.36'}
25 | self.keyword = 'Java开发工程师'
26 | self.chrome_options=Options()
27 | self.chrome_options.add_argument('--headless')
28 | self.conn=pymysql.connect(host="127.0.0.1",
29 | user="root",
30 | passwd="",
31 | charset='utf8mb4',
32 | cursorclass=pymysql.cursors.DictCursor)
33 | self.cur = self.conn.cursor()
34 | self.cur.execute("CREATE DATABASE IF NOT EXISTS `jobs`")
35 | self.cur.execute("USE jobs")
36 | self.cur.execute("DROP TABLE IF EXISTS `web_51jobs_javadevelopment`")
37 | self.cur.execute("CREATE TABLE IF NOT EXISTS `web_51jobs_javadevelopment` (`id` INT PRIMARY KEY AUTO_INCREMENT,`position` varchar(100),`wages` varchar(20),`region` varchar(100),`experience` varchar(100),`education` varchar(100),`need_people` varchar(20),`publish_date` varchar(20),`english` varchar(100),`welfare_tags` varchar(200),`job_information` varchar(4000),`work_address` varchar(200),`company_name` varchar(200),`company_nature` varchar(200),`company_scale` varchar(200),`company_industry` varchar(200),`company_information` varchar(4000),`job_url` varchar(100))")
38 |
39 |
40 | def tatal_url(self):
41 | url = self.base_url % (self.keyword, str(1))
42 | tree = etree.HTML(self.parse_html(url))
43 | # 提取一共有多少页
44 | text = tree.xpath("//div[@class='p_in']/span[1]/text()")[0]
45 | number = re.findall('[0-9]', text)
46 | number = int(''.join(number))
47 | print('%s职位共有%d页' % (self.keyword, number))
48 | return number
49 |
50 | def parse_html(self,url):
51 | driver=webdriver.Chrome(chrome_options=self.chrome_options)
52 | driver.get(url)
53 | html = driver.page_source
54 | time.sleep(random.randint(5,10))
55 | driver.close()
56 | return html
57 |
58 | def detail_url(self, number):
59 |
60 | """
61 | 1、解析每一页职位详情页的 url
62 | 2、特殊情况一:如果有前程无忧自己公司的职位招聘信息掺杂在里面,他的详情页结构和普通的也不一样,页面编码也有差别。
63 | 页面示例:https://51rz.51job.com/job.html?jobid=115980776
64 | 页面真实数据请求地址类似于:https://coapi.51job.com/job_detail.php?jsoncallback=&key=&sign=params={"jobid":""}
65 | 请求地址中的各参数值通过 js 加密:https://js.51jobcdn.com/in/js/2018/coapi/coapi.min.js
66 | 3、特殊情况二:部分公司有自己的专属页面,此类页面的结构也不同于普通页面
67 | 页面示例:http://dali.51ideal.com/jobdetail.html?jobid=121746338
68 | 4、为了规范化,本次爬取将去掉这部分特殊页面,仅爬取 url 带有 jobs.51job.com 的数据
69 | """
70 |
71 | for num in range(1, number+1):
72 | starts = time.time()
73 | url = self.base_url % (self.keyword, str(num))
74 | tree = etree.HTML(self.parse_html(url))
75 | detail_url1 = tree.xpath("//div[@class='j_joblist']/div[@class='e']/a/@href")
76 |
77 | """
78 | 深拷贝一个 url 列表,如果有连续的不满足要求的链接,若直接在原列表里面删除,
79 | 则会漏掉一些链接,因为每次删除后的索引已改变,因此在原列表中提取不符合元素
80 | 后,在深拷贝的列表里面进行删除。最后深拷贝的列表里面的元素均符合要求。
81 | """
82 |
83 | detail_url2 = copy.deepcopy(detail_url1)
84 | for url in detail_url1:
85 | if 'jobs.51job.com' not in url:
86 | detail_url2.remove(url)
87 | self.parse_data(detail_url2)
88 | ends = time.time()
89 | print('第 %d页数据爬取完毕,本页共有 %d个 %s岗位, 用时%d秒' % (num, len(detail_url2), self.keyword, int(ends-starts)))
90 | time.sleep(2)
91 | print('所有数据爬取完毕!')
92 |
93 | def parse_data(self, urls):
94 |
95 | """
96 | position: 职位
97 | wages: 工资
98 | region: 地区
99 | experience: 经验
100 | education: 学历
101 | need_people: 招聘人数
102 | publish_date: 发布时间
103 | english: 英语要求
104 | welfare_tags: 福利标签
105 | job_information: 职位信息
106 | work_address: 上班地址
107 | company_name: 公司名称
108 | company_nature: 公司性质
109 | company_scale: 公司规模
110 | company_industry: 公司行业
111 | company_information: 公司信息
112 | job_url: 招聘链接
113 | """
114 |
115 | # jobs = []
116 |
117 | for url in urls:
118 | job = {}
119 | job['链接'] = url
120 | response = requests.get(url=url, headers=self.headers)
121 | try:
122 | text = response.content.decode('gbk')
123 | except UnicodeDecodeError:
124 | return
125 | tree = etree.HTML(text)
126 |
127 | """
128 | 提取内容时使用 join 方法将列表转为字符串,而不是直接使用索引取值,
129 | 这样做的好处是遇到某些没有的信息直接留空而不会报错
130 | """
131 |
132 | position = ''.join(tree.xpath("//div[@class='cn']/h1/text()"))
133 | wages = ''.join(tree.xpath("//div[@class='cn']/strong/text()"))
134 |
135 | # 经验、学历、招聘人数、发布时间等信息都在一个标签里面,逐一使用列表解析式提取
136 | content = tree.xpath("//div[@class='cn']/p[2]/text()")
137 | content = [i.strip() for i in content]
138 | if content:
139 | region = content[0]
140 | else:
141 | region = ''
142 | experience = ''.join([i for i in content if '经验' in i])
143 | education = ''.join([i for i in content if i in '本科大专应届生在校生硕士'])
144 | need_people = ''.join([i for i in content if '招' in i])
145 | publish_date = ''.join([i for i in content if '发布' in i])
146 | english = ''.join([i for i in content if '英语' in i])
147 |
148 | welfare_tags = ','.join(tree.xpath("//div[@class='jtag']/div//text()")[1:-2])
149 | job_information = ''.join(tree.xpath("//div[@class='bmsg job_msg inbox']/p//text()")).replace(' ', '')
150 | work_address = ''.join(tree.xpath("//div[@class='bmsg inbox']/p//text()"))
151 | company_name = ''.join(tree.xpath("//div[@class='tCompany_sidebar']/div[1]/div[1]/a/p/text()"))
152 | company_nature = ''.join(tree.xpath("//div[@class='tCompany_sidebar']/div[1]/div[2]/p[1]//text()"))
153 | company_scale = ''.join(tree.xpath("//div[@class='tCompany_sidebar']/div[1]/div[2]/p[2]//text()"))
154 | company_industry = ''.join(tree.xpath("//div[@class='tCompany_sidebar']/div[1]/div[2]/p[3]/@title"))
155 | company_information = ''.join(tree.xpath("//div[@class='tmsg inbox']/text()"))
156 |
157 | job_data = [position, wages, region, experience, education, need_people, publish_date,
158 | english, welfare_tags, job_information, work_address, company_name,
159 | company_nature, company_scale, company_industry, company_information, str(url)]
160 | #追加写入csv文件
161 | df = pd.DataFrame([job_data])
162 | df.to_csv('./%s_%s岗位招聘信息.csv'%(str(time.strftime("%Y-%m-%d")), self.keyword), mode='a', header=None, index=None, encoding="utf_8_sig")
163 |
164 | job["职位"] = position
165 | job["工资"] = wages
166 | job["地区"] = region
167 | job["经验"] = experience
168 | job["学历"] = education
169 | job["招聘人数"] = need_people
170 | job["发布时间"] = publish_date
171 | job["英语要求"] = english
172 | job["福利标签"] = welfare_tags
173 | job["职位信息"] = job_information
174 | job["上班地址"] = work_address
175 | job["公司名称"] = company_name
176 | job["公司性质"] = company_nature
177 | job["公司规模"] = company_scale
178 | job["公司行业"] = company_industry
179 | job["公司信息"] = company_information
180 | # print(job)
181 | self.process_job(job)
182 | # jobs.append(job)
183 |
184 | def process_job(self,job):
185 | # self.cur = self.conn.cursor()
186 | try:
187 | position = job["职位"]
188 | wages = job["工资"]
189 | region = job["地区"]
190 | experience = job["经验"]
191 | education = job["学历"]
192 | need_people = job["招聘人数"]
193 | publish_date = job["发布时间"]
194 | english = job["英语要求"]
195 | welfare_tags = job["福利标签"]
196 | job_information = job["职位信息"]
197 | work_address = job["上班地址"]
198 | company_name = job["公司名称"]
199 | company_nature = job["公司性质"]
200 | company_scale = job["公司规模"]
201 | company_industry = job["公司行业"]
202 | company_information = job["公司信息"]
203 | job_url = job['链接']
204 | sql = "INSERT INTO `web_51jobs_javadevelopment` (`position`,`wages`,`region`,`experience`,`education`,`need_people`,`publish_date`,`english`,`welfare_tags`,`job_information`,`work_address`,`company_name`,`company_nature`,`company_scale`,`company_industry`,`company_information`,`job_url`) VALUES ('"+ position +"','"+ wages +"','"+ region +"','"+ experience +"','"+ education +"','"+ need_people +"','"+ publish_date +"','"+ english +"','"+ welfare_tags +"','"+ job_information +"','"+ work_address +"','"+ company_name +"','"+ company_nature +"','"+ company_scale +"','"+ company_industry +"','"+ company_information+"','"+ job_url+"')"
205 | self.cur.execute(sql)
206 | self.conn.commit()
207 | # self.conn.close()
208 | except Exception as err:
209 | print(err)
210 |
211 |
212 | if __name__ == '__main__':
213 | starts = time.time()
214 | spider = JobSpider()
215 | page_number = spider.tatal_url()
216 | spider.detail_url(page_number)
217 | ends = time.time()
218 | print("程序运行完毕,总用时 %d分 %d秒" % (int(ends-starts)/60, (ends-starts)%60))
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
--------------------------------------------------------------------------------
/scrapy/UnsplashCrawler/UnsplashCrawler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python 3.6.1
2 | # -*- coding:utf-8 -*-
3 | # ____author___='Yinux'
4 | import json
5 | import os
6 | import threading
7 | import urllib
8 | from queue import Queue
9 | import random
10 | import requests
11 | import time
12 |
13 | """
14 | 使用多线程将 Unsplash 的图片下载到本地
15 | """
16 | # 使用队列保存存放图片 url 地址, 确保线程同步
17 | url_queue = Queue()
18 | # 线程总数
19 | THREAD_SUM = 5
20 | # 存储图片的位置
21 | IMAGE_SRC = 'E://spiderproject//UnsplashCrawler/'
22 |
23 |
24 | class Unsplash(threading.Thread):
25 | NOT_EXIST = 0
26 |
27 | def __init__(self, thread_id):
28 | threading.Thread.__init__(self)
29 | self.thread_id = thread_id
30 |
31 | def run(self):
32 | while not self.NOT_EXIST:
33 | # 队列为空, 结束线程
34 | if url_queue.empty():
35 | NOT_EXIST = 1
36 | break
37 |
38 | url = url_queue.get()
39 | self.get_data(url)
40 | time.sleep(random.randint(3, 5))
41 |
42 | def get_data(self, url):
43 | """ 根据 url 获取 JSON 格式的图片数据"""
44 | headers = {
45 | 'User-agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36',
46 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
47 | 'referer': 'https://unsplash.com/',
48 | 'path': url.split('com')[1],
49 | 'authority': 'unsplash.com',
50 | 'viewport-width': '1920',
51 | }
52 | response = requests.get(url, headers=headers)
53 | print('请求第[ ' + url + ' ], 状态码为 ', response.status_code)
54 | self.get_image_url(response.text)
55 |
56 | def get_image_url(self, response):
57 | """
58 | 使用 json.loads(response) 将其转化为字典类型, 以便采用 key-value 形式获取值
59 | raw:包含Exif信息的全尺寸原图,此类图片的容量很大
60 | full:全尺寸分辨率的图片,去除了Exif信息并且对内容进行了压缩,图片容量适中
61 | normal:普通尺寸的图片,去除了Exif信息,并且对分辨率和内容进行了压缩,图片容量较小;
62 | """
63 | image_url = json.loads(response)[0]['urls']['full']
64 | self.save_img(image_url)
65 |
66 | def save_img(self, image_url):
67 | print('线程', self.thread_id, ' | 正在下载', image_url)
68 | try:
69 | if not os.path.exists(IMAGE_SRC):
70 | os.mkdir(IMAGE_SRC)
71 | filename = IMAGE_SRC + image_url.split('com')[1].split('?')[0] + '.jpg'
72 | # 下载图片,并保存到文件夹中
73 | urllib.request.urlretrieve(image_url, filename=filename)
74 | except IOError as e:
75 | print('保存图片出现异常失败', e)
76 |
77 |
78 | def get_all_url():
79 | """ 循环计算出所有的 url 地址, 存放到队列中 """
80 | base_url = 'https://unsplash.com/napi/photos?page={}&per_page=1&order_by=latest'
81 | page = 1
82 | max_page = 100
83 | while page <= max_page:
84 | url = base_url.format(page)
85 | url_queue.put(url)
86 | page += 1
87 | print('计划下载', url_queue.qsize(), '张图片')
88 |
89 |
90 | if __name__ == '__main__':
91 | get_all_url()
92 | for i in range(THREAD_SUM):
93 | unsplash = Unsplash(i + 1)
94 | unsplash.start()
--------------------------------------------------------------------------------
/scrapy/WeChatArticle/WecArticle.py:
--------------------------------------------------------------------------------
1 | import bs4
2 | import rom as rom
3 |
4 | rom bs4 import BeautifulSoup
5 | from selenium import webdriver
6 | from selenium.webdriver.support.ui import WebDriverWait
7 | import re
8 | import csv
9 | import time
10 | import os
11 |
12 | browser = webdriver.Chrome()
13 | wait = WebDriverWait(browser, 5) # 设置等待时间
14 |
15 |
16 | # 提取公众号文章信息
17 | def get_info(url):
18 | browser.get(url)
19 | html = browser.page_source
20 | soup = BeautifulSoup(html, 'lxml')
21 | data = [] # 用来储存文章信息
22 | for i in range(0, 10):
23 | titles = soup.select('#sogou_vr_11002601_title_{}'.format(i))
24 | introductions = soup.select('#sogou_vr_11002601_summary_{}'.format(i))
25 | dates = soup.select('#sogou_vr_11002601_box_{} div.txt-box div span'.format(i))
26 | for ti, intr, da in zip(titles, introductions, dates):
27 | info = {}
28 | title = ti.get_text()
29 | info['文章标题'] = title
30 | link = str(re.compile('data-share="(.*?)"').findall(str(titles))).replace('amp;', '')[2:-2]
31 | info['文章链接'] = link
32 | introduction = intr.get_text()
33 | info['文章简介'] = introduction
34 | date = str(da.get_text()).split(')')[-1]
35 | info['发文日期'] = date
36 | data.append(info)
37 | return data
38 |
39 |
40 | def mkdir(): # 创建储存内容的文件夹
41 | isExists = os.path.exists('D:\\Python\\spider\\wecArticle')
42 | if not isExists:
43 | print('创建目录')
44 | os.makedirs('D:\\Python\\spider\\wecArticle') # 创建目录
45 | os.chdir('D:\\Python\\spider\\wecArticle') # 切换到创建的文件夹
46 | return True
47 | else:
48 | print('目录已存在,即将保存!')
49 | os.chdir('D:\\Python\\spider\\wecArticle') # 切换到创建的文件夹
50 | return False
51 |
52 |
53 | def write2csv(url, kw): # 写入文件,以 csv 文件形式储存
54 | mkdir()
55 | print('正在写入文件')
56 | with open('{}.csv'.format(kw), 'a', newline='', encoding='utf-8') as f:
57 | # 追加内容用 a
58 | fieldnames = ['文章标题', '文章链接', '文章简介', '发文日期'] # 控制列的顺序
59 | writer = csv.DictWriter(f, fieldnames=fieldnames)
60 | writer.writeheader()
61 | data = get_info(url)
62 | writer.writerows(data)
63 | print("写入成功")
64 |
65 |
66 | if __name__ == '__main__':
67 | kw = input('请输入你的关键字:\n')
68 | for j in range(1, 11):
69 | url = 'http://weixin.sogou.com/weixin?query={}&type=2&page={}'.format(kw, j)
70 | write2csv(url, kw)
71 | time.sleep(1)
72 |
73 |
--------------------------------------------------------------------------------
/scrapy/cf-ipv6/cf_ipv6_scan.py:
--------------------------------------------------------------------------------
1 | import threading
2 | import requests
3 |
4 | # multithread
5 | def multi_check_ip(start, end):
6 | print(threading.current_thread().name, 'start!')
7 | for i in range(start, end):
8 | hex_num = str(hex(i)).split('x')[-1]
9 | ip = base_ip + hex_num + '::'
10 | url = f'http://[{ip}]/cdn-cgi/trace'
11 | try:
12 | r = requests.get(url, timeout=1)
13 | solo = r.text.split()[6].split('=')[-1]
14 | lock.acquire()
15 | valid_ip.write(ip + ' ' + solo + '\n')
16 | lock.release()
17 | print(ip, solo)
18 | except Exception as e:
19 | print(url, e)
20 |
21 |
22 | if __name__ == '__main__':
23 | base_ip = '2606:4700:'
24 | valid_ip = open('cf_valid_ipv6.txt', 'a+')
25 | thread_list = []
26 | lock = threading.Lock()
27 | thread_num = 64
28 | task_num = int(65536 / thread_num)
29 | for i in range(thread_num):
30 | start = i * task_num
31 | end = (i + 1) * task_num
32 | t = threading.Thread(target=multi_check_ip, args=(start, end))
33 | thread_list.append(t)
34 |
35 | last_start = thread_num * task_num
36 | last_task = threading.Thread(target=multi_check_ip, args=(last_start, 65536))
37 | thread_list.append(last_task)
38 |
39 | for t in thread_list:
40 | t.start()
41 | for t in thread_list:
42 | t.join()
43 |
44 | valid_ip_num = len(valid_ip.readlines())
45 | valid_ip.close()
46 | print(f'本次扫描结束,共扫到{valid_ip_num}个有效ip')
47 |
--------------------------------------------------------------------------------
/scrapy/cf-ipv6/cf_valid_ipv6_scan_2606_4700_.txt:
--------------------------------------------------------------------------------
1 | 2606:4700:3001:: HKG
2 | 2606:4700:3002:: HKG
3 | 2606:4700:3003:: HKG
4 | 2606:4700:3004:: HKG
5 | 2606:4700:3005:: HKG
6 | 2606:4700:3006:: HKG
7 | 2606:4700:3007:: HKG
8 | 2606:4700:3008:: HKG
9 | 2606:4700:3009:: HKG
10 | 2606:4700:300a:: HKG
11 | 2606:4700:300b:: HKG
12 | 2606:4700:300c:: HKG
13 | 2606:4700:3010:: HKG
14 | 2606:4700:3011:: HKG
15 | 2606:4700:3012:: HKG
16 | 2606:4700:3013:: HKG
17 | 2606:4700:3014:: HKG
18 | 2606:4700:3015:: HKG
19 | 2606:4700:3016:: HKG
20 | 2606:4700:3017:: HKG
21 | 2606:4700:3018:: HKG
22 | 2606:4700:3019:: HKG
23 | 2606:4700:301c:: HKG
24 | 2606:4700:3020:: HKG
25 | 2606:4700:3021:: HKG
26 | 2606:4700:3022:: HKG
27 | 2606:4700:3023:: HKG
28 | 2606:4700:3024:: HKG
29 | 2606:4700:3025:: HKG
30 | 2606:4700:3026:: HKG
31 | 2606:4700:3027:: HKG
32 | 2606:4700:3028:: HKG
33 | 2606:4700:3029:: HKG
34 | 2606:4700:302c:: HKG
35 | 2606:4700:3030:: SIN
36 | 2606:4700:3032:: SIN
37 | 2606:4700:3033:: SIN
38 | 2606:4700:3034:: SIN
39 | 2606:4700:3035:: SIN
40 | 2606:4700:3036:: SIN
41 | 2606:4700:3037:: SIN
42 | 2606:4700:3038:: HKG
43 | 2606:4700:3039:: HKG
44 | 2606:4700:303c:: HKG
45 | 2606:4700:8040:: SEA
46 | 2606:4700:8041:: SEA
47 | 2606:4700:8042:: SEA
48 | 2606:4700:8043:: SEA
49 | 2606:4700:8044:: SJC
50 | 2606:4700:8045:: SJC
51 | 2606:4700:8046:: SJC
52 | 2606:4700:8047:: SJC
53 | 2606:4700:8048:: SJC
54 | 2606:4700:8049:: SJC
55 | 2606:4700:804a:: SJC
56 | 2606:4700:804b:: SJC
57 | 2606:4700:804c:: SJC
58 | 2606:4700:804d:: SJC
59 | 2606:4700:804e:: SJC
60 | 2606:4700:804f:: SJC
61 | 2606:4700:80c0:: SEA
62 | 2606:4700:80c1:: SEA
63 | 2606:4700:80c2:: SEA
64 | 2606:4700:80c3:: SEA
65 | 2606:4700:80c4:: LAX
66 | 2606:4700:80c5:: LAX
67 | 2606:4700:80c6:: LAX
68 | 2606:4700:80c7:: LAX
69 | 2606:4700:80c8:: LAX
70 | 2606:4700:80c9:: LAX
71 | 2606:4700:80ca:: LAX
72 | 2606:4700:80cb:: LAX
73 | 2606:4700:80cc:: LAX
74 | 2606:4700:80cd:: LAX
75 | 2606:4700:80ce:: LAX
76 | 2606:4700:80cf:: LAX
77 | 2606:4700:80f0:: SEA
78 | 2606:4700:80f1:: SEA
79 | 2606:4700:80f2:: SEA
80 | 2606:4700:80f3:: SEA
81 | 2606:4700:80f5:: DFW
82 | 2606:4700:80f6:: DFW
83 | 2606:4700:80f7:: DFW
84 | 2606:4700:80f8:: DFW
85 | 2606:4700:80f9:: DFW
86 | 2606:4700:80fa:: DFW
87 | 2606:4700:80fb:: DFW
88 | 2606:4700:80fc:: SEA
89 | 2606:4700:80fd:: SEA
90 | 2606:4700:80fe:: SEA
91 | 2606:4700:80ff:: SEA
92 | 2606:4700:f1:: HKG
93 | 2606:4700:130:: YVR
94 | 2606:4700:131:: YVR
95 | 2606:4700:132:: YVR
96 | 2606:4700:133:: YVR
97 | 2606:4700:134:: YVR
98 | 2606:4700:135:: YVR
99 | 2606:4700:136:: YVR
100 | 2606:4700:137:: YVR
101 | 2606:4700:138:: YVR
102 | 2606:4700:139:: YVR
103 | 2606:4700:13a:: YVR
104 | 2606:4700:13b:: YVR
105 | 2606:4700:13c:: YVR
106 | 2606:4700:13d:: YVR
107 | 2606:4700:13e:: YVR
108 | 2606:4700:13f:: YVR
109 | 2606:4700:8d70:: SIN
110 | 2606:4700:8d71:: SIN
111 | 2606:4700:8d72:: SIN
112 | 2606:4700:8d73:: SIN
113 | 2606:4700:8d74:: SIN
114 | 2606:4700:8d75:: SIN
115 | 2606:4700:8d76:: SIN
116 | 2606:4700:8d77:: SIN
117 | 2606:4700:8d78:: SIN
118 | 2606:4700:8d79:: SIN
119 | 2606:4700:8d7a:: SIN
120 | 2606:4700:8d7c:: SIN
121 | 2606:4700:8d7d:: SIN
122 | 2606:4700:8d7e:: SIN
123 | 2606:4700:8d7f:: SIN
124 | 2606:4700:8d90:: SIN
125 | 2606:4700:8d91:: SIN
126 | 2606:4700:8d92:: SIN
127 | 2606:4700:8d93:: SIN
128 | 2606:4700:8d94:: SIN
129 | 2606:4700:8d95:: SIN
130 | 2606:4700:8d96:: SIN
131 | 2606:4700:8d97:: SIN
132 | 2606:4700:8d98:: SIN
133 | 2606:4700:8d99:: SIN
134 | 2606:4700:8d9a:: SIN
135 | 2606:4700:8d9b:: SIN
136 | 2606:4700:8d9c:: SIN
137 | 2606:4700:8d9d:: SIN
138 | 2606:4700:8d9e:: SIN
139 | 2606:4700:8d9f:: SIN
140 | 2606:4700:81c0:: LAX
141 | 2606:4700:81c1:: LAX
142 | 2606:4700:81c2:: LAX
143 | 2606:4700:81c3:: LAX
144 | 2606:4700:81c4:: SEA
145 | 2606:4700:81c5:: SEA
146 | 2606:4700:81c6:: SEA
147 | 2606:4700:81c7:: SEA
148 | 2606:4700:81c8:: SEA
149 | 2606:4700:81c9:: SEA
150 | 2606:4700:81ca:: SEA
151 | 2606:4700:81cb:: SEA
152 | 2606:4700:81cc:: SEA
153 | 2606:4700:81cd:: SEA
154 | 2606:4700:81ce:: SEA
155 | 2606:4700:81cf:: SEA
156 | 2606:4700:8dd0:: SIN
157 | 2606:4700:8dd1:: SIN
158 | 2606:4700:8dd2:: SIN
159 | 2606:4700:8dd3:: SIN
160 | 2606:4700:8dd4:: SIN
161 | 2606:4700:85c0:: SIN
162 | 2606:4700:85c1:: SIN
163 | 2606:4700:8dd5:: SIN
164 | 2606:4700:85c2:: SIN
165 | 2606:4700:8dd6:: SIN
166 | 2606:4700:85c3:: SIN
167 | 2606:4700:8dd7:: SIN
168 | 2606:4700:85c4:: SIN
169 | 2606:4700:8dd8:: SIN
170 | 2606:4700:85c5:: SIN
171 | 2606:4700:8dd9:: SIN
172 | 2606:4700:85c6:: SIN
173 | 2606:4700:8dda:: SIN
174 | 2606:4700:85c7:: SIN
175 | 2606:4700:8ddb:: SIN
176 | 2606:4700:85c8:: SIN
177 | 2606:4700:8ddc:: SIN
178 | 2606:4700:85c9:: SIN
179 | 2606:4700:8ddd:: SIN
180 | 2606:4700:85ca:: SIN
181 | 2606:4700:8dde:: SIN
182 | 2606:4700:85cb:: SIN
183 | 2606:4700:8ddf:: SIN
184 | 2606:4700:85cc:: SIN
185 | 2606:4700:8de0:: SIN
186 | 2606:4700:85cd:: SIN
187 | 2606:4700:8de1:: SIN
188 | 2606:4700:85ce:: SIN
189 | 2606:4700:8de2:: SIN
190 | 2606:4700:85cf:: SIN
191 | 2606:4700:8de3:: SIN
192 | 2606:4700:85d0:: SIN
193 | 2606:4700:8de4:: SIN
194 | 2606:4700:85d1:: SIN
195 | 2606:4700:8de5:: SIN
196 | 2606:4700:85d2:: SIN
197 | 2606:4700:8de6:: SIN
198 | 2606:4700:85d3:: SIN
199 | 2606:4700:8de7:: SIN
200 | 2606:4700:85d4:: SIN
201 | 2606:4700:8de8:: SIN
202 | 2606:4700:85d5:: SIN
203 | 2606:4700:8de9:: SIN
204 | 2606:4700:85d6:: SIN
205 | 2606:4700:8dea:: SIN
206 | 2606:4700:85d7:: SIN
207 | 2606:4700:8deb:: SIN
208 | 2606:4700:85d8:: SIN
209 | 2606:4700:8dec:: SIN
210 | 2606:4700:85d9:: SIN
211 | 2606:4700:8ded:: SIN
212 | 2606:4700:85da:: SIN
213 | 2606:4700:8dee:: SIN
214 | 2606:4700:85db:: SIN
215 | 2606:4700:8def:: SIN
216 | 2606:4700:85dc:: SIN
217 | 2606:4700:85dd:: SIN
218 | 2606:4700:85de:: SIN
219 | 2606:4700:85df:: SIN
220 | 2606:4700:8390:: SIN
221 | 2606:4700:8391:: SIN
222 | 2606:4700:8392:: SIN
223 | 2606:4700:8393:: SIN
224 | 2606:4700:8394:: SIN
225 | 2606:4700:8395:: SIN
226 | 2606:4700:8396:: SIN
227 | 2606:4700:8397:: SIN
228 | 2606:4700:8398:: SIN
229 | 2606:4700:8399:: SIN
230 | 2606:4700:839a:: SIN
231 | 2606:4700:839b:: SIN
232 | 2606:4700:839c:: SIN
233 | 2606:4700:839d:: SIN
234 | 2606:4700:839e:: SIN
235 | 2606:4700:839f:: SIN
236 | 2606:4700:83a0:: SIN
237 | 2606:4700:83a1:: SIN
238 | 2606:4700:83a2:: SIN
239 | 2606:4700:83a3:: SIN
240 | 2606:4700:83a4:: SIN
241 | 2606:4700:83a5:: SIN
242 | 2606:4700:83a6:: SIN
243 | 2606:4700:83a7:: SIN
244 | 2606:4700:83a8:: SIN
245 | 2606:4700:83a9:: SIN
246 | 2606:4700:83aa:: SIN
247 | 2606:4700:83ab:: SIN
248 | 2606:4700:83ac:: SIN
249 | 2606:4700:83ad:: SIN
250 | 2606:4700:83ae:: SIN
251 | 2606:4700:83af:: SIN
252 | 2606:4700:3000:: HKG
--------------------------------------------------------------------------------
/scrapy/cf-ipv6/ping.py:
--------------------------------------------------------------------------------
1 | from pprint import pprint
2 | from multiping import MultiPing
3 |
4 |
5 | def multi_ping(ip_list):
6 | # Create a MultiPing object to test three hosts / addresses
7 | mp = MultiPing(ip_list)
8 |
9 | # Send the pings to those addresses
10 | mp.send()
11 |
12 | # With a 1 second timout, wait for responses (may return sooner if all
13 | # results are received).
14 | responses, no_responses = mp.receive(1)
15 |
16 | pprint(sorted(responses.items(), key=lambda obj: obj[1], reverse=True))
17 |
18 |
19 | if __name__ == '__main__':
20 | ip_list = []
21 | f = open('cf_valid_ipv6.txt', 'r')
22 | item_list = f.readlines()
23 | for i in item_list:
24 | ip = i.split()[0]
25 | ip_list.append(ip)
26 |
27 | multi_ping(ip_list)
28 |
--------------------------------------------------------------------------------
/scrapy/douyin-grils-down/README.md:
--------------------------------------------------------------------------------
1 | # 抖音
2 | 用Python全自动下载抖音小姐姐视频
3 |
--------------------------------------------------------------------------------
/scrapy/douyin-grils-down/douyin_appium.py:
--------------------------------------------------------------------------------
1 | import time
2 | import random
3 | from appium import webdriver
4 | from selenium.webdriver.common.by import By
5 | from selenium.webdriver.support.ui import WebDriverWait
6 | from appium.webdriver.common.touch_action import TouchAction
7 | from selenium.webdriver.support import expected_conditions as EC
8 |
9 |
10 | def main():
11 | # 设置驱动配置
12 | server = 'http://localhost:4723/wd/hub'
13 | desired_caps = {
14 | 'platformName': 'Android',
15 | 'deviceName': 'STF_AL00',
16 | 'appPackage': 'com.ss.android.ugc.aweme',
17 | 'appActivity': '.main.MainActivity',
18 | # 关闭手机软键盘
19 | 'unicodeKeyboard': True,
20 | 'resetKeyboard': True
21 | }
22 | driver = webdriver.Remote(server, desired_caps)
23 | wait = WebDriverWait(driver, 60)
24 | # 同意用户隐私协议,点击
25 | button_1 = wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/q6')))
26 | button_1.click()
27 | # 禁止电话权限,点击
28 | button_2 = wait.until(EC.presence_of_element_located((By.ID, 'com.android.packageinstaller:id/permission_deny_button')))
29 | button_2.click()
30 | # 禁止位置权限,点击
31 | button_3 = wait.until(EC.presence_of_element_located((By.ID, 'com.android.packageinstaller:id/permission_deny_button')))
32 | button_3.click()
33 | time.sleep(2)
34 | # 向上滑动,进入抖音视频播放页面
35 | TouchAction(driver).press(x=515, y=1200).move_to(x=515, y=1000).release().perform()
36 | # 这里需要设置一个较长时间的延迟,因为抖音有引导操作和提示,需等待片刻
37 | time.sleep(20)
38 | # 点击抖音"喜欢"处,以此进入登录界面
39 | TouchAction(driver).press(x=950, y=800).release().perform()
40 | # 点击密码登录
41 | button_4 = wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/afg')))
42 | button_4.click()
43 | # 输入账号
44 | button_5 = wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/ab_')))
45 | button_5.send_keys('你的账号')
46 | # 输入密码
47 | button_6 = wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/aes')))
48 | button_6.send_keys('你的密码')
49 | time.sleep(2)
50 | # 因为会跳出软键盘,会遮挡登录按钮,需点击软键盘取消
51 | TouchAction(driver).press(x=980, y=1850).release().perform()
52 | time.sleep(2)
53 | # 点击登录按钮
54 | button_7 = wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/abb')))
55 | button_7.click()
56 | time.sleep(2)
57 | # 登录成功,进入抖音视频界面,点击下方标题栏 "我"
58 | TouchAction(driver).press(x=990, y=1850).release().perform()
59 | # 进入个人主页,点击关注处
60 | button_8 = wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/a_7')))
61 | button_8.click()
62 | # 进入关注栏,点击第二个关注
63 | button_9 = wait.until(EC.presence_of_element_located((By.XPATH, ' /hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.RelativeLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.view.ViewGroup/android.widget.LinearLayout/android.support.v7.widget.RecyclerView/android.widget.RelativeLayout[2]/android.widget.RelativeLayout[1]')))
64 | button_9.click()
65 | # 进入UP主主页,点击第一个视频
66 | button_10 = wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/aqm')))
67 | button_10.click()
68 | # 不断下滑页面,直到底部
69 | while True:
70 | TouchAction(driver).press(x=515, y=1247).move_to(x=515, y=1026).release().perform()
71 | time.sleep(float(random.randint(5, 10)))
72 |
73 |
74 | if __name__ == '__main__':
75 | main()
76 |
--------------------------------------------------------------------------------
/scrapy/douyin-grils-down/douyin_download.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import requests
3 | import os
4 |
5 | num = 0
6 | dom = []
7 | folder_path = "F:/video/"
8 | os.makedirs(folder_path)
9 | df = pd.read_csv('douyin.csv', header=None, names=["url"])
10 |
11 | # 对链接去重及刚进入抖音获取的视频链接
12 | for i in df['url'][2:]:
13 | if i not in dom:
14 | dom.append(i)
15 |
16 | # 下载视频
17 | for j in dom:
18 | url = j
19 | num += 1
20 | response = requests.get(url, stream=True)
21 | filename = str(num) + '.mp4'
22 | with open('F:\\video\\' + filename, 'ab+') as f:
23 | f.write(response.content)
24 | f.flush()
25 | print(filename + '下载完成')
26 |
--------------------------------------------------------------------------------
/scrapy/douyin-grils-down/douyin_mitmdump.py:
--------------------------------------------------------------------------------
1 |
2 | def response(flow):
3 | urls = ['http://v1-dy', 'http://v3-dy', 'http://v6-dy', 'http://v9-dy']
4 | # 对url进行筛选,只选取视频的url
5 | for url in urls:
6 | if url in flow.request.url:
7 | print('\n\n抖音视频\n\n')
8 | with open('douyin.csv', 'a+', encoding='utf-8-sig') as f:
9 | f.write(flow.request.url + '\n')
10 |
11 |
--------------------------------------------------------------------------------
/scrapy/ipProxyPool/kuaidaili.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import datetime
4 | import random
5 | import requests
6 | import pandas as pd
7 | from bs4 import BeautifulSoup
8 | from faker import Factory
9 |
10 |
11 | def get_user_agent(num):
12 | """
13 | 生成不同的 user-agent
14 | :param num: 生成个数
15 | :return: list
16 | """
17 | factory = Factory.create()
18 | user_agent = []
19 | for i in range(num):
20 | user_agent.append({'User-Agent': factory.user_agent()})
21 | return user_agent
22 |
23 |
24 | def get_proxy(pages, ua_num, target_url):
25 | """
26 | 爬取代理数据,清洗整合
27 | :param pages: 需要爬取页数
28 | :param ua_num: 需要user-agent个数
29 | :param target_url: 爬虫的目标地址,作为验证代理池ip的有效性
30 | :return: list
31 | """
32 | headers = get_user_agent(ua_num) # 请求头
33 | proxy_list = [] # 最后需入库保存的代理池数据
34 | try:
35 | for num in range(0, pages):
36 | print('Start:第 %d 页请求' % (num + 1))
37 | # 请求路径
38 | url = 'https://www.kuaidaili.com/free/inha/' + str(num + 1) + '/'
39 |
40 | # 随机延时(randint生成的随机数n: a <= n <= b ;random产生 0 到 1 之间的随机浮点数)
41 | time.sleep(random.randint(1, 2) + random.random())
42 | header_i = random.randint(0, len(headers) - 1) # 随机获取1个请求头
43 |
44 | # BeautifulSoup 解析
45 | html = requests.get(url, headers=headers[header_i])
46 | soup = BeautifulSoup(html.text, 'lxml')
47 |
48 | # CSS 选择器
49 | ip = soup.select("td[data-title='IP']")
50 | port = soup.select("td[data-title='PORT']")
51 | degree = soup.select("td[data-title='匿名度']")
52 | proxy_type = soup.select("td[data-title='类型']")
53 | position = soup.select("td[data-title='位置']")
54 | speed = soup.select("td[data-title='响应速度']")
55 | last_time = soup.select("td[data-title='最后验证时间']")
56 |
57 | # 循环验证是否有效
58 | for i, p, dg, pt, ps, sp, lt in zip(ip, port, degree, proxy_type, position, speed, last_time):
59 | ip_port = str(i.get_text()) + ':' + str(p.get_text())
60 | # 调用验证的方法
61 | flag = is_useful(ip_port, headers[header_i], target_url)
62 | if flag:
63 | # 拼装字段
64 | p_ip = str(i.get_text())
65 | p_port = str(p.get_text())
66 | p_degree = str(dg.get_text())
67 | p_type = str(pt.get_text())
68 | p_position = str(ps.get_text()).rsplit(' ', 1)[0]
69 | p_operator = str(ps.get_text()).rsplit(' ')[-1]
70 | p_speed = str(sp.get_text())
71 | p_last_time = str(lt.get_text())
72 |
73 | proxy_list.append([p_ip, p_port, p_degree, p_type, p_position, p_operator, p_speed, p_last_time])
74 | print('End:第 %d 页结束!==========================' % (num + 1))
75 |
76 | except Exception as e:
77 | print('程序 get_proxy 发生错误,Error:', e)
78 |
79 | finally:
80 | # 调用保存的方法
81 | write_proxy(proxy_list)
82 |
83 | return proxy_list
84 |
85 |
86 | def is_useful(ip_port, headers, target_url):
87 | """
88 | 判断ip是否可用
89 | :param ip_port: ip+端口号
90 | :param headers: 随机请求头
91 | :param target_url: 爬虫的目标地址,作为验证代理池ip的有效性
92 | :return: bool
93 | """
94 | url = target_url # 验证ip对目标地址的有效性
95 | proxy_ip = 'http://' + ip_port
96 | proxies = {'http': proxy_ip}
97 | flag = True
98 | try:
99 | requests.get(url=url, headers=headers, proxies=proxies, timeout=2)
100 | print("【可用】:" + ip_port)
101 | except Exception as e:
102 | print('程序 is_useful 发生错误,Error:', e)
103 | flag = False
104 | return flag
105 |
106 |
107 | def write_proxy(proxy_list):
108 | """
109 | 将清洗好的列表数据,保存到xlsx文件
110 | :param proxy_list: 代理池数据列表
111 | :return: bool
112 | """
113 | date_now = datetime.datetime.now().strftime('%Y%m%d%H%M%S') # 当前时间
114 | flag = True # 保存成功标志
115 | print('--- 开始保存 ---')
116 | try:
117 | df = pd.DataFrame(proxy_list,
118 | columns=['ip', 'port', 'degree', 'type', 'position', 'operator', 'speed', 'last_time'])
119 | df.to_excel(date_now + '_proxy.xlsx', index=False)
120 | print('--- 保存成功!---')
121 | except Exception as e:
122 | print('--- 保存失败!---:', e)
123 | flag = False
124 | return flag
125 |
126 |
127 | def read_ip():
128 | """
129 | 读取代理池,返回ip:port列表
130 | :return: list
131 | """
132 | # 最新爬虫数据文件名(列表推导式写法)
133 | file_name = [f for f in os.listdir("./") if f.split('.')[-1] == 'xlsx'][-1]
134 | # 读取文件
135 | proxy_list = pd.read_excel('./' + file_name)
136 | proxy_list['port'] = proxy_list['port'].astype('str') # 先将端口号的整型转为字符串
137 | proxy_list['ip_port'] = proxy_list['ip'].str.cat(proxy_list['port'], sep=':') # 组合成ip+port
138 | return list(proxy_list['ip_port'])
139 |
140 |
141 | def main():
142 | """
143 | 主方法
144 | """
145 | pages = 10 # 定义爬取页数
146 | ua_num = 3 # 定义需生成user-agent个数
147 | target_url = 'https://everia.club/' # 爬虫的目标地址,作为验证代理池ip的有效性
148 | proxy_list = get_proxy(pages, ua_num, target_url)
149 | print(proxy_list)
150 |
151 |
152 | if __name__ == '__main__':
153 | # 1.主方法
154 | # main()
155 | # 2.读取代理池
156 | print(read_ip())
157 |
158 |
159 |
160 |
161 |
162 |
--------------------------------------------------------------------------------
/scrapy/jdCellPhone/cellPhone.py:
--------------------------------------------------------------------------------
1 | import json
2 | import argparse
3 | import time
4 | import re
5 | import requests
6 | import pymongo
7 | import numpy as np
8 | import pandas as pd
9 | from lxml import etree
10 | from wordcloud import WordCloud
11 | import matplotlib.pyplot as plt
12 |
13 | DB = "cellphone"
14 |
15 | def fix_url(string):
16 | if re.match(r"http://", string):
17 | return string
18 | if re.match(r"//", string):
19 | return "http:" + string
20 |
21 | def get_page_num():
22 | url = "https://list.jd.com/list.html?cat=9987,653,655"
23 | r = requests.get(url, verify=False)
24 | content = r.content
25 | root = etree.HTML(content)
26 | page_nodes = root.xpath('.//span[@class="p-num"]/a')
27 | for node in page_nodes:
28 | if node.attrib["class"] == "":
29 | page_num = int(node.text)
30 | return page_num
31 |
32 | def get_price(skuid):
33 | url = "https://c0.3.cn/stock?skuId=" + str(skuid) + "&area=1_72_4137_0&venderId=1000004123&cat=9987,653,655&buyNum=1&choseSuitSkuIds=&extraParam={%22originid%22:%221%22}&ch=1&fqsp=0&pduid=15379228074621272760279&pdpin=&detailedAdd=null&callback=jQuery3285040"
34 | r = requests.get(url, verify=False)
35 | content = r.content.decode('GBK')
36 | matched = re.search(r'jQuery\d+\((.*)\)', content, re.M)
37 | if matched:
38 | data = json.loads(matched.group(1))
39 | price = float(data["stock"]["jdPrice"]["p"])
40 | return price
41 | return 0
42 |
43 | def get_item(skuid, url):
44 | price = get_price(skuid)
45 | r = requests.get(url, verify=False)
46 | content = r.content
47 | root = etree.HTML(content)
48 | nodes = root.xpath('.//div[@class="Ptable"]/div[@class="Ptable-item"]')
49 | params = {"price": price, "skuid": skuid}
50 | for node in nodes:
51 | text_nodes = node.xpath('./dl')[0]
52 | k = ""
53 | v = ""
54 | for text_node in text_nodes:
55 | if text_node.tag == "dt":
56 | k = text_node.text
57 | elif text_node.tag == "dd" and "class" not in text_node.attrib:
58 | v = text_node.text
59 | params[k] = v
60 | return params
61 |
62 | def get_cellphone(page):
63 | url = "https://list.jd.com/list.html?cat=9987,653,655&page={}&sort=sort_rank_asc&trans=1&JL=6_0_0&ms=4#J_main".format(page)
64 | r = requests.get(url, verify=False)
65 | content = r.content.decode("utf-8")
66 | root = etree.HTML(content)
67 | cell_nodes = root.xpath('.//div[@class="p-img"]/a')
68 | client = pymongo.MongoClient()
69 | db = client[DB]
70 | for node in cell_nodes:
71 | item_url = fix_url(node.attrib["href"])
72 | matched = re.search('item.jd.com/(\d+)\.html', item_url)
73 | skuid = int(matched.group(1))
74 | saved = db.items.find({"skuid": skuid}).count()
75 | if saved > 0:
76 | print(saved)
77 | continue
78 | item = get_item(skuid, item_url)
79 | db.items.insert(item)
80 |
81 | def norm_weight(weight_str):
82 | matched = re.search(r'(\d+)', weight_str)
83 | weight = 0
84 | if matched:
85 | weight = matched.group(1)
86 | return weight
87 |
88 | def norm_screen_size(screen_size_str):
89 | matched = re.search(r'(\d+\.\d+)', screen_size_str)
90 | screen_size = 0
91 | if matched:
92 | screen_size = float(matched.group(1))
93 | return screen_size
94 |
95 | def norm_rom(rom_str):
96 | rom = 0
97 | matched = re.search(r'(\d+)MB', rom_str)
98 | if matched:
99 | rom = float(matched.group(1)) / 1024
100 | matched = re.search(r'(\d+)TB', rom_str)
101 | if matched:
102 | rom = float(matched.group(1)) * 1024
103 | matched = re.search(r'(\d+)GB', rom_str)
104 | if matched:
105 | rom = float(matched.group(1))
106 | return rom
107 |
108 | def norm_ram(ram_str):
109 | ram = 0
110 | matched = re.search(r'(\d+)MB', ram_str)
111 | if matched:
112 | ram = float(matched.group(1)) / 1024
113 | matched = re.search(r'(\d+)GB', ram_str)
114 | if matched:
115 | ram = float(matched.group(1))
116 | return ram
117 |
118 | def norm_screen_res(screen_res_str):
119 | width = 0
120 | height = 0
121 | matched = re.search(r'(\d+)[x*](\d+)', screen_res_str)
122 | if matched:
123 | width = matched.group(2)
124 | height = matched.group(1)
125 | return (width, height)
126 |
127 | def norm_battery_cap(battery_cap_str):
128 | items = re.findall(r'(\d+)', battery_cap_str)
129 | items = list(map(lambda x: int(x), items))
130 | if len(items) == 0:
131 | return 0
132 | return max(items)
133 |
134 | def norm_front_cam(front_cam_str):
135 | pass
136 |
137 | def norm_back_cam(back_cam_str):
138 | pass
139 |
140 | def norm_dual_sim(dual_sim_str):
141 | if dual_sim_str is None:
142 | return False
143 |
144 | dual_sim = False
145 | matched = re.search(r'双卡双待', dual_sim_str)
146 | if matched:
147 | dual_sim = True
148 | return dual_sim
149 |
150 | def preprocess(items):
151 | result = []
152 | for item in items:
153 | if '品牌' not in item:
154 | continue
155 |
156 | weight_str = item.get('机身重量(g)', '')
157 | weight = norm_weight(weight_str)
158 | screen_size_str = item.get('主屏幕尺寸(英寸)', '')
159 | screen_size = norm_screen_size(screen_size_str)
160 | rom_str = item.get('ROM', '')
161 | rom = norm_rom(rom_str)
162 | ram_str = item.get('RAM', '')
163 | ram = norm_ram(ram_str)
164 | screen_res_str = item.get('分辨率', '')
165 | screen_res_width, screen_res_height = norm_screen_res(screen_res_str)
166 | battery_cap_str = item.get('电池容量(mAh)', '')
167 | battery_cap = norm_battery_cap(battery_cap_str)
168 | front_cam_str = item.get('前置摄像头', '')
169 | front_cam = norm_front_cam(front_cam_str)
170 | back_cam_str = item.get('后置摄像头')
171 | back_cam = norm_back_cam(back_cam_str)
172 | dual_sim_str = item.get('双卡机类型')
173 | dual_sim = norm_dual_sim(dual_sim_str)
174 |
175 | cellphone = {
176 | "brand": item.get('品牌'),
177 | "model": item.get('型号'),
178 | "color": item.get('机身颜色'),
179 | "weight": weight,
180 | "material": item.get('机身材质分类'),
181 | "cpu_brand": item.get('CPU品牌'),
182 | "cpu_freq": item.get('CPU频率'),
183 | "cpu_core": item.get('CPU核数'),
184 | "cpu_model": item.get('CPU型号'),
185 | "gpu_model": item.get('GPU型号'),
186 | "dual_sim": dual_sim,
187 | "network_4g": item.get('4G网络'),
188 | "rom": rom,
189 | "ram": ram,
190 | "screen_size": screen_size,
191 | "screen_res_width": screen_res_width,
192 | "screen_res_height": screen_res_height,
193 | "screen_mat": item.get('屏幕材质类型'),
194 | "battery_cap": battery_cap,
195 | "front_cam": item.get('前置摄像头'),
196 | "back_cam": item.get('后置摄像头'),
197 | "price": item.get('price'),
198 | }
199 | result.append(cellphone)
200 | return result
201 |
202 | def query():
203 | client = pymongo.MongoClient()
204 | db = client[DB]
205 | items = db.items.find({})
206 | result = preprocess(items)
207 | df = pd.DataFrame(result)
208 | #df.drop_duplicates(subset=["brand", "model", "rom", "ram"], inplace=True)
209 | df_res = df[df.cpu_brand=="骁龙(Snapdragon)"][df.battery_cap >= 3000][df.rom >= 64][df.ram >= 6][df.dual_sim == True][df.price<=1500][df.brand=="小米(MI)"]
210 | print(df_res[["brand", "model", "color", "cpu_brand", "cpu_freq", "cpu_core", "cpu_model", "rom", "ram", "battery_cap", "price"]].sort_values(by=["price", "battery_cap"], ascending=[True, False]).to_csv("cellPhone.csv", encoding="GBK"))
211 | return df_res
212 |
213 | if __name__ == "__main__":
214 | parser = argparse.ArgumentParser()
215 | parser.add_argument("--save", help="save data from web", action="store_true", dest="save")
216 | parser.add_argument("--query", help="query data from DB", action="store_true", dest="query")
217 | args = parser.parse_args()
218 |
219 | if args.save:
220 | page_num = get_page_num()
221 | for i in range(page_num):
222 | get_cellphone(i)
223 | elif args.query:
224 | query()
225 |
--------------------------------------------------------------------------------
/scrapy/postgraduate_colleges/PostgraduateColleges.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/scrapy/postgraduate_colleges/PostgraduateColleges.xlsx
--------------------------------------------------------------------------------
/scrapy/postgraduate_colleges/postgraduatecolleges.csv:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/scrapy/postgraduate_colleges/字段属性.txt:
--------------------------------------------------------------------------------
1 | university: 大学名称
2 | attributes: 院校属性
3 |
--------------------------------------------------------------------------------
/scrapy/scrapy163musicComments/scrapyWyycomments.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python 3.6.1
2 | # -*- coding:utf-8 -*-
3 | # ____author___='Yinux'
4 | # -*- coding:utf-8 -*-
5 |
6 | import json
7 | import random
8 |
9 | import requests
10 | import time
11 | import csv
12 | import codecs
13 |
14 |
15 | """
16 | 爬取网易云音乐歌曲的精彩评论
17 | @Author monkey
18 | @Date 2018-6-6
19 | """
20 |
21 |
22 | def start_spider(song_id):
23 | """ 评论数据采用 AJAX 技术获得, 下面才是获取评论的请求地址 """
24 | url = 'http://music.163.com/weapi/v1/resource/comments/R_SO_4_{}?csrf_token='.format(song_id)
25 |
26 | headers = {
27 | 'User-agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36',
28 | 'Origin': 'http://music.163.com',
29 | 'Referer': 'http://music.163.com/song?id={}'.format(song_id),
30 | }
31 |
32 | formdata = {
33 | 'params': '57Wh2mgebLOOPQVBc+B2wz4sCCH/nXZFEoTc/XNySiqT0V7ZxUADzDNgTXXhYgAJ5BNMryMgxhdwNzF1GyxDZo3iR9/YYbWgCAQHC5DCDuObqvxNcOcnQDaRqJCrqQcrEABW1SwKitfbD3wMEyB4tJu+rU8goSwg2FP/PBBLs9DVs1iWdWGjV6CdrocA36Rs',
34 | 'encSecKey': '63774137ba4f5cc60d1b6a3bc14985a9563a7bfdec4f3e74297ffc07514adf18f90620933a01c2db4ca989cc4e1dfc49789981424c294a34e48c2cbe7aa51533a5cc5b5776a9e499cd08770bc596655dbe8e001d1ed5fd47a27dd195128480820cc67a799d341f95d447e3522851f2b64ad1cb8350e2015b265b9e684179351c',
35 | }
36 |
37 | response = requests.post(url, headers=headers, data=formdata)
38 | print('请求 [ ' + url + ' ], 状态码为 ')
39 | print(response.status_code)
40 | # get_hot_comments(response.text)
41 | # 将数据写到 CSV 文件中
42 | write_to_file(get_hot_comments(response.text))
43 |
44 | def get_163music(url):
45 | user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
46 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
47 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.0) Gecko/20100101 Firefox/61.0",
48 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
49 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36"]
50 | header={'User-Agent':'Mozilla/5.0'}
51 | header['User-Agent'] = random.choice(user_agent_list)
52 | text = requests.session()
53 | response=text.get(url,headers = header).content
54 | text = BeautifulSoup(response,'lxml')
55 | content = text.find('ul',{'class':'f-hide'})
56 | playlist = []
57 | site = 'https://music.163.com/#'
58 | for music in content.find_all('a'):
59 | playlist.append(site + music['href'])
60 | # print('{} : {}'.format(music.text, music['href']))
61 | return playlist
62 |
63 | def get_hot_comments(response):
64 | """ 获取精彩评论
65 | 请求返回结果是 Json 数据格式, 使用 json.loads(response) 将其转化为字典类型, 就可以使用 key-value 形式获取值
66 | """
67 | data_list = []
68 | data = {}
69 |
70 | for comment in json.loads(response)['hotComments']:
71 | data['userId'] = comment['user']['userId']
72 | data['nickname'] = comment['user']['nickname']
73 | data['content'] = comment['content']
74 | data['likedCount'] = comment['likedCount']
75 | data_list.append(data)
76 | data = {}
77 | # print(data_list)
78 | return data_list
79 |
80 |
81 | def write_to_file(datalist):
82 | print('开始将数据持久化……')
83 | file_name = '网易云音乐精彩评论.csv'
84 |
85 | with codecs.open(file_name, 'a+', 'utf-8_sig') as csvfile:
86 | filednames = ['用户Id', '昵称', '评论内容', '点赞数']
87 | writer = csv.DictWriter(csvfile, fieldnames=filednames)
88 |
89 | writer.writeheader()
90 | for data in datalist:
91 | print(data)
92 | try:
93 | writer.writerow({filednames[0]: data['userId'],
94 | filednames[1]: data['nickname'],
95 | filednames[2]: data['content'],
96 | filednames[3]: data['likedCount']})
97 | except UnicodeEncodeError:
98 | print("编码错误, 该数据无法写到文件中, 直接忽略该数据")
99 |
100 | print('成功将数据写入到 ' + file_name + ' 中!')
101 |
102 |
103 | def get_song_id(url):
104 | """ 从 url 中截取歌曲的 id """
105 | song_id = url.split('=')[1]
106 | return song_id
107 |
108 |
109 | def main():
110 | url = 'http://music.163.com/playlist?id=987444580'
111 | songs_url_list = get_163music(url)
112 |
113 | for each in songs_url_list:
114 | start_spider(get_song_id(each))
115 | time.sleep(random.randint(5, 8))
116 |
117 |
118 | if __name__ == '__main__':
119 | main()
120 |
121 |
--------------------------------------------------------------------------------
/scrapy/vip-item/README.md:
--------------------------------------------------------------------------------
1 | # 使用bs4唯品会(vip.com)抓取唯品会商品信息 #
2 | ## 微信公众号 ##
3 | 
4 | 欢迎扫码关注
5 | ## 博客 ##
6 | [YINUXY'S BLOG](https://blog.yinuxy.com/)
7 |
8 | ## 安装& 使用 ##
9 | ### 安装依赖 ###
10 | `pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com`
11 | ### 获取代码 ###
12 | `git clone ggit@github.com:InfiniteYinux/Python.git`
13 | ### 运行 ###
14 | ```
15 | cd pythonScript\vip-item
16 | python vip.py
17 | ```
18 | ## 更新 ##
19 | 1. 2020-05-02 新增数据存储方式:存入数据库
--------------------------------------------------------------------------------
/scrapy/vip-item/requirements.txt:
--------------------------------------------------------------------------------
1 | json
2 | random
3 | requests
4 | BeautifulSoup
5 | selenium
6 | time
7 | pymysql
--------------------------------------------------------------------------------
/scrapy/vip-item/vip.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Apr 30 21:01:12 2020
4 |
5 | @author: Yinux
6 | """
7 |
8 | import json
9 | import random
10 | import requests
11 | from bs4 import BeautifulSoup
12 | from selenium import webdriver
13 | from time import sleep
14 | import time
15 | import pymysql.cursors
16 | #FEED_EXPORT_ENCODING = 'utf-8'
17 |
18 | class VipSpider(object):
19 | def __init__(self, url, search, start_page, end_page):
20 | ua = random.choice(self.user_agent_list)
21 | self.url = url
22 | self.search = search
23 | self.start_page = start_page
24 | self.end_page = end_page
25 | self.headers = {
26 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
27 | "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
28 | "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
29 | "User-Agent":"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
30 | "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
31 | "User-Agent":"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
32 | "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
33 | "User-Agent":"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
34 | "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
35 | "User-Agent":"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
36 | "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
37 | "User-Agent":"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
38 | "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
39 | "User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
40 | "User-Agent":"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
41 | "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
42 | "User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}
43 | self.proxies = {
44 | "http:":"123.101.213.98:9999",
45 | "http":"114.101.42.127:65309",
46 | "http":"39.106.194.91:808",
47 | "http":"122.51.231.113:8080",
48 | "http":"36.248.132.250:9999",
49 | "http":"180.118.128.54:9000",
50 | "http":"113.195.224.194:9999",
51 | "http":"39.108.59.34:8118",
52 | "http":"47.94.200.124:3128",
53 | "http":"163.204.246.83:9999",
54 | "http":"113.124.94.72:9999"
55 | }
56 | self.driver = webdriver.Chrome()
57 | self.conn=pymysql.connect(host="127.0.0.1",
58 | user="username",
59 | passwd="pasword",
60 | charset='utf8mb4',
61 | cursorclass=pymysql.cursors.DictCursor)
62 | self.cur = self.conn.cursor()
63 | self.cur.execute("CREATE DATABASE IF NOT EXISTS `jobs`")
64 | self.cur.execute("USE jobs")
65 | self.cur.execute("DROP TABLE IF EXISTS `web_51jobs`")
66 | self.cur.execute("CREATE TABLE `web_51jobs` (`id` INT PRIMARY KEY AUTO_INCREMENT,`position` varchar(200) DEFAULT NULL,`wages` varchar(200) DEFAULT NULL,`region` varchar(200) DEFAULT NULL,`experience` varchar(200) DEFAULT NULL,`education` varchar(200) DEFAULT NULL,`need_people` varchar(100) DEFAULT NULL,`publish_date` varchar(200) DEFAULT NULL,`english` varchar(300) DEFAULT NULL,`welfare_tags` varchar(200) DEFAULT NULL,`job_information` varchar(200) DEFAULT NULL,`work_address` varchar(200) DEFAULT NULL,`company_name` varchar(200) DEFAULT NULL,`company_nature` varchar(200) DEFAULT NULL,`company_scale` varchar(200) DEFAULT NULL,`company_industry` varchar(200) DEFAULT NULL,`company_information` varchar(200) DEFAULT NULL,PRIMARY KEY (`id`))")
67 |
68 |
69 | def handle_click(self):
70 | self.driver.get(self.url)
71 | self.driver.find_elements_by_xpath("//*[@id='J_main_nav_link']/li[13]/a")[0].click()
72 | sleep(2)
73 | self.driver.find_elements_by_xpath("//*[@id='J-search']/div[1]/input")[0].send_keys(self.search)
74 | sleep(2)
75 | self.driver.find_elements_by_xpath("//*[@id='J-search']/div[1]/a/span")[0].click()
76 | sleep(3)
77 |
78 | def handle_url(self, page):
79 | Durl = self.driver.current_url # "https://category.vip.com/suggest.php?keyword=%E7%AF%AE%E7%90%83&ff=235|12|1|1"
80 | index = Durl.rfind("&")
81 | Durl = Durl[:index]
82 | data = {
83 | "page": page
84 | }
85 | res = requests.get(url=Durl, params=data, headers=random.choice(self.headers),proxies=random.choice(self.proxies))
86 | newurl = res.url
87 | print(newurl)
88 | return newurl
89 |
90 | def scroll_page(self, req):
91 | self.driver.get(req)
92 | sleep(3)
93 | for x in range(20):
94 | js = "var q=document.documentElement.scrollTop=10000"
95 | self.driver.execute_script(js) # 执行脚本(滚动)
96 | sleep(5)
97 | html = self.driver.page_source
98 |
99 | return html
100 |
101 | def downloadin(self, url):
102 | req = requests.get(url,headers=self.headers)
103 | soup = BeautifulSoup(req.content,"lxml")
104 | GoodsList = soup.select("div.pi-title-box")
105 | for div in GoodsList:
106 | shopname = div.a.get_text()
107 | try:
108 | desc = div.select("span.goods-description-title")[0].get_text()
109 | except:
110 | desc = ''
111 | return shopname,desc
112 |
113 | def download(self, request):
114 | soup = BeautifulSoup(request, "lxml")
115 | SectionList = soup.select("section#J_searchCatList")[0]
116 | GoodsList = SectionList.select("div.c-goods")
117 | items = []
118 | for div in GoodsList:
119 | item = {}
120 | itemlink = div.select("h4.goods-info a")[0].get('href')
121 | imageslink = div.img["data-original"]
122 | title = div.select("h4.goods-info a")[0].get_text()
123 | discount = div.select("div.goods-info span")[0].get_text()
124 | pricewra = div.select("div.goods-info em")[0].get_text()
125 | marprice = div.select("div.goods-info del.goods-market-price ")[0].get_text()
126 | item["商品链接"] = 'http:' + itemlink
127 | item["图片链接"] = 'http:' + imageslink
128 | item["商品名称"] = title
129 | item["商品折扣"] = discount
130 | item["特卖价格"] = pricewra
131 | item["原始价格"] = marprice
132 | item["商铺名称"], item["商品描述"] = self.downloadin(item["商品链接"])
133 | self.process_item(item)
134 | items.append(item)
135 |
136 | return items
137 |
138 | def process_item(self,item):
139 | # self.cur = self.conn.cursor()
140 | try:
141 | itemurl = item["商品链接"]
142 | imageurl = item["图片链接"]
143 | title = item["商品名称"]
144 | discount = item["商品折扣"]
145 | saleprice = item["特卖价格"]
146 | oldprice = item["原始价格"]
147 | shopname = item["商铺名称"]
148 | description = item["商品描述"]
149 | sql = "INSERT INTO `Cosmetics` (`title`, `discount`,`saleprice`,`oldprice`,`shopname`,`description`, `imageurl`,`itemurl`) VALUES ('"+title+"','"+discount+"','"+saleprice+"','"+oldprice+"','"+shopname+"','"+description+"','"+imageurl+"','"+itemurl+"')"
150 | self.cur.execute(sql)
151 | self.conn.commit()
152 | # self.conn.close()
153 | except Exception as err:
154 | print(err)
155 |
156 | def startSpider(self):
157 | htmlList = []
158 | for page in range(int(self.start_page), int(self.end_page) + 1):
159 | print("正在抓取第"+ str(page) +"页的数据")
160 | start = time.time()
161 | if page == 1:
162 | self.handle_click()
163 | req = self.handle_url(page)
164 | newhtml = self.scroll_page(req)
165 | htmlList += self.download(newhtml)
166 | else:
167 | req = self.handle_url(page)
168 | newhtml = self.scroll_page(req)
169 | htmlList += self.download(newhtml)
170 | end = time.time()
171 | print("第"+ str(page) +"页的数据抓取完毕,用时"+ str(end-start) +"s")
172 | # 【数据的存储】写入json数据
173 | # 将列表转化成json字符串
174 |
175 | string = json.dumps(htmlList,ensure_ascii=False)
176 | with open("vip2.json", "w", encoding="utf-8") as fp:
177 | fp.write(string)
178 | self.conn.close()
179 |
180 |
181 | def main():
182 | starts = time.time()
183 | url = "http://www.vip.com/"
184 | search = '化妆品'
185 | # search = input("请输入你要搜索的商品:")
186 | start_page = 1
187 | # start_page = input("请输入你要爬取的起始页:")
188 | end_page = 40
189 | # end_page = input("请输入你要爬取的结束页:")
190 | spider = VipSpider(url, search, start_page, end_page)
191 | spider.startSpider()
192 | ends = time.time()
193 | print("程序运行完毕,总用时"+ str(int(ends-starts)/60) +"分钟")
194 |
195 | if __name__ == '__main__':
196 | main()
--------------------------------------------------------------------------------
/scrapy/ximalaya/ximalaya.py:
--------------------------------------------------------------------------------
1 |
2 | import requests
3 | from bs4 import BeautifulSoup
4 | import re
5 | import os
6 | import random
7 | import time
8 |
9 | '''
10 | 作者:pk哥
11 | 公众号:brucepk
12 | 日期:2018/10/11
13 | 代码解析详见公众号「brucepk」。
14 |
15 | 如有疑问或需转载,请联系微信号:dyw520520,备注来意,谢谢。
16 | 如需加入python技术交流群,请加我微信,备注「进群」,我拉你进群,一起讨论交流,共同成长。
17 | '''
18 |
19 |
20 | def gethtml(url): # 获取网站 html 信息
21 | headers = {
22 | 'User-Agent':
23 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
24 | # 用的代理 ip,如果被封的,在http://www.xicidaili.com/换一个
25 | proxy_addr = {'http': '221.7.255.168:8080'}
26 | html = requests.get(url, headers=headers, proxies=proxy_addr) # 请求网页信息
27 | return html
28 |
29 |
30 | def getid(): # 获取专辑的 id 和标题信息
31 | keyword = input('请输入你要查找的音频关键字:\n') # 输入需要下载音频的关键字
32 | albumurl = 'https://www.ximalaya.com/search/album/{}/sc/p1'.format(keyword) # 输入关键字,拼接链接
33 | html = gethtml(albumurl)
34 | soup = BeautifulSoup(html.text, 'lxml')
35 | info = soup.select('#searchPage div.search-type div.common-tab-content div.xm-loading ul div '
36 | 'a.xm-album-title.ellipsis-2') # 提取音频文件的信息
37 | idinfo = re.compile('href="/.*?"').findall(str(info)) # 提取专辑中 id
38 | titleinfo = re.compile('title=".*?"').findall(str(info)) # 提取专辑中标题信息
39 | ids = []
40 | titles = []
41 | for j in idinfo:
42 | id = str(j).split('/')[2]
43 | ids.append(id)
44 | for t in titleinfo:
45 | # 处理下标题,防止创建文件夹失败
46 | title = str(t).split('"')[1].replace('\\', ' ').replace('/', ' ').replace(':', ' ').replace('*', ' ')\
47 | .replace('?', ' ').replace('"', ' ').replace('<', ' ').replace('>', ' ').replace('|', ' ')
48 | titles.append(title)
49 | return ids, titles
50 |
51 |
52 | def downm4a(albumId):
53 | # 获取专辑下的音频总数
54 | counturl = 'https://www.ximalaya.com/revision/album/getTracksList?albumId={}&pageNum=1'.format(albumId)
55 | chtml = gethtml(counturl)
56 | cjson = chtml.json()
57 | trackTotalCount = int(cjson['data']['trackTotalCount'])
58 | if trackTotalCount < 30 or trackTotalCount == 30: # 音频数小于等于 30 时,只有一页
59 | pageNum = 1
60 | else:
61 | if trackTotalCount % 30 == 0: # 音频数大于 30 时,且是30的倍数时
62 | pageNum = trackTotalCount // 30
63 | else:
64 | pageNum = (trackTotalCount // 30) + 1 # 音频数大于 30 时,不是30的倍数时
65 | for num in range(1, pageNum+1):
66 | m4aurl = 'https://www.ximalaya.com/revision/play/album?albumId={}&pageNum={}&pageSize=30'.format(albumId, num) # 拼接可下载音频信息的链接
67 | mhtml = gethtml(m4aurl)
68 | mjson = mhtml.json()
69 | for i in range(30): # 一个页面最多30个音频文件
70 | try:
71 | trackName = mjson['data']['tracksAudioPlay'][i]['trackName'] # 提取音频标题
72 | src = mjson['data']['tracksAudioPlay'][i]['src'] # 提取可下载链接
73 | print(trackName)
74 | print(src)
75 | if str(src) in('null', 'None'): # 如果为付费音频,则跳出循环,继续下载下一个专辑
76 | print('此为付费音频,无法下载')
77 | break
78 | data = requests.get(src).content
79 | with open('%s.m4a' % trackName, 'wb') as f: # 下载音频
80 | f.write(data)
81 | except IndexError:
82 | print('当前专辑已爬取完成!')
83 | continue
84 |
85 |
86 | def mkdir(): # 判断目录是否存在,不存在的话则自动创建
87 | ids, titles = getid()
88 | for title, albumId in zip(titles, ids):
89 | print(title)
90 | path = 'E:\\spiderproject\\ximalaya\\{}'.format(title) # 以音频名称命名
91 | isExists = os.path.exists(path)
92 | if not isExists:
93 | print('创建目录{}'.format(title)) # 目录不存在则创建一个
94 | os.makedirs(path) # 创建目录
95 | os.chdir(path) # 切换到创建的文件夹
96 | downm4a(albumId) # 调用函数下载音频到该目录下
97 | else:
98 | print('{}目录已存在,即将保存!'.format(title))
99 | os.chdir(path) # 切换到创建的文件夹
100 | downm4a(albumId) # 目录已存在时直接保存
101 | time.sleep(int(format(random.randint(2, 6)))) # 随机等待
102 |
103 |
104 | if __name__ == '__main__':
105 | mkdir()
106 |
107 |
108 |
--------------------------------------------------------------------------------
/scrapy/yunzhanImgToPdf/README.md:
--------------------------------------------------------------------------------
1 | 目前只能下载:https://book.yunzhan365.com/xxxx/xxxx/mobile/index.html
2 | 或者:http://www.yunzhan365.com/xxxxxxxxx.html 这样的网址,其他的网址会不行!
3 | 如果是在纯文本展示页面获取的链接,到翻书页面获取网址在开始采集!(ps:主要是懒,不想在多写几个判断条件了!)
--------------------------------------------------------------------------------
/scrapy/yunzhanImgToPdf/main.py:
--------------------------------------------------------------------------------
1 | import os
2 | import requests
3 | from lxml import etree
4 | import img2pdf
5 |
6 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
7 | url = input('请输入云展网图集网址:')
8 | splurl = url.split('/') #分割网址,准备下面判断
9 | if 'index.html' not in splurl: #判断是那一种链接
10 | res = requests.get(url , headers=headers) #获取源码
11 | res.encoding = res.apparent_encoding
12 | xml = etree.HTML(res.text).xpath('//div[@class="show-book-title"]/a/@href')[0].split('/') #取得book.yunzhan365.con网址进行分割
13 | purl = xml[2] + '/' + xml[3] + '/' + xml[4] + '/files/' + 'mobile/' #构造图片下载网址前缀
14 | pathname = etree.HTML(res.text).xpath('//div[@class="show-book-title"]/a/text()') #获取名称
15 | else:
16 | res = requests.get(url , headers=headers) #获取源码
17 | res.encoding = res.apparent_encoding
18 | pathname = etree.HTML(res.text).xpath('/html/head/title/text()') #获取名称
19 | purl = splurl[2] + '/' + splurl[3] + '/' + splurl[4] + '/files/' + 'mobile/' #构造图片前缀
20 |
21 | path = './' #存储路径
22 | if not os.path.exists(path):
23 | os.makedirs(path) #如果路径不存在就创建
24 | m = 0 #定义图片名称变量
25 | imgs = [] #准备空列表放置图片内容
26 | with open(path + '/' + str(pathname[0]) + '.pdf' , 'wb') as f: #创建并打开一个pdf文件,准备写入
27 | while True: #死循环获取并写入图片
28 | m += 1 #名称变量
29 | surl1 = 'http://' + purl + str(m) +'.jpg' #构造图片链接
30 | picurl = requests.get(surl1) #获取图片内容
31 | if picurl.status_code == 200: #判断下 如果图片存在就写入列表
32 | imgs.append(picurl.content)
33 | else:
34 | f.write(img2pdf.convert(imgs)) #把列表中所有的图片内容 写入pdf
35 | print(f'采集完毕!一共采集了{m -1}张,生成的pdf是{path}目录下【{pathname[0]}.pdf】')
36 | break #中止循环!
--------------------------------------------------------------------------------
/scrapy/zhihu-pretty-girl/READEME.md:
--------------------------------------------------------------------------------
1 | ## 微信公众号 ##
2 | 
3 | 欢迎扫码关注
4 | ## 博客 ##
5 | [YINUXY'S BLOG](https://blog.yinuxy.com/)
6 |
7 | ## 安装& 使用 ##
8 | ### 安装依赖 ###
9 | `pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com`
10 | ### 获取代码 ###
11 | `git clone ggit@github.com:InfiniteYinux/Python.git`
12 | ### 运行 ###
13 | ```
14 | cd pythonScript\zhihu-pretty-girl
15 | python zhihu-pretty-girl.py
16 | ```
--------------------------------------------------------------------------------
/scrapy/zhihu-pretty-girl/requirements.txt:
--------------------------------------------------------------------------------
1 | re
2 | argparse
3 | time
4 | json
5 | requests
6 | pymongo
--------------------------------------------------------------------------------
/scrapy/zhihu-pretty-girl/zhihu-pretty-girl.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Mar 6 22:48:22 2020
4 |
5 | @author: Yinux
6 | """
7 |
8 | import re
9 | import argparse
10 | import time
11 | import json
12 | import requests
13 | import pymongo
14 |
15 | def get_answers_by_page(page_no):
16 | offset = page_no * 10
17 | url = "https://www.zhihu.com/api/v4/questions/266808424/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset={}&limit=10&sort_by=default&platform=desktop".format(offset)
18 | headers = {
19 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
20 | }
21 | r = requests.get(url, verify=False, headers=headers)
22 | content = r.content.decode("utf-8")
23 | data = json.loads(content)
24 | is_end = data["paging"]["is_end"]
25 | items = data["data"]
26 | client = pymongo.MongoClient()
27 | db = client["beauty"]
28 | if len(items) > 0:
29 | db.answers.insert_many(items)
30 | return is_end
31 |
32 | def get_answers():
33 | page_no = 0
34 | client = pymongo.MongoClient()
35 | while True:
36 | print(page_no)
37 | is_end = get_answers_by_page(page_no)
38 | page_no += 1
39 | if is_end:
40 | break
41 |
42 | def query():
43 | client = pymongo.MongoClient()
44 | db = client["beauty"]
45 | items = db.answers.find({"voteup_count": {"$gte": 100}}).sort([("voteup_count", pymongo.DESCENDING)])
46 | count = 0
47 |
48 | for item in items:
49 | content = item["content"]
50 | vote_num = item["voteup_count"]
51 | author = item["author"]["name"]
52 | matched = re.findall(r'data-original="([^"]+)"', content)
53 | print("> 来自 {}\n".format(item["url"]))
54 | print("> 作者 {}\n".format(author))
55 | print("> 赞数 {}\n".format(vote_num))
56 | img_urls = []
57 | for img_url in matched:
58 | if img_url not in img_urls:
59 | print("".format(img_url))
60 | img_urls.append(img_url)
61 | count += len(img_urls)
62 | print("\n\n")
63 | print(count)
64 |
65 | if __name__ == "__main__":
66 | parser = argparse.ArgumentParser()
67 | parser.add_argument("--save", help="save data", action="store_true", dest="save")
68 | parser.add_argument("--query", help="query data", action="store_true", dest="query")
69 | args = parser.parse_args()
70 |
71 | if args.save:
72 | get_answers()
73 | elif args.query:
74 | query()
--------------------------------------------------------------------------------