├── README.md ├── Visualization ├── COVID-19_Tracking-master.zip ├── Epidemic-analysis │ ├── .ipynb_checkpoints │ │ └── Epidemic-analysis-checkpoint.ipynb │ ├── Epidemic-analysis.ipynb │ ├── READEME.md │ └── requirements.txt ├── Python疫情监控.zip ├── Python疫情监控 │ ├── notebook笔记 │ │ └── 疫情监控.ipynb │ ├── python疫情监控实战-东方瑞通.pdf │ ├── 环境准备.txt │ └── 项目源码 │ │ └── Cov │ │ ├── .idea │ │ ├── Cov.iml │ │ ├── misc.xml │ │ ├── modules.xml │ │ └── workspace.xml │ │ ├── __pycache__ │ │ ├── app.cpython-37.pyc │ │ └── utils.cpython-37.pyc │ │ ├── app.py │ │ ├── spider.py │ │ ├── static │ │ ├── css │ │ │ └── main.css │ │ └── js │ │ │ ├── china.js │ │ │ ├── controller.js │ │ │ ├── ec_center.js │ │ │ ├── ec_left1.js │ │ │ ├── ec_left2.js │ │ │ ├── ec_right1.js │ │ │ ├── ec_right2.js │ │ │ ├── echarts-wordcloud.min.js │ │ │ ├── echarts.min.js │ │ │ └── jquery-1.11.1.min.js │ │ ├── templates │ │ ├── index.html │ │ ├── main.html │ │ └── test.html │ │ └── utils.py └── maoyanMovies_comments │ ├── analysis.py │ ├── movieswd.py │ ├── t1.jpg │ ├── test.py │ ├── venmo1.jpg │ ├── venom.jpg │ ├── wd.py │ ├── 观众位置分布-地理坐标图.html │ └── 观众来源排行-柱状图.html ├── pythonScript ├── Certificate_photo_for_background_color │ └── main.py ├── OlympicGamesGoldenNotify │ ├── __pycache__ │ │ ├── mail.cpython-37.pyc │ │ └── medals.cpython-37.pyc │ ├── index.py │ ├── mail.py │ └── medals.py ├── WordCloud │ ├── Image-coloredwordcloud.py │ ├── Maskedwordcloud.py │ ├── alice_color.png │ ├── coloredWd.py │ └── comments.txt ├── autoVote │ ├── autoVote.py │ ├── cookie.txt │ └── getCookie.py ├── birthdayNotify │ ├── __pycache__ │ │ └── lunar.cpython-37.pyc │ ├── birthday.json │ ├── birthdayNotify.zip │ ├── index.py │ └── text.py ├── dingReminder │ └── dingReminder.py ├── draw_excel │ ├── 1.jpg │ ├── 2.jpg │ ├── 4k_1.jpg │ ├── draw_excel.py │ └── iu.jpg ├── messageReminder │ ├── README.md │ └── messageReminder.py ├── miStoreBuy │ ├── MiStore.py │ └── debug.log ├── pdfToExcel │ ├── README.md │ ├── pdfToExcel.py │ └── 新建 Microsoft Word 文档.docx ├── poem │ ├── Oxford3000.py │ ├── TangshiGene.py │ ├── TangshiGene2.py │ ├── __init__.py │ ├── dataHandler.py │ ├── test.py │ └── zzcf.py ├── studyReminder │ └── studyRemidner.py ├── telegramPushBot │ ├── ht.sh │ └── locpush.py ├── tianyi-zhuancun │ ├── README.md │ ├── sec1.png │ ├── sec2.png │ └── zhuancun.py └── year_code │ ├── code_dir │ └── readme.md │ ├── data.csv │ ├── readme.md │ ├── show_res │ ├── data_csv.jpg │ ├── py_output.jpg │ ├── py_statistic.jpg │ └── sort_csv.jpg │ ├── sort_data.csv │ └── statistic.py └── scrapy ├── 2019-nCov-cn ├── city.py └── province.py ├── 51job-scrapy ├── 2020-09-25_java开发工程师岗位招聘信息.csv ├── 2020-09-25_python开发工程师岗位招聘信息.csv ├── 2020-09-25_python爬虫工程师岗位招聘信息.csv ├── 2020-09-25_python算法工程师岗位招聘信息.csv ├── 2020-09-27_python开发工程师岗位招聘信息.csv ├── 2020-09-27_python爬虫工程师岗位招聘信息.csv ├── 2020-09-27_python算法工程师岗位招聘信息.csv ├── 2021-03-12_Java开发工程师岗位招聘信息.csv └── 51jobs.py ├── UnsplashCrawler └── UnsplashCrawler.py ├── WeChatArticle └── WecArticle.py ├── cf-ipv6 ├── cf_ipv6_scan.py ├── cf_valid_ipv6_scan_2606_4700_.txt └── ping.py ├── douyin-grils-down ├── README.md ├── douyin_appium.py ├── douyin_download.py └── douyin_mitmdump.py ├── ipProxyPool └── kuaidaili.py ├── jdCellPhone └── cellPhone.py ├── postgraduate_colleges ├── PostgraduateColleges.xlsx ├── postgraduatecolleges.csv └── 字段属性.txt ├── scrapy163musicComments ├── scrapyWyycomments.py └── 网易云音乐精彩评论.csv ├── vip-item ├── README.md ├── requirements.txt └── vip.py ├── ximalaya └── ximalaya.py ├── yunzhanImgToPdf ├── README.md └── main.py └── zhihu-pretty-girl ├── READEME.md ├── requirements.txt └── zhihu-pretty-girl.py /README.md: -------------------------------------------------------------------------------- 1 | # YINUXY的python脚本分享 # 2 | 3 | ## pythonScript ## 4 | * [证件照背景色替换](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/Certificate_photo_for_background_color/) 5 | * [上下班打卡提醒](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/messageReminder/) 6 | * [使用python在Excel里面画图](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/draw_excel/) 7 | * [惊雷歌词生成器](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/jingLei-songsGenerator/) 8 | * [小米商城抢购](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/miStoreBuy/) 9 | * [PDF转Excel](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/pdfToExcel/) 10 | * [批量下载Pixiv图片](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/Pixiv/) 11 | * [诗词生成器](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/poem/) 12 | * [python随机图](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/random_images/) 13 | * [hostloc新帖推送](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/telegramPushBot/) 14 | * [天翼云资源一键转存](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/tianyi-zhuancun/) 15 | * [python制作词云图片](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/WordCloud/) 16 | * [python统计一年书写过的代码量](https://github.com/InfiniteYinux/Python/tree/master/pythonScript/year_code/) 17 | 18 | 19 | ## scrapy ## 20 | * [51job职位抓取](https://github.com/InfiniteYinux/Python/tree/master/scrapy/51job-scrapy/) 21 | * [cf ip扫描](https://github.com/InfiniteYinux/Python/tree/master/scrapy/cf-ipv6/) 22 | * [中国新冠疫情数据抓取](https://github.com/InfiniteYinux/Python/tree/master/scrapy/2019-nCov-cn/) 23 | * [python批量下载抖音视频](https://github.com/InfiniteYinux/Python/tree/master/scrapy/douyin-grils-down/) 24 | * [wallhaven壁纸批量下载](https://github.com/InfiniteYinux/Python/tree/master/scrapy/img-spider-wallhaven/) 25 | * [京东手机信息抓取](https://github.com/InfiniteYinux/Python/tree/master/scrapy/jdCellPhone/) 26 | * [网易云音乐精彩评论抓取](https://github.com/InfiniteYinux/Python/tree/master/scrapy/scrapy163musicComments/) 27 | * [Unsplash图片批量下载](https://github.com/InfiniteYinux/Python/tree/master/scrapy/UnsplashCrawler/) 28 | * [唯品会商品信息抓取](https://github.com/InfiniteYinux/Python/tree/master/scrapy/vip-item/) 29 | * [微信公众号文章抓取](https://github.com/InfiniteYinux/Python/tree/master/scrapy/WeChatArticle/) 30 | * [喜马拉雅有声电子书抓取](https://github.com/InfiniteYinux/Python/tree/master/scrapy/ximalaya/) 31 | * [爬取知乎上的高颜值小姐姐](https://github.com/InfiniteYinux/Python/tree/master/scrapy/zhihu-pretty-girl/) 32 | 33 | ## Visualization ## 34 | * [使用 Python可视化神器 Plotly 动态演示全球疫情变化趋势](https://github.com/InfiniteYinux/Python/tree/master/Visualization/Epidemic-analysis/) 35 | * [猫眼评论数据可视化](https://github.com/InfiniteYinux/Python/tree/master/Visualization/maoyanMovies_comments/) 36 | * [Python疫情监控平台部署](https://github.com/InfiniteYinux/Python/tree/master/Visualization/Python疫情监控/) -------------------------------------------------------------------------------- /Visualization/COVID-19_Tracking-master.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/Visualization/COVID-19_Tracking-master.zip -------------------------------------------------------------------------------- /Visualization/Epidemic-analysis/READEME.md: -------------------------------------------------------------------------------- 1 | ## 微信公众号 ## 2 | ![](https://cdn.jsdelivr.net/gh/InfiniteYinux/cloud@master/qrcode.jpg) 3 | 欢迎扫码关注 4 | ## 博客 ## 5 | [YINUXY'S BLOG](https://blog.yinuxy.com/) 6 | 7 | ## 安装& 使用 ## 8 | ### 安装依赖 ### 9 | `pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com` 10 | ### 获取代码 ### 11 | `git clone ggit@github.com:InfiniteYinux/Python.git` 12 | ### 运行 ### 13 | 推荐使用`Jupyter Notebook`运行 -------------------------------------------------------------------------------- /Visualization/Epidemic-analysis/requirements.txt: -------------------------------------------------------------------------------- 1 | akshare 2 | pandas 3 | plotly -------------------------------------------------------------------------------- /Visualization/Python疫情监控.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/Visualization/Python疫情监控.zip -------------------------------------------------------------------------------- /Visualization/Python疫情监控/python疫情监控实战-东方瑞通.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/Visualization/Python疫情监控/python疫情监控实战-东方瑞通.pdf -------------------------------------------------------------------------------- /Visualization/Python疫情监控/环境准备.txt: -------------------------------------------------------------------------------- 1 | python 3.7 2 | mysql 3 | pycharm 4 | jupyter notebook 5 | hbuilder 6 | linux (centos7) 7 | -------------------------------------------------------------------------------- /Visualization/Python疫情监控/项目源码/Cov/.idea/Cov.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 18 | 19 | 20 | 22 | -------------------------------------------------------------------------------- /Visualization/Python疫情监控/项目源码/Cov/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /Visualization/Python疫情监控/项目源码/Cov/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Visualization/Python疫情监控/项目源码/Cov/__pycache__/app.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/Visualization/Python疫情监控/项目源码/Cov/__pycache__/app.cpython-37.pyc -------------------------------------------------------------------------------- /Visualization/Python疫情监控/项目源码/Cov/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/Visualization/Python疫情监控/项目源码/Cov/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /Visualization/Python疫情监控/项目源码/Cov/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | from flask import request 3 | from flask import render_template 4 | from flask import jsonify 5 | from jieba.analyse import extract_tags 6 | import string 7 | import utils 8 | 9 | app = Flask(__name__) 10 | 11 | 12 | @app.route('/') 13 | def hello_world(): 14 | return render_template("main.html") 15 | 16 | @app.route("/c1") 17 | def get_c1_data(): 18 | data = utils.get_c1_data() 19 | return jsonify({"confirm":data[0],"suspect":data[1],"heal":data[2],"dead":data[3]}) 20 | 21 | @app.route("/c2") 22 | def get_c2_data(): 23 | res = [] 24 | for tup in utils.get_c2_data(): 25 | # print(tup) 26 | res.append({"name":tup[0],"value":int(tup[1])}) 27 | return jsonify({"data":res}) 28 | 29 | @app.route("/l1") 30 | def get_l1_data(): 31 | data = utils.get_l1_data() 32 | day,confirm,suspect,heal,dead = [],[],[],[],[] 33 | for a,b,c,d,e in data[7:]: 34 | day.append(a.strftime("%m-%d")) #a是datatime类型 35 | confirm.append(b) 36 | suspect.append(c) 37 | heal.append(d) 38 | dead.append(e) 39 | return jsonify({"day":day,"confirm": confirm, "suspect": suspect, "heal": heal, "dead": dead}) 40 | 41 | @app.route("/l2") 42 | def get_l2_data(): 43 | data = utils.get_l2_data() 44 | day, confirm_add, suspect_add = [], [], [] 45 | for a, b, c in data[7:]: 46 | day.append(a.strftime("%m-%d")) # a是datatime类型 47 | confirm_add.append(b) 48 | suspect_add.append(c) 49 | return jsonify({"day": day, "confirm_add": confirm_add, "suspect_add": suspect_add}) 50 | 51 | @app.route("/r1") 52 | def get_r1_data(): 53 | data = utils.get_r1_data() 54 | city = [] 55 | confirm = [] 56 | for k,v in data: 57 | city.append(k) 58 | confirm.append(int(v)) 59 | return jsonify({"city": city, "confirm": confirm}) 60 | 61 | 62 | @app.route("/r2") 63 | def get_r2_data(): 64 | data = utils.get_r2_data() #格式 (('民警抗疫一线奋战16天牺牲1037364',), ('四川再派两批医疗队1537382',) 65 | d = [] 66 | for i in data: 67 | k = i[0].rstrip(string.digits) # 移除热搜数字 68 | v = i[0][len(k):] # 获取热搜数字 69 | ks = extract_tags(k) # 使用jieba 提取关键字 70 | for j in ks: 71 | if not j.isdigit(): 72 | d.append({"name": j, "value": v}) 73 | return jsonify({"kws": d}) 74 | 75 | @app.route("/time") 76 | def get_time(): 77 | return utils.get_time() 78 | 79 | @app.route('/ajax',methods=["get","post"]) 80 | def hello_world4(): 81 | name = request.values.get("name") 82 | score = request.values.get("score") 83 | print(f"name:{name},score:{score}") 84 | return '10000' 85 | 86 | @app.route('/tem') 87 | def hello_world3(): 88 | return render_template("index.html") 89 | 90 | @app.route('/login') 91 | def hello_world2(): 92 | name = request.values.get("name") 93 | pwd = request.values.get("pwd") 94 | return f'name={name},pwd={pwd}' 95 | 96 | @app.route("/abc") 97 | def hello_world1(): 98 | id = request.values.get("id") 99 | return f""" 100 |
101 | 账号:
102 | 密码: 103 | 104 |
105 | """ 106 | 107 | if __name__ == '__main__': 108 | app.run(host="0.0.0.0") 109 | -------------------------------------------------------------------------------- /Visualization/Python疫情监控/项目源码/Cov/spider.py: -------------------------------------------------------------------------------- 1 | from selenium.webdriver import Chrome, ChromeOptions 2 | import requests 3 | import pymysql 4 | import time 5 | import json 6 | import traceback 7 | import sys 8 | 9 | def get_conn(): 10 | """ 11 | :return: 连接,游标 12 | """ 13 | # 创建连接 14 | conn = pymysql.connect(host="localhost", 15 | user="root", 16 | password="yinuxy", 17 | db="cov", 18 | charset="utf8") 19 | # 创建游标 20 | cursor = conn.cursor() # 执行完毕返回的结果集默认以元组显示 21 | return conn, cursor 22 | 23 | 24 | def close_conn(conn, cursor): 25 | if cursor: 26 | cursor.close() 27 | if conn: 28 | conn.close() 29 | 30 | 31 | def get_tencent_data(): 32 | """ 33 | :return: 返回历史数据和当日详细数据 34 | """ 35 | url = 'https://api.inews.qq.com/newsqa/v1/query/inner/publish/modules/list?modules=chinaDayList,chinaDayAddList,cityStatis,nowConfirmStatis,provinceCompare' 36 | headers = { 37 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36', 38 | } 39 | r = requests.get(url, headers) 40 | res = json.loads(r.text) # json字符串转字典 41 | data_all = json.loads(res['data']) 42 | 43 | history = {} # 历史数据 44 | for i in data_all["chinaDayList"]: 45 | ds = "2020." + i["date"] 46 | tup = time.strptime(ds, "%Y.%m.%d") 47 | ds = time.strftime("%Y-%m-%d", tup) # 改变时间格式,不然插入数据库会报错,数据库是datetime类型 48 | confirm = i["confirm"] 49 | suspect = i["suspect"] 50 | heal = i["heal"] 51 | dead = i["dead"] 52 | history[ds] = {"confirm": confirm, "suspect": suspect, "heal": heal, "dead": dead} 53 | for i in data_all["chinaDayAddList"]: 54 | ds = "2020." + i["date"] 55 | tup = time.strptime(ds, "%Y.%m.%d") 56 | ds = time.strftime("%Y-%m-%d", tup) 57 | confirm = i["confirm"] 58 | suspect = i["suspect"] 59 | heal = i["heal"] 60 | dead = i["dead"] 61 | history[ds].update({"confirm_add": confirm, "suspect_add": suspect, "heal_add": heal, "dead_add": dead}) 62 | 63 | details = [] # 当日详细数据 64 | update_time = data_all["lastUpdateTime"] 65 | data_country = data_all["areaTree"] # list 25个国家 66 | data_province = data_country[0]["children"] # 中国各省 67 | for pro_infos in data_province: 68 | province = pro_infos["name"] # 省名 69 | for city_infos in pro_infos["children"]: 70 | city = city_infos["name"] 71 | confirm = city_infos["total"]["confirm"] 72 | confirm_add = city_infos["today"]["confirm"] 73 | heal = city_infos["total"]["heal"] 74 | dead = city_infos["total"]["dead"] 75 | details.append([update_time, province, city, confirm, confirm_add, heal, dead]) 76 | return history, details 77 | 78 | 79 | def get_baidu_hot(): 80 | """ 81 | :return: 返回百度疫情热搜 82 | """ 83 | option = ChromeOptions() # 创建谷歌浏览器实例 84 | option.add_argument("--headless") # 隐藏浏览器 85 | option.add_argument('--no-sandbox') 86 | 87 | url = "https://voice.baidu.com/act/virussearch/virussearch?from=osari_map&tab=0&infomore=1" 88 | browser = Chrome(options=option,executable_path="./chromedriver.exe") 89 | browser.get(url) 90 | # 找到展开按钮 91 | dl = browser.find_element_by_xpath('//*[@id="main"]/div/div/section/div[2]/div/div[2]/section/div') 92 | dl.click() 93 | time.sleep(1) 94 | # 找到热搜标签 95 | c = browser.find_elements_by_xpath('//*[@id="main"]/div/div/section/div[2]/div/div[2]/section/a/div/span[2]') 96 | context = [i.text for i in c] # 获取标签内容 97 | print(context) 98 | return context 99 | 100 | 101 | def update_hotsearch(): 102 | """ 103 | 将疫情热搜插入数据库 104 | :return: 105 | """ 106 | cursor = None 107 | conn = None 108 | try: 109 | context = get_baidu_hot() 110 | print(f"{time.asctime()}开始更新热搜数据") 111 | conn, cursor = get_conn() 112 | sql = "insert into hotsearch(dt,content) values(%s,%s)" 113 | ts = time.strftime("%Y-%m-%d %X") 114 | for i in context: 115 | cursor.execute(sql, (ts, i)) # 插入数据 116 | conn.commit() # 提交事务保存数据 117 | print(f"{time.asctime()}数据更新完毕") 118 | except: 119 | traceback.print_exc() 120 | finally: 121 | close_conn(conn, cursor) 122 | 123 | 124 | def update_details(): 125 | """ 126 | 更新 details 表 127 | :return: 128 | """ 129 | cursor = None 130 | conn = None 131 | try: 132 | li = get_tencent_data()[1] # 0 是历史数据字典,1 最新详细数据列表 133 | conn, cursor = get_conn() 134 | sql = "insert into details(update_time,province,city,confirm,confirm_add,heal,dead) values(%s,%s,%s,%s,%s,%s,%s)" 135 | sql_query = 'select %s=(select update_time from details order by id desc limit 1)' #对比当前最大时间戳 136 | cursor.execute(sql_query,li[0][0]) 137 | if not cursor.fetchone()[0]: 138 | print(f"{time.asctime()}开始更新最新数据") 139 | for item in li: 140 | cursor.execute(sql, item) 141 | conn.commit() # 提交事务 update delete insert操作 142 | print(f"{time.asctime()}更新最新数据完毕") 143 | else: 144 | print(f"{time.asctime()}已是最新数据!") 145 | except: 146 | traceback.print_exc() 147 | finally: 148 | close_conn(conn, cursor) 149 | 150 | 151 | def insert_history(): 152 | """ 153 | 插入历史数据 154 | :return: 155 | """ 156 | cursor = None 157 | conn = None 158 | try: 159 | dic = get_tencent_data()[0] # 0 是历史数据字典,1 最新详细数据列表 160 | print(f"{time.asctime()}开始插入历史数据") 161 | conn, cursor = get_conn() 162 | sql = "insert into history values(%s,%s,%s,%s,%s,%s,%s,%s,%s)" 163 | for k, v in dic.items(): 164 | # item 格式 {'2020-01-13': {'confirm': 41, 'suspect': 0, 'heal': 0, 'dead': 1} 165 | cursor.execute(sql, [k, v.get("confirm"), v.get("confirm_add"), v.get("suspect"), 166 | v.get("suspect_add"), v.get("heal"), v.get("heal_add"), 167 | v.get("dead"), v.get("dead_add")]) 168 | 169 | conn.commit() # 提交事务 update delete insert操作 170 | print(f"{time.asctime()}插入历史数据完毕") 171 | except: 172 | traceback.print_exc() 173 | finally: 174 | close_conn(conn, cursor) 175 | 176 | 177 | def update_history(): 178 | """ 179 | 更新历史数据 180 | :return: 181 | """ 182 | cursor = None 183 | conn = None 184 | try: 185 | dic = get_tencent_data()[0] # 0 是历史数据字典,1 最新详细数据列表 186 | print(f"{time.asctime()}开始更新历史数据") 187 | conn, cursor = get_conn() 188 | sql = "insert into history values(%s,%s,%s,%s,%s,%s,%s,%s,%s)" 189 | sql_query = "select confirm from history where ds=%s" 190 | for k, v in dic.items(): 191 | # item 格式 {'2020-01-13': {'confirm': 41, 'suspect': 0, 'heal': 0, 'dead': 1} 192 | if not cursor.execute(sql_query, k): 193 | cursor.execute(sql, [k, v.get("confirm"), v.get("confirm_add"), v.get("suspect"), 194 | v.get("suspect_add"), v.get("heal"), v.get("heal_add"), 195 | v.get("dead"), v.get("dead_add")]) 196 | conn.commit() # 提交事务 update delete insert操作 197 | print(f"{time.asctime()}历史数据更新完毕") 198 | except: 199 | traceback.print_exc() 200 | finally: 201 | close_conn(conn, cursor) 202 | 203 | 204 | if __name__ == "__main__": 205 | l = len(sys.argv) 206 | if l == 1: 207 | s = """ 208 | 请输入参数 209 | 参数说明: 210 | up_his 更新历史记录表 211 | up_hot 更新实时热搜 212 | up_det 更新详细表 213 | """ 214 | print(s) 215 | else: 216 | order = sys.argv[1] 217 | if order == "up_his": 218 | update_history() 219 | elif order == "up_det": 220 | update_details() 221 | elif order == "up_hot": 222 | update_hotsearch() 223 | 224 | 225 | 226 | -------------------------------------------------------------------------------- /Visualization/Python疫情监控/项目源码/Cov/static/css/main.css: -------------------------------------------------------------------------------- 1 | body { 2 | margin: 0; 3 | background: #333; 4 | } 5 | 6 | #title { 7 | position: absolute; 8 | width: 40%; 9 | height: 10%; 10 | top: 0; 11 | left: 30%; 12 | /* background: #666666; */ 13 | color: white; 14 | font-size: 40px; 15 | 16 | display: flex; 17 | align-items: center; 18 | justify-content: center; 19 | } 20 | 21 | #tim { 22 | position: absolute; 23 | /* width: 30%; */ 24 | height: 10%; 25 | right: 2%; 26 | top: 5%; 27 | color: #FFFFFF; 28 | font-size: 16px; 29 | } 30 | 31 | #c1 { 32 | position: absolute; 33 | width: 40%; 34 | height: 25%; 35 | top: 10%; 36 | left: 30%; 37 | color: white 38 | /* background: #777777; */ 39 | } 40 | 41 | .num { 42 | width: 25%; 43 | float: left; 44 | display: flex; 45 | align-items: center; 46 | justify-content: center; 47 | color: gold; 48 | font-size: 20px; 49 | /*margin-top: 20px;*/ 50 | } 51 | 52 | .txt { 53 | width: 25%; 54 | float: left; 55 | font-family: "幼圆"; 56 | display: flex; 57 | align-items: center; 58 | justify-content: center; 59 | } 60 | 61 | .txt h2 { 62 | margin: 0; 63 | } 64 | 65 | 66 | 67 | 68 | 69 | 70 | #c2 { 71 | position: absolute; 72 | width: 40%; 73 | height: 65%; 74 | top: 35%; 75 | left: 30%; 76 | background: #888888; 77 | } 78 | 79 | #l1 { 80 | position: absolute; 81 | width: 30%; 82 | height: 45%; 83 | top: 10%; 84 | left: 0%; 85 | background: #666666; 86 | } 87 | 88 | #l2 { 89 | position: absolute; 90 | width: 30%; 91 | height: 45%; 92 | top: 55%; 93 | left: 0%; 94 | background: #777; 95 | } 96 | 97 | #r1 { 98 | position: absolute; 99 | width: 30%; 100 | height: 45%; 101 | top: 10%; 102 | right: 0%; 103 | background: #666666; 104 | } 105 | 106 | #r2 { 107 | position: absolute; 108 | width: 30%; 109 | height: 45%; 110 | top: 55%; 111 | right: 0%; 112 | background: #777; 113 | } 114 | 115 | /* 此处是设置自动匹配横屏,以适应手机显示 使用下面定义的id:gyroContain */ 116 | @media screen and (orientation: portrait) { 117 | html{ 118 | width : 100vmin; 119 | height : 100vmax; 120 | } 121 | body{ 122 | width : 100vmin; 123 | height : 100vmax; 124 | } 125 | #gyroContain{ 126 | width : 100vmax; 127 | height : 100vmin; 128 | transform-origin: top left; 129 | transform: rotate(90deg) translate(0,-100vmin); 130 | } 131 | } 132 | @media screen and (orientation: landscape) { 133 | html{ 134 | width : 100vmax; 135 | height : 100vmin; 136 | } 137 | body{ 138 | width : 100vmax; 139 | height : 100vmin; 140 | } 141 | #gyroContain{ 142 | width : 100vmax; 143 | height : 100vmin; 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /Visualization/Python疫情监控/项目源码/Cov/static/js/controller.js: -------------------------------------------------------------------------------- 1 | function gettime() { 2 | $.ajax({ 3 | url: "/time", 4 | timeout: 10000, //超时时间设置为10秒; 5 | success: function(data) { 6 | $("#tim").html(data) 7 | }, 8 | error: function(xhr, type, errorThrown) { 9 | 10 | } 11 | }); 12 | } 13 | 14 | function get_c1_data() { 15 | $.ajax({ 16 | url: "/c1", 17 | success: function(data) { 18 | $(".num h1").eq(0).text(data.confirm); 19 | $(".num h1").eq(1).text(data.suspect); 20 | $(".num h1").eq(2).text(data.heal); 21 | $(".num h1").eq(3).text(data.dead); 22 | }, 23 | error: function(xhr, type, errorThrown) { 24 | 25 | } 26 | }) 27 | } 28 | function get_c2_data() { 29 | $.ajax({ 30 | url:"/c2", 31 | success: function(data) { 32 | ec_center_option.series[0].data=data.data 33 | ec_center.setOption(ec_center_option) 34 | }, 35 | error: function(xhr, type, errorThrown) { 36 | 37 | } 38 | }) 39 | } 40 | 41 | function get_l1_data() { 42 | $.ajax({ 43 | url:"/l1", 44 | success: function(data) { 45 | ec_left1_Option.xAxis[0].data=data.day 46 | ec_left1_Option.series[0].data=data.confirm 47 | ec_left1_Option.series[1].data=data.suspect 48 | ec_left1_Option.series[2].data=data.heal 49 | ec_left1_Option.series[3].data=data.dead 50 | ec_left1.setOption(ec_left1_Option) 51 | }, 52 | error: function(xhr, type, errorThrown) { 53 | 54 | } 55 | }) 56 | } 57 | 58 | function get_l2_data() { 59 | $.ajax({ 60 | url:"/l2", 61 | success: function(data) { 62 | ec_left2_Option.xAxis[0].data=data.day 63 | ec_left2_Option.series[0].data=data.confirm_add 64 | ec_left2_Option.series[1].data=data.suspect_add 65 | ec_left2.setOption(ec_left2_Option) 66 | }, 67 | error: function(xhr, type, errorThrown) { 68 | 69 | } 70 | }) 71 | } 72 | 73 | function get_r1_data() { 74 | $.ajax({ 75 | url: "/r1", 76 | success: function (data) { 77 | ec_right1_option.xAxis.data=data.city; 78 | ec_right1_option.series[0].data=data.confirm; 79 | ec_right1.setOption(ec_right1_option); 80 | } 81 | }) 82 | } 83 | function get_r2_data() { 84 | $.ajax({ 85 | url: "/r2", 86 | success: function (data) { 87 | ec_right2_option.series[0].data=data.kws; 88 | ec_right2.setOption(ec_right2_option); 89 | } 90 | }) 91 | } 92 | gettime() 93 | get_c1_data() 94 | get_c2_data() 95 | get_l1_data() 96 | get_l2_data() 97 | get_r1_data() 98 | get_r2_data() 99 | 100 | setInterval(gettime,1000) 101 | setInterval(get_c1_data,1000*10) 102 | setInterval(get_c2_data,10000*10) 103 | setInterval(get_l1_data,10000*10) 104 | setInterval(get_l2_data,10000*10) 105 | setInterval(get_r1_data,10000*10) 106 | setInterval(get_r2_data,10000*10) 107 | -------------------------------------------------------------------------------- /Visualization/Python疫情监控/项目源码/Cov/static/js/ec_center.js: -------------------------------------------------------------------------------- 1 | var ec_center = echarts.init(document.getElementById('c2'), "dark"); 2 | 3 | var mydata = [{'name': '上海', 'value': 318}, {'name': '云南', 'value': 162}] 4 | 5 | var ec_center_option = { 6 | title: { 7 | text: '', 8 | subtext: '', 9 | x: 'left' 10 | }, 11 | tooltip: { 12 | trigger: 'item' 13 | }, 14 | //左侧小导航图标 15 | visualMap: { 16 | show: true, 17 | x: 'left', 18 | y: 'bottom', 19 | textStyle: { 20 | fontSize: 8, 21 | }, 22 | splitList: [{ start: 1,end: 9 }, 23 | {start: 10, end: 99 }, 24 | { start: 100, end: 999 }, 25 | { start: 1000, end: 9999 }, 26 | { start: 10000 }], 27 | color: ['#8A3310', '#C64918', '#E55B25', '#F2AD92', '#F9DCD1'] 28 | }, 29 | //配置属性 30 | series: [{ 31 | name: '累计确诊人数', 32 | type: 'map', 33 | mapType: 'china', 34 | roam: false, //拖动和缩放 35 | itemStyle: { 36 | normal: { 37 | borderWidth: .5, //区域边框宽度 38 | borderColor: '#009fe8', //区域边框颜色 39 | areaColor: "#ffefd5", //区域颜色 40 | }, 41 | emphasis: { //鼠标滑过地图高亮的相关设置 42 | borderWidth: .5, 43 | borderColor: '#4b0082', 44 | areaColor: "#fff", 45 | } 46 | }, 47 | label: { 48 | normal: { 49 | show: true, //省份名称 50 | fontSize: 8, 51 | }, 52 | emphasis: { 53 | show: true, 54 | fontSize: 8, 55 | } 56 | }, 57 | data:[] //mydata //数据 58 | }] 59 | }; 60 | ec_center.setOption(ec_center_option) -------------------------------------------------------------------------------- /Visualization/Python疫情监控/项目源码/Cov/static/js/ec_left1.js: -------------------------------------------------------------------------------- 1 | var ec_left1 = echarts.init(document.getElementById('l1'), "dark"); 2 | 3 | var ec_left1_Option = { 4 | //标题样式 5 | title: { 6 | text: "全国累计趋势", 7 | textStyle: { 8 | // color: 'white', 9 | }, 10 | left: 'left', 11 | }, 12 | tooltip: { 13 | trigger: 'axis', 14 | //指示器 15 | axisPointer: { 16 | type: 'line', 17 | lineStyle: { 18 | color: '#7171C6' 19 | } 20 | }, 21 | }, 22 | legend: { 23 | data: ['累计确诊', '现有疑似', "累计治愈", "累计死亡"], 24 | left: "right" 25 | }, 26 | 27 | //图形位置 28 | grid: { 29 | left: '4%', 30 | right: '6%', 31 | bottom: '4%', 32 | top: 50, 33 | containLabel: true 34 | }, 35 | xAxis: [{ 36 | type: 'category', 37 | //x轴坐标点开始与结束点位置都不在最边缘 38 | // boundaryGap : true, 39 | data: []//['01.20', '01.21', '01.22'] 40 | }], 41 | yAxis: [{ 42 | type: 'value', 43 | //y轴字体设置 44 | axisLabel: { 45 | show: true, 46 | color: 'white', 47 | fontSize: 12, 48 | formatter: function(value) { 49 | if (value >= 1000) { 50 | value = value / 1000 + 'k'; 51 | } 52 | return value; 53 | } 54 | }, 55 | //y轴线设置显示 56 | axisLine: { 57 | show: true 58 | }, 59 | //与x轴平行的线样式 60 | splitLine: { 61 | show: true, 62 | lineStyle: { 63 | color: '#17273B', 64 | width: 1, 65 | type: 'solid', 66 | } 67 | } 68 | }], 69 | series: [{ 70 | name: "累计确诊", 71 | type: 'line', 72 | smooth: true, 73 | data: []//[260, 406, 529] 74 | }, { 75 | name: "现有疑似", 76 | type: 'line', 77 | smooth: true, 78 | data: []//[54, 37, 3935] 79 | }, 80 | { 81 | name: "累计治愈", 82 | type: 'line', 83 | smooth: true, 84 | data: []//[25, 25, 25] 85 | }, { 86 | name: "累计死亡", 87 | type: 'line', 88 | smooth: true, 89 | data: []//[6, 9, 17] 90 | }] 91 | }; 92 | 93 | ec_left1.setOption(ec_left1_Option) 94 | -------------------------------------------------------------------------------- /Visualization/Python疫情监控/项目源码/Cov/static/js/ec_left2.js: -------------------------------------------------------------------------------- 1 | var ec_left2 = echarts.init(document.getElementById('l2'), "dark"); 2 | var ec_left2_Option = { 3 | tooltip: { 4 | trigger: 'axis', 5 | //指示器 6 | axisPointer: { 7 | type: 'line', 8 | lineStyle: { 9 | color: '#7171C6' 10 | } 11 | }, 12 | }, 13 | legend: { 14 | data: ['新增确诊', '新增疑似'], 15 | left: "right" 16 | }, 17 | //标题样式 18 | title: { 19 | text: "全国新增趋势", 20 | textStyle: { 21 | color: 'white', 22 | }, 23 | left: 'left' 24 | }, 25 | //图形位置 26 | grid: { 27 | left: '4%', 28 | right: '6%', 29 | bottom: '4%', 30 | top: 50, 31 | containLabel: true 32 | }, 33 | xAxis: [{ 34 | type: 'category', 35 | //x轴坐标点开始与结束点位置都不在最边缘 36 | // boundaryGap : true, 37 | 38 | data: [] 39 | }], 40 | yAxis: [{ 41 | type: 'value', 42 | //y轴字体设置 43 | 44 | //y轴线设置显示 45 | axisLine: { 46 | show: true 47 | }, 48 | axisLabel: { 49 | show: true, 50 | color: 'white', 51 | fontSize: 12, 52 | formatter: function(value) { 53 | if (value >= 1000) { 54 | value = value / 1000 + 'k'; 55 | } 56 | return value; 57 | } 58 | }, 59 | //与x轴平行的线样式 60 | splitLine: { 61 | show: true, 62 | lineStyle: { 63 | color: '#17273B', 64 | width: 1, 65 | type: 'solid', 66 | } 67 | } 68 | }], 69 | series: [{ 70 | name: "新增确诊", 71 | type: 'line', 72 | smooth: true, 73 | data: [] 74 | }, { 75 | name: "新增疑似", 76 | type: 'line', 77 | smooth: true, 78 | data: [] 79 | }] 80 | }; 81 | 82 | ec_left2.setOption(ec_left2_Option) 83 | -------------------------------------------------------------------------------- /Visualization/Python疫情监控/项目源码/Cov/static/js/ec_right1.js: -------------------------------------------------------------------------------- 1 | var ec_right1 = echarts.init(document.getElementById('r1'),"dark"); 2 | var ec_right1_option = { 3 | //标题样式 4 | title : { 5 | text : "非湖北地区城市确诊TOP5", 6 | textStyle : { 7 | color : 'white', 8 | }, 9 | left : 'left' 10 | }, 11 | color: ['#3398DB'], 12 | tooltip: { 13 | trigger: 'axis', 14 | axisPointer: { // 坐标轴指示器,坐标轴触发有效 15 | type: 'shadow' // 默认为直线,可选为:'line' | 'shadow' 16 | } 17 | }, 18 | xAxis: { 19 | type: 'category', 20 | data: [] 21 | }, 22 | yAxis: { 23 | type: 'value' 24 | }, 25 | series: [{ 26 | data: [], 27 | type: 'bar', 28 | barMaxWidth:"50%" 29 | }] 30 | }; 31 | ec_right1.setOption(ec_right1_option) -------------------------------------------------------------------------------- /Visualization/Python疫情监控/项目源码/Cov/static/js/ec_right2.js: -------------------------------------------------------------------------------- 1 | var ec_right2 = echarts.init(document.getElementById('r2'), "dark"); 2 | 3 | var ddd = [{'name': '肺炎', 'value': '12734670'}, {'name': '实时', 'value': '12734670'}, 4 | {'name': '新型', 'value': '12734670'}] 5 | var ec_right2_option = { 6 | // backgroundColor: '#515151', 7 | title : { 8 | text : "今日疫情热搜", 9 | textStyle : { 10 | color : 'white', 11 | }, 12 | left : 'left' 13 | }, 14 | tooltip: { 15 | show: false 16 | }, 17 | series: [{ 18 | type: 'wordCloud', 19 | // drawOutOfBound:true, 20 | gridSize: 1, 21 | sizeRange: [12, 55], 22 | rotationRange: [-45, 0, 45, 90], 23 | // maskImage: maskImage, 24 | textStyle: { 25 | normal: { 26 | color: function () { 27 | return 'rgb(' + 28 | Math.round(Math.random() * 255) + 29 | ', ' + Math.round(Math.random() * 255) + 30 | ', ' + Math.round(Math.random() * 255) + ')' 31 | } 32 | } 33 | }, 34 | // left: 'center', 35 | // top: 'center', 36 | // // width: '96%', 37 | // // height: '100%', 38 | right: null, 39 | bottom: null, 40 | // width: 300, 41 | // height: 200, 42 | // top: 20, 43 | data: [] 44 | }] 45 | } 46 | 47 | ec_right2.setOption(ec_right2_option); 48 | -------------------------------------------------------------------------------- /Visualization/Python疫情监控/项目源码/Cov/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | my page 6 | 7 | 8 | 9 |

疫情追踪

10 |

实时报道

11 | 12 | 26 | 27 | -------------------------------------------------------------------------------- /Visualization/Python疫情监控/项目源码/Cov/templates/main.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 疫情监控 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 |
全国疫情实时追踪
15 |
16 |
我是左1
17 |
我是左2
18 |
19 |

20 |

21 |

22 |

23 |

累计确诊

24 |

剩余疑似

25 |

累计治愈

26 |

累计死亡

27 |
28 |
我是中2
29 |
我是右1
30 |
我是右2
31 | 32 | 33 | 34 | 35 | 36 | 37 |
38 | 39 | 40 | -------------------------------------------------------------------------------- /Visualization/Python疫情监控/项目源码/Cov/templates/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
10 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /Visualization/Python疫情监控/项目源码/Cov/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pymysql 3 | 4 | def get_time(): 5 | time_str = time.strftime("%Y{}%m{}%d{} %X") 6 | return time_str.format("年","月","日") 7 | 8 | def get_conn(): 9 | """ 10 | :return: 连接,游标 11 | """ 12 | # 创建连接 13 | conn = pymysql.connect(host="127.0.0.1", 14 | user="root", 15 | password="", 16 | db="cov", 17 | charset="utf8") 18 | # 创建游标 19 | cursor = conn.cursor()# 执行完毕返回的结果集默认以元组显示 20 | return conn, cursor 21 | 22 | def close_conn(conn, cursor): 23 | cursor.close() 24 | conn.close() 25 | 26 | def query(sql,*args): 27 | """ 28 | 封装通用查询 29 | :param sql: 30 | :param args: 31 | :return: 返回查询到的结果,((),(),)的形式 32 | """ 33 | conn, cursor = get_conn() 34 | cursor.execute(sql,args) 35 | res = cursor.fetchall() 36 | close_conn(conn, cursor) 37 | return res 38 | 39 | def get_c1_data(): 40 | """ 41 | :return: 返回大屏div id=c1 的数据 42 | """ 43 | # 因为会更新多次数据,取时间戳最新的那组数据 44 | sql = "select sum(confirm)," \ 45 | "(select suspect from history order by ds desc limit 1)," \ 46 | "sum(heal)," \ 47 | "sum(dead) " \ 48 | "from details " \ 49 | "where update_time=(select update_time from details order by update_time desc limit 1) " 50 | res = query(sql) 51 | return res[0] 52 | 53 | def get_c2_data(): 54 | """ 55 | :return: 返回各省数据 56 | """ 57 | # 因为会更新多次数据,取时间戳最新的那组数据 58 | sql = "select province,sum(confirm) from details " \ 59 | "where update_time=(select update_time from details " \ 60 | "order by update_time desc limit 1) " \ 61 | "group by province" 62 | res = query(sql) 63 | return res 64 | 65 | def get_l1_data(): 66 | 67 | sql = "select ds,confirm,suspect,heal,dead from history" 68 | res = query(sql) 69 | return res 70 | 71 | def get_l2_data(): 72 | 73 | sql = "select ds,confirm_add,suspect_add from history" 74 | res = query(sql) 75 | return res 76 | 77 | def get_r1_data(): 78 | """ 79 | :return: 返回非湖北地区城市确诊人数前5名 80 | """ 81 | sql = 'SELECT city,confirm FROM ' \ 82 | '(select city,confirm from details ' \ 83 | 'where update_time=(select update_time from details order by update_time desc limit 1) ' \ 84 | 'and province not in ("湖北","北京","上海","天津","重庆") ' \ 85 | 'union all ' \ 86 | 'select province as city,sum(confirm) as confirm from details ' \ 87 | 'where update_time=(select update_time from details order by update_time desc limit 1) ' \ 88 | 'and province in ("北京","上海","天津","重庆") group by province) as a ' \ 89 | 'ORDER BY confirm DESC LIMIT 5' 90 | res = query(sql) 91 | return res 92 | 93 | def get_r2_data(): 94 | """ 95 | :return: 返回最近的20条热搜 96 | """ 97 | sql = 'select content from hotsearch order by id desc limit 20' 98 | res = query(sql) #格式 (('民警抗疫一线奋战16天牺牲1037364',), ('四川再派两批医疗队1537382',) 99 | return res 100 | 101 | if __name__ == "__main__": 102 | print(get_r2_data()) -------------------------------------------------------------------------------- /Visualization/maoyanMovies_comments/analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author:Ryan time:2018/11/20 4 | 5 | from pyecharts import Style 6 | from pyecharts import Geo 7 | import json 8 | from pyecharts import Bar 9 | from collections import Counter 10 | 11 | 12 | #数据可视化 13 | 14 | def gender(): 15 | cities = [] 16 | with open('E:/spiderproject/maoyanMovies_comments/comments.txt','r',encoding='utf-8')as f: 17 | rows = f.readlines() 18 | try: 19 | for row in rows: 20 | city = row.split(',')[1] 21 | if city != '': 22 | cities.append(city) 23 | #print(city) 24 | except Exception as e: 25 | print(e) 26 | 27 | handle(cities) 28 | data = Counter(cities).most_common() 29 | style = Style( 30 | title_color='#fff', 31 | title_pos='center', 32 | width=1200, 33 | height=600, 34 | background_color='#404a59' 35 | ) 36 | geo = Geo('《毒液》观众位置分布', '数据来源:猫眼-Ryan采集', **style.init_style) 37 | attr, value = geo.cast(data) 38 | geo.add('', attr, value, visual_range=[0, 1000], 39 | visual_text_color='#fff', symbol_size=15, 40 | is_visualmap=True, is_piecewise=False, visual_split_number=10) 41 | geo.render('观众位置分布-地理坐标图.html') 42 | 43 | data_top20 = Counter(cities).most_common(20) 44 | bar = Bar('《毒液》观众来源排行TOP20', '数据来源:猫眼-Ryan采集', title_pos='center', width=1200, height=600) 45 | attr, value = bar.cast(data_top20) 46 | bar.add('', attr, value, is_visualmap=True, visual_range=[0, 3500], visual_text_color='#fff', is_more_utils=True, 47 | is_label_show=True) 48 | bar.render('观众来源排行-柱状图.html') 49 | 50 | def handle(cities): 51 | # print(len(cities), len(set(cities))) 52 | 53 | # 获取坐标文件中所有地名 54 | data = None 55 | with open('C:/Users/purple.guo/AppData/Local/Continuum/anaconda3/Lib/site-packages/pyecharts/datasets/city_coordinates.json', 56 | mode='r', encoding='utf-8') as f: 57 | data = json.loads(f.read()) # 将str转换为json 58 | 59 | # 循环判断处理 60 | data_new = data.copy() # 拷贝所有地名数据 61 | for city in set(cities): # 使用set去重 62 | # 处理地名为空的数据 63 | if city == '': 64 | while city in cities: 65 | cities.remove(city) 66 | count = 0 67 | for k in data.keys(): 68 | count += 1 69 | if k == city: 70 | break 71 | if k.startswith(city): # 处理简写的地名,如 达州市 简写为 达州 72 | # print(k, city) 73 | data_new[city] = data[k] 74 | break 75 | if k.startswith(city[0:-1]) and len(city) >= 3: # 处理行政变更的地名,如县改区 或 县改市等 76 | data_new[city] = data[k] 77 | break 78 | # 处理不存在的地名 79 | if count == len(data): 80 | while city in cities: 81 | cities.remove(city) 82 | 83 | # print(len(data), len(data_new)) 84 | 85 | # 写入覆盖坐标文件 86 | with open( 87 | 'C:/Users/purple.guo/AppData/Local/Continuum/anaconda3/Lib/site-packages/pyecharts/datasets/city_coordinates.json', 88 | mode='w', encoding='utf-8') as f: 89 | f.write(json.dumps(data_new, ensure_ascii=False)) 90 | 91 | if __name__ == '__main__': 92 | gender() -------------------------------------------------------------------------------- /Visualization/maoyanMovies_comments/movieswd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author:Ryan time:2018/11/20 4 | 5 | import jieba 6 | import matplotlib.pyplot as plt 7 | from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator 8 | from scipy.misc import imread 9 | 10 | 11 | comments = [] 12 | with open('files/comments.txt', 'r', encoding='utf-8')as f: 13 | rows = f.readlines() 14 | try: 15 | for row in rows: 16 | comment = row.split(',')[2] 17 | if comment != '': 18 | comments.append(comment) 19 | # print(city) 20 | except Exception as e: 21 | print(e) 22 | comment_after_split = jieba.cut(str(comments), cut_all=False) 23 | words = ' '.join(comment_after_split) 24 | #过虑没用的停止词 25 | # stopwords = STOPWORDS.copy() 26 | # stopwords.add('电影') 27 | # stopwords.add('一部') 28 | # stopwords.add('一个') 29 | # stopwords.add('没有') 30 | # stopwords.add('什么') 31 | # stopwords.add('有点') 32 | # stopwords.add('感觉') 33 | # stopwords.add('海王') 34 | # stopwords.add('就是') 35 | # stopwords.add('觉得') 36 | 37 | 38 | bg_image = plt.imread('venmo1.jpg') 39 | wc = WordCloud(width=1900, height=1080, background_color='white', mask=bg_image, font_path='STKAITI.TTF', 40 | stopwords=stopwords, max_font_size=400, random_state=50) 41 | wc.generate_from_text(words) 42 | plt.imshow(wc) 43 | plt.axis('off') 44 | plt.show() 45 | 46 | wc.to_file('网易云热评词云图.jpg') -------------------------------------------------------------------------------- /Visualization/maoyanMovies_comments/t1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/Visualization/maoyanMovies_comments/t1.jpg -------------------------------------------------------------------------------- /Visualization/maoyanMovies_comments/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author:Ryan time:2018/11/20 4 | 5 | import requests 6 | import json 7 | import random 8 | import time 9 | from datetime import datetime 10 | from datetime import timedelta 11 | 12 | def get_data(url): 13 | headers = { 14 | 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'} 15 | html = requests.get(url, headers=headers) 16 | if html.status_code ==200: 17 | return html.content 18 | else: 19 | return none 20 | 21 | def parse_data(html): 22 | json_data = json.loads(html)['cmts'] 23 | comments = [] 24 | try: 25 | for item in json_data: 26 | comment = { 27 | 'nickName': item['nickName'], 28 | 'cityName': item['cityName'] if 'cityName' in item else '', 29 | 'content': item['content'].strip().replace('\n', ''), 30 | 'score': item['score'], 31 | 'startTime': item['startTime'] 32 | } 33 | comments.append(comment) 34 | return comments 35 | except Exception as e: 36 | print(e) 37 | 38 | def save(): 39 | start_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') 40 | end_time = '2019-2-05 00:00:00' 41 | while start_time > end_time: 42 | url = 'http://m.maoyan.com/mmdb/comments/movie/248906.json?_v_=yes&offset=15&startTime=' + start_time.replace( 43 | ' ', '%20') 44 | html = None 45 | try: 46 | html = get_data(url) 47 | except Exception as e: 48 | time.sleep(0.5) 49 | html = get_data(url) 50 | else: 51 | time.sleep(0.1) 52 | comments =parse_data(html) 53 | start_time = comments[14]['startTime'] 54 | print(start_time) 55 | start_time = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S') + timedelta(seconds=-1) 56 | start_time = datetime.strftime(start_time, '%Y-%m-%d %H:%M:%S') 57 | for item in comments: 58 | print(item) 59 | with open('E:/spiderproject/maoyanMovies_comments/comments.txt', 'a', encoding='utf-8')as f: 60 | f.write(item['nickName']+','+item['cityName'] +','+item['content']+','+str(item['score'])+ item['startTime'] + '\n') 61 | if __name__ == '__main__': 62 | url = 'http://m.maoyan.com/mmdb/comments/movie/248906.json?_v_=yes&offset=15&startTime=2018-11-19%2019%3A36%3A43' 63 | html = get_data(url) 64 | reusults = parse_data(html) 65 | save() -------------------------------------------------------------------------------- /Visualization/maoyanMovies_comments/venmo1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/Visualization/maoyanMovies_comments/venmo1.jpg -------------------------------------------------------------------------------- /Visualization/maoyanMovies_comments/venom.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/Visualization/maoyanMovies_comments/venom.jpg -------------------------------------------------------------------------------- /Visualization/maoyanMovies_comments/wd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author:Ryan time:2018/11/20 4 | 5 | import jieba 6 | import matplotlib.pyplot as plt 7 | from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator 8 | from scipy.misc import imread 9 | 10 | 11 | comments = [] 12 | with open('E:/spiderproject/maoyanMovies_comments/', 'r', encoding='utf-8')as f: 13 | rows = f.readlines() 14 | try: 15 | for row in rows: 16 | comment = row.split(',')[2] 17 | if comment != '': 18 | comments.append(comment) 19 | # print(city) 20 | except Exception as e: 21 | print(e) 22 | comment_after_split = jieba.cut(str(comments), cut_all=False) 23 | words = ' '.join(comment_after_split) 24 | #多虑没用的停止词 25 | stopwords = STOPWORDS.copy() 26 | stopwords.add('电影') 27 | stopwords.add('一部') 28 | stopwords.add('一个') 29 | stopwords.add('没有') 30 | stopwords.add('什么') 31 | stopwords.add('有点') 32 | stopwords.add('感觉') 33 | stopwords.add('毒液') 34 | stopwords.add('就是') 35 | stopwords.add('觉得') 36 | 37 | 38 | bg_image = plt.imread('venmo1.jpg') 39 | wc = WordCloud(width=1024, height=768, background_color='white', mask=bg_image, font_path='STKAITI.TTF', 40 | stopwords=stopwords, max_font_size=400, random_state=50) 41 | wc.generate_from_text(words) 42 | plt.imshow(wc) 43 | plt.axis('off') 44 | plt.show() 45 | 46 | wc.to_file('词云图.jpg') -------------------------------------------------------------------------------- /pythonScript/OlympicGamesGoldenNotify/__pycache__/mail.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/OlympicGamesGoldenNotify/__pycache__/mail.cpython-37.pyc -------------------------------------------------------------------------------- /pythonScript/OlympicGamesGoldenNotify/__pycache__/medals.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/OlympicGamesGoldenNotify/__pycache__/medals.cpython-37.pyc -------------------------------------------------------------------------------- /pythonScript/OlympicGamesGoldenNotify/index.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from mail import sendmail 4 | 5 | # To enable the initializer feature (https://help.aliyun.com/document_detail/158208.html) 6 | # please implement the initializer function as below: 7 | # def initializer(context): 8 | # logger = logging.getLogger() 9 | # logger.info('initializing') 10 | 11 | def handler(event, context): 12 | sendmail() -------------------------------------------------------------------------------- /pythonScript/OlympicGamesGoldenNotify/mail.py: -------------------------------------------------------------------------------- 1 | import smtplib 2 | from email.mime.text import MIMEText 3 | from email.header import Header 4 | from medals import getWinners, getRanking 5 | 6 | def sendmail(): 7 | sender = 'cgyung@qq.com' # 发送邮箱 8 | senderName = "潜龙于野" # 发送者昵称 9 | password = 'qktwjlvxlyrwcagi' # 发送方QQ邮箱授权码 10 | receivers = ['admin@yinuxy.com'] # 接收邮件 11 | 12 | # 三个参数:第一个为文本内容,第二个 plain 设置文本格式,第三个 utf-8 设置编码 13 | str = getRanking() + getWinners() 14 | message = MIMEText(str, 'plain', 'utf-8') 15 | message['From'] = Header(senderName, 'utf-8') # 发送者昵称 16 | 17 | # 主题 18 | subject = '东京奥运会金牌排行榜及获奖人员' 19 | message['Subject'] = Header(subject, 'utf-8') 20 | 21 | try: 22 | client = smtplib.SMTP_SSL('smtp.qq.com', smtplib.SMTP_SSL_PORT) 23 | print("连接到邮件服务器成功") 24 | 25 | client.login(sender, password) 26 | print("登录成功") 27 | 28 | client.sendmail(sender, receivers, message.as_string()) 29 | print("邮件发送成功") 30 | except smtplib.SMTPException: 31 | print("Error: 无法发送邮件") 32 | 33 | if __name__ == '__main__': 34 | sendmail() -------------------------------------------------------------------------------- /pythonScript/OlympicGamesGoldenNotify/medals.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | def getMedalsList(url, replaceTxt): 5 | try: 6 | r = requests.get(url) 7 | r.raise_for_status() 8 | r.encoding = r.apparent_encoding 9 | 10 | # 替换多余的内容 11 | data = str.replace(r.text, replaceTxt + "(", "") 12 | data = str.replace(data, ");", "") 13 | # 解码json,转成字典 14 | medals = json.loads(data) 15 | 16 | return medals 17 | 18 | except: 19 | return "Failed!" 20 | 21 | # 获取排行榜数据 22 | def getRanking(): 23 | url = "https://api.cntv.cn/olympic/getOlyMedals?serviceId=pcocean&itemcode=GEN-------------------------------&t=jsonp&cb=omedals1" 24 | medals = getMedalsList(url, "omedals1") 25 | # 获取数据列表 26 | medalList = medals['data']['medalsList'] 27 | res = "" 28 | for i in range(5): 29 | res += "第" + medalList[i]["rank"] + "名:" + medalList[i]["countryname"] + "(" + medalList[i]["countryid"] + ")\n" 30 | res += "金牌/银牌/铜牌:" + medalList[i]["gold"] + "/" + medalList[i]["silver"] + "/" + medalList[i]["bronze"] + "\n\n" 31 | return res 32 | 33 | # 中国奖牌获得者数据 34 | def getWinners(): 35 | url = "https://api.cntv.cn/Olympic/getOlyMedalList?t=jsonp&cb=OM&serviceId=pcocean&countryid=CHN" 36 | owners = getMedalsList(url, "OM") 37 | # 获取数据列表 38 | ownerList = owners['data']['medalList'] 39 | gold = "" # 金牌 40 | silver = "" # 银牌 41 | bronze = "" # 铜牌 42 | for owner in ownerList: 43 | medaltype = owner['medaltype'] # 奖牌类型 44 | startdatecn = owner['startdatecn'] # 日期CN 45 | item = owner['itemcodename'] + " " + owner['subitemname'] # 项目 46 | playname = owner['playname'] # 运动员 47 | if medaltype == "ME_GOLD": 48 | gold += "日期:" + startdatecn + "\n项目:" + item + "\n获得者:" + playname+"\n\n" 49 | elif medaltype == "ME_SILVER": 50 | silver += "日期:" + startdatecn + "\n项目:" + item + "\n获得者:" + playname+"\n\n" 51 | elif medaltype == "ME_BRONZE": 52 | bronze += "日期:" + startdatecn + "\n项目:" + item + "\n获得者:" + playname+"\n\n" 53 | 54 | res = "\n-------金牌:---------\n" + gold+"\n-------银牌:---------\n" + silver+"\n-------铜牌:---------\n"+ bronze 55 | return res 56 | 57 | if __name__ == '__main__': 58 | print(getRanking()) 59 | print(getWinners()) -------------------------------------------------------------------------------- /pythonScript/WordCloud/Image-coloredwordcloud.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 3.6.1 2 | # -*- coding:utf-8 -*- 3 | # ____author___='Yinux' 4 | """ 5 | Image-colored wordcloud 6 | ======================= 7 | 您可以在ImageColorGenerator中实现使用基于图像的着色策略对文字云进行着色,它使用由源图像中的单词占用的区域的平均颜色。 8 | 9 | """ 10 | 11 | from os import path 12 | from PIL import Image 13 | import numpy as np 14 | import matplotlib.pyplot as plt 15 | 16 | from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator 17 | 18 | d = path.dirname(__file__) 19 | 20 | # 读取整个文本 21 | text = open(path.join(d, 'comments.txt')).read() 22 | 23 | # 读取蒙板/彩色图像(图片是从http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010下载的) 24 | alice_coloring = np.array(Image.open(path.join(d, "alice_color.png"))) 25 | stopwords = set(STOPWORDS) 26 | stopwords.add("said") 27 | 28 | wc = WordCloud(background_color="white", max_words=2000, mask=alice_coloring, 29 | stopwords=stopwords, max_font_size=40, random_state=42) 30 | # 生成词云 31 | wc.generate(text) 32 | 33 | # 从图像创建着色 34 | image_colors = ImageColorGenerator(alice_coloring) 35 | 36 | # 显示 37 | plt.imshow(wc, interpolation="bilinear") 38 | plt.axis("off") #不显示坐标尺寸 39 | plt.figure() 40 | # 重新着色词云并显示 41 | # 我们也可以直接在构造函数中给使用:color_func=image_colors 42 | plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear") 43 | plt.axis("off") #不显示坐标尺寸 44 | plt.figure() 45 | plt.imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear") 46 | plt.axis("off") #不显示坐标尺寸 47 | plt.show()#一次绘制三张图 48 | -------------------------------------------------------------------------------- /pythonScript/WordCloud/Maskedwordcloud.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 3.6.1 2 | # -*- coding:utf-8 -*- 3 | # ____author___='Yinux' 4 | """ 5 | Masked wordcloud 6 | ================ 7 | 8 | 使用蒙版图像可以生成任意形状的wordcloud。 9 | """ 10 | from os import path 11 | from PIL import Image 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | 15 | from wordcloud import WordCloud, STOPWORDS 16 | 17 | d = path.dirname(__file__) 18 | 19 | # 读取整个文本. 20 | text = open(path.join(d, 'comments.txt')).read() 21 | 22 | #读取图片(图片来源:http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg) 23 | alice_mask = np.array(Image.open(path.join(d, "alice_color.png"))) 24 | 25 | stopwords = set(STOPWORDS) 26 | stopwords.add("said") 27 | #设置词云的一些属性 28 | wc = WordCloud(background_color="white", max_words=2000, mask=alice_mask, 29 | stopwords=stopwords) 30 | # 生成词云 31 | wc.generate(text) 32 | 33 | #保存到本地 34 | wc.to_file(path.join(d, "alice.png")) 35 | 36 | #展示 37 | plt.imshow(wc, interpolation='bilinear') 38 | plt.axis("off") 39 | plt.figure() 40 | plt.imshow(alice_mask, cmap=plt.cm.gray, interpolation='bilinear') 41 | plt.axis("off") 42 | plt.show() 43 | -------------------------------------------------------------------------------- /pythonScript/WordCloud/alice_color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/WordCloud/alice_color.png -------------------------------------------------------------------------------- /pythonScript/WordCloud/coloredWd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 3.6.1 2 | # -*- coding:utf-8 -*- 3 | # ____author___='Yinux' 4 | """ 5 | Image-colored wordcloud 6 | ======================= 7 | 您可以在ImageColorGenerator中实现使用基于图像的着色策略对文字云进行着色,它使用由源图像中的单词占用的区域的平均颜色。 8 | """ 9 | from os import path 10 | from PIL import Image 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | 14 | from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator 15 | 16 | d = path.dirname(__file__) 17 | 18 | # 读取整个文本 19 | text = open(path.join(d, 'comments.txt')).read() 20 | 21 | # 读取蒙板/彩色图像(图片是从http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010下载的) 22 | alice_coloring = np.array(Image.open(path.join(d, "alice_color.png"))) 23 | stopwords = set(STOPWORDS) 24 | stopwords.add("said") 25 | 26 | wc = WordCloud(background_color="white", max_words=2000, mask=alice_coloring, 27 | stopwords=stopwords, max_font_size=40, random_state=42) 28 | # 生成词云 29 | wc.generate(text) 30 | 31 | # 从图像创建着色 32 | image_colors = ImageColorGenerator(alice_coloring) 33 | 34 | # 显示 35 | plt.imshow(wc, interpolation="bilinear") 36 | plt.axis("off") #不显示坐标尺寸 37 | plt.figure() 38 | # 重新着色词云并显示 39 | # 我们也可以直接在构造函数中给使用:color_func=image_colors 40 | plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear") 41 | plt.axis("off") #不显示坐标尺寸 42 | plt.figure() 43 | plt.imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear") 44 | plt.axis("off") #不显示坐标尺寸 45 | plt.show()#一次绘制三张图 -------------------------------------------------------------------------------- /pythonScript/WordCloud/comments.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/WordCloud/comments.txt -------------------------------------------------------------------------------- /pythonScript/autoVote/autoVote.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | import requests 4 | import getCookie 5 | from lxml import etree 6 | 7 | def geySkey(cookie): 8 | arrcookie = cookie.split("; ") 9 | for i in range(len(arrcookie)): 10 | arr = arrcookie[i].split("=") 11 | if(arr[0] == 'skey'): 12 | print(arr[1]) 13 | return arr[1] 14 | 15 | def getGTK(skey): 16 | skey=geySkey(skey) 17 | hash = 5381 18 | for i in range(len(skey)): 19 | hash = hash + (hash << 5) + int(ord(skey[i])) 20 | return (hash & 0x7fffffff) 21 | 22 | def dailyTaskAutuComiit(header_dict, vote_url, comment_url, signInurl): 23 | base_url = "https://cloud.tencent.com/developer/ask?q=timeline" 24 | header = { 25 | "Accept": "application/json, text/plain, */*", 26 | "Accept-Encoding": "gzip, deflate, br", 27 | "Accept-Language": "zh-CN,zh;q=0.9", 28 | "Connection": "keep-alive", 29 | "Host": "cloud.tencent.com", 30 | "Referer": "https://cloud.tencent.com/developer/ask", 31 | "Sec-Fetch-Mode": "cors", 32 | "Sec-Fetch-Site": "same-origin", 33 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36" 34 | } 35 | r1 = requests.get(url=base_url, headers=header) 36 | r1.encoding = r1.apparent_encoding 37 | html = etree.HTML(r1.content) 38 | id = {} 39 | data = html.xpath('//div/div[2]/div[3]/a/@href') 40 | for item in data: 41 | idstr = str(item).replace("/developer/ask/","") 42 | idlist = idstr.split('/answer/') 43 | if(len(idlist)>1): 44 | id[idlist[0]] = idlist[1] 45 | votepayloadList = [] 46 | commentpayloadList = [] 47 | for key,value in id.items(): 48 | votepayload = "{\r\n \"action\": \"VoteAnswer\",\r\n \"payload\": {\r\n \"questionId\": %s,\r\n \"answerId\": %s,\r\n \"vote\": 1\r\n }\r\n}"%(key, value) 49 | commentpayload = "{\r\n \"action\": \"CommentAnswer\",\r\n \"payload\": {\r\n \"questionId\": %s,\r\n \"answerId\": %s,\r\n \"content\": \"%s\"\r\n }\r\n}"%(key, value, '666') 50 | votepayloadList.append(votepayload) 51 | commentpayloadList.append(commentpayload) 52 | index = random.sample(range(1,20),5) 53 | index.sort() 54 | for i in index: 55 | r1 = requests.request("POST", vote_url, headers=header_dict, data=votepayloadList[i]) 56 | print("第{}篇文章已点赞,返回代码:".format(i),r1.text) 57 | time.sleep(random.randint(5,10)) 58 | # r2 = requests.request("POST", comment_url, headers=header_dict, data=commentpayloadList[i]) 59 | # print("第{}篇文章已评论,返回代码:".format(i),r2.text) 60 | # time.sleep(random.randint(5,10)) 61 | 62 | def getComment(): 63 | commentsList = [ 64 | "专业的回答,感谢分享", 65 | "不错不错", 66 | "大佬讲的太好啦,受益匪浅", 67 | "学习了,感谢分享经验", 68 | "太强了", 69 | "厉害哦", 70 | "不错啊", 71 | "很好", 72 | "学到了", 73 | "谢谢分享,学习了", 74 | "专业的回答", 75 | "666", 76 | "yyds", 77 | "11111111" 78 | ] 79 | return random.choice(commentsList) 80 | 81 | 82 | if __name__ == '__main__': 83 | getCookie.init() 84 | getCookie.login() 85 | time.sleep(10) 86 | with open("cookie.txt", "r", encoding="utf-8") as f: 87 | cookie = f.read() 88 | csrfCode = getGTK(cookie) 89 | signInurl = "https://cloud.tencent.com/developer/services/ajax/grocery-stall?action=SignIn&uin=100004697298&csrfCode=%s"%(csrfCode) 90 | vote_url = "https://cloud.tencent.com/developer/services/ajax/ask/answer?action=VoteAnswer&uin=100004697298&csrfCode=%s"%(csrfCode) 91 | comment_url = "https://cloud.tencent.com/developer/services/ajax/ask/answer?action=CommentAnswer&uin=100004697298&csrfCode=%s"%(csrfCode) 92 | header_dict = { 93 | 'accept': 'application/json, text/plain, */*', 94 | 'accept-encoding': 'gzip, deflate, br', 95 | 'accept-language': 'zh-CN,zh;q=0.9', 96 | 'content-type': 'application/json;charset=UTF-8', 97 | 'cookie': f"{cookie}", 98 | 'origin': 'https://cloud.tencent.com', 99 | 'referer': 'https://cloud.tencent.com/developer/ask', 100 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' 101 | } 102 | dailyTaskAutuComiit(header_dict, vote_url, comment_url, signInurl) 103 | # print(geySkey(cookie)) 104 | 105 | 106 | -------------------------------------------------------------------------------- /pythonScript/autoVote/cookie.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/autoVote/cookie.txt -------------------------------------------------------------------------------- /pythonScript/autoVote/getCookie.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium import webdriver 3 | from selenium.webdriver.support.wait import WebDriverWait 4 | 5 | 6 | 7 | url = 'https://cloud.tencent.com/developer' 8 | 9 | # 初始化 10 | def init(): 11 | # 定义为全局变量,方便其他模块使用 12 | global browser, wait 13 | # 实例化一个chrome浏览器 14 | option = webdriver.ChromeOptions() 15 | # option.add_argument("--user-data-dir=" + r"C:/Users/Administrator/AppData/Local/Google/Chrome/User Data") 16 | # proxy = get_ip()['HTTP'] 17 | # option.add_argument("--proxy-server=http://54.255.66.81:80") 18 | option.add_experimental_option('excludeSwitches', ['enable-automation']) 19 | option.add_argument("--disable-blink-features=AutomationControlled") 20 | browser = webdriver.Chrome(chrome_options=option) 21 | # 最大化窗口 22 | browser.maximize_window() 23 | time.sleep(2) 24 | # 设置等待超时 25 | wait = WebDriverWait(browser, 20) 26 | 27 | # 登录 28 | def login(): 29 | # 打开登录页面 30 | browser.get(url) 31 | # # 获取用户名输入框 32 | browser.find_element_by_xpath('//*[@id="react-root"]/div[1]/div[1]/div/div[2]/div[2]/div[3]/a[1]').click() 33 | browser.find_element_by_class_name('clg-icon-qq').click() 34 | time.sleep(10) 35 | 36 | # 获取cookie 37 | get_cookies_js = "return document.cookie" 38 | cookie = browser.execute_script(get_cookies_js) 39 | print(cookie) 40 | 41 | with open("./cookie.txt", "w", encoding="utf-8") as f: 42 | f.write(cookie) 43 | # page_source = browser.page_source 44 | # with open("page.html","w",encoding="utf-8") as f: 45 | # f.write(page_source) 46 | 47 | 48 | if __name__ == '__main__': 49 | init() 50 | login() 51 | -------------------------------------------------------------------------------- /pythonScript/birthdayNotify/__pycache__/lunar.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/birthdayNotify/__pycache__/lunar.cpython-37.pyc -------------------------------------------------------------------------------- /pythonScript/birthdayNotify/birthday.json: -------------------------------------------------------------------------------- 1 | { 2 | "friend1":{ 3 | "name":"name", 4 | "relationship":"relationship", 5 | "birthday":"1998-08-26", 6 | "isLunar": true 7 | }, 8 | "friend2":{ 9 | "name":"name", 10 | "relationship":"relationship", 11 | "birthday":"1999-07-14", 12 | "isLunar":false 13 | }, 14 | "friend3":{ 15 | "name":"name", 16 | "relationship":"relationship", 17 | "birthday":"1971-07-10", 18 | "isLunar":true 19 | }, 20 | "friend4":{ 21 | "name":"name", 22 | "relationship":"relationship", 23 | "birthday":"1972-01-23", 24 | "isLunar":true 25 | }, 26 | "friend5":{ 27 | "name":"name", 28 | "relationship":"relationship", 29 | "birthday":"1994-08-20", 30 | "isLunar":true 31 | }, 32 | "friend6":{ 33 | "name":"name", 34 | "relationship":"relationship", 35 | "birthday":"1999-06-10", 36 | "isLunar":false 37 | } 38 | } -------------------------------------------------------------------------------- /pythonScript/birthdayNotify/birthdayNotify.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/birthdayNotify/birthdayNotify.zip -------------------------------------------------------------------------------- /pythonScript/birthdayNotify/index.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import time 4 | import smtplib 5 | import datetime 6 | import requests 7 | from email.mime.text import MIMEText 8 | from email.header import Header 9 | from borax.calendars.lunardate import LunarDate 10 | 11 | # To enable the initializer feature (https://help.aliyun.com/document_detail/158208.html) 12 | # please implement the initializer function as below: 13 | # def initializer(context): 14 | # logger = logging.getLogger() 15 | # logger.info('initializing') 16 | 17 | 18 | def hitokoto(): 19 | #指定 api 的接口地址并设定 url 参数 20 | api_url = 'https://v1.hitokoto.cn/?c=d&c=h&c=i&c=k&encode=json' 21 | #向网站 api 发送请求并获取返回的数据 22 | response = requests.get(api_url) 23 | #将 json 数据对象转化为字典 24 | res = json.loads(response.text) 25 | #取出一言正文和出处拼装为字符串 26 | a_word = res['hitokoto']+' _____'+'《'+res['from']+'》' 27 | #输出一言 28 | return a_word 29 | 30 | def ln_date_str(month, day): 31 | # 月份 32 | lm = '正二三四五六七八九十冬腊' 33 | # 日份 34 | ld = '初一初二初三初四初五初六初七初八初九初十十一十二十三十四十五十六十七十八十九二十廿一廿二廿三廿四廿五廿六廿七廿八廿九三十' 35 | return '{}月{}'.format(lm[month-1], ld[(day-1)*2:day*2]) 36 | 37 | def sendmail(res,relationship, name): 38 | sender = 'cgyung@qq.com' # 发送邮箱 39 | senderName = "潜龙于野" # 发送者昵称 40 | password = 'qktwjlvxlyrwcagi' # 发送方QQ邮箱授权码 41 | receivers = ['admin@yinuxy.com'] # 接收邮件 42 | 43 | # 三个参数:第一个为文本内容,第二个 plain 设置文本格式,第三个 utf-8 设置编码 44 | # str = getRanking() + getWinners() 45 | message = MIMEText(res, 'plain', 'utf-8') 46 | message['From'] = Header(senderName, 'utf-8') # 发送者昵称 47 | 48 | # 主题 49 | subject = '您的{}{}快要过生日啦'.format(relationship, name) 50 | message['Subject'] = Header(subject, 'utf-8') 51 | 52 | try: 53 | client = smtplib.SMTP_SSL('smtp.qq.com', smtplib.SMTP_SSL_PORT) 54 | print("连接到邮件服务器成功") 55 | 56 | client.login(sender, password) 57 | print("登录成功") 58 | 59 | client.sendmail(sender, receivers, message.as_string()) 60 | print("邮件发送成功") 61 | except smtplib.SMTPException: 62 | print("Error: 无法发送邮件") 63 | 64 | def birthdayNotify(path='./birthday.json'): 65 | data = {} 66 | with open(path,'r',encoding='utf8')as fp: 67 | data = json.load(fp) 68 | for value in data.values(): 69 | 70 | birth = value['birthday'] 71 | birth = datetime.datetime.strptime(birth, "%Y-%m-%d") 72 | 73 | birthyear = birth.year 74 | today = datetime.date.today() 75 | 76 | if value['isLunar']: 77 | thisbirth = LunarDate(today.year,birth.month,birth.day) 78 | solardate = thisbirth.to_solar_date() 79 | 80 | if (solardate-today).days < 0 : 81 | thisbirth = LunarDate(today.year+1,birth.month,birth.day) 82 | solardate = thisbirth.to_solar_date() 83 | age = thisbirth.year - birthyear + 1 84 | 85 | # res = "今天是公历 {} \n您的 {} {} 将于 {}年{}月{}日 过生日({}天后)\n(农历生日{})\n\n今天是他的第{}个生日,快去为他挑选一件合适的礼物吧~\n\n{}\n\n\n".format(today, value['relationship'], value['name'], solardate.year, solardate.month, solardate.day, (solardate-today).days,ln_date_str(birth.month,birth.day), age, hitokoto()) 86 | # print(res) 87 | # sendmail(res,value['relationship'], value['name']) 88 | 89 | if (solardate-today).days<=7 and (solardate-today).days>=0: 90 | res = "今天是公历 {} \n您的{}{}将于{}年{}月{}日过生日({}天后)\n农历:{}\n\n今天是他的第{}个生日,快去为他挑选一件合适的礼物吧~\n\n{}\n\n\n".format(today, value['relationship'], value['name'], solardate.year, solardate.month, solardate.day, (solardate-today).days, ln_date_str(birth.month,birth.day), age, hitokoto()) 91 | print(res) 92 | sendmail(res,value['relationship'], value['name']) 93 | else: 94 | thisbirth = LunarDate(today.year,birth.month,birth.day) 95 | if (thisbirth-today).days < 0 : 96 | thisbirth = LunarDate(today.year+1,birth.month,birth.day) 97 | age = thisbirth.year - birthyear + 1 98 | 99 | # res = "今天是公历 {} \n您的 {} {} 将于 {}年{}月{}日 过生日({}天后)\n\n今天是他的第{}个生日,快去为他挑选一件合适的礼物吧~\n\n{}\n\n\n".format(today, value['relationship'], value['name'], thisbirth.year, thisbirth.month, thisbirth.day, (thisbirth-today).days, age, hitokoto()) 100 | # print(res) 101 | # sendmail(str(res),value['relationship'], value['name']) 102 | 103 | if (thisbirth-today).days<=7 and (thisbirth-today).days>=0: 104 | res = "今天是公历 {} \n您的 {} {} 将于 {}年{}月{}日 过生日({}天后)\n\n今天是他的第{}个生日,快去为他挑选一件合适的礼物吧~\n\n{}\n\n\n".format(today, value['relationship'], value['name'], thisbirth.year, thisbirth.month, thisbirth.day, (thisbirth-today).days, age, hitokoto()) 105 | print(res) 106 | # sendmail(res,value['relationship'], value['name']) 107 | time.sleep(5) 108 | 109 | def handler(event, context): 110 | birthdayNotify() 111 | 112 | if __name__ == '__main__': 113 | birthdayNotify() -------------------------------------------------------------------------------- /pythonScript/birthdayNotify/text.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from borax.calendars.lunardate import LunarDate 4 | 5 | 6 | 7 | today = datetime.date.today() 8 | print(type(today.year), type(today.month), type(today.day)) 9 | thisday = LunarDate(today.year,today.month,today.day) 10 | print(thisday, type(thisday)) 11 | solardate = thisday.to_solar_date() 12 | print(solardate, type(solardate)) 13 | 14 | 15 | thisbirth = LunarDate(today.year,today.month,today.day) 16 | ssolardate = thisbirth.to_solar_date() 17 | print(ssolardate, type(ssolardate)) 18 | -------------------------------------------------------------------------------- /pythonScript/dingReminder/dingReminder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Dec 3 16:46:04 2020 4 | 5 | @author: YINUXY 6 | """ 7 | 8 | 9 | import dingtalkchatbot.chatbot as cb 10 | webhook = 'https://oapi.dingtalk.com/robot/send?access_token=2174abe57b7e6874d0143ba18351ed77c59c2b7f25ad476b82bcf4a449007025' 11 | robot = cb.DingtalkChatbot(webhook) 12 | robot.send_markdown(title='首屏会话透出的展示内容', 13 | text="# 这是支持markdown的文本 \n## 标题2 \n* 列表1 \n ![alt 啊](https://gw.alipayobjects.com/zos/skylark-tools/public/files/b424a1af2f0766f39d4a7df52ebe0083.png)") -------------------------------------------------------------------------------- /pythonScript/draw_excel/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/draw_excel/1.jpg -------------------------------------------------------------------------------- /pythonScript/draw_excel/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/draw_excel/2.jpg -------------------------------------------------------------------------------- /pythonScript/draw_excel/4k_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/draw_excel/4k_1.jpg -------------------------------------------------------------------------------- /pythonScript/draw_excel/draw_excel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Dec 13 21:54:25 2019 4 | 5 | @author: Yinux 6 | """ 7 | 8 | from PIL import Image 9 | import openpyxl 10 | from openpyxl.styles import fills 11 | import os 12 | 13 | MAX_WIDTH = 300 14 | MAX_HEIGHT = 300 15 | 16 | def resize(img): 17 | w, h = img.size 18 | if w > MAX_WIDTH: 19 | h = MAX_WIDTH / w * h 20 | w = MAX_WIDTH 21 | 22 | if h > MAX_HEIGHT: 23 | w = MAX_HEIGHT / h * w 24 | h = MAX_HEIGHT 25 | return img.resize((int(w), int(h)), Image.ANTIALIAS) 26 | 27 | 28 | def int_to_16(num): 29 | num1 = hex(num).replace('0x', '') 30 | num2 = num1 if len(num1) > 1 else '0' + num1 31 | return num2 32 | 33 | 34 | def draw_jpg(img_path): 35 | 36 | img_pic = resize(Image.open(img_path)) 37 | img_name = os.path.basename(img_path) 38 | out_file = './result/' + img_name.split('.')[0] + '.xlsx' 39 | if os.path.exists(out_file): 40 | os.remove(out_file) 41 | 42 | workbook = openpyxl.Workbook() 43 | worksheet = workbook.active 44 | 45 | width, height = img_pic.size 46 | 47 | for w in range(1, width + 1): 48 | 49 | for h in range(1, height + 1): 50 | if img_pic.mode == 'RGB': 51 | r, g, b = img_pic.getpixel((w - 1, h - 1)) 52 | elif img_pic.mode == 'RGBA': 53 | r, g, b, a = img_pic.getpixel((w - 1, h - 1)) 54 | 55 | hex_rgb = int_to_16(r) + int_to_16(g) + int_to_16(b) 56 | 57 | cell = worksheet.cell(column=w, row=h) 58 | 59 | if h == 1: 60 | _w = cell.column 61 | _h = cell.col_idx 62 | # 调整列宽 63 | # worksheet.column_dimensions[_w].width = 1 64 | _w_letter = openpyxl.utils.get_column_letter(_w) 65 | worksheet.column_dimensions[_w_letter].width = 1 66 | # 调整行高 67 | worksheet.row_dimensions[h].height = 6 68 | 69 | cell.fill = fills.PatternFill(fill_type="solid", fgColor=hex_rgb) 70 | 71 | print('write in:', w, ' | all:', width + 1) 72 | print('saving...') 73 | workbook.save(out_file) 74 | print('success!') 75 | 76 | if __name__ == '__main__': 77 | filepath = 'D:/Code/Python/Interesting/draw_excel/iu.jpg' 78 | draw_jpg(filepath) -------------------------------------------------------------------------------- /pythonScript/draw_excel/iu.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/draw_excel/iu.jpg -------------------------------------------------------------------------------- /pythonScript/messageReminder/README.md: -------------------------------------------------------------------------------- 1 | # 上下班打卡提醒 2 | ## 使用方法 3 | 1. 需要Python 3.X环境 4 | 2. 需要安装`pyweathercn`包 5 | ``` 6 | pip3 install pyweathercn 7 | ``` 8 | 3. key值获取 9 | * 若使用QQ提醒则前往[https://qmsg.zendee.cn/](https://qmsg.zendee.cn/)登录添加提醒QQ和获取`key`值 10 | * 若使用server酱提醒则前往[http://sc.ftqq.com/](http://sc.ftqq.com/)登录获取`key`值 11 | 4. 替换`key`值后将此代码放入VPS后执行即可 12 | ## 定时策略 13 | 以Centos为例: 14 | ``` 15 | # 进入编写定时脚本 16 | crontab -e 17 | # 需要定时两次脚本(上下班) 18 | 20 8 * * * cd /project/dingReminder && python dingReminder.py >> dingReminder.log 2>&1 19 | 32 17 * * * cd /project/dingReminder && python dingReminder.py >> dingReminder.log 2>&1 20 | ``` 21 | > linux 定时任务编写脚本可参考[Linux Crontab 定时任务](https://www.runoob.com/w3cnote/linux-crontab-tasks.html) -------------------------------------------------------------------------------- /pythonScript/messageReminder/messageReminder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Dec 3 08:46:08 2020 4 | 5 | @author: YINUXY 6 | """ 7 | 8 | 9 | from datetime import datetime, date, time, timezone 10 | import requests 11 | import pyweathercn 12 | 13 | def getWeather(city): 14 | w = pyweathercn.Weather(city) 15 | context = '''今天是'''+w.today(True)['date']+'\n'+w.data['city']+'''市今天的天气为'''+w.today(True)['type']+'''\n温度变化为'''+w.today(True)['temp']+'\n'+w.tip() 16 | return context 17 | 18 | def sendQQ(wcontext): 19 | key = '*****************************' 20 | morning = '08:30:00' 21 | night = '17:30:00' 22 | nowtime = datetime.now().strftime('%H:%M:%S') 23 | if nowtime < morning: 24 | greeting = "早上好主人ヾ(✿゚▽゚)ノ\n美好的搬砖生活开始啦!(<ゝω・)☆\n快点打开手机钉钉进行上班打卡把!!!!!!(~ ̄▽ ̄)~ \n不然就要迟到啦∑(゚Д゚ノ)ノ\n" 25 | context = greeting + wcontext 26 | elif nowtime > night: 27 | greeting = "晚上好主人ヾ(✿゚▽゚)ノ\n辛苦的搬砖生活终于结束啦!(<ゝω・)☆\n不要忘记了晚间下班打卡哟( • ̀ω•́ )✧\n" 28 | context = greeting 29 | else: 30 | context = "现在还没到上/下班签到时间哦\n" 31 | url = 'https://qmsg.zendee.cn/send/' + key + '?msg='+context 32 | requests.post(url) 33 | 34 | def sendWechat(wcontext): 35 | key = '******************************************' 36 | title = '' 37 | morning = '08:30:00' 38 | night = '17:30:00' 39 | nowtime = datetime.now().strftime('%H:%M:%S') 40 | if nowtime < morning: 41 | title = '''上班打卡啦ヾ(✿゚▽゚)ノ''' 42 | greeting = '''> 早上好主人ヾ(✿゚▽゚)ノ\n美好的搬砖生活开始啦!(<ゝω・)☆\n> 快点打开手机钉钉进行上班打卡把!!!!!!(~ ̄▽ ̄)~ \n不然就要迟到啦∑(゚Д゚ノ)ノ\n''' 43 | context = greeting + wcontext 44 | elif nowtime > night: 45 | title = '''下班打卡啦ヾ(✿゚▽゚)ノ''' 46 | greeting = '''> 晚上好主人ヾ(✿゚▽゚)ノ\n> 辛苦的搬砖生活终于结束啦!(<ゝω・)☆\n> 不要忘记了晚间下班打卡哟( • ̀ω•́ )✧''' 47 | context = greeting 48 | else: 49 | title = '''上班时间请勿开小差!(〝▼皿▼)''' 50 | context = '''现在还没到上/下班签到时间哦\n''' + wcontext + wcontext 51 | url = "http://sc.ftqq.com/" + key + ".send?text=" + title + "&desp=" + context 52 | requests.post(url) 53 | 54 | if __name__ == '__main__': 55 | city = '杭州' 56 | w = getWeather(city) 57 | sendQQ(w) 58 | sendWechat(w) 59 | print(sendQQ(w)) 60 | -------------------------------------------------------------------------------- /pythonScript/miStoreBuy/MiStore.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Apr 26 21:22:58 2020 4 | 5 | @author: Yinux 6 | """ 7 | 8 | from selenium import webdriver 9 | import time 10 | import datetime 11 | chrome_driver = 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe' #chromedriver的文件位置 12 | browser = webdriver.Chrome(executable_path = chrome_driver) 13 | 14 | def login(name ,pwd): 15 | browser.get( 'https://account.xiaomi.com/')#登录网址 16 | time.sleep(2) 17 | browser.find_element_by_id("username").send_keys(name) #利用账号标签的ID,确定位置并send信息 18 | browser.find_element_by_id("pwd").send_keys(pwd) #利用密码标签的ID,确定位置并send信息 19 | browser.find_element_by_id("login-button").click()#利用登录按钮的ID,确定位置并点击 20 | #如果找不到标签ID,可以使用其他方法来确定元素位置 21 | time.sleep(3) 22 | browser.get("https://s1.mi.com/m/app/hd/index.html?id=15042")#切换到秒杀页面 23 | print('登录成功,正在等待秒杀···') 24 | 25 | def buy_on_time(): 26 | while True: #不断刷新时钟 27 | now = datetime.datetime.now() 28 | if now.strftime('%H:%M:%S') == '09:00:00' or now.strftime('%H:%M:%S') == '11:00:00' or now.strftime('%H:%M:%S') == '15:00:00' or now.strftime('%H:%M:%S') == '17:00:00': 29 | # if now.strftime('%H:%M:%S') == buytime: 30 | browser.find_element_by_xpath("//div[@class='content-box flex-box']/a[@data-log_code='logcode#activity_code=wjsncc49&page=activity&page_id=15042&bid=3645414.0']/div/img").click() 31 | browser.find_element_by_xpath("//a[@data-log_code='logcode#activity_code=1i19jyzh&page=activity&page_id=15042&bid=3645414.0']").click() 32 | # browser.find_element_by_xpath("//a[@data-log_code='logcode#activity_code=tudhbjjy&page=activity&page_id=15042&bid=3646017.0']").click() #购买按钮的Xpath 33 | # browser.find_element_by_xpath("//a[@data-log_code='logcode#activity_code=qpohzak0&page=activity&page_id=15042&bid=3646017.0']").click() 34 | print('当前时段已抢购完毕') 35 | time.sleep(0.01)#注意刷新间隔时间要尽量短 36 | 37 | login('1317150488' , 'xiaomi0711') 38 | #time.sleep(10) 39 | #buy_on_time()#指定秒杀时间,并且开始等待秒杀 40 | browser.find_element_by_class_name('item flex-box-item')[2].click() 41 | #print("ending") -------------------------------------------------------------------------------- /pythonScript/miStoreBuy/debug.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/miStoreBuy/debug.log -------------------------------------------------------------------------------- /pythonScript/pdfToExcel/README.md: -------------------------------------------------------------------------------- 1 | 1. 转换 PDF中的表格 文件到 Microsoft Excel 文档 2 | 2. 程序自动识别pdf中的表格 3 | 3. 电子工程师用excel建原理图的库(orcad)会需要,特别是引脚多的原理图库,如FPGA等,厂家没有给excel的引脚表,然后分了很多页的pdf,每页都有表头,如果全部复制入EXCEL,需要后期做很多处理,而且有换行和空格等很麻烦,费时间,本程序完美解决,其他行业也应该有用到 4 | 4. 网上也有在线的转换工具和离线的转换工具,试了下不好用,上传后你的文件等于公开了,下载的工具好多不安全,有的要注册会员或有文件大小限制 5 | 5. 将exe文件放到D盘根目录下(程序里写死了),把你的pdf文件改名为test.pdf(程序里写死了),后面有源码你可以自己改成输入路径的,图方便 6 | 7 | ### 注意事项: 8 | * 需不需要安装Microsoft Excel没有试过,测试时候是装着的 9 | * 执行exe后需要一会时间估计几十秒,会出现cmd控制台打印信息,pdf文件越大时间越长,实测9M多的pdf表格都可以 10 | 11 | -------------------------------------------------------------------------------- /pythonScript/pdfToExcel/pdfToExcel.py: -------------------------------------------------------------------------------- 1 | import pdfplumber 2 | import xlwt 3 | 4 | # 定义保存Excel的位置 5 | workbook = xlwt.Workbook() #定义workbook 6 | sheet = workbook.add_sheet('Sheet1') #添加sheet 7 | i = 0 # Excel起始位置 8 | 9 | #path = input("E:/MyProject/python/test.pdf") 10 | path = "D:/test.pdf" # 导入PDF路径 11 | pdf = pdfplumber.open(path) 12 | print('\n') 13 | print('开始读取数据') 14 | print('\n') 15 | for page in pdf.pages: 16 | # 获取当前页面的全部文本信息,包括表格中的文字 17 | # print(page.extract_text()) 18 | for table in page.extract_tables(): 19 | # print(table) 20 | for row in table: 21 | print(row) 22 | for j in range(len(row)): 23 | sheet.write(i, j, row[j]) 24 | i += 1 25 | print('---------- 分割线 ----------') 26 | 27 | pdf.close() 28 | 29 | # 保存Excel表 30 | workbook.save('D:/PDFresult.xls') 31 | print('\n') 32 | print('写入excel成功') 33 | print('保存位置:') 34 | print('D:/PDFresult.xls') 35 | print('\n') 36 | input('PDF取读完毕,按任意键退出') 37 | -------------------------------------------------------------------------------- /pythonScript/pdfToExcel/新建 Microsoft Word 文档.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/pdfToExcel/新建 Microsoft Word 文档.docx -------------------------------------------------------------------------------- /pythonScript/poem/Oxford3000.py: -------------------------------------------------------------------------------- 1 | import urllib3 2 | from bs4 import BeautifulSoup 3 | import certifi 4 | 5 | file = open('../data/word', "w+") 6 | http = urllib3.PoolManager( 7 | cert_reqs='CERT_REQUIRED', 8 | ca_certs=certifi.where()) 9 | 10 | url = 'https://www.oxfordlearnersdictionaries.com/wordlist/english/oxford3000/' 11 | r = http.request('GET', url) 12 | 13 | soup = BeautifulSoup(r.data, 'html.parser') 14 | 15 | category = soup.find('ul', class_="hide_phone") 16 | 17 | content = soup.find('ul', class_="wordlist-oxford3000") 18 | 19 | # for link in content.find_all('a'): 20 | # file.write(link.get_text()+'\n') 21 | 22 | pages = soup.find('ul', class_="paging_links") 23 | 24 | for cat in category.find_all('a'): 25 | # get the former category of data 26 | while pages.find('a', text=">"): 27 | next = pages.find('a', text=">") 28 | r = http.request('GET', next.get('href')) 29 | soup = BeautifulSoup(r.data, 'html.parser') 30 | pages = soup.find('ul', class_="paging_links") 31 | 32 | # get the former page of data 33 | for link in content.find_all('a'): 34 | if link.get_text() != 'o’clock': 35 | file.write(link.get_text()+'\n') 36 | # update the content 37 | content = soup.find('ul', class_="wordlist-oxford3000") 38 | # get the last page of content 39 | for link in content.find_all('a'): 40 | file.write(link.get_text()+'\n') 41 | 42 | r = http.request('GET', cat.get('href')) 43 | soup = BeautifulSoup(r.data, 'html.parser') 44 | 45 | content = soup.find('ul', class_="wordlist-oxford3000") 46 | pages = soup.find('ul', class_="paging_links") 47 | 48 | # get the last category of data 49 | while pages.find('a', text=">"): 50 | next = pages.find('a', text=">") 51 | r = http.request('GET', next.get('href')) 52 | soup = BeautifulSoup(r.data, 'html.parser') 53 | pages = soup.find('ul', class_="paging_links") 54 | 55 | # get the former page of data 56 | for link in content.find_all('a'): 57 | file.write(link.get_text()+'\n') 58 | # update the content 59 | content = soup.find('ul', class_="wordlist-oxford3000") 60 | # get the last page of content 61 | for link in content.find_all('a'): 62 | file.write(link.get_text()+'\n') 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /pythonScript/poem/TangshiGene.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import pinyin 4 | #生成四言律诗 5 | def Line4(): 6 | word_file = '../data/freqword.txt' 7 | 8 | dataset = open(word_file,encoding='utf-8').readlines() 9 | 10 | list = [] 11 | for word in dataset: 12 | # outfile.write(pinyin.get(word, format="strip")+" ") 13 | i = 0 14 | while i= time1 and nowtime <= time2: 26 | return "单词背完了吗,没背完要打屁屁哟!现在已经AM 08:00了,赶快去复习上午的课程把~" 27 | elif nowtime >= time2 and nowtime <= time3: 28 | return "今天的任务完成了吗,没完成的话可是要加夜班了哦!" 29 | elif nowtime >= time3 and nowtime <= time4: 30 | return "晚饭吃完了吗,赶紧去练字去!!!" 31 | elif nowtime >= time4 and nowtime <= time5: 32 | return "现在,可以以开始晚自习拉~~~" 33 | elif nowtime >= time5: 34 | return "今天的任务完成了吗,没有也请放到明天再做吧!" 35 | 36 | 37 | def getOneNote(): 38 | api_url = 'https://v1.hitokoto.cn/?c=k&c=d&c=h&encode=json' 39 | response = requests.get(api_url) 40 | res = json.loads(response.text) 41 | a_word = res['hitokoto']+' _____'+'《'+res['from']+'》' 42 | print(a_word) 43 | 44 | def sendmail(): 45 | sender = 'cgyung@qq.com' # 发送邮箱 46 | senderName = "笨鸟先飞~" # 发送者昵称 47 | password = 'qktwjlvxlyrwcagi' # 发送方QQ邮箱授权码 48 | receivers = ['admin@yinuxy.com'] # 接收邮件 49 | 50 | # 三个参数:第一个为文本内容,第二个 plain 设置文本格式,第三个 utf-8 设置编码 51 | str = notifyText() + getOneNote() 52 | message = MIMEText(str, 'plain', 'utf-8') 53 | message['From'] = Header(senderName, 'utf-8') # 发送者昵称 54 | 55 | # 主题 56 | subject = '叮~您有新的学习计划' 57 | message['Subject'] = Header(subject, 'utf-8') 58 | 59 | try: 60 | client = smtplib.SMTP_SSL('smtp.qq.com', smtplib.SMTP_SSL_PORT) 61 | print("连接到邮件服务器成功") 62 | 63 | client.login(sender, password) 64 | print("登录成功") 65 | 66 | client.sendmail(sender, receivers, message.as_string()) 67 | print("邮件发送成功") 68 | except smtplib.SMTPException: 69 | print("Error: 无法发送邮件") 70 | 71 | def sendQQ(): 72 | key = '42b60c3e094bed98331a1cc5e089ff64' 73 | context = notifyText() + getOneNote() 74 | url = 'https://qmsg.zendee.cn/send/' + key + '?msg='+context 75 | requests.post(url) 76 | 77 | def sendWechat(): 78 | key = 'SCT48533TKJb962s7xJdVTdsszsuv9Dks' 79 | title = '叮~您有新的学习计划' 80 | context = notifyText() + getOneNote() 81 | url = "http://sc.ftqq.com/" + key + ".send?text=" + title + "&desp=" + context 82 | requests.post(url) 83 | 84 | def handler(): 85 | print(type(notifyText())) 86 | sendmail() 87 | # sendQQ() 88 | # sendWechat() 89 | 90 | if __name__ == '__main__': 91 | handler() -------------------------------------------------------------------------------- /pythonScript/telegramPushBot/ht.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 30分钟判断一次进程是否存在,如果不存在就启动它 3 | # python3请使用全路径,否则可能出现无法启动 4 | PIDS=`ps -ef |grep locpush |grep -v grep | awk '{print $2}'` 5 | if [ "$PIDS" != "" ]; then 6 | echo "myprocess is running!" 7 | else 8 | echo "未发现程序后台运行,正在重启中!" 9 | /usr/bin/python3 /project/hostlocpushBot/locpush.py & 10 | fi -------------------------------------------------------------------------------- /pythonScript/telegramPushBot/locpush.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import requests 4 | from urllib import parse 5 | from lxml import etree 6 | import time 7 | import datetime 8 | from requests.adapters import HTTPAdapter 9 | import re 10 | import js2py 11 | 12 | 13 | # 获得cookie 14 | def getcookies(): 15 | url = 'https://www.hostloc.com/forum.php?mod=forumdisplay&fid=45&filter=author&orderby=dateline' 16 | js = js2py.EvalJs() 17 | headers = {'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'} 18 | aesjs = requests.get("https://www.hostloc.com/aes.min.js", headers=headers, timeout=5).text 19 | js.execute(aesjs) 20 | getcookie = requests.get(url).text 21 | getcookie_script = re.findall("",getcookie) 22 | js.execute(getcookie_script[0].split("document")[0]) 23 | data = js.toHex(js.slowAES.decrypt(js.c, 2, js.a, js.b)) 24 | cookie = "L7DFW=" + data 25 | return cookie 26 | 27 | 28 | # 获得日期 29 | def get_week_day(date): 30 | week_day_dict = { 31 | 0: '星期一', 32 | 1: '星期二', 33 | 2: '星期三', 34 | 3: '星期四', 35 | 4: '星期五', 36 | 5: '星期六', 37 | 6: '星期日', 38 | } 39 | day = date.weekday() 40 | return week_day_dict[day] 41 | 42 | 43 | def get_content(url): 44 | while True: 45 | try: 46 | s = requests.get(url) 47 | hostloc_content = etree.HTML(s.content).xpath('//table/tr/td[@class="t_f"]/text()') 48 | 49 | if not hostloc_content: 50 | return "因权限原因,内容无法预览,请手动登陆查看!" 51 | else: 52 | s = '' 53 | for j in hostloc_content: 54 | s = s + j 55 | # 不展示全部内容,防止内容过长,严重影响体验 56 | return s[0:80].replace("\r\n", '').replace('\n', '').replace('\xa0', '').replace('\u200b', '') 57 | 58 | except Exception as e: 59 | print("网络原因,无法访问,请稍后再试...") 60 | return "因权限原因,内容无法预览,请手动登陆查看!" 61 | 62 | 63 | def mark_down(content): 64 | # 删除特殊符号,防止发生错误parse 65 | sign = ['&', '.', '<', '>', ' ', '?', '"', "'", '#', '%', '!', '@', '$', '^', '*', '(', ')', '-', '_', '+', '=', '~', '/', ',', ':', '’', '‘', '“', '”', '%', '^', '——', '{', '}', '*', '[', '、', '\\', ']', '`', '"', "'", '\n'] 66 | for k in sign: 67 | content = content.replace(k, "") 68 | return content 69 | 70 | 71 | def post(chat_id, text): 72 | try: 73 | text = parse.quote(text) 74 | post_url = 'https://api.telegram.org/bot1124748196:*********************tOjQkKU_VOz8CY/sendMessage' \ 75 | '?parse_mode=MarkdownV2&chat_id={0}&text={1}'.format(chat_id, text) 76 | headers = { 77 | 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'} 78 | requests.get(post_url, headers=headers) 79 | except Exception: 80 | print("推送失败!") 81 | time.sleep(3) 82 | post(chat_id, text) 83 | 84 | 85 | # 主程序 86 | def master(r): 87 | xml_content = etree.HTML(r.content) 88 | href_list = xml_content.xpath('/html/body/div[@id="wp"]/div[5]/div/div/div[4]/div[2]/form/table/tbody/tr/th/a[3]/@href') 89 | author = xml_content.xpath('/html/body/div[@id="wp"]/div[5]/div/div/div[4]/div[2]/form/table/tbody/tr/td[2]/cite/a/text()') 90 | author_url = xml_content.xpath('/html/body/div[@id="wp"]/div[5]/div/div/div[4]/div[2]/form/table/tbody/tr/td[2]/cite/a/@href') 91 | number = xml_content.xpath('/html/body/div[@id="wp"]/div[5]/div/div/div[4]/div[2]/form/table/tbody/tr/td[3]/a/text()') 92 | href = xml_content.xpath('/html/body/div[@id="wp"]/div[5]/div/div/div[4]/div[2]/form/table/tbody/tr/th/a[3]/text()') 93 | print(author) 94 | print(number) 95 | for i in range(len(number)): 96 | if number[i] == '0': 97 | if str(href[i].replace("\r\n", "")) not in hostloc_list: 98 | hostloc_list.add(str(href[i].replace("\r\n", ""))) 99 | name = href[i].replace("\r\n", "") 100 | # 文章链接 101 | # print(i) 102 | k = i + 1 103 | # print(k) 104 | url_list = "https://www.hostloc.com/{}".format(href_list[i]) 105 | # 作者id链接 106 | url_author = "https://www.hostloc.com/{}".format(author_url[k]) 107 | # 时间戳 108 | time_1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 109 | date_1 = get_week_day(datetime.datetime.now()) 110 | time_2 = time_1 + ' ' + date_1 + ' ' 111 | time2 = str(time_2).replace('-', '\\-') 112 | # 获得预览内容 113 | # print(get_content(url_list)) 114 | content_2 = mark_down(get_content(url_list)) 115 | text = '主 题:' + "***{}***".format(mark_down(name)) + '\n' + '发 布 者:[{0}]({1})'.format(mark_down(author[i + 1]), url_author) + '\n' + '时 间:' + time2 + '\n' + '内容预览:[点击查看——{0}]({1})'.format(content_2, url_list) 116 | print(text) 117 | # 修改为自己的想推送的ID 118 | post('@locpush', text) 119 | else: 120 | pass 121 | else: 122 | pass 123 | 124 | 125 | # 副程序 126 | def master_1(r): 127 | xml_content = etree.HTML(r.content) 128 | href_list = xml_content.xpath("//div[@class='threadlist']/ul/li/a/@href") 129 | author = xml_content.xpath("//span[@class='by']/text()") 130 | number = xml_content.xpath("//span[@class='num']/text()") 131 | href = xml_content.xpath("//div[@class='threadlist']/ul/li/a/text()") 132 | print(author) 133 | print(number) 134 | # print(href) 135 | # print(href_list) 136 | for i in range(len(number)): 137 | if number[i] == '0': 138 | if str(href[2 * i].replace("\r\n", "")) not in hostloc_list: 139 | hostloc_list.add(str(href[i * 2].replace("\r\n", ""))) 140 | name = href[2 * i].replace("\r\n", "") 141 | # 转换链接: 142 | str_url = href_list[i].replace("forum.php?mod=viewthread&tid=", '').replace("&extra=page%3D1%26filter%3Dauthor%26orderby%3Ddateline&mobile=2", '') 143 | 144 | url_list = "https://www.hostloc.com/thread-{0}-1-1.html".format(str_url) 145 | # 时间戳 146 | time_1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 147 | date_1 = get_week_day(datetime.datetime.now()) 148 | time_2 = time_1 + ' ' + date_1 + ' ' 149 | time2 = str(time_2).replace('-', '\\-') 150 | # 获得预览内容 151 | # print(get_content(url_list)) 152 | content_2 = mark_down(get_content_1(url_list)) 153 | text = '主 题:' + "***{}***".format(mark_down(name)) + '\n' + '发 布 者:{0}'.format(mark_down(author[i])) + '\n' + '时 间:' + time2 + '\n' + '内容预览:[点击查看——{0}]({1})'.format(content_2, url_list) 154 | print(text) 155 | post('@locpush', text) 156 | else: 157 | pass 158 | else: 159 | pass 160 | 161 | 162 | # 获得内容 163 | def get_content_1(url): 164 | while True: 165 | try: 166 | headers = { 167 | 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'} 168 | requests.adapters.DEFAULT_RETRIES = 5 169 | s = requests.session() 170 | s.keep_alive = False 171 | result = 'L7DFW' in cookiestr 172 | if result: 173 | headers = {'Cookie': cookiestr, 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; ' 174 | 'Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome' 175 | '/46.0.2490.76 Mobile Safari/537.36'} 176 | r = s.get(url, headers=headers) 177 | else: 178 | r = s.get(url, headers=headers) 179 | xmlContent = etree.HTML(r.content) 180 | content = xmlContent.xpath('//div[@class="message"]/text()') 181 | return content[0].replace("\r\n", '').replace("\n", '').replace("\r", '').replace("\t", '').replace(" ", '')[0:80] 182 | 183 | except Exception as e: 184 | print("网络原因,无法访问,请稍后再试...") 185 | return "网络原因,无法访问,内容无法预览..." 186 | time.sleep(5) 187 | 188 | 189 | hostloc_list = {"hello"} 190 | url_1 = "https://www.hostloc.com/" 191 | headers = { 192 | 'Accept-Encoding': 'gzip, deflate, br', 193 | 'Accept-Language': 'zh-CN,zh;q=0.9', 194 | 'Upgrade-Insecure-Requests': '1', 195 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 196 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 197 | 'Cache-Control': 'no-cache', 198 | 'Connection': 'keep-alive', 199 | } 200 | url_hostloc = "https://www.hostloc.com/forum.php?mod=forumdisplay&fid=45&filter=author&orderby=dateline" 201 | 202 | while True: 203 | try: 204 | # 网站要求js验证(无法预览网页内容) 205 | cookiestr = getcookies() 206 | print(cookiestr) 207 | print("1") 208 | url = 'https://www.hostloc.com/forum.php?mod=forumdisplay&fid=45&filter=author&orderby=dateline' 209 | headers = { 210 | 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'} 211 | requests.adapters.DEFAULT_RETRIES = 5 212 | s = requests.session() 213 | s.keep_alive = False 214 | result = 'L7DFW' in cookiestr 215 | if result: 216 | headers = {'Cookie': cookiestr, 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; ' 217 | 'Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome' 218 | '/46.0.2490.76 Mobile Safari/537.36'} 219 | r = s.get(url, headers=headers) 220 | else: 221 | r = s.get(url, headers=headers) 222 | master_1(r) 223 | # 多少秒抓取一次网站,自己设定,不要太小,会被ban ip的 224 | time.sleep(20) 225 | except Exception as e: 226 | try: 227 | # 网站不要求js验证 228 | print("2") 229 | headers = { 230 | 'Accept-Encoding': 'gzip, deflate, br', 231 | 'Accept-Language': 'zh-CN,zh;q=0.9', 232 | 'Upgrade-Insecure-Requests': '1', 233 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 234 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 235 | 'Cache-Control': 'no-cache', 236 | 'Connection': 'keep-alive', 237 | } 238 | url_hostloc = "https://www.hostloc.com/forum.php?mod=forumdisplay&fid=45&filter=author&orderby=dateline" 239 | r = requests.get(url_hostloc, headers=headers) 240 | master(r) 241 | time.sleep(20) 242 | except Exception: 243 | print("网络错误,请稍后重试") 244 | time.sleep(120) 245 | 246 | 247 | 248 | -------------------------------------------------------------------------------- /pythonScript/tianyi-zhuancun/README.md: -------------------------------------------------------------------------------- 1 | # 天翼云资源一键转存脚本 2 | ## 使用方法 3 | 填上cookie 和转存目标的文件夹ID即可一键转存 4 | ## 获取方法 5 | 登录天翼云网盘网页版,右键`检查`或者直接`F12`调出`Network`页面 6 | 7 | 1. cookie:
8 | 刷新一下网页点击`main.action`查看`Headers`下拉即可找到`Cookie`选项 9 | ![png](sec1.png) 10 | 2. 目标文件夹ID:
11 | 点击进入你要存储的文件夹,复制顶栏连接`folder/`的id即可 12 | ![png](sec2.png) -------------------------------------------------------------------------------- /pythonScript/tianyi-zhuancun/sec1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/tianyi-zhuancun/sec1.png -------------------------------------------------------------------------------- /pythonScript/tianyi-zhuancun/sec2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/tianyi-zhuancun/sec2.png -------------------------------------------------------------------------------- /pythonScript/tianyi-zhuancun/zhuancun.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import hjson 3 | import urllib.parse 4 | import json 5 | import time 6 | 7 | 8 | def _main(): 9 | h = { 10 | "Cookie": "COOKIE_LOGIN_USER=edrive_view_mode=icon; apm_ct=20200415102651680; apm_uid=29FAD3AAF7227DFC8D69DE214255C5A8; apm_ip=117.152.46.9; apm_ua=F49C41BE171437757C72FF333488A319; _ga=GA1.2.597408328.1587037854; offline_Pic_Showed=true; wpsGuideStatus=true; shareId_136723510=null; shareId_105944752=null; shareId_104180915=null; shareId_1601806=null; shareId_161893853=null; shareId_162635301=null; UM_distinctid=171a13870dd82-03b9857e7e8cf4-70103e47-144000-171a13870de3f0; Hm_lvt_79fae2027f43ca31186e567c6c8fe33e=1587547763; svid=65A0409DA903536E5B0B0EE956E32855; s_fid=439CADEA903B92DB-07A116C92EFCEFD3; lvid=c1238943c866cbbe5ba947ef92efd77e; nvid=1; trkId=98E63362-4356-43AB-8496-517CCB879FF2; Login_Hash=; JSESSIONID=aaai9_nnLa3NShiLkFIgx; COOKIE_LOGIN_USER=8BD018E2B01D662A8DB930FABCFF8864EB3D685B79BDD63EB1652544332B9AFA8E371FCCCC14B0CC5D5F295A51E32C2F7E8115828F136B87B087CE29; validCodeTimestamp=0ac32825-f7ed-41d5-8142-938ee1f8b26e; shareId_168824830=ef8z; shareId_155057311=null; shareId_168824365=null " 11 | } 12 | total = 1 13 | for pp in range(1,3): 14 | req = requests.get( 15 | 'https://cloud.189.cn/v2/listPublicShare.action?userId=330783715&mediaType=0&orderBy=filename&order=ASC&pageNum=%s&pageSize=545' % pp 16 | , headers=h) 17 | j = hjson.loads(req.content.decode()) 18 | for a in j['data']: 19 | print('%s/%s' % (total,1081)) 20 | id = a["fileId"] 21 | name = str(a["fileName"]) 22 | sid = a["shareId"] 23 | fo = a["isFolder"] 24 | t = [{"fileId": id, "fileName": name, "isFolder": 1 if fo else 0}] 25 | 26 | jdata = json.dumps(t, ensure_ascii=False).replace(' ','') 27 | 28 | data = '' 29 | data += 'type=SHARE_SAVE' 30 | data += '&taskInfos=' + str(urllib.parse.quote(jdata)) 31 | data += '&targetFolderId=8146417517567840' 32 | data += '&shareId=' + str(sid) 33 | 34 | ih = h 35 | ih['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8' 36 | resp = requests.post('https://cloud.189.cn/createBatchTask.action', headers=ih, data=data) 37 | print(name, resp.content.decode()) 38 | total +=1 39 | time.sleep(0.5) 40 | 41 | 42 | if __name__ == '__main__': 43 | _main() -------------------------------------------------------------------------------- /pythonScript/year_code/code_dir/readme.md: -------------------------------------------------------------------------------- 1 | # 存放代码文件或者代码文件夹 -------------------------------------------------------------------------------- /pythonScript/year_code/readme.md: -------------------------------------------------------------------------------- 1 | # 2019年第一个开源小项目:玩转统计代码量 2 | 3 | ## 0.导语 4 | 5 | 前两天写了个统计自2018年9月撰写本公众号以来写的部分python代码量项目。主要功能及使用见下文,项目已经开源,点击阅读原文即可进入项目仓库。 6 | 7 | 再来谈一下知识星球,有关知识星球,分享了很多内容,像这次小项目就是在星球内部提出的,3日一个小项目学习,共同交流,除此之外还有每日立了个flag,每日分享干货! 8 | 下面一起来看2019年第一个开源项目:玩转统计代码量! 9 | 10 | **希望可以顺手star与fork,谢谢!** 11 | 12 | 个人公众号: 13 | 14 | ![](https://github.com/Light-City/images/blob/master/wechat.jpg?raw=true) 15 | 16 | ## 1.项目功能 17 | 18 | - **实现代码统计** 19 | 20 | - [x] 代码来源文件 21 | - [x] 总代码量 22 | - [x] 总注释量 23 | - [x] 总空行量 24 | - [x] 实际代码量 25 | - [x] 实际代码比率 26 | - [x] 总注释比率 27 | - [x] 总空行比率 28 | 29 | - **csv数据存储** 30 | 31 | - [x] csv模块进行数据存储 32 | 33 | - **美化输出结果** 34 | 35 | - [x] prettytable模块美化输出 36 | - [x] colorama模块颜色配置 37 | 38 | - **csv数据统计分析** 39 | 40 | - [x] pandas模块读取csv 41 | - [x] pandas模块统计与描述 42 | 43 | ## 2.你会学到 44 | 45 | - [x] python基础 46 | - [x] 面向对象方法 47 | - [x] os模块 48 | - [x] pandas模块 49 | - [x] csv模块 50 | - [x] prettytable模块 51 | - [x] colorama模块 52 | 53 | ## 3.如何使用 54 | 55 | - **下载** 56 | 57 | ```python 58 | git clone git@github.com:Light-City/year_code.git 59 | ``` 60 | 61 | - **使用** 62 | 63 | 将代码文件与文件夹放到code_dir,或者修改`static.py`文件里的 64 | 65 | ```python 66 | dir = './code_dir' # 你的代码文件夹或者代码文件 67 | ``` 68 | 69 | - **运行** 70 | 71 | 运行`statistic.py`文件,然后会打印输出下面结果,并得到原统计数据data.csv以及排序结果数据sort_data.csv。 72 | 73 | - **定制** 74 | 75 | ```python 76 | def codeSort(self,c_name='实际代码量') # 默认为实际代码量排序 77 | ``` 78 | 79 | 使用codeSort函数的时候,可以根据自己的需求来排序,比如可以按照以下参数配置: 80 | 81 | codeSort('总代码量')。 82 | 83 | 可填入:(下面字符串中选择即可) 84 | 85 | ``` 86 | '文件', '总代码量', '总注释量', '总空行量', '实际代码量', '实际代码比率', '总注释比率', '总空行比率' 87 | ``` 88 | 89 | ## 4.结果展示 90 | - 美化输出结果 91 | 92 | ![](./show_res/py_output.jpg) 93 | 94 | - 数据存储结果 95 | 96 | ![](./show_res/data_csv.jpg) 97 | 98 | - 排序存储结果 99 | 100 | 实际代码量排序结果 101 | 102 | ![](./show_res/sort_csv.jpg) 103 | 104 | - 简单统计结果输出 105 | 106 | ![](show_res/py_statistic.jpg) 107 | 108 | ## 5.关于项目与星球 109 | 在星球中会做更加详细的代码讲解,如果有问题,星球内部提问会优先回答。 110 | 111 | **拒绝伸手党,但我同时拥抱开源,多点留言,多点点赞,多点分享,多点转载,多点赞赏,将是我支持原创的动力!** 112 | 113 | 最后,关于加入星球,需要说几点: 114 | 在星球中,我将用自己的业余时间同你们共同分享交流,我们可以做: 115 | - [x] 更多本文这种小项目 116 | - [x] 组织参加更多比赛 117 | - [x] 共同探讨论文 118 | - [x] 共同研究技术点 119 | - [x] 每日每人分享互动 120 | - [x] 提升坚持与学习的能力! 121 | -------------------------------------------------------------------------------- /pythonScript/year_code/show_res/data_csv.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/year_code/show_res/data_csv.jpg -------------------------------------------------------------------------------- /pythonScript/year_code/show_res/py_output.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/year_code/show_res/py_output.jpg -------------------------------------------------------------------------------- /pythonScript/year_code/show_res/py_statistic.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/year_code/show_res/py_statistic.jpg -------------------------------------------------------------------------------- /pythonScript/year_code/show_res/sort_csv.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/pythonScript/year_code/show_res/sort_csv.jpg -------------------------------------------------------------------------------- /pythonScript/year_code/statistic.py: -------------------------------------------------------------------------------- 1 | import os 2 | import prettytable as pt 3 | from colorama import Fore,Style 4 | import pandas as pd 5 | import csv 6 | class FileAnalysis: 7 | def __init__(self): 8 | self.TLine=0 # 总代码行 9 | self.CLine=0 # 总命令行 10 | self.BLine=0 # 总空行 11 | # 美化打印存储到list中 12 | self.file_list=[] # 文件名list 13 | self.total_list=[] # 每个文件总代码list 14 | self.comment_list=[] # 每个文件总注释list 15 | self.blank_list=[] # 每个文件总空行list 16 | self.actual_list=[] # 每个文件实际代码量list 17 | self.actual_rate=[] # 每个文件实际代码比率list 18 | self.comment_rate=[] # 每个文件实际注释比率list 19 | self.black_rate=[] # 每个文件空行比率list 20 | self.isOne=True # 是否第一次写入csv 21 | def coutLines(self,file): 22 | comment_line = 0 23 | blank_line = 0 24 | with open(file, encoding='utf-8', errors='ignore') as f: 25 | # 返回每一个列表,包含每一行代码 26 | lines = f.readlines() 27 | # 总行数 28 | total_line = len(lines) 29 | # 遍历每一行 30 | for i in range(total_line): 31 | line = lines[i] 32 | # 检查是否为注释 33 | if line.startswith("#"): 34 | comment_line += 1 35 | elif line.strip().startswith("'''") or line.strip().startswith('"""'): 36 | comment_line += 1 37 | if line.count('"""') == 1 or line.count("'''") == 1: 38 | while True: 39 | line = lines[i] 40 | comment_line += 1 41 | i+=1 42 | if ("'''" in line) or ('"""' in line): 43 | break 44 | # 检查是否为空行 45 | elif line == '\n': 46 | blank_line += 1 47 | # 输出每个文件结果 48 | print("在%s中:" % file) 49 | print("代码行数:", total_line) 50 | print("注释行数:", comment_line) 51 | print("空行数:", blank_line) 52 | actual_line=total_line - comment_line - blank_line 53 | print("实际总行数:",actual_line) 54 | # 实际代码比率 55 | actual_ra=0 56 | # 注释比率 57 | comment_ra=0 58 | # 空行比率 59 | black_ra=0 60 | try: 61 | actual_ra=actual_line/total_line 62 | print("实际总行数占比率:",actual_ra) 63 | except Exception as e: 64 | print("实际总行数占比率:", 0) 65 | try: 66 | comment_ra=comment_line/total_line 67 | print("注释行数占比率:",comment_ra) 68 | except Exception as e: 69 | print("注释行数占比率:", 0) 70 | try: 71 | black_ra=blank_line/total_line 72 | print("空行数占比率:",black_ra) 73 | except Exception as e: 74 | print("空行数占比率:", 0) 75 | # 往list中添加数据 76 | self.actual_list.append(actual_line) 77 | # 格式化添加输出比率百分百 78 | self.actual_rate.append(format(actual_ra,'0.1%')) 79 | self.comment_rate.append(format(comment_ra,'0.1%')) 80 | self.black_rate.append(format(black_ra,'0.1%')) 81 | # 取xx.py 82 | self.file_list.append(Fore.GREEN+file.split('\\')[-1]) 83 | self.total_list.append(total_line) 84 | self.comment_list.append(comment_line) 85 | self.blank_list.append(blank_line) 86 | 87 | # 存储csv数据格式化 88 | # list添加多个数据 89 | data_list = [file.split('\\')[-1],total_line,comment_line,blank_line,actual_line,actual_ra,comment_ra,black_ra] 90 | if self.isOne: 91 | # 存储head 92 | self.saveCSV(data_list,self.isOne) 93 | self.isOne=False 94 | # 存储 95 | self.saveCSV(data_list) 96 | return total_line, comment_line, blank_line 97 | def fileAnalysis(self,dir): 98 | # 列出目录下的所有文件和目录 99 | list_files = os.listdir(dir) 100 | for file in list_files: 101 | filepath = os.path.join(dir, file) 102 | # 目录:递归遍历子目录 103 | if os.path.isdir(filepath): 104 | self.fileAnalysis(filepath) 105 | # 文件:直接统计行数 106 | elif os.path: 107 | if os.path.splitext(file)[1] == '.py': 108 | total_line, comment_line, blank_line=self.coutLines(filepath) 109 | self.TLine+=total_line 110 | self.CLine+=comment_line 111 | self.BLine+=blank_line 112 | 113 | # 输出打印 114 | def output(self): 115 | # 添加总统计 116 | self.file_list.insert(0,Fore.LIGHTRED_EX+'总统计结果'+Fore.RESET) 117 | self.total_list.insert(0,Fore.LIGHTRED_EX + str(self.TLine) + Fore.RESET) 118 | self.comment_list.insert(0,Fore.LIGHTRED_EX + str(self.CLine) + Fore.RESET) 119 | self.blank_list.insert(0,Fore.LIGHTRED_EX + str(self.BLine) + Fore.RESET) 120 | actual_line = self.TLine-self.CLine-self.BLine 121 | self.actual_list.insert(0,Fore.LIGHTRED_EX + str(actual_line) + Fore.RESET) 122 | self.actual_rate.insert(0,Fore.LIGHTRED_EX +str(format((self.TLine-self.CLine-self.BLine)/self.TLine,'0.1%'))+Fore.RESET) 123 | self.comment_rate.insert(0,Fore.LIGHTRED_EX+str(format(self.CLine/self.TLine,'0.1%'))+Fore.RESET) 124 | self.black_rate.insert(0,Fore.LIGHTRED_EX+str(format(self.BLine/self.TLine,'0.1%'))+Fore.RESET) 125 | 126 | # 美化打印输出 127 | tb = pt.PrettyTable() 128 | tb.add_column(Fore.LIGHTMAGENTA_EX+"文件"+Fore.RESET,self.file_list) 129 | tb.add_column(Fore.LIGHTMAGENTA_EX+'总代码量'+Fore.RESET,self.total_list) 130 | tb.add_column(Fore.LIGHTMAGENTA_EX+'总注释量'+Fore.RESET,self.comment_list) 131 | tb.add_column(Fore.LIGHTMAGENTA_EX+'总空行量'+Fore.RESET,self.blank_list) 132 | tb.add_column(Fore.LIGHTMAGENTA_EX+'实际代码量'+Fore.RESET,self.actual_list) 133 | tb.add_column(Fore.LIGHTMAGENTA_EX+'实际代码比率'+Fore.RESET,self.actual_rate) 134 | tb.add_column(Fore.LIGHTMAGENTA_EX+'总注释比率'+Fore.RESET,self.comment_rate) 135 | tb.add_column(Fore.LIGHTMAGENTA_EX+'总空行比率'+Fore.RESET,self.black_rate) 136 | print(Fore.RED+"-----------------------------------------------光城18年9月份以后部分python代码统计结果-----------------------------------------------") 137 | print(Style.RESET_ALL) 138 | print(tb) 139 | print(Style.RESET_ALL) 140 | def saveCSV(self, data_list, isOne=False): 141 | # newline=''防止写入留空行问题 142 | # 追加写入 143 | with open("data.csv", "a+", encoding='utf_8_sig',newline='') as cf: 144 | writer = csv.writer(cf) 145 | # 如果是第一次写入,就写head,后面就正常写入 146 | if isOne: 147 | data_list = ['文件', '总代码量', '总注释量', '总空行量', '实际代码量', '实际代码比率', '总注释比率', '总空行比率'] 148 | writer.writerow(data_list) 149 | # 排序 150 | def codeSort(self,c_name='实际代码量'): 151 | df = pd.DataFrame(pd.read_csv('./data.csv',encoding='utf_8_sig')) 152 | # print(df) 153 | # lc.sort(["loan_amnt"], ascending=True).head(10) 154 | print(df.sort_values(c_name,ascending=False,inplace=True)) 155 | print(df.head(10)) 156 | print(df.describe()) 157 | print(df.sum()) 158 | df.to_csv('./sort_data.csv',encoding='utf_8_sig',index=False) 159 | 160 | dir = './code_dir' 161 | fa = FileAnalysis() 162 | fa.fileAnalysis(dir) 163 | print(fa.TLine) 164 | print(fa.CLine) 165 | print(fa.BLine) 166 | fa.output() 167 | fa.codeSort('总代码量') 168 | -------------------------------------------------------------------------------- /scrapy/2019-nCov-cn/city.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | import requests 5 | 6 | 7 | def getHTMLText(url): 8 | try: 9 | headers = { 10 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " 11 | "Chrome/80.0.3987.163 Safari/537.36"} 12 | r = requests.get(url, timeout=30, headers=headers) 13 | r.raise_for_status() 14 | r.encoding = r.apparent_encoding 15 | data = re.search("\(+([^)]*)\)+", r.text).group(1) 16 | return data 17 | except: 18 | return "" 19 | 20 | 21 | def getYqDate(lst, YqURL): 22 | html = getHTMLText(YqURL) 23 | hjson = json.loads(html) 24 | a = hjson['data']['list'] 25 | for i in a: 26 | if i['ename'] == "fujian": 27 | city = i['city'] 28 | for j in city: 29 | name = j['name'] # 城市名称 30 | value = j['conNum'] # 累计确诊 31 | econNum = j['econNum'] # 现存确诊 32 | conadd = j['conadd'] # 今日确诊 33 | deathNum = j['deathNum'] # 累计死亡人数 34 | cureNum = j['cureNum'] # 累计治愈 35 | zerodays = j['zerodays'] # 零增长天数 36 | single_data = [name, value, econNum, conadd, deathNum, cureNum, zerodays] 37 | lst.append(single_data) 38 | break 39 | else: 40 | continue 41 | 42 | 43 | def writeResult(lst, fpath): 44 | with open(fpath, 'a+', encoding='utf-8') as f: 45 | f.write('地区\t累计确诊\t现存确诊\t今日确诊\t累计死亡人数\t累计治愈\t零增长天数\n') 46 | for i in range(len(lst)): 47 | for j in range(len(lst[i])): 48 | f.write(str(lst[i][j])) 49 | f.write('\t') 50 | f.write('\n') 51 | lst.clear() 52 | f.close() 53 | 54 | 55 | if __name__ == '__main__': 56 | pagenum = 1 57 | output_file = 'D:/Personal/Desktop/fjyq.xls' 58 | final_data = [] 59 | url = "https://gwpre.sina.cn/interface/fymap2020_data.json?_=1588258367647&callback=dataAPIData" 60 | getYqDate(final_data, url) 61 | writeResult(final_data, output_file) -------------------------------------------------------------------------------- /scrapy/2019-nCov-cn/province.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import requests 4 | 5 | 6 | def getHTMLText(url): 7 | try: 8 | headers = { 9 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " 10 | "Chrome/80.0.3987.163 Safari/537.36"} 11 | r = requests.get(url, timeout=30, headers=headers) 12 | r.raise_for_status() 13 | r.encoding = r.apparent_encoding 14 | data = re.search("\(+([^)]*)\)+", r.text).group(1) 15 | return data 16 | except: 17 | return "" 18 | 19 | 20 | def getYqDate(lst, YqURL): 21 | html = getHTMLText(YqURL) 22 | hjson = json.loads(html) 23 | a = hjson['data']['list'] 24 | for i in a: 25 | try: 26 | name = i['name'] # 省份 27 | value = i['value'] # 累计确诊 28 | econNum = i['econNum'] # 现存确诊 29 | conadd = i['conadd'] # 今日确诊 30 | deathNum = i['deathNum'] # 累计死亡人数 31 | cureNum = i['cureNum'] # 累计治愈 32 | zerodays = i['zerodays'] # 零增长天数 33 | jwsrNum = i['jwsrNum'] # 境外输入总数 34 | single_data = [name, value, econNum, conadd, deathNum, cureNum, zerodays, jwsrNum] 35 | lst.append(single_data) 36 | except: 37 | continue 38 | 39 | 40 | def writeResult(lst, fpath): 41 | with open(fpath, 'a+', encoding='utf-8') as f: 42 | f.write('省份\t累计确诊\t现存确诊\t今日确诊\t累计死亡人数\t累计治愈\t零增长天数\t境外输入总数\n') 43 | for i in range(len(lst)): 44 | for j in range(len(lst[i])): 45 | f.write(str(lst[i][j])) 46 | f.write('\t') 47 | f.write('\n') 48 | lst.clear() 49 | f.close() 50 | 51 | 52 | if __name__ == '__main__': 53 | pagenum = 1 54 | output_file = 'D:/Personal/Desktop/yq.xls' 55 | final_data = [] 56 | url = "https://gwpre.sina.cn/interface/fymap2020_data.json?_=1588258367647&callback=dataAPIData" 57 | getYqDate(final_data, url) 58 | writeResult(final_data, output_file) -------------------------------------------------------------------------------- /scrapy/51job-scrapy/2020-09-25_python算法工程师岗位招聘信息.csv: -------------------------------------------------------------------------------- 1 | Python算法工程师,1-1.5万/月,厦门,1年经验,本科,招若干人,09-25发布,,,岗位职责:1、负责睡眠分期相关数据智能算法的研发和改进2、负责gsensor六轴传感器数据的分析3、有参与过手环数据分析相关项目的优先。能力要求:1、熟练掌握Pathon、Java、C++等至少一门编程语言;2、熟悉Tensorflow、Pytorch、Keras等至少一个深度学习开发框架;3、熟悉经典数据挖掘方法,数据分析方法;4、具备良好的沟通表达能力、严谨的工作态度及较强的执行力。任职要求:1、学历:本科(含)以上学历,多年工作经验优先;应届毕业生学历为硕士;2、专业:计算机技术、智能技术、软件工程等相关专业毕业。Python算法工程师,上班地址:厦门,厦门中翎易优创科技有限公司,民营公司,,计算机软件,厦门中翎易优创科技有限公司诚聘 2 | Python算法工程师,1.5-2万/月,杭州-江干区,2年经验,本科,招若干人,09-25发布,,"五险一金,员工旅游,餐饮补贴,专业培训,绩效奖金,年终奖金,定期体检",岗位要求:1.参与基于摄像机传输的图片和视频流,结合业务场景的图像分类,进行多种行业安防类算法、模型研发及优化。2.针对项目方向进行算法调研、评估,研发相应算法并进行优化,对现有算法进行优化。3.进行深度学习算法的应用型研发,特别是在计算机视觉领域的应用研究,以及模型加速、模型调优、模型量化等研发任职要求:1.计算机相关专业;2.具备良好的动手能力,熟练掌握C/C++、Python等语言。3.掌握一种以上深度学习框架,掌握OpenCV库的使用。4.掌握计算机视觉基础知识、深度学习、经典机器学习等,有一定的实践经验。5.具备一定科研能力,能快速理解paper,具备算法创新能力着优先。6.具备对现实问题抽象出数学模型的逻辑分析能力,并能够求解数学问题。7.具备良好的职业素养、优良的品行、善于团队合作、有严谨的工作作风。,上班地址:杭州经济技术开发区海达南路555号金沙大厦C幢11层,浙江正洁环境科技有限公司,民营公司,150-500人,环保,    浙江正洁环境科技有限公司是一家专业提供工业废水和城市污水处理运营服务的国家高新技术企业。正洁环境作为一家知名的环保水处理第三方综合运营服务商,聚焦食品、印染等行业的源头污水处理,以提升客户自身生产专注度为目标,向客户提供包括工艺设计、设备采购与集成、工程实施、后续运维等一体化专业服务。正洁环境将努力打造出一个全生态链的水处理技术与资源再生集成工艺,循环资源,为改善人类的生态居住环境作出贡献!    正洁环境先后获得了杭州市“雏鹰计划”培育企业、杭州市高新技术企业、国家高新技术企业、杭州市企业高新技术研发中心等荣誉称号,2016年,获第五届中国创新创业大赛节能环保行业优胜奖,浙江省排名第二。    正洁环境目前已有员工近200人,其中学士及以上学位人员占比70%,具有近90%的专业相关度。企业创新能力卓越,具有全国最完善的自有药剂研发、生产和应用体系以及最具应用能力的自有设备研发创新体系。正洁环境拥有的水处理特种药剂和水处理特种设备,在水处理技术上已覆盖了食品、印染、电镀、农村生活污水、实验室废液等多个行业,其中多项技术填补了目前国内环保行业的空白。    公司现已申请专利22项,申请保护16项软件著作权,先后获得建设部环保工程专项资质、环境污染治理设施运行服务资质、环境污染治理工程总承包资质、环境污染治理工程设计资质,荣获浙江省工商企业AA级守合同重信用单位,先后取得ISO9001质量管理体系、ISO14001环境管理体系、HHSAS18001职业健康安全管理体系、GB/T50430工程建设施工企业质量管理规范。 3 | python算法工程师,1.2-2万/月,西安-国家民用航天产业基地,3-4年经验,本科,招1人,09-25发布,,"五险一金,补充医疗保险,通讯补贴,定期体检",岗位职责: 1、参与项目需求分析,业务逻辑流程与设计,文档编写; 2、能独立解决实际开发过程碰到的各类问题; 3、负责数据分析云平台和数据存储系统的搭建; 4、搭建系统开发环境,完成系统框架和核心代码的实现,负责解决开发过程中的技术问题; 任职要求: 一、教育水平:本科以上学历,计算机、应用数学或模式识别相关专业;二、工作资历:2年及以上工作经验;三、专业技能:1、有机器学习或推荐系统研究背景,有深度学习相关科研经历。2、对电商风控、广告预测、推荐系统等任何一个领域有实践经验;3、扎实的计算机基础,熟悉Python或C/C++,有较好的算法实现能力。3、精通使用一种开源的深度学习框架解决实际项目问题,如caffe、tensorflow、Mxnet、pytorch;4、精通SQL开发,精通Mysql、Oracle等关系型数据库中的一种;四、其他要求:1、较强的逻辑思维能力、学习能力;2、良好的英文听说读写能力;,上班地址:飞天路588号北航科技园,上海今日信息科技有限公司,民营公司,50-150人,计算机软件,上海今日信息科技有限公司成立于2014年8月1日成立,注册资金1000万元,是其上级今日集团为其新业务方向而成立的以IT增值业务为核心的IT服务机构。今日信息专注于IT咨询、IT基础构架建设、信息系统集成与虚拟化云计算、行业应用系统开发、IT服务运维等业务。今日信息与Oracle、CISCO、EMC、HP、Vmware、IBM、Microsoft、IMPERVA、APC 等多家国际知名厂商开展多方位的合作。主要增值业务:        基于Oracle IDM身份管理套件,结合国内企业单位信息系统的使用和管理现状,实现企业单位的身份管理、访问控制管理、用户的全生命周期管理等,为企业单位信息系统安全集成、风险管理控制等方面提供咨询与解决方案        基于全球领先的Oracle Cloud平台,我们提供企业级应用系统解决方案的咨询、项目实施、软件开发外包、IT领域的技术研发外包以及人力资源服务。 今日信息为客户提供端到端业务解决方案和服务,涵盖企业资源管理,人力资本管理、商务智能、客户关系管理、供应链管理等领域。        Oracle Primavera系列软件专注于项目密集型企业,其整个项目生命周期内所有项目的组合管理。 4 | python算法工程师,1-1.5万/月,上海-浦东新区,3-4年经验,大专,招若干人,09-25发布,,"五险一金,员工旅游,定期体检,年终奖金,免费班车,专业培训,带薪年假,过节福利,人才推荐奖",1.本科3年以上工作经验,近1年从事python 算法实现开发。2.python语言基础扎实,清晰掌握python中核心库包,能够独立设计并实现正则式,掌握多种代码技巧,熟悉形式语言,熟练使用jupyter、pycharm等python工作环境。3.掌握分布式原理,能够解读从逻辑设计,到框架机制、到代码实现,再到物理支撑的全链路流转。4.数据结构、算法理论基础扎实,掌握python主流数据处理库包,如pandas、numpy,能够独立完成代码调优,发现已有代码问题并制定修改方案。5.有较好的软件工程理论实践经验,熟悉面向对象分析设计、面向过程分析设计、面向接口分析设计中的一种,能够将分析设计表达为UML相应图表,掌握数据流分析,熟悉设计模式。6.熟悉大数据,MySQL数据库,SQL基础扎实。7.具有至少1个中大型python工程研发实现经验(6个月以上研发周期,5人以上python研发团队)。8.有Java、C++研发经验者优先。,上班地址:上海市浦东新区乐昌路399号上海期货所。,上海速强信息技术股份有限公司,民营公司,150-500人,计算机软件,        上海速强信息技术股份有限公司成立于2005年,是一家以全新软件开发理念为主导,致力于建立软件生产新模式的新兴IT公司。        目前公司的主要客户涵盖金融保险、电子商务等行业。专业的研发团队        公司核心技术、管理团队是由具有多年IT行业背景和管理经验的职业经理人组成,具有多年行业应用实施经验。凭借自身多年的软件实施经验,速强信息已形成完善的技术应用体系,包括:Java、. Net等主流开发平台、基于SQL Server、ORACLE、MYSQL等数据库平台,UML面向对象的设计等。        有形的专业技术,无限的创新空间,秉承“真诚、合作、创新”的理念,不断吸引IT人才加盟,创造工作的乐趣。快速的服务响应        客户的需求和满意是速强服务评价的***标准,我们是客户需求的倾听者,是问题的诊断者,也是问题的解决者。作为一家以客户至上为宗旨的软件企业,速强信息对客户的需求做出最及时、最快速的反应,公司对客户承诺48小时内的到场服务;质量至上,多层面系列化的产品服务,建立以客户需求为中心,以市场需求为导向的团队服务体系。        我们的服务口号:只有客户的成功、才是我们的成功!和谐的人文环境        速强信息顺应网络社会、知识经济、人性文化、数字生存的时代潮流,满足人类追求成功、渴求友谊、享受自由、勇于探索、彰显个性的内在需求,以灵活的软件产品和人性的服务,使客户实现工作上的‘智能化、网络化、个性化’;同时,速强更注重人才对软件发展的作用,秉承‘以人为本’的思想,为员工营造和谐、愉快的工作环境,努力为软件产业培养人才。                我们不是最成功的企业,但珍惜与您的每次合作;我们不是***的公司,但一定是您最忠实的倾听者! 5 | Python算法工程师(高级工程师),1-1.5万/月,昆山,3-4年经验,本科,招1人,09-24发布,,"五险一金,员工旅游,出国机会,专业培训,股票期权,包住宿,年终奖金,餐饮补贴","岗位职责:1.参与产品需求、研发设计的相关讨论;2.负责数据产品业务数据的挖掘和分析工作;3.负责机器视觉产品算法研究工作;任职要求:1、具有python开发数据处理软件/机器视觉软件的经验;2、精通Python,掌握opencv, numpy,scipy,matplotlib,pandas等图像和数据处理方面常用的第三方python库;3、熟悉至少一种Sql数据库 (mysql/postgresql/sqlserver/oracle);4. 掌握Tensorflow/Pytorch一种或多种深度学习框架;4、热爱编程、具有良好的代码风格;5、做事具有条理性,具有良好的自学能力、分析问题以及解决问题的能力。",上班地址:昆山开发区春旭路18号(联彩商务中心)2201、2202室,昆山润石智能科技有限公司,外资(非欧美),少于50人,"电子技术/半导体/集成电路,计算机软件",昆山润石智能科技坐落于昆山经济开发区,为泛半导体行业客户提供智能制造一站式解决方案。公司集智能制造需求硬件&软件的研发、销售、服务于一体。团队成员多为两岸半导体与面板产业人才,具备高端智能制造、制程研发、整合及半导体设备知识,在研发、管理、营销方面经验丰富。公司在智能制造方案中产品线齐全,将IoT、Bigdata、AI三环节打通形成闭环,主要产品有:数据采集与边缘计算系列产品(IOT)、工业流程自动化虚拟机器人(RPA)、智能缺陷自动分类系统(AiDC),为高端制造业客户提升生产良率及设备稼动率,提高生产力,助力客户达成智能制造迈向工业4.0。公司希望通过三到五年的努力,将国外昂贵的智能制造方案国产化,为中国的制造业升级贡献一份力量。 6 | Python算法工程师,1.2-2万/月,天津-西青区,2年经验,本科,招2人,09-23发布,,"五险一金,餐饮补贴,通讯补贴,专业培训,绩效奖金,年终奖金,员工旅游,节假日福利,带薪年假,员工聚餐",岗位职责:1、开展数据分析、机器学习算法的研发与应用;2、参与公司相关软件产品的研发;3、参与软件产品相关文档的撰写。岗位要求:1、计算机科学与技术等相关专业本科及以上学历;2、2年以上Python程序开发经验;3、有较强的算法分析和实现能力;4、善于分析和解决问题,学习能力强,良好的团队合作精神。优先考虑:1、精通机器学习及数据处理工具(pandas、numpy、sklearn等);2、熟悉Linux环境下编程,了解docker、git等工具。,上班地址:华苑产业区兰苑路5号留学生创业园D座1002,深析智能科技有限公司,民营公司,50-150人,"计算机软件,医疗设备/器械",深析智能科技有限公司(DeepCyto)成立于2018年,是一家将人工智能与深度学习技术引入智能医学诊断和医学数据分析领域的医疗科技公司。深析人工智能基于真实、准确、海量的临床数据,融合机器视觉、深度学习及大数据挖掘技术,对血液病理检测的细胞形态学、流式细胞学、细胞遗传学、分子生物学等的数据进行智能识别和分析,为广大医疗机构以及第三方医学实验室提供定制MICM全流程AI辅助诊断产品和技术服务。深析核心团队包含国际人工智能专家和国内血液病理学专家,拥有技术研发、血液病理诊断、临床诊疗和医学管理等复合型人才团队。目前,深析已经获得***VC(软银,元生)天使轮投资。深析的AI系统在血液病理人工智能诊断领域,尤其在***细胞的自动分割和分类计数、流式细胞学的自动设门和分群定量等方面已取得令人瞩目的成果。目前,深析智能已经联合中国医学科学院血液病医院、北京大学***医院、上海长海医院、广州中山一附院等***三甲医院,开展了多中心临床验证试验。经临床数据比对,流式细胞学的Deepflow软件对急性白血病的诊断准确率高达95%,比人工诊断速度提高约100倍, CellCounter***细胞形态学AI自动扫描分析形态的准确率达到97%,诊断速度提高约10倍。未来,深析将进一步开发染色体核型智能分析等产品,实现血液病理的智能化综合诊断,提高诊断的准确和效率,促进优质医疗资源下沉,以AI技术重塑中国未来医疗体系。 7 | python算法工程师,5-9千/月,无锡-无锡新区,1年经验,本科,招1人,09-23发布,,"五险一金,补充医疗保险",1)参与项目设计相关工作、程序的开发工作;2)根据公司开发规范及流程,能独立完成子功能的设计、编码、测试及相关文档编写;3)代码编写;4)完成单元测试,并配合进行其他测试和交付工作;5)部门新技术调研和实践。任职资格及素质要求:1、熟悉Python编程、Django或Tornado等web框架;2、熟悉SpringBoot、MongoDB、Linux等技术,掌握面向对象编程思想;3、 有图像处理、人工智能、自然语音处理经历优先;4、熟悉devops思想并有丰富经验者优先;5、自学能力强,愿意接收新技术。,上班地址:无锡市新吴区菱湖大道200号中国传感网国际创新园E1栋,隆正信息科技有限公司,民营公司,150-500人,"计算机软件,计算机服务(系统、数据服务、维修)",隆正互联是一家专门面向大型商业用户及行业云平台(to big B/I)的专业技术公司。公司的基本目标是:面向金融、通讯、能源、交通等行业,定位全国前100家大型行业客户(如工行、中行、国寿、联通、移动、中航信等),通过云平台支持的超级软件工厂,实现高端软件产品的集约化设计、制造、及交付,引进包括人工智能在内的先进技术,从根本上提高国产软件制作水平,打造高端软件制造生态链。 公司的愿景是: 1、 建立知识驱动的、AI导向的、深度共享协同的专业软件交付云平台。 2、 依托云平台支撑的超级软件工厂及渠道、交付、运维集群,实现行业商用软件及服务的一体化支撑与专业交付。 3、 面向重点行业,建立集约化、标准化、一体化的商用软件生态。 公司的总注册资金一亿元,各功能集群及商务中心在北京,超级软件工厂设在无锡,计划占地500亩以上,园区规划建筑积面四十万平米,能够容纳3万名软件从业人员。 如果您和我们拥有共同的理想,希望成为隆正互联的一员,那么您还在等什么?!快快发送您的简历,与我们共创未来! 8 | 高级Python算法工程师,2-2.5万/月,上海-静安区,3-4年经验,本科,招2人,09-18发布,,"五险一金,专业培训,弹性工作,年终奖金,绩效奖金",职位信息1、负责自然语言处理,文本挖掘,机器学习等工作;2、从事数学建模及数据挖掘应用方法研究;3、根据业务需求,研究开发相关算法模型岗位要求1、重点全日制本科及以上学历,计算机或数学相关专业;2、4~5年以上相关工作经验,熟悉Python编程语言,具备Python算法设计与程序开发能力;3、熟悉数据结构,具备清晰的编程思路、良好的编码风格,能胜任复杂模块的编码;4、对NLP有较深入的研究;5、熟悉深度学习工具Tensoerflow/Caffe/Torch者优先;6、良好的沟通能力,学习能力。7、必须在(文本分析,语义分析,语义计算)有一个或多个作品。8、具备良好的沟通协调能力和团队合作意识,工作踏实,态度积极,能够承受工作压力;,上班地址:江场路1313号金谷中环大厦6楼,亿翰智库,上市公司,150-500人,"专业服务(咨询、人力资源、财会),房地产",     公司简介亿翰股份(837350)于2016年5月成为首支登陆资本市场、专注中国房地产战略服务的企业。亿翰时刻关注中国房地产企业发展动态,深刻洞察行业发展趋势与先机,业务范围横跨地产、科技和资本三大领域;为超过70%的百强房企提供系统化的战略顾问服务;公司根植上海,雄踞长三角辐射全中国,现北京、成都、武汉和深圳已设有四家分支机构。亿翰股份每月发布发的《中国典型房企销售业绩排行榜》曝光量超千万,深受行业及资本界关注;有意者请将简历投递至邮箱:yihanzhiku@ehconsulting.com.cn公司人事部电话:021-61552731公司地址:上海市静安区江场路1313号金谷中环大厦6楼公司官网:www.ehconsulting.com.cn集团公众号:ehresearch简历发送格式:姓名+学校+专业+意向职位+信息来源+工作地点如果你足够的自信、优秀,积极上进。把你的简历投进亿翰股份,亿翰股份欢迎你的加入!北京公司地址:北京市朝阳区京信大厦2141室深圳公司地址:深圳市南山区粤兴六路中科纳能大厦409武汉公司地址:武汉市武昌区万达汉街总部国际B座2605成都公司地址:成都市武侯区成都蜀锦路88号“楚峰国际中心” 楼 903 9 | python算法工程师,0.8-1.2万/月,福州-鼓楼区,2年经验,大专,招3人,08-28发布,,"餐饮补贴,交通补贴,年终奖金,五险","任职要求:计算机相关专业本科以上学历,技术能力优秀者可放宽要求 精通Python 语言,熟悉Linux系统,熟悉基于Python的Web开发技术,熟悉Flask、Django、Tornado等常见架构至少一种;有志从事自动化与智能化开发,对语言学习有热情拥有较好的学习能力、良好的代码习惯和团队合作能力,善于沟通,逻辑清晰,能独立解决问题具备图像识别或机器学习相关知识者,有从事过人工智能(Python)上的相关人士优先岗位职责:1、根据开发进度和任务分配按时按量完成相应模块软件的设计、开发;2、负责现有程序的维护、更新与问题排除",上班地址:软件大道89号福州软件园B区4号楼,福建睿思特科技股份有限公司,民营公司,50-150人,"计算机软件,计算机服务(系统、数据服务、维修)",福建睿思特科技股份有限公司,总部位于福州软件园,专业从事智慧城市生态产业链、城市智能化整体解决方案的服务商。睿思特坚持以自主创新和行业应用为基础,以硬件、软件、服务三位一体构筑核心竞争力,为客户提供优质的产品和服务。睿思特依托物联网、云计算、大数据、人工智能、区块链等新一代信息技术,为环保、水利、交通、城管、电力等智慧城市领域提供软硬件一体化的整体解决方案。睿思特拥有结构化的研发、营销、应用服务和供应链团队,专注于为各行业用户提供领先的技术应用服务和绿色智慧城市解决方案。睿思特致力打造成为国内一流、国际领先的智慧产业互联网龙头企业,助力各地智慧城市建设。睿思特通过开放式创新、卓越运营管理、人力资源发展等战略的实施,全面构造公司的核心竞争力,创造客户和社会的价值,从而实现技术的价值。致力于成为最受社会、客户、股东和员工尊敬的公司,并通过组织与过程的持续改进,领导力与员工竞争力的发展,联盟与开放式创新,使公司成为优秀的城市智能化整体解决方案和服务提供商。 10 | Python算法工程师,1.5-2万/月,上海-静安区,2年经验,本科,招5人,09-18发布,,"五险一金,专业培训,弹性工作,年终奖金,绩效奖金",职位信息1、负责自然语言处理,文本挖掘,机器学习等工作;  2、从事数学建模及数据挖掘应用方法研究;  3、根据业务需求,研究开发相关算法模型岗位要求1、全日制本科及以上学历,计算机或数学相关专业;  2、2-3年以上相关工作经验,熟悉Python编程语言,具备Python算法设计与程序开发能力;  3、熟悉数据结构,具备清晰的编程思路、良好的编码风格,能胜任复杂模块的编码;  4、对NLP有较深入的研究;  5、熟悉深度学习工具Tensoerflow/Caffe/Torch者优先;  6、良好的沟通能力,学习能力。  7、必须在(文本分析,语义分析,语义计算)有一个或多个作品。  8、具备良好的沟通协调能力和团队合作意识,工作踏实,态度积极,能够承受工作压力;    ,上班地址:江场路1313号金谷中环大厦6楼,亿翰智库,上市公司,150-500人,"专业服务(咨询、人力资源、财会),房地产",     公司简介亿翰股份(837350)于2016年5月成为首支登陆资本市场、专注中国房地产战略服务的企业。亿翰时刻关注中国房地产企业发展动态,深刻洞察行业发展趋势与先机,业务范围横跨地产、科技和资本三大领域;为超过70%的百强房企提供系统化的战略顾问服务;公司根植上海,雄踞长三角辐射全中国,现北京、成都、武汉和深圳已设有四家分支机构。亿翰股份每月发布发的《中国典型房企销售业绩排行榜》曝光量超千万,深受行业及资本界关注;有意者请将简历投递至邮箱:yihanzhiku@ehconsulting.com.cn公司人事部电话:021-61552731公司地址:上海市静安区江场路1313号金谷中环大厦6楼公司官网:www.ehconsulting.com.cn集团公众号:ehresearch简历发送格式:姓名+学校+专业+意向职位+信息来源+工作地点如果你足够的自信、优秀,积极上进。把你的简历投进亿翰股份,亿翰股份欢迎你的加入!北京公司地址:北京市朝阳区京信大厦2141室深圳公司地址:深圳市南山区粤兴六路中科纳能大厦409武汉公司地址:武汉市武昌区万达汉街总部国际B座2605成都公司地址:成都市武侯区成都蜀锦路88号“楚峰国际中心” 楼 903 11 | -------------------------------------------------------------------------------- /scrapy/51job-scrapy/2020-09-27_python算法工程师岗位招聘信息.csv: -------------------------------------------------------------------------------- 1 | Python算法工程师,1-1.5万/月,厦门,1年经验,本科,招若干人,09-27发布,,,岗位职责:1、负责睡眠分期相关数据智能算法的研发和改进2、负责gsensor六轴传感器数据的分析3、有参与过手环数据分析相关项目的优先。能力要求:1、熟练掌握Pathon、Java、C++等至少一门编程语言;2、熟悉Tensorflow、Pytorch、Keras等至少一个深度学习开发框架;3、熟悉经典数据挖掘方法,数据分析方法;4、具备良好的沟通表达能力、严谨的工作态度及较强的执行力。任职要求:1、学历:本科(含)以上学历,多年工作经验优先;应届毕业生学历为硕士;2、专业:计算机技术、智能技术、软件工程等相关专业毕业。Python算法工程师,上班地址:厦门,厦门中翎易优创科技有限公司,民营公司,,计算机软件,厦门中翎易优创科技有限公司诚聘,https://jobs.51job.com/xiamen/124622088.html?s=01&t=0 2 | python算法工程师,1-1.5万/月,上海-浦东新区,3-4年经验,大专,招若干人,09-27发布,,"五险一金,员工旅游,定期体检,年终奖金,免费班车,专业培训,带薪年假,过节福利,人才推荐奖",1.本科3年以上工作经验,近1年从事python 算法实现开发。2.python语言基础扎实,清晰掌握python中核心库包,能够独立设计并实现正则式,掌握多种代码技巧,熟悉形式语言,熟练使用jupyter、pycharm等python工作环境。3.掌握分布式原理,能够解读从逻辑设计,到框架机制、到代码实现,再到物理支撑的全链路流转。4.数据结构、算法理论基础扎实,掌握python主流数据处理库包,如pandas、numpy,能够独立完成代码调优,发现已有代码问题并制定修改方案。5.有较好的软件工程理论实践经验,熟悉面向对象分析设计、面向过程分析设计、面向接口分析设计中的一种,能够将分析设计表达为UML相应图表,掌握数据流分析,熟悉设计模式。6.熟悉大数据,MySQL数据库,SQL基础扎实。7.具有至少1个中大型python工程研发实现经验(6个月以上研发周期,5人以上python研发团队)。8.有Java、C++研发经验者优先。,上班地址:上海市浦东新区乐昌路399号上海期货所。,上海速强信息技术股份有限公司,民营公司,150-500人,计算机软件,        上海速强信息技术股份有限公司成立于2005年,是一家以全新软件开发理念为主导,致力于建立软件生产新模式的新兴IT公司。        目前公司的主要客户涵盖金融保险、电子商务等行业。专业的研发团队        公司核心技术、管理团队是由具有多年IT行业背景和管理经验的职业经理人组成,具有多年行业应用实施经验。凭借自身多年的软件实施经验,速强信息已形成完善的技术应用体系,包括:Java、. Net等主流开发平台、基于SQL Server、ORACLE、MYSQL等数据库平台,UML面向对象的设计等。        有形的专业技术,无限的创新空间,秉承“真诚、合作、创新”的理念,不断吸引IT人才加盟,创造工作的乐趣。快速的服务响应        客户的需求和满意是速强服务评价的***标准,我们是客户需求的倾听者,是问题的诊断者,也是问题的解决者。作为一家以客户至上为宗旨的软件企业,速强信息对客户的需求做出最及时、最快速的反应,公司对客户承诺48小时内的到场服务;质量至上,多层面系列化的产品服务,建立以客户需求为中心,以市场需求为导向的团队服务体系。        我们的服务口号:只有客户的成功、才是我们的成功!和谐的人文环境        速强信息顺应网络社会、知识经济、人性文化、数字生存的时代潮流,满足人类追求成功、渴求友谊、享受自由、勇于探索、彰显个性的内在需求,以灵活的软件产品和人性的服务,使客户实现工作上的‘智能化、网络化、个性化’;同时,速强更注重人才对软件发展的作用,秉承‘以人为本’的思想,为员工营造和谐、愉快的工作环境,努力为软件产业培养人才。                我们不是最成功的企业,但珍惜与您的每次合作;我们不是***的公司,但一定是您最忠实的倾听者!,https://jobs.51job.com/shanghai-pdxq/124472415.html?s=01&t=0 3 | Python算法工程师,1.5-2万/月,杭州-江干区,2年经验,本科,招若干人,09-27发布,,"五险一金,员工旅游,餐饮补贴,专业培训,绩效奖金,年终奖金,定期体检",岗位要求:1.参与基于摄像机传输的图片和视频流,结合业务场景的图像分类,进行多种行业安防类算法、模型研发及优化。2.针对项目方向进行算法调研、评估,研发相应算法并进行优化,对现有算法进行优化。3.进行深度学习算法的应用型研发,特别是在计算机视觉领域的应用研究,以及模型加速、模型调优、模型量化等研发任职要求:1.计算机相关专业;2.具备良好的动手能力,熟练掌握C/C++、Python等语言。3.掌握一种以上深度学习框架,掌握OpenCV库的使用。4.掌握计算机视觉基础知识、深度学习、经典机器学习等,有一定的实践经验。5.具备一定科研能力,能快速理解paper,具备算法创新能力着优先。6.具备对现实问题抽象出数学模型的逻辑分析能力,并能够求解数学问题。7.具备良好的职业素养、优良的品行、善于团队合作、有严谨的工作作风。,上班地址:杭州经济技术开发区海达南路555号金沙大厦C幢11层,浙江正洁环境科技有限公司,民营公司,150-500人,环保,    浙江正洁环境科技有限公司是一家专业提供工业废水和城市污水处理运营服务的国家高新技术企业。正洁环境作为一家知名的环保水处理第三方综合运营服务商,聚焦食品、印染等行业的源头污水处理,以提升客户自身生产专注度为目标,向客户提供包括工艺设计、设备采购与集成、工程实施、后续运维等一体化专业服务。正洁环境将努力打造出一个全生态链的水处理技术与资源再生集成工艺,循环资源,为改善人类的生态居住环境作出贡献!    正洁环境先后获得了杭州市“雏鹰计划”培育企业、杭州市高新技术企业、国家高新技术企业、杭州市企业高新技术研发中心等荣誉称号,2016年,获第五届中国创新创业大赛节能环保行业优胜奖,浙江省排名第二。    正洁环境目前已有员工近200人,其中学士及以上学位人员占比70%,具有近90%的专业相关度。企业创新能力卓越,具有全国最完善的自有药剂研发、生产和应用体系以及最具应用能力的自有设备研发创新体系。正洁环境拥有的水处理特种药剂和水处理特种设备,在水处理技术上已覆盖了食品、印染、电镀、农村生活污水、实验室废液等多个行业,其中多项技术填补了目前国内环保行业的空白。    公司现已申请专利22项,申请保护16项软件著作权,先后获得建设部环保工程专项资质、环境污染治理设施运行服务资质、环境污染治理工程总承包资质、环境污染治理工程设计资质,荣获浙江省工商企业AA级守合同重信用单位,先后取得ISO9001质量管理体系、ISO14001环境管理体系、HHSAS18001职业健康安全管理体系、GB/T50430工程建设施工企业质量管理规范。,https://jobs.51job.com/hangzhou-jgq/118119462.html?s=01&t=0 4 | python算法工程师,1.2-2万/月,西安-国家民用航天产业基地,3-4年经验,本科,招1人,09-25发布,,"五险一金,补充医疗保险,通讯补贴,定期体检",岗位职责: 1、参与项目需求分析,业务逻辑流程与设计,文档编写; 2、能独立解决实际开发过程碰到的各类问题; 3、负责数据分析云平台和数据存储系统的搭建; 4、搭建系统开发环境,完成系统框架和核心代码的实现,负责解决开发过程中的技术问题; 任职要求: 一、教育水平:本科以上学历,计算机、应用数学或模式识别相关专业;二、工作资历:2年及以上工作经验;三、专业技能:1、有机器学习或推荐系统研究背景,有深度学习相关科研经历。2、对电商风控、广告预测、推荐系统等任何一个领域有实践经验;3、扎实的计算机基础,熟悉Python或C/C++,有较好的算法实现能力。3、精通使用一种开源的深度学习框架解决实际项目问题,如caffe、tensorflow、Mxnet、pytorch;4、精通SQL开发,精通Mysql、Oracle等关系型数据库中的一种;四、其他要求:1、较强的逻辑思维能力、学习能力;2、良好的英文听说读写能力;,上班地址:飞天路588号北航科技园,上海今日信息科技有限公司,民营公司,50-150人,计算机软件,上海今日信息科技有限公司成立于2014年8月1日成立,注册资金1000万元,是其上级今日集团为其新业务方向而成立的以IT增值业务为核心的IT服务机构。今日信息专注于IT咨询、IT基础构架建设、信息系统集成与虚拟化云计算、行业应用系统开发、IT服务运维等业务。今日信息与Oracle、CISCO、EMC、HP、Vmware、IBM、Microsoft、IMPERVA、APC 等多家国际知名厂商开展多方位的合作。主要增值业务:        基于Oracle IDM身份管理套件,结合国内企业单位信息系统的使用和管理现状,实现企业单位的身份管理、访问控制管理、用户的全生命周期管理等,为企业单位信息系统安全集成、风险管理控制等方面提供咨询与解决方案        基于全球领先的Oracle Cloud平台,我们提供企业级应用系统解决方案的咨询、项目实施、软件开发外包、IT领域的技术研发外包以及人力资源服务。 今日信息为客户提供端到端业务解决方案和服务,涵盖企业资源管理,人力资本管理、商务智能、客户关系管理、供应链管理等领域。        Oracle Primavera系列软件专注于项目密集型企业,其整个项目生命周期内所有项目的组合管理。,https://jobs.51job.com/xian-gjmyht/124565026.html?s=01&t=0 5 | Python算法工程师(高级工程师),1-1.5万/月,昆山,3-4年经验,本科,招1人,09-24发布,,"五险一金,员工旅游,出国机会,专业培训,股票期权,包住宿,年终奖金,餐饮补贴","岗位职责:1.参与产品需求、研发设计的相关讨论;2.负责数据产品业务数据的挖掘和分析工作;3.负责机器视觉产品算法研究工作;任职要求:1、具有python开发数据处理软件/机器视觉软件的经验;2、精通Python,掌握opencv, numpy,scipy,matplotlib,pandas等图像和数据处理方面常用的第三方python库;3、熟悉至少一种Sql数据库 (mysql/postgresql/sqlserver/oracle);4. 掌握Tensorflow/Pytorch一种或多种深度学习框架;4、热爱编程、具有良好的代码风格;5、做事具有条理性,具有良好的自学能力、分析问题以及解决问题的能力。",上班地址:昆山开发区春旭路18号(联彩商务中心)2201、2202室,昆山润石智能科技有限公司,外资(非欧美),少于50人,"电子技术/半导体/集成电路,计算机软件",昆山润石智能科技坐落于昆山经济开发区,为泛半导体行业客户提供智能制造一站式解决方案。公司集智能制造需求硬件&软件的研发、销售、服务于一体。团队成员多为两岸半导体与面板产业人才,具备高端智能制造、制程研发、整合及半导体设备知识,在研发、管理、营销方面经验丰富。公司在智能制造方案中产品线齐全,将IoT、Bigdata、AI三环节打通形成闭环,主要产品有:数据采集与边缘计算系列产品(IOT)、工业流程自动化虚拟机器人(RPA)、智能缺陷自动分类系统(AiDC),为高端制造业客户提升生产良率及设备稼动率,提高生产力,助力客户达成智能制造迈向工业4.0。公司希望通过三到五年的努力,将国外昂贵的智能制造方案国产化,为中国的制造业升级贡献一份力量。,https://jobs.51job.com/kunshan/122280284.html?s=01&t=0 6 | Python算法工程师,1.2-2万/月,天津-西青区,2年经验,本科,招2人,09-23发布,,"五险一金,餐饮补贴,通讯补贴,专业培训,绩效奖金,年终奖金,员工旅游,节假日福利,带薪年假,员工聚餐",岗位职责:1、开展数据分析、机器学习算法的研发与应用;2、参与公司相关软件产品的研发;3、参与软件产品相关文档的撰写。岗位要求:1、计算机科学与技术等相关专业本科及以上学历;2、2年以上Python程序开发经验;3、有较强的算法分析和实现能力;4、善于分析和解决问题,学习能力强,良好的团队合作精神。优先考虑:1、精通机器学习及数据处理工具(pandas、numpy、sklearn等);2、熟悉Linux环境下编程,了解docker、git等工具。,上班地址:华苑产业区兰苑路5号留学生创业园D座1002,深析智能科技有限公司,民营公司,50-150人,"计算机软件,医疗设备/器械",深析智能科技有限公司(DeepCyto)成立于2018年,是一家将人工智能与深度学习技术引入智能医学诊断和医学数据分析领域的医疗科技公司。深析人工智能基于真实、准确、海量的临床数据,融合机器视觉、深度学习及大数据挖掘技术,对血液病理检测的细胞形态学、流式细胞学、细胞遗传学、分子生物学等的数据进行智能识别和分析,为广大医疗机构以及第三方医学实验室提供定制MICM全流程AI辅助诊断产品和技术服务。深析核心团队包含国际人工智能专家和国内血液病理学专家,拥有技术研发、血液病理诊断、临床诊疗和医学管理等复合型人才团队。目前,深析已经获得***VC(软银,元生)天使轮投资。深析的AI系统在血液病理人工智能诊断领域,尤其在***细胞的自动分割和分类计数、流式细胞学的自动设门和分群定量等方面已取得令人瞩目的成果。目前,深析智能已经联合中国医学科学院血液病医院、北京大学***医院、上海长海医院、广州中山一附院等***三甲医院,开展了多中心临床验证试验。经临床数据比对,流式细胞学的Deepflow软件对急性白血病的诊断准确率高达95%,比人工诊断速度提高约100倍, CellCounter***细胞形态学AI自动扫描分析形态的准确率达到97%,诊断速度提高约10倍。未来,深析将进一步开发染色体核型智能分析等产品,实现血液病理的智能化综合诊断,提高诊断的准确和效率,促进优质医疗资源下沉,以AI技术重塑中国未来医疗体系。,https://jobs.51job.com/tianjin-xqq/123554406.html?s=01&t=0 7 | python算法工程师,5-9千/月,无锡-无锡新区,1年经验,本科,招1人,09-23发布,,"五险一金,补充医疗保险",1)参与项目设计相关工作、程序的开发工作;2)根据公司开发规范及流程,能独立完成子功能的设计、编码、测试及相关文档编写;3)代码编写;4)完成单元测试,并配合进行其他测试和交付工作;5)部门新技术调研和实践。任职资格及素质要求:1、熟悉Python编程、Django或Tornado等web框架;2、熟悉SpringBoot、MongoDB、Linux等技术,掌握面向对象编程思想;3、 有图像处理、人工智能、自然语音处理经历优先;4、熟悉devops思想并有丰富经验者优先;5、自学能力强,愿意接收新技术。,上班地址:无锡市新吴区菱湖大道200号中国传感网国际创新园E1栋,隆正信息科技有限公司,民营公司,150-500人,"计算机软件,计算机服务(系统、数据服务、维修)",隆正互联是一家专门面向大型商业用户及行业云平台(to big B/I)的专业技术公司。公司的基本目标是:面向金融、通讯、能源、交通等行业,定位全国前100家大型行业客户(如工行、中行、国寿、联通、移动、中航信等),通过云平台支持的超级软件工厂,实现高端软件产品的集约化设计、制造、及交付,引进包括人工智能在内的先进技术,从根本上提高国产软件制作水平,打造高端软件制造生态链。 公司的愿景是: 1、 建立知识驱动的、AI导向的、深度共享协同的专业软件交付云平台。 2、 依托云平台支撑的超级软件工厂及渠道、交付、运维集群,实现行业商用软件及服务的一体化支撑与专业交付。 3、 面向重点行业,建立集约化、标准化、一体化的商用软件生态。 公司的总注册资金一亿元,各功能集群及商务中心在北京,超级软件工厂设在无锡,计划占地500亩以上,园区规划建筑积面四十万平米,能够容纳3万名软件从业人员。 如果您和我们拥有共同的理想,希望成为隆正互联的一员,那么您还在等什么?!快快发送您的简历,与我们共创未来!,https://jobs.51job.com/wuxi-wxxq/122456199.html?s=01&t=0 8 | 高级Python算法工程师,2-2.5万/月,上海-静安区,3-4年经验,本科,招2人,09-18发布,,"五险一金,专业培训,弹性工作,年终奖金,绩效奖金",职位信息1、负责自然语言处理,文本挖掘,机器学习等工作;2、从事数学建模及数据挖掘应用方法研究;3、根据业务需求,研究开发相关算法模型岗位要求1、重点全日制本科及以上学历,计算机或数学相关专业;2、4~5年以上相关工作经验,熟悉Python编程语言,具备Python算法设计与程序开发能力;3、熟悉数据结构,具备清晰的编程思路、良好的编码风格,能胜任复杂模块的编码;4、对NLP有较深入的研究;5、熟悉深度学习工具Tensoerflow/Caffe/Torch者优先;6、良好的沟通能力,学习能力。7、必须在(文本分析,语义分析,语义计算)有一个或多个作品。8、具备良好的沟通协调能力和团队合作意识,工作踏实,态度积极,能够承受工作压力;,上班地址:江场路1313号金谷中环大厦6楼,亿翰智库,上市公司,150-500人,"专业服务(咨询、人力资源、财会),房地产",     公司简介亿翰股份(837350)于2016年5月成为首支登陆资本市场、专注中国房地产战略服务的企业。亿翰时刻关注中国房地产企业发展动态,深刻洞察行业发展趋势与先机,业务范围横跨地产、科技和资本三大领域;为超过70%的百强房企提供系统化的战略顾问服务;公司根植上海,雄踞长三角辐射全中国,现北京、成都、武汉和深圳已设有四家分支机构。亿翰股份每月发布发的《中国典型房企销售业绩排行榜》曝光量超千万,深受行业及资本界关注;有意者请将简历投递至邮箱:yihanzhiku@ehconsulting.com.cn公司人事部电话:021-61552731公司地址:上海市静安区江场路1313号金谷中环大厦6楼公司官网:www.ehconsulting.com.cn集团公众号:ehresearch简历发送格式:姓名+学校+专业+意向职位+信息来源+工作地点如果你足够的自信、优秀,积极上进。把你的简历投进亿翰股份,亿翰股份欢迎你的加入!北京公司地址:北京市朝阳区京信大厦2141室深圳公司地址:深圳市南山区粤兴六路中科纳能大厦409武汉公司地址:武汉市武昌区万达汉街总部国际B座2605成都公司地址:成都市武侯区成都蜀锦路88号“楚峰国际中心” 楼 903,https://jobs.51job.com/shanghai-jaq/117355255.html?s=01&t=0 9 | python算法工程师,0.8-1.2万/月,福州-鼓楼区,2年经验,大专,招3人,08-28发布,,"餐饮补贴,交通补贴,年终奖金,五险","任职要求:计算机相关专业本科以上学历,技术能力优秀者可放宽要求 精通Python 语言,熟悉Linux系统,熟悉基于Python的Web开发技术,熟悉Flask、Django、Tornado等常见架构至少一种;有志从事自动化与智能化开发,对语言学习有热情拥有较好的学习能力、良好的代码习惯和团队合作能力,善于沟通,逻辑清晰,能独立解决问题具备图像识别或机器学习相关知识者,有从事过人工智能(Python)上的相关人士优先岗位职责:1、根据开发进度和任务分配按时按量完成相应模块软件的设计、开发;2、负责现有程序的维护、更新与问题排除",上班地址:软件大道89号福州软件园B区4号楼,福建睿思特科技股份有限公司,民营公司,50-150人,"计算机软件,计算机服务(系统、数据服务、维修)",福建睿思特科技股份有限公司,总部位于福州软件园,专业从事智慧城市生态产业链、城市智能化整体解决方案的服务商。睿思特坚持以自主创新和行业应用为基础,以硬件、软件、服务三位一体构筑核心竞争力,为客户提供优质的产品和服务。睿思特依托物联网、云计算、大数据、人工智能、区块链等新一代信息技术,为环保、水利、交通、城管、电力等智慧城市领域提供软硬件一体化的整体解决方案。睿思特拥有结构化的研发、营销、应用服务和供应链团队,专注于为各行业用户提供领先的技术应用服务和绿色智慧城市解决方案。睿思特致力打造成为国内一流、国际领先的智慧产业互联网龙头企业,助力各地智慧城市建设。睿思特通过开放式创新、卓越运营管理、人力资源发展等战略的实施,全面构造公司的核心竞争力,创造客户和社会的价值,从而实现技术的价值。致力于成为最受社会、客户、股东和员工尊敬的公司,并通过组织与过程的持续改进,领导力与员工竞争力的发展,联盟与开放式创新,使公司成为优秀的城市智能化整体解决方案和服务提供商。,https://jobs.51job.com/fuzhou-glq/117579727.html?s=01&t=0 10 | Python算法工程师,1.5-2万/月,上海-静安区,2年经验,本科,招5人,09-18发布,,"五险一金,专业培训,弹性工作,年终奖金,绩效奖金",职位信息1、负责自然语言处理,文本挖掘,机器学习等工作;  2、从事数学建模及数据挖掘应用方法研究;  3、根据业务需求,研究开发相关算法模型岗位要求1、全日制本科及以上学历,计算机或数学相关专业;  2、2-3年以上相关工作经验,熟悉Python编程语言,具备Python算法设计与程序开发能力;  3、熟悉数据结构,具备清晰的编程思路、良好的编码风格,能胜任复杂模块的编码;  4、对NLP有较深入的研究;  5、熟悉深度学习工具Tensoerflow/Caffe/Torch者优先;  6、良好的沟通能力,学习能力。  7、必须在(文本分析,语义分析,语义计算)有一个或多个作品。  8、具备良好的沟通协调能力和团队合作意识,工作踏实,态度积极,能够承受工作压力;    ,上班地址:江场路1313号金谷中环大厦6楼,亿翰智库,上市公司,150-500人,"专业服务(咨询、人力资源、财会),房地产",     公司简介亿翰股份(837350)于2016年5月成为首支登陆资本市场、专注中国房地产战略服务的企业。亿翰时刻关注中国房地产企业发展动态,深刻洞察行业发展趋势与先机,业务范围横跨地产、科技和资本三大领域;为超过70%的百强房企提供系统化的战略顾问服务;公司根植上海,雄踞长三角辐射全中国,现北京、成都、武汉和深圳已设有四家分支机构。亿翰股份每月发布发的《中国典型房企销售业绩排行榜》曝光量超千万,深受行业及资本界关注;有意者请将简历投递至邮箱:yihanzhiku@ehconsulting.com.cn公司人事部电话:021-61552731公司地址:上海市静安区江场路1313号金谷中环大厦6楼公司官网:www.ehconsulting.com.cn集团公众号:ehresearch简历发送格式:姓名+学校+专业+意向职位+信息来源+工作地点如果你足够的自信、优秀,积极上进。把你的简历投进亿翰股份,亿翰股份欢迎你的加入!北京公司地址:北京市朝阳区京信大厦2141室深圳公司地址:深圳市南山区粤兴六路中科纳能大厦409武汉公司地址:武汉市武昌区万达汉街总部国际B座2605成都公司地址:成都市武侯区成都蜀锦路88号“楚峰国际中心” 楼 903,https://jobs.51job.com/shanghai-jaq/117000837.html?s=01&t=0 11 | -------------------------------------------------------------------------------- /scrapy/51job-scrapy/51jobs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Sep 25 10:48:03 2020 4 | 5 | @author: YINUXY 6 | """ 7 | 8 | 9 | import re 10 | import time 11 | import copy 12 | import random 13 | import requests 14 | import pymysql 15 | import pandas as pd 16 | from lxml import etree 17 | from selenium import webdriver 18 | from selenium.webdriver.chrome.options import Options 19 | 20 | 21 | class JobSpider: 22 | def __init__(self): 23 | self.base_url = 'https://search.51job.com/list/080200,000000,0000,00,9,99,%s,2,%s.html' 24 | self.headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.13 Safari/537.36'} 25 | self.keyword = 'Java开发工程师' 26 | self.chrome_options=Options() 27 | self.chrome_options.add_argument('--headless') 28 | self.conn=pymysql.connect(host="127.0.0.1", 29 | user="root", 30 | passwd="", 31 | charset='utf8mb4', 32 | cursorclass=pymysql.cursors.DictCursor) 33 | self.cur = self.conn.cursor() 34 | self.cur.execute("CREATE DATABASE IF NOT EXISTS `jobs`") 35 | self.cur.execute("USE jobs") 36 | self.cur.execute("DROP TABLE IF EXISTS `web_51jobs_javadevelopment`") 37 | self.cur.execute("CREATE TABLE IF NOT EXISTS `web_51jobs_javadevelopment` (`id` INT PRIMARY KEY AUTO_INCREMENT,`position` varchar(100),`wages` varchar(20),`region` varchar(100),`experience` varchar(100),`education` varchar(100),`need_people` varchar(20),`publish_date` varchar(20),`english` varchar(100),`welfare_tags` varchar(200),`job_information` varchar(4000),`work_address` varchar(200),`company_name` varchar(200),`company_nature` varchar(200),`company_scale` varchar(200),`company_industry` varchar(200),`company_information` varchar(4000),`job_url` varchar(100))") 38 | 39 | 40 | def tatal_url(self): 41 | url = self.base_url % (self.keyword, str(1)) 42 | tree = etree.HTML(self.parse_html(url)) 43 | # 提取一共有多少页 44 | text = tree.xpath("//div[@class='p_in']/span[1]/text()")[0] 45 | number = re.findall('[0-9]', text) 46 | number = int(''.join(number)) 47 | print('%s职位共有%d页' % (self.keyword, number)) 48 | return number 49 | 50 | def parse_html(self,url): 51 | driver=webdriver.Chrome(chrome_options=self.chrome_options) 52 | driver.get(url) 53 | html = driver.page_source 54 | time.sleep(random.randint(5,10)) 55 | driver.close() 56 | return html 57 | 58 | def detail_url(self, number): 59 | 60 | """ 61 | 1、解析每一页职位详情页的 url 62 | 2、特殊情况一:如果有前程无忧自己公司的职位招聘信息掺杂在里面,他的详情页结构和普通的也不一样,页面编码也有差别。 63 | 页面示例:https://51rz.51job.com/job.html?jobid=115980776 64 | 页面真实数据请求地址类似于:https://coapi.51job.com/job_detail.php?jsoncallback=&key=&sign=params={"jobid":""} 65 | 请求地址中的各参数值通过 js 加密:https://js.51jobcdn.com/in/js/2018/coapi/coapi.min.js 66 | 3、特殊情况二:部分公司有自己的专属页面,此类页面的结构也不同于普通页面 67 | 页面示例:http://dali.51ideal.com/jobdetail.html?jobid=121746338 68 | 4、为了规范化,本次爬取将去掉这部分特殊页面,仅爬取 url 带有 jobs.51job.com 的数据 69 | """ 70 | 71 | for num in range(1, number+1): 72 | starts = time.time() 73 | url = self.base_url % (self.keyword, str(num)) 74 | tree = etree.HTML(self.parse_html(url)) 75 | detail_url1 = tree.xpath("//div[@class='j_joblist']/div[@class='e']/a/@href") 76 | 77 | """ 78 | 深拷贝一个 url 列表,如果有连续的不满足要求的链接,若直接在原列表里面删除, 79 | 则会漏掉一些链接,因为每次删除后的索引已改变,因此在原列表中提取不符合元素 80 | 后,在深拷贝的列表里面进行删除。最后深拷贝的列表里面的元素均符合要求。 81 | """ 82 | 83 | detail_url2 = copy.deepcopy(detail_url1) 84 | for url in detail_url1: 85 | if 'jobs.51job.com' not in url: 86 | detail_url2.remove(url) 87 | self.parse_data(detail_url2) 88 | ends = time.time() 89 | print('第 %d页数据爬取完毕,本页共有 %d个 %s岗位, 用时%d秒' % (num, len(detail_url2), self.keyword, int(ends-starts))) 90 | time.sleep(2) 91 | print('所有数据爬取完毕!') 92 | 93 | def parse_data(self, urls): 94 | 95 | """ 96 | position: 职位 97 | wages: 工资 98 | region: 地区 99 | experience: 经验 100 | education: 学历 101 | need_people: 招聘人数 102 | publish_date: 发布时间 103 | english: 英语要求 104 | welfare_tags: 福利标签 105 | job_information: 职位信息 106 | work_address: 上班地址 107 | company_name: 公司名称 108 | company_nature: 公司性质 109 | company_scale: 公司规模 110 | company_industry: 公司行业 111 | company_information: 公司信息 112 | job_url: 招聘链接 113 | """ 114 | 115 | # jobs = [] 116 | 117 | for url in urls: 118 | job = {} 119 | job['链接'] = url 120 | response = requests.get(url=url, headers=self.headers) 121 | try: 122 | text = response.content.decode('gbk') 123 | except UnicodeDecodeError: 124 | return 125 | tree = etree.HTML(text) 126 | 127 | """ 128 | 提取内容时使用 join 方法将列表转为字符串,而不是直接使用索引取值, 129 | 这样做的好处是遇到某些没有的信息直接留空而不会报错 130 | """ 131 | 132 | position = ''.join(tree.xpath("//div[@class='cn']/h1/text()")) 133 | wages = ''.join(tree.xpath("//div[@class='cn']/strong/text()")) 134 | 135 | # 经验、学历、招聘人数、发布时间等信息都在一个标签里面,逐一使用列表解析式提取 136 | content = tree.xpath("//div[@class='cn']/p[2]/text()") 137 | content = [i.strip() for i in content] 138 | if content: 139 | region = content[0] 140 | else: 141 | region = '' 142 | experience = ''.join([i for i in content if '经验' in i]) 143 | education = ''.join([i for i in content if i in '本科大专应届生在校生硕士']) 144 | need_people = ''.join([i for i in content if '招' in i]) 145 | publish_date = ''.join([i for i in content if '发布' in i]) 146 | english = ''.join([i for i in content if '英语' in i]) 147 | 148 | welfare_tags = ','.join(tree.xpath("//div[@class='jtag']/div//text()")[1:-2]) 149 | job_information = ''.join(tree.xpath("//div[@class='bmsg job_msg inbox']/p//text()")).replace(' ', '') 150 | work_address = ''.join(tree.xpath("//div[@class='bmsg inbox']/p//text()")) 151 | company_name = ''.join(tree.xpath("//div[@class='tCompany_sidebar']/div[1]/div[1]/a/p/text()")) 152 | company_nature = ''.join(tree.xpath("//div[@class='tCompany_sidebar']/div[1]/div[2]/p[1]//text()")) 153 | company_scale = ''.join(tree.xpath("//div[@class='tCompany_sidebar']/div[1]/div[2]/p[2]//text()")) 154 | company_industry = ''.join(tree.xpath("//div[@class='tCompany_sidebar']/div[1]/div[2]/p[3]/@title")) 155 | company_information = ''.join(tree.xpath("//div[@class='tmsg inbox']/text()")) 156 | 157 | job_data = [position, wages, region, experience, education, need_people, publish_date, 158 | english, welfare_tags, job_information, work_address, company_name, 159 | company_nature, company_scale, company_industry, company_information, str(url)] 160 | #追加写入csv文件 161 | df = pd.DataFrame([job_data]) 162 | df.to_csv('./%s_%s岗位招聘信息.csv'%(str(time.strftime("%Y-%m-%d")), self.keyword), mode='a', header=None, index=None, encoding="utf_8_sig") 163 | 164 | job["职位"] = position 165 | job["工资"] = wages 166 | job["地区"] = region 167 | job["经验"] = experience 168 | job["学历"] = education 169 | job["招聘人数"] = need_people 170 | job["发布时间"] = publish_date 171 | job["英语要求"] = english 172 | job["福利标签"] = welfare_tags 173 | job["职位信息"] = job_information 174 | job["上班地址"] = work_address 175 | job["公司名称"] = company_name 176 | job["公司性质"] = company_nature 177 | job["公司规模"] = company_scale 178 | job["公司行业"] = company_industry 179 | job["公司信息"] = company_information 180 | # print(job) 181 | self.process_job(job) 182 | # jobs.append(job) 183 | 184 | def process_job(self,job): 185 | # self.cur = self.conn.cursor() 186 | try: 187 | position = job["职位"] 188 | wages = job["工资"] 189 | region = job["地区"] 190 | experience = job["经验"] 191 | education = job["学历"] 192 | need_people = job["招聘人数"] 193 | publish_date = job["发布时间"] 194 | english = job["英语要求"] 195 | welfare_tags = job["福利标签"] 196 | job_information = job["职位信息"] 197 | work_address = job["上班地址"] 198 | company_name = job["公司名称"] 199 | company_nature = job["公司性质"] 200 | company_scale = job["公司规模"] 201 | company_industry = job["公司行业"] 202 | company_information = job["公司信息"] 203 | job_url = job['链接'] 204 | sql = "INSERT INTO `web_51jobs_javadevelopment` (`position`,`wages`,`region`,`experience`,`education`,`need_people`,`publish_date`,`english`,`welfare_tags`,`job_information`,`work_address`,`company_name`,`company_nature`,`company_scale`,`company_industry`,`company_information`,`job_url`) VALUES ('"+ position +"','"+ wages +"','"+ region +"','"+ experience +"','"+ education +"','"+ need_people +"','"+ publish_date +"','"+ english +"','"+ welfare_tags +"','"+ job_information +"','"+ work_address +"','"+ company_name +"','"+ company_nature +"','"+ company_scale +"','"+ company_industry +"','"+ company_information+"','"+ job_url+"')" 205 | self.cur.execute(sql) 206 | self.conn.commit() 207 | # self.conn.close() 208 | except Exception as err: 209 | print(err) 210 | 211 | 212 | if __name__ == '__main__': 213 | starts = time.time() 214 | spider = JobSpider() 215 | page_number = spider.tatal_url() 216 | spider.detail_url(page_number) 217 | ends = time.time() 218 | print("程序运行完毕,总用时 %d分 %d秒" % (int(ends-starts)/60, (ends-starts)%60)) 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | -------------------------------------------------------------------------------- /scrapy/UnsplashCrawler/UnsplashCrawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 3.6.1 2 | # -*- coding:utf-8 -*- 3 | # ____author___='Yinux' 4 | import json 5 | import os 6 | import threading 7 | import urllib 8 | from queue import Queue 9 | import random 10 | import requests 11 | import time 12 | 13 | """ 14 | 使用多线程将 Unsplash 的图片下载到本地 15 | """ 16 | # 使用队列保存存放图片 url 地址, 确保线程同步 17 | url_queue = Queue() 18 | # 线程总数 19 | THREAD_SUM = 5 20 | # 存储图片的位置 21 | IMAGE_SRC = 'E://spiderproject//UnsplashCrawler/' 22 | 23 | 24 | class Unsplash(threading.Thread): 25 | NOT_EXIST = 0 26 | 27 | def __init__(self, thread_id): 28 | threading.Thread.__init__(self) 29 | self.thread_id = thread_id 30 | 31 | def run(self): 32 | while not self.NOT_EXIST: 33 | # 队列为空, 结束线程 34 | if url_queue.empty(): 35 | NOT_EXIST = 1 36 | break 37 | 38 | url = url_queue.get() 39 | self.get_data(url) 40 | time.sleep(random.randint(3, 5)) 41 | 42 | def get_data(self, url): 43 | """ 根据 url 获取 JSON 格式的图片数据""" 44 | headers = { 45 | 'User-agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36', 46 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 47 | 'referer': 'https://unsplash.com/', 48 | 'path': url.split('com')[1], 49 | 'authority': 'unsplash.com', 50 | 'viewport-width': '1920', 51 | } 52 | response = requests.get(url, headers=headers) 53 | print('请求第[ ' + url + ' ], 状态码为 ', response.status_code) 54 | self.get_image_url(response.text) 55 | 56 | def get_image_url(self, response): 57 | """ 58 | 使用 json.loads(response) 将其转化为字典类型, 以便采用 key-value 形式获取值 59 | raw:包含Exif信息的全尺寸原图,此类图片的容量很大 60 | full:全尺寸分辨率的图片,去除了Exif信息并且对内容进行了压缩,图片容量适中 61 | normal:普通尺寸的图片,去除了Exif信息,并且对分辨率和内容进行了压缩,图片容量较小; 62 | """ 63 | image_url = json.loads(response)[0]['urls']['full'] 64 | self.save_img(image_url) 65 | 66 | def save_img(self, image_url): 67 | print('线程', self.thread_id, ' | 正在下载', image_url) 68 | try: 69 | if not os.path.exists(IMAGE_SRC): 70 | os.mkdir(IMAGE_SRC) 71 | filename = IMAGE_SRC + image_url.split('com')[1].split('?')[0] + '.jpg' 72 | # 下载图片,并保存到文件夹中 73 | urllib.request.urlretrieve(image_url, filename=filename) 74 | except IOError as e: 75 | print('保存图片出现异常失败', e) 76 | 77 | 78 | def get_all_url(): 79 | """ 循环计算出所有的 url 地址, 存放到队列中 """ 80 | base_url = 'https://unsplash.com/napi/photos?page={}&per_page=1&order_by=latest' 81 | page = 1 82 | max_page = 100 83 | while page <= max_page: 84 | url = base_url.format(page) 85 | url_queue.put(url) 86 | page += 1 87 | print('计划下载', url_queue.qsize(), '张图片') 88 | 89 | 90 | if __name__ == '__main__': 91 | get_all_url() 92 | for i in range(THREAD_SUM): 93 | unsplash = Unsplash(i + 1) 94 | unsplash.start() -------------------------------------------------------------------------------- /scrapy/WeChatArticle/WecArticle.py: -------------------------------------------------------------------------------- 1 | import bs4 2 | import rom as rom 3 | 4 | rom bs4 import BeautifulSoup 5 | from selenium import webdriver 6 | from selenium.webdriver.support.ui import WebDriverWait 7 | import re 8 | import csv 9 | import time 10 | import os 11 | 12 | browser = webdriver.Chrome() 13 | wait = WebDriverWait(browser, 5) # 设置等待时间 14 | 15 | 16 | # 提取公众号文章信息 17 | def get_info(url): 18 | browser.get(url) 19 | html = browser.page_source 20 | soup = BeautifulSoup(html, 'lxml') 21 | data = [] # 用来储存文章信息 22 | for i in range(0, 10): 23 | titles = soup.select('#sogou_vr_11002601_title_{}'.format(i)) 24 | introductions = soup.select('#sogou_vr_11002601_summary_{}'.format(i)) 25 | dates = soup.select('#sogou_vr_11002601_box_{} div.txt-box div span'.format(i)) 26 | for ti, intr, da in zip(titles, introductions, dates): 27 | info = {} 28 | title = ti.get_text() 29 | info['文章标题'] = title 30 | link = str(re.compile('data-share="(.*?)"').findall(str(titles))).replace('amp;', '')[2:-2] 31 | info['文章链接'] = link 32 | introduction = intr.get_text() 33 | info['文章简介'] = introduction 34 | date = str(da.get_text()).split(')')[-1] 35 | info['发文日期'] = date 36 | data.append(info) 37 | return data 38 | 39 | 40 | def mkdir(): # 创建储存内容的文件夹 41 | isExists = os.path.exists('D:\\Python\\spider\\wecArticle') 42 | if not isExists: 43 | print('创建目录') 44 | os.makedirs('D:\\Python\\spider\\wecArticle') # 创建目录 45 | os.chdir('D:\\Python\\spider\\wecArticle') # 切换到创建的文件夹 46 | return True 47 | else: 48 | print('目录已存在,即将保存!') 49 | os.chdir('D:\\Python\\spider\\wecArticle') # 切换到创建的文件夹 50 | return False 51 | 52 | 53 | def write2csv(url, kw): # 写入文件,以 csv 文件形式储存 54 | mkdir() 55 | print('正在写入文件') 56 | with open('{}.csv'.format(kw), 'a', newline='', encoding='utf-8') as f: 57 | # 追加内容用 a 58 | fieldnames = ['文章标题', '文章链接', '文章简介', '发文日期'] # 控制列的顺序 59 | writer = csv.DictWriter(f, fieldnames=fieldnames) 60 | writer.writeheader() 61 | data = get_info(url) 62 | writer.writerows(data) 63 | print("写入成功") 64 | 65 | 66 | if __name__ == '__main__': 67 | kw = input('请输入你的关键字:\n') 68 | for j in range(1, 11): 69 | url = 'http://weixin.sogou.com/weixin?query={}&type=2&page={}'.format(kw, j) 70 | write2csv(url, kw) 71 | time.sleep(1) 72 | 73 | -------------------------------------------------------------------------------- /scrapy/cf-ipv6/cf_ipv6_scan.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import requests 3 | 4 | # multithread 5 | def multi_check_ip(start, end): 6 | print(threading.current_thread().name, 'start!') 7 | for i in range(start, end): 8 | hex_num = str(hex(i)).split('x')[-1] 9 | ip = base_ip + hex_num + '::' 10 | url = f'http://[{ip}]/cdn-cgi/trace' 11 | try: 12 | r = requests.get(url, timeout=1) 13 | solo = r.text.split()[6].split('=')[-1] 14 | lock.acquire() 15 | valid_ip.write(ip + ' ' + solo + '\n') 16 | lock.release() 17 | print(ip, solo) 18 | except Exception as e: 19 | print(url, e) 20 | 21 | 22 | if __name__ == '__main__': 23 | base_ip = '2606:4700:' 24 | valid_ip = open('cf_valid_ipv6.txt', 'a+') 25 | thread_list = [] 26 | lock = threading.Lock() 27 | thread_num = 64 28 | task_num = int(65536 / thread_num) 29 | for i in range(thread_num): 30 | start = i * task_num 31 | end = (i + 1) * task_num 32 | t = threading.Thread(target=multi_check_ip, args=(start, end)) 33 | thread_list.append(t) 34 | 35 | last_start = thread_num * task_num 36 | last_task = threading.Thread(target=multi_check_ip, args=(last_start, 65536)) 37 | thread_list.append(last_task) 38 | 39 | for t in thread_list: 40 | t.start() 41 | for t in thread_list: 42 | t.join() 43 | 44 | valid_ip_num = len(valid_ip.readlines()) 45 | valid_ip.close() 46 | print(f'本次扫描结束,共扫到{valid_ip_num}个有效ip') 47 | -------------------------------------------------------------------------------- /scrapy/cf-ipv6/cf_valid_ipv6_scan_2606_4700_.txt: -------------------------------------------------------------------------------- 1 | 2606:4700:3001:: HKG 2 | 2606:4700:3002:: HKG 3 | 2606:4700:3003:: HKG 4 | 2606:4700:3004:: HKG 5 | 2606:4700:3005:: HKG 6 | 2606:4700:3006:: HKG 7 | 2606:4700:3007:: HKG 8 | 2606:4700:3008:: HKG 9 | 2606:4700:3009:: HKG 10 | 2606:4700:300a:: HKG 11 | 2606:4700:300b:: HKG 12 | 2606:4700:300c:: HKG 13 | 2606:4700:3010:: HKG 14 | 2606:4700:3011:: HKG 15 | 2606:4700:3012:: HKG 16 | 2606:4700:3013:: HKG 17 | 2606:4700:3014:: HKG 18 | 2606:4700:3015:: HKG 19 | 2606:4700:3016:: HKG 20 | 2606:4700:3017:: HKG 21 | 2606:4700:3018:: HKG 22 | 2606:4700:3019:: HKG 23 | 2606:4700:301c:: HKG 24 | 2606:4700:3020:: HKG 25 | 2606:4700:3021:: HKG 26 | 2606:4700:3022:: HKG 27 | 2606:4700:3023:: HKG 28 | 2606:4700:3024:: HKG 29 | 2606:4700:3025:: HKG 30 | 2606:4700:3026:: HKG 31 | 2606:4700:3027:: HKG 32 | 2606:4700:3028:: HKG 33 | 2606:4700:3029:: HKG 34 | 2606:4700:302c:: HKG 35 | 2606:4700:3030:: SIN 36 | 2606:4700:3032:: SIN 37 | 2606:4700:3033:: SIN 38 | 2606:4700:3034:: SIN 39 | 2606:4700:3035:: SIN 40 | 2606:4700:3036:: SIN 41 | 2606:4700:3037:: SIN 42 | 2606:4700:3038:: HKG 43 | 2606:4700:3039:: HKG 44 | 2606:4700:303c:: HKG 45 | 2606:4700:8040:: SEA 46 | 2606:4700:8041:: SEA 47 | 2606:4700:8042:: SEA 48 | 2606:4700:8043:: SEA 49 | 2606:4700:8044:: SJC 50 | 2606:4700:8045:: SJC 51 | 2606:4700:8046:: SJC 52 | 2606:4700:8047:: SJC 53 | 2606:4700:8048:: SJC 54 | 2606:4700:8049:: SJC 55 | 2606:4700:804a:: SJC 56 | 2606:4700:804b:: SJC 57 | 2606:4700:804c:: SJC 58 | 2606:4700:804d:: SJC 59 | 2606:4700:804e:: SJC 60 | 2606:4700:804f:: SJC 61 | 2606:4700:80c0:: SEA 62 | 2606:4700:80c1:: SEA 63 | 2606:4700:80c2:: SEA 64 | 2606:4700:80c3:: SEA 65 | 2606:4700:80c4:: LAX 66 | 2606:4700:80c5:: LAX 67 | 2606:4700:80c6:: LAX 68 | 2606:4700:80c7:: LAX 69 | 2606:4700:80c8:: LAX 70 | 2606:4700:80c9:: LAX 71 | 2606:4700:80ca:: LAX 72 | 2606:4700:80cb:: LAX 73 | 2606:4700:80cc:: LAX 74 | 2606:4700:80cd:: LAX 75 | 2606:4700:80ce:: LAX 76 | 2606:4700:80cf:: LAX 77 | 2606:4700:80f0:: SEA 78 | 2606:4700:80f1:: SEA 79 | 2606:4700:80f2:: SEA 80 | 2606:4700:80f3:: SEA 81 | 2606:4700:80f5:: DFW 82 | 2606:4700:80f6:: DFW 83 | 2606:4700:80f7:: DFW 84 | 2606:4700:80f8:: DFW 85 | 2606:4700:80f9:: DFW 86 | 2606:4700:80fa:: DFW 87 | 2606:4700:80fb:: DFW 88 | 2606:4700:80fc:: SEA 89 | 2606:4700:80fd:: SEA 90 | 2606:4700:80fe:: SEA 91 | 2606:4700:80ff:: SEA 92 | 2606:4700:f1:: HKG 93 | 2606:4700:130:: YVR 94 | 2606:4700:131:: YVR 95 | 2606:4700:132:: YVR 96 | 2606:4700:133:: YVR 97 | 2606:4700:134:: YVR 98 | 2606:4700:135:: YVR 99 | 2606:4700:136:: YVR 100 | 2606:4700:137:: YVR 101 | 2606:4700:138:: YVR 102 | 2606:4700:139:: YVR 103 | 2606:4700:13a:: YVR 104 | 2606:4700:13b:: YVR 105 | 2606:4700:13c:: YVR 106 | 2606:4700:13d:: YVR 107 | 2606:4700:13e:: YVR 108 | 2606:4700:13f:: YVR 109 | 2606:4700:8d70:: SIN 110 | 2606:4700:8d71:: SIN 111 | 2606:4700:8d72:: SIN 112 | 2606:4700:8d73:: SIN 113 | 2606:4700:8d74:: SIN 114 | 2606:4700:8d75:: SIN 115 | 2606:4700:8d76:: SIN 116 | 2606:4700:8d77:: SIN 117 | 2606:4700:8d78:: SIN 118 | 2606:4700:8d79:: SIN 119 | 2606:4700:8d7a:: SIN 120 | 2606:4700:8d7c:: SIN 121 | 2606:4700:8d7d:: SIN 122 | 2606:4700:8d7e:: SIN 123 | 2606:4700:8d7f:: SIN 124 | 2606:4700:8d90:: SIN 125 | 2606:4700:8d91:: SIN 126 | 2606:4700:8d92:: SIN 127 | 2606:4700:8d93:: SIN 128 | 2606:4700:8d94:: SIN 129 | 2606:4700:8d95:: SIN 130 | 2606:4700:8d96:: SIN 131 | 2606:4700:8d97:: SIN 132 | 2606:4700:8d98:: SIN 133 | 2606:4700:8d99:: SIN 134 | 2606:4700:8d9a:: SIN 135 | 2606:4700:8d9b:: SIN 136 | 2606:4700:8d9c:: SIN 137 | 2606:4700:8d9d:: SIN 138 | 2606:4700:8d9e:: SIN 139 | 2606:4700:8d9f:: SIN 140 | 2606:4700:81c0:: LAX 141 | 2606:4700:81c1:: LAX 142 | 2606:4700:81c2:: LAX 143 | 2606:4700:81c3:: LAX 144 | 2606:4700:81c4:: SEA 145 | 2606:4700:81c5:: SEA 146 | 2606:4700:81c6:: SEA 147 | 2606:4700:81c7:: SEA 148 | 2606:4700:81c8:: SEA 149 | 2606:4700:81c9:: SEA 150 | 2606:4700:81ca:: SEA 151 | 2606:4700:81cb:: SEA 152 | 2606:4700:81cc:: SEA 153 | 2606:4700:81cd:: SEA 154 | 2606:4700:81ce:: SEA 155 | 2606:4700:81cf:: SEA 156 | 2606:4700:8dd0:: SIN 157 | 2606:4700:8dd1:: SIN 158 | 2606:4700:8dd2:: SIN 159 | 2606:4700:8dd3:: SIN 160 | 2606:4700:8dd4:: SIN 161 | 2606:4700:85c0:: SIN 162 | 2606:4700:85c1:: SIN 163 | 2606:4700:8dd5:: SIN 164 | 2606:4700:85c2:: SIN 165 | 2606:4700:8dd6:: SIN 166 | 2606:4700:85c3:: SIN 167 | 2606:4700:8dd7:: SIN 168 | 2606:4700:85c4:: SIN 169 | 2606:4700:8dd8:: SIN 170 | 2606:4700:85c5:: SIN 171 | 2606:4700:8dd9:: SIN 172 | 2606:4700:85c6:: SIN 173 | 2606:4700:8dda:: SIN 174 | 2606:4700:85c7:: SIN 175 | 2606:4700:8ddb:: SIN 176 | 2606:4700:85c8:: SIN 177 | 2606:4700:8ddc:: SIN 178 | 2606:4700:85c9:: SIN 179 | 2606:4700:8ddd:: SIN 180 | 2606:4700:85ca:: SIN 181 | 2606:4700:8dde:: SIN 182 | 2606:4700:85cb:: SIN 183 | 2606:4700:8ddf:: SIN 184 | 2606:4700:85cc:: SIN 185 | 2606:4700:8de0:: SIN 186 | 2606:4700:85cd:: SIN 187 | 2606:4700:8de1:: SIN 188 | 2606:4700:85ce:: SIN 189 | 2606:4700:8de2:: SIN 190 | 2606:4700:85cf:: SIN 191 | 2606:4700:8de3:: SIN 192 | 2606:4700:85d0:: SIN 193 | 2606:4700:8de4:: SIN 194 | 2606:4700:85d1:: SIN 195 | 2606:4700:8de5:: SIN 196 | 2606:4700:85d2:: SIN 197 | 2606:4700:8de6:: SIN 198 | 2606:4700:85d3:: SIN 199 | 2606:4700:8de7:: SIN 200 | 2606:4700:85d4:: SIN 201 | 2606:4700:8de8:: SIN 202 | 2606:4700:85d5:: SIN 203 | 2606:4700:8de9:: SIN 204 | 2606:4700:85d6:: SIN 205 | 2606:4700:8dea:: SIN 206 | 2606:4700:85d7:: SIN 207 | 2606:4700:8deb:: SIN 208 | 2606:4700:85d8:: SIN 209 | 2606:4700:8dec:: SIN 210 | 2606:4700:85d9:: SIN 211 | 2606:4700:8ded:: SIN 212 | 2606:4700:85da:: SIN 213 | 2606:4700:8dee:: SIN 214 | 2606:4700:85db:: SIN 215 | 2606:4700:8def:: SIN 216 | 2606:4700:85dc:: SIN 217 | 2606:4700:85dd:: SIN 218 | 2606:4700:85de:: SIN 219 | 2606:4700:85df:: SIN 220 | 2606:4700:8390:: SIN 221 | 2606:4700:8391:: SIN 222 | 2606:4700:8392:: SIN 223 | 2606:4700:8393:: SIN 224 | 2606:4700:8394:: SIN 225 | 2606:4700:8395:: SIN 226 | 2606:4700:8396:: SIN 227 | 2606:4700:8397:: SIN 228 | 2606:4700:8398:: SIN 229 | 2606:4700:8399:: SIN 230 | 2606:4700:839a:: SIN 231 | 2606:4700:839b:: SIN 232 | 2606:4700:839c:: SIN 233 | 2606:4700:839d:: SIN 234 | 2606:4700:839e:: SIN 235 | 2606:4700:839f:: SIN 236 | 2606:4700:83a0:: SIN 237 | 2606:4700:83a1:: SIN 238 | 2606:4700:83a2:: SIN 239 | 2606:4700:83a3:: SIN 240 | 2606:4700:83a4:: SIN 241 | 2606:4700:83a5:: SIN 242 | 2606:4700:83a6:: SIN 243 | 2606:4700:83a7:: SIN 244 | 2606:4700:83a8:: SIN 245 | 2606:4700:83a9:: SIN 246 | 2606:4700:83aa:: SIN 247 | 2606:4700:83ab:: SIN 248 | 2606:4700:83ac:: SIN 249 | 2606:4700:83ad:: SIN 250 | 2606:4700:83ae:: SIN 251 | 2606:4700:83af:: SIN 252 | 2606:4700:3000:: HKG -------------------------------------------------------------------------------- /scrapy/cf-ipv6/ping.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | from multiping import MultiPing 3 | 4 | 5 | def multi_ping(ip_list): 6 | # Create a MultiPing object to test three hosts / addresses 7 | mp = MultiPing(ip_list) 8 | 9 | # Send the pings to those addresses 10 | mp.send() 11 | 12 | # With a 1 second timout, wait for responses (may return sooner if all 13 | # results are received). 14 | responses, no_responses = mp.receive(1) 15 | 16 | pprint(sorted(responses.items(), key=lambda obj: obj[1], reverse=True)) 17 | 18 | 19 | if __name__ == '__main__': 20 | ip_list = [] 21 | f = open('cf_valid_ipv6.txt', 'r') 22 | item_list = f.readlines() 23 | for i in item_list: 24 | ip = i.split()[0] 25 | ip_list.append(ip) 26 | 27 | multi_ping(ip_list) 28 | -------------------------------------------------------------------------------- /scrapy/douyin-grils-down/README.md: -------------------------------------------------------------------------------- 1 | # 抖音 2 | 用Python全自动下载抖音小姐姐视频 3 | -------------------------------------------------------------------------------- /scrapy/douyin-grils-down/douyin_appium.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | from appium import webdriver 4 | from selenium.webdriver.common.by import By 5 | from selenium.webdriver.support.ui import WebDriverWait 6 | from appium.webdriver.common.touch_action import TouchAction 7 | from selenium.webdriver.support import expected_conditions as EC 8 | 9 | 10 | def main(): 11 | # 设置驱动配置 12 | server = 'http://localhost:4723/wd/hub' 13 | desired_caps = { 14 | 'platformName': 'Android', 15 | 'deviceName': 'STF_AL00', 16 | 'appPackage': 'com.ss.android.ugc.aweme', 17 | 'appActivity': '.main.MainActivity', 18 | # 关闭手机软键盘 19 | 'unicodeKeyboard': True, 20 | 'resetKeyboard': True 21 | } 22 | driver = webdriver.Remote(server, desired_caps) 23 | wait = WebDriverWait(driver, 60) 24 | # 同意用户隐私协议,点击 25 | button_1 = wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/q6'))) 26 | button_1.click() 27 | # 禁止电话权限,点击 28 | button_2 = wait.until(EC.presence_of_element_located((By.ID, 'com.android.packageinstaller:id/permission_deny_button'))) 29 | button_2.click() 30 | # 禁止位置权限,点击 31 | button_3 = wait.until(EC.presence_of_element_located((By.ID, 'com.android.packageinstaller:id/permission_deny_button'))) 32 | button_3.click() 33 | time.sleep(2) 34 | # 向上滑动,进入抖音视频播放页面 35 | TouchAction(driver).press(x=515, y=1200).move_to(x=515, y=1000).release().perform() 36 | # 这里需要设置一个较长时间的延迟,因为抖音有引导操作和提示,需等待片刻 37 | time.sleep(20) 38 | # 点击抖音"喜欢"处,以此进入登录界面 39 | TouchAction(driver).press(x=950, y=800).release().perform() 40 | # 点击密码登录 41 | button_4 = wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/afg'))) 42 | button_4.click() 43 | # 输入账号 44 | button_5 = wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/ab_'))) 45 | button_5.send_keys('你的账号') 46 | # 输入密码 47 | button_6 = wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/aes'))) 48 | button_6.send_keys('你的密码') 49 | time.sleep(2) 50 | # 因为会跳出软键盘,会遮挡登录按钮,需点击软键盘取消 51 | TouchAction(driver).press(x=980, y=1850).release().perform() 52 | time.sleep(2) 53 | # 点击登录按钮 54 | button_7 = wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/abb'))) 55 | button_7.click() 56 | time.sleep(2) 57 | # 登录成功,进入抖音视频界面,点击下方标题栏 "我" 58 | TouchAction(driver).press(x=990, y=1850).release().perform() 59 | # 进入个人主页,点击关注处 60 | button_8 = wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/a_7'))) 61 | button_8.click() 62 | # 进入关注栏,点击第二个关注 63 | button_9 = wait.until(EC.presence_of_element_located((By.XPATH, ' /hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.RelativeLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.view.ViewGroup/android.widget.LinearLayout/android.support.v7.widget.RecyclerView/android.widget.RelativeLayout[2]/android.widget.RelativeLayout[1]'))) 64 | button_9.click() 65 | # 进入UP主主页,点击第一个视频 66 | button_10 = wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/aqm'))) 67 | button_10.click() 68 | # 不断下滑页面,直到底部 69 | while True: 70 | TouchAction(driver).press(x=515, y=1247).move_to(x=515, y=1026).release().perform() 71 | time.sleep(float(random.randint(5, 10))) 72 | 73 | 74 | if __name__ == '__main__': 75 | main() 76 | -------------------------------------------------------------------------------- /scrapy/douyin-grils-down/douyin_download.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import requests 3 | import os 4 | 5 | num = 0 6 | dom = [] 7 | folder_path = "F:/video/" 8 | os.makedirs(folder_path) 9 | df = pd.read_csv('douyin.csv', header=None, names=["url"]) 10 | 11 | # 对链接去重及刚进入抖音获取的视频链接 12 | for i in df['url'][2:]: 13 | if i not in dom: 14 | dom.append(i) 15 | 16 | # 下载视频 17 | for j in dom: 18 | url = j 19 | num += 1 20 | response = requests.get(url, stream=True) 21 | filename = str(num) + '.mp4' 22 | with open('F:\\video\\' + filename, 'ab+') as f: 23 | f.write(response.content) 24 | f.flush() 25 | print(filename + '下载完成') 26 | -------------------------------------------------------------------------------- /scrapy/douyin-grils-down/douyin_mitmdump.py: -------------------------------------------------------------------------------- 1 | 2 | def response(flow): 3 | urls = ['http://v1-dy', 'http://v3-dy', 'http://v6-dy', 'http://v9-dy'] 4 | # 对url进行筛选,只选取视频的url 5 | for url in urls: 6 | if url in flow.request.url: 7 | print('\n\n抖音视频\n\n') 8 | with open('douyin.csv', 'a+', encoding='utf-8-sig') as f: 9 | f.write(flow.request.url + '\n') 10 | 11 | -------------------------------------------------------------------------------- /scrapy/ipProxyPool/kuaidaili.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import datetime 4 | import random 5 | import requests 6 | import pandas as pd 7 | from bs4 import BeautifulSoup 8 | from faker import Factory 9 | 10 | 11 | def get_user_agent(num): 12 | """ 13 | 生成不同的 user-agent 14 | :param num: 生成个数 15 | :return: list 16 | """ 17 | factory = Factory.create() 18 | user_agent = [] 19 | for i in range(num): 20 | user_agent.append({'User-Agent': factory.user_agent()}) 21 | return user_agent 22 | 23 | 24 | def get_proxy(pages, ua_num, target_url): 25 | """ 26 | 爬取代理数据,清洗整合 27 | :param pages: 需要爬取页数 28 | :param ua_num: 需要user-agent个数 29 | :param target_url: 爬虫的目标地址,作为验证代理池ip的有效性 30 | :return: list 31 | """ 32 | headers = get_user_agent(ua_num) # 请求头 33 | proxy_list = [] # 最后需入库保存的代理池数据 34 | try: 35 | for num in range(0, pages): 36 | print('Start:第 %d 页请求' % (num + 1)) 37 | # 请求路径 38 | url = 'https://www.kuaidaili.com/free/inha/' + str(num + 1) + '/' 39 | 40 | # 随机延时(randint生成的随机数n: a <= n <= b ;random产生 0 到 1 之间的随机浮点数) 41 | time.sleep(random.randint(1, 2) + random.random()) 42 | header_i = random.randint(0, len(headers) - 1) # 随机获取1个请求头 43 | 44 | # BeautifulSoup 解析 45 | html = requests.get(url, headers=headers[header_i]) 46 | soup = BeautifulSoup(html.text, 'lxml') 47 | 48 | # CSS 选择器 49 | ip = soup.select("td[data-title='IP']") 50 | port = soup.select("td[data-title='PORT']") 51 | degree = soup.select("td[data-title='匿名度']") 52 | proxy_type = soup.select("td[data-title='类型']") 53 | position = soup.select("td[data-title='位置']") 54 | speed = soup.select("td[data-title='响应速度']") 55 | last_time = soup.select("td[data-title='最后验证时间']") 56 | 57 | # 循环验证是否有效 58 | for i, p, dg, pt, ps, sp, lt in zip(ip, port, degree, proxy_type, position, speed, last_time): 59 | ip_port = str(i.get_text()) + ':' + str(p.get_text()) 60 | # 调用验证的方法 61 | flag = is_useful(ip_port, headers[header_i], target_url) 62 | if flag: 63 | # 拼装字段 64 | p_ip = str(i.get_text()) 65 | p_port = str(p.get_text()) 66 | p_degree = str(dg.get_text()) 67 | p_type = str(pt.get_text()) 68 | p_position = str(ps.get_text()).rsplit(' ', 1)[0] 69 | p_operator = str(ps.get_text()).rsplit(' ')[-1] 70 | p_speed = str(sp.get_text()) 71 | p_last_time = str(lt.get_text()) 72 | 73 | proxy_list.append([p_ip, p_port, p_degree, p_type, p_position, p_operator, p_speed, p_last_time]) 74 | print('End:第 %d 页结束!==========================' % (num + 1)) 75 | 76 | except Exception as e: 77 | print('程序 get_proxy 发生错误,Error:', e) 78 | 79 | finally: 80 | # 调用保存的方法 81 | write_proxy(proxy_list) 82 | 83 | return proxy_list 84 | 85 | 86 | def is_useful(ip_port, headers, target_url): 87 | """ 88 | 判断ip是否可用 89 | :param ip_port: ip+端口号 90 | :param headers: 随机请求头 91 | :param target_url: 爬虫的目标地址,作为验证代理池ip的有效性 92 | :return: bool 93 | """ 94 | url = target_url # 验证ip对目标地址的有效性 95 | proxy_ip = 'http://' + ip_port 96 | proxies = {'http': proxy_ip} 97 | flag = True 98 | try: 99 | requests.get(url=url, headers=headers, proxies=proxies, timeout=2) 100 | print("【可用】:" + ip_port) 101 | except Exception as e: 102 | print('程序 is_useful 发生错误,Error:', e) 103 | flag = False 104 | return flag 105 | 106 | 107 | def write_proxy(proxy_list): 108 | """ 109 | 将清洗好的列表数据,保存到xlsx文件 110 | :param proxy_list: 代理池数据列表 111 | :return: bool 112 | """ 113 | date_now = datetime.datetime.now().strftime('%Y%m%d%H%M%S') # 当前时间 114 | flag = True # 保存成功标志 115 | print('--- 开始保存 ---') 116 | try: 117 | df = pd.DataFrame(proxy_list, 118 | columns=['ip', 'port', 'degree', 'type', 'position', 'operator', 'speed', 'last_time']) 119 | df.to_excel(date_now + '_proxy.xlsx', index=False) 120 | print('--- 保存成功!---') 121 | except Exception as e: 122 | print('--- 保存失败!---:', e) 123 | flag = False 124 | return flag 125 | 126 | 127 | def read_ip(): 128 | """ 129 | 读取代理池,返回ip:port列表 130 | :return: list 131 | """ 132 | # 最新爬虫数据文件名(列表推导式写法) 133 | file_name = [f for f in os.listdir("./") if f.split('.')[-1] == 'xlsx'][-1] 134 | # 读取文件 135 | proxy_list = pd.read_excel('./' + file_name) 136 | proxy_list['port'] = proxy_list['port'].astype('str') # 先将端口号的整型转为字符串 137 | proxy_list['ip_port'] = proxy_list['ip'].str.cat(proxy_list['port'], sep=':') # 组合成ip+port 138 | return list(proxy_list['ip_port']) 139 | 140 | 141 | def main(): 142 | """ 143 | 主方法 144 | """ 145 | pages = 10 # 定义爬取页数 146 | ua_num = 3 # 定义需生成user-agent个数 147 | target_url = 'https://everia.club/' # 爬虫的目标地址,作为验证代理池ip的有效性 148 | proxy_list = get_proxy(pages, ua_num, target_url) 149 | print(proxy_list) 150 | 151 | 152 | if __name__ == '__main__': 153 | # 1.主方法 154 | # main() 155 | # 2.读取代理池 156 | print(read_ip()) 157 | 158 | 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /scrapy/jdCellPhone/cellPhone.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | import time 4 | import re 5 | import requests 6 | import pymongo 7 | import numpy as np 8 | import pandas as pd 9 | from lxml import etree 10 | from wordcloud import WordCloud 11 | import matplotlib.pyplot as plt 12 | 13 | DB = "cellphone" 14 | 15 | def fix_url(string): 16 | if re.match(r"http://", string): 17 | return string 18 | if re.match(r"//", string): 19 | return "http:" + string 20 | 21 | def get_page_num(): 22 | url = "https://list.jd.com/list.html?cat=9987,653,655" 23 | r = requests.get(url, verify=False) 24 | content = r.content 25 | root = etree.HTML(content) 26 | page_nodes = root.xpath('.//span[@class="p-num"]/a') 27 | for node in page_nodes: 28 | if node.attrib["class"] == "": 29 | page_num = int(node.text) 30 | return page_num 31 | 32 | def get_price(skuid): 33 | url = "https://c0.3.cn/stock?skuId=" + str(skuid) + "&area=1_72_4137_0&venderId=1000004123&cat=9987,653,655&buyNum=1&choseSuitSkuIds=&extraParam={%22originid%22:%221%22}&ch=1&fqsp=0&pduid=15379228074621272760279&pdpin=&detailedAdd=null&callback=jQuery3285040" 34 | r = requests.get(url, verify=False) 35 | content = r.content.decode('GBK') 36 | matched = re.search(r'jQuery\d+\((.*)\)', content, re.M) 37 | if matched: 38 | data = json.loads(matched.group(1)) 39 | price = float(data["stock"]["jdPrice"]["p"]) 40 | return price 41 | return 0 42 | 43 | def get_item(skuid, url): 44 | price = get_price(skuid) 45 | r = requests.get(url, verify=False) 46 | content = r.content 47 | root = etree.HTML(content) 48 | nodes = root.xpath('.//div[@class="Ptable"]/div[@class="Ptable-item"]') 49 | params = {"price": price, "skuid": skuid} 50 | for node in nodes: 51 | text_nodes = node.xpath('./dl')[0] 52 | k = "" 53 | v = "" 54 | for text_node in text_nodes: 55 | if text_node.tag == "dt": 56 | k = text_node.text 57 | elif text_node.tag == "dd" and "class" not in text_node.attrib: 58 | v = text_node.text 59 | params[k] = v 60 | return params 61 | 62 | def get_cellphone(page): 63 | url = "https://list.jd.com/list.html?cat=9987,653,655&page={}&sort=sort_rank_asc&trans=1&JL=6_0_0&ms=4#J_main".format(page) 64 | r = requests.get(url, verify=False) 65 | content = r.content.decode("utf-8") 66 | root = etree.HTML(content) 67 | cell_nodes = root.xpath('.//div[@class="p-img"]/a') 68 | client = pymongo.MongoClient() 69 | db = client[DB] 70 | for node in cell_nodes: 71 | item_url = fix_url(node.attrib["href"]) 72 | matched = re.search('item.jd.com/(\d+)\.html', item_url) 73 | skuid = int(matched.group(1)) 74 | saved = db.items.find({"skuid": skuid}).count() 75 | if saved > 0: 76 | print(saved) 77 | continue 78 | item = get_item(skuid, item_url) 79 | db.items.insert(item) 80 | 81 | def norm_weight(weight_str): 82 | matched = re.search(r'(\d+)', weight_str) 83 | weight = 0 84 | if matched: 85 | weight = matched.group(1) 86 | return weight 87 | 88 | def norm_screen_size(screen_size_str): 89 | matched = re.search(r'(\d+\.\d+)', screen_size_str) 90 | screen_size = 0 91 | if matched: 92 | screen_size = float(matched.group(1)) 93 | return screen_size 94 | 95 | def norm_rom(rom_str): 96 | rom = 0 97 | matched = re.search(r'(\d+)MB', rom_str) 98 | if matched: 99 | rom = float(matched.group(1)) / 1024 100 | matched = re.search(r'(\d+)TB', rom_str) 101 | if matched: 102 | rom = float(matched.group(1)) * 1024 103 | matched = re.search(r'(\d+)GB', rom_str) 104 | if matched: 105 | rom = float(matched.group(1)) 106 | return rom 107 | 108 | def norm_ram(ram_str): 109 | ram = 0 110 | matched = re.search(r'(\d+)MB', ram_str) 111 | if matched: 112 | ram = float(matched.group(1)) / 1024 113 | matched = re.search(r'(\d+)GB', ram_str) 114 | if matched: 115 | ram = float(matched.group(1)) 116 | return ram 117 | 118 | def norm_screen_res(screen_res_str): 119 | width = 0 120 | height = 0 121 | matched = re.search(r'(\d+)[x*](\d+)', screen_res_str) 122 | if matched: 123 | width = matched.group(2) 124 | height = matched.group(1) 125 | return (width, height) 126 | 127 | def norm_battery_cap(battery_cap_str): 128 | items = re.findall(r'(\d+)', battery_cap_str) 129 | items = list(map(lambda x: int(x), items)) 130 | if len(items) == 0: 131 | return 0 132 | return max(items) 133 | 134 | def norm_front_cam(front_cam_str): 135 | pass 136 | 137 | def norm_back_cam(back_cam_str): 138 | pass 139 | 140 | def norm_dual_sim(dual_sim_str): 141 | if dual_sim_str is None: 142 | return False 143 | 144 | dual_sim = False 145 | matched = re.search(r'双卡双待', dual_sim_str) 146 | if matched: 147 | dual_sim = True 148 | return dual_sim 149 | 150 | def preprocess(items): 151 | result = [] 152 | for item in items: 153 | if '品牌' not in item: 154 | continue 155 | 156 | weight_str = item.get('机身重量(g)', '') 157 | weight = norm_weight(weight_str) 158 | screen_size_str = item.get('主屏幕尺寸(英寸)', '') 159 | screen_size = norm_screen_size(screen_size_str) 160 | rom_str = item.get('ROM', '') 161 | rom = norm_rom(rom_str) 162 | ram_str = item.get('RAM', '') 163 | ram = norm_ram(ram_str) 164 | screen_res_str = item.get('分辨率', '') 165 | screen_res_width, screen_res_height = norm_screen_res(screen_res_str) 166 | battery_cap_str = item.get('电池容量(mAh)', '') 167 | battery_cap = norm_battery_cap(battery_cap_str) 168 | front_cam_str = item.get('前置摄像头', '') 169 | front_cam = norm_front_cam(front_cam_str) 170 | back_cam_str = item.get('后置摄像头') 171 | back_cam = norm_back_cam(back_cam_str) 172 | dual_sim_str = item.get('双卡机类型') 173 | dual_sim = norm_dual_sim(dual_sim_str) 174 | 175 | cellphone = { 176 | "brand": item.get('品牌'), 177 | "model": item.get('型号'), 178 | "color": item.get('机身颜色'), 179 | "weight": weight, 180 | "material": item.get('机身材质分类'), 181 | "cpu_brand": item.get('CPU品牌'), 182 | "cpu_freq": item.get('CPU频率'), 183 | "cpu_core": item.get('CPU核数'), 184 | "cpu_model": item.get('CPU型号'), 185 | "gpu_model": item.get('GPU型号'), 186 | "dual_sim": dual_sim, 187 | "network_4g": item.get('4G网络'), 188 | "rom": rom, 189 | "ram": ram, 190 | "screen_size": screen_size, 191 | "screen_res_width": screen_res_width, 192 | "screen_res_height": screen_res_height, 193 | "screen_mat": item.get('屏幕材质类型'), 194 | "battery_cap": battery_cap, 195 | "front_cam": item.get('前置摄像头'), 196 | "back_cam": item.get('后置摄像头'), 197 | "price": item.get('price'), 198 | } 199 | result.append(cellphone) 200 | return result 201 | 202 | def query(): 203 | client = pymongo.MongoClient() 204 | db = client[DB] 205 | items = db.items.find({}) 206 | result = preprocess(items) 207 | df = pd.DataFrame(result) 208 | #df.drop_duplicates(subset=["brand", "model", "rom", "ram"], inplace=True) 209 | df_res = df[df.cpu_brand=="骁龙(Snapdragon)"][df.battery_cap >= 3000][df.rom >= 64][df.ram >= 6][df.dual_sim == True][df.price<=1500][df.brand=="小米(MI)"] 210 | print(df_res[["brand", "model", "color", "cpu_brand", "cpu_freq", "cpu_core", "cpu_model", "rom", "ram", "battery_cap", "price"]].sort_values(by=["price", "battery_cap"], ascending=[True, False]).to_csv("cellPhone.csv", encoding="GBK")) 211 | return df_res 212 | 213 | if __name__ == "__main__": 214 | parser = argparse.ArgumentParser() 215 | parser.add_argument("--save", help="save data from web", action="store_true", dest="save") 216 | parser.add_argument("--query", help="query data from DB", action="store_true", dest="query") 217 | args = parser.parse_args() 218 | 219 | if args.save: 220 | page_num = get_page_num() 221 | for i in range(page_num): 222 | get_cellphone(i) 223 | elif args.query: 224 | query() 225 | -------------------------------------------------------------------------------- /scrapy/postgraduate_colleges/PostgraduateColleges.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinuxy/Python/98e06bb62af035dbf3f9bd234c83b276c343973f/scrapy/postgraduate_colleges/PostgraduateColleges.xlsx -------------------------------------------------------------------------------- /scrapy/postgraduate_colleges/postgraduatecolleges.csv: -------------------------------------------------------------------------------- 1 |  2 | -------------------------------------------------------------------------------- /scrapy/postgraduate_colleges/字段属性.txt: -------------------------------------------------------------------------------- 1 | university: 大学名称 2 | attributes: 院校属性 3 | -------------------------------------------------------------------------------- /scrapy/scrapy163musicComments/scrapyWyycomments.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 3.6.1 2 | # -*- coding:utf-8 -*- 3 | # ____author___='Yinux' 4 | # -*- coding:utf-8 -*- 5 | 6 | import json 7 | import random 8 | 9 | import requests 10 | import time 11 | import csv 12 | import codecs 13 | 14 | 15 | """ 16 | 爬取网易云音乐歌曲的精彩评论 17 | @Author monkey 18 | @Date 2018-6-6 19 | """ 20 | 21 | 22 | def start_spider(song_id): 23 | """ 评论数据采用 AJAX 技术获得, 下面才是获取评论的请求地址 """ 24 | url = 'http://music.163.com/weapi/v1/resource/comments/R_SO_4_{}?csrf_token='.format(song_id) 25 | 26 | headers = { 27 | 'User-agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36', 28 | 'Origin': 'http://music.163.com', 29 | 'Referer': 'http://music.163.com/song?id={}'.format(song_id), 30 | } 31 | 32 | formdata = { 33 | 'params': '57Wh2mgebLOOPQVBc+B2wz4sCCH/nXZFEoTc/XNySiqT0V7ZxUADzDNgTXXhYgAJ5BNMryMgxhdwNzF1GyxDZo3iR9/YYbWgCAQHC5DCDuObqvxNcOcnQDaRqJCrqQcrEABW1SwKitfbD3wMEyB4tJu+rU8goSwg2FP/PBBLs9DVs1iWdWGjV6CdrocA36Rs', 34 | 'encSecKey': '63774137ba4f5cc60d1b6a3bc14985a9563a7bfdec4f3e74297ffc07514adf18f90620933a01c2db4ca989cc4e1dfc49789981424c294a34e48c2cbe7aa51533a5cc5b5776a9e499cd08770bc596655dbe8e001d1ed5fd47a27dd195128480820cc67a799d341f95d447e3522851f2b64ad1cb8350e2015b265b9e684179351c', 35 | } 36 | 37 | response = requests.post(url, headers=headers, data=formdata) 38 | print('请求 [ ' + url + ' ], 状态码为 ') 39 | print(response.status_code) 40 | # get_hot_comments(response.text) 41 | # 将数据写到 CSV 文件中 42 | write_to_file(get_hot_comments(response.text)) 43 | 44 | def get_163music(url): 45 | user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", 46 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", 47 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.0) Gecko/20100101 Firefox/61.0", 48 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36", 49 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36"] 50 | header={'User-Agent':'Mozilla/5.0'} 51 | header['User-Agent'] = random.choice(user_agent_list) 52 | text = requests.session() 53 | response=text.get(url,headers = header).content 54 | text = BeautifulSoup(response,'lxml') 55 | content = text.find('ul',{'class':'f-hide'}) 56 | playlist = [] 57 | site = 'https://music.163.com/#' 58 | for music in content.find_all('a'): 59 | playlist.append(site + music['href']) 60 | # print('{} : {}'.format(music.text, music['href'])) 61 | return playlist 62 | 63 | def get_hot_comments(response): 64 | """ 获取精彩评论 65 | 请求返回结果是 Json 数据格式, 使用 json.loads(response) 将其转化为字典类型, 就可以使用 key-value 形式获取值 66 | """ 67 | data_list = [] 68 | data = {} 69 | 70 | for comment in json.loads(response)['hotComments']: 71 | data['userId'] = comment['user']['userId'] 72 | data['nickname'] = comment['user']['nickname'] 73 | data['content'] = comment['content'] 74 | data['likedCount'] = comment['likedCount'] 75 | data_list.append(data) 76 | data = {} 77 | # print(data_list) 78 | return data_list 79 | 80 | 81 | def write_to_file(datalist): 82 | print('开始将数据持久化……') 83 | file_name = '网易云音乐精彩评论.csv' 84 | 85 | with codecs.open(file_name, 'a+', 'utf-8_sig') as csvfile: 86 | filednames = ['用户Id', '昵称', '评论内容', '点赞数'] 87 | writer = csv.DictWriter(csvfile, fieldnames=filednames) 88 | 89 | writer.writeheader() 90 | for data in datalist: 91 | print(data) 92 | try: 93 | writer.writerow({filednames[0]: data['userId'], 94 | filednames[1]: data['nickname'], 95 | filednames[2]: data['content'], 96 | filednames[3]: data['likedCount']}) 97 | except UnicodeEncodeError: 98 | print("编码错误, 该数据无法写到文件中, 直接忽略该数据") 99 | 100 | print('成功将数据写入到 ' + file_name + ' 中!') 101 | 102 | 103 | def get_song_id(url): 104 | """ 从 url 中截取歌曲的 id """ 105 | song_id = url.split('=')[1] 106 | return song_id 107 | 108 | 109 | def main(): 110 | url = 'http://music.163.com/playlist?id=987444580' 111 | songs_url_list = get_163music(url) 112 | 113 | for each in songs_url_list: 114 | start_spider(get_song_id(each)) 115 | time.sleep(random.randint(5, 8)) 116 | 117 | 118 | if __name__ == '__main__': 119 | main() 120 | 121 | -------------------------------------------------------------------------------- /scrapy/vip-item/README.md: -------------------------------------------------------------------------------- 1 | # 使用bs4唯品会(vip.com)抓取唯品会商品信息 # 2 | ## 微信公众号 ## 3 | ![](https://cdn.jsdelivr.net/gh/InfiniteYinux/cloud@master/qrcode.jpg) 4 | 欢迎扫码关注 5 | ## 博客 ## 6 | [YINUXY'S BLOG](https://blog.yinuxy.com/) 7 | 8 | ## 安装& 使用 ## 9 | ### 安装依赖 ### 10 | `pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com` 11 | ### 获取代码 ### 12 | `git clone ggit@github.com:InfiniteYinux/Python.git` 13 | ### 运行 ### 14 | ``` 15 | cd pythonScript\vip-item 16 | python vip.py 17 | ``` 18 | ## 更新 ## 19 | 1. 2020-05-02 新增数据存储方式:存入数据库 -------------------------------------------------------------------------------- /scrapy/vip-item/requirements.txt: -------------------------------------------------------------------------------- 1 | json 2 | random 3 | requests 4 | BeautifulSoup 5 | selenium 6 | time 7 | pymysql -------------------------------------------------------------------------------- /scrapy/vip-item/vip.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Apr 30 21:01:12 2020 4 | 5 | @author: Yinux 6 | """ 7 | 8 | import json 9 | import random 10 | import requests 11 | from bs4 import BeautifulSoup 12 | from selenium import webdriver 13 | from time import sleep 14 | import time 15 | import pymysql.cursors 16 | #FEED_EXPORT_ENCODING = 'utf-8' 17 | 18 | class VipSpider(object): 19 | def __init__(self, url, search, start_page, end_page): 20 | ua = random.choice(self.user_agent_list) 21 | self.url = url 22 | self.search = search 23 | self.start_page = start_page 24 | self.end_page = end_page 25 | self.headers = { 26 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36", 27 | "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 28 | "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 29 | "User-Agent":"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 30 | "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 31 | "User-Agent":"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 32 | "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 33 | "User-Agent":"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 34 | "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 35 | "User-Agent":"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 36 | "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 37 | "User-Agent":"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 38 | "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 39 | "User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 40 | "User-Agent":"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 41 | "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 42 | "User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"} 43 | self.proxies = { 44 | "http:":"123.101.213.98:9999", 45 | "http":"114.101.42.127:65309", 46 | "http":"39.106.194.91:808", 47 | "http":"122.51.231.113:8080", 48 | "http":"36.248.132.250:9999", 49 | "http":"180.118.128.54:9000", 50 | "http":"113.195.224.194:9999", 51 | "http":"39.108.59.34:8118", 52 | "http":"47.94.200.124:3128", 53 | "http":"163.204.246.83:9999", 54 | "http":"113.124.94.72:9999" 55 | } 56 | self.driver = webdriver.Chrome() 57 | self.conn=pymysql.connect(host="127.0.0.1", 58 | user="username", 59 | passwd="pasword", 60 | charset='utf8mb4', 61 | cursorclass=pymysql.cursors.DictCursor) 62 | self.cur = self.conn.cursor() 63 | self.cur.execute("CREATE DATABASE IF NOT EXISTS `jobs`") 64 | self.cur.execute("USE jobs") 65 | self.cur.execute("DROP TABLE IF EXISTS `web_51jobs`") 66 | self.cur.execute("CREATE TABLE `web_51jobs` (`id` INT PRIMARY KEY AUTO_INCREMENT,`position` varchar(200) DEFAULT NULL,`wages` varchar(200) DEFAULT NULL,`region` varchar(200) DEFAULT NULL,`experience` varchar(200) DEFAULT NULL,`education` varchar(200) DEFAULT NULL,`need_people` varchar(100) DEFAULT NULL,`publish_date` varchar(200) DEFAULT NULL,`english` varchar(300) DEFAULT NULL,`welfare_tags` varchar(200) DEFAULT NULL,`job_information` varchar(200) DEFAULT NULL,`work_address` varchar(200) DEFAULT NULL,`company_name` varchar(200) DEFAULT NULL,`company_nature` varchar(200) DEFAULT NULL,`company_scale` varchar(200) DEFAULT NULL,`company_industry` varchar(200) DEFAULT NULL,`company_information` varchar(200) DEFAULT NULL,PRIMARY KEY (`id`))") 67 | 68 | 69 | def handle_click(self): 70 | self.driver.get(self.url) 71 | self.driver.find_elements_by_xpath("//*[@id='J_main_nav_link']/li[13]/a")[0].click() 72 | sleep(2) 73 | self.driver.find_elements_by_xpath("//*[@id='J-search']/div[1]/input")[0].send_keys(self.search) 74 | sleep(2) 75 | self.driver.find_elements_by_xpath("//*[@id='J-search']/div[1]/a/span")[0].click() 76 | sleep(3) 77 | 78 | def handle_url(self, page): 79 | Durl = self.driver.current_url # "https://category.vip.com/suggest.php?keyword=%E7%AF%AE%E7%90%83&ff=235|12|1|1" 80 | index = Durl.rfind("&") 81 | Durl = Durl[:index] 82 | data = { 83 | "page": page 84 | } 85 | res = requests.get(url=Durl, params=data, headers=random.choice(self.headers),proxies=random.choice(self.proxies)) 86 | newurl = res.url 87 | print(newurl) 88 | return newurl 89 | 90 | def scroll_page(self, req): 91 | self.driver.get(req) 92 | sleep(3) 93 | for x in range(20): 94 | js = "var q=document.documentElement.scrollTop=10000" 95 | self.driver.execute_script(js) # 执行脚本(滚动) 96 | sleep(5) 97 | html = self.driver.page_source 98 | 99 | return html 100 | 101 | def downloadin(self, url): 102 | req = requests.get(url,headers=self.headers) 103 | soup = BeautifulSoup(req.content,"lxml") 104 | GoodsList = soup.select("div.pi-title-box") 105 | for div in GoodsList: 106 | shopname = div.a.get_text() 107 | try: 108 | desc = div.select("span.goods-description-title")[0].get_text() 109 | except: 110 | desc = '' 111 | return shopname,desc 112 | 113 | def download(self, request): 114 | soup = BeautifulSoup(request, "lxml") 115 | SectionList = soup.select("section#J_searchCatList")[0] 116 | GoodsList = SectionList.select("div.c-goods") 117 | items = [] 118 | for div in GoodsList: 119 | item = {} 120 | itemlink = div.select("h4.goods-info a")[0].get('href') 121 | imageslink = div.img["data-original"] 122 | title = div.select("h4.goods-info a")[0].get_text() 123 | discount = div.select("div.goods-info span")[0].get_text() 124 | pricewra = div.select("div.goods-info em")[0].get_text() 125 | marprice = div.select("div.goods-info del.goods-market-price ")[0].get_text() 126 | item["商品链接"] = 'http:' + itemlink 127 | item["图片链接"] = 'http:' + imageslink 128 | item["商品名称"] = title 129 | item["商品折扣"] = discount 130 | item["特卖价格"] = pricewra 131 | item["原始价格"] = marprice 132 | item["商铺名称"], item["商品描述"] = self.downloadin(item["商品链接"]) 133 | self.process_item(item) 134 | items.append(item) 135 | 136 | return items 137 | 138 | def process_item(self,item): 139 | # self.cur = self.conn.cursor() 140 | try: 141 | itemurl = item["商品链接"] 142 | imageurl = item["图片链接"] 143 | title = item["商品名称"] 144 | discount = item["商品折扣"] 145 | saleprice = item["特卖价格"] 146 | oldprice = item["原始价格"] 147 | shopname = item["商铺名称"] 148 | description = item["商品描述"] 149 | sql = "INSERT INTO `Cosmetics` (`title`, `discount`,`saleprice`,`oldprice`,`shopname`,`description`, `imageurl`,`itemurl`) VALUES ('"+title+"','"+discount+"','"+saleprice+"','"+oldprice+"','"+shopname+"','"+description+"','"+imageurl+"','"+itemurl+"')" 150 | self.cur.execute(sql) 151 | self.conn.commit() 152 | # self.conn.close() 153 | except Exception as err: 154 | print(err) 155 | 156 | def startSpider(self): 157 | htmlList = [] 158 | for page in range(int(self.start_page), int(self.end_page) + 1): 159 | print("正在抓取第"+ str(page) +"页的数据") 160 | start = time.time() 161 | if page == 1: 162 | self.handle_click() 163 | req = self.handle_url(page) 164 | newhtml = self.scroll_page(req) 165 | htmlList += self.download(newhtml) 166 | else: 167 | req = self.handle_url(page) 168 | newhtml = self.scroll_page(req) 169 | htmlList += self.download(newhtml) 170 | end = time.time() 171 | print("第"+ str(page) +"页的数据抓取完毕,用时"+ str(end-start) +"s") 172 | # 【数据的存储】写入json数据 173 | # 将列表转化成json字符串 174 | 175 | string = json.dumps(htmlList,ensure_ascii=False) 176 | with open("vip2.json", "w", encoding="utf-8") as fp: 177 | fp.write(string) 178 | self.conn.close() 179 | 180 | 181 | def main(): 182 | starts = time.time() 183 | url = "http://www.vip.com/" 184 | search = '化妆品' 185 | # search = input("请输入你要搜索的商品:") 186 | start_page = 1 187 | # start_page = input("请输入你要爬取的起始页:") 188 | end_page = 40 189 | # end_page = input("请输入你要爬取的结束页:") 190 | spider = VipSpider(url, search, start_page, end_page) 191 | spider.startSpider() 192 | ends = time.time() 193 | print("程序运行完毕,总用时"+ str(int(ends-starts)/60) +"分钟") 194 | 195 | if __name__ == '__main__': 196 | main() -------------------------------------------------------------------------------- /scrapy/ximalaya/ximalaya.py: -------------------------------------------------------------------------------- 1 |  2 | import requests 3 | from bs4 import BeautifulSoup 4 | import re 5 | import os 6 | import random 7 | import time 8 | 9 | ''' 10 | 作者:pk哥 11 | 公众号:brucepk 12 | 日期:2018/10/11 13 | 代码解析详见公众号「brucepk」。 14 | 15 | 如有疑问或需转载,请联系微信号:dyw520520,备注来意,谢谢。 16 | 如需加入python技术交流群,请加我微信,备注「进群」,我拉你进群,一起讨论交流,共同成长。 17 | ''' 18 | 19 | 20 | def gethtml(url): # 获取网站 html 信息 21 | headers = { 22 | 'User-Agent': 23 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'} 24 | # 用的代理 ip,如果被封的,在http://www.xicidaili.com/换一个 25 | proxy_addr = {'http': '221.7.255.168:8080'} 26 | html = requests.get(url, headers=headers, proxies=proxy_addr) # 请求网页信息 27 | return html 28 | 29 | 30 | def getid(): # 获取专辑的 id 和标题信息 31 | keyword = input('请输入你要查找的音频关键字:\n') # 输入需要下载音频的关键字 32 | albumurl = 'https://www.ximalaya.com/search/album/{}/sc/p1'.format(keyword) # 输入关键字,拼接链接 33 | html = gethtml(albumurl) 34 | soup = BeautifulSoup(html.text, 'lxml') 35 | info = soup.select('#searchPage div.search-type div.common-tab-content div.xm-loading ul div ' 36 | 'a.xm-album-title.ellipsis-2') # 提取音频文件的信息 37 | idinfo = re.compile('href="/.*?"').findall(str(info)) # 提取专辑中 id 38 | titleinfo = re.compile('title=".*?"').findall(str(info)) # 提取专辑中标题信息 39 | ids = [] 40 | titles = [] 41 | for j in idinfo: 42 | id = str(j).split('/')[2] 43 | ids.append(id) 44 | for t in titleinfo: 45 | # 处理下标题,防止创建文件夹失败 46 | title = str(t).split('"')[1].replace('\\', ' ').replace('/', ' ').replace(':', ' ').replace('*', ' ')\ 47 | .replace('?', ' ').replace('"', ' ').replace('<', ' ').replace('>', ' ').replace('|', ' ') 48 | titles.append(title) 49 | return ids, titles 50 | 51 | 52 | def downm4a(albumId): 53 | # 获取专辑下的音频总数 54 | counturl = 'https://www.ximalaya.com/revision/album/getTracksList?albumId={}&pageNum=1'.format(albumId) 55 | chtml = gethtml(counturl) 56 | cjson = chtml.json() 57 | trackTotalCount = int(cjson['data']['trackTotalCount']) 58 | if trackTotalCount < 30 or trackTotalCount == 30: # 音频数小于等于 30 时,只有一页 59 | pageNum = 1 60 | else: 61 | if trackTotalCount % 30 == 0: # 音频数大于 30 时,且是30的倍数时 62 | pageNum = trackTotalCount // 30 63 | else: 64 | pageNum = (trackTotalCount // 30) + 1 # 音频数大于 30 时,不是30的倍数时 65 | for num in range(1, pageNum+1): 66 | m4aurl = 'https://www.ximalaya.com/revision/play/album?albumId={}&pageNum={}&pageSize=30'.format(albumId, num) # 拼接可下载音频信息的链接 67 | mhtml = gethtml(m4aurl) 68 | mjson = mhtml.json() 69 | for i in range(30): # 一个页面最多30个音频文件 70 | try: 71 | trackName = mjson['data']['tracksAudioPlay'][i]['trackName'] # 提取音频标题 72 | src = mjson['data']['tracksAudioPlay'][i]['src'] # 提取可下载链接 73 | print(trackName) 74 | print(src) 75 | if str(src) in('null', 'None'): # 如果为付费音频,则跳出循环,继续下载下一个专辑 76 | print('此为付费音频,无法下载') 77 | break 78 | data = requests.get(src).content 79 | with open('%s.m4a' % trackName, 'wb') as f: # 下载音频 80 | f.write(data) 81 | except IndexError: 82 | print('当前专辑已爬取完成!') 83 | continue 84 | 85 | 86 | def mkdir(): # 判断目录是否存在,不存在的话则自动创建 87 | ids, titles = getid() 88 | for title, albumId in zip(titles, ids): 89 | print(title) 90 | path = 'E:\\spiderproject\\ximalaya\\{}'.format(title) # 以音频名称命名 91 | isExists = os.path.exists(path) 92 | if not isExists: 93 | print('创建目录{}'.format(title)) # 目录不存在则创建一个 94 | os.makedirs(path) # 创建目录 95 | os.chdir(path) # 切换到创建的文件夹 96 | downm4a(albumId) # 调用函数下载音频到该目录下 97 | else: 98 | print('{}目录已存在,即将保存!'.format(title)) 99 | os.chdir(path) # 切换到创建的文件夹 100 | downm4a(albumId) # 目录已存在时直接保存 101 | time.sleep(int(format(random.randint(2, 6)))) # 随机等待 102 | 103 | 104 | if __name__ == '__main__': 105 | mkdir() 106 | 107 | 108 | -------------------------------------------------------------------------------- /scrapy/yunzhanImgToPdf/README.md: -------------------------------------------------------------------------------- 1 | 目前只能下载:https://book.yunzhan365.com/xxxx/xxxx/mobile/index.html 2 | 或者:http://www.yunzhan365.com/xxxxxxxxx.html 这样的网址,其他的网址会不行! 3 | 如果是在纯文本展示页面获取的链接,到翻书页面获取网址在开始采集!(ps:主要是懒,不想在多写几个判断条件了!) -------------------------------------------------------------------------------- /scrapy/yunzhanImgToPdf/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | from lxml import etree 4 | import img2pdf 5 | 6 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'} 7 | url = input('请输入云展网图集网址:') 8 | splurl = url.split('/') #分割网址,准备下面判断 9 | if 'index.html' not in splurl: #判断是那一种链接 10 | res = requests.get(url , headers=headers) #获取源码 11 | res.encoding = res.apparent_encoding 12 | xml = etree.HTML(res.text).xpath('//div[@class="show-book-title"]/a/@href')[0].split('/') #取得book.yunzhan365.con网址进行分割 13 | purl = xml[2] + '/' + xml[3] + '/' + xml[4] + '/files/' + 'mobile/' #构造图片下载网址前缀 14 | pathname = etree.HTML(res.text).xpath('//div[@class="show-book-title"]/a/text()') #获取名称 15 | else: 16 | res = requests.get(url , headers=headers) #获取源码 17 | res.encoding = res.apparent_encoding 18 | pathname = etree.HTML(res.text).xpath('/html/head/title/text()') #获取名称 19 | purl = splurl[2] + '/' + splurl[3] + '/' + splurl[4] + '/files/' + 'mobile/' #构造图片前缀 20 | 21 | path = './' #存储路径 22 | if not os.path.exists(path): 23 | os.makedirs(path) #如果路径不存在就创建 24 | m = 0 #定义图片名称变量 25 | imgs = [] #准备空列表放置图片内容 26 | with open(path + '/' + str(pathname[0]) + '.pdf' , 'wb') as f: #创建并打开一个pdf文件,准备写入 27 | while True: #死循环获取并写入图片 28 | m += 1 #名称变量 29 | surl1 = 'http://' + purl + str(m) +'.jpg' #构造图片链接 30 | picurl = requests.get(surl1) #获取图片内容 31 | if picurl.status_code == 200: #判断下 如果图片存在就写入列表 32 | imgs.append(picurl.content) 33 | else: 34 | f.write(img2pdf.convert(imgs)) #把列表中所有的图片内容 写入pdf 35 | print(f'采集完毕!一共采集了{m -1}张,生成的pdf是{path}目录下【{pathname[0]}.pdf】') 36 | break #中止循环! -------------------------------------------------------------------------------- /scrapy/zhihu-pretty-girl/READEME.md: -------------------------------------------------------------------------------- 1 | ## 微信公众号 ## 2 | ![](https://cdn.jsdelivr.net/gh/InfiniteYinux/cloud@master/qrcode.jpg) 3 | 欢迎扫码关注 4 | ## 博客 ## 5 | [YINUXY'S BLOG](https://blog.yinuxy.com/) 6 | 7 | ## 安装& 使用 ## 8 | ### 安装依赖 ### 9 | `pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com` 10 | ### 获取代码 ### 11 | `git clone ggit@github.com:InfiniteYinux/Python.git` 12 | ### 运行 ### 13 | ``` 14 | cd pythonScript\zhihu-pretty-girl 15 | python zhihu-pretty-girl.py 16 | ``` -------------------------------------------------------------------------------- /scrapy/zhihu-pretty-girl/requirements.txt: -------------------------------------------------------------------------------- 1 | re 2 | argparse 3 | time 4 | json 5 | requests 6 | pymongo -------------------------------------------------------------------------------- /scrapy/zhihu-pretty-girl/zhihu-pretty-girl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Mar 6 22:48:22 2020 4 | 5 | @author: Yinux 6 | """ 7 | 8 | import re 9 | import argparse 10 | import time 11 | import json 12 | import requests 13 | import pymongo 14 | 15 | def get_answers_by_page(page_no): 16 | offset = page_no * 10 17 | url = "https://www.zhihu.com/api/v4/questions/266808424/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset={}&limit=10&sort_by=default&platform=desktop".format(offset) 18 | headers = { 19 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", 20 | } 21 | r = requests.get(url, verify=False, headers=headers) 22 | content = r.content.decode("utf-8") 23 | data = json.loads(content) 24 | is_end = data["paging"]["is_end"] 25 | items = data["data"] 26 | client = pymongo.MongoClient() 27 | db = client["beauty"] 28 | if len(items) > 0: 29 | db.answers.insert_many(items) 30 | return is_end 31 | 32 | def get_answers(): 33 | page_no = 0 34 | client = pymongo.MongoClient() 35 | while True: 36 | print(page_no) 37 | is_end = get_answers_by_page(page_no) 38 | page_no += 1 39 | if is_end: 40 | break 41 | 42 | def query(): 43 | client = pymongo.MongoClient() 44 | db = client["beauty"] 45 | items = db.answers.find({"voteup_count": {"$gte": 100}}).sort([("voteup_count", pymongo.DESCENDING)]) 46 | count = 0 47 | 48 | for item in items: 49 | content = item["content"] 50 | vote_num = item["voteup_count"] 51 | author = item["author"]["name"] 52 | matched = re.findall(r'data-original="([^"]+)"', content) 53 | print("> 来自 {}\n".format(item["url"])) 54 | print("> 作者 {}\n".format(author)) 55 | print("> 赞数 {}\n".format(vote_num)) 56 | img_urls = [] 57 | for img_url in matched: 58 | if img_url not in img_urls: 59 | print("![]({})".format(img_url)) 60 | img_urls.append(img_url) 61 | count += len(img_urls) 62 | print("\n\n") 63 | print(count) 64 | 65 | if __name__ == "__main__": 66 | parser = argparse.ArgumentParser() 67 | parser.add_argument("--save", help="save data", action="store_true", dest="save") 68 | parser.add_argument("--query", help="query data", action="store_true", dest="query") 69 | args = parser.parse_args() 70 | 71 | if args.save: 72 | get_answers() 73 | elif args.query: 74 | query() --------------------------------------------------------------------------------