├── chahaoba ├── README.md ├── sandbox │ ├── __init__.py │ ├── spiders │ │ ├── spider_subscribe.py │ │ └── __init__.py │ ├── items.py │ ├── utility.py │ ├── models.py │ └── pipelines.py ├── run.py ├── scrapy.cfg ├── .gitignore └── sync_data.py ├── jd ├── jd │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ ├── quotes.py │ │ └── jd_book.py │ ├── items.py │ └── pipelines.py ├── requirements.txt ├── run.py ├── scrapy.cfg └── switch_ip.py ├── kc0011 ├── README.md ├── sandbox │ ├── __init__.py │ ├── spiders │ │ └── __init__.py │ ├── utility.py │ ├── items.py │ └── pipelines.py ├── run.py ├── scrapy.cfg ├── .gitignore └── async_mongo.py ├── tiexue ├── README.md ├── sandbox │ ├── __init__.py │ ├── spiders │ │ └── __init__.py │ ├── items.py │ ├── utility.py │ └── pipelines.py ├── run.py ├── qqq ├── scrapy.cfg └── .gitignore ├── fraud ├── fraud │ ├── __init__.py │ ├── model │ │ ├── __init__.py │ │ ├── db_config.py │ │ └── fraud.py │ ├── spiders │ │ └── __init__.py │ ├── items.py │ ├── match.py │ ├── pipelines.py │ ├── settings.py │ └── middlewares.py ├── run.py └── scrapy.cfg ├── lanrentingshu ├── 懒人听书.txt ├── lrts │ ├── lrts │ │ ├── __init__.py │ │ ├── spiders │ │ │ ├── __init__.py │ │ │ └── tingshu.py │ │ ├── items.py │ │ ├── pipelines.py │ │ └── settings.py │ ├── run.py │ └── scrapy.cfg ├── header_toolkit.py ├── request_header └── lanrentingshu.py ├── sz_yaohao ├── README.md ├── sandbox │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ └── website.py │ ├── items.py │ ├── headers.txt │ ├── utility.py │ ├── models.py │ └── pipelines.py ├── run.py ├── scrapy.cfg └── .gitignore ├── weibo ├── weibo │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ └── wb.py │ ├── pipelines.py │ └── items.py └── scrapy.cfg ├── bbssmth ├── bbssmth │ ├── __init__.py │ ├── spiders │ │ ├── test.html │ │ └── __init__.py │ ├── items.py │ └── pipelines.py ├── cmd_run.py └── scrapy.cfg ├── myubbs ├── sandbox │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ └── website.py │ ├── items.py │ ├── utility.py │ ├── models.py │ ├── headers │ └── pipelines.py ├── run.py ├── scrapy.cfg └── .gitignore ├── 51jbnet ├── im_sandbox │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ └── website.py │ ├── items.py │ ├── models.py │ ├── pipelines.py │ └── settings.py ├── README.MD ├── start_task.py └── scrapy.cfg ├── bilibili ├── bilibili │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ ├── bili.log │ │ └── bili.py │ ├── pipelines.py │ ├── items.py │ └── logger.py ├── run.py └── scrapy.cfg ├── poi_gaode ├── sandbox │ ├── __init__.py │ ├── spiders │ │ └── __init__.py │ ├── items.py │ ├── models.py │ └── pipelines.py ├── run.py ├── scrapy.cfg ├── .gitignore ├── .ipynb_checkpoints │ └── Untitled-checkpoint.ipynb └── gaode_map.py ├── tencentjob ├── tencentjob │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ └── tencent.py │ ├── items.py │ └── pipelines.py ├── run.py └── scrapy.cfg ├── MyLibrary ├── sandbox │ ├── sandbox │ │ ├── __init__.py │ │ ├── spiders │ │ │ └── __init__.py │ │ ├── items.py │ │ ├── models.py │ │ ├── pipelines.py │ │ └── settings.py │ ├── run.py │ ├── scrapy.cfg │ └── .gitignore ├── __init__.py └── login.py ├── cuiqingcai ├── README.MD ├── async_sandbox │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ └── example.py │ ├── items.py │ ├── utility.py │ └── pipelines.py ├── run_spider.py └── scrapy.cfg ├── async_cuiqingcai ├── async_sandbox │ ├── __init__.py │ ├── monitor │ │ ├── __init__.py │ │ ├── settings.py │ │ ├── app.py │ │ ├── statscol.py │ │ └── templates │ │ │ └── index.html │ ├── spiders │ │ ├── __init__.py │ │ └── crawl_all_example.py │ ├── items.py │ ├── commands │ │ └── crawlall.py │ ├── RedisDuplicator.py │ ├── CustomExtension.py │ ├── CustomMiddleware.py │ └── pipelines.py ├── run_spider.py ├── README.MD ├── scrapy.cfg ├── rabbit_send.py └── multi_spider_run.py ├── ximalaya ├── README.MD ├── story.py └── main.py ├── chinaclear └── README.md ├── pornhub ├── _config.yml ├── requirements.txt ├── parseJS.py ├── README.md ├── tampermonkey.js ├── newJs.js ├── .gitignore └── cookies_access.py ├── phone_bomb ├── fuguo.py └── ceoonline │ ├── train │ └── ceconline.h5 │ └── images │ └── ceconline.h5 ├── 51CTOCrawler ├── getkey.txt └── demo.py ├── .gitattributes ├── holdle ├── common │ └── __init__.py └── sync_spider.py ├── stockholder ├── __init__.py └── main.py ├── dfcf ├── push_redis.py └── settings.py ├── github_star ├── .vscode │ └── launch.json └── star.py ├── m3u8_video └── experience.py ├── csdn └── getCSDN_Range.py ├── .gitignore ├── Forbes └── main.py ├── 52sh ├── config_file.py └── aio_spider.py ├── qianfangyiguan └── qianfan_models.py ├── dashiye └── main.py ├── README.MD ├── szhouse ├── database.py └── house.py ├── Ergeduoduo └── main.py ├── fangtianxia └── fangtianxia_proxy_test.py ├── yinyonbao └── yingyongbao.py ├── v2ex_job └── v2ex2.py ├── anjuke ├── test_anjuke.py └── anjuke.py ├── youdao_dictionary └── youdao.py ├── zhihu └── zhihu_book.py ├── stock_pledge └── crawler.py └── baiduwanpan └── baiduwanpan.py /chahaoba/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/jd/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kc0011/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tiexue/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fraud/fraud/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lanrentingshu/懒人听书.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sz_yaohao/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /weibo/weibo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bbssmth/bbssmth/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /chahaoba/sandbox/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kc0011/sandbox/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /myubbs/sandbox/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tiexue/sandbox/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /51jbnet/im_sandbox/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bilibili/bilibili/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fraud/fraud/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /poi_gaode/sandbox/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sz_yaohao/sandbox/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tencentjob/tencentjob/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /51jbnet/README.MD: -------------------------------------------------------------------------------- 1 | # Improve Sandbox -------------------------------------------------------------------------------- /MyLibrary/sandbox/sandbox/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cuiqingcai/README.MD: -------------------------------------------------------------------------------- 1 | # 异步爬虫框架 - 写入数据库异步 -------------------------------------------------------------------------------- /cuiqingcai/async_sandbox/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lanrentingshu/lrts/lrts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /async_cuiqingcai/async_sandbox/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ximalaya/README.MD: -------------------------------------------------------------------------------- 1 | ## 喜马拉雅爬虫 杨继东的投资之道 2 | -------------------------------------------------------------------------------- /chinaclear/README.md: -------------------------------------------------------------------------------- 1 | ## 中登新增投资者数目抓取 2 | ## 每周定时抓取 -------------------------------------------------------------------------------- /pornhub/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-modernist -------------------------------------------------------------------------------- /chahaoba/sandbox/spiders/spider_subscribe.py: -------------------------------------------------------------------------------- 1 | # 使用订阅者模式爬虫 -------------------------------------------------------------------------------- /jd/requirements.txt: -------------------------------------------------------------------------------- 1 | selenium 2 | pandas 3 | scrapy 4 | requests 5 | -------------------------------------------------------------------------------- /pornhub/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | pysocks 3 | lxml 4 | js2py 5 | clint 6 | fire 7 | loguru 8 | -------------------------------------------------------------------------------- /phone_bomb/fuguo.py: -------------------------------------------------------------------------------- 1 | # 富国基金 2 | url='https://etrading.fullgoal.com.cn/etrading/account/openacco/quickinit' -------------------------------------------------------------------------------- /MyLibrary/__init__.py: -------------------------------------------------------------------------------- 1 | #-*-coding=utf-8-*- 2 | 3 | def main(): 4 | 5 | 6 | if __name__ == '__main__': 7 | main() -------------------------------------------------------------------------------- /bbssmth/bbssmth/spiders/test.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rockyzsu/CrawlMan/master/bbssmth/bbssmth/spiders/test.html -------------------------------------------------------------------------------- /51CTOCrawler/getkey.txt: -------------------------------------------------------------------------------- 1 | TiThyXMlilmeFso5akV6ZDg0NlpEWXhOak0zTmNUaG1PRFprTUdNY3lPV1l4YVllekpsTzBEMlU1MzU2NTQ4ZDM4M005ZDI1OGM5VGM3eg== -------------------------------------------------------------------------------- /phone_bomb/ceoonline/train/ceconline.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rockyzsu/CrawlMan/master/phone_bomb/ceoonline/train/ceconline.h5 -------------------------------------------------------------------------------- /async_cuiqingcai/async_sandbox/monitor/__init__.py: -------------------------------------------------------------------------------- 1 | # *-* coding:utf-8 *-* 2 | ''' 3 | @author: ioiogoo 4 | @date: 2016/12/25 15:05 5 | ''' -------------------------------------------------------------------------------- /phone_bomb/ceoonline/images/ceconline.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rockyzsu/CrawlMan/master/phone_bomb/ceoonline/images/ceconline.h5 -------------------------------------------------------------------------------- /tencentjob/run.py: -------------------------------------------------------------------------------- 1 | 2 | # -*-coding=utf-8-*- 3 | from scrapy import cmdline 4 | cmd = 'scrapy crawl tencent' 5 | cmdline.execute(cmd.split()) -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.js linguist-language=python 2 | *.css linguist-language=python 3 | *.ipynb linguist-language=python 4 | *.html linguist-language=python -------------------------------------------------------------------------------- /holdle/common/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2020/11/16 11:39 3 | # @File : __init__.py 4 | # @Author : Rocky C@www.30daydo.com 5 | -------------------------------------------------------------------------------- /cuiqingcai/run_spider.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | name = 'example' 3 | cmdline.execute('scrapy crawl {} -s LOG_FILE=cuiqingcai.log'.format(name).split()) -------------------------------------------------------------------------------- /fraud/run.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | 3 | name = 'fraud_info' # fraud_info 4 | cmd = 'scrapy crawl {0}'.format(name) 5 | cmdline.execute(cmd.split()) -------------------------------------------------------------------------------- /async_cuiqingcai/run_spider.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | name = 'example' 3 | cmdline.execute('scrapy crawl {} -s LOG_FILE=scrapy.log'.format(name).split()) -------------------------------------------------------------------------------- /async_cuiqingcai/README.MD: -------------------------------------------------------------------------------- 1 | # 异步爬虫框架 - 写入数据库异步 2 | # 自定义去重 3 | # 自定义pipeline 设置open_spider,close_spider 4 | 5 | # 自定义去重 6 | # 爬虫中间件 7 | # rabbitmq消息队列 8 | # 下载中间件 9 | -------------------------------------------------------------------------------- /jd/run.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | # @Time : 2018/8/8 22:23 3 | # @File : run.py 4 | from scrapy import cmdline 5 | cmd ='scrapy crawl jd_book' 6 | cmdline.execute(cmd.split()) -------------------------------------------------------------------------------- /lanrentingshu/lrts/run.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | from scrapy import cmdline 3 | 4 | name = 'tingshu' # 5 | cmd = 'scrapy crawl {0}'.format(name) 6 | cmdline.execute(cmd.split()) -------------------------------------------------------------------------------- /51jbnet/start_task.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2019/5/16 17:44 4 | # @File : start_task.py 5 | from scrapy import cmdline 6 | cmdline.execute('scrapy crawl website'.split()) -------------------------------------------------------------------------------- /pornhub/parseJS.py: -------------------------------------------------------------------------------- 1 | import js2py 2 | filename = '20201229js.js' 3 | with open(filename,'r') as f: 4 | content = f.read() 5 | 6 | js_object = js2py.eval_js(content) 7 | print(js_object) -------------------------------------------------------------------------------- /jd/jd/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /bbssmth/cmd_run.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # website: http://30daydo.com 3 | # @Time : 2019/4/18 23:19 4 | # @File : cmd.py 5 | from scrapy import cmdline 6 | cmdline.execute('scrapy crawl bbssm'.split()) -------------------------------------------------------------------------------- /fraud/fraud/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /weibo/weibo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /bbssmth/bbssmth/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /bilibili/run.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | # @Time : 2018/8/15 14:52 3 | # @File : run.py 4 | from scrapy import cmdline 5 | name = 'ordinary' 6 | cmd = 'scrapy crawl {}'.format(name) 7 | cmdline.execute(cmd.split()) -------------------------------------------------------------------------------- /chahaoba/sandbox/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /kc0011/sandbox/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /myubbs/sandbox/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /tiexue/sandbox/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /51jbnet/im_sandbox/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /bilibili/bilibili/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /poi_gaode/sandbox/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /sz_yaohao/sandbox/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /tencentjob/tencentjob/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /MyLibrary/sandbox/sandbox/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /chahaoba/run.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/9/26 9:58 4 | # @File : run.py 5 | 6 | from scrapy import cmdline 7 | name = 'chahaoba' 8 | cmd = 'scrapy crawl {}'.format(name) 9 | cmdline.execute(cmd.split()) -------------------------------------------------------------------------------- /cuiqingcai/async_sandbox/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /lanrentingshu/lrts/lrts/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /poi_gaode/run.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/9/26 9:58 4 | # @File : run.py 5 | 6 | from scrapy import cmdline 7 | name = 'gaode1' 8 | cmd = 'scrapy crawl {}'.format(name) 9 | cmdline.execute(cmd.split()) -------------------------------------------------------------------------------- /sz_yaohao/run.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/9/26 9:58 4 | # @File : run.py 5 | 6 | from scrapy import cmdline 7 | name = 'website' 8 | cmd = 'scrapy crawl {}'.format(name) 9 | cmdline.execute(cmd.split()) -------------------------------------------------------------------------------- /tiexue/run.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/9/26 9:58 4 | # @File : run.py 5 | 6 | from scrapy import cmdline 7 | name = 'example' 8 | cmd = 'scrapy crawl {}'.format(name) 9 | cmdline.execute(cmd.split()) -------------------------------------------------------------------------------- /async_cuiqingcai/async_sandbox/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /MyLibrary/sandbox/run.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/9/26 9:58 4 | # @File : run.py 5 | 6 | from scrapy import cmdline 7 | name = 'website2' 8 | cmd = 'scrapy crawl {}'.format(name) 9 | cmdline.execute(cmd.split()) -------------------------------------------------------------------------------- /kc0011/run.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/9/26 9:58 4 | # @File : run.py 5 | 6 | from scrapy import cmdline 7 | name = 'spider' 8 | cmd = 'scrapy crawl {} -s JOBDIR=jobs'.format(name) 9 | cmdline.execute(cmd.split()) 10 | -------------------------------------------------------------------------------- /stockholder/__init__.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | __author__ = 'Rocky' 4 | ''' 5 | http://30daydo.com 6 | Email: weigesysu@qq.com 7 | ''' 8 | 9 | 10 | def main(): 11 | pass 12 | 13 | 14 | if __name__ == '__main__': 15 | main() -------------------------------------------------------------------------------- /tiexue/qqq: -------------------------------------------------------------------------------- 1 |
2 |
旌旗漫卷 -------------------------------------------------------------------------------- /myubbs/run.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/9/26 9:58 4 | # @File : run.py 5 | import datetime 6 | 7 | from scrapy import cmdline 8 | name = 'myubbs' 9 | current = datetime.date.today() 10 | cmd = 'scrapy crawl {} -s LOG_FILE={}.log'.format(name,current) 11 | cmdline.execute(cmd.split()) -------------------------------------------------------------------------------- /jd/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = jd.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = jd 12 | -------------------------------------------------------------------------------- /fraud/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = fraud.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = fraud 12 | -------------------------------------------------------------------------------- /weibo/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = weibo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = weibo 12 | -------------------------------------------------------------------------------- /bbssmth/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = bbssmth.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = bbssmth 12 | -------------------------------------------------------------------------------- /chahaoba/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = sandbox.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = sandbox 12 | -------------------------------------------------------------------------------- /kc0011/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = sandbox.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = sandbox 12 | -------------------------------------------------------------------------------- /myubbs/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = sandbox.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = sandbox 12 | -------------------------------------------------------------------------------- /poi_gaode/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = sandbox.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = sandbox 12 | -------------------------------------------------------------------------------- /sz_yaohao/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = sandbox.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = sandbox 12 | -------------------------------------------------------------------------------- /tiexue/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = sandbox.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = sandbox 12 | -------------------------------------------------------------------------------- /51jbnet/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = im_sandbox.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = im_sandbox 12 | -------------------------------------------------------------------------------- /bilibili/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = bilibili.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = bilibili 12 | -------------------------------------------------------------------------------- /lanrentingshu/lrts/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = lrts.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = lrts 12 | -------------------------------------------------------------------------------- /MyLibrary/sandbox/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = sandbox.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = sandbox 12 | -------------------------------------------------------------------------------- /tencentjob/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = tencentjob.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = tencentjob 12 | -------------------------------------------------------------------------------- /cuiqingcai/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = async_sandbox.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = async_sandbox 12 | -------------------------------------------------------------------------------- /async_cuiqingcai/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = async_sandbox.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = async_sandbox 12 | -------------------------------------------------------------------------------- /weibo/weibo/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class WeiboPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /bilibili/bilibili/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class BilibiliPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /weibo/weibo/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class WeiboItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /bilibili/bilibili/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BilibiliItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /lanrentingshu/lrts/lrts/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class LrtsItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /lanrentingshu/header_toolkit.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | def getheader(): 4 | with open('request_header') as fp: 5 | data=fp.readlines() 6 | dictionary=dict() 7 | for line in data: 8 | line=line.strip() 9 | dictionary[line.split(":")[0]]=':'.join(line.split(":")[1:]) 10 | return dictionary 11 | if __name__=="__main__": 12 | print getheader() -------------------------------------------------------------------------------- /bilibili/bilibili/spiders/bili.log: -------------------------------------------------------------------------------- 1 | 2018-08-15 15:12:59 - E:\git\CrawlMan\bilibili\bilibili\spiders\bili.py - INFO: - ==================================================== 2 | 2018-08-15 15:12:59 - E:\git\CrawlMan\bilibili\bilibili\spiders\bili.py - INFO: -
--
3 | 2018-08-15 15:12:59 - E:\git\CrawlMan\bilibili\bilibili\spiders\bili.py - INFO: - ==================================================== 4 | -------------------------------------------------------------------------------- /MyLibrary/sandbox/sandbox/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item,Field 9 | 10 | 11 | class SpiderItem(Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | username=Field() 15 | password = Field() 16 | 17 | -------------------------------------------------------------------------------- /chahaoba/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.pyc 3 | data.cfg 4 | *.mp3 5 | *.pkl 6 | *.xls 7 | *.xml 8 | *.csv 9 | *.pkl 10 | ~$d.xlsx 11 | d.xlsx 12 | data/ 13 | temp 14 | request_header 15 | header_toolkit.txt 16 | *.xlsx 17 | *.log 18 | __pycache__/ 19 | wikizhword.text 20 | news_tensite_xml.dat 21 | news_tensite_xml.smarty.dat 22 | *.jpg 23 | Download/ 24 | Download_IMG/ 25 | *.zip 26 | cookies 27 | httpcache 28 | config.py 29 | *.png 30 | full_name.dat 31 | -------------------------------------------------------------------------------- /kc0011/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.pyc 3 | data.cfg 4 | *.mp3 5 | *.pkl 6 | *.xls 7 | *.xml 8 | *.csv 9 | *.pkl 10 | ~$d.xlsx 11 | d.xlsx 12 | data/ 13 | temp 14 | request_header 15 | header_toolkit.txt 16 | *.xlsx 17 | *.log 18 | __pycache__/ 19 | wikizhword.text 20 | news_tensite_xml.dat 21 | news_tensite_xml.smarty.dat 22 | *.jpg 23 | Download/ 24 | Download_IMG/ 25 | *.zip 26 | cookies 27 | httpcache 28 | config.py 29 | *.png 30 | full_name.dat 31 | -------------------------------------------------------------------------------- /myubbs/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.pyc 3 | data.cfg 4 | *.mp3 5 | *.pkl 6 | *.xls 7 | *.xml 8 | *.csv 9 | *.pkl 10 | ~$d.xlsx 11 | d.xlsx 12 | data/ 13 | temp 14 | request_header 15 | header_toolkit.txt 16 | *.xlsx 17 | *.log 18 | __pycache__/ 19 | wikizhword.text 20 | news_tensite_xml.dat 21 | news_tensite_xml.smarty.dat 22 | *.jpg 23 | Download/ 24 | Download_IMG/ 25 | *.zip 26 | cookies 27 | httpcache 28 | config.py 29 | *.png 30 | full_name.dat 31 | -------------------------------------------------------------------------------- /tiexue/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.pyc 3 | data.cfg 4 | *.mp3 5 | *.pkl 6 | *.xls 7 | *.xml 8 | *.csv 9 | *.pkl 10 | ~$d.xlsx 11 | d.xlsx 12 | data/ 13 | temp 14 | request_header 15 | header_toolkit.txt 16 | *.xlsx 17 | *.log 18 | __pycache__/ 19 | wikizhword.text 20 | news_tensite_xml.dat 21 | news_tensite_xml.smarty.dat 22 | *.jpg 23 | Download/ 24 | Download_IMG/ 25 | *.zip 26 | cookies 27 | httpcache 28 | config.py 29 | *.png 30 | full_name.dat 31 | -------------------------------------------------------------------------------- /chahaoba/sync_data.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2019/8/22 16:56 4 | # @File : sync_data.py 5 | import redis 6 | r=redis.StrictRedis('10.18.6.46',db=8,decode_responses=True) 7 | import pymysql 8 | con = pymysql.connect(host='',port=,db='spider',user='',password='') 9 | cursor = con.cursor() 10 | cmd = 'select number from chahaoba' 11 | cursor.execute(cmd) 12 | ret = cursor.fetchall() 13 | for i in ret: 14 | r.sadd('chahaoba',i[0]) 15 | -------------------------------------------------------------------------------- /poi_gaode/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.pyc 3 | data.cfg 4 | *.mp3 5 | *.pkl 6 | *.xls 7 | *.xml 8 | *.csv 9 | *.pkl 10 | ~$d.xlsx 11 | d.xlsx 12 | data/ 13 | temp 14 | request_header 15 | header_toolkit.txt 16 | *.xlsx 17 | *.log 18 | __pycache__/ 19 | wikizhword.text 20 | news_tensite_xml.dat 21 | news_tensite_xml.smarty.dat 22 | *.jpg 23 | Download/ 24 | Download_IMG/ 25 | *.zip 26 | cookies 27 | httpcache 28 | config.py 29 | *.png 30 | full_name.dat 31 | -------------------------------------------------------------------------------- /sz_yaohao/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.pyc 3 | data.cfg 4 | *.mp3 5 | *.pkl 6 | *.xls 7 | *.xml 8 | *.csv 9 | *.pkl 10 | ~$d.xlsx 11 | d.xlsx 12 | data/ 13 | temp 14 | request_header 15 | header_toolkit.txt 16 | *.xlsx 17 | *.log 18 | __pycache__/ 19 | wikizhword.text 20 | news_tensite_xml.dat 21 | news_tensite_xml.smarty.dat 22 | *.jpg 23 | Download/ 24 | Download_IMG/ 25 | *.zip 26 | cookies 27 | httpcache 28 | config.py 29 | *.png 30 | full_name.dat 31 | -------------------------------------------------------------------------------- /MyLibrary/sandbox/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.pyc 3 | data.cfg 4 | *.mp3 5 | *.pkl 6 | *.xls 7 | *.xml 8 | *.csv 9 | *.pkl 10 | ~$d.xlsx 11 | d.xlsx 12 | data/ 13 | temp 14 | request_header 15 | header_toolkit.txt 16 | *.xlsx 17 | *.log 18 | __pycache__/ 19 | wikizhword.text 20 | news_tensite_xml.dat 21 | news_tensite_xml.smarty.dat 22 | *.jpg 23 | Download/ 24 | Download_IMG/ 25 | *.zip 26 | cookies 27 | httpcache 28 | config.py 29 | *.png 30 | full_name.dat 31 | -------------------------------------------------------------------------------- /51jbnet/im_sandbox/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class SandboxItem(scrapy.Item): 11 | # define the fields for your item here like: 12 | # name = scrapy.Field() 13 | title=scrapy.Field() 14 | url=scrapy.Field() 15 | pubdate=scrapy.Field() 16 | category=scrapy.Field() 17 | -------------------------------------------------------------------------------- /myubbs/sandbox/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item, Field 9 | 10 | 11 | class SpiderItem(Item): 12 | # define the fields for your item here like: 13 | 14 | title = Field() 15 | pubdate = Field() 16 | content = Field() 17 | author = Field() 18 | url = Field() 19 | crawltime = Field() 20 | -------------------------------------------------------------------------------- /jd/jd/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class JdItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # pass 15 | name=scrapy.Field() 16 | price=scrapy.Field() 17 | remark=scrapy.Field() 18 | publish=scrapy.Field() 19 | # shop=scrapy.Field() -------------------------------------------------------------------------------- /dfcf/push_redis.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import redis 3 | r=redis.StrictRedis('192.168.10.48',db=5,decode_responses=True) 4 | 5 | name='todo.xlsx' 6 | df=pd.read_excel(name,dtype={'symbol':str}) 7 | # print(df.head()) 8 | new_list=df.loc[df.industry.str.contains('汽车'), :]['symbol'].tolist() 9 | # for i in df['代码'].values: 10 | # r.lpush('code_list',i) 11 | old_file = '要爬取的个股列表.xlsx' 12 | df2=pd.read_excel(old_file,dtype={'代码':str}) 13 | old_list = df2['代码'].tolist() 14 | for item in new_list: 15 | if item not in old_list: 16 | r.set(item,0) 17 | -------------------------------------------------------------------------------- /bbssmth/bbssmth/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item,Field 9 | 10 | 11 | class BbssmthItem(Item): 12 | # define the fields for your item here like: 13 | # name = Field() 14 | title = Field() 15 | content = Field() 16 | create_time = Field() 17 | url = Field() 18 | crawltime = Field() 19 | category = Field() 20 | author = Field() 21 | reply = Field() 22 | -------------------------------------------------------------------------------- /jd/jd/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from jd.items import JdItem 8 | import pymongo 9 | class JDPipeline(object): 10 | def __init__(self): 11 | self.mongo=pymongo.MongoClient('10.18.6.46',27001) 12 | self.doc=self.mongo['spider']['jd_book'] 13 | def process_item(self, item, spider): 14 | self.doc.insert(dict(item)) 15 | return item 16 | -------------------------------------------------------------------------------- /chahaoba/sandbox/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item,Field 9 | import scrapy 10 | 11 | class SpiderItem(Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | _number = scrapy.Field() 15 | _city = scrapy.Field() 16 | _province = scrapy.Field() 17 | _card_type = scrapy.Field() 18 | _op = scrapy.Field() 19 | _card_detail= scrapy.Field() 20 | -------------------------------------------------------------------------------- /fraud/fraud/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # from scrapy.item import Item, Field 3 | import scrapy 4 | 5 | class FraudItem(scrapy.Item): 6 | executed_name = scrapy.Field() 7 | gender = scrapy.Field() 8 | age = scrapy.Field() 9 | identity_number = scrapy.Field() 10 | court = scrapy.Field() 11 | province = scrapy.Field() 12 | case_number = scrapy.Field() 13 | performance = scrapy.Field() # 被执行人的履行情况 14 | disrupt_type_name = scrapy.Field() # 失信被执行人行为具体情形 15 | duty = scrapy.Field() # 生效法律文书确定的义务 16 | release_time = scrapy.Field() 17 | -------------------------------------------------------------------------------- /sz_yaohao/sandbox/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item,Field 9 | 10 | 11 | class SpiderItem(Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | card=Field() 15 | accountLength = Field() 16 | cardName = Field() 17 | cardType = Field() 18 | mainAccount = Field() 19 | mainValue = Field() 20 | orgName = Field() 21 | crawltime = Field() 22 | -------------------------------------------------------------------------------- /github_star/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | 8 | { 9 | "name": "Python: Current File", 10 | "type": "python", 11 | "request": "launch", 12 | "program": "${file}", 13 | "console": "integratedTerminal", 14 | "args": ["rockyzsu"] 15 | } 16 | ] 17 | } -------------------------------------------------------------------------------- /tencentjob/tencentjob/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | # import scrapy 9 | from scrapy import Field,Item 10 | 11 | class TencentjobItem(Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = Field() 15 | catalog = Field() 16 | workLocation = Field() 17 | recruitNumber = Field() 18 | duty = Field() 19 | Job_requirement= Field() 20 | url = Field() 21 | publishTime = Field() 22 | -------------------------------------------------------------------------------- /tencentjob/tencentjob/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | from collections import OrderedDict 9 | class TencentjobPipeline(object): 10 | def __init__(self): 11 | self.db = pymongo.MongoClient('localhost') 12 | self.collection = self.db['tencent']['job'] 13 | 14 | def process_item(self, item, spider): 15 | self.collection.insert(OrderedDict(item)) 16 | return item 17 | -------------------------------------------------------------------------------- /m3u8_video/experience.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2019/12/2 9:17 4 | # @File : experience.py 5 | import requests 6 | url='https://jh0p4t0rh9rs9610ryc.exp.bcevod.com/mda-jjkxjt57fdsith87/mda-jjkxjt57fdsith87.m3u8.{}.ts' 7 | total = 253 8 | headers={'User-Agent':'Xiaomi'} 9 | data = 'data' 10 | for i in range(total+1): 11 | try: 12 | r = requests.get(url.format(i),headers=headers) 13 | except Exception as e: 14 | print(e) 15 | else: 16 | with open('data/{}.ts'.format(i),'wb') as f: 17 | f.write(r.content) 18 | print('done {}.ts'.format(i)) 19 | 20 | -------------------------------------------------------------------------------- /pornhub/README.md: -------------------------------------------------------------------------------- 1 | 2 | - ```运行在Python环境``` 3 | - ```git clone https://github.com/formateddd/Pornhub ``` 4 | - ```cd Pornhub && pip install -r requirements.txt``` 5 | - ```python crawler.py webm``` 6 | - 待程序运行完毕, 会在webm文件夹下download两页的webm缩略图,对应名称为详细页面的URL后缀 7 | - 运行```python crawler.py mp4```, 在MP4文件夹可看到下载好的MP4文件 8 | 9 | - ```运行在浏览器``` 10 | 11 | - [安装油猴](http://tampermonkey.net/) 12 | - Create a new script, copy and paste the [code](https://raw.githubusercontent.com/formateddd/pornhub/master/tampermonkey.js). 13 | 14 | 15 | 16 | ## 加群分析共享爬虫项目代码: 17 | 18 | ## 759746505 19 | 20 | -------------------------------------------------------------------------------- /sz_yaohao/sandbox/headers.txt: -------------------------------------------------------------------------------- 1 | Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 2 | Accept-Encoding: gzip, deflate, br 3 | Accept-Language: zh-CN,zh;q=0.9 4 | Cache-Control: no-cache 5 | Connection: keep-alive 6 | Content-Type: application/x-www-form-urlencoded 7 | Host: apply.jtys.sz.gov.cn 8 | Origin: http://xqctk.jtys.sz.gov.cn 9 | Pragma: no-cache 10 | Referer: http://xqctk.jtys.sz.gov.cn/? 11 | Upgrade-Insecure-Requests: 1 12 | User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36 -------------------------------------------------------------------------------- /csdn/getCSDN_Range.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | # Get your range of csdn 3 | __author__ = 'rocky' 4 | import requests 5 | import re 6 | import time 7 | 8 | link = 'http://blog.csdn.net/yagamil/article/details/52858314' 9 | user_agent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)" 10 | header = {"User-Agent": user_agent} 11 | req = requests.get(link, headers=header) 12 | content =req.text 13 | p = re.search(r'
',content).group(1) 14 | today = time.strftime("%Y-%m-%d") 15 | f = open(r"D:\OneDrive\Stock_Data\csdn_range.txt", 'a') 16 | contents = today + '\t' + p + '\n' 17 | f.write(contents) 18 | f.close() 19 | -------------------------------------------------------------------------------- /cuiqingcai/async_sandbox/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class AsyncSandboxItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = scrapy.Field() 15 | article_url = scrapy.Field() 16 | content = scrapy.Field() 17 | created_at = scrapy.Field() 18 | category = scrapy.Field() 19 | visited = scrapy.Field() 20 | comment = scrapy.Field() 21 | liked = scrapy.Field() 22 | author = scrapy.Field() 23 | crawltime = scrapy.Field() 24 | -------------------------------------------------------------------------------- /async_cuiqingcai/async_sandbox/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class AsyncSandboxItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = scrapy.Field() 15 | article_url = scrapy.Field() 16 | # content = scrapy.Field() 17 | created_at = scrapy.Field() 18 | category = scrapy.Field() 19 | visited = scrapy.Field() 20 | comment = scrapy.Field() 21 | liked = scrapy.Field() 22 | author = scrapy.Field() 23 | crawltime = scrapy.Field() 24 | -------------------------------------------------------------------------------- /lanrentingshu/lrts/lrts/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from scrapy.pipelines.files import FilesPipeline 9 | from urllib.parse import urlparse 10 | from os.path import basename,dirname,join 11 | class LrtsPipeline(object): 12 | def process_item(self, item, spider): 13 | return item 14 | 15 | class MyFilesPipeline(FilesPipeline): 16 | 17 | def file_path(self, request, response=None, info=None): 18 | path = urlparse(request.url).path 19 | return join(basename(dirname(path)),basename(path)) 20 | -------------------------------------------------------------------------------- /tiexue/sandbox/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item,Field 9 | import scrapy 10 | 11 | class SpiderItem(Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = scrapy.Field() 15 | article_url = scrapy.Field() 16 | content = scrapy.Field() 17 | created_at = scrapy.Field() 18 | # category = scrapy.Field() 19 | # visited = scrapy.Field() 20 | # comment = scrapy.Field() 21 | # liked = scrapy.Field() 22 | author = scrapy.Field() 23 | crawltime = scrapy.Field() 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.pyc 3 | *.mp3 4 | cookies 5 | .idea 6 | *.pyc 7 | data.cfg 8 | *.mp3 9 | *.pkl 10 | *.xls 11 | *.xml 12 | *.csv 13 | *.pkl 14 | ~$d.xlsx 15 | d.xlsx 16 | data/ 17 | temp 18 | request_header 19 | header_toolkit.txt 20 | *.xlsx 21 | *.log 22 | __pycache__/ 23 | wikizhword.text 24 | news_tensite_xml.dat 25 | news_tensite_xml.smarty.dat 26 | *.jpg 27 | Download/ 28 | Download_IMG/ 29 | *.zip 30 | cookies 31 | config.json 32 | config.py 33 | data.cfg 34 | setting.py 35 | setttings.py 36 | *.ts 37 | kc0011/jobs/requests.queue/p1 38 | kc0011/jobs/requests.queue/p0 39 | kc0011/jobs/requests.queue/active.json 40 | kc0011/jobs/spider.state 41 | kc0011/jobs/requests.seen 42 | *.jpg 43 | *.png 44 | *.jpeg 45 | configure/ 46 | -------------------------------------------------------------------------------- /lanrentingshu/request_header: -------------------------------------------------------------------------------- 1 | Accept:*/* 2 | Accept-Encoding:gzip, deflate 3 | Accept-Language:zh-CN,zh;q=0.8 4 | Cache-Control:no-cache 5 | Connection:keep-alive 6 | Content-Length:0 7 | Cookie:aliyungf_tc=AQAAADCDiwwT/gEAv7APt2maQ56C3T1o; uid=15052187062975665e8ceaad34eb9911f2a90ee5b66ad; CNZZDATA1254668430=2046036592-1505217321-null%7C1505217321; Hm_lvt_ada61571fd48bb3f905f5fd1d6ef0ec4=1505218688; Hm_lpvt_ada61571fd48bb3f905f5fd1d6ef0ec4=1505219620; JSESSIONID=EE57EEBB708D1DF15621C6949A4FBE48 8 | Host:www.lrts.me 9 | Origin:http://www.lrts.me 10 | Pragma:no-cache 11 | Referer:http://www.lrts.me/book/32551 12 | User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36 13 | X-Requested-With:XMLHttpRequest -------------------------------------------------------------------------------- /fraud/fraud/model/db_config.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine 2 | from sqlalchemy.orm import sessionmaker 3 | import redis 4 | 5 | 6 | engine = create_engine('mysql+pymysql://root:{}@localhost:3306/spider?charset=utf8') 7 | DBSession = sessionmaker(bind=engine) 8 | 9 | 10 | class RedisPool: 11 | def __init__(self, client_host="localhost", client_port=6379, client_db=0): 12 | self.client_host = client_host 13 | self.client_port = client_port 14 | self.client_db = client_db 15 | 16 | def redis_pool(self): 17 | pool = redis.ConnectionPool( 18 | host=self.client_host, 19 | port=self.client_port, 20 | db=self.client_db, 21 | decode_responses=True) 22 | return redis.StrictRedis(connection_pool=pool) -------------------------------------------------------------------------------- /kc0011/sandbox/utility.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/12/13 13:47 4 | # @File : utility.py 5 | 6 | import os 7 | 8 | # 获取headers 9 | 10 | def get_header(header_file='headers.txt'): 11 | path = os.path.dirname(__file__) 12 | header_path = os.path.join(path,'headers',header_file) 13 | if not os.path.exists(header_path): 14 | return None 15 | 16 | with open(header_path) as fp: 17 | data = fp.readlines() 18 | dictionary = dict() 19 | 20 | for line in data: 21 | line = line.strip() 22 | line = line.replace(' ', '') 23 | dictionary[line.split(":")[0].strip()] = ':'.join( 24 | line.split(":")[1:]) 25 | 26 | if 'Content-Length' in dictionary: 27 | del dictionary['Content-Length'] 28 | 29 | return dictionary -------------------------------------------------------------------------------- /myubbs/sandbox/utility.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/12/13 13:47 4 | # @File : utility.py 5 | 6 | import os 7 | 8 | # 获取headers 9 | 10 | def get_header(header_file='headers.txt'): 11 | path = os.path.dirname(__file__) 12 | header_path = os.path.join(path,'headers',header_file) 13 | if not os.path.exists(header_path): 14 | return None 15 | 16 | with open(header_path) as fp: 17 | data = fp.readlines() 18 | dictionary = dict() 19 | 20 | for line in data: 21 | line = line.strip() 22 | line = line.replace(' ', '') 23 | dictionary[line.split(":")[0].strip()] = ':'.join( 24 | line.split(":")[1:]) 25 | 26 | if 'Content-Length' in dictionary: 27 | del dictionary['Content-Length'] 28 | 29 | return dictionary -------------------------------------------------------------------------------- /tiexue/sandbox/utility.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/12/13 13:47 4 | # @File : utility.py 5 | 6 | import os 7 | 8 | # 获取headers 9 | 10 | def get_header(header_file='headers.txt'): 11 | path = os.path.dirname(__file__) 12 | header_path = os.path.join(path,'headers',header_file) 13 | if not os.path.exists(header_path): 14 | return None 15 | 16 | with open(header_path) as fp: 17 | data = fp.readlines() 18 | dictionary = dict() 19 | 20 | for line in data: 21 | line = line.strip() 22 | line = line.replace(' ', '') 23 | dictionary[line.split(":")[0].strip()] = ':'.join( 24 | line.split(":")[1:]) 25 | 26 | if 'Content-Length' in dictionary: 27 | del dictionary['Content-Length'] 28 | 29 | return dictionary -------------------------------------------------------------------------------- /chahaoba/sandbox/utility.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/12/13 13:47 4 | # @File : utility.py 5 | 6 | import os 7 | 8 | # 获取headers 9 | 10 | def get_header(header_file='headers.txt'): 11 | path = os.path.dirname(__file__) 12 | header_path = os.path.join(path,'headers',header_file) 13 | if not os.path.exists(header_path): 14 | return None 15 | 16 | with open(header_path) as fp: 17 | data = fp.readlines() 18 | dictionary = dict() 19 | 20 | for line in data: 21 | line = line.strip() 22 | line = line.replace(' ', '') 23 | dictionary[line.split(":")[0].strip()] = ':'.join( 24 | line.split(":")[1:]) 25 | 26 | if 'Content-Length' in dictionary: 27 | del dictionary['Content-Length'] 28 | 29 | return dictionary -------------------------------------------------------------------------------- /sz_yaohao/sandbox/utility.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/12/13 13:47 4 | # @File : utility.py 5 | 6 | import os 7 | 8 | # 获取headers 9 | 10 | def get_header(header_file='headers.txt'): 11 | path = os.path.dirname(__file__) 12 | header_path = os.path.join(path,'headers',header_file) 13 | if not os.path.exists(header_path): 14 | return None 15 | 16 | with open(header_path) as fp: 17 | data = fp.readlines() 18 | dictionary = dict() 19 | 20 | for line in data: 21 | line = line.strip() 22 | line = line.replace(' ', '') 23 | dictionary[line.split(":")[0].strip()] = ':'.join( 24 | line.split(":")[1:]) 25 | 26 | if 'Content-Length' in dictionary: 27 | del dictionary['Content-Length'] 28 | 29 | return dictionary -------------------------------------------------------------------------------- /cuiqingcai/async_sandbox/utility.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/12/13 13:47 4 | # @File : utility.py 5 | 6 | import os 7 | 8 | # 获取headers 9 | 10 | def get_header(header_file='headers.txt'): 11 | path = os.path.dirname(__file__) 12 | header_path = os.path.join(path,'headers',header_file) 13 | if not os.path.exists(header_path): 14 | return None 15 | 16 | with open(header_path) as fp: 17 | data = fp.readlines() 18 | dictionary = dict() 19 | 20 | for line in data: 21 | line = line.strip() 22 | line = line.replace(' ', '') 23 | dictionary[line.split(":")[0].strip()] = ':'.join( 24 | line.split(":")[1:]) 25 | 26 | if 'Content-Length' in dictionary: 27 | del dictionary['Content-Length'] 28 | 29 | return dictionary -------------------------------------------------------------------------------- /tiexue/sandbox/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | # from sandbox.models import SpiderModels, DBSession 8 | import logging 9 | import pymongo 10 | from sandbox import config 11 | from sandbox import settings 12 | 13 | 14 | 15 | class MongoPipeline(object): 16 | def __init__(self): 17 | DOCUMENT = settings.MONGODB_DOC 18 | self.db = pymongo.MongoClient(settings.MONGO_HOST, port=settings.MONGO_PORT) 19 | self.doc = self.db['spider'][DOCUMENT] 20 | 21 | def process_item(self, item, spider): 22 | print('on process') 23 | insert_item = dict(item) 24 | self.doc.insert(insert_item) 25 | 26 | return item 27 | -------------------------------------------------------------------------------- /poi_gaode/sandbox/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item, Field 9 | 10 | 11 | class SpiderItem(Item): 12 | # define the fields for your item here like: 13 | 14 | id = Field() 15 | parent = Field() 16 | name = Field() 17 | type = Field() 18 | typecode = Field() 19 | biz_type = Field() 20 | address = Field() 21 | location = Field() 22 | tel = Field() 23 | distance = Field() 24 | biz_ext = Field() 25 | pname = Field() 26 | cityname = Field() 27 | adname = Field() 28 | importance = Field() 29 | shopid = Field() 30 | shopinfo = Field() 31 | poiweight = Field() 32 | photos = Field() 33 | crawltime = Field() 34 | -------------------------------------------------------------------------------- /async_cuiqingcai/async_sandbox/monitor/settings.py: -------------------------------------------------------------------------------- 1 | # *-* coding:utf-8 *-* 2 | ''' 3 | @author: ioiogoo 4 | @date: 2016/12/26 11:48 5 | ''' 6 | 7 | ''' 8 | TIMEINTERVAL 刷新时间间隔,单位毫秒 9 | POINTINTERVAL 图上各点之间间隔,越小则表示点越密集 10 | POINTLENGTH 图上点的数量,越大则表示图上时间跨度越长 11 | STATS_KEYS 图上显示的stats_key 12 | REDIS_HOST redis地址 13 | REDIS_PORT redis端口 14 | REDIS_DB redis数据库,默认0 15 | APP_HOST app运行地址,默认127.0.0.1 16 | APP_PORT app运行端口,默认5000 17 | ''' 18 | 19 | TIMEINTERVAL = 30000 20 | POINTINTERVAL = 30 21 | POINTLENGTH = 2000 22 | STATS_KEYS = ['downloader/request_count', 'downloader/response_count','downloader/response_status_count/200', 'item_scraped_count'] 23 | REDIS_HOST = '10.18.6.46' 24 | REDIS_PORT = 6379 25 | REDIS_DB = 0 26 | APP_HOST = '127.0.0.1' 27 | APP_PORT = 5000 28 | -------------------------------------------------------------------------------- /async_cuiqingcai/rabbit_send.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2019-08-30 17:25:46 4 | # @Author : Rocky Chen (weigesysu@qq.com) 5 | # @Link : http://30daydo.com 6 | # @Version : $Id$ 7 | 8 | import pika 9 | # import settings 10 | 11 | credentials = pika.PlainCredentials('admin','admin') 12 | connection = pika.BlockingConnection(pika.ConnectionParameters('192.168.1.101',5672,'/',credentials)) 13 | 14 | channel = connection.channel() 15 | channel.exchange_declare(exchange='direct_log',exchange_type='direct') # fanout 就是组播 16 | 17 | routing_key = 'info' 18 | message='https://36kr.com/pp/api/aggregation-entity?type=web_latest_article&b_id=59499&per_page=30' 19 | channel.basic_publish( 20 | exchange='direct_log', 21 | routing_key=routing_key, 22 | body=message 23 | ) 24 | 25 | print('sending message {}'.format(message)) 26 | connection.close() 27 | -------------------------------------------------------------------------------- /poi_gaode/.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "def get_max():\n", 10 | " with open('sz_poi.txt','r') as f:\n", 11 | " # js = json.load(f)\n", 12 | " while 1:\n", 13 | " " 14 | ] 15 | } 16 | ], 17 | "metadata": { 18 | "kernelspec": { 19 | "display_name": "Python 3", 20 | "language": "python", 21 | "name": "python3" 22 | }, 23 | "language_info": { 24 | "codemirror_mode": { 25 | "name": "ipython", 26 | "version": 3 27 | }, 28 | "file_extension": ".py", 29 | "mimetype": "text/x-python", 30 | "name": "python", 31 | "nbconvert_exporter": "python", 32 | "pygments_lexer": "ipython3", 33 | "version": "3.6.2" 34 | } 35 | }, 36 | "nbformat": 4, 37 | "nbformat_minor": 2 38 | } 39 | -------------------------------------------------------------------------------- /pornhub/tampermonkey.js: -------------------------------------------------------------------------------- 1 | // ==UserScript== 2 | // @name New Userscript 3 | // @namespace http://tampermonkey.net/ 4 | // @version 0.1 5 | // @description try to take over the world! 6 | // @author github.com/formateddd/pornhub 7 | // @include *.pornhub.com/view_video.php?viewkey=* 8 | // @grant none 9 | // ==/UserScript== 10 | 11 | 12 | 13 | (function() { 14 | 'use strict'; 15 | 16 | // Your code here... 17 | 18 | 19 | var qualites = [ 20 | "quality_1080p", 21 | "quality_720p", 22 | "quality_480p", 23 | "quality_240p", 24 | ]; 25 | 26 | for (var i in qualites) { 27 | if (window[qualites[i]]){ 28 | document.querySelector("h1").innerHTML += '' + qualites[i] + '' 29 | console.info(window.qualites[i]); 30 | break 31 | } 32 | } 33 | 34 | 35 | })(); 36 | -------------------------------------------------------------------------------- /jd/jd/spiders/quotes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy_splash import SplashRequest 4 | 5 | 6 | class QuotesSpider(scrapy.Spider): 7 | name = "quotes" 8 | allowed_domains = ["quotes.toscrape.com"] 9 | start_urls = ['http://quotes.toscrape.com/js/'] 10 | 11 | def start_requests(self): 12 | for url in self.start_urls: 13 | yield SplashRequest(url, args={'images': 0, 'timeout': 3}) 14 | 15 | def parse(self, response): 16 | for sel in response.css('div.quote'): 17 | quote = sel.css('span.text::text').extract_first() 18 | author = sel.css('small.author::text').extract_first() 19 | yield {'quote': quote, 'author': author} 20 | href = response.css('li.next > a::attr(href)').extract_first() 21 | if href: 22 | url = response.urljoin(href) 23 | yield SplashRequest(url, args={'images': 0, 'timeout': 3}) -------------------------------------------------------------------------------- /kc0011/sandbox/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item,Field 9 | import scrapy 10 | 11 | class SpiderItem(Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | nick_name = scrapy.Field() 15 | level = scrapy.Field() 16 | credit = scrapy.Field() 17 | score_count = scrapy.Field() 18 | tie_count = scrapy.Field() 19 | jifeng = scrapy.Field() 20 | register = scrapy.Field() 21 | alipay=scrapy.Field() 22 | email=scrapy.Field() 23 | person_info_html = scrapy.Field() 24 | crawltime = scrapy.Field() 25 | 26 | class ContentItem(Item): 27 | url = scrapy.Field() 28 | publishTime = scrapy.Field() 29 | author = scrapy.Field() 30 | content = scrapy.Field() 31 | crawltime=scrapy.Field() 32 | 33 | -------------------------------------------------------------------------------- /bilibili/bilibili/logger.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | import logging 3 | import datetime 4 | import os 5 | # from setting import llogger 6 | def llogger(filename): 7 | 8 | logger = logging.getLogger(filename) # 不加名称设置root logger 9 | logger.setLevel(logging.DEBUG) 10 | formatter = logging.Formatter( 11 | '%(asctime)s - %(name)s - %(levelname)s: - %(message)s', 12 | datefmt='%Y-%m-%d %H:%M:%S') 13 | # 使用FileHandler输出到文件 14 | prefix = os.path.splitext(filename)[0] 15 | fh = logging.FileHandler(prefix+'.log') 16 | fh.setLevel(logging.DEBUG) 17 | fh.setFormatter(formatter) 18 | # 使用StreamHandler输出到屏幕 19 | ch = logging.StreamHandler() 20 | ch.setLevel(logging.DEBUG) 21 | ch.setFormatter(formatter) 22 | # 添加两个Handler 23 | logger.addHandler(ch) 24 | logger.addHandler(fh) 25 | # logger.info('this is info message') 26 | # logger.warning('this is warn message') 27 | return logger 28 | 29 | -------------------------------------------------------------------------------- /async_cuiqingcai/multi_spider_run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2019-08-30 16:20:47 4 | # @Author : Rocky Chen (weigesysu@qq.com) 5 | # @Link : http://30daydo.com 6 | # @Version : $Id$ 7 | 8 | from crochet import setup 9 | from importlib import import_module 10 | from scrapy.crawler import CrawlerRunner 11 | from scrapy.utils.project import get_project_settings 12 | setup() 13 | 14 | # not work 15 | def run_spider(spiderName): 16 | module_name="async_sandbox.spiders.{}".format(spiderName) 17 | scrapy_var = import_module(module_name) #do some dynamic import of selected spider 18 | print(scrapy_var) 19 | print(dir(scrapy_var)) 20 | spiderObj=scrapy_var.ExampleSpider #get mySpider-object from spider module 21 | print(spiderObj) 22 | 23 | crawler = CrawlerRunner(get_project_settings()) #from Scrapy docs 24 | crawler.crawl(spiderObj) 25 | print('start') 26 | 27 | run_spider('example') -------------------------------------------------------------------------------- /bilibili/bilibili/spiders/bili.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy_splash import SplashRequest 4 | import logging 5 | # from bilibili.logger import llogger 6 | # from scrapy import log 7 | # loggers = llogger(__file__) 8 | 9 | class BiliSpider(scrapy.Spider): 10 | name = 'ordinary' # 这个名字就是上面连接中那个启动应用的名字 11 | allowed_domain = ["bilibili.com"] 12 | start_urls = [ 13 | "https://www.bilibili.com/" 14 | ] 15 | 16 | def start_requests(self): 17 | splash_args = { 18 | 'wait': '5', 19 | } 20 | for url in self.start_urls: 21 | yield SplashRequest(url, self.parse_result, args=splash_args, endpoint='render.html') 22 | 23 | def parse_result(self, response): 24 | logging.info('====================================================') 25 | content = response.xpath("//div[@class='num-wrap']").extract_first() 26 | logging.info(content) 27 | logging.info('====================================================') 28 | 29 | -------------------------------------------------------------------------------- /fraud/fraud/model/fraud.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sqlalchemy import Column, String , DateTime, Integer, Text 3 | from sqlalchemy.ext.declarative import declarative_base 4 | from fraud.model import db_config 5 | import datetime 6 | 7 | Base = declarative_base() 8 | 9 | class Fraud(Base): 10 | __tablename__ = 'tb_frauds2' 11 | 12 | id = Column(Integer, primary_key=True) 13 | executed_name = Column(String(300)) 14 | gender = Column(String(10)) 15 | age = Column(String(10)) 16 | identity_number = Column(String(50)) 17 | court = Column(String(200)) 18 | province = Column(String(50)) 19 | case_number = Column(String(100)) 20 | performance = Column(String(100)) # 被执行人的履行情况 21 | disrupt_type_name = Column(Text) # 失信被执行人行为具体情形 22 | duty = Column(Text) # 生效法律文书确定的义务 23 | release_time = Column(String(50)) 24 | crawl_time = Column(DateTime, default=datetime.datetime.now()) 25 | data_resource = Column(String(50), default='baidu_api') 26 | 27 | Base.metadata.create_all(db_config.engine) 28 | -------------------------------------------------------------------------------- /sz_yaohao/sandbox/models.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/9/26 9:25 4 | # @File : models.py 5 | 6 | from sqlalchemy import Column, String, DateTime, Integer, Text, create_engine, DATE 7 | from sqlalchemy.ext.declarative import declarative_base 8 | import datetime 9 | from sqlalchemy.orm import sessionmaker 10 | from sandbox import config 11 | 12 | Base = declarative_base() 13 | engine = create_engine('mysql+pymysql://{}:{}@{}:3367/spider?charset=utf8'.format(config.username,config.password,config.mysql_ip)) 14 | DBSession = sessionmaker(bind=engine) 15 | 16 | TABLE_NAME = '' 17 | 18 | # ORM 模型,根据项目需求修改 19 | class SpiderModels(Base): 20 | __tablename__ = TABLE_NAME 21 | 22 | # 根据项目修改字段 23 | id = Column(Integer, primary_key=True, autoincrement=True) 24 | card=Column(Text, comment='卡号') 25 | accountLength = Column(Text, comment='长度') 26 | origin = Column(String(30), comment='来源') 27 | crawltime = Column(DateTime, default=datetime.datetime.now(), comment='抓取时间') 28 | 29 | 30 | Base.metadata.create_all(engine) -------------------------------------------------------------------------------- /async_cuiqingcai/async_sandbox/commands/crawlall.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2019-08-29 16:56:28 4 | # @Author : Rocky Chen (weigesysu@qq.com) 5 | # @Link : http://30daydo.com 6 | # @Version : $1.0$ 7 | 8 | 9 | from scrapy.commands import ScrapyCommand 10 | from scrapy.crawler import CrawlerProcess 11 | class Command(ScrapyCommand): 12 | 13 | requires_project = True 14 | 15 | def syntax(self): 16 | return '[options]' 17 | 18 | def short_desc(self): 19 | return 'Runs all of the spiders - My Defined' 20 | 21 | def run(self,args,opts): 22 | print('==================') 23 | print(type(self.crawler_process)) 24 | spider_list = self.crawler_process.spiders.list() 25 | # 可以在这里 定义 spider_list = ['example','chouti'] 26 | for name in spider_list: 27 | print('=================') 28 | print(name) 29 | self.crawler_process.crawl(name,**opts.__dict__) 30 | 31 | self.crawler_process.start() 32 | 33 | 34 | -------------------------------------------------------------------------------- /MyLibrary/login.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import hashlib 3 | 4 | def login_session(username,password): 5 | s = bytes(password, encoding='utf8') 6 | m = hashlib.md5() 7 | m.update(s) 8 | first_md5 = m.hexdigest() 9 | headers = {'Referer': 'https://www.szlib.org.cn/MyLibrary/Reader-Access.jsp?infomistake=0&eventsite=WWW-044005', 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 11 | 'X-Requested-With': 'XMLHttpRequest'} 12 | 13 | url = 'https://www.szlib.org.cn/MyLibrary/readerLoginM.jsp' 14 | data = {'rand': '', 15 | 'username': username, 16 | 'password': first_md5, 17 | 18 | } 19 | session=None 20 | session = requests.Session() 21 | 22 | r = session.post(url=url, headers=headers, data=data, timeout=15) 23 | print(r.text) 24 | if 'OK' in r.text: 25 | print('Crash !!!') 26 | print(username) 27 | print(password) 28 | 29 | return session 30 | -------------------------------------------------------------------------------- /myubbs/sandbox/models.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/9/26 9:25 4 | # @File : models.py 5 | 6 | from sqlalchemy import Column, String, DateTime, Integer, Text, create_engine, DATE 7 | from sqlalchemy.ext.declarative import declarative_base 8 | import datetime 9 | from sqlalchemy.orm import sessionmaker 10 | from sandbox import config 11 | 12 | Base = declarative_base() 13 | engine = create_engine('mysql+pymysql://{}:{}@{}:3306/db_rocky?charset=utf8'.format(config.username,config.password,config.mysql_ip)) 14 | DBSession = sessionmaker(bind=engine) 15 | 16 | TABLE_NAME = 'tb_myubbs' 17 | 18 | # ORM 模型,根据项目需求修改 19 | class SpiderModels(Base): 20 | __tablename__ = TABLE_NAME 21 | 22 | 23 | # 根据项目修改字段 24 | id = Column(Integer, primary_key=True, autoincrement=True) 25 | title = Column(String(400)) 26 | pubdate = Column(DateTime) 27 | content = Column(Text) 28 | author = Column(String(100)) 29 | url = Column(String(200)) 30 | crawltime = Column(DateTime, default=datetime.datetime.now(), comment='抓取时间') 31 | 32 | 33 | Base.metadata.create_all(engine) -------------------------------------------------------------------------------- /myubbs/sandbox/headers: -------------------------------------------------------------------------------- 1 | Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 2 | Accept-Encoding: gzip, deflate 3 | Accept-Language: zh-CN,zh;q=0.9 4 | Cache-Control: no-cache 5 | Connection: keep-alive 6 | Cookie: MKG1_2132_saltkey=LnNTIT1F; MKG1_2132_lastvisit=1555164586; UM_distinctid=16a173f60854de-0f1c29779936eb-39395704-144000-16a173f60863cb; CNZZDATA3065925=cnzz_eid%3D1943346629-1555168187-http%253A%252F%252Fwww.myzsu.com%252F%26ntime%3D1555168187; MKG1_2132_seccode=103.e48171c76ce30999a4; MKG1_2132_visitedfid=97; MKG1_2132_st_p=0%7C1555169196%7C31ebb51b6faa73e0deaa417d1878522f; MKG1_2132_viewid=tid_140374; MKG1_2132_st_t=0%7C1555169280%7C31d0f95d5b85fe7f3c5028e0928583bb; MKG1_2132_forum_lastvisit=D_97_1555169280; MKG1_2132_lastact=1555169281%09home.php%09misc; MKG1_2132_sendmail=1 7 | Host: zsu.myubbs.com 8 | Pragma: no-cache 9 | Referer: http://zsu.myubbs.com/forum-97-1.html 10 | Upgrade-Insecure-Requests: 1 11 | User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36 -------------------------------------------------------------------------------- /async_cuiqingcai/async_sandbox/spiders/crawl_all_example.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import time 3 | class QuotesSpider(scrapy.Spider): 4 | name = "quotes" 5 | start_urls = ['http://quotes.toscrape.com/tag/humor/'] 6 | 7 | 8 | def parse(self, response): 9 | time.sleep(15) 10 | print(f'in spider {self.name}') 11 | for quote in response.css('div.quote'): 12 | print(quote.css('span.text::text').extract_first()) 13 | 14 | def close(self,reason): 15 | print('===================== spider close ================') 16 | 17 | class QuotesSpider1(scrapy.Spider): 18 | name = "quotes_1" 19 | start_urls = ['http://quotes.toscrape.com/tag/humor/'] 20 | 21 | def parse(self, response): 22 | print('meta content ==============') 23 | print(response.meta) 24 | print('meta content ==============') 25 | 26 | print(f'in spider {self.name}') 27 | for quote in response.css('div.quote'): 28 | print(quote.css('span.text::text').extract_first()) 29 | 30 | def close(self,reason): 31 | print('===================== spider close ================') 32 | -------------------------------------------------------------------------------- /chahaoba/sandbox/models.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/9/26 9:25 4 | # @File : models.py 5 | 6 | from sqlalchemy import Column, String, DateTime, Integer, Text, create_engine, DATE 7 | from sqlalchemy.ext.declarative import declarative_base 8 | import datetime 9 | from sqlalchemy.orm import sessionmaker 10 | from sandbox import config 11 | 12 | 13 | Base = declarative_base() 14 | engine = create_engine( 15 | 'mysql+pymysql://{}:{}@{}:3367/spider?charset=utf8'.format(config.username, config.password, config.mysql_ip)) 16 | DBSession = sessionmaker(bind=engine) 17 | 18 | TABLE_NAME = 'chahaoba' 19 | # 20 | # 21 | # # ORM 模型,根据项目需求修改 22 | class SpiderModels(Base): 23 | __tablename__ = TABLE_NAME 24 | 25 | # 根据项目修改字段 26 | id = Column(Integer, primary_key=True, autoincrement=True) 27 | 28 | number = Column(String(11), comment='手机号段') 29 | city = Column(String(10), comment='城市') 30 | province = Column(String(10), comment='省份') 31 | card_type = Column(String(10), comment='手机卡类型') 32 | op = Column(String(10), comment='运营商') 33 | card_detail = Column(String(80), comment='卡详细') 34 | 35 | 36 | Base.metadata.create_all(engine) 37 | -------------------------------------------------------------------------------- /kc0011/sandbox/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | # from sandbox.models import SpiderModels, DBSession 8 | 9 | import logging 10 | import pymongo 11 | from sandbox import settings 12 | from sandbox.items import SpiderItem 13 | 14 | class MongoPipeline(object): 15 | def __init__(self): 16 | self.db = pymongo.MongoClient(settings.MONGO_HOST, port=settings.MONGO_PORT) 17 | self.doc1 = self.db[settings.MONGODB_DB][settings.MONGODB_DOC] 18 | self.doc2 = self.db[settings.MONGODB_DB][settings.MONGODB_DOC2] 19 | try: 20 | self.doc2.ensure_index('url',unique=True) 21 | except Exception as e: 22 | print(e) 23 | 24 | def process_item(self, item, spider): 25 | if isinstance(item,SpiderItem): 26 | 27 | insert_item = dict(item) 28 | self.doc1.insert(insert_item) 29 | 30 | else: 31 | 32 | insert_item = dict(item) 33 | self.doc2.insert(insert_item) 34 | 35 | return item 36 | -------------------------------------------------------------------------------- /chahaoba/sandbox/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from sandbox.models import SpiderModels, DBSession 8 | import logging 9 | import pymongo 10 | import pymysql 11 | from sandbox import config 12 | from sandbox import settings 13 | from scrapy.exceptions import DropItem 14 | 15 | class SQLPipeline(object): 16 | def __init__(self): 17 | self.session = DBSession() 18 | 19 | def process_item(self, item, spider): 20 | 21 | obj = SpiderModels( 22 | number=item['_number'], 23 | city = item['_city'], 24 | province = item['_province'], 25 | card_type = item['_card_type'], 26 | op = item['_op'], 27 | card_detail = item['_card_detail'], 28 | ) 29 | self.session.add(obj) 30 | 31 | try: 32 | self.session.commit() 33 | 34 | except Exception as e: 35 | print(e) 36 | logging.error('>>>> 重复数据') 37 | self.session.rollback() 38 | DropItem(item) 39 | else: 40 | return item 41 | 42 | -------------------------------------------------------------------------------- /Forbes/main.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | __author__ = 'Rocky' 4 | ''' 5 | http://30daydo.com 6 | Email: weigesysu@qq.com 7 | ''' 8 | import requests 9 | from lxml import etree 10 | import pymongo 11 | 12 | db = pymongo.MongoClient('127.0.0.1') 13 | collection = db['forbes']['2017'] 14 | 15 | def getContent(url, retry =5): 16 | headers = {'User-Agent':'User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'} 17 | for _ in range(retry): 18 | try: 19 | r = requests.get(url,headers=headers,timeout=20) 20 | if r: 21 | return r 22 | except Exception,e: 23 | print e 24 | continue 25 | return None 26 | 27 | def getItem(): 28 | colums = ['number','name','money','enterprise','living'] 29 | r = getContent('http://www.forbeschina.com/review/list/002399.shtml') 30 | # print r.text 31 | tree = etree.HTML(r.text) 32 | items = tree.xpath('//tbody/tr') 33 | for item in items: 34 | d = dict(zip(colums,item.xpath('.//td/text()'))) 35 | print d 36 | collection.insert(d) 37 | 38 | def main(): 39 | getItem() 40 | 41 | if __name__ == '__main__': 42 | main() -------------------------------------------------------------------------------- /bbssmth/bbssmth/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import logging 8 | from logging import log 9 | from elasticsearch import Elasticsearch 10 | from bbssmth.settings import ES_HOST 11 | 12 | 13 | class BbssmthPipeline(object): 14 | def __init__(self): 15 | self.index = 'newsmth' 16 | self.doc = 'doc' 17 | self.es = Elasticsearch(ES_HOST) 18 | 19 | def process_item(self, item, spider): 20 | body = { 21 | 'title': item.get('title'), 22 | 'url': item.get('url'), 23 | 'content': item.get('content'), 24 | 'author': item.get('author'), 25 | 'crawltime': item.get('crawltime'), 26 | 'reply': item.get('reply'), 27 | 'category': item.get('category'), 28 | 'create_time':item.get('create_time'), 29 | 30 | } 31 | 32 | try: 33 | self.es.index(index=self.index, doc_type=self.doc, body=body) 34 | except Exception as e: 35 | log.msg('错误 >>>>>') 36 | log.msg(e) 37 | return item 38 | -------------------------------------------------------------------------------- /52sh/config_file.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # website: http://30daydo.com 3 | # @Time : 2020/9/24 12:12 4 | # @File : config_file.py 5 | 6 | START_URL = 'http://www.52sh.com.tw/index.php/main/knowledge/65/page/{page}' 7 | HEADERS = { 8 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 9 | "Accept-Encoding": "gzip, deflate", 10 | "Accept-Language": "zh,en;q=0.9,en-US;q=0.8,zh-CN;q=0.7", 11 | "Cache-Control": "no-cache", 12 | "Cookie": "PHPSESSID=a3oqieou2ik4a987ksq2bm3354; _ga=GA1.3.1399498082.1600914935; _gid=GA1.3.1565426161.1600914935", 13 | "Host": "www.52sh.com.tw", 14 | "Pragma": "no-cache", 15 | "Proxy-Connection": "keep-alive", 16 | "Referer": "http://www.52sh.com.tw/index.php/main/knowledge/65/page/105", 17 | "Upgrade-Insecure-Requests": "1", 18 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36", 19 | } 20 | PROXY = {'http': 'http://127.0.0.1:58083'} 21 | PROXY_STR = 'http://127.0.0.1:58083' 22 | SIMPLE_HEADERS = { 23 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36", 24 | } -------------------------------------------------------------------------------- /51CTOCrawler/demo.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import subprocess 3 | def demo_validate(): 4 | url='http://v22.51cto.com/2018/12/19/338483/e899/high/loco_video_323000_{}.ts' 5 | for i in range(112): 6 | r=requests.get(url.format(i)) 7 | with open('loco_video_323000_{}.ts'.format(i),'wb') as f: 8 | f.write(r.content) 9 | 10 | def write_confile(ts_len): 11 | txt = '' 12 | for i in range(ts_len): 13 | txt += "file 'C:\\git\\CrawlMan\\51CTOCrawler\\loco_video_323000_{}.ts'\n".format(i) 14 | with open('confile.txt', 'w') as fout: 15 | fout.write(txt) 16 | 17 | def merge_ts_video(title, v_type='.mp4'): 18 | cmd = 'ffmpeg -f concat -safe 0 -i confile.txt -c copy %s%s' %(title, v_type) 19 | print(cmd) 20 | p = subprocess.Popen(cmd, stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 21 | out, err = p.communicate() 22 | print(str(out, 'utf-8')) 23 | print(str(err, 'utf-8')) 24 | 25 | def run_cmd(): 26 | import os 27 | name = 'loco_video_323000_{}.ts' 28 | args = '+'.join([name.format(i) for i in range(112)]) 29 | cmd = 'copy /b '+args + ' test.ts' 30 | print(cmd) 31 | os.system(cmd) 32 | 33 | # demo_validate() 34 | write_confile(112) 35 | merge_ts_video('wanttoplay') 36 | #run_cmd() -------------------------------------------------------------------------------- /poi_gaode/sandbox/models.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/9/26 9:25 4 | # @File : models.py 5 | 6 | from sqlalchemy import Column, String, DateTime, Integer, Text, create_engine, DATE 7 | from sqlalchemy.ext.declarative import declarative_base 8 | import datetime 9 | from sqlalchemy.orm import sessionmaker 10 | from sandbox import config 11 | 12 | Base = declarative_base() 13 | engine = create_engine('mysql+pymysql://{}:{}@{}:3367/spider?charset=utf8'.format(config.username,config.password,config.mysql_ip)) 14 | DBSession = sessionmaker(bind=engine) 15 | 16 | TABLE_NAME = 'card_bin_scrapy' 17 | 18 | # ORM 模型,根据项目需求修改 19 | class SpiderModels(Base): 20 | __tablename__ = TABLE_NAME 21 | 22 | 23 | # 根据项目修改字段 24 | id = Column(Integer, primary_key=True, autoincrement=True) 25 | card=Column(Text, comment='卡号') 26 | accountLength = Column(Text, comment='长度') 27 | cardName = Column(Text, comment='卡名') 28 | cardType = Column(Text, comment='卡类型') 29 | mainAccount = Column(Text, comment='主账号') 30 | mainValue = Column(Text, comment='主账号值') 31 | orgName = Column(Text, comment='发卡行') 32 | 33 | origin = Column(String(30), comment='来源') 34 | crawltime = Column(DateTime, default=datetime.datetime.now(), comment='抓取时间') 35 | 36 | 37 | Base.metadata.create_all(engine) -------------------------------------------------------------------------------- /fraud/fraud/match.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from model.fraud import Fraud 4 | from model.db_config import DBSession, RedisPool 5 | import sys 6 | 7 | reload(sys) 8 | sys.setdefaultencoding('utf8') 9 | f = open("id_name.txt") 10 | line = f.readline() 11 | total_num, match_num, name_match_num = [0, 0, 0] 12 | 13 | session = DBSession() 14 | r_pool = RedisPool(client_db=1) 15 | r = r_pool.redis_pool() 16 | while line: 17 | id_num = line[0:18] 18 | formatted_id_num = id_num[0:11] + '*' * 4 + id_num[14:] 19 | # print line 20 | name = line[19:-1].strip() 21 | try: 22 | fraud_info = session.query(Fraud).filter_by(identity_number=formatted_id_num).first() 23 | except: 24 | session.rollback() 25 | if fraud_info: 26 | match_num += 1 27 | if name.encode('gb2312') == fraud_info.executed_name.encode('gb2312'): 28 | name_match_num += 1 29 | else: 30 | r.set(fraud_info.identity_number, 1) 31 | total_num += 1 32 | line = f.readline() 33 | 34 | f.close() 35 | session.close() 36 | print('样本总量:%s' % total_num) 37 | print('匹配成功数量:%s' % match_num) 38 | print('匹配率:%s' % ((match_num/total_num) * 100), '%') 39 | print('姓名身份证号匹配成功个数:%s' % name_match_num) 40 | print('姓名身份证号匹配率:%s' % ((name_match_num/match_num) * 100), '%') 41 | 42 | -------------------------------------------------------------------------------- /MyLibrary/sandbox/sandbox/models.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/9/26 9:25 4 | # @File : models.py 5 | 6 | from sqlalchemy import Column, String, DateTime, Integer, Text, create_engine, DATE 7 | from sqlalchemy.ext.declarative import declarative_base 8 | import datetime 9 | from sqlalchemy.orm import sessionmaker 10 | from sandbox import config 11 | 12 | # Base = declarative_base() 13 | # engine = create_engine('mysql+pymysql://{}:{}@{}:3367/spider?charset=utf8'.format(config.username,config.password,config.mysql_ip)) 14 | # DBSession = sessionmaker(bind=engine) 15 | # 16 | # TABLE_NAME = 'card_bin_scrapy' 17 | # 18 | # # ORM 模型,根据项目需求修改 19 | # class SpiderModels(Base): 20 | # __tablename__ = TABLE_NAME 21 | # 22 | # 23 | # # 根据项目修改字段 24 | # id = Column(Integer, primary_key=True, autoincrement=True) 25 | # card=Column(Text, comment='卡号') 26 | # accountLength = Column(Text, comment='长度') 27 | # cardName = Column(Text, comment='卡名') 28 | # cardType = Column(Text, comment='卡类型') 29 | # mainAccount = Column(Text, comment='主账号') 30 | # mainValue = Column(Text, comment='主账号值') 31 | # orgName = Column(Text, comment='发卡行') 32 | # # origin = Column(String(30), comment='来源') 33 | # crawtime = Column(DateTime, default=datetime.datetime.now(), comment='抓取时间') 34 | # 35 | # 36 | # Base.metadata.create_all(engine) -------------------------------------------------------------------------------- /github_star/star.py: -------------------------------------------------------------------------------- 1 | import sys, json, os, requests 2 | 3 | if len(sys.argv) < 2 or len(sys.argv[1]) == 0: 4 | print('Check your GitHub ID ...\n demo :\n python github_counter.py rockyzsu') 5 | exit() 6 | 7 | print('Search...') 8 | github_id = sys.argv[1] 9 | url = 'https://api.github.com/users/{github_id}/repos?page={page_id}' 10 | repo_list = [] 11 | page_id = 1 12 | while True: 13 | r = requests.get(url.format(github_id=github_id, page_id=page_id)) 14 | if r.status_code != 200: 15 | print('check your network connections') 16 | exit() 17 | 18 | repo_array = json.loads(r.content.decode('utf-8')) 19 | if len(repo_array) == 0: 20 | break 21 | 22 | for repo in repo_array: 23 | if not repo['fork']: 24 | repo_list.append([repo['name'], repo['stargazers_count'], repo['forks_count'],'' if repo['description'] is None else repo['description']]) 25 | page_id += 1 26 | 27 | # sort by number of stars 28 | repo_list = sorted(repo_list, key=lambda x: x[1], reverse=True) 29 | 30 | print('=' * 55) 31 | print('\n'.join(['{: <30}★{: <10}\tfork {:<10}\t{:<30} '.format(*repo) for repo in repo_list])) 32 | print('=' * 55) 33 | print('{: <30}★{: <10}\tfork {} '.format('total', sum([i[1] for i in repo_list]), sum([i[2] for i in repo_list]))) 34 | print('='*55) 35 | print('{:<30}\t{:<30}'.format('total_repo_count',len(repo_list))) 36 | -------------------------------------------------------------------------------- /async_cuiqingcai/async_sandbox/monitor/app.py: -------------------------------------------------------------------------------- 1 | # *-* coding:utf-8 *-* 2 | ''' 3 | @author: ioiogoo 4 | @date: 2016/12/25 15:00 5 | ''' 6 | import json 7 | from flask import Flask, render_template, jsonify, request, current_app 8 | import redis 9 | from settings import * 10 | 11 | app = Flask(__name__) 12 | 13 | 14 | @app.route('/') 15 | def index(): 16 | return render_template('index.html', timeinterval=TIMEINTERVAL, stats_keys=STATS_KEYS) 17 | 18 | 19 | @app.route('/ajax') 20 | def ajax(): 21 | key = request.args.get('key') 22 | result = current_app.r.lrange(key, -POINTLENGTH, -1)[::POINTINTERVAL] 23 | if not current_app.spider_is_run: 24 | # spider is closed 25 | return json.dumps(result), 404 26 | return json.dumps(result) 27 | 28 | 29 | @app.route('/signal') 30 | def signal(): 31 | signal = request.args.get('sign') 32 | if signal == 'closed': 33 | current_app.spider_is_run = False 34 | elif signal == 'running': 35 | current_app.spider_is_run = True 36 | return jsonify('') 37 | 38 | 39 | @app.before_first_request 40 | def init(): 41 | current_app.r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB,decode_responses=True) 42 | current_app.spider_is_run = True if current_app.r.get('spider_is_run') == '1' else False 43 | 44 | 45 | if __name__ == '__main__': 46 | app.run(debug=True, host=APP_HOST, port=APP_PORT) 47 | -------------------------------------------------------------------------------- /cuiqingcai/async_sandbox/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymysql 8 | from twisted.enterprise import adbapi 9 | import logging 10 | class AsyncSQLPipeline(object): 11 | def __init__(self): 12 | self.dbpool = adbapi.ConnectionPool('pymysql',host='',port='',user='',password='',db='spider') 13 | # self.cursor = self.conn.cursor() 14 | 15 | def process_item(self, item, spider): 16 | update_=self.dbpool.runInteraction(self.update,item) 17 | update_.addErrback(self.handle_error,item,spider) 18 | 19 | return item 20 | 21 | def update(self,cursor,item): 22 | insert_sql = 'insert into tb_cuiqingcai (category,title,article_url,content,author,created_at,liked,visited,comment,crawltime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' 23 | data=(item['category'],item['title'],item['article_url'],item['content'],item['author'],item['created_at'],item['liked'],item['visited'],item['comment'],item['crawltime'] 24 | ) 25 | cursor.execute(insert_sql,data) 26 | 27 | def handle_error(self,failure,item,spider): 28 | logging.error('写入数据库异常--->') 29 | logging.error(failure) 30 | logging.error('error item') 31 | logging.error(item) -------------------------------------------------------------------------------- /jd/switch_ip.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2020/3/30 21:50 4 | # @File : switch_ip.py 5 | 6 | import os 7 | import time 8 | from config import AD_PASSWORD, AD_USER 9 | 10 | g_adsl_account = {"name": "adsl", # 这个可以随意写 下面user和pwd 账号密码 11 | "username": AD_USER, 12 | "password": AD_PASSWORD} 13 | 14 | 15 | class ADSL(object): 16 | 17 | def __init__(self): 18 | self.name = g_adsl_account["name"] 19 | self.username = g_adsl_account["username"] 20 | self.password = g_adsl_account["password"] 21 | 22 | # set_adsl : 修改adsl设置 23 | 24 | def set_adsl(self, account): 25 | self.name = account["name"] 26 | self.username = account["username"] 27 | self.password = account["password"] 28 | 29 | # connect : 宽带拨号 30 | 31 | def connect(self): 32 | cmd_str = "rasdial %s %s %s" % (self.name, self.username, self.password) 33 | os.system(cmd_str) 34 | time.sleep(5) 35 | 36 | # disconnect : 断开宽带连接 37 | 38 | def disconnect(self): 39 | cmd_str = "rasdial %s /disconnect" % self.name 40 | os.system(cmd_str) 41 | time.sleep(5) 42 | 43 | # reconnect : 重新进行拨号 44 | 45 | def reconnect(self): 46 | print('自动拨号') 47 | self.disconnect() 48 | self.connect() 49 | 50 | 51 | if __name__ == '__main__': 52 | a = ADSL() 53 | a.reconnect() 54 | -------------------------------------------------------------------------------- /qianfangyiguan/qianfan_models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | from sqlalchemy import create_engine 4 | from sqlalchemy.orm import sessionmaker, relationship 5 | from sqlalchemy.ext.declarative import declarative_base 6 | from sqlalchemy import Column, String, DateTime, Integer, Text, ForeignKey, Float 7 | from sqlalchemy import event 8 | from sqlalchemy import DDL 9 | 10 | engine = create_engine('mysql+pymysql://root:@localhost:3306/db_parker?charset=utf8') 11 | DBSession = sessionmaker(bind=engine) 12 | Base = declarative_base() 13 | 14 | 15 | class Apps(Base): 16 | __tablename__ = 'tb_apps3' 17 | id = Column(Integer, primary_key=True) 18 | app_rank = Column(Integer, index=True) 19 | appName = Column(String(150), index=True) 20 | developCompanyFullName = Column(String(180),index=True) 21 | second_cateName = Column(String(150)) 22 | first_cateName = Column(String(150)) 23 | appId = Column(String(150)) 24 | activeNums = Column(Float) 25 | activeAvgDay = Column(Float) 26 | runtimeAvgDay = Column(Float) 27 | runtimeAvgPersonRatio = Column(Float) 28 | activeAvgDayRatio = Column(Float) 29 | runtimeNums = Column(Float) 30 | launchNums = Column(Float) 31 | runtimeNumsRatio = Column(Float) 32 | launchAvgDayRatio = Column(Float) 33 | statDate = Column(DateTime) 34 | developCompanyAbbr = Column(String(180)) 35 | 36 | 37 | Base.metadata.create_all(engine) 38 | -------------------------------------------------------------------------------- /dashiye/main.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2020/4/26 20:20 4 | # @File : main.py 5 | 6 | import requests 7 | import numpy as np 8 | 9 | 10 | code = input('请输入股票代码:') 11 | 12 | cookies = { 13 | 'PHPSESSID': 'jqb0q4h60h4bmtj5bkd9bjuv00', 14 | 'Hm_lvt_210e7fd46c913658d1ca5581797c34e3': '1587903421', 15 | 'Hm_lpvt_210e7fd46c913658d1ca5581797c34e3': '1587903461', 16 | } 17 | 18 | headers = { 19 | 'Origin': 'http://www.dashiyetouzi.com', 20 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', 21 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 22 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 23 | 'X-Requested-With': 'XMLHttpRequest', 24 | 'Referer': 'http://www.dashiyetouzi.com/tools/compare/historical_valuation.php', 25 | } 26 | 27 | data = { 28 | 'report_type': 'totalValue', 29 | 'report_stock_id': code, 30 | 'from_date': '2015-04-26', 31 | 'to_date': '2020-04-26' 32 | } 33 | 34 | response = requests.post('http://www.dashiyetouzi.com/tools/compare/historical_valuation_data.php', headers=headers, cookies=cookies, data=data, verify=False) 35 | js=response.json() 36 | data=js.get('list') 37 | all_point=[] 38 | for item in data: 39 | all_point.append(item[1]) 40 | 41 | 42 | np_data = np.array(all_point) 43 | print(f'中值:{np.median(np_data)}') 44 | print(f'最小值:{np.min(np_data)}') 45 | -------------------------------------------------------------------------------- /myubbs/sandbox/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from sandbox.models import SpiderModels, DBSession 8 | import logging 9 | import pymongo 10 | from sandbox import config 11 | from sandbox import settings 12 | 13 | class SQLPipeline(object): 14 | def __init__(self): 15 | self.session = DBSession() 16 | 17 | def process_item(self, item, spider): 18 | 19 | obj = SpiderModels( 20 | title=item['title'], 21 | pubdate = item['pubdate'], 22 | content = item['content'], 23 | author = item['author'], 24 | url = item['url'], 25 | crawltime=item['crawltime'], 26 | ) 27 | self.session.add(obj) 28 | 29 | try: 30 | self.session.commit() 31 | 32 | except Exception as e: 33 | self.session.rollback() 34 | logging.error('>>>> 插入数据库失败{}'.format(e)) 35 | return item 36 | 37 | 38 | class MongoPipeline(object): 39 | def __init__(self): 40 | DOCUMENT = setting.MONGODB_DOC 41 | self.db = pymongo.MongoClient(config.mongo_ip, port=27018) 42 | self.doc = self.db['spider'][DOCUMENT] 43 | 44 | def process_item(self, item, spider): 45 | insert_item = dict(item) 46 | self.doc.insert(insert_item) 47 | 48 | return item 49 | -------------------------------------------------------------------------------- /MyLibrary/sandbox/sandbox/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | # from sandbox.models import SpiderModels, DBSession 8 | import logging 9 | import pymongo 10 | from sandbox import config 11 | 12 | 13 | # class SQLPipeline(object): 14 | # def __init__(self): 15 | # self.session = DBSession() 16 | # 17 | # def process_item(self, item, spider): 18 | # 19 | # obj = SpiderModels( 20 | # card=item['card'], 21 | # accountLength=item['accountLength'], 22 | # cardName=item['cardName'], 23 | # cardType=item['cardType'], 24 | # mainAccount=item['mainAccount'], 25 | # mainValue=item['mainValue'], 26 | # orgName=item['orgName'], 27 | # ) 28 | # self.session.add(obj) 29 | # 30 | # try: 31 | # self.session.commit() 32 | # 33 | # except Exception as e: 34 | # logging.error('>>>> 插入数据库失败{}'.format(e)) 35 | # return item 36 | 37 | 38 | class MongoPipeline(object): 39 | def __init__(self): 40 | DOCUMENT = 'szlib' 41 | self.db = pymongo.MongoClient(config.mongo_ip, port=config.mongo_port) 42 | self.doc = self.db['spider'][DOCUMENT] 43 | 44 | def process_item(self, item, spider): 45 | self.doc.insert(dict(item)) 46 | return item 47 | -------------------------------------------------------------------------------- /lanrentingshu/lrts/lrts/spiders/tingshu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy import Request 4 | 5 | class TingshuSpider(scrapy.Spider): 6 | name = 'tingshu' 7 | 8 | # allowed_domains = ['www.lrts.me'] 9 | # start_urls = ['http://www.lrts.me/'] 10 | 11 | def start_requests(self): 12 | headers = {'Host': 'www.lrts.me', 'Proxy-Connection': 'keep-alive', 'Accept': '*/*', 13 | 'X-Requested-With': 'XMLHttpRequest', 14 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/65.0.3325.162Safari/537.36', 15 | 'Referer': 'http://www.lrts.me/playlist', 'Accept-Encoding': 'gzip,deflate', 16 | 'Accept-Language': 'zh-CN,zh;q=0.9', 17 | 'Cookie': 'aliyungf_tc=AQAAAF1znybVVQsAByAmG3Fs/DLq2DNK;CNZZDATA1254668430=264272103-1533047311-null%7C1533047311;Hm_lvt_ada61571fd48bb3f905f5fd1d6ef0ec4=1533051241;uid=1533051247919aea3a93a713a48c4a8d2221a0db33cc5;JSESSIONID=472B70BC34B8D0027B3B20AAE935E662;Hm_lpvt_ada61571fd48bb3f905f5fd1d6ef0ec4=1533051318'} 18 | 19 | url = 'http://www.lrts.me/ajax/playlist/2/6458' 20 | yield Request(url=url,headers=headers) 21 | 22 | def parse(self, response): 23 | download_list = response.xpath('//input[@name="source"]/@value').extract() 24 | item={} 25 | item['file_urls']=[] 26 | for each in download_list: 27 | item['file_urls'].append(each) 28 | yield item 29 | -------------------------------------------------------------------------------- /lanrentingshu/lanrentingshu.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import urllib 3 | 4 | import os 5 | import requests 6 | import time 7 | from lxml import etree 8 | from header_toolkit import getheader 9 | 10 | 11 | def spider(): 12 | curr=os.getcwd() 13 | target_dir=os.path.join(curr,'data') 14 | if not os.path.exists(target_dir): 15 | os.mkdir(target_dir) 16 | for i in range(1, 100, 10): 17 | url = 'http://www.lrts.me/ajax/playlist/2/32551/%d' % i 18 | headers = { 19 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'} 20 | s = requests.get(url=url, headers=headers) 21 | tree = etree.HTML(s.text) 22 | nodes = tree.xpath('//*[starts-with(@class,"clearfix section-item section")]') 23 | print len(nodes) 24 | for node in nodes: 25 | filename = node.xpath('.//div[@class="column1 nowrap"]/span/text()')[0] 26 | link = node.xpath('.//input[@name="source" and @type="hidden"]/@value')[0] 27 | 28 | print link 29 | post_fix=link.split('.')[-1] 30 | full_path= filename+'.'+post_fix 31 | filename = os.path.join(target_dir, full_path) 32 | # 修改这一段,多线程下载 33 | if not os.path.isfile(filename): 34 | urllib.urlretrieve(link, filename) 35 | time.sleep(1) 36 | else: 37 | continue 38 | 39 | 40 | if __name__ == '__main__': 41 | spider() 42 | -------------------------------------------------------------------------------- /pornhub/newJs.js: -------------------------------------------------------------------------------- 1 | var quality_1080p =/* + radra27radra27 + */rahttpsra83rahttpsra83 + /* + rancomvira35rancomvira35 + */raevphncdra57raevphncdra57 + /* + radra27radra27 + */rancomvira35rancomvira35 + /* + ra006163ra73ra006163ra73 + */radeos202ra16radeos202ra16 + /* + ra09ratera79ra09ratera79 + */ra006163ra73ra006163ra73 + /* + ra1080p4ra73ra1080p4ra73 + */ra24075351ra94ra24075351ra94 + /* + raroiu6qra26raroiu6qra26 + */ra1080p4ra73ra1080p4ra73 + /* + ra000k324ra70ra000k324ra70 + */ra000k324ra70ra000k324ra70 + /* + rancomvira35rancomvira35 + */ra075351mra26ra075351mra26 + /* + ravalidtora49ravalidtora49 + */rap4validra25rap4validra25 + /* + ra209hashra72ra209hashra72 + */rafrom160ra56rafrom160ra56 + /* + ra1080p4ra73ra1080p4ra73 + */ra6708909ra29ra6708909ra29 + /* + ra209hashra72ra209hashra72 + */ravalidtora49ravalidtora49 + /* + ramgdmctbvra11ramgdmctbvra11 + */ra16067161ra17ra16067161ra17 + /* + ra24075351ra94ra24075351ra94 + */ra09ratera79ra09ratera79 + /* + ra50000kbra49ra50000kbra49 + */ra50000kbra49ra50000kbra49 + /* + ramgdmctbvra11ramgdmctbvra11 + */raurst500ra63raurst500ra63 + /* + ra209hashra72ra209hashra72 + */ra00kip4ra41ra00kip4ra41 + /* + raroiu6qra26raroiu6qra26 + */ra72419ra91ra72419ra91 + /* + ra09ratera79ra09ratera79 + */ra209hashra72ra209hashra72 + /* + raro7upu3ra66raro7upu3ra66 + */raroiu6qra26raroiu6qra26 + /* + ra075351mra26ra075351mra26 + */ra2bmkdz7nra36ra2bmkdz7nra36 + /* + ra50000kbra49ra50000kbra49 + */ramgdmctbvra11ramgdmctbvra11 + /* + radeos202ra16radeos202ra16 + */raro7upu3ra66raro7upu3ra66 + /* + ra075351mra26ra075351mra26 + */radra27radra27; 2 | -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | ## 爬虫合集 2 | * 51CTOCrawler: 爬取51CTO的视频,并通过ffmpeg合并 3 | * 51jbnet: 51脚本内容爬取 4 | * 52sh: 台湾52社区网站 妹子图片爬取 5 | * anjuke:安居客爬虫 6 | * async_cuiqingcai: 异步爬取崔庆才博客内容 7 | * baiduwanpan: 暴力破解百度网盘密码 8 | * bbssmth:水木清华爬虫 9 | * bilibili:bilibili视频抓取 10 | * chahaoba: 查号吧 遍历所有手机号码归属地 11 | * chinaclear: 中登网开户人数爬取 12 | * cnbeta: cnbeta爬虫 13 | * csdn:csdn博客排名抓取 14 | * cuiqingcai:崔庆才博客爬取 15 | * dfcf: 东方财富股吧爬取,爬取所有个股的股吧帖子,可通过参数控制爬取指定日期 16 | * enterprise: 爬取工商企业数据 17 | * Ergeduoduo:儿歌多多 [http://30daydo.com/article/236](http://30daydo.com/article/236) 18 | * Forbes:福布斯排名爬虫 19 | * fraud: 失信被执行人爬取 20 | * github_star: 获取github某个人的所有仓库,星星总数 21 | * htqyy: 好听轻音乐 爬取轻音乐mp3 22 | * jd:京东图书爬取 23 | * kc0011:投资咨询网 24 | * lanrentingshu:每天心理学 (懒人听书)[http://30daydo.com/article/231](http://30daydo.com/article/231) 25 | * MyLibrary:图书馆抓取个人的阅读记录 26 | * pornhub: p站视频下载 27 | * poi_gaode:根据经纬度范围,在高德地图上遍历数据 28 | * qianfangyiguan:千帆易观数据抓取 29 | * szhouse: 深圳房价官网爬取 30 | * tiexue: 军事网站 铁血网内容爬取 31 | * stockholder:股东数据抓取 32 | * tencentjob:腾讯工作岗位爬取 33 | * ximalaya:喜马拉雅音频爬取 [http://30daydo.com/article/503](http://30daydo.com/article/503) 34 | * yinyonbao:应用宝app排名数据抓取 35 | * youdao_dictionary:有道词典js加密破解 [http://30daydo.com/article/416](http://30daydo.com/article/416) 36 | * zhihu:知乎分布式爬取 37 | ### QA疑问 38 | 代码库代码均已通过本人爬取测试可行,如果你使用本代码遇到问题,可邮件咨询。 39 | **上述只是本人代码库中部分展示代码,还有大量的非公开爬虫代码如:国家工商系统爬虫,淘宝网等,可以联系本人提供** 40 | **同时本人也承接各类爬虫业务** 41 | 42 | 或者爬虫的朋友可以加QQ群,一起分享项目。 43 | 44 | 45 | 46 | ###### 做爬虫的朋友经常会遇到别人已经爬过的网站,然后自己刚好需要,互补有无,提高效率。 47 | 48 | ###### QQ群:759746506 49 | 50 | 51 | 52 | 公众号: 53 | 54 | ![appMarket](https://s3.ax1x.com/2021/01/05/skCVPJ.jpg) -------------------------------------------------------------------------------- /async_cuiqingcai/async_sandbox/monitor/statscol.py: -------------------------------------------------------------------------------- 1 | # *-* coding:utf-8 *-* 2 | ''' 3 | @author: ioiogoo 4 | @date: 2016/12/25 16:50 5 | ''' 6 | 7 | import redis 8 | from .settings import STATS_KEYS 9 | import time 10 | import requests 11 | import json 12 | r = redis.Redis(host='10.18.6.46', port=6379, db=0,decode_responses=True) 13 | Time = lambda: time.strftime('%Y-%m-%d %H:%M:%S') 14 | 15 | 16 | class StatcollectorMiddleware(object): 17 | def __init__(self): 18 | self.r = redis.Redis(host='10.18.6.46', port=6379, db=0,decode_responses=True) 19 | self.stats_keys = STATS_KEYS 20 | 21 | def process_request(self, request, spider): 22 | self.formatStats(spider.crawler.stats.get_stats()) 23 | 24 | def formatStats(self, stats): 25 | for key in self.stats_keys: 26 | key_value = stats.get(key, None) 27 | if not key_value: continue 28 | value = {"value": [Time(), key_value]} 29 | content = json.dumps(value) 30 | print(f'key content {key}') 31 | print(f'value -->{content}') 32 | self.insert2redis(key, content) 33 | 34 | def insert2redis(self, key, value): 35 | self.r.rpush(key, value) 36 | 37 | 38 | class SpiderRunStatspipeline(object): 39 | def open_spider(self, spider): 40 | print('open SpiderRunStatspipeline') 41 | r.set('spider_is_run', 1) 42 | requests.get('http://127.0.0.1:5000/signal?sign=running') 43 | 44 | def close_spider(self, spider): 45 | r.set('spider_is_run', 0) 46 | requests.get('http://127.0.0.1:5000/signal?sign=closed') -------------------------------------------------------------------------------- /weibo/weibo/spiders/wb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Spider, FormRequest, Request 3 | 4 | 5 | class WbSpider(Spider): 6 | name = 'wb' 7 | 8 | headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 9 | 'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 10 | 'Connection': 'keep-alive', 11 | # 'Cookie': 'ALF=1539744188;SCF=Arejsw06Aa86L7rLsj3RRh8YiCul1z1Yapy6v1kQNGNbjcNLV3LPZbziAEtRKYVOAL_s5JKT2rck3tB7VAtepd4.;SUB=_2A252m2dXDeRhGedH7lcT8y7Fwj-IHXVSZAkfrDV6PUJbktAKLRejkW1NUKTAOGny8CQfH8IlGwCeP72gG_Pf_dFi;SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWIFwD6xpqyuh9_mA2jr6on5JpX5K-hUgL.Fo24SK-Ee0541Ke2dJLoI7LCdcSuwHvAMN-t;SUHB=0Ryruv0xgZvGM5;SSOLoginState=1537152775;_T_WM=ae5298708cece22521d281346fac7744', 12 | 'Host': 'weibo.cn', 'Pragma': 'no-cache', 13 | 'Referer': 'https://weibo.cn/search/mblog?hideSearchFrame=&keyword=000001&page=2', 14 | 'Upgrade-Insecure-Requests': '1', 15 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36'} 16 | 17 | def start_requests(self): 18 | keyword = '000001' 19 | for page in range(1, 2): 20 | url = 'https://weibo.cn/search/mblog?hideSearchFrame=&keyword=000001&page=1' 21 | yield Request(url=url, headers=self.headers) 22 | 23 | def parse(self, response): 24 | # print(response.text) 25 | response.xpath('//div[@class="c" and contains(@id,"M_")]') -------------------------------------------------------------------------------- /sz_yaohao/sandbox/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from sandbox.models import SpiderModels, DBSession 8 | import logging 9 | import pymongo 10 | from sandbox import config 11 | from sandbox import settings 12 | 13 | class SQLPipeline(object): 14 | def __init__(self): 15 | self.session = DBSession() 16 | 17 | def process_item(self, item, spider): 18 | 19 | obj = SpiderModels( 20 | card=item['card'], 21 | accountLength=item['accountLength'], 22 | cardName=item['cardName'], 23 | cardType=item['cardType'], 24 | mainAccount=item['mainAccount'], 25 | mainValue=item['mainValue'], 26 | orgName=item['orgName'], 27 | origin=item['origin'], 28 | crawltime=item['crawltime'], 29 | ) 30 | self.session.add(obj) 31 | 32 | try: 33 | self.session.commit() 34 | 35 | except Exception as e: 36 | logging.error('>>>> 插入数据库失败{}'.format(e)) 37 | return item 38 | 39 | 40 | class MongoPipeline(object): 41 | def __init__(self): 42 | DOCUMENT = settings.MONGODB_DOC 43 | self.db = pymongo.MongoClient(settings.MONGO_HOST, port=settings.MONGO_PORT) 44 | self.doc = self.db['spider'][DOCUMENT] 45 | 46 | def process_item(self, item, spider): 47 | insert_item = dict(item) 48 | self.doc.insert(insert_item) 49 | 50 | return item 51 | -------------------------------------------------------------------------------- /poi_gaode/gaode_map.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2018/12/6 10:39 4 | # @File : gaode_map.py 5 | import requests 6 | from math import radians, cos, sin, asin, sqrt 7 | import config 8 | import json 9 | 10 | def demo(): 11 | key=config.key 12 | url =f'https://restapi.amap.com/v3/place/polygon?polygon=116.460988,40.006919|116.48231,40.007381|116.47516,39.99713|116.472596,39.985227|116.45669,39.984989|116.460988,40.006919&keywords=kfc&output=json&key={key}' 13 | r = requests.get(url) 14 | print(r.json()) 15 | 16 | def haversine(lon1, lat1, lon2, lat2): # 经度1,纬度1,经度2,纬度2 (十进制度数) 17 | """ 18 | Calculate the great circle distance between two points 19 | on the earth (specified in decimal degrees) 20 | """ 21 | # 将十进制度数转化为弧度 22 | lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) 23 | 24 | # haversine公式 25 | dlon = lon2 - lon1 26 | dlat = lat2 - lat1 27 | a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2 28 | c = 2 * asin(sqrt(a)) 29 | r = 6371 # 地球平均半径,单位为公里 30 | 31 | return c * r * 1000 32 | 33 | 34 | def long_lati_change(): 35 | lbs = [(22.7100061372,113.7915802002), 36 | (22.7866273171,114.3717956543), 37 | (22.5404642212,113.9189529419), 38 | (22.5487084710,114.2375564575), 39 | (22.6586902908,114.2598724365), 40 | ] 41 | for i in lbs: 42 | print(f'{i[1]},{i[0]}|',end='') 43 | # demo() 44 | # 114.04308499999999,22.527853|114.04808499999999,22.522853 45 | lati1,long1=22.527853,114.04308499999999 46 | lati2,long2=22.522853,114.04808499999999 47 | print(haversine(long1,lati1,long2,lati2)) 48 | # long_lati_change() 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /tencentjob/tencentjob/spiders/tencent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | 4 | import scrapy 5 | from scrapy.linkextractors import LinkExtractor 6 | from scrapy.spiders import CrawlSpider, Rule 7 | from tencentjob.items import TencentjobItem 8 | 9 | 10 | class TencentSpider(CrawlSpider): 11 | name = 'tencent' 12 | allowed_domains = ['tencent.com'] 13 | start_urls = ['https://hr.tencent.com/position.php'] 14 | rules = [ 15 | # 多个条件 16 | Rule(LinkExtractor(allow=("start=\d+"))), 17 | Rule(LinkExtractor(allow=("position_detail\.php")), follow=True, callback='parse_item') 18 | ] 19 | 20 | def parse_item(self, response): 21 | item = TencentjobItem() 22 | 23 | title = response.xpath('//*[(@id = "sharetitle")]/text()').extract_first() 24 | workLocation = response.xpath('//*[@class="lightblue l2"]/../text()').extract_first() 25 | catalog = response.xpath('//*[@class="lightblue"]/../text()').extract_first() 26 | recruitNumber = response.xpath('//*[@class="lightblue"]/../text()').re('(\d+)')[0] 27 | duty_pre = response.xpath('//*[@class="squareli"]').extract_first() 28 | duty = re.sub('<.*?>', '', duty_pre) 29 | 30 | Job_requirement_pre = response.xpath('//*[@class="squareli"]').extract_first() 31 | Job_requirement = re.sub('<.*?>', '', Job_requirement_pre) 32 | 33 | item['title'] = title 34 | item['url'] = response.url 35 | item['workLocation'] = workLocation 36 | item['catalog'] = catalog 37 | item['recruitNumber'] = recruitNumber 38 | item['duty'] = duty 39 | item['Job_requirement'] = Job_requirement 40 | 41 | yield item 42 | -------------------------------------------------------------------------------- /szhouse/database.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | __author__ = 'Rocky' 3 | import sqlite3 4 | 5 | def create_table(): 6 | conn = sqlite3.connect('shenzhen_house.db') 7 | try: 8 | create_tb_cmd=''' 9 | CREATE TABLE IF NOT EXISTS HOUSE 10 | ('日期' TEXT, 11 | '一手房套数' TEXT, 12 | '一手房面积' TEXT, 13 | '二手房套数' TEXT, 14 | '二手房面积' TEXT); 15 | ''' 16 | #主要就是上面的语句 17 | conn.execute(create_tb_cmd) 18 | except: 19 | print("Create table failed") 20 | return False 21 | 22 | 23 | conn.execute(create_tb_cmd) 24 | conn.commit() 25 | conn.close() 26 | 27 | def insert(date,one_hand,one_area,second_hand,second_area): 28 | conn = sqlite3.connect('shenzhen_house.db') 29 | print("open database passed") 30 | 31 | cmd="INSERT INTO HOUSE ('日期','一手房套数','一手房面积','二手房套数','二手房面积') VALUES('%s','%s','%s','%s','%s');" %(date,one_hand,one_area,second_hand,second_area) 32 | #works 要么加\" 33 | #paul_su="INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) VALUES(5,'%s',32,'CALIFORNIA',2000.00);" %temp2 34 | #works 要么加 ’‘ 35 | 36 | #allen="INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) VALUES(2,'ALLEN',72,'CALIFORNIA',20500.00);" 37 | #teddy="INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) VALUES(3,'TEDDY',732,'CALIFORNIA',52000.00);" 38 | #mark="INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) VALUES(4,'MARK',327,'CALIFORNIA',3000.00);" 39 | #sun="INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) VALUES(?,?,?,?,?);" 40 | #conn.execute("INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) VALUES(?,?,32,'CALIFORNIA',2000.00)",temp) 41 | 42 | conn.execute(cmd) 43 | 44 | conn.commit() 45 | conn.close() -------------------------------------------------------------------------------- /fraud/fraud/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from fraud.model.fraud import Fraud 3 | from fraud.model.db_config import DBSession, RedisPool 4 | from scrapy.exceptions import DropItem 5 | import datetime 6 | import json 7 | class FraudPipeline(object): 8 | 9 | def open_spider(self, spider): 10 | self.session = DBSession() 11 | 12 | def process_item(self, item, spider): 13 | # item = json.dumps(dict(item)).decode('unicode-escape') 14 | f = Fraud(executed_name=item['executed_name'], 15 | gender=item['gender'], 16 | age=item['age'], 17 | identity_number=item['identity_number'], 18 | court=item['court'], 19 | province=item['province'], 20 | case_number=item['case_number'], 21 | performance=item['performance'], 22 | disrupt_type_name=item['disrupt_type_name'], 23 | duty=item['duty'], 24 | release_time=item['release_time'], 25 | crawl_time=datetime.datetime.now()) 26 | self.session.add(f) 27 | try: 28 | self.session.commit() 29 | except Exception as e: 30 | print(e) 31 | self.session.rollback() 32 | 33 | return item 34 | 35 | def close_spider(self, spider): 36 | self.session.close() 37 | 38 | class DuplicatesPipeline(object): 39 | def process_item(self, item, spider): 40 | pool = RedisPool() 41 | r = pool.redis_pool() 42 | if r.exists('id_num: %s' % item['case_number']): 43 | raise DropItem("Duplicate item found: %s" % item['case_number']) 44 | else: 45 | r.set('id_num: %s' % item['case_number'], 1) 46 | return item 47 | -------------------------------------------------------------------------------- /szhouse/house.py: -------------------------------------------------------------------------------- 1 | #-*-coding=utf-8-*- 2 | __author__ = 'rocky' 3 | # 网页源码修改 废弃使用 4 | #获取每天深圳一手房,二手房的成交套数与面积,并且写入数据库 5 | #主要就是正则表达抓取几个数字 6 | import re 7 | import database 8 | import requests 9 | 10 | def getContent(): 11 | url="http://ris.szpl.gov.cn/" 12 | one_hand="credit/showcjgs/ysfcjgs.aspx" 13 | second_hand="credit/showcjgs/esfcjgs.aspx" 14 | # req=urllib2.Request(url+one_hand) 15 | # content=urllib2.urlopen(req).read() 16 | #返回的就是网页的源码,没有做任何防爬虫的处理,zf网站,呵呵 17 | #print content 18 | headers={'User-Agent':'Mozilla/5.0 (WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'} 19 | content = requests.get(url=url+one_hand,headers=headers).text 20 | 21 | date=re.compile(r'(.*)') 22 | reg=re.compile(r'(\d+)') 23 | result=reg.findall(content) 24 | current_date=date.findall(content) 25 | 26 | reg2=re.compile(r'(.*?)') 27 | yishou_area=reg2.findall(content) 28 | 29 | 30 | print(current_date[0]) 31 | print('一手商品房成交套数:%s' % result[0]) 32 | print('一手商品房成交面积: %s' % yishou_area[0]) 33 | 34 | 35 | # sec_req=urllib2.Request(url+second_hand) 36 | # sec_content=urllib2.urlopen(sec_req).read() 37 | 38 | sec_content = requests.get(url+second_hand).text 39 | 40 | sec_quantity=re.compile(r'(\d+)') 41 | sec_result=sec_quantity.findall(sec_content) 42 | second_area=re.findall(r'(.*?)',sec_content) 43 | 44 | print('二手商品房成交套数:%s' % sec_result[1]) 45 | print('二手商品房成交面积: %s' % second_area[2]) 46 | database.create_table() 47 | database.insert(current_date[0],result[0],yishou_area[0],sec_result[1],second_area[2]) 48 | 49 | getContent() -------------------------------------------------------------------------------- /ximalaya/story.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2019/10/18 18:04 4 | # @File : story.py 5 | 6 | # 睡前故事 7 | import os 8 | 9 | import requests,datetime,re 10 | 11 | url='http://mobwsa.ximalaya.com/mobile-album/album/page/ts-1571392955128?ac=WIFI&albumId=260744&device=android&isAsc=false&isQueryInvitationBrand=true&isVideoAsc=true&pageId={}&pageSize=100&pre_page=0&source=5&supportWebp=true' 12 | headers = {'User-Agent': 'Xiaomi'} 13 | 14 | def download(): 15 | 16 | for i in range(1, 2): # 只下载一页 17 | 18 | r = requests.get(url=url.format(i), headers=headers) 19 | js_data = r.json() 20 | data_list = js_data.get('data', {}).get('tracks',{}).get('list',[]) 21 | 22 | for item in data_list: 23 | trackName = item.get('title') 24 | trackName = re.sub('[\/\\\:\*\?\"\<\>\|]', '_', trackName) 25 | # trackName=re.sub(':','',trackName) 26 | src_url = item.get('playUrl64') 27 | orderNo = item.get('orderNo') 28 | 29 | filename = '{}-{}.mp3'.format(orderNo,trackName) 30 | if not os.path.exists(filename): 31 | 32 | try: 33 | r0 = requests.get(src_url, headers=headers,timeout=3600) 34 | except Exception as e: 35 | print(e) 36 | print(trackName) 37 | r0 = requests.get(src_url, headers=headers,timeout=3600) 38 | 39 | 40 | 41 | with open(filename, 'wb') as f: 42 | f.write(r0.content) 43 | print('{}下载完成'.format(filename)) 44 | 45 | else: 46 | print(f'{filename}已经下载过了') 47 | 48 | if __name__=='__main__': 49 | print(f'start at {datetime.datetime.now()}') 50 | download() 51 | print(f'end at {datetime.datetime.now()}') 52 | -------------------------------------------------------------------------------- /pornhub/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | __pycache__ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask instance folder 57 | instance/ 58 | 59 | # Scrapy stuff: 60 | .scrapy 61 | 62 | # Sphinx documentation 63 | docs/_build/ 64 | 65 | # PyBuilder 66 | target/ 67 | 68 | # IPython Notebook 69 | .ipynb_checkpoints 70 | *.ipynb 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | 92 | .idea/* 93 | .DS_Store 94 | .vscode 95 | settings.yaml 96 | tmp/* 97 | test/ 98 | *.sqlite 99 | result/* 100 | logs/* 101 | tasks/result/* 102 | *.swp 103 | web/upload/* 104 | *.png 105 | *.yaml 106 | 107 | download* 108 | *.zip 109 | mp4/ 110 | webm/ 111 | nohup.out 112 | -------------------------------------------------------------------------------- /stockholder/main.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | import requests 3 | from lxml import etree 4 | import pymongo 5 | import tushare as ts 6 | client = pymongo.MongoClient('10.18.6.102') 7 | doc = client['secutiry']['shareholder'] 8 | 9 | __author__ = 'Rocky' 10 | 11 | ''' 12 | http://30daydo.com 13 | Email: weigesysu@qq.com 14 | ''' 15 | def getContent(code): 16 | url = 'http://quotes.money.163.com/f10/gdfx_{}.html'.format(code) 17 | 18 | headers = {'User-Agent':'Mozilla/5.0(WindowsNT6.1;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/65.0.3325.162Safari/537.36'} 19 | for i in range(5): 20 | try: 21 | r = requests.get(url, headers=headers) 22 | if r.status_code==200: 23 | return r.text 24 | except Exception,e: 25 | print e 26 | continue 27 | 28 | return None 29 | 30 | def parser(code): 31 | text = getContent(code,) 32 | document={} 33 | if text is not None: 34 | tree = etree.HTML(text) 35 | name = tree.xpath('//div[@id="dateTable"]/table/tr/td[1]/text()') 36 | percent = tree.xpath('//div[@id="dateTable"]/table/tr/td[2]/text()') 37 | number = tree.xpath('//div[@id="dateTable"]/table/tr/td[3]/text()') 38 | # print name 39 | # print percent 40 | # print number 41 | d = {} 42 | for index,value in enumerate(name): 43 | # print index 44 | k = name[index] 45 | p=percent[index] 46 | n=number[index] 47 | if '.' in k: 48 | k=k.replace('.','_') 49 | d[k]=(p,n) 50 | document[code]=d 51 | doc.insert(document) 52 | 53 | def all_stocks(): 54 | df = ts.get_stock_basics() 55 | for i in df.index: 56 | parser(i) 57 | 58 | def main(): 59 | # parser('000011') 60 | all_stocks() 61 | 62 | if __name__ == '__main__': 63 | main() -------------------------------------------------------------------------------- /kc0011/async_mongo.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2019/11/26 8:55 4 | # @File : async_mongo.py 5 | import asyncio 6 | from urllib.parse import urlparse 7 | import pymongo 8 | import threading 9 | from motor.motor_asyncio import AsyncIOMotorClient 10 | import motor 11 | from pymongo.errors import DuplicateKeyError 12 | 13 | #异步更新mongo数据库 14 | 15 | db_host = '192.168.10.48' 16 | db_port = 17001 17 | uri = 'mongodb://{0}:{1}'.format( 18 | db_host, db_port) # db_name 认证数据库 19 | db = motor.motor_tornado.MotorClient(uri)['spider'] # 认证完成后需要连接要用的数据库 20 | 21 | # client = AsyncIOMotorClient(MONGO_HOST, port=MONGO_PORT) 22 | # db = client['hedgehog_spider'] 23 | # db.authenticate(name='Zane', password='*#06#', source='admin') 24 | 25 | doc = db['KC0011_content'] 26 | block = 500 27 | total = 124684 28 | 29 | iter_number = total // block 30 | 31 | remain_part = total % block 32 | import re 33 | 34 | re_pattern = re.compile('&page=\d+') 35 | 36 | 37 | async def run(): 38 | for i in range(iter_number + 1): 39 | 40 | small_part = doc.find({}, {'_id': 1, 'url': 1}).limit(block).skip(i * block) 41 | 42 | async for item in small_part: 43 | url = item.get('url') 44 | idx = item.get('_id') 45 | if re.search(re_pattern,url): 46 | # print(url) 47 | 48 | url_ = re.sub(re_pattern, '', url) 49 | 50 | try: 51 | await doc.update_one( 52 | {'_id': idx}, 53 | {'$set': {'url': url_}} 54 | ) 55 | 56 | except DuplicateKeyError as e: 57 | print(e) 58 | print('删除此doc {}'.format(url)) 59 | await doc.delete_one({'_id':idx}) 60 | 61 | except Exception as e: 62 | print(e) 63 | 64 | 65 | asyncio.get_event_loop().run_until_complete(run()) 66 | -------------------------------------------------------------------------------- /Ergeduoduo/main.py: -------------------------------------------------------------------------------- 1 | #-*-coding=utf-8-*- 2 | import sys,os 3 | import requests 4 | from lxml import etree 5 | import subprocess 6 | session = requests.Session() 7 | def getContent(url): 8 | # url='http://www.iqiyi.com/v_19rrkwcx6w.html' 9 | try: 10 | ret = requests.get(url) 11 | ret.encoding='utf-8' 12 | # except Exception,e: 13 | except: 14 | # print e 15 | return None 16 | if ret.status_code==200: 17 | return ret.text 18 | else: 19 | return None 20 | 21 | def getUrl(): 22 | url='http://www.iqiyi.com/v_19rrkwcx6w.html' 23 | url2='http://www.iqiyi.com/v_19rrl2td7g.html' # 31-61 24 | content = getContent(url) 25 | if not content: 26 | print "network issue, retry" 27 | exit(0) 28 | root = etree.HTML(content,parser=etree.HTMLParser(encoding='utf-8')) 29 | elements=root.xpath('//div[@data-current-count="1"]//li') 30 | for items in elements: 31 | url_item=items.xpath('.//a/@href')[0] 32 | song_url = url_item.replace('//','') 33 | song_url=song_url.strip() 34 | print(song_url) 35 | # name=items.xpath('.//span[@class="item-num"]/text()')[0] 36 | name=items.xpath('.//span[@class="item-num"]/text()')[0].encode('utf-8').strip()+\ 37 | ' '+items.xpath('.//span[@class="item-txt"]/text()')[0].encode('utf-8').strip()+'.mp4' 38 | name= '儿歌多多 '+name 39 | name=name.decode('utf-8') 40 | filename=os.path.join(os.getcwd(),name) 41 | print filename 42 | if os.path.exists(filename): 43 | continue 44 | p=subprocess.Popen('python you-get -d --format=HD {}'.format(song_url),stderr=subprocess.PIPE,stdout=subprocess.PIPE,shell=True) 45 | output,error = p.communicate() 46 | print(output) 47 | print(error) 48 | p.wait() 49 | 50 | 51 | def main(): 52 | getUrl() 53 | 54 | if __name__ == '__main__': 55 | main() -------------------------------------------------------------------------------- /poi_gaode/sandbox/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import datetime 8 | 9 | from sandbox.models import SpiderModels, DBSession 10 | import logging 11 | import pymongo 12 | from sandbox import config 13 | from sandbox import settings 14 | from pymongo.errors import DuplicateKeyError 15 | from scrapy.exceptions import DropItem 16 | # class SQLPipeline(object): 17 | # def __init__(self): 18 | # self.session = DBSession() 19 | # 20 | # def process_item(self, item, spider): 21 | # 22 | # obj = SpiderModels( 23 | # card=item['card'], 24 | # accountLength=item['accountLength'], 25 | # cardName=item['cardName'], 26 | # cardType=item['cardType'], 27 | # mainAccount=item['mainAccount'], 28 | # mainValue=item['mainValue'], 29 | # orgName=item['orgName'], 30 | # origin=item['origin'], 31 | # crawltime=item['crawltime'], 32 | # ) 33 | # self.session.add(obj) 34 | # 35 | # try: 36 | # self.session.commit() 37 | # 38 | # except Exception as e: 39 | # logging.error('>>>> 插入数据库失败{}'.format(e)) 40 | # return item 41 | 42 | 43 | class MongoPipeline(object): 44 | def __init__(self): 45 | DOCUMENT = settings.MONGODB_DOC 46 | self.db = pymongo.MongoClient(config.mongo_ip, port=27018) 47 | self.doc = self.db['spider'][DOCUMENT] 48 | 49 | def process_item(self, item, spider): 50 | insert_item = dict(item) 51 | insert_item['crawltime']=datetime.datetime.now() 52 | try: 53 | self.doc.insert(insert_item) 54 | except DuplicateKeyError: 55 | raise DropItem('drop item {}'.format(insert_item['id'])) 56 | 57 | return item 58 | -------------------------------------------------------------------------------- /pornhub/cookies_access.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | headers = { 4 | 'authority': 'cn.pornhub.com', 5 | 'pragma': 'no-cache', 6 | 'cache-control': 'no-cache', 7 | 'upgrade-insecure-requests': '1', 8 | 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36', 9 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 10 | 'sec-fetch-site': 'none', 11 | 'sec-fetch-mode': 'navigate', 12 | 'sec-fetch-dest': 'document', 13 | 'accept-language': 'zh,en;q=0.9,en-US;q=0.8,zh-CN;q=0.7', 14 | 'cookie': 'FastPopSessionRequestNumber=11; bs=0hwo170h8b27c5b55tt3ux7b8xkukol0; ss=630427593672619545; bitmovin_analytics_uuid=48eeeda8-bcfe-47f6-84fb-dd172921281a; platform_cookie_reset=pc; fg_9d12f2b2865de2f8c67706feaa332230=56077.100000; fg_7133c455c2e877ecb0adfd7a6ec6d6fe=32682.100000; ats_jp_vkey=ph5f29d906ac970; il=v1yKrZvlyVIqstonKh7Cf8kS4JOEHaOX5I0jleVOp8p6sxNjE0NjQ3MTgwaExRdXp5LXY2QVV4dnhhZmV1NncydDhpam15N1NMamk2dFc5bENEXw..; expiredEnterModalShown=1; platform=pc; fg_a197b3a83beb75c5f0255dc465e9f2de=3629.100000; ua=dcc77110dea38e3cff8b12436648706c; fanClubInfoPop=1; FastPopSessionRequestNumber=9', 15 | } 16 | 17 | params = ( 18 | ('s', 'eyJrIjoiMDgxOTU1NjU4MGNjZjQyOTQ1ODVkZTdhNjM5NjkyMjQzNWE1NzdjYSIsInQiOjE2MDkyMTYwNDJ9'), 19 | ('v', 'ph5fe22b22c2a32'), 20 | ('e', '0'), 21 | ) 22 | 23 | response = requests.get('https://cn.pornhub.com/video/get_media', headers=headers, params=params) 24 | 25 | #NB. Original query string below. It seems impossible to parse and 26 | #reproduce query strings 100% accurately so the one below is given 27 | #in case the reproduced version is not "correct". 28 | # response = requests.get('https://cn.pornhub.com/video/get_media?s=eyJrIjoiM2JkNzk3OTc3MDYxNjdhN2NiZjg3ZjAxN2YxMDI3YTY3MjNkOWNmMyIsInQiOjE2MDkyMTE5MzJ9&v=ph5c7a39b625845&e=0', headers=headers) 29 | print(response.json()) -------------------------------------------------------------------------------- /async_cuiqingcai/async_sandbox/RedisDuplicator.py: -------------------------------------------------------------------------------- 1 | import redis 2 | from scrapy.dupefilters import BaseDupeFilter 3 | # 自定义dupefilter 4 | class DupeFilter(BaseDupeFilter): 5 | 6 | def __init__(self,host,port,db,key,reset): 7 | print('='*20) 8 | print('using my dupefilter ') 9 | print('='*20) 10 | self.r = redis.StrictRedis(host=host,port=port,db=db) 11 | self.key = key 12 | self.reset = reset 13 | 14 | 15 | @classmethod 16 | def from_settings(cls, settings): 17 | # result=(dict(settings)) 18 | 19 | # name=settings.get('BOT_NAME') 20 | # print(f'name is {name}') 21 | host=settings.get('REDIS_HOST','127.0.0.1') 22 | port=settings.get('REDIS_PORT',6379) 23 | 24 | print(f'host:{host},port {port}') 25 | db=settings.get('REDIS_DB',0) 26 | redis_key=settings.get('REDIS_KEY') 27 | 28 | 29 | print(f'redis key{redis_key}') 30 | user=settings.get('USER_AGENT') 31 | print(user) 32 | if redis_key is None: 33 | raise ValueError('No value assign to redis_key') 34 | 35 | reset=settings.getbool('REDIS_REST',False) 36 | 37 | 38 | 39 | return cls(host,port,db,redis_key,reset) 40 | 41 | def request_seen(self, request): 42 | 43 | if self.r.sismember(self.key,request.url): 44 | print(f'url ---{request.url}---has been seen 重复URL') 45 | 46 | return True 47 | 48 | else: 49 | # print('add an url in redis') 50 | self.r.sadd(self.key,request.url) 51 | 52 | return False 53 | 54 | def open(self): # can return deferred 55 | pass 56 | 57 | def close(self, reason): # can return a deferred 58 | print('dup closed') 59 | 60 | if self.reset: 61 | print(f'delete redis key {self.key}') 62 | self.r.delete(self.key) 63 | 64 | def log(self, request, spider): # log that a request has been filtered 65 | pass -------------------------------------------------------------------------------- /sz_yaohao/sandbox/spiders/website.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import re 4 | 5 | import requests 6 | import scrapy 7 | from scrapy import Request, FormRequest 8 | import logging 9 | import redis 10 | from sandbox.items import SpiderItem 11 | from sandbox.utility import get_header 12 | from sandbox.config import code_url 13 | 14 | # post 15 | class WebPostSpider(scrapy.Spider): 16 | name = 'website' 17 | headers = { 18 | 19 | } 20 | post_url = 'https://apply.jtys.sz.gov.cn/apply/app/increment/person/login' 21 | img_url = 'http://apply.jtys.sz.gov.cn/apply/app/validCodeImage' 22 | 23 | def __init__(self, *args, **kwargs): 24 | super(WebPostSpider, self).__init__(*args, **kwargs) 25 | self.headers = get_header() 26 | 27 | self.data = { 28 | 'loginType': 'MOBILE', 29 | 'loginCode': '', 30 | 'password': '', 31 | 'validCode': '', 32 | } 33 | 34 | def start_requests(self): 35 | 36 | yield Request( 37 | url=self.img_url, 38 | headers=self.headers 39 | ) 40 | def parse(self,response): 41 | # TO DO 42 | img = response.body 43 | 44 | # with open('test.jpg','wb') as f: 45 | # f.write(img) 46 | r=requests.post(code_url,data=img) 47 | js_data = r.json() 48 | if js_data.get('success'): 49 | code = js_data.get('message') 50 | post_data=self.data.copy() 51 | post_data['validCode']=code 52 | # input('input code') 53 | yield FormRequest(url=self.post_url, 54 | headers=self.headers, 55 | formdata=post_data, 56 | callback=self.check_login, 57 | ) 58 | 59 | def check_login(self,response): 60 | content=response.text 61 | if '忘记密码' in content: 62 | print('密码错误') 63 | else: 64 | print('找到密码') 65 | 66 | -------------------------------------------------------------------------------- /fangtianxia/fangtianxia_proxy_test.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import hashlib 3 | import time 4 | import requests 5 | 6 | # 找群主购买 my_app_key, myappsecret, 以及蚂蚁代理服务器的 mayi_url 地址和 mayi_port 端口 7 | my_app_key = "" 8 | app_secret = "" 9 | mayi_url = 's3.proxy.mayidaili.com' 10 | mayi_port = '8123' 11 | 12 | # 蚂蚁代理服务器地址 13 | mayi_proxy = {'http': 'http://{}:{}'.format(mayi_url, mayi_port)} 14 | 15 | # 准备去爬的 URL 链接 16 | #url = 'http://1212.ip138.com/ic.asp' 17 | testUrl='http://members.3322.org/dyndns/getip' 18 | # 计算签名 19 | timesp = '{}'.format(time.strftime("%Y-%m-%d %H:%M:%S")) 20 | codes = app_secret + 'app_key' + my_app_key + 'timestamp' + timesp + app_secret 21 | sign = hashlib.md5(codes.encode('utf-8')).hexdigest().upper() 22 | 23 | # 拼接一个用来获得蚂蚁代理服务器的「准入」的 header (Python 的 concatenate '+' 比 join 效率高) 24 | authHeader = 'MYH-AUTH-MD5 sign=' + sign + '&app_key=' + my_app_key + '×tamp=' + timesp 25 | 26 | # 用 Python 的 Requests 模块。先订立 Session(),再更新 headers 和 proxies 27 | 28 | user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0" 29 | # cookie_read=open('cookie').read().strip() 30 | headers = {"User-agent": user_agent, 'upgrade-insecure-requests': '1', 31 | 'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6', 32 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 33 | 'accept-encoding': 'gzip, deflate', 'Cache-Control': 'no-cache'} 34 | ''' 35 | s = requests.Session() 36 | s.headers.update({'Proxy-Authorization': authHeader}) 37 | s.proxies.update(mayi_proxy) 38 | s.headers.update(headers) 39 | s.headers.update({'Proxy-Authorization': authHeader}) 40 | pg = s.get(testUrl) # tuple: 300 代表 connect timeout, 270 代表 read timeout 41 | print(pg.text) 42 | print(pg.status_code) 43 | ''' 44 | headers['Proxy-Authorization']=authHeader 45 | while 1: 46 | r=requests.get(url=testUrl,headers=headers,proxies=mayi_proxy) 47 | print(r.status_code) 48 | #r.encoding='gb2312' 49 | print(r.text) 50 | time.sleep(10) 51 | #pg.encoding = 'GB18030' 52 | -------------------------------------------------------------------------------- /dfcf/settings.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2020/3/31 23:36 4 | # @File : settings.py 5 | import time 6 | 7 | import config 8 | import requests 9 | 10 | headers = { 11 | 'Connection': 'keep-alive', 12 | # 'Upgrade-Insecure-Requests': '1', 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', 14 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 15 | 'Referer': 'http://guba.eastmoney.com/list,300750_2.html', 16 | 'Accept-Encoding': 'gzip, deflate', 17 | 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8', 18 | } 19 | 20 | cookies = { 21 | 'qgqp_b_id': '4d112e2089d3c5855c8ca2d1f2947ecd', 22 | 'em_hq_fls': 'js', 23 | 'st_si': '98016728708487', 24 | 'HAList': 'a-sh-601799-%u661F%u5B87%u80A1%u4EFD%2Ca-sh-600729-%u91CD%u5E86%u767E%u8D27%2Ca-sz-000063-%u4E2D%u5174%u901A%u8BAF%2Cf-0-399300-%u6CAA%u6DF1300', 25 | 'emshistory': '%5B%22%E6%98%9F%E5%AE%87%E8%82%A1%E4%BB%BD%22%2C%22601799%22%2C%22300496%22%2C%22dfcf%22%5D', 26 | 'st_asi': 'delete', 27 | 'st_pvi': '04745525503534', 28 | 'st_sp': '2019-10-28%2011%3A48%3A22', 29 | 'st_inirUrl': 'https%3A%2F%2Fwww.baidu.com%2Flink', 30 | 'st_sn': '132', 31 | 'st_psi': '20200401002426450-117001301474-3984682985', 32 | } 33 | 34 | def get_proxy(retry=10): 35 | count = 0 36 | proxyurl = 'http://{}:8101/dynamicIp/common/getDynamicIp.do'.format( 37 | config.PROXIES_OLD) 38 | for i in range(retry): 39 | try: 40 | r = requests.get(proxyurl, timeout=10) 41 | # print('获取的代理ip ' + r.text) 42 | except Exception as e: 43 | print(e) 44 | count += 1 45 | print('代理获取失败,重试' + str(count)) 46 | time.sleep(1) 47 | 48 | else: 49 | js = r.json() 50 | proxyServer = 'http://{0}:{1}'.format(js.get('ip'), js.get('port')) 51 | proxies_random = { 52 | 'http': proxyServer 53 | } 54 | return proxies_random 55 | -------------------------------------------------------------------------------- /holdle/sync_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2020/11/24 21:42 3 | # @File : sync_spider.py 4 | # @Author : Rocky C@www.30daydo.com 5 | import requests 6 | import sys 7 | sys.path.append('..') 8 | import asyncio 9 | import datetime 10 | import aiohttp 11 | import re 12 | import time 13 | from parsel import Selector 14 | from configure.settings import DBSelector 15 | from common.BaseService import BaseService 16 | 17 | SLEEP = 2 18 | 19 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0', 20 | 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'} 21 | 22 | URL_MAP = {'home_page': 'https://holdle.com/stocks/industry', 'base': 'https://holdle.com'} 23 | 24 | 25 | class Holdle(BaseService): 26 | 27 | def __init__(self): 28 | super(Holdle, self).__init__() 29 | 30 | self.DB = DBSelector() 31 | self.client = self.DB.mongo(location_type='qq', async_type=True) 32 | self.session = requests.Session() 33 | 34 | def run(self): 35 | start = time.time() 36 | 37 | response = self.session.get(url=URL_MAP['home_page'], headers=headers) 38 | html = response.text # 这个阻塞 39 | resp = Selector(text=html) 40 | industries = resp.xpath('//ul[@class="list-unstyled"]/a') 41 | for industry in industries: 42 | json_data = {} 43 | industry_url = industry.xpath('.//@href').extract_first() 44 | industry_name = industry.xpath('.//li/text()').extract_first() 45 | json_data['industry_url'] = industry_url 46 | json_data['industry_name'] = industry_name 47 | self.detail_list(industry_url, json_data) 48 | 49 | end = time.time() 50 | print(f'time used {end-start}') 51 | 52 | def detail_list(self, url, json_data): 53 | 54 | response = self.session.get(URL_MAP['base']+url, headers=headers) 55 | response =response.text 56 | self.parse_detail(response, json_data) 57 | 58 | def parse_detail(self, html, json_data=None): 59 | resp = Selector(text=html) 60 | title =resp.xpath('//title/text()').extract_first() 61 | print(title) 62 | 63 | 64 | app = Holdle() 65 | app.run() 66 | -------------------------------------------------------------------------------- /51jbnet/im_sandbox/models.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2019/5/16 17:34 4 | # @File : models.py 5 | from contextlib import contextmanager 6 | from datetime import datetime 7 | 8 | from sqlalchemy import create_engine 9 | from sqlalchemy import Column, Integer, String, Date, DateTime, Text 10 | from sqlalchemy.orm import sessionmaker, scoped_session 11 | from sqlalchemy.ext.declarative import declarative_base 12 | 13 | from im_sandbox.settings import MYSQL_DB_URI 14 | 15 | # declare a Mapping,this is the class describe map to table column 16 | Base = declarative_base() 17 | engine = create_engine(MYSQL_DB_URI) 18 | session_factory = sessionmaker(bind=engine) 19 | Session = scoped_session(session_factory) 20 | 21 | 22 | @contextmanager 23 | def scoped_session(): 24 | session = Session() 25 | try: 26 | yield session 27 | session.commit() 28 | except: 29 | session.rollback() 30 | raise 31 | finally: 32 | session.close() 33 | 34 | 35 | class SpiderModel(Base): 36 | __tablename__ = 'testdb' 37 | id = Column(Integer, primary_key=True, autoincrement=True) 38 | score = Column(Integer, nullable=False, default=0) 39 | catid = Column(Integer, nullable=False, default=0) 40 | score_story = Column(String(512), nullable=False, default='') 41 | hometext = Column(String(1024), nullable=False, default='') 42 | counter = Column(Integer, nullable=False, default=0) 43 | inputtime = Column(DateTime, nullable=False, default=datetime.now()) 44 | topic = Column(Integer, nullable=False, default=0) 45 | source = Column(String(128), nullable=False, default='') 46 | mview = Column(Integer, nullable=False, default=0) 47 | comments = Column(Integer, nullable=False, default=0) 48 | crawled_datetime = Column(DateTime, nullable=False, default=datetime.now()) 49 | rate_sum = Column(Integer, nullable=False, default=0) 50 | title = Column(String(512), nullable=False, default='') 51 | url_show = Column(String(512), nullable=False, default='') 52 | thumb = Column(String(256), nullable=False, default='') 53 | 54 | # 建表的时候去掉这一行注释 55 | # Base.metadata.create_all(engine) 56 | 57 | def map_orm_item(scrapy_item, sql_item): 58 | for k, v in scrapy_item.items(): 59 | sql_item.__setattr__(k, v) 60 | return sql_item 61 | 62 | 63 | -------------------------------------------------------------------------------- /async_cuiqingcai/async_sandbox/CustomExtension.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2019-08-27 11:31:19 4 | # @Author : Rocky Chen (weigesysu@qq.com) 5 | # @Link : http://30daydo.com 6 | # @Version : $1.0$ 7 | from scrapy import signals 8 | import pika 9 | import json 10 | import datetime 11 | from scrapy.exceptions import NotConfigured 12 | 13 | # 自定义扩展 推送到 rabbitmq 14 | class AdvancedExtension(object): 15 | 16 | def __init__(self,crawler): 17 | self.crawler = crawler 18 | self.crawler.signals.connect(self.spider_close,signals.spider_closed) 19 | self.mq_host=crawler.settings.get('MQ_HOST') 20 | self.mq_port=crawler.settings.getint('MQ_PORT') 21 | self.mq_user=crawler.settings.get('MQ_USER') 22 | self.mq_password=crawler.settings.get('MQ_PASSWORD') 23 | self.queue_name = crawler.settings.get('MQ_QUEUE_NAME') 24 | if not self.queue_name: 25 | raise NotConfigured # 有这个是让这个模块失效而不报错 26 | self.start_time = datetime.datetime.now() 27 | 28 | @classmethod 29 | def from_crawler(cls,crawler): 30 | 31 | return cls(crawler) 32 | 33 | def spider_close(self,spider): 34 | 35 | print('in extension module, spider close') 36 | print(f'spider name {spider.name}') 37 | # print(dir(spider)) 38 | credentials = pika.PlainCredentials(self.mq_user,self.mq_password) 39 | 40 | connection = pika.BlockingConnection(pika.ConnectionParameters(self.mq_host,self.mq_port,'/',credentials)) 41 | 42 | channel = connection.channel() 43 | 44 | queue_name = 'spider' 45 | channel.queue_declare(queue=self.queue_name,durable=True) 46 | now = datetime.datetime.now() 47 | 48 | content = {'spiderName':spider.name,'status':'closed','start_time':self.start_time.strftime('%Y-%m-%d %H:%M:%S'),'end_time':now.strftime('%Y-%m-%d %H:%M:%S'),'time_used(s)':(now-self.start_time).seconds} 49 | 50 | send_content = json.dumps(content) 51 | 52 | channel.basic_publish( 53 | exchange='', 54 | routing_key=self.queue_name, 55 | body=send_content, 56 | properties=pika.BasicProperties( 57 | delivery_mode=2) # 这个是用来做消息持久化,数据会保存在队列,直到被消费 58 | ) 59 | 60 | print('[x] send {}'.format(send_content)) 61 | connection.close() 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /yinyonbao/yingyongbao.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import re 3 | 4 | import requests 5 | from lxml import etree 6 | import pandas as pd 7 | 8 | class Yinyongbao(): 9 | def __init__(self): 10 | self.user_agent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)" 11 | self.headers = {"User-Agent": self.user_agent} 12 | 13 | 14 | def getData(self): 15 | base_url='http://sj.qq.com/myapp/category.htm' 16 | parent_url='http://sj.qq.com/myapp/category.htm?orgame=1' 17 | s=requests.get(url=parent_url,headers=self.headers) 18 | print(s.status_code) 19 | #print(s.text) 20 | tree=etree.HTML(s.text) 21 | menu=tree.xpath('//ul[@class="menu-junior"]')[0] 22 | print(type(menu)) 23 | 24 | link= menu.xpath('.//li[@id]/a/@href') 25 | catelog=[] 26 | for i in link: 27 | print(i) 28 | p=re.compile('categoryId=(-?\d+)') 29 | #x=base_url+i 30 | x=p.findall(i)[0] 31 | #print(x) 32 | catelog.append(x) 33 | return catelog 34 | 35 | def testcase(self): 36 | catelog=self.getData() 37 | print(catelog) 38 | for i in catelog: 39 | print("Catelog : ", i) 40 | self.each_page(int(i),0) 41 | 42 | #抓取某一个分类的 43 | def each_page(self,categoryId,pageContext): 44 | 45 | url='http://sj.qq.com/myapp/cate/appList.htm?orgame=1&categoryId=%d&pageSize=20&pageContext=%d' %(categoryId,pageContext) 46 | para={'orgame':1,'categoryId':categoryId,'pageSize':20,'pageContext':pageContext} 47 | s=requests.get(url=url,params=para,headers=self.headers) 48 | js= s.json() 49 | name=[] 50 | df=pd.DataFrame(js['obj']) 51 | print(df) 52 | for i in js['obj']: 53 | #需要的数据都在这里面 54 | x= i['appName'] 55 | print(x,' ---download count: ', i['appDownCount']) 56 | 57 | name.append(x) 58 | print(len(name)) 59 | try: 60 | pageContext=int(js['pageContext']) 61 | self.each_page(categoryId,pageContext) 62 | except Exception as e: 63 | return 64 | 65 | def main(): 66 | obj=Yinyongbao() 67 | #obj.getData() 68 | #obj.each_page('',0) 69 | obj.testcase() 70 | ''' 71 | for i in range(0,200,38): 72 | obj.each_page('',i) 73 | ''' 74 | main() 75 | -------------------------------------------------------------------------------- /ximalaya/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # website: http://30daydo.com 3 | # @Time : 2019/6/30 12:03 4 | # @File : main.py 5 | 6 | import requests 7 | import re 8 | import os 9 | 10 | url = 'http://180.153.255.6/mobile/v1/album/track/ts-1571294887744?albumId=23057324&device=android&isAsc=true&isQueryInvitationBrand=true&pageId={}&pageSize=20&pre_page=0' 11 | headers = {'User-Agent': 'Xiaomi'} 12 | 13 | def download(): 14 | for i in range(1, 3): 15 | r = requests.get(url=url.format(i), headers=headers) 16 | js_data = r.json() 17 | data_list = js_data.get('data', {}).get('list', []) 18 | for item in data_list: 19 | trackName = item.get('title') 20 | trackName = re.sub('[\/\\\:\*\?\"\<\>\|]', '_', trackName) 21 | # trackName=re.sub(':','',trackName) 22 | src_url = item.get('playUrl64') 23 | filename = '{}.mp3'.format(trackName) 24 | if not os.path.exists(filename): 25 | 26 | try: 27 | r0 = requests.get(src_url, headers=headers) 28 | except Exception as e: 29 | print(e) 30 | print(trackName) 31 | r0 = requests.get(src_url, headers=headers) 32 | 33 | 34 | else: 35 | with open(filename, 'wb') as f: 36 | f.write(r0.content) 37 | 38 | print('{} downloaded'.format(trackName)) 39 | 40 | else: 41 | print(f'{filename}已经下载过了') 42 | 43 | import shutil 44 | 45 | def rename_(): 46 | for i in range(1, 3): 47 | r = requests.get(url=url.format(i), headers=headers) 48 | js_data = r.json() 49 | data_list = js_data.get('data', {}).get('list', []) 50 | for item in data_list: 51 | trackName = item.get('title') 52 | trackName = re.sub('[\/\\\:\*\?\"\<\>\|]', '_', trackName) 53 | src_url = item.get('playUrl64') 54 | 55 | orderNo=item.get('orderNo') 56 | 57 | filename = '{}.mp3'.format(trackName) 58 | try: 59 | 60 | if os.path.exists(filename): 61 | new_file='{}_{}.mp3'.format(orderNo,trackName) 62 | shutil.move(filename,new_file) 63 | except Exception as e: 64 | print(e) 65 | 66 | 67 | 68 | 69 | 70 | if __name__=='__main__': 71 | rename_() 72 | -------------------------------------------------------------------------------- /myubbs/sandbox/spiders/website.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import json 4 | import re 5 | import scrapy 6 | from scrapy import Request, FormRequest 7 | import logging 8 | import redis 9 | from sandbox.items import SpiderItem 10 | from sandbox.utility import get_header 11 | 12 | # get 13 | class WebGetSpider(scrapy.Spider): 14 | name = 'myubbs' 15 | URL = 'http://zsu.myubbs.com/forum-97-{}.html' 16 | 17 | def __init__(self): 18 | 19 | super(WebGetSpider,self).__init__() 20 | self.headers=get_header() 21 | self.page=10 22 | 23 | def start_requests(self): 24 | # TO DO 25 | for p in range(1,self.page+1): 26 | yield Request(url=self.URL.format(p), 27 | headers=self.headers 28 | ) 29 | 30 | def parse(self, response): 31 | root=response.xpath('//*[@id="threadlisttableid"]/tbody') 32 | for node in root[1:]: 33 | url = node.xpath('.//th//a[@class="s xst"]/@href').extract_first() 34 | # print(url) 35 | if url: 36 | yield Request(url,headers=self.headers,callback=self.parse_item) 37 | 38 | def parse_item(self,response): 39 | 40 | title = response.xpath('//span[@id="thread_subject"]/text()').extract_first() 41 | url = response.url 42 | pubdate = response.xpath('//div[@id="postlist"]/div[1]/table//div[@class="authi"]/em/text()').re_first('\d+-\d+-\d+ \d+:\d+:\d{2}') 43 | if pubdate is None: 44 | try: 45 | pubdate = response.xpath('//div[@id="postlist"]/div[1]/table//div[@class="authi"]/em/span/@title').extract_first() 46 | except Exception as e: 47 | print(e) 48 | pubdate='' 49 | # pubdate = response.xpath('//div[@id="postlist"]/').extract_first() 50 | author=response.xpath('//div[@class="authi"]/a/text()').extract_first() 51 | content = response.xpath('//td[@class="t_f"]')[0].xpath('string(.)').extract()[0] 52 | crawltime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 53 | 54 | spiderItem= SpiderItem() 55 | 56 | for field in spiderItem.fields: 57 | try: 58 | spiderItem[field]=eval(field) 59 | except Exception as e: 60 | logging.warning('can not find define of {}'.format(field)) 61 | logging.warning(e) 62 | 63 | # print(spiderItem) 64 | yield spiderItem 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /v2ex_job/v2ex2.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | from scrapy import Selector 4 | from twisted.internet import defer 5 | from twisted.internet import reactor 6 | from twisted.web.client import getPage 7 | 8 | 9 | class V2exJob: 10 | def __init__(self): 11 | pass 12 | 13 | def get_page(self): 14 | """ 15 | 总共页码的获取 16 | :return: 17 | """ 18 | index_url = 'https://www.v2ex.com/go/jobs' 19 | index_headers = { 20 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36' 21 | } 22 | response = requests.get(url=index_url, headers=index_headers) 23 | selector = Selector(text=response.text) 24 | all_page = selector.xpath('//a[@class="page_normal"]/text()').extract() 25 | all_page = all_page[-1] 26 | return all_page 27 | 28 | @defer.inlineCallbacks 29 | def get_html(self, each_page): 30 | """ 31 | 进行网站信息的获取,并进行返回。 32 | :param each_page: 33 | :return: 34 | """ 35 | each_urls = 'https://www.v2ex.com/go/jobs?p=%s' % str(each_page) 36 | res = getPage(bytes(each_urls, encoding="utf-8")) # 获取页面,发送http请求,是使用select池将所有socket请求保存,依据此进行计数。 37 | # print( type(res)) # 38 | res.addCallback(self.parse_infos) # 对每一个请求都添加一个回调方法 39 | yield res # 返回他 40 | 41 | def parse_infos(self, parse_infos): 42 | parse_infos = parse_infos.decode('utf-8') 43 | parse_infos = etree.HTML(parse_infos) 44 | infos = parse_infos.xpath('//span[@class="item_title"]/a/text()') 45 | print(infos) 46 | 47 | def run(self): 48 | """ 49 | 程序的启动开始采集数据 50 | :return: 51 | """ 52 | all_page = self.get_page() 53 | defer_list = [] 54 | for each_page in range(1, 10): # 禁忌务要一次性访问过多的请求。不然别人会禁掉你的。 55 | v = self.get_html(each_page) # 发送请求后立即返回,不等待返回,v是一个特殊对象,标志你发送到那个请求 56 | defer_list.append(v) 57 | d = defer.DeferredList(defer_list) # 将上面的特殊对象列表一起放入DeferredList 58 | d.addBoth(self.all_done) # 为所有对象添加回调 59 | reactor.run() # 会一直循环,我们需要在任务执行完毕后关闭。含有计数器,执行一个任务,会执行一次get_html,计数减一。单任务执行完毕,计数为0,执行all_done 60 | 61 | def all_done(self, arg): 62 | print("all done") 63 | reactor.stop() 64 | 65 | 66 | if __name__ == '__main__': 67 | v2ex_job = V2exJob() 68 | v2ex_job.run() 69 | 70 | -------------------------------------------------------------------------------- /anjuke/test_anjuke.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import re 3 | 4 | import requests 5 | from lxml import etree 6 | headers = { 7 | 'accept': 'text/html', 8 | 'accept-encoding': 'gzip, deflate, sdch', 9 | 'accept-language': 'zh-CN,zh;q=0.8', 10 | 'cache-control': 'no-cache', 11 | 'pragma': 'no-cache', 12 | 'User-Agent': 'UCWEB/2.0 (Linux; U; Adr 2.3; zh-CN; MI-ONEPlus)U2/1.0.0 UCBrowser/8.6.0.199 U2/1.0.0 Mobile', 13 | 'x-requested-with': 'XMLHttpRequest', 14 | 'cookie': 'als=0; isp=true; Hm_lvt_c5899c8768ebee272710c9c5f365a6d8=1502856226; sessid=1551E6AF-1AA9-2526-E4E9-D494551F4A2F; search_words361=%E9%98%B3%E5%85%89%E5%B0%8F%E5%8C%BA; search_words24=%E9%9D%96%E6%B1%9F%E9%9B%85%E5%9B%AD11%E5%8F%B7%E6%A5%BC%7C%E6%9C%88%E6%A1%82%E8%A5%BF%E5%9B%AD; search_words14=%E8%B6%85%E6%98%8E%E5%9B%AD; search_words25=%E6%96%B0%E6%83%A0%E5%AE%B6%E5%9B%AD; browse_comm_ids13=95393; seo_source_type=0; search_words13=%E6%AC%A7%E9%99%86%E7%BB%8F%E5%85%B8%7C%E5%8D%97%E6%96%B9%E6%98%8E%E7%8F%A0%E8%8A%B1%E5%9B%AD%7C%E5%8D%97%E6%96%B9%E6%98%8E%E7%8F%A0%E8%8A%B1%E5%9B%AD%E4%BA%8C%E6%9C%9F1%E6%A0%8B; twe=2; __xsptplus8=8.43.1504789824.1504790391.8%233%7C123.sogou.com%7C%7C%7C%7C%23%23hvhL5eg3_ejnK-ngxJE-qwbIXXbQIk81%23%3B%20aQQ_a; _ga=GA1.2.1188068084.1502419352; _gid=GA1.2.1082371756.1504696715; lps="/cityList/|"; aQQ_ajkguid=B97BFB26-048C-2797-947E-7543B95A2D8A; ctid=13; 58tj_uuid=a4461385-7d0d-4e1a-9e94-85fa7b69f6aa; new_session=0; init_refer=; new_uv=61' 15 | } 16 | 17 | start_url = 'https://m.anjuke.com/gu/community/?from=anjuke_home&p=1' 18 | r = requests.get(url=start_url, headers=headers) 19 | if r.json()['data']: 20 | print('not empty') 21 | else: 22 | print('empty') 23 | 24 | 25 | price_case='https://m.anjuke.com/gz/community/112952/' 26 | content=requests.get(url=price_case,headers=headers).text 27 | tree=etree.HTML(content) 28 | price=tree.xpath('//a[@data-soj="community_topprice"]/div[@class="txt-c"]/p[@class="price"]/text()')[0] 29 | print(price) 30 | name=tree.xpath('//div[@class="comm-tit"]/h1/text()')[0] 31 | print(name) 32 | address=tree.xpath('//div[@class="comm-tit"]/div[@class="comm-ad"]/p/text()')[0] 33 | print(address) 34 | building_type=tree.xpath('//div[@class="header-field"]/span')[0].xpath('./text()')[0] 35 | building_date=tree.xpath('//div[@class="header-field"]/span')[2].xpath('./text()')[0] 36 | print(building_date) 37 | print(building_type) 38 | pattern = 'data-center="(.*?)"' 39 | data = re.findall(pattern, content) 40 | t= data[0].split(',') 41 | print(t[0]) 42 | print(t[1]) 43 | #longitude = data[0] 44 | #latitude = data[1] -------------------------------------------------------------------------------- /youdao_dictionary/youdao.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # website: http://30daydo.com 3 | # @Time : 2019/2/23 19:34 4 | # @File : youdao.py 5 | # 解密有道词典的JS 6 | 7 | 8 | import hashlib 9 | import random 10 | import requests 11 | import time 12 | 13 | 14 | def md5_(word): 15 | s = bytes(word, encoding='utf8') 16 | m = hashlib.md5() 17 | m.update(s) 18 | ret = m.hexdigest() 19 | return ret 20 | 21 | def get_sign(word, salt): 22 | ret = md5_('fanyideskweb' + word + salt + 'p09@Bn{h02_BIEe]$P^nG') 23 | return ret 24 | 25 | def youdao(word): 26 | url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule' 27 | headers = { 28 | 'Host': 'fanyi.youdao.com', 29 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0', 30 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 31 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 32 | 'Accept-Encoding': 'gzip, deflate', 33 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 34 | 'X-Requested-With': 'XMLHttpRequest', 35 | 'Referer': 'http://fanyi.youdao.com/', 36 | 'Cookie': 'YOUDAO_MOBILE_ACCESS_TYPE=1; OUTFOX_SEARCH_USER_ID=1672542763@10.169.0.83; JSESSIONID=aaaWzxpjeDu1gbhopLzKw; ___rl__test__cookies=1550913722828; OUTFOX_SEARCH_USER_ID_NCOO=372126049.6326876', 37 | 'Connection': 'keep-alive', 38 | 'Pragma': 'no-cache', 39 | 'Cache-Control': 'no-cache', 40 | } 41 | 42 | ts = str(int(time.time()*1000)) 43 | salt=ts+str(random.randint(0,10)) 44 | bv = md5_("5.0 (Windows)") 45 | sign= get_sign(word,salt) 46 | 47 | post_data = { 48 | 'i': word, 49 | 'from': 'AUTO', 'to': 'AUTO', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': salt, 50 | 'sign': sign, 'ts': ts, 'bv': bv, 'doctype': 'json', 'version': '2.1', 51 | 'keyfrom': 'fanyi.web', 'action': 'FY_BY_REALTIME', 'typoResult': 'false' 52 | } 53 | 54 | r = requests.post( 55 | url=url, 56 | headers=headers, 57 | data=post_data 58 | ) 59 | 60 | js_data = r.json() 61 | smart_result= js_data.get('smartResult', {}) 62 | 63 | if smart_result: 64 | for item in smart_result.get('entries'): 65 | print(item) 66 | 67 | translate_result = js_data.get('translateResult',[]) 68 | if translate_result: 69 | for items in translate_result: 70 | for item in items: 71 | print(item.get('tgt')) 72 | 73 | word='我喜欢吃鸡腿' 74 | youdao(word) 75 | -------------------------------------------------------------------------------- /zhihu/zhihu_book.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import requests 3 | import json 4 | import pymongo 5 | # 下载知乎书籍的数据 6 | def get_books_by_url(url): 7 | headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"} 8 | r = requests.get(url, headers=headers) 9 | data = json.loads(r.content.decode("utf-8")) 10 | return data 11 | 12 | def get_books_by_category(category_id): 13 | url_patt = "https://www.zhihu.com/api/v3/books/categories/{}?limit={}&offset={}&version=v2" 14 | limit = 10 15 | offset = 0 16 | client = pymongo.MongoClient('10.18.6.26',27001) 17 | db = client.zhihu_book 18 | while True: 19 | url = url_patt.format(category_id, limit, offset) 20 | print(url) 21 | data = get_books_by_url(url) 22 | books = data["data"] 23 | db.books.insert_many(books) 24 | if data["paging"]["is_end"]: 25 | break 26 | offset = offset + limit 27 | 28 | def get_all_books(): 29 | categories = [147, 254, 232, 209, 245, 175, 219, 189, 205, 161, 143, 284, 265, 214, 155, 241] 30 | for category in categories: 31 | get_books_by_category(category) 32 | 33 | def query_books(): 34 | client = pymongo.MongoClient('10.18.6.26',27001) 35 | db = client.zhihu_book 36 | 37 | books = db.books.find().sort("score") 38 | book_ids = [] 39 | for book in books: 40 | if book["id"] in book_ids: 41 | continue 42 | price = 0 43 | if book["promotion"]["is_promotion"]: 44 | price = book["promotion"]["promotion_price"]/100 45 | else: 46 | price = book["promotion"]["price"]/100 47 | print("{},{},{},{},{}".format(book["title"], book["url"], book["score"], price, book["promotion"]["origin_price"]/100)) 48 | book_ids.append(book["id"]) 49 | 50 | # books = db.books.find({"promotion.price": 0.0}).sort("score") 51 | # book_ids = [] 52 | # for book in books: 53 | # if book["id"] in book_ids: 54 | # continue 55 | # print("{},{},{}".format(book["title"], book["url"], book["score"])) 56 | # book_ids.append(book["id"]) 57 | 58 | if __name__ == "__main__": 59 | # parser = argparse.ArgumentParser() 60 | # parser.add_argument("--download", help="", action="store_true") 61 | # parser.add_argument("--query", help="", action="store_true") 62 | # args = parser.parse_args() 63 | # if args.download: 64 | # get_all_books() 65 | # elif args.query: 66 | # query_books() 67 | get_all_books() -------------------------------------------------------------------------------- /51jbnet/im_sandbox/spiders/website.py: -------------------------------------------------------------------------------- 1 | # -*-coding=utf-8-*- 2 | 3 | # @Time : 2019/5/16 17:30 4 | # @File : website.py 5 | 6 | # -*- coding: utf-8 -*- 7 | import re 8 | import requests 9 | import scrapy 10 | from scrapy import Request 11 | from im_sandbox import settings 12 | from scrapy.log import logger 13 | import json 14 | from im_sandbox.items import SandboxItem 15 | import datetime 16 | from scrapy.selector import Selector 17 | 18 | 19 | class Website(scrapy.Spider): 20 | name = "website" 21 | category='linux_shell' 22 | idx=235 23 | total=1403 24 | page = int(total/40)+1 25 | default_headers = { 26 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 27 | "Accept-Encoding": "gzip, deflate, br", 28 | "Accept-Language": "zh,en;q=0.9,en-US;q=0.8,zh-CN;q=0.7", 29 | "Cache-Control": "no-cache", 30 | "Connection": "keep-alive", 31 | "Host": "www.jb51.net", 32 | "Pragma": "no-cache", 33 | "Referer": "https://www.jb51.net/list/list_97_1.htm", 34 | "Upgrade-Insecure-Requests": "1", 35 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36", 36 | } 37 | 38 | def start_requests(self): 39 | page = 400 40 | base_url = 'https://www.jb51.net/list/list_{idx}_{page}.htm' 41 | for i in range(1, self.page + 1): 42 | yield Request(url=base_url.format(page=i,idx=self.idx), headers=self.default_headers, callback=self.parse) 43 | 44 | def parse(self, response): 45 | 46 | if not response.body: 47 | logger.error(msg='there is no response body ,please go and check it ') 48 | return 49 | 50 | nodes = response.xpath('//div[@class="artlist clearfix"]/DL/DT') 51 | if nodes: 52 | pass 53 | else: 54 | nodes = response.xpath('//div[@class="artlist clearfix"]/dl/dt') 55 | 56 | for node in nodes: 57 | pubdate = node.xpath('.//span/text()').extract_first() 58 | pubdate = re.sub('日期:', '', pubdate) 59 | title=node.xpath('.//a/text()').extract_first() 60 | url=node.xpath('.//a/@href').extract_first() 61 | full_url = 'https://www.jb51.net{}'.format(url) 62 | item = SandboxItem() 63 | item['pubdate']=pubdate 64 | item['url']=full_url 65 | item['title']=title 66 | item['category']=self.category 67 | yield item 68 | -------------------------------------------------------------------------------- /jd/jd/spiders/jd_book.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | from scrapy import Request 5 | from scrapy_splash import SplashRequest 6 | import re 7 | from jd.items import JdItem 8 | lua_script = """ 9 | function main(splash) 10 | splash:go(splash.args.url) 11 | splash:wait(5) 12 | splash:runjs("document.getElementsByClassName('page')[0].scrollIntoView(true)") 13 | splash:wait(5) 14 | return splash:html() 15 | end 16 | """ 17 | 18 | 19 | class JDBookSpider(scrapy.Spider): 20 | name = "jd_book" 21 | allowed_domains = ["search.jd.com"] 22 | kw='股票' 23 | base_url = 'https://search.jd.com/Search?keyword={}&enc=utf-8&wq={}'.format(kw,kw) 24 | 25 | def start_requests(self): 26 | # 请求第一页,无需 js 渲染 27 | yield Request(self.base_url, callback=self.parse_urls, dont_filter=True) 28 | 29 | def parse_urls(self, response): 30 | # 获取商品总数,计算出总页数 31 | total = response.css('span#J_resCount::text').extract_first().strip('+') 32 | try: 33 | total=re.sub('万','',total) 34 | total=float(total)*10000 35 | except: 36 | return 37 | pageNum = total // 60 + (1 if total % 60 else 0) 38 | 39 | # 构造每页的 url,向 Splash 的 execute 端点发送请求 40 | for i in range(int(pageNum)): 41 | url = '%s&page=%s' % (self.base_url, 2*i+1) 42 | yield SplashRequest(url, endpoint='execute', args={'lua_source': lua_script},\ 43 | cache_args=['lua_source']) 44 | 45 | def parse(self, response): 46 | # 获取一个页面中每本书的名字和价格 47 | for sel in response.css('ul.gl-warp.clearfix > li.gl-item'): 48 | item = JdItem() 49 | name= sel.css('div.p-name').xpath('string(.//em)').extract_first() 50 | price= sel.css('div.p-price i::text').extract_first() 51 | try: 52 | remark=sel.xpath('.//div[(@class="p-commit" or @class="p-comm")]').xpath('string(.)').extract_first() 53 | if remark: 54 | remark=remark.strip() 55 | except: 56 | remark=None 57 | try: 58 | price=float(price) 59 | except: 60 | price=price 61 | 62 | # 自营 63 | # shop=sel.css('div.p-shopnum span::text').extract_first() 64 | 65 | # 出版社 66 | 67 | publish=sel.css('div.p-shopnum a::text').extract_first() 68 | if publish is None: 69 | publish=sel.css('div.p-shop a::text').extract_first() 70 | # if shop is None: 71 | # shop=sel.css('div.p-shopnum a::text').extract_first() 72 | # publish=None 73 | 74 | item['name']=name 75 | item['price']=price 76 | item['remark']=remark 77 | item['publish']=publish 78 | # item['shop']=shop 79 | yield item -------------------------------------------------------------------------------- /async_cuiqingcai/async_sandbox/CustomMiddleware.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2019-08-28 19:35:51 4 | # @Author : Rocky Chen (weigesysu@qq.com) 5 | # @Link : http://30daydo.com 6 | # @Version : 1.0 7 | 8 | # 自定义middleware 9 | from scrapy.exceptions import IgnoreRequest 10 | # from scrapy import log 11 | import logging 12 | from scrapy.downloadermiddlewares.retry import RetryMiddleware 13 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 14 | 15 | class CustomMiddleware(object): 16 | 17 | def process_request(self,request,spider): 18 | # print('before download v1') 19 | # print(f'name -->{spider.name}') 20 | 21 | request.meta['vvv']='kkk' # 可以这样携带一些参数 22 | 23 | 24 | # print('主动提交错误') # 去执行process_exception 25 | # raise IgnoreRequest 26 | 27 | def process_response(self,request,response,spider): 28 | # print('after download v1') 29 | # print(f'name -->{spider.name}') 30 | # print(request.meta['vvv']) 31 | # print(dir(response)) 32 | # print(response.status) 33 | 34 | if response.status==404: 35 | print('重新调度') 36 | return request 37 | else: 38 | return response # 需要返回response 39 | 40 | def process_exception(self,request, exception, spider): 41 | print('遇到错误了!!!!!!!!') 42 | return request 43 | 44 | class CustomMiddleware2(object): 45 | 46 | def process_request(self,request,spider): 47 | # logging.info('before download v2') 48 | # print(f'name -->{spider.name}') 49 | request.meta['vvv']='kkk' # 可以这样携带一些参数 50 | 51 | def process_response(self,request,response,spider): 52 | # print('after download v2') 53 | # print(f'name -->{spider.name}') 54 | # print(request.meta['vvv']) 55 | v = request.meta['vvv'] 56 | return response 57 | 58 | 59 | class ModifiedRetryMiddleware(RetryMiddleware): 60 | 61 | 62 | def process_response(self, request, response, spider): 63 | 64 | logging.info('这个我定义的继承retrymiddleware') 65 | 66 | if request.meta.get('dont_retry', False): 67 | return response 68 | 69 | if response.status in self.retry_http_codes: 70 | reason = response_status_message(response.status) 71 | return self._retry(request, reason, spider) or response 72 | 73 | return response 74 | 75 | class ModifiedUserAgentMiddleware(UserAgentMiddleware): 76 | 77 | def process_request(self, request, spider): 78 | 79 | if self.user_agent: 80 | 81 | logging.info('这是自定义UA中间件') 82 | 83 | request.headers.setdefault(b'User-Agent', self.user_agent) 84 | 85 | def process_response(self,request,response,spider): 86 | logging.info(f'请求的request header ====== {request.headers}') 87 | return response -------------------------------------------------------------------------------- /async_cuiqingcai/async_sandbox/monitor/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 爬虫动态监控系统 6 | 7 | 8 | 9 | 10 |
11 |
12 | 110 | 111 | -------------------------------------------------------------------------------- /stock_pledge/crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # website: http://30daydo.com 3 | # @Time : 2019/3/9 17:17 4 | # @File : crawler.py 5 | import datetime 6 | import requests 7 | 8 | # import grequests 9 | import pandas as pd 10 | import numpy as np 11 | from setting import get_engine 12 | import tushare as ts 13 | 14 | # 2018.03.05 后才有数据 15 | 16 | url = 'http://www.chinaclear.cn/cms-rank/downloadFile?queryDate={}&type=proportion' 17 | 18 | headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 19 | 'Accept-Encoding': 'gzip,deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 20 | 'Pragma': 'no-cache', 'Proxy-Connection': 'keep-alive', 21 | # 'Referer': 'http://www.chinaclear.cn/cms-rank/queryPledgeProportion?action=query&queryDate=2019.03.09&secCde=&page=3', 22 | 'Upgrade-Insecure-Requests': '1', 23 | 'User-Agent': 'Mozilla/5.0(Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36'} 24 | 25 | engine = get_engine('db_pledge', 'local') 26 | 27 | 28 | class PledgeSpider(): 29 | 30 | def __init__(self): 31 | self.start = datetime.datetime.now() 32 | self.delta= 400 33 | 34 | 35 | def start_task(self): 36 | pass 37 | 38 | def handle_exception(self,request,exception): 39 | print('process error') 40 | 41 | def crawl(self): 42 | # tasks=[] 43 | # date_list =[] 44 | for i in range(self.delta): 45 | fetch_day = self.start+datetime.timedelta(days=-1*i) 46 | if fetch_day < datetime.datetime(year=2018,month=3,day=4): 47 | break 48 | 49 | if not ts.is_holiday(fetch_day.strftime('%Y-%m-%d')): 50 | name=fetch_day.strftime('%Y-%m-%d') 51 | try: 52 | day=url.format(fetch_day.strftime('%Y.%m.%d')) 53 | print(day) 54 | r=requests.get(url=day,headers=headers,timeout=20) 55 | except Exception as e: 56 | print(e) 57 | else: 58 | print(r.status_code) 59 | with open('{}.xls'.format(name), 'wb') as f: 60 | f.write(r.content) 61 | # tasks.append(grequests.get(url=url.format(fetch_day.strftime('%Y.%m.%d')))) 62 | 63 | # date_list.append(fetch_day.strftime('%Y-%m-%d')) 64 | 65 | # resp = grequests.map(tasks,size=8,exception_handler=self.handle_exception) 66 | # for index,r in enumerate(resp): 67 | # with open('{}.xls'.format(date_list[index]),'wb') as f: 68 | # f.write(r.content) 69 | 70 | 71 | def data_transfer(self): 72 | df = pd.read_excel('pledge.xls', header=2, dtype={'证券代码': np.str}) 73 | df = df.reset_index(drop=True) 74 | return df 75 | 76 | 77 | pledge = PledgeSpider() 78 | pledge.crawl() 79 | # df = pledge.data_transfer() 80 | -------------------------------------------------------------------------------- /cuiqingcai/async_sandbox/spiders/example.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import re 4 | 5 | import scrapy 6 | from scrapy import Request 7 | import logging 8 | from async_sandbox.items import AsyncSandboxItem 9 | 10 | 11 | class ExampleSpider(scrapy.Spider): 12 | name = 'example' 13 | # 技术 14 | # BASE_URL = 'https://cuiqingcai.com/category/technique/page/{}' 15 | # 生活 16 | BASE_URL = 'https://cuiqingcai.com/category/life/page/{}' 17 | 18 | def start_requests(self): 19 | start_page = 1 20 | 21 | yield Request( 22 | url=self.BASE_URL.format(start_page), 23 | meta={'page': start_page} 24 | ) 25 | 26 | def parse(self, response): 27 | page = response.meta['page'] 28 | next_page = page + 1 29 | 30 | articles = response.xpath('//article[@class="excerpt"]') 31 | for article in articles: 32 | item = AsyncSandboxItem() 33 | category = article.xpath('./header/a[1]/text()').extract_first() 34 | title = article.xpath('./header/h2/a[1]/text()').extract_first() 35 | article_url = article.xpath('./header/h2/a[1]/@href').extract_first() 36 | item['title'] = title 37 | item['category'] = category 38 | item['article_url'] = article_url 39 | 40 | yield Request( 41 | url=article_url, 42 | callback=self.parse_item, 43 | meta={'item': item} 44 | ) 45 | 46 | if next_page < 900: 47 | yield Request( 48 | url=self.BASE_URL.format(next_page), 49 | meta={'page': next_page} 50 | ) 51 | 52 | def parse_item(self, response): 53 | item = response.meta['item'] 54 | author = response.xpath( 55 | '//header[@class="article-header"]//i[@class="fa fa-user"]/following::*[1]/text()').extract_first() 56 | visited = response.xpath( 57 | '//header[@class="article-header"]//i[@class="fa fa-eye"]/parent::*[1]/text()').extract_first() 58 | comment = response.xpath( 59 | '//header[@class="article-header"]//i[@class="fa fa-comments-o"]/following-sibling::*[1]/text()').extract_first() 60 | liked = response.xpath('//span[@class="count"]/text()').extract_first() 61 | created_at = response.xpath( 62 | '//header[@class="article-header"]//i[@class="fa fa-clock-o"]/parent::*[1]/text()').extract_first() 63 | content = response.xpath('//article[@class="article-content"]')[0].xpath('string(.)').extract()[0] 64 | 65 | item['author'] = author 66 | item['created_at'] = created_at 67 | item['content'] = content 68 | visited=re.sub('浏览','',visited) 69 | item['visited'] = visited 70 | comment=re.sub('评论','',comment) 71 | item['comment'] = comment 72 | item['liked'] = liked 73 | item['crawltime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 74 | yield item 75 | -------------------------------------------------------------------------------- /52sh/aio_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # website: http://30daydo.com 3 | # @Time : 2020/9/24 12:09 4 | # @File : aio_spider.py 5 | import asyncio 6 | import aiohttp 7 | import aiofiles 8 | import os 9 | 10 | import re 11 | 12 | from config_file import START_URL, HEADERS, PROXY_STR,SIMPLE_HEADERS 13 | from parsel import Selector 14 | 15 | 16 | async def fetch(url): 17 | async with aiohttp.ClientSession() as session: 18 | async with session.get(url=url, 19 | headers=HEADERS, 20 | proxy=PROXY_STR, 21 | ) as response: 22 | text = await response.text() 23 | resp = Selector(text=text) 24 | nodes = resp.xpath('//div[@class="kl1-2"]') 25 | for node in nodes: 26 | next_url = node.xpath('.//div[@class="kl1-2a2"]/a/@href').extract_first() 27 | title = node.xpath('.//div[@class="kl1-2a2"]/a/@title').extract_first() 28 | await detail(session=session, next_url=next_url, title=title) 29 | print('next page') 30 | 31 | 32 | async def detail(**kwargs): 33 | session = kwargs['session'] 34 | next_url = kwargs['next_url'] 35 | title = kwargs['title'] 36 | print(next_url) 37 | print(title) 38 | async with session.get( 39 | url=next_url, 40 | headers=HEADERS, 41 | proxy=PROXY_STR, 42 | ) as response: 43 | text = await response.text() 44 | resp = Selector(text=text) 45 | nodes = resp.xpath('//div[@class="kl2-1"]//img/@src').extract() 46 | nodes = list(set(nodes)) 47 | for img in nodes: 48 | # print(img) 49 | await download_img(session=session,url=img,title=title) 50 | print('next image') 51 | 52 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) 53 | 54 | async def download_img(**kwargs): 55 | url= kwargs['url'] 56 | title= kwargs['title'] 57 | 58 | title = title.replace(' ','_') 59 | title = re.sub('[\/:*?"<>|]', '-', title) 60 | if not os.path.exists(title): 61 | os.mkdir(title) 62 | 63 | filename = url.split('/')[-1] 64 | if not filename.endswith(('png','jpg','jpeg')): 65 | return 66 | save_file = os.path.join(title,filename) 67 | 68 | if os.path.exists(save_file): 69 | return 70 | print('saving image - ') 71 | try: 72 | conn = aiohttp.TCPConnector(ssl=False) # 防止ssl报错 73 | async with aiohttp.ClientSession(connector=conn, trust_env=True) as session: 74 | async with session.get(url=url, headers=SIMPLE_HEADERS, proxy=PROXY_STR) as response: 75 | 76 | if response.status>=200 and response.status<300: 77 | f=await aiofiles.open(save_file,'wb') 78 | await f.write(await response.read()) 79 | await f.close() 80 | 81 | except Exception as e: 82 | print(e) 83 | print(url) 84 | return 85 | 86 | async def main(): 87 | total_page = 3640 88 | for page in range(0,total_page,35): 89 | 90 | url = START_URL.format(page=page) 91 | await fetch(url) 92 | await asyncio.sleep(0) 93 | print(f'downing page {page}-') 94 | loop = asyncio.get_event_loop() 95 | loop.run_until_complete(main()) 96 | -------------------------------------------------------------------------------- /51jbnet/im_sandbox/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from scrapy.exceptions import DropItem 8 | 9 | class ImSandboxPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | 13 | 14 | import datetime 15 | 16 | import pymongo 17 | 18 | from im_sandbox.settings import MONGODB, ES_HOST 19 | from im_sandbox import models 20 | from im_sandbox.models import scoped_session 21 | from elasticsearch import Elasticsearch 22 | from scrapy import log 23 | 24 | 25 | class im_sandboxMongoPipeline(object): 26 | 27 | def __init__(self): 28 | self._db = MONGODB.get('db') 29 | self._collection = MONGODB.get('collection') 30 | self._host = MONGODB.get('host') 31 | self._port = MONGODB.get('port') 32 | self._client = pymongo \ 33 | .MongoClient(host=self._host, port=self._port) \ 34 | .get_database(self._db) \ 35 | .get_collection(self._collection) 36 | 37 | def process_item(self, item, spider): 38 | self._client.create_index([('title', pymongo.DESCENDING)], background=True) 39 | self._client.update_one(filter={'title': item['title']}, update={'$set': dict(item)}, upsert=True) 40 | return item 41 | 42 | 43 | class im_sandboxMysqlPipeline(object): 44 | 45 | def process_item(self, item, spider): 46 | sql_im_sandbox = models.SpiderModel() 47 | sql_im_sandbox = models.map_orm_item(scrapy_item=item, sql_item=sql_im_sandbox) 48 | with scoped_session() as session: 49 | session.add(sql_im_sandbox) 50 | 51 | return item 52 | 53 | 54 | class ESPipeline(object): 55 | def __init__(self): 56 | self.index = '51jbnet' 57 | self.doc = 'doc' 58 | self.es = Elasticsearch(ES_HOST) 59 | 60 | def process_item(self, item, spider): 61 | crawltime = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S') 62 | url = item.get('url', None) 63 | if not url: 64 | raise FileNotFoundError('url is empty') 65 | 66 | query_body = { 67 | "query": 68 | { 69 | "term": { 70 | "url": url 71 | } 72 | } 73 | } 74 | 75 | # 去重 76 | try: 77 | query_result = self.es.search(index=self.index, body=query_body) 78 | 79 | except Exception as e: 80 | log.msg(e) 81 | raise ConnectionError('查询ES报错') 82 | 83 | hits=query_result.get('hits',{}).get('hits',[]) 84 | 85 | if hits: 86 | 87 | raise DropItem('Duplication item') 88 | 89 | body = { 90 | "pubdate": item["pubdate"], 91 | "title": item["title"], 92 | "url": item["url"], 93 | "crawled_datetime": crawltime, 94 | "category": item['category'], 95 | } 96 | 97 | try: 98 | self.es.index(index=self.index, doc_type=self.doc, body=body) 99 | except Exception as e: 100 | log.msg('错误 >>>>>') 101 | log.msg(e) 102 | return item 103 | -------------------------------------------------------------------------------- /async_cuiqingcai/async_sandbox/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymysql 8 | from twisted.enterprise import adbapi 9 | import logging 10 | import pymongo 11 | from scrapy.exceptions import DropItem 12 | 13 | class AsyncSQLPipeline(object): 14 | def __init__(self): 15 | self.dbpool = adbapi.ConnectionPool('pymysql',host='192.168.1.100',port=3306,user='root',password='*',db='spider_test') 16 | # self.cursor = self.conn.cursor() 17 | 18 | def process_item(self, item, spider): 19 | update_=self.dbpool.runInteraction(self.update,item) 20 | update_.addErrback(self.handle_error,item,spider) 21 | 22 | return item 23 | 24 | def update(self,cursor,item): 25 | insert_sql = 'insert into tb_cuiqingcai (category,title,article_url,content,author,created_at,liked,visited,comment,crawltime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' 26 | data=(item['category'],item['title'],item['article_url'],item['content'],item['author'],item['created_at'],item['liked'],item['visited'],item['comment'],item['crawltime'] 27 | ) 28 | cursor.execute(insert_sql,data) 29 | 30 | def handle_error(self,failure,item,spider): 31 | logging.error('写入数据库异常--->') 32 | logging.error(failure) 33 | logging.error('error item') 34 | logging.error(item) 35 | 36 | class MongoPipeline(object): 37 | 38 | def __init__(self,host,port,db,doc): 39 | client = pymongo.MongoClient(host,port) 40 | self.doc=client[db][doc] 41 | 42 | @classmethod 43 | def from_crawler(cls,crawler): 44 | print('in from crawler') 45 | host = crawler.settings.get('MONGO_HOST') 46 | port = crawler.settings.getint('MONGO_PORT') 47 | db = crawler.settings.get('MONGO_DB') 48 | doc = crawler.settings.get('MONGO_DOC') 49 | 50 | 51 | print(f'host {host}') 52 | return cls(host,port,db,doc) 53 | 54 | def open_spider(self,spider): 55 | print('spider open') 56 | 57 | def process_item(self,item,spider): 58 | print('in mongopipeline') 59 | 60 | if item is None: 61 | print('item is None') 62 | else: 63 | print('item is not None') 64 | print(f'receive item -> len is {len(item)}') 65 | # self.doc.insert(dict(item)) 66 | return item 67 | 68 | def close_spider(self,spider): 69 | print('closing in pipeline') 70 | 71 | class JSONPipeline(object): 72 | 73 | def __init__(self,host,port,db,doc): 74 | pass 75 | 76 | @classmethod 77 | def from_crawler(cls,crawler): 78 | print('in from crawler') 79 | host = crawler.settings.get('MONGO_HOST') 80 | port = crawler.settings.getint('MONGO_PORT') 81 | db = crawler.settings.get('MONGO_DB') 82 | doc = crawler.settings.get('MONGO_DOC') 83 | 84 | 85 | print(f'host {host}') 86 | return cls(host,port,db,doc) 87 | 88 | def open_spider(self,spider): 89 | print('spider open') 90 | 91 | def process_item(self,item,spider): 92 | print('in JSON pipeline') 93 | print(f'receive item -> len is {len(item)}') 94 | 95 | # return item 96 | raise DropItem(item) 97 | 98 | def close_spider(self,spider): 99 | print('closing in pipeline') -------------------------------------------------------------------------------- /fraud/fraud/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for fraud project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'fraud' 13 | 14 | SPIDER_MODULES = ['fraud.spiders'] 15 | NEWSPIDER_MODULE = 'fraud.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'fraud (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | # CONCURRENT_REQUESTS_PER_DOMAIN = 1 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | COOKIES_ENABLED = True 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'fraud.middlewares.FraudSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | # DOWNLOADER_MIDDLEWARES = { 56 | # 'fraud.middlewares.DynamicProxyMiddleware': 543, 57 | # } 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'fraud.pipelines.FraudPipeline': 300, 69 | # 'fraud.pipelines.DuplicatesPipeline': 200, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /MyLibrary/sandbox/sandbox/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for sandbox project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'sandbox' 13 | 14 | SPIDER_MODULES = ['sandbox.spiders'] 15 | NEWSPIDER_MODULE = 'sandbox.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'sandbox (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'sandbox.middlewares.SandboxSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | # DOWNLOADER_MIDDLEWARES = { 56 | # 'sandbox.middlewares.RandomUserAgent': 543, 57 | # 'sandbox.middlewares.ProxyMiddleware': 553, 58 | # } 59 | 60 | # Enable or disable extensions 61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | # 'sandbox.pipelines.SQLPipeline': 300, 70 | 'sandbox.pipelines.MongoPipeline': 100, 71 | } 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | #HTTPCACHE_ENABLED = True 89 | #HTTPCACHE_EXPIRATION_SECS = 0 90 | #HTTPCACHE_DIR = 'httpcache' 91 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | -------------------------------------------------------------------------------- /anjuke/anjuke.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import codecs 3 | import json 4 | import re 5 | import urllib 6 | from lxml import etree 7 | import requests 8 | 9 | 10 | def query(kw): 11 | for i in range(1, 10): 12 | encode_kw = urllib.quote(kw) 13 | print(i) 14 | url = 'https://m.anjuke.com/ajax/autocomplete/?city_id=13&kw=%s&from=1&callback=jsonp%d' % (encode_kw, i) 15 | s = requests.Session() 16 | headers = { 17 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'} 18 | js = s.get(url, headers=headers) 19 | print(js.status_code) 20 | # print(js.text) 21 | try: 22 | result = re.findall('jsonp7\((.*?)\);', js.text)[0] 23 | dic = json.loads(result) 24 | print('*' * 20) 25 | print(dic['data']['match'][0]['comm_id']) 26 | except Exception as e: 27 | print(e) 28 | 29 | 30 | # 获取安居客的城市列表 31 | def getcitylist(): 32 | headers = {'Accept-Language': ' zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': ' gzip, deflate', 33 | 'Connection': ' keep-alive', 34 | 'Accept': ' text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 35 | 'User-Agent': ' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0', 36 | 'Host': ' m.anjuke.com', 'Referer': ' https://m.anjuke.com/bj/', 37 | 'Cookie': ' aQQ_ajkguid=145D8A4E-6387-1752-E32C-D4EFB4EBFE09; lps="/|"; ctid=14; 58tj_uuid=fdb54be9-84d6-4511-ad1e-3227c1eac9ae; new_session=0; init_refer=; new_uv=1; sessid=AD7C8189-AB56-4CAF-1BAC-FF0CCD27668C'} 38 | url = 'https://m.anjuke.com/cityList/' 39 | r = requests.get(url=url, headers=headers) 40 | print(r.status_code) 41 | tree = etree.HTML(r.text) 42 | word=u'其他' 43 | node = tree.xpath('//div[@class="cl-c-l-h" and @id !="letter-%s"]/following-sibling::*[1]' %word) 44 | dicts ={} 45 | for i in node: 46 | name = i.xpath('.//li/a/text()') 47 | link= i.xpath('.//li/a/@href') 48 | if len(name) != len(link): 49 | for j in name: 50 | print(j) 51 | for k in link: 52 | print(k) 53 | 54 | for index in range(len(name)): 55 | short_cut=link[index].split('/')[3] 56 | dicts[short_cut]=name[index] 57 | 58 | return dicts 59 | 60 | def debug_page(): 61 | 62 | headers = {'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0', 'Host': 'm.anjuke.com', 'Cookie': 'aQQ_ajkguid=0B0A627A-FCF1-2B6A-2ADF-56DD166B0EBC; ctid=13; lps="/|"; sessid=804075FD-7FE8-E9C0-FA60-2FCB76C5B6B3; 58tj_uuid=02402201-d0d6-48de-8e58-6432612af29d; new_session=0; init_refer=; new_uv=1', 'Upgrade-Insecure-Requests': '1'} 63 | 64 | url='https://m.anjuke.com/dg/community/279422/' 65 | r=requests.get(url=url,headers=headers) 66 | print(r.status_code) 67 | tree = etree.HTML(r.text) 68 | return tree 69 | 70 | #if __name__=="__main__": 71 | #debug_page() 72 | # query('南方明珠花园二期1栋') 73 | #d = getcitylist() 74 | #f=codecs.open('anjuke_city','w',encoding='utf-8') 75 | #json.dump(d,f,ensure_ascii=False) 76 | #for k,v in d.items(): 77 | #print(k,v) 78 | 79 | tree=debug_page() -------------------------------------------------------------------------------- /lanrentingshu/lrts/lrts/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for lrts project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'lrts' 13 | 14 | SPIDER_MODULES = ['lrts.spiders'] 15 | NEWSPIDER_MODULE = 'lrts.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'lrts (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'lrts.middlewares.LrtsSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'lrts.middlewares.LrtsDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | # MyFilesPipeline 69 | # 'scrapy.pipelines.files.FilesPipeline':1 70 | 'lrts.pipelines.MyFilesPipeline': 300, 71 | } 72 | FILES_STORE='C:\\git\\CrawlMan\\lanrentingshu\\lrts\\lrts\\data' 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | #HTTPCACHE_ENABLED = True 89 | #HTTPCACHE_EXPIRATION_SECS = 0 90 | #HTTPCACHE_DIR = 'httpcache' 91 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | -------------------------------------------------------------------------------- /fraud/fraud/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | import time 10 | import hashlib 11 | 12 | 13 | class FraudSpiderMiddleware(object): 14 | # Not all methods need to be defined. If a method is not defined, 15 | # scrapy acts as if the spider middleware does not modify the 16 | # passed objects. 17 | 18 | @classmethod 19 | def from_crawler(cls, crawler): 20 | # This method is used by Scrapy to create your spiders. 21 | s = cls() 22 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 23 | return s 24 | 25 | def process_spider_input(self, response, spider): 26 | # Called for each response that goes through the spider 27 | # middleware and into the spider. 28 | 29 | # Should return None or raise an exception. 30 | return None 31 | 32 | def process_spider_output(self, response, result, spider): 33 | # Called with the results returned from the Spider, after 34 | # it has processed the response. 35 | 36 | # Must return an iterable of Request, dict or Item objects. 37 | for i in result: 38 | yield i 39 | 40 | def process_spider_exception(self, response, exception, spider): 41 | # Called when a spider or process_spider_input() method 42 | # (from other spider middleware) raises an exception. 43 | 44 | # Should return either None or an iterable of Response, dict 45 | # or Item objects. 46 | pass 47 | 48 | def process_start_requests(self, start_requests, spider): 49 | # Called with the start requests of the spider, and works 50 | # similarly to the process_spider_output() method, except 51 | # that it doesn’t have a response associated. 52 | 53 | # Must return only requests (not items). 54 | for r in start_requests: 55 | yield r 56 | 57 | def spider_opened(self, spider): 58 | spider.logger.info('Spider opened: %s' % spider.name) 59 | ''' 60 | class DynamicProxyMiddleware(object): 61 | def process_request(self, request, spider): 62 | # time.sleep(1) 63 | auth_header = self.get_auth_header() 64 | request.meta['proxy'] = "http://s3.proxy.mayidaili.com:8123" 65 | request.headers['Proxy-Authorization'] = auth_header 66 | 67 | def get_auth_header(self): 68 | # 请替换app_key和secret 69 | app_key = "67783764" 70 | secret = "6151eb360668ca10ad772ca9e46d306b" 71 | 72 | param_map = { 73 | "app_key": app_key, 74 | "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), # 如果你的程序在国外,请进行时区处理 75 | "enable-simulate": 'true', 76 | "random-useragent": 'pc', 77 | "clear-cookies": 'true' 78 | } 79 | # 排序 80 | keys = param_map.keys() 81 | keys.sort() 82 | 83 | codes = "%s%s%s" % (secret, str().join('%s%s' % (key, param_map[key]) for key in keys), secret) 84 | 85 | # 计算签名 86 | sign = hashlib.md5(codes).hexdigest().upper() 87 | 88 | param_map["sign"] = sign 89 | 90 | # 拼装请求头Proxy-Authorization的值 91 | keys = param_map.keys() 92 | auth_header = "MYH-AUTH-MD5 " + str('&').join('%s=%s' % (key, param_map[key]) for key in keys) 93 | 94 | # print time.strftime("%Y-%m-%d %H:%M:%S") 95 | # print authHeader 96 | 97 | return auth_header 98 | ''' -------------------------------------------------------------------------------- /51jbnet/im_sandbox/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for im_sandbox project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'im_sandbox' 13 | 14 | SPIDER_MODULES = ['im_sandbox.spiders'] 15 | NEWSPIDER_MODULE = 'im_sandbox.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'im_sandbox (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | # DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'im_sandbox.middlewares.ImSandboxSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'im_sandbox.middlewares.ImSandboxDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'im_sandbox.pipelines.ESPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | MYSQL_DB_URI='mysql+pymysql://root:*@127.0.0.1:3306/spider?charset=utf8' 92 | MONGODB='' 93 | ES_HOST='10.18.6.102' -------------------------------------------------------------------------------- /baiduwanpan/baiduwanpan.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import time 3 | import sys 4 | header = {'Origin': 'https://pan.baidu.com', 'Content-Length': '26', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'X-Requested-With': 'XMLHttpRequest', 'Host': 'pan.baidu.com', 'Accept': '*/*', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 'Connection': 'keep-alive', 'Cookie': 'BAIDUID=11BC8C5D223E048DDCCF45DA68C96329:FG=1; BIDUPSID=11BC8C5D223E048DDCCF45DA68C96329; PSTM=1502071949; __cfduid=dbc4d8c8a8ff8f8f56693bf9911a78f9a1502257445; PANWEB=1; bdshare_firstime=1502276137037; BDSFRCVID=4g8sJeC62lrjCp3ZxSq0MencMmK52YjTH6aotvr5NjaXcbr6amOqEG0PqM8g0Ku-aG3kogKK3gOTH4nP; H_BDCLCKID_SF=JJkH_CIMJCvbfP0k5bo0M-FSMMrX5C62aJ3DW45bWJ5TMC_w5l6KWbDl2-O0Qfr-aD7uWx022bubShPC-tnGM4IzWfon363D-a6U-xDE3l02V-j9e-t2ynQDDljRq4RMW20e0h7mWIb_VKFCjTKhejO0epJf-K6Jb6Q3BROS2RrHKROkeUOlyJtpbt-qJjcqyjrvQfcy3nTZ8J5k-UcV3T0fhGJnBT5Kaa6BBqQw5xbNM-jR0qJl0DukQN3TbRkO5bRiL6C-bq-BDn3oyTbJXp0njMTTqj_efnCDoD8QKbRofJ-k-4QEbbQH-UnLq-LqX57Z0l8Ktt3_ohjSyl6W0pLHXfoX5MrLWbTPbI3mWIQHSRQLLx7m5-KyjMne3JcpLa74KKJx-xKWeIJo5Dc6D6kzhUJiB5JMBan7_nrxfDD5bKDlD6-3-PAe5f8X5to05TIX3b7Ef-5ZM-O_bf--DR-HW-Q7BqTOL5RL2R58Kh6VOI5a05Jxy5K_3xjz3fvTbIce_n7b0tT4VUOHQT3mKqQbbN3i-CrgtJblWb3cWKOJ8UbSj-Tme6jXeautJ6F8f5vfL5rDa-n5HJjRq4bohjPjMPQeBtQmJJrtahRCMl7AJMO3Mxcqh4tIhtnCtp5BQg-q3R71MqvZMbrHBUQPbj8AWa5w0x-jLT6PVn0MW-5D8h6nLPnJyUnybPnnBT3XLnLHoDPXJCDBbDv65nt_b44bKUQKbK62aKDs5lRc-hcqEIL45fRaDq47Wl7gLtcu5Co22R6cJRuK8UbSj4QoXbIUWHOX0lRC3DTu3toufp5nhMJl3j7JDMP0-4vu5MJy523iob3vQpPMDxtuj68WejcXjNRjtnOe5C6H3bP8tCLWb5rnhPF3j-bbKP6-35KHaTrB5-tbytn6qDJEbtTjXtuUjH5kaq37JD6yLPQ-Jlr8Hfnn-RK--tugKtoxJpODBRbMopvaHRjnhnvvbURvDP-g3-AJ2q8EK5r2SC-ytI_-3J; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a02553875233; MCITY=-257%3A; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; PSINO=7; H_PS_PSSID=1455_21114_17001_19897; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; STOKEN=98916c84333e810c2b1d715bb7f7cf805ae2faf839dc1e7b2ffea14af9a43422; SCRC=e189858affb6c034f51facb687ba42a3; BDCLND=Z12FNBCnoSTSfwubbu7R1dmuJgAkUv%2FVXMPFC%2FhXqtw%3D; PANPSC=8159382662928957333%3A0tGXwXye%2FVgybgBxVCVQs9wxnZzNwr1w%2Fi1kePBHTIGypp29WjDdFHgXofrWESI4GPVIaAX1Mx4yLJx7kL47ECcTFj%2FtuMrTJEGGcevXkUatUq%2FdzxBw4vvqPIbe4OQ9iyFns5yFArUpANCmD7pcJX5IlZf3%2F0X8eJFOG%2FXb%2FW8u%2BjscPFpwMA%3D%3D; Hm_lvt_7a3960b6f067eb0085b7f96ff5e660b0=1504793178,1504793213,1504793250,1504793289; Hm_lpvt_7a3960b6f067eb0085b7f96ff5e660b0=1505901469', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'Referer': 'https://pan.baidu.com/share/init?surl=o8zEuJC', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'} 5 | import requests 6 | import re 7 | 8 | for _ in range(100): 9 | # re.sub('\d',) 10 | if sys.version_info.major <3: 11 | t = str(long(time.time() * 1000)) 12 | else: 13 | t = str(int(time.time() * 1000)) 14 | #print(t) 15 | url='https://pan.baidu.com/share/verify?surl=o8zEuJC&t=%s&bdstoken=null&channel=chunlei&clienttype=0&web=1&app_id=250528&logid=MTUwNTkwMTQ3NzYzNjAuNTQwMjcwOTYwMTg0MTkyOA==' %t 16 | #url = 'https://pan.baidu.com/share/verify?surl=mhPHC7Y&t=%s&bdstoken=c5232d2c47ec22f6fb2de6a151828c91&channel=chunlei&clienttype=0&web=1&app_id=250528&logid=MTUwNTkwMDQyNDI2MzAuNDQyNTQxMzMyNDU0MTQ4NQ==' % t 17 | data = {'pwd': '2222', 'vcode': '', 'vcode_str': ''} 18 | r = requests.post(url=url, data=data, headers=header) 19 | js = r.json() 20 | print(js) 21 | 22 | pw='gxrr' 23 | data = {'pwd': pw, 'vcode': '', 'vcode_str': ''} 24 | r = requests.post(url=url, data=data, headers=header) 25 | js = r.json() 26 | print(js) --------------------------------------------------------------------------------