├── chahaoba
    ├── README.md
    ├── sandbox
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── spider_subscribe.py
    │   │   └── __init__.py
    │   ├── items.py
    │   ├── utility.py
    │   ├── models.py
    │   └── pipelines.py
    ├── run.py
    ├── scrapy.cfg
    ├── .gitignore
    └── sync_data.py
├── jd
    ├── jd
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   ├── quotes.py
    │   │   └── jd_book.py
    │   ├── items.py
    │   └── pipelines.py
    ├── requirements.txt
    ├── run.py
    ├── scrapy.cfg
    └── switch_ip.py
├── kc0011
    ├── README.md
    ├── sandbox
    │   ├── __init__.py
    │   ├── spiders
    │   │   └── __init__.py
    │   ├── utility.py
    │   ├── items.py
    │   └── pipelines.py
    ├── run.py
    ├── scrapy.cfg
    ├── .gitignore
    └── async_mongo.py
├── tiexue
    ├── README.md
    ├── sandbox
    │   ├── __init__.py
    │   ├── spiders
    │   │   └── __init__.py
    │   ├── items.py
    │   ├── utility.py
    │   └── pipelines.py
    ├── run.py
    ├── qqq
    ├── scrapy.cfg
    └── .gitignore
├── fraud
    ├── fraud
    │   ├── __init__.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── db_config.py
    │   │   └── fraud.py
    │   ├── spiders
    │   │   └── __init__.py
    │   ├── items.py
    │   ├── match.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── middlewares.py
    ├── run.py
    └── scrapy.cfg
├── lanrentingshu
    ├── 懒人听书.txt
    ├── lrts
    │   ├── lrts
    │   │   ├── __init__.py
    │   │   ├── spiders
    │   │   │   ├── __init__.py
    │   │   │   └── tingshu.py
    │   │   ├── items.py
    │   │   ├── pipelines.py
    │   │   └── settings.py
    │   ├── run.py
    │   └── scrapy.cfg
    ├── header_toolkit.py
    ├── request_header
    └── lanrentingshu.py
├── sz_yaohao
    ├── README.md
    ├── sandbox
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── website.py
    │   ├── items.py
    │   ├── headers.txt
    │   ├── utility.py
    │   ├── models.py
    │   └── pipelines.py
    ├── run.py
    ├── scrapy.cfg
    └── .gitignore
├── weibo
    ├── weibo
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── wb.py
    │   ├── pipelines.py
    │   └── items.py
    └── scrapy.cfg
├── bbssmth
    ├── bbssmth
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── test.html
    │   │   └── __init__.py
    │   ├── items.py
    │   └── pipelines.py
    ├── cmd_run.py
    └── scrapy.cfg
├── myubbs
    ├── sandbox
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── website.py
    │   ├── items.py
    │   ├── utility.py
    │   ├── models.py
    │   ├── headers
    │   └── pipelines.py
    ├── run.py
    ├── scrapy.cfg
    └── .gitignore
├── 51jbnet
    ├── im_sandbox
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── website.py
    │   ├── items.py
    │   ├── models.py
    │   ├── pipelines.py
    │   └── settings.py
    ├── README.MD
    ├── start_task.py
    └── scrapy.cfg
├── bilibili
    ├── bilibili
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   ├── bili.log
    │   │   └── bili.py
    │   ├── pipelines.py
    │   ├── items.py
    │   └── logger.py
    ├── run.py
    └── scrapy.cfg
├── poi_gaode
    ├── sandbox
    │   ├── __init__.py
    │   ├── spiders
    │   │   └── __init__.py
    │   ├── items.py
    │   ├── models.py
    │   └── pipelines.py
    ├── run.py
    ├── scrapy.cfg
    ├── .gitignore
    ├── .ipynb_checkpoints
    │   └── Untitled-checkpoint.ipynb
    └── gaode_map.py
├── tencentjob
    ├── tencentjob
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── tencent.py
    │   ├── items.py
    │   └── pipelines.py
    ├── run.py
    └── scrapy.cfg
├── MyLibrary
    ├── sandbox
    │   ├── sandbox
    │   │   ├── __init__.py
    │   │   ├── spiders
    │   │   │   └── __init__.py
    │   │   ├── items.py
    │   │   ├── models.py
    │   │   ├── pipelines.py
    │   │   └── settings.py
    │   ├── run.py
    │   ├── scrapy.cfg
    │   └── .gitignore
    ├── __init__.py
    └── login.py
├── cuiqingcai
    ├── README.MD
    ├── async_sandbox
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── example.py
    │   ├── items.py
    │   ├── utility.py
    │   └── pipelines.py
    ├── run_spider.py
    └── scrapy.cfg
├── async_cuiqingcai
    ├── async_sandbox
    │   ├── __init__.py
    │   ├── monitor
    │   │   ├── __init__.py
    │   │   ├── settings.py
    │   │   ├── app.py
    │   │   ├── statscol.py
    │   │   └── templates
    │   │   │   └── index.html
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── crawl_all_example.py
    │   ├── items.py
    │   ├── commands
    │   │   └── crawlall.py
    │   ├── RedisDuplicator.py
    │   ├── CustomExtension.py
    │   ├── CustomMiddleware.py
    │   └── pipelines.py
    ├── run_spider.py
    ├── README.MD
    ├── scrapy.cfg
    ├── rabbit_send.py
    └── multi_spider_run.py
├── ximalaya
    ├── README.MD
    ├── story.py
    └── main.py
├── chinaclear
    └── README.md
├── pornhub
    ├── _config.yml
    ├── requirements.txt
    ├── parseJS.py
    ├── README.md
    ├── tampermonkey.js
    ├── newJs.js
    ├── .gitignore
    └── cookies_access.py
├── phone_bomb
    ├── fuguo.py
    └── ceoonline
    │   ├── train
    │       └── ceconline.h5
    │   └── images
    │       └── ceconline.h5
├── 51CTOCrawler
    ├── getkey.txt
    └── demo.py
├── .gitattributes
├── holdle
    ├── common
    │   └── __init__.py
    └── sync_spider.py
├── stockholder
    ├── __init__.py
    └── main.py
├── dfcf
    ├── push_redis.py
    └── settings.py
├── github_star
    ├── .vscode
    │   └── launch.json
    └── star.py
├── m3u8_video
    └── experience.py
├── csdn
    └── getCSDN_Range.py
├── .gitignore
├── Forbes
    └── main.py
├── 52sh
    ├── config_file.py
    └── aio_spider.py
├── qianfangyiguan
    └── qianfan_models.py
├── dashiye
    └── main.py
├── README.MD
├── szhouse
    ├── database.py
    └── house.py
├── Ergeduoduo
    └── main.py
├── fangtianxia
    └── fangtianxia_proxy_test.py
├── yinyonbao
    └── yingyongbao.py
├── v2ex_job
    └── v2ex2.py
├── anjuke
    ├── test_anjuke.py
    └── anjuke.py
├── youdao_dictionary
    └── youdao.py
├── zhihu
    └── zhihu_book.py
├── stock_pledge
    └── crawler.py
└── baiduwanpan
    └── baiduwanpan.py


/chahaoba/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/jd/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kc0011/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tiexue/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/fraud/fraud/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lanrentingshu/懒人听书.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sz_yaohao/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/weibo/weibo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/bbssmth/bbssmth/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/chahaoba/sandbox/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kc0011/sandbox/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/myubbs/sandbox/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tiexue/sandbox/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/51jbnet/im_sandbox/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/bilibili/bilibili/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/fraud/fraud/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/poi_gaode/sandbox/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sz_yaohao/sandbox/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tencentjob/tencentjob/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/51jbnet/README.MD:
--------------------------------------------------------------------------------
1 | # Improve Sandbox


--------------------------------------------------------------------------------
/MyLibrary/sandbox/sandbox/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cuiqingcai/README.MD:
--------------------------------------------------------------------------------
1 | # 异步爬虫框架 - 写入数据库异步


--------------------------------------------------------------------------------
/cuiqingcai/async_sandbox/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lanrentingshu/lrts/lrts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ximalaya/README.MD:
--------------------------------------------------------------------------------
1 | ## 喜马拉雅爬虫 杨继东的投资之道
2 | 


--------------------------------------------------------------------------------
/chinaclear/README.md:
--------------------------------------------------------------------------------
1 | ## 中登新增投资者数目抓取
2 | ## 每周定时抓取


--------------------------------------------------------------------------------
/pornhub/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-modernist


--------------------------------------------------------------------------------
/chahaoba/sandbox/spiders/spider_subscribe.py:
--------------------------------------------------------------------------------
1 | # 使用订阅者模式爬虫


--------------------------------------------------------------------------------
/jd/requirements.txt:
--------------------------------------------------------------------------------
1 | selenium
2 | pandas
3 | scrapy
4 | requests
5 | 


--------------------------------------------------------------------------------
/pornhub/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | pysocks
3 | lxml
4 | js2py
5 | clint
6 | fire
7 | loguru
8 | 


--------------------------------------------------------------------------------
/phone_bomb/fuguo.py:
--------------------------------------------------------------------------------
1 | # 富国基金
2 | url='https://etrading.fullgoal.com.cn/etrading/account/openacco/quickinit'


--------------------------------------------------------------------------------
/MyLibrary/__init__.py:
--------------------------------------------------------------------------------
1 | #-*-coding=utf-8-*-
2 | 
3 | def main():
4 | 
5 | 
6 | if __name__ == '__main__':
7 |     main()


--------------------------------------------------------------------------------
/bbssmth/bbssmth/spiders/test.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rockyzsu/CrawlMan/master/bbssmth/bbssmth/spiders/test.html


--------------------------------------------------------------------------------
/51CTOCrawler/getkey.txt:
--------------------------------------------------------------------------------
1 | TiThyXMlilmeFso5akV6ZDg0NlpEWXhOak0zTmNUaG1PRFprTUdNY3lPV1l4YVllekpsTzBEMlU1MzU2NTQ4ZDM4M005ZDI1OGM5VGM3eg==


--------------------------------------------------------------------------------
/phone_bomb/ceoonline/train/ceconline.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rockyzsu/CrawlMan/master/phone_bomb/ceoonline/train/ceconline.h5


--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/monitor/__init__.py:
--------------------------------------------------------------------------------
1 | # *-* coding:utf-8 *-*
2 | '''
3 | @author: ioiogoo
4 | @date: 2016/12/25 15:05
5 | '''


--------------------------------------------------------------------------------
/phone_bomb/ceoonline/images/ceconline.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rockyzsu/CrawlMan/master/phone_bomb/ceoonline/images/ceconline.h5


--------------------------------------------------------------------------------
/tencentjob/run.py:
--------------------------------------------------------------------------------
1 | 
2 | # -*-coding=utf-8-*-
3 | from scrapy import cmdline
4 | cmd = 'scrapy crawl tencent'
5 | cmdline.execute(cmd.split())


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.js linguist-language=python
2 | *.css linguist-language=python
3 | *.ipynb linguist-language=python
4 | *.html linguist-language=python


--------------------------------------------------------------------------------
/holdle/common/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time : 2020/11/16 11:39
3 | # @File : __init__.py
4 | # @Author : Rocky C@www.30daydo.com
5 | 


--------------------------------------------------------------------------------
/cuiqingcai/run_spider.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | name = 'example'
3 | cmdline.execute('scrapy crawl {} -s LOG_FILE=cuiqingcai.log'.format(name).split())


--------------------------------------------------------------------------------
/fraud/run.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | 
3 | name = 'fraud_info'  # fraud_info
4 | cmd = 'scrapy crawl {0}'.format(name)
5 | cmdline.execute(cmd.split())


--------------------------------------------------------------------------------
/async_cuiqingcai/run_spider.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | name = 'example'
3 | cmdline.execute('scrapy crawl {} -s LOG_FILE=scrapy.log'.format(name).split())


--------------------------------------------------------------------------------
/async_cuiqingcai/README.MD:
--------------------------------------------------------------------------------
1 | # 异步爬虫框架 - 写入数据库异步
2 | # 自定义去重
3 | # 自定义pipeline 设置open_spider,close_spider
4 | 
5 | # 自定义去重
6 | # 爬虫中间件
7 | # rabbitmq消息队列
8 | # 下载中间件
9 | 


--------------------------------------------------------------------------------
/jd/run.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 | # @Time : 2018/8/8 22:23
3 | # @File : run.py
4 | from scrapy import cmdline
5 | cmd ='scrapy crawl jd_book'
6 | cmdline.execute(cmd.split())


--------------------------------------------------------------------------------
/lanrentingshu/lrts/run.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 | from scrapy import cmdline
3 | 
4 | name = 'tingshu'  #
5 | cmd = 'scrapy crawl {0}'.format(name)
6 | cmdline.execute(cmd.split())


--------------------------------------------------------------------------------
/51jbnet/start_task.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 | 
3 | # @Time : 2019/5/16 17:44
4 | # @File : start_task.py
5 | from scrapy import cmdline
6 | cmdline.execute('scrapy crawl website'.split())


--------------------------------------------------------------------------------
/pornhub/parseJS.py:
--------------------------------------------------------------------------------
1 | import js2py
2 | filename = '20201229js.js'
3 | with open(filename,'r') as f:
4 |     content = f.read()
5 | 
6 | js_object = js2py.eval_js(content)
7 | print(js_object)


--------------------------------------------------------------------------------
/jd/jd/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/bbssmth/cmd_run.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # website: http://30daydo.com
3 | # @Time : 2019/4/18 23:19
4 | # @File : cmd.py
5 | from scrapy import cmdline
6 | cmdline.execute('scrapy crawl bbssm'.split())


--------------------------------------------------------------------------------
/fraud/fraud/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/weibo/weibo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/bbssmth/bbssmth/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/bilibili/run.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 | # @Time : 2018/8/15 14:52
3 | # @File : run.py
4 | from scrapy import cmdline
5 | name = 'ordinary'
6 | cmd = 'scrapy crawl {}'.format(name)
7 | cmdline.execute(cmd.split())


--------------------------------------------------------------------------------
/chahaoba/sandbox/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/kc0011/sandbox/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/myubbs/sandbox/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/tiexue/sandbox/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/51jbnet/im_sandbox/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/bilibili/bilibili/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/poi_gaode/sandbox/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/sz_yaohao/sandbox/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/tencentjob/tencentjob/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/MyLibrary/sandbox/sandbox/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/chahaoba/run.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 | 
3 | # @Time : 2018/9/26 9:58
4 | # @File : run.py
5 | 
6 | from scrapy import cmdline
7 | name = 'chahaoba'
8 | cmd = 'scrapy crawl {}'.format(name)
9 | cmdline.execute(cmd.split())


--------------------------------------------------------------------------------
/cuiqingcai/async_sandbox/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/lanrentingshu/lrts/lrts/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/poi_gaode/run.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 | 
3 | # @Time : 2018/9/26 9:58
4 | # @File : run.py
5 | 
6 | from scrapy import cmdline
7 | name = 'gaode1'
8 | cmd = 'scrapy crawl {}'.format(name)
9 | cmdline.execute(cmd.split())


--------------------------------------------------------------------------------
/sz_yaohao/run.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 | 
3 | # @Time : 2018/9/26 9:58
4 | # @File : run.py
5 | 
6 | from scrapy import cmdline
7 | name = 'website'
8 | cmd = 'scrapy crawl {}'.format(name)
9 | cmdline.execute(cmd.split())


--------------------------------------------------------------------------------
/tiexue/run.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 | 
3 | # @Time : 2018/9/26 9:58
4 | # @File : run.py
5 | 
6 | from scrapy import cmdline
7 | name = 'example'
8 | cmd = 'scrapy crawl {}'.format(name)
9 | cmdline.execute(cmd.split())


--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/MyLibrary/sandbox/run.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 | 
3 | # @Time : 2018/9/26 9:58
4 | # @File : run.py
5 | 
6 | from scrapy import cmdline
7 | name = 'website2'
8 | cmd = 'scrapy crawl {}'.format(name)
9 | cmdline.execute(cmd.split())


--------------------------------------------------------------------------------
/kc0011/run.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2018/9/26 9:58
 4 | # @File : run.py
 5 | 
 6 | from scrapy import cmdline
 7 | name = 'spider'
 8 | cmd = 'scrapy crawl {} -s JOBDIR=jobs'.format(name)
 9 | cmdline.execute(cmd.split())
10 | 


--------------------------------------------------------------------------------
/stockholder/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | __author__ = 'Rocky'
 4 | '''
 5 | http://30daydo.com
 6 | Email: weigesysu@qq.com
 7 | '''
 8 | 
 9 | 
10 | def main():
11 |     pass
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     main()


--------------------------------------------------------------------------------
/tiexue/qqq:
--------------------------------------------------------------------------------
1 | <div class="auteurInfo">
2 |                         <div class="bar_1"> <span class="fatieren float_L"><img src="http://r.itiexue.net/bbs/other/images/user.gif" width="30" height="30" class="float_L" /><a href="http://i.tiexue.net/13081352" class="coloBlue" target="_blank">旌旗漫卷</a></span>


--------------------------------------------------------------------------------
/myubbs/run.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2018/9/26 9:58
 4 | # @File : run.py
 5 | import datetime
 6 | 
 7 | from scrapy import cmdline
 8 | name = 'myubbs'
 9 | current = datetime.date.today()
10 | cmd = 'scrapy crawl {} -s LOG_FILE={}.log'.format(name,current)
11 | cmdline.execute(cmd.split())


--------------------------------------------------------------------------------
/jd/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = jd.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = jd
12 | 


--------------------------------------------------------------------------------
/fraud/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = fraud.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = fraud
12 | 


--------------------------------------------------------------------------------
/weibo/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = weibo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = weibo
12 | 


--------------------------------------------------------------------------------
/bbssmth/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = bbssmth.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = bbssmth
12 | 


--------------------------------------------------------------------------------
/chahaoba/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = sandbox.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sandbox
12 | 


--------------------------------------------------------------------------------
/kc0011/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = sandbox.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sandbox
12 | 


--------------------------------------------------------------------------------
/myubbs/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = sandbox.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sandbox
12 | 


--------------------------------------------------------------------------------
/poi_gaode/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = sandbox.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sandbox
12 | 


--------------------------------------------------------------------------------
/sz_yaohao/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = sandbox.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sandbox
12 | 


--------------------------------------------------------------------------------
/tiexue/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = sandbox.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sandbox
12 | 


--------------------------------------------------------------------------------
/51jbnet/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = im_sandbox.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = im_sandbox
12 | 


--------------------------------------------------------------------------------
/bilibili/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = bilibili.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = bilibili
12 | 


--------------------------------------------------------------------------------
/lanrentingshu/lrts/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = lrts.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = lrts
12 | 


--------------------------------------------------------------------------------
/MyLibrary/sandbox/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = sandbox.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sandbox
12 | 


--------------------------------------------------------------------------------
/tencentjob/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = tencentjob.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = tencentjob
12 | 


--------------------------------------------------------------------------------
/cuiqingcai/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = async_sandbox.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = async_sandbox
12 | 


--------------------------------------------------------------------------------
/async_cuiqingcai/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = async_sandbox.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = async_sandbox
12 | 


--------------------------------------------------------------------------------
/weibo/weibo/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class WeiboPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/bilibili/bilibili/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class BilibiliPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/weibo/weibo/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class WeiboItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/bilibili/bilibili/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class BilibiliItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/lanrentingshu/lrts/lrts/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class LrtsItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/lanrentingshu/header_toolkit.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | def getheader():
 4 |     with open('request_header') as fp:
 5 |         data=fp.readlines()
 6 |     dictionary=dict()
 7 |     for line in data:
 8 |         line=line.strip()
 9 |         dictionary[line.split(":")[0]]=':'.join(line.split(":")[1:])
10 |     return dictionary
11 | if __name__=="__main__":
12 |     print getheader()


--------------------------------------------------------------------------------
/bilibili/bilibili/spiders/bili.log:
--------------------------------------------------------------------------------
1 | 2018-08-15 15:12:59 - E:\git\CrawlMan\bilibili\bilibili\spiders\bili.py - INFO: - ====================================================
2 | 2018-08-15 15:12:59 - E:\git\CrawlMan\bilibili\bilibili\spiders\bili.py - INFO: - <div class="num-wrap"><span>--</span></div>
3 | 2018-08-15 15:12:59 - E:\git\CrawlMan\bilibili\bilibili\spiders\bili.py - INFO: - ====================================================
4 | 


--------------------------------------------------------------------------------
/MyLibrary/sandbox/sandbox/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Item,Field
 9 | 
10 | 
11 | class SpiderItem(Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     username=Field()
15 |     password = Field()
16 | 
17 | 


--------------------------------------------------------------------------------
/chahaoba/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | *.pyc
 3 | data.cfg
 4 | *.mp3
 5 | *.pkl
 6 | *.xls
 7 | *.xml
 8 | *.csv
 9 | *.pkl
10 | ~$d.xlsx
11 | d.xlsx
12 | data/
13 | temp
14 | request_header
15 | header_toolkit.txt
16 | *.xlsx
17 | *.log
18 | __pycache__/
19 | wikizhword.text
20 | news_tensite_xml.dat
21 | news_tensite_xml.smarty.dat
22 | *.jpg
23 | Download/
24 | Download_IMG/
25 | *.zip
26 | cookies
27 | httpcache
28 | config.py
29 | *.png
30 | full_name.dat
31 | 


--------------------------------------------------------------------------------
/kc0011/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | *.pyc
 3 | data.cfg
 4 | *.mp3
 5 | *.pkl
 6 | *.xls
 7 | *.xml
 8 | *.csv
 9 | *.pkl
10 | ~$d.xlsx
11 | d.xlsx
12 | data/
13 | temp
14 | request_header
15 | header_toolkit.txt
16 | *.xlsx
17 | *.log
18 | __pycache__/
19 | wikizhword.text
20 | news_tensite_xml.dat
21 | news_tensite_xml.smarty.dat
22 | *.jpg
23 | Download/
24 | Download_IMG/
25 | *.zip
26 | cookies
27 | httpcache
28 | config.py
29 | *.png
30 | full_name.dat
31 | 


--------------------------------------------------------------------------------
/myubbs/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | *.pyc
 3 | data.cfg
 4 | *.mp3
 5 | *.pkl
 6 | *.xls
 7 | *.xml
 8 | *.csv
 9 | *.pkl
10 | ~$d.xlsx
11 | d.xlsx
12 | data/
13 | temp
14 | request_header
15 | header_toolkit.txt
16 | *.xlsx
17 | *.log
18 | __pycache__/
19 | wikizhword.text
20 | news_tensite_xml.dat
21 | news_tensite_xml.smarty.dat
22 | *.jpg
23 | Download/
24 | Download_IMG/
25 | *.zip
26 | cookies
27 | httpcache
28 | config.py
29 | *.png
30 | full_name.dat
31 | 


--------------------------------------------------------------------------------
/tiexue/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | *.pyc
 3 | data.cfg
 4 | *.mp3
 5 | *.pkl
 6 | *.xls
 7 | *.xml
 8 | *.csv
 9 | *.pkl
10 | ~$d.xlsx
11 | d.xlsx
12 | data/
13 | temp
14 | request_header
15 | header_toolkit.txt
16 | *.xlsx
17 | *.log
18 | __pycache__/
19 | wikizhword.text
20 | news_tensite_xml.dat
21 | news_tensite_xml.smarty.dat
22 | *.jpg
23 | Download/
24 | Download_IMG/
25 | *.zip
26 | cookies
27 | httpcache
28 | config.py
29 | *.png
30 | full_name.dat
31 | 


--------------------------------------------------------------------------------
/chahaoba/sync_data.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2019/8/22 16:56
 4 | # @File : sync_data.py
 5 | import redis
 6 | r=redis.StrictRedis('10.18.6.46',db=8,decode_responses=True)
 7 | import pymysql
 8 | con = pymysql.connect(host='',port=,db='spider',user='',password='')
 9 | cursor = con.cursor()
10 | cmd = 'select number from chahaoba'
11 | cursor.execute(cmd)
12 | ret = cursor.fetchall()
13 | for i in ret:
14 |     r.sadd('chahaoba',i[0])
15 | 


--------------------------------------------------------------------------------
/poi_gaode/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | *.pyc
 3 | data.cfg
 4 | *.mp3
 5 | *.pkl
 6 | *.xls
 7 | *.xml
 8 | *.csv
 9 | *.pkl
10 | ~$d.xlsx
11 | d.xlsx
12 | data/
13 | temp
14 | request_header
15 | header_toolkit.txt
16 | *.xlsx
17 | *.log
18 | __pycache__/
19 | wikizhword.text
20 | news_tensite_xml.dat
21 | news_tensite_xml.smarty.dat
22 | *.jpg
23 | Download/
24 | Download_IMG/
25 | *.zip
26 | cookies
27 | httpcache
28 | config.py
29 | *.png
30 | full_name.dat
31 | 


--------------------------------------------------------------------------------
/sz_yaohao/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | *.pyc
 3 | data.cfg
 4 | *.mp3
 5 | *.pkl
 6 | *.xls
 7 | *.xml
 8 | *.csv
 9 | *.pkl
10 | ~$d.xlsx
11 | d.xlsx
12 | data/
13 | temp
14 | request_header
15 | header_toolkit.txt
16 | *.xlsx
17 | *.log
18 | __pycache__/
19 | wikizhword.text
20 | news_tensite_xml.dat
21 | news_tensite_xml.smarty.dat
22 | *.jpg
23 | Download/
24 | Download_IMG/
25 | *.zip
26 | cookies
27 | httpcache
28 | config.py
29 | *.png
30 | full_name.dat
31 | 


--------------------------------------------------------------------------------
/MyLibrary/sandbox/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | *.pyc
 3 | data.cfg
 4 | *.mp3
 5 | *.pkl
 6 | *.xls
 7 | *.xml
 8 | *.csv
 9 | *.pkl
10 | ~$d.xlsx
11 | d.xlsx
12 | data/
13 | temp
14 | request_header
15 | header_toolkit.txt
16 | *.xlsx
17 | *.log
18 | __pycache__/
19 | wikizhword.text
20 | news_tensite_xml.dat
21 | news_tensite_xml.smarty.dat
22 | *.jpg
23 | Download/
24 | Download_IMG/
25 | *.zip
26 | cookies
27 | httpcache
28 | config.py
29 | *.png
30 | full_name.dat
31 | 


--------------------------------------------------------------------------------
/51jbnet/im_sandbox/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | class SandboxItem(scrapy.Item):
11 |     # define the fields for your item here like:
12 |     # name = scrapy.Field()
13 |     title=scrapy.Field()
14 |     url=scrapy.Field()
15 |     pubdate=scrapy.Field()
16 |     category=scrapy.Field()
17 | 


--------------------------------------------------------------------------------
/myubbs/sandbox/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Item, Field
 9 | 
10 | 
11 | class SpiderItem(Item):
12 |     # define the fields for your item here like:
13 | 
14 |     title = Field()
15 |     pubdate = Field()
16 |     content = Field()
17 |     author = Field()
18 |     url = Field()
19 |     crawltime = Field()
20 | 


--------------------------------------------------------------------------------
/jd/jd/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class JdItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     # pass
15 |     name=scrapy.Field()
16 |     price=scrapy.Field()
17 |     remark=scrapy.Field()
18 |     publish=scrapy.Field()
19 |     # shop=scrapy.Field()


--------------------------------------------------------------------------------
/dfcf/push_redis.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import redis
 3 | r=redis.StrictRedis('192.168.10.48',db=5,decode_responses=True)
 4 | 
 5 | name='todo.xlsx'
 6 | df=pd.read_excel(name,dtype={'symbol':str})
 7 | # print(df.head())
 8 | new_list=df.loc[df.industry.str.contains('汽车'), :]['symbol'].tolist()
 9 | # for i in df['代码'].values:
10 | #     r.lpush('code_list',i)
11 | old_file = '要爬取的个股列表.xlsx'
12 | df2=pd.read_excel(old_file,dtype={'代码':str})
13 | old_list = df2['代码'].tolist()
14 | for item in new_list:
15 |     if item not in old_list:
16 |         r.set(item,0)
17 | 


--------------------------------------------------------------------------------
/bbssmth/bbssmth/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Item,Field
 9 | 
10 | 
11 | class BbssmthItem(Item):
12 |     # define the fields for your item here like:
13 |     # name = Field()
14 |     title = Field()
15 |     content = Field()
16 |     create_time = Field()
17 |     url = Field()
18 |     crawltime = Field()
19 |     category = Field()
20 |     author = Field()
21 |     reply = Field()
22 | 


--------------------------------------------------------------------------------
/jd/jd/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | from jd.items import JdItem
 8 | import pymongo
 9 | class JDPipeline(object):
10 |     def __init__(self):
11 |         self.mongo=pymongo.MongoClient('10.18.6.46',27001)
12 |         self.doc=self.mongo['spider']['jd_book']
13 |     def process_item(self, item, spider):
14 |         self.doc.insert(dict(item))
15 |         return item
16 | 


--------------------------------------------------------------------------------
/chahaoba/sandbox/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Item,Field
 9 | import scrapy
10 | 
11 | class SpiderItem(Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     _number = scrapy.Field()
15 |     _city = scrapy.Field()
16 |     _province = scrapy.Field()
17 |     _card_type = scrapy.Field()
18 |     _op = scrapy.Field()
19 |     _card_detail= scrapy.Field()
20 | 


--------------------------------------------------------------------------------
/fraud/fraud/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # from scrapy.item import Item, Field
 3 | import scrapy
 4 | 
 5 | class FraudItem(scrapy.Item):
 6 |     executed_name = scrapy.Field()
 7 |     gender = scrapy.Field()
 8 |     age = scrapy.Field()
 9 |     identity_number = scrapy.Field()
10 |     court = scrapy.Field()
11 |     province = scrapy.Field()
12 |     case_number = scrapy.Field()
13 |     performance = scrapy.Field()  # 被执行人的履行情况
14 |     disrupt_type_name = scrapy.Field()  # 失信被执行人行为具体情形
15 |     duty = scrapy.Field()  # 生效法律文书确定的义务
16 |     release_time = scrapy.Field()
17 | 


--------------------------------------------------------------------------------
/sz_yaohao/sandbox/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Item,Field
 9 | 
10 | 
11 | class SpiderItem(Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     card=Field()
15 |     accountLength = Field()
16 |     cardName = Field()
17 |     cardType = Field()
18 |     mainAccount = Field()
19 |     mainValue = Field()
20 |     orgName = Field()
21 |     crawltime = Field()
22 | 


--------------------------------------------------------------------------------
/github_star/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Use IntelliSense to learn about possible attributes.
 3 |     // Hover to view descriptions of existing attributes.
 4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         
 8 |         {
 9 |             "name": "Python: Current File",
10 |             "type": "python",
11 |             "request": "launch",
12 |             "program": "${file}",
13 |             "console": "integratedTerminal",
14 |             "args": ["rockyzsu"]
15 |         }
16 |     ]
17 | }


--------------------------------------------------------------------------------
/tencentjob/tencentjob/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | # import scrapy
 9 | from scrapy import Field,Item
10 | 
11 | class TencentjobItem(Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = Field()
15 |     catalog = Field()
16 |     workLocation = Field()
17 |     recruitNumber = Field()
18 |     duty = Field()
19 |     Job_requirement= Field()
20 |     url = Field()
21 |     publishTime = Field()
22 | 


--------------------------------------------------------------------------------
/tencentjob/tencentjob/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymongo
 8 | from collections import OrderedDict
 9 | class TencentjobPipeline(object):
10 |     def __init__(self):
11 |         self.db = pymongo.MongoClient('localhost')
12 |         self.collection = self.db['tencent']['job']
13 | 
14 |     def process_item(self, item, spider):
15 |         self.collection.insert(OrderedDict(item))
16 |         return item
17 | 


--------------------------------------------------------------------------------
/m3u8_video/experience.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2019/12/2 9:17
 4 | # @File : experience.py
 5 | import requests
 6 | url='https://jh0p4t0rh9rs9610ryc.exp.bcevod.com/mda-jjkxjt57fdsith87/mda-jjkxjt57fdsith87.m3u8.{}.ts'
 7 | total = 253
 8 | headers={'User-Agent':'Xiaomi'}
 9 | data = 'data'
10 | for i in range(total+1):
11 |     try:
12 |         r = requests.get(url.format(i),headers=headers)
13 |     except Exception as e:
14 |         print(e)
15 |     else:
16 |         with open('data/{}.ts'.format(i),'wb') as f:
17 |             f.write(r.content)
18 |         print('done {}.ts'.format(i))
19 | 
20 | 


--------------------------------------------------------------------------------
/pornhub/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | - ```运行在Python环境```
 3 |     - ```git clone https://github.com/formateddd/Pornhub ```
 4 |     - ```cd Pornhub && pip install -r requirements.txt```
 5 |     - ```python crawler.py webm```
 6 |     - 待程序运行完毕， 会在webm文件夹下download两页的webm缩略图，对应名称为详细页面的URL后缀
 7 |     - 运行```python crawler.py mp4```, 在MP4文件夹可看到下载好的MP4文件
 8 | 
 9 | - ```运行在浏览器```
10 | 
11 |     - [安装油猴](http://tampermonkey.net/)
12 |     - Create a new script, copy and paste the [code](https://raw.githubusercontent.com/formateddd/pornhub/master/tampermonkey.js).
13 | 
14 |     
15 | 
16 |     ## 加群分析共享爬虫项目代码：
17 | 
18 |     ## 759746505
19 | 
20 | 


--------------------------------------------------------------------------------
/sz_yaohao/sandbox/headers.txt:
--------------------------------------------------------------------------------
 1 | Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
 2 | Accept-Encoding: gzip, deflate, br
 3 | Accept-Language: zh-CN,zh;q=0.9
 4 | Cache-Control: no-cache
 5 | Connection: keep-alive
 6 | Content-Type: application/x-www-form-urlencoded
 7 | Host: apply.jtys.sz.gov.cn
 8 | Origin: http://xqctk.jtys.sz.gov.cn
 9 | Pragma: no-cache
10 | Referer: http://xqctk.jtys.sz.gov.cn/?
11 | Upgrade-Insecure-Requests: 1
12 | User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36


--------------------------------------------------------------------------------
/csdn/getCSDN_Range.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | # Get your range of csdn
 3 | __author__ = 'rocky'
 4 | import requests
 5 | import re
 6 | import time
 7 | 
 8 | link = 'http://blog.csdn.net/yagamil/article/details/52858314'
 9 | user_agent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
10 | header = {"User-Agent": user_agent}
11 | req = requests.get(link, headers=header)
12 | content =req.text
13 | p = re.search(r' <dl title="(\d+)">',content).group(1)
14 | today = time.strftime("%Y-%m-%d")
15 | f = open(r"D:\OneDrive\Stock_Data\csdn_range.txt", 'a')
16 | contents = today + '\t' + p + '\n'
17 | f.write(contents)
18 | f.close()
19 | 


--------------------------------------------------------------------------------
/cuiqingcai/async_sandbox/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class AsyncSandboxItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = scrapy.Field()
15 |     article_url = scrapy.Field()
16 |     content = scrapy.Field()
17 |     created_at = scrapy.Field()
18 |     category = scrapy.Field()
19 |     visited = scrapy.Field()
20 |     comment = scrapy.Field()
21 |     liked = scrapy.Field()
22 |     author = scrapy.Field()
23 |     crawltime = scrapy.Field()
24 | 


--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class AsyncSandboxItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = scrapy.Field()
15 |     article_url = scrapy.Field()
16 |     # content = scrapy.Field()
17 |     created_at = scrapy.Field()
18 |     category = scrapy.Field()
19 |     visited = scrapy.Field()
20 |     comment = scrapy.Field()
21 |     liked = scrapy.Field()
22 |     author = scrapy.Field()
23 |     crawltime = scrapy.Field()
24 | 


--------------------------------------------------------------------------------
/lanrentingshu/lrts/lrts/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | from scrapy.pipelines.files import FilesPipeline
 9 | from urllib.parse import urlparse
10 | from os.path import basename,dirname,join
11 | class LrtsPipeline(object):
12 |     def process_item(self, item, spider):
13 |         return item
14 | 
15 | class MyFilesPipeline(FilesPipeline):
16 | 
17 |     def file_path(self, request, response=None, info=None):
18 |         path = urlparse(request.url).path
19 |         return join(basename(dirname(path)),basename(path))
20 | 


--------------------------------------------------------------------------------
/tiexue/sandbox/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Item,Field
 9 | import scrapy
10 | 
11 | class SpiderItem(Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = scrapy.Field()
15 |     article_url = scrapy.Field()
16 |     content = scrapy.Field()
17 |     created_at = scrapy.Field()
18 |     # category = scrapy.Field()
19 |     # visited = scrapy.Field()
20 |     # comment = scrapy.Field()
21 |     # liked = scrapy.Field()
22 |     author = scrapy.Field()
23 |     crawltime = scrapy.Field()
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | *.pyc
 3 | *.mp3
 4 | cookies
 5 | .idea
 6 | *.pyc
 7 | data.cfg
 8 | *.mp3
 9 | *.pkl
10 | *.xls
11 | *.xml
12 | *.csv
13 | *.pkl
14 | ~$d.xlsx
15 | d.xlsx
16 | data/
17 | temp
18 | request_header
19 | header_toolkit.txt
20 | *.xlsx
21 | *.log
22 | __pycache__/
23 | wikizhword.text
24 | news_tensite_xml.dat
25 | news_tensite_xml.smarty.dat
26 | *.jpg
27 | Download/
28 | Download_IMG/
29 | *.zip
30 | cookies
31 | config.json
32 | config.py
33 | data.cfg
34 | setting.py
35 | setttings.py
36 | *.ts
37 | kc0011/jobs/requests.queue/p1
38 | kc0011/jobs/requests.queue/p0
39 | kc0011/jobs/requests.queue/active.json
40 | kc0011/jobs/spider.state
41 | kc0011/jobs/requests.seen
42 | *.jpg
43 | *.png
44 | *.jpeg
45 | configure/
46 | 


--------------------------------------------------------------------------------
/lanrentingshu/request_header:
--------------------------------------------------------------------------------
 1 | Accept:*/*
 2 | Accept-Encoding:gzip, deflate
 3 | Accept-Language:zh-CN,zh;q=0.8
 4 | Cache-Control:no-cache
 5 | Connection:keep-alive
 6 | Content-Length:0
 7 | Cookie:aliyungf_tc=AQAAADCDiwwT/gEAv7APt2maQ56C3T1o; uid=15052187062975665e8ceaad34eb9911f2a90ee5b66ad; CNZZDATA1254668430=2046036592-1505217321-null%7C1505217321; Hm_lvt_ada61571fd48bb3f905f5fd1d6ef0ec4=1505218688; Hm_lpvt_ada61571fd48bb3f905f5fd1d6ef0ec4=1505219620; JSESSIONID=EE57EEBB708D1DF15621C6949A4FBE48
 8 | Host:www.lrts.me
 9 | Origin:http://www.lrts.me
10 | Pragma:no-cache
11 | Referer:http://www.lrts.me/book/32551
12 | User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36
13 | X-Requested-With:XMLHttpRequest


--------------------------------------------------------------------------------
/fraud/fraud/model/db_config.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import create_engine
 2 | from sqlalchemy.orm import sessionmaker
 3 | import redis
 4 | 
 5 | 
 6 | engine = create_engine('mysql+pymysql://root:{}@localhost:3306/spider?charset=utf8')
 7 | DBSession = sessionmaker(bind=engine)
 8 | 
 9 | 
10 | class RedisPool:
11 |     def __init__(self, client_host="localhost", client_port=6379, client_db=0):
12 |         self.client_host = client_host
13 |         self.client_port = client_port
14 |         self.client_db = client_db
15 | 
16 |     def redis_pool(self):
17 |         pool = redis.ConnectionPool(
18 |             host=self.client_host,
19 |             port=self.client_port,
20 |             db=self.client_db,
21 |             decode_responses=True)
22 |         return redis.StrictRedis(connection_pool=pool)


--------------------------------------------------------------------------------
/kc0011/sandbox/utility.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2018/12/13 13:47
 4 | # @File : utility.py
 5 | 
 6 | import os
 7 | 
 8 | # 获取headers
 9 | 
10 | def get_header(header_file='headers.txt'):
11 |     path = os.path.dirname(__file__)
12 |     header_path = os.path.join(path,'headers',header_file)
13 |     if not os.path.exists(header_path):
14 |         return None
15 | 
16 |     with open(header_path) as fp:
17 |         data = fp.readlines()
18 |     dictionary = dict()
19 | 
20 |     for line in data:
21 |         line = line.strip()
22 |         line = line.replace(' ', '')
23 |         dictionary[line.split(":")[0].strip()] = ':'.join(
24 |             line.split(":")[1:])
25 | 
26 |     if 'Content-Length' in dictionary:
27 |         del dictionary['Content-Length']
28 | 
29 |     return dictionary


--------------------------------------------------------------------------------
/myubbs/sandbox/utility.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2018/12/13 13:47
 4 | # @File : utility.py
 5 | 
 6 | import os
 7 | 
 8 | # 获取headers
 9 | 
10 | def get_header(header_file='headers.txt'):
11 |     path = os.path.dirname(__file__)
12 |     header_path = os.path.join(path,'headers',header_file)
13 |     if not os.path.exists(header_path):
14 |         return None
15 | 
16 |     with open(header_path) as fp:
17 |         data = fp.readlines()
18 |     dictionary = dict()
19 | 
20 |     for line in data:
21 |         line = line.strip()
22 |         line = line.replace(' ', '')
23 |         dictionary[line.split(":")[0].strip()] = ':'.join(
24 |             line.split(":")[1:])
25 | 
26 |     if 'Content-Length' in dictionary:
27 |         del dictionary['Content-Length']
28 | 
29 |     return dictionary


--------------------------------------------------------------------------------
/tiexue/sandbox/utility.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2018/12/13 13:47
 4 | # @File : utility.py
 5 | 
 6 | import os
 7 | 
 8 | # 获取headers
 9 | 
10 | def get_header(header_file='headers.txt'):
11 |     path = os.path.dirname(__file__)
12 |     header_path = os.path.join(path,'headers',header_file)
13 |     if not os.path.exists(header_path):
14 |         return None
15 | 
16 |     with open(header_path) as fp:
17 |         data = fp.readlines()
18 |     dictionary = dict()
19 | 
20 |     for line in data:
21 |         line = line.strip()
22 |         line = line.replace(' ', '')
23 |         dictionary[line.split(":")[0].strip()] = ':'.join(
24 |             line.split(":")[1:])
25 | 
26 |     if 'Content-Length' in dictionary:
27 |         del dictionary['Content-Length']
28 | 
29 |     return dictionary


--------------------------------------------------------------------------------
/chahaoba/sandbox/utility.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2018/12/13 13:47
 4 | # @File : utility.py
 5 | 
 6 | import os
 7 | 
 8 | # 获取headers
 9 | 
10 | def get_header(header_file='headers.txt'):
11 |     path = os.path.dirname(__file__)
12 |     header_path = os.path.join(path,'headers',header_file)
13 |     if not os.path.exists(header_path):
14 |         return None
15 | 
16 |     with open(header_path) as fp:
17 |         data = fp.readlines()
18 |     dictionary = dict()
19 | 
20 |     for line in data:
21 |         line = line.strip()
22 |         line = line.replace(' ', '')
23 |         dictionary[line.split(":")[0].strip()] = ':'.join(
24 |             line.split(":")[1:])
25 | 
26 |     if 'Content-Length' in dictionary:
27 |         del dictionary['Content-Length']
28 | 
29 |     return dictionary


--------------------------------------------------------------------------------
/sz_yaohao/sandbox/utility.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2018/12/13 13:47
 4 | # @File : utility.py
 5 | 
 6 | import os
 7 | 
 8 | # 获取headers
 9 | 
10 | def get_header(header_file='headers.txt'):
11 |     path = os.path.dirname(__file__)
12 |     header_path = os.path.join(path,'headers',header_file)
13 |     if not os.path.exists(header_path):
14 |         return None
15 | 
16 |     with open(header_path) as fp:
17 |         data = fp.readlines()
18 |     dictionary = dict()
19 | 
20 |     for line in data:
21 |         line = line.strip()
22 |         line = line.replace(' ', '')
23 |         dictionary[line.split(":")[0].strip()] = ':'.join(
24 |             line.split(":")[1:])
25 | 
26 |     if 'Content-Length' in dictionary:
27 |         del dictionary['Content-Length']
28 | 
29 |     return dictionary


--------------------------------------------------------------------------------
/cuiqingcai/async_sandbox/utility.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2018/12/13 13:47
 4 | # @File : utility.py
 5 | 
 6 | import os
 7 | 
 8 | # 获取headers
 9 | 
10 | def get_header(header_file='headers.txt'):
11 |     path = os.path.dirname(__file__)
12 |     header_path = os.path.join(path,'headers',header_file)
13 |     if not os.path.exists(header_path):
14 |         return None
15 | 
16 |     with open(header_path) as fp:
17 |         data = fp.readlines()
18 |     dictionary = dict()
19 | 
20 |     for line in data:
21 |         line = line.strip()
22 |         line = line.replace(' ', '')
23 |         dictionary[line.split(":")[0].strip()] = ':'.join(
24 |             line.split(":")[1:])
25 | 
26 |     if 'Content-Length' in dictionary:
27 |         del dictionary['Content-Length']
28 | 
29 |     return dictionary


--------------------------------------------------------------------------------
/tiexue/sandbox/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | # from sandbox.models import SpiderModels, DBSession
 8 | import logging
 9 | import pymongo
10 | from sandbox import config
11 | from sandbox import settings
12 | 
13 | 
14 | 
15 | class MongoPipeline(object):
16 |     def __init__(self):
17 |         DOCUMENT = settings.MONGODB_DOC
18 |         self.db = pymongo.MongoClient(settings.MONGO_HOST, port=settings.MONGO_PORT)
19 |         self.doc = self.db['spider'][DOCUMENT]
20 | 
21 |     def process_item(self, item, spider):
22 |         print('on process')
23 |         insert_item = dict(item)
24 |         self.doc.insert(insert_item)
25 | 
26 |         return item
27 | 


--------------------------------------------------------------------------------
/poi_gaode/sandbox/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Item, Field
 9 | 
10 | 
11 | class SpiderItem(Item):
12 |     # define the fields for your item here like:
13 | 
14 |     id = Field()
15 |     parent = Field()
16 |     name = Field()
17 |     type = Field()
18 |     typecode = Field()
19 |     biz_type = Field()
20 |     address = Field()
21 |     location = Field()
22 |     tel = Field()
23 |     distance = Field()
24 |     biz_ext = Field()
25 |     pname = Field()
26 |     cityname = Field()
27 |     adname = Field()
28 |     importance = Field()
29 |     shopid = Field()
30 |     shopinfo = Field()
31 |     poiweight = Field()
32 |     photos = Field()
33 |     crawltime = Field()
34 | 


--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/monitor/settings.py:
--------------------------------------------------------------------------------
 1 | # *-* coding:utf-8 *-*
 2 | '''
 3 | @author: ioiogoo
 4 | @date: 2016/12/26 11:48
 5 | '''
 6 | 
 7 | '''
 8 |     TIMEINTERVAL        刷新时间间隔，单位毫秒
 9 |     POINTINTERVAL       图上各点之间间隔，越小则表示点越密集
10 |     POINTLENGTH         图上点的数量，越大则表示图上时间跨度越长
11 |     STATS_KEYS          图上显示的stats_key
12 |     REDIS_HOST          redis地址
13 |     REDIS_PORT          redis端口
14 |     REDIS_DB            redis数据库，默认0
15 |     APP_HOST            app运行地址，默认127.0.0.1
16 |     APP_PORT            app运行端口，默认5000
17 | '''
18 | 
19 | TIMEINTERVAL = 30000
20 | POINTINTERVAL = 30
21 | POINTLENGTH = 2000
22 | STATS_KEYS = ['downloader/request_count', 'downloader/response_count','downloader/response_status_count/200', 'item_scraped_count']
23 | REDIS_HOST = '10.18.6.46'
24 | REDIS_PORT = 6379
25 | REDIS_DB = 0
26 | APP_HOST = '127.0.0.1'
27 | APP_PORT = 5000
28 | 


--------------------------------------------------------------------------------
/async_cuiqingcai/rabbit_send.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Date    : 2019-08-30 17:25:46
 4 | # @Author  : Rocky Chen (weigesysu@qq.com)
 5 | # @Link    : http://30daydo.com
 6 | # @Version : $Id$
 7 | 
 8 | import pika
 9 | # import settings
10 | 
11 | credentials = pika.PlainCredentials('admin','admin')
12 | connection = pika.BlockingConnection(pika.ConnectionParameters('192.168.1.101',5672,'/',credentials))
13 | 
14 | channel = connection.channel()
15 | channel.exchange_declare(exchange='direct_log',exchange_type='direct') # fanout 就是组播
16 | 
17 | routing_key = 'info'
18 | message='https://36kr.com/pp/api/aggregation-entity?type=web_latest_article&b_id=59499&per_page=30'
19 | channel.basic_publish(
20 |     exchange='direct_log',
21 |     routing_key=routing_key,
22 |     body=message
23 |     )
24 | 
25 | print('sending message {}'.format(message))
26 | connection.close()
27 | 


--------------------------------------------------------------------------------
/poi_gaode/.ipynb_checkpoints/Untitled-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "def get_max():\n",
10 |     "    with open('sz_poi.txt','r') as f:\n",
11 |     "        # js = json.load(f)\n",
12 |     "        while 1:\n",
13 |     "            "
14 |    ]
15 |   }
16 |  ],
17 |  "metadata": {
18 |   "kernelspec": {
19 |    "display_name": "Python 3",
20 |    "language": "python",
21 |    "name": "python3"
22 |   },
23 |   "language_info": {
24 |    "codemirror_mode": {
25 |     "name": "ipython",
26 |     "version": 3
27 |    },
28 |    "file_extension": ".py",
29 |    "mimetype": "text/x-python",
30 |    "name": "python",
31 |    "nbconvert_exporter": "python",
32 |    "pygments_lexer": "ipython3",
33 |    "version": "3.6.2"
34 |   }
35 |  },
36 |  "nbformat": 4,
37 |  "nbformat_minor": 2
38 | }
39 | 


--------------------------------------------------------------------------------
/pornhub/tampermonkey.js:
--------------------------------------------------------------------------------
 1 | // ==UserScript==
 2 | // @name         New Userscript
 3 | // @namespace    http://tampermonkey.net/
 4 | // @version      0.1
 5 | // @description  try to take over the world!
 6 | // @author       github.com/formateddd/pornhub
 7 | // @include      *.pornhub.com/view_video.php?viewkey=*
 8 | // @grant        none
 9 | // ==/UserScript==
10 | 
11 | 
12 | 
13 | (function() {
14 |     'use strict';
15 | 
16 |     // Your code here...
17 | 
18 | 
19 |     var qualites = [
20 |         "quality_1080p",
21 |         "quality_720p",
22 |         "quality_480p",
23 |         "quality_240p",
24 |     ];
25 | 
26 |     for (var i in qualites) {
27 |         if (window[qualites[i]]){
28 |             document.querySelector("h1").innerHTML += '<a href=' + window[qualites[i]] + '>' + qualites[i] + '</a>'
29 |             console.info(window.qualites[i]);
30 |             break
31 |         }
32 |     }
33 | 
34 | 
35 | })();
36 | 


--------------------------------------------------------------------------------
/jd/jd/spiders/quotes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy_splash import SplashRequest
 4 | 
 5 | 
 6 | class QuotesSpider(scrapy.Spider):
 7 |     name = "quotes"
 8 |     allowed_domains = ["quotes.toscrape.com"]
 9 |     start_urls = ['http://quotes.toscrape.com/js/']
10 | 
11 |     def start_requests(self):
12 |         for url in self.start_urls:
13 |             yield SplashRequest(url, args={'images': 0, 'timeout': 3})
14 | 
15 |     def parse(self, response):
16 |         for sel in response.css('div.quote'):
17 |             quote = sel.css('span.text::text').extract_first()
18 |             author = sel.css('small.author::text').extract_first()
19 |             yield {'quote': quote, 'author': author}
20 |         href = response.css('li.next > a::attr(href)').extract_first()
21 |         if href:
22 |             url = response.urljoin(href)
23 |             yield SplashRequest(url, args={'images': 0, 'timeout': 3})


--------------------------------------------------------------------------------
/kc0011/sandbox/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Item,Field
 9 | import scrapy
10 | 
11 | class SpiderItem(Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     nick_name = scrapy.Field()
15 |     level = scrapy.Field()
16 |     credit = scrapy.Field()
17 |     score_count = scrapy.Field()
18 |     tie_count = scrapy.Field()
19 |     jifeng = scrapy.Field()
20 |     register = scrapy.Field()
21 |     alipay=scrapy.Field()
22 |     email=scrapy.Field()
23 |     person_info_html = scrapy.Field()
24 |     crawltime = scrapy.Field()
25 | 
26 | class ContentItem(Item):
27 |     url = scrapy.Field()
28 |     publishTime = scrapy.Field()
29 |     author = scrapy.Field()
30 |     content = scrapy.Field()
31 |     crawltime=scrapy.Field()
32 | 
33 | 


--------------------------------------------------------------------------------
/bilibili/bilibili/logger.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | import logging
 3 | import datetime
 4 | import os
 5 | # from setting import llogger
 6 | def llogger(filename):
 7 | 
 8 |     logger = logging.getLogger(filename)  # 不加名称设置root logger
 9 |     logger.setLevel(logging.DEBUG)
10 |     formatter = logging.Formatter(
11 |         '%(asctime)s - %(name)s - %(levelname)s: - %(message)s',
12 |         datefmt='%Y-%m-%d %H:%M:%S')
13 |     # 使用FileHandler输出到文件
14 |     prefix = os.path.splitext(filename)[0]
15 |     fh = logging.FileHandler(prefix+'.log')
16 |     fh.setLevel(logging.DEBUG)
17 |     fh.setFormatter(formatter)
18 |     # 使用StreamHandler输出到屏幕
19 |     ch = logging.StreamHandler()
20 |     ch.setLevel(logging.DEBUG)
21 |     ch.setFormatter(formatter)
22 |     # 添加两个Handler
23 |     logger.addHandler(ch)
24 |     logger.addHandler(fh)
25 |     # logger.info('this is info message')
26 |     # logger.warning('this is warn message')
27 |     return logger
28 | 
29 | 


--------------------------------------------------------------------------------
/async_cuiqingcai/multi_spider_run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Date    : 2019-08-30 16:20:47
 4 | # @Author  : Rocky Chen (weigesysu@qq.com)
 5 | # @Link    : http://30daydo.com
 6 | # @Version : $Id$
 7 | 
 8 | from crochet import setup
 9 | from importlib import import_module
10 | from scrapy.crawler import CrawlerRunner
11 | from scrapy.utils.project import get_project_settings
12 | setup()
13 | 
14 | # not work
15 | def run_spider(spiderName):
16 |     module_name="async_sandbox.spiders.{}".format(spiderName)
17 |     scrapy_var = import_module(module_name)   #do some dynamic import of selected spider   
18 |     print(scrapy_var)
19 |     print(dir(scrapy_var))
20 |     spiderObj=scrapy_var.ExampleSpider           #get mySpider-object from spider module
21 |     print(spiderObj)
22 | 
23 |     crawler = CrawlerRunner(get_project_settings())   #from Scrapy docs
24 |     crawler.crawl(spiderObj) 
25 |     print('start')
26 | 
27 | run_spider('example')


--------------------------------------------------------------------------------
/bilibili/bilibili/spiders/bili.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy_splash import SplashRequest
 4 | import logging
 5 | # from bilibili.logger import llogger
 6 | # from scrapy import log
 7 | # loggers = llogger(__file__)
 8 | 
 9 | class BiliSpider(scrapy.Spider):
10 |     name = 'ordinary'  # 这个名字就是上面连接中那个启动应用的名字
11 |     allowed_domain = ["bilibili.com"]
12 |     start_urls = [
13 |         "https://www.bilibili.com/"
14 |     ]
15 | 
16 |     def start_requests(self):
17 |         splash_args = {
18 |             'wait': '5',
19 |         }
20 |         for url in self.start_urls:
21 |             yield SplashRequest(url, self.parse_result, args=splash_args, endpoint='render.html')
22 | 
23 |     def parse_result(self, response):
24 |         logging.info('====================================================')
25 |         content = response.xpath("//div[@class='num-wrap']").extract_first()
26 |         logging.info(content)
27 |         logging.info('====================================================')
28 | 
29 | 


--------------------------------------------------------------------------------
/fraud/fraud/model/fraud.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from sqlalchemy import Column, String , DateTime, Integer, Text
 3 | from sqlalchemy.ext.declarative import declarative_base
 4 | from fraud.model import db_config
 5 | import datetime
 6 | 
 7 | Base = declarative_base()
 8 | 
 9 | class Fraud(Base):
10 |     __tablename__ = 'tb_frauds2'
11 | 
12 |     id = Column(Integer, primary_key=True)
13 |     executed_name = Column(String(300))
14 |     gender = Column(String(10))
15 |     age = Column(String(10))
16 |     identity_number = Column(String(50))
17 |     court = Column(String(200))
18 |     province = Column(String(50))
19 |     case_number = Column(String(100))
20 |     performance = Column(String(100))  # 被执行人的履行情况
21 |     disrupt_type_name = Column(Text)  # 失信被执行人行为具体情形
22 |     duty = Column(Text)  # 生效法律文书确定的义务
23 |     release_time = Column(String(50))
24 |     crawl_time = Column(DateTime, default=datetime.datetime.now())
25 |     data_resource = Column(String(50), default='baidu_api')
26 | 
27 | Base.metadata.create_all(db_config.engine)
28 | 


--------------------------------------------------------------------------------
/sz_yaohao/sandbox/models.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2018/9/26 9:25
 4 | # @File : models.py
 5 | 
 6 | from sqlalchemy import Column, String, DateTime, Integer, Text, create_engine, DATE
 7 | from sqlalchemy.ext.declarative import declarative_base
 8 | import datetime
 9 | from sqlalchemy.orm import sessionmaker
10 | from sandbox import config
11 | 
12 | Base = declarative_base()
13 | engine = create_engine('mysql+pymysql://{}:{}@{}:3367/spider?charset=utf8'.format(config.username,config.password,config.mysql_ip))
14 | DBSession = sessionmaker(bind=engine)
15 | 
16 | TABLE_NAME = ''
17 | 
18 | # ORM 模型，根据项目需求修改
19 | class SpiderModels(Base):
20 |     __tablename__ = TABLE_NAME
21 | 
22 |     # 根据项目修改字段
23 |     id = Column(Integer, primary_key=True, autoincrement=True)
24 |     card=Column(Text, comment='卡号')
25 |     accountLength = Column(Text, comment='长度')
26 |     origin = Column(String(30), comment='来源')
27 |     crawltime = Column(DateTime, default=datetime.datetime.now(), comment='抓取时间')
28 | 
29 | 
30 | Base.metadata.create_all(engine)


--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/commands/crawlall.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Date    : 2019-08-29 16:56:28
 4 | # @Author  : Rocky Chen (weigesysu@qq.com)
 5 | # @Link    : http://30daydo.com
 6 | # @Version : $1.0$
 7 | 
 8 | 
 9 | from scrapy.commands import ScrapyCommand
10 | from scrapy.crawler import CrawlerProcess
11 | class Command(ScrapyCommand):
12 | 
13 |     requires_project = True
14 | 
15 |     def syntax(self):
16 |         return '[options]'
17 | 
18 |     def short_desc(self):
19 |         return 'Runs all of the spiders - My Defined'
20 | 
21 |     def run(self,args,opts):
22 |         print('==================')
23 |         print(type(self.crawler_process))
24 |         spider_list = self.crawler_process.spiders.list()
25 |         # 可以在这里 定义 spider_list = ['example','chouti']
26 |         for name in spider_list:
27 |             print('=================')
28 |             print(name)
29 |             self.crawler_process.crawl(name,**opts.__dict__)
30 | 
31 |         self.crawler_process.start()
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/MyLibrary/login.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import hashlib
 3 | 
 4 | def login_session(username,password):
 5 |     s = bytes(password, encoding='utf8')
 6 |     m = hashlib.md5()
 7 |     m.update(s)
 8 |     first_md5 = m.hexdigest()
 9 |     headers = {'Referer': 'https://www.szlib.org.cn/MyLibrary/Reader-Access.jsp?infomistake=0&eventsite=WWW-044005',
10 |                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
11 |                'X-Requested-With': 'XMLHttpRequest'}
12 | 
13 |     url = 'https://www.szlib.org.cn/MyLibrary/readerLoginM.jsp'
14 |     data = {'rand': '',
15 |             'username': username,
16 |             'password': first_md5,
17 | 
18 |             }
19 |     session=None
20 |     session = requests.Session()
21 | 
22 |     r = session.post(url=url, headers=headers, data=data, timeout=15)
23 |     print(r.text)
24 |     if '<message>OK</message>' in r.text:
25 |         print('Crash !!!')
26 |         print(username)
27 |         print(password)
28 | 
29 |     return session
30 | 


--------------------------------------------------------------------------------
/myubbs/sandbox/models.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2018/9/26 9:25
 4 | # @File : models.py
 5 | 
 6 | from sqlalchemy import Column, String, DateTime, Integer, Text, create_engine, DATE
 7 | from sqlalchemy.ext.declarative import declarative_base
 8 | import datetime
 9 | from sqlalchemy.orm import sessionmaker
10 | from sandbox import config
11 | 
12 | Base = declarative_base()
13 | engine = create_engine('mysql+pymysql://{}:{}@{}:3306/db_rocky?charset=utf8'.format(config.username,config.password,config.mysql_ip))
14 | DBSession = sessionmaker(bind=engine)
15 | 
16 | TABLE_NAME = 'tb_myubbs'
17 | 
18 | # ORM 模型，根据项目需求修改
19 | class SpiderModels(Base):
20 |     __tablename__ = TABLE_NAME
21 | 
22 | 
23 |     # 根据项目修改字段
24 |     id = Column(Integer, primary_key=True, autoincrement=True)
25 |     title = Column(String(400))
26 |     pubdate = Column(DateTime)
27 |     content = Column(Text)
28 |     author = Column(String(100))
29 |     url = Column(String(200))
30 |     crawltime = Column(DateTime, default=datetime.datetime.now(), comment='抓取时间')
31 | 
32 | 
33 | Base.metadata.create_all(engine)


--------------------------------------------------------------------------------
/myubbs/sandbox/headers:
--------------------------------------------------------------------------------
 1 | Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
 2 | Accept-Encoding: gzip, deflate
 3 | Accept-Language: zh-CN,zh;q=0.9
 4 | Cache-Control: no-cache
 5 | Connection: keep-alive
 6 | Cookie: MKG1_2132_saltkey=LnNTIT1F; MKG1_2132_lastvisit=1555164586; UM_distinctid=16a173f60854de-0f1c29779936eb-39395704-144000-16a173f60863cb; CNZZDATA3065925=cnzz_eid%3D1943346629-1555168187-http%253A%252F%252Fwww.myzsu.com%252F%26ntime%3D1555168187; MKG1_2132_seccode=103.e48171c76ce30999a4; MKG1_2132_visitedfid=97; MKG1_2132_st_p=0%7C1555169196%7C31ebb51b6faa73e0deaa417d1878522f; MKG1_2132_viewid=tid_140374; MKG1_2132_st_t=0%7C1555169280%7C31d0f95d5b85fe7f3c5028e0928583bb; MKG1_2132_forum_lastvisit=D_97_1555169280; MKG1_2132_lastact=1555169281%09home.php%09misc; MKG1_2132_sendmail=1
 7 | Host: zsu.myubbs.com
 8 | Pragma: no-cache
 9 | Referer: http://zsu.myubbs.com/forum-97-1.html
10 | Upgrade-Insecure-Requests: 1
11 | User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36


--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/spiders/crawl_all_example.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import time
 3 | class QuotesSpider(scrapy.Spider):
 4 |     name = "quotes"
 5 |     start_urls = ['http://quotes.toscrape.com/tag/humor/']
 6 | 
 7 | 
 8 |     def parse(self, response):
 9 |         time.sleep(15)
10 |         print(f'in spider {self.name}')
11 |         for quote in response.css('div.quote'):
12 |             print(quote.css('span.text::text').extract_first())
13 | 
14 |     def close(self,reason):
15 |         print('===================== spider close ================')
16 | 
17 | class QuotesSpider1(scrapy.Spider):
18 |     name = "quotes_1"
19 |     start_urls = ['http://quotes.toscrape.com/tag/humor/']
20 | 
21 |     def parse(self, response):
22 |         print('meta content ==============')
23 |         print(response.meta)
24 |         print('meta content ==============')
25 | 
26 |         print(f'in spider {self.name}')
27 |         for quote in response.css('div.quote'):
28 |             print(quote.css('span.text::text').extract_first())
29 | 
30 |     def close(self,reason):
31 |         print('===================== spider close ================')
32 | 


--------------------------------------------------------------------------------
/chahaoba/sandbox/models.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2018/9/26 9:25
 4 | # @File : models.py
 5 | 
 6 | from sqlalchemy import Column, String, DateTime, Integer, Text, create_engine, DATE
 7 | from sqlalchemy.ext.declarative import declarative_base
 8 | import datetime
 9 | from sqlalchemy.orm import sessionmaker
10 | from sandbox import config
11 | 
12 | 
13 | Base = declarative_base()
14 | engine = create_engine(
15 |     'mysql+pymysql://{}:{}@{}:3367/spider?charset=utf8'.format(config.username, config.password, config.mysql_ip))
16 | DBSession = sessionmaker(bind=engine)
17 | 
18 | TABLE_NAME = 'chahaoba'
19 | #
20 | #
21 | # # ORM 模型，根据项目需求修改
22 | class SpiderModels(Base):
23 |     __tablename__ = TABLE_NAME
24 | 
25 |     # 根据项目修改字段
26 |     id = Column(Integer, primary_key=True, autoincrement=True)
27 | 
28 |     number = Column(String(11), comment='手机号段')
29 |     city = Column(String(10), comment='城市')
30 |     province = Column(String(10), comment='省份')
31 |     card_type = Column(String(10), comment='手机卡类型')
32 |     op = Column(String(10), comment='运营商')
33 |     card_detail = Column(String(80), comment='卡详细')
34 | 
35 | 
36 | Base.metadata.create_all(engine)
37 | 


--------------------------------------------------------------------------------
/kc0011/sandbox/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | # from sandbox.models import SpiderModels, DBSession
 8 | 
 9 | import logging
10 | import pymongo
11 | from sandbox import settings
12 | from sandbox.items import SpiderItem
13 | 
14 | class MongoPipeline(object):
15 |     def __init__(self):
16 |         self.db = pymongo.MongoClient(settings.MONGO_HOST, port=settings.MONGO_PORT)
17 |         self.doc1 = self.db[settings.MONGODB_DB][settings.MONGODB_DOC]
18 |         self.doc2 = self.db[settings.MONGODB_DB][settings.MONGODB_DOC2]
19 |         try:
20 |             self.doc2.ensure_index('url',unique=True)
21 |         except Exception as e:
22 |             print(e)
23 | 
24 |     def process_item(self, item, spider):
25 |         if isinstance(item,SpiderItem):
26 | 
27 |             insert_item = dict(item)
28 |             self.doc1.insert(insert_item)
29 | 
30 |         else:
31 | 
32 |             insert_item = dict(item)
33 |             self.doc2.insert(insert_item)
34 | 
35 |         return item
36 | 


--------------------------------------------------------------------------------
/chahaoba/sandbox/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | from sandbox.models import SpiderModels, DBSession
 8 | import logging
 9 | import pymongo
10 | import pymysql
11 | from sandbox import config
12 | from sandbox import settings
13 | from scrapy.exceptions import DropItem
14 | 
15 | class SQLPipeline(object):
16 |     def __init__(self):
17 |         self.session = DBSession()
18 | 
19 |     def process_item(self, item, spider):
20 | 
21 |         obj = SpiderModels(
22 |         number=item['_number'],
23 |         city = item['_city'],
24 |         province = item['_province'],
25 |         card_type = item['_card_type'],
26 |         op = item['_op'],
27 |         card_detail = item['_card_detail'],
28 |         )
29 |         self.session.add(obj)
30 | 
31 |         try:
32 |             self.session.commit()
33 | 
34 |         except Exception as e:
35 |             print(e)
36 |             logging.error('>>>> 重复数据')
37 |             self.session.rollback()
38 |             DropItem(item)
39 |         else:
40 |             return item
41 | 
42 | 


--------------------------------------------------------------------------------
/Forbes/main.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | __author__ = 'Rocky'
 4 | '''
 5 | http://30daydo.com
 6 | Email: weigesysu@qq.com
 7 | '''
 8 | import requests
 9 | from lxml import etree
10 | import pymongo
11 | 
12 | db = pymongo.MongoClient('127.0.0.1')
13 | collection = db['forbes']['2017']
14 | 
15 | def getContent(url, retry =5):
16 |     headers = {'User-Agent':'User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'}
17 |     for _ in range(retry):
18 |         try:
19 |             r = requests.get(url,headers=headers,timeout=20)
20 |             if r:
21 |                return r
22 |         except Exception,e:
23 |             print e
24 |             continue
25 |     return None
26 | 
27 | def getItem():
28 |     colums = ['number','name','money','enterprise','living']
29 |     r = getContent('http://www.forbeschina.com/review/list/002399.shtml')
30 |     # print r.text
31 |     tree = etree.HTML(r.text)
32 |     items = tree.xpath('//tbody/tr')
33 |     for item in items:
34 |         d = dict(zip(colums,item.xpath('.//td/text()')))
35 |         print d
36 |         collection.insert(d)
37 | 
38 | def main():
39 |     getItem()
40 | 
41 | if __name__ == '__main__':
42 |     main()


--------------------------------------------------------------------------------
/bbssmth/bbssmth/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import logging
 8 | from logging import log
 9 | from elasticsearch import Elasticsearch
10 | from bbssmth.settings import ES_HOST
11 | 
12 | 
13 | class BbssmthPipeline(object):
14 |     def __init__(self):
15 |         self.index = 'newsmth'
16 |         self.doc = 'doc'
17 |         self.es = Elasticsearch(ES_HOST)
18 | 
19 |     def process_item(self, item, spider):
20 |         body = {
21 |             'title': item.get('title'),
22 |             'url': item.get('url'),
23 |             'content': item.get('content'),
24 |             'author': item.get('author'),
25 |             'crawltime': item.get('crawltime'),
26 |             'reply': item.get('reply'),
27 |             'category': item.get('category'),
28 |             'create_time':item.get('create_time'),
29 | 
30 |         }
31 | 
32 |         try:
33 |             self.es.index(index=self.index, doc_type=self.doc, body=body)
34 |         except Exception as e:
35 |             log.msg('错误 >>>>>')
36 |             log.msg(e)
37 |         return item
38 | 


--------------------------------------------------------------------------------
/52sh/config_file.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # website: http://30daydo.com
 3 | # @Time : 2020/9/24 12:12
 4 | # @File : config_file.py
 5 | 
 6 | START_URL = 'http://www.52sh.com.tw/index.php/main/knowledge/65/page/{page}'
 7 | HEADERS = {
 8 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
 9 |     "Accept-Encoding": "gzip, deflate",
10 |     "Accept-Language": "zh,en;q=0.9,en-US;q=0.8,zh-CN;q=0.7",
11 |     "Cache-Control": "no-cache",
12 |     "Cookie": "PHPSESSID=a3oqieou2ik4a987ksq2bm3354; _ga=GA1.3.1399498082.1600914935; _gid=GA1.3.1565426161.1600914935",
13 |     "Host": "www.52sh.com.tw",
14 |     "Pragma": "no-cache",
15 |     "Proxy-Connection": "keep-alive",
16 |     "Referer": "http://www.52sh.com.tw/index.php/main/knowledge/65/page/105",
17 |     "Upgrade-Insecure-Requests": "1",
18 |     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
19 | }
20 | PROXY = {'http': 'http://127.0.0.1:58083'}
21 | PROXY_STR = 'http://127.0.0.1:58083'
22 | SIMPLE_HEADERS = {
23 |     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
24 | }


--------------------------------------------------------------------------------
/51CTOCrawler/demo.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import subprocess
 3 | def demo_validate():
 4 |     url='http://v22.51cto.com/2018/12/19/338483/e899/high/loco_video_323000_{}.ts'
 5 |     for i in range(112):
 6 |         r=requests.get(url.format(i))
 7 |         with open('loco_video_323000_{}.ts'.format(i),'wb') as f:
 8 |             f.write(r.content)
 9 | 
10 | def write_confile(ts_len):
11 |     txt = ''
12 |     for i in range(ts_len):
13 |         txt += "file 'C:\\git\\CrawlMan\\51CTOCrawler\\loco_video_323000_{}.ts'\n".format(i)
14 |     with open('confile.txt', 'w') as fout:
15 |         fout.write(txt)
16 | 
17 | def merge_ts_video(title, v_type='.mp4'):
18 |     cmd = 'ffmpeg -f concat -safe 0 -i confile.txt  -c copy %s%s' %(title, v_type)
19 |     print(cmd)
20 |     p = subprocess.Popen(cmd, stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
21 |     out, err = p.communicate()
22 |     print(str(out, 'utf-8'))
23 |     print(str(err, 'utf-8'))
24 | 
25 | def run_cmd():
26 |     import os
27 |     name = 'loco_video_323000_{}.ts'
28 |     args = '+'.join([name.format(i) for i in range(112)])
29 |     cmd = 'copy /b '+args + ' test.ts'
30 |     print(cmd)
31 |     os.system(cmd)
32 | 
33 | # demo_validate()
34 | write_confile(112)
35 | merge_ts_video('wanttoplay')
36 | #run_cmd()


--------------------------------------------------------------------------------
/poi_gaode/sandbox/models.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2018/9/26 9:25
 4 | # @File : models.py
 5 | 
 6 | from sqlalchemy import Column, String, DateTime, Integer, Text, create_engine, DATE
 7 | from sqlalchemy.ext.declarative import declarative_base
 8 | import datetime
 9 | from sqlalchemy.orm import sessionmaker
10 | from sandbox import config
11 | 
12 | Base = declarative_base()
13 | engine = create_engine('mysql+pymysql://{}:{}@{}:3367/spider?charset=utf8'.format(config.username,config.password,config.mysql_ip))
14 | DBSession = sessionmaker(bind=engine)
15 | 
16 | TABLE_NAME = 'card_bin_scrapy'
17 | 
18 | # ORM 模型，根据项目需求修改
19 | class SpiderModels(Base):
20 |     __tablename__ = TABLE_NAME
21 | 
22 | 
23 |     # 根据项目修改字段
24 |     id = Column(Integer, primary_key=True, autoincrement=True)
25 |     card=Column(Text, comment='卡号')
26 |     accountLength = Column(Text, comment='长度')
27 |     cardName = Column(Text, comment='卡名')
28 |     cardType = Column(Text, comment='卡类型')
29 |     mainAccount = Column(Text, comment='主账号')
30 |     mainValue = Column(Text, comment='主账号值')
31 |     orgName = Column(Text, comment='发卡行')
32 | 
33 |     origin = Column(String(30), comment='来源')
34 |     crawltime = Column(DateTime, default=datetime.datetime.now(), comment='抓取时间')
35 | 
36 | 
37 | Base.metadata.create_all(engine)


--------------------------------------------------------------------------------
/fraud/fraud/match.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from model.fraud import Fraud
 4 | from model.db_config import DBSession, RedisPool
 5 | import sys
 6 | 
 7 | reload(sys)
 8 | sys.setdefaultencoding('utf8')
 9 | f = open("id_name.txt")
10 | line = f.readline()
11 | total_num, match_num, name_match_num = [0, 0, 0]
12 | 
13 | session = DBSession()
14 | r_pool = RedisPool(client_db=1)
15 | r = r_pool.redis_pool()
16 | while line:
17 |     id_num = line[0:18]
18 |     formatted_id_num = id_num[0:11] + '*' * 4 + id_num[14:]
19 |     # print line
20 |     name = line[19:-1].strip()
21 |     try:
22 |         fraud_info = session.query(Fraud).filter_by(identity_number=formatted_id_num).first()
23 |     except:
24 |         session.rollback()
25 |     if fraud_info:
26 |         match_num += 1
27 |         if name.encode('gb2312') == fraud_info.executed_name.encode('gb2312'):
28 |             name_match_num += 1
29 |         else:
30 |             r.set(fraud_info.identity_number, 1)
31 |     total_num += 1
32 |     line = f.readline()
33 | 
34 | f.close()
35 | session.close()
36 | print('样本总量：%s' % total_num)
37 | print('匹配成功数量：%s' % match_num)
38 | print('匹配率：%s' % ((match_num/total_num) * 100), '%')
39 | print('姓名身份证号匹配成功个数：%s' % name_match_num)
40 | print('姓名身份证号匹配率：%s' % ((name_match_num/match_num) * 100), '%')
41 | 
42 | 


--------------------------------------------------------------------------------
/MyLibrary/sandbox/sandbox/models.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2018/9/26 9:25
 4 | # @File : models.py
 5 | 
 6 | from sqlalchemy import Column, String, DateTime, Integer, Text, create_engine, DATE
 7 | from sqlalchemy.ext.declarative import declarative_base
 8 | import datetime
 9 | from sqlalchemy.orm import sessionmaker
10 | from sandbox import config
11 | 
12 | # Base = declarative_base()
13 | # engine = create_engine('mysql+pymysql://{}:{}@{}:3367/spider?charset=utf8'.format(config.username,config.password,config.mysql_ip))
14 | # DBSession = sessionmaker(bind=engine)
15 | #
16 | # TABLE_NAME = 'card_bin_scrapy'
17 | #
18 | # # ORM 模型，根据项目需求修改
19 | # class SpiderModels(Base):
20 | #     __tablename__ = TABLE_NAME
21 | #
22 | #
23 | #     # 根据项目修改字段
24 | #     id = Column(Integer, primary_key=True, autoincrement=True)
25 | #     card=Column(Text, comment='卡号')
26 | #     accountLength = Column(Text, comment='长度')
27 | #     cardName = Column(Text, comment='卡名')
28 | #     cardType = Column(Text, comment='卡类型')
29 | #     mainAccount = Column(Text, comment='主账号')
30 | #     mainValue = Column(Text, comment='主账号值')
31 | #     orgName = Column(Text, comment='发卡行')
32 | #     # origin = Column(String(30), comment='来源')
33 | #     crawtime = Column(DateTime, default=datetime.datetime.now(), comment='抓取时间')
34 | #
35 | #
36 | # Base.metadata.create_all(engine)


--------------------------------------------------------------------------------
/github_star/star.py:
--------------------------------------------------------------------------------
 1 | import sys, json, os, requests
 2 | 
 3 | if len(sys.argv) < 2 or len(sys.argv[1]) == 0:
 4 |     print('Check your GitHub ID ...\n demo :\n python github_counter.py rockyzsu')
 5 |     exit()
 6 | 
 7 | print('Search...')
 8 | github_id = sys.argv[1]
 9 | url = 'https://api.github.com/users/{github_id}/repos?page={page_id}'
10 | repo_list = []
11 | page_id = 1
12 | while True:
13 |     r = requests.get(url.format(github_id=github_id, page_id=page_id))
14 |     if r.status_code != 200:
15 |         print('check your network connections')
16 |         exit()
17 | 
18 |     repo_array = json.loads(r.content.decode('utf-8'))
19 |     if len(repo_array) == 0:
20 |         break
21 | 
22 |     for repo in repo_array:
23 |         if not repo['fork']:
24 |             repo_list.append([repo['name'], repo['stargazers_count'], repo['forks_count'],'' if repo['description'] is None else repo['description']])
25 |     page_id += 1
26 | 
27 | # sort by number of stars
28 | repo_list = sorted(repo_list, key=lambda x: x[1], reverse=True)
29 | 
30 | print('=' * 55)
31 | print('\n'.join(['{: <30}★{: <10}\tfork {:<10}\t{:<30} '.format(*repo) for repo in repo_list]))
32 | print('=' * 55)
33 | print('{: <30}★{: <10}\tfork {} '.format('total', sum([i[1] for i in repo_list]), sum([i[2] for i in repo_list])))
34 | print('='*55)
35 | print('{:<30}\t{:<30}'.format('total_repo_count',len(repo_list)))
36 | 


--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/monitor/app.py:
--------------------------------------------------------------------------------
 1 | # *-* coding:utf-8 *-*
 2 | '''
 3 | @author: ioiogoo
 4 | @date: 2016/12/25 15:00
 5 | '''
 6 | import json
 7 | from flask import Flask, render_template, jsonify, request, current_app
 8 | import redis
 9 | from settings import *
10 | 
11 | app = Flask(__name__)
12 | 
13 | 
14 | @app.route('/')
15 | def index():
16 |     return render_template('index.html', timeinterval=TIMEINTERVAL, stats_keys=STATS_KEYS)
17 | 
18 | 
19 | @app.route('/ajax')
20 | def ajax():
21 |     key = request.args.get('key')
22 |     result = current_app.r.lrange(key, -POINTLENGTH, -1)[::POINTINTERVAL]
23 |     if not current_app.spider_is_run:
24 |         # spider is closed
25 |         return json.dumps(result), 404
26 |     return json.dumps(result)
27 | 
28 | 
29 | @app.route('/signal')
30 | def signal():
31 |     signal = request.args.get('sign')
32 |     if signal == 'closed':
33 |         current_app.spider_is_run = False
34 |     elif signal == 'running':
35 |         current_app.spider_is_run = True
36 |     return jsonify('')
37 | 
38 | 
39 | @app.before_first_request
40 | def init():
41 |     current_app.r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB,decode_responses=True)
42 |     current_app.spider_is_run = True if current_app.r.get('spider_is_run') == '1' else False
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     app.run(debug=True, host=APP_HOST, port=APP_PORT)
47 | 


--------------------------------------------------------------------------------
/cuiqingcai/async_sandbox/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymysql
 8 | from twisted.enterprise import adbapi
 9 | import logging
10 | class AsyncSQLPipeline(object):
11 |     def __init__(self):
12 |         self.dbpool = adbapi.ConnectionPool('pymysql',host='',port='',user='',password='',db='spider')
13 |         # self.cursor = self.conn.cursor()
14 | 
15 |     def process_item(self, item, spider):
16 |         update_=self.dbpool.runInteraction(self.update,item)
17 |         update_.addErrback(self.handle_error,item,spider)
18 | 
19 |         return item
20 | 
21 |     def update(self,cursor,item):
22 |         insert_sql = 'insert into tb_cuiqingcai (category,title,article_url,content,author,created_at,liked,visited,comment,crawltime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
23 |         data=(item['category'],item['title'],item['article_url'],item['content'],item['author'],item['created_at'],item['liked'],item['visited'],item['comment'],item['crawltime']
24 |               )
25 |         cursor.execute(insert_sql,data)
26 | 
27 |     def handle_error(self,failure,item,spider):
28 |         logging.error('写入数据库异常--->')
29 |         logging.error(failure)
30 |         logging.error('error item')
31 |         logging.error(item)


--------------------------------------------------------------------------------
/jd/switch_ip.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2020/3/30 21:50
 4 | # @File : switch_ip.py
 5 | 
 6 | import os
 7 | import time
 8 | from config import AD_PASSWORD, AD_USER
 9 | 
10 | g_adsl_account = {"name": "adsl",  # 这个可以随意写 下面user和pwd 账号密码
11 |                   "username": AD_USER,
12 |                   "password": AD_PASSWORD}
13 | 
14 | 
15 | class ADSL(object):
16 | 
17 |     def __init__(self):
18 |         self.name = g_adsl_account["name"]
19 |         self.username = g_adsl_account["username"]
20 |         self.password = g_adsl_account["password"]
21 | 
22 |     # set_adsl : 修改adsl设置
23 | 
24 |     def set_adsl(self, account):
25 |         self.name = account["name"]
26 |         self.username = account["username"]
27 |         self.password = account["password"]
28 | 
29 |         # connect : 宽带拨号
30 | 
31 |     def connect(self):
32 |         cmd_str = "rasdial %s %s %s" % (self.name, self.username, self.password)
33 |         os.system(cmd_str)
34 |         time.sleep(5)
35 | 
36 |         # disconnect : 断开宽带连接
37 | 
38 |     def disconnect(self):
39 |         cmd_str = "rasdial %s /disconnect" % self.name
40 |         os.system(cmd_str)
41 |         time.sleep(5)
42 | 
43 |         # reconnect : 重新进行拨号
44 | 
45 |     def reconnect(self):
46 |         print('自动拨号')
47 |         self.disconnect()
48 |         self.connect()
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     a = ADSL()
53 |     a.reconnect()
54 | 


--------------------------------------------------------------------------------
/qianfangyiguan/qianfan_models.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | from sqlalchemy import create_engine
 4 | from sqlalchemy.orm import sessionmaker, relationship
 5 | from sqlalchemy.ext.declarative import declarative_base
 6 | from sqlalchemy import Column, String, DateTime, Integer, Text, ForeignKey, Float
 7 | from sqlalchemy import event
 8 | from sqlalchemy import DDL
 9 | 
10 | engine = create_engine('mysql+pymysql://root:@localhost:3306/db_parker?charset=utf8')
11 | DBSession = sessionmaker(bind=engine)
12 | Base = declarative_base()
13 | 
14 | 
15 | class Apps(Base):
16 |     __tablename__ = 'tb_apps3'
17 |     id = Column(Integer, primary_key=True)
18 |     app_rank = Column(Integer, index=True)
19 |     appName = Column(String(150), index=True)
20 |     developCompanyFullName = Column(String(180),index=True)
21 |     second_cateName = Column(String(150))
22 |     first_cateName = Column(String(150))
23 |     appId = Column(String(150))
24 |     activeNums = Column(Float)
25 |     activeAvgDay = Column(Float)
26 |     runtimeAvgDay = Column(Float)
27 |     runtimeAvgPersonRatio = Column(Float)
28 |     activeAvgDayRatio = Column(Float)
29 |     runtimeNums = Column(Float)
30 |     launchNums = Column(Float)
31 |     runtimeNumsRatio = Column(Float)
32 |     launchAvgDayRatio = Column(Float)
33 |     statDate = Column(DateTime)
34 |     developCompanyAbbr = Column(String(180))
35 | 
36 | 
37 | Base.metadata.create_all(engine)
38 | 


--------------------------------------------------------------------------------
/dashiye/main.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2020/4/26 20:20
 4 | # @File : main.py
 5 | 
 6 | import requests
 7 | import numpy as np
 8 | 
 9 | 
10 | code = input('请输入股票代码：')
11 | 
12 | cookies = {
13 |     'PHPSESSID': 'jqb0q4h60h4bmtj5bkd9bjuv00',
14 |     'Hm_lvt_210e7fd46c913658d1ca5581797c34e3': '1587903421',
15 |     'Hm_lpvt_210e7fd46c913658d1ca5581797c34e3': '1587903461',
16 | }
17 | 
18 | headers = {
19 |     'Origin': 'http://www.dashiyetouzi.com',
20 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
21 |     'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
22 |     'Accept': 'application/json, text/javascript, */*; q=0.01',
23 |     'X-Requested-With': 'XMLHttpRequest',
24 |     'Referer': 'http://www.dashiyetouzi.com/tools/compare/historical_valuation.php',
25 | }
26 | 
27 | data = {
28 |   'report_type': 'totalValue',
29 |   'report_stock_id': code,
30 |   'from_date': '2015-04-26',
31 |   'to_date': '2020-04-26'
32 | }
33 | 
34 | response = requests.post('http://www.dashiyetouzi.com/tools/compare/historical_valuation_data.php', headers=headers, cookies=cookies, data=data, verify=False)
35 | js=response.json()
36 | data=js.get('list')
37 | all_point=[]
38 | for item in data:
39 |     all_point.append(item[1])
40 | 
41 | 
42 | np_data = np.array(all_point)
43 | print(f'中值：{np.median(np_data)}')
44 | print(f'最小值：{np.min(np_data)}')
45 | 


--------------------------------------------------------------------------------
/myubbs/sandbox/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | from sandbox.models import SpiderModels, DBSession
 8 | import logging
 9 | import pymongo
10 | from sandbox import config
11 | from sandbox import settings
12 | 
13 | class SQLPipeline(object):
14 |     def __init__(self):
15 |         self.session = DBSession()
16 | 
17 |     def process_item(self, item, spider):
18 | 
19 |         obj = SpiderModels(
20 |         title=item['title'],
21 |         pubdate = item['pubdate'],
22 |         content = item['content'],
23 |         author = item['author'],
24 |         url = item['url'],
25 |             crawltime=item['crawltime'],
26 |         )
27 |         self.session.add(obj)
28 | 
29 |         try:
30 |             self.session.commit()
31 | 
32 |         except Exception as e:
33 |             self.session.rollback()
34 |             logging.error('>>>> 插入数据库失败{}'.format(e))
35 |         return item
36 | 
37 | 
38 | class MongoPipeline(object):
39 |     def __init__(self):
40 |         DOCUMENT = setting.MONGODB_DOC
41 |         self.db = pymongo.MongoClient(config.mongo_ip, port=27018)
42 |         self.doc = self.db['spider'][DOCUMENT]
43 | 
44 |     def process_item(self, item, spider):
45 |         insert_item = dict(item)
46 |         self.doc.insert(insert_item)
47 | 
48 |         return item
49 | 


--------------------------------------------------------------------------------
/MyLibrary/sandbox/sandbox/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | # from sandbox.models import SpiderModels, DBSession
 8 | import logging
 9 | import pymongo
10 | from sandbox import config
11 | 
12 | 
13 | # class SQLPipeline(object):
14 | #     def __init__(self):
15 | #         self.session = DBSession()
16 | #
17 | #     def process_item(self, item, spider):
18 | #
19 | #         obj = SpiderModels(
20 | #             card=item['card'],
21 | #             accountLength=item['accountLength'],
22 | #             cardName=item['cardName'],
23 | #             cardType=item['cardType'],
24 | #             mainAccount=item['mainAccount'],
25 | #             mainValue=item['mainValue'],
26 | #             orgName=item['orgName'],
27 | #         )
28 | #         self.session.add(obj)
29 | #
30 | #         try:
31 | #             self.session.commit()
32 | #
33 | #         except Exception as e:
34 | #             logging.error('>>>> 插入数据库失败{}'.format(e))
35 | #         return item
36 | 
37 | 
38 | class MongoPipeline(object):
39 |     def __init__(self):
40 |         DOCUMENT = 'szlib'
41 |         self.db = pymongo.MongoClient(config.mongo_ip, port=config.mongo_port)
42 |         self.doc = self.db['spider'][DOCUMENT]
43 | 
44 |     def process_item(self, item, spider):
45 |         self.doc.insert(dict(item))
46 |         return item
47 | 


--------------------------------------------------------------------------------
/lanrentingshu/lrts/lrts/spiders/tingshu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy import Request
 4 | 
 5 | class TingshuSpider(scrapy.Spider):
 6 |     name = 'tingshu'
 7 | 
 8 |     # allowed_domains = ['www.lrts.me']
 9 |     # start_urls = ['http://www.lrts.me/']
10 | 
11 |     def start_requests(self):
12 |         headers = {'Host': 'www.lrts.me', 'Proxy-Connection': 'keep-alive', 'Accept': '*/*',
13 |                    'X-Requested-With': 'XMLHttpRequest',
14 |                    'User-Agent': 'Mozilla/5.0(WindowsNT6.1;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/65.0.3325.162Safari/537.36',
15 |                    'Referer': 'http://www.lrts.me/playlist', 'Accept-Encoding': 'gzip,deflate',
16 |                    'Accept-Language': 'zh-CN,zh;q=0.9',
17 |                    'Cookie': 'aliyungf_tc=AQAAAF1znybVVQsAByAmG3Fs/DLq2DNK;CNZZDATA1254668430=264272103-1533047311-null%7C1533047311;Hm_lvt_ada61571fd48bb3f905f5fd1d6ef0ec4=1533051241;uid=1533051247919aea3a93a713a48c4a8d2221a0db33cc5;JSESSIONID=472B70BC34B8D0027B3B20AAE935E662;Hm_lpvt_ada61571fd48bb3f905f5fd1d6ef0ec4=1533051318'}
18 | 
19 |         url = 'http://www.lrts.me/ajax/playlist/2/6458'
20 |         yield Request(url=url,headers=headers)
21 | 
22 |     def parse(self, response):
23 |         download_list = response.xpath('//input[@name="source"]/@value').extract()
24 |         item={}
25 |         item['file_urls']=[]
26 |         for each in download_list:
27 |             item['file_urls'].append(each)
28 |         yield item
29 | 


--------------------------------------------------------------------------------
/lanrentingshu/lanrentingshu.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import urllib
 3 | 
 4 | import os
 5 | import requests
 6 | import time
 7 | from lxml import etree
 8 | from header_toolkit import getheader
 9 | 
10 | 
11 | def spider():
12 |     curr=os.getcwd()
13 |     target_dir=os.path.join(curr,'data')
14 |     if not os.path.exists(target_dir):
15 |         os.mkdir(target_dir)
16 |     for i in range(1, 100, 10):
17 |         url = 'http://www.lrts.me/ajax/playlist/2/32551/%d' % i
18 |         headers = {
19 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
20 |         s = requests.get(url=url, headers=headers)
21 |         tree = etree.HTML(s.text)
22 |         nodes = tree.xpath('//*[starts-with(@class,"clearfix section-item section")]')
23 |         print len(nodes)
24 |         for node in nodes:
25 |             filename = node.xpath('.//div[@class="column1 nowrap"]/span/text()')[0]
26 |             link = node.xpath('.//input[@name="source" and @type="hidden"]/@value')[0]
27 | 
28 |             print link
29 |             post_fix=link.split('.')[-1]
30 |             full_path= filename+'.'+post_fix
31 |             filename = os.path.join(target_dir, full_path)
32 |             # 修改这一段，多线程下载
33 |             if not os.path.isfile(filename):
34 |                 urllib.urlretrieve(link, filename)
35 |                 time.sleep(1)
36 |             else:
37 |                 continue
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     spider()
42 | 


--------------------------------------------------------------------------------
/pornhub/newJs.js:
--------------------------------------------------------------------------------
1 | var quality_1080p =/* + radra27radra27 + */rahttpsra83rahttpsra83 + /* + rancomvira35rancomvira35 + */raevphncdra57raevphncdra57 + /* + radra27radra27 + */rancomvira35rancomvira35 + /* + ra006163ra73ra006163ra73 + */radeos202ra16radeos202ra16 + /* + ra09ratera79ra09ratera79 + */ra006163ra73ra006163ra73 + /* + ra1080p4ra73ra1080p4ra73 + */ra24075351ra94ra24075351ra94 + /* + raroiu6qra26raroiu6qra26 + */ra1080p4ra73ra1080p4ra73 + /* + ra000k324ra70ra000k324ra70 + */ra000k324ra70ra000k324ra70 + /* + rancomvira35rancomvira35 + */ra075351mra26ra075351mra26 + /* + ravalidtora49ravalidtora49 + */rap4validra25rap4validra25 + /* + ra209hashra72ra209hashra72 + */rafrom160ra56rafrom160ra56 + /* + ra1080p4ra73ra1080p4ra73 + */ra6708909ra29ra6708909ra29 + /* + ra209hashra72ra209hashra72 + */ravalidtora49ravalidtora49 + /* + ramgdmctbvra11ramgdmctbvra11 + */ra16067161ra17ra16067161ra17 + /* + ra24075351ra94ra24075351ra94 + */ra09ratera79ra09ratera79 + /* + ra50000kbra49ra50000kbra49 + */ra50000kbra49ra50000kbra49 + /* + ramgdmctbvra11ramgdmctbvra11 + */raurst500ra63raurst500ra63 + /* + ra209hashra72ra209hashra72 + */ra00kip4ra41ra00kip4ra41 + /* + raroiu6qra26raroiu6qra26 + */ra72419ra91ra72419ra91 + /* + ra09ratera79ra09ratera79 + */ra209hashra72ra209hashra72 + /* + raro7upu3ra66raro7upu3ra66 + */raroiu6qra26raroiu6qra26 + /* + ra075351mra26ra075351mra26 + */ra2bmkdz7nra36ra2bmkdz7nra36 + /* + ra50000kbra49ra50000kbra49 + */ramgdmctbvra11ramgdmctbvra11 + /* + radeos202ra16radeos202ra16 + */raro7upu3ra66raro7upu3ra66 + /* + ra075351mra26ra075351mra26 + */radra27radra27;
2 | 


--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
 1 | ## 爬虫合集
 2 | * 51CTOCrawler: 爬取51CTO的视频，并通过ffmpeg合并
 3 | * 51jbnet: 51脚本内容爬取
 4 | * 52sh: 台湾52社区网站 妹子图片爬取
 5 | * anjuke:安居客爬虫
 6 | * async_cuiqingcai: 异步爬取崔庆才博客内容
 7 | * baiduwanpan: 暴力破解百度网盘密码
 8 | * bbssmth：水木清华爬虫
 9 | * bilibili:bilibili视频抓取 
10 | * chahaoba: 查号吧 遍历所有手机号码归属地
11 | * chinaclear: 中登网开户人数爬取
12 | * cnbeta: cnbeta爬虫
13 | * csdn:csdn博客排名抓取
14 | * cuiqingcai：崔庆才博客爬取
15 | * dfcf: 东方财富股吧爬取，爬取所有个股的股吧帖子，可通过参数控制爬取指定日期
16 | * enterprise: 爬取工商企业数据
17 | * Ergeduoduo:儿歌多多 [http://30daydo.com/article/236](http://30daydo.com/article/236)
18 | * Forbes:福布斯排名爬虫
19 | * fraud: 失信被执行人爬取
20 | * github_star: 获取github某个人的所有仓库,星星总数
21 | * htqyy: 好听轻音乐 爬取轻音乐mp3
22 | * jd:京东图书爬取
23 | * kc0011:投资咨询网
24 | * lanrentingshu:每天心理学 (懒人听书)[http://30daydo.com/article/231](http://30daydo.com/article/231)
25 | * MyLibrary:图书馆抓取个人的阅读记录
26 | * pornhub: p站视频下载
27 | * poi_gaode:根据经纬度范围,在高德地图上遍历数据
28 | * qianfangyiguan:千帆易观数据抓取
29 | * szhouse: 深圳房价官网爬取
30 | * tiexue: 军事网站 铁血网内容爬取
31 | * stockholder:股东数据抓取
32 | * tencentjob:腾讯工作岗位爬取
33 | * ximalaya:喜马拉雅音频爬取 [http://30daydo.com/article/503](http://30daydo.com/article/503)
34 | * yinyonbao:应用宝app排名数据抓取
35 | * youdao_dictionary:有道词典js加密破解 [http://30daydo.com/article/416](http://30daydo.com/article/416)
36 | * zhihu:知乎分布式爬取
37 | ### QA疑问 
38 | 代码库代码均已通过本人爬取测试可行，如果你使用本代码遇到问题，可邮件咨询。
39 | **上述只是本人代码库中部分展示代码，还有大量的非公开爬虫代码如：国家工商系统爬虫，淘宝网等，可以联系本人提供**
40 | **同时本人也承接各类爬虫业务** 
41 | 
42 | 或者爬虫的朋友可以加QQ群，一起分享项目。
43 | 
44 | 
45 | 
46 | ###### 做爬虫的朋友经常会遇到别人已经爬过的网站，然后自己刚好需要，互补有无，提高效率。
47 | 
48 | ###### QQ群：759746506
49 | 
50 | 
51 | 
52 | 公众号:
53 | 
54 | ![appMarket](https://s3.ax1x.com/2021/01/05/skCVPJ.jpg)


--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/monitor/statscol.py:
--------------------------------------------------------------------------------
 1 | # *-* coding:utf-8 *-*
 2 | '''
 3 | @author: ioiogoo
 4 | @date: 2016/12/25 16:50
 5 | '''
 6 | 
 7 | import redis
 8 | from .settings import STATS_KEYS
 9 | import time
10 | import requests
11 | import json
12 | r = redis.Redis(host='10.18.6.46', port=6379, db=0,decode_responses=True)
13 | Time = lambda: time.strftime('%Y-%m-%d %H:%M:%S')
14 | 
15 | 
16 | class StatcollectorMiddleware(object):
17 |     def __init__(self):
18 |         self.r = redis.Redis(host='10.18.6.46', port=6379, db=0,decode_responses=True)
19 |         self.stats_keys = STATS_KEYS
20 | 
21 |     def process_request(self, request, spider):
22 |         self.formatStats(spider.crawler.stats.get_stats())
23 | 
24 |     def formatStats(self, stats):
25 |         for key in self.stats_keys:
26 |             key_value = stats.get(key, None)
27 |             if not key_value: continue
28 |             value = {"value": [Time(), key_value]}
29 |             content = json.dumps(value)
30 |             print(f'key content {key}')
31 |             print(f'value -->{content}')
32 |             self.insert2redis(key, content)
33 | 
34 |     def insert2redis(self, key, value):
35 |         self.r.rpush(key, value)
36 | 
37 | 
38 | class SpiderRunStatspipeline(object):
39 |     def open_spider(self, spider):
40 |         print('open SpiderRunStatspipeline')
41 |         r.set('spider_is_run', 1)
42 |         requests.get('http://127.0.0.1:5000/signal?sign=running')
43 | 
44 |     def close_spider(self, spider):
45 |         r.set('spider_is_run', 0)
46 |         requests.get('http://127.0.0.1:5000/signal?sign=closed')


--------------------------------------------------------------------------------
/weibo/weibo/spiders/wb.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy import Spider, FormRequest, Request
 3 | 
 4 | 
 5 | class WbSpider(Spider):
 6 |     name = 'wb'
 7 | 
 8 |     headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 9 |                'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache',
10 |                'Connection': 'keep-alive',
11 |                # 'Cookie': 'ALF=1539744188;SCF=Arejsw06Aa86L7rLsj3RRh8YiCul1z1Yapy6v1kQNGNbjcNLV3LPZbziAEtRKYVOAL_s5JKT2rck3tB7VAtepd4.;SUB=_2A252m2dXDeRhGedH7lcT8y7Fwj-IHXVSZAkfrDV6PUJbktAKLRejkW1NUKTAOGny8CQfH8IlGwCeP72gG_Pf_dFi;SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWIFwD6xpqyuh9_mA2jr6on5JpX5K-hUgL.Fo24SK-Ee0541Ke2dJLoI7LCdcSuwHvAMN-t;SUHB=0Ryruv0xgZvGM5;SSOLoginState=1537152775;_T_WM=ae5298708cece22521d281346fac7744',
12 |                'Host': 'weibo.cn', 'Pragma': 'no-cache',
13 |                'Referer': 'https://weibo.cn/search/mblog?hideSearchFrame=&keyword=000001&page=2',
14 |                'Upgrade-Insecure-Requests': '1',
15 |                'User-Agent': 'Mozilla/5.0(WindowsNT6.1;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36'}
16 | 
17 |     def start_requests(self):
18 |         keyword = '000001'
19 |         for page in range(1, 2):
20 |             url = 'https://weibo.cn/search/mblog?hideSearchFrame=&keyword=000001&page=1'
21 |             yield Request(url=url, headers=self.headers)
22 | 
23 |     def parse(self, response):
24 |         # print(response.text)
25 |         response.xpath('//div[@class="c" and contains(@id,"M_")]')


--------------------------------------------------------------------------------
/sz_yaohao/sandbox/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | from sandbox.models import SpiderModels, DBSession
 8 | import logging
 9 | import pymongo
10 | from sandbox import config
11 | from sandbox import settings
12 | 
13 | class SQLPipeline(object):
14 |     def __init__(self):
15 |         self.session = DBSession()
16 | 
17 |     def process_item(self, item, spider):
18 | 
19 |         obj = SpiderModels(
20 |             card=item['card'],
21 |             accountLength=item['accountLength'],
22 |             cardName=item['cardName'],
23 |             cardType=item['cardType'],
24 |             mainAccount=item['mainAccount'],
25 |             mainValue=item['mainValue'],
26 |             orgName=item['orgName'],
27 |             origin=item['origin'],
28 |             crawltime=item['crawltime'],
29 |         )
30 |         self.session.add(obj)
31 | 
32 |         try:
33 |             self.session.commit()
34 | 
35 |         except Exception as e:
36 |             logging.error('>>>> 插入数据库失败{}'.format(e))
37 |         return item
38 | 
39 | 
40 | class MongoPipeline(object):
41 |     def __init__(self):
42 |         DOCUMENT = settings.MONGODB_DOC
43 |         self.db = pymongo.MongoClient(settings.MONGO_HOST, port=settings.MONGO_PORT)
44 |         self.doc = self.db['spider'][DOCUMENT]
45 | 
46 |     def process_item(self, item, spider):
47 |         insert_item = dict(item)
48 |         self.doc.insert(insert_item)
49 | 
50 |         return item
51 | 


--------------------------------------------------------------------------------
/poi_gaode/gaode_map.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2018/12/6 10:39
 4 | # @File : gaode_map.py
 5 | import requests
 6 | from math import radians, cos, sin, asin, sqrt
 7 | import config
 8 | import json
 9 | 
10 | def demo():
11 |     key=config.key
12 |     url =f'https://restapi.amap.com/v3/place/polygon?polygon=116.460988,40.006919|116.48231,40.007381|116.47516,39.99713|116.472596,39.985227|116.45669,39.984989|116.460988,40.006919&keywords=kfc&output=json&key={key}'
13 |     r = requests.get(url)
14 |     print(r.json())
15 | 
16 | def haversine(lon1, lat1, lon2, lat2):  # 经度1，纬度1，经度2，纬度2 （十进制度数）
17 |     """
18 |     Calculate the great circle distance between two points
19 |     on the earth (specified in decimal degrees)
20 |     """
21 |     # 将十进制度数转化为弧度
22 |     lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
23 | 
24 |     # haversine公式
25 |     dlon = lon2 - lon1
26 |     dlat = lat2 - lat1
27 |     a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
28 |     c = 2 * asin(sqrt(a))
29 |     r = 6371  # 地球平均半径，单位为公里
30 | 
31 |     return c * r * 1000
32 | 
33 | 
34 | def long_lati_change():
35 |     lbs = [(22.7100061372,113.7915802002),
36 |      (22.7866273171,114.3717956543),
37 |      (22.5404642212,113.9189529419),
38 |      (22.5487084710,114.2375564575),
39 |      (22.6586902908,114.2598724365),
40 |            ]
41 |     for i in lbs:
42 |         print(f'{i[1]},{i[0]}|',end='')
43 | # demo()
44 | # 114.04308499999999,22.527853|114.04808499999999,22.522853
45 | lati1,long1=22.527853,114.04308499999999
46 | lati2,long2=22.522853,114.04808499999999
47 | print(haversine(long1,lati1,long2,lati2))
48 | # long_lati_change()
49 | 
50 | 
51 |             
52 | 


--------------------------------------------------------------------------------
/tencentjob/tencentjob/spiders/tencent.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | 
 4 | import scrapy
 5 | from scrapy.linkextractors import LinkExtractor
 6 | from scrapy.spiders import CrawlSpider, Rule
 7 | from tencentjob.items import TencentjobItem
 8 | 
 9 | 
10 | class TencentSpider(CrawlSpider):
11 |     name = 'tencent'
12 |     allowed_domains = ['tencent.com']
13 |     start_urls = ['https://hr.tencent.com/position.php']
14 |     rules = [
15 |         # 多个条件
16 |         Rule(LinkExtractor(allow=("start=\d+"))),
17 |         Rule(LinkExtractor(allow=("position_detail\.php")), follow=True, callback='parse_item')
18 |     ]
19 | 
20 |     def parse_item(self, response):
21 |         item = TencentjobItem()
22 | 
23 |         title = response.xpath('//*[(@id = "sharetitle")]/text()').extract_first()
24 |         workLocation = response.xpath('//*[@class="lightblue l2"]/../text()').extract_first()
25 |         catalog = response.xpath('//*[@class="lightblue"]/../text()').extract_first()
26 |         recruitNumber = response.xpath('//*[@class="lightblue"]/../text()').re('(\d+)')[0]
27 |         duty_pre = response.xpath('//*[@class="squareli"]').extract_first()
28 |         duty = re.sub('<.*?>', '', duty_pre)
29 | 
30 |         Job_requirement_pre = response.xpath('//*[@class="squareli"]').extract_first()
31 |         Job_requirement = re.sub('<.*?>', '', Job_requirement_pre)
32 | 
33 |         item['title'] = title
34 |         item['url'] = response.url
35 |         item['workLocation'] = workLocation
36 |         item['catalog'] = catalog
37 |         item['recruitNumber'] = recruitNumber
38 |         item['duty'] = duty
39 |         item['Job_requirement'] = Job_requirement
40 | 
41 |         yield item
42 | 


--------------------------------------------------------------------------------
/szhouse/database.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | __author__ = 'Rocky'
 3 | import sqlite3
 4 | 
 5 | def create_table():
 6 |     conn = sqlite3.connect('shenzhen_house.db')
 7 |     try:
 8 |         create_tb_cmd='''
 9 |         CREATE TABLE IF NOT EXISTS HOUSE
10 |         ('日期' TEXT,
11 |         '一手房套数' TEXT,
12 |         '一手房面积' TEXT,
13 |         '二手房套数' TEXT,
14 |         '二手房面积' TEXT);
15 |         '''
16 |         #主要就是上面的语句
17 |         conn.execute(create_tb_cmd)
18 |     except:
19 |         print("Create table failed")
20 |         return False
21 | 
22 | 
23 |     conn.execute(create_tb_cmd)
24 |     conn.commit()
25 |     conn.close()
26 | 
27 | def insert(date,one_hand,one_area,second_hand,second_area):
28 |     conn = sqlite3.connect('shenzhen_house.db')
29 |     print("open database passed")
30 | 
31 |     cmd="INSERT INTO HOUSE ('日期','一手房套数','一手房面积','二手房套数','二手房面积') VALUES('%s','%s','%s','%s','%s');" %(date,one_hand,one_area,second_hand,second_area)
32 |     #works 要么加\"
33 |     #paul_su="INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) VALUES(5,'%s',32,'CALIFORNIA',2000.00);" %temp2
34 |     #works 要么加 ’‘
35 | 
36 |     #allen="INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) VALUES(2,'ALLEN',72,'CALIFORNIA',20500.00);"
37 |     #teddy="INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) VALUES(3,'TEDDY',732,'CALIFORNIA',52000.00);"
38 |     #mark="INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) VALUES(4,'MARK',327,'CALIFORNIA',3000.00);"
39 |     #sun="INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) VALUES(?,?,?,?,?);"
40 |     #conn.execute("INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) VALUES(?,?,32,'CALIFORNIA',2000.00)",temp)
41 | 
42 |     conn.execute(cmd)
43 | 
44 |     conn.commit()
45 |     conn.close()


--------------------------------------------------------------------------------
/fraud/fraud/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from fraud.model.fraud import Fraud
 3 | from fraud.model.db_config import DBSession, RedisPool
 4 | from scrapy.exceptions import DropItem
 5 | import datetime
 6 | import json
 7 | class FraudPipeline(object):
 8 | 
 9 |     def open_spider(self, spider):
10 |         self.session = DBSession()
11 | 
12 |     def process_item(self, item, spider):
13 |         # item = json.dumps(dict(item)).decode('unicode-escape')
14 |         f = Fraud(executed_name=item['executed_name'],
15 |                   gender=item['gender'],
16 |                   age=item['age'],
17 |                   identity_number=item['identity_number'],
18 |                   court=item['court'],
19 |                   province=item['province'],
20 |                   case_number=item['case_number'],
21 |                   performance=item['performance'],
22 |                   disrupt_type_name=item['disrupt_type_name'],
23 |                   duty=item['duty'],
24 |                   release_time=item['release_time'],
25 |                   crawl_time=datetime.datetime.now())
26 |         self.session.add(f)
27 |         try:
28 |             self.session.commit()
29 |         except Exception as e:
30 |             print(e)
31 |             self.session.rollback()
32 | 
33 |         return item
34 | 
35 |     def close_spider(self, spider):
36 |         self.session.close()
37 | 
38 | class DuplicatesPipeline(object):
39 |     def process_item(self, item, spider):
40 |         pool = RedisPool()
41 |         r = pool.redis_pool()
42 |         if r.exists('id_num: %s' % item['case_number']):
43 |             raise DropItem("Duplicate item found: %s" % item['case_number'])
44 |         else:
45 |             r.set('id_num: %s' % item['case_number'], 1)
46 |             return item
47 | 


--------------------------------------------------------------------------------
/szhouse/house.py:
--------------------------------------------------------------------------------
 1 | #-*-coding=utf-8-*-
 2 | __author__ = 'rocky'
 3 | # 网页源码修改 废弃使用
 4 | #获取每天深圳一手房，二手房的成交套数与面积，并且写入数据库
 5 | #主要就是正则表达抓取几个数字
 6 | import re
 7 | import database
 8 | import requests
 9 | 
10 | def getContent():
11 |     url="http://ris.szpl.gov.cn/"
12 |     one_hand="credit/showcjgs/ysfcjgs.aspx"
13 |     second_hand="credit/showcjgs/esfcjgs.aspx"
14 |     # req=urllib2.Request(url+one_hand)
15 |     # content=urllib2.urlopen(req).read()
16 |     #返回的就是网页的源码，没有做任何防爬虫的处理，zf网站，呵呵
17 |     #print content
18 |     headers={'User-Agent':'Mozilla/5.0 (WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
19 |     content = requests.get(url=url+one_hand,headers=headers).text
20 | 
21 |     date=re.compile(r'<SPAN class=titleblue><span id=\"lblCurTime5\">(.*)</span>')
22 |     reg=re.compile(r'<td width="14%"><b>(\d+)</b>')
23 |     result=reg.findall(content)
24 |     current_date=date.findall(content)
25 | 
26 |     reg2=re.compile(r'<td align="right"><b>(.*?)</b>')
27 |     yishou_area=reg2.findall(content)
28 | 
29 | 
30 |     print(current_date[0])
31 |     print('一手商品房成交套数：%s'  % result[0])
32 |     print('一手商品房成交面积： %s'  % yishou_area[0])
33 | 
34 | 
35 |     # sec_req=urllib2.Request(url+second_hand)
36 |     # sec_content=urllib2.urlopen(sec_req).read()
37 | 
38 |     sec_content = requests.get(url+second_hand).text
39 | 
40 |     sec_quantity=re.compile(r'<td width="30%">(\d+)</td>')
41 |     sec_result=sec_quantity.findall(sec_content)
42 |     second_area=re.findall(r'<td align="right">(.*?)</td>',sec_content)
43 | 
44 |     print('二手商品房成交套数：%s'  % sec_result[1])
45 |     print('二手商品房成交面积： %s'  % second_area[2])
46 |     database.create_table()
47 |     database.insert(current_date[0],result[0],yishou_area[0],sec_result[1],second_area[2])
48 | 
49 | getContent()


--------------------------------------------------------------------------------
/ximalaya/story.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2019/10/18 18:04
 4 | # @File : story.py
 5 | 
 6 | # 睡前故事
 7 | import os
 8 | 
 9 | import requests,datetime,re
10 | 
11 | url='http://mobwsa.ximalaya.com/mobile-album/album/page/ts-1571392955128?ac=WIFI&albumId=260744&device=android&isAsc=false&isQueryInvitationBrand=true&isVideoAsc=true&pageId={}&pageSize=100&pre_page=0&source=5&supportWebp=true'
12 | headers = {'User-Agent': 'Xiaomi'}
13 | 
14 | def download():
15 | 
16 |     for i in range(1, 2): # 只下载一页
17 | 
18 |         r = requests.get(url=url.format(i), headers=headers)
19 |         js_data = r.json()
20 |         data_list = js_data.get('data', {}).get('tracks',{}).get('list',[])
21 | 
22 |         for item in data_list:
23 |             trackName = item.get('title')
24 |             trackName = re.sub('[\/\\\:\*\?\"\<\>\|]', '_', trackName)
25 |             # trackName=re.sub(':','',trackName)
26 |             src_url = item.get('playUrl64')
27 |             orderNo = item.get('orderNo')
28 | 
29 |             filename = '{}-{}.mp3'.format(orderNo,trackName)
30 |             if not os.path.exists(filename):
31 | 
32 |                 try:
33 |                     r0 = requests.get(src_url, headers=headers,timeout=3600)
34 |                 except Exception as e:
35 |                     print(e)
36 |                     print(trackName)
37 |                     r0 = requests.get(src_url, headers=headers,timeout=3600)
38 | 
39 | 
40 | 
41 |                 with open(filename, 'wb') as f:
42 |                     f.write(r0.content)
43 |                     print('{}下载完成'.format(filename))
44 | 
45 |             else:
46 |                 print(f'{filename}已经下载过了')
47 | 
48 | if __name__=='__main__':
49 |     print(f'start at {datetime.datetime.now()}')
50 |     download()
51 |     print(f'end at {datetime.datetime.now()}')
52 | 


--------------------------------------------------------------------------------
/pornhub/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | __pycache__
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | env/
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *,cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | 
 56 | # Flask instance folder
 57 | instance/
 58 | 
 59 | # Scrapy stuff:
 60 | .scrapy
 61 | 
 62 | # Sphinx documentation
 63 | docs/_build/
 64 | 
 65 | # PyBuilder
 66 | target/
 67 | 
 68 | # IPython Notebook
 69 | .ipynb_checkpoints
 70 | *.ipynb
 71 | 
 72 | # pyenv
 73 | .python-version
 74 | 
 75 | # celery beat schedule file
 76 | celerybeat-schedule
 77 | 
 78 | # dotenv
 79 | .env
 80 | 
 81 | # virtualenv
 82 | venv/
 83 | ENV/
 84 | 
 85 | # Spyder project settings
 86 | .spyderproject
 87 | 
 88 | # Rope project settings
 89 | .ropeproject
 90 | 
 91 | 
 92 | .idea/*
 93 | .DS_Store
 94 | .vscode
 95 | settings.yaml
 96 | tmp/*
 97 | test/
 98 | *.sqlite
 99 | result/*
100 | logs/*
101 | tasks/result/*
102 | *.swp
103 | web/upload/*
104 | *.png
105 | *.yaml
106 | 
107 | download*
108 | *.zip
109 | mp4/
110 | webm/
111 | nohup.out
112 | 


--------------------------------------------------------------------------------
/stockholder/main.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | import requests
 3 | from lxml import etree
 4 | import pymongo
 5 | import tushare as ts
 6 | client = pymongo.MongoClient('10.18.6.102')
 7 | doc = client['secutiry']['shareholder']
 8 | 
 9 | __author__ = 'Rocky'
10 | 
11 | '''
12 | http://30daydo.com
13 | Email: weigesysu@qq.com
14 | '''
15 | def getContent(code):
16 |     url = 'http://quotes.money.163.com/f10/gdfx_{}.html'.format(code)
17 | 
18 |     headers = {'User-Agent':'Mozilla/5.0(WindowsNT6.1;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/65.0.3325.162Safari/537.36'}
19 |     for i in range(5):
20 |         try:
21 |             r = requests.get(url, headers=headers)
22 |             if r.status_code==200:
23 |                 return r.text
24 |         except Exception,e:
25 |             print e
26 |             continue
27 | 
28 |     return None
29 | 
30 | def parser(code):
31 |     text = getContent(code,)
32 |     document={}
33 |     if text is not None:
34 |         tree = etree.HTML(text)
35 |         name = tree.xpath('//div[@id="dateTable"]/table/tr/td[1]/text()')
36 |         percent = tree.xpath('//div[@id="dateTable"]/table/tr/td[2]/text()')
37 |         number = tree.xpath('//div[@id="dateTable"]/table/tr/td[3]/text()')
38 |         # print name
39 |         # print percent
40 |         # print number
41 |         d = {}
42 |         for index,value in enumerate(name):
43 |             # print index
44 |             k = name[index]
45 |             p=percent[index]
46 |             n=number[index]
47 |             if '.' in k:
48 |                 k=k.replace('.','_')
49 |             d[k]=(p,n)
50 |     document[code]=d
51 |     doc.insert(document)
52 | 
53 | def all_stocks():
54 |     df = ts.get_stock_basics()
55 |     for i in df.index:
56 |         parser(i)
57 | 
58 | def main():
59 |     # parser('000011')
60 |     all_stocks()
61 | 
62 | if __name__ == '__main__':
63 |     main()


--------------------------------------------------------------------------------
/kc0011/async_mongo.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2019/11/26 8:55
 4 | # @File : async_mongo.py
 5 | import asyncio
 6 | from urllib.parse import urlparse
 7 | import pymongo
 8 | import threading
 9 | from motor.motor_asyncio import AsyncIOMotorClient
10 | import motor
11 | from pymongo.errors import DuplicateKeyError
12 | 
13 | #异步更新mongo数据库
14 | 
15 | db_host = '192.168.10.48'
16 | db_port = 17001
17 | uri = 'mongodb://{0}:{1}'.format(
18 |     db_host, db_port)  # db_name 认证数据库
19 | db = motor.motor_tornado.MotorClient(uri)['spider']  # 认证完成后需要连接要用的数据库
20 | 
21 | # client = AsyncIOMotorClient(MONGO_HOST, port=MONGO_PORT)
22 | # db = client['hedgehog_spider']
23 | # db.authenticate(name='Zane', password='*#06#', source='admin')
24 | 
25 | doc = db['KC0011_content']
26 | block = 500
27 | total = 124684
28 | 
29 | iter_number = total // block
30 | 
31 | remain_part = total % block
32 | import re
33 | 
34 | re_pattern = re.compile('&page=\d+')
35 | 
36 | 
37 | async def run():
38 |     for i in range(iter_number + 1):
39 | 
40 |         small_part = doc.find({}, {'_id': 1, 'url': 1}).limit(block).skip(i * block)
41 | 
42 |         async for item in small_part:
43 |             url = item.get('url')
44 |             idx = item.get('_id')
45 |             if re.search(re_pattern,url):
46 |                 # print(url)
47 | 
48 |                 url_ = re.sub(re_pattern, '', url)
49 | 
50 |                 try:
51 |                     await doc.update_one(
52 |                         {'_id': idx},
53 |                         {'$set': {'url': url_}}
54 |                     )
55 | 
56 |                 except DuplicateKeyError as e:
57 |                     print(e)
58 |                     print('删除此doc {}'.format(url))
59 |                     await doc.delete_one({'_id':idx})
60 | 
61 |                 except Exception as e:
62 |                     print(e)
63 | 
64 | 
65 | asyncio.get_event_loop().run_until_complete(run())
66 | 


--------------------------------------------------------------------------------
/Ergeduoduo/main.py:
--------------------------------------------------------------------------------
 1 | #-*-coding=utf-8-*-
 2 | import sys,os
 3 | import requests
 4 | from lxml import etree
 5 | import subprocess
 6 | session = requests.Session()
 7 | def getContent(url):
 8 |     # url='http://www.iqiyi.com/v_19rrkwcx6w.html'
 9 |     try:
10 |         ret = requests.get(url)
11 |         ret.encoding='utf-8'
12 |     # except Exception,e:
13 |     except:
14 |         # print e
15 |         return None
16 |     if ret.status_code==200:
17 |         return ret.text
18 |     else:
19 |         return None
20 | 
21 | def getUrl():
22 |     url='http://www.iqiyi.com/v_19rrkwcx6w.html'
23 |     url2='http://www.iqiyi.com/v_19rrl2td7g.html' # 31-61
24 |     content = getContent(url)
25 |     if not content:
26 |         print "network issue, retry"
27 |         exit(0)
28 |     root = etree.HTML(content,parser=etree.HTMLParser(encoding='utf-8'))
29 |     elements=root.xpath('//div[@data-current-count="1"]//li')
30 |     for items in elements:
31 |         url_item=items.xpath('.//a/@href')[0]
32 |         song_url = url_item.replace('//','')
33 |         song_url=song_url.strip()
34 |         print(song_url)
35 |         # name=items.xpath('.//span[@class="item-num"]/text()')[0]
36 |         name=items.xpath('.//span[@class="item-num"]/text()')[0].encode('utf-8').strip()+\
37 |              ' '+items.xpath('.//span[@class="item-txt"]/text()')[0].encode('utf-8').strip()+'.mp4'
38 |         name= '儿歌多多 '+name
39 |         name=name.decode('utf-8')
40 |         filename=os.path.join(os.getcwd(),name)
41 |         print filename
42 |         if os.path.exists(filename):
43 |             continue
44 |         p=subprocess.Popen('python you-get -d --format=HD {}'.format(song_url),stderr=subprocess.PIPE,stdout=subprocess.PIPE,shell=True)
45 |         output,error = p.communicate()
46 |         print(output)
47 |         print(error)
48 |         p.wait()
49 | 
50 | 
51 | def main():
52 |     getUrl()
53 | 
54 | if __name__ == '__main__':
55 |     main()


--------------------------------------------------------------------------------
/poi_gaode/sandbox/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import datetime
 8 | 
 9 | from sandbox.models import SpiderModels, DBSession
10 | import logging
11 | import pymongo
12 | from sandbox import config
13 | from sandbox import settings
14 | from pymongo.errors import DuplicateKeyError
15 | from scrapy.exceptions import DropItem
16 | # class SQLPipeline(object):
17 | #     def __init__(self):
18 | #         self.session = DBSession()
19 | #
20 | #     def process_item(self, item, spider):
21 | #
22 | #         obj = SpiderModels(
23 | #             card=item['card'],
24 | #             accountLength=item['accountLength'],
25 | #             cardName=item['cardName'],
26 | #             cardType=item['cardType'],
27 | #             mainAccount=item['mainAccount'],
28 | #             mainValue=item['mainValue'],
29 | #             orgName=item['orgName'],
30 | #             origin=item['origin'],
31 | #             crawltime=item['crawltime'],
32 | #         )
33 | #         self.session.add(obj)
34 | #
35 | #         try:
36 | #             self.session.commit()
37 | #
38 | #         except Exception as e:
39 | #             logging.error('>>>> 插入数据库失败{}'.format(e))
40 | #         return item
41 | 
42 | 
43 | class MongoPipeline(object):
44 |     def __init__(self):
45 |         DOCUMENT = settings.MONGODB_DOC
46 |         self.db = pymongo.MongoClient(config.mongo_ip, port=27018)
47 |         self.doc = self.db['spider'][DOCUMENT]
48 | 
49 |     def process_item(self, item, spider):
50 |         insert_item = dict(item)
51 |         insert_item['crawltime']=datetime.datetime.now()
52 |         try:
53 |             self.doc.insert(insert_item)
54 |         except DuplicateKeyError:
55 |             raise DropItem('drop item {}'.format(insert_item['id']))
56 | 
57 |         return item
58 | 


--------------------------------------------------------------------------------
/pornhub/cookies_access.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | headers = {
 4 |     'authority': 'cn.pornhub.com',
 5 |     'pragma': 'no-cache',
 6 |     'cache-control': 'no-cache',
 7 |     'upgrade-insecure-requests': '1',
 8 |     'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
 9 |     'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
10 |     'sec-fetch-site': 'none',
11 |     'sec-fetch-mode': 'navigate',
12 |     'sec-fetch-dest': 'document',
13 |     'accept-language': 'zh,en;q=0.9,en-US;q=0.8,zh-CN;q=0.7',
14 |     'cookie': 'FastPopSessionRequestNumber=11; bs=0hwo170h8b27c5b55tt3ux7b8xkukol0; ss=630427593672619545; bitmovin_analytics_uuid=48eeeda8-bcfe-47f6-84fb-dd172921281a; platform_cookie_reset=pc; fg_9d12f2b2865de2f8c67706feaa332230=56077.100000; fg_7133c455c2e877ecb0adfd7a6ec6d6fe=32682.100000; ats_jp_vkey=ph5f29d906ac970; il=v1yKrZvlyVIqstonKh7Cf8kS4JOEHaOX5I0jleVOp8p6sxNjE0NjQ3MTgwaExRdXp5LXY2QVV4dnhhZmV1NncydDhpam15N1NMamk2dFc5bENEXw..; expiredEnterModalShown=1; platform=pc; fg_a197b3a83beb75c5f0255dc465e9f2de=3629.100000; ua=dcc77110dea38e3cff8b12436648706c; fanClubInfoPop=1; FastPopSessionRequestNumber=9',
15 | }
16 | 
17 | params = (
18 |     ('s', 'eyJrIjoiMDgxOTU1NjU4MGNjZjQyOTQ1ODVkZTdhNjM5NjkyMjQzNWE1NzdjYSIsInQiOjE2MDkyMTYwNDJ9'),
19 |     ('v', 'ph5fe22b22c2a32'),
20 |     ('e', '0'),
21 | )
22 | 
23 | response = requests.get('https://cn.pornhub.com/video/get_media', headers=headers, params=params)
24 | 
25 | #NB. Original query string below. It seems impossible to parse and
26 | #reproduce query strings 100% accurately so the one below is given
27 | #in case the reproduced version is not "correct".
28 | # response = requests.get('https://cn.pornhub.com/video/get_media?s=eyJrIjoiM2JkNzk3OTc3MDYxNjdhN2NiZjg3ZjAxN2YxMDI3YTY3MjNkOWNmMyIsInQiOjE2MDkyMTE5MzJ9&v=ph5c7a39b625845&e=0', headers=headers)
29 | print(response.json())


--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/RedisDuplicator.py:
--------------------------------------------------------------------------------
 1 | import redis
 2 | from scrapy.dupefilters import BaseDupeFilter
 3 | # 自定义dupefilter
 4 | class DupeFilter(BaseDupeFilter):
 5 | 
 6 |     def __init__(self,host,port,db,key,reset):
 7 |         print('='*20)
 8 |         print('using my dupefilter ')
 9 |         print('='*20)
10 |         self.r = redis.StrictRedis(host=host,port=port,db=db)
11 |         self.key = key
12 |         self.reset = reset
13 | 
14 |     
15 |     @classmethod
16 |     def from_settings(cls, settings):
17 |         # result=(dict(settings))
18 | 
19 |         # name=settings.get('BOT_NAME')
20 |         # print(f'name is {name}')
21 |         host=settings.get('REDIS_HOST','127.0.0.1')
22 |         port=settings.get('REDIS_PORT',6379)
23 |         
24 |         print(f'host:{host},port {port}')
25 |         db=settings.get('REDIS_DB',0)
26 |         redis_key=settings.get('REDIS_KEY')
27 | 
28 | 
29 |         print(f'redis key{redis_key}')
30 |         user=settings.get('USER_AGENT')
31 |         print(user)
32 |         if redis_key is None:
33 |             raise ValueError('No value assign to redis_key')
34 | 
35 |         reset=settings.getbool('REDIS_REST',False)
36 |        
37 | 
38 | 
39 |         return cls(host,port,db,redis_key,reset)
40 | 
41 |     def request_seen(self, request):
42 | 
43 |         if self.r.sismember(self.key,request.url):
44 |             print(f'url ---{request.url}---has been seen 重复URL')
45 | 
46 |             return True
47 | 
48 |         else:
49 |             # print('add an url in redis')
50 |             self.r.sadd(self.key,request.url)
51 | 
52 |             return False
53 | 
54 |     def open(self):  # can return deferred
55 |         pass
56 | 
57 |     def close(self, reason):  # can return a deferred
58 |         print('dup closed')
59 | 
60 |         if self.reset:
61 |             print(f'delete redis key {self.key}')
62 |             self.r.delete(self.key)
63 | 
64 |     def log(self, request, spider):  # log that a request has been filtered
65 |         pass


--------------------------------------------------------------------------------
/sz_yaohao/sandbox/spiders/website.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import re
 4 | 
 5 | import requests
 6 | import scrapy
 7 | from scrapy import Request, FormRequest
 8 | import logging
 9 | import redis
10 | from sandbox.items import SpiderItem
11 | from sandbox.utility import get_header
12 | from sandbox.config import code_url
13 | 
14 | # post
15 | class WebPostSpider(scrapy.Spider):
16 |     name = 'website'
17 |     headers = {
18 | 
19 |     }
20 |     post_url = 'https://apply.jtys.sz.gov.cn/apply/app/increment/person/login'
21 |     img_url = 'http://apply.jtys.sz.gov.cn/apply/app/validCodeImage'
22 | 
23 |     def __init__(self, *args, **kwargs):
24 |         super(WebPostSpider, self).__init__(*args, **kwargs)
25 |         self.headers = get_header()
26 | 
27 |         self.data = {
28 |             'loginType': 'MOBILE',
29 |             'loginCode': '',
30 |             'password': '',
31 |             'validCode': '',
32 |         }
33 | 
34 |     def start_requests(self):
35 | 
36 |         yield Request(
37 |             url=self.img_url,
38 |             headers=self.headers
39 |         )
40 |     def parse(self,response):
41 |         # TO DO
42 |         img = response.body
43 | 
44 |         # with open('test.jpg','wb') as f:
45 |         #     f.write(img)
46 |         r=requests.post(code_url,data=img)
47 |         js_data = r.json()
48 |         if js_data.get('success'):
49 |             code = js_data.get('message')
50 |             post_data=self.data.copy()
51 |             post_data['validCode']=code
52 |             # input('input code')
53 |             yield FormRequest(url=self.post_url,
54 |                               headers=self.headers,
55 |                               formdata=post_data,
56 |                               callback=self.check_login,
57 |                               )
58 | 
59 |     def check_login(self,response):
60 |         content=response.text
61 |         if '忘记密码' in content:
62 |             print('密码错误')
63 |         else:
64 |             print('找到密码')
65 | 
66 | 


--------------------------------------------------------------------------------
/fangtianxia/fangtianxia_proxy_test.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import hashlib
 3 | import time
 4 | import requests
 5 | 
 6 | # 找群主购买 my_app_key, myappsecret, 以及蚂蚁代理服务器的 mayi_url 地址和 mayi_port 端口
 7 | my_app_key = ""
 8 | app_secret = ""
 9 | mayi_url = 's3.proxy.mayidaili.com'
10 | mayi_port = '8123'
11 | 
12 | # 蚂蚁代理服务器地址
13 | mayi_proxy = {'http': 'http://{}:{}'.format(mayi_url, mayi_port)}
14 | 
15 | # 准备去爬的 URL 链接
16 | #url = 'http://1212.ip138.com/ic.asp'
17 | testUrl='http://members.3322.org/dyndns/getip'
18 | # 计算签名
19 | timesp = '{}'.format(time.strftime("%Y-%m-%d %H:%M:%S"))
20 | codes = app_secret + 'app_key' + my_app_key + 'timestamp' + timesp + app_secret
21 | sign = hashlib.md5(codes.encode('utf-8')).hexdigest().upper()
22 | 
23 | # 拼接一个用来获得蚂蚁代理服务器的「准入」的 header (Python 的 concatenate '+' 比 join 效率高)
24 | authHeader = 'MYH-AUTH-MD5 sign=' + sign + '&app_key=' + my_app_key + '&timestamp=' + timesp
25 | 
26 | # 用 Python 的 Requests 模块。先订立 Session()，再更新 headers 和 proxies
27 | 
28 | user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0"
29 | # cookie_read=open('cookie').read().strip()
30 | headers = {"User-agent": user_agent, 'upgrade-insecure-requests': '1',
31 |                 'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6',
32 |                 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
33 |                 'accept-encoding': 'gzip, deflate', 'Cache-Control': 'no-cache'}
34 | '''
35 | s = requests.Session()
36 | s.headers.update({'Proxy-Authorization': authHeader})
37 | s.proxies.update(mayi_proxy)
38 | s.headers.update(headers)
39 | s.headers.update({'Proxy-Authorization': authHeader})
40 | pg = s.get(testUrl)  # tuple: 300 代表 connect timeout, 270 代表 read timeout
41 | print(pg.text)
42 | print(pg.status_code)
43 | '''
44 | headers['Proxy-Authorization']=authHeader
45 | while 1:
46 |     r=requests.get(url=testUrl,headers=headers,proxies=mayi_proxy)
47 |     print(r.status_code)
48 |     #r.encoding='gb2312'
49 |     print(r.text)
50 |     time.sleep(10)
51 | #pg.encoding = 'GB18030'
52 | 


--------------------------------------------------------------------------------
/dfcf/settings.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2020/3/31 23:36
 4 | # @File : settings.py
 5 | import time
 6 | 
 7 | import config
 8 | import requests
 9 | 
10 | headers = {
11 |     'Connection': 'keep-alive',
12 |     # 'Upgrade-Insecure-Requests': '1',
13 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
14 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
15 |     'Referer': 'http://guba.eastmoney.com/list,300750_2.html',
16 |     'Accept-Encoding': 'gzip, deflate',
17 |     'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8',
18 | }
19 | 
20 | cookies = {
21 |     'qgqp_b_id': '4d112e2089d3c5855c8ca2d1f2947ecd',
22 |     'em_hq_fls': 'js',
23 |     'st_si': '98016728708487',
24 |     'HAList': 'a-sh-601799-%u661F%u5B87%u80A1%u4EFD%2Ca-sh-600729-%u91CD%u5E86%u767E%u8D27%2Ca-sz-000063-%u4E2D%u5174%u901A%u8BAF%2Cf-0-399300-%u6CAA%u6DF1300',
25 |     'emshistory': '%5B%22%E6%98%9F%E5%AE%87%E8%82%A1%E4%BB%BD%22%2C%22601799%22%2C%22300496%22%2C%22dfcf%22%5D',
26 |     'st_asi': 'delete',
27 |     'st_pvi': '04745525503534',
28 |     'st_sp': '2019-10-28%2011%3A48%3A22',
29 |     'st_inirUrl': 'https%3A%2F%2Fwww.baidu.com%2Flink',
30 |     'st_sn': '132',
31 |     'st_psi': '20200401002426450-117001301474-3984682985',
32 | }
33 | 
34 | def get_proxy(retry=10):
35 |     count = 0
36 |     proxyurl = 'http://{}:8101/dynamicIp/common/getDynamicIp.do'.format(
37 |         config.PROXIES_OLD)
38 |     for i in range(retry):
39 |         try:
40 |             r = requests.get(proxyurl, timeout=10)
41 |             # print('获取的代理ip ' + r.text)
42 |         except Exception as e:
43 |             print(e)
44 |             count += 1
45 |             print('代理获取失败,重试' + str(count))
46 |             time.sleep(1)
47 | 
48 |         else:
49 |             js = r.json()
50 |             proxyServer = 'http://{0}:{1}'.format(js.get('ip'), js.get('port'))
51 |             proxies_random = {
52 |                 'http': proxyServer
53 |             }
54 |             return proxies_random
55 | 


--------------------------------------------------------------------------------
/holdle/sync_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time : 2020/11/24 21:42
 3 | # @File : sync_spider.py
 4 | # @Author : Rocky C@www.30daydo.com
 5 | import requests
 6 | import sys
 7 | sys.path.append('..')
 8 | import asyncio
 9 | import datetime
10 | import aiohttp
11 | import re
12 | import time
13 | from parsel import Selector
14 | from configure.settings import DBSelector
15 | from common.BaseService import BaseService
16 | 
17 | SLEEP = 2
18 | 
19 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
20 |            'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'}
21 | 
22 | URL_MAP = {'home_page': 'https://holdle.com/stocks/industry', 'base': 'https://holdle.com'}
23 | 
24 | 
25 | class Holdle(BaseService):
26 | 
27 |     def __init__(self):
28 |         super(Holdle, self).__init__()
29 | 
30 |         self.DB = DBSelector()
31 |         self.client = self.DB.mongo(location_type='qq', async_type=True)
32 |         self.session = requests.Session()
33 | 
34 |     def run(self):
35 |         start = time.time()
36 | 
37 |         response = self.session.get(url=URL_MAP['home_page'], headers=headers)
38 |         html =  response.text  # 这个阻塞
39 |         resp = Selector(text=html)
40 |         industries = resp.xpath('//ul[@class="list-unstyled"]/a')
41 |         for industry in industries:
42 |             json_data = {}
43 |             industry_url = industry.xpath('.//@href').extract_first()
44 |             industry_name = industry.xpath('.//li/text()').extract_first()
45 |             json_data['industry_url'] = industry_url
46 |             json_data['industry_name'] = industry_name
47 |             self.detail_list(industry_url, json_data)
48 | 
49 |         end = time.time()
50 |         print(f'time used {end-start}')
51 | 
52 |     def detail_list(self, url, json_data):
53 | 
54 |         response = self.session.get(URL_MAP['base']+url, headers=headers)
55 |         response =response.text
56 |         self.parse_detail(response, json_data)
57 | 
58 |     def parse_detail(self, html, json_data=None):
59 |         resp = Selector(text=html)
60 |         title =resp.xpath('//title/text()').extract_first()
61 |         print(title)
62 | 
63 | 
64 | app = Holdle()
65 | app.run()
66 | 


--------------------------------------------------------------------------------
/51jbnet/im_sandbox/models.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2019/5/16 17:34
 4 | # @File : models.py
 5 | from contextlib import contextmanager
 6 | from datetime import datetime
 7 | 
 8 | from sqlalchemy import create_engine
 9 | from sqlalchemy import Column, Integer, String, Date, DateTime, Text
10 | from sqlalchemy.orm import sessionmaker, scoped_session
11 | from sqlalchemy.ext.declarative import declarative_base
12 | 
13 | from im_sandbox.settings import MYSQL_DB_URI
14 | 
15 | # declare a Mapping,this is the class describe map to table column
16 | Base = declarative_base()
17 | engine = create_engine(MYSQL_DB_URI)
18 | session_factory = sessionmaker(bind=engine)
19 | Session = scoped_session(session_factory)
20 | 
21 | 
22 | @contextmanager
23 | def scoped_session():
24 |     session = Session()
25 |     try:
26 |         yield session
27 |         session.commit()
28 |     except:
29 |         session.rollback()
30 |         raise
31 |     finally:
32 |         session.close()
33 | 
34 | 
35 | class SpiderModel(Base):
36 |     __tablename__ = 'testdb'
37 |     id = Column(Integer, primary_key=True, autoincrement=True)
38 |     score = Column(Integer, nullable=False, default=0)
39 |     catid = Column(Integer, nullable=False, default=0)
40 |     score_story = Column(String(512), nullable=False, default='')
41 |     hometext = Column(String(1024), nullable=False, default='')
42 |     counter = Column(Integer, nullable=False, default=0)
43 |     inputtime = Column(DateTime, nullable=False, default=datetime.now())
44 |     topic = Column(Integer, nullable=False, default=0)
45 |     source = Column(String(128), nullable=False, default='')
46 |     mview = Column(Integer, nullable=False, default=0)
47 |     comments = Column(Integer, nullable=False, default=0)
48 |     crawled_datetime = Column(DateTime, nullable=False, default=datetime.now())
49 |     rate_sum = Column(Integer, nullable=False, default=0)
50 |     title = Column(String(512), nullable=False, default='')
51 |     url_show = Column(String(512), nullable=False, default='')
52 |     thumb = Column(String(256), nullable=False, default='')
53 | 
54 | # 建表的时候去掉这一行注释
55 | # Base.metadata.create_all(engine)
56 | 
57 | def map_orm_item(scrapy_item, sql_item):
58 |     for k, v in scrapy_item.items():
59 |         sql_item.__setattr__(k, v)
60 |     return sql_item
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/CustomExtension.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Date    : 2019-08-27 11:31:19
 4 | # @Author  : Rocky Chen (weigesysu@qq.com)
 5 | # @Link    : http://30daydo.com
 6 | # @Version : $1.0$
 7 | from scrapy import signals
 8 | import pika
 9 | import json
10 | import datetime
11 | from scrapy.exceptions import NotConfigured
12 | 
13 | # 自定义扩展 推送到 rabbitmq
14 | class AdvancedExtension(object):
15 |     
16 |     def __init__(self,crawler):
17 |         self.crawler = crawler
18 |         self.crawler.signals.connect(self.spider_close,signals.spider_closed)
19 |         self.mq_host=crawler.settings.get('MQ_HOST')
20 |         self.mq_port=crawler.settings.getint('MQ_PORT')
21 |         self.mq_user=crawler.settings.get('MQ_USER')
22 |         self.mq_password=crawler.settings.get('MQ_PASSWORD')
23 |         self.queue_name = crawler.settings.get('MQ_QUEUE_NAME')
24 |         if not self.queue_name:
25 |             raise NotConfigured # 有这个是让这个模块失效而不报错
26 |         self.start_time = datetime.datetime.now()
27 | 
28 |     @classmethod
29 |     def from_crawler(cls,crawler):
30 | 
31 |         return cls(crawler)
32 | 
33 |     def spider_close(self,spider):
34 |         
35 |         print('in extension module, spider close')
36 |         print(f'spider name {spider.name}')
37 |         # print(dir(spider))
38 |         credentials = pika.PlainCredentials(self.mq_user,self.mq_password)
39 | 
40 |         connection = pika.BlockingConnection(pika.ConnectionParameters(self.mq_host,self.mq_port,'/',credentials))
41 | 
42 |         channel = connection.channel()
43 | 
44 |         queue_name = 'spider'
45 |         channel.queue_declare(queue=self.queue_name,durable=True)
46 |         now = datetime.datetime.now()
47 | 
48 |         content = {'spiderName':spider.name,'status':'closed','start_time':self.start_time.strftime('%Y-%m-%d %H:%M:%S'),'end_time':now.strftime('%Y-%m-%d %H:%M:%S'),'time_used(s)':(now-self.start_time).seconds}
49 | 
50 |         send_content = json.dumps(content)
51 | 
52 |         channel.basic_publish(
53 |             exchange='',
54 |             routing_key=self.queue_name,
55 |             body=send_content,
56 |             properties=pika.BasicProperties(
57 |                 delivery_mode=2) # 这个是用来做消息持久化，数据会保存在队列，直到被消费
58 |             )
59 | 
60 |         print('[x] send {}'.format(send_content))
61 |         connection.close()
62 | 
63 | 
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/yinyonbao/yingyongbao.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import re
 3 | 
 4 | import requests
 5 | from lxml import etree
 6 | import pandas as pd
 7 | 
 8 | class Yinyongbao():
 9 |     def __init__(self):
10 |         self.user_agent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
11 |         self.headers = {"User-Agent": self.user_agent}
12 | 
13 | 
14 |     def getData(self):
15 |         base_url='http://sj.qq.com/myapp/category.htm'
16 |         parent_url='http://sj.qq.com/myapp/category.htm?orgame=1'
17 |         s=requests.get(url=parent_url,headers=self.headers)
18 |         print(s.status_code)
19 |         #print(s.text)
20 |         tree=etree.HTML(s.text)
21 |         menu=tree.xpath('//ul[@class="menu-junior"]')[0]
22 |         print(type(menu))
23 | 
24 |         link= menu.xpath('.//li[@id]/a/@href')
25 |         catelog=[]
26 |         for i in link:
27 |             print(i)
28 |             p=re.compile('categoryId=(-?\d+)')
29 |             #x=base_url+i
30 |             x=p.findall(i)[0]
31 |             #print(x)
32 |             catelog.append(x)
33 |         return catelog
34 | 
35 |     def testcase(self):
36 |         catelog=self.getData()
37 |         print(catelog)
38 |         for i in catelog:
39 |             print("Catelog : ", i)
40 |             self.each_page(int(i),0)
41 | 
42 |     #抓取某一个分类的
43 |     def each_page(self,categoryId,pageContext):
44 | 
45 |         url='http://sj.qq.com/myapp/cate/appList.htm?orgame=1&categoryId=%d&pageSize=20&pageContext=%d' %(categoryId,pageContext)
46 |         para={'orgame':1,'categoryId':categoryId,'pageSize':20,'pageContext':pageContext}
47 |         s=requests.get(url=url,params=para,headers=self.headers)
48 |         js= s.json()
49 |         name=[]
50 |         df=pd.DataFrame(js['obj'])
51 |         print(df)
52 |         for i in js['obj']:
53 |             #需要的数据都在这里面
54 |             x= i['appName']
55 |             print(x,' ---download count: ', i['appDownCount'])
56 | 
57 |             name.append(x)
58 |         print(len(name))
59 |         try:
60 |             pageContext=int(js['pageContext'])
61 |             self.each_page(categoryId,pageContext)
62 |         except Exception as e:
63 |             return
64 | 
65 | def main():
66 |     obj=Yinyongbao()
67 |     #obj.getData()
68 |     #obj.each_page('',0)
69 |     obj.testcase()
70 |     '''
71 |     for i in range(0,200,38):
72 |         obj.each_page('',i)
73 |     '''
74 | main()
75 | 


--------------------------------------------------------------------------------
/ximalaya/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # website: http://30daydo.com
 3 | # @Time : 2019/6/30 12:03
 4 | # @File : main.py
 5 | 
 6 | import requests
 7 | import re
 8 | import os
 9 | 
10 | url = 'http://180.153.255.6/mobile/v1/album/track/ts-1571294887744?albumId=23057324&device=android&isAsc=true&isQueryInvitationBrand=true&pageId={}&pageSize=20&pre_page=0'
11 | headers = {'User-Agent': 'Xiaomi'}
12 | 
13 | def download():
14 |     for i in range(1, 3):
15 |         r = requests.get(url=url.format(i), headers=headers)
16 |         js_data = r.json()
17 |         data_list = js_data.get('data', {}).get('list', [])
18 |         for item in data_list:
19 |             trackName = item.get('title')
20 |             trackName = re.sub('[\/\\\:\*\?\"\<\>\|]', '_', trackName)
21 |             # trackName=re.sub(':','',trackName)
22 |             src_url = item.get('playUrl64')
23 |             filename = '{}.mp3'.format(trackName)
24 |             if not os.path.exists(filename):
25 | 
26 |                 try:
27 |                     r0 = requests.get(src_url, headers=headers)
28 |                 except Exception as e:
29 |                     print(e)
30 |                     print(trackName)
31 |                     r0 = requests.get(src_url, headers=headers)
32 | 
33 | 
34 |                 else:
35 |                     with open(filename, 'wb') as f:
36 |                         f.write(r0.content)
37 | 
38 |                     print('{} downloaded'.format(trackName))
39 | 
40 |             else:
41 |                 print(f'{filename}已经下载过了')
42 | 
43 | import shutil
44 | 
45 | def rename_():
46 |     for i in range(1, 3):
47 |         r = requests.get(url=url.format(i), headers=headers)
48 |         js_data = r.json()
49 |         data_list = js_data.get('data', {}).get('list', [])
50 |         for item in data_list:
51 |             trackName = item.get('title')
52 |             trackName = re.sub('[\/\\\:\*\?\"\<\>\|]', '_', trackName)
53 |             src_url = item.get('playUrl64')
54 | 
55 |             orderNo=item.get('orderNo')
56 | 
57 |             filename = '{}.mp3'.format(trackName)
58 |             try:
59 | 
60 |                 if os.path.exists(filename):
61 |                     new_file='{}_{}.mp3'.format(orderNo,trackName)
62 |                     shutil.move(filename,new_file)
63 |             except Exception as e:
64 |                 print(e)
65 | 
66 | 
67 | 
68 | 
69 | 
70 | if __name__=='__main__':
71 |     rename_()
72 | 


--------------------------------------------------------------------------------
/myubbs/sandbox/spiders/website.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import datetime
 3 | import json
 4 | import re
 5 | import scrapy
 6 | from scrapy import Request, FormRequest
 7 | import logging
 8 | import redis
 9 | from sandbox.items import SpiderItem
10 | from sandbox.utility import get_header
11 | 
12 | # get
13 | class WebGetSpider(scrapy.Spider):
14 |     name = 'myubbs'
15 |     URL = 'http://zsu.myubbs.com/forum-97-{}.html'
16 | 
17 |     def __init__(self):
18 | 
19 |         super(WebGetSpider,self).__init__()
20 |         self.headers=get_header()
21 |         self.page=10
22 | 
23 |     def start_requests(self):
24 |         # TO DO
25 |         for p in range(1,self.page+1):
26 |             yield Request(url=self.URL.format(p),
27 |                       headers=self.headers
28 |                       )
29 | 
30 |     def parse(self, response):
31 |         root=response.xpath('//*[@id="threadlisttableid"]/tbody')
32 |         for node in root[1:]:
33 |             url = node.xpath('.//th//a[@class="s xst"]/@href').extract_first()
34 |             # print(url)
35 |             if url:
36 |                 yield Request(url,headers=self.headers,callback=self.parse_item)
37 | 
38 |     def parse_item(self,response):
39 | 
40 |         title = response.xpath('//span[@id="thread_subject"]/text()').extract_first()
41 |         url = response.url
42 |         pubdate = response.xpath('//div[@id="postlist"]/div[1]/table//div[@class="authi"]/em/text()').re_first('\d+-\d+-\d+ \d+:\d+:\d{2}')
43 |         if pubdate is None:
44 |             try:
45 |                 pubdate = response.xpath('//div[@id="postlist"]/div[1]/table//div[@class="authi"]/em/span/@title').extract_first()
46 |             except Exception as e:
47 |                 print(e)
48 |                 pubdate=''
49 |         # pubdate = response.xpath('//div[@id="postlist"]/').extract_first()
50 |         author=response.xpath('//div[@class="authi"]/a/text()').extract_first()
51 |         content = response.xpath('//td[@class="t_f"]')[0].xpath('string(.)').extract()[0]
52 |         crawltime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
53 | 
54 |         spiderItem= SpiderItem()
55 | 
56 |         for field in spiderItem.fields:
57 |             try:
58 |                 spiderItem[field]=eval(field)
59 |             except Exception as e:
60 |                 logging.warning('can not find define of {}'.format(field))
61 |                 logging.warning(e)
62 | 
63 |         # print(spiderItem)
64 |         yield spiderItem
65 | 
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/v2ex_job/v2ex2.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from lxml import etree
 3 | from scrapy import Selector
 4 | from twisted.internet import defer
 5 | from twisted.internet import reactor
 6 | from twisted.web.client import getPage
 7 | 
 8 | 
 9 | class V2exJob:
10 |     def __init__(self):
11 |         pass
12 | 
13 |     def get_page(self):
14 |         """
15 |         总共页码的获取
16 |         :return:
17 |         """
18 |         index_url = 'https://www.v2ex.com/go/jobs'
19 |         index_headers = {
20 |             'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
21 |         }
22 |         response = requests.get(url=index_url, headers=index_headers)
23 |         selector = Selector(text=response.text)
24 |         all_page = selector.xpath('//a[@class="page_normal"]/text()').extract()
25 |         all_page = all_page[-1]
26 |         return all_page
27 | 
28 |     @defer.inlineCallbacks
29 |     def get_html(self, each_page):
30 |         """
31 |         进行网站信息的获取，并进行返回。
32 |         :param each_page: 
33 |         :return: 
34 |         """
35 |         each_urls = 'https://www.v2ex.com/go/jobs?p=%s' % str(each_page) 
36 |         res = getPage(bytes(each_urls, encoding="utf-8"))  # 获取页面，发送http请求,是使用select池将所有socket请求保存，依据此进行计数。
37 |         # print( type(res))  # <class 'twisted.internet.defer.Deferred'>
38 |         res.addCallback(self.parse_infos)  # 对每一个请求都添加一个回调方法
39 |         yield res  # 返回他
40 | 
41 |     def parse_infos(self, parse_infos):
42 |         parse_infos = parse_infos.decode('utf-8')
43 |         parse_infos = etree.HTML(parse_infos)
44 |         infos = parse_infos.xpath('//span[@class="item_title"]/a/text()')
45 |         print(infos)
46 | 
47 |     def run(self):
48 |         """
49 |         程序的启动开始采集数据
50 |         :return:
51 |         """
52 |         all_page = self.get_page()
53 |         defer_list = []
54 |         for each_page in range(1, 10):  # 禁忌务要一次性访问过多的请求。不然别人会禁掉你的。
55 |             v = self.get_html(each_page)  # 发送请求后立即返回，不等待返回，v是一个特殊对象，标志你发送到那个请求
56 |             defer_list.append(v)
57 |         d = defer.DeferredList(defer_list)  # 将上面的特殊对象列表一起放入DeferredList
58 |         d.addBoth(self.all_done)  # 为所有对象添加回调
59 |         reactor.run()  # 会一直循环，我们需要在任务执行完毕后关闭。含有计数器，执行一个任务，会执行一次get_html,计数减一。单任务执行完毕，计数为0，执行all_done
60 | 
61 |     def all_done(self, arg):
62 |         print("all done")
63 |         reactor.stop()
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     v2ex_job = V2exJob()
68 |     v2ex_job.run()
69 | 
70 | 


--------------------------------------------------------------------------------
/anjuke/test_anjuke.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import re
 3 | 
 4 | import requests
 5 | from lxml import etree
 6 | headers = {
 7 |     'accept': 'text/html',
 8 |     'accept-encoding': 'gzip, deflate, sdch',
 9 |     'accept-language': 'zh-CN,zh;q=0.8',
10 |     'cache-control': 'no-cache',
11 |     'pragma': 'no-cache',
12 |     'User-Agent': 'UCWEB/2.0 (Linux; U; Adr 2.3; zh-CN; MI-ONEPlus)U2/1.0.0 UCBrowser/8.6.0.199 U2/1.0.0 Mobile',
13 |     'x-requested-with': 'XMLHttpRequest',
14 |     'cookie': 'als=0; isp=true; Hm_lvt_c5899c8768ebee272710c9c5f365a6d8=1502856226; sessid=1551E6AF-1AA9-2526-E4E9-D494551F4A2F; search_words361=%E9%98%B3%E5%85%89%E5%B0%8F%E5%8C%BA; search_words24=%E9%9D%96%E6%B1%9F%E9%9B%85%E5%9B%AD11%E5%8F%B7%E6%A5%BC%7C%E6%9C%88%E6%A1%82%E8%A5%BF%E5%9B%AD; search_words14=%E8%B6%85%E6%98%8E%E5%9B%AD; search_words25=%E6%96%B0%E6%83%A0%E5%AE%B6%E5%9B%AD; browse_comm_ids13=95393; seo_source_type=0; search_words13=%E6%AC%A7%E9%99%86%E7%BB%8F%E5%85%B8%7C%E5%8D%97%E6%96%B9%E6%98%8E%E7%8F%A0%E8%8A%B1%E5%9B%AD%7C%E5%8D%97%E6%96%B9%E6%98%8E%E7%8F%A0%E8%8A%B1%E5%9B%AD%E4%BA%8C%E6%9C%9F1%E6%A0%8B; twe=2; __xsptplus8=8.43.1504789824.1504790391.8%233%7C123.sogou.com%7C%7C%7C%7C%23%23hvhL5eg3_ejnK-ngxJE-qwbIXXbQIk81%23%3B%20aQQ_a; _ga=GA1.2.1188068084.1502419352; _gid=GA1.2.1082371756.1504696715; lps="/cityList/|"; aQQ_ajkguid=B97BFB26-048C-2797-947E-7543B95A2D8A; ctid=13; 58tj_uuid=a4461385-7d0d-4e1a-9e94-85fa7b69f6aa; new_session=0; init_refer=; new_uv=61'
15 | }
16 | 
17 | start_url = 'https://m.anjuke.com/gu/community/?from=anjuke_home&p=1'
18 | r = requests.get(url=start_url, headers=headers)
19 | if  r.json()['data']:
20 |     print('not empty')
21 | else:
22 |     print('empty')
23 | 
24 | 
25 | price_case='https://m.anjuke.com/gz/community/112952/'
26 | content=requests.get(url=price_case,headers=headers).text
27 | tree=etree.HTML(content)
28 | price=tree.xpath('//a[@data-soj="community_topprice"]/div[@class="txt-c"]/p[@class="price"]/text()')[0]
29 | print(price)
30 | name=tree.xpath('//div[@class="comm-tit"]/h1/text()')[0]
31 | print(name)
32 | address=tree.xpath('//div[@class="comm-tit"]/div[@class="comm-ad"]/p/text()')[0]
33 | print(address)
34 | building_type=tree.xpath('//div[@class="header-field"]/span')[0].xpath('./text()')[0]
35 | building_date=tree.xpath('//div[@class="header-field"]/span')[2].xpath('./text()')[0]
36 | print(building_date)
37 | print(building_type)
38 | pattern = 'data-center="(.*?)"'
39 | data = re.findall(pattern, content)
40 | t= data[0].split(',')
41 | print(t[0])
42 | print(t[1])
43 | #longitude = data[0]
44 | #latitude = data[1]


--------------------------------------------------------------------------------
/youdao_dictionary/youdao.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # website: http://30daydo.com
 3 | # @Time : 2019/2/23 19:34
 4 | # @File : youdao.py
 5 | # 解密有道词典的JS
 6 | 
 7 | 
 8 | import hashlib
 9 | import random
10 | import requests
11 | import time
12 | 
13 | 
14 | def md5_(word):
15 |     s = bytes(word, encoding='utf8')
16 |     m = hashlib.md5()
17 |     m.update(s)
18 |     ret = m.hexdigest()
19 |     return ret
20 | 
21 | def get_sign(word, salt):
22 |     ret = md5_('fanyideskweb' + word + salt + 'p09@Bn{h02_BIEe]$P^nG')
23 |     return ret
24 | 
25 | def youdao(word):
26 |     url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
27 |     headers = {
28 |         'Host': 'fanyi.youdao.com',
29 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
30 |         'Accept': 'application/json, text/javascript, */*; q=0.01',
31 |         'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
32 |         'Accept-Encoding': 'gzip, deflate',
33 |         'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
34 |         'X-Requested-With': 'XMLHttpRequest',
35 |         'Referer': 'http://fanyi.youdao.com/',
36 |         'Cookie': 'YOUDAO_MOBILE_ACCESS_TYPE=1; OUTFOX_SEARCH_USER_ID=1672542763@10.169.0.83; JSESSIONID=aaaWzxpjeDu1gbhopLzKw; ___rl__test__cookies=1550913722828; OUTFOX_SEARCH_USER_ID_NCOO=372126049.6326876',
37 |         'Connection': 'keep-alive',
38 |         'Pragma': 'no-cache',
39 |         'Cache-Control': 'no-cache',
40 |     }
41 | 
42 |     ts = str(int(time.time()*1000))
43 |     salt=ts+str(random.randint(0,10))
44 |     bv = md5_("5.0 (Windows)")
45 |     sign= get_sign(word,salt)
46 | 
47 |     post_data = {
48 |         'i': word,
49 |         'from': 'AUTO', 'to': 'AUTO', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': salt,
50 |         'sign': sign, 'ts': ts, 'bv': bv, 'doctype': 'json', 'version': '2.1',
51 |         'keyfrom': 'fanyi.web', 'action': 'FY_BY_REALTIME', 'typoResult': 'false'
52 |     }
53 | 
54 |     r = requests.post(
55 |         url=url,
56 |         headers=headers,
57 |         data=post_data
58 |     )
59 | 
60 |     js_data = r.json()
61 |     smart_result= js_data.get('smartResult', {})
62 | 
63 |     if smart_result:
64 |         for item in smart_result.get('entries'):
65 |             print(item)
66 | 
67 |     translate_result = js_data.get('translateResult',[])
68 |     if translate_result:
69 |         for items in translate_result:
70 |             for item in items:
71 |                 print(item.get('tgt'))
72 | 
73 | word='我喜欢吃鸡腿'
74 | youdao(word)
75 | 


--------------------------------------------------------------------------------
/zhihu/zhihu_book.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import requests
 3 | import json
 4 | import pymongo
 5 | # 下载知乎书籍的数据
 6 | def get_books_by_url(url):
 7 |     headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"}
 8 |     r = requests.get(url, headers=headers)
 9 |     data = json.loads(r.content.decode("utf-8"))
10 |     return data
11 | 
12 | def get_books_by_category(category_id):
13 |     url_patt = "https://www.zhihu.com/api/v3/books/categories/{}?limit={}&offset={}&version=v2"
14 |     limit = 10
15 |     offset = 0
16 |     client = pymongo.MongoClient('10.18.6.26',27001)
17 |     db = client.zhihu_book
18 |     while True:
19 |         url = url_patt.format(category_id, limit, offset)
20 |         print(url)
21 |         data = get_books_by_url(url)
22 |         books = data["data"]
23 |         db.books.insert_many(books)
24 |         if data["paging"]["is_end"]:
25 |             break
26 |         offset = offset + limit
27 | 
28 | def get_all_books():
29 |     categories = [147, 254, 232, 209, 245, 175, 219, 189, 205, 161, 143, 284, 265, 214, 155, 241]
30 |     for category in categories:
31 |         get_books_by_category(category)
32 | 
33 | def query_books():
34 |     client = pymongo.MongoClient('10.18.6.26',27001)
35 |     db = client.zhihu_book
36 | 
37 |     books = db.books.find().sort("score")
38 |     book_ids = []
39 |     for book in books:
40 |         if book["id"] in book_ids:
41 |             continue
42 |         price = 0
43 |         if book["promotion"]["is_promotion"]:
44 |             price = book["promotion"]["promotion_price"]/100
45 |         else:
46 |             price = book["promotion"]["price"]/100
47 |         print("{},{},{},{},{}".format(book["title"], book["url"], book["score"], price, book["promotion"]["origin_price"]/100))
48 |         book_ids.append(book["id"])
49 | 
50 | #    books = db.books.find({"promotion.price": 0.0}).sort("score")
51 | #    book_ids = []
52 | #    for book in books:
53 | #        if book["id"] in book_ids:
54 | #            continue
55 | #        print("{},{},{}".format(book["title"], book["url"], book["score"]))
56 | #        book_ids.append(book["id"])
57 | 
58 | if __name__ == "__main__":
59 |     # parser = argparse.ArgumentParser()
60 |     # parser.add_argument("--download", help="", action="store_true")
61 |     # parser.add_argument("--query", help="", action="store_true")
62 |     # args = parser.parse_args()
63 |     # if args.download:
64 |     #     get_all_books()
65 |     # elif args.query:
66 |     #     query_books()
67 |     get_all_books()


--------------------------------------------------------------------------------
/51jbnet/im_sandbox/spiders/website.py:
--------------------------------------------------------------------------------
 1 | # -*-coding=utf-8-*-
 2 | 
 3 | # @Time : 2019/5/16 17:30
 4 | # @File : website.py
 5 | 
 6 | # -*- coding: utf-8 -*-
 7 | import re
 8 | import requests
 9 | import scrapy
10 | from scrapy import Request
11 | from im_sandbox import settings
12 | from scrapy.log import logger
13 | import json
14 | from im_sandbox.items import SandboxItem
15 | import datetime
16 | from scrapy.selector import Selector
17 | 
18 | 
19 | class Website(scrapy.Spider):
20 |     name = "website"
21 |     category='linux_shell'
22 |     idx=235
23 |     total=1403
24 |     page = int(total/40)+1
25 |     default_headers = {
26 |         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
27 |         "Accept-Encoding": "gzip, deflate, br",
28 |         "Accept-Language": "zh,en;q=0.9,en-US;q=0.8,zh-CN;q=0.7",
29 |         "Cache-Control": "no-cache",
30 |         "Connection": "keep-alive",
31 |         "Host": "www.jb51.net",
32 |         "Pragma": "no-cache",
33 |         "Referer": "https://www.jb51.net/list/list_97_1.htm",
34 |         "Upgrade-Insecure-Requests": "1",
35 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
36 |     }
37 | 
38 |     def start_requests(self):
39 |         page = 400
40 |         base_url = 'https://www.jb51.net/list/list_{idx}_{page}.htm'
41 |         for i in range(1, self.page + 1):
42 |             yield Request(url=base_url.format(page=i,idx=self.idx), headers=self.default_headers, callback=self.parse)
43 | 
44 |     def parse(self, response):
45 | 
46 |         if not response.body:
47 |             logger.error(msg='there is no response body ,please go and check it ')
48 |             return
49 | 
50 |         nodes = response.xpath('//div[@class="artlist clearfix"]/DL/DT')
51 |         if nodes:
52 |             pass
53 |         else:
54 |             nodes = response.xpath('//div[@class="artlist clearfix"]/dl/dt')
55 | 
56 |         for node in nodes:
57 |             pubdate = node.xpath('.//span/text()').extract_first()
58 |             pubdate = re.sub('日期:', '', pubdate)
59 |             title=node.xpath('.//a/text()').extract_first()
60 |             url=node.xpath('.//a/@href').extract_first()
61 |             full_url = 'https://www.jb51.net{}'.format(url)
62 |             item = SandboxItem()
63 |             item['pubdate']=pubdate
64 |             item['url']=full_url
65 |             item['title']=title
66 |             item['category']=self.category
67 |             yield item
68 | 


--------------------------------------------------------------------------------
/jd/jd/spiders/jd_book.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import scrapy
 4 | from scrapy import Request
 5 | from scrapy_splash import SplashRequest
 6 | import re
 7 | from jd.items import JdItem
 8 | lua_script = """
 9 | function main(splash)
10 |     splash:go(splash.args.url)
11 |     splash:wait(5)
12 |     splash:runjs("document.getElementsByClassName('page')[0].scrollIntoView(true)")
13 |     splash:wait(5)
14 |     return splash:html()
15 | end
16 | """
17 | 
18 | 
19 | class JDBookSpider(scrapy.Spider):
20 |     name = "jd_book"
21 |     allowed_domains = ["search.jd.com"]
22 |     kw='股票'
23 |     base_url = 'https://search.jd.com/Search?keyword={}&enc=utf-8&wq={}'.format(kw,kw)
24 | 
25 |     def start_requests(self):
26 |         # 请求第一页，无需 js 渲染
27 |         yield Request(self.base_url, callback=self.parse_urls, dont_filter=True)
28 | 
29 |     def parse_urls(self, response):
30 |         # 获取商品总数，计算出总页数
31 |         total = response.css('span#J_resCount::text').extract_first().strip('+')
32 |         try:
33 |             total=re.sub('万','',total)
34 |             total=float(total)*10000
35 |         except:
36 |             return
37 |         pageNum = total // 60 + (1 if total % 60 else 0)
38 | 
39 |         # 构造每页的 url，向 Splash 的 execute 端点发送请求
40 |         for i in range(int(pageNum)):
41 |             url = '%s&page=%s' % (self.base_url, 2*i+1)
42 |             yield SplashRequest(url, endpoint='execute', args={'lua_source': lua_script},\
43 |                                 cache_args=['lua_source'])
44 | 
45 |     def parse(self, response):
46 |         # 获取一个页面中每本书的名字和价格
47 |         for sel in response.css('ul.gl-warp.clearfix > li.gl-item'):
48 |             item = JdItem()
49 |             name= sel.css('div.p-name').xpath('string(.//em)').extract_first()
50 |             price= sel.css('div.p-price i::text').extract_first()
51 |             try:
52 |                 remark=sel.xpath('.//div[(@class="p-commit" or @class="p-comm")]').xpath('string(.)').extract_first()
53 |                 if remark:
54 |                     remark=remark.strip()
55 |             except:
56 |                 remark=None
57 |             try:
58 |                 price=float(price)
59 |             except:
60 |                 price=price
61 | 
62 |             # 自营
63 |             # shop=sel.css('div.p-shopnum span::text').extract_first()
64 | 
65 |             # 出版社
66 | 
67 |             publish=sel.css('div.p-shopnum a::text').extract_first()
68 |             if publish is None:
69 |                 publish=sel.css('div.p-shop a::text').extract_first()
70 |             # if shop is None:
71 |             #     shop=sel.css('div.p-shopnum a::text').extract_first()
72 |             #     publish=None
73 | 
74 |             item['name']=name
75 |             item['price']=price
76 |             item['remark']=remark
77 |             item['publish']=publish
78 |             # item['shop']=shop
79 |             yield item


--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/CustomMiddleware.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Date    : 2019-08-28 19:35:51
 4 | # @Author  : Rocky Chen (weigesysu@qq.com)
 5 | # @Link    : http://30daydo.com
 6 | # @Version : 1.0
 7 | 
 8 | # 自定义middleware
 9 | from scrapy.exceptions import IgnoreRequest
10 | # from scrapy import log
11 | import logging
12 | from scrapy.downloadermiddlewares.retry import RetryMiddleware
13 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
14 | 
15 | class CustomMiddleware(object):
16 | 
17 |     def process_request(self,request,spider):
18 |         # print('before download v1')
19 |         # print(f'name -->{spider.name}')
20 |         
21 |         request.meta['vvv']='kkk' # 可以这样携带一些参数
22 |         
23 | 
24 |         # print('主动提交错误') # 去执行process_exception
25 |         # raise IgnoreRequest
26 | 
27 |     def process_response(self,request,response,spider):
28 |         # print('after download v1')
29 |         # print(f'name -->{spider.name}')
30 |         # print(request.meta['vvv'])
31 |         # print(dir(response))
32 |         # print(response.status)
33 | 
34 |         if response.status==404:
35 |             print('重新调度')
36 |             return request
37 |         else:
38 |             return response # 需要返回response
39 | 
40 |     def process_exception(self,request, exception, spider):
41 |         print('遇到错误了!!!!!!!!')
42 |         return request
43 | 
44 | class CustomMiddleware2(object):
45 | 
46 |     def process_request(self,request,spider):
47 |         # logging.info('before download v2')
48 |         # print(f'name -->{spider.name}')
49 |         request.meta['vvv']='kkk' # 可以这样携带一些参数
50 | 
51 |     def process_response(self,request,response,spider):
52 |         # print('after download v2')
53 |         # print(f'name -->{spider.name}')
54 |         # print(request.meta['vvv'])
55 |         v = request.meta['vvv']
56 |         return response
57 | 
58 | 
59 | class ModifiedRetryMiddleware(RetryMiddleware):
60 | 
61 | 
62 |     def process_response(self, request, response, spider):
63 |         
64 |         logging.info('这个我定义的继承retrymiddleware')
65 | 
66 |         if request.meta.get('dont_retry', False):
67 |             return response
68 |         
69 |         if response.status in self.retry_http_codes:
70 |             reason = response_status_message(response.status)
71 |             return self._retry(request, reason, spider) or response
72 |         
73 |         return response
74 | 
75 | class ModifiedUserAgentMiddleware(UserAgentMiddleware):
76 | 
77 |     def process_request(self, request, spider):
78 |     
79 |         if self.user_agent:
80 |             
81 |             logging.info('这是自定义UA中间件')
82 | 
83 |             request.headers.setdefault(b'User-Agent', self.user_agent)
84 | 
85 |     def process_response(self,request,response,spider):
86 |         logging.info(f'请求的request header ====== {request.headers}')
87 |         return response


--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/monitor/templates/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <title>爬虫动态监控系统</title>
  6 |     <script src="//cdn.bootcss.com/echarts/3.3.2/echarts.common.js"></script>
  7 |     <script src="//cdn.bootcss.com/jquery/3.1.1/jquery.min.js"></script>
  8 | </head>
  9 | <body>
 10 |     <div id="main" style="height: 500px; width: 75%; margin: 0 auto">
 11 |     </div>
 12 | <script type="text/javascript">
 13 | 
 14 | var STATS_KEYS = {{stats_keys|safe}}
 15 | var data = [{'value': [Date(), '0']}]
 16 | var myChart = echarts.init(document.getElementById('main'));
 17 |     option = {
 18 |     title: {
 19 |         text: '爬虫监控数据',
 20 |         left:'center'
 21 |     },
 22 |     tooltip: {
 23 |         trigger: 'axis',
 24 |         axisPointer: {
 25 |             animation: true
 26 |         }
 27 |     },
 28 |     toolbox: {
 29 |         left:'left',
 30 |         feature: {
 31 |             dataZoom: {
 32 |                 yAxisIndex: 'none'
 33 |             },
 34 |             saveAsImage: {
 35 |                 show:true
 36 |             }
 37 |         }
 38 |     },
 39 |     xAxis: {
 40 |         type: 'time',
 41 |         splitLine:{
 42 |             show: true
 43 |         }
 44 |     },
 45 |     yAxis: {
 46 |         type: 'value',
 47 |         boundaryGap: [0, '100%'],
 48 |         splitLine: {
 49 |             show: true
 50 |         }
 51 |     },
 52 |     series: [
 53 |     {
 54 |         name: 'zxc',
 55 |         type: 'line',
 56 |         data: data
 57 |     }]
 58 | };
 59 | 
 60 | myChart.setOption(option);
 61 | 
 62 | is_running = true
 63 | function ajax_get(name){
 64 |     $.ajax({url:"/ajax?key="+name, async: false, success: function(data){
 65 |     $data = {
 66 |             showSymbol: false,
 67 |             hoverAnimation: false,
 68 |             name:name,
 69 |             type:'line',
 70 |             data:eval(data)
 71 |         };
 72 |     series.push($data);
 73 | }, error: function(jqXHR, text, errorThrown){
 74 |     is_running = false;
 75 |     data = jqXHR.responseText
 76 |     $data = {
 77 |             showSymbol: false,
 78 |             hoverAnimation: false,
 79 |             name:name,
 80 |             type:'line',
 81 |             data:eval(data)
 82 |         };
 83 |     series.push($data);
 84 | }});
 85 | 
 86 | }
 87 | 
 88 | function get_data(){
 89 |     series = [];
 90 |     for (i=0;i<STATS_KEYS.length;i++){
 91 |         ajax_get(STATS_KEYS[i])
 92 |     }
 93 |     myChart.setOption({
 94 |         series: series,
 95 |         legend: {
 96 |         data:STATS_KEYS,
 97 |         orient:'vertical',
 98 |         left: 'right'
 99 |     },
100 |     })
101 | }
102 | 
103 | var timer = setInterval(function(){
104 |     get_data()
105 |     if (!is_running){
106 |         clearInterval(timer);
107 |     };
108 | },{{ timeinterval }});
109 | </script>
110 | </body>
111 | </html>


--------------------------------------------------------------------------------
/stock_pledge/crawler.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # website: http://30daydo.com
 3 | # @Time : 2019/3/9 17:17
 4 | # @File : crawler.py
 5 | import datetime
 6 | import requests
 7 | 
 8 | # import grequests
 9 | import pandas as pd
10 | import numpy as np
11 | from setting import get_engine
12 | import tushare as ts
13 | 
14 | # 2018.03.05 后才有数据
15 | 
16 | url = 'http://www.chinaclear.cn/cms-rank/downloadFile?queryDate={}&type=proportion'
17 | 
18 | headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
19 |            'Accept-Encoding': 'gzip,deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache',
20 |            'Pragma': 'no-cache', 'Proxy-Connection': 'keep-alive',
21 |            # 'Referer': 'http://www.chinaclear.cn/cms-rank/queryPledgeProportion?action=query&queryDate=2019.03.09&secCde=&page=3',
22 |            'Upgrade-Insecure-Requests': '1',
23 |            'User-Agent': 'Mozilla/5.0(Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36'}
24 | 
25 | engine = get_engine('db_pledge', 'local')
26 | 
27 | 
28 | class PledgeSpider():
29 | 
30 |     def __init__(self):
31 |         self.start = datetime.datetime.now()
32 |         self.delta= 400
33 | 
34 | 
35 |     def start_task(self):
36 |         pass
37 | 
38 |     def handle_exception(self,request,exception):
39 |         print('process error')
40 | 
41 |     def crawl(self):
42 |         # tasks=[]
43 |         # date_list =[]
44 |         for i in range(self.delta):
45 |             fetch_day = self.start+datetime.timedelta(days=-1*i)
46 |             if fetch_day < datetime.datetime(year=2018,month=3,day=4):
47 |                 break
48 | 
49 |             if not ts.is_holiday(fetch_day.strftime('%Y-%m-%d')):
50 |                 name=fetch_day.strftime('%Y-%m-%d')
51 |                 try:
52 |                     day=url.format(fetch_day.strftime('%Y.%m.%d'))
53 |                     print(day)
54 |                     r=requests.get(url=day,headers=headers,timeout=20)
55 |                 except Exception as e:
56 |                     print(e)
57 |                 else:
58 |                     print(r.status_code)
59 |                     with open('{}.xls'.format(name), 'wb') as f:
60 |                         f.write(r.content)
61 |                 # tasks.append(grequests.get(url=url.format(fetch_day.strftime('%Y.%m.%d'))))
62 | 
63 |             # date_list.append(fetch_day.strftime('%Y-%m-%d'))
64 | 
65 |         # resp = grequests.map(tasks,size=8,exception_handler=self.handle_exception)
66 |         # for index,r in enumerate(resp):
67 |         #     with open('{}.xls'.format(date_list[index]),'wb') as f:
68 |         #         f.write(r.content)
69 | 
70 | 
71 |     def data_transfer(self):
72 |         df = pd.read_excel('pledge.xls', header=2, dtype={'证券代码': np.str})
73 |         df = df.reset_index(drop=True)
74 |         return df
75 | 
76 | 
77 | pledge = PledgeSpider()
78 | pledge.crawl()
79 | # df = pledge.data_transfer()
80 | 


--------------------------------------------------------------------------------
/cuiqingcai/async_sandbox/spiders/example.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import datetime
 3 | import re
 4 | 
 5 | import scrapy
 6 | from scrapy import Request
 7 | import logging
 8 | from async_sandbox.items import AsyncSandboxItem
 9 | 
10 | 
11 | class ExampleSpider(scrapy.Spider):
12 |     name = 'example'
13 |     # 技术
14 |     # BASE_URL = 'https://cuiqingcai.com/category/technique/page/{}'
15 |     # 生活
16 |     BASE_URL = 'https://cuiqingcai.com/category/life/page/{}'
17 | 
18 |     def start_requests(self):
19 |         start_page = 1
20 | 
21 |         yield Request(
22 |             url=self.BASE_URL.format(start_page),
23 |             meta={'page': start_page}
24 |         )
25 | 
26 |     def parse(self, response):
27 |         page = response.meta['page']
28 |         next_page = page + 1
29 | 
30 |         articles = response.xpath('//article[@class="excerpt"]')
31 |         for article in articles:
32 |             item = AsyncSandboxItem()
33 |             category = article.xpath('./header/a[1]/text()').extract_first()
34 |             title = article.xpath('./header/h2/a[1]/text()').extract_first()
35 |             article_url = article.xpath('./header/h2/a[1]/@href').extract_first()
36 |             item['title'] = title
37 |             item['category'] = category
38 |             item['article_url'] = article_url
39 | 
40 |             yield Request(
41 |                 url=article_url,
42 |                 callback=self.parse_item,
43 |                 meta={'item': item}
44 |             )
45 | 
46 |         if next_page < 900:
47 |             yield Request(
48 |                 url=self.BASE_URL.format(next_page),
49 |                 meta={'page': next_page}
50 |             )
51 | 
52 |     def parse_item(self, response):
53 |         item = response.meta['item']
54 |         author = response.xpath(
55 |             '//header[@class="article-header"]//i[@class="fa fa-user"]/following::*[1]/text()').extract_first()
56 |         visited = response.xpath(
57 |             '//header[@class="article-header"]//i[@class="fa fa-eye"]/parent::*[1]/text()').extract_first()
58 |         comment = response.xpath(
59 |             '//header[@class="article-header"]//i[@class="fa fa-comments-o"]/following-sibling::*[1]/text()').extract_first()
60 |         liked = response.xpath('//span[@class="count"]/text()').extract_first()
61 |         created_at = response.xpath(
62 |             '//header[@class="article-header"]//i[@class="fa fa-clock-o"]/parent::*[1]/text()').extract_first()
63 |         content = response.xpath('//article[@class="article-content"]')[0].xpath('string(.)').extract()[0]
64 | 
65 |         item['author'] = author
66 |         item['created_at'] = created_at
67 |         item['content'] = content
68 |         visited=re.sub('浏览','',visited)
69 |         item['visited'] = visited
70 |         comment=re.sub('评论','',comment)
71 |         item['comment'] = comment
72 |         item['liked'] = liked
73 |         item['crawltime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
74 |         yield item
75 | 


--------------------------------------------------------------------------------
/52sh/aio_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # website: http://30daydo.com
 3 | # @Time : 2020/9/24 12:09
 4 | # @File : aio_spider.py
 5 | import asyncio
 6 | import aiohttp
 7 | import aiofiles
 8 | import os
 9 | 
10 | import re
11 | 
12 | from config_file import START_URL, HEADERS, PROXY_STR,SIMPLE_HEADERS
13 | from parsel import Selector
14 | 
15 | 
16 | async def fetch(url):
17 |     async with aiohttp.ClientSession() as session:
18 |         async with session.get(url=url,
19 |                                headers=HEADERS,
20 |                                proxy=PROXY_STR,
21 |                                ) as response:
22 |             text = await response.text()
23 |             resp = Selector(text=text)
24 |             nodes = resp.xpath('//div[@class="kl1-2"]')
25 |             for node in nodes:
26 |                 next_url = node.xpath('.//div[@class="kl1-2a2"]/a/@href').extract_first()
27 |                 title = node.xpath('.//div[@class="kl1-2a2"]/a/@title').extract_first()
28 |                 await detail(session=session, next_url=next_url, title=title)
29 |                 print('next page')
30 | 
31 | 
32 | async def detail(**kwargs):
33 |     session = kwargs['session']
34 |     next_url = kwargs['next_url']
35 |     title = kwargs['title']
36 |     print(next_url)
37 |     print(title)
38 |     async with session.get(
39 |             url=next_url,
40 |             headers=HEADERS,
41 |             proxy=PROXY_STR,
42 |     ) as response:
43 |         text = await response.text()
44 |         resp = Selector(text=text)
45 |         nodes = resp.xpath('//div[@class="kl2-1"]//img/@src').extract()
46 |         nodes = list(set(nodes))
47 |         for img in nodes:
48 |             # print(img)
49 |             await download_img(session=session,url=img,title=title)
50 |             print('next image')
51 | 
52 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
53 | 
54 | async def download_img(**kwargs):
55 |     url= kwargs['url']
56 |     title= kwargs['title']
57 | 
58 |     title = title.replace(' ','_')
59 |     title = re.sub('[\/:*?"<>|]', '-', title)
60 |     if not os.path.exists(title):
61 |         os.mkdir(title)
62 | 
63 |     filename = url.split('/')[-1]
64 |     if not filename.endswith(('png','jpg','jpeg')):
65 |         return
66 |     save_file = os.path.join(title,filename)
67 | 
68 |     if os.path.exists(save_file):
69 |         return
70 |     print('saving image - ')
71 |     try:
72 |         conn = aiohttp.TCPConnector(ssl=False)  # 防止ssl报错
73 |         async with aiohttp.ClientSession(connector=conn, trust_env=True) as session:
74 |             async with session.get(url=url, headers=SIMPLE_HEADERS, proxy=PROXY_STR) as response:
75 | 
76 |                 if response.status>=200 and response.status<300:
77 |                     f=await aiofiles.open(save_file,'wb')
78 |                     await f.write(await response.read())
79 |                     await f.close()
80 | 
81 |     except Exception as e:
82 |         print(e)
83 |         print(url)
84 |         return
85 | 
86 | async def main():
87 |     total_page = 3640
88 |     for page in range(0,total_page,35):
89 | 
90 |         url = START_URL.format(page=page)
91 |         await fetch(url)
92 |         await asyncio.sleep(0)
93 |         print(f'downing page {page}-')
94 | loop = asyncio.get_event_loop()
95 | loop.run_until_complete(main())
96 | 


--------------------------------------------------------------------------------
/51jbnet/im_sandbox/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | #
  5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 | from scrapy.exceptions import DropItem
  8 | 
  9 | class ImSandboxPipeline(object):
 10 |     def process_item(self, item, spider):
 11 |         return item
 12 | 
 13 | 
 14 | import datetime
 15 | 
 16 | import pymongo
 17 | 
 18 | from im_sandbox.settings import MONGODB, ES_HOST
 19 | from im_sandbox import models
 20 | from im_sandbox.models import scoped_session
 21 | from elasticsearch import Elasticsearch
 22 | from scrapy import log
 23 | 
 24 | 
 25 | class im_sandboxMongoPipeline(object):
 26 | 
 27 |     def __init__(self):
 28 |         self._db = MONGODB.get('db')
 29 |         self._collection = MONGODB.get('collection')
 30 |         self._host = MONGODB.get('host')
 31 |         self._port = MONGODB.get('port')
 32 |         self._client = pymongo \
 33 |             .MongoClient(host=self._host, port=self._port) \
 34 |             .get_database(self._db) \
 35 |             .get_collection(self._collection)
 36 | 
 37 |     def process_item(self, item, spider):
 38 |         self._client.create_index([('title', pymongo.DESCENDING)], background=True)
 39 |         self._client.update_one(filter={'title': item['title']}, update={'$set': dict(item)}, upsert=True)
 40 |         return item
 41 | 
 42 | 
 43 | class im_sandboxMysqlPipeline(object):
 44 | 
 45 |     def process_item(self, item, spider):
 46 |         sql_im_sandbox = models.SpiderModel()
 47 |         sql_im_sandbox = models.map_orm_item(scrapy_item=item, sql_item=sql_im_sandbox)
 48 |         with scoped_session() as session:
 49 |             session.add(sql_im_sandbox)
 50 | 
 51 |         return item
 52 | 
 53 | 
 54 | class ESPipeline(object):
 55 |     def __init__(self):
 56 |         self.index = '51jbnet'
 57 |         self.doc = 'doc'
 58 |         self.es = Elasticsearch(ES_HOST)
 59 | 
 60 |     def process_item(self, item, spider):
 61 |         crawltime = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
 62 |         url = item.get('url', None)
 63 |         if not url:
 64 |             raise FileNotFoundError('url is empty')
 65 | 
 66 |         query_body = {
 67 |             "query":
 68 |                 {
 69 |                     "term": {
 70 |                         "url": url
 71 |                     }
 72 |                 }
 73 |         }
 74 | 
 75 |         # 去重
 76 |         try:
 77 |             query_result = self.es.search(index=self.index, body=query_body)
 78 | 
 79 |         except Exception as e:
 80 |             log.msg(e)
 81 |             raise ConnectionError('查询ES报错')
 82 | 
 83 |         hits=query_result.get('hits',{}).get('hits',[])
 84 | 
 85 |         if hits:
 86 | 
 87 |            raise DropItem('Duplication item')
 88 | 
 89 |         body = {
 90 |             "pubdate": item["pubdate"],
 91 |             "title": item["title"],
 92 |             "url": item["url"],
 93 |             "crawled_datetime": crawltime,
 94 |             "category": item['category'],
 95 |         }
 96 | 
 97 |         try:
 98 |             self.es.index(index=self.index, doc_type=self.doc, body=body)
 99 |         except Exception as e:
100 |             log.msg('错误 >>>>>')
101 |             log.msg(e)
102 |         return item
103 | 


--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymysql
 8 | from twisted.enterprise import adbapi
 9 | import logging
10 | import pymongo
11 | from scrapy.exceptions import DropItem
12 | 
13 | class AsyncSQLPipeline(object):
14 |     def __init__(self):
15 |         self.dbpool = adbapi.ConnectionPool('pymysql',host='192.168.1.100',port=3306,user='root',password='*',db='spider_test')
16 |         # self.cursor = self.conn.cursor()
17 | 
18 |     def process_item(self, item, spider):
19 |         update_=self.dbpool.runInteraction(self.update,item)
20 |         update_.addErrback(self.handle_error,item,spider)
21 | 
22 |         return item
23 | 
24 |     def update(self,cursor,item):
25 |         insert_sql = 'insert into tb_cuiqingcai (category,title,article_url,content,author,created_at,liked,visited,comment,crawltime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
26 |         data=(item['category'],item['title'],item['article_url'],item['content'],item['author'],item['created_at'],item['liked'],item['visited'],item['comment'],item['crawltime']
27 |               )
28 |         cursor.execute(insert_sql,data)
29 | 
30 |     def handle_error(self,failure,item,spider):
31 |         logging.error('写入数据库异常--->')
32 |         logging.error(failure)
33 |         logging.error('error item')
34 |         logging.error(item)
35 | 
36 | class MongoPipeline(object):
37 | 
38 |     def __init__(self,host,port,db,doc):
39 |         client = pymongo.MongoClient(host,port)
40 |         self.doc=client[db][doc]
41 | 
42 |     @classmethod
43 |     def from_crawler(cls,crawler):
44 |         print('in from crawler')
45 |         host = crawler.settings.get('MONGO_HOST')
46 |         port = crawler.settings.getint('MONGO_PORT')
47 |         db = crawler.settings.get('MONGO_DB')
48 |         doc = crawler.settings.get('MONGO_DOC')
49 | 
50 | 
51 |         print(f'host {host}')
52 |         return cls(host,port,db,doc)
53 | 
54 |     def open_spider(self,spider):
55 |         print('spider open')
56 | 
57 |     def process_item(self,item,spider):
58 |         print('in mongopipeline')
59 |         
60 |         if item is None:
61 |             print('item is None')
62 |         else:
63 |             print('item is not None')
64 |         print(f'receive item -> len is {len(item)}')
65 |         # self.doc.insert(dict(item))
66 |         return item
67 | 
68 |     def close_spider(self,spider):
69 |         print('closing in pipeline')
70 | 
71 | class JSONPipeline(object):
72 | 
73 |     def __init__(self,host,port,db,doc):
74 |         pass
75 | 
76 |     @classmethod
77 |     def from_crawler(cls,crawler):
78 |         print('in from crawler')
79 |         host = crawler.settings.get('MONGO_HOST')
80 |         port = crawler.settings.getint('MONGO_PORT')
81 |         db = crawler.settings.get('MONGO_DB')
82 |         doc = crawler.settings.get('MONGO_DOC')
83 |         
84 | 
85 |         print(f'host {host}')
86 |         return cls(host,port,db,doc)
87 | 
88 |     def open_spider(self,spider):
89 |         print('spider open')
90 | 
91 |     def process_item(self,item,spider):
92 |         print('in JSON pipeline')
93 |         print(f'receive item -> len is {len(item)}')
94 | 
95 |         # return item
96 |         raise DropItem(item)
97 | 
98 |     def close_spider(self,spider):
99 |         print('closing in pipeline')


--------------------------------------------------------------------------------
/fraud/fraud/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for fraud project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'fraud'
13 | 
14 | SPIDER_MODULES = ['fraud.spiders']
15 | NEWSPIDER_MODULE = 'fraud.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'fraud (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | # CONCURRENT_REQUESTS_PER_DOMAIN = 1
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | COOKIES_ENABLED = True
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'fraud.middlewares.FraudSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | # DOWNLOADER_MIDDLEWARES = {
56 | #    'fraud.middlewares.DynamicProxyMiddleware': 543,
57 | # }
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'fraud.pipelines.FraudPipeline': 300,
69 |    # 'fraud.pipelines.DuplicatesPipeline': 200,
70 | }
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 


--------------------------------------------------------------------------------
/MyLibrary/sandbox/sandbox/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for sandbox project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'sandbox'
13 | 
14 | SPIDER_MODULES = ['sandbox.spiders']
15 | NEWSPIDER_MODULE = 'sandbox.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'sandbox (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'sandbox.middlewares.SandboxSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | # DOWNLOADER_MIDDLEWARES = {
56 | #    'sandbox.middlewares.RandomUserAgent': 543,
57 | #    'sandbox.middlewares.ProxyMiddleware': 553,
58 | # }
59 | 
60 | # Enable or disable extensions
61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | #    'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 | 
66 | # Configure item pipelines
67 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 |    # 'sandbox.pipelines.SQLPipeline': 300,
70 |    'sandbox.pipelines.MongoPipeline': 100,
71 | }
72 | 
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 | 
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 | 


--------------------------------------------------------------------------------
/anjuke/anjuke.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import codecs
 3 | import json
 4 | import re
 5 | import urllib
 6 | from lxml import etree
 7 | import requests
 8 | 
 9 | 
10 | def query(kw):
11 |     for i in range(1, 10):
12 |         encode_kw = urllib.quote(kw)
13 |         print(i)
14 |         url = 'https://m.anjuke.com/ajax/autocomplete/?city_id=13&kw=%s&from=1&callback=jsonp%d' % (encode_kw, i)
15 |         s = requests.Session()
16 |         headers = {
17 |             'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'}
18 |         js = s.get(url, headers=headers)
19 |         print(js.status_code)
20 |         # print(js.text)
21 |         try:
22 |             result = re.findall('jsonp7\((.*?)\);', js.text)[0]
23 |             dic = json.loads(result)
24 |             print('*' * 20)
25 |             print(dic['data']['match'][0]['comm_id'])
26 |         except Exception as e:
27 |             print(e)
28 | 
29 | 
30 | # 获取安居客的城市列表
31 | def getcitylist():
32 |     headers = {'Accept-Language': ' zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': ' gzip, deflate',
33 |                'Connection': ' keep-alive',
34 |                'Accept': ' text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
35 |                'User-Agent': ' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0',
36 |                'Host': ' m.anjuke.com', 'Referer': ' https://m.anjuke.com/bj/',
37 |                'Cookie': ' aQQ_ajkguid=145D8A4E-6387-1752-E32C-D4EFB4EBFE09; lps="/|"; ctid=14; 58tj_uuid=fdb54be9-84d6-4511-ad1e-3227c1eac9ae; new_session=0; init_refer=; new_uv=1; sessid=AD7C8189-AB56-4CAF-1BAC-FF0CCD27668C'}
38 |     url = 'https://m.anjuke.com/cityList/'
39 |     r = requests.get(url=url, headers=headers)
40 |     print(r.status_code)
41 |     tree = etree.HTML(r.text)
42 |     word=u'其他'
43 |     node = tree.xpath('//div[@class="cl-c-l-h" and @id !="letter-%s"]/following-sibling::*[1]' %word)
44 |     dicts ={}
45 |     for i in node:
46 |         name =  i.xpath('.//li/a/text()')
47 |         link= i.xpath('.//li/a/@href')
48 |         if len(name) != len(link):
49 |             for j in name:
50 |                 print(j)
51 |             for k in link:
52 |                 print(k)
53 | 
54 |         for index in range(len(name)):
55 |             short_cut=link[index].split('/')[3]
56 |             dicts[short_cut]=name[index]
57 | 
58 |     return dicts
59 | 
60 | def debug_page():
61 | 
62 |     headers = {'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0', 'Host': 'm.anjuke.com', 'Cookie': 'aQQ_ajkguid=0B0A627A-FCF1-2B6A-2ADF-56DD166B0EBC; ctid=13; lps="/|"; sessid=804075FD-7FE8-E9C0-FA60-2FCB76C5B6B3; 58tj_uuid=02402201-d0d6-48de-8e58-6432612af29d; new_session=0; init_refer=; new_uv=1', 'Upgrade-Insecure-Requests': '1'}
63 | 
64 |     url='https://m.anjuke.com/dg/community/279422/'
65 |     r=requests.get(url=url,headers=headers)
66 |     print(r.status_code)
67 |     tree = etree.HTML(r.text)
68 |     return tree
69 | 
70 | #if __name__=="__main__":
71 |     #debug_page()
72 |     # query('南方明珠花园二期1栋')
73 |     #d = getcitylist()
74 |     #f=codecs.open('anjuke_city','w',encoding='utf-8')
75 |     #json.dump(d,f,ensure_ascii=False)
76 |     #for k,v in d.items():
77 |         #print(k,v)
78 | 
79 | tree=debug_page()


--------------------------------------------------------------------------------
/lanrentingshu/lrts/lrts/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for lrts project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'lrts'
13 | 
14 | SPIDER_MODULES = ['lrts.spiders']
15 | NEWSPIDER_MODULE = 'lrts.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'lrts (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'lrts.middlewares.LrtsSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'lrts.middlewares.LrtsDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     # MyFilesPipeline
69 |     # 'scrapy.pipelines.files.FilesPipeline':1
70 |    'lrts.pipelines.MyFilesPipeline': 300,
71 | }
72 | FILES_STORE='C:\\git\\CrawlMan\\lanrentingshu\\lrts\\lrts\\data'
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 | 
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 | 


--------------------------------------------------------------------------------
/fraud/fraud/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | import time
10 | import hashlib
11 | 
12 | 
13 | class FraudSpiderMiddleware(object):
14 |     # Not all methods need to be defined. If a method is not defined,
15 |     # scrapy acts as if the spider middleware does not modify the
16 |     # passed objects.
17 | 
18 |     @classmethod
19 |     def from_crawler(cls, crawler):
20 |         # This method is used by Scrapy to create your spiders.
21 |         s = cls()
22 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
23 |         return s
24 | 
25 |     def process_spider_input(self, response, spider):
26 |         # Called for each response that goes through the spider
27 |         # middleware and into the spider.
28 | 
29 |         # Should return None or raise an exception.
30 |         return None
31 | 
32 |     def process_spider_output(self, response, result, spider):
33 |         # Called with the results returned from the Spider, after
34 |         # it has processed the response.
35 | 
36 |         # Must return an iterable of Request, dict or Item objects.
37 |         for i in result:
38 |             yield i
39 | 
40 |     def process_spider_exception(self, response, exception, spider):
41 |         # Called when a spider or process_spider_input() method
42 |         # (from other spider middleware) raises an exception.
43 | 
44 |         # Should return either None or an iterable of Response, dict
45 |         # or Item objects.
46 |         pass
47 | 
48 |     def process_start_requests(self, start_requests, spider):
49 |         # Called with the start requests of the spider, and works
50 |         # similarly to the process_spider_output() method, except
51 |         # that it doesn’t have a response associated.
52 | 
53 |         # Must return only requests (not items).
54 |         for r in start_requests:
55 |             yield r
56 | 
57 |     def spider_opened(self, spider):
58 |         spider.logger.info('Spider opened: %s' % spider.name)
59 | '''
60 | class DynamicProxyMiddleware(object):
61 |     def process_request(self, request, spider):
62 |         # time.sleep(1)
63 |         auth_header = self.get_auth_header()
64 |         request.meta['proxy'] = "http://s3.proxy.mayidaili.com:8123"
65 |         request.headers['Proxy-Authorization'] = auth_header
66 | 
67 |     def get_auth_header(self):
68 |         # 请替换app_key和secret
69 |         app_key = "67783764"
70 |         secret = "6151eb360668ca10ad772ca9e46d306b"
71 | 
72 |         param_map = {
73 |             "app_key": app_key,
74 |             "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),  # 如果你的程序在国外，请进行时区处理
75 |             "enable-simulate": 'true',
76 |             "random-useragent": 'pc',
77 |             "clear-cookies": 'true'
78 |         }
79 |         # 排序
80 |         keys = param_map.keys()
81 |         keys.sort()
82 | 
83 |         codes = "%s%s%s" % (secret, str().join('%s%s' % (key, param_map[key]) for key in keys), secret)
84 | 
85 |         # 计算签名
86 |         sign = hashlib.md5(codes).hexdigest().upper()
87 | 
88 |         param_map["sign"] = sign
89 | 
90 |         # 拼装请求头Proxy-Authorization的值
91 |         keys = param_map.keys()
92 |         auth_header = "MYH-AUTH-MD5 " + str('&').join('%s=%s' % (key, param_map[key]) for key in keys)
93 | 
94 |         # print time.strftime("%Y-%m-%d %H:%M:%S")
95 |         # print authHeader
96 | 
97 |         return auth_header
98 | '''


--------------------------------------------------------------------------------
/51jbnet/im_sandbox/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for im_sandbox project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'im_sandbox'
13 | 
14 | SPIDER_MODULES = ['im_sandbox.spiders']
15 | NEWSPIDER_MODULE = 'im_sandbox.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'im_sandbox (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | # DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'im_sandbox.middlewares.ImSandboxSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'im_sandbox.middlewares.ImSandboxDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'im_sandbox.pipelines.ESPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | MYSQL_DB_URI='mysql+pymysql://root:*@127.0.0.1:3306/spider?charset=utf8'
92 | MONGODB=''
93 | ES_HOST='10.18.6.102'


--------------------------------------------------------------------------------
/baiduwanpan/baiduwanpan.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import time
 3 | import sys
 4 | header = {'Origin': 'https://pan.baidu.com', 'Content-Length': '26', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'X-Requested-With': 'XMLHttpRequest', 'Host': 'pan.baidu.com', 'Accept': '*/*', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 'Connection': 'keep-alive', 'Cookie': 'BAIDUID=11BC8C5D223E048DDCCF45DA68C96329:FG=1; BIDUPSID=11BC8C5D223E048DDCCF45DA68C96329; PSTM=1502071949; __cfduid=dbc4d8c8a8ff8f8f56693bf9911a78f9a1502257445; PANWEB=1; bdshare_firstime=1502276137037; BDSFRCVID=4g8sJeC62lrjCp3ZxSq0MencMmK52YjTH6aotvr5NjaXcbr6amOqEG0PqM8g0Ku-aG3kogKK3gOTH4nP; H_BDCLCKID_SF=JJkH_CIMJCvbfP0k5bo0M-FSMMrX5C62aJ3DW45bWJ5TMC_w5l6KWbDl2-O0Qfr-aD7uWx022bubShPC-tnGM4IzWfon363D-a6U-xDE3l02V-j9e-t2ynQDDljRq4RMW20e0h7mWIb_VKFCjTKhejO0epJf-K6Jb6Q3BROS2RrHKROkeUOlyJtpbt-qJjcqyjrvQfcy3nTZ8J5k-UcV3T0fhGJnBT5Kaa6BBqQw5xbNM-jR0qJl0DukQN3TbRkO5bRiL6C-bq-BDn3oyTbJXp0njMTTqj_efnCDoD8QKbRofJ-k-4QEbbQH-UnLq-LqX57Z0l8Ktt3_ohjSyl6W0pLHXfoX5MrLWbTPbI3mWIQHSRQLLx7m5-KyjMne3JcpLa74KKJx-xKWeIJo5Dc6D6kzhUJiB5JMBan7_nrxfDD5bKDlD6-3-PAe5f8X5to05TIX3b7Ef-5ZM-O_bf--DR-HW-Q7BqTOL5RL2R58Kh6VOI5a05Jxy5K_3xjz3fvTbIce_n7b0tT4VUOHQT3mKqQbbN3i-CrgtJblWb3cWKOJ8UbSj-Tme6jXeautJ6F8f5vfL5rDa-n5HJjRq4bohjPjMPQeBtQmJJrtahRCMl7AJMO3Mxcqh4tIhtnCtp5BQg-q3R71MqvZMbrHBUQPbj8AWa5w0x-jLT6PVn0MW-5D8h6nLPnJyUnybPnnBT3XLnLHoDPXJCDBbDv65nt_b44bKUQKbK62aKDs5lRc-hcqEIL45fRaDq47Wl7gLtcu5Co22R6cJRuK8UbSj4QoXbIUWHOX0lRC3DTu3toufp5nhMJl3j7JDMP0-4vu5MJy523iob3vQpPMDxtuj68WejcXjNRjtnOe5C6H3bP8tCLWb5rnhPF3j-bbKP6-35KHaTrB5-tbytn6qDJEbtTjXtuUjH5kaq37JD6yLPQ-Jlr8Hfnn-RK--tugKtoxJpODBRbMopvaHRjnhnvvbURvDP-g3-AJ2q8EK5r2SC-ytI_-3J; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a02553875233; MCITY=-257%3A; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; PSINO=7; H_PS_PSSID=1455_21114_17001_19897; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; STOKEN=98916c84333e810c2b1d715bb7f7cf805ae2faf839dc1e7b2ffea14af9a43422; SCRC=e189858affb6c034f51facb687ba42a3; BDCLND=Z12FNBCnoSTSfwubbu7R1dmuJgAkUv%2FVXMPFC%2FhXqtw%3D; PANPSC=8159382662928957333%3A0tGXwXye%2FVgybgBxVCVQs9wxnZzNwr1w%2Fi1kePBHTIGypp29WjDdFHgXofrWESI4GPVIaAX1Mx4yLJx7kL47ECcTFj%2FtuMrTJEGGcevXkUatUq%2FdzxBw4vvqPIbe4OQ9iyFns5yFArUpANCmD7pcJX5IlZf3%2F0X8eJFOG%2FXb%2FW8u%2BjscPFpwMA%3D%3D; Hm_lvt_7a3960b6f067eb0085b7f96ff5e660b0=1504793178,1504793213,1504793250,1504793289; Hm_lpvt_7a3960b6f067eb0085b7f96ff5e660b0=1505901469', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'Referer': 'https://pan.baidu.com/share/init?surl=o8zEuJC', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}
 5 | import requests
 6 | import re
 7 | 
 8 | for _ in range(100):
 9 | # re.sub('\d',)
10 |     if sys.version_info.major <3:
11 |         t = str(long(time.time() * 1000))
12 |     else:
13 |         t = str(int(time.time() * 1000))
14 |     #print(t)
15 |     url='https://pan.baidu.com/share/verify?surl=o8zEuJC&t=%s&bdstoken=null&channel=chunlei&clienttype=0&web=1&app_id=250528&logid=MTUwNTkwMTQ3NzYzNjAuNTQwMjcwOTYwMTg0MTkyOA==' %t
16 |     #url = 'https://pan.baidu.com/share/verify?surl=mhPHC7Y&t=%s&bdstoken=c5232d2c47ec22f6fb2de6a151828c91&channel=chunlei&clienttype=0&web=1&app_id=250528&logid=MTUwNTkwMDQyNDI2MzAuNDQyNTQxMzMyNDU0MTQ4NQ==' % t
17 |     data = {'pwd': '2222', 'vcode': '', 'vcode_str': ''}
18 |     r = requests.post(url=url, data=data, headers=header)
19 |     js = r.json()
20 |     print(js)
21 | 
22 | pw='gxrr'
23 | data = {'pwd': pw, 'vcode': '', 'vcode_str': ''}
24 | r = requests.post(url=url, data=data, headers=header)
25 | js = r.json()
26 | print(js)


--------------------------------------------------------------------------------