旌旗漫卷
--------------------------------------------------------------------------------
/myubbs/run.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2018/9/26 9:58
4 | # @File : run.py
5 | import datetime
6 |
7 | from scrapy import cmdline
8 | name = 'myubbs'
9 | current = datetime.date.today()
10 | cmd = 'scrapy crawl {} -s LOG_FILE={}.log'.format(name,current)
11 | cmdline.execute(cmd.split())
--------------------------------------------------------------------------------
/jd/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = jd.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = jd
12 |
--------------------------------------------------------------------------------
/fraud/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = fraud.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = fraud
12 |
--------------------------------------------------------------------------------
/weibo/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = weibo.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = weibo
12 |
--------------------------------------------------------------------------------
/bbssmth/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = bbssmth.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = bbssmth
12 |
--------------------------------------------------------------------------------
/chahaoba/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = sandbox.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sandbox
12 |
--------------------------------------------------------------------------------
/kc0011/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = sandbox.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sandbox
12 |
--------------------------------------------------------------------------------
/myubbs/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = sandbox.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sandbox
12 |
--------------------------------------------------------------------------------
/poi_gaode/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = sandbox.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sandbox
12 |
--------------------------------------------------------------------------------
/sz_yaohao/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = sandbox.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sandbox
12 |
--------------------------------------------------------------------------------
/tiexue/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = sandbox.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sandbox
12 |
--------------------------------------------------------------------------------
/51jbnet/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = im_sandbox.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = im_sandbox
12 |
--------------------------------------------------------------------------------
/bilibili/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = bilibili.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = bilibili
12 |
--------------------------------------------------------------------------------
/lanrentingshu/lrts/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = lrts.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = lrts
12 |
--------------------------------------------------------------------------------
/MyLibrary/sandbox/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = sandbox.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sandbox
12 |
--------------------------------------------------------------------------------
/tencentjob/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = tencentjob.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = tencentjob
12 |
--------------------------------------------------------------------------------
/cuiqingcai/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = async_sandbox.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = async_sandbox
12 |
--------------------------------------------------------------------------------
/async_cuiqingcai/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = async_sandbox.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = async_sandbox
12 |
--------------------------------------------------------------------------------
/weibo/weibo/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class WeiboPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/bilibili/bilibili/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class BilibiliPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/weibo/weibo/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class WeiboItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/bilibili/bilibili/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class BilibiliItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/lanrentingshu/lrts/lrts/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class LrtsItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/lanrentingshu/header_toolkit.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | def getheader():
4 | with open('request_header') as fp:
5 | data=fp.readlines()
6 | dictionary=dict()
7 | for line in data:
8 | line=line.strip()
9 | dictionary[line.split(":")[0]]=':'.join(line.split(":")[1:])
10 | return dictionary
11 | if __name__=="__main__":
12 | print getheader()
--------------------------------------------------------------------------------
/bilibili/bilibili/spiders/bili.log:
--------------------------------------------------------------------------------
1 | 2018-08-15 15:12:59 - E:\git\CrawlMan\bilibili\bilibili\spiders\bili.py - INFO: - ====================================================
2 | 2018-08-15 15:12:59 - E:\git\CrawlMan\bilibili\bilibili\spiders\bili.py - INFO: -
--
3 | 2018-08-15 15:12:59 - E:\git\CrawlMan\bilibili\bilibili\spiders\bili.py - INFO: - ====================================================
4 |
--------------------------------------------------------------------------------
/MyLibrary/sandbox/sandbox/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | from scrapy import Item,Field
9 |
10 |
11 | class SpiderItem(Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | username=Field()
15 | password = Field()
16 |
17 |
--------------------------------------------------------------------------------
/chahaoba/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.pyc
3 | data.cfg
4 | *.mp3
5 | *.pkl
6 | *.xls
7 | *.xml
8 | *.csv
9 | *.pkl
10 | ~$d.xlsx
11 | d.xlsx
12 | data/
13 | temp
14 | request_header
15 | header_toolkit.txt
16 | *.xlsx
17 | *.log
18 | __pycache__/
19 | wikizhword.text
20 | news_tensite_xml.dat
21 | news_tensite_xml.smarty.dat
22 | *.jpg
23 | Download/
24 | Download_IMG/
25 | *.zip
26 | cookies
27 | httpcache
28 | config.py
29 | *.png
30 | full_name.dat
31 |
--------------------------------------------------------------------------------
/kc0011/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.pyc
3 | data.cfg
4 | *.mp3
5 | *.pkl
6 | *.xls
7 | *.xml
8 | *.csv
9 | *.pkl
10 | ~$d.xlsx
11 | d.xlsx
12 | data/
13 | temp
14 | request_header
15 | header_toolkit.txt
16 | *.xlsx
17 | *.log
18 | __pycache__/
19 | wikizhword.text
20 | news_tensite_xml.dat
21 | news_tensite_xml.smarty.dat
22 | *.jpg
23 | Download/
24 | Download_IMG/
25 | *.zip
26 | cookies
27 | httpcache
28 | config.py
29 | *.png
30 | full_name.dat
31 |
--------------------------------------------------------------------------------
/myubbs/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.pyc
3 | data.cfg
4 | *.mp3
5 | *.pkl
6 | *.xls
7 | *.xml
8 | *.csv
9 | *.pkl
10 | ~$d.xlsx
11 | d.xlsx
12 | data/
13 | temp
14 | request_header
15 | header_toolkit.txt
16 | *.xlsx
17 | *.log
18 | __pycache__/
19 | wikizhword.text
20 | news_tensite_xml.dat
21 | news_tensite_xml.smarty.dat
22 | *.jpg
23 | Download/
24 | Download_IMG/
25 | *.zip
26 | cookies
27 | httpcache
28 | config.py
29 | *.png
30 | full_name.dat
31 |
--------------------------------------------------------------------------------
/tiexue/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.pyc
3 | data.cfg
4 | *.mp3
5 | *.pkl
6 | *.xls
7 | *.xml
8 | *.csv
9 | *.pkl
10 | ~$d.xlsx
11 | d.xlsx
12 | data/
13 | temp
14 | request_header
15 | header_toolkit.txt
16 | *.xlsx
17 | *.log
18 | __pycache__/
19 | wikizhword.text
20 | news_tensite_xml.dat
21 | news_tensite_xml.smarty.dat
22 | *.jpg
23 | Download/
24 | Download_IMG/
25 | *.zip
26 | cookies
27 | httpcache
28 | config.py
29 | *.png
30 | full_name.dat
31 |
--------------------------------------------------------------------------------
/chahaoba/sync_data.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2019/8/22 16:56
4 | # @File : sync_data.py
5 | import redis
6 | r=redis.StrictRedis('10.18.6.46',db=8,decode_responses=True)
7 | import pymysql
8 | con = pymysql.connect(host='',port=,db='spider',user='',password='')
9 | cursor = con.cursor()
10 | cmd = 'select number from chahaoba'
11 | cursor.execute(cmd)
12 | ret = cursor.fetchall()
13 | for i in ret:
14 | r.sadd('chahaoba',i[0])
15 |
--------------------------------------------------------------------------------
/poi_gaode/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.pyc
3 | data.cfg
4 | *.mp3
5 | *.pkl
6 | *.xls
7 | *.xml
8 | *.csv
9 | *.pkl
10 | ~$d.xlsx
11 | d.xlsx
12 | data/
13 | temp
14 | request_header
15 | header_toolkit.txt
16 | *.xlsx
17 | *.log
18 | __pycache__/
19 | wikizhword.text
20 | news_tensite_xml.dat
21 | news_tensite_xml.smarty.dat
22 | *.jpg
23 | Download/
24 | Download_IMG/
25 | *.zip
26 | cookies
27 | httpcache
28 | config.py
29 | *.png
30 | full_name.dat
31 |
--------------------------------------------------------------------------------
/sz_yaohao/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.pyc
3 | data.cfg
4 | *.mp3
5 | *.pkl
6 | *.xls
7 | *.xml
8 | *.csv
9 | *.pkl
10 | ~$d.xlsx
11 | d.xlsx
12 | data/
13 | temp
14 | request_header
15 | header_toolkit.txt
16 | *.xlsx
17 | *.log
18 | __pycache__/
19 | wikizhword.text
20 | news_tensite_xml.dat
21 | news_tensite_xml.smarty.dat
22 | *.jpg
23 | Download/
24 | Download_IMG/
25 | *.zip
26 | cookies
27 | httpcache
28 | config.py
29 | *.png
30 | full_name.dat
31 |
--------------------------------------------------------------------------------
/MyLibrary/sandbox/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.pyc
3 | data.cfg
4 | *.mp3
5 | *.pkl
6 | *.xls
7 | *.xml
8 | *.csv
9 | *.pkl
10 | ~$d.xlsx
11 | d.xlsx
12 | data/
13 | temp
14 | request_header
15 | header_toolkit.txt
16 | *.xlsx
17 | *.log
18 | __pycache__/
19 | wikizhword.text
20 | news_tensite_xml.dat
21 | news_tensite_xml.smarty.dat
22 | *.jpg
23 | Download/
24 | Download_IMG/
25 | *.zip
26 | cookies
27 | httpcache
28 | config.py
29 | *.png
30 | full_name.dat
31 |
--------------------------------------------------------------------------------
/51jbnet/im_sandbox/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 | class SandboxItem(scrapy.Item):
11 | # define the fields for your item here like:
12 | # name = scrapy.Field()
13 | title=scrapy.Field()
14 | url=scrapy.Field()
15 | pubdate=scrapy.Field()
16 | category=scrapy.Field()
17 |
--------------------------------------------------------------------------------
/myubbs/sandbox/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | from scrapy import Item, Field
9 |
10 |
11 | class SpiderItem(Item):
12 | # define the fields for your item here like:
13 |
14 | title = Field()
15 | pubdate = Field()
16 | content = Field()
17 | author = Field()
18 | url = Field()
19 | crawltime = Field()
20 |
--------------------------------------------------------------------------------
/jd/jd/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class JdItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | # pass
15 | name=scrapy.Field()
16 | price=scrapy.Field()
17 | remark=scrapy.Field()
18 | publish=scrapy.Field()
19 | # shop=scrapy.Field()
--------------------------------------------------------------------------------
/dfcf/push_redis.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import redis
3 | r=redis.StrictRedis('192.168.10.48',db=5,decode_responses=True)
4 |
5 | name='todo.xlsx'
6 | df=pd.read_excel(name,dtype={'symbol':str})
7 | # print(df.head())
8 | new_list=df.loc[df.industry.str.contains('汽车'), :]['symbol'].tolist()
9 | # for i in df['代码'].values:
10 | # r.lpush('code_list',i)
11 | old_file = '要爬取的个股列表.xlsx'
12 | df2=pd.read_excel(old_file,dtype={'代码':str})
13 | old_list = df2['代码'].tolist()
14 | for item in new_list:
15 | if item not in old_list:
16 | r.set(item,0)
17 |
--------------------------------------------------------------------------------
/bbssmth/bbssmth/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | from scrapy import Item,Field
9 |
10 |
11 | class BbssmthItem(Item):
12 | # define the fields for your item here like:
13 | # name = Field()
14 | title = Field()
15 | content = Field()
16 | create_time = Field()
17 | url = Field()
18 | crawltime = Field()
19 | category = Field()
20 | author = Field()
21 | reply = Field()
22 |
--------------------------------------------------------------------------------
/jd/jd/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | from jd.items import JdItem
8 | import pymongo
9 | class JDPipeline(object):
10 | def __init__(self):
11 | self.mongo=pymongo.MongoClient('10.18.6.46',27001)
12 | self.doc=self.mongo['spider']['jd_book']
13 | def process_item(self, item, spider):
14 | self.doc.insert(dict(item))
15 | return item
16 |
--------------------------------------------------------------------------------
/chahaoba/sandbox/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | from scrapy import Item,Field
9 | import scrapy
10 |
11 | class SpiderItem(Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | _number = scrapy.Field()
15 | _city = scrapy.Field()
16 | _province = scrapy.Field()
17 | _card_type = scrapy.Field()
18 | _op = scrapy.Field()
19 | _card_detail= scrapy.Field()
20 |
--------------------------------------------------------------------------------
/fraud/fraud/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # from scrapy.item import Item, Field
3 | import scrapy
4 |
5 | class FraudItem(scrapy.Item):
6 | executed_name = scrapy.Field()
7 | gender = scrapy.Field()
8 | age = scrapy.Field()
9 | identity_number = scrapy.Field()
10 | court = scrapy.Field()
11 | province = scrapy.Field()
12 | case_number = scrapy.Field()
13 | performance = scrapy.Field() # 被执行人的履行情况
14 | disrupt_type_name = scrapy.Field() # 失信被执行人行为具体情形
15 | duty = scrapy.Field() # 生效法律文书确定的义务
16 | release_time = scrapy.Field()
17 |
--------------------------------------------------------------------------------
/sz_yaohao/sandbox/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | from scrapy import Item,Field
9 |
10 |
11 | class SpiderItem(Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | card=Field()
15 | accountLength = Field()
16 | cardName = Field()
17 | cardType = Field()
18 | mainAccount = Field()
19 | mainValue = Field()
20 | orgName = Field()
21 | crawltime = Field()
22 |
--------------------------------------------------------------------------------
/github_star/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | // Use IntelliSense to learn about possible attributes.
3 | // Hover to view descriptions of existing attributes.
4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5 | "version": "0.2.0",
6 | "configurations": [
7 |
8 | {
9 | "name": "Python: Current File",
10 | "type": "python",
11 | "request": "launch",
12 | "program": "${file}",
13 | "console": "integratedTerminal",
14 | "args": ["rockyzsu"]
15 | }
16 | ]
17 | }
--------------------------------------------------------------------------------
/tencentjob/tencentjob/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | # import scrapy
9 | from scrapy import Field,Item
10 |
11 | class TencentjobItem(Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | title = Field()
15 | catalog = Field()
16 | workLocation = Field()
17 | recruitNumber = Field()
18 | duty = Field()
19 | Job_requirement= Field()
20 | url = Field()
21 | publishTime = Field()
22 |
--------------------------------------------------------------------------------
/tencentjob/tencentjob/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import pymongo
8 | from collections import OrderedDict
9 | class TencentjobPipeline(object):
10 | def __init__(self):
11 | self.db = pymongo.MongoClient('localhost')
12 | self.collection = self.db['tencent']['job']
13 |
14 | def process_item(self, item, spider):
15 | self.collection.insert(OrderedDict(item))
16 | return item
17 |
--------------------------------------------------------------------------------
/m3u8_video/experience.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2019/12/2 9:17
4 | # @File : experience.py
5 | import requests
6 | url='https://jh0p4t0rh9rs9610ryc.exp.bcevod.com/mda-jjkxjt57fdsith87/mda-jjkxjt57fdsith87.m3u8.{}.ts'
7 | total = 253
8 | headers={'User-Agent':'Xiaomi'}
9 | data = 'data'
10 | for i in range(total+1):
11 | try:
12 | r = requests.get(url.format(i),headers=headers)
13 | except Exception as e:
14 | print(e)
15 | else:
16 | with open('data/{}.ts'.format(i),'wb') as f:
17 | f.write(r.content)
18 | print('done {}.ts'.format(i))
19 |
20 |
--------------------------------------------------------------------------------
/pornhub/README.md:
--------------------------------------------------------------------------------
1 |
2 | - ```运行在Python环境```
3 | - ```git clone https://github.com/formateddd/Pornhub ```
4 | - ```cd Pornhub && pip install -r requirements.txt```
5 | - ```python crawler.py webm```
6 | - 待程序运行完毕, 会在webm文件夹下download两页的webm缩略图,对应名称为详细页面的URL后缀
7 | - 运行```python crawler.py mp4```, 在MP4文件夹可看到下载好的MP4文件
8 |
9 | - ```运行在浏览器```
10 |
11 | - [安装油猴](http://tampermonkey.net/)
12 | - Create a new script, copy and paste the [code](https://raw.githubusercontent.com/formateddd/pornhub/master/tampermonkey.js).
13 |
14 |
15 |
16 | ## 加群分析共享爬虫项目代码:
17 |
18 | ## 759746505
19 |
20 |
--------------------------------------------------------------------------------
/sz_yaohao/sandbox/headers.txt:
--------------------------------------------------------------------------------
1 | Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
2 | Accept-Encoding: gzip, deflate, br
3 | Accept-Language: zh-CN,zh;q=0.9
4 | Cache-Control: no-cache
5 | Connection: keep-alive
6 | Content-Type: application/x-www-form-urlencoded
7 | Host: apply.jtys.sz.gov.cn
8 | Origin: http://xqctk.jtys.sz.gov.cn
9 | Pragma: no-cache
10 | Referer: http://xqctk.jtys.sz.gov.cn/?
11 | Upgrade-Insecure-Requests: 1
12 | User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36
--------------------------------------------------------------------------------
/csdn/getCSDN_Range.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 | # Get your range of csdn
3 | __author__ = 'rocky'
4 | import requests
5 | import re
6 | import time
7 |
8 | link = 'http://blog.csdn.net/yagamil/article/details/52858314'
9 | user_agent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
10 | header = {"User-Agent": user_agent}
11 | req = requests.get(link, headers=header)
12 | content =req.text
13 | p = re.search(r'
',content).group(1)
14 | today = time.strftime("%Y-%m-%d")
15 | f = open(r"D:\OneDrive\Stock_Data\csdn_range.txt", 'a')
16 | contents = today + '\t' + p + '\n'
17 | f.write(contents)
18 | f.close()
19 |
--------------------------------------------------------------------------------
/cuiqingcai/async_sandbox/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class AsyncSandboxItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | title = scrapy.Field()
15 | article_url = scrapy.Field()
16 | content = scrapy.Field()
17 | created_at = scrapy.Field()
18 | category = scrapy.Field()
19 | visited = scrapy.Field()
20 | comment = scrapy.Field()
21 | liked = scrapy.Field()
22 | author = scrapy.Field()
23 | crawltime = scrapy.Field()
24 |
--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class AsyncSandboxItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | title = scrapy.Field()
15 | article_url = scrapy.Field()
16 | # content = scrapy.Field()
17 | created_at = scrapy.Field()
18 | category = scrapy.Field()
19 | visited = scrapy.Field()
20 | comment = scrapy.Field()
21 | liked = scrapy.Field()
22 | author = scrapy.Field()
23 | crawltime = scrapy.Field()
24 |
--------------------------------------------------------------------------------
/lanrentingshu/lrts/lrts/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | from scrapy.pipelines.files import FilesPipeline
9 | from urllib.parse import urlparse
10 | from os.path import basename,dirname,join
11 | class LrtsPipeline(object):
12 | def process_item(self, item, spider):
13 | return item
14 |
15 | class MyFilesPipeline(FilesPipeline):
16 |
17 | def file_path(self, request, response=None, info=None):
18 | path = urlparse(request.url).path
19 | return join(basename(dirname(path)),basename(path))
20 |
--------------------------------------------------------------------------------
/tiexue/sandbox/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | from scrapy import Item,Field
9 | import scrapy
10 |
11 | class SpiderItem(Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | title = scrapy.Field()
15 | article_url = scrapy.Field()
16 | content = scrapy.Field()
17 | created_at = scrapy.Field()
18 | # category = scrapy.Field()
19 | # visited = scrapy.Field()
20 | # comment = scrapy.Field()
21 | # liked = scrapy.Field()
22 | author = scrapy.Field()
23 | crawltime = scrapy.Field()
24 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | *.pyc
3 | *.mp3
4 | cookies
5 | .idea
6 | *.pyc
7 | data.cfg
8 | *.mp3
9 | *.pkl
10 | *.xls
11 | *.xml
12 | *.csv
13 | *.pkl
14 | ~$d.xlsx
15 | d.xlsx
16 | data/
17 | temp
18 | request_header
19 | header_toolkit.txt
20 | *.xlsx
21 | *.log
22 | __pycache__/
23 | wikizhword.text
24 | news_tensite_xml.dat
25 | news_tensite_xml.smarty.dat
26 | *.jpg
27 | Download/
28 | Download_IMG/
29 | *.zip
30 | cookies
31 | config.json
32 | config.py
33 | data.cfg
34 | setting.py
35 | setttings.py
36 | *.ts
37 | kc0011/jobs/requests.queue/p1
38 | kc0011/jobs/requests.queue/p0
39 | kc0011/jobs/requests.queue/active.json
40 | kc0011/jobs/spider.state
41 | kc0011/jobs/requests.seen
42 | *.jpg
43 | *.png
44 | *.jpeg
45 | configure/
46 |
--------------------------------------------------------------------------------
/lanrentingshu/request_header:
--------------------------------------------------------------------------------
1 | Accept:*/*
2 | Accept-Encoding:gzip, deflate
3 | Accept-Language:zh-CN,zh;q=0.8
4 | Cache-Control:no-cache
5 | Connection:keep-alive
6 | Content-Length:0
7 | Cookie:aliyungf_tc=AQAAADCDiwwT/gEAv7APt2maQ56C3T1o; uid=15052187062975665e8ceaad34eb9911f2a90ee5b66ad; CNZZDATA1254668430=2046036592-1505217321-null%7C1505217321; Hm_lvt_ada61571fd48bb3f905f5fd1d6ef0ec4=1505218688; Hm_lpvt_ada61571fd48bb3f905f5fd1d6ef0ec4=1505219620; JSESSIONID=EE57EEBB708D1DF15621C6949A4FBE48
8 | Host:www.lrts.me
9 | Origin:http://www.lrts.me
10 | Pragma:no-cache
11 | Referer:http://www.lrts.me/book/32551
12 | User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36
13 | X-Requested-With:XMLHttpRequest
--------------------------------------------------------------------------------
/fraud/fraud/model/db_config.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy import create_engine
2 | from sqlalchemy.orm import sessionmaker
3 | import redis
4 |
5 |
6 | engine = create_engine('mysql+pymysql://root:{}@localhost:3306/spider?charset=utf8')
7 | DBSession = sessionmaker(bind=engine)
8 |
9 |
10 | class RedisPool:
11 | def __init__(self, client_host="localhost", client_port=6379, client_db=0):
12 | self.client_host = client_host
13 | self.client_port = client_port
14 | self.client_db = client_db
15 |
16 | def redis_pool(self):
17 | pool = redis.ConnectionPool(
18 | host=self.client_host,
19 | port=self.client_port,
20 | db=self.client_db,
21 | decode_responses=True)
22 | return redis.StrictRedis(connection_pool=pool)
--------------------------------------------------------------------------------
/kc0011/sandbox/utility.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2018/12/13 13:47
4 | # @File : utility.py
5 |
6 | import os
7 |
8 | # 获取headers
9 |
10 | def get_header(header_file='headers.txt'):
11 | path = os.path.dirname(__file__)
12 | header_path = os.path.join(path,'headers',header_file)
13 | if not os.path.exists(header_path):
14 | return None
15 |
16 | with open(header_path) as fp:
17 | data = fp.readlines()
18 | dictionary = dict()
19 |
20 | for line in data:
21 | line = line.strip()
22 | line = line.replace(' ', '')
23 | dictionary[line.split(":")[0].strip()] = ':'.join(
24 | line.split(":")[1:])
25 |
26 | if 'Content-Length' in dictionary:
27 | del dictionary['Content-Length']
28 |
29 | return dictionary
--------------------------------------------------------------------------------
/myubbs/sandbox/utility.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2018/12/13 13:47
4 | # @File : utility.py
5 |
6 | import os
7 |
8 | # 获取headers
9 |
10 | def get_header(header_file='headers.txt'):
11 | path = os.path.dirname(__file__)
12 | header_path = os.path.join(path,'headers',header_file)
13 | if not os.path.exists(header_path):
14 | return None
15 |
16 | with open(header_path) as fp:
17 | data = fp.readlines()
18 | dictionary = dict()
19 |
20 | for line in data:
21 | line = line.strip()
22 | line = line.replace(' ', '')
23 | dictionary[line.split(":")[0].strip()] = ':'.join(
24 | line.split(":")[1:])
25 |
26 | if 'Content-Length' in dictionary:
27 | del dictionary['Content-Length']
28 |
29 | return dictionary
--------------------------------------------------------------------------------
/tiexue/sandbox/utility.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2018/12/13 13:47
4 | # @File : utility.py
5 |
6 | import os
7 |
8 | # 获取headers
9 |
10 | def get_header(header_file='headers.txt'):
11 | path = os.path.dirname(__file__)
12 | header_path = os.path.join(path,'headers',header_file)
13 | if not os.path.exists(header_path):
14 | return None
15 |
16 | with open(header_path) as fp:
17 | data = fp.readlines()
18 | dictionary = dict()
19 |
20 | for line in data:
21 | line = line.strip()
22 | line = line.replace(' ', '')
23 | dictionary[line.split(":")[0].strip()] = ':'.join(
24 | line.split(":")[1:])
25 |
26 | if 'Content-Length' in dictionary:
27 | del dictionary['Content-Length']
28 |
29 | return dictionary
--------------------------------------------------------------------------------
/chahaoba/sandbox/utility.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2018/12/13 13:47
4 | # @File : utility.py
5 |
6 | import os
7 |
8 | # 获取headers
9 |
10 | def get_header(header_file='headers.txt'):
11 | path = os.path.dirname(__file__)
12 | header_path = os.path.join(path,'headers',header_file)
13 | if not os.path.exists(header_path):
14 | return None
15 |
16 | with open(header_path) as fp:
17 | data = fp.readlines()
18 | dictionary = dict()
19 |
20 | for line in data:
21 | line = line.strip()
22 | line = line.replace(' ', '')
23 | dictionary[line.split(":")[0].strip()] = ':'.join(
24 | line.split(":")[1:])
25 |
26 | if 'Content-Length' in dictionary:
27 | del dictionary['Content-Length']
28 |
29 | return dictionary
--------------------------------------------------------------------------------
/sz_yaohao/sandbox/utility.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2018/12/13 13:47
4 | # @File : utility.py
5 |
6 | import os
7 |
8 | # 获取headers
9 |
10 | def get_header(header_file='headers.txt'):
11 | path = os.path.dirname(__file__)
12 | header_path = os.path.join(path,'headers',header_file)
13 | if not os.path.exists(header_path):
14 | return None
15 |
16 | with open(header_path) as fp:
17 | data = fp.readlines()
18 | dictionary = dict()
19 |
20 | for line in data:
21 | line = line.strip()
22 | line = line.replace(' ', '')
23 | dictionary[line.split(":")[0].strip()] = ':'.join(
24 | line.split(":")[1:])
25 |
26 | if 'Content-Length' in dictionary:
27 | del dictionary['Content-Length']
28 |
29 | return dictionary
--------------------------------------------------------------------------------
/cuiqingcai/async_sandbox/utility.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2018/12/13 13:47
4 | # @File : utility.py
5 |
6 | import os
7 |
8 | # 获取headers
9 |
10 | def get_header(header_file='headers.txt'):
11 | path = os.path.dirname(__file__)
12 | header_path = os.path.join(path,'headers',header_file)
13 | if not os.path.exists(header_path):
14 | return None
15 |
16 | with open(header_path) as fp:
17 | data = fp.readlines()
18 | dictionary = dict()
19 |
20 | for line in data:
21 | line = line.strip()
22 | line = line.replace(' ', '')
23 | dictionary[line.split(":")[0].strip()] = ':'.join(
24 | line.split(":")[1:])
25 |
26 | if 'Content-Length' in dictionary:
27 | del dictionary['Content-Length']
28 |
29 | return dictionary
--------------------------------------------------------------------------------
/tiexue/sandbox/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | # from sandbox.models import SpiderModels, DBSession
8 | import logging
9 | import pymongo
10 | from sandbox import config
11 | from sandbox import settings
12 |
13 |
14 |
15 | class MongoPipeline(object):
16 | def __init__(self):
17 | DOCUMENT = settings.MONGODB_DOC
18 | self.db = pymongo.MongoClient(settings.MONGO_HOST, port=settings.MONGO_PORT)
19 | self.doc = self.db['spider'][DOCUMENT]
20 |
21 | def process_item(self, item, spider):
22 | print('on process')
23 | insert_item = dict(item)
24 | self.doc.insert(insert_item)
25 |
26 | return item
27 |
--------------------------------------------------------------------------------
/poi_gaode/sandbox/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | from scrapy import Item, Field
9 |
10 |
11 | class SpiderItem(Item):
12 | # define the fields for your item here like:
13 |
14 | id = Field()
15 | parent = Field()
16 | name = Field()
17 | type = Field()
18 | typecode = Field()
19 | biz_type = Field()
20 | address = Field()
21 | location = Field()
22 | tel = Field()
23 | distance = Field()
24 | biz_ext = Field()
25 | pname = Field()
26 | cityname = Field()
27 | adname = Field()
28 | importance = Field()
29 | shopid = Field()
30 | shopinfo = Field()
31 | poiweight = Field()
32 | photos = Field()
33 | crawltime = Field()
34 |
--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/monitor/settings.py:
--------------------------------------------------------------------------------
1 | # *-* coding:utf-8 *-*
2 | '''
3 | @author: ioiogoo
4 | @date: 2016/12/26 11:48
5 | '''
6 |
7 | '''
8 | TIMEINTERVAL 刷新时间间隔,单位毫秒
9 | POINTINTERVAL 图上各点之间间隔,越小则表示点越密集
10 | POINTLENGTH 图上点的数量,越大则表示图上时间跨度越长
11 | STATS_KEYS 图上显示的stats_key
12 | REDIS_HOST redis地址
13 | REDIS_PORT redis端口
14 | REDIS_DB redis数据库,默认0
15 | APP_HOST app运行地址,默认127.0.0.1
16 | APP_PORT app运行端口,默认5000
17 | '''
18 |
19 | TIMEINTERVAL = 30000
20 | POINTINTERVAL = 30
21 | POINTLENGTH = 2000
22 | STATS_KEYS = ['downloader/request_count', 'downloader/response_count','downloader/response_status_count/200', 'item_scraped_count']
23 | REDIS_HOST = '10.18.6.46'
24 | REDIS_PORT = 6379
25 | REDIS_DB = 0
26 | APP_HOST = '127.0.0.1'
27 | APP_PORT = 5000
28 |
--------------------------------------------------------------------------------
/async_cuiqingcai/rabbit_send.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # @Date : 2019-08-30 17:25:46
4 | # @Author : Rocky Chen (weigesysu@qq.com)
5 | # @Link : http://30daydo.com
6 | # @Version : $Id$
7 |
8 | import pika
9 | # import settings
10 |
11 | credentials = pika.PlainCredentials('admin','admin')
12 | connection = pika.BlockingConnection(pika.ConnectionParameters('192.168.1.101',5672,'/',credentials))
13 |
14 | channel = connection.channel()
15 | channel.exchange_declare(exchange='direct_log',exchange_type='direct') # fanout 就是组播
16 |
17 | routing_key = 'info'
18 | message='https://36kr.com/pp/api/aggregation-entity?type=web_latest_article&b_id=59499&per_page=30'
19 | channel.basic_publish(
20 | exchange='direct_log',
21 | routing_key=routing_key,
22 | body=message
23 | )
24 |
25 | print('sending message {}'.format(message))
26 | connection.close()
27 |
--------------------------------------------------------------------------------
/poi_gaode/.ipynb_checkpoints/Untitled-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "def get_max():\n",
10 | " with open('sz_poi.txt','r') as f:\n",
11 | " # js = json.load(f)\n",
12 | " while 1:\n",
13 | " "
14 | ]
15 | }
16 | ],
17 | "metadata": {
18 | "kernelspec": {
19 | "display_name": "Python 3",
20 | "language": "python",
21 | "name": "python3"
22 | },
23 | "language_info": {
24 | "codemirror_mode": {
25 | "name": "ipython",
26 | "version": 3
27 | },
28 | "file_extension": ".py",
29 | "mimetype": "text/x-python",
30 | "name": "python",
31 | "nbconvert_exporter": "python",
32 | "pygments_lexer": "ipython3",
33 | "version": "3.6.2"
34 | }
35 | },
36 | "nbformat": 4,
37 | "nbformat_minor": 2
38 | }
39 |
--------------------------------------------------------------------------------
/pornhub/tampermonkey.js:
--------------------------------------------------------------------------------
1 | // ==UserScript==
2 | // @name New Userscript
3 | // @namespace http://tampermonkey.net/
4 | // @version 0.1
5 | // @description try to take over the world!
6 | // @author github.com/formateddd/pornhub
7 | // @include *.pornhub.com/view_video.php?viewkey=*
8 | // @grant none
9 | // ==/UserScript==
10 |
11 |
12 |
13 | (function() {
14 | 'use strict';
15 |
16 | // Your code here...
17 |
18 |
19 | var qualites = [
20 | "quality_1080p",
21 | "quality_720p",
22 | "quality_480p",
23 | "quality_240p",
24 | ];
25 |
26 | for (var i in qualites) {
27 | if (window[qualites[i]]){
28 | document.querySelector("h1").innerHTML += '' + qualites[i] + ''
29 | console.info(window.qualites[i]);
30 | break
31 | }
32 | }
33 |
34 |
35 | })();
36 |
--------------------------------------------------------------------------------
/jd/jd/spiders/quotes.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from scrapy_splash import SplashRequest
4 |
5 |
6 | class QuotesSpider(scrapy.Spider):
7 | name = "quotes"
8 | allowed_domains = ["quotes.toscrape.com"]
9 | start_urls = ['http://quotes.toscrape.com/js/']
10 |
11 | def start_requests(self):
12 | for url in self.start_urls:
13 | yield SplashRequest(url, args={'images': 0, 'timeout': 3})
14 |
15 | def parse(self, response):
16 | for sel in response.css('div.quote'):
17 | quote = sel.css('span.text::text').extract_first()
18 | author = sel.css('small.author::text').extract_first()
19 | yield {'quote': quote, 'author': author}
20 | href = response.css('li.next > a::attr(href)').extract_first()
21 | if href:
22 | url = response.urljoin(href)
23 | yield SplashRequest(url, args={'images': 0, 'timeout': 3})
--------------------------------------------------------------------------------
/kc0011/sandbox/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | from scrapy import Item,Field
9 | import scrapy
10 |
11 | class SpiderItem(Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | nick_name = scrapy.Field()
15 | level = scrapy.Field()
16 | credit = scrapy.Field()
17 | score_count = scrapy.Field()
18 | tie_count = scrapy.Field()
19 | jifeng = scrapy.Field()
20 | register = scrapy.Field()
21 | alipay=scrapy.Field()
22 | email=scrapy.Field()
23 | person_info_html = scrapy.Field()
24 | crawltime = scrapy.Field()
25 |
26 | class ContentItem(Item):
27 | url = scrapy.Field()
28 | publishTime = scrapy.Field()
29 | author = scrapy.Field()
30 | content = scrapy.Field()
31 | crawltime=scrapy.Field()
32 |
33 |
--------------------------------------------------------------------------------
/bilibili/bilibili/logger.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 | import logging
3 | import datetime
4 | import os
5 | # from setting import llogger
6 | def llogger(filename):
7 |
8 | logger = logging.getLogger(filename) # 不加名称设置root logger
9 | logger.setLevel(logging.DEBUG)
10 | formatter = logging.Formatter(
11 | '%(asctime)s - %(name)s - %(levelname)s: - %(message)s',
12 | datefmt='%Y-%m-%d %H:%M:%S')
13 | # 使用FileHandler输出到文件
14 | prefix = os.path.splitext(filename)[0]
15 | fh = logging.FileHandler(prefix+'.log')
16 | fh.setLevel(logging.DEBUG)
17 | fh.setFormatter(formatter)
18 | # 使用StreamHandler输出到屏幕
19 | ch = logging.StreamHandler()
20 | ch.setLevel(logging.DEBUG)
21 | ch.setFormatter(formatter)
22 | # 添加两个Handler
23 | logger.addHandler(ch)
24 | logger.addHandler(fh)
25 | # logger.info('this is info message')
26 | # logger.warning('this is warn message')
27 | return logger
28 |
29 |
--------------------------------------------------------------------------------
/async_cuiqingcai/multi_spider_run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # @Date : 2019-08-30 16:20:47
4 | # @Author : Rocky Chen (weigesysu@qq.com)
5 | # @Link : http://30daydo.com
6 | # @Version : $Id$
7 |
8 | from crochet import setup
9 | from importlib import import_module
10 | from scrapy.crawler import CrawlerRunner
11 | from scrapy.utils.project import get_project_settings
12 | setup()
13 |
14 | # not work
15 | def run_spider(spiderName):
16 | module_name="async_sandbox.spiders.{}".format(spiderName)
17 | scrapy_var = import_module(module_name) #do some dynamic import of selected spider
18 | print(scrapy_var)
19 | print(dir(scrapy_var))
20 | spiderObj=scrapy_var.ExampleSpider #get mySpider-object from spider module
21 | print(spiderObj)
22 |
23 | crawler = CrawlerRunner(get_project_settings()) #from Scrapy docs
24 | crawler.crawl(spiderObj)
25 | print('start')
26 |
27 | run_spider('example')
--------------------------------------------------------------------------------
/bilibili/bilibili/spiders/bili.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from scrapy_splash import SplashRequest
4 | import logging
5 | # from bilibili.logger import llogger
6 | # from scrapy import log
7 | # loggers = llogger(__file__)
8 |
9 | class BiliSpider(scrapy.Spider):
10 | name = 'ordinary' # 这个名字就是上面连接中那个启动应用的名字
11 | allowed_domain = ["bilibili.com"]
12 | start_urls = [
13 | "https://www.bilibili.com/"
14 | ]
15 |
16 | def start_requests(self):
17 | splash_args = {
18 | 'wait': '5',
19 | }
20 | for url in self.start_urls:
21 | yield SplashRequest(url, self.parse_result, args=splash_args, endpoint='render.html')
22 |
23 | def parse_result(self, response):
24 | logging.info('====================================================')
25 | content = response.xpath("//div[@class='num-wrap']").extract_first()
26 | logging.info(content)
27 | logging.info('====================================================')
28 |
29 |
--------------------------------------------------------------------------------
/fraud/fraud/model/fraud.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from sqlalchemy import Column, String , DateTime, Integer, Text
3 | from sqlalchemy.ext.declarative import declarative_base
4 | from fraud.model import db_config
5 | import datetime
6 |
7 | Base = declarative_base()
8 |
9 | class Fraud(Base):
10 | __tablename__ = 'tb_frauds2'
11 |
12 | id = Column(Integer, primary_key=True)
13 | executed_name = Column(String(300))
14 | gender = Column(String(10))
15 | age = Column(String(10))
16 | identity_number = Column(String(50))
17 | court = Column(String(200))
18 | province = Column(String(50))
19 | case_number = Column(String(100))
20 | performance = Column(String(100)) # 被执行人的履行情况
21 | disrupt_type_name = Column(Text) # 失信被执行人行为具体情形
22 | duty = Column(Text) # 生效法律文书确定的义务
23 | release_time = Column(String(50))
24 | crawl_time = Column(DateTime, default=datetime.datetime.now())
25 | data_resource = Column(String(50), default='baidu_api')
26 |
27 | Base.metadata.create_all(db_config.engine)
28 |
--------------------------------------------------------------------------------
/sz_yaohao/sandbox/models.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2018/9/26 9:25
4 | # @File : models.py
5 |
6 | from sqlalchemy import Column, String, DateTime, Integer, Text, create_engine, DATE
7 | from sqlalchemy.ext.declarative import declarative_base
8 | import datetime
9 | from sqlalchemy.orm import sessionmaker
10 | from sandbox import config
11 |
12 | Base = declarative_base()
13 | engine = create_engine('mysql+pymysql://{}:{}@{}:3367/spider?charset=utf8'.format(config.username,config.password,config.mysql_ip))
14 | DBSession = sessionmaker(bind=engine)
15 |
16 | TABLE_NAME = ''
17 |
18 | # ORM 模型,根据项目需求修改
19 | class SpiderModels(Base):
20 | __tablename__ = TABLE_NAME
21 |
22 | # 根据项目修改字段
23 | id = Column(Integer, primary_key=True, autoincrement=True)
24 | card=Column(Text, comment='卡号')
25 | accountLength = Column(Text, comment='长度')
26 | origin = Column(String(30), comment='来源')
27 | crawltime = Column(DateTime, default=datetime.datetime.now(), comment='抓取时间')
28 |
29 |
30 | Base.metadata.create_all(engine)
--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/commands/crawlall.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # @Date : 2019-08-29 16:56:28
4 | # @Author : Rocky Chen (weigesysu@qq.com)
5 | # @Link : http://30daydo.com
6 | # @Version : $1.0$
7 |
8 |
9 | from scrapy.commands import ScrapyCommand
10 | from scrapy.crawler import CrawlerProcess
11 | class Command(ScrapyCommand):
12 |
13 | requires_project = True
14 |
15 | def syntax(self):
16 | return '[options]'
17 |
18 | def short_desc(self):
19 | return 'Runs all of the spiders - My Defined'
20 |
21 | def run(self,args,opts):
22 | print('==================')
23 | print(type(self.crawler_process))
24 | spider_list = self.crawler_process.spiders.list()
25 | # 可以在这里 定义 spider_list = ['example','chouti']
26 | for name in spider_list:
27 | print('=================')
28 | print(name)
29 | self.crawler_process.crawl(name,**opts.__dict__)
30 |
31 | self.crawler_process.start()
32 |
33 |
34 |
--------------------------------------------------------------------------------
/MyLibrary/login.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import hashlib
3 |
4 | def login_session(username,password):
5 | s = bytes(password, encoding='utf8')
6 | m = hashlib.md5()
7 | m.update(s)
8 | first_md5 = m.hexdigest()
9 | headers = {'Referer': 'https://www.szlib.org.cn/MyLibrary/Reader-Access.jsp?infomistake=0&eventsite=WWW-044005',
10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
11 | 'X-Requested-With': 'XMLHttpRequest'}
12 |
13 | url = 'https://www.szlib.org.cn/MyLibrary/readerLoginM.jsp'
14 | data = {'rand': '',
15 | 'username': username,
16 | 'password': first_md5,
17 |
18 | }
19 | session=None
20 | session = requests.Session()
21 |
22 | r = session.post(url=url, headers=headers, data=data, timeout=15)
23 | print(r.text)
24 | if 'OK' in r.text:
25 | print('Crash !!!')
26 | print(username)
27 | print(password)
28 |
29 | return session
30 |
--------------------------------------------------------------------------------
/myubbs/sandbox/models.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2018/9/26 9:25
4 | # @File : models.py
5 |
6 | from sqlalchemy import Column, String, DateTime, Integer, Text, create_engine, DATE
7 | from sqlalchemy.ext.declarative import declarative_base
8 | import datetime
9 | from sqlalchemy.orm import sessionmaker
10 | from sandbox import config
11 |
12 | Base = declarative_base()
13 | engine = create_engine('mysql+pymysql://{}:{}@{}:3306/db_rocky?charset=utf8'.format(config.username,config.password,config.mysql_ip))
14 | DBSession = sessionmaker(bind=engine)
15 |
16 | TABLE_NAME = 'tb_myubbs'
17 |
18 | # ORM 模型,根据项目需求修改
19 | class SpiderModels(Base):
20 | __tablename__ = TABLE_NAME
21 |
22 |
23 | # 根据项目修改字段
24 | id = Column(Integer, primary_key=True, autoincrement=True)
25 | title = Column(String(400))
26 | pubdate = Column(DateTime)
27 | content = Column(Text)
28 | author = Column(String(100))
29 | url = Column(String(200))
30 | crawltime = Column(DateTime, default=datetime.datetime.now(), comment='抓取时间')
31 |
32 |
33 | Base.metadata.create_all(engine)
--------------------------------------------------------------------------------
/myubbs/sandbox/headers:
--------------------------------------------------------------------------------
1 | Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
2 | Accept-Encoding: gzip, deflate
3 | Accept-Language: zh-CN,zh;q=0.9
4 | Cache-Control: no-cache
5 | Connection: keep-alive
6 | Cookie: MKG1_2132_saltkey=LnNTIT1F; MKG1_2132_lastvisit=1555164586; UM_distinctid=16a173f60854de-0f1c29779936eb-39395704-144000-16a173f60863cb; CNZZDATA3065925=cnzz_eid%3D1943346629-1555168187-http%253A%252F%252Fwww.myzsu.com%252F%26ntime%3D1555168187; MKG1_2132_seccode=103.e48171c76ce30999a4; MKG1_2132_visitedfid=97; MKG1_2132_st_p=0%7C1555169196%7C31ebb51b6faa73e0deaa417d1878522f; MKG1_2132_viewid=tid_140374; MKG1_2132_st_t=0%7C1555169280%7C31d0f95d5b85fe7f3c5028e0928583bb; MKG1_2132_forum_lastvisit=D_97_1555169280; MKG1_2132_lastact=1555169281%09home.php%09misc; MKG1_2132_sendmail=1
7 | Host: zsu.myubbs.com
8 | Pragma: no-cache
9 | Referer: http://zsu.myubbs.com/forum-97-1.html
10 | Upgrade-Insecure-Requests: 1
11 | User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36
--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/spiders/crawl_all_example.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | import time
3 | class QuotesSpider(scrapy.Spider):
4 | name = "quotes"
5 | start_urls = ['http://quotes.toscrape.com/tag/humor/']
6 |
7 |
8 | def parse(self, response):
9 | time.sleep(15)
10 | print(f'in spider {self.name}')
11 | for quote in response.css('div.quote'):
12 | print(quote.css('span.text::text').extract_first())
13 |
14 | def close(self,reason):
15 | print('===================== spider close ================')
16 |
17 | class QuotesSpider1(scrapy.Spider):
18 | name = "quotes_1"
19 | start_urls = ['http://quotes.toscrape.com/tag/humor/']
20 |
21 | def parse(self, response):
22 | print('meta content ==============')
23 | print(response.meta)
24 | print('meta content ==============')
25 |
26 | print(f'in spider {self.name}')
27 | for quote in response.css('div.quote'):
28 | print(quote.css('span.text::text').extract_first())
29 |
30 | def close(self,reason):
31 | print('===================== spider close ================')
32 |
--------------------------------------------------------------------------------
/chahaoba/sandbox/models.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2018/9/26 9:25
4 | # @File : models.py
5 |
6 | from sqlalchemy import Column, String, DateTime, Integer, Text, create_engine, DATE
7 | from sqlalchemy.ext.declarative import declarative_base
8 | import datetime
9 | from sqlalchemy.orm import sessionmaker
10 | from sandbox import config
11 |
12 |
13 | Base = declarative_base()
14 | engine = create_engine(
15 | 'mysql+pymysql://{}:{}@{}:3367/spider?charset=utf8'.format(config.username, config.password, config.mysql_ip))
16 | DBSession = sessionmaker(bind=engine)
17 |
18 | TABLE_NAME = 'chahaoba'
19 | #
20 | #
21 | # # ORM 模型,根据项目需求修改
22 | class SpiderModels(Base):
23 | __tablename__ = TABLE_NAME
24 |
25 | # 根据项目修改字段
26 | id = Column(Integer, primary_key=True, autoincrement=True)
27 |
28 | number = Column(String(11), comment='手机号段')
29 | city = Column(String(10), comment='城市')
30 | province = Column(String(10), comment='省份')
31 | card_type = Column(String(10), comment='手机卡类型')
32 | op = Column(String(10), comment='运营商')
33 | card_detail = Column(String(80), comment='卡详细')
34 |
35 |
36 | Base.metadata.create_all(engine)
37 |
--------------------------------------------------------------------------------
/kc0011/sandbox/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | # from sandbox.models import SpiderModels, DBSession
8 |
9 | import logging
10 | import pymongo
11 | from sandbox import settings
12 | from sandbox.items import SpiderItem
13 |
14 | class MongoPipeline(object):
15 | def __init__(self):
16 | self.db = pymongo.MongoClient(settings.MONGO_HOST, port=settings.MONGO_PORT)
17 | self.doc1 = self.db[settings.MONGODB_DB][settings.MONGODB_DOC]
18 | self.doc2 = self.db[settings.MONGODB_DB][settings.MONGODB_DOC2]
19 | try:
20 | self.doc2.ensure_index('url',unique=True)
21 | except Exception as e:
22 | print(e)
23 |
24 | def process_item(self, item, spider):
25 | if isinstance(item,SpiderItem):
26 |
27 | insert_item = dict(item)
28 | self.doc1.insert(insert_item)
29 |
30 | else:
31 |
32 | insert_item = dict(item)
33 | self.doc2.insert(insert_item)
34 |
35 | return item
36 |
--------------------------------------------------------------------------------
/chahaoba/sandbox/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | from sandbox.models import SpiderModels, DBSession
8 | import logging
9 | import pymongo
10 | import pymysql
11 | from sandbox import config
12 | from sandbox import settings
13 | from scrapy.exceptions import DropItem
14 |
15 | class SQLPipeline(object):
16 | def __init__(self):
17 | self.session = DBSession()
18 |
19 | def process_item(self, item, spider):
20 |
21 | obj = SpiderModels(
22 | number=item['_number'],
23 | city = item['_city'],
24 | province = item['_province'],
25 | card_type = item['_card_type'],
26 | op = item['_op'],
27 | card_detail = item['_card_detail'],
28 | )
29 | self.session.add(obj)
30 |
31 | try:
32 | self.session.commit()
33 |
34 | except Exception as e:
35 | print(e)
36 | logging.error('>>>> 重复数据')
37 | self.session.rollback()
38 | DropItem(item)
39 | else:
40 | return item
41 |
42 |
--------------------------------------------------------------------------------
/Forbes/main.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | __author__ = 'Rocky'
4 | '''
5 | http://30daydo.com
6 | Email: weigesysu@qq.com
7 | '''
8 | import requests
9 | from lxml import etree
10 | import pymongo
11 |
12 | db = pymongo.MongoClient('127.0.0.1')
13 | collection = db['forbes']['2017']
14 |
15 | def getContent(url, retry =5):
16 | headers = {'User-Agent':'User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'}
17 | for _ in range(retry):
18 | try:
19 | r = requests.get(url,headers=headers,timeout=20)
20 | if r:
21 | return r
22 | except Exception,e:
23 | print e
24 | continue
25 | return None
26 |
27 | def getItem():
28 | colums = ['number','name','money','enterprise','living']
29 | r = getContent('http://www.forbeschina.com/review/list/002399.shtml')
30 | # print r.text
31 | tree = etree.HTML(r.text)
32 | items = tree.xpath('//tbody/tr')
33 | for item in items:
34 | d = dict(zip(colums,item.xpath('.//td/text()')))
35 | print d
36 | collection.insert(d)
37 |
38 | def main():
39 | getItem()
40 |
41 | if __name__ == '__main__':
42 | main()
--------------------------------------------------------------------------------
/bbssmth/bbssmth/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import logging
8 | from logging import log
9 | from elasticsearch import Elasticsearch
10 | from bbssmth.settings import ES_HOST
11 |
12 |
13 | class BbssmthPipeline(object):
14 | def __init__(self):
15 | self.index = 'newsmth'
16 | self.doc = 'doc'
17 | self.es = Elasticsearch(ES_HOST)
18 |
19 | def process_item(self, item, spider):
20 | body = {
21 | 'title': item.get('title'),
22 | 'url': item.get('url'),
23 | 'content': item.get('content'),
24 | 'author': item.get('author'),
25 | 'crawltime': item.get('crawltime'),
26 | 'reply': item.get('reply'),
27 | 'category': item.get('category'),
28 | 'create_time':item.get('create_time'),
29 |
30 | }
31 |
32 | try:
33 | self.es.index(index=self.index, doc_type=self.doc, body=body)
34 | except Exception as e:
35 | log.msg('错误 >>>>>')
36 | log.msg(e)
37 | return item
38 |
--------------------------------------------------------------------------------
/52sh/config_file.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # website: http://30daydo.com
3 | # @Time : 2020/9/24 12:12
4 | # @File : config_file.py
5 |
6 | START_URL = 'http://www.52sh.com.tw/index.php/main/knowledge/65/page/{page}'
7 | HEADERS = {
8 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
9 | "Accept-Encoding": "gzip, deflate",
10 | "Accept-Language": "zh,en;q=0.9,en-US;q=0.8,zh-CN;q=0.7",
11 | "Cache-Control": "no-cache",
12 | "Cookie": "PHPSESSID=a3oqieou2ik4a987ksq2bm3354; _ga=GA1.3.1399498082.1600914935; _gid=GA1.3.1565426161.1600914935",
13 | "Host": "www.52sh.com.tw",
14 | "Pragma": "no-cache",
15 | "Proxy-Connection": "keep-alive",
16 | "Referer": "http://www.52sh.com.tw/index.php/main/knowledge/65/page/105",
17 | "Upgrade-Insecure-Requests": "1",
18 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
19 | }
20 | PROXY = {'http': 'http://127.0.0.1:58083'}
21 | PROXY_STR = 'http://127.0.0.1:58083'
22 | SIMPLE_HEADERS = {
23 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
24 | }
--------------------------------------------------------------------------------
/51CTOCrawler/demo.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import subprocess
3 | def demo_validate():
4 | url='http://v22.51cto.com/2018/12/19/338483/e899/high/loco_video_323000_{}.ts'
5 | for i in range(112):
6 | r=requests.get(url.format(i))
7 | with open('loco_video_323000_{}.ts'.format(i),'wb') as f:
8 | f.write(r.content)
9 |
10 | def write_confile(ts_len):
11 | txt = ''
12 | for i in range(ts_len):
13 | txt += "file 'C:\\git\\CrawlMan\\51CTOCrawler\\loco_video_323000_{}.ts'\n".format(i)
14 | with open('confile.txt', 'w') as fout:
15 | fout.write(txt)
16 |
17 | def merge_ts_video(title, v_type='.mp4'):
18 | cmd = 'ffmpeg -f concat -safe 0 -i confile.txt -c copy %s%s' %(title, v_type)
19 | print(cmd)
20 | p = subprocess.Popen(cmd, stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
21 | out, err = p.communicate()
22 | print(str(out, 'utf-8'))
23 | print(str(err, 'utf-8'))
24 |
25 | def run_cmd():
26 | import os
27 | name = 'loco_video_323000_{}.ts'
28 | args = '+'.join([name.format(i) for i in range(112)])
29 | cmd = 'copy /b '+args + ' test.ts'
30 | print(cmd)
31 | os.system(cmd)
32 |
33 | # demo_validate()
34 | write_confile(112)
35 | merge_ts_video('wanttoplay')
36 | #run_cmd()
--------------------------------------------------------------------------------
/poi_gaode/sandbox/models.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2018/9/26 9:25
4 | # @File : models.py
5 |
6 | from sqlalchemy import Column, String, DateTime, Integer, Text, create_engine, DATE
7 | from sqlalchemy.ext.declarative import declarative_base
8 | import datetime
9 | from sqlalchemy.orm import sessionmaker
10 | from sandbox import config
11 |
12 | Base = declarative_base()
13 | engine = create_engine('mysql+pymysql://{}:{}@{}:3367/spider?charset=utf8'.format(config.username,config.password,config.mysql_ip))
14 | DBSession = sessionmaker(bind=engine)
15 |
16 | TABLE_NAME = 'card_bin_scrapy'
17 |
18 | # ORM 模型,根据项目需求修改
19 | class SpiderModels(Base):
20 | __tablename__ = TABLE_NAME
21 |
22 |
23 | # 根据项目修改字段
24 | id = Column(Integer, primary_key=True, autoincrement=True)
25 | card=Column(Text, comment='卡号')
26 | accountLength = Column(Text, comment='长度')
27 | cardName = Column(Text, comment='卡名')
28 | cardType = Column(Text, comment='卡类型')
29 | mainAccount = Column(Text, comment='主账号')
30 | mainValue = Column(Text, comment='主账号值')
31 | orgName = Column(Text, comment='发卡行')
32 |
33 | origin = Column(String(30), comment='来源')
34 | crawltime = Column(DateTime, default=datetime.datetime.now(), comment='抓取时间')
35 |
36 |
37 | Base.metadata.create_all(engine)
--------------------------------------------------------------------------------
/fraud/fraud/match.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import division
3 | from model.fraud import Fraud
4 | from model.db_config import DBSession, RedisPool
5 | import sys
6 |
7 | reload(sys)
8 | sys.setdefaultencoding('utf8')
9 | f = open("id_name.txt")
10 | line = f.readline()
11 | total_num, match_num, name_match_num = [0, 0, 0]
12 |
13 | session = DBSession()
14 | r_pool = RedisPool(client_db=1)
15 | r = r_pool.redis_pool()
16 | while line:
17 | id_num = line[0:18]
18 | formatted_id_num = id_num[0:11] + '*' * 4 + id_num[14:]
19 | # print line
20 | name = line[19:-1].strip()
21 | try:
22 | fraud_info = session.query(Fraud).filter_by(identity_number=formatted_id_num).first()
23 | except:
24 | session.rollback()
25 | if fraud_info:
26 | match_num += 1
27 | if name.encode('gb2312') == fraud_info.executed_name.encode('gb2312'):
28 | name_match_num += 1
29 | else:
30 | r.set(fraud_info.identity_number, 1)
31 | total_num += 1
32 | line = f.readline()
33 |
34 | f.close()
35 | session.close()
36 | print('样本总量:%s' % total_num)
37 | print('匹配成功数量:%s' % match_num)
38 | print('匹配率:%s' % ((match_num/total_num) * 100), '%')
39 | print('姓名身份证号匹配成功个数:%s' % name_match_num)
40 | print('姓名身份证号匹配率:%s' % ((name_match_num/match_num) * 100), '%')
41 |
42 |
--------------------------------------------------------------------------------
/MyLibrary/sandbox/sandbox/models.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2018/9/26 9:25
4 | # @File : models.py
5 |
6 | from sqlalchemy import Column, String, DateTime, Integer, Text, create_engine, DATE
7 | from sqlalchemy.ext.declarative import declarative_base
8 | import datetime
9 | from sqlalchemy.orm import sessionmaker
10 | from sandbox import config
11 |
12 | # Base = declarative_base()
13 | # engine = create_engine('mysql+pymysql://{}:{}@{}:3367/spider?charset=utf8'.format(config.username,config.password,config.mysql_ip))
14 | # DBSession = sessionmaker(bind=engine)
15 | #
16 | # TABLE_NAME = 'card_bin_scrapy'
17 | #
18 | # # ORM 模型,根据项目需求修改
19 | # class SpiderModels(Base):
20 | # __tablename__ = TABLE_NAME
21 | #
22 | #
23 | # # 根据项目修改字段
24 | # id = Column(Integer, primary_key=True, autoincrement=True)
25 | # card=Column(Text, comment='卡号')
26 | # accountLength = Column(Text, comment='长度')
27 | # cardName = Column(Text, comment='卡名')
28 | # cardType = Column(Text, comment='卡类型')
29 | # mainAccount = Column(Text, comment='主账号')
30 | # mainValue = Column(Text, comment='主账号值')
31 | # orgName = Column(Text, comment='发卡行')
32 | # # origin = Column(String(30), comment='来源')
33 | # crawtime = Column(DateTime, default=datetime.datetime.now(), comment='抓取时间')
34 | #
35 | #
36 | # Base.metadata.create_all(engine)
--------------------------------------------------------------------------------
/github_star/star.py:
--------------------------------------------------------------------------------
1 | import sys, json, os, requests
2 |
3 | if len(sys.argv) < 2 or len(sys.argv[1]) == 0:
4 | print('Check your GitHub ID ...\n demo :\n python github_counter.py rockyzsu')
5 | exit()
6 |
7 | print('Search...')
8 | github_id = sys.argv[1]
9 | url = 'https://api.github.com/users/{github_id}/repos?page={page_id}'
10 | repo_list = []
11 | page_id = 1
12 | while True:
13 | r = requests.get(url.format(github_id=github_id, page_id=page_id))
14 | if r.status_code != 200:
15 | print('check your network connections')
16 | exit()
17 |
18 | repo_array = json.loads(r.content.decode('utf-8'))
19 | if len(repo_array) == 0:
20 | break
21 |
22 | for repo in repo_array:
23 | if not repo['fork']:
24 | repo_list.append([repo['name'], repo['stargazers_count'], repo['forks_count'],'' if repo['description'] is None else repo['description']])
25 | page_id += 1
26 |
27 | # sort by number of stars
28 | repo_list = sorted(repo_list, key=lambda x: x[1], reverse=True)
29 |
30 | print('=' * 55)
31 | print('\n'.join(['{: <30}★{: <10}\tfork {:<10}\t{:<30} '.format(*repo) for repo in repo_list]))
32 | print('=' * 55)
33 | print('{: <30}★{: <10}\tfork {} '.format('total', sum([i[1] for i in repo_list]), sum([i[2] for i in repo_list])))
34 | print('='*55)
35 | print('{:<30}\t{:<30}'.format('total_repo_count',len(repo_list)))
36 |
--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/monitor/app.py:
--------------------------------------------------------------------------------
1 | # *-* coding:utf-8 *-*
2 | '''
3 | @author: ioiogoo
4 | @date: 2016/12/25 15:00
5 | '''
6 | import json
7 | from flask import Flask, render_template, jsonify, request, current_app
8 | import redis
9 | from settings import *
10 |
11 | app = Flask(__name__)
12 |
13 |
14 | @app.route('/')
15 | def index():
16 | return render_template('index.html', timeinterval=TIMEINTERVAL, stats_keys=STATS_KEYS)
17 |
18 |
19 | @app.route('/ajax')
20 | def ajax():
21 | key = request.args.get('key')
22 | result = current_app.r.lrange(key, -POINTLENGTH, -1)[::POINTINTERVAL]
23 | if not current_app.spider_is_run:
24 | # spider is closed
25 | return json.dumps(result), 404
26 | return json.dumps(result)
27 |
28 |
29 | @app.route('/signal')
30 | def signal():
31 | signal = request.args.get('sign')
32 | if signal == 'closed':
33 | current_app.spider_is_run = False
34 | elif signal == 'running':
35 | current_app.spider_is_run = True
36 | return jsonify('')
37 |
38 |
39 | @app.before_first_request
40 | def init():
41 | current_app.r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB,decode_responses=True)
42 | current_app.spider_is_run = True if current_app.r.get('spider_is_run') == '1' else False
43 |
44 |
45 | if __name__ == '__main__':
46 | app.run(debug=True, host=APP_HOST, port=APP_PORT)
47 |
--------------------------------------------------------------------------------
/cuiqingcai/async_sandbox/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import pymysql
8 | from twisted.enterprise import adbapi
9 | import logging
10 | class AsyncSQLPipeline(object):
11 | def __init__(self):
12 | self.dbpool = adbapi.ConnectionPool('pymysql',host='',port='',user='',password='',db='spider')
13 | # self.cursor = self.conn.cursor()
14 |
15 | def process_item(self, item, spider):
16 | update_=self.dbpool.runInteraction(self.update,item)
17 | update_.addErrback(self.handle_error,item,spider)
18 |
19 | return item
20 |
21 | def update(self,cursor,item):
22 | insert_sql = 'insert into tb_cuiqingcai (category,title,article_url,content,author,created_at,liked,visited,comment,crawltime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
23 | data=(item['category'],item['title'],item['article_url'],item['content'],item['author'],item['created_at'],item['liked'],item['visited'],item['comment'],item['crawltime']
24 | )
25 | cursor.execute(insert_sql,data)
26 |
27 | def handle_error(self,failure,item,spider):
28 | logging.error('写入数据库异常--->')
29 | logging.error(failure)
30 | logging.error('error item')
31 | logging.error(item)
--------------------------------------------------------------------------------
/jd/switch_ip.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2020/3/30 21:50
4 | # @File : switch_ip.py
5 |
6 | import os
7 | import time
8 | from config import AD_PASSWORD, AD_USER
9 |
10 | g_adsl_account = {"name": "adsl", # 这个可以随意写 下面user和pwd 账号密码
11 | "username": AD_USER,
12 | "password": AD_PASSWORD}
13 |
14 |
15 | class ADSL(object):
16 |
17 | def __init__(self):
18 | self.name = g_adsl_account["name"]
19 | self.username = g_adsl_account["username"]
20 | self.password = g_adsl_account["password"]
21 |
22 | # set_adsl : 修改adsl设置
23 |
24 | def set_adsl(self, account):
25 | self.name = account["name"]
26 | self.username = account["username"]
27 | self.password = account["password"]
28 |
29 | # connect : 宽带拨号
30 |
31 | def connect(self):
32 | cmd_str = "rasdial %s %s %s" % (self.name, self.username, self.password)
33 | os.system(cmd_str)
34 | time.sleep(5)
35 |
36 | # disconnect : 断开宽带连接
37 |
38 | def disconnect(self):
39 | cmd_str = "rasdial %s /disconnect" % self.name
40 | os.system(cmd_str)
41 | time.sleep(5)
42 |
43 | # reconnect : 重新进行拨号
44 |
45 | def reconnect(self):
46 | print('自动拨号')
47 | self.disconnect()
48 | self.connect()
49 |
50 |
51 | if __name__ == '__main__':
52 | a = ADSL()
53 | a.reconnect()
54 |
--------------------------------------------------------------------------------
/qianfangyiguan/qianfan_models.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import re
3 | from sqlalchemy import create_engine
4 | from sqlalchemy.orm import sessionmaker, relationship
5 | from sqlalchemy.ext.declarative import declarative_base
6 | from sqlalchemy import Column, String, DateTime, Integer, Text, ForeignKey, Float
7 | from sqlalchemy import event
8 | from sqlalchemy import DDL
9 |
10 | engine = create_engine('mysql+pymysql://root:@localhost:3306/db_parker?charset=utf8')
11 | DBSession = sessionmaker(bind=engine)
12 | Base = declarative_base()
13 |
14 |
15 | class Apps(Base):
16 | __tablename__ = 'tb_apps3'
17 | id = Column(Integer, primary_key=True)
18 | app_rank = Column(Integer, index=True)
19 | appName = Column(String(150), index=True)
20 | developCompanyFullName = Column(String(180),index=True)
21 | second_cateName = Column(String(150))
22 | first_cateName = Column(String(150))
23 | appId = Column(String(150))
24 | activeNums = Column(Float)
25 | activeAvgDay = Column(Float)
26 | runtimeAvgDay = Column(Float)
27 | runtimeAvgPersonRatio = Column(Float)
28 | activeAvgDayRatio = Column(Float)
29 | runtimeNums = Column(Float)
30 | launchNums = Column(Float)
31 | runtimeNumsRatio = Column(Float)
32 | launchAvgDayRatio = Column(Float)
33 | statDate = Column(DateTime)
34 | developCompanyAbbr = Column(String(180))
35 |
36 |
37 | Base.metadata.create_all(engine)
38 |
--------------------------------------------------------------------------------
/dashiye/main.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2020/4/26 20:20
4 | # @File : main.py
5 |
6 | import requests
7 | import numpy as np
8 |
9 |
10 | code = input('请输入股票代码:')
11 |
12 | cookies = {
13 | 'PHPSESSID': 'jqb0q4h60h4bmtj5bkd9bjuv00',
14 | 'Hm_lvt_210e7fd46c913658d1ca5581797c34e3': '1587903421',
15 | 'Hm_lpvt_210e7fd46c913658d1ca5581797c34e3': '1587903461',
16 | }
17 |
18 | headers = {
19 | 'Origin': 'http://www.dashiyetouzi.com',
20 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
21 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
22 | 'Accept': 'application/json, text/javascript, */*; q=0.01',
23 | 'X-Requested-With': 'XMLHttpRequest',
24 | 'Referer': 'http://www.dashiyetouzi.com/tools/compare/historical_valuation.php',
25 | }
26 |
27 | data = {
28 | 'report_type': 'totalValue',
29 | 'report_stock_id': code,
30 | 'from_date': '2015-04-26',
31 | 'to_date': '2020-04-26'
32 | }
33 |
34 | response = requests.post('http://www.dashiyetouzi.com/tools/compare/historical_valuation_data.php', headers=headers, cookies=cookies, data=data, verify=False)
35 | js=response.json()
36 | data=js.get('list')
37 | all_point=[]
38 | for item in data:
39 | all_point.append(item[1])
40 |
41 |
42 | np_data = np.array(all_point)
43 | print(f'中值:{np.median(np_data)}')
44 | print(f'最小值:{np.min(np_data)}')
45 |
--------------------------------------------------------------------------------
/myubbs/sandbox/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | from sandbox.models import SpiderModels, DBSession
8 | import logging
9 | import pymongo
10 | from sandbox import config
11 | from sandbox import settings
12 |
13 | class SQLPipeline(object):
14 | def __init__(self):
15 | self.session = DBSession()
16 |
17 | def process_item(self, item, spider):
18 |
19 | obj = SpiderModels(
20 | title=item['title'],
21 | pubdate = item['pubdate'],
22 | content = item['content'],
23 | author = item['author'],
24 | url = item['url'],
25 | crawltime=item['crawltime'],
26 | )
27 | self.session.add(obj)
28 |
29 | try:
30 | self.session.commit()
31 |
32 | except Exception as e:
33 | self.session.rollback()
34 | logging.error('>>>> 插入数据库失败{}'.format(e))
35 | return item
36 |
37 |
38 | class MongoPipeline(object):
39 | def __init__(self):
40 | DOCUMENT = setting.MONGODB_DOC
41 | self.db = pymongo.MongoClient(config.mongo_ip, port=27018)
42 | self.doc = self.db['spider'][DOCUMENT]
43 |
44 | def process_item(self, item, spider):
45 | insert_item = dict(item)
46 | self.doc.insert(insert_item)
47 |
48 | return item
49 |
--------------------------------------------------------------------------------
/MyLibrary/sandbox/sandbox/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | # from sandbox.models import SpiderModels, DBSession
8 | import logging
9 | import pymongo
10 | from sandbox import config
11 |
12 |
13 | # class SQLPipeline(object):
14 | # def __init__(self):
15 | # self.session = DBSession()
16 | #
17 | # def process_item(self, item, spider):
18 | #
19 | # obj = SpiderModels(
20 | # card=item['card'],
21 | # accountLength=item['accountLength'],
22 | # cardName=item['cardName'],
23 | # cardType=item['cardType'],
24 | # mainAccount=item['mainAccount'],
25 | # mainValue=item['mainValue'],
26 | # orgName=item['orgName'],
27 | # )
28 | # self.session.add(obj)
29 | #
30 | # try:
31 | # self.session.commit()
32 | #
33 | # except Exception as e:
34 | # logging.error('>>>> 插入数据库失败{}'.format(e))
35 | # return item
36 |
37 |
38 | class MongoPipeline(object):
39 | def __init__(self):
40 | DOCUMENT = 'szlib'
41 | self.db = pymongo.MongoClient(config.mongo_ip, port=config.mongo_port)
42 | self.doc = self.db['spider'][DOCUMENT]
43 |
44 | def process_item(self, item, spider):
45 | self.doc.insert(dict(item))
46 | return item
47 |
--------------------------------------------------------------------------------
/lanrentingshu/lrts/lrts/spiders/tingshu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from scrapy import Request
4 |
5 | class TingshuSpider(scrapy.Spider):
6 | name = 'tingshu'
7 |
8 | # allowed_domains = ['www.lrts.me']
9 | # start_urls = ['http://www.lrts.me/']
10 |
11 | def start_requests(self):
12 | headers = {'Host': 'www.lrts.me', 'Proxy-Connection': 'keep-alive', 'Accept': '*/*',
13 | 'X-Requested-With': 'XMLHttpRequest',
14 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/65.0.3325.162Safari/537.36',
15 | 'Referer': 'http://www.lrts.me/playlist', 'Accept-Encoding': 'gzip,deflate',
16 | 'Accept-Language': 'zh-CN,zh;q=0.9',
17 | 'Cookie': 'aliyungf_tc=AQAAAF1znybVVQsAByAmG3Fs/DLq2DNK;CNZZDATA1254668430=264272103-1533047311-null%7C1533047311;Hm_lvt_ada61571fd48bb3f905f5fd1d6ef0ec4=1533051241;uid=1533051247919aea3a93a713a48c4a8d2221a0db33cc5;JSESSIONID=472B70BC34B8D0027B3B20AAE935E662;Hm_lpvt_ada61571fd48bb3f905f5fd1d6ef0ec4=1533051318'}
18 |
19 | url = 'http://www.lrts.me/ajax/playlist/2/6458'
20 | yield Request(url=url,headers=headers)
21 |
22 | def parse(self, response):
23 | download_list = response.xpath('//input[@name="source"]/@value').extract()
24 | item={}
25 | item['file_urls']=[]
26 | for each in download_list:
27 | item['file_urls'].append(each)
28 | yield item
29 |
--------------------------------------------------------------------------------
/lanrentingshu/lanrentingshu.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import urllib
3 |
4 | import os
5 | import requests
6 | import time
7 | from lxml import etree
8 | from header_toolkit import getheader
9 |
10 |
11 | def spider():
12 | curr=os.getcwd()
13 | target_dir=os.path.join(curr,'data')
14 | if not os.path.exists(target_dir):
15 | os.mkdir(target_dir)
16 | for i in range(1, 100, 10):
17 | url = 'http://www.lrts.me/ajax/playlist/2/32551/%d' % i
18 | headers = {
19 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
20 | s = requests.get(url=url, headers=headers)
21 | tree = etree.HTML(s.text)
22 | nodes = tree.xpath('//*[starts-with(@class,"clearfix section-item section")]')
23 | print len(nodes)
24 | for node in nodes:
25 | filename = node.xpath('.//div[@class="column1 nowrap"]/span/text()')[0]
26 | link = node.xpath('.//input[@name="source" and @type="hidden"]/@value')[0]
27 |
28 | print link
29 | post_fix=link.split('.')[-1]
30 | full_path= filename+'.'+post_fix
31 | filename = os.path.join(target_dir, full_path)
32 | # 修改这一段,多线程下载
33 | if not os.path.isfile(filename):
34 | urllib.urlretrieve(link, filename)
35 | time.sleep(1)
36 | else:
37 | continue
38 |
39 |
40 | if __name__ == '__main__':
41 | spider()
42 |
--------------------------------------------------------------------------------
/pornhub/newJs.js:
--------------------------------------------------------------------------------
1 | var quality_1080p =/* + radra27radra27 + */rahttpsra83rahttpsra83 + /* + rancomvira35rancomvira35 + */raevphncdra57raevphncdra57 + /* + radra27radra27 + */rancomvira35rancomvira35 + /* + ra006163ra73ra006163ra73 + */radeos202ra16radeos202ra16 + /* + ra09ratera79ra09ratera79 + */ra006163ra73ra006163ra73 + /* + ra1080p4ra73ra1080p4ra73 + */ra24075351ra94ra24075351ra94 + /* + raroiu6qra26raroiu6qra26 + */ra1080p4ra73ra1080p4ra73 + /* + ra000k324ra70ra000k324ra70 + */ra000k324ra70ra000k324ra70 + /* + rancomvira35rancomvira35 + */ra075351mra26ra075351mra26 + /* + ravalidtora49ravalidtora49 + */rap4validra25rap4validra25 + /* + ra209hashra72ra209hashra72 + */rafrom160ra56rafrom160ra56 + /* + ra1080p4ra73ra1080p4ra73 + */ra6708909ra29ra6708909ra29 + /* + ra209hashra72ra209hashra72 + */ravalidtora49ravalidtora49 + /* + ramgdmctbvra11ramgdmctbvra11 + */ra16067161ra17ra16067161ra17 + /* + ra24075351ra94ra24075351ra94 + */ra09ratera79ra09ratera79 + /* + ra50000kbra49ra50000kbra49 + */ra50000kbra49ra50000kbra49 + /* + ramgdmctbvra11ramgdmctbvra11 + */raurst500ra63raurst500ra63 + /* + ra209hashra72ra209hashra72 + */ra00kip4ra41ra00kip4ra41 + /* + raroiu6qra26raroiu6qra26 + */ra72419ra91ra72419ra91 + /* + ra09ratera79ra09ratera79 + */ra209hashra72ra209hashra72 + /* + raro7upu3ra66raro7upu3ra66 + */raroiu6qra26raroiu6qra26 + /* + ra075351mra26ra075351mra26 + */ra2bmkdz7nra36ra2bmkdz7nra36 + /* + ra50000kbra49ra50000kbra49 + */ramgdmctbvra11ramgdmctbvra11 + /* + radeos202ra16radeos202ra16 + */raro7upu3ra66raro7upu3ra66 + /* + ra075351mra26ra075351mra26 + */radra27radra27;
2 |
--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
1 | ## 爬虫合集
2 | * 51CTOCrawler: 爬取51CTO的视频,并通过ffmpeg合并
3 | * 51jbnet: 51脚本内容爬取
4 | * 52sh: 台湾52社区网站 妹子图片爬取
5 | * anjuke:安居客爬虫
6 | * async_cuiqingcai: 异步爬取崔庆才博客内容
7 | * baiduwanpan: 暴力破解百度网盘密码
8 | * bbssmth:水木清华爬虫
9 | * bilibili:bilibili视频抓取
10 | * chahaoba: 查号吧 遍历所有手机号码归属地
11 | * chinaclear: 中登网开户人数爬取
12 | * cnbeta: cnbeta爬虫
13 | * csdn:csdn博客排名抓取
14 | * cuiqingcai:崔庆才博客爬取
15 | * dfcf: 东方财富股吧爬取,爬取所有个股的股吧帖子,可通过参数控制爬取指定日期
16 | * enterprise: 爬取工商企业数据
17 | * Ergeduoduo:儿歌多多 [http://30daydo.com/article/236](http://30daydo.com/article/236)
18 | * Forbes:福布斯排名爬虫
19 | * fraud: 失信被执行人爬取
20 | * github_star: 获取github某个人的所有仓库,星星总数
21 | * htqyy: 好听轻音乐 爬取轻音乐mp3
22 | * jd:京东图书爬取
23 | * kc0011:投资咨询网
24 | * lanrentingshu:每天心理学 (懒人听书)[http://30daydo.com/article/231](http://30daydo.com/article/231)
25 | * MyLibrary:图书馆抓取个人的阅读记录
26 | * pornhub: p站视频下载
27 | * poi_gaode:根据经纬度范围,在高德地图上遍历数据
28 | * qianfangyiguan:千帆易观数据抓取
29 | * szhouse: 深圳房价官网爬取
30 | * tiexue: 军事网站 铁血网内容爬取
31 | * stockholder:股东数据抓取
32 | * tencentjob:腾讯工作岗位爬取
33 | * ximalaya:喜马拉雅音频爬取 [http://30daydo.com/article/503](http://30daydo.com/article/503)
34 | * yinyonbao:应用宝app排名数据抓取
35 | * youdao_dictionary:有道词典js加密破解 [http://30daydo.com/article/416](http://30daydo.com/article/416)
36 | * zhihu:知乎分布式爬取
37 | ### QA疑问
38 | 代码库代码均已通过本人爬取测试可行,如果你使用本代码遇到问题,可邮件咨询。
39 | **上述只是本人代码库中部分展示代码,还有大量的非公开爬虫代码如:国家工商系统爬虫,淘宝网等,可以联系本人提供**
40 | **同时本人也承接各类爬虫业务**
41 |
42 | 或者爬虫的朋友可以加QQ群,一起分享项目。
43 |
44 |
45 |
46 | ###### 做爬虫的朋友经常会遇到别人已经爬过的网站,然后自己刚好需要,互补有无,提高效率。
47 |
48 | ###### QQ群:759746506
49 |
50 |
51 |
52 | 公众号:
53 |
54 | 
--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/monitor/statscol.py:
--------------------------------------------------------------------------------
1 | # *-* coding:utf-8 *-*
2 | '''
3 | @author: ioiogoo
4 | @date: 2016/12/25 16:50
5 | '''
6 |
7 | import redis
8 | from .settings import STATS_KEYS
9 | import time
10 | import requests
11 | import json
12 | r = redis.Redis(host='10.18.6.46', port=6379, db=0,decode_responses=True)
13 | Time = lambda: time.strftime('%Y-%m-%d %H:%M:%S')
14 |
15 |
16 | class StatcollectorMiddleware(object):
17 | def __init__(self):
18 | self.r = redis.Redis(host='10.18.6.46', port=6379, db=0,decode_responses=True)
19 | self.stats_keys = STATS_KEYS
20 |
21 | def process_request(self, request, spider):
22 | self.formatStats(spider.crawler.stats.get_stats())
23 |
24 | def formatStats(self, stats):
25 | for key in self.stats_keys:
26 | key_value = stats.get(key, None)
27 | if not key_value: continue
28 | value = {"value": [Time(), key_value]}
29 | content = json.dumps(value)
30 | print(f'key content {key}')
31 | print(f'value -->{content}')
32 | self.insert2redis(key, content)
33 |
34 | def insert2redis(self, key, value):
35 | self.r.rpush(key, value)
36 |
37 |
38 | class SpiderRunStatspipeline(object):
39 | def open_spider(self, spider):
40 | print('open SpiderRunStatspipeline')
41 | r.set('spider_is_run', 1)
42 | requests.get('http://127.0.0.1:5000/signal?sign=running')
43 |
44 | def close_spider(self, spider):
45 | r.set('spider_is_run', 0)
46 | requests.get('http://127.0.0.1:5000/signal?sign=closed')
--------------------------------------------------------------------------------
/weibo/weibo/spiders/wb.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from scrapy import Spider, FormRequest, Request
3 |
4 |
5 | class WbSpider(Spider):
6 | name = 'wb'
7 |
8 | headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
9 | 'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache',
10 | 'Connection': 'keep-alive',
11 | # 'Cookie': 'ALF=1539744188;SCF=Arejsw06Aa86L7rLsj3RRh8YiCul1z1Yapy6v1kQNGNbjcNLV3LPZbziAEtRKYVOAL_s5JKT2rck3tB7VAtepd4.;SUB=_2A252m2dXDeRhGedH7lcT8y7Fwj-IHXVSZAkfrDV6PUJbktAKLRejkW1NUKTAOGny8CQfH8IlGwCeP72gG_Pf_dFi;SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWIFwD6xpqyuh9_mA2jr6on5JpX5K-hUgL.Fo24SK-Ee0541Ke2dJLoI7LCdcSuwHvAMN-t;SUHB=0Ryruv0xgZvGM5;SSOLoginState=1537152775;_T_WM=ae5298708cece22521d281346fac7744',
12 | 'Host': 'weibo.cn', 'Pragma': 'no-cache',
13 | 'Referer': 'https://weibo.cn/search/mblog?hideSearchFrame=&keyword=000001&page=2',
14 | 'Upgrade-Insecure-Requests': '1',
15 | 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36'}
16 |
17 | def start_requests(self):
18 | keyword = '000001'
19 | for page in range(1, 2):
20 | url = 'https://weibo.cn/search/mblog?hideSearchFrame=&keyword=000001&page=1'
21 | yield Request(url=url, headers=self.headers)
22 |
23 | def parse(self, response):
24 | # print(response.text)
25 | response.xpath('//div[@class="c" and contains(@id,"M_")]')
--------------------------------------------------------------------------------
/sz_yaohao/sandbox/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | from sandbox.models import SpiderModels, DBSession
8 | import logging
9 | import pymongo
10 | from sandbox import config
11 | from sandbox import settings
12 |
13 | class SQLPipeline(object):
14 | def __init__(self):
15 | self.session = DBSession()
16 |
17 | def process_item(self, item, spider):
18 |
19 | obj = SpiderModels(
20 | card=item['card'],
21 | accountLength=item['accountLength'],
22 | cardName=item['cardName'],
23 | cardType=item['cardType'],
24 | mainAccount=item['mainAccount'],
25 | mainValue=item['mainValue'],
26 | orgName=item['orgName'],
27 | origin=item['origin'],
28 | crawltime=item['crawltime'],
29 | )
30 | self.session.add(obj)
31 |
32 | try:
33 | self.session.commit()
34 |
35 | except Exception as e:
36 | logging.error('>>>> 插入数据库失败{}'.format(e))
37 | return item
38 |
39 |
40 | class MongoPipeline(object):
41 | def __init__(self):
42 | DOCUMENT = settings.MONGODB_DOC
43 | self.db = pymongo.MongoClient(settings.MONGO_HOST, port=settings.MONGO_PORT)
44 | self.doc = self.db['spider'][DOCUMENT]
45 |
46 | def process_item(self, item, spider):
47 | insert_item = dict(item)
48 | self.doc.insert(insert_item)
49 |
50 | return item
51 |
--------------------------------------------------------------------------------
/poi_gaode/gaode_map.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2018/12/6 10:39
4 | # @File : gaode_map.py
5 | import requests
6 | from math import radians, cos, sin, asin, sqrt
7 | import config
8 | import json
9 |
10 | def demo():
11 | key=config.key
12 | url =f'https://restapi.amap.com/v3/place/polygon?polygon=116.460988,40.006919|116.48231,40.007381|116.47516,39.99713|116.472596,39.985227|116.45669,39.984989|116.460988,40.006919&keywords=kfc&output=json&key={key}'
13 | r = requests.get(url)
14 | print(r.json())
15 |
16 | def haversine(lon1, lat1, lon2, lat2): # 经度1,纬度1,经度2,纬度2 (十进制度数)
17 | """
18 | Calculate the great circle distance between two points
19 | on the earth (specified in decimal degrees)
20 | """
21 | # 将十进制度数转化为弧度
22 | lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
23 |
24 | # haversine公式
25 | dlon = lon2 - lon1
26 | dlat = lat2 - lat1
27 | a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
28 | c = 2 * asin(sqrt(a))
29 | r = 6371 # 地球平均半径,单位为公里
30 |
31 | return c * r * 1000
32 |
33 |
34 | def long_lati_change():
35 | lbs = [(22.7100061372,113.7915802002),
36 | (22.7866273171,114.3717956543),
37 | (22.5404642212,113.9189529419),
38 | (22.5487084710,114.2375564575),
39 | (22.6586902908,114.2598724365),
40 | ]
41 | for i in lbs:
42 | print(f'{i[1]},{i[0]}|',end='')
43 | # demo()
44 | # 114.04308499999999,22.527853|114.04808499999999,22.522853
45 | lati1,long1=22.527853,114.04308499999999
46 | lati2,long2=22.522853,114.04808499999999
47 | print(haversine(long1,lati1,long2,lati2))
48 | # long_lati_change()
49 |
50 |
51 |
52 |
--------------------------------------------------------------------------------
/tencentjob/tencentjob/spiders/tencent.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import re
3 |
4 | import scrapy
5 | from scrapy.linkextractors import LinkExtractor
6 | from scrapy.spiders import CrawlSpider, Rule
7 | from tencentjob.items import TencentjobItem
8 |
9 |
10 | class TencentSpider(CrawlSpider):
11 | name = 'tencent'
12 | allowed_domains = ['tencent.com']
13 | start_urls = ['https://hr.tencent.com/position.php']
14 | rules = [
15 | # 多个条件
16 | Rule(LinkExtractor(allow=("start=\d+"))),
17 | Rule(LinkExtractor(allow=("position_detail\.php")), follow=True, callback='parse_item')
18 | ]
19 |
20 | def parse_item(self, response):
21 | item = TencentjobItem()
22 |
23 | title = response.xpath('//*[(@id = "sharetitle")]/text()').extract_first()
24 | workLocation = response.xpath('//*[@class="lightblue l2"]/../text()').extract_first()
25 | catalog = response.xpath('//*[@class="lightblue"]/../text()').extract_first()
26 | recruitNumber = response.xpath('//*[@class="lightblue"]/../text()').re('(\d+)')[0]
27 | duty_pre = response.xpath('//*[@class="squareli"]').extract_first()
28 | duty = re.sub('<.*?>', '', duty_pre)
29 |
30 | Job_requirement_pre = response.xpath('//*[@class="squareli"]').extract_first()
31 | Job_requirement = re.sub('<.*?>', '', Job_requirement_pre)
32 |
33 | item['title'] = title
34 | item['url'] = response.url
35 | item['workLocation'] = workLocation
36 | item['catalog'] = catalog
37 | item['recruitNumber'] = recruitNumber
38 | item['duty'] = duty
39 | item['Job_requirement'] = Job_requirement
40 |
41 | yield item
42 |
--------------------------------------------------------------------------------
/szhouse/database.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 | __author__ = 'Rocky'
3 | import sqlite3
4 |
5 | def create_table():
6 | conn = sqlite3.connect('shenzhen_house.db')
7 | try:
8 | create_tb_cmd='''
9 | CREATE TABLE IF NOT EXISTS HOUSE
10 | ('日期' TEXT,
11 | '一手房套数' TEXT,
12 | '一手房面积' TEXT,
13 | '二手房套数' TEXT,
14 | '二手房面积' TEXT);
15 | '''
16 | #主要就是上面的语句
17 | conn.execute(create_tb_cmd)
18 | except:
19 | print("Create table failed")
20 | return False
21 |
22 |
23 | conn.execute(create_tb_cmd)
24 | conn.commit()
25 | conn.close()
26 |
27 | def insert(date,one_hand,one_area,second_hand,second_area):
28 | conn = sqlite3.connect('shenzhen_house.db')
29 | print("open database passed")
30 |
31 | cmd="INSERT INTO HOUSE ('日期','一手房套数','一手房面积','二手房套数','二手房面积') VALUES('%s','%s','%s','%s','%s');" %(date,one_hand,one_area,second_hand,second_area)
32 | #works 要么加\"
33 | #paul_su="INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) VALUES(5,'%s',32,'CALIFORNIA',2000.00);" %temp2
34 | #works 要么加 ’‘
35 |
36 | #allen="INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) VALUES(2,'ALLEN',72,'CALIFORNIA',20500.00);"
37 | #teddy="INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) VALUES(3,'TEDDY',732,'CALIFORNIA',52000.00);"
38 | #mark="INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) VALUES(4,'MARK',327,'CALIFORNIA',3000.00);"
39 | #sun="INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) VALUES(?,?,?,?,?);"
40 | #conn.execute("INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) VALUES(?,?,32,'CALIFORNIA',2000.00)",temp)
41 |
42 | conn.execute(cmd)
43 |
44 | conn.commit()
45 | conn.close()
--------------------------------------------------------------------------------
/fraud/fraud/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from fraud.model.fraud import Fraud
3 | from fraud.model.db_config import DBSession, RedisPool
4 | from scrapy.exceptions import DropItem
5 | import datetime
6 | import json
7 | class FraudPipeline(object):
8 |
9 | def open_spider(self, spider):
10 | self.session = DBSession()
11 |
12 | def process_item(self, item, spider):
13 | # item = json.dumps(dict(item)).decode('unicode-escape')
14 | f = Fraud(executed_name=item['executed_name'],
15 | gender=item['gender'],
16 | age=item['age'],
17 | identity_number=item['identity_number'],
18 | court=item['court'],
19 | province=item['province'],
20 | case_number=item['case_number'],
21 | performance=item['performance'],
22 | disrupt_type_name=item['disrupt_type_name'],
23 | duty=item['duty'],
24 | release_time=item['release_time'],
25 | crawl_time=datetime.datetime.now())
26 | self.session.add(f)
27 | try:
28 | self.session.commit()
29 | except Exception as e:
30 | print(e)
31 | self.session.rollback()
32 |
33 | return item
34 |
35 | def close_spider(self, spider):
36 | self.session.close()
37 |
38 | class DuplicatesPipeline(object):
39 | def process_item(self, item, spider):
40 | pool = RedisPool()
41 | r = pool.redis_pool()
42 | if r.exists('id_num: %s' % item['case_number']):
43 | raise DropItem("Duplicate item found: %s" % item['case_number'])
44 | else:
45 | r.set('id_num: %s' % item['case_number'], 1)
46 | return item
47 |
--------------------------------------------------------------------------------
/szhouse/house.py:
--------------------------------------------------------------------------------
1 | #-*-coding=utf-8-*-
2 | __author__ = 'rocky'
3 | # 网页源码修改 废弃使用
4 | #获取每天深圳一手房,二手房的成交套数与面积,并且写入数据库
5 | #主要就是正则表达抓取几个数字
6 | import re
7 | import database
8 | import requests
9 |
10 | def getContent():
11 | url="http://ris.szpl.gov.cn/"
12 | one_hand="credit/showcjgs/ysfcjgs.aspx"
13 | second_hand="credit/showcjgs/esfcjgs.aspx"
14 | # req=urllib2.Request(url+one_hand)
15 | # content=urllib2.urlopen(req).read()
16 | #返回的就是网页的源码,没有做任何防爬虫的处理,zf网站,呵呵
17 | #print content
18 | headers={'User-Agent':'Mozilla/5.0 (WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
19 | content = requests.get(url=url+one_hand,headers=headers).text
20 |
21 | date=re.compile(r'(.*)')
22 | reg=re.compile(r'| (\d+)')
23 | result=reg.findall(content)
24 | current_date=date.findall(content)
25 |
26 | reg2=re.compile(r' | (.*?)')
27 | yishou_area=reg2.findall(content)
28 |
29 |
30 | print(current_date[0])
31 | print('一手商品房成交套数:%s' % result[0])
32 | print('一手商品房成交面积: %s' % yishou_area[0])
33 |
34 |
35 | # sec_req=urllib2.Request(url+second_hand)
36 | # sec_content=urllib2.urlopen(sec_req).read()
37 |
38 | sec_content = requests.get(url+second_hand).text
39 |
40 | sec_quantity=re.compile(r' | (\d+) | ')
41 | sec_result=sec_quantity.findall(sec_content)
42 | second_area=re.findall(r'(.*?) | ',sec_content)
43 |
44 | print('二手商品房成交套数:%s' % sec_result[1])
45 | print('二手商品房成交面积: %s' % second_area[2])
46 | database.create_table()
47 | database.insert(current_date[0],result[0],yishou_area[0],sec_result[1],second_area[2])
48 |
49 | getContent()
--------------------------------------------------------------------------------
/ximalaya/story.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2019/10/18 18:04
4 | # @File : story.py
5 |
6 | # 睡前故事
7 | import os
8 |
9 | import requests,datetime,re
10 |
11 | url='http://mobwsa.ximalaya.com/mobile-album/album/page/ts-1571392955128?ac=WIFI&albumId=260744&device=android&isAsc=false&isQueryInvitationBrand=true&isVideoAsc=true&pageId={}&pageSize=100&pre_page=0&source=5&supportWebp=true'
12 | headers = {'User-Agent': 'Xiaomi'}
13 |
14 | def download():
15 |
16 | for i in range(1, 2): # 只下载一页
17 |
18 | r = requests.get(url=url.format(i), headers=headers)
19 | js_data = r.json()
20 | data_list = js_data.get('data', {}).get('tracks',{}).get('list',[])
21 |
22 | for item in data_list:
23 | trackName = item.get('title')
24 | trackName = re.sub('[\/\\\:\*\?\"\<\>\|]', '_', trackName)
25 | # trackName=re.sub(':','',trackName)
26 | src_url = item.get('playUrl64')
27 | orderNo = item.get('orderNo')
28 |
29 | filename = '{}-{}.mp3'.format(orderNo,trackName)
30 | if not os.path.exists(filename):
31 |
32 | try:
33 | r0 = requests.get(src_url, headers=headers,timeout=3600)
34 | except Exception as e:
35 | print(e)
36 | print(trackName)
37 | r0 = requests.get(src_url, headers=headers,timeout=3600)
38 |
39 |
40 |
41 | with open(filename, 'wb') as f:
42 | f.write(r0.content)
43 | print('{}下载完成'.format(filename))
44 |
45 | else:
46 | print(f'{filename}已经下载过了')
47 |
48 | if __name__=='__main__':
49 | print(f'start at {datetime.datetime.now()}')
50 | download()
51 | print(f'end at {datetime.datetime.now()}')
52 |
--------------------------------------------------------------------------------
/pornhub/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | __pycache__
4 | *.py[cod]
5 | *$py.class
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask instance folder
57 | instance/
58 |
59 | # Scrapy stuff:
60 | .scrapy
61 |
62 | # Sphinx documentation
63 | docs/_build/
64 |
65 | # PyBuilder
66 | target/
67 |
68 | # IPython Notebook
69 | .ipynb_checkpoints
70 | *.ipynb
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
91 |
92 | .idea/*
93 | .DS_Store
94 | .vscode
95 | settings.yaml
96 | tmp/*
97 | test/
98 | *.sqlite
99 | result/*
100 | logs/*
101 | tasks/result/*
102 | *.swp
103 | web/upload/*
104 | *.png
105 | *.yaml
106 |
107 | download*
108 | *.zip
109 | mp4/
110 | webm/
111 | nohup.out
112 |
--------------------------------------------------------------------------------
/stockholder/main.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 | import requests
3 | from lxml import etree
4 | import pymongo
5 | import tushare as ts
6 | client = pymongo.MongoClient('10.18.6.102')
7 | doc = client['secutiry']['shareholder']
8 |
9 | __author__ = 'Rocky'
10 |
11 | '''
12 | http://30daydo.com
13 | Email: weigesysu@qq.com
14 | '''
15 | def getContent(code):
16 | url = 'http://quotes.money.163.com/f10/gdfx_{}.html'.format(code)
17 |
18 | headers = {'User-Agent':'Mozilla/5.0(WindowsNT6.1;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/65.0.3325.162Safari/537.36'}
19 | for i in range(5):
20 | try:
21 | r = requests.get(url, headers=headers)
22 | if r.status_code==200:
23 | return r.text
24 | except Exception,e:
25 | print e
26 | continue
27 |
28 | return None
29 |
30 | def parser(code):
31 | text = getContent(code,)
32 | document={}
33 | if text is not None:
34 | tree = etree.HTML(text)
35 | name = tree.xpath('//div[@id="dateTable"]/table/tr/td[1]/text()')
36 | percent = tree.xpath('//div[@id="dateTable"]/table/tr/td[2]/text()')
37 | number = tree.xpath('//div[@id="dateTable"]/table/tr/td[3]/text()')
38 | # print name
39 | # print percent
40 | # print number
41 | d = {}
42 | for index,value in enumerate(name):
43 | # print index
44 | k = name[index]
45 | p=percent[index]
46 | n=number[index]
47 | if '.' in k:
48 | k=k.replace('.','_')
49 | d[k]=(p,n)
50 | document[code]=d
51 | doc.insert(document)
52 |
53 | def all_stocks():
54 | df = ts.get_stock_basics()
55 | for i in df.index:
56 | parser(i)
57 |
58 | def main():
59 | # parser('000011')
60 | all_stocks()
61 |
62 | if __name__ == '__main__':
63 | main()
--------------------------------------------------------------------------------
/kc0011/async_mongo.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2019/11/26 8:55
4 | # @File : async_mongo.py
5 | import asyncio
6 | from urllib.parse import urlparse
7 | import pymongo
8 | import threading
9 | from motor.motor_asyncio import AsyncIOMotorClient
10 | import motor
11 | from pymongo.errors import DuplicateKeyError
12 |
13 | #异步更新mongo数据库
14 |
15 | db_host = '192.168.10.48'
16 | db_port = 17001
17 | uri = 'mongodb://{0}:{1}'.format(
18 | db_host, db_port) # db_name 认证数据库
19 | db = motor.motor_tornado.MotorClient(uri)['spider'] # 认证完成后需要连接要用的数据库
20 |
21 | # client = AsyncIOMotorClient(MONGO_HOST, port=MONGO_PORT)
22 | # db = client['hedgehog_spider']
23 | # db.authenticate(name='Zane', password='*#06#', source='admin')
24 |
25 | doc = db['KC0011_content']
26 | block = 500
27 | total = 124684
28 |
29 | iter_number = total // block
30 |
31 | remain_part = total % block
32 | import re
33 |
34 | re_pattern = re.compile('&page=\d+')
35 |
36 |
37 | async def run():
38 | for i in range(iter_number + 1):
39 |
40 | small_part = doc.find({}, {'_id': 1, 'url': 1}).limit(block).skip(i * block)
41 |
42 | async for item in small_part:
43 | url = item.get('url')
44 | idx = item.get('_id')
45 | if re.search(re_pattern,url):
46 | # print(url)
47 |
48 | url_ = re.sub(re_pattern, '', url)
49 |
50 | try:
51 | await doc.update_one(
52 | {'_id': idx},
53 | {'$set': {'url': url_}}
54 | )
55 |
56 | except DuplicateKeyError as e:
57 | print(e)
58 | print('删除此doc {}'.format(url))
59 | await doc.delete_one({'_id':idx})
60 |
61 | except Exception as e:
62 | print(e)
63 |
64 |
65 | asyncio.get_event_loop().run_until_complete(run())
66 |
--------------------------------------------------------------------------------
/Ergeduoduo/main.py:
--------------------------------------------------------------------------------
1 | #-*-coding=utf-8-*-
2 | import sys,os
3 | import requests
4 | from lxml import etree
5 | import subprocess
6 | session = requests.Session()
7 | def getContent(url):
8 | # url='http://www.iqiyi.com/v_19rrkwcx6w.html'
9 | try:
10 | ret = requests.get(url)
11 | ret.encoding='utf-8'
12 | # except Exception,e:
13 | except:
14 | # print e
15 | return None
16 | if ret.status_code==200:
17 | return ret.text
18 | else:
19 | return None
20 |
21 | def getUrl():
22 | url='http://www.iqiyi.com/v_19rrkwcx6w.html'
23 | url2='http://www.iqiyi.com/v_19rrl2td7g.html' # 31-61
24 | content = getContent(url)
25 | if not content:
26 | print "network issue, retry"
27 | exit(0)
28 | root = etree.HTML(content,parser=etree.HTMLParser(encoding='utf-8'))
29 | elements=root.xpath('//div[@data-current-count="1"]//li')
30 | for items in elements:
31 | url_item=items.xpath('.//a/@href')[0]
32 | song_url = url_item.replace('//','')
33 | song_url=song_url.strip()
34 | print(song_url)
35 | # name=items.xpath('.//span[@class="item-num"]/text()')[0]
36 | name=items.xpath('.//span[@class="item-num"]/text()')[0].encode('utf-8').strip()+\
37 | ' '+items.xpath('.//span[@class="item-txt"]/text()')[0].encode('utf-8').strip()+'.mp4'
38 | name= '儿歌多多 '+name
39 | name=name.decode('utf-8')
40 | filename=os.path.join(os.getcwd(),name)
41 | print filename
42 | if os.path.exists(filename):
43 | continue
44 | p=subprocess.Popen('python you-get -d --format=HD {}'.format(song_url),stderr=subprocess.PIPE,stdout=subprocess.PIPE,shell=True)
45 | output,error = p.communicate()
46 | print(output)
47 | print(error)
48 | p.wait()
49 |
50 |
51 | def main():
52 | getUrl()
53 |
54 | if __name__ == '__main__':
55 | main()
--------------------------------------------------------------------------------
/poi_gaode/sandbox/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import datetime
8 |
9 | from sandbox.models import SpiderModels, DBSession
10 | import logging
11 | import pymongo
12 | from sandbox import config
13 | from sandbox import settings
14 | from pymongo.errors import DuplicateKeyError
15 | from scrapy.exceptions import DropItem
16 | # class SQLPipeline(object):
17 | # def __init__(self):
18 | # self.session = DBSession()
19 | #
20 | # def process_item(self, item, spider):
21 | #
22 | # obj = SpiderModels(
23 | # card=item['card'],
24 | # accountLength=item['accountLength'],
25 | # cardName=item['cardName'],
26 | # cardType=item['cardType'],
27 | # mainAccount=item['mainAccount'],
28 | # mainValue=item['mainValue'],
29 | # orgName=item['orgName'],
30 | # origin=item['origin'],
31 | # crawltime=item['crawltime'],
32 | # )
33 | # self.session.add(obj)
34 | #
35 | # try:
36 | # self.session.commit()
37 | #
38 | # except Exception as e:
39 | # logging.error('>>>> 插入数据库失败{}'.format(e))
40 | # return item
41 |
42 |
43 | class MongoPipeline(object):
44 | def __init__(self):
45 | DOCUMENT = settings.MONGODB_DOC
46 | self.db = pymongo.MongoClient(config.mongo_ip, port=27018)
47 | self.doc = self.db['spider'][DOCUMENT]
48 |
49 | def process_item(self, item, spider):
50 | insert_item = dict(item)
51 | insert_item['crawltime']=datetime.datetime.now()
52 | try:
53 | self.doc.insert(insert_item)
54 | except DuplicateKeyError:
55 | raise DropItem('drop item {}'.format(insert_item['id']))
56 |
57 | return item
58 |
--------------------------------------------------------------------------------
/pornhub/cookies_access.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | headers = {
4 | 'authority': 'cn.pornhub.com',
5 | 'pragma': 'no-cache',
6 | 'cache-control': 'no-cache',
7 | 'upgrade-insecure-requests': '1',
8 | 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
9 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
10 | 'sec-fetch-site': 'none',
11 | 'sec-fetch-mode': 'navigate',
12 | 'sec-fetch-dest': 'document',
13 | 'accept-language': 'zh,en;q=0.9,en-US;q=0.8,zh-CN;q=0.7',
14 | 'cookie': 'FastPopSessionRequestNumber=11; bs=0hwo170h8b27c5b55tt3ux7b8xkukol0; ss=630427593672619545; bitmovin_analytics_uuid=48eeeda8-bcfe-47f6-84fb-dd172921281a; platform_cookie_reset=pc; fg_9d12f2b2865de2f8c67706feaa332230=56077.100000; fg_7133c455c2e877ecb0adfd7a6ec6d6fe=32682.100000; ats_jp_vkey=ph5f29d906ac970; il=v1yKrZvlyVIqstonKh7Cf8kS4JOEHaOX5I0jleVOp8p6sxNjE0NjQ3MTgwaExRdXp5LXY2QVV4dnhhZmV1NncydDhpam15N1NMamk2dFc5bENEXw..; expiredEnterModalShown=1; platform=pc; fg_a197b3a83beb75c5f0255dc465e9f2de=3629.100000; ua=dcc77110dea38e3cff8b12436648706c; fanClubInfoPop=1; FastPopSessionRequestNumber=9',
15 | }
16 |
17 | params = (
18 | ('s', 'eyJrIjoiMDgxOTU1NjU4MGNjZjQyOTQ1ODVkZTdhNjM5NjkyMjQzNWE1NzdjYSIsInQiOjE2MDkyMTYwNDJ9'),
19 | ('v', 'ph5fe22b22c2a32'),
20 | ('e', '0'),
21 | )
22 |
23 | response = requests.get('https://cn.pornhub.com/video/get_media', headers=headers, params=params)
24 |
25 | #NB. Original query string below. It seems impossible to parse and
26 | #reproduce query strings 100% accurately so the one below is given
27 | #in case the reproduced version is not "correct".
28 | # response = requests.get('https://cn.pornhub.com/video/get_media?s=eyJrIjoiM2JkNzk3OTc3MDYxNjdhN2NiZjg3ZjAxN2YxMDI3YTY3MjNkOWNmMyIsInQiOjE2MDkyMTE5MzJ9&v=ph5c7a39b625845&e=0', headers=headers)
29 | print(response.json())
--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/RedisDuplicator.py:
--------------------------------------------------------------------------------
1 | import redis
2 | from scrapy.dupefilters import BaseDupeFilter
3 | # 自定义dupefilter
4 | class DupeFilter(BaseDupeFilter):
5 |
6 | def __init__(self,host,port,db,key,reset):
7 | print('='*20)
8 | print('using my dupefilter ')
9 | print('='*20)
10 | self.r = redis.StrictRedis(host=host,port=port,db=db)
11 | self.key = key
12 | self.reset = reset
13 |
14 |
15 | @classmethod
16 | def from_settings(cls, settings):
17 | # result=(dict(settings))
18 |
19 | # name=settings.get('BOT_NAME')
20 | # print(f'name is {name}')
21 | host=settings.get('REDIS_HOST','127.0.0.1')
22 | port=settings.get('REDIS_PORT',6379)
23 |
24 | print(f'host:{host},port {port}')
25 | db=settings.get('REDIS_DB',0)
26 | redis_key=settings.get('REDIS_KEY')
27 |
28 |
29 | print(f'redis key{redis_key}')
30 | user=settings.get('USER_AGENT')
31 | print(user)
32 | if redis_key is None:
33 | raise ValueError('No value assign to redis_key')
34 |
35 | reset=settings.getbool('REDIS_REST',False)
36 |
37 |
38 |
39 | return cls(host,port,db,redis_key,reset)
40 |
41 | def request_seen(self, request):
42 |
43 | if self.r.sismember(self.key,request.url):
44 | print(f'url ---{request.url}---has been seen 重复URL')
45 |
46 | return True
47 |
48 | else:
49 | # print('add an url in redis')
50 | self.r.sadd(self.key,request.url)
51 |
52 | return False
53 |
54 | def open(self): # can return deferred
55 | pass
56 |
57 | def close(self, reason): # can return a deferred
58 | print('dup closed')
59 |
60 | if self.reset:
61 | print(f'delete redis key {self.key}')
62 | self.r.delete(self.key)
63 |
64 | def log(self, request, spider): # log that a request has been filtered
65 | pass
--------------------------------------------------------------------------------
/sz_yaohao/sandbox/spiders/website.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import json
3 | import re
4 |
5 | import requests
6 | import scrapy
7 | from scrapy import Request, FormRequest
8 | import logging
9 | import redis
10 | from sandbox.items import SpiderItem
11 | from sandbox.utility import get_header
12 | from sandbox.config import code_url
13 |
14 | # post
15 | class WebPostSpider(scrapy.Spider):
16 | name = 'website'
17 | headers = {
18 |
19 | }
20 | post_url = 'https://apply.jtys.sz.gov.cn/apply/app/increment/person/login'
21 | img_url = 'http://apply.jtys.sz.gov.cn/apply/app/validCodeImage'
22 |
23 | def __init__(self, *args, **kwargs):
24 | super(WebPostSpider, self).__init__(*args, **kwargs)
25 | self.headers = get_header()
26 |
27 | self.data = {
28 | 'loginType': 'MOBILE',
29 | 'loginCode': '',
30 | 'password': '',
31 | 'validCode': '',
32 | }
33 |
34 | def start_requests(self):
35 |
36 | yield Request(
37 | url=self.img_url,
38 | headers=self.headers
39 | )
40 | def parse(self,response):
41 | # TO DO
42 | img = response.body
43 |
44 | # with open('test.jpg','wb') as f:
45 | # f.write(img)
46 | r=requests.post(code_url,data=img)
47 | js_data = r.json()
48 | if js_data.get('success'):
49 | code = js_data.get('message')
50 | post_data=self.data.copy()
51 | post_data['validCode']=code
52 | # input('input code')
53 | yield FormRequest(url=self.post_url,
54 | headers=self.headers,
55 | formdata=post_data,
56 | callback=self.check_login,
57 | )
58 |
59 | def check_login(self,response):
60 | content=response.text
61 | if '忘记密码' in content:
62 | print('密码错误')
63 | else:
64 | print('找到密码')
65 |
66 |
--------------------------------------------------------------------------------
/fangtianxia/fangtianxia_proxy_test.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import hashlib
3 | import time
4 | import requests
5 |
6 | # 找群主购买 my_app_key, myappsecret, 以及蚂蚁代理服务器的 mayi_url 地址和 mayi_port 端口
7 | my_app_key = ""
8 | app_secret = ""
9 | mayi_url = 's3.proxy.mayidaili.com'
10 | mayi_port = '8123'
11 |
12 | # 蚂蚁代理服务器地址
13 | mayi_proxy = {'http': 'http://{}:{}'.format(mayi_url, mayi_port)}
14 |
15 | # 准备去爬的 URL 链接
16 | #url = 'http://1212.ip138.com/ic.asp'
17 | testUrl='http://members.3322.org/dyndns/getip'
18 | # 计算签名
19 | timesp = '{}'.format(time.strftime("%Y-%m-%d %H:%M:%S"))
20 | codes = app_secret + 'app_key' + my_app_key + 'timestamp' + timesp + app_secret
21 | sign = hashlib.md5(codes.encode('utf-8')).hexdigest().upper()
22 |
23 | # 拼接一个用来获得蚂蚁代理服务器的「准入」的 header (Python 的 concatenate '+' 比 join 效率高)
24 | authHeader = 'MYH-AUTH-MD5 sign=' + sign + '&app_key=' + my_app_key + '×tamp=' + timesp
25 |
26 | # 用 Python 的 Requests 模块。先订立 Session(),再更新 headers 和 proxies
27 |
28 | user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0"
29 | # cookie_read=open('cookie').read().strip()
30 | headers = {"User-agent": user_agent, 'upgrade-insecure-requests': '1',
31 | 'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6',
32 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
33 | 'accept-encoding': 'gzip, deflate', 'Cache-Control': 'no-cache'}
34 | '''
35 | s = requests.Session()
36 | s.headers.update({'Proxy-Authorization': authHeader})
37 | s.proxies.update(mayi_proxy)
38 | s.headers.update(headers)
39 | s.headers.update({'Proxy-Authorization': authHeader})
40 | pg = s.get(testUrl) # tuple: 300 代表 connect timeout, 270 代表 read timeout
41 | print(pg.text)
42 | print(pg.status_code)
43 | '''
44 | headers['Proxy-Authorization']=authHeader
45 | while 1:
46 | r=requests.get(url=testUrl,headers=headers,proxies=mayi_proxy)
47 | print(r.status_code)
48 | #r.encoding='gb2312'
49 | print(r.text)
50 | time.sleep(10)
51 | #pg.encoding = 'GB18030'
52 |
--------------------------------------------------------------------------------
/dfcf/settings.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2020/3/31 23:36
4 | # @File : settings.py
5 | import time
6 |
7 | import config
8 | import requests
9 |
10 | headers = {
11 | 'Connection': 'keep-alive',
12 | # 'Upgrade-Insecure-Requests': '1',
13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
14 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
15 | 'Referer': 'http://guba.eastmoney.com/list,300750_2.html',
16 | 'Accept-Encoding': 'gzip, deflate',
17 | 'Accept-Language': 'zh,en;q=0.9,en-US;q=0.8',
18 | }
19 |
20 | cookies = {
21 | 'qgqp_b_id': '4d112e2089d3c5855c8ca2d1f2947ecd',
22 | 'em_hq_fls': 'js',
23 | 'st_si': '98016728708487',
24 | 'HAList': 'a-sh-601799-%u661F%u5B87%u80A1%u4EFD%2Ca-sh-600729-%u91CD%u5E86%u767E%u8D27%2Ca-sz-000063-%u4E2D%u5174%u901A%u8BAF%2Cf-0-399300-%u6CAA%u6DF1300',
25 | 'emshistory': '%5B%22%E6%98%9F%E5%AE%87%E8%82%A1%E4%BB%BD%22%2C%22601799%22%2C%22300496%22%2C%22dfcf%22%5D',
26 | 'st_asi': 'delete',
27 | 'st_pvi': '04745525503534',
28 | 'st_sp': '2019-10-28%2011%3A48%3A22',
29 | 'st_inirUrl': 'https%3A%2F%2Fwww.baidu.com%2Flink',
30 | 'st_sn': '132',
31 | 'st_psi': '20200401002426450-117001301474-3984682985',
32 | }
33 |
34 | def get_proxy(retry=10):
35 | count = 0
36 | proxyurl = 'http://{}:8101/dynamicIp/common/getDynamicIp.do'.format(
37 | config.PROXIES_OLD)
38 | for i in range(retry):
39 | try:
40 | r = requests.get(proxyurl, timeout=10)
41 | # print('获取的代理ip ' + r.text)
42 | except Exception as e:
43 | print(e)
44 | count += 1
45 | print('代理获取失败,重试' + str(count))
46 | time.sleep(1)
47 |
48 | else:
49 | js = r.json()
50 | proxyServer = 'http://{0}:{1}'.format(js.get('ip'), js.get('port'))
51 | proxies_random = {
52 | 'http': proxyServer
53 | }
54 | return proxies_random
55 |
--------------------------------------------------------------------------------
/holdle/sync_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time : 2020/11/24 21:42
3 | # @File : sync_spider.py
4 | # @Author : Rocky C@www.30daydo.com
5 | import requests
6 | import sys
7 | sys.path.append('..')
8 | import asyncio
9 | import datetime
10 | import aiohttp
11 | import re
12 | import time
13 | from parsel import Selector
14 | from configure.settings import DBSelector
15 | from common.BaseService import BaseService
16 |
17 | SLEEP = 2
18 |
19 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
20 | 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'}
21 |
22 | URL_MAP = {'home_page': 'https://holdle.com/stocks/industry', 'base': 'https://holdle.com'}
23 |
24 |
25 | class Holdle(BaseService):
26 |
27 | def __init__(self):
28 | super(Holdle, self).__init__()
29 |
30 | self.DB = DBSelector()
31 | self.client = self.DB.mongo(location_type='qq', async_type=True)
32 | self.session = requests.Session()
33 |
34 | def run(self):
35 | start = time.time()
36 |
37 | response = self.session.get(url=URL_MAP['home_page'], headers=headers)
38 | html = response.text # 这个阻塞
39 | resp = Selector(text=html)
40 | industries = resp.xpath('//ul[@class="list-unstyled"]/a')
41 | for industry in industries:
42 | json_data = {}
43 | industry_url = industry.xpath('.//@href').extract_first()
44 | industry_name = industry.xpath('.//li/text()').extract_first()
45 | json_data['industry_url'] = industry_url
46 | json_data['industry_name'] = industry_name
47 | self.detail_list(industry_url, json_data)
48 |
49 | end = time.time()
50 | print(f'time used {end-start}')
51 |
52 | def detail_list(self, url, json_data):
53 |
54 | response = self.session.get(URL_MAP['base']+url, headers=headers)
55 | response =response.text
56 | self.parse_detail(response, json_data)
57 |
58 | def parse_detail(self, html, json_data=None):
59 | resp = Selector(text=html)
60 | title =resp.xpath('//title/text()').extract_first()
61 | print(title)
62 |
63 |
64 | app = Holdle()
65 | app.run()
66 |
--------------------------------------------------------------------------------
/51jbnet/im_sandbox/models.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2019/5/16 17:34
4 | # @File : models.py
5 | from contextlib import contextmanager
6 | from datetime import datetime
7 |
8 | from sqlalchemy import create_engine
9 | from sqlalchemy import Column, Integer, String, Date, DateTime, Text
10 | from sqlalchemy.orm import sessionmaker, scoped_session
11 | from sqlalchemy.ext.declarative import declarative_base
12 |
13 | from im_sandbox.settings import MYSQL_DB_URI
14 |
15 | # declare a Mapping,this is the class describe map to table column
16 | Base = declarative_base()
17 | engine = create_engine(MYSQL_DB_URI)
18 | session_factory = sessionmaker(bind=engine)
19 | Session = scoped_session(session_factory)
20 |
21 |
22 | @contextmanager
23 | def scoped_session():
24 | session = Session()
25 | try:
26 | yield session
27 | session.commit()
28 | except:
29 | session.rollback()
30 | raise
31 | finally:
32 | session.close()
33 |
34 |
35 | class SpiderModel(Base):
36 | __tablename__ = 'testdb'
37 | id = Column(Integer, primary_key=True, autoincrement=True)
38 | score = Column(Integer, nullable=False, default=0)
39 | catid = Column(Integer, nullable=False, default=0)
40 | score_story = Column(String(512), nullable=False, default='')
41 | hometext = Column(String(1024), nullable=False, default='')
42 | counter = Column(Integer, nullable=False, default=0)
43 | inputtime = Column(DateTime, nullable=False, default=datetime.now())
44 | topic = Column(Integer, nullable=False, default=0)
45 | source = Column(String(128), nullable=False, default='')
46 | mview = Column(Integer, nullable=False, default=0)
47 | comments = Column(Integer, nullable=False, default=0)
48 | crawled_datetime = Column(DateTime, nullable=False, default=datetime.now())
49 | rate_sum = Column(Integer, nullable=False, default=0)
50 | title = Column(String(512), nullable=False, default='')
51 | url_show = Column(String(512), nullable=False, default='')
52 | thumb = Column(String(256), nullable=False, default='')
53 |
54 | # 建表的时候去掉这一行注释
55 | # Base.metadata.create_all(engine)
56 |
57 | def map_orm_item(scrapy_item, sql_item):
58 | for k, v in scrapy_item.items():
59 | sql_item.__setattr__(k, v)
60 | return sql_item
61 |
62 |
63 |
--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/CustomExtension.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # @Date : 2019-08-27 11:31:19
4 | # @Author : Rocky Chen (weigesysu@qq.com)
5 | # @Link : http://30daydo.com
6 | # @Version : $1.0$
7 | from scrapy import signals
8 | import pika
9 | import json
10 | import datetime
11 | from scrapy.exceptions import NotConfigured
12 |
13 | # 自定义扩展 推送到 rabbitmq
14 | class AdvancedExtension(object):
15 |
16 | def __init__(self,crawler):
17 | self.crawler = crawler
18 | self.crawler.signals.connect(self.spider_close,signals.spider_closed)
19 | self.mq_host=crawler.settings.get('MQ_HOST')
20 | self.mq_port=crawler.settings.getint('MQ_PORT')
21 | self.mq_user=crawler.settings.get('MQ_USER')
22 | self.mq_password=crawler.settings.get('MQ_PASSWORD')
23 | self.queue_name = crawler.settings.get('MQ_QUEUE_NAME')
24 | if not self.queue_name:
25 | raise NotConfigured # 有这个是让这个模块失效而不报错
26 | self.start_time = datetime.datetime.now()
27 |
28 | @classmethod
29 | def from_crawler(cls,crawler):
30 |
31 | return cls(crawler)
32 |
33 | def spider_close(self,spider):
34 |
35 | print('in extension module, spider close')
36 | print(f'spider name {spider.name}')
37 | # print(dir(spider))
38 | credentials = pika.PlainCredentials(self.mq_user,self.mq_password)
39 |
40 | connection = pika.BlockingConnection(pika.ConnectionParameters(self.mq_host,self.mq_port,'/',credentials))
41 |
42 | channel = connection.channel()
43 |
44 | queue_name = 'spider'
45 | channel.queue_declare(queue=self.queue_name,durable=True)
46 | now = datetime.datetime.now()
47 |
48 | content = {'spiderName':spider.name,'status':'closed','start_time':self.start_time.strftime('%Y-%m-%d %H:%M:%S'),'end_time':now.strftime('%Y-%m-%d %H:%M:%S'),'time_used(s)':(now-self.start_time).seconds}
49 |
50 | send_content = json.dumps(content)
51 |
52 | channel.basic_publish(
53 | exchange='',
54 | routing_key=self.queue_name,
55 | body=send_content,
56 | properties=pika.BasicProperties(
57 | delivery_mode=2) # 这个是用来做消息持久化,数据会保存在队列,直到被消费
58 | )
59 |
60 | print('[x] send {}'.format(send_content))
61 | connection.close()
62 |
63 |
64 |
65 |
66 |
--------------------------------------------------------------------------------
/yinyonbao/yingyongbao.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import re
3 |
4 | import requests
5 | from lxml import etree
6 | import pandas as pd
7 |
8 | class Yinyongbao():
9 | def __init__(self):
10 | self.user_agent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
11 | self.headers = {"User-Agent": self.user_agent}
12 |
13 |
14 | def getData(self):
15 | base_url='http://sj.qq.com/myapp/category.htm'
16 | parent_url='http://sj.qq.com/myapp/category.htm?orgame=1'
17 | s=requests.get(url=parent_url,headers=self.headers)
18 | print(s.status_code)
19 | #print(s.text)
20 | tree=etree.HTML(s.text)
21 | menu=tree.xpath('//ul[@class="menu-junior"]')[0]
22 | print(type(menu))
23 |
24 | link= menu.xpath('.//li[@id]/a/@href')
25 | catelog=[]
26 | for i in link:
27 | print(i)
28 | p=re.compile('categoryId=(-?\d+)')
29 | #x=base_url+i
30 | x=p.findall(i)[0]
31 | #print(x)
32 | catelog.append(x)
33 | return catelog
34 |
35 | def testcase(self):
36 | catelog=self.getData()
37 | print(catelog)
38 | for i in catelog:
39 | print("Catelog : ", i)
40 | self.each_page(int(i),0)
41 |
42 | #抓取某一个分类的
43 | def each_page(self,categoryId,pageContext):
44 |
45 | url='http://sj.qq.com/myapp/cate/appList.htm?orgame=1&categoryId=%d&pageSize=20&pageContext=%d' %(categoryId,pageContext)
46 | para={'orgame':1,'categoryId':categoryId,'pageSize':20,'pageContext':pageContext}
47 | s=requests.get(url=url,params=para,headers=self.headers)
48 | js= s.json()
49 | name=[]
50 | df=pd.DataFrame(js['obj'])
51 | print(df)
52 | for i in js['obj']:
53 | #需要的数据都在这里面
54 | x= i['appName']
55 | print(x,' ---download count: ', i['appDownCount'])
56 |
57 | name.append(x)
58 | print(len(name))
59 | try:
60 | pageContext=int(js['pageContext'])
61 | self.each_page(categoryId,pageContext)
62 | except Exception as e:
63 | return
64 |
65 | def main():
66 | obj=Yinyongbao()
67 | #obj.getData()
68 | #obj.each_page('',0)
69 | obj.testcase()
70 | '''
71 | for i in range(0,200,38):
72 | obj.each_page('',i)
73 | '''
74 | main()
75 |
--------------------------------------------------------------------------------
/ximalaya/main.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # website: http://30daydo.com
3 | # @Time : 2019/6/30 12:03
4 | # @File : main.py
5 |
6 | import requests
7 | import re
8 | import os
9 |
10 | url = 'http://180.153.255.6/mobile/v1/album/track/ts-1571294887744?albumId=23057324&device=android&isAsc=true&isQueryInvitationBrand=true&pageId={}&pageSize=20&pre_page=0'
11 | headers = {'User-Agent': 'Xiaomi'}
12 |
13 | def download():
14 | for i in range(1, 3):
15 | r = requests.get(url=url.format(i), headers=headers)
16 | js_data = r.json()
17 | data_list = js_data.get('data', {}).get('list', [])
18 | for item in data_list:
19 | trackName = item.get('title')
20 | trackName = re.sub('[\/\\\:\*\?\"\<\>\|]', '_', trackName)
21 | # trackName=re.sub(':','',trackName)
22 | src_url = item.get('playUrl64')
23 | filename = '{}.mp3'.format(trackName)
24 | if not os.path.exists(filename):
25 |
26 | try:
27 | r0 = requests.get(src_url, headers=headers)
28 | except Exception as e:
29 | print(e)
30 | print(trackName)
31 | r0 = requests.get(src_url, headers=headers)
32 |
33 |
34 | else:
35 | with open(filename, 'wb') as f:
36 | f.write(r0.content)
37 |
38 | print('{} downloaded'.format(trackName))
39 |
40 | else:
41 | print(f'{filename}已经下载过了')
42 |
43 | import shutil
44 |
45 | def rename_():
46 | for i in range(1, 3):
47 | r = requests.get(url=url.format(i), headers=headers)
48 | js_data = r.json()
49 | data_list = js_data.get('data', {}).get('list', [])
50 | for item in data_list:
51 | trackName = item.get('title')
52 | trackName = re.sub('[\/\\\:\*\?\"\<\>\|]', '_', trackName)
53 | src_url = item.get('playUrl64')
54 |
55 | orderNo=item.get('orderNo')
56 |
57 | filename = '{}.mp3'.format(trackName)
58 | try:
59 |
60 | if os.path.exists(filename):
61 | new_file='{}_{}.mp3'.format(orderNo,trackName)
62 | shutil.move(filename,new_file)
63 | except Exception as e:
64 | print(e)
65 |
66 |
67 |
68 |
69 |
70 | if __name__=='__main__':
71 | rename_()
72 |
--------------------------------------------------------------------------------
/myubbs/sandbox/spiders/website.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import datetime
3 | import json
4 | import re
5 | import scrapy
6 | from scrapy import Request, FormRequest
7 | import logging
8 | import redis
9 | from sandbox.items import SpiderItem
10 | from sandbox.utility import get_header
11 |
12 | # get
13 | class WebGetSpider(scrapy.Spider):
14 | name = 'myubbs'
15 | URL = 'http://zsu.myubbs.com/forum-97-{}.html'
16 |
17 | def __init__(self):
18 |
19 | super(WebGetSpider,self).__init__()
20 | self.headers=get_header()
21 | self.page=10
22 |
23 | def start_requests(self):
24 | # TO DO
25 | for p in range(1,self.page+1):
26 | yield Request(url=self.URL.format(p),
27 | headers=self.headers
28 | )
29 |
30 | def parse(self, response):
31 | root=response.xpath('//*[@id="threadlisttableid"]/tbody')
32 | for node in root[1:]:
33 | url = node.xpath('.//th//a[@class="s xst"]/@href').extract_first()
34 | # print(url)
35 | if url:
36 | yield Request(url,headers=self.headers,callback=self.parse_item)
37 |
38 | def parse_item(self,response):
39 |
40 | title = response.xpath('//span[@id="thread_subject"]/text()').extract_first()
41 | url = response.url
42 | pubdate = response.xpath('//div[@id="postlist"]/div[1]/table//div[@class="authi"]/em/text()').re_first('\d+-\d+-\d+ \d+:\d+:\d{2}')
43 | if pubdate is None:
44 | try:
45 | pubdate = response.xpath('//div[@id="postlist"]/div[1]/table//div[@class="authi"]/em/span/@title').extract_first()
46 | except Exception as e:
47 | print(e)
48 | pubdate=''
49 | # pubdate = response.xpath('//div[@id="postlist"]/').extract_first()
50 | author=response.xpath('//div[@class="authi"]/a/text()').extract_first()
51 | content = response.xpath('//td[@class="t_f"]')[0].xpath('string(.)').extract()[0]
52 | crawltime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
53 |
54 | spiderItem= SpiderItem()
55 |
56 | for field in spiderItem.fields:
57 | try:
58 | spiderItem[field]=eval(field)
59 | except Exception as e:
60 | logging.warning('can not find define of {}'.format(field))
61 | logging.warning(e)
62 |
63 | # print(spiderItem)
64 | yield spiderItem
65 |
66 |
67 |
68 |
69 |
--------------------------------------------------------------------------------
/v2ex_job/v2ex2.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from lxml import etree
3 | from scrapy import Selector
4 | from twisted.internet import defer
5 | from twisted.internet import reactor
6 | from twisted.web.client import getPage
7 |
8 |
9 | class V2exJob:
10 | def __init__(self):
11 | pass
12 |
13 | def get_page(self):
14 | """
15 | 总共页码的获取
16 | :return:
17 | """
18 | index_url = 'https://www.v2ex.com/go/jobs'
19 | index_headers = {
20 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
21 | }
22 | response = requests.get(url=index_url, headers=index_headers)
23 | selector = Selector(text=response.text)
24 | all_page = selector.xpath('//a[@class="page_normal"]/text()').extract()
25 | all_page = all_page[-1]
26 | return all_page
27 |
28 | @defer.inlineCallbacks
29 | def get_html(self, each_page):
30 | """
31 | 进行网站信息的获取,并进行返回。
32 | :param each_page:
33 | :return:
34 | """
35 | each_urls = 'https://www.v2ex.com/go/jobs?p=%s' % str(each_page)
36 | res = getPage(bytes(each_urls, encoding="utf-8")) # 获取页面,发送http请求,是使用select池将所有socket请求保存,依据此进行计数。
37 | # print( type(res)) #
38 | res.addCallback(self.parse_infos) # 对每一个请求都添加一个回调方法
39 | yield res # 返回他
40 |
41 | def parse_infos(self, parse_infos):
42 | parse_infos = parse_infos.decode('utf-8')
43 | parse_infos = etree.HTML(parse_infos)
44 | infos = parse_infos.xpath('//span[@class="item_title"]/a/text()')
45 | print(infos)
46 |
47 | def run(self):
48 | """
49 | 程序的启动开始采集数据
50 | :return:
51 | """
52 | all_page = self.get_page()
53 | defer_list = []
54 | for each_page in range(1, 10): # 禁忌务要一次性访问过多的请求。不然别人会禁掉你的。
55 | v = self.get_html(each_page) # 发送请求后立即返回,不等待返回,v是一个特殊对象,标志你发送到那个请求
56 | defer_list.append(v)
57 | d = defer.DeferredList(defer_list) # 将上面的特殊对象列表一起放入DeferredList
58 | d.addBoth(self.all_done) # 为所有对象添加回调
59 | reactor.run() # 会一直循环,我们需要在任务执行完毕后关闭。含有计数器,执行一个任务,会执行一次get_html,计数减一。单任务执行完毕,计数为0,执行all_done
60 |
61 | def all_done(self, arg):
62 | print("all done")
63 | reactor.stop()
64 |
65 |
66 | if __name__ == '__main__':
67 | v2ex_job = V2exJob()
68 | v2ex_job.run()
69 |
70 |
--------------------------------------------------------------------------------
/anjuke/test_anjuke.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import re
3 |
4 | import requests
5 | from lxml import etree
6 | headers = {
7 | 'accept': 'text/html',
8 | 'accept-encoding': 'gzip, deflate, sdch',
9 | 'accept-language': 'zh-CN,zh;q=0.8',
10 | 'cache-control': 'no-cache',
11 | 'pragma': 'no-cache',
12 | 'User-Agent': 'UCWEB/2.0 (Linux; U; Adr 2.3; zh-CN; MI-ONEPlus)U2/1.0.0 UCBrowser/8.6.0.199 U2/1.0.0 Mobile',
13 | 'x-requested-with': 'XMLHttpRequest',
14 | 'cookie': 'als=0; isp=true; Hm_lvt_c5899c8768ebee272710c9c5f365a6d8=1502856226; sessid=1551E6AF-1AA9-2526-E4E9-D494551F4A2F; search_words361=%E9%98%B3%E5%85%89%E5%B0%8F%E5%8C%BA; search_words24=%E9%9D%96%E6%B1%9F%E9%9B%85%E5%9B%AD11%E5%8F%B7%E6%A5%BC%7C%E6%9C%88%E6%A1%82%E8%A5%BF%E5%9B%AD; search_words14=%E8%B6%85%E6%98%8E%E5%9B%AD; search_words25=%E6%96%B0%E6%83%A0%E5%AE%B6%E5%9B%AD; browse_comm_ids13=95393; seo_source_type=0; search_words13=%E6%AC%A7%E9%99%86%E7%BB%8F%E5%85%B8%7C%E5%8D%97%E6%96%B9%E6%98%8E%E7%8F%A0%E8%8A%B1%E5%9B%AD%7C%E5%8D%97%E6%96%B9%E6%98%8E%E7%8F%A0%E8%8A%B1%E5%9B%AD%E4%BA%8C%E6%9C%9F1%E6%A0%8B; twe=2; __xsptplus8=8.43.1504789824.1504790391.8%233%7C123.sogou.com%7C%7C%7C%7C%23%23hvhL5eg3_ejnK-ngxJE-qwbIXXbQIk81%23%3B%20aQQ_a; _ga=GA1.2.1188068084.1502419352; _gid=GA1.2.1082371756.1504696715; lps="/cityList/|"; aQQ_ajkguid=B97BFB26-048C-2797-947E-7543B95A2D8A; ctid=13; 58tj_uuid=a4461385-7d0d-4e1a-9e94-85fa7b69f6aa; new_session=0; init_refer=; new_uv=61'
15 | }
16 |
17 | start_url = 'https://m.anjuke.com/gu/community/?from=anjuke_home&p=1'
18 | r = requests.get(url=start_url, headers=headers)
19 | if r.json()['data']:
20 | print('not empty')
21 | else:
22 | print('empty')
23 |
24 |
25 | price_case='https://m.anjuke.com/gz/community/112952/'
26 | content=requests.get(url=price_case,headers=headers).text
27 | tree=etree.HTML(content)
28 | price=tree.xpath('//a[@data-soj="community_topprice"]/div[@class="txt-c"]/p[@class="price"]/text()')[0]
29 | print(price)
30 | name=tree.xpath('//div[@class="comm-tit"]/h1/text()')[0]
31 | print(name)
32 | address=tree.xpath('//div[@class="comm-tit"]/div[@class="comm-ad"]/p/text()')[0]
33 | print(address)
34 | building_type=tree.xpath('//div[@class="header-field"]/span')[0].xpath('./text()')[0]
35 | building_date=tree.xpath('//div[@class="header-field"]/span')[2].xpath('./text()')[0]
36 | print(building_date)
37 | print(building_type)
38 | pattern = 'data-center="(.*?)"'
39 | data = re.findall(pattern, content)
40 | t= data[0].split(',')
41 | print(t[0])
42 | print(t[1])
43 | #longitude = data[0]
44 | #latitude = data[1]
--------------------------------------------------------------------------------
/youdao_dictionary/youdao.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # website: http://30daydo.com
3 | # @Time : 2019/2/23 19:34
4 | # @File : youdao.py
5 | # 解密有道词典的JS
6 |
7 |
8 | import hashlib
9 | import random
10 | import requests
11 | import time
12 |
13 |
14 | def md5_(word):
15 | s = bytes(word, encoding='utf8')
16 | m = hashlib.md5()
17 | m.update(s)
18 | ret = m.hexdigest()
19 | return ret
20 |
21 | def get_sign(word, salt):
22 | ret = md5_('fanyideskweb' + word + salt + 'p09@Bn{h02_BIEe]$P^nG')
23 | return ret
24 |
25 | def youdao(word):
26 | url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
27 | headers = {
28 | 'Host': 'fanyi.youdao.com',
29 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
30 | 'Accept': 'application/json, text/javascript, */*; q=0.01',
31 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
32 | 'Accept-Encoding': 'gzip, deflate',
33 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
34 | 'X-Requested-With': 'XMLHttpRequest',
35 | 'Referer': 'http://fanyi.youdao.com/',
36 | 'Cookie': 'YOUDAO_MOBILE_ACCESS_TYPE=1; OUTFOX_SEARCH_USER_ID=1672542763@10.169.0.83; JSESSIONID=aaaWzxpjeDu1gbhopLzKw; ___rl__test__cookies=1550913722828; OUTFOX_SEARCH_USER_ID_NCOO=372126049.6326876',
37 | 'Connection': 'keep-alive',
38 | 'Pragma': 'no-cache',
39 | 'Cache-Control': 'no-cache',
40 | }
41 |
42 | ts = str(int(time.time()*1000))
43 | salt=ts+str(random.randint(0,10))
44 | bv = md5_("5.0 (Windows)")
45 | sign= get_sign(word,salt)
46 |
47 | post_data = {
48 | 'i': word,
49 | 'from': 'AUTO', 'to': 'AUTO', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': salt,
50 | 'sign': sign, 'ts': ts, 'bv': bv, 'doctype': 'json', 'version': '2.1',
51 | 'keyfrom': 'fanyi.web', 'action': 'FY_BY_REALTIME', 'typoResult': 'false'
52 | }
53 |
54 | r = requests.post(
55 | url=url,
56 | headers=headers,
57 | data=post_data
58 | )
59 |
60 | js_data = r.json()
61 | smart_result= js_data.get('smartResult', {})
62 |
63 | if smart_result:
64 | for item in smart_result.get('entries'):
65 | print(item)
66 |
67 | translate_result = js_data.get('translateResult',[])
68 | if translate_result:
69 | for items in translate_result:
70 | for item in items:
71 | print(item.get('tgt'))
72 |
73 | word='我喜欢吃鸡腿'
74 | youdao(word)
75 |
--------------------------------------------------------------------------------
/zhihu/zhihu_book.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import requests
3 | import json
4 | import pymongo
5 | # 下载知乎书籍的数据
6 | def get_books_by_url(url):
7 | headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"}
8 | r = requests.get(url, headers=headers)
9 | data = json.loads(r.content.decode("utf-8"))
10 | return data
11 |
12 | def get_books_by_category(category_id):
13 | url_patt = "https://www.zhihu.com/api/v3/books/categories/{}?limit={}&offset={}&version=v2"
14 | limit = 10
15 | offset = 0
16 | client = pymongo.MongoClient('10.18.6.26',27001)
17 | db = client.zhihu_book
18 | while True:
19 | url = url_patt.format(category_id, limit, offset)
20 | print(url)
21 | data = get_books_by_url(url)
22 | books = data["data"]
23 | db.books.insert_many(books)
24 | if data["paging"]["is_end"]:
25 | break
26 | offset = offset + limit
27 |
28 | def get_all_books():
29 | categories = [147, 254, 232, 209, 245, 175, 219, 189, 205, 161, 143, 284, 265, 214, 155, 241]
30 | for category in categories:
31 | get_books_by_category(category)
32 |
33 | def query_books():
34 | client = pymongo.MongoClient('10.18.6.26',27001)
35 | db = client.zhihu_book
36 |
37 | books = db.books.find().sort("score")
38 | book_ids = []
39 | for book in books:
40 | if book["id"] in book_ids:
41 | continue
42 | price = 0
43 | if book["promotion"]["is_promotion"]:
44 | price = book["promotion"]["promotion_price"]/100
45 | else:
46 | price = book["promotion"]["price"]/100
47 | print("{},{},{},{},{}".format(book["title"], book["url"], book["score"], price, book["promotion"]["origin_price"]/100))
48 | book_ids.append(book["id"])
49 |
50 | # books = db.books.find({"promotion.price": 0.0}).sort("score")
51 | # book_ids = []
52 | # for book in books:
53 | # if book["id"] in book_ids:
54 | # continue
55 | # print("{},{},{}".format(book["title"], book["url"], book["score"]))
56 | # book_ids.append(book["id"])
57 |
58 | if __name__ == "__main__":
59 | # parser = argparse.ArgumentParser()
60 | # parser.add_argument("--download", help="", action="store_true")
61 | # parser.add_argument("--query", help="", action="store_true")
62 | # args = parser.parse_args()
63 | # if args.download:
64 | # get_all_books()
65 | # elif args.query:
66 | # query_books()
67 | get_all_books()
--------------------------------------------------------------------------------
/51jbnet/im_sandbox/spiders/website.py:
--------------------------------------------------------------------------------
1 | # -*-coding=utf-8-*-
2 |
3 | # @Time : 2019/5/16 17:30
4 | # @File : website.py
5 |
6 | # -*- coding: utf-8 -*-
7 | import re
8 | import requests
9 | import scrapy
10 | from scrapy import Request
11 | from im_sandbox import settings
12 | from scrapy.log import logger
13 | import json
14 | from im_sandbox.items import SandboxItem
15 | import datetime
16 | from scrapy.selector import Selector
17 |
18 |
19 | class Website(scrapy.Spider):
20 | name = "website"
21 | category='linux_shell'
22 | idx=235
23 | total=1403
24 | page = int(total/40)+1
25 | default_headers = {
26 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
27 | "Accept-Encoding": "gzip, deflate, br",
28 | "Accept-Language": "zh,en;q=0.9,en-US;q=0.8,zh-CN;q=0.7",
29 | "Cache-Control": "no-cache",
30 | "Connection": "keep-alive",
31 | "Host": "www.jb51.net",
32 | "Pragma": "no-cache",
33 | "Referer": "https://www.jb51.net/list/list_97_1.htm",
34 | "Upgrade-Insecure-Requests": "1",
35 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
36 | }
37 |
38 | def start_requests(self):
39 | page = 400
40 | base_url = 'https://www.jb51.net/list/list_{idx}_{page}.htm'
41 | for i in range(1, self.page + 1):
42 | yield Request(url=base_url.format(page=i,idx=self.idx), headers=self.default_headers, callback=self.parse)
43 |
44 | def parse(self, response):
45 |
46 | if not response.body:
47 | logger.error(msg='there is no response body ,please go and check it ')
48 | return
49 |
50 | nodes = response.xpath('//div[@class="artlist clearfix"]/DL/DT')
51 | if nodes:
52 | pass
53 | else:
54 | nodes = response.xpath('//div[@class="artlist clearfix"]/dl/dt')
55 |
56 | for node in nodes:
57 | pubdate = node.xpath('.//span/text()').extract_first()
58 | pubdate = re.sub('日期:', '', pubdate)
59 | title=node.xpath('.//a/text()').extract_first()
60 | url=node.xpath('.//a/@href').extract_first()
61 | full_url = 'https://www.jb51.net{}'.format(url)
62 | item = SandboxItem()
63 | item['pubdate']=pubdate
64 | item['url']=full_url
65 | item['title']=title
66 | item['category']=self.category
67 | yield item
68 |
--------------------------------------------------------------------------------
/jd/jd/spiders/jd_book.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import scrapy
4 | from scrapy import Request
5 | from scrapy_splash import SplashRequest
6 | import re
7 | from jd.items import JdItem
8 | lua_script = """
9 | function main(splash)
10 | splash:go(splash.args.url)
11 | splash:wait(5)
12 | splash:runjs("document.getElementsByClassName('page')[0].scrollIntoView(true)")
13 | splash:wait(5)
14 | return splash:html()
15 | end
16 | """
17 |
18 |
19 | class JDBookSpider(scrapy.Spider):
20 | name = "jd_book"
21 | allowed_domains = ["search.jd.com"]
22 | kw='股票'
23 | base_url = 'https://search.jd.com/Search?keyword={}&enc=utf-8&wq={}'.format(kw,kw)
24 |
25 | def start_requests(self):
26 | # 请求第一页,无需 js 渲染
27 | yield Request(self.base_url, callback=self.parse_urls, dont_filter=True)
28 |
29 | def parse_urls(self, response):
30 | # 获取商品总数,计算出总页数
31 | total = response.css('span#J_resCount::text').extract_first().strip('+')
32 | try:
33 | total=re.sub('万','',total)
34 | total=float(total)*10000
35 | except:
36 | return
37 | pageNum = total // 60 + (1 if total % 60 else 0)
38 |
39 | # 构造每页的 url,向 Splash 的 execute 端点发送请求
40 | for i in range(int(pageNum)):
41 | url = '%s&page=%s' % (self.base_url, 2*i+1)
42 | yield SplashRequest(url, endpoint='execute', args={'lua_source': lua_script},\
43 | cache_args=['lua_source'])
44 |
45 | def parse(self, response):
46 | # 获取一个页面中每本书的名字和价格
47 | for sel in response.css('ul.gl-warp.clearfix > li.gl-item'):
48 | item = JdItem()
49 | name= sel.css('div.p-name').xpath('string(.//em)').extract_first()
50 | price= sel.css('div.p-price i::text').extract_first()
51 | try:
52 | remark=sel.xpath('.//div[(@class="p-commit" or @class="p-comm")]').xpath('string(.)').extract_first()
53 | if remark:
54 | remark=remark.strip()
55 | except:
56 | remark=None
57 | try:
58 | price=float(price)
59 | except:
60 | price=price
61 |
62 | # 自营
63 | # shop=sel.css('div.p-shopnum span::text').extract_first()
64 |
65 | # 出版社
66 |
67 | publish=sel.css('div.p-shopnum a::text').extract_first()
68 | if publish is None:
69 | publish=sel.css('div.p-shop a::text').extract_first()
70 | # if shop is None:
71 | # shop=sel.css('div.p-shopnum a::text').extract_first()
72 | # publish=None
73 |
74 | item['name']=name
75 | item['price']=price
76 | item['remark']=remark
77 | item['publish']=publish
78 | # item['shop']=shop
79 | yield item
--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/CustomMiddleware.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # @Date : 2019-08-28 19:35:51
4 | # @Author : Rocky Chen (weigesysu@qq.com)
5 | # @Link : http://30daydo.com
6 | # @Version : 1.0
7 |
8 | # 自定义middleware
9 | from scrapy.exceptions import IgnoreRequest
10 | # from scrapy import log
11 | import logging
12 | from scrapy.downloadermiddlewares.retry import RetryMiddleware
13 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
14 |
15 | class CustomMiddleware(object):
16 |
17 | def process_request(self,request,spider):
18 | # print('before download v1')
19 | # print(f'name -->{spider.name}')
20 |
21 | request.meta['vvv']='kkk' # 可以这样携带一些参数
22 |
23 |
24 | # print('主动提交错误') # 去执行process_exception
25 | # raise IgnoreRequest
26 |
27 | def process_response(self,request,response,spider):
28 | # print('after download v1')
29 | # print(f'name -->{spider.name}')
30 | # print(request.meta['vvv'])
31 | # print(dir(response))
32 | # print(response.status)
33 |
34 | if response.status==404:
35 | print('重新调度')
36 | return request
37 | else:
38 | return response # 需要返回response
39 |
40 | def process_exception(self,request, exception, spider):
41 | print('遇到错误了!!!!!!!!')
42 | return request
43 |
44 | class CustomMiddleware2(object):
45 |
46 | def process_request(self,request,spider):
47 | # logging.info('before download v2')
48 | # print(f'name -->{spider.name}')
49 | request.meta['vvv']='kkk' # 可以这样携带一些参数
50 |
51 | def process_response(self,request,response,spider):
52 | # print('after download v2')
53 | # print(f'name -->{spider.name}')
54 | # print(request.meta['vvv'])
55 | v = request.meta['vvv']
56 | return response
57 |
58 |
59 | class ModifiedRetryMiddleware(RetryMiddleware):
60 |
61 |
62 | def process_response(self, request, response, spider):
63 |
64 | logging.info('这个我定义的继承retrymiddleware')
65 |
66 | if request.meta.get('dont_retry', False):
67 | return response
68 |
69 | if response.status in self.retry_http_codes:
70 | reason = response_status_message(response.status)
71 | return self._retry(request, reason, spider) or response
72 |
73 | return response
74 |
75 | class ModifiedUserAgentMiddleware(UserAgentMiddleware):
76 |
77 | def process_request(self, request, spider):
78 |
79 | if self.user_agent:
80 |
81 | logging.info('这是自定义UA中间件')
82 |
83 | request.headers.setdefault(b'User-Agent', self.user_agent)
84 |
85 | def process_response(self,request,response,spider):
86 | logging.info(f'请求的request header ====== {request.headers}')
87 | return response
--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/monitor/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | 爬虫动态监控系统
6 |
7 |
8 |
9 |
10 |
11 |
12 |
110 |
111 |
--------------------------------------------------------------------------------
/stock_pledge/crawler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # website: http://30daydo.com
3 | # @Time : 2019/3/9 17:17
4 | # @File : crawler.py
5 | import datetime
6 | import requests
7 |
8 | # import grequests
9 | import pandas as pd
10 | import numpy as np
11 | from setting import get_engine
12 | import tushare as ts
13 |
14 | # 2018.03.05 后才有数据
15 |
16 | url = 'http://www.chinaclear.cn/cms-rank/downloadFile?queryDate={}&type=proportion'
17 |
18 | headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
19 | 'Accept-Encoding': 'gzip,deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache',
20 | 'Pragma': 'no-cache', 'Proxy-Connection': 'keep-alive',
21 | # 'Referer': 'http://www.chinaclear.cn/cms-rank/queryPledgeProportion?action=query&queryDate=2019.03.09&secCde=&page=3',
22 | 'Upgrade-Insecure-Requests': '1',
23 | 'User-Agent': 'Mozilla/5.0(Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36'}
24 |
25 | engine = get_engine('db_pledge', 'local')
26 |
27 |
28 | class PledgeSpider():
29 |
30 | def __init__(self):
31 | self.start = datetime.datetime.now()
32 | self.delta= 400
33 |
34 |
35 | def start_task(self):
36 | pass
37 |
38 | def handle_exception(self,request,exception):
39 | print('process error')
40 |
41 | def crawl(self):
42 | # tasks=[]
43 | # date_list =[]
44 | for i in range(self.delta):
45 | fetch_day = self.start+datetime.timedelta(days=-1*i)
46 | if fetch_day < datetime.datetime(year=2018,month=3,day=4):
47 | break
48 |
49 | if not ts.is_holiday(fetch_day.strftime('%Y-%m-%d')):
50 | name=fetch_day.strftime('%Y-%m-%d')
51 | try:
52 | day=url.format(fetch_day.strftime('%Y.%m.%d'))
53 | print(day)
54 | r=requests.get(url=day,headers=headers,timeout=20)
55 | except Exception as e:
56 | print(e)
57 | else:
58 | print(r.status_code)
59 | with open('{}.xls'.format(name), 'wb') as f:
60 | f.write(r.content)
61 | # tasks.append(grequests.get(url=url.format(fetch_day.strftime('%Y.%m.%d'))))
62 |
63 | # date_list.append(fetch_day.strftime('%Y-%m-%d'))
64 |
65 | # resp = grequests.map(tasks,size=8,exception_handler=self.handle_exception)
66 | # for index,r in enumerate(resp):
67 | # with open('{}.xls'.format(date_list[index]),'wb') as f:
68 | # f.write(r.content)
69 |
70 |
71 | def data_transfer(self):
72 | df = pd.read_excel('pledge.xls', header=2, dtype={'证券代码': np.str})
73 | df = df.reset_index(drop=True)
74 | return df
75 |
76 |
77 | pledge = PledgeSpider()
78 | pledge.crawl()
79 | # df = pledge.data_transfer()
80 |
--------------------------------------------------------------------------------
/cuiqingcai/async_sandbox/spiders/example.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import datetime
3 | import re
4 |
5 | import scrapy
6 | from scrapy import Request
7 | import logging
8 | from async_sandbox.items import AsyncSandboxItem
9 |
10 |
11 | class ExampleSpider(scrapy.Spider):
12 | name = 'example'
13 | # 技术
14 | # BASE_URL = 'https://cuiqingcai.com/category/technique/page/{}'
15 | # 生活
16 | BASE_URL = 'https://cuiqingcai.com/category/life/page/{}'
17 |
18 | def start_requests(self):
19 | start_page = 1
20 |
21 | yield Request(
22 | url=self.BASE_URL.format(start_page),
23 | meta={'page': start_page}
24 | )
25 |
26 | def parse(self, response):
27 | page = response.meta['page']
28 | next_page = page + 1
29 |
30 | articles = response.xpath('//article[@class="excerpt"]')
31 | for article in articles:
32 | item = AsyncSandboxItem()
33 | category = article.xpath('./header/a[1]/text()').extract_first()
34 | title = article.xpath('./header/h2/a[1]/text()').extract_first()
35 | article_url = article.xpath('./header/h2/a[1]/@href').extract_first()
36 | item['title'] = title
37 | item['category'] = category
38 | item['article_url'] = article_url
39 |
40 | yield Request(
41 | url=article_url,
42 | callback=self.parse_item,
43 | meta={'item': item}
44 | )
45 |
46 | if next_page < 900:
47 | yield Request(
48 | url=self.BASE_URL.format(next_page),
49 | meta={'page': next_page}
50 | )
51 |
52 | def parse_item(self, response):
53 | item = response.meta['item']
54 | author = response.xpath(
55 | '//header[@class="article-header"]//i[@class="fa fa-user"]/following::*[1]/text()').extract_first()
56 | visited = response.xpath(
57 | '//header[@class="article-header"]//i[@class="fa fa-eye"]/parent::*[1]/text()').extract_first()
58 | comment = response.xpath(
59 | '//header[@class="article-header"]//i[@class="fa fa-comments-o"]/following-sibling::*[1]/text()').extract_first()
60 | liked = response.xpath('//span[@class="count"]/text()').extract_first()
61 | created_at = response.xpath(
62 | '//header[@class="article-header"]//i[@class="fa fa-clock-o"]/parent::*[1]/text()').extract_first()
63 | content = response.xpath('//article[@class="article-content"]')[0].xpath('string(.)').extract()[0]
64 |
65 | item['author'] = author
66 | item['created_at'] = created_at
67 | item['content'] = content
68 | visited=re.sub('浏览','',visited)
69 | item['visited'] = visited
70 | comment=re.sub('评论','',comment)
71 | item['comment'] = comment
72 | item['liked'] = liked
73 | item['crawltime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
74 | yield item
75 |
--------------------------------------------------------------------------------
/52sh/aio_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # website: http://30daydo.com
3 | # @Time : 2020/9/24 12:09
4 | # @File : aio_spider.py
5 | import asyncio
6 | import aiohttp
7 | import aiofiles
8 | import os
9 |
10 | import re
11 |
12 | from config_file import START_URL, HEADERS, PROXY_STR,SIMPLE_HEADERS
13 | from parsel import Selector
14 |
15 |
16 | async def fetch(url):
17 | async with aiohttp.ClientSession() as session:
18 | async with session.get(url=url,
19 | headers=HEADERS,
20 | proxy=PROXY_STR,
21 | ) as response:
22 | text = await response.text()
23 | resp = Selector(text=text)
24 | nodes = resp.xpath('//div[@class="kl1-2"]')
25 | for node in nodes:
26 | next_url = node.xpath('.//div[@class="kl1-2a2"]/a/@href').extract_first()
27 | title = node.xpath('.//div[@class="kl1-2a2"]/a/@title').extract_first()
28 | await detail(session=session, next_url=next_url, title=title)
29 | print('next page')
30 |
31 |
32 | async def detail(**kwargs):
33 | session = kwargs['session']
34 | next_url = kwargs['next_url']
35 | title = kwargs['title']
36 | print(next_url)
37 | print(title)
38 | async with session.get(
39 | url=next_url,
40 | headers=HEADERS,
41 | proxy=PROXY_STR,
42 | ) as response:
43 | text = await response.text()
44 | resp = Selector(text=text)
45 | nodes = resp.xpath('//div[@class="kl2-1"]//img/@src').extract()
46 | nodes = list(set(nodes))
47 | for img in nodes:
48 | # print(img)
49 | await download_img(session=session,url=img,title=title)
50 | print('next image')
51 |
52 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
53 |
54 | async def download_img(**kwargs):
55 | url= kwargs['url']
56 | title= kwargs['title']
57 |
58 | title = title.replace(' ','_')
59 | title = re.sub('[\/:*?"<>|]', '-', title)
60 | if not os.path.exists(title):
61 | os.mkdir(title)
62 |
63 | filename = url.split('/')[-1]
64 | if not filename.endswith(('png','jpg','jpeg')):
65 | return
66 | save_file = os.path.join(title,filename)
67 |
68 | if os.path.exists(save_file):
69 | return
70 | print('saving image - ')
71 | try:
72 | conn = aiohttp.TCPConnector(ssl=False) # 防止ssl报错
73 | async with aiohttp.ClientSession(connector=conn, trust_env=True) as session:
74 | async with session.get(url=url, headers=SIMPLE_HEADERS, proxy=PROXY_STR) as response:
75 |
76 | if response.status>=200 and response.status<300:
77 | f=await aiofiles.open(save_file,'wb')
78 | await f.write(await response.read())
79 | await f.close()
80 |
81 | except Exception as e:
82 | print(e)
83 | print(url)
84 | return
85 |
86 | async def main():
87 | total_page = 3640
88 | for page in range(0,total_page,35):
89 |
90 | url = START_URL.format(page=page)
91 | await fetch(url)
92 | await asyncio.sleep(0)
93 | print(f'downing page {page}-')
94 | loop = asyncio.get_event_loop()
95 | loop.run_until_complete(main())
96 |
--------------------------------------------------------------------------------
/51jbnet/im_sandbox/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | from scrapy.exceptions import DropItem
8 |
9 | class ImSandboxPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
13 |
14 | import datetime
15 |
16 | import pymongo
17 |
18 | from im_sandbox.settings import MONGODB, ES_HOST
19 | from im_sandbox import models
20 | from im_sandbox.models import scoped_session
21 | from elasticsearch import Elasticsearch
22 | from scrapy import log
23 |
24 |
25 | class im_sandboxMongoPipeline(object):
26 |
27 | def __init__(self):
28 | self._db = MONGODB.get('db')
29 | self._collection = MONGODB.get('collection')
30 | self._host = MONGODB.get('host')
31 | self._port = MONGODB.get('port')
32 | self._client = pymongo \
33 | .MongoClient(host=self._host, port=self._port) \
34 | .get_database(self._db) \
35 | .get_collection(self._collection)
36 |
37 | def process_item(self, item, spider):
38 | self._client.create_index([('title', pymongo.DESCENDING)], background=True)
39 | self._client.update_one(filter={'title': item['title']}, update={'$set': dict(item)}, upsert=True)
40 | return item
41 |
42 |
43 | class im_sandboxMysqlPipeline(object):
44 |
45 | def process_item(self, item, spider):
46 | sql_im_sandbox = models.SpiderModel()
47 | sql_im_sandbox = models.map_orm_item(scrapy_item=item, sql_item=sql_im_sandbox)
48 | with scoped_session() as session:
49 | session.add(sql_im_sandbox)
50 |
51 | return item
52 |
53 |
54 | class ESPipeline(object):
55 | def __init__(self):
56 | self.index = '51jbnet'
57 | self.doc = 'doc'
58 | self.es = Elasticsearch(ES_HOST)
59 |
60 | def process_item(self, item, spider):
61 | crawltime = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
62 | url = item.get('url', None)
63 | if not url:
64 | raise FileNotFoundError('url is empty')
65 |
66 | query_body = {
67 | "query":
68 | {
69 | "term": {
70 | "url": url
71 | }
72 | }
73 | }
74 |
75 | # 去重
76 | try:
77 | query_result = self.es.search(index=self.index, body=query_body)
78 |
79 | except Exception as e:
80 | log.msg(e)
81 | raise ConnectionError('查询ES报错')
82 |
83 | hits=query_result.get('hits',{}).get('hits',[])
84 |
85 | if hits:
86 |
87 | raise DropItem('Duplication item')
88 |
89 | body = {
90 | "pubdate": item["pubdate"],
91 | "title": item["title"],
92 | "url": item["url"],
93 | "crawled_datetime": crawltime,
94 | "category": item['category'],
95 | }
96 |
97 | try:
98 | self.es.index(index=self.index, doc_type=self.doc, body=body)
99 | except Exception as e:
100 | log.msg('错误 >>>>>')
101 | log.msg(e)
102 | return item
103 |
--------------------------------------------------------------------------------
/async_cuiqingcai/async_sandbox/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import pymysql
8 | from twisted.enterprise import adbapi
9 | import logging
10 | import pymongo
11 | from scrapy.exceptions import DropItem
12 |
13 | class AsyncSQLPipeline(object):
14 | def __init__(self):
15 | self.dbpool = adbapi.ConnectionPool('pymysql',host='192.168.1.100',port=3306,user='root',password='*',db='spider_test')
16 | # self.cursor = self.conn.cursor()
17 |
18 | def process_item(self, item, spider):
19 | update_=self.dbpool.runInteraction(self.update,item)
20 | update_.addErrback(self.handle_error,item,spider)
21 |
22 | return item
23 |
24 | def update(self,cursor,item):
25 | insert_sql = 'insert into tb_cuiqingcai (category,title,article_url,content,author,created_at,liked,visited,comment,crawltime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
26 | data=(item['category'],item['title'],item['article_url'],item['content'],item['author'],item['created_at'],item['liked'],item['visited'],item['comment'],item['crawltime']
27 | )
28 | cursor.execute(insert_sql,data)
29 |
30 | def handle_error(self,failure,item,spider):
31 | logging.error('写入数据库异常--->')
32 | logging.error(failure)
33 | logging.error('error item')
34 | logging.error(item)
35 |
36 | class MongoPipeline(object):
37 |
38 | def __init__(self,host,port,db,doc):
39 | client = pymongo.MongoClient(host,port)
40 | self.doc=client[db][doc]
41 |
42 | @classmethod
43 | def from_crawler(cls,crawler):
44 | print('in from crawler')
45 | host = crawler.settings.get('MONGO_HOST')
46 | port = crawler.settings.getint('MONGO_PORT')
47 | db = crawler.settings.get('MONGO_DB')
48 | doc = crawler.settings.get('MONGO_DOC')
49 |
50 |
51 | print(f'host {host}')
52 | return cls(host,port,db,doc)
53 |
54 | def open_spider(self,spider):
55 | print('spider open')
56 |
57 | def process_item(self,item,spider):
58 | print('in mongopipeline')
59 |
60 | if item is None:
61 | print('item is None')
62 | else:
63 | print('item is not None')
64 | print(f'receive item -> len is {len(item)}')
65 | # self.doc.insert(dict(item))
66 | return item
67 |
68 | def close_spider(self,spider):
69 | print('closing in pipeline')
70 |
71 | class JSONPipeline(object):
72 |
73 | def __init__(self,host,port,db,doc):
74 | pass
75 |
76 | @classmethod
77 | def from_crawler(cls,crawler):
78 | print('in from crawler')
79 | host = crawler.settings.get('MONGO_HOST')
80 | port = crawler.settings.getint('MONGO_PORT')
81 | db = crawler.settings.get('MONGO_DB')
82 | doc = crawler.settings.get('MONGO_DOC')
83 |
84 |
85 | print(f'host {host}')
86 | return cls(host,port,db,doc)
87 |
88 | def open_spider(self,spider):
89 | print('spider open')
90 |
91 | def process_item(self,item,spider):
92 | print('in JSON pipeline')
93 | print(f'receive item -> len is {len(item)}')
94 |
95 | # return item
96 | raise DropItem(item)
97 |
98 | def close_spider(self,spider):
99 | print('closing in pipeline')
--------------------------------------------------------------------------------
/fraud/fraud/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for fraud project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'fraud'
13 |
14 | SPIDER_MODULES = ['fraud.spiders']
15 | NEWSPIDER_MODULE = 'fraud.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'fraud (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | # CONCURRENT_REQUESTS_PER_DOMAIN = 1
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | COOKIES_ENABLED = True
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'fraud.middlewares.FraudSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | # DOWNLOADER_MIDDLEWARES = {
56 | # 'fraud.middlewares.DynamicProxyMiddleware': 543,
57 | # }
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'fraud.pipelines.FraudPipeline': 300,
69 | # 'fraud.pipelines.DuplicatesPipeline': 200,
70 | }
71 |
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 |
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 |
--------------------------------------------------------------------------------
/MyLibrary/sandbox/sandbox/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for sandbox project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'sandbox'
13 |
14 | SPIDER_MODULES = ['sandbox.spiders']
15 | NEWSPIDER_MODULE = 'sandbox.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'sandbox (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'sandbox.middlewares.SandboxSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | # DOWNLOADER_MIDDLEWARES = {
56 | # 'sandbox.middlewares.RandomUserAgent': 543,
57 | # 'sandbox.middlewares.ProxyMiddleware': 553,
58 | # }
59 |
60 | # Enable or disable extensions
61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | # 'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 |
66 | # Configure item pipelines
67 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 | # 'sandbox.pipelines.SQLPipeline': 300,
70 | 'sandbox.pipelines.MongoPipeline': 100,
71 | }
72 |
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 |
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 |
--------------------------------------------------------------------------------
/anjuke/anjuke.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import codecs
3 | import json
4 | import re
5 | import urllib
6 | from lxml import etree
7 | import requests
8 |
9 |
10 | def query(kw):
11 | for i in range(1, 10):
12 | encode_kw = urllib.quote(kw)
13 | print(i)
14 | url = 'https://m.anjuke.com/ajax/autocomplete/?city_id=13&kw=%s&from=1&callback=jsonp%d' % (encode_kw, i)
15 | s = requests.Session()
16 | headers = {
17 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'}
18 | js = s.get(url, headers=headers)
19 | print(js.status_code)
20 | # print(js.text)
21 | try:
22 | result = re.findall('jsonp7\((.*?)\);', js.text)[0]
23 | dic = json.loads(result)
24 | print('*' * 20)
25 | print(dic['data']['match'][0]['comm_id'])
26 | except Exception as e:
27 | print(e)
28 |
29 |
30 | # 获取安居客的城市列表
31 | def getcitylist():
32 | headers = {'Accept-Language': ' zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': ' gzip, deflate',
33 | 'Connection': ' keep-alive',
34 | 'Accept': ' text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
35 | 'User-Agent': ' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0',
36 | 'Host': ' m.anjuke.com', 'Referer': ' https://m.anjuke.com/bj/',
37 | 'Cookie': ' aQQ_ajkguid=145D8A4E-6387-1752-E32C-D4EFB4EBFE09; lps="/|"; ctid=14; 58tj_uuid=fdb54be9-84d6-4511-ad1e-3227c1eac9ae; new_session=0; init_refer=; new_uv=1; sessid=AD7C8189-AB56-4CAF-1BAC-FF0CCD27668C'}
38 | url = 'https://m.anjuke.com/cityList/'
39 | r = requests.get(url=url, headers=headers)
40 | print(r.status_code)
41 | tree = etree.HTML(r.text)
42 | word=u'其他'
43 | node = tree.xpath('//div[@class="cl-c-l-h" and @id !="letter-%s"]/following-sibling::*[1]' %word)
44 | dicts ={}
45 | for i in node:
46 | name = i.xpath('.//li/a/text()')
47 | link= i.xpath('.//li/a/@href')
48 | if len(name) != len(link):
49 | for j in name:
50 | print(j)
51 | for k in link:
52 | print(k)
53 |
54 | for index in range(len(name)):
55 | short_cut=link[index].split('/')[3]
56 | dicts[short_cut]=name[index]
57 |
58 | return dicts
59 |
60 | def debug_page():
61 |
62 | headers = {'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0', 'Host': 'm.anjuke.com', 'Cookie': 'aQQ_ajkguid=0B0A627A-FCF1-2B6A-2ADF-56DD166B0EBC; ctid=13; lps="/|"; sessid=804075FD-7FE8-E9C0-FA60-2FCB76C5B6B3; 58tj_uuid=02402201-d0d6-48de-8e58-6432612af29d; new_session=0; init_refer=; new_uv=1', 'Upgrade-Insecure-Requests': '1'}
63 |
64 | url='https://m.anjuke.com/dg/community/279422/'
65 | r=requests.get(url=url,headers=headers)
66 | print(r.status_code)
67 | tree = etree.HTML(r.text)
68 | return tree
69 |
70 | #if __name__=="__main__":
71 | #debug_page()
72 | # query('南方明珠花园二期1栋')
73 | #d = getcitylist()
74 | #f=codecs.open('anjuke_city','w',encoding='utf-8')
75 | #json.dump(d,f,ensure_ascii=False)
76 | #for k,v in d.items():
77 | #print(k,v)
78 |
79 | tree=debug_page()
--------------------------------------------------------------------------------
/lanrentingshu/lrts/lrts/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for lrts project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'lrts'
13 |
14 | SPIDER_MODULES = ['lrts.spiders']
15 | NEWSPIDER_MODULE = 'lrts.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'lrts (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'lrts.middlewares.LrtsSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'lrts.middlewares.LrtsDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | # MyFilesPipeline
69 | # 'scrapy.pipelines.files.FilesPipeline':1
70 | 'lrts.pipelines.MyFilesPipeline': 300,
71 | }
72 | FILES_STORE='C:\\git\\CrawlMan\\lanrentingshu\\lrts\\lrts\\data'
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 |
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 |
--------------------------------------------------------------------------------
/fraud/fraud/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 | import time
10 | import hashlib
11 |
12 |
13 | class FraudSpiderMiddleware(object):
14 | # Not all methods need to be defined. If a method is not defined,
15 | # scrapy acts as if the spider middleware does not modify the
16 | # passed objects.
17 |
18 | @classmethod
19 | def from_crawler(cls, crawler):
20 | # This method is used by Scrapy to create your spiders.
21 | s = cls()
22 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
23 | return s
24 |
25 | def process_spider_input(self, response, spider):
26 | # Called for each response that goes through the spider
27 | # middleware and into the spider.
28 |
29 | # Should return None or raise an exception.
30 | return None
31 |
32 | def process_spider_output(self, response, result, spider):
33 | # Called with the results returned from the Spider, after
34 | # it has processed the response.
35 |
36 | # Must return an iterable of Request, dict or Item objects.
37 | for i in result:
38 | yield i
39 |
40 | def process_spider_exception(self, response, exception, spider):
41 | # Called when a spider or process_spider_input() method
42 | # (from other spider middleware) raises an exception.
43 |
44 | # Should return either None or an iterable of Response, dict
45 | # or Item objects.
46 | pass
47 |
48 | def process_start_requests(self, start_requests, spider):
49 | # Called with the start requests of the spider, and works
50 | # similarly to the process_spider_output() method, except
51 | # that it doesn’t have a response associated.
52 |
53 | # Must return only requests (not items).
54 | for r in start_requests:
55 | yield r
56 |
57 | def spider_opened(self, spider):
58 | spider.logger.info('Spider opened: %s' % spider.name)
59 | '''
60 | class DynamicProxyMiddleware(object):
61 | def process_request(self, request, spider):
62 | # time.sleep(1)
63 | auth_header = self.get_auth_header()
64 | request.meta['proxy'] = "http://s3.proxy.mayidaili.com:8123"
65 | request.headers['Proxy-Authorization'] = auth_header
66 |
67 | def get_auth_header(self):
68 | # 请替换app_key和secret
69 | app_key = "67783764"
70 | secret = "6151eb360668ca10ad772ca9e46d306b"
71 |
72 | param_map = {
73 | "app_key": app_key,
74 | "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), # 如果你的程序在国外,请进行时区处理
75 | "enable-simulate": 'true',
76 | "random-useragent": 'pc',
77 | "clear-cookies": 'true'
78 | }
79 | # 排序
80 | keys = param_map.keys()
81 | keys.sort()
82 |
83 | codes = "%s%s%s" % (secret, str().join('%s%s' % (key, param_map[key]) for key in keys), secret)
84 |
85 | # 计算签名
86 | sign = hashlib.md5(codes).hexdigest().upper()
87 |
88 | param_map["sign"] = sign
89 |
90 | # 拼装请求头Proxy-Authorization的值
91 | keys = param_map.keys()
92 | auth_header = "MYH-AUTH-MD5 " + str('&').join('%s=%s' % (key, param_map[key]) for key in keys)
93 |
94 | # print time.strftime("%Y-%m-%d %H:%M:%S")
95 | # print authHeader
96 |
97 | return auth_header
98 | '''
--------------------------------------------------------------------------------
/51jbnet/im_sandbox/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for im_sandbox project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'im_sandbox'
13 |
14 | SPIDER_MODULES = ['im_sandbox.spiders']
15 | NEWSPIDER_MODULE = 'im_sandbox.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'im_sandbox (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | # DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'im_sandbox.middlewares.ImSandboxSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'im_sandbox.middlewares.ImSandboxDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'im_sandbox.pipelines.ESPipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | MYSQL_DB_URI='mysql+pymysql://root:*@127.0.0.1:3306/spider?charset=utf8'
92 | MONGODB=''
93 | ES_HOST='10.18.6.102'
--------------------------------------------------------------------------------
/baiduwanpan/baiduwanpan.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import time
3 | import sys
4 | header = {'Origin': 'https://pan.baidu.com', 'Content-Length': '26', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'X-Requested-With': 'XMLHttpRequest', 'Host': 'pan.baidu.com', 'Accept': '*/*', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 'Connection': 'keep-alive', 'Cookie': 'BAIDUID=11BC8C5D223E048DDCCF45DA68C96329:FG=1; BIDUPSID=11BC8C5D223E048DDCCF45DA68C96329; PSTM=1502071949; __cfduid=dbc4d8c8a8ff8f8f56693bf9911a78f9a1502257445; PANWEB=1; bdshare_firstime=1502276137037; BDSFRCVID=4g8sJeC62lrjCp3ZxSq0MencMmK52YjTH6aotvr5NjaXcbr6amOqEG0PqM8g0Ku-aG3kogKK3gOTH4nP; H_BDCLCKID_SF=JJkH_CIMJCvbfP0k5bo0M-FSMMrX5C62aJ3DW45bWJ5TMC_w5l6KWbDl2-O0Qfr-aD7uWx022bubShPC-tnGM4IzWfon363D-a6U-xDE3l02V-j9e-t2ynQDDljRq4RMW20e0h7mWIb_VKFCjTKhejO0epJf-K6Jb6Q3BROS2RrHKROkeUOlyJtpbt-qJjcqyjrvQfcy3nTZ8J5k-UcV3T0fhGJnBT5Kaa6BBqQw5xbNM-jR0qJl0DukQN3TbRkO5bRiL6C-bq-BDn3oyTbJXp0njMTTqj_efnCDoD8QKbRofJ-k-4QEbbQH-UnLq-LqX57Z0l8Ktt3_ohjSyl6W0pLHXfoX5MrLWbTPbI3mWIQHSRQLLx7m5-KyjMne3JcpLa74KKJx-xKWeIJo5Dc6D6kzhUJiB5JMBan7_nrxfDD5bKDlD6-3-PAe5f8X5to05TIX3b7Ef-5ZM-O_bf--DR-HW-Q7BqTOL5RL2R58Kh6VOI5a05Jxy5K_3xjz3fvTbIce_n7b0tT4VUOHQT3mKqQbbN3i-CrgtJblWb3cWKOJ8UbSj-Tme6jXeautJ6F8f5vfL5rDa-n5HJjRq4bohjPjMPQeBtQmJJrtahRCMl7AJMO3Mxcqh4tIhtnCtp5BQg-q3R71MqvZMbrHBUQPbj8AWa5w0x-jLT6PVn0MW-5D8h6nLPnJyUnybPnnBT3XLnLHoDPXJCDBbDv65nt_b44bKUQKbK62aKDs5lRc-hcqEIL45fRaDq47Wl7gLtcu5Co22R6cJRuK8UbSj4QoXbIUWHOX0lRC3DTu3toufp5nhMJl3j7JDMP0-4vu5MJy523iob3vQpPMDxtuj68WejcXjNRjtnOe5C6H3bP8tCLWb5rnhPF3j-bbKP6-35KHaTrB5-tbytn6qDJEbtTjXtuUjH5kaq37JD6yLPQ-Jlr8Hfnn-RK--tugKtoxJpODBRbMopvaHRjnhnvvbURvDP-g3-AJ2q8EK5r2SC-ytI_-3J; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a02553875233; MCITY=-257%3A; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; PSINO=7; H_PS_PSSID=1455_21114_17001_19897; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; STOKEN=98916c84333e810c2b1d715bb7f7cf805ae2faf839dc1e7b2ffea14af9a43422; SCRC=e189858affb6c034f51facb687ba42a3; BDCLND=Z12FNBCnoSTSfwubbu7R1dmuJgAkUv%2FVXMPFC%2FhXqtw%3D; PANPSC=8159382662928957333%3A0tGXwXye%2FVgybgBxVCVQs9wxnZzNwr1w%2Fi1kePBHTIGypp29WjDdFHgXofrWESI4GPVIaAX1Mx4yLJx7kL47ECcTFj%2FtuMrTJEGGcevXkUatUq%2FdzxBw4vvqPIbe4OQ9iyFns5yFArUpANCmD7pcJX5IlZf3%2F0X8eJFOG%2FXb%2FW8u%2BjscPFpwMA%3D%3D; Hm_lvt_7a3960b6f067eb0085b7f96ff5e660b0=1504793178,1504793213,1504793250,1504793289; Hm_lpvt_7a3960b6f067eb0085b7f96ff5e660b0=1505901469', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'Referer': 'https://pan.baidu.com/share/init?surl=o8zEuJC', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}
5 | import requests
6 | import re
7 |
8 | for _ in range(100):
9 | # re.sub('\d',)
10 | if sys.version_info.major <3:
11 | t = str(long(time.time() * 1000))
12 | else:
13 | t = str(int(time.time() * 1000))
14 | #print(t)
15 | url='https://pan.baidu.com/share/verify?surl=o8zEuJC&t=%s&bdstoken=null&channel=chunlei&clienttype=0&web=1&app_id=250528&logid=MTUwNTkwMTQ3NzYzNjAuNTQwMjcwOTYwMTg0MTkyOA==' %t
16 | #url = 'https://pan.baidu.com/share/verify?surl=mhPHC7Y&t=%s&bdstoken=c5232d2c47ec22f6fb2de6a151828c91&channel=chunlei&clienttype=0&web=1&app_id=250528&logid=MTUwNTkwMDQyNDI2MzAuNDQyNTQxMzMyNDU0MTQ4NQ==' % t
17 | data = {'pwd': '2222', 'vcode': '', 'vcode_str': ''}
18 | r = requests.post(url=url, data=data, headers=header)
19 | js = r.json()
20 | print(js)
21 |
22 | pw='gxrr'
23 | data = {'pwd': pw, 'vcode': '', 'vcode_str': ''}
24 | r = requests.post(url=url, data=data, headers=header)
25 | js = r.json()
26 | print(js)
--------------------------------------------------------------------------------