├── .gitignore ├── MANIFEST.in ├── README.md ├── README.rst ├── __init__.py ├── requirements.txt ├── scrapy.cfg ├── setup.py ├── test ├── __init__.py └── myapp │ ├── __init__.py │ ├── myapp │ ├── __init__.py │ ├── proxy.list │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── item_field.py │ │ ├── item_xpath.py │ │ ├── page_xpath.py │ │ └── spiders.py │ └── scrapy.cfg └── walker ├── __init__.py ├── check_status.py ├── downloadermiddlewares.py ├── pipelines.py ├── redis_feed.py ├── scheduler.py ├── spiders ├── __init__.py ├── exception_process.py ├── helper.py └── utils.py ├── start_project.py ├── stats_collectors.py ├── templates └── project │ ├── module │ ├── __init__.py │ ├── proxy.list │ ├── settings.py.tmpl │ └── spiders │ │ ├── __init__.py │ │ ├── item_field.py │ │ ├── item_xpath.py │ │ ├── page_xpath.py │ │ └── spiders.py │ └── scrapy.cfg └── user_agents.list /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.pyc 3 | log/ 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/MANIFEST.in -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/README.md -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/README.rst -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy>=1.0.5 2 | log-to-kafka 3 | custom-redis 4 | openpyxl 5 | psutil 6 | pdb -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/scrapy.cfg -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/setup.py -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/myapp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/myapp/myapp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/myapp/myapp/proxy.list: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/test/myapp/myapp/proxy.list -------------------------------------------------------------------------------- /test/myapp/myapp/settings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/test/myapp/myapp/settings.py -------------------------------------------------------------------------------- /test/myapp/myapp/spiders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/test/myapp/myapp/spiders/__init__.py -------------------------------------------------------------------------------- /test/myapp/myapp/spiders/item_field.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/test/myapp/myapp/spiders/item_field.py -------------------------------------------------------------------------------- /test/myapp/myapp/spiders/item_xpath.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/test/myapp/myapp/spiders/item_xpath.py -------------------------------------------------------------------------------- /test/myapp/myapp/spiders/page_xpath.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/test/myapp/myapp/spiders/page_xpath.py -------------------------------------------------------------------------------- /test/myapp/myapp/spiders/spiders.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/test/myapp/myapp/spiders/spiders.py -------------------------------------------------------------------------------- /test/myapp/scrapy.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/test/myapp/scrapy.cfg -------------------------------------------------------------------------------- /walker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/__init__.py -------------------------------------------------------------------------------- /walker/check_status.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/check_status.py -------------------------------------------------------------------------------- /walker/downloadermiddlewares.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/downloadermiddlewares.py -------------------------------------------------------------------------------- /walker/pipelines.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/pipelines.py -------------------------------------------------------------------------------- /walker/redis_feed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/redis_feed.py -------------------------------------------------------------------------------- /walker/scheduler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/scheduler.py -------------------------------------------------------------------------------- /walker/spiders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/spiders/__init__.py -------------------------------------------------------------------------------- /walker/spiders/exception_process.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/spiders/exception_process.py -------------------------------------------------------------------------------- /walker/spiders/helper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/spiders/helper.py -------------------------------------------------------------------------------- /walker/spiders/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/spiders/utils.py -------------------------------------------------------------------------------- /walker/start_project.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/start_project.py -------------------------------------------------------------------------------- /walker/stats_collectors.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/stats_collectors.py -------------------------------------------------------------------------------- /walker/templates/project/module/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /walker/templates/project/module/proxy.list: -------------------------------------------------------------------------------- 1 | # 配置你的代理ip:port,换行符分隔, 如: 2 | #192.168.101.160:8888 -------------------------------------------------------------------------------- /walker/templates/project/module/settings.py.tmpl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/templates/project/module/settings.py.tmpl -------------------------------------------------------------------------------- /walker/templates/project/module/spiders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/templates/project/module/spiders/__init__.py -------------------------------------------------------------------------------- /walker/templates/project/module/spiders/item_field.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/templates/project/module/spiders/item_field.py -------------------------------------------------------------------------------- /walker/templates/project/module/spiders/item_xpath.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/templates/project/module/spiders/item_xpath.py -------------------------------------------------------------------------------- /walker/templates/project/module/spiders/page_xpath.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/templates/project/module/spiders/page_xpath.py -------------------------------------------------------------------------------- /walker/templates/project/module/spiders/spiders.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/templates/project/module/spiders/spiders.py -------------------------------------------------------------------------------- /walker/templates/project/scrapy.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/templates/project/scrapy.cfg -------------------------------------------------------------------------------- /walker/user_agents.list: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/webWalker/HEAD/walker/user_agents.list --------------------------------------------------------------------------------