├── s0vkaq ├── LcvSearch │ ├── LcvSearch │ │ ├── __init__.py │ │ ├── wsgi.py │ │ ├── urls.py │ │ └── settings.py │ ├── search │ │ ├── __init__.py │ │ ├── migrations │ │ │ └── __init__.py │ │ ├── admin.py │ │ ├── tests.py │ │ ├── apps.py │ │ ├── models.py │ │ └── views.py │ ├── db.sqlite3 │ ├── static │ │ ├── img │ │ │ ├── ll.png │ │ │ ├── lr.png │ │ │ ├── Thumbs.db │ │ │ ├── btnbg.png │ │ │ ├── down.png │ │ │ ├── line.png │ │ │ ├── logo.png │ │ │ ├── logo1.png │ │ │ ├── more.png │ │ │ ├── btn_min.png │ │ │ ├── inputbg.png │ │ │ ├── logo-bak.png │ │ │ ├── seachbtn.png │ │ │ ├── logo-bak2.png │ │ │ └── logo1-bak.png │ │ ├── js │ │ │ ├── global.js │ │ │ ├── common.js │ │ │ └── pagination.js │ │ └── css │ │ │ ├── index.css │ │ │ ├── style.css │ │ │ ├── advanced.css │ │ │ └── result.css │ ├── .idea │ │ ├── markdown-navigator │ │ │ └── profiles_settings.xml │ │ ├── encodings.xml │ │ ├── modules.xml │ │ ├── LcvSearch.iml │ │ └── misc.xml │ ├── manage.py │ └── templates │ │ └── index.html ├── ArticleSpider │ ├── articleexport.json │ ├── ArticleSpider │ │ ├── __init__.py │ │ ├── utils │ │ │ ├── cookies.txt │ │ │ ├── __init__.py │ │ │ ├── captcha.jpg │ │ │ ├── common.py │ │ │ ├── bloomfilter.py │ │ │ └── zhihu_login_requests.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ └── es_types.py │ │ ├── spiders │ │ │ ├── __init__.py │ │ │ ├── lagou.py │ │ │ └── jobbole.py │ │ ├── images │ │ │ └── full │ │ │ │ ├── 055507fb28ac7ac8228b811c71f9ffdec4eb1748.jpg │ │ │ │ └── 0680fd15f05a124d6ac8e95b032713d8839b6c92.jpg │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ └── settings.py │ ├── build │ │ └── lib │ │ │ ├── ArticleSpider │ │ │ ├── __init__.py │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ └── es_types.py │ │ │ ├── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── common.py │ │ │ │ ├── bloomfilter.py │ │ │ │ └── zhihu_login_requests.py │ │ │ ├── spiders │ │ │ │ ├── __init__.py │ │ │ │ ├── lagou.py │ │ │ │ └── jobbole.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ └── settings.py │ │ │ └── tools │ │ │ ├── __init__.py │ │ │ ├── selenium_spider.py │ │ │ ├── crawl_xici_ip.py │ │ │ └── yundama_requests.py │ ├── job_info │ │ ├── 001 │ │ │ ├── requests.queue │ │ │ │ ├── active.json │ │ │ │ └── p0 │ │ │ └── spider.state │ │ └── 002 │ │ │ └── requests.queue │ │ │ └── p0 │ ├── project.egg-info │ │ ├── dependency_links.txt │ │ ├── top_level.txt │ │ ├── entry_points.txt │ │ ├── PKG-INFO │ │ └── SOURCES.txt │ ├── tools │ │ ├── __init__.py │ │ ├── image │ │ │ ├── 1.jpg │ │ │ ├── 2.jpg │ │ │ ├── 3.png │ │ │ ├── 4.png │ │ │ ├── 5.png │ │ │ └── captcha.jpg │ │ ├── ghostdriver.log │ │ ├── selenium_spider.py │ │ ├── crawl_xici_ip.py │ │ └── yundama_requests.py │ ├── captcha.jpg │ ├── .idea │ │ ├── markdown-navigator │ │ │ └── profiles_settings.xml │ │ ├── inspectionProfiles │ │ │ └── profiles_settings.xml │ │ ├── modules.xml │ │ ├── ArticleSpider.iml │ │ └── misc.xml │ ├── test.py │ ├── setup.py │ ├── scrapy.cfg │ └── main.py ├── ScrapyRedisTest │ ├── ScrapyRedisTest │ │ ├── __init__.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ └── bloomfilter.py │ │ ├── spiders │ │ │ ├── __init__.py │ │ │ └── jobbole.py │ │ ├── pipelines.py │ │ ├── items.py │ │ ├── middlewares.py │ │ └── settings.py │ ├── .idea │ │ ├── markdown-navigator │ │ │ └── profiles_settings.xml │ │ ├── inspectionProfiles │ │ │ └── profiles_settings.xml │ │ ├── modules.xml │ │ ├── ScrapyRedisTest.iml │ │ └── misc.xml │ ├── scrapy_redis │ │ ├── utils.py │ │ ├── __init__.py │ │ ├── picklecompat.py │ │ ├── defaults.py │ │ ├── pipelines.py │ │ ├── connection.py │ │ ├── dupefilter.py │ │ ├── queue.py │ │ └── scheduler.py │ ├── main.py │ └── scrapy.cfg └── LcvSearch-Front │ ├── img │ ├── ll.png │ ├── lr.png │ ├── down.png │ ├── line.png │ ├── logo.png │ ├── more.png │ ├── Thumbs.db │ ├── btn_min.png │ ├── btnbg.png │ ├── inputbg.png │ ├── logo1.png │ ├── logo-bak.png │ ├── logo-bak2.png │ ├── logo1-bak.png │ └── seachbtn.png │ ├── js │ ├── global.js │ ├── common.js │ └── pagination.js │ ├── css │ ├── index.css │ ├── style.css │ ├── advanced.css │ └── result.css │ └── index.html └── README.md /s0vkaq/LcvSearch/LcvSearch/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/search/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/articleexport.json: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/ArticleSpider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/search/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/ArticleSpider/utils/cookies.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/ScrapyRedisTest/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/build/lib/ArticleSpider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/job_info/001/requests.queue/active.json: -------------------------------------------------------------------------------- 1 | [0] -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/project.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/project.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | ArticleSpider 2 | tools 3 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/tools/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/build/lib/tools/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/ArticleSpider/models/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/ArticleSpider/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/search/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/search/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/project.egg-info/entry_points.txt: -------------------------------------------------------------------------------- 1 | [scrapy] 2 | settings = ArticleSpider.settings 3 | 4 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/db.sqlite3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/db.sqlite3 -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/ScrapyRedisTest/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/build/lib/ArticleSpider/models/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/build/lib/ArticleSpider/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/captcha.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/captcha.jpg -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/img/ll.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/ll.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/img/lr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/lr.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/img/down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/down.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/img/line.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/line.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/logo.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/img/more.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/more.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/img/ll.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/ll.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/img/lr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/lr.png -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/tools/image/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/tools/image/1.jpg -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/tools/image/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/tools/image/2.jpg -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/tools/image/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/tools/image/3.png -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/tools/image/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/tools/image/4.png -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/tools/image/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/tools/image/5.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/img/Thumbs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/Thumbs.db -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/img/btn_min.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/btn_min.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/img/btnbg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/btnbg.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/img/inputbg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/inputbg.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/img/logo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/logo1.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/img/Thumbs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/Thumbs.db -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/img/btnbg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/btnbg.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/img/down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/down.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/img/line.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/line.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/logo.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/img/logo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/logo1.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/img/more.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/more.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/img/logo-bak.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/logo-bak.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/img/logo-bak2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/logo-bak2.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/img/logo1-bak.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/logo1-bak.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/img/seachbtn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/seachbtn.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/search/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class SearchConfig(AppConfig): 5 | name = 'search' 6 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/img/btn_min.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/btn_min.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/img/inputbg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/inputbg.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/img/logo-bak.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/logo-bak.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/img/seachbtn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/seachbtn.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/img/logo-bak2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/logo-bak2.png -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/img/logo1-bak.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/logo1-bak.png -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/tools/image/captcha.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/tools/image/captcha.jpg -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/job_info/001/spider.state: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/job_info/001/spider.state -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/ArticleSpider/utils/captcha.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/ArticleSpider/utils/captcha.jpg -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/job_info/001/requests.queue/p0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/job_info/001/requests.queue/p0 -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/job_info/002/requests.queue/p0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/job_info/002/requests.queue/p0 -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/.idea/markdown-navigator/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/.idea/markdown-navigator/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/.idea/markdown-navigator/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | 4 | import redis 5 | redis_cli = redis.StrictRedis() 6 | redis_cli.incr("jobbole_count") 7 | 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Python分布式爬虫打造搜索引擎 2 | 3 | 全部代码都汇总在这里 4 | 5 | 代码解读移步我博客:[python分布式爬虫打造搜索引擎--------scrapy实现](http://www.cnblogs.com/jinxiao-pu/p/6706319.html) 6 | 7 | 觉得对你有用就给个star吧!非常感谢! 8 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/ArticleSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/ScrapyRedisTest/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/ArticleSpider/images/full/055507fb28ac7ac8228b811c71f9ffdec4eb1748.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/ArticleSpider/images/full/055507fb28ac7ac8228b811c71f9ffdec4eb1748.jpg -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/ArticleSpider/images/full/0680fd15f05a124d6ac8e95b032713d8839b6c92.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/ArticleSpider/images/full/0680fd15f05a124d6ac8e95b032713d8839b6c92.jpg -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/build/lib/ArticleSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/scrapy_redis/utils.py: -------------------------------------------------------------------------------- 1 | import six 2 | 3 | 4 | def bytes_to_str(s, encoding='utf-8'): 5 | """Returns a str if a bytes object is given.""" 6 | if six.PY3 and isinstance(s, bytes): 7 | return s.decode(encoding) 8 | return s 9 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/project.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: project 3 | Version: 1.0 4 | Summary: UNKNOWN 5 | Home-page: UNKNOWN 6 | Author: UNKNOWN 7 | Author-email: UNKNOWN 8 | License: UNKNOWN 9 | Description: UNKNOWN 10 | Platform: UNKNOWN 11 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | 4 | from scrapy.cmdline import execute 5 | 6 | import sys 7 | import os 8 | 9 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 10 | execute(["scrapy", "crawl", "jobbole"]) -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/scrapy_redis/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .connection import ( # NOQA 3 | get_redis, 4 | get_redis_from_settings, 5 | ) 6 | 7 | 8 | __author__ = 'Rolando Espinoza' 9 | __email__ = 'rolando at rmax.io' 10 | __version__ = '0.7.0-dev' 11 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/setup.py: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapyd-deploy 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup( 6 | name = 'project', 7 | version = '1.0', 8 | packages = find_packages(), 9 | entry_points = {'scrapy': ['settings = ArticleSpider.settings']}, 10 | ) 11 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = ArticleSpider.settings 8 | 9 | [deploy:bobby] 10 | url = http://localhost:6800/ 11 | project = ArticleSpider 12 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = ScrapyRedisTest.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = ScrapyRedisTest 12 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | 4 | from scrapy.cmdline import execute 5 | 6 | import sys 7 | import os 8 | 9 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 10 | execute(["scrapy", "crawl", "jobbole"]) 11 | # execute(["scrapy", "crawl", "zhihu"]) 12 | # execute(["scrapy", "crawl", "lagou"]) -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/scrapy_redis/picklecompat.py: -------------------------------------------------------------------------------- 1 | """A pickle wrapper module with protocol=-1 by default.""" 2 | 3 | try: 4 | import cPickle as pickle # PY2 5 | except ImportError: 6 | import pickle 7 | 8 | 9 | def loads(s): 10 | return pickle.loads(s) 11 | 12 | 13 | def dumps(obj): 14 | return pickle.dumps(obj, protocol=-1) 15 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/ScrapyRedisTest/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class ScrapyredistestPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/ScrapyRedisTest/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ScrapyredistestItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/LcvSearch/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for LcvSearch project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.11/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "LcvSearch.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/.idea/ScrapyRedisTest.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/ArticleSpider/utils/common.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | import hashlib 4 | import re 5 | 6 | 7 | def get_md5(url): 8 | if isinstance(url, str): 9 | url = url.encode("utf-8") 10 | m = hashlib.md5() 11 | m.update(url) 12 | return m.hexdigest() 13 | 14 | 15 | def extract_num(text): 16 | #从字符串中提取出数字 17 | match_re = re.match(".*?(\d+).*", text) 18 | if match_re: 19 | nums = int(match_re.group(1)) 20 | else: 21 | nums = 0 22 | 23 | return nums 24 | 25 | if __name__ == "__main__": 26 | print (get_md5("http://jobbole.com".encode("utf-8"))) -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/.idea/ArticleSpider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 13 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/build/lib/ArticleSpider/utils/common.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | import hashlib 4 | import re 5 | 6 | 7 | def get_md5(url): 8 | if isinstance(url, str): 9 | url = url.encode("utf-8") 10 | m = hashlib.md5() 11 | m.update(url) 12 | return m.hexdigest() 13 | 14 | 15 | def extract_num(text): 16 | #从字符串中提取出数字 17 | match_re = re.match(".*?(\d+).*", text) 18 | if match_re: 19 | nums = int(match_re.group(1)) 20 | else: 21 | nums = 0 22 | 23 | return nums 24 | 25 | if __name__ == "__main__": 26 | print (get_md5("http://jobbole.com".encode("utf-8"))) -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/scrapy_redis/defaults.py: -------------------------------------------------------------------------------- 1 | import redis 2 | 3 | 4 | # For standalone use. 5 | DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' 6 | 7 | PIPELINE_KEY = '%(spider)s:items' 8 | 9 | REDIS_CLS = redis.StrictRedis 10 | REDIS_ENCODING = 'utf-8' 11 | # Sane connection defaults. 12 | REDIS_PARAMS = { 13 | 'socket_timeout': 30, 14 | 'socket_connect_timeout': 30, 15 | 'retry_on_timeout': True, 16 | 'encoding': REDIS_ENCODING, 17 | } 18 | 19 | SCHEDULER_QUEUE_KEY = '%(spider)s:requests' 20 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' 21 | SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' 22 | SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' 23 | 24 | START_URLS_KEY = '%(name)s:start_urls' 25 | START_URLS_AS_SET = False 26 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/js/global.js: -------------------------------------------------------------------------------- 1 | 2 | $(document).ready(function(){ 3 | 4 | // 去除虚线框(会影响效率) 5 | $("a,input:checkbox,input:radio,button,input:button").live('focus',function(){$(this).blur();}); 6 | 7 | }); 8 | 9 | 10 | function hideElement(currentElement, targetElement) { 11 | if (!$.isArray(targetElement)) { 12 | targetElement = [ targetElement ]; 13 | } 14 | $(document).on("click.hideElement", function(e) { 15 | var len = 0, $target = $(e.target); 16 | for (var i = 0, length = targetElement.length; i < length; i++) { 17 | $.each(targetElement[i], function(j, n) { 18 | if ($target.is($(n)) || $.contains($(n)[0], $target[0])) { 19 | len++; 20 | } 21 | }); 22 | } 23 | if ($.contains(currentElement[0], $target[0])) { 24 | len = 1; 25 | } 26 | if (len == 0) { 27 | currentElement.hide(); 28 | } 29 | }); 30 | }; -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/js/global.js: -------------------------------------------------------------------------------- 1 | 2 | $(document).ready(function(){ 3 | 4 | // 去除虚线框(会影响效率) 5 | $("a,input:checkbox,input:radio,button,input:button").live('focus',function(){$(this).blur();}); 6 | 7 | }); 8 | 9 | 10 | function hideElement(currentElement, targetElement) { 11 | if (!$.isArray(targetElement)) { 12 | targetElement = [ targetElement ]; 13 | } 14 | $(document).on("click.hideElement", function(e) { 15 | var len = 0, $target = $(e.target); 16 | for (var i = 0, length = targetElement.length; i < length; i++) { 17 | $.each(targetElement[i], function(j, n) { 18 | if ($target.is($(n)) || $.contains($(n)[0], $target[0])) { 19 | len++; 20 | } 21 | }); 22 | } 23 | if ($.contains(currentElement[0], $target[0])) { 24 | len = 1; 25 | } 26 | if (len == 0) { 27 | currentElement.hide(); 28 | } 29 | }); 30 | }; -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/project.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | setup.py 2 | ArticleSpider/__init__.py 3 | ArticleSpider/items.py 4 | ArticleSpider/middlewares.py 5 | ArticleSpider/pipelines.py 6 | ArticleSpider/settings.py 7 | ArticleSpider/models/__init__.py 8 | ArticleSpider/models/es_types.py 9 | ArticleSpider/spiders/__init__.py 10 | ArticleSpider/spiders/jobbole.py 11 | ArticleSpider/spiders/lagou.py 12 | ArticleSpider/spiders/zhihu.py 13 | ArticleSpider/utils/__init__.py 14 | ArticleSpider/utils/bloomfilter.py 15 | ArticleSpider/utils/common.py 16 | ArticleSpider/utils/zhihu_login_requests.py 17 | project.egg-info/PKG-INFO 18 | project.egg-info/SOURCES.txt 19 | project.egg-info/dependency_links.txt 20 | project.egg-info/entry_points.txt 21 | project.egg-info/top_level.txt 22 | tools/__init__.py 23 | tools/crawl_xici_ip.py 24 | tools/selenium_spider.py 25 | tools/yundama_requests.py -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "LcvSearch.settings") 7 | try: 8 | from django.core.management import execute_from_command_line 9 | except ImportError: 10 | # The above import may fail for some other reason. Ensure that the 11 | # issue is really that Django is missing to avoid masking other 12 | # exceptions on Python 2. 13 | try: 14 | import django 15 | except ImportError: 16 | raise ImportError( 17 | "Couldn't import Django. Are you sure it's installed and " 18 | "available on your PYTHONPATH environment variable? Did you " 19 | "forget to activate a virtual environment?" 20 | ) 21 | raise 22 | execute_from_command_line(sys.argv) 23 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/js/common.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by yli on 2017/4/21. 3 | */ 4 | 5 | var searchArr; 6 | //定义一个search的,判断浏览器有无数据存储(搜索历史) 7 | if(localStorage.search){ 8 | //如果有,转换成 数组的形式存放到searchArr的数组里(localStorage以字符串的形式存储,所以要把它转换成数组的形式) 9 | searchArr= localStorage.search.split(",") 10 | }else{ 11 | //如果没有,则定义searchArr为一个空的数组 12 | searchArr = []; 13 | } 14 | //把存储的数据显示出来作为搜索历史 15 | MapSearchArr(); 16 | 17 | 18 | $("#btn").on("click", function(){ 19 | var val = $("#inp").val(); 20 | //点击搜索按钮时,去重 21 | KillRepeat(val); 22 | //去重后把数组存储到浏览器localStorage 23 | localStorage.search = searchArr; 24 | //然后再把搜索内容显示出来 25 | MapSearchArr(); 26 | }); 27 | 28 | 29 | function MapSearchArr(){ 30 | var tmpHtml = ""; 31 | for (var i=0;i " 33 | } 34 | $("#keyname").html(tmpHtml); 35 | } 36 | //去重 37 | function KillRepeat(val){ 38 | var kill = 0; 39 | for (var i=0;i " 33 | } 34 | $("#keyname").html(tmpHtml); 35 | } 36 | //去重 37 | function KillRepeat(val){ 38 | var kill = 0; 39 | for (var i=0;i 2 | 3 | 4 | 5 | 6 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 25 | 26 | 27 | 29 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/search/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | 4 | from datetime import datetime 5 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, \ 6 | analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer 7 | 8 | from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer 9 | 10 | from elasticsearch_dsl.connections import connections 11 | connections.create_connection(hosts=["localhost"]) 12 | 13 | class CustomAnalyzer(_CustomAnalyzer): 14 | def get_analysis_definition(self): 15 | return {} 16 | 17 | 18 | ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"]) 19 | 20 | class ArticleType(DocType): 21 | #伯乐在线文章类型 22 | suggest = Completion(analyzer=ik_analyzer) 23 | title = Text(analyzer="ik_max_word") 24 | create_date = Date() 25 | url = Keyword() 26 | url_object_id = Keyword() 27 | front_image_url = Keyword() 28 | front_image_path = Keyword() 29 | praise_nums = Integer() 30 | comment_nums = Integer() 31 | fav_nums = Integer() 32 | tags = Text(analyzer="ik_max_word") 33 | content = Text(analyzer="ik_max_word") 34 | 35 | class Meta: 36 | index = "jobbole" 37 | doc_type = "article" 38 | 39 | if __name__ == "__main__": 40 | ArticleType.init() 41 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/ArticleSpider/models/es_types.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | 4 | from datetime import datetime 5 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, \ 6 | analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer 7 | 8 | from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer 9 | 10 | from elasticsearch_dsl.connections import connections 11 | connections.create_connection(hosts=["localhost"]) 12 | 13 | class CustomAnalyzer(_CustomAnalyzer): 14 | def get_analysis_definition(self): 15 | return {} 16 | 17 | 18 | ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"]) 19 | 20 | class ArticleType(DocType): 21 | #伯乐在线文章类型 22 | suggest = Completion(analyzer=ik_analyzer) 23 | title = Text(analyzer="ik_max_word") 24 | create_date = Date() 25 | url = Keyword() 26 | url_object_id = Keyword() 27 | front_image_url = Keyword() 28 | front_image_path = Keyword() 29 | praise_nums = Integer() 30 | comment_nums = Integer() 31 | fav_nums = Integer() 32 | tags = Text(analyzer="ik_max_word") 33 | content = Text(analyzer="ik_max_word") 34 | 35 | class Meta: 36 | index = "jobbole" 37 | doc_type = "article" 38 | 39 | if __name__ == "__main__": 40 | ArticleType.init() 41 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/build/lib/ArticleSpider/models/es_types.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | 4 | from datetime import datetime 5 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, \ 6 | analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer 7 | 8 | from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer 9 | 10 | from elasticsearch_dsl.connections import connections 11 | connections.create_connection(hosts=["localhost"]) 12 | 13 | class CustomAnalyzer(_CustomAnalyzer): 14 | def get_analysis_definition(self): 15 | return {} 16 | 17 | 18 | ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"]) 19 | 20 | class ArticleType(DocType): 21 | #伯乐在线文章类型 22 | suggest = Completion(analyzer=ik_analyzer) 23 | title = Text(analyzer="ik_max_word") 24 | create_date = Date() 25 | url = Keyword() 26 | url_object_id = Keyword() 27 | front_image_url = Keyword() 28 | front_image_path = Keyword() 29 | praise_nums = Integer() 30 | comment_nums = Integer() 31 | fav_nums = Integer() 32 | tags = Text(analyzer="ik_max_word") 33 | content = Text(analyzer="ik_max_word") 34 | 35 | class Meta: 36 | index = "jobbole" 37 | doc_type = "article" 38 | 39 | if __name__ == "__main__": 40 | ArticleType.init() 41 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/tools/ghostdriver.log: -------------------------------------------------------------------------------- 1 | [INFO - 2017-04-04T03:05:37.523Z] GhostDriver - Main - running on port 53940 2 | [INFO - 2017-04-04T03:05:40.868Z] Session [95d6e1e0-18e3-11e7-883b-4735e6270c64] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.1 Safari/538.1","webSecurityEnabled":true} 3 | [INFO - 2017-04-04T03:05:40.868Z] Session [95d6e1e0-18e3-11e7-883b-4735e6270c64] - page.customHeaders: - {} 4 | [INFO - 2017-04-04T03:05:40.869Z] Session [95d6e1e0-18e3-11e7-883b-4735e6270c64] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"windows-7-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}} 5 | [INFO - 2017-04-04T03:05:40.869Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: 95d6e1e0-18e3-11e7-883b-4735e6270c64 6 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/ScrapyRedisTest/spiders/jobbole.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | 4 | from scrapy.http import Request 5 | from urllib import parse 6 | 7 | from scrapy_redis.spiders import RedisSpider 8 | 9 | class JobboleSpider(RedisSpider): 10 | name = 'jobbole' 11 | allowed_domains = ["blog.jobbole.com"] 12 | redis_key = 'jobbole:start_urls' 13 | 14 | # 收集伯乐在线所有404的url以及404页面数 15 | handle_httpstatus_list = [404] 16 | 17 | 18 | def parse(self, response): 19 | """ 20 | 1. 获取文章列表页中的文章url并交给scrapy下载后并进行解析 21 | 2. 获取下一页的url并交给scrapy进行下载, 下载完成后交给parse 22 | """ 23 | # 解析列表页中的所有文章url并交给scrapy下载后并进行解析 24 | if response.status == 404: 25 | self.fail_urls.append(response.url) 26 | self.crawler.stats.inc_value("failed_url") 27 | 28 | post_nodes = response.css("#archive .floated-thumb .post-thumb a") 29 | for post_node in post_nodes: 30 | image_url = post_node.css("img::attr(src)").extract_first("") 31 | post_url = post_node.css("::attr(href)").extract_first("") 32 | yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url": image_url}, 33 | callback=self.parse_detail) 34 | 35 | # 提取下一页并交给scrapy进行下载 36 | next_url = response.css(".next.page-numbers::attr(href)").extract_first("") 37 | if next_url: 38 | yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse) 39 | 40 | def parse_detail(self, response): 41 | pass -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/css/index.css: -------------------------------------------------------------------------------- 1 | @charset "utf-8"; 2 | html{*overflow:auto;} 3 | #main{width:730px;margin:75px auto 0;} 4 | #main h1.title{width:600px;} 5 | #bd{margin-bottom:20px;} 6 | .logo.large{margin:0px auto 10px auto;width:342px;height:144px;background: url(../img/logo.png) no-repeat center center;} 7 | 8 | /*nav样式*/ 9 | .nav{margin-bottom:10px;} 10 | .searchList{float:left;padding-left:5px;} 11 | .searchList .searchItem{float:left;margin-right:15px;font-size:14px;padding:0 0 2px 2px;cursor:pointer;height:18px;} 12 | .searchList .searchItem.current{color:#0080cc;border-bottom:2px solid #9cc813;font-weight:bold;} 13 | 14 | /*input搜索区域*/ 15 | .inputArea{position:relative;margin-bottom:65px;} 16 | .inputArea .searchInput{border:1px solid #bfbfbf;padding:0 15px;outline:none;height:40px;*height:39px;*line-height:40px;width:520px; background:url(../img/inputbg.png);font-size:14px;} 17 | .inputArea .searchButton{position:absolute;left:550px;*left:552px;*top:1px;width:106px;height:42px;*height:41px;background:url(../img/seachbtn.png) no-repeat;border:none;cursor:pointer;} 18 | /*高级搜索*/ 19 | .inputArea .advanced{position:absolute;font-size:14px;left:674px;top:12px;text-decoration:underline;} 20 | /*联想下拉区域*/ 21 | .inputArea .dataList{display:none;position:absolute;left:0;top:42px;*top:43px;width:550px;padding:5px 0;background:#fff;border:1px solid #bfbfbf;border-top:none;} 22 | .inputArea .dataList li{padding:2px 15px;font-size:14px;} 23 | .inputArea .dataList li:hover{background:#f0f0f0;color:#0080cc;font-weight:bold;} 24 | 25 | /*搜索历史区域*/ 26 | .historyArea{margin:0 auto;width:485px;} 27 | .historyArea .history {margin-bottom:5px;} 28 | .historyArea .history label{font-weight:bold;} 29 | .historyArea .history a{margin-right:12px;} 30 | 31 | /*版权信息*/ 32 | .foot{position:absolute;bottom:0px;width:100%;} 33 | .foot .wrap{margin:0 auto;} 34 | .foot .copyright{position:relative;top:-35px;color:#ababab;text-align:center;} -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/css/index.css: -------------------------------------------------------------------------------- 1 | @charset "utf-8"; 2 | html{*overflow:auto;} 3 | #main{width:730px;margin:75px auto 0;} 4 | #main h1.title{width:600px;} 5 | #bd{margin-bottom:20px;} 6 | .logo.large{margin:0px auto 10px auto;width:342px;height:144px;background: url(../img/logo.png) no-repeat center center;} 7 | 8 | /*nav样式*/ 9 | .nav{margin-bottom:10px;} 10 | .searchList{float:left;padding-left:5px;} 11 | .searchList .searchItem{float:left;margin-right:15px;font-size:14px;padding:0 0 2px 2px;cursor:pointer;height:18px;} 12 | .searchList .searchItem.current{color:#0080cc;border-bottom:2px solid #9cc813;font-weight:bold;} 13 | 14 | /*input搜索区域*/ 15 | .inputArea{position:relative;margin-bottom:65px;} 16 | .inputArea .searchInput{border:1px solid #bfbfbf;padding:0 15px;outline:none;height:40px;*height:39px;*line-height:40px;width:520px; background:url(../img/inputbg.png);font-size:14px;} 17 | .inputArea .searchButton{position:absolute;left:550px;*left:552px;*top:1px;width:106px;height:42px;*height:41px;background:url(../img/seachbtn.png) no-repeat;border:none;cursor:pointer;} 18 | /*高级搜索*/ 19 | .inputArea .advanced{position:absolute;font-size:14px;left:674px;top:12px;text-decoration:underline;} 20 | /*联想下拉区域*/ 21 | .inputArea .dataList{display:none;position:absolute;left:0;top:42px;*top:43px;width:550px;padding:5px 0;background:#fff;border:1px solid #bfbfbf;border-top:none;} 22 | .inputArea .dataList li{padding:2px 15px;font-size:14px;} 23 | .inputArea .dataList li:hover{background:#f0f0f0;color:#0080cc;font-weight:bold;} 24 | 25 | /*搜索历史区域*/ 26 | .historyArea{margin:0 auto;width:485px;} 27 | .historyArea .history {margin-bottom:5px;} 28 | .historyArea .history label{font-weight:bold;} 29 | .historyArea .history a{margin-right:12px;} 30 | 31 | /*版权信息*/ 32 | .foot{position:absolute;bottom:0px;width:100%;} 33 | .foot .wrap{margin:0 auto;} 34 | .foot .copyright{position:relative;top:-35px;color:#ababab;text-align:center;} -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/build/lib/tools/selenium_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | 4 | from selenium import webdriver 5 | from scrapy.selector import Selector 6 | 7 | # browser = webdriver.Chrome(executable_path="D:/Temp/chromedriver.exe") 8 | 9 | # browser.get("https://www.zhihu.com/#signin") 10 | # 11 | # browser.find_element_by_css_selector(".view-signin input[name='account']").send_keys("18782902568") 12 | # browser.find_element_by_css_selector(".view-signin input[name='password']").send_keys("admin125") 13 | # 14 | # browser.find_element_by_css_selector(".view-signin button.sign-button").click() 15 | #selenium 完成微博模拟登录 16 | 17 | # browser.get("https://www.oschina.net/blog") 18 | # import time 19 | # time.sleep(5) 20 | # browser.find_element_by_css_selector("#loginname").send_keys("liyao198705@sina.com") 21 | # browser.find_element_by_css_selector(".info_list.password input[node-type='password']").send_keys("da_ge_da") 22 | # browser.find_element_by_css_selector(".info_list.login_btn a[node-type='submitBtn']").click() 23 | 24 | # for i in range(3): 25 | # browser.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;") 26 | # time.sleep(3) 27 | # t_selector = Selector(text=browser.page_source) 28 | # print (t_selector.css(".tm-promo-price .tm-price::text").extract()) 29 | 30 | 31 | #设置chromedriver不加载图片 32 | # chrome_opt = webdriver.ChromeOptions() 33 | # prefs = {"profile.managed_default_content_settings.images":2} 34 | # chrome_opt.add_experimental_option("prefs", prefs) 35 | 36 | 37 | #phantomjs, 无界面的浏览器, 多进程情况下phantomjs性能会下降很严重 38 | 39 | browser = webdriver.PhantomJS(executable_path="E:/home/phantomjs-2.1.1-windows/bin/phantomjs.exe") 40 | browser.get("https://detail.tmall.com/item.htm?spm=a230r.1.14.3.yYBVG6&id=538286972599&cm_id=140105335569ed55e27b&abbucket=15&sku_properties=10004:709990523;5919063:6536025") 41 | 42 | print (browser.page_source) 43 | browser.quit() -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/ScrapyRedisTest/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ScrapyredistestSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/ArticleSpider/spiders/lagou.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from datetime import datetime 3 | import scrapy 4 | from scrapy.linkextractors import LinkExtractor 5 | from scrapy.spiders import CrawlSpider, Rule 6 | 7 | from items import LagouJobItemLoader, LagouJobItem 8 | from ArticleSpider.utils.common import get_md5 9 | 10 | class LagouSpider(CrawlSpider): 11 | name = 'lagou' 12 | allowed_domains = ['www.lagou.com'] 13 | start_urls = ['https://www.lagou.com'] 14 | 15 | rules = ( 16 | Rule(LinkExtractor(allow=("zhaopin/.*",)), follow=True), 17 | Rule(LinkExtractor(allow=("gongsi/j\d+.html",)), follow=True), 18 | Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True), 19 | ) 20 | # 21 | # def parse_start_url(self, response): 22 | # return [] 23 | # 24 | # def process_results(self, response, results): 25 | # return results 26 | 27 | def parse_job(self, response): 28 | #解析拉勾网的职位 29 | item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) 30 | item_loader.add_css("title", ".job-name::attr(title)") 31 | item_loader.add_value("url", response.url) 32 | item_loader.add_value("url_object_id", get_md5(response.url)) 33 | item_loader.add_css("salary", ".job_request .salary::text") 34 | item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") 35 | item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") 36 | item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") 37 | item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") 38 | 39 | item_loader.add_css("tags", '.position-label li::text') 40 | item_loader.add_css("publish_time", ".publish_time::text") 41 | item_loader.add_css("job_advantage", ".job-advantage p::text") 42 | item_loader.add_css("job_desc", ".job_bt div") 43 | item_loader.add_css("job_addr", ".work_addr") 44 | item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") 45 | item_loader.add_css("company_url", "#job_company dt a::attr(href)") 46 | item_loader.add_value("crawl_time", datetime.now()) 47 | 48 | job_item = item_loader.load_item() 49 | 50 | return job_item 51 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/build/lib/ArticleSpider/spiders/lagou.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from datetime import datetime 3 | import scrapy 4 | from scrapy.linkextractors import LinkExtractor 5 | from scrapy.spiders import CrawlSpider, Rule 6 | 7 | from items import LagouJobItemLoader, LagouJobItem 8 | from ArticleSpider.utils.common import get_md5 9 | 10 | class LagouSpider(CrawlSpider): 11 | name = 'lagou' 12 | allowed_domains = ['www.lagou.com'] 13 | start_urls = ['https://www.lagou.com'] 14 | 15 | rules = ( 16 | Rule(LinkExtractor(allow=("zhaopin/.*",)), follow=True), 17 | Rule(LinkExtractor(allow=("gongsi/j\d+.html",)), follow=True), 18 | Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True), 19 | ) 20 | # 21 | # def parse_start_url(self, response): 22 | # return [] 23 | # 24 | # def process_results(self, response, results): 25 | # return results 26 | 27 | def parse_job(self, response): 28 | #解析拉勾网的职位 29 | item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) 30 | item_loader.add_css("title", ".job-name::attr(title)") 31 | item_loader.add_value("url", response.url) 32 | item_loader.add_value("url_object_id", get_md5(response.url)) 33 | item_loader.add_css("salary", ".job_request .salary::text") 34 | item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") 35 | item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") 36 | item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") 37 | item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") 38 | 39 | item_loader.add_css("tags", '.position-label li::text') 40 | item_loader.add_css("publish_time", ".publish_time::text") 41 | item_loader.add_css("job_advantage", ".job-advantage p::text") 42 | item_loader.add_css("job_desc", ".job_bt div") 43 | item_loader.add_css("job_addr", ".work_addr") 44 | item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") 45 | item_loader.add_css("company_url", "#job_company dt a::attr(href)") 46 | item_loader.add_value("crawl_time", datetime.now()) 47 | 48 | job_item = item_loader.load_item() 49 | 50 | return job_item 51 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/scrapy_redis/pipelines.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.misc import load_object 2 | from scrapy.utils.serialize import ScrapyJSONEncoder 3 | from twisted.internet.threads import deferToThread 4 | 5 | from . import connection, defaults 6 | 7 | 8 | default_serialize = ScrapyJSONEncoder().encode 9 | 10 | 11 | class RedisPipeline(object): 12 | """Pushes serialized item into a redis list/queue 13 | 14 | Settings 15 | -------- 16 | REDIS_ITEMS_KEY : str 17 | Redis key where to store items. 18 | REDIS_ITEMS_SERIALIZER : str 19 | Object path to serializer function. 20 | 21 | """ 22 | 23 | def __init__(self, server, 24 | key=defaults.PIPELINE_KEY, 25 | serialize_func=default_serialize): 26 | """Initialize pipeline. 27 | 28 | Parameters 29 | ---------- 30 | server : StrictRedis 31 | Redis client instance. 32 | key : str 33 | Redis key where to store items. 34 | serialize_func : callable 35 | Items serializer function. 36 | 37 | """ 38 | self.server = server 39 | self.key = key 40 | self.serialize = serialize_func 41 | 42 | @classmethod 43 | def from_settings(cls, settings): 44 | params = { 45 | 'server': connection.from_settings(settings), 46 | } 47 | if settings.get('REDIS_ITEMS_KEY'): 48 | params['key'] = settings['REDIS_ITEMS_KEY'] 49 | if settings.get('REDIS_ITEMS_SERIALIZER'): 50 | params['serialize_func'] = load_object( 51 | settings['REDIS_ITEMS_SERIALIZER'] 52 | ) 53 | 54 | return cls(**params) 55 | 56 | @classmethod 57 | def from_crawler(cls, crawler): 58 | return cls.from_settings(crawler.settings) 59 | 60 | def process_item(self, item, spider): 61 | return deferToThread(self._process_item, item, spider) 62 | 63 | def _process_item(self, item, spider): 64 | key = self.item_key(item, spider) 65 | data = self.serialize(item) 66 | self.server.rpush(key, data) 67 | return item 68 | 69 | def item_key(self, item, spider): 70 | """Returns redis key based on given spider. 71 | 72 | Override this function to use a different key depending on the item 73 | and/or spider. 74 | 75 | """ 76 | return self.key % {'spider': spider.name} 77 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/scrapy_redis/connection.py: -------------------------------------------------------------------------------- 1 | import six 2 | 3 | from scrapy.utils.misc import load_object 4 | 5 | from . import defaults 6 | 7 | 8 | # Shortcut maps 'setting name' -> 'parmater name'. 9 | SETTINGS_PARAMS_MAP = { 10 | 'REDIS_URL': 'url', 11 | 'REDIS_HOST': 'host', 12 | 'REDIS_PORT': 'port', 13 | 'REDIS_ENCODING': 'encoding', 14 | } 15 | 16 | 17 | def get_redis_from_settings(settings): 18 | """Returns a redis client instance from given Scrapy settings object. 19 | 20 | This function uses ``get_client`` to instantiate the client and uses 21 | ``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You 22 | can override them using the ``REDIS_PARAMS`` setting. 23 | 24 | Parameters 25 | ---------- 26 | settings : Settings 27 | A scrapy settings object. See the supported settings below. 28 | 29 | Returns 30 | ------- 31 | server 32 | Redis client instance. 33 | 34 | Other Parameters 35 | ---------------- 36 | REDIS_URL : str, optional 37 | Server connection URL. 38 | REDIS_HOST : str, optional 39 | Server host. 40 | REDIS_PORT : str, optional 41 | Server port. 42 | REDIS_ENCODING : str, optional 43 | Data encoding. 44 | REDIS_PARAMS : dict, optional 45 | Additional client parameters. 46 | 47 | """ 48 | params = defaults.REDIS_PARAMS.copy() 49 | params.update(settings.getdict('REDIS_PARAMS')) 50 | # XXX: Deprecate REDIS_* settings. 51 | for source, dest in SETTINGS_PARAMS_MAP.items(): 52 | val = settings.get(source) 53 | if val: 54 | params[dest] = val 55 | 56 | # Allow ``redis_cls`` to be a path to a class. 57 | if isinstance(params.get('redis_cls'), six.string_types): 58 | params['redis_cls'] = load_object(params['redis_cls']) 59 | 60 | return get_redis(**params) 61 | 62 | 63 | # Backwards compatible alias. 64 | from_settings = get_redis_from_settings 65 | 66 | 67 | def get_redis(**kwargs): 68 | """Returns a redis client instance. 69 | 70 | Parameters 71 | ---------- 72 | redis_cls : class, optional 73 | Defaults to ``redis.StrictRedis``. 74 | url : str, optional 75 | If given, ``redis_cls.from_url`` is used to instantiate the class. 76 | **kwargs 77 | Extra parameters to be passed to the ``redis_cls`` class. 78 | 79 | Returns 80 | ------- 81 | server 82 | Redis client instance. 83 | 84 | """ 85 | redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS) 86 | url = kwargs.pop('url', None) 87 | if url: 88 | return redis_cls.from_url(url, **kwargs) 89 | else: 90 | return redis_cls(**kwargs) 91 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/ScrapyRedisTest/utils/bloomfilter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import mmh3 3 | import redis 4 | import math 5 | import time 6 | 7 | 8 | class PyBloomFilter(): 9 | #内置100个随机种子 10 | SEEDS = [543, 460, 171, 876, 796, 607, 650, 81, 837, 545, 591, 946, 846, 521, 913, 636, 878, 735, 414, 372, 11 | 344, 324, 223, 180, 327, 891, 798, 933, 493, 293, 836, 10, 6, 544, 924, 849, 438, 41, 862, 648, 338, 12 | 465, 562, 693, 979, 52, 763, 103, 387, 374, 349, 94, 384, 680, 574, 480, 307, 580, 71, 535, 300, 53, 13 | 481, 519, 644, 219, 686, 236, 424, 326, 244, 212, 909, 202, 951, 56, 812, 901, 926, 250, 507, 739, 371, 14 | 63, 584, 154, 7, 284, 617, 332, 472, 140, 605, 262, 355, 526, 647, 923, 199, 518] 15 | 16 | #capacity是预先估计要去重的数量 17 | #error_rate表示错误率 18 | #conn表示redis的连接客户端 19 | #key表示在redis中的键的名字前缀 20 | def __init__(self, capacity=1000000000, error_rate=0.00000001, conn=None, key='BloomFilter'): 21 | self.m = math.ceil(capacity*math.log2(math.e)*math.log2(1/error_rate)) #需要的总bit位数 22 | self.k = math.ceil(math.log1p(2)*self.m/capacity) #需要最少的hash次数 23 | self.mem = math.ceil(self.m/8/1024/1024) #需要的多少M内存 24 | self.blocknum = math.ceil(self.mem/512) #需要多少个512M的内存块,value的第一个字符必须是ascii码,所有最多有256个内存块 25 | self.seeds = self.SEEDS[0:self.k] 26 | self.key = key 27 | self.N = 2**31-1 28 | self.redis = conn 29 | # print(self.mem) 30 | # print(self.k) 31 | 32 | def add(self, value): 33 | name = self.key + "_" + str(ord(value[0])%self.blocknum) 34 | hashs = self.get_hashs(value) 35 | for hash in hashs: 36 | self.redis.setbit(name, hash, 1) 37 | 38 | def is_exist(self, value): 39 | name = self.key + "_" + str(ord(value[0])%self.blocknum) 40 | hashs = self.get_hashs(value) 41 | exist = True 42 | for hash in hashs: 43 | exist = exist & self.redis.getbit(name, hash) 44 | return exist 45 | 46 | def get_hashs(self, value): 47 | hashs = list() 48 | for seed in self.seeds: 49 | hash = mmh3.hash(value, seed) 50 | if hash >= 0: 51 | hashs.append(hash) 52 | else: 53 | hashs.append(self.N - hash) 54 | return hashs 55 | 56 | 57 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=0) 58 | conn = redis.StrictRedis(connection_pool=pool) 59 | 60 | if __name__ == "__main__": 61 | bf = PyBloomFilter(conn=conn) 62 | bf.add('www.jobbole.com') 63 | bf.add('www.zhihu.com') 64 | print(bf.is_exist('www.zhihu.com')) 65 | print(bf.is_exist('www.lagou.com')) 66 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/tools/selenium_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from selenium import webdriver 4 | from scrapy.selector import Selector 5 | 6 | 7 | #知乎的模拟登录 8 | # browser = webdriver.Chrome(executable_path="E:/chromedriver.exe") #路径是chromedriver.exe的存放的位置 9 | # browser.get("https://www.zhihu.com/#signin") 10 | # browser.find_element_by_css_selector(".view-signin input[name='account']").send_keys("********") #帐号 11 | # browser.find_element_by_css_selector(".view-signin input[name='password']").send_keys("********") #密码 12 | # browser.find_element_by_id("captcha").send_keys(input('请输入验证码:')) 13 | # browser.find_element_by_css_selector(".view-signin button.sign-button").click() #登录 14 | # browser.quit() 15 | # 16 | # 17 | # #可以用selenium得到js加载后的html,比如这样的话可以抓取到本来抓取的不到的一些字段(淘宝的交易量等等) 18 | # browser = webdriver.Chrome(executable_path="E:/chromedriver.exe") 19 | # browser.get("https://detail.tmall.com/item.htm?spm=a230r.1.14.3.yYBVG6&id=538286972599&cm_id=140105335569ed55e27b&abbucket=15&sku_properties=10004:709990523;5919063:6536025") 20 | # print(browser.page_source) #page_source就是js加载完的源代码 21 | # #browser.quit() 22 | # ''' 23 | # 如果是用selenium本身的选择器(python写的,比较慢),会很慢 24 | # 所以现在转换成scrapy中的selector(他是用c语言写的,很快) 25 | # 模版,也可以嵌入scrapy中 26 | # ''' 27 | # t_selector=Selector(text=browser.page_source) 28 | # print(t_selector.xpath('//*[@id="J_StrPriceModBox"]/dd/span/text()').extract()) 29 | 30 | 31 | # #selenium 完成微博模拟登录 32 | # browser = webdriver.Chrome(executable_path="E:/chromedriver.exe") 33 | # browser.get("http://weibo.com/") 34 | # import time 35 | # time.sleep(5) 36 | # browser.find_element_by_css_selector("#loginname").send_keys("******") 37 | # browser.find_element_by_css_selector(".info_list.password input[node-type='password']").send_keys("******") 38 | # browser.find_element_by_css_selector(".info_list.login_btn a[node-type='submitBtn']").click() 39 | # #下拉 40 | # for i in range(3): 41 | # '''三次下拉操作,这是javascript的知识''' 42 | # browser.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;") 43 | # time.sleep(3) 44 | 45 | 46 | #设置chromedriver不加载图片 47 | #是固定的模版 48 | # chrome_opt = webdriver.ChromeOptions() 49 | # prefs = {"profile.managed_default_content_settings.images":2} 50 | # chrome_opt.add_experimental_option("prefs", prefs) 51 | # browser = webdriver.Chrome(executable_path="E:/chromedriver.exe",chrome_options=chrome_opt) 52 | # browser.get("http://weibo.com/") 53 | 54 | 55 | #phantomjs, 无界面的浏览器, 多进程情况下phantomjs性能会下降很严重 56 | browser = webdriver.PhantomJS(executable_path="F:/迅雷下载/phantomjs-2.1.1-windows/bin/phantomjs.exe") 57 | browser.get("https://detail.tmall.com/item.htm?spm=a230r.1.14.3.yYBVG6&id=538286972599&cm_id=140105335569ed55e27b&abbucket=15&sku_properties=10004:709990523;5919063:6536025") 58 | print (browser.page_source) 59 | browser.quit() -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/css/style.css: -------------------------------------------------------------------------------- 1 | @charset "utf-8"; 2 | /*css reset*/ 3 | html, body, div, span, object, iframe, h1, h2, h3, h4, h5, h6, p, blockquote, pre, a, abbr, acronym, address, code, del, dfn, em, img, q, dl, dt, dd, ol, ul, li, fieldset, form, label, legend, table, caption, tbody, tfoot, thead, tr, th, td , i{ 4 | margin:0; 5 | padding:0; 6 | border:0; 7 | font-weight:inherit; 8 | font-style:inherit; 9 | font-size:100%; 10 | font-family:inherit; 11 | vertical-align:baseline; 12 | } 13 | body {line-height:1.5;} 14 | table {border-collapse: collapse;border-spacing:0;} 15 | caption, th, td ,b,strong{text-align:left;font-weight:normal;} 16 | table, td, th {vertical-align:middle;} 17 | blockquote:before, blockquote:after, q:before, q:after {content:"";} 18 | blockquote, q {quotes:"" "";} 19 | a img {border:none;} 20 | em,cite{font-style:normal;} 21 | 22 | 23 | body { background:#fff; font: 12px/1.5 Tahoma,'宋体';color:#000;} 24 | h1, h2, h3, h4, h5, h6 {font-weight:normal;color:#111;} 25 | a {text-decoration:none;cursor:pointer;} 26 | dl, dt, dd, ol, ul, li{ list-style:none;} 27 | 28 | /*some common class*/ 29 | .left{float:left;} 30 | .right{float:right;} 31 | 32 | /*clear*/ 33 | .ue-clear:after{content: ".";display:block;height:0;clear:both;visibility:hidden;} 34 | .ue-clear{display:inline-block;} 35 | *html .ue-clear{height:1%;} 36 | .ue-clear{display:block;} 37 | 38 | a{color:#0080cc;} 39 | a:hover{color:#267A01;text-decoration:underline;} 40 | /*logo样式*/ 41 | .logo{width:160px;height:47px;padding:0 5px;background: url(../img/logo1.png) no-repeat center center #fff;} 42 | 43 | /*choose样式*/ 44 | .choose{float:left;margin-right:15px;white-space:nowrap;} 45 | .choose .text{float:left;padding-left:20px;*padding-left:16px;white-space:nowrap; vertical-align:text-bottom;} 46 | .choose input[type=radio],.choose input[type=checkbox]{position:relative;*top:-3px;float:left;margin-right:-16px;} 47 | 48 | /*==================================== 49 | 分页信息(表格依赖样式) 50 | ===================================*/ 51 | .pagination{font-size:14px;} 52 | .pagination a {text-decoration: none;border: solid 1px; } 53 | .pagination .pxofy{float:left;margin-left: 5px;height:25px;*padding-top:1px;} 54 | .pagination a, .pagination span {display: block;float: left;height:18px;line-height:18px;padding:0 6px;margin-right: 5px;font-family:Arial, Helvetica, sans-serif !important;} 55 | .pagination .current {cursor:default;border: solid 1px ;} 56 | .pagination .prev, .pagination .next{*line-height:22px;} 57 | 58 | /*分页样式*/ 59 | .pagination a{color: #032F54;border-color:#8EB2D2;} 60 | .pagination a:hover{color:#023054;border-color:#8EB2D2;background:#B8DFFB;} 61 | .pagination .current{color:#fff;border-color:#5c9bc4;background:#89B8D8;} 62 | .pagination .current.prev, .pagination .current.next{color:#B9B9B9;border-color:#D3D3D3;background:#fff;} 63 | .pagination .pxofy{color: #023054;} 64 | 65 | #foot{height:32px;line-height:32px; text-align:center;background:#f9f9f9;border-top:1px solid #e0e0e0;color:#ababab;} 66 | 67 | 68 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/css/style.css: -------------------------------------------------------------------------------- 1 | @charset "utf-8"; 2 | /*css reset*/ 3 | html, body, div, span, object, iframe, h1, h2, h3, h4, h5, h6, p, blockquote, pre, a, abbr, acronym, address, code, del, dfn, em, img, q, dl, dt, dd, ol, ul, li, fieldset, form, label, legend, table, caption, tbody, tfoot, thead, tr, th, td , i{ 4 | margin:0; 5 | padding:0; 6 | border:0; 7 | font-weight:inherit; 8 | font-style:inherit; 9 | font-size:100%; 10 | font-family:inherit; 11 | vertical-align:baseline; 12 | } 13 | body {line-height:1.5;} 14 | table {border-collapse: collapse;border-spacing:0;} 15 | caption, th, td ,b,strong{text-align:left;font-weight:normal;} 16 | table, td, th {vertical-align:middle;} 17 | blockquote:before, blockquote:after, q:before, q:after {content:"";} 18 | blockquote, q {quotes:"" "";} 19 | a img {border:none;} 20 | em,cite{font-style:normal;} 21 | 22 | 23 | body { background:#fff; font: 12px/1.5 Tahoma,'宋体';color:#000;} 24 | h1, h2, h3, h4, h5, h6 {font-weight:normal;color:#111;} 25 | a {text-decoration:none;cursor:pointer;} 26 | dl, dt, dd, ol, ul, li{ list-style:none;} 27 | 28 | /*some common class*/ 29 | .left{float:left;} 30 | .right{float:right;} 31 | 32 | /*clear*/ 33 | .ue-clear:after{content: ".";display:block;height:0;clear:both;visibility:hidden;} 34 | .ue-clear{display:inline-block;} 35 | *html .ue-clear{height:1%;} 36 | .ue-clear{display:block;} 37 | 38 | a{color:#0080cc;} 39 | a:hover{color:#267A01;text-decoration:underline;} 40 | /*logo样式*/ 41 | .logo{width:160px;height:47px;padding:0 5px;background: url(../img/logo1.png) no-repeat center center #fff;} 42 | 43 | /*choose样式*/ 44 | .choose{float:left;margin-right:15px;white-space:nowrap;} 45 | .choose .text{float:left;padding-left:20px;*padding-left:16px;white-space:nowrap; vertical-align:text-bottom;} 46 | .choose input[type=radio],.choose input[type=checkbox]{position:relative;*top:-3px;float:left;margin-right:-16px;} 47 | 48 | /*==================================== 49 | 分页信息(表格依赖样式) 50 | ===================================*/ 51 | .pagination{font-size:14px;} 52 | .pagination a {text-decoration: none;border: solid 1px; } 53 | .pagination .pxofy{float:left;margin-left: 5px;height:25px;*padding-top:1px;} 54 | .pagination a, .pagination span {display: block;float: left;height:18px;line-height:18px;padding:0 6px;margin-right: 5px;font-family:Arial, Helvetica, sans-serif !important;} 55 | .pagination .current {cursor:default;border: solid 1px ;} 56 | .pagination .prev, .pagination .next{*line-height:22px;} 57 | 58 | /*分页样式*/ 59 | .pagination a{color: #032F54;border-color:#8EB2D2;} 60 | .pagination a:hover{color:#023054;border-color:#8EB2D2;background:#B8DFFB;} 61 | .pagination .current{color:#fff;border-color:#5c9bc4;background:#89B8D8;} 62 | .pagination .current.prev, .pagination .current.next{color:#B9B9B9;border-color:#D3D3D3;background:#fff;} 63 | .pagination .pxofy{color: #023054;} 64 | 65 | #foot{height:32px;line-height:32px; text-align:center;background:#f9f9f9;border-top:1px solid #e0e0e0;color:#ababab;} 66 | 67 | 68 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/css/advanced.css: -------------------------------------------------------------------------------- 1 | @charset "utf-8"; 2 | html{*overflow:auto;} 3 | #hd{padding:20px 10px;} 4 | .logo{float:left;margin-right:30px; height:33px;} 5 | /*input搜索区域*/ 6 | .inputArea{float:left;position:relative;} 7 | .inputArea .searchInput{border:1px solid #bfbfbf;padding:0 15px;outline:none;height:35px;*line-height:38px;width:350px; background:url(../img/inputbg.png);font-size:14px;} 8 | .inputArea .searchButton{position:absolute;left:382px;top:0;*top:1px;*left:381px;width:106px;height:38px;background:url(../img/btn_min.png) no-repeat;border:none; cursor:pointer;} 9 | 10 | /*返回搜索*/ 11 | .inputArea .back{position:absolute;font-size:14px;left:500px;top:12px;width:60px;text-decoration:underline;} 12 | 13 | /*分界区域*/ 14 | .divsion{margin-bottom:24px;height:36px;background:#f9f9f9;border-bottom:1px solid #e0e0e0;} 15 | 16 | /*高级搜索区域*/ 17 | .subfield{border-left:4px solid #9cc813;font-size:14px;font-weight:bold;padding:2px 0 2px 20px;} 18 | .subfieldContent{padding-left:140px;padding-bottom:40px;} 19 | .subfieldContent .advanceItem{padding-left:350px;margin-bottom:15px;padding-top:8px;padding-bottom:3px;} 20 | .subfieldContent .advanceItem.keyWords{background:#f4f4f4;padding-top:18px;padding-bottom:3px;} 21 | .subfieldContent .advanceItem dd{float:left;margin-left:-320px;} 22 | .subfieldContent .advanceItem dd label{float:left;margin-right:40px;width:75px;font-weight:bold;} 23 | .subfieldContent .advanceItem dd .impInfo{ font-weight:bold;} 24 | .subfieldContent .advanceItem dd .tips{float:left;} 25 | .subfieldContent .advanceItem dd p, .subfieldContent .advanceItem dt p{margin-bottom:10px;height:26px;} 26 | .subfieldContent .advanceItem dt p input[type=text]{position:relative;top:-5px;line-height:26px;} 27 | 28 | .subfieldContent .advanceItem dt{float:left;width:100%;} 29 | .subfieldContent .advanceItem.keyWords dt input[type=text]{width:290px;height:26px;border:1px solid #bfbfbf;outline:none;} 30 | /*自定义*/ 31 | .subfieldContent .advanceItem.time{height:30px;} 32 | .subfieldContent .advanceItem .define{display:none;position:relative;*top:-3px;} 33 | .subfieldContent .advanceItem.time input[type=text]{width:80px;height:18px;line-height:18px;border:1px solid #bfbfbf;outline:none;} 34 | 35 | 36 | 37 | 38 | 39 | /*更多按钮*/ 40 | .more {float:left;} 41 | .more:hover{text-decoration:none;} 42 | .more .moreIcon{display:inline-block;position:relative;top:-1px;*top:-3px;left:2px;*left:-1px;width:9px;height:5px;background:url(../img/more.png);} 43 | .more.show .moreIcon{background:url(../img/down.png);top:-2px;} 44 | 45 | /*立即搜索样式*/ 46 | .subfieldContent .search{margin:45px 0 0 145px;width:130px;height:36px;background:url(../img/btnbg.png); font-weight:bold;border:none;border:1px solid #bfbfbf;line-height:36px;cursor:pointer;font-size:14px;} 47 | /*联想下拉区域*/ 48 | .inputArea .dataList{display:none;position:absolute;left:0;top:42px;*top:43px;width:550px;padding:5px 0;background:#fff;border:1px solid #bfbfbf;border-top:none;} 49 | .inputArea .dataList li{padding:2px 15px;font-size:14px;} 50 | .inputArea .dataList li:hover{background:#f0f0f0;color:#0080cc;font-weight:bold;} 51 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/css/advanced.css: -------------------------------------------------------------------------------- 1 | @charset "utf-8"; 2 | html{*overflow:auto;} 3 | #hd{padding:20px 10px;} 4 | .logo{float:left;margin-right:30px; height:33px;} 5 | /*input搜索区域*/ 6 | .inputArea{float:left;position:relative;} 7 | .inputArea .searchInput{border:1px solid #bfbfbf;padding:0 15px;outline:none;height:35px;*line-height:38px;width:350px; background:url(../img/inputbg.png);font-size:14px;} 8 | .inputArea .searchButton{position:absolute;left:382px;top:0;*top:1px;*left:381px;width:106px;height:38px;background:url(../img/btn_min.png) no-repeat;border:none; cursor:pointer;} 9 | 10 | /*返回搜索*/ 11 | .inputArea .back{position:absolute;font-size:14px;left:500px;top:12px;width:60px;text-decoration:underline;} 12 | 13 | /*分界区域*/ 14 | .divsion{margin-bottom:24px;height:36px;background:#f9f9f9;border-bottom:1px solid #e0e0e0;} 15 | 16 | /*高级搜索区域*/ 17 | .subfield{border-left:4px solid #9cc813;font-size:14px;font-weight:bold;padding:2px 0 2px 20px;} 18 | .subfieldContent{padding-left:140px;padding-bottom:40px;} 19 | .subfieldContent .advanceItem{padding-left:350px;margin-bottom:15px;padding-top:8px;padding-bottom:3px;} 20 | .subfieldContent .advanceItem.keyWords{background:#f4f4f4;padding-top:18px;padding-bottom:3px;} 21 | .subfieldContent .advanceItem dd{float:left;margin-left:-320px;} 22 | .subfieldContent .advanceItem dd label{float:left;margin-right:40px;width:75px;font-weight:bold;} 23 | .subfieldContent .advanceItem dd .impInfo{ font-weight:bold;} 24 | .subfieldContent .advanceItem dd .tips{float:left;} 25 | .subfieldContent .advanceItem dd p, .subfieldContent .advanceItem dt p{margin-bottom:10px;height:26px;} 26 | .subfieldContent .advanceItem dt p input[type=text]{position:relative;top:-5px;line-height:26px;} 27 | 28 | .subfieldContent .advanceItem dt{float:left;width:100%;} 29 | .subfieldContent .advanceItem.keyWords dt input[type=text]{width:290px;height:26px;border:1px solid #bfbfbf;outline:none;} 30 | /*自定义*/ 31 | .subfieldContent .advanceItem.time{height:30px;} 32 | .subfieldContent .advanceItem .define{display:none;position:relative;*top:-3px;} 33 | .subfieldContent .advanceItem.time input[type=text]{width:80px;height:18px;line-height:18px;border:1px solid #bfbfbf;outline:none;} 34 | 35 | 36 | 37 | 38 | 39 | /*更多按钮*/ 40 | .more {float:left;} 41 | .more:hover{text-decoration:none;} 42 | .more .moreIcon{display:inline-block;position:relative;top:-1px;*top:-3px;left:2px;*left:-1px;width:9px;height:5px;background:url(../img/more.png);} 43 | .more.show .moreIcon{background:url(../img/down.png);top:-2px;} 44 | 45 | /*立即搜索样式*/ 46 | .subfieldContent .search{margin:45px 0 0 145px;width:130px;height:36px;background:url(../img/btnbg.png); font-weight:bold;border:none;border:1px solid #bfbfbf;line-height:36px;cursor:pointer;font-size:14px;} 47 | /*联想下拉区域*/ 48 | .inputArea .dataList{display:none;position:absolute;left:0;top:42px;*top:43px;width:550px;padding:5px 0;background:#fff;border:1px solid #bfbfbf;border-top:none;} 49 | .inputArea .dataList li{padding:2px 15px;font-size:14px;} 50 | .inputArea .dataList li:hover{background:#f0f0f0;color:#0080cc;font-weight:bold;} 51 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/ArticleSpider/utils/bloomfilter.py: -------------------------------------------------------------------------------- 1 | import mmh3 2 | import BitVector 3 | import redis 4 | import math 5 | import time 6 | 7 | 8 | class BloomFilter(): 9 | #内置100个随机种子 10 | SEEDS = [543, 460, 171, 876, 796, 607, 650, 81, 837, 545, 591, 946, 846, 521, 913, 636, 878, 735, 414, 372, 11 | 344, 324, 223, 180, 327, 891, 798, 933, 493, 293, 836, 10, 6, 544, 924, 849, 438, 41, 862, 648, 338, 12 | 465, 562, 693, 979, 52, 763, 103, 387, 374, 349, 94, 384, 680, 574, 480, 307, 580, 71, 535, 300, 53, 13 | 481, 519, 644, 219, 686, 236, 424, 326, 244, 212, 909, 202, 951, 56, 812, 901, 926, 250, 507, 739, 371, 14 | 63, 584, 154, 7, 284, 617, 332, 472, 140, 605, 262, 355, 526, 647, 923, 199, 518] 15 | 16 | #capacity是预先估计要去重的数量 17 | #error_rate表示错误率 18 | #conn表示redis的连接客户端 19 | #key表示在redis中的键的名字前缀 20 | def __init__(self, capacity=1000000000, error_rate=0.00000001, conn=None, key='BloomFilter'): 21 | self.m = math.ceil(capacity*math.log2(math.e)*math.log2(1/error_rate)) #需要的总bit位数 22 | self.k = math.ceil(math.log1p(2)*self.m/capacity) #需要最少的hash次数 23 | self.mem = math.ceil(self.m/8/1024/1024) #需要的多少M内存 24 | self.blocknum = math.ceil(self.mem/512) #需要多少个512M的内存块,value的第一个字符必须是ascii码,所有最多有256个内存块 25 | self.seeds = self.SEEDS[0:self.k] 26 | self.key = key 27 | self.N = 2**31-1 28 | self.redis = conn 29 | if not self.redis: 30 | #默认如果没有redis连接,在内存中使用512M的内存块去重 31 | self.bitset = BitVector.BitVector(size=1<<32) 32 | print(self.mem) 33 | print(self.k) 34 | 35 | def add(self, value): 36 | name = self.key + "_" + str(ord(value[0])%self.blocknum) 37 | hashs = self.get_hashs(value) 38 | for hash in hashs: 39 | if self.redis: 40 | self.redis.setbit(name, hash, 1) 41 | else: 42 | self.bitset[hash] = 1 43 | 44 | def is_exist(self, value): 45 | name = self.key + "_" + str(ord(value[0])%self.blocknum) 46 | hashs = self.get_hashs(value) 47 | exist = True 48 | for hash in hashs: 49 | if self.redis: 50 | exist = exist & self.redis.getbit(name, hash) 51 | else: 52 | exist = exist & self.bitset[hash] 53 | return exist 54 | 55 | def get_hashs(self, value): 56 | hashs = list() 57 | for seed in self.seeds: 58 | hash = mmh3.hash(value, seed) 59 | if hash >= 0: 60 | hashs.append(hash) 61 | else: 62 | hashs.append(self.N - hash) 63 | return hashs 64 | 65 | 66 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=0) 67 | conn = redis.StrictRedis(connection_pool=pool) 68 | 69 | start = time.time() 70 | bf = BloomFilter(conn=conn) 71 | bf.add('test') 72 | bf.add('fsest1') 73 | print(bf.is_exist('qest')) 74 | print(bf.is_exist('testdsad')) 75 | end = time.time() 76 | print(end-start) -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/ArticleSpider/utils/zhihu_login_requests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | 4 | import requests 5 | try: 6 | import cookielib 7 | except: 8 | import http.cookiejar as cookielib 9 | 10 | import re 11 | 12 | session = requests.session() 13 | session.cookies = cookielib.LWPCookieJar(filename="cookies.txt") 14 | try: 15 | session.cookies.load(ignore_discard=True) 16 | except: 17 | print ("cookie未能加载") 18 | 19 | agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0" 20 | header = { 21 | "HOST":"www.zhihu.com", 22 | "Referer": "https://www.zhizhu.com", 23 | 'User-Agent': agent 24 | } 25 | 26 | def is_login(): 27 | #通过个人中心页面返回状态码来判断是否为登录状态 28 | inbox_url = "https://www.zhihu.com/question/56250357/answer/148534773" 29 | response = session.get(inbox_url, headers=header, allow_redirects=False) 30 | if response.status_code != 200: 31 | return False 32 | else: 33 | return True 34 | 35 | def get_xsrf(): 36 | #获取xsrf code 37 | response = session.get("https://www.zhihu.com", headers=header) 38 | match_obj = re.match('.*name="_xsrf" value="(.*?)"', response.text) 39 | if match_obj: 40 | return (match_obj.group(1)) 41 | else: 42 | return "" 43 | 44 | 45 | def get_index(): 46 | response = session.get("https://www.zhihu.com", headers=header) 47 | with open("index_page.html", "wb") as f: 48 | f.write(response.text.encode("utf-8")) 49 | print ("ok") 50 | 51 | def get_captcha(): 52 | import time 53 | t = str(int(time.time()*1000)) 54 | captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login".format(t) 55 | t = session.get(captcha_url, headers=header) 56 | with open("captcha.jpg","wb") as f: 57 | f.write(t.content) 58 | f.close() 59 | 60 | from PIL import Image 61 | try: 62 | im = Image.open('captcha.jpg') 63 | im.show() 64 | im.close() 65 | except: 66 | pass 67 | 68 | captcha = input("输入验证码\n>") 69 | return captcha 70 | 71 | def zhihu_login(account, password): 72 | #知乎登录 73 | if re.match("^1\d{10}",account): 74 | print ("手机号码登录") 75 | post_url = "https://www.zhihu.com/login/phone_num" 76 | post_data = { 77 | "_xsrf": get_xsrf(), 78 | "phone_num": account, 79 | "password": password, 80 | "captcha":get_captcha() 81 | } 82 | else: 83 | if "@" in account: 84 | #判断用户名是否为邮箱 85 | print("邮箱方式登录") 86 | post_url = "https://www.zhihu.com/login/email" 87 | post_data = { 88 | "_xsrf": get_xsrf(), 89 | "email": account, 90 | "password": password 91 | } 92 | 93 | response_text = session.post(post_url, data=post_data, headers=header) 94 | session.cookies.save() 95 | 96 | zhihu_login("18782902568", "admin123") 97 | # get_index() 98 | is_login() 99 | 100 | # get_captcha() -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/build/lib/ArticleSpider/utils/bloomfilter.py: -------------------------------------------------------------------------------- 1 | import mmh3 2 | import BitVector 3 | import redis 4 | import math 5 | import time 6 | 7 | 8 | class BloomFilter(): 9 | #内置100个随机种子 10 | SEEDS = [543, 460, 171, 876, 796, 607, 650, 81, 837, 545, 591, 946, 846, 521, 913, 636, 878, 735, 414, 372, 11 | 344, 324, 223, 180, 327, 891, 798, 933, 493, 293, 836, 10, 6, 544, 924, 849, 438, 41, 862, 648, 338, 12 | 465, 562, 693, 979, 52, 763, 103, 387, 374, 349, 94, 384, 680, 574, 480, 307, 580, 71, 535, 300, 53, 13 | 481, 519, 644, 219, 686, 236, 424, 326, 244, 212, 909, 202, 951, 56, 812, 901, 926, 250, 507, 739, 371, 14 | 63, 584, 154, 7, 284, 617, 332, 472, 140, 605, 262, 355, 526, 647, 923, 199, 518] 15 | 16 | #capacity是预先估计要去重的数量 17 | #error_rate表示错误率 18 | #conn表示redis的连接客户端 19 | #key表示在redis中的键的名字前缀 20 | def __init__(self, capacity=1000000000, error_rate=0.00000001, conn=None, key='BloomFilter'): 21 | self.m = math.ceil(capacity*math.log2(math.e)*math.log2(1/error_rate)) #需要的总bit位数 22 | self.k = math.ceil(math.log1p(2)*self.m/capacity) #需要最少的hash次数 23 | self.mem = math.ceil(self.m/8/1024/1024) #需要的多少M内存 24 | self.blocknum = math.ceil(self.mem/512) #需要多少个512M的内存块,value的第一个字符必须是ascii码,所有最多有256个内存块 25 | self.seeds = self.SEEDS[0:self.k] 26 | self.key = key 27 | self.N = 2**31-1 28 | self.redis = conn 29 | if not self.redis: 30 | #默认如果没有redis连接,在内存中使用512M的内存块去重 31 | self.bitset = BitVector.BitVector(size=1<<32) 32 | print(self.mem) 33 | print(self.k) 34 | 35 | def add(self, value): 36 | name = self.key + "_" + str(ord(value[0])%self.blocknum) 37 | hashs = self.get_hashs(value) 38 | for hash in hashs: 39 | if self.redis: 40 | self.redis.setbit(name, hash, 1) 41 | else: 42 | self.bitset[hash] = 1 43 | 44 | def is_exist(self, value): 45 | name = self.key + "_" + str(ord(value[0])%self.blocknum) 46 | hashs = self.get_hashs(value) 47 | exist = True 48 | for hash in hashs: 49 | if self.redis: 50 | exist = exist & self.redis.getbit(name, hash) 51 | else: 52 | exist = exist & self.bitset[hash] 53 | return exist 54 | 55 | def get_hashs(self, value): 56 | hashs = list() 57 | for seed in self.seeds: 58 | hash = mmh3.hash(value, seed) 59 | if hash >= 0: 60 | hashs.append(hash) 61 | else: 62 | hashs.append(self.N - hash) 63 | return hashs 64 | 65 | 66 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=0) 67 | conn = redis.StrictRedis(connection_pool=pool) 68 | 69 | start = time.time() 70 | bf = BloomFilter(conn=conn) 71 | bf.add('test') 72 | bf.add('fsest1') 73 | print(bf.is_exist('qest')) 74 | print(bf.is_exist('testdsad')) 75 | end = time.time() 76 | print(end-start) -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/build/lib/ArticleSpider/utils/zhihu_login_requests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | 4 | import requests 5 | try: 6 | import cookielib 7 | except: 8 | import http.cookiejar as cookielib 9 | 10 | import re 11 | 12 | session = requests.session() 13 | session.cookies = cookielib.LWPCookieJar(filename="cookies.txt") 14 | try: 15 | session.cookies.load(ignore_discard=True) 16 | except: 17 | print ("cookie未能加载") 18 | 19 | agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0" 20 | header = { 21 | "HOST":"www.zhihu.com", 22 | "Referer": "https://www.zhizhu.com", 23 | 'User-Agent': agent 24 | } 25 | 26 | def is_login(): 27 | #通过个人中心页面返回状态码来判断是否为登录状态 28 | inbox_url = "https://www.zhihu.com/question/56250357/answer/148534773" 29 | response = session.get(inbox_url, headers=header, allow_redirects=False) 30 | if response.status_code != 200: 31 | return False 32 | else: 33 | return True 34 | 35 | def get_xsrf(): 36 | #获取xsrf code 37 | response = session.get("https://www.zhihu.com", headers=header) 38 | match_obj = re.match('.*name="_xsrf" value="(.*?)"', response.text) 39 | if match_obj: 40 | return (match_obj.group(1)) 41 | else: 42 | return "" 43 | 44 | 45 | def get_index(): 46 | response = session.get("https://www.zhihu.com", headers=header) 47 | with open("index_page.html", "wb") as f: 48 | f.write(response.text.encode("utf-8")) 49 | print ("ok") 50 | 51 | def get_captcha(): 52 | import time 53 | t = str(int(time.time()*1000)) 54 | captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login".format(t) 55 | t = session.get(captcha_url, headers=header) 56 | with open("captcha.jpg","wb") as f: 57 | f.write(t.content) 58 | f.close() 59 | 60 | from PIL import Image 61 | try: 62 | im = Image.open('captcha.jpg') 63 | im.show() 64 | im.close() 65 | except: 66 | pass 67 | 68 | captcha = input("输入验证码\n>") 69 | return captcha 70 | 71 | def zhihu_login(account, password): 72 | #知乎登录 73 | if re.match("^1\d{10}",account): 74 | print ("手机号码登录") 75 | post_url = "https://www.zhihu.com/login/phone_num" 76 | post_data = { 77 | "_xsrf": get_xsrf(), 78 | "phone_num": account, 79 | "password": password, 80 | "captcha":get_captcha() 81 | } 82 | else: 83 | if "@" in account: 84 | #判断用户名是否为邮箱 85 | print("邮箱方式登录") 86 | post_url = "https://www.zhihu.com/login/email" 87 | post_data = { 88 | "_xsrf": get_xsrf(), 89 | "email": account, 90 | "password": password 91 | } 92 | 93 | response_text = session.post(post_url, data=post_data, headers=header) 94 | session.cookies.save() 95 | 96 | zhihu_login("18782902568", "admin123") 97 | # get_index() 98 | is_login() 99 | 100 | # get_captcha() -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/tools/crawl_xici_ip.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | import requests 4 | from scrapy.selector import Selector 5 | import MySQLdb 6 | 7 | conn = MySQLdb.connect(host="127.0.0.1", user="root", passwd="root", db="article_spider", charset="utf8") 8 | cursor = conn.cursor() 9 | 10 | 11 | def crawl_ips(): 12 | #爬取西刺的免费ip代理 13 | headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"} 14 | for i in range(1568): 15 | re = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers) 16 | 17 | selector = Selector(text=re.text) 18 | all_trs = selector.css("#ip_list tr") 19 | 20 | 21 | ip_list = [] 22 | for tr in all_trs[1:]: 23 | speed_str = tr.css(".bar::attr(title)").extract()[0] 24 | if speed_str: 25 | speed = float(speed_str.split("秒")[0]) 26 | all_texts = tr.css("td::text").extract() 27 | 28 | ip = all_texts[0] 29 | port = all_texts[1] 30 | proxy_type = all_texts[5] 31 | 32 | ip_list.append((ip, port, proxy_type, speed)) 33 | 34 | for ip_info in ip_list: 35 | cursor.execute( 36 | "insert proxy_ip(ip, port, speed, proxy_type) VALUES('{0}', '{1}', {2}, 'HTTP')".format( 37 | ip_info[0], ip_info[1], ip_info[3] 38 | ) 39 | ) 40 | 41 | conn.commit() 42 | 43 | 44 | class GetIP(object): 45 | def delete_ip(self, ip): 46 | #从数据库中删除无效的ip 47 | delete_sql = """ 48 | delete from proxy_ip where ip='{0}' 49 | """.format(ip) 50 | cursor.execute(delete_sql) 51 | conn.commit() 52 | return True 53 | 54 | def judge_ip(self, ip, port): 55 | #判断ip是否可用 56 | http_url = "http://www.baidu.com" 57 | proxy_url = "http://{0}:{1}".format(ip, port) 58 | try: 59 | proxy_dict = { 60 | "http":proxy_url, 61 | } 62 | response = requests.get(http_url, proxies=proxy_dict) 63 | except Exception as e: 64 | print ("invalid ip and port") 65 | self.delete_ip(ip) 66 | return False 67 | else: 68 | code = response.status_code 69 | if code >= 200 and code < 300: 70 | print ("effective ip") 71 | return True 72 | else: 73 | print ("invalid ip and port") 74 | self.delete_ip(ip) 75 | return False 76 | 77 | 78 | def get_random_ip(self): 79 | #从数据库中随机获取一个可用的ip 80 | random_sql = """ 81 | SELECT ip, port FROM proxy_ip 82 | ORDER BY RAND() 83 | LIMIT 1 84 | """ 85 | result = cursor.execute(random_sql) 86 | for ip_info in cursor.fetchall(): 87 | ip = ip_info[0] 88 | port = ip_info[1] 89 | 90 | judge_re = self.judge_ip(ip, port) 91 | if judge_re: 92 | return "http://{0}:{1}".format(ip, port) 93 | else: 94 | return self.get_random_ip() 95 | 96 | 97 | 98 | # print (crawl_ips()) 99 | if __name__ == "__main__": 100 | get_ip = GetIP() 101 | get_ip.get_random_ip() -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/build/lib/tools/crawl_xici_ip.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | import requests 4 | from scrapy.selector import Selector 5 | import MySQLdb 6 | 7 | conn = MySQLdb.connect(host="127.0.0.1", user="root", passwd="root", db="article_spider", charset="utf8") 8 | cursor = conn.cursor() 9 | 10 | 11 | def crawl_ips(): 12 | #爬取西刺的免费ip代理 13 | headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"} 14 | for i in range(1568): 15 | re = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers) 16 | 17 | selector = Selector(text=re.text) 18 | all_trs = selector.css("#ip_list tr") 19 | 20 | 21 | ip_list = [] 22 | for tr in all_trs[1:]: 23 | speed_str = tr.css(".bar::attr(title)").extract()[0] 24 | if speed_str: 25 | speed = float(speed_str.split("秒")[0]) 26 | all_texts = tr.css("td::text").extract() 27 | 28 | ip = all_texts[0] 29 | port = all_texts[1] 30 | proxy_type = all_texts[5] 31 | 32 | ip_list.append((ip, port, proxy_type, speed)) 33 | 34 | for ip_info in ip_list: 35 | cursor.execute( 36 | "insert proxy_ip(ip, port, speed, proxy_type) VALUES('{0}', '{1}', {2}, 'HTTP')".format( 37 | ip_info[0], ip_info[1], ip_info[3] 38 | ) 39 | ) 40 | 41 | conn.commit() 42 | 43 | 44 | class GetIP(object): 45 | def delete_ip(self, ip): 46 | #从数据库中删除无效的ip 47 | delete_sql = """ 48 | delete from proxy_ip where ip='{0}' 49 | """.format(ip) 50 | cursor.execute(delete_sql) 51 | conn.commit() 52 | return True 53 | 54 | def judge_ip(self, ip, port): 55 | #判断ip是否可用 56 | http_url = "http://www.baidu.com" 57 | proxy_url = "http://{0}:{1}".format(ip, port) 58 | try: 59 | proxy_dict = { 60 | "http":proxy_url, 61 | } 62 | response = requests.get(http_url, proxies=proxy_dict) 63 | except Exception as e: 64 | print ("invalid ip and port") 65 | self.delete_ip(ip) 66 | return False 67 | else: 68 | code = response.status_code 69 | if code >= 200 and code < 300: 70 | print ("effective ip") 71 | return True 72 | else: 73 | print ("invalid ip and port") 74 | self.delete_ip(ip) 75 | return False 76 | 77 | 78 | def get_random_ip(self): 79 | #从数据库中随机获取一个可用的ip 80 | random_sql = """ 81 | SELECT ip, port FROM proxy_ip 82 | ORDER BY RAND() 83 | LIMIT 1 84 | """ 85 | result = cursor.execute(random_sql) 86 | for ip_info in cursor.fetchall(): 87 | ip = ip_info[0] 88 | port = ip_info[1] 89 | 90 | judge_re = self.judge_ip(ip, port) 91 | if judge_re: 92 | return "http://{0}:{1}".format(ip, port) 93 | else: 94 | return self.get_random_ip() 95 | 96 | 97 | 98 | # print (crawl_ips()) 99 | if __name__ == "__main__": 100 | get_ip = GetIP() 101 | get_ip.get_random_ip() -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/ScrapyRedisTest/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for ScrapyRedisTest project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'ScrapyRedisTest' 13 | 14 | SPIDER_MODULES = ['ScrapyRedisTest.spiders'] 15 | NEWSPIDER_MODULE = 'ScrapyRedisTest.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'ScrapyRedisTest (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'ScrapyRedisTest.middlewares.ScrapyredistestSpiderMiddleware': 543, 51 | #} 52 | 53 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 54 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 55 | ITEM_PIPELINES = { 56 | 'scrapy_redis.pipelines.RedisPipeline': 300 57 | } 58 | # Enable or disable downloader middlewares 59 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 60 | #DOWNLOADER_MIDDLEWARES = { 61 | # 'ScrapyRedisTest.middlewares.MyCustomDownloaderMiddleware': 543, 62 | #} 63 | 64 | # Enable or disable extensions 65 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 66 | #EXTENSIONS = { 67 | # 'scrapy.extensions.telnet.TelnetConsole': None, 68 | #} 69 | 70 | # Configure item pipelines 71 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 72 | #ITEM_PIPELINES = { 73 | # 'ScrapyRedisTest.pipelines.ScrapyredistestPipeline': 300, 74 | #} 75 | 76 | # Enable and configure the AutoThrottle extension (disabled by default) 77 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 78 | #AUTOTHROTTLE_ENABLED = True 79 | # The initial download delay 80 | #AUTOTHROTTLE_START_DELAY = 5 81 | # The maximum download delay to be set in case of high latencies 82 | #AUTOTHROTTLE_MAX_DELAY = 60 83 | # The average number of requests Scrapy should be sending in parallel to 84 | # each remote server 85 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 86 | # Enable showing throttling stats for every response received: 87 | #AUTOTHROTTLE_DEBUG = False 88 | 89 | # Enable and configure HTTP caching (disabled by default) 90 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 91 | #HTTPCACHE_ENABLED = True 92 | #HTTPCACHE_EXPIRATION_SECS = 0 93 | #HTTPCACHE_DIR = 'httpcache' 94 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 95 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 96 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/LcvSearch/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for LcvSearch project. 3 | 4 | Generated by 'django-admin startproject' using Django 1.11. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.11/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/1.11/ref/settings/ 11 | """ 12 | 13 | import os 14 | 15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 17 | 18 | 19 | # Quick-start development settings - unsuitable for production 20 | # See https://docs.djangoproject.com/en/1.11/howto/deployment/checklist/ 21 | 22 | # SECURITY WARNING: keep the secret key used in production secret! 23 | SECRET_KEY = '5)$op9fxf2b#%(*_-qcr7sf)*c@gr!v=d851(*3f*2gef0f!#d' 24 | 25 | # SECURITY WARNING: don't run with debug turned on in production! 26 | DEBUG = True 27 | 28 | ALLOWED_HOSTS = [] 29 | 30 | 31 | # Application definition 32 | 33 | INSTALLED_APPS = [ 34 | 'django.contrib.admin', 35 | 'django.contrib.auth', 36 | 'django.contrib.contenttypes', 37 | 'django.contrib.sessions', 38 | 'django.contrib.messages', 39 | 'django.contrib.staticfiles', 40 | 'search', 41 | ] 42 | 43 | MIDDLEWARE = [ 44 | 'django.middleware.security.SecurityMiddleware', 45 | 'django.contrib.sessions.middleware.SessionMiddleware', 46 | 'django.middleware.common.CommonMiddleware', 47 | 'django.middleware.csrf.CsrfViewMiddleware', 48 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 49 | 'django.contrib.messages.middleware.MessageMiddleware', 50 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 51 | ] 52 | 53 | ROOT_URLCONF = 'LcvSearch.urls' 54 | 55 | TEMPLATES = [ 56 | { 57 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 58 | 'DIRS': [os.path.join(BASE_DIR, 'templates')] 59 | , 60 | 'APP_DIRS': True, 61 | 'OPTIONS': { 62 | 'context_processors': [ 63 | 'django.template.context_processors.debug', 64 | 'django.template.context_processors.request', 65 | 'django.contrib.auth.context_processors.auth', 66 | 'django.contrib.messages.context_processors.messages', 67 | ], 68 | }, 69 | }, 70 | ] 71 | 72 | WSGI_APPLICATION = 'LcvSearch.wsgi.application' 73 | 74 | 75 | # Database 76 | # https://docs.djangoproject.com/en/1.11/ref/settings/#databases 77 | 78 | DATABASES = { 79 | 'default': { 80 | 'ENGINE': 'django.db.backends.sqlite3', 81 | 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), 82 | } 83 | } 84 | 85 | 86 | # Password validation 87 | # https://docs.djangoproject.com/en/1.11/ref/settings/#auth-password-validators 88 | 89 | AUTH_PASSWORD_VALIDATORS = [ 90 | { 91 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 92 | }, 93 | { 94 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', 95 | }, 96 | { 97 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', 98 | }, 99 | { 100 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', 101 | }, 102 | ] 103 | 104 | 105 | # Internationalization 106 | # https://docs.djangoproject.com/en/1.11/topics/i18n/ 107 | 108 | LANGUAGE_CODE = 'en-us' 109 | 110 | TIME_ZONE = 'UTC' 111 | 112 | USE_I18N = True 113 | 114 | USE_L10N = True 115 | 116 | USE_TZ = True 117 | 118 | 119 | # Static files (CSS, JavaScript, Images) 120 | # https://docs.djangoproject.com/en/1.11/howto/static-files/ 121 | 122 | STATIC_URL = '/static/' 123 | 124 | STATICFILES_DIRS = [ 125 | os.path.join(BASE_DIR, "static") 126 | ] -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/ArticleSpider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | from fake_useragent import UserAgent 10 | 11 | from tools.crawl_xici_ip import GetIP 12 | 13 | 14 | 15 | class ArticlespiderSpiderMiddleware(object): 16 | # Not all methods need to be defined. If a method is not defined, 17 | # scrapy acts as if the spider middleware does not modify the 18 | # passed objects. 19 | 20 | @classmethod 21 | def from_crawler(cls, crawler): 22 | # This method is used by Scrapy to create your spiders. 23 | s = cls() 24 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 25 | return s 26 | 27 | def process_spider_input(response, spider): 28 | # Called for each response that goes through the spider 29 | # middleware and into the spider. 30 | 31 | # Should return None or raise an exception. 32 | return None 33 | 34 | def process_spider_output(response, result, spider): 35 | # Called with the results returned from the Spider, after 36 | # it has processed the response. 37 | 38 | # Must return an iterable of Request, dict or Item objects. 39 | for i in result: 40 | yield i 41 | 42 | def process_spider_exception(response, exception, spider): 43 | # Called when a spider or process_spider_input() method 44 | # (from other spider middleware) raises an exception. 45 | 46 | # Should return either None or an iterable of Response, dict 47 | # or Item objects. 48 | pass 49 | 50 | def process_start_requests(start_requests, spider): 51 | # Called with the start requests of the spider, and works 52 | # similarly to the process_spider_output() method, except 53 | # that it doesn’t have a response associated. 54 | 55 | # Must return only requests (not items). 56 | for r in start_requests: 57 | yield r 58 | 59 | def spider_opened(self, spider): 60 | spider.logger.info('Spider opened: %s' % spider.name) 61 | 62 | 63 | class RandomUserAgentMiddlware(object): 64 | #随机更换user-agent 65 | def __init__(self, crawler): 66 | super(RandomUserAgentMiddlware, self).__init__() 67 | self.ua = UserAgent() 68 | self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random") 69 | 70 | @classmethod 71 | def from_crawler(cls, crawler): 72 | return cls(crawler) 73 | 74 | def process_request(self, request, spider): 75 | def get_ua(): 76 | return getattr(self.ua, self.ua_type) 77 | 78 | request.headers.setdefault('User-Agent', get_ua()) 79 | 80 | class RandomProxyMiddleware(object): 81 | #动态设置ip代理 82 | def process_request(self, request, spider): 83 | get_ip = GetIP() 84 | request.meta["proxy"] = get_ip.get_random_ip() 85 | 86 | 87 | from selenium import webdriver 88 | from scrapy.http import HtmlResponse 89 | class JSPageMiddleware(object): 90 | 91 | #通过chrome请求动态网页 92 | def process_request(self, request, spider): 93 | if spider.name == "jobbole": 94 | # browser = webdriver.Chrome(executable_path="D:/Temp/chromedriver.exe") 95 | spider.browser.get(request.url) 96 | import time 97 | time.sleep(3) 98 | print ("访问:{0}".format(request.url)) 99 | 100 | return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8", request=request) 101 | 102 | # from pyvirtualdisplay import Display 103 | # display = Display(visible=0, size=(800, 600)) 104 | # display.start() 105 | # 106 | # browser = webdriver.Chrome() 107 | # browser.get() 108 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/build/lib/ArticleSpider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | from fake_useragent import UserAgent 10 | 11 | from tools.crawl_xici_ip import GetIP 12 | 13 | 14 | 15 | class ArticlespiderSpiderMiddleware(object): 16 | # Not all methods need to be defined. If a method is not defined, 17 | # scrapy acts as if the spider middleware does not modify the 18 | # passed objects. 19 | 20 | @classmethod 21 | def from_crawler(cls, crawler): 22 | # This method is used by Scrapy to create your spiders. 23 | s = cls() 24 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 25 | return s 26 | 27 | def process_spider_input(response, spider): 28 | # Called for each response that goes through the spider 29 | # middleware and into the spider. 30 | 31 | # Should return None or raise an exception. 32 | return None 33 | 34 | def process_spider_output(response, result, spider): 35 | # Called with the results returned from the Spider, after 36 | # it has processed the response. 37 | 38 | # Must return an iterable of Request, dict or Item objects. 39 | for i in result: 40 | yield i 41 | 42 | def process_spider_exception(response, exception, spider): 43 | # Called when a spider or process_spider_input() method 44 | # (from other spider middleware) raises an exception. 45 | 46 | # Should return either None or an iterable of Response, dict 47 | # or Item objects. 48 | pass 49 | 50 | def process_start_requests(start_requests, spider): 51 | # Called with the start requests of the spider, and works 52 | # similarly to the process_spider_output() method, except 53 | # that it doesn’t have a response associated. 54 | 55 | # Must return only requests (not items). 56 | for r in start_requests: 57 | yield r 58 | 59 | def spider_opened(self, spider): 60 | spider.logger.info('Spider opened: %s' % spider.name) 61 | 62 | 63 | class RandomUserAgentMiddlware(object): 64 | #随机更换user-agent 65 | def __init__(self, crawler): 66 | super(RandomUserAgentMiddlware, self).__init__() 67 | self.ua = UserAgent() 68 | self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random") 69 | 70 | @classmethod 71 | def from_crawler(cls, crawler): 72 | return cls(crawler) 73 | 74 | def process_request(self, request, spider): 75 | def get_ua(): 76 | return getattr(self.ua, self.ua_type) 77 | 78 | request.headers.setdefault('User-Agent', get_ua()) 79 | 80 | class RandomProxyMiddleware(object): 81 | #动态设置ip代理 82 | def process_request(self, request, spider): 83 | get_ip = GetIP() 84 | request.meta["proxy"] = get_ip.get_random_ip() 85 | 86 | 87 | from selenium import webdriver 88 | from scrapy.http import HtmlResponse 89 | class JSPageMiddleware(object): 90 | 91 | #通过chrome请求动态网页 92 | def process_request(self, request, spider): 93 | if spider.name == "jobbole": 94 | # browser = webdriver.Chrome(executable_path="D:/Temp/chromedriver.exe") 95 | spider.browser.get(request.url) 96 | import time 97 | time.sleep(3) 98 | print ("访问:{0}".format(request.url)) 99 | 100 | return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8", request=request) 101 | 102 | # from pyvirtualdisplay import Display 103 | # display = Display(visible=0, size=(800, 600)) 104 | # display.start() 105 | # 106 | # browser = webdriver.Chrome() 107 | # browser.get() 108 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/tools/yundama_requests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | 4 | import json 5 | import requests 6 | 7 | class YDMHttp(object): 8 | apiurl = 'http://api.yundama.com/api.php' 9 | username = '' 10 | password = '' 11 | appid = '' 12 | appkey = '' 13 | 14 | def __init__(self, username, password, appid, appkey): 15 | self.username = username 16 | self.password = password 17 | self.appid = str(appid) 18 | self.appkey = appkey 19 | 20 | def balance(self): 21 | data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} 22 | response_data = requests.post(self.apiurl, data=data) 23 | ret_data = json.loads(response_data.text) 24 | if ret_data["ret"] == 0: 25 | print ("获取剩余积分", ret_data["balance"]) 26 | return ret_data["balance"] 27 | else: 28 | return None 29 | 30 | def login(self): 31 | data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} 32 | response_data = requests.post(self.apiurl, data=data) 33 | ret_data = json.loads(response_data.text) 34 | if ret_data["ret"] == 0: 35 | print ("登录成功", ret_data["uid"]) 36 | return ret_data["uid"] 37 | else: 38 | return None 39 | 40 | def decode(self, filename, codetype, timeout): 41 | data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} 42 | files = {'file': open(filename, 'rb')} 43 | response_data = requests.post(self.apiurl, files=files, data=data) 44 | ret_data = json.loads(response_data.text) 45 | if ret_data["ret"] == 0: 46 | print ("识别成功", ret_data["text"]) 47 | return ret_data["text"] 48 | else: 49 | return None 50 | 51 | def ydm(file_path): 52 | username = 'da_ge_da1' 53 | # 密码 54 | password = 'da_ge_da' 55 | # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! 56 | appid = 3129 57 | # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! 58 | appkey = '40d5ad41c047179fc797631e3b9c3025' 59 | # 图片文件 60 | filename = 'image/captcha.jpg' 61 | # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html 62 | codetype = 5000 63 | # 超时时间,秒 64 | timeout = 60 65 | # 检查 66 | 67 | yundama = YDMHttp(username, password, appid, appkey) 68 | if (username == 'username'): 69 | print('请设置好相关参数再测试') 70 | else: 71 | # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 72 | return yundama.decode(file_path, codetype, timeout); 73 | 74 | if __name__ == "__main__": 75 | # 用户名 76 | username = 'da_ge_da1' 77 | # 密码 78 | password = 'da_ge_da' 79 | # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! 80 | appid = 3129 81 | # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! 82 | appkey = '40d5ad41c047179fc797631e3b9c3025' 83 | # 图片文件 84 | filename = 'image/captcha.jpg' 85 | # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html 86 | codetype = 5000 87 | # 超时时间,秒 88 | timeout = 60 89 | # 检查 90 | if (username == 'username'): 91 | print ('请设置好相关参数再测试') 92 | else: 93 | # 初始化 94 | yundama = YDMHttp(username, password, appid, appkey) 95 | 96 | # 登陆云打码 97 | uid = yundama.login(); 98 | print('uid: %s' % uid) 99 | 100 | # 登陆云打码 101 | uid = yundama.login(); 102 | print ('uid: %s' % uid) 103 | 104 | # 查询余额 105 | balance = yundama.balance(); 106 | print ('balance: %s' % balance) 107 | 108 | # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 109 | text = yundama.decode(filename, codetype, timeout); 110 | 111 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/build/lib/tools/yundama_requests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'bobby' 3 | 4 | import json 5 | import requests 6 | 7 | class YDMHttp(object): 8 | apiurl = 'http://api.yundama.com/api.php' 9 | username = '' 10 | password = '' 11 | appid = '' 12 | appkey = '' 13 | 14 | def __init__(self, username, password, appid, appkey): 15 | self.username = username 16 | self.password = password 17 | self.appid = str(appid) 18 | self.appkey = appkey 19 | 20 | def balance(self): 21 | data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} 22 | response_data = requests.post(self.apiurl, data=data) 23 | ret_data = json.loads(response_data.text) 24 | if ret_data["ret"] == 0: 25 | print ("获取剩余积分", ret_data["balance"]) 26 | return ret_data["balance"] 27 | else: 28 | return None 29 | 30 | def login(self): 31 | data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} 32 | response_data = requests.post(self.apiurl, data=data) 33 | ret_data = json.loads(response_data.text) 34 | if ret_data["ret"] == 0: 35 | print ("登录成功", ret_data["uid"]) 36 | return ret_data["uid"] 37 | else: 38 | return None 39 | 40 | def decode(self, filename, codetype, timeout): 41 | data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} 42 | files = {'file': open(filename, 'rb')} 43 | response_data = requests.post(self.apiurl, files=files, data=data) 44 | ret_data = json.loads(response_data.text) 45 | if ret_data["ret"] == 0: 46 | print ("识别成功", ret_data["text"]) 47 | return ret_data["text"] 48 | else: 49 | return None 50 | 51 | def ydm(file_path): 52 | username = 'da_ge_da1' 53 | # 密码 54 | password = 'da_ge_da' 55 | # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! 56 | appid = 3129 57 | # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! 58 | appkey = '40d5ad41c047179fc797631e3b9c3025' 59 | # 图片文件 60 | filename = 'image/captcha.jpg' 61 | # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html 62 | codetype = 5000 63 | # 超时时间,秒 64 | timeout = 60 65 | # 检查 66 | 67 | yundama = YDMHttp(username, password, appid, appkey) 68 | if (username == 'username'): 69 | print('请设置好相关参数再测试') 70 | else: 71 | # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 72 | return yundama.decode(file_path, codetype, timeout); 73 | 74 | if __name__ == "__main__": 75 | # 用户名 76 | username = 'da_ge_da1' 77 | # 密码 78 | password = 'da_ge_da' 79 | # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! 80 | appid = 3129 81 | # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! 82 | appkey = '40d5ad41c047179fc797631e3b9c3025' 83 | # 图片文件 84 | filename = 'image/captcha.jpg' 85 | # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html 86 | codetype = 5000 87 | # 超时时间,秒 88 | timeout = 60 89 | # 检查 90 | if (username == 'username'): 91 | print ('请设置好相关参数再测试') 92 | else: 93 | # 初始化 94 | yundama = YDMHttp(username, password, appid, appkey) 95 | 96 | # 登陆云打码 97 | uid = yundama.login(); 98 | print('uid: %s' % uid) 99 | 100 | # 登陆云打码 101 | uid = yundama.login(); 102 | print ('uid: %s' % uid) 103 | 104 | # 查询余额 105 | balance = yundama.balance(); 106 | print ('balance: %s' % balance) 107 | 108 | # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 109 | text = yundama.decode(filename, codetype, timeout); 110 | 111 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/ArticleSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import codecs 8 | import json 9 | 10 | from scrapy.pipelines.images import ImagesPipeline 11 | from scrapy.exporters import JsonItemExporter 12 | from twisted.enterprise import adbapi 13 | from models.es_types import ArticleType 14 | from w3lib.html import remove_tags 15 | 16 | import MySQLdb 17 | import MySQLdb.cursors 18 | 19 | class ArticlespiderPipeline(object): 20 | def process_item(self, item, spider): 21 | return item 22 | 23 | 24 | class JsonWithEncodingPipeline(object): 25 | #自定义json文件的导出 26 | def __init__(self): 27 | self.file = codecs.open('article.json', 'w', encoding="utf-8") 28 | def process_item(self, item, spider): 29 | lines = json.dumps(dict(item), ensure_ascii=False) + "\n" 30 | self.file.write(lines) 31 | return item 32 | def spider_closed(self, spider): 33 | self.file.close() 34 | 35 | 36 | class MysqlPipeline(object): 37 | #采用同步的机制写入mysql 38 | def __init__(self): 39 | self.conn = MySQLdb.connect('192.168.0.106', 'root', 'root', 'article_spider', charset="utf8", use_unicode=True) 40 | self.cursor = self.conn.cursor() 41 | 42 | def process_item(self, item, spider): 43 | insert_sql = """ 44 | insert into jobbole_article(title, url, create_date, fav_nums) 45 | VALUES (%s, %s, %s, %s) 46 | """ 47 | self.cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"])) 48 | self.conn.commit() 49 | 50 | 51 | class MysqlTwistedPipline(object): 52 | def __init__(self, dbpool): 53 | self.dbpool = dbpool 54 | 55 | @classmethod 56 | def from_settings(cls, settings): 57 | dbparms = dict( 58 | host = settings["MYSQL_HOST"], 59 | db = settings["MYSQL_DBNAME"], 60 | user = settings["MYSQL_USER"], 61 | passwd = settings["MYSQL_PASSWORD"], 62 | charset='utf8', 63 | cursorclass=MySQLdb.cursors.DictCursor, 64 | use_unicode=True, 65 | ) 66 | dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms) 67 | 68 | return cls(dbpool) 69 | 70 | def process_item(self, item, spider): 71 | #使用twisted将mysql插入变成异步执行 72 | query = self.dbpool.runInteraction(self.do_insert, item) 73 | query.addErrback(self.handle_error, item, spider) #处理异常 74 | 75 | def handle_error(self, failure, item, spider): 76 | #处理异步插入的异常 77 | print (failure) 78 | 79 | def do_insert(self, cursor, item): 80 | #执行具体的插入 81 | #根据不同的item 构建不同的sql语句并插入到mysql中 82 | insert_sql, params = item.get_insert_sql() 83 | cursor.execute(insert_sql, params) 84 | 85 | 86 | class JsonExporterPipleline(object): 87 | #调用scrapy提供的json export导出json文件 88 | def __init__(self): 89 | self.file = open('articleexport.json', 'wb') 90 | self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False) 91 | self.exporter.start_exporting() 92 | 93 | def close_spider(self, spider): 94 | self.exporter.finish_exporting() 95 | self.file.close() 96 | 97 | def process_item(self, item, spider): 98 | self.exporter.export_item(item) 99 | return item 100 | 101 | 102 | class ArticleImagePipeline(ImagesPipeline): 103 | def item_completed(self, results, item, info): 104 | if "front_image_url" in item: 105 | for ok, value in results: 106 | image_file_path = value["path"] 107 | item["front_image_path"] = image_file_path 108 | 109 | return item 110 | 111 | 112 | class ElasticsearchPipeline(object): 113 | #将数据写入到es中 114 | 115 | def process_item(self, item, spider): 116 | #将item转换为es的数据 117 | item.save_to_es() 118 | 119 | return item -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/build/lib/ArticleSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import codecs 8 | import json 9 | 10 | from scrapy.pipelines.images import ImagesPipeline 11 | from scrapy.exporters import JsonItemExporter 12 | from twisted.enterprise import adbapi 13 | from models.es_types import ArticleType 14 | from w3lib.html import remove_tags 15 | 16 | import MySQLdb 17 | import MySQLdb.cursors 18 | 19 | class ArticlespiderPipeline(object): 20 | def process_item(self, item, spider): 21 | return item 22 | 23 | 24 | class JsonWithEncodingPipeline(object): 25 | #自定义json文件的导出 26 | def __init__(self): 27 | self.file = codecs.open('article.json', 'w', encoding="utf-8") 28 | def process_item(self, item, spider): 29 | lines = json.dumps(dict(item), ensure_ascii=False) + "\n" 30 | self.file.write(lines) 31 | return item 32 | def spider_closed(self, spider): 33 | self.file.close() 34 | 35 | 36 | class MysqlPipeline(object): 37 | #采用同步的机制写入mysql 38 | def __init__(self): 39 | self.conn = MySQLdb.connect('192.168.0.106', 'root', 'root', 'article_spider', charset="utf8", use_unicode=True) 40 | self.cursor = self.conn.cursor() 41 | 42 | def process_item(self, item, spider): 43 | insert_sql = """ 44 | insert into jobbole_article(title, url, create_date, fav_nums) 45 | VALUES (%s, %s, %s, %s) 46 | """ 47 | self.cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"])) 48 | self.conn.commit() 49 | 50 | 51 | class MysqlTwistedPipline(object): 52 | def __init__(self, dbpool): 53 | self.dbpool = dbpool 54 | 55 | @classmethod 56 | def from_settings(cls, settings): 57 | dbparms = dict( 58 | host = settings["MYSQL_HOST"], 59 | db = settings["MYSQL_DBNAME"], 60 | user = settings["MYSQL_USER"], 61 | passwd = settings["MYSQL_PASSWORD"], 62 | charset='utf8', 63 | cursorclass=MySQLdb.cursors.DictCursor, 64 | use_unicode=True, 65 | ) 66 | dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms) 67 | 68 | return cls(dbpool) 69 | 70 | def process_item(self, item, spider): 71 | #使用twisted将mysql插入变成异步执行 72 | query = self.dbpool.runInteraction(self.do_insert, item) 73 | query.addErrback(self.handle_error, item, spider) #处理异常 74 | 75 | def handle_error(self, failure, item, spider): 76 | #处理异步插入的异常 77 | print (failure) 78 | 79 | def do_insert(self, cursor, item): 80 | #执行具体的插入 81 | #根据不同的item 构建不同的sql语句并插入到mysql中 82 | insert_sql, params = item.get_insert_sql() 83 | cursor.execute(insert_sql, params) 84 | 85 | 86 | class JsonExporterPipleline(object): 87 | #调用scrapy提供的json export导出json文件 88 | def __init__(self): 89 | self.file = open('articleexport.json', 'wb') 90 | self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False) 91 | self.exporter.start_exporting() 92 | 93 | def close_spider(self, spider): 94 | self.exporter.finish_exporting() 95 | self.file.close() 96 | 97 | def process_item(self, item, spider): 98 | self.exporter.export_item(item) 99 | return item 100 | 101 | 102 | class ArticleImagePipeline(ImagesPipeline): 103 | def item_completed(self, results, item, info): 104 | if "front_image_url" in item: 105 | for ok, value in results: 106 | image_file_path = value["path"] 107 | item["front_image_path"] = image_file_path 108 | 109 | return item 110 | 111 | 112 | class ElasticsearchPipeline(object): 113 | #将数据写入到es中 114 | 115 | def process_item(self, item, spider): 116 | #将item转换为es的数据 117 | item.save_to_es() 118 | 119 | return item -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 33 | 34 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 33 | 34 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 33 | 34 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/search/views.py: -------------------------------------------------------------------------------- 1 | import json 2 | from django.shortcuts import render 3 | from django.views.generic.base import View 4 | from search.models import ArticleType 5 | from django.http import HttpResponse 6 | from elasticsearch import Elasticsearch 7 | from datetime import datetime 8 | import redis 9 | 10 | client = Elasticsearch(hosts=["127.0.0.1"]) 11 | redis_cli = redis.StrictRedis() 12 | 13 | 14 | class IndexView(View): 15 | #首页 16 | def get(self, request): 17 | topn_search = redis_cli.zrevrangebyscore("search_keywords_set", "+inf", "-inf", start=0, num=5) 18 | return render(request, "index.html", {"topn_search":topn_search}) 19 | 20 | # Create your views here. 21 | class SearchSuggest(View): 22 | def get(self, request): 23 | key_words = request.GET.get('s','') 24 | re_datas = [] 25 | if key_words: 26 | s = ArticleType.search() 27 | s = s.suggest('my_suggest', key_words, completion={ 28 | "field":"suggest", "fuzzy":{ 29 | "fuzziness":2 30 | }, 31 | "size": 10 32 | }) 33 | suggestions = s.execute_suggest() 34 | for match in suggestions.my_suggest[0].options: 35 | source = match._source 36 | re_datas.append(source["title"]) 37 | return HttpResponse(json.dumps(re_datas), content_type="application/json") 38 | 39 | 40 | class SearchView(View): 41 | def get(self, request): 42 | key_words = request.GET.get("q","") 43 | s_type = request.GET.get("s_type", "article") 44 | 45 | redis_cli.zincrby("search_keywords_set", key_words) 46 | 47 | topn_search = redis_cli.zrevrangebyscore("search_keywords_set", "+inf", "-inf", start=0, num=5) 48 | page = request.GET.get("p", "1") 49 | try: 50 | page = int(page) 51 | except: 52 | page = 1 53 | 54 | jobbole_count = redis_cli.get("jobbole_count") 55 | start_time = datetime.now() 56 | response = client.search( 57 | index= "jobbole", 58 | body={ 59 | "query":{ 60 | "multi_match":{ 61 | "query":key_words, 62 | "fields":["tags", "title", "content"] 63 | } 64 | }, 65 | "from":(page-1)*10, 66 | "size":10, 67 | "highlight": { 68 | "pre_tags": [''], 69 | "post_tags": [''], 70 | "fields": { 71 | "title": {}, 72 | "content": {}, 73 | } 74 | } 75 | } 76 | ) 77 | 78 | end_time = datetime.now() 79 | last_seconds = (end_time-start_time).total_seconds() 80 | total_nums = response["hits"]["total"] 81 | if (page%10) > 0: 82 | page_nums = int(total_nums/10) +1 83 | else: 84 | page_nums = int(total_nums/10) 85 | hit_list = [] 86 | for hit in response["hits"]["hits"]: 87 | hit_dict = {} 88 | if "title" in hit["highlight"]: 89 | hit_dict["title"] = "".join(hit["highlight"]["title"]) 90 | else: 91 | hit_dict["title"] = hit["_source"]["title"] 92 | if "content" in hit["highlight"]: 93 | hit_dict["content"] = "".join(hit["highlight"]["content"])[:500] 94 | else: 95 | hit_dict["content"] = hit["_source"]["content"][:500] 96 | 97 | hit_dict["create_date"] = hit["_source"]["create_date"] 98 | hit_dict["url"] = hit["_source"]["url"] 99 | hit_dict["score"] = hit["_score"] 100 | 101 | hit_list.append(hit_dict) 102 | 103 | return render(request, "result.html", {"page":page, 104 | "all_hits":hit_list, 105 | "key_words":key_words, 106 | "total_nums":total_nums, 107 | "page_nums":page_nums, 108 | "last_seconds":last_seconds, 109 | "jobbole_count":jobbole_count, 110 | "topn_search":topn_search}) 111 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/ArticleSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | 5 | # Scrapy settings for ArticleSpider project 6 | # 7 | # For simplicity, this file contains only settings considered important or 8 | # commonly used. You can find more settings consulting the documentation: 9 | # 10 | # http://doc.scrapy.org/en/latest/topics/settings.html 11 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 12 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 13 | 14 | BOT_NAME = 'ArticleSpider' 15 | 16 | SPIDER_MODULES = ['ArticleSpider.spiders'] 17 | NEWSPIDER_MODULE = 'ArticleSpider.spiders' 18 | 19 | 20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 21 | #USER_AGENT = 'ArticleSpider (+http://www.yourdomain.com)' 22 | 23 | # Obey robots.txt rules 24 | ROBOTSTXT_OBEY = False 25 | 26 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 27 | #CONCURRENT_REQUESTS = 32 28 | 29 | # Configure a delay for requests for the same website (default: 0) 30 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 31 | # See also autothrottle settings and docs 32 | DOWNLOAD_DELAY = 10 33 | # The download delay setting will honor only one of: 34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 35 | #CONCURRENT_REQUESTS_PER_IP = 16 36 | 37 | # Disable cookies (enabled by default) 38 | COOKIES_ENABLED = False 39 | 40 | # Disable Telnet Console (enabled by default) 41 | #TELNETCONSOLE_ENABLED = False 42 | 43 | # Override the default request headers: 44 | #DEFAULT_REQUEST_HEADERS = { 45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | # 'Accept-Language': 'en', 47 | #} 48 | 49 | # Enable or disable spider middlewares 50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 51 | # SPIDER_MIDDLEWARES = { 52 | # 'ArticleSpider.middlewares.ArticlespiderSpiderMiddleware': 543, 53 | # } 54 | 55 | # Enable or disable downloader middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 57 | # DOWNLOADER_MIDDLEWARES = { 58 | # 'ArticleSpider.middlewares.JSPageMiddleware': 1, 59 | # # 'ArticleSpider.middlewares.RandomUserAgentMiddlware': 543, 60 | # # 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 61 | # } 62 | 63 | # Enable or disable extensions 64 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 65 | #EXTENSIONS = { 66 | # 'scrapy.extensions.telnet.TelnetConsole': None, 67 | #} 68 | 69 | # Configure item pipelines 70 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 71 | ITEM_PIPELINES = { 72 | # 'ArticleSpider.pipelines.JsonExporterPipleline': 2, 73 | # # 'scrapy.pipelines.images.ImagesPipeline': 1, 74 | # 'ArticleSpider.pipelines.ArticleImagePipeline': 1, 75 | # 'ArticleSpider.pipelines.MysqlTwistedPipline': 1, 76 | 'ArticleSpider.pipelines.ElasticsearchPipeline': 1 77 | } 78 | IMAGES_URLS_FIELD = "front_image_url" 79 | project_dir = os.path.abspath(os.path.dirname(__file__)) 80 | IMAGES_STORE = os.path.join(project_dir, 'images') 81 | 82 | import sys 83 | BASE_DIR = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) 84 | sys.path.insert(0, os.path.join(BASE_DIR, 'ArticleSpider')) 85 | 86 | USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0" 87 | 88 | RANDOM_UA_TYPE = "random" 89 | # 90 | # IMAGES_MIN_HEIGHT = 100 91 | # IMAGES_MIN_WIDTH = 100 92 | 93 | # Enable and configure the AutoThrottle extension (disabled by default) 94 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 95 | AUTOTHROTTLE_ENABLED = True 96 | # The initial download delay 97 | #AUTOTHROTTLE_START_DELAY = 5 98 | # The maximum download delay to be set in case of high latencies 99 | #AUTOTHROTTLE_MAX_DELAY = 60 100 | # The average number of requests Scrapy should be sending in parallel to 101 | # each remote server 102 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 103 | # Enable showing throttling stats for every response received: 104 | #AUTOTHROTTLE_DEBUG = False 105 | 106 | # Enable and configure HTTP caching (disabled by default) 107 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 108 | #HTTPCACHE_ENABLED = True 109 | #HTTPCACHE_EXPIRATION_SECS = 0 110 | #HTTPCACHE_DIR = 'httpcache' 111 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 112 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 113 | 114 | MYSQL_HOST = "127.0.0.1" 115 | MYSQL_DBNAME = "article_spider" 116 | MYSQL_USER = "root" 117 | MYSQL_PASSWORD = "root" 118 | 119 | 120 | SQL_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" 121 | SQL_DATE_FORMAT = "%Y-%m-%d" 122 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/build/lib/ArticleSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | 5 | # Scrapy settings for ArticleSpider project 6 | # 7 | # For simplicity, this file contains only settings considered important or 8 | # commonly used. You can find more settings consulting the documentation: 9 | # 10 | # http://doc.scrapy.org/en/latest/topics/settings.html 11 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 12 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 13 | 14 | BOT_NAME = 'ArticleSpider' 15 | 16 | SPIDER_MODULES = ['ArticleSpider.spiders'] 17 | NEWSPIDER_MODULE = 'ArticleSpider.spiders' 18 | 19 | 20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 21 | #USER_AGENT = 'ArticleSpider (+http://www.yourdomain.com)' 22 | 23 | # Obey robots.txt rules 24 | ROBOTSTXT_OBEY = False 25 | 26 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 27 | #CONCURRENT_REQUESTS = 32 28 | 29 | # Configure a delay for requests for the same website (default: 0) 30 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 31 | # See also autothrottle settings and docs 32 | DOWNLOAD_DELAY = 10 33 | # The download delay setting will honor only one of: 34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 35 | #CONCURRENT_REQUESTS_PER_IP = 16 36 | 37 | # Disable cookies (enabled by default) 38 | COOKIES_ENABLED = False 39 | 40 | # Disable Telnet Console (enabled by default) 41 | #TELNETCONSOLE_ENABLED = False 42 | 43 | # Override the default request headers: 44 | #DEFAULT_REQUEST_HEADERS = { 45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | # 'Accept-Language': 'en', 47 | #} 48 | 49 | # Enable or disable spider middlewares 50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 51 | # SPIDER_MIDDLEWARES = { 52 | # 'ArticleSpider.middlewares.ArticlespiderSpiderMiddleware': 543, 53 | # } 54 | 55 | # Enable or disable downloader middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 57 | # DOWNLOADER_MIDDLEWARES = { 58 | # 'ArticleSpider.middlewares.JSPageMiddleware': 1, 59 | # # 'ArticleSpider.middlewares.RandomUserAgentMiddlware': 543, 60 | # # 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 61 | # } 62 | 63 | # Enable or disable extensions 64 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 65 | #EXTENSIONS = { 66 | # 'scrapy.extensions.telnet.TelnetConsole': None, 67 | #} 68 | 69 | # Configure item pipelines 70 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 71 | ITEM_PIPELINES = { 72 | # 'ArticleSpider.pipelines.JsonExporterPipleline': 2, 73 | # # 'scrapy.pipelines.images.ImagesPipeline': 1, 74 | # 'ArticleSpider.pipelines.ArticleImagePipeline': 1, 75 | # 'ArticleSpider.pipelines.MysqlTwistedPipline': 1, 76 | 'ArticleSpider.pipelines.ElasticsearchPipeline': 1 77 | } 78 | IMAGES_URLS_FIELD = "front_image_url" 79 | project_dir = os.path.abspath(os.path.dirname(__file__)) 80 | IMAGES_STORE = os.path.join(project_dir, 'images') 81 | 82 | import sys 83 | BASE_DIR = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) 84 | sys.path.insert(0, os.path.join(BASE_DIR, 'ArticleSpider')) 85 | 86 | USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0" 87 | 88 | RANDOM_UA_TYPE = "random" 89 | # 90 | # IMAGES_MIN_HEIGHT = 100 91 | # IMAGES_MIN_WIDTH = 100 92 | 93 | # Enable and configure the AutoThrottle extension (disabled by default) 94 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 95 | AUTOTHROTTLE_ENABLED = True 96 | # The initial download delay 97 | #AUTOTHROTTLE_START_DELAY = 5 98 | # The maximum download delay to be set in case of high latencies 99 | #AUTOTHROTTLE_MAX_DELAY = 60 100 | # The average number of requests Scrapy should be sending in parallel to 101 | # each remote server 102 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 103 | # Enable showing throttling stats for every response received: 104 | #AUTOTHROTTLE_DEBUG = False 105 | 106 | # Enable and configure HTTP caching (disabled by default) 107 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 108 | #HTTPCACHE_ENABLED = True 109 | #HTTPCACHE_EXPIRATION_SECS = 0 110 | #HTTPCACHE_DIR = 'httpcache' 111 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 112 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 113 | 114 | MYSQL_HOST = "127.0.0.1" 115 | MYSQL_DBNAME = "article_spider" 116 | MYSQL_USER = "root" 117 | MYSQL_PASSWORD = "root" 118 | 119 | 120 | SQL_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" 121 | SQL_DATE_FORMAT = "%Y-%m-%d" 122 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/scrapy_redis/dupefilter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | from scrapy.dupefilters import BaseDupeFilter 5 | from scrapy.utils.request import request_fingerprint 6 | 7 | from . import defaults 8 | from .connection import get_redis_from_settings 9 | from ScrapyRedisTest.utils.bloomfilter import conn, PyBloomFilter 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | # TODO: Rename class to RedisDupeFilter. 16 | class RFPDupeFilter(BaseDupeFilter): 17 | """Redis-based request duplicates filter. 18 | 19 | This class can also be used with default Scrapy's scheduler. 20 | 21 | """ 22 | 23 | logger = logger 24 | 25 | def __init__(self, server, key, debug=False): 26 | """Initialize the duplicates filter. 27 | 28 | Parameters 29 | ---------- 30 | server : redis.StrictRedis 31 | The redis server instance. 32 | key : str 33 | Redis key Where to store fingerprints. 34 | debug : bool, optional 35 | Whether to log filtered requests. 36 | 37 | """ 38 | self.server = server 39 | self.key = key 40 | self.debug = debug 41 | self.logdupes = True 42 | 43 | self.bf = PyBloomFilter(conn=conn, key=key) 44 | 45 | @classmethod 46 | def from_settings(cls, settings): 47 | """Returns an instance from given settings. 48 | 49 | This uses by default the key ``dupefilter:``. When using the 50 | ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as 51 | it needs to pass the spider name in the key. 52 | 53 | Parameters 54 | ---------- 55 | settings : scrapy.settings.Settings 56 | 57 | Returns 58 | ------- 59 | RFPDupeFilter 60 | A RFPDupeFilter instance. 61 | 62 | 63 | """ 64 | server = get_redis_from_settings(settings) 65 | # XXX: This creates one-time key. needed to support to use this 66 | # class as standalone dupefilter with scrapy's default scheduler 67 | # if scrapy passes spider on open() method this wouldn't be needed 68 | # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. 69 | key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} 70 | debug = settings.getbool('DUPEFILTER_DEBUG') 71 | return cls(server, key=key, debug=debug) 72 | 73 | @classmethod 74 | def from_crawler(cls, crawler): 75 | """Returns instance from crawler. 76 | 77 | Parameters 78 | ---------- 79 | crawler : scrapy.crawler.Crawler 80 | 81 | Returns 82 | ------- 83 | RFPDupeFilter 84 | Instance of RFPDupeFilter. 85 | 86 | """ 87 | return cls.from_settings(crawler.settings) 88 | 89 | def request_seen(self, request): 90 | """Returns True if request was already seen. 91 | 92 | Parameters 93 | ---------- 94 | request : scrapy.http.Request 95 | 96 | Returns 97 | ------- 98 | bool 99 | 100 | """ 101 | fp = self.request_fingerprint(request) 102 | 103 | if self.bf.is_exist(fp): 104 | return True 105 | else: 106 | self.bf.add(fp) 107 | return False 108 | # This returns the number of values added, zero if already exists. 109 | # added = self.server.sadd(self.key, fp) 110 | # return added == 0 111 | 112 | def request_fingerprint(self, request): 113 | """Returns a fingerprint for a given request. 114 | 115 | Parameters 116 | ---------- 117 | request : scrapy.http.Request 118 | 119 | Returns 120 | ------- 121 | str 122 | 123 | """ 124 | return request_fingerprint(request) 125 | 126 | def close(self, reason=''): 127 | """Delete data on close. Called by Scrapy's scheduler. 128 | 129 | Parameters 130 | ---------- 131 | reason : str, optional 132 | 133 | """ 134 | self.clear() 135 | 136 | def clear(self): 137 | """Clears fingerprints data.""" 138 | self.server.delete(self.key) 139 | 140 | def log(self, request, spider): 141 | """Logs given request. 142 | 143 | Parameters 144 | ---------- 145 | request : scrapy.http.Request 146 | spider : scrapy.spiders.Spider 147 | 148 | """ 149 | if self.debug: 150 | msg = "Filtered duplicate request: %(request)s" 151 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 152 | elif self.logdupes: 153 | msg = ("Filtered duplicate request %(request)s" 154 | " - no more duplicates will be shown" 155 | " (see DUPEFILTER_DEBUG to show all duplicates)") 156 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 157 | self.logdupes = False 158 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/scrapy_redis/queue.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.reqser import request_to_dict, request_from_dict 2 | 3 | from . import picklecompat 4 | 5 | 6 | class Base(object): 7 | """Per-spider base queue class""" 8 | 9 | def __init__(self, server, spider, key, serializer=None): 10 | """Initialize per-spider redis queue. 11 | 12 | Parameters 13 | ---------- 14 | server : StrictRedis 15 | Redis client instance. 16 | spider : Spider 17 | Scrapy spider instance. 18 | key: str 19 | Redis key where to put and get messages. 20 | serializer : object 21 | Serializer object with ``loads`` and ``dumps`` methods. 22 | 23 | """ 24 | if serializer is None: 25 | # Backward compatibility. 26 | # TODO: deprecate pickle. 27 | serializer = picklecompat 28 | if not hasattr(serializer, 'loads'): 29 | raise TypeError("serializer does not implement 'loads' function: %r" 30 | % serializer) 31 | if not hasattr(serializer, 'dumps'): 32 | raise TypeError("serializer '%s' does not implement 'dumps' function: %r" 33 | % serializer) 34 | 35 | self.server = server 36 | self.spider = spider 37 | self.key = key % {'spider': spider.name} 38 | self.serializer = serializer 39 | 40 | def _encode_request(self, request): 41 | """Encode a request object""" 42 | obj = request_to_dict(request, self.spider) 43 | return self.serializer.dumps(obj) 44 | 45 | def _decode_request(self, encoded_request): 46 | """Decode an request previously encoded""" 47 | obj = self.serializer.loads(encoded_request) 48 | return request_from_dict(obj, self.spider) 49 | 50 | def __len__(self): 51 | """Return the length of the queue""" 52 | raise NotImplementedError 53 | 54 | def push(self, request): 55 | """Push a request""" 56 | raise NotImplementedError 57 | 58 | def pop(self, timeout=0): 59 | """Pop a request""" 60 | raise NotImplementedError 61 | 62 | def clear(self): 63 | """Clear queue/stack""" 64 | self.server.delete(self.key) 65 | 66 | 67 | class FifoQueue(Base): 68 | """Per-spider FIFO queue""" 69 | 70 | def __len__(self): 71 | """Return the length of the queue""" 72 | return self.server.llen(self.key) 73 | 74 | def push(self, request): 75 | """Push a request""" 76 | self.server.lpush(self.key, self._encode_request(request)) 77 | 78 | def pop(self, timeout=0): 79 | """Pop a request""" 80 | if timeout > 0: 81 | data = self.server.brpop(self.key, timeout) 82 | if isinstance(data, tuple): 83 | data = data[1] 84 | else: 85 | data = self.server.rpop(self.key) 86 | if data: 87 | return self._decode_request(data) 88 | 89 | 90 | class PriorityQueue(Base): 91 | """Per-spider priority queue abstraction using redis' sorted set""" 92 | 93 | def __len__(self): 94 | """Return the length of the queue""" 95 | return self.server.zcard(self.key) 96 | 97 | def push(self, request): 98 | """Push a request""" 99 | data = self._encode_request(request) 100 | score = -request.priority 101 | # We don't use zadd method as the order of arguments change depending on 102 | # whether the class is Redis or StrictRedis, and the option of using 103 | # kwargs only accepts strings, not bytes. 104 | self.server.execute_command('ZADD', self.key, score, data) 105 | 106 | def pop(self, timeout=0): 107 | """ 108 | Pop a request 109 | timeout not support in this queue class 110 | """ 111 | # use atomic range/remove using multi/exec 112 | pipe = self.server.pipeline() 113 | pipe.multi() 114 | pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0) 115 | results, count = pipe.execute() 116 | if results: 117 | return self._decode_request(results[0]) 118 | 119 | 120 | class LifoQueue(Base): 121 | """Per-spider LIFO queue.""" 122 | 123 | def __len__(self): 124 | """Return the length of the stack""" 125 | return self.server.llen(self.key) 126 | 127 | def push(self, request): 128 | """Push a request""" 129 | self.server.lpush(self.key, self._encode_request(request)) 130 | 131 | def pop(self, timeout=0): 132 | """Pop a request""" 133 | if timeout > 0: 134 | data = self.server.blpop(self.key, timeout) 135 | if isinstance(data, tuple): 136 | data = data[1] 137 | else: 138 | data = self.server.lpop(self.key) 139 | 140 | if data: 141 | return self._decode_request(data) 142 | 143 | 144 | # TODO: Deprecate the use of these names. 145 | SpiderQueue = FifoQueue 146 | SpiderStack = LifoQueue 147 | SpiderPriorityQueue = PriorityQueue 148 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/css/result.css: -------------------------------------------------------------------------------- 1 | @charset "utf-8"; 2 | html{*overflow:auto;} 3 | #hd{padding:20px 10px;} 4 | #bd{margin-bottom:40px;} 5 | .logo{float:left;margin-right:30px; height:33px;} 6 | /*input搜索区域*/ 7 | .inputArea{float:left;position:relative;} 8 | .inputArea .searchInput{border:1px solid #bfbfbf;padding:0 15px;outline:none;height:35px;*line-height:35px;width:350px; background:url(../img/inputbg.png);font-size:14px;} 9 | .inputArea .searchButton{position:absolute;left:382px;top:0;*top:1px;*left:381px;width:106px;height:38px;background:url(../img/btn_min.png) no-repeat;border:none;cursor:pointer;} 10 | 11 | /*返回高级搜索*/ 12 | .inputArea .advanced{position:absolute;font-size:14px;left:500px;top:12px;width:60px;text-decoration:underline;} 13 | 14 | /*分界区域,导航*/ 15 | .nav{margin-bottom:24px;height:31px;background:#f9f9f9;border-bottom:1px solid #e0e0e0;padding:5px 0 0 210px;} 16 | .searchList{float:left;padding-left:5px;} 17 | .searchList .searchItem{float:left;margin-right:15px;font-size:14px;padding:0 3px 2px 3px;cursor:pointer;height:26px; line-height:26px;} 18 | .searchList .searchItem.current{color:#0080cc;border-bottom:3px solid #9cc813;font-weight:bold;} 19 | .nav .tips{color:#969696;font-size:12px;line-height:24px;*line-height:26px;} 20 | #container.sideBarHide .nav{padding-left:35px;} 21 | 22 | /*#main区域样式*/ 23 | #main{padding:0 215px 0 182px;} 24 | #main.sideBarHide{padding-left:10px;} 25 | /*侧边栏搜索条件*/ 26 | .sideBar{position:relative;float:left;margin-left:-182px;width:182px;} 27 | .sideBar .subfieldContext{margin-bottom:20px;padding-left:25px;} 28 | .sideBar .subfieldContext li{margin-bottom:5px;cursor:pointer;} 29 | .sideBar .subfieldContext input[type=text]{width:75px;} 30 | .sideBar .unit{color:#787878;} 31 | 32 | /*更多按钮*/ 33 | .sideBar .more a:hover{text-decoration:none;} 34 | .sideBar .more .moreIcon{display:inline-block;position:relative;top:-1px;*top:-3px;left:2px;*left:-1px;width:9px;height:5px;background:url(../img/more.png);} 35 | .sideBar .more.show .moreIcon{background:url(../img/down.png);top:-2px;} 36 | 37 | .sideBar .reset{padding-left:25px;} 38 | /*siderBar区域显隐控制*/ 39 | .sideBar .sideBarShowHide{position:absolute;right:0px;top:20px;height:177px;width:1px; background:url(../img/line.png) right;} 40 | .sideBar .sideBarShowHide a{position:absolute;top:70px;left:-11px;display:inline-block;width:12px;height:31px;background:url(../img/lr.png);} 41 | 42 | .sideBar .sideBarShowHide a:hover{background-position:0 -31px;} 43 | 44 | /*左侧收起样式*/ 45 | #main.sideBarHide .sideBar{margin-left:-191px;*margin-left:-182px;} 46 | #main.sideBarHide .sideBar .sideBarShowHide{-moz-transform:rotate(180deg); -o-transform:rotate(180deg); -webkit-transform:rotate(180deg); transform:rotate(180deg);} 47 | #main.sideBarHide .sideBar .sideBarShowHide a{*background:url(../img/ll.png);} 48 | #main.sideBarHide .sideBar .sideBarShowHide a:hover{*background-position:0 -31px;} 49 | #main.sideBarHide .sideBar .sideBarShowHide{background:none;} 50 | 51 | .resultArea{float:left;width:100%;} 52 | .resultArea .resultTotal{position:relative;padding-left:30px;margin-bottom:20px;} 53 | .resultArea .resultTotal .info{color:#9a9a9a;} 54 | .resultArea .resultTotal .orderOpt{position:absolute;right:50px;} 55 | .resultArea .resultTotal .orderOpt a{margin-right:10px;color:#0080cc;} 56 | 57 | /*搜索结果列表区域*/ 58 | .resultArea .resultList{padding-left:30px;} 59 | .resultArea .resultList .resultItem{margin-bottom:20px;} 60 | .resultArea .resultList .resultItem{margin-bottom:30px;} 61 | .resultArea .resultList .itemHead{margin-bottom:5px;color:#767676;} 62 | .resultArea .resultList .itemHead .keyWord{color:#d90909;} 63 | .resultArea .resultList .itemBody .keyWord{color:#d90909;} 64 | .resultArea .resultList .itemHead a.title{font-size:16px;color:#0080cc;text-decoration:underline;} 65 | .resultArea .resultList .itemHead .value{color:#008000;} 66 | .resultArea .resultList .itemHead .divsion{margin:0 5px;} 67 | .resultArea .resultList .itemHead .fileType{margin-right:10px;} 68 | 69 | /*搜索内容主体*/ 70 | .resultArea .resultList .itemBody{margin-bottom:5px;line-height:18px;width:90%;} 71 | .resultArea .resultList .itemFoot{color:#008000;} 72 | .resultArea .resultList .itemFoot .info{margin-right:10px;} 73 | 74 | .resultArea .pagination{margin-bottom:25px;padding-left:32px;} 75 | /*相关搜索*/ 76 | .resultArea .dependSearch{margin-bottom:30px;padding-left:32px;font-size:14px;} 77 | .resultArea .dependSearch h6{float:left;margin-right:15px;font-weight:bold;} 78 | .resultArea .dependSearch p{margin-bottom:5px;} 79 | .resultArea .dependSearch a{display:inline-block;margin-right:15px;text-decoration:underline;width:90px; white-space:nowrap; overflow:hidden;text-overflow:ellipsis;} 80 | .resultArea .searchInResult{padding-left:35px;} 81 | .resultArea .searchInResult .inResult{position:absolute;right:-190px;top:8px;font-size:14px;text-decoration:underline;} 82 | .resultArea .searchInResult .searchButton{left:417px;} 83 | /*历史搜索区域*/ 84 | .historyArea{float:right;margin-right:-212px;width:212px;} 85 | .historyArea h6{margin-bottom:10px;font-weight:bold;} 86 | .historyArea .historyList{margin-bottom:20px;} 87 | .historyArea .historyList li{margin-bottom:5px;} 88 | 89 | 90 | 91 | /*左侧分栏区域*/ 92 | .subfield{margin-bottom:5px;font-size:14px;font-weight:bold;padding:2px 0 2px 24px;} 93 | .subfield:first-child{border-left:4px solid #9cc813;padding-left:20px;} 94 | 95 | 96 | 97 | /*立即搜索样式*/ 98 | .subfieldContent .search{margin:45px 0 0 135px;width:130px;height:36px;background:url(../img/btnbg.png); font-weight:bold;border:none;border:1px solid #bfbfbf;line-height:36px;} 99 | /*联想下拉区域*/ 100 | .inputArea .dataList{display:none;position:absolute;left:0;top:42px;*top:43px;width:550px;padding:5px 0;background:#fff;border:1px solid #bfbfbf;border-top:none;} 101 | .inputArea .dataList li{padding:2px 15px;font-size:14px;} 102 | .inputArea .dataList li:hover{background:#f0f0f0;color:#0080cc;font-weight:bold;} 103 | 104 | 105 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/css/result.css: -------------------------------------------------------------------------------- 1 | @charset "utf-8"; 2 | html{*overflow:auto;} 3 | #hd{padding:20px 10px;} 4 | #bd{margin-bottom:40px;} 5 | .logo{float:left;margin-right:30px; height:33px;} 6 | /*input搜索区域*/ 7 | .inputArea{float:left;position:relative;} 8 | .inputArea .searchInput{border:1px solid #bfbfbf;padding:0 15px;outline:none;height:35px;*line-height:35px;width:350px; background:url(../img/inputbg.png);font-size:14px;} 9 | .inputArea .searchButton{position:absolute;left:382px;top:0;*top:1px;*left:381px;width:106px;height:38px;background:url(../img/btn_min.png) no-repeat;border:none;cursor:pointer;} 10 | 11 | /*返回高级搜索*/ 12 | .inputArea .advanced{position:absolute;font-size:14px;left:500px;top:12px;width:60px;text-decoration:underline;} 13 | 14 | /*分界区域,导航*/ 15 | .nav{margin-bottom:24px;height:31px;background:#f9f9f9;border-bottom:1px solid #e0e0e0;padding:5px 0 0 210px;} 16 | .searchList{float:left;padding-left:5px;} 17 | .searchList .searchItem{float:left;margin-right:15px;font-size:14px;padding:0 3px 2px 3px;cursor:pointer;height:26px; line-height:26px;} 18 | .searchList .searchItem.current{color:#0080cc;border-bottom:3px solid #9cc813;font-weight:bold;} 19 | .nav .tips{color:#969696;font-size:12px;line-height:24px;*line-height:26px;} 20 | #container.sideBarHide .nav{padding-left:35px;} 21 | 22 | /*#main区域样式*/ 23 | #main{padding:0 215px 0 182px;} 24 | #main.sideBarHide{padding-left:10px;} 25 | /*侧边栏搜索条件*/ 26 | .sideBar{position:relative;float:left;margin-left:-182px;width:182px;} 27 | .sideBar .subfieldContext{margin-bottom:20px;padding-left:25px;} 28 | .sideBar .subfieldContext li{margin-bottom:5px;cursor:pointer;} 29 | .sideBar .subfieldContext input[type=text]{width:75px;} 30 | .sideBar .unit{color:#787878;} 31 | 32 | /*更多按钮*/ 33 | .sideBar .more a:hover{text-decoration:none;} 34 | .sideBar .more .moreIcon{display:inline-block;position:relative;top:-1px;*top:-3px;left:2px;*left:-1px;width:9px;height:5px;background:url(../img/more.png);} 35 | .sideBar .more.show .moreIcon{background:url(../img/down.png);top:-2px;} 36 | 37 | .sideBar .reset{padding-left:25px;} 38 | /*siderBar区域显隐控制*/ 39 | .sideBar .sideBarShowHide{position:absolute;right:0px;top:20px;height:177px;width:1px; background:url(../img/line.png) right;} 40 | .sideBar .sideBarShowHide a{position:absolute;top:70px;left:-11px;display:inline-block;width:12px;height:31px;background:url(../img/lr.png);} 41 | 42 | .sideBar .sideBarShowHide a:hover{background-position:0 -31px;} 43 | 44 | /*左侧收起样式*/ 45 | #main.sideBarHide .sideBar{margin-left:-191px;*margin-left:-182px;} 46 | #main.sideBarHide .sideBar .sideBarShowHide{-moz-transform:rotate(180deg); -o-transform:rotate(180deg); -webkit-transform:rotate(180deg); transform:rotate(180deg);} 47 | #main.sideBarHide .sideBar .sideBarShowHide a{*background:url(../img/ll.png);} 48 | #main.sideBarHide .sideBar .sideBarShowHide a:hover{*background-position:0 -31px;} 49 | #main.sideBarHide .sideBar .sideBarShowHide{background:none;} 50 | 51 | .resultArea{float:left;width:100%;} 52 | .resultArea .resultTotal{position:relative;padding-left:30px;margin-bottom:20px;} 53 | .resultArea .resultTotal .info{color:#9a9a9a;} 54 | .resultArea .resultTotal .orderOpt{position:absolute;right:50px;} 55 | .resultArea .resultTotal .orderOpt a{margin-right:10px;color:#0080cc;} 56 | 57 | /*搜索结果列表区域*/ 58 | .resultArea .resultList{padding-left:30px;} 59 | .resultArea .resultList .resultItem{margin-bottom:20px;} 60 | .resultArea .resultList .resultItem{margin-bottom:30px;} 61 | .resultArea .resultList .itemHead{margin-bottom:5px;color:#767676;} 62 | .resultArea .resultList .itemHead .keyWord{color:#d90909;} 63 | .resultArea .resultList .itemBody .keyWord{color:#d90909;} 64 | .resultArea .resultList .itemHead a.title{font-size:16px;color:#0080cc;text-decoration:underline;} 65 | .resultArea .resultList .itemHead .value{color:#008000;} 66 | .resultArea .resultList .itemHead .divsion{margin:0 5px;} 67 | .resultArea .resultList .itemHead .fileType{margin-right:10px;} 68 | 69 | /*搜索内容主体*/ 70 | .resultArea .resultList .itemBody{margin-bottom:5px;line-height:18px;width:90%;} 71 | .resultArea .resultList .itemFoot{color:#008000;} 72 | .resultArea .resultList .itemFoot .info{margin-right:10px;} 73 | 74 | .resultArea .pagination{margin-bottom:25px;padding-left:32px;} 75 | /*相关搜索*/ 76 | .resultArea .dependSearch{margin-bottom:30px;padding-left:32px;font-size:14px;} 77 | .resultArea .dependSearch h6{float:left;margin-right:15px;font-weight:bold;} 78 | .resultArea .dependSearch p{margin-bottom:5px;} 79 | .resultArea .dependSearch a{display:inline-block;margin-right:15px;text-decoration:underline;width:90px; white-space:nowrap; overflow:hidden;text-overflow:ellipsis;} 80 | .resultArea .searchInResult{padding-left:35px;} 81 | .resultArea .searchInResult .inResult{position:absolute;right:-190px;top:8px;font-size:14px;text-decoration:underline;} 82 | .resultArea .searchInResult .searchButton{left:417px;} 83 | /*历史搜索区域*/ 84 | .historyArea{float:right;margin-right:-212px;width:212px;} 85 | .historyArea h6{margin-bottom:10px;font-weight:bold;} 86 | .historyArea .historyList{margin-bottom:20px;} 87 | .historyArea .historyList li{margin-bottom:5px;} 88 | 89 | 90 | 91 | /*左侧分栏区域*/ 92 | .subfield{margin-bottom:5px;font-size:14px;font-weight:bold;padding:2px 0 2px 24px;} 93 | .subfield:first-child{border-left:4px solid #9cc813;padding-left:20px;} 94 | 95 | 96 | 97 | /*立即搜索样式*/ 98 | .subfieldContent .search{margin:45px 0 0 135px;width:130px;height:36px;background:url(../img/btnbg.png); font-weight:bold;border:none;border:1px solid #bfbfbf;line-height:36px;} 99 | /*联想下拉区域*/ 100 | .inputArea .dataList{display:none;position:absolute;left:0;top:42px;*top:43px;width:550px;padding:5px 0;background:#fff;border:1px solid #bfbfbf;border-top:none;} 101 | .inputArea .dataList li{padding:2px 15px;font-size:14px;} 102 | .inputArea .dataList li:hover{background:#f0f0f0;color:#0080cc;font-weight:bold;} 103 | 104 | 105 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | lcv-search 搜索引擎 9 | 10 | 11 | 12 | 13 |
14 |
15 |
16 |

17 | 18 |

19 | 26 |
27 | 28 | 29 |
    30 |
  • 如何学好设计
  • 31 |
  • 界面设计
  • 32 |
  • UI设计培训要多少钱
  • 33 |
  • 设计师学习
  • 34 |
  • 哪里有好的网站
  • 35 |
36 |
37 | 38 |
39 |

40 | 41 | 42 |

43 |

44 | 45 | 46 | 专注界面设计网站 47 | 用户体验 48 | 互联网 49 | 资费套餐 50 | 51 | 52 |

53 |
54 |
55 |
56 | 57 |
58 |
59 | 60 |
61 |
62 |
63 | 64 | 65 | 66 | 116 | 175 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | {% load staticfiles %} 5 | 6 | 7 | 8 | lcv-search 搜索引擎 9 | 10 | 11 | 12 | 13 |
14 |
15 |
16 |

17 | 18 |

19 | 26 |
27 | 28 | 29 |
    30 |
  • 如何学好设计
  • 31 |
  • 界面设计
  • 32 |
  • UI设计培训要多少钱
  • 33 |
  • 设计师学习
  • 34 |
  • 哪里有好的网站
  • 35 |
36 |
37 | 38 |
39 |

40 | 41 | {% for search_words in topn_search %} 42 | {{ search_words }} 43 | {% endfor %} 44 |

45 |

46 | 47 | 48 | 专注界面设计网站 49 | 用户体验 50 | 互联网 51 | 资费套餐 52 | 53 | 54 |

55 |
56 |
57 |
58 | 59 |
60 |
61 | 62 |
63 |
64 |
65 | 66 | 67 | 68 | 118 | 177 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch-Front/js/pagination.js: -------------------------------------------------------------------------------- 1 | jQuery.fn.pagination = function(maxentries, opts) { 2 | opts = jQuery.extend({ 3 | items_per_page : 10, // 每页显示多少条记录 4 | current_page : 0, //当前页码 5 | num_display_entries : 4, // 中间显示页码的个数 6 | num_edge_entries : 2, // 末尾显示页码的个数 7 | link_to : "javascript:;", //页码点击后的链接 8 | prev_text : "上一页", //上一页的文字 9 | next_text : "下一页", //下一页的文字 10 | ellipse_text : "...", //页码之间的省略号 11 | display_msg : true, // 是否显示记录信息 12 | prev_show_always : true, //是否总是显示最前页 13 | next_show_always : true,//是否总是显示最后页 14 | setPageNo:false,//是否显示跳转第几页 15 | callback : function() { 16 | return false; 17 | } // 回调函数 18 | }, opts || {}); 19 | 20 | return this.each(function() { 21 | // 总页数 22 | function numPages() { 23 | return Math.ceil(maxentries / opts.items_per_page); 24 | } 25 | /** 26 | * 计算页码 27 | */ 28 | function getInterval() { 29 | var ne_half = Math.ceil(opts.num_display_entries / 2); 30 | var np = numPages(); 31 | var upper_limit = np - opts.num_display_entries; 32 | var start = current_page > ne_half ? Math.max(Math.min(current_page 33 | - ne_half, upper_limit), 0) : 0; 34 | var end = current_page > ne_half ? Math.min(current_page + ne_half, 35 | np) : Math.min(opts.num_display_entries, np); 36 | return [start, end]; 37 | } 38 | 39 | /** 40 | * 点击事件 41 | */ 42 | function pageSelected(page_id, evt) { 43 | var page_id = parseInt(page_id); 44 | current_page = page_id; 45 | drawLinks(); 46 | var continuePropagation = opts.callback(page_id, panel); 47 | if (!continuePropagation) { 48 | if (evt.stopPropagation) { 49 | evt.stopPropagation(); 50 | } else { 51 | evt.cancelBubble = true; 52 | } 53 | } 54 | return continuePropagation; 55 | } 56 | 57 | /** 58 | * 链接 59 | */ 60 | function drawLinks() { 61 | panel.empty(); 62 | var interval = getInterval(); 63 | var np = numPages(); 64 | var getClickHandler = function(page_id) { 65 | return function(evt) { 66 | return pageSelected(page_id, evt); 67 | } 68 | } 69 | var appendItem = function(page_id, appendopts) { 70 | page_id = page_id < 0 ? 0 : (page_id < np ? page_id : np-1); 71 | appendopts = jQuery.extend({ 72 | text : page_id+1, 73 | classes : "" 74 | }, appendopts || {}); 75 | if (page_id == current_page) { 76 | var lnk = $("" + (appendopts.text) 77 | + ""); 78 | } else { 79 | var lnk = $("" + (appendopts.text) + "").bind( 80 | "click", getClickHandler(page_id)).attr('href', 81 | opts.link_to.replace(/__id__/, page_id)); 82 | 83 | } 84 | if (appendopts.classes) { 85 | lnk.addClass(appendopts.classes); 86 | } 87 | panel.append(lnk); 88 | } 89 | // 上一页 90 | if (opts.prev_text && (current_page > 0 || opts.prev_show_always)) { 91 | appendItem(current_page - 1, { 92 | text : opts.prev_text, 93 | classes : "prev" 94 | }); 95 | } 96 | // 点点点 97 | if (interval[0] > 0 && opts.num_edge_entries > 0) { 98 | var end = Math.min(opts.num_edge_entries, interval[0]); 99 | for (var i = 0; i < end; i++) { 100 | appendItem(i); 101 | } 102 | if (opts.num_edge_entries < interval[0] && opts.ellipse_text) { 103 | jQuery("" + opts.ellipse_text + "") 104 | .appendTo(panel); 105 | } 106 | } 107 | // 中间的页码 108 | for (var i = interval[0]; i < interval[1]; i++) { 109 | appendItem(i); 110 | } 111 | // 最后的页码 112 | if (interval[1] < np && opts.num_edge_entries > 0) { 113 | if (np - opts.num_edge_entries > interval[1] 114 | && opts.ellipse_text) { 115 | jQuery("" + opts.ellipse_text + "") 116 | .appendTo(panel); 117 | } 118 | var begin = Math.max(np - opts.num_edge_entries, interval[1]); 119 | for (var i = begin; i < np; i++) { 120 | appendItem(i); 121 | } 122 | 123 | } 124 | // 下一页 125 | if (opts.next_text 126 | && (current_page < np - 1 || opts.next_show_always)) { 127 | appendItem(current_page + 1, { 128 | text : opts.next_text, 129 | classes : "next" 130 | }); 131 | } 132 | // 记录显示 133 | if (opts.display_msg) { 134 | if(!maxentries){ 135 | panel 136 | .append('
暂时无数据可以显示
'); 137 | }else{ 138 | panel 139 | .append('
显示第 ' 140 | + ((current_page * opts.items_per_page) + 1) 141 | + ' 条到 ' 142 | + (((current_page + 1) * opts.items_per_page) > maxentries 143 | ? maxentries 144 | : ((current_page + 1) * opts.items_per_page)) 145 | + ' 条记录,总共 ' + maxentries + ' 条
'); 146 | } 147 | } 148 | //设置跳到第几页 149 | if(opts.setPageNo){ 150 | panel.append("
跳转到
"); 151 | } 152 | } 153 | 154 | // 当前页 155 | var current_page = opts.current_page; 156 | maxentries = ( maxentries < 0) ? 0 : maxentries; 157 | opts.items_per_page = (!opts.items_per_page || opts.items_per_page < 0) 158 | ? 1 159 | : opts.items_per_page; 160 | var panel = jQuery(this); 161 | this.selectPage = function(page_id) { 162 | pageSelected(page_id); 163 | } 164 | this.prevPage = function() { 165 | if (current_page > 0) { 166 | pageSelected(current_page - 1); 167 | return true; 168 | } else { 169 | return false; 170 | } 171 | } 172 | this.nextPage = function() { 173 | if (current_page < numPages() - 1) { 174 | pageSelected(current_page + 1); 175 | return true; 176 | } else { 177 | return false; 178 | } 179 | } 180 | 181 | if(maxentries==0){ 182 | panel.append(''+opts.prev_text+''+opts.next_text+'
暂时无数据可以显示
'); 183 | }else{ 184 | drawLinks(); 185 | } 186 | $(this).find(".goto button").live("click",function(evt){ 187 | var setPageNo = $(this).parent().find("input").val(); 188 | if(setPageNo!=null && setPageNo!=""&&setPageNo>0&&setPageNo<=numPages()){ 189 | pageSelected(setPageNo-1, evt); 190 | } 191 | }); 192 | }); 193 | } 194 | -------------------------------------------------------------------------------- /s0vkaq/LcvSearch/static/js/pagination.js: -------------------------------------------------------------------------------- 1 | jQuery.fn.pagination = function(maxentries, opts) { 2 | opts = jQuery.extend({ 3 | items_per_page : 10, // 每页显示多少条记录 4 | current_page : 0, //当前页码 5 | num_display_entries : 4, // 中间显示页码的个数 6 | num_edge_entries : 2, // 末尾显示页码的个数 7 | link_to : "javascript:;", //页码点击后的链接 8 | prev_text : "上一页", //上一页的文字 9 | next_text : "下一页", //下一页的文字 10 | ellipse_text : "...", //页码之间的省略号 11 | display_msg : true, // 是否显示记录信息 12 | prev_show_always : true, //是否总是显示最前页 13 | next_show_always : true,//是否总是显示最后页 14 | setPageNo:false,//是否显示跳转第几页 15 | callback : function() { 16 | return false; 17 | } // 回调函数 18 | }, opts || {}); 19 | 20 | return this.each(function() { 21 | // 总页数 22 | function numPages() { 23 | return Math.ceil(maxentries / opts.items_per_page); 24 | } 25 | /** 26 | * 计算页码 27 | */ 28 | function getInterval() { 29 | var ne_half = Math.ceil(opts.num_display_entries / 2); 30 | var np = numPages(); 31 | var upper_limit = np - opts.num_display_entries; 32 | var start = current_page > ne_half ? Math.max(Math.min(current_page 33 | - ne_half, upper_limit), 0) : 0; 34 | var end = current_page > ne_half ? Math.min(current_page + ne_half, 35 | np) : Math.min(opts.num_display_entries, np); 36 | return [start, end]; 37 | } 38 | 39 | /** 40 | * 点击事件 41 | */ 42 | function pageSelected(page_id, evt) { 43 | var page_id = parseInt(page_id); 44 | current_page = page_id; 45 | drawLinks(); 46 | var continuePropagation = opts.callback(page_id, panel); 47 | if (!continuePropagation) { 48 | if (evt.stopPropagation) { 49 | evt.stopPropagation(); 50 | } else { 51 | evt.cancelBubble = true; 52 | } 53 | } 54 | return continuePropagation; 55 | } 56 | 57 | /** 58 | * 链接 59 | */ 60 | function drawLinks() { 61 | panel.empty(); 62 | var interval = getInterval(); 63 | var np = numPages(); 64 | var getClickHandler = function(page_id) { 65 | return function(evt) { 66 | return pageSelected(page_id, evt); 67 | } 68 | } 69 | var appendItem = function(page_id, appendopts) { 70 | page_id = page_id < 0 ? 0 : (page_id < np ? page_id : np-1); 71 | appendopts = jQuery.extend({ 72 | text : page_id+1, 73 | classes : "" 74 | }, appendopts || {}); 75 | if (page_id == current_page) { 76 | var lnk = $("" + (appendopts.text) 77 | + ""); 78 | } else { 79 | var lnk = $("" + (appendopts.text) + "").bind( 80 | "click", getClickHandler(page_id)).attr('href', 81 | opts.link_to.replace(/__id__/, page_id)); 82 | 83 | } 84 | if (appendopts.classes) { 85 | lnk.addClass(appendopts.classes); 86 | } 87 | panel.append(lnk); 88 | } 89 | // 上一页 90 | if (opts.prev_text && (current_page > 0 || opts.prev_show_always)) { 91 | appendItem(current_page - 1, { 92 | text : opts.prev_text, 93 | classes : "prev" 94 | }); 95 | } 96 | // 点点点 97 | if (interval[0] > 0 && opts.num_edge_entries > 0) { 98 | var end = Math.min(opts.num_edge_entries, interval[0]); 99 | for (var i = 0; i < end; i++) { 100 | appendItem(i); 101 | } 102 | if (opts.num_edge_entries < interval[0] && opts.ellipse_text) { 103 | jQuery("" + opts.ellipse_text + "") 104 | .appendTo(panel); 105 | } 106 | } 107 | // 中间的页码 108 | for (var i = interval[0]; i < interval[1]; i++) { 109 | appendItem(i); 110 | } 111 | // 最后的页码 112 | if (interval[1] < np && opts.num_edge_entries > 0) { 113 | if (np - opts.num_edge_entries > interval[1] 114 | && opts.ellipse_text) { 115 | jQuery("" + opts.ellipse_text + "") 116 | .appendTo(panel); 117 | } 118 | var begin = Math.max(np - opts.num_edge_entries, interval[1]); 119 | for (var i = begin; i < np; i++) { 120 | appendItem(i); 121 | } 122 | 123 | } 124 | // 下一页 125 | if (opts.next_text 126 | && (current_page < np - 1 || opts.next_show_always)) { 127 | appendItem(current_page + 1, { 128 | text : opts.next_text, 129 | classes : "next" 130 | }); 131 | } 132 | // 记录显示 133 | if (opts.display_msg) { 134 | if(!maxentries){ 135 | panel 136 | .append('
暂时无数据可以显示
'); 137 | }else{ 138 | panel 139 | .append('
显示第 ' 140 | + ((current_page * opts.items_per_page) + 1) 141 | + ' 条到 ' 142 | + (((current_page + 1) * opts.items_per_page) > maxentries 143 | ? maxentries 144 | : ((current_page + 1) * opts.items_per_page)) 145 | + ' 条记录,总共 ' + maxentries + ' 条
'); 146 | } 147 | } 148 | //设置跳到第几页 149 | if(opts.setPageNo){ 150 | panel.append("
跳转到
"); 151 | } 152 | } 153 | 154 | // 当前页 155 | var current_page = opts.current_page; 156 | maxentries = ( maxentries < 0) ? 0 : maxentries; 157 | opts.items_per_page = (!opts.items_per_page || opts.items_per_page < 0) 158 | ? 1 159 | : opts.items_per_page; 160 | var panel = jQuery(this); 161 | this.selectPage = function(page_id) { 162 | pageSelected(page_id); 163 | } 164 | this.prevPage = function() { 165 | if (current_page > 0) { 166 | pageSelected(current_page - 1); 167 | return true; 168 | } else { 169 | return false; 170 | } 171 | } 172 | this.nextPage = function() { 173 | if (current_page < numPages() - 1) { 174 | pageSelected(current_page + 1); 175 | return true; 176 | } else { 177 | return false; 178 | } 179 | } 180 | 181 | if(maxentries==0){ 182 | panel.append(''+opts.prev_text+''+opts.next_text+'
暂时无数据可以显示
'); 183 | }else{ 184 | drawLinks(); 185 | } 186 | $(this).find(".goto button").live("click",function(evt){ 187 | var setPageNo = $(this).parent().find("input").val(); 188 | if(setPageNo!=null && setPageNo!=""&&setPageNo>0&&setPageNo<=numPages()){ 189 | pageSelected(setPageNo-1, evt); 190 | } 191 | }); 192 | }); 193 | } 194 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/ArticleSpider/spiders/jobbole.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import scrapy 4 | import datetime 5 | from scrapy.http import Request 6 | from urllib import parse 7 | from scrapy.loader import ItemLoader 8 | 9 | from ArticleSpider.items import JobBoleArticleItem, ArticleItemLoader 10 | 11 | from ArticleSpider.utils.common import get_md5 12 | from selenium import webdriver 13 | from scrapy.xlib.pydispatch import dispatcher 14 | from scrapy import signals 15 | 16 | class JobboleSpider(scrapy.Spider): 17 | name = "jobbole" 18 | allowed_domains = ["blog.jobbole.com"] 19 | start_urls = ['http://blog.jobbole.com/all-posts/'] 20 | 21 | 22 | # def __init__(self): 23 | # self.browser = webdriver.Chrome(executable_path="D:/Temp/chromedriver.exe") 24 | # super(JobboleSpider, self).__init__() 25 | # dispatcher.connect(self.spider_closed, signals.spider_closed) 26 | # 27 | # def spider_closed(self, spider): 28 | # #当爬虫退出的时候关闭chrome 29 | # print ("spider closed") 30 | # self.browser.quit() 31 | 32 | #收集伯乐在线所有404的url以及404页面数 33 | handle_httpstatus_list = [404] 34 | 35 | def __init__(self, **kwargs): 36 | self.fail_urls = [] 37 | dispatcher.connect(self.handle_spider_closed, signals.spider_closed) 38 | 39 | def handle_spider_closed(self, spider, reason): 40 | self.crawler.stats.set_value("failed_urls", ",".join(self.fail_urls)) 41 | 42 | def parse(self, response): 43 | """ 44 | 1. 获取文章列表页中的文章url并交给scrapy下载后并进行解析 45 | 2. 获取下一页的url并交给scrapy进行下载, 下载完成后交给parse 46 | """ 47 | #解析列表页中的所有文章url并交给scrapy下载后并进行解析 48 | if response.status == 404: 49 | self.fail_urls.append(response.url) 50 | self.crawler.stats.inc_value("failed_url") 51 | 52 | post_nodes = response.css("#archive .floated-thumb .post-thumb a") 53 | for post_node in post_nodes: 54 | image_url = post_node.css("img::attr(src)").extract_first("") 55 | post_url = post_node.css("::attr(href)").extract_first("") 56 | yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url":image_url}, callback=self.parse_detail) 57 | 58 | #提取下一页并交给scrapy进行下载 59 | next_url = response.css(".next.page-numbers::attr(href)").extract_first("") 60 | if next_url: 61 | yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse) 62 | 63 | def parse_detail(self, response): 64 | article_item = JobBoleArticleItem() 65 | 66 | #提取文章的具体字段 67 | # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("") 68 | # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip() 69 | # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] 70 | # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] 71 | # match_re = re.match(".*?(\d+).*", fav_nums) 72 | # if match_re: 73 | # fav_nums = match_re.group(1) 74 | # 75 | # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] 76 | # match_re = re.match(".*?(\d+).*", comment_nums) 77 | # if match_re: 78 | # comment_nums = match_re.group(1) 79 | # 80 | # content = response.xpath("//div[@class='entry']").extract()[0] 81 | # 82 | # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() 83 | # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] 84 | # tags = ",".join(tag_list) 85 | 86 | #通过css选择器提取字段 87 | # front_image_url = response.meta.get("front_image_url", "") #文章封面图 88 | # title = response.css(".entry-header h1::text").extract()[0] 89 | # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip() 90 | # praise_nums = response.css(".vote-post-up h10::text").extract()[0] 91 | # fav_nums = response.css(".bookmark-btn::text").extract()[0] 92 | # match_re = re.match(".*?(\d+).*", fav_nums) 93 | # if match_re: 94 | # fav_nums = int(match_re.group(1)) 95 | # else: 96 | # fav_nums = 0 97 | # 98 | # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] 99 | # match_re = re.match(".*?(\d+).*", comment_nums) 100 | # if match_re: 101 | # comment_nums = int(match_re.group(1)) 102 | # else: 103 | # comment_nums = 0 104 | # 105 | # content = response.css("div.entry").extract()[0] 106 | # 107 | # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() 108 | # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] 109 | # tags = ",".join(tag_list) 110 | # 111 | # article_item["url_object_id"] = get_md5(response.url) 112 | # article_item["title"] = title 113 | # article_item["url"] = response.url 114 | # try: 115 | # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() 116 | # except Exception as e: 117 | # create_date = datetime.datetime.now().date() 118 | # article_item["create_date"] = create_date 119 | # article_item["front_image_url"] = [front_image_url] 120 | # article_item["praise_nums"] = praise_nums 121 | # article_item["comment_nums"] = comment_nums 122 | # article_item["fav_nums"] = fav_nums 123 | # article_item["tags"] = tags 124 | # article_item["content"] = content 125 | 126 | 127 | #通过item loader加载item 128 | front_image_url = response.meta.get("front_image_url", "") # 文章封面图 129 | item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) 130 | item_loader.add_css("title", ".entry-header h1::text") 131 | item_loader.add_value("url", response.url) 132 | item_loader.add_value("url_object_id", get_md5(response.url)) 133 | item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") 134 | item_loader.add_value("front_image_url", [front_image_url]) 135 | item_loader.add_css("praise_nums", ".vote-post-up h10::text") 136 | item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") 137 | item_loader.add_css("fav_nums", ".bookmark-btn::text") 138 | item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") 139 | item_loader.add_css("content", "div.entry") 140 | 141 | article_item = item_loader.load_item() 142 | 143 | 144 | yield article_item 145 | -------------------------------------------------------------------------------- /s0vkaq/ArticleSpider/build/lib/ArticleSpider/spiders/jobbole.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import scrapy 4 | import datetime 5 | from scrapy.http import Request 6 | from urllib import parse 7 | from scrapy.loader import ItemLoader 8 | 9 | from ArticleSpider.items import JobBoleArticleItem, ArticleItemLoader 10 | 11 | from ArticleSpider.utils.common import get_md5 12 | from selenium import webdriver 13 | from scrapy.xlib.pydispatch import dispatcher 14 | from scrapy import signals 15 | 16 | class JobboleSpider(scrapy.Spider): 17 | name = "jobbole" 18 | allowed_domains = ["blog.jobbole.com"] 19 | start_urls = ['http://blog.jobbole.com/all-posts/'] 20 | 21 | 22 | # def __init__(self): 23 | # self.browser = webdriver.Chrome(executable_path="D:/Temp/chromedriver.exe") 24 | # super(JobboleSpider, self).__init__() 25 | # dispatcher.connect(self.spider_closed, signals.spider_closed) 26 | # 27 | # def spider_closed(self, spider): 28 | # #当爬虫退出的时候关闭chrome 29 | # print ("spider closed") 30 | # self.browser.quit() 31 | 32 | #收集伯乐在线所有404的url以及404页面数 33 | handle_httpstatus_list = [404] 34 | 35 | def __init__(self, **kwargs): 36 | self.fail_urls = [] 37 | dispatcher.connect(self.handle_spider_closed, signals.spider_closed) 38 | 39 | def handle_spider_closed(self, spider, reason): 40 | self.crawler.stats.set_value("failed_urls", ",".join(self.fail_urls)) 41 | 42 | def parse(self, response): 43 | """ 44 | 1. 获取文章列表页中的文章url并交给scrapy下载后并进行解析 45 | 2. 获取下一页的url并交给scrapy进行下载, 下载完成后交给parse 46 | """ 47 | #解析列表页中的所有文章url并交给scrapy下载后并进行解析 48 | if response.status == 404: 49 | self.fail_urls.append(response.url) 50 | self.crawler.stats.inc_value("failed_url") 51 | 52 | post_nodes = response.css("#archive .floated-thumb .post-thumb a") 53 | for post_node in post_nodes: 54 | image_url = post_node.css("img::attr(src)").extract_first("") 55 | post_url = post_node.css("::attr(href)").extract_first("") 56 | yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url":image_url}, callback=self.parse_detail) 57 | 58 | #提取下一页并交给scrapy进行下载 59 | next_url = response.css(".next.page-numbers::attr(href)").extract_first("") 60 | if next_url: 61 | yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse) 62 | 63 | def parse_detail(self, response): 64 | article_item = JobBoleArticleItem() 65 | 66 | #提取文章的具体字段 67 | # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("") 68 | # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip() 69 | # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] 70 | # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] 71 | # match_re = re.match(".*?(\d+).*", fav_nums) 72 | # if match_re: 73 | # fav_nums = match_re.group(1) 74 | # 75 | # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] 76 | # match_re = re.match(".*?(\d+).*", comment_nums) 77 | # if match_re: 78 | # comment_nums = match_re.group(1) 79 | # 80 | # content = response.xpath("//div[@class='entry']").extract()[0] 81 | # 82 | # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() 83 | # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] 84 | # tags = ",".join(tag_list) 85 | 86 | #通过css选择器提取字段 87 | # front_image_url = response.meta.get("front_image_url", "") #文章封面图 88 | # title = response.css(".entry-header h1::text").extract()[0] 89 | # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip() 90 | # praise_nums = response.css(".vote-post-up h10::text").extract()[0] 91 | # fav_nums = response.css(".bookmark-btn::text").extract()[0] 92 | # match_re = re.match(".*?(\d+).*", fav_nums) 93 | # if match_re: 94 | # fav_nums = int(match_re.group(1)) 95 | # else: 96 | # fav_nums = 0 97 | # 98 | # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] 99 | # match_re = re.match(".*?(\d+).*", comment_nums) 100 | # if match_re: 101 | # comment_nums = int(match_re.group(1)) 102 | # else: 103 | # comment_nums = 0 104 | # 105 | # content = response.css("div.entry").extract()[0] 106 | # 107 | # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() 108 | # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] 109 | # tags = ",".join(tag_list) 110 | # 111 | # article_item["url_object_id"] = get_md5(response.url) 112 | # article_item["title"] = title 113 | # article_item["url"] = response.url 114 | # try: 115 | # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() 116 | # except Exception as e: 117 | # create_date = datetime.datetime.now().date() 118 | # article_item["create_date"] = create_date 119 | # article_item["front_image_url"] = [front_image_url] 120 | # article_item["praise_nums"] = praise_nums 121 | # article_item["comment_nums"] = comment_nums 122 | # article_item["fav_nums"] = fav_nums 123 | # article_item["tags"] = tags 124 | # article_item["content"] = content 125 | 126 | 127 | #通过item loader加载item 128 | front_image_url = response.meta.get("front_image_url", "") # 文章封面图 129 | item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) 130 | item_loader.add_css("title", ".entry-header h1::text") 131 | item_loader.add_value("url", response.url) 132 | item_loader.add_value("url_object_id", get_md5(response.url)) 133 | item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") 134 | item_loader.add_value("front_image_url", [front_image_url]) 135 | item_loader.add_css("praise_nums", ".vote-post-up h10::text") 136 | item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") 137 | item_loader.add_css("fav_nums", ".bookmark-btn::text") 138 | item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") 139 | item_loader.add_css("content", "div.entry") 140 | 141 | article_item = item_loader.load_item() 142 | 143 | 144 | yield article_item 145 | -------------------------------------------------------------------------------- /s0vkaq/ScrapyRedisTest/scrapy_redis/scheduler.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import six 3 | 4 | from scrapy.utils.misc import load_object 5 | 6 | from . import connection, defaults 7 | 8 | 9 | # TODO: add SCRAPY_JOB support. 10 | class Scheduler(object): 11 | """Redis-based scheduler 12 | 13 | Settings 14 | -------- 15 | SCHEDULER_PERSIST : bool (default: False) 16 | Whether to persist or clear redis queue. 17 | SCHEDULER_FLUSH_ON_START : bool (default: False) 18 | Whether to flush redis queue on start. 19 | SCHEDULER_IDLE_BEFORE_CLOSE : int (default: 0) 20 | How many seconds to wait before closing if no message is received. 21 | SCHEDULER_QUEUE_KEY : str 22 | Scheduler redis key. 23 | SCHEDULER_QUEUE_CLASS : str 24 | Scheduler queue class. 25 | SCHEDULER_DUPEFILTER_KEY : str 26 | Scheduler dupefilter redis key. 27 | SCHEDULER_DUPEFILTER_CLASS : str 28 | Scheduler dupefilter class. 29 | SCHEDULER_SERIALIZER : str 30 | Scheduler serializer. 31 | 32 | """ 33 | 34 | def __init__(self, server, 35 | persist=False, 36 | flush_on_start=False, 37 | queue_key=defaults.SCHEDULER_QUEUE_KEY, 38 | queue_cls=defaults.SCHEDULER_QUEUE_CLASS, 39 | dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY, 40 | dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS, 41 | idle_before_close=0, 42 | serializer=None): 43 | """Initialize scheduler. 44 | 45 | Parameters 46 | ---------- 47 | server : Redis 48 | The redis server instance. 49 | persist : bool 50 | Whether to flush requests when closing. Default is False. 51 | flush_on_start : bool 52 | Whether to flush requests on start. Default is False. 53 | queue_key : str 54 | Requests queue key. 55 | queue_cls : str 56 | Importable path to the queue class. 57 | dupefilter_key : str 58 | Duplicates filter key. 59 | dupefilter_cls : str 60 | Importable path to the dupefilter class. 61 | idle_before_close : int 62 | Timeout before giving up. 63 | 64 | """ 65 | if idle_before_close < 0: 66 | raise TypeError("idle_before_close cannot be negative") 67 | 68 | self.server = server 69 | self.persist = persist 70 | self.flush_on_start = flush_on_start 71 | self.queue_key = queue_key 72 | self.queue_cls = queue_cls 73 | self.dupefilter_cls = dupefilter_cls 74 | self.dupefilter_key = dupefilter_key 75 | self.idle_before_close = idle_before_close 76 | self.serializer = serializer 77 | self.stats = None 78 | 79 | def __len__(self): 80 | return len(self.queue) 81 | 82 | @classmethod 83 | def from_settings(cls, settings): 84 | kwargs = { 85 | 'persist': settings.getbool('SCHEDULER_PERSIST'), 86 | 'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'), 87 | 'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'), 88 | } 89 | 90 | # If these values are missing, it means we want to use the defaults. 91 | optional = { 92 | # TODO: Use custom prefixes for this settings to note that are 93 | # specific to scrapy-redis. 94 | 'queue_key': 'SCHEDULER_QUEUE_KEY', 95 | 'queue_cls': 'SCHEDULER_QUEUE_CLASS', 96 | 'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY', 97 | # We use the default setting name to keep compatibility. 98 | 'dupefilter_cls': 'DUPEFILTER_CLASS', 99 | 'serializer': 'SCHEDULER_SERIALIZER', 100 | } 101 | for name, setting_name in optional.items(): 102 | val = settings.get(setting_name) 103 | if val: 104 | kwargs[name] = val 105 | 106 | # Support serializer as a path to a module. 107 | if isinstance(kwargs.get('serializer'), six.string_types): 108 | kwargs['serializer'] = importlib.import_module(kwargs['serializer']) 109 | 110 | server = connection.from_settings(settings) 111 | # Ensure the connection is working. 112 | server.ping() 113 | 114 | return cls(server=server, **kwargs) 115 | 116 | @classmethod 117 | def from_crawler(cls, crawler): 118 | instance = cls.from_settings(crawler.settings) 119 | # FIXME: for now, stats are only supported from this constructor 120 | instance.stats = crawler.stats 121 | return instance 122 | 123 | def open(self, spider): 124 | self.spider = spider 125 | 126 | try: 127 | self.queue = load_object(self.queue_cls)( 128 | server=self.server, 129 | spider=spider, 130 | key=self.queue_key % {'spider': spider.name}, 131 | serializer=self.serializer, 132 | ) 133 | except TypeError as e: 134 | raise ValueError("Failed to instantiate queue class '%s': %s", 135 | self.queue_cls, e) 136 | 137 | try: 138 | self.df = load_object(self.dupefilter_cls)( 139 | server=self.server, 140 | key=self.dupefilter_key % {'spider': spider.name}, 141 | debug=spider.settings.getbool('DUPEFILTER_DEBUG'), 142 | ) 143 | except TypeError as e: 144 | raise ValueError("Failed to instantiate dupefilter class '%s': %s", 145 | self.dupefilter_cls, e) 146 | 147 | if self.flush_on_start: 148 | self.flush() 149 | # notice if there are requests already in the queue to resume the crawl 150 | if len(self.queue): 151 | spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) 152 | 153 | def close(self, reason): 154 | if not self.persist: 155 | self.flush() 156 | 157 | def flush(self): 158 | self.df.clear() 159 | self.queue.clear() 160 | 161 | def enqueue_request(self, request): 162 | if not request.dont_filter and self.df.request_seen(request): 163 | self.df.log(request, self.spider) 164 | return False 165 | if self.stats: 166 | self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) 167 | self.queue.push(request) 168 | return True 169 | 170 | def next_request(self): 171 | block_pop_timeout = self.idle_before_close 172 | request = self.queue.pop(block_pop_timeout) 173 | if request and self.stats: 174 | self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider) 175 | return request 176 | 177 | def has_pending_requests(self): 178 | return len(self) > 0 179 | --------------------------------------------------------------------------------