├── s0vkaq
    ├── LcvSearch
    │   ├── LcvSearch
    │   │   ├── __init__.py
    │   │   ├── wsgi.py
    │   │   ├── urls.py
    │   │   └── settings.py
    │   ├── search
    │   │   ├── __init__.py
    │   │   ├── migrations
    │   │   │   └── __init__.py
    │   │   ├── admin.py
    │   │   ├── tests.py
    │   │   ├── apps.py
    │   │   ├── models.py
    │   │   └── views.py
    │   ├── db.sqlite3
    │   ├── static
    │   │   ├── img
    │   │   │   ├── ll.png
    │   │   │   ├── lr.png
    │   │   │   ├── Thumbs.db
    │   │   │   ├── btnbg.png
    │   │   │   ├── down.png
    │   │   │   ├── line.png
    │   │   │   ├── logo.png
    │   │   │   ├── logo1.png
    │   │   │   ├── more.png
    │   │   │   ├── btn_min.png
    │   │   │   ├── inputbg.png
    │   │   │   ├── logo-bak.png
    │   │   │   ├── seachbtn.png
    │   │   │   ├── logo-bak2.png
    │   │   │   └── logo1-bak.png
    │   │   ├── js
    │   │   │   ├── global.js
    │   │   │   ├── common.js
    │   │   │   └── pagination.js
    │   │   └── css
    │   │   │   ├── index.css
    │   │   │   ├── style.css
    │   │   │   ├── advanced.css
    │   │   │   └── result.css
    │   ├── .idea
    │   │   ├── markdown-navigator
    │   │   │   └── profiles_settings.xml
    │   │   ├── encodings.xml
    │   │   ├── modules.xml
    │   │   ├── LcvSearch.iml
    │   │   └── misc.xml
    │   ├── manage.py
    │   └── templates
    │   │   └── index.html
    ├── ArticleSpider
    │   ├── articleexport.json
    │   ├── ArticleSpider
    │   │   ├── __init__.py
    │   │   ├── utils
    │   │   │   ├── cookies.txt
    │   │   │   ├── __init__.py
    │   │   │   ├── captcha.jpg
    │   │   │   ├── common.py
    │   │   │   ├── bloomfilter.py
    │   │   │   └── zhihu_login_requests.py
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   └── es_types.py
    │   │   ├── spiders
    │   │   │   ├── __init__.py
    │   │   │   ├── lagou.py
    │   │   │   └── jobbole.py
    │   │   ├── images
    │   │   │   └── full
    │   │   │   │   ├── 055507fb28ac7ac8228b811c71f9ffdec4eb1748.jpg
    │   │   │   │   └── 0680fd15f05a124d6ac8e95b032713d8839b6c92.jpg
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   └── settings.py
    │   ├── build
    │   │   └── lib
    │   │   │   ├── ArticleSpider
    │   │   │       ├── __init__.py
    │   │   │       ├── models
    │   │   │       │   ├── __init__.py
    │   │   │       │   └── es_types.py
    │   │   │       ├── utils
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── common.py
    │   │   │       │   ├── bloomfilter.py
    │   │   │       │   └── zhihu_login_requests.py
    │   │   │       ├── spiders
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── lagou.py
    │   │   │       │   └── jobbole.py
    │   │   │       ├── middlewares.py
    │   │   │       ├── pipelines.py
    │   │   │       └── settings.py
    │   │   │   └── tools
    │   │   │       ├── __init__.py
    │   │   │       ├── selenium_spider.py
    │   │   │       ├── crawl_xici_ip.py
    │   │   │       └── yundama_requests.py
    │   ├── job_info
    │   │   ├── 001
    │   │   │   ├── requests.queue
    │   │   │   │   ├── active.json
    │   │   │   │   └── p0
    │   │   │   └── spider.state
    │   │   └── 002
    │   │   │   └── requests.queue
    │   │   │       └── p0
    │   ├── project.egg-info
    │   │   ├── dependency_links.txt
    │   │   ├── top_level.txt
    │   │   ├── entry_points.txt
    │   │   ├── PKG-INFO
    │   │   └── SOURCES.txt
    │   ├── tools
    │   │   ├── __init__.py
    │   │   ├── image
    │   │   │   ├── 1.jpg
    │   │   │   ├── 2.jpg
    │   │   │   ├── 3.png
    │   │   │   ├── 4.png
    │   │   │   ├── 5.png
    │   │   │   └── captcha.jpg
    │   │   ├── ghostdriver.log
    │   │   ├── selenium_spider.py
    │   │   ├── crawl_xici_ip.py
    │   │   └── yundama_requests.py
    │   ├── captcha.jpg
    │   ├── .idea
    │   │   ├── markdown-navigator
    │   │   │   └── profiles_settings.xml
    │   │   ├── inspectionProfiles
    │   │   │   └── profiles_settings.xml
    │   │   ├── modules.xml
    │   │   ├── ArticleSpider.iml
    │   │   └── misc.xml
    │   ├── test.py
    │   ├── setup.py
    │   ├── scrapy.cfg
    │   └── main.py
    ├── ScrapyRedisTest
    │   ├── ScrapyRedisTest
    │   │   ├── __init__.py
    │   │   ├── utils
    │   │   │   ├── __init__.py
    │   │   │   └── bloomfilter.py
    │   │   ├── spiders
    │   │   │   ├── __init__.py
    │   │   │   └── jobbole.py
    │   │   ├── pipelines.py
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   └── settings.py
    │   ├── .idea
    │   │   ├── markdown-navigator
    │   │   │   └── profiles_settings.xml
    │   │   ├── inspectionProfiles
    │   │   │   └── profiles_settings.xml
    │   │   ├── modules.xml
    │   │   ├── ScrapyRedisTest.iml
    │   │   └── misc.xml
    │   ├── scrapy_redis
    │   │   ├── utils.py
    │   │   ├── __init__.py
    │   │   ├── picklecompat.py
    │   │   ├── defaults.py
    │   │   ├── pipelines.py
    │   │   ├── connection.py
    │   │   ├── dupefilter.py
    │   │   ├── queue.py
    │   │   └── scheduler.py
    │   ├── main.py
    │   └── scrapy.cfg
    └── LcvSearch-Front
    │   ├── img
    │       ├── ll.png
    │       ├── lr.png
    │       ├── down.png
    │       ├── line.png
    │       ├── logo.png
    │       ├── more.png
    │       ├── Thumbs.db
    │       ├── btn_min.png
    │       ├── btnbg.png
    │       ├── inputbg.png
    │       ├── logo1.png
    │       ├── logo-bak.png
    │       ├── logo-bak2.png
    │       ├── logo1-bak.png
    │       └── seachbtn.png
    │   ├── js
    │       ├── global.js
    │       ├── common.js
    │       └── pagination.js
    │   ├── css
    │       ├── index.css
    │       ├── style.css
    │       ├── advanced.css
    │       └── result.css
    │   └── index.html
└── README.md


/s0vkaq/LcvSearch/LcvSearch/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/search/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/articleexport.json:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/ArticleSpider/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/search/migrations/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/ArticleSpider/utils/cookies.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/ScrapyRedisTest/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/build/lib/ArticleSpider/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/job_info/001/requests.queue/active.json:
--------------------------------------------------------------------------------
1 | [0]


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/project.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/project.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | ArticleSpider
2 | tools
3 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/tools/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = 'bobby'
3 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/build/lib/tools/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = 'bobby'
3 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/ArticleSpider/models/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = 'bobby'
3 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/ArticleSpider/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = 'bobby'
3 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/search/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 | 
3 | # Register your models here.
4 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/search/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 | 
3 | # Create your tests here.
4 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/project.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [scrapy]
2 | settings = ArticleSpider.settings
3 | 
4 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/db.sqlite3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/db.sqlite3


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/ScrapyRedisTest/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = 'bobby'
3 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/build/lib/ArticleSpider/models/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = 'bobby'
3 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/build/lib/ArticleSpider/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = 'bobby'
3 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/captcha.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/captcha.jpg


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/img/ll.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/ll.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/img/lr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/lr.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/img/down.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/down.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/img/line.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/line.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/logo.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/img/more.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/more.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/img/ll.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/ll.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/img/lr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/lr.png


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/tools/image/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/tools/image/1.jpg


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/tools/image/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/tools/image/2.jpg


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/tools/image/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/tools/image/3.png


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/tools/image/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/tools/image/4.png


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/tools/image/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/tools/image/5.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/img/Thumbs.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/Thumbs.db


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/img/btn_min.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/btn_min.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/img/btnbg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/btnbg.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/img/inputbg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/inputbg.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/img/logo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/logo1.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/img/Thumbs.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/Thumbs.db


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/img/btnbg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/btnbg.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/img/down.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/down.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/img/line.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/line.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/logo.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/img/logo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/logo1.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/img/more.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/more.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/img/logo-bak.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/logo-bak.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/img/logo-bak2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/logo-bak2.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/img/logo1-bak.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/logo1-bak.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/img/seachbtn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch-Front/img/seachbtn.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/search/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 | 
3 | 
4 | class SearchConfig(AppConfig):
5 |     name = 'search'
6 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/img/btn_min.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/btn_min.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/img/inputbg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/inputbg.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/img/logo-bak.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/logo-bak.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/img/seachbtn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/seachbtn.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/img/logo-bak2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/logo-bak2.png


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/img/logo1-bak.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/LcvSearch/static/img/logo1-bak.png


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/tools/image/captcha.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/tools/image/captcha.jpg


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/job_info/001/spider.state:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/job_info/001/spider.state


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/ArticleSpider/utils/captcha.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/ArticleSpider/utils/captcha.jpg


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/job_info/001/requests.queue/p0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/job_info/001/requests.queue/p0


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/job_info/002/requests.queue/p0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/job_info/002/requests.queue/p0


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/.idea/markdown-navigator/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="MarkdownNavigator.ProfileManager">
2 |   <settings default="" />
3 | </component>


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/.idea/markdown-navigator/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="MarkdownNavigator.ProfileManager">
2 |   <settings default="" />
3 | </component>


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/.idea/markdown-navigator/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="MarkdownNavigator.ProfileManager">
2 |   <settings default="" />
3 | </component>


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/test.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = 'bobby'
3 | 
4 | import redis
5 | redis_cli = redis.StrictRedis()
6 | redis_cli.incr("jobbole_count")
7 | 
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Python分布式爬虫打造搜索引擎
2 | 
3 | 全部代码都汇总在这里
4 | 
5 | 代码解读移步我博客：[python分布式爬虫打造搜索引擎--------scrapy实现](http://www.cnblogs.com/jinxiao-pu/p/6706319.html)
6 | 
7 | 觉得对你有用就给个star吧！非常感谢！
8 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="Encoding">
4 |     <file url="PROJECT" charset="UTF-8" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/ArticleSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/ScrapyRedisTest/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/ArticleSpider/images/full/055507fb28ac7ac8228b811c71f9ffdec4eb1748.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/ArticleSpider/images/full/055507fb28ac7ac8228b811c71f9ffdec4eb1748.jpg


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/ArticleSpider/images/full/0680fd15f05a124d6ac8e95b032713d8839b6c92.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pujinxiao/project_pjx/HEAD/s0vkaq/ArticleSpider/ArticleSpider/images/full/0680fd15f05a124d6ac8e95b032713d8839b6c92.jpg


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/build/lib/ArticleSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/scrapy_redis/utils.py:
--------------------------------------------------------------------------------
1 | import six
2 | 
3 | 
4 | def bytes_to_str(s, encoding='utf-8'):
5 |     """Returns a str if a bytes object is given."""
6 |     if six.PY3 and isinstance(s, bytes):
7 |         return s.decode(encoding)
8 |     return s
9 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/project.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.0
 2 | Name: project
 3 | Version: 1.0
 4 | Summary: UNKNOWN
 5 | Home-page: UNKNOWN
 6 | Author: UNKNOWN
 7 | Author-email: UNKNOWN
 8 | License: UNKNOWN
 9 | Description: UNKNOWN
10 | Platform: UNKNOWN
11 | 


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | __author__ = 'bobby'
 3 | 
 4 | from scrapy.cmdline import execute
 5 | 
 6 | import sys
 7 | import os
 8 | 
 9 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
10 | execute(["scrapy", "crawl", "jobbole"])


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/scrapy_redis/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from .connection import (  # NOQA
 3 |     get_redis,
 4 |     get_redis_from_settings,
 5 | )
 6 | 
 7 | 
 8 | __author__ = 'Rolando Espinoza'
 9 | __email__ = 'rolando at rmax.io'
10 | __version__ = '0.7.0-dev'
11 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="useProjectProfile" value="false" />
4 |     <option name="USE_PROJECT_PROFILE" value="false" />
5 |     <version value="1.0" />
6 |   </settings>
7 | </component>


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="useProjectProfile" value="false" />
4 |     <option name="USE_PROJECT_PROFILE" value="false" />
5 |     <version value="1.0" />
6 |   </settings>
7 | </component>


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/LcvSearch.iml" filepath="$PROJECT_DIR$/.idea/LcvSearch.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/setup.py:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapyd-deploy
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | setup(
 6 |     name         = 'project',
 7 |     version      = '1.0',
 8 |     packages     = find_packages(),
 9 |     entry_points = {'scrapy': ['settings = ArticleSpider.settings']},
10 | )
11 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/ArticleSpider.iml" filepath="$PROJECT_DIR$/.idea/ArticleSpider.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/ScrapyRedisTest.iml" filepath="$PROJECT_DIR$/.idea/ScrapyRedisTest.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = ArticleSpider.settings
 8 | 
 9 | [deploy:bobby]
10 | url = http://localhost:6800/
11 | project = ArticleSpider
12 | 


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = ScrapyRedisTest.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ScrapyRedisTest
12 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | __author__ = 'bobby'
 3 | 
 4 | from scrapy.cmdline import execute
 5 | 
 6 | import sys
 7 | import os
 8 | 
 9 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
10 | execute(["scrapy", "crawl", "jobbole"])
11 | # execute(["scrapy", "crawl", "zhihu"])
12 | # execute(["scrapy", "crawl", "lagou"])


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/scrapy_redis/picklecompat.py:
--------------------------------------------------------------------------------
 1 | """A pickle wrapper module with protocol=-1 by default."""
 2 | 
 3 | try:
 4 |     import cPickle as pickle  # PY2
 5 | except ImportError:
 6 |     import pickle
 7 | 
 8 | 
 9 | def loads(s):
10 |     return pickle.loads(s)
11 | 
12 | 
13 | def dumps(obj):
14 |     return pickle.dumps(obj, protocol=-1)
15 | 


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/ScrapyRedisTest/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class ScrapyredistestPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/ScrapyRedisTest/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ScrapyredistestItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/LcvSearch/wsgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WSGI config for LcvSearch project.
 3 | 
 4 | It exposes the WSGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/1.11/howto/deployment/wsgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.wsgi import get_wsgi_application
13 | 
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "LcvSearch.settings")
15 | 
16 | application = get_wsgi_application()
17 | 


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/.idea/ScrapyRedisTest.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 3.5.3 virtualenv at E:\Evns\article_spider" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/ArticleSpider/utils/common.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | __author__ = 'bobby'
 3 | import hashlib
 4 | import re
 5 | 
 6 | 
 7 | def get_md5(url):
 8 |     if isinstance(url, str):
 9 |         url = url.encode("utf-8")
10 |     m = hashlib.md5()
11 |     m.update(url)
12 |     return m.hexdigest()
13 | 
14 | 
15 | def extract_num(text):
16 |     #从字符串中提取出数字
17 |     match_re = re.match(".*?(\d+).*", text)
18 |     if match_re:
19 |         nums = int(match_re.group(1))
20 |     else:
21 |         nums = 0
22 | 
23 |     return nums
24 | 
25 | if __name__ == "__main__":
26 |     print (get_md5("http://jobbole.com".encode("utf-8")))


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/.idea/ArticleSpider.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <sourceFolder url="file://$MODULE_DIR$/ArticleSpider" isTestSource="false" />
 6 |     </content>
 7 |     <orderEntry type="jdk" jdkName="Python 3.5.3 virtualenv at E:\Evns\article_spider" jdkType="Python SDK" />
 8 |     <orderEntry type="sourceFolder" forTests="false" />
 9 |   </component>
10 |   <component name="TestRunnerService">
11 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
12 |   </component>
13 | </module>


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/build/lib/ArticleSpider/utils/common.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | __author__ = 'bobby'
 3 | import hashlib
 4 | import re
 5 | 
 6 | 
 7 | def get_md5(url):
 8 |     if isinstance(url, str):
 9 |         url = url.encode("utf-8")
10 |     m = hashlib.md5()
11 |     m.update(url)
12 |     return m.hexdigest()
13 | 
14 | 
15 | def extract_num(text):
16 |     #从字符串中提取出数字
17 |     match_re = re.match(".*?(\d+).*", text)
18 |     if match_re:
19 |         nums = int(match_re.group(1))
20 |     else:
21 |         nums = 0
22 | 
23 |     return nums
24 | 
25 | if __name__ == "__main__":
26 |     print (get_md5("http://jobbole.com".encode("utf-8")))


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/scrapy_redis/defaults.py:
--------------------------------------------------------------------------------
 1 | import redis
 2 | 
 3 | 
 4 | # For standalone use.
 5 | DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
 6 | 
 7 | PIPELINE_KEY = '%(spider)s:items'
 8 | 
 9 | REDIS_CLS = redis.StrictRedis
10 | REDIS_ENCODING = 'utf-8'
11 | # Sane connection defaults.
12 | REDIS_PARAMS = {
13 |     'socket_timeout': 30,
14 |     'socket_connect_timeout': 30,
15 |     'retry_on_timeout': True,
16 |     'encoding': REDIS_ENCODING,
17 | }
18 | 
19 | SCHEDULER_QUEUE_KEY = '%(spider)s:requests'
20 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
21 | SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'
22 | SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
23 | 
24 | START_URLS_KEY = '%(name)s:start_urls'
25 | START_URLS_AS_SET = False
26 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/js/global.js:
--------------------------------------------------------------------------------
 1 | 
 2 | $(document).ready(function(){
 3 | 	
 4 | // 去除虚线框（会影响效率）
 5 | $("a,input:checkbox,input:radio,button,input:button").live('focus',function(){$(this).blur();});
 6 | 
 7 | });
 8 | 
 9 | 
10 | function hideElement(currentElement, targetElement) {
11 | 	if (!$.isArray(targetElement)) {
12 | 		targetElement = [ targetElement ];
13 | 	}
14 | 	$(document).on("click.hideElement", function(e) {
15 | 		var len = 0, $target = $(e.target);
16 | 		for (var i = 0, length = targetElement.length; i < length; i++) {
17 | 			$.each(targetElement[i], function(j, n) {
18 | 				if ($target.is($(n)) || $.contains($(n)[0], $target[0])) {
19 | 					len++;
20 | 				}
21 | 			});
22 | 		}
23 | 		if ($.contains(currentElement[0], $target[0])) {
24 | 			len = 1;
25 | 		}
26 | 		if (len == 0) {
27 | 			currentElement.hide();
28 | 		}
29 | 	});
30 | };


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/js/global.js:
--------------------------------------------------------------------------------
 1 | 
 2 | $(document).ready(function(){
 3 | 	
 4 | // 去除虚线框（会影响效率）
 5 | $("a,input:checkbox,input:radio,button,input:button").live('focus',function(){$(this).blur();});
 6 | 
 7 | });
 8 | 
 9 | 
10 | function hideElement(currentElement, targetElement) {
11 | 	if (!$.isArray(targetElement)) {
12 | 		targetElement = [ targetElement ];
13 | 	}
14 | 	$(document).on("click.hideElement", function(e) {
15 | 		var len = 0, $target = $(e.target);
16 | 		for (var i = 0, length = targetElement.length; i < length; i++) {
17 | 			$.each(targetElement[i], function(j, n) {
18 | 				if ($target.is($(n)) || $.contains($(n)[0], $target[0])) {
19 | 					len++;
20 | 				}
21 | 			});
22 | 		}
23 | 		if ($.contains(currentElement[0], $target[0])) {
24 | 			len = 1;
25 | 		}
26 | 		if (len == 0) {
27 | 			currentElement.hide();
28 | 		}
29 | 	});
30 | };


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/project.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | setup.py
 2 | ArticleSpider/__init__.py
 3 | ArticleSpider/items.py
 4 | ArticleSpider/middlewares.py
 5 | ArticleSpider/pipelines.py
 6 | ArticleSpider/settings.py
 7 | ArticleSpider/models/__init__.py
 8 | ArticleSpider/models/es_types.py
 9 | ArticleSpider/spiders/__init__.py
10 | ArticleSpider/spiders/jobbole.py
11 | ArticleSpider/spiders/lagou.py
12 | ArticleSpider/spiders/zhihu.py
13 | ArticleSpider/utils/__init__.py
14 | ArticleSpider/utils/bloomfilter.py
15 | ArticleSpider/utils/common.py
16 | ArticleSpider/utils/zhihu_login_requests.py
17 | project.egg-info/PKG-INFO
18 | project.egg-info/SOURCES.txt
19 | project.egg-info/dependency_links.txt
20 | project.egg-info/entry_points.txt
21 | project.egg-info/top_level.txt
22 | tools/__init__.py
23 | tools/crawl_xici_ip.py
24 | tools/selenium_spider.py
25 | tools/yundama_requests.py


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/manage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | 
 5 | if __name__ == "__main__":
 6 |     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "LcvSearch.settings")
 7 |     try:
 8 |         from django.core.management import execute_from_command_line
 9 |     except ImportError:
10 |         # The above import may fail for some other reason. Ensure that the
11 |         # issue is really that Django is missing to avoid masking other
12 |         # exceptions on Python 2.
13 |         try:
14 |             import django
15 |         except ImportError:
16 |             raise ImportError(
17 |                 "Couldn't import Django. Are you sure it's installed and "
18 |                 "available on your PYTHONPATH environment variable? Did you "
19 |                 "forget to activate a virtual environment?"
20 |             )
21 |         raise
22 |     execute_from_command_line(sys.argv)
23 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/js/common.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Created by yli on 2017/4/21.
 3 |  */
 4 | 
 5 | var searchArr;
 6 | //定义一个search的，判断浏览器有无数据存储（搜索历史）
 7 | if(localStorage.search){
 8 | //如果有，转换成 数组的形式存放到searchArr的数组里（localStorage以字符串的形式存储，所以要把它转换成数组的形式）
 9 |     searchArr= localStorage.search.split(",")
10 | }else{
11 | //如果没有，则定义searchArr为一个空的数组
12 |     searchArr = [];
13 | }
14 | //把存储的数据显示出来作为搜索历史
15 | MapSearchArr();
16 | 
17 | 
18 | $("#btn").on("click", function(){
19 |     var val = $("#inp").val();
20 | //点击搜索按钮时，去重
21 |     KillRepeat(val);
22 | //去重后把数组存储到浏览器localStorage
23 |     localStorage.search = searchArr;
24 | //然后再把搜索内容显示出来
25 |     MapSearchArr();
26 | });
27 | 
28 | 
29 | function MapSearchArr(){
30 |     var tmpHtml = "";
31 |     for (var i=0;i<searchArr.length;i++){
32 |         tmpHtml += "<span>" + searchArr[i] + "</span> "
33 |     }
34 |     $("#keyname").html(tmpHtml);
35 | }
36 | //去重
37 | function KillRepeat(val){
38 |     var kill = 0;
39 |     for (var i=0;i<searchArr.length;i++){
40 |         if(val===searchArr[i]){
41 |             kill ++;
42 |         }
43 |     }
44 |     if(kill<1){
45 |         searchArr.push(val);
46 |     }
47 | }
48 | 
49 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/js/common.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Created by yli on 2017/4/21.
 3 |  */
 4 | 
 5 | var searchArr;
 6 | //定义一个search的，判断浏览器有无数据存储（搜索历史）
 7 | if(localStorage.search){
 8 | //如果有，转换成 数组的形式存放到searchArr的数组里（localStorage以字符串的形式存储，所以要把它转换成数组的形式）
 9 |     searchArr= localStorage.search.split(",")
10 | }else{
11 | //如果没有，则定义searchArr为一个空的数组
12 |     searchArr = [];
13 | }
14 | //把存储的数据显示出来作为搜索历史
15 | MapSearchArr();
16 | 
17 | 
18 | $("#btn").on("click", function(){
19 |     var val = $("#inp").val();
20 | //点击搜索按钮时，去重
21 |     KillRepeat(val);
22 | //去重后把数组存储到浏览器localStorage
23 |     localStorage.search = searchArr;
24 | //然后再把搜索内容显示出来
25 |     MapSearchArr();
26 | });
27 | 
28 | 
29 | function MapSearchArr(){
30 |     var tmpHtml = "";
31 |     for (var i=0;i<searchArr.length;i++){
32 |         tmpHtml += "<span>" + searchArr[i] + "</span> "
33 |     }
34 |     $("#keyname").html(tmpHtml);
35 | }
36 | //去重
37 | function KillRepeat(val){
38 |     var kill = 0;
39 |     for (var i=0;i<searchArr.length;i++){
40 |         if(val===searchArr[i]){
41 |             kill ++;
42 |         }
43 |     }
44 |     if(kill<1){
45 |         searchArr.push(val);
46 |     }
47 | }
48 | 
49 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/LcvSearch/urls.py:
--------------------------------------------------------------------------------
 1 | """LcvSearch URL Configuration
 2 | 
 3 | The `urlpatterns` list routes URLs to views. For more information please see:
 4 |     https://docs.djangoproject.com/en/1.11/topics/http/urls/
 5 | Examples:
 6 | Function views
 7 |     1. Add an import:  from my_app import views
 8 |     2. Add a URL to urlpatterns:  url(r'^$', views.home, name='home')
 9 | Class-based views
10 |     1. Add an import:  from other_app.views import Home
11 |     2. Add a URL to urlpatterns:  url(r'^$', Home.as_view(), name='home')
12 | Including another URLconf
13 |     1. Import the include() function: from django.conf.urls import url, include
14 |     2. Add a URL to urlpatterns:  url(r'^blog/', include('blog.urls'))
15 | """
16 | from django.conf.urls import url
17 | from django.contrib import admin
18 | from django.views.generic import TemplateView
19 | from search.views import SearchSuggest, SearchView, IndexView
20 | 
21 | urlpatterns = [
22 |     url(r'^admin/', admin.site.urls),
23 |     url(r'^$', IndexView.as_view(), name="index"),
24 | 
25 |     url(r'^suggest/$', SearchSuggest.as_view(), name="suggest"),
26 | 
27 |     url(r'^search/$', SearchView.as_view(), name="search"),
28 | ]
29 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/.idea/LcvSearch.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="FacetManager">
 4 |     <facet type="django" name="Django">
 5 |       <configuration>
 6 |         <option name="rootFolder" value="$MODULE_DIR$" />
 7 |         <option name="settingsModule" value="LcvSearch/settings.py" />
 8 |         <option name="manageScript" value="manage.py" />
 9 |         <option name="environment" value="&lt;map/&gt;" />
10 |       </configuration>
11 |     </facet>
12 |   </component>
13 |   <component name="NewModuleRootManager">
14 |     <content url="file://$MODULE_DIR$" />
15 |     <orderEntry type="jdk" jdkName="Python 3.5.2 virtualenv at E:\virtualevn\scrapy3" jdkType="Python SDK" />
16 |     <orderEntry type="sourceFolder" forTests="false" />
17 |   </component>
18 |   <component name="TemplatesService">
19 |     <option name="TEMPLATE_CONFIGURATION" value="Django" />
20 |     <option name="TEMPLATE_FOLDERS">
21 |       <list>
22 |         <option value="$MODULE_DIR$/../LcvSearch\templates" />
23 |       </list>
24 |     </option>
25 |   </component>
26 |   <component name="TestRunnerService">
27 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
28 |   </component>
29 | </module>


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/search/models.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | __author__ = 'bobby'
 3 | 
 4 | from datetime import datetime
 5 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
 6 |     analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
 7 | 
 8 | from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
 9 | 
10 | from elasticsearch_dsl.connections import connections
11 | connections.create_connection(hosts=["localhost"])
12 | 
13 | class CustomAnalyzer(_CustomAnalyzer):
14 |     def get_analysis_definition(self):
15 |         return {}
16 | 
17 | 
18 | ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])
19 | 
20 | class ArticleType(DocType):
21 |     #伯乐在线文章类型
22 |     suggest = Completion(analyzer=ik_analyzer)
23 |     title = Text(analyzer="ik_max_word")
24 |     create_date = Date()
25 |     url = Keyword()
26 |     url_object_id = Keyword()
27 |     front_image_url = Keyword()
28 |     front_image_path = Keyword()
29 |     praise_nums = Integer()
30 |     comment_nums = Integer()
31 |     fav_nums = Integer()
32 |     tags = Text(analyzer="ik_max_word")
33 |     content = Text(analyzer="ik_max_word")
34 | 
35 |     class Meta:
36 |         index = "jobbole"
37 |         doc_type = "article"
38 | 
39 | if __name__ == "__main__":
40 |     ArticleType.init()
41 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/ArticleSpider/models/es_types.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | __author__ = 'bobby'
 3 | 
 4 | from datetime import datetime
 5 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
 6 |     analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
 7 | 
 8 | from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
 9 | 
10 | from elasticsearch_dsl.connections import connections
11 | connections.create_connection(hosts=["localhost"])
12 | 
13 | class CustomAnalyzer(_CustomAnalyzer):
14 |     def get_analysis_definition(self):
15 |         return {}
16 | 
17 | 
18 | ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])
19 | 
20 | class ArticleType(DocType):
21 |     #伯乐在线文章类型
22 |     suggest = Completion(analyzer=ik_analyzer)
23 |     title = Text(analyzer="ik_max_word")
24 |     create_date = Date()
25 |     url = Keyword()
26 |     url_object_id = Keyword()
27 |     front_image_url = Keyword()
28 |     front_image_path = Keyword()
29 |     praise_nums = Integer()
30 |     comment_nums = Integer()
31 |     fav_nums = Integer()
32 |     tags = Text(analyzer="ik_max_word")
33 |     content = Text(analyzer="ik_max_word")
34 | 
35 |     class Meta:
36 |         index = "jobbole"
37 |         doc_type = "article"
38 | 
39 | if __name__ == "__main__":
40 |     ArticleType.init()
41 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/build/lib/ArticleSpider/models/es_types.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | __author__ = 'bobby'
 3 | 
 4 | from datetime import datetime
 5 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
 6 |     analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
 7 | 
 8 | from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
 9 | 
10 | from elasticsearch_dsl.connections import connections
11 | connections.create_connection(hosts=["localhost"])
12 | 
13 | class CustomAnalyzer(_CustomAnalyzer):
14 |     def get_analysis_definition(self):
15 |         return {}
16 | 
17 | 
18 | ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])
19 | 
20 | class ArticleType(DocType):
21 |     #伯乐在线文章类型
22 |     suggest = Completion(analyzer=ik_analyzer)
23 |     title = Text(analyzer="ik_max_word")
24 |     create_date = Date()
25 |     url = Keyword()
26 |     url_object_id = Keyword()
27 |     front_image_url = Keyword()
28 |     front_image_path = Keyword()
29 |     praise_nums = Integer()
30 |     comment_nums = Integer()
31 |     fav_nums = Integer()
32 |     tags = Text(analyzer="ik_max_word")
33 |     content = Text(analyzer="ik_max_word")
34 | 
35 |     class Meta:
36 |         index = "jobbole"
37 |         doc_type = "article"
38 | 
39 | if __name__ == "__main__":
40 |     ArticleType.init()
41 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/tools/ghostdriver.log:
--------------------------------------------------------------------------------
1 | [INFO  - 2017-04-04T03:05:37.523Z] GhostDriver - Main - running on port 53940
2 | [INFO  - 2017-04-04T03:05:40.868Z] Session [95d6e1e0-18e3-11e7-883b-4735e6270c64] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.1 Safari/538.1","webSecurityEnabled":true}
3 | [INFO  - 2017-04-04T03:05:40.868Z] Session [95d6e1e0-18e3-11e7-883b-4735e6270c64] - page.customHeaders:  - {}
4 | [INFO  - 2017-04-04T03:05:40.869Z] Session [95d6e1e0-18e3-11e7-883b-4735e6270c64] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"windows-7-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}}
5 | [INFO  - 2017-04-04T03:05:40.869Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: 95d6e1e0-18e3-11e7-883b-4735e6270c64
6 | 


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/ScrapyRedisTest/spiders/jobbole.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | __author__ = 'bobby'
 3 | 
 4 | from scrapy.http import Request
 5 | from urllib import parse
 6 | 
 7 | from scrapy_redis.spiders import RedisSpider
 8 | 
 9 | class JobboleSpider(RedisSpider):
10 |     name = 'jobbole'
11 |     allowed_domains = ["blog.jobbole.com"]
12 |     redis_key = 'jobbole:start_urls'
13 | 
14 |     # 收集伯乐在线所有404的url以及404页面数
15 |     handle_httpstatus_list = [404]
16 | 
17 | 
18 |     def parse(self, response):
19 |         """
20 |         1. 获取文章列表页中的文章url并交给scrapy下载后并进行解析
21 |         2. 获取下一页的url并交给scrapy进行下载， 下载完成后交给parse
22 |         """
23 |         # 解析列表页中的所有文章url并交给scrapy下载后并进行解析
24 |         if response.status == 404:
25 |             self.fail_urls.append(response.url)
26 |             self.crawler.stats.inc_value("failed_url")
27 | 
28 |         post_nodes = response.css("#archive .floated-thumb .post-thumb a")
29 |         for post_node in post_nodes:
30 |             image_url = post_node.css("img::attr(src)").extract_first("")
31 |             post_url = post_node.css("::attr(href)").extract_first("")
32 |             yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url": image_url},
33 |                           callback=self.parse_detail)
34 | 
35 |         # 提取下一页并交给scrapy进行下载
36 |         next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
37 |         if next_url:
38 |             yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse)
39 | 
40 |     def parse_detail(self, response):
41 |         pass


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/css/index.css:
--------------------------------------------------------------------------------
 1 | @charset "utf-8";
 2 | html{*overflow:auto;}
 3 | #main{width:730px;margin:75px auto 0;}
 4 | #main h1.title{width:600px;}
 5 | #bd{margin-bottom:20px;}
 6 | .logo.large{margin:0px auto 10px auto;width:342px;height:144px;background: url(../img/logo.png) no-repeat center center;}
 7 | 
 8 | /*nav样式*/
 9 | .nav{margin-bottom:10px;}
10 | .searchList{float:left;padding-left:5px;}
11 | .searchList .searchItem{float:left;margin-right:15px;font-size:14px;padding:0 0 2px 2px;cursor:pointer;height:18px;}
12 | .searchList .searchItem.current{color:#0080cc;border-bottom:2px solid #9cc813;font-weight:bold;}
13 | 
14 | /*input搜索区域*/
15 | .inputArea{position:relative;margin-bottom:65px;}
16 | .inputArea .searchInput{border:1px solid #bfbfbf;padding:0 15px;outline:none;height:40px;*height:39px;*line-height:40px;width:520px; background:url(../img/inputbg.png);font-size:14px;}
17 | .inputArea .searchButton{position:absolute;left:550px;*left:552px;*top:1px;width:106px;height:42px;*height:41px;background:url(../img/seachbtn.png) no-repeat;border:none;cursor:pointer;}
18 | /*高级搜索*/
19 | .inputArea .advanced{position:absolute;font-size:14px;left:674px;top:12px;text-decoration:underline;}
20 | /*联想下拉区域*/
21 | .inputArea .dataList{display:none;position:absolute;left:0;top:42px;*top:43px;width:550px;padding:5px 0;background:#fff;border:1px solid #bfbfbf;border-top:none;}
22 | .inputArea .dataList li{padding:2px 15px;font-size:14px;}
23 | .inputArea .dataList li:hover{background:#f0f0f0;color:#0080cc;font-weight:bold;}
24 | 
25 | /*搜索历史区域*/
26 | .historyArea{margin:0 auto;width:485px;}
27 | .historyArea .history {margin-bottom:5px;}
28 | .historyArea .history label{font-weight:bold;}
29 | .historyArea .history a{margin-right:12px;}
30 | 
31 | /*版权信息*/
32 | .foot{position:absolute;bottom:0px;width:100%;}
33 | .foot .wrap{margin:0 auto;}
34 | .foot .copyright{position:relative;top:-35px;color:#ababab;text-align:center;}


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/css/index.css:
--------------------------------------------------------------------------------
 1 | @charset "utf-8";
 2 | html{*overflow:auto;}
 3 | #main{width:730px;margin:75px auto 0;}
 4 | #main h1.title{width:600px;}
 5 | #bd{margin-bottom:20px;}
 6 | .logo.large{margin:0px auto 10px auto;width:342px;height:144px;background: url(../img/logo.png) no-repeat center center;}
 7 | 
 8 | /*nav样式*/
 9 | .nav{margin-bottom:10px;}
10 | .searchList{float:left;padding-left:5px;}
11 | .searchList .searchItem{float:left;margin-right:15px;font-size:14px;padding:0 0 2px 2px;cursor:pointer;height:18px;}
12 | .searchList .searchItem.current{color:#0080cc;border-bottom:2px solid #9cc813;font-weight:bold;}
13 | 
14 | /*input搜索区域*/
15 | .inputArea{position:relative;margin-bottom:65px;}
16 | .inputArea .searchInput{border:1px solid #bfbfbf;padding:0 15px;outline:none;height:40px;*height:39px;*line-height:40px;width:520px; background:url(../img/inputbg.png);font-size:14px;}
17 | .inputArea .searchButton{position:absolute;left:550px;*left:552px;*top:1px;width:106px;height:42px;*height:41px;background:url(../img/seachbtn.png) no-repeat;border:none;cursor:pointer;}
18 | /*高级搜索*/
19 | .inputArea .advanced{position:absolute;font-size:14px;left:674px;top:12px;text-decoration:underline;}
20 | /*联想下拉区域*/
21 | .inputArea .dataList{display:none;position:absolute;left:0;top:42px;*top:43px;width:550px;padding:5px 0;background:#fff;border:1px solid #bfbfbf;border-top:none;}
22 | .inputArea .dataList li{padding:2px 15px;font-size:14px;}
23 | .inputArea .dataList li:hover{background:#f0f0f0;color:#0080cc;font-weight:bold;}
24 | 
25 | /*搜索历史区域*/
26 | .historyArea{margin:0 auto;width:485px;}
27 | .historyArea .history {margin-bottom:5px;}
28 | .historyArea .history label{font-weight:bold;}
29 | .historyArea .history a{margin-right:12px;}
30 | 
31 | /*版权信息*/
32 | .foot{position:absolute;bottom:0px;width:100%;}
33 | .foot .wrap{margin:0 auto;}
34 | .foot .copyright{position:relative;top:-35px;color:#ababab;text-align:center;}


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/build/lib/tools/selenium_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | __author__ = 'bobby'
 3 | 
 4 | from selenium import webdriver
 5 | from scrapy.selector import Selector
 6 | 
 7 | # browser = webdriver.Chrome(executable_path="D:/Temp/chromedriver.exe")
 8 | 
 9 | # browser.get("https://www.zhihu.com/#signin")
10 | #
11 | # browser.find_element_by_css_selector(".view-signin input[name='account']").send_keys("18782902568")
12 | # browser.find_element_by_css_selector(".view-signin input[name='password']").send_keys("admin125")
13 | #
14 | # browser.find_element_by_css_selector(".view-signin button.sign-button").click()
15 | #selenium 完成微博模拟登录
16 | 
17 | # browser.get("https://www.oschina.net/blog")
18 | # import time
19 | # time.sleep(5)
20 | # browser.find_element_by_css_selector("#loginname").send_keys("liyao198705@sina.com")
21 | # browser.find_element_by_css_selector(".info_list.password input[node-type='password']").send_keys("da_ge_da")
22 | # browser.find_element_by_css_selector(".info_list.login_btn a[node-type='submitBtn']").click()
23 | 
24 | # for i in range(3):
25 | #     browser.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;")
26 | #     time.sleep(3)
27 | # t_selector = Selector(text=browser.page_source)
28 | # print (t_selector.css(".tm-promo-price .tm-price::text").extract())
29 | 
30 | 
31 | #设置chromedriver不加载图片
32 | # chrome_opt = webdriver.ChromeOptions()
33 | # prefs = {"profile.managed_default_content_settings.images":2}
34 | # chrome_opt.add_experimental_option("prefs", prefs)
35 | 
36 | 
37 | #phantomjs, 无界面的浏览器， 多进程情况下phantomjs性能会下降很严重
38 | 
39 | browser = webdriver.PhantomJS(executable_path="E:/home/phantomjs-2.1.1-windows/bin/phantomjs.exe")
40 | browser.get("https://detail.tmall.com/item.htm?spm=a230r.1.14.3.yYBVG6&id=538286972599&cm_id=140105335569ed55e27b&abbucket=15&sku_properties=10004:709990523;5919063:6536025")
41 | 
42 | print (browser.page_source)
43 | browser.quit()


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/ScrapyRedisTest/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class ScrapyredistestSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/ArticleSpider/spiders/lagou.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from datetime import datetime
 3 | import scrapy
 4 | from scrapy.linkextractors import LinkExtractor
 5 | from scrapy.spiders import CrawlSpider, Rule
 6 | 
 7 | from items import LagouJobItemLoader, LagouJobItem
 8 | from ArticleSpider.utils.common import get_md5
 9 | 
10 | class LagouSpider(CrawlSpider):
11 |     name = 'lagou'
12 |     allowed_domains = ['www.lagou.com']
13 |     start_urls = ['https://www.lagou.com']
14 | 
15 |     rules = (
16 |         Rule(LinkExtractor(allow=("zhaopin/.*",)), follow=True),
17 |         Rule(LinkExtractor(allow=("gongsi/j\d+.html",)), follow=True),
18 |         Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True),
19 |     )
20 |     #
21 |     # def parse_start_url(self, response):
22 |     #     return []
23 |     #
24 |     # def process_results(self, response, results):
25 |     #     return results
26 | 
27 |     def parse_job(self, response):
28 |         #解析拉勾网的职位
29 |         item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response)
30 |         item_loader.add_css("title", ".job-name::attr(title)")
31 |         item_loader.add_value("url", response.url)
32 |         item_loader.add_value("url_object_id", get_md5(response.url))
33 |         item_loader.add_css("salary", ".job_request .salary::text")
34 |         item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()")
35 |         item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()")
36 |         item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()")
37 |         item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()")
38 | 
39 |         item_loader.add_css("tags", '.position-label li::text')
40 |         item_loader.add_css("publish_time", ".publish_time::text")
41 |         item_loader.add_css("job_advantage", ".job-advantage p::text")
42 |         item_loader.add_css("job_desc", ".job_bt div")
43 |         item_loader.add_css("job_addr", ".work_addr")
44 |         item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
45 |         item_loader.add_css("company_url", "#job_company dt a::attr(href)")
46 |         item_loader.add_value("crawl_time", datetime.now())
47 | 
48 |         job_item = item_loader.load_item()
49 | 
50 |         return job_item
51 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/build/lib/ArticleSpider/spiders/lagou.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from datetime import datetime
 3 | import scrapy
 4 | from scrapy.linkextractors import LinkExtractor
 5 | from scrapy.spiders import CrawlSpider, Rule
 6 | 
 7 | from items import LagouJobItemLoader, LagouJobItem
 8 | from ArticleSpider.utils.common import get_md5
 9 | 
10 | class LagouSpider(CrawlSpider):
11 |     name = 'lagou'
12 |     allowed_domains = ['www.lagou.com']
13 |     start_urls = ['https://www.lagou.com']
14 | 
15 |     rules = (
16 |         Rule(LinkExtractor(allow=("zhaopin/.*",)), follow=True),
17 |         Rule(LinkExtractor(allow=("gongsi/j\d+.html",)), follow=True),
18 |         Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True),
19 |     )
20 |     #
21 |     # def parse_start_url(self, response):
22 |     #     return []
23 |     #
24 |     # def process_results(self, response, results):
25 |     #     return results
26 | 
27 |     def parse_job(self, response):
28 |         #解析拉勾网的职位
29 |         item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response)
30 |         item_loader.add_css("title", ".job-name::attr(title)")
31 |         item_loader.add_value("url", response.url)
32 |         item_loader.add_value("url_object_id", get_md5(response.url))
33 |         item_loader.add_css("salary", ".job_request .salary::text")
34 |         item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()")
35 |         item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()")
36 |         item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()")
37 |         item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()")
38 | 
39 |         item_loader.add_css("tags", '.position-label li::text')
40 |         item_loader.add_css("publish_time", ".publish_time::text")
41 |         item_loader.add_css("job_advantage", ".job-advantage p::text")
42 |         item_loader.add_css("job_desc", ".job_bt div")
43 |         item_loader.add_css("job_addr", ".work_addr")
44 |         item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
45 |         item_loader.add_css("company_url", "#job_company dt a::attr(href)")
46 |         item_loader.add_value("crawl_time", datetime.now())
47 | 
48 |         job_item = item_loader.load_item()
49 | 
50 |         return job_item
51 | 


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/scrapy_redis/pipelines.py:
--------------------------------------------------------------------------------
 1 | from scrapy.utils.misc import load_object
 2 | from scrapy.utils.serialize import ScrapyJSONEncoder
 3 | from twisted.internet.threads import deferToThread
 4 | 
 5 | from . import connection, defaults
 6 | 
 7 | 
 8 | default_serialize = ScrapyJSONEncoder().encode
 9 | 
10 | 
11 | class RedisPipeline(object):
12 |     """Pushes serialized item into a redis list/queue
13 | 
14 |     Settings
15 |     --------
16 |     REDIS_ITEMS_KEY : str
17 |         Redis key where to store items.
18 |     REDIS_ITEMS_SERIALIZER : str
19 |         Object path to serializer function.
20 | 
21 |     """
22 | 
23 |     def __init__(self, server,
24 |                  key=defaults.PIPELINE_KEY,
25 |                  serialize_func=default_serialize):
26 |         """Initialize pipeline.
27 | 
28 |         Parameters
29 |         ----------
30 |         server : StrictRedis
31 |             Redis client instance.
32 |         key : str
33 |             Redis key where to store items.
34 |         serialize_func : callable
35 |             Items serializer function.
36 | 
37 |         """
38 |         self.server = server
39 |         self.key = key
40 |         self.serialize = serialize_func
41 | 
42 |     @classmethod
43 |     def from_settings(cls, settings):
44 |         params = {
45 |             'server': connection.from_settings(settings),
46 |         }
47 |         if settings.get('REDIS_ITEMS_KEY'):
48 |             params['key'] = settings['REDIS_ITEMS_KEY']
49 |         if settings.get('REDIS_ITEMS_SERIALIZER'):
50 |             params['serialize_func'] = load_object(
51 |                 settings['REDIS_ITEMS_SERIALIZER']
52 |             )
53 | 
54 |         return cls(**params)
55 | 
56 |     @classmethod
57 |     def from_crawler(cls, crawler):
58 |         return cls.from_settings(crawler.settings)
59 | 
60 |     def process_item(self, item, spider):
61 |         return deferToThread(self._process_item, item, spider)
62 | 
63 |     def _process_item(self, item, spider):
64 |         key = self.item_key(item, spider)
65 |         data = self.serialize(item)
66 |         self.server.rpush(key, data)
67 |         return item
68 | 
69 |     def item_key(self, item, spider):
70 |         """Returns redis key based on given spider.
71 | 
72 |         Override this function to use a different key depending on the item
73 |         and/or spider.
74 | 
75 |         """
76 |         return self.key % {'spider': spider.name}
77 | 


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/scrapy_redis/connection.py:
--------------------------------------------------------------------------------
 1 | import six
 2 | 
 3 | from scrapy.utils.misc import load_object
 4 | 
 5 | from . import defaults
 6 | 
 7 | 
 8 | # Shortcut maps 'setting name' -> 'parmater name'.
 9 | SETTINGS_PARAMS_MAP = {
10 |     'REDIS_URL': 'url',
11 |     'REDIS_HOST': 'host',
12 |     'REDIS_PORT': 'port',
13 |     'REDIS_ENCODING': 'encoding',
14 | }
15 | 
16 | 
17 | def get_redis_from_settings(settings):
18 |     """Returns a redis client instance from given Scrapy settings object.
19 | 
20 |     This function uses ``get_client`` to instantiate the client and uses
21 |     ``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You
22 |     can override them using the ``REDIS_PARAMS`` setting.
23 | 
24 |     Parameters
25 |     ----------
26 |     settings : Settings
27 |         A scrapy settings object. See the supported settings below.
28 | 
29 |     Returns
30 |     -------
31 |     server
32 |         Redis client instance.
33 | 
34 |     Other Parameters
35 |     ----------------
36 |     REDIS_URL : str, optional
37 |         Server connection URL.
38 |     REDIS_HOST : str, optional
39 |         Server host.
40 |     REDIS_PORT : str, optional
41 |         Server port.
42 |     REDIS_ENCODING : str, optional
43 |         Data encoding.
44 |     REDIS_PARAMS : dict, optional
45 |         Additional client parameters.
46 | 
47 |     """
48 |     params = defaults.REDIS_PARAMS.copy()
49 |     params.update(settings.getdict('REDIS_PARAMS'))
50 |     # XXX: Deprecate REDIS_* settings.
51 |     for source, dest in SETTINGS_PARAMS_MAP.items():
52 |         val = settings.get(source)
53 |         if val:
54 |             params[dest] = val
55 | 
56 |     # Allow ``redis_cls`` to be a path to a class.
57 |     if isinstance(params.get('redis_cls'), six.string_types):
58 |         params['redis_cls'] = load_object(params['redis_cls'])
59 | 
60 |     return get_redis(**params)
61 | 
62 | 
63 | # Backwards compatible alias.
64 | from_settings = get_redis_from_settings
65 | 
66 | 
67 | def get_redis(**kwargs):
68 |     """Returns a redis client instance.
69 | 
70 |     Parameters
71 |     ----------
72 |     redis_cls : class, optional
73 |         Defaults to ``redis.StrictRedis``.
74 |     url : str, optional
75 |         If given, ``redis_cls.from_url`` is used to instantiate the class.
76 |     **kwargs
77 |         Extra parameters to be passed to the ``redis_cls`` class.
78 | 
79 |     Returns
80 |     -------
81 |     server
82 |         Redis client instance.
83 | 
84 |     """
85 |     redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS)
86 |     url = kwargs.pop('url', None)
87 |     if url:
88 |         return redis_cls.from_url(url, **kwargs)
89 |     else:
90 |         return redis_cls(**kwargs)
91 | 


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/ScrapyRedisTest/utils/bloomfilter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import mmh3
 3 | import redis
 4 | import math
 5 | import time
 6 | 
 7 | 
 8 | class PyBloomFilter():
 9 |     #内置100个随机种子
10 |     SEEDS = [543, 460, 171, 876, 796, 607, 650, 81, 837, 545, 591, 946, 846, 521, 913, 636, 878, 735, 414, 372,
11 |              344, 324, 223, 180, 327, 891, 798, 933, 493, 293, 836, 10, 6, 544, 924, 849, 438, 41, 862, 648, 338,
12 |              465, 562, 693, 979, 52, 763, 103, 387, 374, 349, 94, 384, 680, 574, 480, 307, 580, 71, 535, 300, 53,
13 |              481, 519, 644, 219, 686, 236, 424, 326, 244, 212, 909, 202, 951, 56, 812, 901, 926, 250, 507, 739, 371,
14 |              63, 584, 154, 7, 284, 617, 332, 472, 140, 605, 262, 355, 526, 647, 923, 199, 518]
15 | 
16 |     #capacity是预先估计要去重的数量
17 |     #error_rate表示错误率
18 |     #conn表示redis的连接客户端
19 |     #key表示在redis中的键的名字前缀
20 |     def __init__(self, capacity=1000000000, error_rate=0.00000001, conn=None, key='BloomFilter'):
21 |         self.m = math.ceil(capacity*math.log2(math.e)*math.log2(1/error_rate))      #需要的总bit位数
22 |         self.k = math.ceil(math.log1p(2)*self.m/capacity)                           #需要最少的hash次数
23 |         self.mem = math.ceil(self.m/8/1024/1024)                                    #需要的多少M内存
24 |         self.blocknum = math.ceil(self.mem/512)                                     #需要多少个512M的内存块,value的第一个字符必须是ascii码，所有最多有256个内存块
25 |         self.seeds = self.SEEDS[0:self.k]
26 |         self.key = key
27 |         self.N = 2**31-1
28 |         self.redis = conn
29 |         # print(self.mem)
30 |         # print(self.k)
31 | 
32 |     def add(self, value):
33 |         name = self.key + "_" + str(ord(value[0])%self.blocknum)
34 |         hashs = self.get_hashs(value)
35 |         for hash in hashs:
36 |             self.redis.setbit(name, hash, 1)
37 | 
38 |     def is_exist(self, value):
39 |         name = self.key + "_" + str(ord(value[0])%self.blocknum)
40 |         hashs = self.get_hashs(value)
41 |         exist = True
42 |         for hash in hashs:
43 |             exist = exist & self.redis.getbit(name, hash)
44 |         return exist
45 | 
46 |     def get_hashs(self, value):
47 |         hashs = list()
48 |         for seed in self.seeds:
49 |             hash = mmh3.hash(value, seed)
50 |             if hash >= 0:
51 |                 hashs.append(hash)
52 |             else:
53 |                 hashs.append(self.N - hash)
54 |         return hashs
55 | 
56 | 
57 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)
58 | conn = redis.StrictRedis(connection_pool=pool)
59 | 
60 | if __name__ == "__main__":
61 |     bf = PyBloomFilter(conn=conn)
62 |     bf.add('www.jobbole.com')
63 |     bf.add('www.zhihu.com')
64 |     print(bf.is_exist('www.zhihu.com'))
65 |     print(bf.is_exist('www.lagou.com'))
66 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/tools/selenium_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from selenium import webdriver
 4 | from scrapy.selector import Selector
 5 | 
 6 | 
 7 | #知乎的模拟登录
 8 | # browser = webdriver.Chrome(executable_path="E:/chromedriver.exe")  #路径是chromedriver.exe的存放的位置
 9 | # browser.get("https://www.zhihu.com/#signin")
10 | # browser.find_element_by_css_selector(".view-signin input[name='account']").send_keys("********") #帐号
11 | # browser.find_element_by_css_selector(".view-signin input[name='password']").send_keys("********") #密码
12 | # browser.find_element_by_id("captcha").send_keys(input('请输入验证码：'))
13 | # browser.find_element_by_css_selector(".view-signin button.sign-button").click() #登录
14 | # browser.quit()
15 | #
16 | #
17 | # #可以用selenium得到js加载后的html，比如这样的话可以抓取到本来抓取的不到的一些字段（淘宝的交易量等等）
18 | # browser = webdriver.Chrome(executable_path="E:/chromedriver.exe")
19 | # browser.get("https://detail.tmall.com/item.htm?spm=a230r.1.14.3.yYBVG6&id=538286972599&cm_id=140105335569ed55e27b&abbucket=15&sku_properties=10004:709990523;5919063:6536025")
20 | # print(browser.page_source) #page_source就是js加载完的源代码
21 | # #browser.quit()
22 | # '''
23 | # 如果是用selenium本身的选择器（python写的，比较慢），会很慢
24 | # 所以现在转换成scrapy中的selector（他是用c语言写的，很快）
25 | # 模版，也可以嵌入scrapy中
26 | # '''
27 | # t_selector=Selector(text=browser.page_source)
28 | # print(t_selector.xpath('//*[@id="J_StrPriceModBox"]/dd/span/text()').extract())
29 | 
30 | 
31 | # #selenium 完成微博模拟登录
32 | # browser = webdriver.Chrome(executable_path="E:/chromedriver.exe")
33 | # browser.get("http://weibo.com/")
34 | # import time
35 | # time.sleep(5)
36 | # browser.find_element_by_css_selector("#loginname").send_keys("******")
37 | # browser.find_element_by_css_selector(".info_list.password input[node-type='password']").send_keys("******")
38 | # browser.find_element_by_css_selector(".info_list.login_btn a[node-type='submitBtn']").click()
39 | # #下拉
40 | # for i in range(3):
41 | #     '''三次下拉操作，这是javascript的知识'''
42 | #     browser.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;")
43 | #     time.sleep(3)
44 | 
45 | 
46 | #设置chromedriver不加载图片
47 | #是固定的模版
48 | # chrome_opt = webdriver.ChromeOptions()
49 | # prefs = {"profile.managed_default_content_settings.images":2}
50 | # chrome_opt.add_experimental_option("prefs", prefs)
51 | # browser = webdriver.Chrome(executable_path="E:/chromedriver.exe",chrome_options=chrome_opt)
52 | # browser.get("http://weibo.com/")
53 | 
54 | 
55 | #phantomjs, 无界面的浏览器， 多进程情况下phantomjs性能会下降很严重
56 | browser = webdriver.PhantomJS(executable_path="F:/迅雷下载/phantomjs-2.1.1-windows/bin/phantomjs.exe")
57 | browser.get("https://detail.tmall.com/item.htm?spm=a230r.1.14.3.yYBVG6&id=538286972599&cm_id=140105335569ed55e27b&abbucket=15&sku_properties=10004:709990523;5919063:6536025")
58 | print (browser.page_source)
59 | browser.quit()


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/css/style.css:
--------------------------------------------------------------------------------
 1 | @charset "utf-8";
 2 | /*css reset*/
 3 | html, body, div, span, object, iframe, h1, h2, h3, h4, h5, h6, p, blockquote, pre, a, abbr, acronym, address, code, del, dfn, em, img, q, dl, dt, dd, ol, ul, li, fieldset, form, label, legend, table, caption, tbody, tfoot, thead, tr, th, td , i{
 4 | 	margin:0;
 5 | 	padding:0;
 6 | 	border:0;
 7 | 	font-weight:inherit;
 8 | 	font-style:inherit;
 9 | 	font-size:100%;
10 | 	font-family:inherit;
11 | 	vertical-align:baseline;
12 | }
13 | body {line-height:1.5;}
14 | table {border-collapse: collapse;border-spacing:0;}
15 | caption, th, td ,b,strong{text-align:left;font-weight:normal;}
16 | table, td, th {vertical-align:middle;}	
17 | blockquote:before, blockquote:after, q:before, q:after {content:"";}
18 | blockquote, q {quotes:"" "";}
19 | a img {border:none;}
20 | em,cite{font-style:normal;}
21 | 
22 | 
23 | body { background:#fff; font: 12px/1.5 Tahoma,'宋体';color:#000;}
24 | h1, h2, h3, h4, h5, h6 {font-weight:normal;color:#111;}
25 | a {text-decoration:none;cursor:pointer;}
26 | dl, dt, dd, ol, ul, li{ list-style:none;}
27 | 
28 | /*some common class*/
29 | .left{float:left;}
30 | .right{float:right;}
31 | 
32 | /*clear*/
33 | .ue-clear:after{content: ".";display:block;height:0;clear:both;visibility:hidden;}
34 | .ue-clear{display:inline-block;}
35 | *html .ue-clear{height:1%;}
36 | .ue-clear{display:block;}
37 | 
38 | a{color:#0080cc;}
39 | a:hover{color:#267A01;text-decoration:underline;}
40 | /*logo样式*/
41 | .logo{width:160px;height:47px;padding:0 5px;background: url(../img/logo1.png) no-repeat center center #fff;}
42 | 
43 | /*choose样式*/
44 | .choose{float:left;margin-right:15px;white-space:nowrap;}
45 | .choose .text{float:left;padding-left:20px;*padding-left:16px;white-space:nowrap; vertical-align:text-bottom;}
46 | .choose input[type=radio],.choose input[type=checkbox]{position:relative;*top:-3px;float:left;margin-right:-16px;}
47 | 
48 | /*====================================
49 |   分页信息（表格依赖样式）
50 |   ===================================*/
51 | .pagination{font-size:14px;}
52 | .pagination a {text-decoration: none;border: solid 1px;	}
53 | .pagination .pxofy{float:left;margin-left: 5px;height:25px;*padding-top:1px;}	
54 | .pagination a, .pagination span {display: block;float: left;height:18px;line-height:18px;padding:0 6px;margin-right: 5px;font-family:Arial, Helvetica, sans-serif !important;}
55 | .pagination .current {cursor:default;border: solid 1px ;}
56 | .pagination .prev, .pagination .next{*line-height:22px;}
57 | 
58 | /*分页样式*/
59 | .pagination a{color: #032F54;border-color:#8EB2D2;}
60 | .pagination a:hover{color:#023054;border-color:#8EB2D2;background:#B8DFFB;}
61 | .pagination .current{color:#fff;border-color:#5c9bc4;background:#89B8D8;}
62 | .pagination .current.prev, .pagination .current.next{color:#B9B9B9;border-color:#D3D3D3;background:#fff;}
63 | .pagination .pxofy{color: #023054;}
64 | 
65 | #foot{height:32px;line-height:32px; text-align:center;background:#f9f9f9;border-top:1px solid #e0e0e0;color:#ababab;}
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/css/style.css:
--------------------------------------------------------------------------------
 1 | @charset "utf-8";
 2 | /*css reset*/
 3 | html, body, div, span, object, iframe, h1, h2, h3, h4, h5, h6, p, blockquote, pre, a, abbr, acronym, address, code, del, dfn, em, img, q, dl, dt, dd, ol, ul, li, fieldset, form, label, legend, table, caption, tbody, tfoot, thead, tr, th, td , i{
 4 | 	margin:0;
 5 | 	padding:0;
 6 | 	border:0;
 7 | 	font-weight:inherit;
 8 | 	font-style:inherit;
 9 | 	font-size:100%;
10 | 	font-family:inherit;
11 | 	vertical-align:baseline;
12 | }
13 | body {line-height:1.5;}
14 | table {border-collapse: collapse;border-spacing:0;}
15 | caption, th, td ,b,strong{text-align:left;font-weight:normal;}
16 | table, td, th {vertical-align:middle;}	
17 | blockquote:before, blockquote:after, q:before, q:after {content:"";}
18 | blockquote, q {quotes:"" "";}
19 | a img {border:none;}
20 | em,cite{font-style:normal;}
21 | 
22 | 
23 | body { background:#fff; font: 12px/1.5 Tahoma,'宋体';color:#000;}
24 | h1, h2, h3, h4, h5, h6 {font-weight:normal;color:#111;}
25 | a {text-decoration:none;cursor:pointer;}
26 | dl, dt, dd, ol, ul, li{ list-style:none;}
27 | 
28 | /*some common class*/
29 | .left{float:left;}
30 | .right{float:right;}
31 | 
32 | /*clear*/
33 | .ue-clear:after{content: ".";display:block;height:0;clear:both;visibility:hidden;}
34 | .ue-clear{display:inline-block;}
35 | *html .ue-clear{height:1%;}
36 | .ue-clear{display:block;}
37 | 
38 | a{color:#0080cc;}
39 | a:hover{color:#267A01;text-decoration:underline;}
40 | /*logo样式*/
41 | .logo{width:160px;height:47px;padding:0 5px;background: url(../img/logo1.png) no-repeat center center #fff;}
42 | 
43 | /*choose样式*/
44 | .choose{float:left;margin-right:15px;white-space:nowrap;}
45 | .choose .text{float:left;padding-left:20px;*padding-left:16px;white-space:nowrap; vertical-align:text-bottom;}
46 | .choose input[type=radio],.choose input[type=checkbox]{position:relative;*top:-3px;float:left;margin-right:-16px;}
47 | 
48 | /*====================================
49 |   分页信息（表格依赖样式）
50 |   ===================================*/
51 | .pagination{font-size:14px;}
52 | .pagination a {text-decoration: none;border: solid 1px;	}
53 | .pagination .pxofy{float:left;margin-left: 5px;height:25px;*padding-top:1px;}	
54 | .pagination a, .pagination span {display: block;float: left;height:18px;line-height:18px;padding:0 6px;margin-right: 5px;font-family:Arial, Helvetica, sans-serif !important;}
55 | .pagination .current {cursor:default;border: solid 1px ;}
56 | .pagination .prev, .pagination .next{*line-height:22px;}
57 | 
58 | /*分页样式*/
59 | .pagination a{color: #032F54;border-color:#8EB2D2;}
60 | .pagination a:hover{color:#023054;border-color:#8EB2D2;background:#B8DFFB;}
61 | .pagination .current{color:#fff;border-color:#5c9bc4;background:#89B8D8;}
62 | .pagination .current.prev, .pagination .current.next{color:#B9B9B9;border-color:#D3D3D3;background:#fff;}
63 | .pagination .pxofy{color: #023054;}
64 | 
65 | #foot{height:32px;line-height:32px; text-align:center;background:#f9f9f9;border-top:1px solid #e0e0e0;color:#ababab;}
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/css/advanced.css:
--------------------------------------------------------------------------------
 1 | @charset "utf-8";
 2 | html{*overflow:auto;}
 3 | #hd{padding:20px 10px;}
 4 | .logo{float:left;margin-right:30px; height:33px;}
 5 | /*input搜索区域*/
 6 | .inputArea{float:left;position:relative;}
 7 | .inputArea .searchInput{border:1px solid #bfbfbf;padding:0 15px;outline:none;height:35px;*line-height:38px;width:350px; background:url(../img/inputbg.png);font-size:14px;}
 8 | .inputArea .searchButton{position:absolute;left:382px;top:0;*top:1px;*left:381px;width:106px;height:38px;background:url(../img/btn_min.png) no-repeat;border:none; cursor:pointer;}
 9 | 
10 | /*返回搜索*/
11 | .inputArea .back{position:absolute;font-size:14px;left:500px;top:12px;width:60px;text-decoration:underline;}
12 | 
13 | /*分界区域*/
14 | .divsion{margin-bottom:24px;height:36px;background:#f9f9f9;border-bottom:1px solid #e0e0e0;}
15 | 
16 | /*高级搜索区域*/
17 | .subfield{border-left:4px solid #9cc813;font-size:14px;font-weight:bold;padding:2px 0 2px 20px;}
18 | .subfieldContent{padding-left:140px;padding-bottom:40px;}
19 | .subfieldContent .advanceItem{padding-left:350px;margin-bottom:15px;padding-top:8px;padding-bottom:3px;}
20 | .subfieldContent .advanceItem.keyWords{background:#f4f4f4;padding-top:18px;padding-bottom:3px;}
21 | .subfieldContent .advanceItem dd{float:left;margin-left:-320px;}
22 | .subfieldContent .advanceItem dd label{float:left;margin-right:40px;width:75px;font-weight:bold;}
23 | .subfieldContent .advanceItem dd .impInfo{ font-weight:bold;}
24 | .subfieldContent .advanceItem dd .tips{float:left;}
25 | .subfieldContent .advanceItem dd p, .subfieldContent .advanceItem dt p{margin-bottom:10px;height:26px;}
26 | .subfieldContent .advanceItem dt p input[type=text]{position:relative;top:-5px;line-height:26px;}
27 | 
28 | .subfieldContent .advanceItem dt{float:left;width:100%;}
29 | .subfieldContent .advanceItem.keyWords dt input[type=text]{width:290px;height:26px;border:1px solid #bfbfbf;outline:none;}
30 | /*自定义*/
31 | .subfieldContent .advanceItem.time{height:30px;}
32 | .subfieldContent .advanceItem .define{display:none;position:relative;*top:-3px;}
33 | .subfieldContent .advanceItem.time input[type=text]{width:80px;height:18px;line-height:18px;border:1px solid #bfbfbf;outline:none;}
34 | 
35 | 
36 | 
37 | 
38 | 
39 | /*更多按钮*/
40 | .more {float:left;}
41 | .more:hover{text-decoration:none;}
42 | .more .moreIcon{display:inline-block;position:relative;top:-1px;*top:-3px;left:2px;*left:-1px;width:9px;height:5px;background:url(../img/more.png);}
43 | .more.show .moreIcon{background:url(../img/down.png);top:-2px;}
44 | 
45 | /*立即搜索样式*/
46 | .subfieldContent .search{margin:45px 0 0 145px;width:130px;height:36px;background:url(../img/btnbg.png); font-weight:bold;border:none;border:1px solid #bfbfbf;line-height:36px;cursor:pointer;font-size:14px;}
47 | /*联想下拉区域*/
48 | .inputArea .dataList{display:none;position:absolute;left:0;top:42px;*top:43px;width:550px;padding:5px 0;background:#fff;border:1px solid #bfbfbf;border-top:none;}
49 | .inputArea .dataList li{padding:2px 15px;font-size:14px;}
50 | .inputArea .dataList li:hover{background:#f0f0f0;color:#0080cc;font-weight:bold;}
51 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/css/advanced.css:
--------------------------------------------------------------------------------
 1 | @charset "utf-8";
 2 | html{*overflow:auto;}
 3 | #hd{padding:20px 10px;}
 4 | .logo{float:left;margin-right:30px; height:33px;}
 5 | /*input搜索区域*/
 6 | .inputArea{float:left;position:relative;}
 7 | .inputArea .searchInput{border:1px solid #bfbfbf;padding:0 15px;outline:none;height:35px;*line-height:38px;width:350px; background:url(../img/inputbg.png);font-size:14px;}
 8 | .inputArea .searchButton{position:absolute;left:382px;top:0;*top:1px;*left:381px;width:106px;height:38px;background:url(../img/btn_min.png) no-repeat;border:none; cursor:pointer;}
 9 | 
10 | /*返回搜索*/
11 | .inputArea .back{position:absolute;font-size:14px;left:500px;top:12px;width:60px;text-decoration:underline;}
12 | 
13 | /*分界区域*/
14 | .divsion{margin-bottom:24px;height:36px;background:#f9f9f9;border-bottom:1px solid #e0e0e0;}
15 | 
16 | /*高级搜索区域*/
17 | .subfield{border-left:4px solid #9cc813;font-size:14px;font-weight:bold;padding:2px 0 2px 20px;}
18 | .subfieldContent{padding-left:140px;padding-bottom:40px;}
19 | .subfieldContent .advanceItem{padding-left:350px;margin-bottom:15px;padding-top:8px;padding-bottom:3px;}
20 | .subfieldContent .advanceItem.keyWords{background:#f4f4f4;padding-top:18px;padding-bottom:3px;}
21 | .subfieldContent .advanceItem dd{float:left;margin-left:-320px;}
22 | .subfieldContent .advanceItem dd label{float:left;margin-right:40px;width:75px;font-weight:bold;}
23 | .subfieldContent .advanceItem dd .impInfo{ font-weight:bold;}
24 | .subfieldContent .advanceItem dd .tips{float:left;}
25 | .subfieldContent .advanceItem dd p, .subfieldContent .advanceItem dt p{margin-bottom:10px;height:26px;}
26 | .subfieldContent .advanceItem dt p input[type=text]{position:relative;top:-5px;line-height:26px;}
27 | 
28 | .subfieldContent .advanceItem dt{float:left;width:100%;}
29 | .subfieldContent .advanceItem.keyWords dt input[type=text]{width:290px;height:26px;border:1px solid #bfbfbf;outline:none;}
30 | /*自定义*/
31 | .subfieldContent .advanceItem.time{height:30px;}
32 | .subfieldContent .advanceItem .define{display:none;position:relative;*top:-3px;}
33 | .subfieldContent .advanceItem.time input[type=text]{width:80px;height:18px;line-height:18px;border:1px solid #bfbfbf;outline:none;}
34 | 
35 | 
36 | 
37 | 
38 | 
39 | /*更多按钮*/
40 | .more {float:left;}
41 | .more:hover{text-decoration:none;}
42 | .more .moreIcon{display:inline-block;position:relative;top:-1px;*top:-3px;left:2px;*left:-1px;width:9px;height:5px;background:url(../img/more.png);}
43 | .more.show .moreIcon{background:url(../img/down.png);top:-2px;}
44 | 
45 | /*立即搜索样式*/
46 | .subfieldContent .search{margin:45px 0 0 145px;width:130px;height:36px;background:url(../img/btnbg.png); font-weight:bold;border:none;border:1px solid #bfbfbf;line-height:36px;cursor:pointer;font-size:14px;}
47 | /*联想下拉区域*/
48 | .inputArea .dataList{display:none;position:absolute;left:0;top:42px;*top:43px;width:550px;padding:5px 0;background:#fff;border:1px solid #bfbfbf;border-top:none;}
49 | .inputArea .dataList li{padding:2px 15px;font-size:14px;}
50 | .inputArea .dataList li:hover{background:#f0f0f0;color:#0080cc;font-weight:bold;}
51 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/ArticleSpider/utils/bloomfilter.py:
--------------------------------------------------------------------------------
 1 | import mmh3
 2 | import BitVector
 3 | import redis
 4 | import math
 5 | import time
 6 | 
 7 | 
 8 | class BloomFilter():
 9 |     #内置100个随机种子
10 |     SEEDS = [543, 460, 171, 876, 796, 607, 650, 81, 837, 545, 591, 946, 846, 521, 913, 636, 878, 735, 414, 372,
11 |              344, 324, 223, 180, 327, 891, 798, 933, 493, 293, 836, 10, 6, 544, 924, 849, 438, 41, 862, 648, 338,
12 |              465, 562, 693, 979, 52, 763, 103, 387, 374, 349, 94, 384, 680, 574, 480, 307, 580, 71, 535, 300, 53,
13 |              481, 519, 644, 219, 686, 236, 424, 326, 244, 212, 909, 202, 951, 56, 812, 901, 926, 250, 507, 739, 371,
14 |              63, 584, 154, 7, 284, 617, 332, 472, 140, 605, 262, 355, 526, 647, 923, 199, 518]
15 | 
16 |     #capacity是预先估计要去重的数量
17 |     #error_rate表示错误率
18 |     #conn表示redis的连接客户端
19 |     #key表示在redis中的键的名字前缀
20 |     def __init__(self, capacity=1000000000, error_rate=0.00000001, conn=None, key='BloomFilter'):
21 |         self.m = math.ceil(capacity*math.log2(math.e)*math.log2(1/error_rate))      #需要的总bit位数
22 |         self.k = math.ceil(math.log1p(2)*self.m/capacity)                           #需要最少的hash次数
23 |         self.mem = math.ceil(self.m/8/1024/1024)                                    #需要的多少M内存
24 |         self.blocknum = math.ceil(self.mem/512)                                     #需要多少个512M的内存块,value的第一个字符必须是ascii码，所有最多有256个内存块
25 |         self.seeds = self.SEEDS[0:self.k]
26 |         self.key = key
27 |         self.N = 2**31-1
28 |         self.redis = conn
29 |         if not self.redis:
30 |             #默认如果没有redis连接，在内存中使用512M的内存块去重
31 |             self.bitset = BitVector.BitVector(size=1<<32)
32 |         print(self.mem)
33 |         print(self.k)
34 | 
35 |     def add(self, value):
36 |         name = self.key + "_" + str(ord(value[0])%self.blocknum)
37 |         hashs = self.get_hashs(value)
38 |         for hash in hashs:
39 |             if self.redis:
40 |                 self.redis.setbit(name, hash, 1)
41 |             else:
42 |                 self.bitset[hash] = 1
43 | 
44 |     def is_exist(self, value):
45 |         name = self.key + "_" + str(ord(value[0])%self.blocknum)
46 |         hashs = self.get_hashs(value)
47 |         exist = True
48 |         for hash in hashs:
49 |             if self.redis:
50 |                 exist = exist & self.redis.getbit(name, hash)
51 |             else:
52 |                 exist = exist & self.bitset[hash]
53 |         return exist
54 | 
55 |     def get_hashs(self, value):
56 |         hashs = list()
57 |         for seed in self.seeds:
58 |             hash = mmh3.hash(value, seed)
59 |             if hash >= 0:
60 |                 hashs.append(hash)
61 |             else:
62 |                 hashs.append(self.N - hash)
63 |         return hashs
64 | 
65 | 
66 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)
67 | conn = redis.StrictRedis(connection_pool=pool)
68 | 
69 | start = time.time()
70 | bf = BloomFilter(conn=conn)
71 | bf.add('test')
72 | bf.add('fsest1')
73 | print(bf.is_exist('qest'))
74 | print(bf.is_exist('testdsad'))
75 | end = time.time()
76 | print(end-start)


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/ArticleSpider/utils/zhihu_login_requests.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | __author__ = 'bobby'
  3 | 
  4 | import requests
  5 | try:
  6 |     import cookielib
  7 | except:
  8 |     import http.cookiejar as cookielib
  9 | 
 10 | import re
 11 | 
 12 | session = requests.session()
 13 | session.cookies = cookielib.LWPCookieJar(filename="cookies.txt")
 14 | try:
 15 |     session.cookies.load(ignore_discard=True)
 16 | except:
 17 |     print ("cookie未能加载")
 18 | 
 19 | agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
 20 | header = {
 21 |     "HOST":"www.zhihu.com",
 22 |     "Referer": "https://www.zhizhu.com",
 23 |     'User-Agent': agent
 24 | }
 25 | 
 26 | def is_login():
 27 |     #通过个人中心页面返回状态码来判断是否为登录状态
 28 |     inbox_url = "https://www.zhihu.com/question/56250357/answer/148534773"
 29 |     response = session.get(inbox_url, headers=header, allow_redirects=False)
 30 |     if response.status_code != 200:
 31 |         return False
 32 |     else:
 33 |         return True
 34 | 
 35 | def get_xsrf():
 36 |     #获取xsrf code
 37 |     response = session.get("https://www.zhihu.com", headers=header)
 38 |     match_obj = re.match('.*name="_xsrf" value="(.*?)"', response.text)
 39 |     if match_obj:
 40 |         return (match_obj.group(1))
 41 |     else:
 42 |         return ""
 43 | 
 44 | 
 45 | def get_index():
 46 |     response = session.get("https://www.zhihu.com", headers=header)
 47 |     with open("index_page.html", "wb") as f:
 48 |         f.write(response.text.encode("utf-8"))
 49 |     print ("ok")
 50 | 
 51 | def get_captcha():
 52 |     import time
 53 |     t = str(int(time.time()*1000))
 54 |     captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login".format(t)
 55 |     t = session.get(captcha_url, headers=header)
 56 |     with open("captcha.jpg","wb") as f:
 57 |         f.write(t.content)
 58 |         f.close()
 59 | 
 60 |     from PIL import Image
 61 |     try:
 62 |         im = Image.open('captcha.jpg')
 63 |         im.show()
 64 |         im.close()
 65 |     except:
 66 |         pass
 67 | 
 68 |     captcha = input("输入验证码\n>")
 69 |     return captcha
 70 | 
 71 | def zhihu_login(account, password):
 72 |     #知乎登录
 73 |     if re.match("^1\d{10}",account):
 74 |         print ("手机号码登录")
 75 |         post_url = "https://www.zhihu.com/login/phone_num"
 76 |         post_data = {
 77 |             "_xsrf": get_xsrf(),
 78 |             "phone_num": account,
 79 |             "password": password,
 80 |             "captcha":get_captcha()
 81 |         }
 82 |     else:
 83 |         if "@" in account:
 84 |             #判断用户名是否为邮箱
 85 |             print("邮箱方式登录")
 86 |             post_url = "https://www.zhihu.com/login/email"
 87 |             post_data = {
 88 |                 "_xsrf": get_xsrf(),
 89 |                 "email": account,
 90 |                 "password": password
 91 |             }
 92 | 
 93 |     response_text = session.post(post_url, data=post_data, headers=header)
 94 |     session.cookies.save()
 95 | 
 96 | zhihu_login("18782902568", "admin123")
 97 | # get_index()
 98 | is_login()
 99 | 
100 | # get_captcha()


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/build/lib/ArticleSpider/utils/bloomfilter.py:
--------------------------------------------------------------------------------
 1 | import mmh3
 2 | import BitVector
 3 | import redis
 4 | import math
 5 | import time
 6 | 
 7 | 
 8 | class BloomFilter():
 9 |     #内置100个随机种子
10 |     SEEDS = [543, 460, 171, 876, 796, 607, 650, 81, 837, 545, 591, 946, 846, 521, 913, 636, 878, 735, 414, 372,
11 |              344, 324, 223, 180, 327, 891, 798, 933, 493, 293, 836, 10, 6, 544, 924, 849, 438, 41, 862, 648, 338,
12 |              465, 562, 693, 979, 52, 763, 103, 387, 374, 349, 94, 384, 680, 574, 480, 307, 580, 71, 535, 300, 53,
13 |              481, 519, 644, 219, 686, 236, 424, 326, 244, 212, 909, 202, 951, 56, 812, 901, 926, 250, 507, 739, 371,
14 |              63, 584, 154, 7, 284, 617, 332, 472, 140, 605, 262, 355, 526, 647, 923, 199, 518]
15 | 
16 |     #capacity是预先估计要去重的数量
17 |     #error_rate表示错误率
18 |     #conn表示redis的连接客户端
19 |     #key表示在redis中的键的名字前缀
20 |     def __init__(self, capacity=1000000000, error_rate=0.00000001, conn=None, key='BloomFilter'):
21 |         self.m = math.ceil(capacity*math.log2(math.e)*math.log2(1/error_rate))      #需要的总bit位数
22 |         self.k = math.ceil(math.log1p(2)*self.m/capacity)                           #需要最少的hash次数
23 |         self.mem = math.ceil(self.m/8/1024/1024)                                    #需要的多少M内存
24 |         self.blocknum = math.ceil(self.mem/512)                                     #需要多少个512M的内存块,value的第一个字符必须是ascii码，所有最多有256个内存块
25 |         self.seeds = self.SEEDS[0:self.k]
26 |         self.key = key
27 |         self.N = 2**31-1
28 |         self.redis = conn
29 |         if not self.redis:
30 |             #默认如果没有redis连接，在内存中使用512M的内存块去重
31 |             self.bitset = BitVector.BitVector(size=1<<32)
32 |         print(self.mem)
33 |         print(self.k)
34 | 
35 |     def add(self, value):
36 |         name = self.key + "_" + str(ord(value[0])%self.blocknum)
37 |         hashs = self.get_hashs(value)
38 |         for hash in hashs:
39 |             if self.redis:
40 |                 self.redis.setbit(name, hash, 1)
41 |             else:
42 |                 self.bitset[hash] = 1
43 | 
44 |     def is_exist(self, value):
45 |         name = self.key + "_" + str(ord(value[0])%self.blocknum)
46 |         hashs = self.get_hashs(value)
47 |         exist = True
48 |         for hash in hashs:
49 |             if self.redis:
50 |                 exist = exist & self.redis.getbit(name, hash)
51 |             else:
52 |                 exist = exist & self.bitset[hash]
53 |         return exist
54 | 
55 |     def get_hashs(self, value):
56 |         hashs = list()
57 |         for seed in self.seeds:
58 |             hash = mmh3.hash(value, seed)
59 |             if hash >= 0:
60 |                 hashs.append(hash)
61 |             else:
62 |                 hashs.append(self.N - hash)
63 |         return hashs
64 | 
65 | 
66 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)
67 | conn = redis.StrictRedis(connection_pool=pool)
68 | 
69 | start = time.time()
70 | bf = BloomFilter(conn=conn)
71 | bf.add('test')
72 | bf.add('fsest1')
73 | print(bf.is_exist('qest'))
74 | print(bf.is_exist('testdsad'))
75 | end = time.time()
76 | print(end-start)


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/build/lib/ArticleSpider/utils/zhihu_login_requests.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | __author__ = 'bobby'
  3 | 
  4 | import requests
  5 | try:
  6 |     import cookielib
  7 | except:
  8 |     import http.cookiejar as cookielib
  9 | 
 10 | import re
 11 | 
 12 | session = requests.session()
 13 | session.cookies = cookielib.LWPCookieJar(filename="cookies.txt")
 14 | try:
 15 |     session.cookies.load(ignore_discard=True)
 16 | except:
 17 |     print ("cookie未能加载")
 18 | 
 19 | agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
 20 | header = {
 21 |     "HOST":"www.zhihu.com",
 22 |     "Referer": "https://www.zhizhu.com",
 23 |     'User-Agent': agent
 24 | }
 25 | 
 26 | def is_login():
 27 |     #通过个人中心页面返回状态码来判断是否为登录状态
 28 |     inbox_url = "https://www.zhihu.com/question/56250357/answer/148534773"
 29 |     response = session.get(inbox_url, headers=header, allow_redirects=False)
 30 |     if response.status_code != 200:
 31 |         return False
 32 |     else:
 33 |         return True
 34 | 
 35 | def get_xsrf():
 36 |     #获取xsrf code
 37 |     response = session.get("https://www.zhihu.com", headers=header)
 38 |     match_obj = re.match('.*name="_xsrf" value="(.*?)"', response.text)
 39 |     if match_obj:
 40 |         return (match_obj.group(1))
 41 |     else:
 42 |         return ""
 43 | 
 44 | 
 45 | def get_index():
 46 |     response = session.get("https://www.zhihu.com", headers=header)
 47 |     with open("index_page.html", "wb") as f:
 48 |         f.write(response.text.encode("utf-8"))
 49 |     print ("ok")
 50 | 
 51 | def get_captcha():
 52 |     import time
 53 |     t = str(int(time.time()*1000))
 54 |     captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login".format(t)
 55 |     t = session.get(captcha_url, headers=header)
 56 |     with open("captcha.jpg","wb") as f:
 57 |         f.write(t.content)
 58 |         f.close()
 59 | 
 60 |     from PIL import Image
 61 |     try:
 62 |         im = Image.open('captcha.jpg')
 63 |         im.show()
 64 |         im.close()
 65 |     except:
 66 |         pass
 67 | 
 68 |     captcha = input("输入验证码\n>")
 69 |     return captcha
 70 | 
 71 | def zhihu_login(account, password):
 72 |     #知乎登录
 73 |     if re.match("^1\d{10}",account):
 74 |         print ("手机号码登录")
 75 |         post_url = "https://www.zhihu.com/login/phone_num"
 76 |         post_data = {
 77 |             "_xsrf": get_xsrf(),
 78 |             "phone_num": account,
 79 |             "password": password,
 80 |             "captcha":get_captcha()
 81 |         }
 82 |     else:
 83 |         if "@" in account:
 84 |             #判断用户名是否为邮箱
 85 |             print("邮箱方式登录")
 86 |             post_url = "https://www.zhihu.com/login/email"
 87 |             post_data = {
 88 |                 "_xsrf": get_xsrf(),
 89 |                 "email": account,
 90 |                 "password": password
 91 |             }
 92 | 
 93 |     response_text = session.post(post_url, data=post_data, headers=header)
 94 |     session.cookies.save()
 95 | 
 96 | zhihu_login("18782902568", "admin123")
 97 | # get_index()
 98 | is_login()
 99 | 
100 | # get_captcha()


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/tools/crawl_xici_ip.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | __author__ = 'bobby'
  3 | import requests
  4 | from scrapy.selector import Selector
  5 | import MySQLdb
  6 | 
  7 | conn = MySQLdb.connect(host="127.0.0.1", user="root", passwd="root", db="article_spider", charset="utf8")
  8 | cursor = conn.cursor()
  9 | 
 10 | 
 11 | def crawl_ips():
 12 |     #爬取西刺的免费ip代理
 13 |     headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"}
 14 |     for i in range(1568):
 15 |         re = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers)
 16 | 
 17 |         selector = Selector(text=re.text)
 18 |         all_trs = selector.css("#ip_list tr")
 19 | 
 20 | 
 21 |         ip_list = []
 22 |         for tr in all_trs[1:]:
 23 |             speed_str = tr.css(".bar::attr(title)").extract()[0]
 24 |             if speed_str:
 25 |                 speed = float(speed_str.split("秒")[0])
 26 |             all_texts = tr.css("td::text").extract()
 27 | 
 28 |             ip = all_texts[0]
 29 |             port = all_texts[1]
 30 |             proxy_type = all_texts[5]
 31 | 
 32 |             ip_list.append((ip, port, proxy_type, speed))
 33 | 
 34 |         for ip_info in ip_list:
 35 |             cursor.execute(
 36 |                 "insert proxy_ip(ip, port, speed, proxy_type) VALUES('{0}', '{1}', {2}, 'HTTP')".format(
 37 |                     ip_info[0], ip_info[1], ip_info[3]
 38 |                 )
 39 |             )
 40 | 
 41 |             conn.commit()
 42 | 
 43 | 
 44 | class GetIP(object):
 45 |     def delete_ip(self, ip):
 46 |         #从数据库中删除无效的ip
 47 |         delete_sql = """
 48 |             delete from proxy_ip where ip='{0}'
 49 |         """.format(ip)
 50 |         cursor.execute(delete_sql)
 51 |         conn.commit()
 52 |         return True
 53 | 
 54 |     def judge_ip(self, ip, port):
 55 |         #判断ip是否可用
 56 |         http_url = "http://www.baidu.com"
 57 |         proxy_url = "http://{0}:{1}".format(ip, port)
 58 |         try:
 59 |             proxy_dict = {
 60 |                 "http":proxy_url,
 61 |             }
 62 |             response = requests.get(http_url, proxies=proxy_dict)
 63 |         except Exception as e:
 64 |             print ("invalid ip and port")
 65 |             self.delete_ip(ip)
 66 |             return False
 67 |         else:
 68 |             code = response.status_code
 69 |             if code >= 200 and code < 300:
 70 |                 print ("effective ip")
 71 |                 return True
 72 |             else:
 73 |                 print  ("invalid ip and port")
 74 |                 self.delete_ip(ip)
 75 |                 return False
 76 | 
 77 | 
 78 |     def get_random_ip(self):
 79 |         #从数据库中随机获取一个可用的ip
 80 |         random_sql = """
 81 |               SELECT ip, port FROM proxy_ip
 82 |             ORDER BY RAND()
 83 |             LIMIT 1
 84 |             """
 85 |         result = cursor.execute(random_sql)
 86 |         for ip_info in cursor.fetchall():
 87 |             ip = ip_info[0]
 88 |             port = ip_info[1]
 89 | 
 90 |             judge_re = self.judge_ip(ip, port)
 91 |             if judge_re:
 92 |                 return "http://{0}:{1}".format(ip, port)
 93 |             else:
 94 |                 return self.get_random_ip()
 95 | 
 96 | 
 97 | 
 98 | # print (crawl_ips())
 99 | if __name__ == "__main__":
100 |     get_ip = GetIP()
101 |     get_ip.get_random_ip()


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/build/lib/tools/crawl_xici_ip.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | __author__ = 'bobby'
  3 | import requests
  4 | from scrapy.selector import Selector
  5 | import MySQLdb
  6 | 
  7 | conn = MySQLdb.connect(host="127.0.0.1", user="root", passwd="root", db="article_spider", charset="utf8")
  8 | cursor = conn.cursor()
  9 | 
 10 | 
 11 | def crawl_ips():
 12 |     #爬取西刺的免费ip代理
 13 |     headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"}
 14 |     for i in range(1568):
 15 |         re = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers)
 16 | 
 17 |         selector = Selector(text=re.text)
 18 |         all_trs = selector.css("#ip_list tr")
 19 | 
 20 | 
 21 |         ip_list = []
 22 |         for tr in all_trs[1:]:
 23 |             speed_str = tr.css(".bar::attr(title)").extract()[0]
 24 |             if speed_str:
 25 |                 speed = float(speed_str.split("秒")[0])
 26 |             all_texts = tr.css("td::text").extract()
 27 | 
 28 |             ip = all_texts[0]
 29 |             port = all_texts[1]
 30 |             proxy_type = all_texts[5]
 31 | 
 32 |             ip_list.append((ip, port, proxy_type, speed))
 33 | 
 34 |         for ip_info in ip_list:
 35 |             cursor.execute(
 36 |                 "insert proxy_ip(ip, port, speed, proxy_type) VALUES('{0}', '{1}', {2}, 'HTTP')".format(
 37 |                     ip_info[0], ip_info[1], ip_info[3]
 38 |                 )
 39 |             )
 40 | 
 41 |             conn.commit()
 42 | 
 43 | 
 44 | class GetIP(object):
 45 |     def delete_ip(self, ip):
 46 |         #从数据库中删除无效的ip
 47 |         delete_sql = """
 48 |             delete from proxy_ip where ip='{0}'
 49 |         """.format(ip)
 50 |         cursor.execute(delete_sql)
 51 |         conn.commit()
 52 |         return True
 53 | 
 54 |     def judge_ip(self, ip, port):
 55 |         #判断ip是否可用
 56 |         http_url = "http://www.baidu.com"
 57 |         proxy_url = "http://{0}:{1}".format(ip, port)
 58 |         try:
 59 |             proxy_dict = {
 60 |                 "http":proxy_url,
 61 |             }
 62 |             response = requests.get(http_url, proxies=proxy_dict)
 63 |         except Exception as e:
 64 |             print ("invalid ip and port")
 65 |             self.delete_ip(ip)
 66 |             return False
 67 |         else:
 68 |             code = response.status_code
 69 |             if code >= 200 and code < 300:
 70 |                 print ("effective ip")
 71 |                 return True
 72 |             else:
 73 |                 print  ("invalid ip and port")
 74 |                 self.delete_ip(ip)
 75 |                 return False
 76 | 
 77 | 
 78 |     def get_random_ip(self):
 79 |         #从数据库中随机获取一个可用的ip
 80 |         random_sql = """
 81 |               SELECT ip, port FROM proxy_ip
 82 |             ORDER BY RAND()
 83 |             LIMIT 1
 84 |             """
 85 |         result = cursor.execute(random_sql)
 86 |         for ip_info in cursor.fetchall():
 87 |             ip = ip_info[0]
 88 |             port = ip_info[1]
 89 | 
 90 |             judge_re = self.judge_ip(ip, port)
 91 |             if judge_re:
 92 |                 return "http://{0}:{1}".format(ip, port)
 93 |             else:
 94 |                 return self.get_random_ip()
 95 | 
 96 | 
 97 | 
 98 | # print (crawl_ips())
 99 | if __name__ == "__main__":
100 |     get_ip = GetIP()
101 |     get_ip.get_random_ip()


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/ScrapyRedisTest/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for ScrapyRedisTest project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'ScrapyRedisTest'
13 | 
14 | SPIDER_MODULES = ['ScrapyRedisTest.spiders']
15 | NEWSPIDER_MODULE = 'ScrapyRedisTest.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'ScrapyRedisTest (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'ScrapyRedisTest.middlewares.ScrapyredistestSpiderMiddleware': 543,
51 | #}
52 | 
53 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
54 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
55 | ITEM_PIPELINES = {
56 |     'scrapy_redis.pipelines.RedisPipeline': 300
57 | }
58 | # Enable or disable downloader middlewares
59 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
60 | #DOWNLOADER_MIDDLEWARES = {
61 | #    'ScrapyRedisTest.middlewares.MyCustomDownloaderMiddleware': 543,
62 | #}
63 | 
64 | # Enable or disable extensions
65 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
66 | #EXTENSIONS = {
67 | #    'scrapy.extensions.telnet.TelnetConsole': None,
68 | #}
69 | 
70 | # Configure item pipelines
71 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
72 | #ITEM_PIPELINES = {
73 | #    'ScrapyRedisTest.pipelines.ScrapyredistestPipeline': 300,
74 | #}
75 | 
76 | # Enable and configure the AutoThrottle extension (disabled by default)
77 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
78 | #AUTOTHROTTLE_ENABLED = True
79 | # The initial download delay
80 | #AUTOTHROTTLE_START_DELAY = 5
81 | # The maximum download delay to be set in case of high latencies
82 | #AUTOTHROTTLE_MAX_DELAY = 60
83 | # The average number of requests Scrapy should be sending in parallel to
84 | # each remote server
85 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
86 | # Enable showing throttling stats for every response received:
87 | #AUTOTHROTTLE_DEBUG = False
88 | 
89 | # Enable and configure HTTP caching (disabled by default)
90 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
91 | #HTTPCACHE_ENABLED = True
92 | #HTTPCACHE_EXPIRATION_SECS = 0
93 | #HTTPCACHE_DIR = 'httpcache'
94 | #HTTPCACHE_IGNORE_HTTP_CODES = []
95 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
96 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/LcvSearch/settings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Django settings for LcvSearch project.
  3 | 
  4 | Generated by 'django-admin startproject' using Django 1.11.
  5 | 
  6 | For more information on this file, see
  7 | https://docs.djangoproject.com/en/1.11/topics/settings/
  8 | 
  9 | For the full list of settings and their values, see
 10 | https://docs.djangoproject.com/en/1.11/ref/settings/
 11 | """
 12 | 
 13 | import os
 14 | 
 15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 17 | 
 18 | 
 19 | # Quick-start development settings - unsuitable for production
 20 | # See https://docs.djangoproject.com/en/1.11/howto/deployment/checklist/
 21 | 
 22 | # SECURITY WARNING: keep the secret key used in production secret!
 23 | SECRET_KEY = '5)$op9fxf2b#%(*_-qcr7sf)*c@gr!v=d851(*3f*2gef0f!#d'
 24 | 
 25 | # SECURITY WARNING: don't run with debug turned on in production!
 26 | DEBUG = True
 27 | 
 28 | ALLOWED_HOSTS = []
 29 | 
 30 | 
 31 | # Application definition
 32 | 
 33 | INSTALLED_APPS = [
 34 |     'django.contrib.admin',
 35 |     'django.contrib.auth',
 36 |     'django.contrib.contenttypes',
 37 |     'django.contrib.sessions',
 38 |     'django.contrib.messages',
 39 |     'django.contrib.staticfiles',
 40 |     'search',
 41 | ]
 42 | 
 43 | MIDDLEWARE = [
 44 |     'django.middleware.security.SecurityMiddleware',
 45 |     'django.contrib.sessions.middleware.SessionMiddleware',
 46 |     'django.middleware.common.CommonMiddleware',
 47 |     'django.middleware.csrf.CsrfViewMiddleware',
 48 |     'django.contrib.auth.middleware.AuthenticationMiddleware',
 49 |     'django.contrib.messages.middleware.MessageMiddleware',
 50 |     'django.middleware.clickjacking.XFrameOptionsMiddleware',
 51 | ]
 52 | 
 53 | ROOT_URLCONF = 'LcvSearch.urls'
 54 | 
 55 | TEMPLATES = [
 56 |     {
 57 |         'BACKEND': 'django.template.backends.django.DjangoTemplates',
 58 |         'DIRS': [os.path.join(BASE_DIR, 'templates')]
 59 |         ,
 60 |         'APP_DIRS': True,
 61 |         'OPTIONS': {
 62 |             'context_processors': [
 63 |                 'django.template.context_processors.debug',
 64 |                 'django.template.context_processors.request',
 65 |                 'django.contrib.auth.context_processors.auth',
 66 |                 'django.contrib.messages.context_processors.messages',
 67 |             ],
 68 |         },
 69 |     },
 70 | ]
 71 | 
 72 | WSGI_APPLICATION = 'LcvSearch.wsgi.application'
 73 | 
 74 | 
 75 | # Database
 76 | # https://docs.djangoproject.com/en/1.11/ref/settings/#databases
 77 | 
 78 | DATABASES = {
 79 |     'default': {
 80 |         'ENGINE': 'django.db.backends.sqlite3',
 81 |         'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
 82 |     }
 83 | }
 84 | 
 85 | 
 86 | # Password validation
 87 | # https://docs.djangoproject.com/en/1.11/ref/settings/#auth-password-validators
 88 | 
 89 | AUTH_PASSWORD_VALIDATORS = [
 90 |     {
 91 |         'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
 92 |     },
 93 |     {
 94 |         'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
 95 |     },
 96 |     {
 97 |         'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
 98 |     },
 99 |     {
100 |         'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
101 |     },
102 | ]
103 | 
104 | 
105 | # Internationalization
106 | # https://docs.djangoproject.com/en/1.11/topics/i18n/
107 | 
108 | LANGUAGE_CODE = 'en-us'
109 | 
110 | TIME_ZONE = 'UTC'
111 | 
112 | USE_I18N = True
113 | 
114 | USE_L10N = True
115 | 
116 | USE_TZ = True
117 | 
118 | 
119 | # Static files (CSS, JavaScript, Images)
120 | # https://docs.djangoproject.com/en/1.11/howto/static-files/
121 | 
122 | STATIC_URL = '/static/'
123 | 
124 | STATICFILES_DIRS = [
125 |     os.path.join(BASE_DIR, "static")
126 | ]


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/ArticleSpider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | from fake_useragent import UserAgent
 10 | 
 11 | from tools.crawl_xici_ip import GetIP
 12 | 
 13 | 
 14 | 
 15 | class ArticlespiderSpiderMiddleware(object):
 16 |     # Not all methods need to be defined. If a method is not defined,
 17 |     # scrapy acts as if the spider middleware does not modify the
 18 |     # passed objects.
 19 | 
 20 |     @classmethod
 21 |     def from_crawler(cls, crawler):
 22 |         # This method is used by Scrapy to create your spiders.
 23 |         s = cls()
 24 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 25 |         return s
 26 | 
 27 |     def process_spider_input(response, spider):
 28 |         # Called for each response that goes through the spider
 29 |         # middleware and into the spider.
 30 | 
 31 |         # Should return None or raise an exception.
 32 |         return None
 33 | 
 34 |     def process_spider_output(response, result, spider):
 35 |         # Called with the results returned from the Spider, after
 36 |         # it has processed the response.
 37 | 
 38 |         # Must return an iterable of Request, dict or Item objects.
 39 |         for i in result:
 40 |             yield i
 41 | 
 42 |     def process_spider_exception(response, exception, spider):
 43 |         # Called when a spider or process_spider_input() method
 44 |         # (from other spider middleware) raises an exception.
 45 | 
 46 |         # Should return either None or an iterable of Response, dict
 47 |         # or Item objects.
 48 |         pass
 49 | 
 50 |     def process_start_requests(start_requests, spider):
 51 |         # Called with the start requests of the spider, and works
 52 |         # similarly to the process_spider_output() method, except
 53 |         # that it doesn’t have a response associated.
 54 | 
 55 |         # Must return only requests (not items).
 56 |         for r in start_requests:
 57 |             yield r
 58 | 
 59 |     def spider_opened(self, spider):
 60 |         spider.logger.info('Spider opened: %s' % spider.name)
 61 | 
 62 | 
 63 | class RandomUserAgentMiddlware(object):
 64 |     #随机更换user-agent
 65 |     def __init__(self, crawler):
 66 |         super(RandomUserAgentMiddlware, self).__init__()
 67 |         self.ua = UserAgent()
 68 |         self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
 69 | 
 70 |     @classmethod
 71 |     def from_crawler(cls, crawler):
 72 |         return cls(crawler)
 73 | 
 74 |     def process_request(self, request, spider):
 75 |         def get_ua():
 76 |             return getattr(self.ua, self.ua_type)
 77 | 
 78 |         request.headers.setdefault('User-Agent', get_ua())
 79 | 
 80 | class RandomProxyMiddleware(object):
 81 |     #动态设置ip代理
 82 |     def process_request(self, request, spider):
 83 |         get_ip = GetIP()
 84 |         request.meta["proxy"] = get_ip.get_random_ip()
 85 | 
 86 | 
 87 | from selenium import webdriver
 88 | from scrapy.http import HtmlResponse
 89 | class JSPageMiddleware(object):
 90 | 
 91 |     #通过chrome请求动态网页
 92 |     def process_request(self, request, spider):
 93 |         if spider.name == "jobbole":
 94 |             # browser = webdriver.Chrome(executable_path="D:/Temp/chromedriver.exe")
 95 |             spider.browser.get(request.url)
 96 |             import time
 97 |             time.sleep(3)
 98 |             print ("访问:{0}".format(request.url))
 99 | 
100 |             return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8", request=request)
101 | 
102 | # from pyvirtualdisplay import Display
103 | # display = Display(visible=0, size=(800, 600))
104 | # display.start()
105 | #
106 | # browser = webdriver.Chrome()
107 | # browser.get()
108 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/build/lib/ArticleSpider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | from fake_useragent import UserAgent
 10 | 
 11 | from tools.crawl_xici_ip import GetIP
 12 | 
 13 | 
 14 | 
 15 | class ArticlespiderSpiderMiddleware(object):
 16 |     # Not all methods need to be defined. If a method is not defined,
 17 |     # scrapy acts as if the spider middleware does not modify the
 18 |     # passed objects.
 19 | 
 20 |     @classmethod
 21 |     def from_crawler(cls, crawler):
 22 |         # This method is used by Scrapy to create your spiders.
 23 |         s = cls()
 24 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 25 |         return s
 26 | 
 27 |     def process_spider_input(response, spider):
 28 |         # Called for each response that goes through the spider
 29 |         # middleware and into the spider.
 30 | 
 31 |         # Should return None or raise an exception.
 32 |         return None
 33 | 
 34 |     def process_spider_output(response, result, spider):
 35 |         # Called with the results returned from the Spider, after
 36 |         # it has processed the response.
 37 | 
 38 |         # Must return an iterable of Request, dict or Item objects.
 39 |         for i in result:
 40 |             yield i
 41 | 
 42 |     def process_spider_exception(response, exception, spider):
 43 |         # Called when a spider or process_spider_input() method
 44 |         # (from other spider middleware) raises an exception.
 45 | 
 46 |         # Should return either None or an iterable of Response, dict
 47 |         # or Item objects.
 48 |         pass
 49 | 
 50 |     def process_start_requests(start_requests, spider):
 51 |         # Called with the start requests of the spider, and works
 52 |         # similarly to the process_spider_output() method, except
 53 |         # that it doesn’t have a response associated.
 54 | 
 55 |         # Must return only requests (not items).
 56 |         for r in start_requests:
 57 |             yield r
 58 | 
 59 |     def spider_opened(self, spider):
 60 |         spider.logger.info('Spider opened: %s' % spider.name)
 61 | 
 62 | 
 63 | class RandomUserAgentMiddlware(object):
 64 |     #随机更换user-agent
 65 |     def __init__(self, crawler):
 66 |         super(RandomUserAgentMiddlware, self).__init__()
 67 |         self.ua = UserAgent()
 68 |         self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
 69 | 
 70 |     @classmethod
 71 |     def from_crawler(cls, crawler):
 72 |         return cls(crawler)
 73 | 
 74 |     def process_request(self, request, spider):
 75 |         def get_ua():
 76 |             return getattr(self.ua, self.ua_type)
 77 | 
 78 |         request.headers.setdefault('User-Agent', get_ua())
 79 | 
 80 | class RandomProxyMiddleware(object):
 81 |     #动态设置ip代理
 82 |     def process_request(self, request, spider):
 83 |         get_ip = GetIP()
 84 |         request.meta["proxy"] = get_ip.get_random_ip()
 85 | 
 86 | 
 87 | from selenium import webdriver
 88 | from scrapy.http import HtmlResponse
 89 | class JSPageMiddleware(object):
 90 | 
 91 |     #通过chrome请求动态网页
 92 |     def process_request(self, request, spider):
 93 |         if spider.name == "jobbole":
 94 |             # browser = webdriver.Chrome(executable_path="D:/Temp/chromedriver.exe")
 95 |             spider.browser.get(request.url)
 96 |             import time
 97 |             time.sleep(3)
 98 |             print ("访问:{0}".format(request.url))
 99 | 
100 |             return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8", request=request)
101 | 
102 | # from pyvirtualdisplay import Display
103 | # display = Display(visible=0, size=(800, 600))
104 | # display.start()
105 | #
106 | # browser = webdriver.Chrome()
107 | # browser.get()
108 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/tools/yundama_requests.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | __author__ = 'bobby'
  3 | 
  4 | import json
  5 | import requests
  6 | 
  7 | class YDMHttp(object):
  8 |     apiurl = 'http://api.yundama.com/api.php'
  9 |     username = ''
 10 |     password = ''
 11 |     appid = ''
 12 |     appkey = ''
 13 | 
 14 |     def __init__(self, username, password, appid, appkey):
 15 |         self.username = username
 16 |         self.password = password
 17 |         self.appid = str(appid)
 18 |         self.appkey = appkey
 19 | 
 20 |     def balance(self):
 21 |         data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
 22 |         response_data = requests.post(self.apiurl, data=data)
 23 |         ret_data = json.loads(response_data.text)
 24 |         if ret_data["ret"] == 0:
 25 |             print ("获取剩余积分", ret_data["balance"])
 26 |             return ret_data["balance"]
 27 |         else:
 28 |             return None
 29 | 
 30 |     def login(self):
 31 |         data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
 32 |         response_data = requests.post(self.apiurl, data=data)
 33 |         ret_data = json.loads(response_data.text)
 34 |         if ret_data["ret"] == 0:
 35 |             print ("登录成功", ret_data["uid"])
 36 |             return ret_data["uid"]
 37 |         else:
 38 |             return None
 39 | 
 40 |     def decode(self, filename, codetype, timeout):
 41 |         data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
 42 |         files = {'file': open(filename, 'rb')}
 43 |         response_data = requests.post(self.apiurl, files=files, data=data)
 44 |         ret_data = json.loads(response_data.text)
 45 |         if ret_data["ret"] == 0:
 46 |             print ("识别成功", ret_data["text"])
 47 |             return ret_data["text"]
 48 |         else:
 49 |             return None
 50 | 
 51 | def ydm(file_path):
 52 |     username = 'da_ge_da1'
 53 |     # 密码
 54 |     password = 'da_ge_da'
 55 |     # 软件ＩＤ，开发者分成必要参数。登录开发者后台【我的软件】获得！
 56 |     appid = 3129
 57 |     # 软件密钥，开发者分成必要参数。登录开发者后台【我的软件】获得！
 58 |     appkey = '40d5ad41c047179fc797631e3b9c3025'
 59 |     # 图片文件
 60 |     filename = 'image/captcha.jpg'
 61 |     # 验证码类型，# 例：1004表示4位字母数字，不同类型收费不同。请准确填写，否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
 62 |     codetype = 5000
 63 |     # 超时时间，秒
 64 |     timeout = 60
 65 |     # 检查
 66 | 
 67 |     yundama = YDMHttp(username, password, appid, appkey)
 68 |     if (username == 'username'):
 69 |         print('请设置好相关参数再测试')
 70 |     else:
 71 |         # 开始识别，图片路径，验证码类型ID，超时时间（秒），识别结果
 72 |         return yundama.decode(file_path, codetype, timeout);
 73 | 
 74 | if __name__ == "__main__":
 75 |     # 用户名
 76 |     username = 'da_ge_da1'
 77 |     # 密码
 78 |     password = 'da_ge_da'
 79 |     # 软件ＩＤ，开发者分成必要参数。登录开发者后台【我的软件】获得！
 80 |     appid = 3129
 81 |     # 软件密钥，开发者分成必要参数。登录开发者后台【我的软件】获得！
 82 |     appkey = '40d5ad41c047179fc797631e3b9c3025'
 83 |     # 图片文件
 84 |     filename = 'image/captcha.jpg'
 85 |     # 验证码类型，# 例：1004表示4位字母数字，不同类型收费不同。请准确填写，否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
 86 |     codetype = 5000
 87 |     # 超时时间，秒
 88 |     timeout = 60
 89 |     # 检查
 90 |     if (username == 'username'):
 91 |         print ('请设置好相关参数再测试')
 92 |     else:
 93 |         # 初始化
 94 |         yundama = YDMHttp(username, password, appid, appkey)
 95 | 
 96 |         # 登陆云打码
 97 |         uid = yundama.login();
 98 |         print('uid: %s' % uid)
 99 | 
100 |         # 登陆云打码
101 |         uid = yundama.login();
102 |         print ('uid: %s' % uid)
103 | 
104 |         # 查询余额
105 |         balance = yundama.balance();
106 |         print ('balance: %s' % balance)
107 | 
108 |         # 开始识别，图片路径，验证码类型ID，超时时间（秒），识别结果
109 |         text = yundama.decode(filename, codetype, timeout);
110 | 
111 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/build/lib/tools/yundama_requests.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | __author__ = 'bobby'
  3 | 
  4 | import json
  5 | import requests
  6 | 
  7 | class YDMHttp(object):
  8 |     apiurl = 'http://api.yundama.com/api.php'
  9 |     username = ''
 10 |     password = ''
 11 |     appid = ''
 12 |     appkey = ''
 13 | 
 14 |     def __init__(self, username, password, appid, appkey):
 15 |         self.username = username
 16 |         self.password = password
 17 |         self.appid = str(appid)
 18 |         self.appkey = appkey
 19 | 
 20 |     def balance(self):
 21 |         data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
 22 |         response_data = requests.post(self.apiurl, data=data)
 23 |         ret_data = json.loads(response_data.text)
 24 |         if ret_data["ret"] == 0:
 25 |             print ("获取剩余积分", ret_data["balance"])
 26 |             return ret_data["balance"]
 27 |         else:
 28 |             return None
 29 | 
 30 |     def login(self):
 31 |         data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
 32 |         response_data = requests.post(self.apiurl, data=data)
 33 |         ret_data = json.loads(response_data.text)
 34 |         if ret_data["ret"] == 0:
 35 |             print ("登录成功", ret_data["uid"])
 36 |             return ret_data["uid"]
 37 |         else:
 38 |             return None
 39 | 
 40 |     def decode(self, filename, codetype, timeout):
 41 |         data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
 42 |         files = {'file': open(filename, 'rb')}
 43 |         response_data = requests.post(self.apiurl, files=files, data=data)
 44 |         ret_data = json.loads(response_data.text)
 45 |         if ret_data["ret"] == 0:
 46 |             print ("识别成功", ret_data["text"])
 47 |             return ret_data["text"]
 48 |         else:
 49 |             return None
 50 | 
 51 | def ydm(file_path):
 52 |     username = 'da_ge_da1'
 53 |     # 密码
 54 |     password = 'da_ge_da'
 55 |     # 软件ＩＤ，开发者分成必要参数。登录开发者后台【我的软件】获得！
 56 |     appid = 3129
 57 |     # 软件密钥，开发者分成必要参数。登录开发者后台【我的软件】获得！
 58 |     appkey = '40d5ad41c047179fc797631e3b9c3025'
 59 |     # 图片文件
 60 |     filename = 'image/captcha.jpg'
 61 |     # 验证码类型，# 例：1004表示4位字母数字，不同类型收费不同。请准确填写，否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
 62 |     codetype = 5000
 63 |     # 超时时间，秒
 64 |     timeout = 60
 65 |     # 检查
 66 | 
 67 |     yundama = YDMHttp(username, password, appid, appkey)
 68 |     if (username == 'username'):
 69 |         print('请设置好相关参数再测试')
 70 |     else:
 71 |         # 开始识别，图片路径，验证码类型ID，超时时间（秒），识别结果
 72 |         return yundama.decode(file_path, codetype, timeout);
 73 | 
 74 | if __name__ == "__main__":
 75 |     # 用户名
 76 |     username = 'da_ge_da1'
 77 |     # 密码
 78 |     password = 'da_ge_da'
 79 |     # 软件ＩＤ，开发者分成必要参数。登录开发者后台【我的软件】获得！
 80 |     appid = 3129
 81 |     # 软件密钥，开发者分成必要参数。登录开发者后台【我的软件】获得！
 82 |     appkey = '40d5ad41c047179fc797631e3b9c3025'
 83 |     # 图片文件
 84 |     filename = 'image/captcha.jpg'
 85 |     # 验证码类型，# 例：1004表示4位字母数字，不同类型收费不同。请准确填写，否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
 86 |     codetype = 5000
 87 |     # 超时时间，秒
 88 |     timeout = 60
 89 |     # 检查
 90 |     if (username == 'username'):
 91 |         print ('请设置好相关参数再测试')
 92 |     else:
 93 |         # 初始化
 94 |         yundama = YDMHttp(username, password, appid, appkey)
 95 | 
 96 |         # 登陆云打码
 97 |         uid = yundama.login();
 98 |         print('uid: %s' % uid)
 99 | 
100 |         # 登陆云打码
101 |         uid = yundama.login();
102 |         print ('uid: %s' % uid)
103 | 
104 |         # 查询余额
105 |         balance = yundama.balance();
106 |         print ('balance: %s' % balance)
107 | 
108 |         # 开始识别，图片路径，验证码类型ID，超时时间（秒），识别结果
109 |         text = yundama.decode(filename, codetype, timeout);
110 | 
111 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/ArticleSpider/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | #
  5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 | import codecs
  8 | import json
  9 | 
 10 | from scrapy.pipelines.images import ImagesPipeline
 11 | from scrapy.exporters import JsonItemExporter
 12 | from twisted.enterprise import adbapi
 13 | from models.es_types import ArticleType
 14 | from w3lib.html import remove_tags
 15 | 
 16 | import MySQLdb
 17 | import MySQLdb.cursors
 18 | 
 19 | class ArticlespiderPipeline(object):
 20 |     def process_item(self, item, spider):
 21 |         return item
 22 | 
 23 | 
 24 | class JsonWithEncodingPipeline(object):
 25 |     #自定义json文件的导出
 26 |     def __init__(self):
 27 |         self.file = codecs.open('article.json', 'w', encoding="utf-8")
 28 |     def process_item(self, item, spider):
 29 |         lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
 30 |         self.file.write(lines)
 31 |         return item
 32 |     def spider_closed(self, spider):
 33 |         self.file.close()
 34 | 
 35 | 
 36 | class MysqlPipeline(object):
 37 |     #采用同步的机制写入mysql
 38 |     def __init__(self):
 39 |         self.conn = MySQLdb.connect('192.168.0.106', 'root', 'root', 'article_spider', charset="utf8", use_unicode=True)
 40 |         self.cursor = self.conn.cursor()
 41 | 
 42 |     def process_item(self, item, spider):
 43 |         insert_sql = """
 44 |             insert into jobbole_article(title, url, create_date, fav_nums)
 45 |             VALUES (%s, %s, %s, %s)
 46 |         """
 47 |         self.cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"]))
 48 |         self.conn.commit()
 49 | 
 50 | 
 51 | class MysqlTwistedPipline(object):
 52 |     def __init__(self, dbpool):
 53 |         self.dbpool = dbpool
 54 | 
 55 |     @classmethod
 56 |     def from_settings(cls, settings):
 57 |         dbparms = dict(
 58 |             host = settings["MYSQL_HOST"],
 59 |             db = settings["MYSQL_DBNAME"],
 60 |             user = settings["MYSQL_USER"],
 61 |             passwd = settings["MYSQL_PASSWORD"],
 62 |             charset='utf8',
 63 |             cursorclass=MySQLdb.cursors.DictCursor,
 64 |             use_unicode=True,
 65 |         )
 66 |         dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
 67 | 
 68 |         return cls(dbpool)
 69 | 
 70 |     def process_item(self, item, spider):
 71 |         #使用twisted将mysql插入变成异步执行
 72 |         query = self.dbpool.runInteraction(self.do_insert, item)
 73 |         query.addErrback(self.handle_error, item, spider) #处理异常
 74 | 
 75 |     def handle_error(self, failure, item, spider):
 76 |         #处理异步插入的异常
 77 |         print (failure)
 78 | 
 79 |     def do_insert(self, cursor, item):
 80 |         #执行具体的插入
 81 |         #根据不同的item 构建不同的sql语句并插入到mysql中
 82 |         insert_sql, params = item.get_insert_sql()
 83 |         cursor.execute(insert_sql, params)
 84 | 
 85 | 
 86 | class JsonExporterPipleline(object):
 87 |     #调用scrapy提供的json export导出json文件
 88 |     def __init__(self):
 89 |         self.file = open('articleexport.json', 'wb')
 90 |         self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
 91 |         self.exporter.start_exporting()
 92 | 
 93 |     def close_spider(self, spider):
 94 |         self.exporter.finish_exporting()
 95 |         self.file.close()
 96 | 
 97 |     def process_item(self, item, spider):
 98 |         self.exporter.export_item(item)
 99 |         return item
100 | 
101 | 
102 | class ArticleImagePipeline(ImagesPipeline):
103 |     def item_completed(self, results, item, info):
104 |         if "front_image_url" in item:
105 |             for ok, value in results:
106 |                 image_file_path = value["path"]
107 |             item["front_image_path"] = image_file_path
108 | 
109 |         return item
110 | 
111 | 
112 | class ElasticsearchPipeline(object):
113 |     #将数据写入到es中
114 | 
115 |     def process_item(self, item, spider):
116 |         #将item转换为es的数据
117 |         item.save_to_es()
118 | 
119 |         return item


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/build/lib/ArticleSpider/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | #
  5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 | import codecs
  8 | import json
  9 | 
 10 | from scrapy.pipelines.images import ImagesPipeline
 11 | from scrapy.exporters import JsonItemExporter
 12 | from twisted.enterprise import adbapi
 13 | from models.es_types import ArticleType
 14 | from w3lib.html import remove_tags
 15 | 
 16 | import MySQLdb
 17 | import MySQLdb.cursors
 18 | 
 19 | class ArticlespiderPipeline(object):
 20 |     def process_item(self, item, spider):
 21 |         return item
 22 | 
 23 | 
 24 | class JsonWithEncodingPipeline(object):
 25 |     #自定义json文件的导出
 26 |     def __init__(self):
 27 |         self.file = codecs.open('article.json', 'w', encoding="utf-8")
 28 |     def process_item(self, item, spider):
 29 |         lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
 30 |         self.file.write(lines)
 31 |         return item
 32 |     def spider_closed(self, spider):
 33 |         self.file.close()
 34 | 
 35 | 
 36 | class MysqlPipeline(object):
 37 |     #采用同步的机制写入mysql
 38 |     def __init__(self):
 39 |         self.conn = MySQLdb.connect('192.168.0.106', 'root', 'root', 'article_spider', charset="utf8", use_unicode=True)
 40 |         self.cursor = self.conn.cursor()
 41 | 
 42 |     def process_item(self, item, spider):
 43 |         insert_sql = """
 44 |             insert into jobbole_article(title, url, create_date, fav_nums)
 45 |             VALUES (%s, %s, %s, %s)
 46 |         """
 47 |         self.cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"]))
 48 |         self.conn.commit()
 49 | 
 50 | 
 51 | class MysqlTwistedPipline(object):
 52 |     def __init__(self, dbpool):
 53 |         self.dbpool = dbpool
 54 | 
 55 |     @classmethod
 56 |     def from_settings(cls, settings):
 57 |         dbparms = dict(
 58 |             host = settings["MYSQL_HOST"],
 59 |             db = settings["MYSQL_DBNAME"],
 60 |             user = settings["MYSQL_USER"],
 61 |             passwd = settings["MYSQL_PASSWORD"],
 62 |             charset='utf8',
 63 |             cursorclass=MySQLdb.cursors.DictCursor,
 64 |             use_unicode=True,
 65 |         )
 66 |         dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
 67 | 
 68 |         return cls(dbpool)
 69 | 
 70 |     def process_item(self, item, spider):
 71 |         #使用twisted将mysql插入变成异步执行
 72 |         query = self.dbpool.runInteraction(self.do_insert, item)
 73 |         query.addErrback(self.handle_error, item, spider) #处理异常
 74 | 
 75 |     def handle_error(self, failure, item, spider):
 76 |         #处理异步插入的异常
 77 |         print (failure)
 78 | 
 79 |     def do_insert(self, cursor, item):
 80 |         #执行具体的插入
 81 |         #根据不同的item 构建不同的sql语句并插入到mysql中
 82 |         insert_sql, params = item.get_insert_sql()
 83 |         cursor.execute(insert_sql, params)
 84 | 
 85 | 
 86 | class JsonExporterPipleline(object):
 87 |     #调用scrapy提供的json export导出json文件
 88 |     def __init__(self):
 89 |         self.file = open('articleexport.json', 'wb')
 90 |         self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
 91 |         self.exporter.start_exporting()
 92 | 
 93 |     def close_spider(self, spider):
 94 |         self.exporter.finish_exporting()
 95 |         self.file.close()
 96 | 
 97 |     def process_item(self, item, spider):
 98 |         self.exporter.export_item(item)
 99 |         return item
100 | 
101 | 
102 | class ArticleImagePipeline(ImagesPipeline):
103 |     def item_completed(self, results, item, info):
104 |         if "front_image_url" in item:
105 |             for ok, value in results:
106 |                 image_file_path = value["path"]
107 |             item["front_image_path"] = image_file_path
108 | 
109 |         return item
110 | 
111 | 
112 | class ElasticsearchPipeline(object):
113 |     #将数据写入到es中
114 | 
115 |     def process_item(self, item, spider):
116 |         #将item转换为es的数据
117 |         item.save_to_es()
118 | 
119 |         return item


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/.idea/misc.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="MarkdownProjectSettings">
 4 |     <PreviewSettings splitEditorLayout="SPLIT" splitEditorPreview="PREVIEW" useGrayscaleRendering="false" zoomFactor="1.5" maxImageWidth="0" showGitHubPageIfSynced="false" allowBrowsingInPreview="false" synchronizePreviewPosition="false" highlightPreviewType="NONE" highlightFadeOut="5" highlightOnTyping="false" synchronizeSourcePosition="false">
 5 |       <PanelProvider>
 6 |         <provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.panel" providerName="Default - Swing" />
 7 |       </PanelProvider>
 8 |     </PreviewSettings>
 9 |     <ParserSettings>
10 |       <PegdownExtensions>
11 |         <option name="ABBREVIATIONS" value="false" />
12 |         <option name="ANCHORLINKS" value="true" />
13 |         <option name="ATXHEADERSPACE" value="true" />
14 |         <option name="AUTOLINKS" value="true" />
15 |         <option name="DEFINITIONS" value="false" />
16 |         <option name="FENCED_CODE_BLOCKS" value="true" />
17 |         <option name="FOOTNOTES" value="false" />
18 |         <option name="HARDWRAPS" value="false" />
19 |         <option name="INSERTED" value="false" />
20 |         <option name="QUOTES" value="false" />
21 |         <option name="RELAXEDHRULES" value="true" />
22 |         <option name="SMARTS" value="false" />
23 |         <option name="STRIKETHROUGH" value="true" />
24 |         <option name="SUBSCRIPT" value="false" />
25 |         <option name="SUPERSCRIPT" value="false" />
26 |         <option name="SUPPRESS_HTML_BLOCKS" value="false" />
27 |         <option name="SUPPRESS_INLINE_HTML" value="false" />
28 |         <option name="TABLES" value="true" />
29 |         <option name="TASKLISTITEMS" value="true" />
30 |         <option name="TOC" value="false" />
31 |         <option name="WIKILINKS" value="true" />
32 |       </PegdownExtensions>
33 |       <ParserOptions>
34 |         <option name="COMMONMARK_LISTS" value="false" />
35 |         <option name="DUMMY" value="false" />
36 |         <option name="EMOJI_SHORTCUTS" value="true" />
37 |         <option name="FLEXMARK_FRONT_MATTER" value="false" />
38 |         <option name="GFM_TABLE_RENDERING" value="true" />
39 |         <option name="GITBOOK_URL_ENCODING" value="false" />
40 |         <option name="GITHUB_EMOJI_URL" value="false" />
41 |         <option name="GITHUB_LISTS" value="true" />
42 |         <option name="GITHUB_WIKI_LINKS" value="true" />
43 |         <option name="JEKYLL_FRONT_MATTER" value="false" />
44 |         <option name="SIM_TOC_BLANK_LINE_SPACER" value="true" />
45 |       </ParserOptions>
46 |     </ParserSettings>
47 |     <HtmlSettings headerTopEnabled="false" headerBottomEnabled="false" bodyTopEnabled="false" bodyBottomEnabled="false" embedUrlContent="false" addPageHeader="true">
48 |       <GeneratorProvider>
49 |         <provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.generator" providerName="Default Swing HTML Generator" />
50 |       </GeneratorProvider>
51 |       <headerTop />
52 |       <headerBottom />
53 |       <bodyTop />
54 |       <bodyBottom />
55 |     </HtmlSettings>
56 |     <CssSettings previewScheme="UI_SCHEME" cssUri="" isCssUriEnabled="false" isCssTextEnabled="false" isDynamicPageWidth="true">
57 |       <StylesheetProvider>
58 |         <provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.css" providerName="Default Swing Stylesheet" />
59 |       </StylesheetProvider>
60 |       <ScriptProviders />
61 |       <cssText />
62 |     </CssSettings>
63 |     <HtmlExportSettings updateOnSave="false" parentDir="$ProjectFileDir$" targetDir="$ProjectFileDir$" cssDir="" scriptDir="" plainHtml="false" imageDir="" copyLinkedImages="false" imageUniquifyType="0" targetExt="" useTargetExt="false" noCssNoScripts="false" linkToExportedHtml="true" exportOnSettingsChange="true" regenerateOnProjectOpen="false" />
64 |     <LinkMapSettings>
65 |       <textMaps />
66 |     </LinkMapSettings>
67 |   </component>
68 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5.2 virtualenv at E:\virtualevn\scrapy3" project-jdk-type="Python SDK" />
69 | </project>


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/.idea/misc.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="MarkdownProjectSettings">
 4 |     <PreviewSettings splitEditorLayout="SPLIT" splitEditorPreview="PREVIEW" useGrayscaleRendering="false" zoomFactor="1.5" maxImageWidth="0" showGitHubPageIfSynced="false" allowBrowsingInPreview="false" synchronizePreviewPosition="false" highlightPreviewType="NONE" highlightFadeOut="5" highlightOnTyping="false" synchronizeSourcePosition="false">
 5 |       <PanelProvider>
 6 |         <provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.panel" providerName="Default - Swing" />
 7 |       </PanelProvider>
 8 |     </PreviewSettings>
 9 |     <ParserSettings>
10 |       <PegdownExtensions>
11 |         <option name="ABBREVIATIONS" value="false" />
12 |         <option name="ANCHORLINKS" value="true" />
13 |         <option name="ATXHEADERSPACE" value="true" />
14 |         <option name="AUTOLINKS" value="true" />
15 |         <option name="DEFINITIONS" value="false" />
16 |         <option name="FENCED_CODE_BLOCKS" value="true" />
17 |         <option name="FOOTNOTES" value="false" />
18 |         <option name="HARDWRAPS" value="false" />
19 |         <option name="INSERTED" value="false" />
20 |         <option name="QUOTES" value="false" />
21 |         <option name="RELAXEDHRULES" value="true" />
22 |         <option name="SMARTS" value="false" />
23 |         <option name="STRIKETHROUGH" value="true" />
24 |         <option name="SUBSCRIPT" value="false" />
25 |         <option name="SUPERSCRIPT" value="false" />
26 |         <option name="SUPPRESS_HTML_BLOCKS" value="false" />
27 |         <option name="SUPPRESS_INLINE_HTML" value="false" />
28 |         <option name="TABLES" value="true" />
29 |         <option name="TASKLISTITEMS" value="true" />
30 |         <option name="TOC" value="false" />
31 |         <option name="WIKILINKS" value="true" />
32 |       </PegdownExtensions>
33 |       <ParserOptions>
34 |         <option name="COMMONMARK_LISTS" value="false" />
35 |         <option name="DUMMY" value="false" />
36 |         <option name="EMOJI_SHORTCUTS" value="true" />
37 |         <option name="FLEXMARK_FRONT_MATTER" value="false" />
38 |         <option name="GFM_TABLE_RENDERING" value="true" />
39 |         <option name="GITBOOK_URL_ENCODING" value="false" />
40 |         <option name="GITHUB_EMOJI_URL" value="false" />
41 |         <option name="GITHUB_LISTS" value="true" />
42 |         <option name="GITHUB_WIKI_LINKS" value="true" />
43 |         <option name="JEKYLL_FRONT_MATTER" value="false" />
44 |         <option name="SIM_TOC_BLANK_LINE_SPACER" value="true" />
45 |       </ParserOptions>
46 |     </ParserSettings>
47 |     <HtmlSettings headerTopEnabled="false" headerBottomEnabled="false" bodyTopEnabled="false" bodyBottomEnabled="false" embedUrlContent="false" addPageHeader="true">
48 |       <GeneratorProvider>
49 |         <provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.generator" providerName="Default Swing HTML Generator" />
50 |       </GeneratorProvider>
51 |       <headerTop />
52 |       <headerBottom />
53 |       <bodyTop />
54 |       <bodyBottom />
55 |     </HtmlSettings>
56 |     <CssSettings previewScheme="UI_SCHEME" cssUri="" isCssUriEnabled="false" isCssTextEnabled="false" isDynamicPageWidth="true">
57 |       <StylesheetProvider>
58 |         <provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.css" providerName="Default Swing Stylesheet" />
59 |       </StylesheetProvider>
60 |       <ScriptProviders />
61 |       <cssText />
62 |     </CssSettings>
63 |     <HtmlExportSettings updateOnSave="false" parentDir="$ProjectFileDir$" targetDir="$ProjectFileDir$" cssDir="" scriptDir="" plainHtml="false" imageDir="" copyLinkedImages="false" imageUniquifyType="0" targetExt="" useTargetExt="false" noCssNoScripts="false" linkToExportedHtml="true" exportOnSettingsChange="true" regenerateOnProjectOpen="false" />
64 |     <LinkMapSettings>
65 |       <textMaps />
66 |     </LinkMapSettings>
67 |   </component>
68 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5.3 virtualenv at E:\Evns\article_spider" project-jdk-type="Python SDK" />
69 | </project>


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/.idea/misc.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="MarkdownProjectSettings">
 4 |     <PreviewSettings splitEditorLayout="SPLIT" splitEditorPreview="PREVIEW" useGrayscaleRendering="false" zoomFactor="1.5" maxImageWidth="0" showGitHubPageIfSynced="false" allowBrowsingInPreview="false" synchronizePreviewPosition="false" highlightPreviewType="NONE" highlightFadeOut="5" highlightOnTyping="false" synchronizeSourcePosition="false">
 5 |       <PanelProvider>
 6 |         <provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.panel" providerName="Default - Swing" />
 7 |       </PanelProvider>
 8 |     </PreviewSettings>
 9 |     <ParserSettings>
10 |       <PegdownExtensions>
11 |         <option name="ABBREVIATIONS" value="false" />
12 |         <option name="ANCHORLINKS" value="true" />
13 |         <option name="ATXHEADERSPACE" value="true" />
14 |         <option name="AUTOLINKS" value="true" />
15 |         <option name="DEFINITIONS" value="false" />
16 |         <option name="FENCED_CODE_BLOCKS" value="true" />
17 |         <option name="FOOTNOTES" value="false" />
18 |         <option name="HARDWRAPS" value="false" />
19 |         <option name="INSERTED" value="false" />
20 |         <option name="QUOTES" value="false" />
21 |         <option name="RELAXEDHRULES" value="true" />
22 |         <option name="SMARTS" value="false" />
23 |         <option name="STRIKETHROUGH" value="true" />
24 |         <option name="SUBSCRIPT" value="false" />
25 |         <option name="SUPERSCRIPT" value="false" />
26 |         <option name="SUPPRESS_HTML_BLOCKS" value="false" />
27 |         <option name="SUPPRESS_INLINE_HTML" value="false" />
28 |         <option name="TABLES" value="true" />
29 |         <option name="TASKLISTITEMS" value="true" />
30 |         <option name="TOC" value="false" />
31 |         <option name="WIKILINKS" value="true" />
32 |       </PegdownExtensions>
33 |       <ParserOptions>
34 |         <option name="COMMONMARK_LISTS" value="false" />
35 |         <option name="DUMMY" value="false" />
36 |         <option name="EMOJI_SHORTCUTS" value="true" />
37 |         <option name="FLEXMARK_FRONT_MATTER" value="false" />
38 |         <option name="GFM_TABLE_RENDERING" value="true" />
39 |         <option name="GITBOOK_URL_ENCODING" value="false" />
40 |         <option name="GITHUB_EMOJI_URL" value="false" />
41 |         <option name="GITHUB_LISTS" value="true" />
42 |         <option name="GITHUB_WIKI_LINKS" value="true" />
43 |         <option name="JEKYLL_FRONT_MATTER" value="false" />
44 |         <option name="SIM_TOC_BLANK_LINE_SPACER" value="true" />
45 |       </ParserOptions>
46 |     </ParserSettings>
47 |     <HtmlSettings headerTopEnabled="false" headerBottomEnabled="false" bodyTopEnabled="false" bodyBottomEnabled="false" embedUrlContent="false" addPageHeader="true">
48 |       <GeneratorProvider>
49 |         <provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.generator" providerName="Default Swing HTML Generator" />
50 |       </GeneratorProvider>
51 |       <headerTop />
52 |       <headerBottom />
53 |       <bodyTop />
54 |       <bodyBottom />
55 |     </HtmlSettings>
56 |     <CssSettings previewScheme="UI_SCHEME" cssUri="" isCssUriEnabled="false" isCssTextEnabled="false" isDynamicPageWidth="true">
57 |       <StylesheetProvider>
58 |         <provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.css" providerName="Default Swing Stylesheet" />
59 |       </StylesheetProvider>
60 |       <ScriptProviders />
61 |       <cssText />
62 |     </CssSettings>
63 |     <HtmlExportSettings updateOnSave="false" parentDir="$ProjectFileDir$" targetDir="$ProjectFileDir$" cssDir="" scriptDir="" plainHtml="false" imageDir="" copyLinkedImages="false" imageUniquifyType="0" targetExt="" useTargetExt="false" noCssNoScripts="false" linkToExportedHtml="true" exportOnSettingsChange="true" regenerateOnProjectOpen="false" />
64 |     <LinkMapSettings>
65 |       <textMaps />
66 |     </LinkMapSettings>
67 |   </component>
68 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5.3 virtualenv at E:\Evns\article_spider" project-jdk-type="Python SDK" />
69 | </project>


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/search/views.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from django.shortcuts import render
  3 | from django.views.generic.base import View
  4 | from search.models import ArticleType
  5 | from django.http import HttpResponse
  6 | from elasticsearch import Elasticsearch
  7 | from datetime import datetime
  8 | import redis
  9 | 
 10 | client = Elasticsearch(hosts=["127.0.0.1"])
 11 | redis_cli = redis.StrictRedis()
 12 | 
 13 | 
 14 | class IndexView(View):
 15 |     #首页
 16 |     def get(self, request):
 17 |         topn_search = redis_cli.zrevrangebyscore("search_keywords_set", "+inf", "-inf", start=0, num=5)
 18 |         return render(request, "index.html", {"topn_search":topn_search})
 19 | 
 20 | # Create your views here.
 21 | class SearchSuggest(View):
 22 |     def get(self, request):
 23 |         key_words = request.GET.get('s','')
 24 |         re_datas = []
 25 |         if key_words:
 26 |             s = ArticleType.search()
 27 |             s = s.suggest('my_suggest', key_words, completion={
 28 |                 "field":"suggest", "fuzzy":{
 29 |                     "fuzziness":2
 30 |                 },
 31 |                 "size": 10
 32 |             })
 33 |             suggestions = s.execute_suggest()
 34 |             for match in suggestions.my_suggest[0].options:
 35 |                 source = match._source
 36 |                 re_datas.append(source["title"])
 37 |         return HttpResponse(json.dumps(re_datas), content_type="application/json")
 38 | 
 39 | 
 40 | class SearchView(View):
 41 |     def get(self, request):
 42 |         key_words = request.GET.get("q","")
 43 |         s_type = request.GET.get("s_type", "article")
 44 | 
 45 |         redis_cli.zincrby("search_keywords_set", key_words)
 46 | 
 47 |         topn_search = redis_cli.zrevrangebyscore("search_keywords_set", "+inf", "-inf", start=0, num=5)
 48 |         page = request.GET.get("p", "1")
 49 |         try:
 50 |             page = int(page)
 51 |         except:
 52 |             page = 1
 53 | 
 54 |         jobbole_count = redis_cli.get("jobbole_count")
 55 |         start_time = datetime.now()
 56 |         response = client.search(
 57 |             index= "jobbole",
 58 |             body={
 59 |                 "query":{
 60 |                     "multi_match":{
 61 |                         "query":key_words,
 62 |                         "fields":["tags", "title", "content"]
 63 |                     }
 64 |                 },
 65 |                 "from":(page-1)*10,
 66 |                 "size":10,
 67 |                 "highlight": {
 68 |                     "pre_tags": ['<span class="keyWord">'],
 69 |                     "post_tags": ['</span>'],
 70 |                     "fields": {
 71 |                         "title": {},
 72 |                         "content": {},
 73 |                     }
 74 |                 }
 75 |             }
 76 |         )
 77 | 
 78 |         end_time = datetime.now()
 79 |         last_seconds = (end_time-start_time).total_seconds()
 80 |         total_nums = response["hits"]["total"]
 81 |         if (page%10) > 0:
 82 |             page_nums = int(total_nums/10) +1
 83 |         else:
 84 |             page_nums = int(total_nums/10)
 85 |         hit_list = []
 86 |         for hit in response["hits"]["hits"]:
 87 |             hit_dict = {}
 88 |             if "title" in hit["highlight"]:
 89 |                 hit_dict["title"] = "".join(hit["highlight"]["title"])
 90 |             else:
 91 |                 hit_dict["title"] = hit["_source"]["title"]
 92 |             if "content" in hit["highlight"]:
 93 |                 hit_dict["content"] = "".join(hit["highlight"]["content"])[:500]
 94 |             else:
 95 |                 hit_dict["content"] = hit["_source"]["content"][:500]
 96 | 
 97 |             hit_dict["create_date"] = hit["_source"]["create_date"]
 98 |             hit_dict["url"] = hit["_source"]["url"]
 99 |             hit_dict["score"] = hit["_score"]
100 | 
101 |             hit_list.append(hit_dict)
102 | 
103 |         return render(request, "result.html", {"page":page,
104 |                                                "all_hits":hit_list,
105 |                                                "key_words":key_words,
106 |                                                "total_nums":total_nums,
107 |                                                "page_nums":page_nums,
108 |                                                "last_seconds":last_seconds,
109 |                                                "jobbole_count":jobbole_count,
110 |                                                "topn_search":topn_search})
111 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/ArticleSpider/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import os
  4 | 
  5 | # Scrapy settings for ArticleSpider project
  6 | #
  7 | # For simplicity, this file contains only settings considered important or
  8 | # commonly used. You can find more settings consulting the documentation:
  9 | #
 10 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 11 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 12 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 13 | 
 14 | BOT_NAME = 'ArticleSpider'
 15 | 
 16 | SPIDER_MODULES = ['ArticleSpider.spiders']
 17 | NEWSPIDER_MODULE = 'ArticleSpider.spiders'
 18 | 
 19 | 
 20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 21 | #USER_AGENT = 'ArticleSpider (+http://www.yourdomain.com)'
 22 | 
 23 | # Obey robots.txt rules
 24 | ROBOTSTXT_OBEY = False
 25 | 
 26 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 27 | #CONCURRENT_REQUESTS = 32
 28 | 
 29 | # Configure a delay for requests for the same website (default: 0)
 30 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 31 | # See also autothrottle settings and docs
 32 | DOWNLOAD_DELAY = 10
 33 | # The download delay setting will honor only one of:
 34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 35 | #CONCURRENT_REQUESTS_PER_IP = 16
 36 | 
 37 | # Disable cookies (enabled by default)
 38 | COOKIES_ENABLED = False
 39 | 
 40 | # Disable Telnet Console (enabled by default)
 41 | #TELNETCONSOLE_ENABLED = False
 42 | 
 43 | # Override the default request headers:
 44 | #DEFAULT_REQUEST_HEADERS = {
 45 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 46 | #   'Accept-Language': 'en',
 47 | #}
 48 | 
 49 | # Enable or disable spider middlewares
 50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 51 | # SPIDER_MIDDLEWARES = {
 52 | #    'ArticleSpider.middlewares.ArticlespiderSpiderMiddleware': 543,
 53 | # }
 54 | 
 55 | # Enable or disable downloader middlewares
 56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 57 | # DOWNLOADER_MIDDLEWARES = {
 58 | #     'ArticleSpider.middlewares.JSPageMiddleware': 1,
 59 | #    # 'ArticleSpider.middlewares.RandomUserAgentMiddlware': 543,
 60 | #     # 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
 61 | # }
 62 | 
 63 | # Enable or disable extensions
 64 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 65 | #EXTENSIONS = {
 66 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 67 | #}
 68 | 
 69 | # Configure item pipelines
 70 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 71 | ITEM_PIPELINES = {
 72 |    # 'ArticleSpider.pipelines.JsonExporterPipleline': 2,
 73 |    # # 'scrapy.pipelines.images.ImagesPipeline': 1,
 74 |    #  'ArticleSpider.pipelines.ArticleImagePipeline': 1,
 75 |    #  'ArticleSpider.pipelines.MysqlTwistedPipline': 1,
 76 |     'ArticleSpider.pipelines.ElasticsearchPipeline': 1
 77 | }
 78 | IMAGES_URLS_FIELD = "front_image_url"
 79 | project_dir = os.path.abspath(os.path.dirname(__file__))
 80 | IMAGES_STORE = os.path.join(project_dir, 'images')
 81 | 
 82 | import sys
 83 | BASE_DIR = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
 84 | sys.path.insert(0, os.path.join(BASE_DIR, 'ArticleSpider'))
 85 | 
 86 | USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
 87 | 
 88 | RANDOM_UA_TYPE = "random"
 89 | #
 90 | # IMAGES_MIN_HEIGHT = 100
 91 | # IMAGES_MIN_WIDTH = 100
 92 | 
 93 | # Enable and configure the AutoThrottle extension (disabled by default)
 94 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 95 | AUTOTHROTTLE_ENABLED = True
 96 | # The initial download delay
 97 | #AUTOTHROTTLE_START_DELAY = 5
 98 | # The maximum download delay to be set in case of high latencies
 99 | #AUTOTHROTTLE_MAX_DELAY = 60
100 | # The average number of requests Scrapy should be sending in parallel to
101 | # each remote server
102 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
103 | # Enable showing throttling stats for every response received:
104 | #AUTOTHROTTLE_DEBUG = False
105 | 
106 | # Enable and configure HTTP caching (disabled by default)
107 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
108 | #HTTPCACHE_ENABLED = True
109 | #HTTPCACHE_EXPIRATION_SECS = 0
110 | #HTTPCACHE_DIR = 'httpcache'
111 | #HTTPCACHE_IGNORE_HTTP_CODES = []
112 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
113 | 
114 | MYSQL_HOST = "127.0.0.1"
115 | MYSQL_DBNAME = "article_spider"
116 | MYSQL_USER = "root"
117 | MYSQL_PASSWORD = "root"
118 | 
119 | 
120 | SQL_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
121 | SQL_DATE_FORMAT = "%Y-%m-%d"
122 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/build/lib/ArticleSpider/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import os
  4 | 
  5 | # Scrapy settings for ArticleSpider project
  6 | #
  7 | # For simplicity, this file contains only settings considered important or
  8 | # commonly used. You can find more settings consulting the documentation:
  9 | #
 10 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 11 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 12 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 13 | 
 14 | BOT_NAME = 'ArticleSpider'
 15 | 
 16 | SPIDER_MODULES = ['ArticleSpider.spiders']
 17 | NEWSPIDER_MODULE = 'ArticleSpider.spiders'
 18 | 
 19 | 
 20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 21 | #USER_AGENT = 'ArticleSpider (+http://www.yourdomain.com)'
 22 | 
 23 | # Obey robots.txt rules
 24 | ROBOTSTXT_OBEY = False
 25 | 
 26 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 27 | #CONCURRENT_REQUESTS = 32
 28 | 
 29 | # Configure a delay for requests for the same website (default: 0)
 30 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 31 | # See also autothrottle settings and docs
 32 | DOWNLOAD_DELAY = 10
 33 | # The download delay setting will honor only one of:
 34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 35 | #CONCURRENT_REQUESTS_PER_IP = 16
 36 | 
 37 | # Disable cookies (enabled by default)
 38 | COOKIES_ENABLED = False
 39 | 
 40 | # Disable Telnet Console (enabled by default)
 41 | #TELNETCONSOLE_ENABLED = False
 42 | 
 43 | # Override the default request headers:
 44 | #DEFAULT_REQUEST_HEADERS = {
 45 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 46 | #   'Accept-Language': 'en',
 47 | #}
 48 | 
 49 | # Enable or disable spider middlewares
 50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 51 | # SPIDER_MIDDLEWARES = {
 52 | #    'ArticleSpider.middlewares.ArticlespiderSpiderMiddleware': 543,
 53 | # }
 54 | 
 55 | # Enable or disable downloader middlewares
 56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 57 | # DOWNLOADER_MIDDLEWARES = {
 58 | #     'ArticleSpider.middlewares.JSPageMiddleware': 1,
 59 | #    # 'ArticleSpider.middlewares.RandomUserAgentMiddlware': 543,
 60 | #     # 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
 61 | # }
 62 | 
 63 | # Enable or disable extensions
 64 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 65 | #EXTENSIONS = {
 66 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 67 | #}
 68 | 
 69 | # Configure item pipelines
 70 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 71 | ITEM_PIPELINES = {
 72 |    # 'ArticleSpider.pipelines.JsonExporterPipleline': 2,
 73 |    # # 'scrapy.pipelines.images.ImagesPipeline': 1,
 74 |    #  'ArticleSpider.pipelines.ArticleImagePipeline': 1,
 75 |    #  'ArticleSpider.pipelines.MysqlTwistedPipline': 1,
 76 |     'ArticleSpider.pipelines.ElasticsearchPipeline': 1
 77 | }
 78 | IMAGES_URLS_FIELD = "front_image_url"
 79 | project_dir = os.path.abspath(os.path.dirname(__file__))
 80 | IMAGES_STORE = os.path.join(project_dir, 'images')
 81 | 
 82 | import sys
 83 | BASE_DIR = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
 84 | sys.path.insert(0, os.path.join(BASE_DIR, 'ArticleSpider'))
 85 | 
 86 | USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
 87 | 
 88 | RANDOM_UA_TYPE = "random"
 89 | #
 90 | # IMAGES_MIN_HEIGHT = 100
 91 | # IMAGES_MIN_WIDTH = 100
 92 | 
 93 | # Enable and configure the AutoThrottle extension (disabled by default)
 94 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 95 | AUTOTHROTTLE_ENABLED = True
 96 | # The initial download delay
 97 | #AUTOTHROTTLE_START_DELAY = 5
 98 | # The maximum download delay to be set in case of high latencies
 99 | #AUTOTHROTTLE_MAX_DELAY = 60
100 | # The average number of requests Scrapy should be sending in parallel to
101 | # each remote server
102 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
103 | # Enable showing throttling stats for every response received:
104 | #AUTOTHROTTLE_DEBUG = False
105 | 
106 | # Enable and configure HTTP caching (disabled by default)
107 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
108 | #HTTPCACHE_ENABLED = True
109 | #HTTPCACHE_EXPIRATION_SECS = 0
110 | #HTTPCACHE_DIR = 'httpcache'
111 | #HTTPCACHE_IGNORE_HTTP_CODES = []
112 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
113 | 
114 | MYSQL_HOST = "127.0.0.1"
115 | MYSQL_DBNAME = "article_spider"
116 | MYSQL_USER = "root"
117 | MYSQL_PASSWORD = "root"
118 | 
119 | 
120 | SQL_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
121 | SQL_DATE_FORMAT = "%Y-%m-%d"
122 | 


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/scrapy_redis/dupefilter.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import time
  3 | 
  4 | from scrapy.dupefilters import BaseDupeFilter
  5 | from scrapy.utils.request import request_fingerprint
  6 | 
  7 | from . import defaults
  8 | from .connection import get_redis_from_settings
  9 | from ScrapyRedisTest.utils.bloomfilter import conn, PyBloomFilter
 10 | 
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | # TODO: Rename class to RedisDupeFilter.
 16 | class RFPDupeFilter(BaseDupeFilter):
 17 |     """Redis-based request duplicates filter.
 18 | 
 19 |     This class can also be used with default Scrapy's scheduler.
 20 | 
 21 |     """
 22 | 
 23 |     logger = logger
 24 | 
 25 |     def __init__(self, server, key, debug=False):
 26 |         """Initialize the duplicates filter.
 27 | 
 28 |         Parameters
 29 |         ----------
 30 |         server : redis.StrictRedis
 31 |             The redis server instance.
 32 |         key : str
 33 |             Redis key Where to store fingerprints.
 34 |         debug : bool, optional
 35 |             Whether to log filtered requests.
 36 | 
 37 |         """
 38 |         self.server = server
 39 |         self.key = key
 40 |         self.debug = debug
 41 |         self.logdupes = True
 42 | 
 43 |         self.bf = PyBloomFilter(conn=conn, key=key)
 44 | 
 45 |     @classmethod
 46 |     def from_settings(cls, settings):
 47 |         """Returns an instance from given settings.
 48 | 
 49 |         This uses by default the key ``dupefilter:<timestamp>``. When using the
 50 |         ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
 51 |         it needs to pass the spider name in the key.
 52 | 
 53 |         Parameters
 54 |         ----------
 55 |         settings : scrapy.settings.Settings
 56 | 
 57 |         Returns
 58 |         -------
 59 |         RFPDupeFilter
 60 |             A RFPDupeFilter instance.
 61 | 
 62 | 
 63 |         """
 64 |         server = get_redis_from_settings(settings)
 65 |         # XXX: This creates one-time key. needed to support to use this
 66 |         # class as standalone dupefilter with scrapy's default scheduler
 67 |         # if scrapy passes spider on open() method this wouldn't be needed
 68 |         # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
 69 |         key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}
 70 |         debug = settings.getbool('DUPEFILTER_DEBUG')
 71 |         return cls(server, key=key, debug=debug)
 72 | 
 73 |     @classmethod
 74 |     def from_crawler(cls, crawler):
 75 |         """Returns instance from crawler.
 76 | 
 77 |         Parameters
 78 |         ----------
 79 |         crawler : scrapy.crawler.Crawler
 80 | 
 81 |         Returns
 82 |         -------
 83 |         RFPDupeFilter
 84 |             Instance of RFPDupeFilter.
 85 | 
 86 |         """
 87 |         return cls.from_settings(crawler.settings)
 88 | 
 89 |     def request_seen(self, request):
 90 |         """Returns True if request was already seen.
 91 | 
 92 |         Parameters
 93 |         ----------
 94 |         request : scrapy.http.Request
 95 | 
 96 |         Returns
 97 |         -------
 98 |         bool
 99 | 
100 |         """
101 |         fp = self.request_fingerprint(request)
102 | 
103 |         if self.bf.is_exist(fp):
104 |             return True
105 |         else:
106 |             self.bf.add(fp)
107 |             return False
108 |         # This returns the number of values added, zero if already exists.
109 |         # added = self.server.sadd(self.key, fp)
110 |         # return added == 0
111 | 
112 |     def request_fingerprint(self, request):
113 |         """Returns a fingerprint for a given request.
114 | 
115 |         Parameters
116 |         ----------
117 |         request : scrapy.http.Request
118 | 
119 |         Returns
120 |         -------
121 |         str
122 | 
123 |         """
124 |         return request_fingerprint(request)
125 | 
126 |     def close(self, reason=''):
127 |         """Delete data on close. Called by Scrapy's scheduler.
128 | 
129 |         Parameters
130 |         ----------
131 |         reason : str, optional
132 | 
133 |         """
134 |         self.clear()
135 | 
136 |     def clear(self):
137 |         """Clears fingerprints data."""
138 |         self.server.delete(self.key)
139 | 
140 |     def log(self, request, spider):
141 |         """Logs given request.
142 | 
143 |         Parameters
144 |         ----------
145 |         request : scrapy.http.Request
146 |         spider : scrapy.spiders.Spider
147 | 
148 |         """
149 |         if self.debug:
150 |             msg = "Filtered duplicate request: %(request)s"
151 |             self.logger.debug(msg, {'request': request}, extra={'spider': spider})
152 |         elif self.logdupes:
153 |             msg = ("Filtered duplicate request %(request)s"
154 |                    " - no more duplicates will be shown"
155 |                    " (see DUPEFILTER_DEBUG to show all duplicates)")
156 |             self.logger.debug(msg, {'request': request}, extra={'spider': spider})
157 |             self.logdupes = False
158 | 


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/scrapy_redis/queue.py:
--------------------------------------------------------------------------------
  1 | from scrapy.utils.reqser import request_to_dict, request_from_dict
  2 | 
  3 | from . import picklecompat
  4 | 
  5 | 
  6 | class Base(object):
  7 |     """Per-spider base queue class"""
  8 | 
  9 |     def __init__(self, server, spider, key, serializer=None):
 10 |         """Initialize per-spider redis queue.
 11 | 
 12 |         Parameters
 13 |         ----------
 14 |         server : StrictRedis
 15 |             Redis client instance.
 16 |         spider : Spider
 17 |             Scrapy spider instance.
 18 |         key: str
 19 |             Redis key where to put and get messages.
 20 |         serializer : object
 21 |             Serializer object with ``loads`` and ``dumps`` methods.
 22 | 
 23 |         """
 24 |         if serializer is None:
 25 |             # Backward compatibility.
 26 |             # TODO: deprecate pickle.
 27 |             serializer = picklecompat
 28 |         if not hasattr(serializer, 'loads'):
 29 |             raise TypeError("serializer does not implement 'loads' function: %r"
 30 |                             % serializer)
 31 |         if not hasattr(serializer, 'dumps'):
 32 |             raise TypeError("serializer '%s' does not implement 'dumps' function: %r"
 33 |                             % serializer)
 34 | 
 35 |         self.server = server
 36 |         self.spider = spider
 37 |         self.key = key % {'spider': spider.name}
 38 |         self.serializer = serializer
 39 | 
 40 |     def _encode_request(self, request):
 41 |         """Encode a request object"""
 42 |         obj = request_to_dict(request, self.spider)
 43 |         return self.serializer.dumps(obj)
 44 | 
 45 |     def _decode_request(self, encoded_request):
 46 |         """Decode an request previously encoded"""
 47 |         obj = self.serializer.loads(encoded_request)
 48 |         return request_from_dict(obj, self.spider)
 49 | 
 50 |     def __len__(self):
 51 |         """Return the length of the queue"""
 52 |         raise NotImplementedError
 53 | 
 54 |     def push(self, request):
 55 |         """Push a request"""
 56 |         raise NotImplementedError
 57 | 
 58 |     def pop(self, timeout=0):
 59 |         """Pop a request"""
 60 |         raise NotImplementedError
 61 | 
 62 |     def clear(self):
 63 |         """Clear queue/stack"""
 64 |         self.server.delete(self.key)
 65 | 
 66 | 
 67 | class FifoQueue(Base):
 68 |     """Per-spider FIFO queue"""
 69 | 
 70 |     def __len__(self):
 71 |         """Return the length of the queue"""
 72 |         return self.server.llen(self.key)
 73 | 
 74 |     def push(self, request):
 75 |         """Push a request"""
 76 |         self.server.lpush(self.key, self._encode_request(request))
 77 | 
 78 |     def pop(self, timeout=0):
 79 |         """Pop a request"""
 80 |         if timeout > 0:
 81 |             data = self.server.brpop(self.key, timeout)
 82 |             if isinstance(data, tuple):
 83 |                 data = data[1]
 84 |         else:
 85 |             data = self.server.rpop(self.key)
 86 |         if data:
 87 |             return self._decode_request(data)
 88 | 
 89 | 
 90 | class PriorityQueue(Base):
 91 |     """Per-spider priority queue abstraction using redis' sorted set"""
 92 | 
 93 |     def __len__(self):
 94 |         """Return the length of the queue"""
 95 |         return self.server.zcard(self.key)
 96 | 
 97 |     def push(self, request):
 98 |         """Push a request"""
 99 |         data = self._encode_request(request)
100 |         score = -request.priority
101 |         # We don't use zadd method as the order of arguments change depending on
102 |         # whether the class is Redis or StrictRedis, and the option of using
103 |         # kwargs only accepts strings, not bytes.
104 |         self.server.execute_command('ZADD', self.key, score, data)
105 | 
106 |     def pop(self, timeout=0):
107 |         """
108 |         Pop a request
109 |         timeout not support in this queue class
110 |         """
111 |         # use atomic range/remove using multi/exec
112 |         pipe = self.server.pipeline()
113 |         pipe.multi()
114 |         pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0)
115 |         results, count = pipe.execute()
116 |         if results:
117 |             return self._decode_request(results[0])
118 | 
119 | 
120 | class LifoQueue(Base):
121 |     """Per-spider LIFO queue."""
122 | 
123 |     def __len__(self):
124 |         """Return the length of the stack"""
125 |         return self.server.llen(self.key)
126 | 
127 |     def push(self, request):
128 |         """Push a request"""
129 |         self.server.lpush(self.key, self._encode_request(request))
130 | 
131 |     def pop(self, timeout=0):
132 |         """Pop a request"""
133 |         if timeout > 0:
134 |             data = self.server.blpop(self.key, timeout)
135 |             if isinstance(data, tuple):
136 |                 data = data[1]
137 |         else:
138 |             data = self.server.lpop(self.key)
139 | 
140 |         if data:
141 |             return self._decode_request(data)
142 | 
143 | 
144 | # TODO: Deprecate the use of these names.
145 | SpiderQueue = FifoQueue
146 | SpiderStack = LifoQueue
147 | SpiderPriorityQueue = PriorityQueue
148 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/css/result.css:
--------------------------------------------------------------------------------
  1 | @charset "utf-8";
  2 | html{*overflow:auto;}
  3 | #hd{padding:20px 10px;}
  4 | #bd{margin-bottom:40px;}
  5 | .logo{float:left;margin-right:30px; height:33px;}
  6 | /*input搜索区域*/
  7 | .inputArea{float:left;position:relative;}
  8 | .inputArea .searchInput{border:1px solid #bfbfbf;padding:0 15px;outline:none;height:35px;*line-height:35px;width:350px; background:url(../img/inputbg.png);font-size:14px;}
  9 | .inputArea .searchButton{position:absolute;left:382px;top:0;*top:1px;*left:381px;width:106px;height:38px;background:url(../img/btn_min.png) no-repeat;border:none;cursor:pointer;}
 10 | 
 11 | /*返回高级搜索*/
 12 | .inputArea .advanced{position:absolute;font-size:14px;left:500px;top:12px;width:60px;text-decoration:underline;}
 13 | 
 14 | /*分界区域，导航*/
 15 | .nav{margin-bottom:24px;height:31px;background:#f9f9f9;border-bottom:1px solid #e0e0e0;padding:5px 0 0 210px;}
 16 | .searchList{float:left;padding-left:5px;}
 17 | .searchList .searchItem{float:left;margin-right:15px;font-size:14px;padding:0 3px 2px 3px;cursor:pointer;height:26px; line-height:26px;}
 18 | .searchList .searchItem.current{color:#0080cc;border-bottom:3px solid #9cc813;font-weight:bold;}
 19 | .nav .tips{color:#969696;font-size:12px;line-height:24px;*line-height:26px;}
 20 | #container.sideBarHide .nav{padding-left:35px;}
 21 | 
 22 | /*#main区域样式*/
 23 | #main{padding:0 215px 0 182px;}
 24 | #main.sideBarHide{padding-left:10px;}
 25 | /*侧边栏搜索条件*/
 26 | .sideBar{position:relative;float:left;margin-left:-182px;width:182px;}
 27 | .sideBar .subfieldContext{margin-bottom:20px;padding-left:25px;}
 28 | .sideBar .subfieldContext li{margin-bottom:5px;cursor:pointer;}
 29 | .sideBar .subfieldContext input[type=text]{width:75px;}
 30 | .sideBar .unit{color:#787878;}
 31 | 
 32 | /*更多按钮*/
 33 | .sideBar .more a:hover{text-decoration:none;}
 34 | .sideBar .more .moreIcon{display:inline-block;position:relative;top:-1px;*top:-3px;left:2px;*left:-1px;width:9px;height:5px;background:url(../img/more.png);}
 35 | .sideBar .more.show .moreIcon{background:url(../img/down.png);top:-2px;}
 36 | 
 37 | .sideBar .reset{padding-left:25px;}
 38 | /*siderBar区域显隐控制*/
 39 | .sideBar .sideBarShowHide{position:absolute;right:0px;top:20px;height:177px;width:1px; background:url(../img/line.png) right;}
 40 | .sideBar .sideBarShowHide a{position:absolute;top:70px;left:-11px;display:inline-block;width:12px;height:31px;background:url(../img/lr.png);}
 41 | 
 42 | .sideBar .sideBarShowHide a:hover{background-position:0 -31px;}
 43 | 
 44 | /*左侧收起样式*/
 45 | #main.sideBarHide .sideBar{margin-left:-191px;*margin-left:-182px;}
 46 | #main.sideBarHide .sideBar .sideBarShowHide{-moz-transform:rotate(180deg); -o-transform:rotate(180deg); -webkit-transform:rotate(180deg); transform:rotate(180deg);}
 47 | #main.sideBarHide .sideBar .sideBarShowHide a{*background:url(../img/ll.png);}
 48 | #main.sideBarHide .sideBar .sideBarShowHide a:hover{*background-position:0 -31px;}
 49 | #main.sideBarHide .sideBar .sideBarShowHide{background:none;}
 50 | 
 51 | .resultArea{float:left;width:100%;}
 52 | .resultArea .resultTotal{position:relative;padding-left:30px;margin-bottom:20px;}
 53 | .resultArea .resultTotal .info{color:#9a9a9a;}
 54 | .resultArea .resultTotal .orderOpt{position:absolute;right:50px;}
 55 | .resultArea .resultTotal .orderOpt a{margin-right:10px;color:#0080cc;}
 56 | 
 57 | /*搜索结果列表区域*/
 58 | .resultArea .resultList{padding-left:30px;}
 59 | .resultArea .resultList .resultItem{margin-bottom:20px;}
 60 | .resultArea .resultList .resultItem{margin-bottom:30px;}
 61 | .resultArea .resultList .itemHead{margin-bottom:5px;color:#767676;} 
 62 | .resultArea .resultList .itemHead .keyWord{color:#d90909;}
 63 | .resultArea .resultList .itemBody .keyWord{color:#d90909;}
 64 | .resultArea .resultList .itemHead a.title{font-size:16px;color:#0080cc;text-decoration:underline;}
 65 | .resultArea .resultList .itemHead .value{color:#008000;}
 66 | .resultArea .resultList .itemHead .divsion{margin:0 5px;}
 67 | .resultArea .resultList .itemHead .fileType{margin-right:10px;}
 68 | 
 69 | /*搜索内容主体*/
 70 | .resultArea .resultList .itemBody{margin-bottom:5px;line-height:18px;width:90%;}
 71 | .resultArea .resultList .itemFoot{color:#008000;}
 72 | .resultArea .resultList .itemFoot .info{margin-right:10px;}
 73 | 
 74 | .resultArea .pagination{margin-bottom:25px;padding-left:32px;}
 75 | /*相关搜索*/
 76 | .resultArea .dependSearch{margin-bottom:30px;padding-left:32px;font-size:14px;}
 77 | .resultArea .dependSearch h6{float:left;margin-right:15px;font-weight:bold;}
 78 | .resultArea .dependSearch p{margin-bottom:5px;}
 79 | .resultArea .dependSearch a{display:inline-block;margin-right:15px;text-decoration:underline;width:90px; white-space:nowrap; overflow:hidden;text-overflow:ellipsis;}
 80 | .resultArea .searchInResult{padding-left:35px;}
 81 | .resultArea .searchInResult .inResult{position:absolute;right:-190px;top:8px;font-size:14px;text-decoration:underline;}
 82 | .resultArea .searchInResult .searchButton{left:417px;}
 83 | /*历史搜索区域*/
 84 | .historyArea{float:right;margin-right:-212px;width:212px;}
 85 | .historyArea h6{margin-bottom:10px;font-weight:bold;}
 86 | .historyArea .historyList{margin-bottom:20px;}
 87 | .historyArea .historyList li{margin-bottom:5px;}
 88 | 
 89 | 
 90 | 
 91 | /*左侧分栏区域*/
 92 | .subfield{margin-bottom:5px;font-size:14px;font-weight:bold;padding:2px 0 2px 24px;}
 93 | .subfield:first-child{border-left:4px solid #9cc813;padding-left:20px;}
 94 | 
 95 | 
 96 | 
 97 | /*立即搜索样式*/
 98 | .subfieldContent .search{margin:45px 0 0 135px;width:130px;height:36px;background:url(../img/btnbg.png); font-weight:bold;border:none;border:1px solid #bfbfbf;line-height:36px;}
 99 | /*联想下拉区域*/
100 | .inputArea .dataList{display:none;position:absolute;left:0;top:42px;*top:43px;width:550px;padding:5px 0;background:#fff;border:1px solid #bfbfbf;border-top:none;}
101 | .inputArea .dataList li{padding:2px 15px;font-size:14px;}
102 | .inputArea .dataList li:hover{background:#f0f0f0;color:#0080cc;font-weight:bold;}
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/css/result.css:
--------------------------------------------------------------------------------
  1 | @charset "utf-8";
  2 | html{*overflow:auto;}
  3 | #hd{padding:20px 10px;}
  4 | #bd{margin-bottom:40px;}
  5 | .logo{float:left;margin-right:30px; height:33px;}
  6 | /*input搜索区域*/
  7 | .inputArea{float:left;position:relative;}
  8 | .inputArea .searchInput{border:1px solid #bfbfbf;padding:0 15px;outline:none;height:35px;*line-height:35px;width:350px; background:url(../img/inputbg.png);font-size:14px;}
  9 | .inputArea .searchButton{position:absolute;left:382px;top:0;*top:1px;*left:381px;width:106px;height:38px;background:url(../img/btn_min.png) no-repeat;border:none;cursor:pointer;}
 10 | 
 11 | /*返回高级搜索*/
 12 | .inputArea .advanced{position:absolute;font-size:14px;left:500px;top:12px;width:60px;text-decoration:underline;}
 13 | 
 14 | /*分界区域，导航*/
 15 | .nav{margin-bottom:24px;height:31px;background:#f9f9f9;border-bottom:1px solid #e0e0e0;padding:5px 0 0 210px;}
 16 | .searchList{float:left;padding-left:5px;}
 17 | .searchList .searchItem{float:left;margin-right:15px;font-size:14px;padding:0 3px 2px 3px;cursor:pointer;height:26px; line-height:26px;}
 18 | .searchList .searchItem.current{color:#0080cc;border-bottom:3px solid #9cc813;font-weight:bold;}
 19 | .nav .tips{color:#969696;font-size:12px;line-height:24px;*line-height:26px;}
 20 | #container.sideBarHide .nav{padding-left:35px;}
 21 | 
 22 | /*#main区域样式*/
 23 | #main{padding:0 215px 0 182px;}
 24 | #main.sideBarHide{padding-left:10px;}
 25 | /*侧边栏搜索条件*/
 26 | .sideBar{position:relative;float:left;margin-left:-182px;width:182px;}
 27 | .sideBar .subfieldContext{margin-bottom:20px;padding-left:25px;}
 28 | .sideBar .subfieldContext li{margin-bottom:5px;cursor:pointer;}
 29 | .sideBar .subfieldContext input[type=text]{width:75px;}
 30 | .sideBar .unit{color:#787878;}
 31 | 
 32 | /*更多按钮*/
 33 | .sideBar .more a:hover{text-decoration:none;}
 34 | .sideBar .more .moreIcon{display:inline-block;position:relative;top:-1px;*top:-3px;left:2px;*left:-1px;width:9px;height:5px;background:url(../img/more.png);}
 35 | .sideBar .more.show .moreIcon{background:url(../img/down.png);top:-2px;}
 36 | 
 37 | .sideBar .reset{padding-left:25px;}
 38 | /*siderBar区域显隐控制*/
 39 | .sideBar .sideBarShowHide{position:absolute;right:0px;top:20px;height:177px;width:1px; background:url(../img/line.png) right;}
 40 | .sideBar .sideBarShowHide a{position:absolute;top:70px;left:-11px;display:inline-block;width:12px;height:31px;background:url(../img/lr.png);}
 41 | 
 42 | .sideBar .sideBarShowHide a:hover{background-position:0 -31px;}
 43 | 
 44 | /*左侧收起样式*/
 45 | #main.sideBarHide .sideBar{margin-left:-191px;*margin-left:-182px;}
 46 | #main.sideBarHide .sideBar .sideBarShowHide{-moz-transform:rotate(180deg); -o-transform:rotate(180deg); -webkit-transform:rotate(180deg); transform:rotate(180deg);}
 47 | #main.sideBarHide .sideBar .sideBarShowHide a{*background:url(../img/ll.png);}
 48 | #main.sideBarHide .sideBar .sideBarShowHide a:hover{*background-position:0 -31px;}
 49 | #main.sideBarHide .sideBar .sideBarShowHide{background:none;}
 50 | 
 51 | .resultArea{float:left;width:100%;}
 52 | .resultArea .resultTotal{position:relative;padding-left:30px;margin-bottom:20px;}
 53 | .resultArea .resultTotal .info{color:#9a9a9a;}
 54 | .resultArea .resultTotal .orderOpt{position:absolute;right:50px;}
 55 | .resultArea .resultTotal .orderOpt a{margin-right:10px;color:#0080cc;}
 56 | 
 57 | /*搜索结果列表区域*/
 58 | .resultArea .resultList{padding-left:30px;}
 59 | .resultArea .resultList .resultItem{margin-bottom:20px;}
 60 | .resultArea .resultList .resultItem{margin-bottom:30px;}
 61 | .resultArea .resultList .itemHead{margin-bottom:5px;color:#767676;} 
 62 | .resultArea .resultList .itemHead .keyWord{color:#d90909;}
 63 | .resultArea .resultList .itemBody .keyWord{color:#d90909;}
 64 | .resultArea .resultList .itemHead a.title{font-size:16px;color:#0080cc;text-decoration:underline;}
 65 | .resultArea .resultList .itemHead .value{color:#008000;}
 66 | .resultArea .resultList .itemHead .divsion{margin:0 5px;}
 67 | .resultArea .resultList .itemHead .fileType{margin-right:10px;}
 68 | 
 69 | /*搜索内容主体*/
 70 | .resultArea .resultList .itemBody{margin-bottom:5px;line-height:18px;width:90%;}
 71 | .resultArea .resultList .itemFoot{color:#008000;}
 72 | .resultArea .resultList .itemFoot .info{margin-right:10px;}
 73 | 
 74 | .resultArea .pagination{margin-bottom:25px;padding-left:32px;}
 75 | /*相关搜索*/
 76 | .resultArea .dependSearch{margin-bottom:30px;padding-left:32px;font-size:14px;}
 77 | .resultArea .dependSearch h6{float:left;margin-right:15px;font-weight:bold;}
 78 | .resultArea .dependSearch p{margin-bottom:5px;}
 79 | .resultArea .dependSearch a{display:inline-block;margin-right:15px;text-decoration:underline;width:90px; white-space:nowrap; overflow:hidden;text-overflow:ellipsis;}
 80 | .resultArea .searchInResult{padding-left:35px;}
 81 | .resultArea .searchInResult .inResult{position:absolute;right:-190px;top:8px;font-size:14px;text-decoration:underline;}
 82 | .resultArea .searchInResult .searchButton{left:417px;}
 83 | /*历史搜索区域*/
 84 | .historyArea{float:right;margin-right:-212px;width:212px;}
 85 | .historyArea h6{margin-bottom:10px;font-weight:bold;}
 86 | .historyArea .historyList{margin-bottom:20px;}
 87 | .historyArea .historyList li{margin-bottom:5px;}
 88 | 
 89 | 
 90 | 
 91 | /*左侧分栏区域*/
 92 | .subfield{margin-bottom:5px;font-size:14px;font-weight:bold;padding:2px 0 2px 24px;}
 93 | .subfield:first-child{border-left:4px solid #9cc813;padding-left:20px;}
 94 | 
 95 | 
 96 | 
 97 | /*立即搜索样式*/
 98 | .subfieldContent .search{margin:45px 0 0 135px;width:130px;height:36px;background:url(../img/btnbg.png); font-weight:bold;border:none;border:1px solid #bfbfbf;line-height:36px;}
 99 | /*联想下拉区域*/
100 | .inputArea .dataList{display:none;position:absolute;left:0;top:42px;*top:43px;width:550px;padding:5px 0;background:#fff;border:1px solid #bfbfbf;border-top:none;}
101 | .inputArea .dataList li{padding:2px 15px;font-size:14px;}
102 | .inputArea .dataList li:hover{background:#f0f0f0;color:#0080cc;font-weight:bold;}
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/index.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html >
  3 | <html xmlns="http://www.w3.org/1999/xhtml">
  4 | 
  5 | <head>
  6 | <meta http-equiv="X-UA-Compatible" content="IE=emulateIE7" />
  7 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  8 | <title>lcv-search 搜索引擎</title>
  9 | <link href="css/style.css" rel="stylesheet" type="text/css" />
 10 | <link href="css/index.css" rel="stylesheet" type="text/css" />
 11 | </head>
 12 | <body>
 13 | <div id="container">
 14 | 	<div id="bd">
 15 |         <div id="main">
 16 |         	<h1 class="title">
 17 |             	<div class="logo large"></div>
 18 |             </h1>
 19 |             <div class="nav ue-clear">
 20 |             	<ul class="searchList">
 21 |                     <li class="searchItem current" data-type="article">文章</li>
 22 |                     <li class="searchItem" data-type="question">问答</li>
 23 |                     <li class="searchItem" data-type="job">职位</li>
 24 |                 </ul>
 25 |             </div>
 26 |             <div class="inputArea">
 27 |             	<input type="text" class="searchInput" />
 28 |                 <input type="button" class="searchButton" onclick="add_search()" />
 29 |                 <ul class="dataList">
 30 |                 	<li>如何学好设计</li>
 31 |                     <li>界面设计</li>
 32 |                     <li>UI设计培训要多少钱</li>
 33 |                     <li>设计师学习</li>
 34 |                     <li>哪里有好的网站</li>
 35 |                 </ul>
 36 |             </div>
 37 | 
 38 |             <div class="historyArea">
 39 |             	<p class="history">
 40 |                 	<label>热门搜索：</label>
 41 |                     
 42 |                 </p>
 43 |                 <p class="history mysearch">
 44 |                 	<label>我的搜索：</label>
 45 |                     <span class="all-search">
 46 |                         <a href="javascript:;">专注界面设计网站</a>
 47 |                         <a href="javascript:;">用户体验</a>
 48 |                         <a href="javascript:;">互联网</a>
 49 |                         <a href="javascript:;">资费套餐</a>
 50 |                     </span>
 51 | 
 52 |                 </p>
 53 |             </div>
 54 |         </div><!-- End of main -->
 55 |     </div><!--End of bd-->
 56 | 
 57 |     <div class="foot">
 58 |     	<div class="wrap">
 59 |             <div class="copyright">Copyright &copy;uimaker.com 版权所有  E-mail:admin@uimaker.com</div>
 60 |         </div>
 61 |     </div>
 62 | </div>
 63 | </body>
 64 | <script type="text/javascript" src="js/jquery.js"></script>
 65 | <script type="text/javascript" src="js/global.js"></script>
 66 | <script type="text/javascript">
 67 |     var suggest_url = "/suggest/"
 68 |     var search_url = "/search/"
 69 | 
 70 | 
 71 | 	$('.searchList').on('click', '.searchItem', function(){
 72 | 		$('.searchList .searchItem').removeClass('current');
 73 | 		$(this).addClass('current');
 74 | 	});
 75 | 
 76 |     function removeByValue(arr, val) {
 77 |       for(var i=0; i<arr.length; i++) {
 78 |         if(arr[i] == val) {
 79 |           arr.splice(i, 1);
 80 |           break;
 81 |         }
 82 |       }
 83 |     }
 84 | 
 85 | 
 86 |     // 搜索建议
 87 |     $(function(){
 88 |         $('.searchInput').bind(' input propertychange ',function(){
 89 |             var searchText = $(this).val();
 90 |             var tmpHtml = ""
 91 |             $.ajax({
 92 |                 cache: false,
 93 |                 type: 'get',
 94 |                 dataType:'json',
 95 |                 url:suggest_url+"?s="+searchText+"&s_type="+$(".searchItem.current").attr('data-type'),
 96 |                 async: true,
 97 |                 success: function(data) {
 98 |                     for (var i=0;i<data.length;i++){
 99 |                         tmpHtml += '<li><a href="'+search_url+'?q='+data[i]+'">'+data[i]+'</a></li>'
100 |                     }
101 |                     $(".dataList").html("")
102 |                     $(".dataList").append(tmpHtml);
103 |                     if (data.length == 0){
104 |                         $('.dataList').hide()
105 |                     }else {
106 |                         $('.dataList').show()
107 |                     }
108 |                 }
109 |             });
110 |         } );
111 |     })
112 | 
113 |     hideElement($('.dataList'), $('.searchInput'));
114 | 
115 | </script>
116 | <script>
117 |     var searchArr;
118 |     //定义一个search的，判断浏览器有无数据存储（搜索历史）
119 |     if(localStorage.search){
120 |     //如果有，转换成 数组的形式存放到searchArr的数组里（localStorage以字符串的形式存储，所以要把它转换成数组的形式）
121 |         searchArr= localStorage.search.split(",")
122 |     }else{
123 |     //如果没有，则定义searchArr为一个空的数组
124 |         searchArr = [];
125 |     }
126 |     //把存储的数据显示出来作为搜索历史
127 |     MapSearchArr();
128 | 
129 |     function add_search(){
130 |         var val = $(".searchInput").val();
131 |         if (val.length>=2){
132 |             //点击搜索按钮时，去重
133 |             KillRepeat(val);
134 |             //去重后把数组存储到浏览器localStorage
135 |             localStorage.search = searchArr;
136 |             //然后再把搜索内容显示出来
137 |             MapSearchArr();
138 |         }
139 | 
140 |         window.location.href=search_url+'?q='+val+"&s_type="+$(".searchItem.current").attr('data-type')
141 | 
142 |     }
143 | 
144 |     function MapSearchArr(){
145 |         var tmpHtml = "";
146 |         var arrLen = 0
147 |         if (searchArr.length >= 5){
148 |             arrLen = 5
149 |         }else {
150 |             arrLen = searchArr.length
151 |         }
152 |         for (var i=0;i<arrLen;i++){
153 |             tmpHtml += '<a href="'+search_url+'?q='+searchArr[i]+'">'+searchArr[i]+'</a>'
154 |         }
155 |         $(".mysearch .all-search").html(tmpHtml);
156 |     }
157 |     //去重
158 |     function KillRepeat(val){
159 |         var kill = 0;
160 |         for (var i=0;i<searchArr.length;i++){
161 |             if(val===searchArr[i]){
162 |                 kill ++;
163 |             }
164 |         }
165 |         if(kill<1){
166 |             searchArr.unshift(val);
167 |         }else {
168 |             removeByValue(searchArr, val)
169 |             searchArr.unshift(val)
170 |         }
171 |     }
172 | 
173 | 
174 | </script>
175 | </html>


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/templates/index.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html >
  3 | <html xmlns="http://www.w3.org/1999/xhtml">
  4 | {% load staticfiles %}
  5 | <head>
  6 | <meta http-equiv="X-UA-Compatible" content="IE=emulateIE7" />
  7 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  8 | <title>lcv-search 搜索引擎</title>
  9 | <link href="{% static 'css/style.css' %}" rel="stylesheet" type="text/css" />
 10 | <link href="{% static 'css/index.css' %}" rel="stylesheet" type="text/css" />
 11 | </head>
 12 | <body>
 13 | <div id="container">
 14 | 	<div id="bd">
 15 |         <div id="main">
 16 |         	<h1 class="title">
 17 |             	<div class="logo large"></div>
 18 |             </h1>
 19 |             <div class="nav ue-clear">
 20 |             	<ul class="searchList">
 21 |                     <li class="searchItem current" data-type="article">文章</li>
 22 |                     <li class="searchItem" data-type="question">问答</li>
 23 |                     <li class="searchItem" data-type="job">职位</li>
 24 |                 </ul>
 25 |             </div>
 26 |             <div class="inputArea">
 27 |             	<input type="text" class="searchInput" />
 28 |                 <input type="button" class="searchButton" onclick="add_search()" />
 29 |                 <ul class="dataList">
 30 |                 	<li>如何学好设计</li>
 31 |                     <li>界面设计</li>
 32 |                     <li>UI设计培训要多少钱</li>
 33 |                     <li>设计师学习</li>
 34 |                     <li>哪里有好的网站</li>
 35 |                 </ul>
 36 |             </div>
 37 | 
 38 |             <div class="historyArea">
 39 |             	<p class="history">
 40 |                 	<label>热门搜索：</label>
 41 |                     {% for search_words in topn_search %}
 42 |                         <a href="/search?q={{ search_words }}">{{ search_words }}</a>
 43 |                     {% endfor %}
 44 |                 </p>
 45 |                 <p class="history mysearch">
 46 |                 	<label>我的搜索：</label>
 47 |                     <span class="all-search">
 48 |                         <a href="javascript:;">专注界面设计网站</a>
 49 |                         <a href="javascript:;">用户体验</a>
 50 |                         <a href="javascript:;">互联网</a>
 51 |                         <a href="javascript:;">资费套餐</a>
 52 |                     </span>
 53 | 
 54 |                 </p>
 55 |             </div>
 56 |         </div><!-- End of main -->
 57 |     </div><!--End of bd-->
 58 | 
 59 |     <div class="foot">
 60 |     	<div class="wrap">
 61 |             <div class="copyright">Copyright &copy;uimaker.com 版权所有  E-mail:admin@uimaker.com</div>
 62 |         </div>
 63 |     </div>
 64 | </div>
 65 | </body>
 66 | <script type="text/javascript" src="{% static 'js/jquery.js' %}"></script>
 67 | <script type="text/javascript" src="{% static 'js/global.js' %}"></script>
 68 | <script type="text/javascript">
 69 |     var suggest_url = "{% url "suggest" %}"
 70 |     var search_url = "{% url 'search' %}"
 71 | 
 72 | 
 73 | 	$('.searchList').on('click', '.searchItem', function(){
 74 | 		$('.searchList .searchItem').removeClass('current');
 75 | 		$(this).addClass('current');
 76 | 	});
 77 | 
 78 |     function removeByValue(arr, val) {
 79 |       for(var i=0; i<arr.length; i++) {
 80 |         if(arr[i] == val) {
 81 |           arr.splice(i, 1);
 82 |           break;
 83 |         }
 84 |       }
 85 |     }
 86 | 
 87 | 
 88 |     // 搜索建议
 89 |     $(function(){
 90 |         $('.searchInput').bind(' input propertychange ',function(){
 91 |             var searchText = $(this).val();
 92 |             var tmpHtml = ""
 93 |             $.ajax({
 94 |                 cache: false,
 95 |                 type: 'get',
 96 |                 dataType:'json',
 97 |                 url:suggest_url+"?s="+searchText+"&s_type="+$(".searchItem.current").attr('data-type'),
 98 |                 async: true,
 99 |                 success: function(data) {
100 |                     for (var i=0;i<data.length;i++){
101 |                         tmpHtml += '<li><a href="'+search_url+'?q='+data[i]+'">'+data[i]+'</a></li>'
102 |                     }
103 |                     $(".dataList").html("")
104 |                     $(".dataList").append(tmpHtml);
105 |                     if (data.length == 0){
106 |                         $('.dataList').hide()
107 |                     }else {
108 |                         $('.dataList').show()
109 |                     }
110 |                 }
111 |             });
112 |         } );
113 |     })
114 | 
115 |     hideElement($('.dataList'), $('.searchInput'));
116 | 
117 | </script>
118 | <script>
119 |     var searchArr;
120 |     //定义一个search的，判断浏览器有无数据存储（搜索历史）
121 |     if(localStorage.search){
122 |     //如果有，转换成 数组的形式存放到searchArr的数组里（localStorage以字符串的形式存储，所以要把它转换成数组的形式）
123 |         searchArr= localStorage.search.split(",")
124 |     }else{
125 |     //如果没有，则定义searchArr为一个空的数组
126 |         searchArr = [];
127 |     }
128 |     //把存储的数据显示出来作为搜索历史
129 |     MapSearchArr();
130 | 
131 |     function add_search(){
132 |         var val = $(".searchInput").val();
133 |         if (val.length>=2){
134 |             //点击搜索按钮时，去重
135 |             KillRepeat(val);
136 |             //去重后把数组存储到浏览器localStorage
137 |             localStorage.search = searchArr;
138 |             //然后再把搜索内容显示出来
139 |             MapSearchArr();
140 |         }
141 | 
142 |         window.location.href=search_url+'?q='+val+"&s_type="+$(".searchItem.current").attr('data-type')
143 | 
144 |     }
145 | 
146 |     function MapSearchArr(){
147 |         var tmpHtml = "";
148 |         var arrLen = 0
149 |         if (searchArr.length >= 5){
150 |             arrLen = 5
151 |         }else {
152 |             arrLen = searchArr.length
153 |         }
154 |         for (var i=0;i<arrLen;i++){
155 |             tmpHtml += '<a href="'+search_url+'?q='+searchArr[i]+'">'+searchArr[i]+'</a>'
156 |         }
157 |         $(".mysearch .all-search").html(tmpHtml);
158 |     }
159 |     //去重
160 |     function KillRepeat(val){
161 |         var kill = 0;
162 |         for (var i=0;i<searchArr.length;i++){
163 |             if(val===searchArr[i]){
164 |                 kill ++;
165 |             }
166 |         }
167 |         if(kill<1){
168 |             searchArr.unshift(val);
169 |         }else {
170 |             removeByValue(searchArr, val)
171 |             searchArr.unshift(val)
172 |         }
173 |     }
174 | 
175 | 
176 | </script>
177 | </html>


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch-Front/js/pagination.js:
--------------------------------------------------------------------------------
  1 | jQuery.fn.pagination = function(maxentries, opts) {
  2 | 	opts = jQuery.extend({
  3 | 				items_per_page : 10, // 每页显示多少条记录
  4 | 				current_page : 0,      //当前页码
  5 | 				num_display_entries : 4, // 中间显示页码的个数
  6 | 				num_edge_entries : 2, // 末尾显示页码的个数
  7 | 				link_to : "javascript:;",         //页码点击后的链接
  8 | 				prev_text : "上一页",   //上一页的文字
  9 | 				next_text : "下一页",	   //下一页的文字
 10 | 				ellipse_text : "...",  //页码之间的省略号
 11 | 				display_msg : true, // 是否显示记录信息
 12 | 				prev_show_always : true, //是否总是显示最前页
 13 | 				next_show_always : true,//是否总是显示最后页
 14 | 				setPageNo:false,//是否显示跳转第几页
 15 | 				callback : function() {
 16 | 					return false;
 17 | 				} // 回调函数
 18 | 			}, opts || {});
 19 | 
 20 | 	return this.each(function() {
 21 | 		// 总页数
 22 | 		function numPages() {
 23 | 			return Math.ceil(maxentries / opts.items_per_page);
 24 | 		}
 25 | 		/**
 26 | 		 * 计算页码
 27 | 		 */
 28 | 		function getInterval() {
 29 | 			var ne_half = Math.ceil(opts.num_display_entries / 2);
 30 | 			var np = numPages();
 31 | 			var upper_limit = np - opts.num_display_entries;
 32 | 			var start = current_page > ne_half ? Math.max(Math.min(current_page
 33 | 									- ne_half, upper_limit), 0) : 0;
 34 | 			var end = current_page > ne_half ? Math.min(current_page + ne_half,
 35 | 					np) : Math.min(opts.num_display_entries, np);
 36 | 			return [start, end];
 37 | 		}
 38 | 
 39 | 		/**
 40 | 		 * 点击事件
 41 | 		 */
 42 | 		function pageSelected(page_id, evt) {
 43 | 			var page_id = parseInt(page_id);
 44 | 			current_page = page_id;
 45 | 			drawLinks();
 46 | 			var continuePropagation = opts.callback(page_id, panel);
 47 | 			if (!continuePropagation) {
 48 | 				if (evt.stopPropagation) {
 49 | 					evt.stopPropagation();
 50 | 				} else {
 51 | 					evt.cancelBubble = true;
 52 | 				}
 53 | 			}
 54 | 			return continuePropagation;
 55 | 		}
 56 | 
 57 | 		/**
 58 | 		 * 链接
 59 | 		 */
 60 | 		function drawLinks() {
 61 | 			panel.empty();
 62 | 			var interval = getInterval();
 63 | 			var np = numPages();
 64 | 			var getClickHandler = function(page_id) {
 65 | 				return function(evt) {
 66 | 					return pageSelected(page_id, evt);
 67 | 				}
 68 | 			}
 69 | 			var appendItem = function(page_id, appendopts) {
 70 | 				page_id = page_id < 0 ? 0 : (page_id < np ? page_id : np-1);
 71 | 				appendopts = jQuery.extend({
 72 | 							text : page_id+1,
 73 | 							classes : ""
 74 | 						}, appendopts || {});
 75 | 				if (page_id == current_page) {
 76 | 					var lnk = $("<span class='current'>" + (appendopts.text)
 77 | 							+ "</span>");
 78 | 				} else {
 79 | 					var lnk = $("<a>" + (appendopts.text) + "</a>").bind(
 80 | 							"click", getClickHandler(page_id)).attr('href',
 81 | 							opts.link_to.replace(/__id__/, page_id));
 82 | 
 83 | 				}
 84 | 				if (appendopts.classes) {
 85 | 					lnk.addClass(appendopts.classes);
 86 | 				}
 87 | 				panel.append(lnk);
 88 | 			}
 89 | 			// 上一页
 90 | 			if (opts.prev_text && (current_page > 0 || opts.prev_show_always)) {
 91 | 				appendItem(current_page - 1, {
 92 | 							text : opts.prev_text,
 93 | 							classes : "prev"
 94 | 						});
 95 | 			}
 96 | 			// 点点点
 97 | 			if (interval[0] > 0 && opts.num_edge_entries > 0) {
 98 | 				var end = Math.min(opts.num_edge_entries, interval[0]);
 99 | 				for (var i = 0; i < end; i++) {
100 | 					appendItem(i);
101 | 				}
102 | 				if (opts.num_edge_entries < interval[0] && opts.ellipse_text) {
103 | 					jQuery("<span>" + opts.ellipse_text + "</span>")
104 | 							.appendTo(panel);
105 | 				}
106 | 			}
107 | 			// 中间的页码
108 | 			for (var i = interval[0]; i < interval[1]; i++) {
109 | 				appendItem(i);
110 | 			}
111 | 			// 最后的页码
112 | 			if (interval[1] < np && opts.num_edge_entries > 0) {
113 | 				if (np - opts.num_edge_entries > interval[1]
114 | 						&& opts.ellipse_text) {
115 | 					jQuery("<span>" + opts.ellipse_text + "</span>")
116 | 							.appendTo(panel);
117 | 				}
118 | 				var begin = Math.max(np - opts.num_edge_entries, interval[1]);
119 | 				for (var i = begin; i < np; i++) {
120 | 					appendItem(i);
121 | 				}
122 | 
123 | 			}
124 | 			// 下一页
125 | 			if (opts.next_text
126 | 					&& (current_page < np - 1 || opts.next_show_always)) {
127 | 				appendItem(current_page + 1, {
128 | 							text : opts.next_text,
129 | 							classes : "next"
130 | 						});
131 | 			}
132 | 			// 记录显示
133 | 			if (opts.display_msg) {
134 | 				if(!maxentries){
135 | 					panel
136 | 						.append('<div class="pxofy">暂时无数据可以显示</div>');
137 | 				}else{
138 | 				panel
139 | 						.append('<div class="pxofy">显示第&nbsp;'
140 | 								+ ((current_page * opts.items_per_page) + 1)
141 | 								+ '&nbsp;条到&nbsp;'
142 | 								+ (((current_page + 1) * opts.items_per_page) > maxentries
143 | 										? maxentries
144 | 										: ((current_page + 1) * opts.items_per_page))
145 | 								+ '&nbsp;条记录，总共&nbsp;' + maxentries + '&nbsp;条</div>');
146 | 				}
147 | 			}
148 | 			//设置跳到第几页
149 | 			if(opts.setPageNo){
150 | 				  panel.append("<div class='goto'><span class='text'>跳转到</span><input type='text'/><span class='page'>页</span><button type='button' class='ue-button long2'>确定</button></div>");	
151 | 			}
152 | 		}
153 | 
154 | 		// 当前页
155 | 		var current_page = opts.current_page;
156 | 		maxentries = ( maxentries < 0) ? 0 : maxentries;
157 | 		opts.items_per_page = (!opts.items_per_page || opts.items_per_page < 0)
158 | 				? 1
159 | 				: opts.items_per_page;
160 | 		var panel = jQuery(this);
161 | 		this.selectPage = function(page_id) {
162 | 			pageSelected(page_id);
163 | 		}
164 | 		this.prevPage = function() {
165 | 			if (current_page > 0) {
166 | 				pageSelected(current_page - 1);
167 | 				return true;
168 | 			} else {
169 | 				return false;
170 | 			}
171 | 		}
172 | 		this.nextPage = function() {
173 | 			if (current_page < numPages() - 1) {
174 | 				pageSelected(current_page + 1);
175 | 				return true;
176 | 			} else {
177 | 				return false;
178 | 			}
179 | 		}
180 | 		
181 | 		if(maxentries==0){
182 | 			panel.append('<span class="current prev">'+opts.prev_text+'</span><span class="current next">'+opts.next_text+'</span><div class="pxofy">暂时无数据可以显示</div>');
183 | 		}else{
184 | 			drawLinks();
185 | 		}
186 | 		$(this).find(".goto button").live("click",function(evt){
187 | 			var setPageNo = $(this).parent().find("input").val();
188 | 			if(setPageNo!=null && setPageNo!=""&&setPageNo>0&&setPageNo<=numPages()){
189 | 				pageSelected(setPageNo-1, evt);
190 | 			}
191 | 		});		
192 | 	});
193 | }
194 | 


--------------------------------------------------------------------------------
/s0vkaq/LcvSearch/static/js/pagination.js:
--------------------------------------------------------------------------------
  1 | jQuery.fn.pagination = function(maxentries, opts) {
  2 | 	opts = jQuery.extend({
  3 | 				items_per_page : 10, // 每页显示多少条记录
  4 | 				current_page : 0,      //当前页码
  5 | 				num_display_entries : 4, // 中间显示页码的个数
  6 | 				num_edge_entries : 2, // 末尾显示页码的个数
  7 | 				link_to : "javascript:;",         //页码点击后的链接
  8 | 				prev_text : "上一页",   //上一页的文字
  9 | 				next_text : "下一页",	   //下一页的文字
 10 | 				ellipse_text : "...",  //页码之间的省略号
 11 | 				display_msg : true, // 是否显示记录信息
 12 | 				prev_show_always : true, //是否总是显示最前页
 13 | 				next_show_always : true,//是否总是显示最后页
 14 | 				setPageNo:false,//是否显示跳转第几页
 15 | 				callback : function() {
 16 | 					return false;
 17 | 				} // 回调函数
 18 | 			}, opts || {});
 19 | 
 20 | 	return this.each(function() {
 21 | 		// 总页数
 22 | 		function numPages() {
 23 | 			return Math.ceil(maxentries / opts.items_per_page);
 24 | 		}
 25 | 		/**
 26 | 		 * 计算页码
 27 | 		 */
 28 | 		function getInterval() {
 29 | 			var ne_half = Math.ceil(opts.num_display_entries / 2);
 30 | 			var np = numPages();
 31 | 			var upper_limit = np - opts.num_display_entries;
 32 | 			var start = current_page > ne_half ? Math.max(Math.min(current_page
 33 | 									- ne_half, upper_limit), 0) : 0;
 34 | 			var end = current_page > ne_half ? Math.min(current_page + ne_half,
 35 | 					np) : Math.min(opts.num_display_entries, np);
 36 | 			return [start, end];
 37 | 		}
 38 | 
 39 | 		/**
 40 | 		 * 点击事件
 41 | 		 */
 42 | 		function pageSelected(page_id, evt) {
 43 | 			var page_id = parseInt(page_id);
 44 | 			current_page = page_id;
 45 | 			drawLinks();
 46 | 			var continuePropagation = opts.callback(page_id, panel);
 47 | 			if (!continuePropagation) {
 48 | 				if (evt.stopPropagation) {
 49 | 					evt.stopPropagation();
 50 | 				} else {
 51 | 					evt.cancelBubble = true;
 52 | 				}
 53 | 			}
 54 | 			return continuePropagation;
 55 | 		}
 56 | 
 57 | 		/**
 58 | 		 * 链接
 59 | 		 */
 60 | 		function drawLinks() {
 61 | 			panel.empty();
 62 | 			var interval = getInterval();
 63 | 			var np = numPages();
 64 | 			var getClickHandler = function(page_id) {
 65 | 				return function(evt) {
 66 | 					return pageSelected(page_id, evt);
 67 | 				}
 68 | 			}
 69 | 			var appendItem = function(page_id, appendopts) {
 70 | 				page_id = page_id < 0 ? 0 : (page_id < np ? page_id : np-1);
 71 | 				appendopts = jQuery.extend({
 72 | 							text : page_id+1,
 73 | 							classes : ""
 74 | 						}, appendopts || {});
 75 | 				if (page_id == current_page) {
 76 | 					var lnk = $("<span class='current'>" + (appendopts.text)
 77 | 							+ "</span>");
 78 | 				} else {
 79 | 					var lnk = $("<a>" + (appendopts.text) + "</a>").bind(
 80 | 							"click", getClickHandler(page_id)).attr('href',
 81 | 							opts.link_to.replace(/__id__/, page_id));
 82 | 
 83 | 				}
 84 | 				if (appendopts.classes) {
 85 | 					lnk.addClass(appendopts.classes);
 86 | 				}
 87 | 				panel.append(lnk);
 88 | 			}
 89 | 			// 上一页
 90 | 			if (opts.prev_text && (current_page > 0 || opts.prev_show_always)) {
 91 | 				appendItem(current_page - 1, {
 92 | 							text : opts.prev_text,
 93 | 							classes : "prev"
 94 | 						});
 95 | 			}
 96 | 			// 点点点
 97 | 			if (interval[0] > 0 && opts.num_edge_entries > 0) {
 98 | 				var end = Math.min(opts.num_edge_entries, interval[0]);
 99 | 				for (var i = 0; i < end; i++) {
100 | 					appendItem(i);
101 | 				}
102 | 				if (opts.num_edge_entries < interval[0] && opts.ellipse_text) {
103 | 					jQuery("<span>" + opts.ellipse_text + "</span>")
104 | 							.appendTo(panel);
105 | 				}
106 | 			}
107 | 			// 中间的页码
108 | 			for (var i = interval[0]; i < interval[1]; i++) {
109 | 				appendItem(i);
110 | 			}
111 | 			// 最后的页码
112 | 			if (interval[1] < np && opts.num_edge_entries > 0) {
113 | 				if (np - opts.num_edge_entries > interval[1]
114 | 						&& opts.ellipse_text) {
115 | 					jQuery("<span>" + opts.ellipse_text + "</span>")
116 | 							.appendTo(panel);
117 | 				}
118 | 				var begin = Math.max(np - opts.num_edge_entries, interval[1]);
119 | 				for (var i = begin; i < np; i++) {
120 | 					appendItem(i);
121 | 				}
122 | 
123 | 			}
124 | 			// 下一页
125 | 			if (opts.next_text
126 | 					&& (current_page < np - 1 || opts.next_show_always)) {
127 | 				appendItem(current_page + 1, {
128 | 							text : opts.next_text,
129 | 							classes : "next"
130 | 						});
131 | 			}
132 | 			// 记录显示
133 | 			if (opts.display_msg) {
134 | 				if(!maxentries){
135 | 					panel
136 | 						.append('<div class="pxofy">暂时无数据可以显示</div>');
137 | 				}else{
138 | 				panel
139 | 						.append('<div class="pxofy">显示第&nbsp;'
140 | 								+ ((current_page * opts.items_per_page) + 1)
141 | 								+ '&nbsp;条到&nbsp;'
142 | 								+ (((current_page + 1) * opts.items_per_page) > maxentries
143 | 										? maxentries
144 | 										: ((current_page + 1) * opts.items_per_page))
145 | 								+ '&nbsp;条记录，总共&nbsp;' + maxentries + '&nbsp;条</div>');
146 | 				}
147 | 			}
148 | 			//设置跳到第几页
149 | 			if(opts.setPageNo){
150 | 				  panel.append("<div class='goto'><span class='text'>跳转到</span><input type='text'/><span class='page'>页</span><button type='button' class='ue-button long2'>确定</button></div>");	
151 | 			}
152 | 		}
153 | 
154 | 		// 当前页
155 | 		var current_page = opts.current_page;
156 | 		maxentries = ( maxentries < 0) ? 0 : maxentries;
157 | 		opts.items_per_page = (!opts.items_per_page || opts.items_per_page < 0)
158 | 				? 1
159 | 				: opts.items_per_page;
160 | 		var panel = jQuery(this);
161 | 		this.selectPage = function(page_id) {
162 | 			pageSelected(page_id);
163 | 		}
164 | 		this.prevPage = function() {
165 | 			if (current_page > 0) {
166 | 				pageSelected(current_page - 1);
167 | 				return true;
168 | 			} else {
169 | 				return false;
170 | 			}
171 | 		}
172 | 		this.nextPage = function() {
173 | 			if (current_page < numPages() - 1) {
174 | 				pageSelected(current_page + 1);
175 | 				return true;
176 | 			} else {
177 | 				return false;
178 | 			}
179 | 		}
180 | 		
181 | 		if(maxentries==0){
182 | 			panel.append('<span class="current prev">'+opts.prev_text+'</span><span class="current next">'+opts.next_text+'</span><div class="pxofy">暂时无数据可以显示</div>');
183 | 		}else{
184 | 			drawLinks();
185 | 		}
186 | 		$(this).find(".goto button").live("click",function(evt){
187 | 			var setPageNo = $(this).parent().find("input").val();
188 | 			if(setPageNo!=null && setPageNo!=""&&setPageNo>0&&setPageNo<=numPages()){
189 | 				pageSelected(setPageNo-1, evt);
190 | 			}
191 | 		});		
192 | 	});
193 | }
194 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/ArticleSpider/spiders/jobbole.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | import scrapy
  4 | import datetime
  5 | from scrapy.http import Request
  6 | from urllib import parse
  7 | from scrapy.loader import ItemLoader
  8 | 
  9 | from ArticleSpider.items import JobBoleArticleItem, ArticleItemLoader
 10 | 
 11 | from ArticleSpider.utils.common import get_md5
 12 | from selenium import webdriver
 13 | from scrapy.xlib.pydispatch import dispatcher
 14 | from scrapy import signals
 15 | 
 16 | class JobboleSpider(scrapy.Spider):
 17 |     name = "jobbole"
 18 |     allowed_domains = ["blog.jobbole.com"]
 19 |     start_urls = ['http://blog.jobbole.com/all-posts/']
 20 | 
 21 | 
 22 |     # def __init__(self):
 23 |     #     self.browser = webdriver.Chrome(executable_path="D:/Temp/chromedriver.exe")
 24 |     #     super(JobboleSpider, self).__init__()
 25 |     #     dispatcher.connect(self.spider_closed, signals.spider_closed)
 26 |     #
 27 |     # def spider_closed(self, spider):
 28 |     #     #当爬虫退出的时候关闭chrome
 29 |     #     print ("spider closed")
 30 |     #     self.browser.quit()
 31 | 
 32 |     #收集伯乐在线所有404的url以及404页面数
 33 |     handle_httpstatus_list = [404]
 34 | 
 35 |     def __init__(self, **kwargs):
 36 |         self.fail_urls = []
 37 |         dispatcher.connect(self.handle_spider_closed, signals.spider_closed)
 38 | 
 39 |     def handle_spider_closed(self, spider, reason):
 40 |         self.crawler.stats.set_value("failed_urls", ",".join(self.fail_urls))
 41 | 
 42 |     def parse(self, response):
 43 |         """
 44 |         1. 获取文章列表页中的文章url并交给scrapy下载后并进行解析
 45 |         2. 获取下一页的url并交给scrapy进行下载， 下载完成后交给parse
 46 |         """
 47 |         #解析列表页中的所有文章url并交给scrapy下载后并进行解析
 48 |         if response.status == 404:
 49 |             self.fail_urls.append(response.url)
 50 |             self.crawler.stats.inc_value("failed_url")
 51 | 
 52 |         post_nodes = response.css("#archive .floated-thumb .post-thumb a")
 53 |         for post_node in post_nodes:
 54 |             image_url = post_node.css("img::attr(src)").extract_first("")
 55 |             post_url = post_node.css("::attr(href)").extract_first("")
 56 |             yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url":image_url}, callback=self.parse_detail)
 57 | 
 58 |         #提取下一页并交给scrapy进行下载
 59 |         next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
 60 |         if next_url:
 61 |             yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse)
 62 | 
 63 |     def parse_detail(self, response):
 64 |         article_item = JobBoleArticleItem()
 65 | 
 66 |         #提取文章的具体字段
 67 |         # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("")
 68 |         # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip()
 69 |         # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]
 70 |         # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
 71 |         # match_re = re.match(".*?(\d+).*", fav_nums)
 72 |         # if match_re:
 73 |         #     fav_nums = match_re.group(1)
 74 |         #
 75 |         # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
 76 |         # match_re = re.match(".*?(\d+).*", comment_nums)
 77 |         # if match_re:
 78 |         #     comment_nums = match_re.group(1)
 79 |         #
 80 |         # content = response.xpath("//div[@class='entry']").extract()[0]
 81 |         #
 82 |         # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
 83 |         # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
 84 |         # tags = ",".join(tag_list)
 85 | 
 86 |         #通过css选择器提取字段
 87 |         # front_image_url = response.meta.get("front_image_url", "")  #文章封面图
 88 |         # title = response.css(".entry-header h1::text").extract()[0]
 89 |         # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip()
 90 |         # praise_nums = response.css(".vote-post-up h10::text").extract()[0]
 91 |         # fav_nums = response.css(".bookmark-btn::text").extract()[0]
 92 |         # match_re = re.match(".*?(\d+).*", fav_nums)
 93 |         # if match_re:
 94 |         #     fav_nums = int(match_re.group(1))
 95 |         # else:
 96 |         #     fav_nums = 0
 97 |         #
 98 |         # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
 99 |         # match_re = re.match(".*?(\d+).*", comment_nums)
100 |         # if match_re:
101 |         #     comment_nums = int(match_re.group(1))
102 |         # else:
103 |         #     comment_nums = 0
104 |         #
105 |         # content = response.css("div.entry").extract()[0]
106 |         #
107 |         # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
108 |         # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
109 |         # tags = ",".join(tag_list)
110 |         #
111 |         # article_item["url_object_id"] = get_md5(response.url)
112 |         # article_item["title"] = title
113 |         # article_item["url"] = response.url
114 |         # try:
115 |         #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
116 |         # except Exception as e:
117 |         #     create_date = datetime.datetime.now().date()
118 |         # article_item["create_date"] = create_date
119 |         # article_item["front_image_url"] = [front_image_url]
120 |         # article_item["praise_nums"] = praise_nums
121 |         # article_item["comment_nums"] = comment_nums
122 |         # article_item["fav_nums"] = fav_nums
123 |         # article_item["tags"] = tags
124 |         # article_item["content"] = content
125 | 
126 | 
127 |         #通过item loader加载item
128 |         front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
129 |         item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
130 |         item_loader.add_css("title", ".entry-header h1::text")
131 |         item_loader.add_value("url", response.url)
132 |         item_loader.add_value("url_object_id", get_md5(response.url))
133 |         item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
134 |         item_loader.add_value("front_image_url", [front_image_url])
135 |         item_loader.add_css("praise_nums", ".vote-post-up h10::text")
136 |         item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
137 |         item_loader.add_css("fav_nums", ".bookmark-btn::text")
138 |         item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
139 |         item_loader.add_css("content", "div.entry")
140 | 
141 |         article_item = item_loader.load_item()
142 | 
143 | 
144 |         yield article_item
145 | 


--------------------------------------------------------------------------------
/s0vkaq/ArticleSpider/build/lib/ArticleSpider/spiders/jobbole.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | import scrapy
  4 | import datetime
  5 | from scrapy.http import Request
  6 | from urllib import parse
  7 | from scrapy.loader import ItemLoader
  8 | 
  9 | from ArticleSpider.items import JobBoleArticleItem, ArticleItemLoader
 10 | 
 11 | from ArticleSpider.utils.common import get_md5
 12 | from selenium import webdriver
 13 | from scrapy.xlib.pydispatch import dispatcher
 14 | from scrapy import signals
 15 | 
 16 | class JobboleSpider(scrapy.Spider):
 17 |     name = "jobbole"
 18 |     allowed_domains = ["blog.jobbole.com"]
 19 |     start_urls = ['http://blog.jobbole.com/all-posts/']
 20 | 
 21 | 
 22 |     # def __init__(self):
 23 |     #     self.browser = webdriver.Chrome(executable_path="D:/Temp/chromedriver.exe")
 24 |     #     super(JobboleSpider, self).__init__()
 25 |     #     dispatcher.connect(self.spider_closed, signals.spider_closed)
 26 |     #
 27 |     # def spider_closed(self, spider):
 28 |     #     #当爬虫退出的时候关闭chrome
 29 |     #     print ("spider closed")
 30 |     #     self.browser.quit()
 31 | 
 32 |     #收集伯乐在线所有404的url以及404页面数
 33 |     handle_httpstatus_list = [404]
 34 | 
 35 |     def __init__(self, **kwargs):
 36 |         self.fail_urls = []
 37 |         dispatcher.connect(self.handle_spider_closed, signals.spider_closed)
 38 | 
 39 |     def handle_spider_closed(self, spider, reason):
 40 |         self.crawler.stats.set_value("failed_urls", ",".join(self.fail_urls))
 41 | 
 42 |     def parse(self, response):
 43 |         """
 44 |         1. 获取文章列表页中的文章url并交给scrapy下载后并进行解析
 45 |         2. 获取下一页的url并交给scrapy进行下载， 下载完成后交给parse
 46 |         """
 47 |         #解析列表页中的所有文章url并交给scrapy下载后并进行解析
 48 |         if response.status == 404:
 49 |             self.fail_urls.append(response.url)
 50 |             self.crawler.stats.inc_value("failed_url")
 51 | 
 52 |         post_nodes = response.css("#archive .floated-thumb .post-thumb a")
 53 |         for post_node in post_nodes:
 54 |             image_url = post_node.css("img::attr(src)").extract_first("")
 55 |             post_url = post_node.css("::attr(href)").extract_first("")
 56 |             yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url":image_url}, callback=self.parse_detail)
 57 | 
 58 |         #提取下一页并交给scrapy进行下载
 59 |         next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
 60 |         if next_url:
 61 |             yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse)
 62 | 
 63 |     def parse_detail(self, response):
 64 |         article_item = JobBoleArticleItem()
 65 | 
 66 |         #提取文章的具体字段
 67 |         # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("")
 68 |         # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip()
 69 |         # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]
 70 |         # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
 71 |         # match_re = re.match(".*?(\d+).*", fav_nums)
 72 |         # if match_re:
 73 |         #     fav_nums = match_re.group(1)
 74 |         #
 75 |         # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
 76 |         # match_re = re.match(".*?(\d+).*", comment_nums)
 77 |         # if match_re:
 78 |         #     comment_nums = match_re.group(1)
 79 |         #
 80 |         # content = response.xpath("//div[@class='entry']").extract()[0]
 81 |         #
 82 |         # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
 83 |         # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
 84 |         # tags = ",".join(tag_list)
 85 | 
 86 |         #通过css选择器提取字段
 87 |         # front_image_url = response.meta.get("front_image_url", "")  #文章封面图
 88 |         # title = response.css(".entry-header h1::text").extract()[0]
 89 |         # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip()
 90 |         # praise_nums = response.css(".vote-post-up h10::text").extract()[0]
 91 |         # fav_nums = response.css(".bookmark-btn::text").extract()[0]
 92 |         # match_re = re.match(".*?(\d+).*", fav_nums)
 93 |         # if match_re:
 94 |         #     fav_nums = int(match_re.group(1))
 95 |         # else:
 96 |         #     fav_nums = 0
 97 |         #
 98 |         # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
 99 |         # match_re = re.match(".*?(\d+).*", comment_nums)
100 |         # if match_re:
101 |         #     comment_nums = int(match_re.group(1))
102 |         # else:
103 |         #     comment_nums = 0
104 |         #
105 |         # content = response.css("div.entry").extract()[0]
106 |         #
107 |         # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
108 |         # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
109 |         # tags = ",".join(tag_list)
110 |         #
111 |         # article_item["url_object_id"] = get_md5(response.url)
112 |         # article_item["title"] = title
113 |         # article_item["url"] = response.url
114 |         # try:
115 |         #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
116 |         # except Exception as e:
117 |         #     create_date = datetime.datetime.now().date()
118 |         # article_item["create_date"] = create_date
119 |         # article_item["front_image_url"] = [front_image_url]
120 |         # article_item["praise_nums"] = praise_nums
121 |         # article_item["comment_nums"] = comment_nums
122 |         # article_item["fav_nums"] = fav_nums
123 |         # article_item["tags"] = tags
124 |         # article_item["content"] = content
125 | 
126 | 
127 |         #通过item loader加载item
128 |         front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
129 |         item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
130 |         item_loader.add_css("title", ".entry-header h1::text")
131 |         item_loader.add_value("url", response.url)
132 |         item_loader.add_value("url_object_id", get_md5(response.url))
133 |         item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
134 |         item_loader.add_value("front_image_url", [front_image_url])
135 |         item_loader.add_css("praise_nums", ".vote-post-up h10::text")
136 |         item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
137 |         item_loader.add_css("fav_nums", ".bookmark-btn::text")
138 |         item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
139 |         item_loader.add_css("content", "div.entry")
140 | 
141 |         article_item = item_loader.load_item()
142 | 
143 | 
144 |         yield article_item
145 | 


--------------------------------------------------------------------------------
/s0vkaq/ScrapyRedisTest/scrapy_redis/scheduler.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | import six
  3 | 
  4 | from scrapy.utils.misc import load_object
  5 | 
  6 | from . import connection, defaults
  7 | 
  8 | 
  9 | # TODO: add SCRAPY_JOB support.
 10 | class Scheduler(object):
 11 |     """Redis-based scheduler
 12 | 
 13 |     Settings
 14 |     --------
 15 |     SCHEDULER_PERSIST : bool (default: False)
 16 |         Whether to persist or clear redis queue.
 17 |     SCHEDULER_FLUSH_ON_START : bool (default: False)
 18 |         Whether to flush redis queue on start.
 19 |     SCHEDULER_IDLE_BEFORE_CLOSE : int (default: 0)
 20 |         How many seconds to wait before closing if no message is received.
 21 |     SCHEDULER_QUEUE_KEY : str
 22 |         Scheduler redis key.
 23 |     SCHEDULER_QUEUE_CLASS : str
 24 |         Scheduler queue class.
 25 |     SCHEDULER_DUPEFILTER_KEY : str
 26 |         Scheduler dupefilter redis key.
 27 |     SCHEDULER_DUPEFILTER_CLASS : str
 28 |         Scheduler dupefilter class.
 29 |     SCHEDULER_SERIALIZER : str
 30 |         Scheduler serializer.
 31 | 
 32 |     """
 33 | 
 34 |     def __init__(self, server,
 35 |                  persist=False,
 36 |                  flush_on_start=False,
 37 |                  queue_key=defaults.SCHEDULER_QUEUE_KEY,
 38 |                  queue_cls=defaults.SCHEDULER_QUEUE_CLASS,
 39 |                  dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY,
 40 |                  dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS,
 41 |                  idle_before_close=0,
 42 |                  serializer=None):
 43 |         """Initialize scheduler.
 44 | 
 45 |         Parameters
 46 |         ----------
 47 |         server : Redis
 48 |             The redis server instance.
 49 |         persist : bool
 50 |             Whether to flush requests when closing. Default is False.
 51 |         flush_on_start : bool
 52 |             Whether to flush requests on start. Default is False.
 53 |         queue_key : str
 54 |             Requests queue key.
 55 |         queue_cls : str
 56 |             Importable path to the queue class.
 57 |         dupefilter_key : str
 58 |             Duplicates filter key.
 59 |         dupefilter_cls : str
 60 |             Importable path to the dupefilter class.
 61 |         idle_before_close : int
 62 |             Timeout before giving up.
 63 | 
 64 |         """
 65 |         if idle_before_close < 0:
 66 |             raise TypeError("idle_before_close cannot be negative")
 67 | 
 68 |         self.server = server
 69 |         self.persist = persist
 70 |         self.flush_on_start = flush_on_start
 71 |         self.queue_key = queue_key
 72 |         self.queue_cls = queue_cls
 73 |         self.dupefilter_cls = dupefilter_cls
 74 |         self.dupefilter_key = dupefilter_key
 75 |         self.idle_before_close = idle_before_close
 76 |         self.serializer = serializer
 77 |         self.stats = None
 78 | 
 79 |     def __len__(self):
 80 |         return len(self.queue)
 81 | 
 82 |     @classmethod
 83 |     def from_settings(cls, settings):
 84 |         kwargs = {
 85 |             'persist': settings.getbool('SCHEDULER_PERSIST'),
 86 |             'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'),
 87 |             'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'),
 88 |         }
 89 | 
 90 |         # If these values are missing, it means we want to use the defaults.
 91 |         optional = {
 92 |             # TODO: Use custom prefixes for this settings to note that are
 93 |             # specific to scrapy-redis.
 94 |             'queue_key': 'SCHEDULER_QUEUE_KEY',
 95 |             'queue_cls': 'SCHEDULER_QUEUE_CLASS',
 96 |             'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY',
 97 |             # We use the default setting name to keep compatibility.
 98 |             'dupefilter_cls': 'DUPEFILTER_CLASS',
 99 |             'serializer': 'SCHEDULER_SERIALIZER',
100 |         }
101 |         for name, setting_name in optional.items():
102 |             val = settings.get(setting_name)
103 |             if val:
104 |                 kwargs[name] = val
105 | 
106 |         # Support serializer as a path to a module.
107 |         if isinstance(kwargs.get('serializer'), six.string_types):
108 |             kwargs['serializer'] = importlib.import_module(kwargs['serializer'])
109 | 
110 |         server = connection.from_settings(settings)
111 |         # Ensure the connection is working.
112 |         server.ping()
113 | 
114 |         return cls(server=server, **kwargs)
115 | 
116 |     @classmethod
117 |     def from_crawler(cls, crawler):
118 |         instance = cls.from_settings(crawler.settings)
119 |         # FIXME: for now, stats are only supported from this constructor
120 |         instance.stats = crawler.stats
121 |         return instance
122 | 
123 |     def open(self, spider):
124 |         self.spider = spider
125 | 
126 |         try:
127 |             self.queue = load_object(self.queue_cls)(
128 |                 server=self.server,
129 |                 spider=spider,
130 |                 key=self.queue_key % {'spider': spider.name},
131 |                 serializer=self.serializer,
132 |             )
133 |         except TypeError as e:
134 |             raise ValueError("Failed to instantiate queue class '%s': %s",
135 |                              self.queue_cls, e)
136 | 
137 |         try:
138 |             self.df = load_object(self.dupefilter_cls)(
139 |                 server=self.server,
140 |                 key=self.dupefilter_key % {'spider': spider.name},
141 |                 debug=spider.settings.getbool('DUPEFILTER_DEBUG'),
142 |             )
143 |         except TypeError as e:
144 |             raise ValueError("Failed to instantiate dupefilter class '%s': %s",
145 |                              self.dupefilter_cls, e)
146 | 
147 |         if self.flush_on_start:
148 |             self.flush()
149 |         # notice if there are requests already in the queue to resume the crawl
150 |         if len(self.queue):
151 |             spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
152 | 
153 |     def close(self, reason):
154 |         if not self.persist:
155 |             self.flush()
156 | 
157 |     def flush(self):
158 |         self.df.clear()
159 |         self.queue.clear()
160 | 
161 |     def enqueue_request(self, request):
162 |         if not request.dont_filter and self.df.request_seen(request):
163 |             self.df.log(request, self.spider)
164 |             return False
165 |         if self.stats:
166 |             self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
167 |         self.queue.push(request)
168 |         return True
169 | 
170 |     def next_request(self):
171 |         block_pop_timeout = self.idle_before_close
172 |         request = self.queue.pop(block_pop_timeout)
173 |         if request and self.stats:
174 |             self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider)
175 |         return request
176 | 
177 |     def has_pending_requests(self):
178 |         return len(self) > 0
179 | 


--------------------------------------------------------------------------------