├── .gitignore
├── .idea
    ├── baidu_paper_spider.iml
    ├── misc.xml
    ├── modules.xml
    └── vcs.xml
├── README.md
├── README_zh.MD
├── django_search
    ├── .idea
    │   ├── django_search.iml
    │   ├── misc.xml
    │   ├── modules.xml
    │   └── vcs.xml
    ├── django_search
    │   ├── __init__.py
    │   ├── asgi.py
    │   ├── settings.py
    │   ├── urls.py
    │   └── wsgi.py
    ├── manage.py
    ├── search
    │   ├── __init__.py
    │   ├── admin.py
    │   ├── apps.py
    │   ├── migrations
    │   │   └── __init__.py
    │   ├── models.py
    │   ├── tests.py
    │   └── views.py
    ├── static
    │   ├── css
    │   │   ├── advanced.css
    │   │   ├── index.css
    │   │   ├── result.css
    │   │   └── style.css
    │   ├── img
    │   │   ├── Thumbs.db
    │   │   ├── btn_min.png
    │   │   ├── btnbg.png
    │   │   ├── down.png
    │   │   ├── inputbg.png
    │   │   ├── line.png
    │   │   ├── ll.png
    │   │   ├── logo.png
    │   │   ├── logo1.png
    │   │   ├── lr.png
    │   │   ├── more.png
    │   │   ├── result_icon.png
    │   │   └── searchbtn.png
    │   └── js
    │   │   ├── common.js
    │   │   ├── global.js
    │   │   ├── jquery.js
    │   │   └── pagination.js
    └── templates
    │   ├── index.html
    │   └── result.html
└── paperSpider
    ├── .idea
        └── paperSpider.iml
    ├── main.py
    ├── paperSpider
        ├── __init__.py
        ├── items.py
        ├── middlewares.py
        ├── models
        │   ├── __init__.py
        │   └── es_types.py
        ├── pipelines.py
        ├── settings.py
        ├── spiders
        │   ├── __init__.py
        │   └── baidu.py
        ├── tools
        │   ├── __init__.py
        │   └── add_urls.py
        └── utils
        │   ├── __init__.py
        │   ├── bloomfilter.py
        │   └── common.py
    ├── scrapy.cfg
    ├── scrapy_redis
        ├── __init__.py
        ├── connection.py
        ├── defaults.py
        ├── dupefilter.py
        ├── picklecompat.py
        ├── pipelines.py
        ├── queue.py
        ├── scheduler.py
        ├── spiders.py
        └── utils.py
    └── test.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/.idea/baidu_paper_spider.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <excludeFolder url="file://$MODULE_DIR$/venv" />
 6 |     </content>
 7 |     <orderEntry type="jdk" jdkName="Python 3.7 (baidu_paper_spider)" jdkType="Python SDK" />
 8 |     <orderEntry type="sourceFolder" forTests="false" />
 9 |   </component>
10 |   <component name="TestRunnerService">
11 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
12 |   </component>
13 | </module>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (baidu_paper_spider)" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/baidu_paper_spider.iml" filepath="$PROJECT_DIR$/.idea/baidu_paper_spider.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Distributed Document Search Engine
  2 | This is an open-source project for a paper search engine, which includes a Scrapy-Redis distributed crawler, an Elasticsearch search engine, and a Django frontend. The project was designed to provide a platform for users to easily search and access research papers.
  3 | 
  4 | ## Features
  5 | - Scrapy-Redis distributed crawler using CSS Selectors.
  6 | - Centralized deduplication with Redis for distribution.
  7 | - Text search engine implemented with ElasticSearch.
  8 | - Full-stack web application built using Django.
  9 | 
 10 | ## Technology Stack
 11 | The main technology stack used in this project includes:
 12 | - Scrapy-Redis
 13 | - Elasticsearch
 14 | - Django
 15 | <br />
 16 | <br /> 
 17 | 
 18 | **👉👉👉 More technical details that help to understand my project as follows.**
 19 | [中文版本](https://github.com/Beking0912/Distributed-Document-Search-Engine/blob/master/README_zh.MD)
 20 | 
 21 | ## Technical selection scrapy vs requests+beautifulsoup
 22 | 1. Both requests and beautifulsoup are libraries, and scrapy is the framework;
 23 | 2. Requests and beautifulsoup can be added to the scrapy framework;
 24 | 3. Scrapy is based on twisted, performance is the biggest advantage;
 25 | 4. Scrapy is convenient for expansion and provides many built-in functions;
 26 | 5. The built-in css and xpath selector of scrapy is very convenient, and the biggest disadvantage of beautifulsoup is slow.
 27 | 
 28 | ## Depth first and breadth first
 29 | Depth first (recursive implementation)
 30 | ```python
 31 | def depth_tree(tree_node):
 32 |     if tree_node is not None:
 33 |         print (tree_node._data)
 34 |         if tree_node._left is not None:
 35 |             return depth_tree(tree_node._left)
 36 |         if tree_node._right is not None:
 37 |             return depth_tree(tree_node._right)
 38 | ```
 39 | 
 40 | Breadth first (queue implementation)
 41 | ```python
 42 | def level_queue(root):
 43 |     if root is None:
 44 |         return
 45 |     my_queue = []
 46 |     node = root
 47 |     my_queue.append(node)
 48 |     while my_queue:
 49 |         node = my_queue.pop(0)
 50 |         print (node.elem)
 51 |         if node.lchild is not None:
 52 |             my_queue.append(node.lchild)
 53 |         if node.rchild is not None:
 54 |             my_queue.append(node.rchild)
 55 | ```
 56 | 
 57 | ## URL deduplication strategy
 58 | 1. Save the visited URL in the database;
 59 | 2. Save the visited URL in the set, and query the URL only at the cost of O(1);
 60 | 3. The URL is saved in the set after being hashed by md5 and other methods;
 61 | 4. Use the bitmap method to map the visited URL to a certain bit through the hash function;
 62 | 5. The bloomfilter method improves bitmap, and multiple hash functions reduce conflicts.
 63 | 
 64 | ## String encoding encode decode
 65 | 1. Computers can only process numbers, and text can only be processed by converting text to numbers. 8 bits in the computer are regarded as a byte, so the largest number that a byte can represent is 255;
 66 | 2. ASCII (one byte) encoding has become the standard encoding for Americans;
 67 | 3. ASCII is not enough to handle Chinese. China has developed GB2312 encoding, which uses two bytes to represent a Chinese character;
 68 | 4. The emergence of unicode unifies all languages into a set of codes;
 69 | 5. The garbled problem is solved, but if the content is all in English, unicode encoding requires twice the storage space than ASCII, and at the same time, if the transmission requires twice the transmission;
 70 | 6. The emergence of variable-length encoding utf-8 has changed the length of English to one byte and Chinese characters to three bytes. Especially uncommon ones become 4-6 bytes. If a large amount of English is transmitted, the effect of utf-8 will be obvious.
 71 | 
 72 | ## scrapy
 73 | scrapy is a fast and high-level screen scraping and web scraping framework developed by Python to scrape web sites and extract structured data from pages. Advantages: high concurrency (the bottom layer is asynchronous IO frame time loop + callback).
 74 | [Official document](https://docs.scrapy.org/en/latest/)
 75 | 
 76 | 1. download：`pip install Scrapy`
 77 | 2. new：`scrapy startproject namexxx`
 78 | 
 79 | ## xpath syntax res.xpath('').extract_first('')
 80 | 1. xpath uses path expressions to navigate in xml and html;
 81 | 2. xpath contains standard function library;
 82 | 3. xpath is a w3c standard.
 83 | 
 84 | ## Advantages of distributed crawlers
 85 | 1. Make full use of the bandwidth of multiple machines to accelerate crawling;
 86 | 2. Make full use of the IP of multiple machines to accelerate the crawling speed.
 87 | 
 88 | ## Stand-alone crawler => distributed crawlers problems that need to solve
 89 | 1. Centralized management of request queue: The scheduler is stored in memory in the form of a queue, and other servers cannot get the contents of the current server's memory;
 90 | 2. De-duplicate centralized management. Solution: Put the request queue and de-replay into third-party components, using Redis (memory database, faster reading speed).
 91 | 
 92 | ## Redis
 93 | Redis is a key-value storage system, and data is stored in memory.
 94 | 
 95 | ## Redis data type
 96 | String hash/hash list collection, sortable collection
 97 | 
 98 | ## needs to pay attention to writing crawlers using Scrapy-Redis
 99 | 1. Inherit RedisSpider;
100 | 2. All requests are no longer completed by the local schedule, but the schedule of Scrapy-Redis;
101 | 3. Need to push the starting url.
102 | 
103 | ## The difference between session and cookie
104 | 1. Cookies are stored in the form of key-value
105 | 
106 | ## When downloading the package fails
107 | 1. `pip install wheel`
108 | 2. `pip install -r requirements.txt`
109 | 
110 | ## Integrate Redis
111 | ## Integrate BloomFilter
112 | 
113 | ## Incremental crawling of crawlers
114 | 1. How to quickly discover new data
115 |    1. The full amount of crawlers is still going on
116 |       1. Restart a crawler: one is responsible for full crawling, and the other is responsible for incremental crawling
117 |       2. Use priority queue (conducive to maintenance)
118 |    2. Crawler is over
119 |       1. Crawler is closed
120 |          1. How to find that there is a new URL to be crawled, once there is a URL, a script is required to start the crawler
121 |       2. Crawler waiting: continue to push URL
122 | 2. How to solve the data that has been crawled (scrapy comes with a deduplication mechanism)
123 |    1. After the list data has been crawled, continue crawling
124 |    2. Whether to continue crawling the items that have been crawled (involving update issues)
125 | Optimal solution: Modify the scrapy-redis source code to achieve the goal.
126 | 
127 | ## Complete incremental crawling by modifying scrapy-redis
128 | 
129 | ## Crawler data update
130 | Fields that will be updated: cited amount
131 | 
132 | ## Search engine requirements
133 | 1. Efficient
134 | 2. Zero configuration is completely free
135 | 3. Able to interact with search engines simply through json and http
136 | 4. Search server is stable
137 | 5. Able to easily expand one server to hundreds
138 | 
139 | ## Introduction to elasticsearch
140 | 1. Lucene-based search server
141 | 2. Provides a full-text search engine with distributed multi-user capabilities
142 | 3. Based on RESTful web interface
143 | 4. Developed in Java and released as open source under the terms of the Apache license
144 | 
145 | ## Disadvantages of relational data search
146 | 1. Unable to score -> Unable to sort
147 | 2. No distributed
148 | 3. Unable to parse search request
149 | 4. low efficiency
150 | 5. Participle
151 | 
152 | ## elasticsearch installation
153 | 1. Install elasticsearch-rtf
154 | 2. Installation of head plugin and kibana
155 | 
156 | ## Cross-domain configuration
157 | ```
158 | http.cors.enabled: true
159 | http:cors.allow-origin: "*"
160 | http.cors.allow-methods: OPTIONS, HEAD, GET, POST, PUT, DELETE
161 | http.cors.allow-headers: "X-Requested-With, Content-Type, Content-Type, Content-Length, X-User"
162 | ```
163 | 
164 | ## elasticsearch concept
165 | 1. Cluster: One or more nodes are organized together
166 | 2. Node: A node is a server in the cluster, identified by a name, the default is the name of a random comic character
167 | 3. Fragmentation: The ability to divide the index into multiple parts, allowing horizontal partitioning and capacity expansion, multiple shards responding to requests, improving performance and throughput
168 | 4. Replica: The ability to create one or more copies of a shard, and the rest of the nodes can be on top when one node fails
169 | 
170 | 
171 | ## elasticsearch vs mysql
172 | 1. index => database
173 | 2. type => table
174 | 3. document => line
175 | 4. fields => columns
176 | 
177 | ## Inverted index
178 | The inverted index comes from the need to find records based on the value of attributes in practical applications. Each item in this index table includes an attribute value and the address of each record with the attribute value. Since the attribute value is not determined by the record, but the position of the record is determined by the attribute value, it is called an inverted index. A text with an inverted index is referred to as an inverted file.
179 | 
180 | ## TF-IDF
181 | 
182 | ## Inverted index pending issues
183 | 1. Case conversion issues, such as python and PYTHON should be a word
184 | 2. Stemming, looking and look should be treated as one word
185 | 3. Participle
186 | 4. The inverted index file is too large, compression encoding
187 | Elasticsearch can complete all of the above problems.
188 | 
189 | ## elasticsearch basic index
190 | 
191 | ## Mapping
192 | Mapping: When creating an index, you can predefine the field type and related attributes.
193 | 
194 | ES will guess the field mapping you want based on the basic type of the JSON source data. Turn the entered data into searchable index items. Mapping is the data type of the field defined by my mother. It also tells es how to index the data and whether it can be searched.
195 | 
196 | Role: It will make the index creation more detailed and perfect.
197 | 
198 | ## es query
199 | 1. Basic query: use es built-in query conditions to query
200 | 2. Combined query: Combine multiple queries together for compound query
201 | 3. Filtering: the query passes the filter condition to filter the data without affecting the scoring
202 | 
203 | ## Edit distance
204 | Edit distance is a calculation method of similarity between strings. That is, the edit distance between two character strings is equal to the minimum number of operations for insert/delete/replace/swap positions of adjacent character strings to make one character string become another character string.
205 | 
206 | Regarding the calculation of edit distance, dynamic programming is commonly used.
207 | 
208 | ## Environment migration
209 | 1. pip freeze > requirements.text
210 | 2. pip install -r requirement.txt
211 | 
212 | ## References
213 | [Elasticsearch中ik_max_word和 ik_smart的区别](https://blog.csdn.net/weixin_44062339/article/details/85006948)
214 | 
215 | [相关度评分背后的理论](https://www.elastic.co/guide/cn/elasticsearch/guide/current/scoring-theory.html)
216 | 
217 | [Elasticsearch搜索中文分词优化](https://www.jianshu.com/p/914f102bc174)
218 | 
219 | ## Several problems encountered in Elasticsearch Chinese search
220 | 1. Search for the glucose keyword, hope that the result contains only glucose, not grapes; search for grapes, hope that the result contains glucose.
221 | 2. Searching for "RMB" will only match the content that contains the keyword "RMB". In fact, "RMB" and "RMB" are synonyms. We hope that users can search for "RMB" and "RMB" to match each other. How to configure ES synonyms ?
222 | 3. User search pinyin: such as "baidu", or the first letter of pinyin "bd", how to match the keyword "百度", and if the user enters the word "摆渡", it can also match the keyword "Baidu", how does the Chinese pinyin match? Do it?
223 | 4. How to ensure that the search keywords are correctly segmented, usually we will use a custom dictionary to do it, so how to get a custom dictionary?
224 | 
225 | ## ik tokenizer
226 | 1. ik_max_word: Split the text at the finest granularity, such as splitting the "Great Hall of the People of the People's Republic of China" into "People's Republic of China, Chinese People, Chinese, Chinese, People's Republic, People, Republic, Great Hall, Assembly, Words such as hall.
227 | 2. ik_smart: Will do the most coarse-grained split, such as splitting the "Great Hall of the People of the People's Republic of China" into the People's Republic of China and the Great Hall of the People.
228 | 
229 | ## Best Practices
230 | The best practice for the use of the two tokenizers is: use ik_max_word for indexing, and ik_smart for search.
231 | 
232 | That is: the content of the article is segmented to the maximum when indexing, and the desired result is more precise when searching. When indexing, in order to provide the coverage of the index, the ik_max_word analyzer is usually used, which will index with the most fine-grained word segmentation. In order to improve the search accuracy, the ik_smart analyzer will be used for coarse-grained word segmentation.
233 | 
234 | ## ES word segmentation process analysis and analyzer
235 | 1. character filter: process the string before word segmentation and remove HTML tags;
236 | 2. tokenizer: English word segmentation can separate words according to spaces, Chinese word segmentation is more complicated, and machine learning algorithms can be used to segment words;
237 | 3. token filters characterize filters: modify capitalization, stop words, add synonyms, add words, etc.;
238 | 4. ES word segmentation process: character filter-->>tokenizer-->>token filters
239 | 5. Custom analyzer
240 | 6. Word segmentation mapping settings
241 | ```
242 | "content": {
243 |     "type": "string",
244 |     "analyzer": "ik_max_word",
245 |     "search_analyzer": "ik_smart"
246 | }
247 | ```
248 | 
249 | ## Synonym
250 | ## Suggest participle
251 | Suggest words need to match the prefix of Pinyin, Quanpin, and Chinese. For example: "百度", type "baidu", "bd", "百" must be matched, so it needs to be divided into multiple words when indexing A word segmenter is used to index and save. Chinese uses single-character word segmentation. Pinyin first letter and Quanpin require a custom analyzer to index.
252 | 
253 | 


--------------------------------------------------------------------------------
/README_zh.MD:
--------------------------------------------------------------------------------
  1 | # Distributed Document Search Engine
  2 | 
  3 | [English version](https://github.com/Beking0912/Distributed-Document-Search-Engine/blob/master/README.md)
  4 | 
  5 | ## 技术选型 scrapy vs requests+beautifulsoup
  6 | 1. requests 和 beautifulsoup 都是库，scrapy 是框架；
  7 | 2. scrapy 框架中可以加入requests 和 beautifulsoup；
  8 | 3. scrapy 基于 twisted，性能是最大优势；
  9 | 4. scrapy 方便扩展，提供了很多内置的功能；
 10 | 5. scrapy 内置的 css 和 xpath selector 非常方便，beautifulsoup 最大的缺点就是慢。
 11 | 
 12 | ## 深度优先和广度优先
 13 | 深度优先（递归实现）
 14 | ```python
 15 | def depth_tree(tree_node):
 16 |     if tree_node is not None:
 17 |         print (tree_node._data)
 18 |         if tree_node._left is not None:
 19 |             return depth_tree(tree_node._left)
 20 |         if tree_node._right is not None:
 21 |             return depth_tree(tree_node._right)
 22 | ```
 23 | 
 24 | 广度优先（队列实现）
 25 | ```python
 26 | def level_queue(root):
 27 |     if root is None:
 28 |         return
 29 |     my_queue = []
 30 |     node = root
 31 |     my_queue.append(node)
 32 |     while my_queue:
 33 |         node = my_queue.pop(0)
 34 |         print (node.elem)
 35 |         if node.lchild is not None:
 36 |             my_queue.append(node.lchild)
 37 |         if node.rchild is not None:
 38 |             my_queue.append(node.rchild)
 39 | ```
 40 | 
 41 | ## URL去重策略
 42 | 1. 将访问过的URL保存到数据库中；
 43 | 2. 将访问过的URL保存到set中，只需要O(1)的代价就可以查询URL；
 44 | 3. URL经过md5等方法哈希后保存到set中；
 45 | 4. 用bitmap方法将访问过的URL通过hash函数映射到某一位；
 46 | 5. bloomfilter方法对bitmap进行改进，多重hash函数降低冲突。
 47 | 
 48 | ## 字符串编码 encode decode
 49 | 1. 计算机只能处理数字，文本转换为数字才能处理。计算机中8个bit作为一个字节，所以一个字节能表示最大的数字就是255；
 50 | 2. ASCII(一个字节)编码就成为美国人的标准编码；
 51 | 3. ASCII处理中文是不够的，中国制定了GB2312编码，用两个字节表示一个汉字；
 52 | 4. unicode的出现将所有语言统一到一套编码里；
 53 | 5. 乱码问题解决了，但是如果内容全是英文，unicode编码比ASCII需要多一倍的存储空间，同时如果传输需要多一倍的传输；
 54 | 6. 可变长的编码utf-8的出现，把英文变长一个字节，汉字三个字节。特别生僻的变成4-6字节，如果传输大量的英文，utf-8作用就很明显了。
 55 | 
 56 | ## scrapy
 57 | scrapy 是 Python 开发的一个快速高层次的屏幕抓取和web抓取框架，用于抓取web站点并从页面中提取结构化的数据。优点：高并发(底层是异步IO框架 时间循环+回调)。
 58 | 
 59 | [官方文档](https://docs.scrapy.org/en/latest/)
 60 | 
 61 | 1. 下载：`pip install Scrapy`
 62 | 2. 新建：`scrapy startproject namexxx`
 63 | 
 64 | ## xpath 语法 res.xpath('').extract_first('')
 65 | 1. xpath 使用路径表达式在xml和html中进行导航；
 66 | 2. xpath 包含标准函数库；
 67 | 3. xpath 是一个w3c的标准。
 68 | 
 69 | ## 分布式爬虫的优点
 70 | 1. 充分利用多机器的带宽加速爬取；
 71 | 2. 充分利用多机的IP加速爬取速度。
 72 | 
 73 | ## 单机爬虫 => 分布式爬虫 需要解决的问题
 74 | 1. request 队列集中管理：scheduler 以队列形式存储在内存中，而其他服务器无法拿到当前服务器内存中的内容；
 75 | 2. 去重集中管理。
 76 | 解决方法：将 request 队列和去重 放到第三方组件中，采用 Redis(内存数据库，读取速度更快)。
 77 | 
 78 | ## Redis
 79 | Redis 是 key-value 存储系统，数据存在内存中。
 80 | 
 81 | ## Redis 数据类型
 82 | 字符串 散列/哈希 列表 集合 可排序集合
 83 | 
 84 | ## Scrapy-Redis 编写爬虫需要注意的点 
 85 | 1. 继承 RedisSpider；
 86 | 2. 所有 request 不再由本地 schedule 来完成，而是 Scrapy-Redis 的 schedule；
 87 | 3. 需要 push 起始 url。
 88 | 
 89 | ## session 和 cookie 的区别
 90 | 1. cookie 以 key-value 形式存储
 91 | 
 92 | ## 下载包失败时
 93 | 1. `pip install wheel`
 94 | 2. `pip install -r requirements.txt`
 95 | 
 96 | ## 集成 Redis
 97 | ## 集成 BloomFilter
 98 | 
 99 | ## 爬虫的增量爬取
100 | 1. 如何快速发现新的数据
101 |    1. 全量的爬虫仍然在继续
102 |       1. 重新启动一个爬虫：一个负责全量抓取，一个负责增量抓取
103 |       2. 采用优先级队列(利于维护)
104 |    2. 爬虫已结束
105 |       1. 爬虫已关闭
106 |          1. 如何发现已经有新的URL待抓取，一旦有URL则需要脚本启动爬虫
107 |       2. 爬虫等待：继续push URL
108 | 2. 如何解决已经抓取过的数据(scrapy 自带去重机制)
109 |    1. 列表数据已经抓取过之后还要继续抓取
110 |    2. 已经抓取过的条目是否还要继续抓取(涉及更新问题)
111 | 
112 | 最优方案：修改 scrapy-redis 源码可以达到目的。
113 | 
114 | ## 通过修改 scrapy-redis 完成增量爬取
115 | 
116 | ## 爬虫的数据更新
117 | 会更新的字段：被引用量
118 | 
119 | ## 搜索引擎需求
120 | 1. 高效
121 | 2. 零配置 完全免费
122 | 3. 能够简单通过json和http与搜索引擎交互
123 | 4. 搜索服务器稳定
124 | 5. 能够简单的将一台服务器扩展到上百台
125 | 
126 | ## elasticsearch 介绍
127 | 1. 基于 Lucene 的搜索服务器
128 | 2. 提供了一个分布式多用户能力的全文搜索引擎
129 | 3. 基于 RESTful web 接口
130 | 4. 是用 Java 开发的，并作为 Apache 许可条款下的开放源码发布
131 | 
132 | ## 关系数据搜索缺点
133 | 1. 无法打分 -> 无法排序
134 | 2. 无分布式
135 | 3. 无法解析搜索请求
136 | 4. 效率低
137 | 5. 分词
138 | 
139 | ## elasticsearch 安装
140 | 1. 安装 elasticsearch-rtf
141 | 2. head 插件和 kibana 的安装
142 | 
143 | ## 跨域配置
144 | ```
145 | http.cors.enabled: true
146 | http:cors.allow-origin: "*"
147 | http.cors.allow-methods: OPTIONS, HEAD, GET, POST, PUT, DELETE
148 | http.cors.allow-headers: "X-Requested-With, Content-Type, Content-Type, Content-Length, X-User"
149 | ```
150 | 
151 | ## elasticsearch 概念
152 | 1. 集群：一个或多个节点组织在一起
153 | 2. 节点：一个节点是集群中的一个服务器，由一个名字来标识，默认是一个随机的漫画角色的名字
154 | 3. 分片：将索引划分为多份的能力，允许水平分割和扩展容量，多个分片响应请求，提高性能和吞吐量
155 | 4. 副本：创建分片的一份或多份的能力，在一个节点失败时其余节点可以顶上
156 | 
157 | ## elasticsearch vs mysql
158 | 1. index(索引) => 数据库
159 | 2. type(类型) => 表
160 | 3. document(文档) => 行
161 | 4. fields => 列
162 | 
163 | ## 倒排索引
164 | 倒排索引源于实际应用中需要根据属性的值来查找记录。这种索引表中的每一项都包括一个属性值和具有该属性值的各记录的地址。由于不是由记录来确定属性值，而是由属性值来确定记录的位置，因而称为倒排索引(inverted index)。带有倒排索引的文简称倒排文件(inverted file)。
165 | 
166 | ## TF-IDF
167 | 
168 | ## 倒排索引待解决问题
169 | 1. 大小写转换问题，如 python 和 PYTHON 应该为一个词
170 | 2. 词干抽取，looking 和 look 应该处理为一个词
171 | 3. 分词
172 | 4. 倒排索引文件过大，压缩编码
173 | 
174 | elasticsearch 可以全部完成以上问题。
175 | 
176 | ## elasticsearch 基本的索引
177 | 
178 | ## 映射(mapping)
179 | 映射：创建索引时，可以预先定义字段的类型以及相关属性。
180 | 
181 | es会根据 JSON 源数据的基础类型猜测你想要的字段映射。将输入的数据转变为可搜索的索引项。mapping 就是我妈自己定义的字段的数据类型，同时告诉 es 如何索引数据以及是否可以被搜索。
182 | 
183 | 作用：会让索引建立的更加细致和完善。
184 | 
185 | ## es 查询
186 | 1. 基本查询：使用 es 内置查询条件进行查询
187 | 2. 组合查询：把多个查询组合在一起进行复合查询
188 | 3. 过滤：查询同时通过 filter 条件在不影响打分的情况下筛选数据
189 | 
190 | ## 编辑距离
191 | 编辑距离是一种字符串之间相似程度的计算方法。即两个字符串之间的编辑距离等于使一个字符串变成另一个字符串而进行的 插入/删除/替换/相邻字符串交换位置 进行操作的最少次数。
192 | 
193 | 关于编辑距离的求法，普遍采用的是动态规划。
194 | 
195 | ## 环境迁移
196 | 1. pip freeze > requirements.text
197 | 2. pip install -r requirement.txt
198 | 
199 | ## 资料
200 | [Elasticsearch中ik_max_word和 ik_smart的区别](https://blog.csdn.net/weixin_44062339/article/details/85006948)
201 | 
202 | [相关度评分背后的理论](https://www.elastic.co/guide/cn/elasticsearch/guide/current/scoring-theory.html)
203 | 
204 | [Elasticsearch搜索中文分词优化](https://www.jianshu.com/p/914f102bc174)
205 | 
206 | ## Elasticsearch 中文搜索时遇到几个问题
207 | 1. 检索葡萄糖关键字，希望结果仅包含葡萄糖，不包含葡萄；检索葡萄，希望结果包含葡萄糖。
208 | 2. 搜索“RMB”时只会匹配到包含“RMB”关键词的内容，实际上，“RMB”和“人民币”是同义词，我们希望用户搜索“RMB”和“人民币”可以相互匹配，ES同义词怎么配置？
209 | 3. 用户搜索拼音: 如"baidu",或者拼音首字母"bd",怎么匹配到"百度"这个关键词,又如用户输入"摆渡"这个词也能匹配到"百度"关键词,中文拼音匹配怎么做到?
210 | 4. 怎么保证搜索关键词被正确分词,通常我们会采用自定义词典来做,那么怎么获取自定义词典?
211 | 
212 | ## ik 分词器
213 | 1. ik_max_word：将文本做最细粒度的拆分，比如会将“中华人民共和国人民大会堂”拆分为“中华人民共和国、中华人民、中华、华人、人民共和国、人民、共和国、大会堂、大会、会堂等词语。
214 | 2. ik_smart：会做最粗粒度的拆分，比如会将“中华人民共和国人民大会堂”拆分为中华人民共和国、人民大会堂。
215 | 
216 | ## 最佳实践
217 | 两种分词器使用的最佳实践是：索引时用ik_max_word，在搜索时用ik_smart。
218 | 
219 | 即：索引时最大化的将文章内容分词，搜索时更精确的搜索到想要的结果。索引时，为了提供索引的覆盖范围，通常会采用ik_max_word分析器，会以最细粒度分词索引，搜索时为了提高搜索准确度，会采用ik_smart分析器，会以粗粒度分词。
220 | 
221 | ## ES 分词流程之分析（analysis）和分析器（analyzer）
222 | 1. character filter 字符过滤器：在分词前处理字符串，去除HTML标记；
223 | 2. tokenizer 分词器：英文分词可以根据空格将单词分开，中文分词比较复杂，可以采用机器学习算法来分词；
224 | 3. token filters 表征过滤器：修改大小写，停用词，增加同义词，增加词等；
225 | 4. ES分词流程：character filter-->>tokenizer-->>token filters
226 | 5. 自定义analyzer
227 | 6. 分词mapping设置
228 | ```
229 | "content": {
230 |     "type": "string",
231 |     "analyzer": "ik_max_word",
232 |     "search_analyzer": "ik_smart"
233 | }
234 | ```
235 | 
236 | ## 同义词
237 | ## Suggest分词
238 | suggest词需要对拼音前缀，全拼，中文进行前缀匹配，例如：“百度”一词，键入"baidu","bd","百"都必须匹配到，因此在索引的时候需要一词分多个分词器来索引保存，中文采用单字分词，拼音首字母和全拼需要自定义analyzer来索引。
239 | 
240 | 
241 | 


--------------------------------------------------------------------------------
/django_search/.idea/django_search.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="FacetManager">
 4 |     <facet type="django" name="Django">
 5 |       <configuration>
 6 |         <option name="rootFolder" value="$MODULE_DIR$" />
 7 |         <option name="settingsModule" value="django_search/settings.py" />
 8 |         <option name="manageScript" value="$MODULE_DIR$/manage.py" />
 9 |         <option name="environment" value="&lt;map/&gt;" />
10 |         <option name="doNotUseTestRunner" value="false" />
11 |         <option name="trackFilePattern" value="migrations" />
12 |       </configuration>
13 |     </facet>
14 |   </component>
15 |   <component name="NewModuleRootManager">
16 |     <content url="file://$MODULE_DIR$" />
17 |     <orderEntry type="inheritedJdk" />
18 |     <orderEntry type="sourceFolder" forTests="false" />
19 |   </component>
20 |   <component name="TemplatesService">
21 |     <option name="TEMPLATE_CONFIGURATION" value="Django" />
22 |     <option name="TEMPLATE_FOLDERS">
23 |       <list>
24 |         <option value="$MODULE_DIR$/templates" />
25 |       </list>
26 |     </option>
27 |   </component>
28 |   <component name="TestRunnerService">
29 |     <option name="projectConfiguration" value="Twisted Trial" />
30 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
31 |   </component>
32 | </module>


--------------------------------------------------------------------------------
/django_search/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (baidu_paper_spider)" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/django_search/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/django_search.iml" filepath="$PROJECT_DIR$/.idea/django_search.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/django_search/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/django_search/django_search/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/django_search/__init__.py


--------------------------------------------------------------------------------
/django_search/django_search/asgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ASGI config for django_search project.
 3 | 
 4 | It exposes the ASGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/3.0/howto/deployment/asgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.asgi import get_asgi_application
13 | 
14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'django_search.settings')
15 | 
16 | application = get_asgi_application()
17 | 


--------------------------------------------------------------------------------
/django_search/django_search/settings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Django settings for django_search project.
  3 | 
  4 | Generated by 'django-admin startproject' using Django 3.0.4.
  5 | 
  6 | For more information on this file, see
  7 | https://docs.djangoproject.com/en/3.0/topics/settings/
  8 | 
  9 | For the full list of settings and their values, see
 10 | https://docs.djangoproject.com/en/3.0/ref/settings/
 11 | """
 12 | 
 13 | import os
 14 | 
 15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 17 | 
 18 | # Quick-start development settings - unsuitable for production
 19 | # See https://docs.djangoproject.com/en/3.0/howto/deployment/checklist/
 20 | 
 21 | # SECURITY WARNING: keep the secret key used in production secret!
 22 | SECRET_KEY = '5h7388ran&ufu53jgqtzp)2*vupdb3&wmb*zz21mx%c00(8j9&'
 23 | 
 24 | # SECURITY WARNING: don't run with debug turned on in production!
 25 | DEBUG = True
 26 | 
 27 | ALLOWED_HOSTS = []
 28 | 
 29 | # Application definition
 30 | 
 31 | INSTALLED_APPS = [
 32 |     'django.contrib.admin',
 33 |     'django.contrib.auth',
 34 |     'django.contrib.contenttypes',
 35 |     'django.contrib.sessions',
 36 |     'django.contrib.messages',
 37 |     'django.contrib.staticfiles',
 38 |     'search.apps.SearchConfig',
 39 | ]
 40 | 
 41 | MIDDLEWARE = [
 42 |     'django.middleware.security.SecurityMiddleware',
 43 |     'django.contrib.sessions.middleware.SessionMiddleware',
 44 |     'django.middleware.common.CommonMiddleware',
 45 |     'django.middleware.csrf.CsrfViewMiddleware',
 46 |     'django.contrib.auth.middleware.AuthenticationMiddleware',
 47 |     'django.contrib.messages.middleware.MessageMiddleware',
 48 |     'django.middleware.clickjacking.XFrameOptionsMiddleware',
 49 | ]
 50 | 
 51 | ROOT_URLCONF = 'django_search.urls'
 52 | 
 53 | TEMPLATES = [
 54 |     {
 55 |         'BACKEND': 'django.template.backends.django.DjangoTemplates',
 56 |         'DIRS': [os.path.join(BASE_DIR, 'templates')]
 57 |         ,
 58 |         'APP_DIRS': True,
 59 |         'OPTIONS': {
 60 |             'context_processors': [
 61 |                 'django.template.context_processors.debug',
 62 |                 'django.template.context_processors.request',
 63 |                 'django.contrib.auth.context_processors.auth',
 64 |                 'django.contrib.messages.context_processors.messages',
 65 |             ],
 66 |             'libraries': {  # Adding this section should work around the issue.
 67 |                 'staticfiles': 'django.templatetags.static',
 68 |             },
 69 |         },
 70 |     },
 71 | ]
 72 | 
 73 | WSGI_APPLICATION = 'django_search.wsgi.application'
 74 | 
 75 | # Database
 76 | # https://docs.djangoproject.com/en/3.0/ref/settings/#databases
 77 | 
 78 | DATABASES = {
 79 |     'default': {
 80 |         'ENGINE': 'django.db.backends.sqlite3',
 81 |         'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
 82 |     }
 83 | }
 84 | 
 85 | # Password validation
 86 | # https://docs.djangoproject.com/en/3.0/ref/settings/#auth-password-validators
 87 | 
 88 | AUTH_PASSWORD_VALIDATORS = [
 89 |     {
 90 |         'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
 91 |     },
 92 |     {
 93 |         'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
 94 |     },
 95 |     {
 96 |         'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
 97 |     },
 98 |     {
 99 |         'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
100 |     },
101 | ]
102 | 
103 | # Internationalization
104 | # https://docs.djangoproject.com/en/3.0/topics/i18n/
105 | 
106 | LANGUAGE_CODE = 'en-us'
107 | 
108 | TIME_ZONE = 'UTC'
109 | 
110 | USE_I18N = True
111 | 
112 | USE_L10N = True
113 | 
114 | USE_TZ = True
115 | 
116 | # Static files (CSS, JavaScript, Images)
117 | # https://docs.djangoproject.com/en/3.0/howto/static-files/
118 | 
119 | STATIC_URL = '/static/'
120 | 
121 | STATICFILES_DIRS = [
122 |     os.path.join(BASE_DIR, "static")
123 | ]
124 | 


--------------------------------------------------------------------------------
/django_search/django_search/urls.py:
--------------------------------------------------------------------------------
 1 | """django_search URL Configuration
 2 | 
 3 | The `urlpatterns` list routes URLs to views. For more information please see:
 4 |     https://docs.djangoproject.com/en/3.0/topics/http/urls/
 5 | Examples:
 6 | Function views
 7 |     1. Add an import:  from my_app import views
 8 |     2. Add a URL to urlpatterns:  path('', views.home, name='home')
 9 | Class-based views
10 |     1. Add an import:  from other_app.views import Home
11 |     2. Add a URL to urlpatterns:  path('', Home.as_view(), name='home')
12 | Including another URLconf
13 |     1. Import the include() function: from django.urls import include, path
14 |     2. Add a URL to urlpatterns:  path('blog/', include('blog.urls'))
15 | """
16 | from django.contrib import admin
17 | # from django.urls import path
18 | # from django.views.generic import TemplateView
19 | from django.conf.urls import url
20 | from search.views import SearchSuggest, SearchView, IndexView
21 | 
22 | urlpatterns = [
23 |     # url(r'^admin/', admin.site.urls),
24 |     url(r'^$', IndexView.as_view(), name="index"),
25 |     url(r'^suggest/$', SearchSuggest.as_view(), name="suggest"),
26 |     url(r'^search/$', SearchView.as_view(), name="search"),
27 | ]
28 | 


--------------------------------------------------------------------------------
/django_search/django_search/wsgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WSGI config for django_search project.
 3 | 
 4 | It exposes the WSGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/3.0/howto/deployment/wsgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.wsgi import get_wsgi_application
13 | 
14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'django_search.settings')
15 | 
16 | application = get_wsgi_application()
17 | 


--------------------------------------------------------------------------------
/django_search/manage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Django's command-line utility for administrative tasks."""
 3 | import os
 4 | import sys
 5 | 
 6 | 
 7 | def main():
 8 |     os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'django_search.settings')
 9 |     try:
10 |         from django.core.management import execute_from_command_line
11 |     except ImportError as exc:
12 |         raise ImportError(
13 |             "Couldn't import Django. Are you sure it's installed and "
14 |             "available on your PYTHONPATH environment variable? Did you "
15 |             "forget to activate a virtual environment?"
16 |         ) from exc
17 |     execute_from_command_line(sys.argv)
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     main()
22 | 


--------------------------------------------------------------------------------
/django_search/search/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/search/__init__.py


--------------------------------------------------------------------------------
/django_search/search/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 | 
3 | # Register your models here.
4 | 


--------------------------------------------------------------------------------
/django_search/search/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 | 
3 | 
4 | class SearchConfig(AppConfig):
5 |     name = 'search'
6 | 


--------------------------------------------------------------------------------
/django_search/search/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/search/migrations/__init__.py


--------------------------------------------------------------------------------
/django_search/search/models.py:
--------------------------------------------------------------------------------
 1 | from django.db import models
 2 | 
 3 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
 4 |     analyzer, Completion, Keyword, Text, Integer
 5 | from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
 6 | from elasticsearch_dsl.connections import connections
 7 | 
 8 | connections.create_connection(hosts=["localhost"])
 9 | 
10 | 
11 | class CustomAnalyzer(_CustomAnalyzer):
12 |     def get_analysis_definition(self):
13 |         return {}
14 | 
15 | 
16 | ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])
17 | 
18 | 
19 | class PaperType(DocType):
20 |     suggest = Completion(analyzer=ik_analyzer)  # 用于自动补全
21 | 
22 |     paper_title = Text(analyzer="ik_max_word")
23 |     paper_writer = Text()
24 |     paper_time = Text()
25 |     paper_cite_count = Integer()
26 |     paper_source = Keyword()
27 |     paper_abstract = Text(analyzer="ik_max_word")
28 |     paper_keywords = Text(analyzer="ik_max_word")
29 |     paper_DOI = Text()
30 |     paper_download_link = Text()
31 | 
32 |     class Meta:
33 |         index = "baidu"
34 |         doc_type = "paper"
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     PaperType.init()
39 | 


--------------------------------------------------------------------------------
/django_search/search/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 | 
3 | # Create your tests here.
4 | 


--------------------------------------------------------------------------------
/django_search/search/views.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from datetime import datetime
  3 | from django.shortcuts import render
  4 | from django.http import HttpResponse
  5 | from django.views.generic.base import View
  6 | from elasticsearch import Elasticsearch
  7 | import redis
  8 | 
  9 | from search.models import PaperType
 10 | 
 11 | client = Elasticsearch(hosts=["127.0.0.1"])
 12 | 
 13 | redis_cli = redis.StrictRedis(charset='utf-8', decode_responses=True)
 14 | 
 15 | 
 16 | class IndexView(View):
 17 |     def get(self, request):
 18 |         topn_search = redis_cli.zrevrangebyscore("search_keywords_set", "+inf", "-inf", start=0, num=5)
 19 |         return render(request, "index.html", {"topn_search": topn_search})
 20 | 
 21 | 
 22 | class SearchSuggest(View):
 23 |     def get(self, request):
 24 |         key_words = request.GET.get('s', '')
 25 |         re_datas = []
 26 |         if key_words:
 27 |             s = PaperType.search()
 28 |             s = s.suggest('suggest', key_words, completion={
 29 |                 "field": "suggest", "fuzzy": {
 30 |                     "fuzziness": 2  # 编辑距离
 31 |                 }, "size": 10
 32 |             })
 33 |             suggestions = s.execute_suggest()
 34 |             for match in suggestions.suggest[0].options:
 35 |                 source = match._source
 36 |                 re_datas.append(source["paper_title"])
 37 |         return HttpResponse(json.dumps(re_datas), content_type="application/json")
 38 | 
 39 | 
 40 | class SearchView(View):
 41 |     def get(self, request):
 42 |         key_words = request.GET.get('q', '')
 43 | 
 44 |         redis_cli.zincrby("search_keywords_set", 1, key_words)
 45 |         topn_search = redis_cli.zrevrangebyscore("search_keywords_set", "+inf", "-inf", start=0, num=5)
 46 | 
 47 |         page = request.GET.get("p", "1")
 48 |         try:
 49 |             page = int(page)
 50 |         except:
 51 |             page = 1
 52 | 
 53 |         baidu_count = redis_cli.get("baidu_count")
 54 |         start_time = datetime.now()
 55 | 
 56 |         choice = request.GET.get("option", "")
 57 |         if choice == 'cite':
 58 |             response = client.search(
 59 |                 index="baidu",
 60 |                 body={
 61 |                     "sort": {"paper_cite_count": {"order": "desc"}},
 62 |                     "query": {
 63 |                         "multi_match": {
 64 |                             "query": key_words,
 65 |                             "fields": ["paper_title", "paper_keywords", "paper_abstract"]
 66 |                         }
 67 |                     },
 68 |                     "from": (page - 1) * 10,
 69 |                     "size": 10,
 70 |                     "highlight": {
 71 |                         "pre_tags": ['<span class="keyWord">'],
 72 |                         "post_tags": ['</span>'],
 73 |                         "fields": {
 74 |                             "paper_title": {},
 75 |                             "paper_abstract": {},
 76 |                             "paper_keywords": {}
 77 |                         }
 78 |                     }
 79 |                 }
 80 |             )
 81 |         elif choice == 'date':
 82 |             response = client.search(
 83 |                 index="baidu",
 84 |                 body={
 85 |                     "sort": {"paper_time": {"order": "desc"}},
 86 |                     "query": {
 87 |                         "multi_match": {
 88 |                             "query": key_words,
 89 |                             "fields": ["paper_title", "paper_keywords", "paper_abstract"]
 90 |                         }
 91 |                     },
 92 |                     "from": (page - 1) * 10,
 93 |                     "size": 10,
 94 |                     "highlight": {
 95 |                         "pre_tags": ['<span class="keyWord">'],
 96 |                         "post_tags": ['</span>'],
 97 |                         "fields": {
 98 |                             "paper_title": {},
 99 |                             "paper_abstract": {},
100 |                             "paper_keywords": {}
101 |                         }
102 |                     }
103 |                 }
104 |             )
105 |         else:
106 |             response = client.search(
107 |                 index="baidu",
108 |                 body={
109 |                     "query": {
110 |                         "multi_match": {
111 |                             "query": key_words,
112 |                             "fields": ["paper_title", "paper_keywords", "paper_abstract"]
113 |                         }
114 |                     },
115 |                     "from": (page-1)*10,
116 |                     "size": 10,
117 |                     "highlight": {
118 |                         "pre_tags": ['<span class="keyWord">'],
119 |                         "post_tags": ['</span>'],
120 |                         "fields": {
121 |                             "paper_title": {},
122 |                             "paper_abstract": {},
123 |                             "paper_keywords": {}
124 |                         }
125 |                     }
126 |                 }
127 |             )
128 | 
129 |         end_time = datetime.now()
130 |         last_seconds = (end_time - start_time).total_seconds()
131 | 
132 |         total_nums = response["hits"]["total"]
133 |         if(page % 10) > 0:
134 |             page_nums = int(total_nums/10+1)
135 |         else:
136 |             page_nums = int(total_nums / 10)
137 | 
138 |         hit_list = []
139 |         for hit in response["hits"]["hits"]:
140 |             hit_dict = {}
141 |             if "highlight" in hit:
142 |                 if "paper_title" in hit["highlight"]:
143 |                     hit_dict["paper_title"] = "".join(hit["highlight"]["paper_title"])
144 |                 else:
145 |                     hit_dict["paper_title"] = hit["_source"]["paper_title"]
146 | 
147 |                 if "paper_abstract" in hit["highlight"]:
148 |                     hit_dict["paper_abstract"] = "".join(hit["highlight"]["paper_abstract"])
149 |                 else:
150 |                     hit_dict["paper_abstract"] = hit["_source"]["paper_abstract"]
151 | 
152 |                 if "paper_keywords" in hit["highlight"]:
153 |                     hit_dict["paper_keywords"] = hit["highlight"]["paper_keywords"]
154 |                 else:
155 |                     if "paper_keywords" in hit["_source"]:
156 |                         hit_dict["paper_keywords"] = hit["_source"]["paper_keywords"]
157 | 
158 |             if "paper_writer" in hit["_source"]:
159 |                 hit_dict["paper_writer"] = hit["_source"]["paper_writer"]
160 | 
161 |             if "paper_time" in hit["_source"]:
162 |                 hit_dict["paper_time"] = hit["_source"]["paper_time"]
163 | 
164 |             if "paper_cite_count" in hit["_source"]:
165 |                 hit_dict["paper_cite_count"] = hit["_source"]["paper_cite_count"]
166 |             hit_dict["paper_source"] = hit["_source"]["paper_source"]
167 | 
168 |             if "paper_DOI" in hit["_source"]:
169 |                 hit_dict["paper_DOI"] = hit["_source"]["paper_DOI"]
170 | 
171 |             if "paper_download_link" in hit["_source"]:
172 |                 hit_dict["paper_download_link"] = hit["_source"]["paper_download_link"][:5]
173 | 
174 |             hit_dict["score"] = hit["_score"]
175 | 
176 |             hit_list.append(hit_dict)
177 | 
178 |         return render(request, "result.html", {"page": page,
179 |                                                "all_hits": hit_list,
180 |                                                "key_words": key_words,
181 |                                                "total_nums": total_nums,
182 |                                                "page_nums": page_nums,
183 |                                                "last_seconds": last_seconds,
184 |                                                "baidu_count": baidu_count,
185 |                                                "topn_search": topn_search})
186 | 
187 | 


--------------------------------------------------------------------------------
/django_search/static/css/advanced.css:
--------------------------------------------------------------------------------
 1 | @charset "utf-8";
 2 | html{*overflow:auto;}
 3 | #hd{padding:20px 10px;}
 4 | .logo{float:left;margin-right:30px; height:33px;}
 5 | /*input搜索区域*/
 6 | .inputArea{float:left;position:relative;}
 7 | .inputArea .searchInput{border:1px solid #bfbfbf;padding:0 15px;outline:none;height:35px;*line-height:38px;width:350px; background:url(../img/inputbg.png);font-size:14px;}
 8 | .inputArea .searchButton{position:absolute;left:382px;top:0;*top:1px;*left:381px;width:106px;height:38px;background:url(../img/btn_min.png) no-repeat;border:none; cursor:pointer;}
 9 | 
10 | /*返回搜索*/
11 | .inputArea .back{position:absolute;font-size:14px;left:500px;top:12px;width:60px;text-decoration:underline;}
12 | 
13 | /*分界区域*/
14 | .divsion{margin-bottom:24px;height:36px;background:#f9f9f9;border-bottom:1px solid #e0e0e0;}
15 | 
16 | /*高级搜索区域*/
17 | .subfield{border-left:4px solid #9cc813;font-size:14px;font-weight:bold;padding:2px 0 2px 20px;}
18 | .subfieldContent{padding-left:140px;padding-bottom:40px;}
19 | .subfieldContent .advanceItem{padding-left:350px;margin-bottom:15px;padding-top:8px;padding-bottom:3px;}
20 | .subfieldContent .advanceItem.keyWords{background:#f4f4f4;padding-top:18px;padding-bottom:3px;}
21 | .subfieldContent .advanceItem dd{float:left;margin-left:-320px;}
22 | .subfieldContent .advanceItem dd label{float:left;margin-right:40px;width:75px;font-weight:bold;}
23 | .subfieldContent .advanceItem dd .impInfo{ font-weight:bold;}
24 | .subfieldContent .advanceItem dd .tips{float:left;}
25 | .subfieldContent .advanceItem dd p, .subfieldContent .advanceItem dt p{margin-bottom:10px;height:26px;}
26 | .subfieldContent .advanceItem dt p input[type=text]{position:relative;top:-5px;line-height:26px;}
27 | 
28 | .subfieldContent .advanceItem dt{float:left;width:100%;}
29 | .subfieldContent .advanceItem.keyWords dt input[type=text]{width:290px;height:26px;border:1px solid #bfbfbf;outline:none;}
30 | /*自定义*/
31 | .subfieldContent .advanceItem.time{height:30px;}
32 | .subfieldContent .advanceItem .define{display:none;position:relative;*top:-3px;}
33 | .subfieldContent .advanceItem.time input[type=text]{width:80px;height:18px;line-height:18px;border:1px solid #bfbfbf;outline:none;}
34 | 
35 | 
36 | 
37 | 
38 | 
39 | /*更多按钮*/
40 | .more {float:left;}
41 | .more:hover{text-decoration:none;}
42 | .more .moreIcon{display:inline-block;position:relative;top:-1px;*top:-3px;left:2px;*left:-1px;width:9px;height:5px;background:url(../img/more.png);}
43 | .more.show .moreIcon{background:url(../img/down.png);top:-2px;}
44 | 
45 | /*立即搜索样式*/
46 | .subfieldContent .search{margin:45px 0 0 145px;width:130px;height:36px;background:url(../img/btnbg.png); font-weight:bold;border:none;border:1px solid #bfbfbf;line-height:36px;cursor:pointer;font-size:14px;}
47 | /*联想下拉区域*/
48 | .inputArea .dataList{display:none;position:absolute;left:0;top:42px;*top:43px;width:550px;padding:5px 0;background:#fff;border:1px solid #bfbfbf;border-top:none;}
49 | .inputArea .dataList li{padding:2px 15px;font-size:14px;}
50 | .inputArea .dataList li:hover{background:#f0f0f0;color:#0080cc;font-weight:bold;}
51 | 


--------------------------------------------------------------------------------
/django_search/static/css/index.css:
--------------------------------------------------------------------------------
 1 | @charset "utf-8";
 2 | html{*overflow:auto;}
 3 | #main{width:730px;margin:75px auto 0;}
 4 | #main h1.title{width:600px;}
 5 | #bd{margin-bottom:20px;}
 6 | .logo.large{margin:0px auto 10px auto;width:400px;height:190px;background: url(../img/logo.png) no-repeat center center;}
 7 | 
 8 | /*nav样式*/
 9 | .nav{margin-bottom:10px;}
10 | .searchList{float:left;padding-left:5px;}
11 | .searchList .searchItem{float:left;margin-right:15px;font-size:14px;padding:0 0 2px 2px;cursor:pointer;height:18px;}
12 | .searchList .searchItem.current{color:#0080cc;border-bottom:2px solid #9cc813;font-weight:bold;}
13 | 
14 | /*input搜索区域*/
15 | .inputArea{position:relative;margin-bottom:65px;}
16 | .inputArea .searchInput{border-radius: 5rem;border:1px solid #bfbfbf;padding:0 15px;outline:none;height:40px;*height:39px;*line-height:40px;width:520px; background:url(../img/inputbg.png);font-size:14px;}
17 | .inputArea .searchButton{position:absolute;left:560px;*left:562px;*top:3px;top:4px;width:40px;height:40px;*height:40px;background:url(../img/btn_min.png) no-repeat;border:none;cursor:pointer;}
18 | /*高级搜索*/
19 | .inputArea .advanced{position:absolute;font-size:14px;left:674px;top:12px;text-decoration:underline;}
20 | /*联想下拉区域*/
21 | .inputArea .dataList{display:none;position:absolute;left:0;top:42px;*top:43px;width:550px;padding:5px 0;background:#fff;border:1px solid #bfbfbf;border-top:none;}
22 | .inputArea .dataList li{padding:2px 15px;font-size:14px;}
23 | .inputArea .dataList li:hover{background:#f0f0f0;color:#0080cc;font-weight:bold;}
24 | 
25 | /*搜索历史区域*/
26 | .historyArea{width:600px;}
27 | .historyArea .history {margin-bottom:5px;}
28 | .historyArea .history label{font-weight:bold;}
29 | .historyArea .history a{margin-right:12px;}
30 | 


--------------------------------------------------------------------------------
/django_search/static/css/result.css:
--------------------------------------------------------------------------------
  1 | @charset "utf-8";
  2 | html{*overflow:auto;}
  3 | #hd{padding:20px 10px;}
  4 | #bd{margin-bottom:40px;}
  5 | .logo{float:left;margin-right:30px; height:70px;}
  6 | /*input搜索区域*/
  7 | .inputArea{float:left;position:relative;}
  8 | .inputArea .searchInput{border-radius: 5rem;border:1px solid #bfbfbf;padding:0 15px;outline:none;height:35px;*line-height:35px;width:520px; background:url(../img/inputbg.png);font-size:14px;}
  9 | .inputArea .searchButton{position:absolute;left:560px;top:3px;*top:2px;*left:560px;width:40px;height:40px;background:url(../img/btn_min.png) no-repeat;border:none;cursor:pointer;}
 10 | 
 11 | /*返回高级搜索*/
 12 | .inputArea .advanced{position:absolute;font-size:14px;left:500px;top:12px;width:60px;text-decoration:underline;}
 13 | 
 14 | /*分界区域，导航*/
 15 | .nav{margin-bottom:24px;height:31px;background:#f9f9f9;border-bottom:1px solid #e0e0e0;padding:5px 0 0 210px;}
 16 | .searchList{float:left;padding-left:5px;}
 17 | .searchList .searchItem{float:left;margin-right:15px;font-size:14px;padding:0 3px 2px 3px;cursor:pointer;height:26px; line-height:26px;}
 18 | .searchList .searchItem.current{color:#0080cc;border-bottom:3px solid #9cc813;font-weight:bold;}
 19 | .nav .tips{color:#969696;font-size:12px;line-height:24px;*line-height:26px;}
 20 | #container.sideBarHide .nav{padding-left:35px;}
 21 | 
 22 | /*#main区域样式*/
 23 | #main{padding:0 215px 0 182px;}
 24 | #main.sideBarHide{padding-left:10px;}
 25 | /*侧边栏搜索条件*/
 26 | .sideBar{position:relative;float:left;margin-left:-182px;width:182px;}
 27 | .sideBar .subfieldContext{margin-bottom:20px;padding-left:25px;}
 28 | .sideBar .subfieldContext li{margin-bottom:5px;cursor:pointer;}
 29 | .sideBar .subfieldContext input[type=text]{width:75px;}
 30 | .sideBar .unit{color:#787878;}
 31 | 
 32 | /*更多按钮*/
 33 | .sideBar .more a:hover{text-decoration:none;}
 34 | .sideBar .more .moreIcon{display:inline-block;position:relative;top:-1px;*top:-3px;left:2px;*left:-1px;width:9px;height:5px;background:url(../img/more.png);}
 35 | .sideBar .more.show .moreIcon{background:url(../img/down.png);top:-2px;}
 36 | 
 37 | .sideBar .reset{padding-left:25px;}
 38 | /*siderBar区域显隐控制*/
 39 | .sideBar .sideBarShowHide{position:absolute;right:0px;top:20px;height:177px;width:1px; background:url(../img/line.png) right;}
 40 | .sideBar .sideBarShowHide a{position:absolute;top:70px;left:-11px;display:inline-block;width:12px;height:31px;background:url(../img/lr.png);}
 41 | 
 42 | .sideBar .sideBarShowHide a:hover{background-position:0 -31px;}
 43 | 
 44 | /*左侧收起样式*/
 45 | #main.sideBarHide .sideBar{margin-left:-191px;*margin-left:-182px;}
 46 | #main.sideBarHide .sideBar .sideBarShowHide{-moz-transform:rotate(180deg); -o-transform:rotate(180deg); -webkit-transform:rotate(180deg); transform:rotate(180deg);}
 47 | #main.sideBarHide .sideBar .sideBarShowHide a{*background:url(../img/ll.png);}
 48 | #main.sideBarHide .sideBar .sideBarShowHide a:hover{*background-position:0 -31px;}
 49 | #main.sideBarHide .sideBar .sideBarShowHide{background:none;}
 50 | 
 51 | .resultArea{float:left;width:100%;}
 52 | .resultArea .resultTotal{position:relative;padding-left:30px;margin-bottom:20px;}
 53 | .resultArea .resultTotal .info{color:#9a9a9a;}
 54 | .resultArea .resultTotal .orderOpt{position:absolute;right:50px;}
 55 | .resultArea .resultTotal .orderOpt a{margin-right:10px;color:#0080cc;}
 56 | 
 57 | /*搜索结果列表区域*/
 58 | .resultArea .resultList{padding-left:30px;}
 59 | /*.resultArea .resultList .resultItem{margin-bottom:20px;}*/
 60 | .resultArea .resultList .resultItem{margin-bottom:20px;border-bottom: 1px solid #eee;}
 61 | .resultArea .resultList .itemHead{margin-bottom:5px;color:#767676;}
 62 | .resultArea .resultList .itemHead .keyWord{color:#c00;}
 63 | .resultArea .resultList .itemBody .keyWord{color:#c00;}
 64 | .resultArea .resultList .itemHead a.title{font-size:18px;color:#001ba0;text-decoration:none;}
 65 | .resultArea .resultList .itemHead .value{color:#008000;}
 66 | .resultArea .resultList .itemHead .divsion{margin:0 5px;}
 67 | .resultArea .resultList .itemHead .fileType{margin-right:10px;}
 68 | 
 69 | /*搜索内容主体*/
 70 | .resultArea .resultList .itemBody{font-size:12px;margin-bottom:5px;line-height:18px;width:90%;}
 71 | .resultArea .resultList .itemFoot{color:#008000; margin-bottom: 20px;}
 72 | .resultArea .resultList .itemFoot .info{margin-right:10px;}
 73 | 
 74 | .resultArea .pagination{margin-bottom:25px;padding-left:32px;}
 75 | /*相关搜索*/
 76 | .resultArea .dependSearch{margin-bottom:30px;padding-left:32px;font-size:14px;}
 77 | .resultArea .dependSearch h6{float:left;margin-right:15px;font-weight:bold;}
 78 | .resultArea .dependSearch p{margin-bottom:5px;}
 79 | .resultArea .dependSearch a{display:inline-block;margin-right:15px;text-decoration:underline;width:90px; white-space:nowrap; overflow:hidden;text-overflow:ellipsis;}
 80 | .resultArea .searchInResult{padding-left:35px;}
 81 | .resultArea .searchInResult .inResult{position:absolute;right:-190px;top:8px;font-size:14px;text-decoration:underline;}
 82 | .resultArea .searchInResult .searchButton{left:417px;}
 83 | /*历史搜索区域*/
 84 | .historyArea{float:right;margin-right:-216px;width:212px;}
 85 | .historyArea h6{margin-bottom:10px;font-weight:bold;}
 86 | .historyArea .historyList{margin-bottom:20px;}
 87 | .historyArea .historyList li{margin-bottom:5px;}
 88 | 
 89 | 
 90 | 
 91 | /*左侧分栏区域*/
 92 | .subfield{margin-bottom:5px;font-size:14px;font-weight:bold;padding:2px 0 2px 24px;}
 93 | .subfield:first-child{border-left:4px solid #9cc813;padding-left:20px;}
 94 | 
 95 | 
 96 | 
 97 | /*立即搜索样式*/
 98 | .subfieldContent .search{margin:45px 0 0 135px;width:130px;height:36px;background:url(../img/btnbg.png); font-weight:bold;border:none;border:1px solid #bfbfbf;line-height:36px;}
 99 | /*联想下拉区域*/
100 | .inputArea .dataList{display:none;position:absolute;left:0;top:42px;*top:43px;width:550px;padding:5px 0;background:#fff;border:1px solid #bfbfbf;border-top:none;}
101 | .inputArea .dataList li{padding:2px 15px;font-size:14px;}
102 | .inputArea .dataList li:hover{background:#f0f0f0;color:#0080cc;font-weight:bold;}
103 | 
104 | 
105 | 
106 | .writers{color: #888;font-size: 12px;margin-right: 4px; }
107 | .resultArea .resultList .itemBody_writer{font-size: 12px; margin-bottom: 2px}
108 | .resultArea .resultList .itemBody_keywords{ margin-top: 10px;margin-bottom: 10px;}
109 | .keywords{font-size: 12px; color: #333; background-color: #f7f7f7; margin: 4px; padding: 4px}
110 | .download{color: #001ba0;}
111 | 
112 | .showDownloead{ visibility:hidden; }
113 | .clickLink{margin-left: 4px;}


--------------------------------------------------------------------------------
/django_search/static/css/style.css:
--------------------------------------------------------------------------------
 1 | @charset "utf-8";
 2 | /*css reset*/
 3 | html, body, div, span, object, iframe, h1, h2, h3, h4, h5, h6, p, blockquote, pre, a, abbr, acronym, address, code, del, dfn, em, img, q, dl, dt, dd, ol, ul, li, fieldset, form, label, legend, table, caption, tbody, tfoot, thead, tr, th, td , i{
 4 | 	margin:0;
 5 | 	padding:0;
 6 | 	border:0;
 7 | 	font-weight:inherit;
 8 | 	font-style:inherit;
 9 | 	font-size:100%;
10 | 	font-family:inherit;
11 | 	vertical-align:baseline;
12 | }
13 | body {line-height:1.5;}
14 | table {border-collapse: collapse;border-spacing:0;}
15 | caption, th, td ,b,strong{text-align:left;font-weight:normal;}
16 | table, td, th {vertical-align:middle;}	
17 | blockquote:before, blockquote:after, q:before, q:after {content:"";}
18 | blockquote, q {quotes:"" "";}
19 | a img {border:none;}
20 | em,cite{font-style:normal;}
21 | 
22 | 
23 | body { background:#fff; font: 12px/1.5 Tahoma,'宋体';color:#000;}
24 | h1, h2, h3, h4, h5, h6 {font-weight:normal;color:#111;}
25 | a {text-decoration:none;cursor:pointer;}
26 | dl, dt, dd, ol, ul, li{ list-style:none;}
27 | 
28 | /*some common class*/
29 | .left{float:left;}
30 | .right{float:right;}
31 | 
32 | /*clear*/
33 | .ue-clear:after{content: ".";display:block;height:0;clear:both;visibility:hidden;}
34 | .ue-clear{display:inline-block;}
35 | *html .ue-clear{height:1%;}
36 | .ue-clear{display:block;}
37 | 
38 | a{color:#0080cc;}
39 | a:hover{color:#267A01;text-decoration:underline;}
40 | /*logo样式*/
41 | .logo{width:160px;height:47px;padding:0 5px;background: url(../img/logo1.png) no-repeat center center #fff;}
42 | 
43 | /*choose样式*/
44 | .choose{float:left;margin-right:15px;white-space:nowrap;}
45 | .choose .text{float:left;padding-left:20px;*padding-left:16px;white-space:nowrap; vertical-align:text-bottom;}
46 | .choose input[type=radio],.choose input[type=checkbox]{position:relative;*top:-3px;float:left;margin-right:-16px;}
47 | 
48 | /*====================================
49 |   分页信息（表格依赖样式）
50 |   ===================================*/
51 | .pagination{font-size:14px;}
52 | .pagination a {text-decoration: none;border: solid 1px;	}
53 | .pagination .pxofy{float:left;margin-left: 5px;height:25px;*padding-top:1px;}	
54 | .pagination a, .pagination span {display: block;float: left;height:18px;line-height:18px;padding:0 6px;margin-right: 5px;font-family:Arial, Helvetica, sans-serif !important;}
55 | .pagination .current {cursor:default;border: solid 1px ;}
56 | .pagination .prev, .pagination .next{*line-height:22px;}
57 | 
58 | /*分页样式*/
59 | .pagination a{color: #032F54;border-color:#8EB2D2;}
60 | .pagination a:hover{color:#023054;border-color:#8EB2D2;background:#B8DFFB;}
61 | .pagination .current{color:#fff;border-color:#5c9bc4;background:#89B8D8;}
62 | .pagination .current.prev, .pagination .current.next{color:#B9B9B9;border-color:#D3D3D3;background:#fff;}
63 | .pagination .pxofy{color: #023054;}
64 | 
65 | #foot{height:32px;line-height:32px; text-align:center;background:#f9f9f9;border-top:1px solid #e0e0e0;color:#ababab;}
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/django_search/static/img/Thumbs.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/Thumbs.db


--------------------------------------------------------------------------------
/django_search/static/img/btn_min.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/btn_min.png


--------------------------------------------------------------------------------
/django_search/static/img/btnbg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/btnbg.png


--------------------------------------------------------------------------------
/django_search/static/img/down.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/down.png


--------------------------------------------------------------------------------
/django_search/static/img/inputbg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/inputbg.png


--------------------------------------------------------------------------------
/django_search/static/img/line.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/line.png


--------------------------------------------------------------------------------
/django_search/static/img/ll.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/ll.png


--------------------------------------------------------------------------------
/django_search/static/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/logo.png


--------------------------------------------------------------------------------
/django_search/static/img/logo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/logo1.png


--------------------------------------------------------------------------------
/django_search/static/img/lr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/lr.png


--------------------------------------------------------------------------------
/django_search/static/img/more.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/more.png


--------------------------------------------------------------------------------
/django_search/static/img/result_icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/result_icon.png


--------------------------------------------------------------------------------
/django_search/static/img/searchbtn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/searchbtn.png


--------------------------------------------------------------------------------
/django_search/static/js/common.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Created by yli on 2017/4/21.
 3 |  */
 4 | 
 5 | var searchArr;
 6 | //定义一个search的，判断浏览器有无数据存储（搜索历史）
 7 | if(localStorage.search){
 8 | //如果有，转换成 数组的形式存放到searchArr的数组里（localStorage以字符串的形式存储，所以要把它转换成数组的形式）
 9 |     searchArr= localStorage.search.split(",")
10 | }else{
11 | //如果没有，则定义searchArr为一个空的数组
12 |     searchArr = [];
13 | }
14 | //把存储的数据显示出来作为搜索历史
15 | MapSearchArr();
16 | 
17 | 
18 | $("#btn").on("click", function(){
19 |     var val = $("#inp").val();
20 | //点击搜索按钮时，去重
21 |     KillRepeat(val);
22 | //去重后把数组存储到浏览器localStorage
23 |     localStorage.search = searchArr;
24 | //然后再把搜索内容显示出来
25 |     MapSearchArr();
26 | });
27 | 
28 | 
29 | function MapSearchArr(){
30 |     var tmpHtml = "";
31 |     for (var i=0;i<searchArr.length;i++){
32 |         tmpHtml += "<span>" + searchArr[i] + "</span> "
33 |     }
34 |     $("#keyname").html(tmpHtml);
35 | }
36 | //去重
37 | function KillRepeat(val){
38 |     var kill = 0;
39 |     for (var i=0;i<searchArr.length;i++){
40 |         if(val===searchArr[i]){
41 |             kill ++;
42 |         }
43 |     }
44 |     if(kill<1){
45 |         searchArr.push(val);
46 |     }
47 | }
48 | 
49 | 


--------------------------------------------------------------------------------
/django_search/static/js/global.js:
--------------------------------------------------------------------------------
 1 | 
 2 | $(document).ready(function(){
 3 | 	
 4 | // 去除虚线框（会影响效率）
 5 | $("a,input:checkbox,input:radio,button,input:button").live('focus',function(){$(this).blur();});
 6 | 
 7 | });
 8 | 
 9 | 
10 | function hideElement(currentElement, targetElement) {
11 | 	if (!$.isArray(targetElement)) {
12 | 		targetElement = [ targetElement ];
13 | 	}
14 | 	$(document).on("click.hideElement", function(e) {
15 | 		var len = 0, $target = $(e.target);
16 | 		for (var i = 0, length = targetElement.length; i < length; i++) {
17 | 			$.each(targetElement[i], function(j, n) {
18 | 				if ($target.is($(n)) || $.contains($(n)[0], $target[0])) {
19 | 					len++;
20 | 				}
21 | 			});
22 | 		}
23 | 		if ($.contains(currentElement[0], $target[0])) {
24 | 			len = 1;
25 | 		}
26 | 		if (len == 0) {
27 | 			currentElement.hide();
28 | 		}
29 | 	});
30 | };


--------------------------------------------------------------------------------
/django_search/static/js/pagination.js:
--------------------------------------------------------------------------------
  1 | jQuery.fn.pagination = function(maxentries, opts) {
  2 | 	opts = jQuery.extend({
  3 | 				items_per_page : 10, // 每页显示多少条记录
  4 | 				current_page : 0,      //当前页码
  5 | 				num_display_entries : 4, // 中间显示页码的个数
  6 | 				num_edge_entries : 2, // 末尾显示页码的个数
  7 | 				link_to : "javascript:;",         //页码点击后的链接
  8 | 				prev_text : "上一页",   //上一页的文字
  9 | 				next_text : "下一页",	   //下一页的文字
 10 | 				ellipse_text : "...",  //页码之间的省略号
 11 | 				display_msg : true, // 是否显示记录信息
 12 | 				prev_show_always : true, //是否总是显示最前页
 13 | 				next_show_always : true,//是否总是显示最后页
 14 | 				setPageNo:false,//是否显示跳转第几页
 15 | 				callback : function() {
 16 | 					return false;
 17 | 				} // 回调函数
 18 | 			}, opts || {});
 19 | 
 20 | 	return this.each(function() {
 21 | 		// 总页数
 22 | 		function numPages() {
 23 | 			return Math.ceil(maxentries / opts.items_per_page);
 24 | 		}
 25 | 		/**
 26 | 		 * 计算页码
 27 | 		 */
 28 | 		function getInterval() {
 29 | 			var ne_half = Math.ceil(opts.num_display_entries / 2);
 30 | 			var np = numPages();
 31 | 			var upper_limit = np - opts.num_display_entries;
 32 | 			var start = current_page > ne_half ? Math.max(Math.min(current_page
 33 | 									- ne_half, upper_limit), 0) : 0;
 34 | 			var end = current_page > ne_half ? Math.min(current_page + ne_half,
 35 | 					np) : Math.min(opts.num_display_entries, np);
 36 | 			return [start, end];
 37 | 		}
 38 | 
 39 | 		/**
 40 | 		 * 点击事件
 41 | 		 */
 42 | 		function pageSelected(page_id, evt) {
 43 | 			var page_id = parseInt(page_id);
 44 | 			current_page = page_id;
 45 | 			drawLinks();
 46 | 			var continuePropagation = opts.callback(page_id, panel);
 47 | 			if (!continuePropagation) {
 48 | 				if (evt.stopPropagation) {
 49 | 					evt.stopPropagation();
 50 | 				} else {
 51 | 					evt.cancelBubble = true;
 52 | 				}
 53 | 			}
 54 | 			return continuePropagation;
 55 | 		}
 56 | 
 57 | 		/**
 58 | 		 * 链接
 59 | 		 */
 60 | 		function drawLinks() {
 61 | 			panel.empty();
 62 | 			var interval = getInterval();
 63 | 			var np = numPages();
 64 | 			var getClickHandler = function(page_id) {
 65 | 				return function(evt) {
 66 | 					return pageSelected(page_id, evt);
 67 | 				}
 68 | 			}
 69 | 			var appendItem = function(page_id, appendopts) {
 70 | 				page_id = page_id < 0 ? 0 : (page_id < np ? page_id : np-1);
 71 | 				appendopts = jQuery.extend({
 72 | 							text : page_id+1,
 73 | 							classes : ""
 74 | 						}, appendopts || {});
 75 | 				if (page_id == current_page) {
 76 | 					var lnk = $("<span class='current'>" + (appendopts.text)
 77 | 							+ "</span>");
 78 | 				} else {
 79 | 					var lnk = $("<a>" + (appendopts.text) + "</a>").bind(
 80 | 							"click", getClickHandler(page_id)).attr('href',
 81 | 							opts.link_to.replace(/__id__/, page_id));
 82 | 
 83 | 				}
 84 | 				if (appendopts.classes) {
 85 | 					lnk.addClass(appendopts.classes);
 86 | 				}
 87 | 				panel.append(lnk);
 88 | 			}
 89 | 			// 上一页
 90 | 			if (opts.prev_text && (current_page > 0 || opts.prev_show_always)) {
 91 | 				appendItem(current_page - 1, {
 92 | 							text : opts.prev_text,
 93 | 							classes : "prev"
 94 | 						});
 95 | 			}
 96 | 			// 点点点
 97 | 			if (interval[0] > 0 && opts.num_edge_entries > 0) {
 98 | 				var end = Math.min(opts.num_edge_entries, interval[0]);
 99 | 				for (var i = 0; i < end; i++) {
100 | 					appendItem(i);
101 | 				}
102 | 				if (opts.num_edge_entries < interval[0] && opts.ellipse_text) {
103 | 					jQuery("<span>" + opts.ellipse_text + "</span>")
104 | 							.appendTo(panel);
105 | 				}
106 | 			}
107 | 			// 中间的页码
108 | 			for (var i = interval[0]; i < interval[1]; i++) {
109 | 				appendItem(i);
110 | 			}
111 | 			// 最后的页码
112 | 			if (interval[1] < np && opts.num_edge_entries > 0) {
113 | 				if (np - opts.num_edge_entries > interval[1]
114 | 						&& opts.ellipse_text) {
115 | 					jQuery("<span>" + opts.ellipse_text + "</span>")
116 | 							.appendTo(panel);
117 | 				}
118 | 				var begin = Math.max(np - opts.num_edge_entries, interval[1]);
119 | 				for (var i = begin; i < np; i++) {
120 | 					appendItem(i);
121 | 				}
122 | 
123 | 			}
124 | 			// 下一页
125 | 			if (opts.next_text
126 | 					&& (current_page < np - 1 || opts.next_show_always)) {
127 | 				appendItem(current_page + 1, {
128 | 							text : opts.next_text,
129 | 							classes : "next"
130 | 						});
131 | 			}
132 | 			// 记录显示
133 | 			if (opts.display_msg) {
134 | 				if(!maxentries){
135 | 					panel
136 | 						.append('<div class="pxofy">暂时无数据可以显示</div>');
137 | 				}else{
138 | 				panel
139 | 						.append('<div class="pxofy">显示第&nbsp;'
140 | 								+ ((current_page * opts.items_per_page) + 1)
141 | 								+ '&nbsp;条到&nbsp;'
142 | 								+ (((current_page + 1) * opts.items_per_page) > maxentries
143 | 										? maxentries
144 | 										: ((current_page + 1) * opts.items_per_page))
145 | 								+ '&nbsp;条记录，总共&nbsp;' + maxentries + '&nbsp;条</div>');
146 | 				}
147 | 			}
148 | 			//设置跳到第几页
149 | 			if(opts.setPageNo){
150 | 				  panel.append("<div class='goto'><span class='text'>跳转到</span><input type='text'/><span class='page'>页</span><button type='button' class='ue-button long2'>确定</button></div>");	
151 | 			}
152 | 		}
153 | 
154 | 		// 当前页
155 | 		var current_page = opts.current_page;
156 | 		maxentries = ( maxentries < 0) ? 0 : maxentries;
157 | 		opts.items_per_page = (!opts.items_per_page || opts.items_per_page < 0)
158 | 				? 1
159 | 				: opts.items_per_page;
160 | 		var panel = jQuery(this);
161 | 		this.selectPage = function(page_id) {
162 | 			pageSelected(page_id);
163 | 		}
164 | 		this.prevPage = function() {
165 | 			if (current_page > 0) {
166 | 				pageSelected(current_page - 1);
167 | 				return true;
168 | 			} else {
169 | 				return false;
170 | 			}
171 | 		}
172 | 		this.nextPage = function() {
173 | 			if (current_page < numPages() - 1) {
174 | 				pageSelected(current_page + 1);
175 | 				return true;
176 | 			} else {
177 | 				return false;
178 | 			}
179 | 		}
180 | 		
181 | 		if(maxentries==0){
182 | 			panel.append('<span class="current prev">'+opts.prev_text+'</span><span class="current next">'+opts.next_text+'</span><div class="pxofy">暂时无数据可以显示</div>');
183 | 		}else{
184 | 			drawLinks();
185 | 		}
186 | 		$(this).find(".goto button").live("click",function(evt){
187 | 			var setPageNo = $(this).parent().find("input").val();
188 | 			if(setPageNo!=null && setPageNo!=""&&setPageNo>0&&setPageNo<=numPages()){
189 | 				pageSelected(setPageNo-1, evt);
190 | 			}
191 | 		});		
192 | 	});
193 | }
194 | 


--------------------------------------------------------------------------------
/django_search/templates/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html >
  2 | <html xmlns="http://www.w3.org/1999/xhtml">
  3 | {% load staticfiles %}
  4 | <head>
  5 |     <meta http-equiv="X-UA-Compatible" content="IE=emulateIE7"/>
  6 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  7 |     <title>搜索引擎</title>
  8 |     <link href="{% static 'css/style.css' %}" rel="stylesheet" type="text/css"/>
  9 |     <link href="{% static 'css/index.css' %}" rel="stylesheet" type="text/css"/>
 10 | </head>
 11 | <body>
 12 | <div id="container">
 13 |     <div id="bd">
 14 |         <div id="main">
 15 |             <h1 class="title">
 16 |                 <div class="logo large"></div>
 17 |             </h1>
 18 |             <div class="inputArea">
 19 |                 <input type="text" class="searchInput"/>
 20 |                 <input type="button" class="searchButton" onclick="add_search()"/>
 21 |                 <ul class="dataList"></ul>
 22 |             </div>
 23 | 
 24 |             <div class="historyArea">
 25 |                 <p class="history">
 26 |                     <label>热门搜索：</label>
 27 |                     {% for search_word in topn_search %}
 28 |                         <a href="/search?q={{ search_word }}">{{ search_word }}</a>
 29 |                     {% endfor %}
 30 |                 </p>
 31 |                 <p class="history mysearch">
 32 |                     <label>我的搜索：</label>
 33 |                     <span class="all-search">
 34 |                         <a href="javascript:;"></a>
 35 |                     </span>
 36 | 
 37 |                 </p>
 38 |             </div>
 39 |         </div><!-- End of main -->
 40 |     </div><!--End of bd-->
 41 | 
 42 | </div>
 43 | </body>
 44 | <script type="text/javascript" src="{% static 'js/jquery.js' %}"></script>
 45 | <script type="text/javascript" src="{% static 'js/global.js' %}"></script>
 46 | <script type="text/javascript">
 47 |     var suggest_url = "{% url "suggest" %}"
 48 |     var search_url = "{% url "search" %}"
 49 | 
 50 | 
 51 |     $('.searchList').on('click', '.searchItem', function () {
 52 |         $('.searchList .searchItem').removeClass('current');
 53 |         $(this).addClass('current');
 54 |     });
 55 | 
 56 |     function removeByValue(arr, val) {
 57 |         for (var i = 0; i < arr.length; i++) {
 58 |             if (arr[i] == val) {
 59 |                 arr.splice(i, 1);
 60 |                 break;
 61 |             }
 62 |         }
 63 |     }
 64 | 
 65 | 
 66 |     // 搜索建议
 67 |     $(function () {
 68 |         $('.searchInput').bind(' input propertychange ', function () {
 69 |             var searchText = $(this).val();
 70 |             var tmpHtml = ""
 71 |             $.ajax({
 72 |                 cache: false,
 73 |                 type: 'get',
 74 |                 dataType: 'json',
 75 |                 url: suggest_url + "?s=" + searchText,
 76 |                 async: true,
 77 |                 success: function (data) {
 78 |                     for (var i = 0; i < data.length; i++) {
 79 |                         tmpHtml += '<li><a href="' + search_url + '?q=' + data[i] + '">' + data[i] + '</a></li>'
 80 |                     }
 81 |                     $(".dataList").html("")
 82 |                     $(".dataList").append(tmpHtml);
 83 |                     if (data.length == 0) {
 84 |                         $('.dataList').hide()
 85 |                     } else {
 86 |                         $('.dataList').show()
 87 |                     }
 88 |                 }
 89 |             });
 90 |         });
 91 |     })
 92 | 
 93 |     hideElement($('.dataList'), $('.searchInput'));
 94 | 
 95 | </script>
 96 | <script>
 97 |     var searchArr;
 98 |     //定义一个search的，判断浏览器有无数据存储（搜索历史）
 99 |     if (localStorage.search) {
100 |         //如果有，转换成 数组的形式存放到searchArr的数组里（localStorage以字符串的形式存储，所以要把它转换成数组的形式）
101 |         searchArr = localStorage.search.split(",")
102 |     } else {
103 |         //如果没有，则定义searchArr为一个空的数组
104 |         searchArr = [];
105 |     }
106 |     //把存储的数据显示出来作为搜索历史
107 |     MapSearchArr();
108 | 
109 |     function add_search() {
110 |         var val = $(".searchInput").val();
111 |         if (val.length >= 2) {
112 |             //点击搜索按钮时，去重
113 |             KillRepeat(val);
114 |             //去重后把数组存储到浏览器localStorage
115 |             localStorage.search = searchArr;
116 |             //然后再把搜索内容显示出来
117 |             MapSearchArr();
118 |         }
119 | 
120 |         window.location.href = search_url + '?q=' + val
121 | 
122 |     }
123 | 
124 |     function MapSearchArr() {
125 |         var tmpHtml = "";
126 |         var arrLen = 0
127 |         if (searchArr.length >= 5) {
128 |             arrLen = 5
129 |         } else {
130 |             arrLen = searchArr.length
131 |         }
132 |         for (var i = 0; i < arrLen; i++) {
133 |             tmpHtml += '<a href="' + search_url + '?q=' + searchArr[i] + '">' + searchArr[i] + '</a>'
134 |         }
135 |         $(".mysearch .all-search").html(tmpHtml);
136 |     }
137 | 
138 |     //去重
139 |     function KillRepeat(val) {
140 |         var kill = 0;
141 |         for (var i = 0; i < searchArr.length; i++) {
142 |             if (val === searchArr[i]) {
143 |                 kill++;
144 |             }
145 |         }
146 |         if (kill < 1) {
147 |             searchArr.unshift(val);
148 |         } else {
149 |             removeByValue(searchArr, val)
150 |             searchArr.unshift(val)
151 |         }
152 |     }
153 | 
154 | 
155 | </script>
156 | </html>


--------------------------------------------------------------------------------
/django_search/templates/result.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html >
  2 | <html xmlns="http://www.w3.org/1999/xhtml">
  3 | {% load staticfiles %}
  4 | <head>
  5 |     <meta http-equiv="X-UA-Compatible" content="IE=emulateIE7"/>
  6 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  7 |     <title>搜索引擎</title>
  8 |     <link href="{% static 'css/style.css' %}" rel="stylesheet" type="text/css"/>
  9 |     <link href="{% static 'css/result.css' %}" rel="stylesheet" type="text/css"/>
 10 | </head>
 11 | <body>
 12 | <div id="container">
 13 |     <div id="hd" class="ue-clear">
 14 |         <a href="/">
 15 |             <div class="logo"></div>
 16 |         </a>
 17 |         <div class="inputArea">
 18 |             <input type="text" class="searchInput" value="{{ key_words }}"/>
 19 |             <input type="button" class="searchButton" onclick="add_search()"/>
 20 |             <ul class="dataList"></ul>
 21 |         </div>
 22 |     </div>
 23 |     <div id="bd" class="ue-clear">
 24 |         <div id="main" style="overflow: hidden">
 25 |             <div class="sideBar">
 26 |                 <div class="subfield">排序条件</div>
 27 |                 <ul class="subfieldContext">
 28 |                      <li>
 29 |                         <a class="name" name="option" type="radio" value="score" href="/search?q={{ key_words }}"/> 按默认打分</a>
 30 |                     </li>
 31 |                     <li>
 32 |                         <a class="name" name="option" type="radio" value="score" href="/search?q={{ key_words }}&option=date"/> 按发表时间</a>
 33 |                     </li>
 34 |                     <li>
 35 |                         <a class="name" name="option" type="radio" value="score" href="/search?q={{ key_words }}&option=cite" /> 按被引用量</a>
 36 |                     </li>
 37 |                 </ul>
 38 |                 <div class="subfield">数据统计</div>
 39 |                 <ul class="subfieldContext">
 40 |                     <li>
 41 |                         <span class="name">爬虫已爬取</span>
 42 |                         <span class="unit">{{ baidu_count }}</span>
 43 |                     </li>
 44 |                     <li>
 45 |                         <span class="name">ES总数据量</span>
 46 |                         <span class="unit">{{ baidu_count }}</span>
 47 |                     </li>
 48 |                 </ul>
 49 |                 <div class="sideBarShowHide">
 50 |                     <a href="javascript:;" class="icon"></a>
 51 |                 </div>
 52 | 
 53 |             </div>
 54 |             <div class="historyArea">
 55 |                 <div class="hotSearch">
 56 |                     <h6>热门搜索</h6>
 57 |                     <ul class="historyList">
 58 |                         {% for search_word in topn_search %}
 59 |                             <li><a href="/search?q={{ search_word }}">{{ search_word }}</a></li>
 60 |                         {% endfor %}
 61 |                     </ul>
 62 |                 </div>
 63 |                 <div class="mySearch">
 64 |                     <h6>我的搜索</h6>
 65 |                     <ul class="historyList"></ul>
 66 |                 </div>
 67 |             </div>
 68 |             <div class="resultArea">
 69 |                 <p class="resultTotal">
 70 |                     <span class="info">找到约&nbsp;<span class="totalResult">{{ total_nums }}</span>&nbsp;条结果(用时<span
 71 |                             class="time">{{ last_seconds }}</span>秒)，共约<span
 72 |                             class="totalPage">{{ page_nums }}</span>页</span>
 73 |                 </p>
 74 |                 <div class="resultList" style="float: left">
 75 | 
 76 |                     {% for hit in all_hits %}
 77 |                         <div class="resultItem">
 78 |                             <div class="itemHead">
 79 |                                 <a href="{{ hit.paper_source }}" target="_blank" class="title">
 80 |                                     {% autoescape off %}
 81 |                                         {{ hit.paper_title }}
 82 |                                     {% endautoescape %}
 83 |                                 </a>
 84 |                                 <span class="divsion">-</span>
 85 |                                 <span class="fileType">
 86 |                                     <a class="label" href="{{ hit.paper_source }}">[来源]</a>
 87 |                                 </span>
 88 |                                 <span class="dependValue">
 89 |                                     <span class="label">得分</span>
 90 |                                     <span class="value">{{ hit.score }}</span>
 91 |                                 </span>
 92 |                             </div>
 93 |                             <div class="itemBody_writer">
 94 |                                 {% for writer in hit.paper_writer %}
 95 |                                     <span class="writers">{{ writer }}</span>
 96 |                                 {% endfor %}
 97 |                             </div>
 98 |                             <div class="itemBody">
 99 |                                 {% autoescape off %}
100 |                                     {{ hit.paper_abstract }}
101 |                                 {% endautoescape %}
102 |                             </div>
103 |                             <div class="itemBody itemBody_keywords">
104 |                                 {% for word in hit.paper_keywords %}
105 |                                     <span class="keywords">
106 |                                     {% autoescape off %}
107 |                                         {{ word }}
108 |                                     {% endautoescape %}
109 |                                     </span>
110 |                                 {% endfor %}
111 |                             </div>
112 |                             <div class=" itemFoot">
113 |                                 <span class="info">
114 |                                     <span class="value">{{ hit.paper_time }}年</span>
115 |                                 </span>
116 |                                 <span class="info">
117 |                                     <label>被引用量</label>
118 |                                     <span class="value">{{ hit.paper_cite_count }}</span>
119 |                                 </span>
120 |                                 <span class="info">
121 |                                     <label>DOI</label>
122 |                                     <span class="value">{{ hit.paper_DOI }}</span>
123 |                                 </span>
124 |                                 <span class="hideDownloead">
125 |                                     <a href="javascript:;" class="download">下载地址 ></a>
126 |                                     <span class="links showDownloead">
127 |                                         {% for link in hit.paper_download_link %}
128 |                                             <a class="clickLink" href="{{ link }}">点击下载</a>
129 |                                         {% endfor %}
130 |                                     </span>
131 |                                 </span>
132 |                             </div>
133 |                         </div>
134 |                     {% endfor %}
135 | 
136 |                 </div>
137 |                 <!-- 分页 -->
138 |                 <div class="pagination ue-clear"></div>
139 |                 <!-- 相关搜索 -->
140 |             </div>
141 |         </div><!-- End of main -->
142 |     </div><!--End of bd-->
143 | </div>
144 | </body>
145 | <script type="text/javascript" src="{% static 'js/jquery.js' %}"></script>
146 | <script type="text/javascript" src="{% static 'js/global.js' %}"></script>
147 | <script type="text/javascript" src="{% static 'js/pagination.js' %}"></script>
148 | <script type="text/javascript">
149 |     var search_url = "{% url 'search' %}"
150 | 
151 |     $('.searchList').on('click', '.searchItem', function () {
152 |         $('.searchList .searchItem').removeClass('current');
153 |         $(this).addClass('current');
154 | 
155 |     });
156 | 
157 |     function removeByValue(arr, val) {
158 |         for (var i = 0; i < arr.length; i++) {
159 |             if (arr[i] == val) {
160 |                 arr.splice(i, 1);
161 |                 break;
162 |             }
163 |         }
164 |     }
165 | 
166 |     $('.sideBarShowHide a').click(function (e) {
167 |         if ($('#main').hasClass('sideBarHide')) {
168 |             $('#main').removeClass('sideBarHide');
169 |             $('#container').removeClass('sideBarHide');
170 |         } else {
171 |             $('#main').addClass('sideBarHide');
172 |             $('#container').addClass('sideBarHide');
173 |         }
174 | 
175 |     });
176 | 
177 |     var key_words = "{{key_words}}"
178 |     var option_value = $("input[type='radio'][name='option']:checked").val();
179 |     //分页
180 |     $(".pagination").pagination({{ total_nums }}, {
181 |         current_page:{{ page|add:'-1' }}, //当前页码
182 |         items_per_page: 10,
183 |         display_msg: true,
184 |         callback: pageselectCallback
185 |     });
186 | 
187 |     function pageselectCallback(page_id, jq) {
188 |         window.location.href = search_url + '?q=' + key_words + '&p=' + (page_id + 1)
189 |     }
190 | 
191 |     setHeight();
192 |     $(window).resize(function () {
193 |         setHeight();
194 |     });
195 | 
196 |     function setHeight() {
197 |         if ($('#container').outerHeight() < $(window).height()) {
198 |             $('#container').height($(window).height() - 33);
199 |         }
200 |     }
201 | </script>
202 | <script type="text/javascript">
203 |     var searchArr;
204 |     //定义一个search的，判断浏览器有无数据存储（搜索历史）
205 |     if (localStorage.search) {
206 |         //如果有，转换成 数组的形式存放到searchArr的数组里（localStorage以字符串的形式存储，所以要把它转换成数组的形式）
207 |         searchArr = localStorage.search.split(",")
208 |     } else {
209 |         //如果没有，则定义searchArr为一个空的数组
210 |         searchArr = [];
211 |     }
212 |     //把存储的数据显示出来作为搜索历史
213 |     MapSearchArr();
214 | 
215 |     function add_search() {
216 |         var val = $(".searchInput").val();
217 |         if (val.length >= 2) {
218 |             //点击搜索按钮时，去重
219 |             KillRepeat(val);
220 |             //去重后把数组存储到浏览器localStorage
221 |             localStorage.search = searchArr;
222 |             //然后再把搜索内容显示出来
223 |             MapSearchArr();
224 |         }
225 | 
226 |         window.location.href = search_url + '?q=' + val
227 | 
228 |     }
229 | 
230 |     function MapSearchArr() {
231 |         var tmpHtml = "";
232 |         var arrLen = 0
233 |         if (searchArr.length > 6) {
234 |             arrLen = 6
235 |         } else {
236 |             arrLen = searchArr.length
237 |         }
238 |         for (var i = 0; i < arrLen; i++) {
239 |             tmpHtml += '<li><a href="/search?q=' + searchArr[i] + '">' + searchArr[i] + '</a></li>'
240 |         }
241 |         $(".mySearch .historyList").append(tmpHtml);
242 |     }
243 | 
244 |     //去重
245 |     function KillRepeat(val) {
246 |         var kill = 0;
247 |         for (var i = 0; i < searchArr.length; i++) {
248 |             if (val === searchArr[i]) {
249 |                 kill++;
250 |             }
251 |         }
252 |         if (kill < 1) {
253 |             searchArr.unshift(val);
254 |         } else {
255 |             removeByValue(searchArr, val)
256 |             searchArr.unshift(val)
257 |         }
258 |     }
259 | 
260 |     $('.hideDownloead a').click(function (e) {
261 |         if ($(this).next().hasClass('showDownloead')) {
262 |             $(this).next().removeClass('showDownloead');
263 |         } else {
264 |             $(this).next().addClass('showDownloead');
265 |         }
266 |     });
267 | </script>
268 | <script type="text/javascript">
269 |     var suggest_url = "{% url "suggest" %}"
270 |     var search_url = "{% url "search" %}"
271 | 
272 | 
273 |     $('.searchList').on('click', '.searchItem', function () {
274 |         $('.searchList .searchItem').removeClass('current');
275 |         $(this).addClass('current');
276 |     });
277 | 
278 |     function removeByValue(arr, val) {
279 |         for (var i = 0; i < arr.length; i++) {
280 |             if (arr[i] == val) {
281 |                 arr.splice(i, 1);
282 |                 break;
283 |             }
284 |         }
285 |     }
286 | 
287 | 
288 |     // 搜索建议
289 |     $(function () {
290 |         $('.searchInput').bind(' input propertychange ', function () {
291 |             var searchText = $(this).val();
292 |             var tmpHtml = ""
293 |             $.ajax({
294 |                 cache: false,
295 |                 type: 'get',
296 |                 dataType: 'json',
297 |                 url: suggest_url + "?s=" + searchText,
298 |                 async: true,
299 |                 success: function (data) {
300 |                     for (var i = 0; i < data.length; i++) {
301 |                         tmpHtml += '<li><a href="' + search_url + '?q=' + data[i] + '">' + data[i] + '</a></li>'
302 |                     }
303 |                     $(".dataList").html("")
304 |                     $(".dataList").append(tmpHtml);
305 |                     if (data.length == 0) {
306 |                         $('.dataList').hide()
307 |                     } else {
308 |                         $('.dataList').show()
309 |                     }
310 |                 }
311 |             });
312 |         });
313 |     })
314 | 
315 |     hideElement($('.dataList'), $('.searchInput'));
316 | 
317 | </script>
318 | </html>


--------------------------------------------------------------------------------
/paperSpider/.idea/paperSpider.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 3.7 (baidu_paper_spider)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/paperSpider/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time     : 2020-02-18 15:52
 3 | # @Author   : beking
 4 | from scrapy.cmdline import execute
 5 | 
 6 | import sys
 7 | import os
 8 | 
 9 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
10 | 
11 | execute(["scrapy", "crawl", "baidu"])
12 | 


--------------------------------------------------------------------------------
/paperSpider/paperSpider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/paperSpider/paperSpider/__init__.py


--------------------------------------------------------------------------------
/paperSpider/paperSpider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://docs.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class PaperspiderItem(scrapy.Item):
12 |     paper_title = scrapy.Field()  # 论文题目
13 |     paper_writer = scrapy.Field()  # 作者
14 |     paper_time = scrapy.Field()  # 发表年代
15 |     paper_cite_count = scrapy.Field()  # 被引用量
16 |     paper_source = scrapy.Field()  # 来源
17 |     paper_abstract = scrapy.Field()  # 摘要
18 |     paper_keywords = scrapy.Field()  # 关键词
19 |     paper_DOI = scrapy.Field()  # DOI
20 |     paper_download_link = scrapy.Field()  # 下载地址
21 | 


--------------------------------------------------------------------------------
/paperSpider/paperSpider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class PaperspiderSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Request, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class PaperspiderDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/paperSpider/paperSpider/models/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time     : 2020-03-22 13:34
3 | # @Author   : beking
4 | 


--------------------------------------------------------------------------------
/paperSpider/paperSpider/models/es_types.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time     : 2020-03-22 13:35
 3 | # @Author   : beking
 4 | 
 5 | from datetime import datetime
 6 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
 7 |     analyzer, Completion, Keyword, Text, Integer
 8 | from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
 9 | from elasticsearch_dsl.connections import connections
10 | 
11 | connections.create_connection(hosts=["localhost"])
12 | 
13 | 
14 | class CustomAnalyzer(_CustomAnalyzer):
15 |     def get_analysis_definition(self):
16 |         return {}
17 | 
18 | 
19 | ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])
20 | 
21 | 
22 | class PaperType(DocType):
23 |     suggest = Completion(analyzer=ik_analyzer)  # 用于自动补全
24 | 
25 |     paper_title = Text(analyzer="ik_max_word")
26 |     paper_writer = Keyword()
27 |     paper_time = Integer()
28 |     paper_cite_count = Integer()
29 |     paper_source = Keyword()
30 |     paper_abstract = Text(analyzer="ik_max_word")
31 |     paper_keywords = Text(analyzer="ik_max_word")
32 |     paper_DOI = Text()
33 |     paper_download_link = Text()
34 | 
35 |     class Meta:
36 |         index = "baidu"
37 |         doc_type = "paper"
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     PaperType.init()
42 | 


--------------------------------------------------------------------------------
/paperSpider/paperSpider/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | #
  5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
  7 | # import codecs
  8 | # from scrapy.exporters import JsonItemExporter
  9 | import redis
 10 | from paperSpider.models.es_types import PaperType
 11 | 
 12 | from elasticsearch_dsl.connections import connections
 13 | 
 14 | es = connections.create_connection(PaperType._doc_type.using)
 15 | 
 16 | redis_cli = redis.StrictRedis()
 17 | 
 18 | # import MySQLdb
 19 | 
 20 | 
 21 | def gen_suggests(index, info_tuple):
 22 |     # 根据字符串生成搜索建议数组
 23 |     used_words = set()
 24 |     suggests = []
 25 |     for text, weight in info_tuple:
 26 |         if text:
 27 |             # 调用 es 的 analyze 接口分析字符串
 28 |             words = es.indices.analyze(index=index, analyzer="ik_smart", params={'filter': ["lowercase"]}, body=text)
 29 |             analyzed_words = set([r["token"] for r in words["tokens"] if len(r["token"]) > 1])
 30 |             new_words = analyzed_words - used_words
 31 |         else:
 32 |             new_words = set()
 33 | 
 34 |         if new_words:
 35 |             suggests.append({"input": list(new_words), "weight": weight})
 36 |             used_words = used_words.union(new_words)
 37 | 
 38 |     return suggests
 39 | 
 40 | 
 41 | class PaperspiderPipeline(object):
 42 |     def process_item(self, item, spider):
 43 |         return item
 44 | 
 45 | 
 46 | # class MysqlPipeline(object):
 47 | #     def __init__(self):
 48 | #         self.conn = MySQLdb.connect('127.0.0.1', 'root', '123456', 'paper_spider_data', charset='utf8',
 49 | #                                     use_unicode=True)
 50 | #         self.cursor = self.conn.cursor()
 51 | #
 52 | #     def process_item(self, item, spider):
 53 | #         insert_sql = """
 54 | #             insert into paper_info(paper_title,paper_writer,paper_time,paper_cite_count,paper_source,paper_abstract,paper_keywords,paper_DOI,paper_download_link)
 55 | #             values (%s, %s, %s, %s, %s, %s, %s, %s, %s)
 56 | #         """
 57 | #         params = list()
 58 | #         params.append(item.get('paper_title', ''))
 59 | #         params.append(item.get('paper_writer', ''))
 60 | #         params.append(item.get('paper_time', ''))
 61 | #         params.append(item.get('paper_cite_count', ''))
 62 | #         params.append(item.get('paper_source', ''))
 63 | #         params.append(item.get('paper_abstract', ''))
 64 | #         params.append(item.get('paper_keywords', ''))
 65 | #         params.append(item.get('paper_DOI', ''))
 66 | #         params.append(item.get('paper_download_link', ''))
 67 | #         self.cursor.execute(insert_sql, tuple(params))
 68 | #         self.conn.commit()
 69 | #
 70 | #         return item
 71 | #
 72 | #
 73 | # class JsonExporterPipeline(object):
 74 | #     def __init__(self):
 75 | #         self.file = open('paper_export.json', 'wb')
 76 | #         self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False)
 77 | #         self.exporter.start_exporting()
 78 | #
 79 | #     def process_item(self, item, spider):
 80 | #         self.exporter.export_item(item)
 81 | #         return item
 82 | #
 83 | #     def spider_closed(self):
 84 | #         self.exporter.finish_exporting()
 85 | #         self.file.close()
 86 | 
 87 | 
 88 | class ElasticsearchPipeline(object):
 89 |     def process_item(self, item, spider):
 90 |         # 将 item 转化为 es 的数据
 91 |         paper = PaperType()
 92 |         paper.paper_title = item['paper_title']
 93 |         paper.paper_writer = item['paper_writer']
 94 |         paper.paper_abstract = item['paper_abstract']
 95 |         paper.paper_keywords = item['paper_keywords']
 96 |         paper.paper_DOI = item['paper_DOI']
 97 |         paper.paper_time = item['paper_time']
 98 |         paper.paper_cite_count = item['paper_cite_count']
 99 |         paper.paper_source = item['paper_source']
100 |         paper.paper_download_link = item['paper_download_link']
101 |         paper.meta.id = item['paper_source']
102 | 
103 |         paper.suggest = gen_suggests(PaperType._doc_type.index,
104 |                                      ((paper.paper_title, 10), (paper.paper_keywords, 5), (paper.paper_abstract, 2)))
105 | 
106 |         paper.save()
107 |         redis_cli.incr("baidu_count")
108 | 
109 |         return item
110 | 


--------------------------------------------------------------------------------
/paperSpider/paperSpider/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for paperSpider project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://docs.scrapy.org/en/latest/topics/settings.html
  9 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'paperSpider'
 13 | 
 14 | SPIDER_MODULES = ['paperSpider.spiders']
 15 | NEWSPIDER_MODULE = 'paperSpider.spiders'
 16 | 
 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 18 | # USER_AGENT = 'paperSpider (+http://www.yourdomain.com)'
 19 | 
 20 | # Obey robots.txt rules
 21 | ROBOTSTXT_OBEY = False
 22 | 
 23 | # Enables scheduling storing requests queue in redis.
 24 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
 25 | 
 26 | # Schedule requests using a priority queue. (default)
 27 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
 28 | 
 29 | # Ensure all spiders share same duplicates filter through redis.
 30 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
 31 | 
 32 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 33 | # CONCURRENT_REQUESTS = 32
 34 | 
 35 | # Configure a delay for requests for the same website (default: 0)
 36 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 37 | # See also autothrottle settings and docs
 38 | # DOWNLOAD_DELAY = 3
 39 | # The download delay setting will honor only one of:
 40 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 41 | # CONCURRENT_REQUESTS_PER_IP = 16
 42 | 
 43 | # Disable cookies (enabled by default)
 44 | # COOKIES_ENABLED = False
 45 | 
 46 | # Disable Telnet Console (enabled by default)
 47 | # TELNETCONSOLE_ENABLED = False
 48 | 
 49 | # Override the default request headers:
 50 | # DEFAULT_REQUEST_HEADERS = {
 51 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 52 | #   'Accept-Language': 'en',
 53 | # }
 54 | 
 55 | # Enable or disable spider middlewares
 56 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 57 | # SPIDER_MIDDLEWARES = {
 58 | #    'paperSpider.middlewares.PaperspiderSpiderMiddleware': 543,
 59 | # }
 60 | 
 61 | # Enable or disable downloader middlewares
 62 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 63 | # DOWNLOADER_MIDDLEWARES = {
 64 | #    'paperSpider.middlewares.PaperspiderDownloaderMiddleware': 543,
 65 | # }
 66 | 
 67 | # Enable or disable extensions
 68 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
 69 | # EXTENSIONS = {
 70 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 71 | # }
 72 | 
 73 | # Configure item pipelines
 74 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 75 | # ITEM_PIPELINES = {
 76 | #    'paperSpider.pipelines.JsonExporterPipeline': 2,
 77 | #    'paperSpider.pipelines.MysqlPipeline': 3,
 78 | #    'paperSpider.pipelines.PaperspiderPipeline': 300,
 79 | # }
 80 | # Store scraped item in redis for post-processing.
 81 | ITEM_PIPELINES = {
 82 |    'scrapy_redis.pipelines.RedisPipeline': 300,
 83 |    'paperSpider.pipelines.ElasticsearchPipeline': 1
 84 | }
 85 | # Enable and configure the AutoThrottle extension (disabled by default)
 86 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 87 | # AUTOTHROTTLE_ENABLED = True
 88 | # The initial download delay
 89 | # AUTOTHROTTLE_START_DELAY = 5
 90 | # The maximum download delay to be set in case of high latencies
 91 | # AUTOTHROTTLE_MAX_DELAY = 60
 92 | # The average number of requests Scrapy should be sending in parallel to
 93 | # each remote server
 94 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 95 | # Enable showing throttling stats for every response received:
 96 | # AUTOTHROTTLE_DEBUG = False
 97 | 
 98 | # Enable and configure HTTP caching (disabled by default)
 99 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
100 | # HTTPCACHE_ENABLED = True
101 | # HTTPCACHE_EXPIRATION_SECS = 0
102 | # HTTPCACHE_DIR = 'httpcache'
103 | # HTTPCACHE_IGNORE_HTTP_CODES = []
104 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
105 | 


--------------------------------------------------------------------------------
/paperSpider/paperSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/paperSpider/paperSpider/spiders/baidu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | from urllib import parse
 4 | from paperSpider.utils.common import format_word
 5 | import scrapy
 6 | from scrapy import Request
 7 | from paperSpider.items import PaperspiderItem
 8 | from scrapy_redis.spiders import RedisSpider
 9 | 
10 | 
11 | class BaiduSpider(scrapy.Spider):
12 |     name = 'baidu'
13 |     # redis_key = 'baidu:start_urls'
14 |     allowed_domains = ['xueshu.baidu.com']
15 |     input_keyword = 'machine%20learning'
16 |     start_urls = [
17 |         # 'http://xueshu.baidu.com/s?wd=machine%20learning&pn=0&tn=SE_baiduxueshu_c1gjeupa&ie=utf-8&sc_f_para=sc_tasktype%3D%7BfirstAdvancedSearch%7D&sc_hit=1'
18 |         # 'http://xueshu.baidu.com/s?wd=%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0&tn=SE_baiduxueshu_c1gjeupa&cl=3&ie=utf-8&bs=machine+learning&f=8&rsv_bp=1&rsv_sug2=0&sc_f_para=sc_tasktype%3D%7BfirstSimpleSearch%7D&rsv_spt=3'
19 |         # 'http://xueshu.baidu.com/s?wd=%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98&rsv_bp=0&tn=SE_baiduxueshu_c1gjeupa&rsv_spt=3&ie=utf-8&f=8&rsv_sug2=0&sc_f_para=sc_tasktype%3D%7BfirstSimpleSearch%7D'
20 |         'http://xueshu.baidu.com/s?wd=%E6%B7%B1%E5%BA%A6%E5%AD%A6%E4%B9%A0&tn=SE_baiduxueshu_c1gjeupa&cl=3&ie=utf-8&bs=%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98&f=8&rsv_bp=1&rsv_sug2=0&sc_f_para=sc_tasktype%3D%7BfirstSimpleSearch%7D&rsv_spt=3'
21 |     ]
22 | 
23 |     def parse(self, response):
24 |         paper_nodes = response.xpath('//*[@class="sc_content"]')
25 |         for paper_node in paper_nodes:
26 |             paper_url = paper_node.css('h3 a::attr(href)').extract_first('')
27 |             yield Request(url=parse.urljoin(response.url, paper_url), callback=self.parse_detail)
28 | 
29 |         next_url = response.css('#page a:last-child::attr(href)').extract_first('')
30 |         if next_url:
31 |             # next_url = 'http://xueshu.baidu.com' + next_url
32 |             yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
33 | 
34 |     def parse_detail(self, response):
35 |         paper_item = PaperspiderItem()
36 | 
37 |         paper_title = response.css('.main-info h3 a::text').extract_first('')
38 |         paper_title = format_word(paper_title)
39 | 
40 |         paper_writer = response.css('.author_wr .author_text span a::text').extract()
41 |         paper_abstract = response.css('.abstract::text').extract_first('')
42 |         paper_keywords = response.css('.kw_wr .kw_main span a::text').extract()
43 | 
44 |         paper_DOI = response.css('.doi_wr .kw_main::text').extract_first('')
45 |         paper_DOI = format_word(paper_DOI)
46 | 
47 |         paper_cite_count = response.css('.sc_cite_cont::text').extract_first(0)
48 |         paper_cite_count = format_word(paper_cite_count)
49 | 
50 |         paper_source = response.css('.love_wr .label-ll a::attr(href)').extract_first('')
51 | 
52 |         paper_time = response.css('.year_wr .kw_main::text').extract_first('暂无')
53 |         paper_time = format_word(paper_time)
54 | 
55 |         paper_download_link = response.css('#savelink_wr .dl_item_span a::attr(href)').extract()
56 | 
57 |         paper_item['paper_title'] = paper_title
58 |         paper_item['paper_writer'] = paper_writer
59 |         paper_item['paper_abstract'] = paper_abstract
60 |         paper_item['paper_keywords'] = paper_keywords
61 |         paper_item['paper_DOI'] = paper_DOI
62 |         paper_item['paper_time'] = paper_time
63 |         paper_item['paper_cite_count'] = paper_cite_count
64 |         paper_item['paper_source'] = paper_source
65 |         paper_item['paper_download_link'] = paper_download_link[:5]
66 | 
67 |         yield paper_item
68 | 


--------------------------------------------------------------------------------
/paperSpider/paperSpider/tools/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time     : 2020-03-21 13:37
3 | # @Author   : beking
4 | 


--------------------------------------------------------------------------------
/paperSpider/paperSpider/tools/add_urls.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time     : 2020-03-21 13:37
 3 | # @Author   : beking
 4 | 
 5 | import redis
 6 | import json
 7 | 
 8 | rd = redis.Redis("127.0.0.1", decode_responses=True)
 9 | rd.lpush('baidu:start_urls',
10 |          'http://xueshu.baidu.com/s?wd=machine+learning&rsv_bp=0&tn=SE_baiduxueshu_c1gjeupa&rsv_spt=3&ie=utf-8&f=8&rsv_sug2=1&sc_f_para=sc_tasktype%3D%7BfirstSimpleSearch%7D')
11 | 
12 | urls = [('', 3, 'parse_detail'), ('', 4, 'parse_detail')]
13 | 
14 | for url in urls:
15 |     rd.rpush("baidu:new_urls", json.dumps(url))
16 | 


--------------------------------------------------------------------------------
/paperSpider/paperSpider/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time     : 2020-03-19 12:29
3 | # @Author   : beking
4 | 


--------------------------------------------------------------------------------
/paperSpider/paperSpider/utils/bloomfilter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time     : 2020-03-20 13:25
 3 | # @Author   : beking
 4 | 
 5 | import mmh3
 6 | import redis
 7 | import math
 8 | import time
 9 | 
10 | 
11 | class PyBloomFilter():
12 |     # 内置100个随机种子
13 |     SEEDS = [543, 460, 171, 876, 796, 607, 650, 81, 837, 545, 591, 946, 846, 521, 913, 636, 878, 735, 414, 372,
14 |              344, 324, 223, 180, 327, 891, 798, 933, 493, 293, 836, 10, 6, 544, 924, 849, 438, 41, 862, 648, 338,
15 |              465, 562, 693, 979, 52, 763, 103, 387, 374, 349, 94, 384, 680, 574, 480, 307, 580, 71, 535, 300, 53,
16 |              481, 519, 644, 219, 686, 236, 424, 326, 244, 212, 909, 202, 951, 56, 812, 901, 926, 250, 507, 739, 371,
17 |              63, 584, 154, 7, 284, 617, 332, 472, 140, 605, 262, 355, 526, 647, 923, 199, 518]
18 | 
19 |     # capacity 预先估计要去重的数量
20 |     # error_rate 错误率
21 |     # conn redis的连接客户端
22 |     # key 在redis中的键的名字前缀
23 |     def __init__(self, capacity=1000000000, error_rate=0.00000001, conn=None, key='BloomFilter'):
24 |         self.m = math.ceil(capacity * math.log2(math.e) * math.log2(1 / error_rate))  # 需要的总bit位数
25 |         self.k = math.ceil(math.log1p(2) * self.m / capacity)  # 需要最少的hash次数
26 |         self.mem = math.ceil(self.m / 8 / 1024 / 1024)  # 需要的多少M内存
27 |         self.blocknum = math.ceil(self.mem / 512)  # 需要多少个512M的内存块,value的第一个字符必须是ascii码，所有最多有256个内存块
28 |         self.seeds = self.SEEDS[0:self.k]
29 |         self.key = key
30 |         self.N = 2 ** 31 - 1
31 |         self.redis = conn
32 |         print(self.mem)
33 |         print(self.k)
34 | 
35 |     def add(self, value):
36 |         name = self.key + "_" + str(ord(value[0]) % self.blocknum)
37 |         hashs = self.get_hashs(value)
38 |         for hash in hashs:
39 |             self.redis.setbit(name, hash, 1)
40 | 
41 |     def is_exist(self, value):
42 |         name = self.key + "_" + str(ord(value[0]) % self.blocknum)
43 |         hashs = self.get_hashs(value)
44 |         exist = True
45 |         for hash in hashs:
46 |             exist = exist & self.redis.getbit(name, hash)
47 |         return exist
48 | 
49 |     def get_hashs(self, value):
50 |         hashs = list()
51 |         for seed in self.seeds:
52 |             hash = mmh3.hash(value, seed)
53 |             if hash >= 0:
54 |                 hashs.append(hash)
55 |             else:
56 |                 hashs.append(self.N - hash)
57 |         return hashs
58 | 
59 | 
60 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)
61 | conn = redis.StrictRedis(connection_pool=pool)
62 | 
63 | start = time.time()
64 | bf = PyBloomFilter(conn=conn)
65 | bf.add('www.jobbole.com')
66 | bf.add('www.zhihu.com')
67 | print(bf.is_exist('www.zhihu.com'))
68 | print(bf.is_exist('www.lagou.com'))
69 | 


--------------------------------------------------------------------------------
/paperSpider/paperSpider/utils/common.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time     : 2020-03-19 12:29
 3 | # @Author   : beking
 4 | import re
 5 | 
 6 | 
 7 | def format_word(word):
 8 |     if word:
 9 |         re.sub(r'\s+', '', word)
10 |         word = str(word).strip()
11 |         return word
12 | 


--------------------------------------------------------------------------------
/paperSpider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = paperSpider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = paperSpider
12 | 


--------------------------------------------------------------------------------
/paperSpider/scrapy_redis/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from .connection import (  # NOQA
 3 |     get_redis,
 4 |     get_redis_from_settings,
 5 | )
 6 | 
 7 | __author__ = 'Rolando Espinoza'
 8 | __email__ = 'rolando at rmax.io'
 9 | __version__ = '0.7.0-dev'
10 | 


--------------------------------------------------------------------------------
/paperSpider/scrapy_redis/connection.py:
--------------------------------------------------------------------------------
 1 | import six
 2 | 
 3 | from scrapy.utils.misc import load_object
 4 | 
 5 | from . import defaults
 6 | 
 7 | # Shortcut maps 'setting name' -> 'parmater name'.
 8 | SETTINGS_PARAMS_MAP = {
 9 |     'REDIS_URL': 'url',
10 |     'REDIS_HOST': 'host',
11 |     'REDIS_PORT': 'port',
12 |     'REDIS_ENCODING': 'encoding',
13 | }
14 | 
15 | 
16 | def get_redis_from_settings(settings):
17 |     """Returns a redis client instance from given Scrapy settings object.
18 | 
19 |     This function uses ``get_client`` to instantiate the client and uses
20 |     ``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You
21 |     can override them using the ``REDIS_PARAMS`` setting.
22 | 
23 |     Parameters
24 |     ----------
25 |     settings : Settings
26 |         A scrapy settings object. See the supported settings below.
27 | 
28 |     Returns
29 |     -------
30 |     server
31 |         Redis client instance.
32 | 
33 |     Other Parameters
34 |     ----------------
35 |     REDIS_URL : str, optional
36 |         Server connection URL.
37 |     REDIS_HOST : str, optional
38 |         Server host.
39 |     REDIS_PORT : str, optional
40 |         Server port.
41 |     REDIS_ENCODING : str, optional
42 |         Data encoding.
43 |     REDIS_PARAMS : dict, optional
44 |         Additional client parameters.
45 | 
46 |     """
47 |     params = defaults.REDIS_PARAMS.copy()
48 |     params.update(settings.getdict('REDIS_PARAMS'))
49 |     # XXX: Deprecate REDIS_* settings.
50 |     for source, dest in SETTINGS_PARAMS_MAP.items():
51 |         val = settings.get(source)
52 |         if val:
53 |             params[dest] = val
54 | 
55 |     # Allow ``redis_cls`` to be a path to a class.
56 |     if isinstance(params.get('redis_cls'), six.string_types):
57 |         params['redis_cls'] = load_object(params['redis_cls'])
58 | 
59 |     return get_redis(**params)
60 | 
61 | 
62 | # Backwards compatible alias.
63 | from_settings = get_redis_from_settings
64 | 
65 | 
66 | def get_redis(**kwargs):
67 |     """Returns a redis client instance.
68 | 
69 |     Parameters
70 |     ----------
71 |     redis_cls : class, optional
72 |         Defaults to ``redis.StrictRedis``.
73 |     url : str, optional
74 |         If given, ``redis_cls.from_url`` is used to instantiate the class.
75 |     **kwargs
76 |         Extra parameters to be passed to the ``redis_cls`` class.
77 | 
78 |     Returns
79 |     -------
80 |     server
81 |         Redis client instance.
82 | 
83 |     """
84 |     redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS)
85 |     url = kwargs.pop('url', None)
86 |     if url:
87 |         return redis_cls.from_url(url, **kwargs)
88 |     else:
89 |         return redis_cls(**kwargs)
90 | 


--------------------------------------------------------------------------------
/paperSpider/scrapy_redis/defaults.py:
--------------------------------------------------------------------------------
 1 | import redis
 2 | 
 3 | # For standalone use.
 4 | DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
 5 | 
 6 | PIPELINE_KEY = '%(spider)s:items'
 7 | 
 8 | REDIS_CLS = redis.StrictRedis
 9 | REDIS_ENCODING = 'utf-8'
10 | # Sane connection defaults.
11 | REDIS_PARAMS = {
12 |     'socket_timeout': 30,
13 |     'socket_connect_timeout': 30,
14 |     'retry_on_timeout': True,
15 |     'encoding': REDIS_ENCODING,
16 | }
17 | 
18 | SCHEDULER_QUEUE_KEY = '%(spider)s:requests'
19 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
20 | SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'
21 | SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
22 | 
23 | START_URLS_KEY = '%(name)s:start_urls'
24 | START_URLS_AS_SET = False
25 | 


--------------------------------------------------------------------------------
/paperSpider/scrapy_redis/dupefilter.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import time
  3 | 
  4 | from scrapy.dupefilters import BaseDupeFilter
  5 | from scrapy.utils.request import request_fingerprint
  6 | 
  7 | from . import defaults
  8 | from .connection import get_redis_from_settings
  9 | from paperSpider.utils.bloomfilter import conn, PyBloomFilter
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | # TODO: Rename class to RedisDupeFilter.
 15 | class RFPDupeFilter(BaseDupeFilter):
 16 |     """Redis-based request duplicates filter.
 17 | 
 18 |     This class can also be used with default Scrapy's scheduler.
 19 | 
 20 |     """
 21 | 
 22 |     logger = logger
 23 | 
 24 |     def __init__(self, server, key, debug=False):
 25 |         """Initialize the duplicates filter.
 26 | 
 27 |         Parameters
 28 |         ----------
 29 |         server : redis.StrictRedis
 30 |             The redis server instance.
 31 |         key : str
 32 |             Redis key Where to store fingerprints.
 33 |         debug : bool, optional
 34 |             Whether to log filtered requests.
 35 | 
 36 |         """
 37 |         self.server = server
 38 |         self.key = key
 39 |         self.debug = debug
 40 |         self.logdupes = True
 41 | 
 42 |         self.bf = PyBloomFilter(conn=conn, key=key)
 43 | 
 44 |     @classmethod
 45 |     def from_settings(cls, settings):
 46 |         """Returns an instance from given settings.
 47 | 
 48 |         This uses by default the key ``dupefilter:<timestamp>``. When using the
 49 |         ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
 50 |         it needs to pass the spider name in the key.
 51 | 
 52 |         Parameters
 53 |         ----------
 54 |         settings : scrapy.settings.Settings
 55 | 
 56 |         Returns
 57 |         -------
 58 |         RFPDupeFilter
 59 |             A RFPDupeFilter instance.
 60 | 
 61 | 
 62 |         """
 63 |         server = get_redis_from_settings(settings)
 64 |         # XXX: This creates one-time key. needed to support to use this
 65 |         # class as standalone dupefilter with scrapy's default scheduler
 66 |         # if scrapy passes spider on open() method this wouldn't be needed
 67 |         # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
 68 |         key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}
 69 |         debug = settings.getbool('DUPEFILTER_DEBUG')
 70 |         return cls(server, key=key, debug=debug)
 71 | 
 72 |     @classmethod
 73 |     def from_crawler(cls, crawler):
 74 |         """Returns instance from crawler.
 75 | 
 76 |         Parameters
 77 |         ----------
 78 |         crawler : scrapy.crawler.Crawler
 79 | 
 80 |         Returns
 81 |         -------
 82 |         RFPDupeFilter
 83 |             Instance of RFPDupeFilter.
 84 | 
 85 |         """
 86 |         return cls.from_settings(crawler.settings)
 87 | 
 88 |     def request_seen(self, request):
 89 |         """Returns True if request was already seen.
 90 | 
 91 |         Parameters
 92 |         ----------
 93 |         request : scrapy.http.Request
 94 | 
 95 |         Returns
 96 |         -------
 97 |         bool
 98 | 
 99 |         """
100 |         fp = self.request_fingerprint(request)
101 | 
102 |         if self.bf.is_exist(fp):
103 |             return True
104 |         else:
105 |             self.bf.add(fp)
106 |             return False
107 |         # This returns the number of values added, zero if already exists.
108 |         # added = self.server.sadd(self.key, fp)
109 |         # return added == 0
110 | 
111 |     def request_fingerprint(self, request):
112 |         """Returns a fingerprint for a given request.
113 | 
114 |         Parameters
115 |         ----------
116 |         request : scrapy.http.Request
117 | 
118 |         Returns
119 |         -------
120 |         str
121 | 
122 |         """
123 |         return request_fingerprint(request)
124 | 
125 |     @classmethod
126 |     def from_spider(cls, spider):
127 |         settings = spider.settings
128 |         server = get_redis_from_settings(settings)
129 |         dupefilter_key = settings.get("SCHEDULER_DUPEFILTER_KEY", defaults.SCHEDULER_DUPEFILTER_KEY)
130 |         key = dupefilter_key % {'spider': spider.name}
131 |         debug = settings.getbool('DUPEFILTER_DEBUG')
132 |         return cls(server, key=key, debug=debug)
133 | 
134 |     def close(self, reason=''):
135 |         """Delete data on close. Called by Scrapy's scheduler.
136 | 
137 |         Parameters
138 |         ----------
139 |         reason : str, optional
140 | 
141 |         """
142 |         self.clear()
143 | 
144 |     def clear(self):
145 |         """Clears fingerprints data."""
146 |         self.server.delete(self.key)
147 | 
148 |     def log(self, request, spider):
149 |         """Logs given request.
150 | 
151 |         Parameters
152 |         ----------
153 |         request : scrapy.http.Request
154 |         spider : scrapy.spiders.Spider
155 | 
156 |         """
157 |         if self.debug:
158 |             msg = "Filtered duplicate request: %(request)s"
159 |             self.logger.debug(msg, {'request': request}, extra={'spider': spider})
160 |         elif self.logdupes:
161 |             msg = ("Filtered duplicate request %(request)s"
162 |                    " - no more duplicates will be shown"
163 |                    " (see DUPEFILTER_DEBUG to show all duplicates)")
164 |             self.logger.debug(msg, {'request': request}, extra={'spider': spider})
165 |             self.logdupes = False
166 | 


--------------------------------------------------------------------------------
/paperSpider/scrapy_redis/picklecompat.py:
--------------------------------------------------------------------------------
 1 | """A pickle wrapper module with protocol=-1 by default."""
 2 | 
 3 | try:
 4 |     import cPickle as pickle  # PY2
 5 | except ImportError:
 6 |     import pickle
 7 | 
 8 | 
 9 | def loads(s):
10 |     return pickle.loads(s)
11 | 
12 | 
13 | def dumps(obj):
14 |     return pickle.dumps(obj, protocol=-1)
15 | 


--------------------------------------------------------------------------------
/paperSpider/scrapy_redis/pipelines.py:
--------------------------------------------------------------------------------
 1 | from scrapy.utils.misc import load_object
 2 | from scrapy.utils.serialize import ScrapyJSONEncoder
 3 | from twisted.internet.threads import deferToThread
 4 | 
 5 | from . import connection, defaults
 6 | 
 7 | default_serialize = ScrapyJSONEncoder().encode
 8 | 
 9 | 
10 | class RedisPipeline(object):
11 |     """Pushes serialized item into a redis list/queue
12 | 
13 |     Settings
14 |     --------
15 |     REDIS_ITEMS_KEY : str
16 |         Redis key where to store items.
17 |     REDIS_ITEMS_SERIALIZER : str
18 |         Object path to serializer function.
19 | 
20 |     """
21 | 
22 |     def __init__(self, server,
23 |                  key=defaults.PIPELINE_KEY,
24 |                  serialize_func=default_serialize):
25 |         """Initialize pipeline.
26 | 
27 |         Parameters
28 |         ----------
29 |         server : StrictRedis
30 |             Redis client instance.
31 |         key : str
32 |             Redis key where to store items.
33 |         serialize_func : callable
34 |             Items serializer function.
35 | 
36 |         """
37 |         self.server = server
38 |         self.key = key
39 |         self.serialize = serialize_func
40 | 
41 |     @classmethod
42 |     def from_settings(cls, settings):
43 |         params = {
44 |             'server': connection.from_settings(settings),
45 |         }
46 |         if settings.get('REDIS_ITEMS_KEY'):
47 |             params['key'] = settings['REDIS_ITEMS_KEY']
48 |         if settings.get('REDIS_ITEMS_SERIALIZER'):
49 |             params['serialize_func'] = load_object(
50 |                 settings['REDIS_ITEMS_SERIALIZER']
51 |             )
52 | 
53 |         return cls(**params)
54 | 
55 |     @classmethod
56 |     def from_crawler(cls, crawler):
57 |         return cls.from_settings(crawler.settings)
58 | 
59 |     def process_item(self, item, spider):
60 |         return deferToThread(self._process_item, item, spider)
61 | 
62 |     def _process_item(self, item, spider):
63 |         key = self.item_key(item, spider)
64 |         data = self.serialize(item)
65 |         self.server.rpush(key, data)
66 |         return item
67 | 
68 |     def item_key(self, item, spider):
69 |         """Returns redis key based on given spider.
70 | 
71 |         Override this function to use a different key depending on the item
72 |         and/or spider.
73 | 
74 |         """
75 |         return self.key % {'spider': spider.name}
76 | 


--------------------------------------------------------------------------------
/paperSpider/scrapy_redis/queue.py:
--------------------------------------------------------------------------------
  1 | from scrapy.utils.reqser import request_to_dict, request_from_dict
  2 | 
  3 | from . import picklecompat
  4 | 
  5 | 
  6 | class Base(object):
  7 |     """Per-spider base queue class"""
  8 | 
  9 |     def __init__(self, server, spider, key, serializer=None):
 10 |         """Initialize per-spider redis queue.
 11 | 
 12 |         Parameters
 13 |         ----------
 14 |         server : StrictRedis
 15 |             Redis client instance.
 16 |         spider : Spider
 17 |             Scrapy spider instance.
 18 |         key: str
 19 |             Redis key where to put and get messages.
 20 |         serializer : object
 21 |             Serializer object with ``loads`` and ``dumps`` methods.
 22 | 
 23 |         """
 24 |         if serializer is None:
 25 |             # Backward compatibility.
 26 |             # TODO: deprecate pickle.
 27 |             serializer = picklecompat
 28 |         if not hasattr(serializer, 'loads'):
 29 |             raise TypeError("serializer does not implement 'loads' function: %r"
 30 |                             % serializer)
 31 |         if not hasattr(serializer, 'dumps'):
 32 |             raise TypeError("serializer '%s' does not implement 'dumps' function: %r"
 33 |                             % serializer)
 34 | 
 35 |         self.server = server
 36 |         self.spider = spider
 37 |         self.key = key % {'spider': spider.name}
 38 |         self.serializer = serializer
 39 | 
 40 |     def _encode_request(self, request):
 41 |         """Encode a request object"""
 42 |         obj = request_to_dict(request, self.spider)
 43 |         return self.serializer.dumps(obj)
 44 | 
 45 |     def _decode_request(self, encoded_request):
 46 |         """Decode an request previously encoded"""
 47 |         obj = self.serializer.loads(encoded_request)
 48 |         return request_from_dict(obj, self.spider)
 49 | 
 50 |     def __len__(self):
 51 |         """Return the length of the queue"""
 52 |         raise NotImplementedError
 53 | 
 54 |     def push(self, request):
 55 |         """Push a request"""
 56 |         raise NotImplementedError
 57 | 
 58 |     def pop(self, timeout=0):
 59 |         """Pop a request"""
 60 |         raise NotImplementedError
 61 | 
 62 |     def clear(self):
 63 |         """Clear queue/stack"""
 64 |         self.server.delete(self.key)
 65 | 
 66 | 
 67 | class FifoQueue(Base):
 68 |     """Per-spider FIFO queue"""
 69 | 
 70 |     def __len__(self):
 71 |         """Return the length of the queue"""
 72 |         return self.server.llen(self.key)
 73 | 
 74 |     def push(self, request):
 75 |         """Push a request"""
 76 |         self.server.lpush(self.key, self._encode_request(request))
 77 | 
 78 |     def pop(self, timeout=0):
 79 |         """Pop a request"""
 80 |         if timeout > 0:
 81 |             data = self.server.brpop(self.key, timeout)
 82 |             if isinstance(data, tuple):
 83 |                 data = data[1]
 84 |         else:
 85 |             data = self.server.rpop(self.key)
 86 |         if data:
 87 |             return self._decode_request(data)
 88 | 
 89 | 
 90 | class PriorityQueue(Base):
 91 |     """Per-spider priority queue abstraction using redis' sorted set"""
 92 | 
 93 |     def __len__(self):
 94 |         """Return the length of the queue"""
 95 |         return self.server.zcard(self.key)
 96 | 
 97 |     def push(self, request):
 98 |         """Push a request"""
 99 |         data = self._encode_request(request)
100 |         score = -request.priority
101 |         # We don't use zadd method as the order of arguments change depending on
102 |         # whether the class is Redis or StrictRedis, and the option of using
103 |         # kwargs only accepts strings, not bytes.
104 |         self.server.execute_command('ZADD', self.key, score, data)
105 | 
106 |     def pop(self, timeout=0):
107 |         """
108 |         Pop a request
109 |         timeout not support in this queue class
110 |         """
111 |         # use atomic range/remove using multi/exec
112 |         pipe = self.server.pipeline()
113 |         pipe.multi()
114 |         pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0)
115 |         results, count = pipe.execute()
116 |         if results:
117 |             return self._decode_request(results[0])
118 | 
119 | 
120 | class LifoQueue(Base):
121 |     """Per-spider LIFO queue."""
122 | 
123 |     def __len__(self):
124 |         """Return the length of the stack"""
125 |         return self.server.llen(self.key)
126 | 
127 |     def push(self, request):
128 |         """Push a request"""
129 |         self.server.lpush(self.key, self._encode_request(request))
130 | 
131 |     def pop(self, timeout=0):
132 |         """Pop a request"""
133 |         if timeout > 0:
134 |             data = self.server.blpop(self.key, timeout)
135 |             if isinstance(data, tuple):
136 |                 data = data[1]
137 |         else:
138 |             data = self.server.lpop(self.key)
139 | 
140 |         if data:
141 |             return self._decode_request(data)
142 | 
143 | 
144 | # TODO: Deprecate the use of these names.
145 | SpiderQueue = FifoQueue
146 | SpiderStack = LifoQueue
147 | SpiderPriorityQueue = PriorityQueue
148 | 


--------------------------------------------------------------------------------
/paperSpider/scrapy_redis/scheduler.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | import six
  3 | 
  4 | from scrapy.utils.misc import load_object
  5 | 
  6 | from . import connection, defaults
  7 | 
  8 | 
  9 | # TODO: add SCRAPY_JOB support.
 10 | class Scheduler(object):
 11 |     """Redis-based scheduler
 12 | 
 13 |     Settings
 14 |     --------
 15 |     SCHEDULER_PERSIST : bool (default: False)
 16 |         Whether to persist or clear redis queue.
 17 |     SCHEDULER_FLUSH_ON_START : bool (default: False)
 18 |         Whether to flush redis queue on start.
 19 |     SCHEDULER_IDLE_BEFORE_CLOSE : int (default: 0)
 20 |         How many seconds to wait before closing if no message is received.
 21 |     SCHEDULER_QUEUE_KEY : str
 22 |         Scheduler redis key.
 23 |     SCHEDULER_QUEUE_CLASS : str
 24 |         Scheduler queue class.
 25 |     SCHEDULER_DUPEFILTER_KEY : str
 26 |         Scheduler dupefilter redis key.
 27 |     SCHEDULER_DUPEFILTER_CLASS : str
 28 |         Scheduler dupefilter class.
 29 |     SCHEDULER_SERIALIZER : str
 30 |         Scheduler serializer.
 31 | 
 32 |     """
 33 | 
 34 |     def __init__(self, server,
 35 |                  persist=False,
 36 |                  flush_on_start=False,
 37 |                  queue_key=defaults.SCHEDULER_QUEUE_KEY,
 38 |                  queue_cls=defaults.SCHEDULER_QUEUE_CLASS,
 39 |                  dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY,
 40 |                  dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS,
 41 |                  idle_before_close=0,
 42 |                  serializer=None):
 43 |         """Initialize scheduler.
 44 | 
 45 |         Parameters
 46 |         ----------
 47 |         server : Redis
 48 |             The redis server instance.
 49 |         persist : bool
 50 |             Whether to flush requests when closing. Default is False.
 51 |         flush_on_start : bool
 52 |             Whether to flush requests on start. Default is False.
 53 |         queue_key : str
 54 |             Requests queue key.
 55 |         queue_cls : str
 56 |             Importable path to the queue class.
 57 |         dupefilter_key : str
 58 |             Duplicates filter key.
 59 |         dupefilter_cls : str
 60 |             Importable path to the dupefilter class.
 61 |         idle_before_close : int
 62 |             Timeout before giving up.
 63 | 
 64 |         """
 65 |         if idle_before_close < 0:
 66 |             raise TypeError("idle_before_close cannot be negative")
 67 | 
 68 |         self.server = server
 69 |         self.persist = persist
 70 |         self.flush_on_start = flush_on_start
 71 |         self.queue_key = queue_key
 72 |         self.queue_cls = queue_cls
 73 |         self.dupefilter_cls = dupefilter_cls
 74 |         self.dupefilter_key = dupefilter_key
 75 |         self.idle_before_close = idle_before_close
 76 |         self.serializer = serializer
 77 |         self.stats = None
 78 | 
 79 |     def __len__(self):
 80 |         return len(self.queue)
 81 | 
 82 |     @classmethod
 83 |     def from_settings(cls, settings):
 84 |         kwargs = {
 85 |             'persist': settings.getbool('SCHEDULER_PERSIST'),
 86 |             'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'),
 87 |             'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'),
 88 |         }
 89 | 
 90 |         # If these values are missing, it means we want to use the defaults.
 91 |         optional = {
 92 |             # TODO: Use custom prefixes for this settings to note that are
 93 |             # specific to scrapy-redis.
 94 |             'queue_key': 'SCHEDULER_QUEUE_KEY',
 95 |             'queue_cls': 'SCHEDULER_QUEUE_CLASS',
 96 |             'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY',
 97 |             # We use the default setting name to keep compatibility.
 98 |             'dupefilter_cls': 'DUPEFILTER_CLASS',
 99 |             'serializer': 'SCHEDULER_SERIALIZER',
100 |         }
101 |         for name, setting_name in optional.items():
102 |             val = settings.get(setting_name)
103 |             if val:
104 |                 kwargs[name] = val
105 | 
106 |         # Support serializer as a path to a module.
107 |         if isinstance(kwargs.get('serializer'), six.string_types):
108 |             kwargs['serializer'] = importlib.import_module(kwargs['serializer'])
109 | 
110 |         server = connection.from_settings(settings)
111 |         # Ensure the connection is working.
112 |         server.ping()
113 | 
114 |         return cls(server=server, **kwargs)
115 | 
116 |     @classmethod
117 |     def from_crawler(cls, crawler):
118 |         instance = cls.from_settings(crawler.settings)
119 |         # FIXME: for now, stats are only supported from this constructor
120 |         instance.stats = crawler.stats
121 |         return instance
122 | 
123 |     def open(self, spider):
124 |         self.spider = spider
125 | 
126 |         try:
127 |             self.queue = load_object(self.queue_cls)(
128 |                 server=self.server,
129 |                 spider=spider,
130 |                 key=self.queue_key % {'spider': spider.name},
131 |                 serializer=self.serializer,
132 |             )
133 |         except TypeError as e:
134 |             raise ValueError("Failed to instantiate queue class '%s': %s",
135 |                              self.queue_cls, e)
136 | 
137 |         self.df = load_object(self.dupefilter_cls).from_spider(spider)
138 | 
139 |         if self.flush_on_start:
140 |             self.flush()
141 |         # notice if there are requests already in the queue to resume the crawl
142 |         if len(self.queue):
143 |             spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
144 | 
145 |     def close(self, reason):
146 |         if not self.persist:
147 |             self.flush()
148 | 
149 |     def flush(self):
150 |         self.df.clear()
151 |         self.queue.clear()
152 | 
153 |     def enqueue_request(self, request):
154 |         # 通信 从 redis 获取 url 并放入到队列中
155 |         import redis
156 |         import json
157 |         import scrapy
158 | 
159 |         rd = redis.Redis("127.0.0.1", decode_responses=True)
160 |         # 先检查指定的 redis 队列中是否有 url
161 |         list_name = "baidu:new_urls"
162 |         while rd.llen(list_name):
163 |             data = json.loads(rd.lpop(list_name))
164 |             callback_func = getattr(self.spider, data[2])
165 |             req = scrapy.Request(url=data[0], dont_filter=False, callback=callback_func, priority=data[1])
166 |             self.queue.push(req)
167 | 
168 |         if not request.dont_filter and self.df.request_seen(request):
169 |             self.df.log(request, self.spider)
170 |             return False
171 |         if self.stats:
172 |             self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
173 |         self.queue.push(request)
174 |         return True
175 | 
176 |     def next_request(self):
177 |         block_pop_timeout = self.idle_before_close
178 |         request = self.queue.pop(block_pop_timeout)
179 |         if request and self.stats:
180 |             self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider)
181 |         return request
182 | 
183 |     def has_pending_requests(self):
184 |         return len(self) > 0
185 | 


--------------------------------------------------------------------------------
/paperSpider/scrapy_redis/spiders.py:
--------------------------------------------------------------------------------
  1 | from scrapy import signals
  2 | from scrapy.exceptions import DontCloseSpider
  3 | from scrapy.spiders import Spider, CrawlSpider
  4 | 
  5 | from . import connection, defaults
  6 | from .utils import bytes_to_str
  7 | 
  8 | 
  9 | class RedisMixin(object):
 10 |     """Mixin class to implement reading urls from a redis queue."""
 11 |     redis_key = None
 12 |     redis_batch_size = None
 13 |     redis_encoding = None
 14 | 
 15 |     # Redis client placeholder.
 16 |     server = None
 17 | 
 18 |     def start_requests(self):
 19 |         """Returns a batch of start requests from redis."""
 20 |         return self.next_requests()
 21 | 
 22 |     def setup_redis(self, crawler=None):
 23 |         """Setup redis connection and idle signal.
 24 | 
 25 |         This should be called after the spider has set its crawler object.
 26 |         """
 27 |         if self.server is not None:
 28 |             return
 29 | 
 30 |         if crawler is None:
 31 |             # We allow optional crawler argument to keep backwards
 32 |             # compatibility.
 33 |             # XXX: Raise a deprecation warning.
 34 |             crawler = getattr(self, 'crawler', None)
 35 | 
 36 |         if crawler is None:
 37 |             raise ValueError("crawler is required")
 38 | 
 39 |         settings = crawler.settings
 40 | 
 41 |         if self.redis_key is None:
 42 |             self.redis_key = settings.get(
 43 |                 'REDIS_START_URLS_KEY', defaults.START_URLS_KEY,
 44 |             )
 45 | 
 46 |         self.redis_key = self.redis_key % {'name': self.name}
 47 | 
 48 |         if not self.redis_key.strip():
 49 |             raise ValueError("redis_key must not be empty")
 50 | 
 51 |         if self.redis_batch_size is None:
 52 |             # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE).
 53 |             self.redis_batch_size = settings.getint(
 54 |                 'REDIS_START_URLS_BATCH_SIZE',
 55 |                 settings.getint('CONCURRENT_REQUESTS'),
 56 |             )
 57 | 
 58 |         try:
 59 |             self.redis_batch_size = int(self.redis_batch_size)
 60 |         except (TypeError, ValueError):
 61 |             raise ValueError("redis_batch_size must be an integer")
 62 | 
 63 |         if self.redis_encoding is None:
 64 |             self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING)
 65 | 
 66 |         self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
 67 |                          "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s",
 68 |                          self.__dict__)
 69 | 
 70 |         self.server = connection.from_settings(crawler.settings)
 71 |         # The idle signal is called when the spider has no requests left,
 72 |         # that's when we will schedule new requests from redis queue
 73 |         crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
 74 | 
 75 |     def next_requests(self):
 76 |         """Returns a request to be scheduled or none."""
 77 |         use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET)
 78 |         fetch_one = self.server.spop if use_set else self.server.lpop
 79 |         # XXX: Do we need to use a timeout here?
 80 |         found = 0
 81 |         # TODO: Use redis pipeline execution.
 82 |         while found < self.redis_batch_size:
 83 |             data = fetch_one(self.redis_key)
 84 |             if not data:
 85 |                 # Queue empty.
 86 |                 break
 87 |             req = self.make_request_from_data(data)
 88 |             if req:
 89 |                 yield req
 90 |                 found += 1
 91 |             else:
 92 |                 self.logger.debug("Request not made from data: %r", data)
 93 | 
 94 |         if found:
 95 |             self.logger.debug("Read %s requests from '%s'", found, self.redis_key)
 96 | 
 97 |     def make_request_from_data(self, data):
 98 |         """Returns a Request instance from data coming from Redis.
 99 | 
100 |         By default, ``data`` is an encoded URL. You can override this method to
101 |         provide your own message decoding.
102 | 
103 |         Parameters
104 |         ----------
105 |         data : bytes
106 |             Message from redis.
107 | 
108 |         """
109 |         url = bytes_to_str(data, self.redis_encoding)
110 |         return self.make_requests_from_url(url)
111 | 
112 |     def schedule_next_requests(self):
113 |         """Schedules a request if available"""
114 |         # TODO: While there is capacity, schedule a batch of redis requests.
115 |         for req in self.next_requests():
116 |             self.crawler.engine.crawl(req, spider=self)
117 | 
118 |     def spider_idle(self):
119 |         """Schedules a request if available, otherwise waits."""
120 |         # XXX: Handle a sentinel to close the spider.
121 |         self.schedule_next_requests()
122 |         raise DontCloseSpider
123 | 
124 | 
125 | class RedisSpider(RedisMixin, Spider):
126 |     """Spider that reads urls from redis queue when idle.
127 | 
128 |     Attributes
129 |     ----------
130 |     redis_key : str (default: REDIS_START_URLS_KEY)
131 |         Redis key where to fetch start URLs from..
132 |     redis_batch_size : int (default: CONCURRENT_REQUESTS)
133 |         Number of messages to fetch from redis on each attempt.
134 |     redis_encoding : str (default: REDIS_ENCODING)
135 |         Encoding to use when decoding messages from redis queue.
136 | 
137 |     Settings
138 |     --------
139 |     REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
140 |         Default Redis key where to fetch start URLs from..
141 |     REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
142 |         Default number of messages to fetch from redis on each attempt.
143 |     REDIS_START_URLS_AS_SET : bool (default: False)
144 |         Use SET operations to retrieve messages from the redis queue. If False,
145 |         the messages are retrieve using the LPOP command.
146 |     REDIS_ENCODING : str (default: "utf-8")
147 |         Default encoding to use when decoding messages from redis queue.
148 | 
149 |     """
150 | 
151 |     @classmethod
152 |     def from_crawler(self, crawler, *args, **kwargs):
153 |         obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)
154 |         obj.setup_redis(crawler)
155 |         return obj
156 | 
157 | 
158 | class RedisCrawlSpider(RedisMixin, CrawlSpider):
159 |     """Spider that reads urls from redis queue when idle.
160 | 
161 |     Attributes
162 |     ----------
163 |     redis_key : str (default: REDIS_START_URLS_KEY)
164 |         Redis key where to fetch start URLs from..
165 |     redis_batch_size : int (default: CONCURRENT_REQUESTS)
166 |         Number of messages to fetch from redis on each attempt.
167 |     redis_encoding : str (default: REDIS_ENCODING)
168 |         Encoding to use when decoding messages from redis queue.
169 | 
170 |     Settings
171 |     --------
172 |     REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
173 |         Default Redis key where to fetch start URLs from..
174 |     REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
175 |         Default number of messages to fetch from redis on each attempt.
176 |     REDIS_START_URLS_AS_SET : bool (default: True)
177 |         Use SET operations to retrieve messages from the redis queue.
178 |     REDIS_ENCODING : str (default: "utf-8")
179 |         Default encoding to use when decoding messages from redis queue.
180 | 
181 |     """
182 | 
183 |     @classmethod
184 |     def from_crawler(self, crawler, *args, **kwargs):
185 |         obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs)
186 |         obj.setup_redis(crawler)
187 |         return obj
188 | 


--------------------------------------------------------------------------------
/paperSpider/scrapy_redis/utils.py:
--------------------------------------------------------------------------------
1 | import six
2 | 
3 | 
4 | def bytes_to_str(s, encoding='utf-8'):
5 |     """Returns a str if a bytes object is given."""
6 |     if six.PY3 and isinstance(s, bytes):
7 |         return s.decode(encoding)
8 |     return s
9 | 


--------------------------------------------------------------------------------
/paperSpider/test.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time     : 2020-03-23 12:18
3 | # @Author   : beking
4 | 
5 | import redis
6 | redis_cli = redis.StrictRedis()
7 | redis_cli.incr("baidu_count")
8 | 


--------------------------------------------------------------------------------