├── .gitignore ├── .idea ├── baidu_paper_spider.iml ├── misc.xml ├── modules.xml └── vcs.xml ├── README.md ├── README_zh.MD ├── django_search ├── .idea │ ├── django_search.iml │ ├── misc.xml │ ├── modules.xml │ └── vcs.xml ├── django_search │ ├── __init__.py │ ├── asgi.py │ ├── settings.py │ ├── urls.py │ └── wsgi.py ├── manage.py ├── search │ ├── __init__.py │ ├── admin.py │ ├── apps.py │ ├── migrations │ │ └── __init__.py │ ├── models.py │ ├── tests.py │ └── views.py ├── static │ ├── css │ │ ├── advanced.css │ │ ├── index.css │ │ ├── result.css │ │ └── style.css │ ├── img │ │ ├── Thumbs.db │ │ ├── btn_min.png │ │ ├── btnbg.png │ │ ├── down.png │ │ ├── inputbg.png │ │ ├── line.png │ │ ├── ll.png │ │ ├── logo.png │ │ ├── logo1.png │ │ ├── lr.png │ │ ├── more.png │ │ ├── result_icon.png │ │ └── searchbtn.png │ └── js │ │ ├── common.js │ │ ├── global.js │ │ ├── jquery.js │ │ └── pagination.js └── templates │ ├── index.html │ └── result.html └── paperSpider ├── .idea └── paperSpider.iml ├── main.py ├── paperSpider ├── __init__.py ├── items.py ├── middlewares.py ├── models │ ├── __init__.py │ └── es_types.py ├── pipelines.py ├── settings.py ├── spiders │ ├── __init__.py │ └── baidu.py ├── tools │ ├── __init__.py │ └── add_urls.py └── utils │ ├── __init__.py │ ├── bloomfilter.py │ └── common.py ├── scrapy.cfg ├── scrapy_redis ├── __init__.py ├── connection.py ├── defaults.py ├── dupefilter.py ├── picklecompat.py ├── pipelines.py ├── queue.py ├── scheduler.py ├── spiders.py └── utils.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /.idea/baidu_paper_spider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 13 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Distributed Document Search Engine 2 | This is an open-source project for a paper search engine, which includes a Scrapy-Redis distributed crawler, an Elasticsearch search engine, and a Django frontend. The project was designed to provide a platform for users to easily search and access research papers. 3 | 4 | ## Features 5 | - Scrapy-Redis distributed crawler using CSS Selectors. 6 | - Centralized deduplication with Redis for distribution. 7 | - Text search engine implemented with ElasticSearch. 8 | - Full-stack web application built using Django. 9 | 10 | ## Technology Stack 11 | The main technology stack used in this project includes: 12 | - Scrapy-Redis 13 | - Elasticsearch 14 | - Django 15 |
16 |
17 | 18 | **👉👉👉 More technical details that help to understand my project as follows.** 19 | [中文版本](https://github.com/Beking0912/Distributed-Document-Search-Engine/blob/master/README_zh.MD) 20 | 21 | ## Technical selection scrapy vs requests+beautifulsoup 22 | 1. Both requests and beautifulsoup are libraries, and scrapy is the framework; 23 | 2. Requests and beautifulsoup can be added to the scrapy framework; 24 | 3. Scrapy is based on twisted, performance is the biggest advantage; 25 | 4. Scrapy is convenient for expansion and provides many built-in functions; 26 | 5. The built-in css and xpath selector of scrapy is very convenient, and the biggest disadvantage of beautifulsoup is slow. 27 | 28 | ## Depth first and breadth first 29 | Depth first (recursive implementation) 30 | ```python 31 | def depth_tree(tree_node): 32 | if tree_node is not None: 33 | print (tree_node._data) 34 | if tree_node._left is not None: 35 | return depth_tree(tree_node._left) 36 | if tree_node._right is not None: 37 | return depth_tree(tree_node._right) 38 | ``` 39 | 40 | Breadth first (queue implementation) 41 | ```python 42 | def level_queue(root): 43 | if root is None: 44 | return 45 | my_queue = [] 46 | node = root 47 | my_queue.append(node) 48 | while my_queue: 49 | node = my_queue.pop(0) 50 | print (node.elem) 51 | if node.lchild is not None: 52 | my_queue.append(node.lchild) 53 | if node.rchild is not None: 54 | my_queue.append(node.rchild) 55 | ``` 56 | 57 | ## URL deduplication strategy 58 | 1. Save the visited URL in the database; 59 | 2. Save the visited URL in the set, and query the URL only at the cost of O(1); 60 | 3. The URL is saved in the set after being hashed by md5 and other methods; 61 | 4. Use the bitmap method to map the visited URL to a certain bit through the hash function; 62 | 5. The bloomfilter method improves bitmap, and multiple hash functions reduce conflicts. 63 | 64 | ## String encoding encode decode 65 | 1. Computers can only process numbers, and text can only be processed by converting text to numbers. 8 bits in the computer are regarded as a byte, so the largest number that a byte can represent is 255; 66 | 2. ASCII (one byte) encoding has become the standard encoding for Americans; 67 | 3. ASCII is not enough to handle Chinese. China has developed GB2312 encoding, which uses two bytes to represent a Chinese character; 68 | 4. The emergence of unicode unifies all languages into a set of codes; 69 | 5. The garbled problem is solved, but if the content is all in English, unicode encoding requires twice the storage space than ASCII, and at the same time, if the transmission requires twice the transmission; 70 | 6. The emergence of variable-length encoding utf-8 has changed the length of English to one byte and Chinese characters to three bytes. Especially uncommon ones become 4-6 bytes. If a large amount of English is transmitted, the effect of utf-8 will be obvious. 71 | 72 | ## scrapy 73 | scrapy is a fast and high-level screen scraping and web scraping framework developed by Python to scrape web sites and extract structured data from pages. Advantages: high concurrency (the bottom layer is asynchronous IO frame time loop + callback). 74 | [Official document](https://docs.scrapy.org/en/latest/) 75 | 76 | 1. download:`pip install Scrapy` 77 | 2. new:`scrapy startproject namexxx` 78 | 79 | ## xpath syntax res.xpath('').extract_first('') 80 | 1. xpath uses path expressions to navigate in xml and html; 81 | 2. xpath contains standard function library; 82 | 3. xpath is a w3c standard. 83 | 84 | ## Advantages of distributed crawlers 85 | 1. Make full use of the bandwidth of multiple machines to accelerate crawling; 86 | 2. Make full use of the IP of multiple machines to accelerate the crawling speed. 87 | 88 | ## Stand-alone crawler => distributed crawlers problems that need to solve 89 | 1. Centralized management of request queue: The scheduler is stored in memory in the form of a queue, and other servers cannot get the contents of the current server's memory; 90 | 2. De-duplicate centralized management. Solution: Put the request queue and de-replay into third-party components, using Redis (memory database, faster reading speed). 91 | 92 | ## Redis 93 | Redis is a key-value storage system, and data is stored in memory. 94 | 95 | ## Redis data type 96 | String hash/hash list collection, sortable collection 97 | 98 | ## needs to pay attention to writing crawlers using Scrapy-Redis 99 | 1. Inherit RedisSpider; 100 | 2. All requests are no longer completed by the local schedule, but the schedule of Scrapy-Redis; 101 | 3. Need to push the starting url. 102 | 103 | ## The difference between session and cookie 104 | 1. Cookies are stored in the form of key-value 105 | 106 | ## When downloading the package fails 107 | 1. `pip install wheel` 108 | 2. `pip install -r requirements.txt` 109 | 110 | ## Integrate Redis 111 | ## Integrate BloomFilter 112 | 113 | ## Incremental crawling of crawlers 114 | 1. How to quickly discover new data 115 | 1. The full amount of crawlers is still going on 116 | 1. Restart a crawler: one is responsible for full crawling, and the other is responsible for incremental crawling 117 | 2. Use priority queue (conducive to maintenance) 118 | 2. Crawler is over 119 | 1. Crawler is closed 120 | 1. How to find that there is a new URL to be crawled, once there is a URL, a script is required to start the crawler 121 | 2. Crawler waiting: continue to push URL 122 | 2. How to solve the data that has been crawled (scrapy comes with a deduplication mechanism) 123 | 1. After the list data has been crawled, continue crawling 124 | 2. Whether to continue crawling the items that have been crawled (involving update issues) 125 | Optimal solution: Modify the scrapy-redis source code to achieve the goal. 126 | 127 | ## Complete incremental crawling by modifying scrapy-redis 128 | 129 | ## Crawler data update 130 | Fields that will be updated: cited amount 131 | 132 | ## Search engine requirements 133 | 1. Efficient 134 | 2. Zero configuration is completely free 135 | 3. Able to interact with search engines simply through json and http 136 | 4. Search server is stable 137 | 5. Able to easily expand one server to hundreds 138 | 139 | ## Introduction to elasticsearch 140 | 1. Lucene-based search server 141 | 2. Provides a full-text search engine with distributed multi-user capabilities 142 | 3. Based on RESTful web interface 143 | 4. Developed in Java and released as open source under the terms of the Apache license 144 | 145 | ## Disadvantages of relational data search 146 | 1. Unable to score -> Unable to sort 147 | 2. No distributed 148 | 3. Unable to parse search request 149 | 4. low efficiency 150 | 5. Participle 151 | 152 | ## elasticsearch installation 153 | 1. Install elasticsearch-rtf 154 | 2. Installation of head plugin and kibana 155 | 156 | ## Cross-domain configuration 157 | ``` 158 | http.cors.enabled: true 159 | http:cors.allow-origin: "*" 160 | http.cors.allow-methods: OPTIONS, HEAD, GET, POST, PUT, DELETE 161 | http.cors.allow-headers: "X-Requested-With, Content-Type, Content-Type, Content-Length, X-User" 162 | ``` 163 | 164 | ## elasticsearch concept 165 | 1. Cluster: One or more nodes are organized together 166 | 2. Node: A node is a server in the cluster, identified by a name, the default is the name of a random comic character 167 | 3. Fragmentation: The ability to divide the index into multiple parts, allowing horizontal partitioning and capacity expansion, multiple shards responding to requests, improving performance and throughput 168 | 4. Replica: The ability to create one or more copies of a shard, and the rest of the nodes can be on top when one node fails 169 | 170 | 171 | ## elasticsearch vs mysql 172 | 1. index => database 173 | 2. type => table 174 | 3. document => line 175 | 4. fields => columns 176 | 177 | ## Inverted index 178 | The inverted index comes from the need to find records based on the value of attributes in practical applications. Each item in this index table includes an attribute value and the address of each record with the attribute value. Since the attribute value is not determined by the record, but the position of the record is determined by the attribute value, it is called an inverted index. A text with an inverted index is referred to as an inverted file. 179 | 180 | ## TF-IDF 181 | 182 | ## Inverted index pending issues 183 | 1. Case conversion issues, such as python and PYTHON should be a word 184 | 2. Stemming, looking and look should be treated as one word 185 | 3. Participle 186 | 4. The inverted index file is too large, compression encoding 187 | Elasticsearch can complete all of the above problems. 188 | 189 | ## elasticsearch basic index 190 | 191 | ## Mapping 192 | Mapping: When creating an index, you can predefine the field type and related attributes. 193 | 194 | ES will guess the field mapping you want based on the basic type of the JSON source data. Turn the entered data into searchable index items. Mapping is the data type of the field defined by my mother. It also tells es how to index the data and whether it can be searched. 195 | 196 | Role: It will make the index creation more detailed and perfect. 197 | 198 | ## es query 199 | 1. Basic query: use es built-in query conditions to query 200 | 2. Combined query: Combine multiple queries together for compound query 201 | 3. Filtering: the query passes the filter condition to filter the data without affecting the scoring 202 | 203 | ## Edit distance 204 | Edit distance is a calculation method of similarity between strings. That is, the edit distance between two character strings is equal to the minimum number of operations for insert/delete/replace/swap positions of adjacent character strings to make one character string become another character string. 205 | 206 | Regarding the calculation of edit distance, dynamic programming is commonly used. 207 | 208 | ## Environment migration 209 | 1. pip freeze > requirements.text 210 | 2. pip install -r requirement.txt 211 | 212 | ## References 213 | [Elasticsearch中ik_max_word和 ik_smart的区别](https://blog.csdn.net/weixin_44062339/article/details/85006948) 214 | 215 | [相关度评分背后的理论](https://www.elastic.co/guide/cn/elasticsearch/guide/current/scoring-theory.html) 216 | 217 | [Elasticsearch搜索中文分词优化](https://www.jianshu.com/p/914f102bc174) 218 | 219 | ## Several problems encountered in Elasticsearch Chinese search 220 | 1. Search for the glucose keyword, hope that the result contains only glucose, not grapes; search for grapes, hope that the result contains glucose. 221 | 2. Searching for "RMB" will only match the content that contains the keyword "RMB". In fact, "RMB" and "RMB" are synonyms. We hope that users can search for "RMB" and "RMB" to match each other. How to configure ES synonyms ? 222 | 3. User search pinyin: such as "baidu", or the first letter of pinyin "bd", how to match the keyword "百度", and if the user enters the word "摆渡", it can also match the keyword "Baidu", how does the Chinese pinyin match? Do it? 223 | 4. How to ensure that the search keywords are correctly segmented, usually we will use a custom dictionary to do it, so how to get a custom dictionary? 224 | 225 | ## ik tokenizer 226 | 1. ik_max_word: Split the text at the finest granularity, such as splitting the "Great Hall of the People of the People's Republic of China" into "People's Republic of China, Chinese People, Chinese, Chinese, People's Republic, People, Republic, Great Hall, Assembly, Words such as hall. 227 | 2. ik_smart: Will do the most coarse-grained split, such as splitting the "Great Hall of the People of the People's Republic of China" into the People's Republic of China and the Great Hall of the People. 228 | 229 | ## Best Practices 230 | The best practice for the use of the two tokenizers is: use ik_max_word for indexing, and ik_smart for search. 231 | 232 | That is: the content of the article is segmented to the maximum when indexing, and the desired result is more precise when searching. When indexing, in order to provide the coverage of the index, the ik_max_word analyzer is usually used, which will index with the most fine-grained word segmentation. In order to improve the search accuracy, the ik_smart analyzer will be used for coarse-grained word segmentation. 233 | 234 | ## ES word segmentation process analysis and analyzer 235 | 1. character filter: process the string before word segmentation and remove HTML tags; 236 | 2. tokenizer: English word segmentation can separate words according to spaces, Chinese word segmentation is more complicated, and machine learning algorithms can be used to segment words; 237 | 3. token filters characterize filters: modify capitalization, stop words, add synonyms, add words, etc.; 238 | 4. ES word segmentation process: character filter-->>tokenizer-->>token filters 239 | 5. Custom analyzer 240 | 6. Word segmentation mapping settings 241 | ``` 242 | "content": { 243 | "type": "string", 244 | "analyzer": "ik_max_word", 245 | "search_analyzer": "ik_smart" 246 | } 247 | ``` 248 | 249 | ## Synonym 250 | ## Suggest participle 251 | Suggest words need to match the prefix of Pinyin, Quanpin, and Chinese. For example: "百度", type "baidu", "bd", "百" must be matched, so it needs to be divided into multiple words when indexing A word segmenter is used to index and save. Chinese uses single-character word segmentation. Pinyin first letter and Quanpin require a custom analyzer to index. 252 | 253 | -------------------------------------------------------------------------------- /README_zh.MD: -------------------------------------------------------------------------------- 1 | # Distributed Document Search Engine 2 | 3 | [English version](https://github.com/Beking0912/Distributed-Document-Search-Engine/blob/master/README.md) 4 | 5 | ## 技术选型 scrapy vs requests+beautifulsoup 6 | 1. requests 和 beautifulsoup 都是库,scrapy 是框架; 7 | 2. scrapy 框架中可以加入requests 和 beautifulsoup; 8 | 3. scrapy 基于 twisted,性能是最大优势; 9 | 4. scrapy 方便扩展,提供了很多内置的功能; 10 | 5. scrapy 内置的 css 和 xpath selector 非常方便,beautifulsoup 最大的缺点就是慢。 11 | 12 | ## 深度优先和广度优先 13 | 深度优先(递归实现) 14 | ```python 15 | def depth_tree(tree_node): 16 | if tree_node is not None: 17 | print (tree_node._data) 18 | if tree_node._left is not None: 19 | return depth_tree(tree_node._left) 20 | if tree_node._right is not None: 21 | return depth_tree(tree_node._right) 22 | ``` 23 | 24 | 广度优先(队列实现) 25 | ```python 26 | def level_queue(root): 27 | if root is None: 28 | return 29 | my_queue = [] 30 | node = root 31 | my_queue.append(node) 32 | while my_queue: 33 | node = my_queue.pop(0) 34 | print (node.elem) 35 | if node.lchild is not None: 36 | my_queue.append(node.lchild) 37 | if node.rchild is not None: 38 | my_queue.append(node.rchild) 39 | ``` 40 | 41 | ## URL去重策略 42 | 1. 将访问过的URL保存到数据库中; 43 | 2. 将访问过的URL保存到set中,只需要O(1)的代价就可以查询URL; 44 | 3. URL经过md5等方法哈希后保存到set中; 45 | 4. 用bitmap方法将访问过的URL通过hash函数映射到某一位; 46 | 5. bloomfilter方法对bitmap进行改进,多重hash函数降低冲突。 47 | 48 | ## 字符串编码 encode decode 49 | 1. 计算机只能处理数字,文本转换为数字才能处理。计算机中8个bit作为一个字节,所以一个字节能表示最大的数字就是255; 50 | 2. ASCII(一个字节)编码就成为美国人的标准编码; 51 | 3. ASCII处理中文是不够的,中国制定了GB2312编码,用两个字节表示一个汉字; 52 | 4. unicode的出现将所有语言统一到一套编码里; 53 | 5. 乱码问题解决了,但是如果内容全是英文,unicode编码比ASCII需要多一倍的存储空间,同时如果传输需要多一倍的传输; 54 | 6. 可变长的编码utf-8的出现,把英文变长一个字节,汉字三个字节。特别生僻的变成4-6字节,如果传输大量的英文,utf-8作用就很明显了。 55 | 56 | ## scrapy 57 | scrapy 是 Python 开发的一个快速高层次的屏幕抓取和web抓取框架,用于抓取web站点并从页面中提取结构化的数据。优点:高并发(底层是异步IO框架 时间循环+回调)。 58 | 59 | [官方文档](https://docs.scrapy.org/en/latest/) 60 | 61 | 1. 下载:`pip install Scrapy` 62 | 2. 新建:`scrapy startproject namexxx` 63 | 64 | ## xpath 语法 res.xpath('').extract_first('') 65 | 1. xpath 使用路径表达式在xml和html中进行导航; 66 | 2. xpath 包含标准函数库; 67 | 3. xpath 是一个w3c的标准。 68 | 69 | ## 分布式爬虫的优点 70 | 1. 充分利用多机器的带宽加速爬取; 71 | 2. 充分利用多机的IP加速爬取速度。 72 | 73 | ## 单机爬虫 => 分布式爬虫 需要解决的问题 74 | 1. request 队列集中管理:scheduler 以队列形式存储在内存中,而其他服务器无法拿到当前服务器内存中的内容; 75 | 2. 去重集中管理。 76 | 解决方法:将 request 队列和去重 放到第三方组件中,采用 Redis(内存数据库,读取速度更快)。 77 | 78 | ## Redis 79 | Redis 是 key-value 存储系统,数据存在内存中。 80 | 81 | ## Redis 数据类型 82 | 字符串 散列/哈希 列表 集合 可排序集合 83 | 84 | ## Scrapy-Redis 编写爬虫需要注意的点 85 | 1. 继承 RedisSpider; 86 | 2. 所有 request 不再由本地 schedule 来完成,而是 Scrapy-Redis 的 schedule; 87 | 3. 需要 push 起始 url。 88 | 89 | ## session 和 cookie 的区别 90 | 1. cookie 以 key-value 形式存储 91 | 92 | ## 下载包失败时 93 | 1. `pip install wheel` 94 | 2. `pip install -r requirements.txt` 95 | 96 | ## 集成 Redis 97 | ## 集成 BloomFilter 98 | 99 | ## 爬虫的增量爬取 100 | 1. 如何快速发现新的数据 101 | 1. 全量的爬虫仍然在继续 102 | 1. 重新启动一个爬虫:一个负责全量抓取,一个负责增量抓取 103 | 2. 采用优先级队列(利于维护) 104 | 2. 爬虫已结束 105 | 1. 爬虫已关闭 106 | 1. 如何发现已经有新的URL待抓取,一旦有URL则需要脚本启动爬虫 107 | 2. 爬虫等待:继续push URL 108 | 2. 如何解决已经抓取过的数据(scrapy 自带去重机制) 109 | 1. 列表数据已经抓取过之后还要继续抓取 110 | 2. 已经抓取过的条目是否还要继续抓取(涉及更新问题) 111 | 112 | 最优方案:修改 scrapy-redis 源码可以达到目的。 113 | 114 | ## 通过修改 scrapy-redis 完成增量爬取 115 | 116 | ## 爬虫的数据更新 117 | 会更新的字段:被引用量 118 | 119 | ## 搜索引擎需求 120 | 1. 高效 121 | 2. 零配置 完全免费 122 | 3. 能够简单通过json和http与搜索引擎交互 123 | 4. 搜索服务器稳定 124 | 5. 能够简单的将一台服务器扩展到上百台 125 | 126 | ## elasticsearch 介绍 127 | 1. 基于 Lucene 的搜索服务器 128 | 2. 提供了一个分布式多用户能力的全文搜索引擎 129 | 3. 基于 RESTful web 接口 130 | 4. 是用 Java 开发的,并作为 Apache 许可条款下的开放源码发布 131 | 132 | ## 关系数据搜索缺点 133 | 1. 无法打分 -> 无法排序 134 | 2. 无分布式 135 | 3. 无法解析搜索请求 136 | 4. 效率低 137 | 5. 分词 138 | 139 | ## elasticsearch 安装 140 | 1. 安装 elasticsearch-rtf 141 | 2. head 插件和 kibana 的安装 142 | 143 | ## 跨域配置 144 | ``` 145 | http.cors.enabled: true 146 | http:cors.allow-origin: "*" 147 | http.cors.allow-methods: OPTIONS, HEAD, GET, POST, PUT, DELETE 148 | http.cors.allow-headers: "X-Requested-With, Content-Type, Content-Type, Content-Length, X-User" 149 | ``` 150 | 151 | ## elasticsearch 概念 152 | 1. 集群:一个或多个节点组织在一起 153 | 2. 节点:一个节点是集群中的一个服务器,由一个名字来标识,默认是一个随机的漫画角色的名字 154 | 3. 分片:将索引划分为多份的能力,允许水平分割和扩展容量,多个分片响应请求,提高性能和吞吐量 155 | 4. 副本:创建分片的一份或多份的能力,在一个节点失败时其余节点可以顶上 156 | 157 | ## elasticsearch vs mysql 158 | 1. index(索引) => 数据库 159 | 2. type(类型) => 表 160 | 3. document(文档) => 行 161 | 4. fields => 列 162 | 163 | ## 倒排索引 164 | 倒排索引源于实际应用中需要根据属性的值来查找记录。这种索引表中的每一项都包括一个属性值和具有该属性值的各记录的地址。由于不是由记录来确定属性值,而是由属性值来确定记录的位置,因而称为倒排索引(inverted index)。带有倒排索引的文简称倒排文件(inverted file)。 165 | 166 | ## TF-IDF 167 | 168 | ## 倒排索引待解决问题 169 | 1. 大小写转换问题,如 python 和 PYTHON 应该为一个词 170 | 2. 词干抽取,looking 和 look 应该处理为一个词 171 | 3. 分词 172 | 4. 倒排索引文件过大,压缩编码 173 | 174 | elasticsearch 可以全部完成以上问题。 175 | 176 | ## elasticsearch 基本的索引 177 | 178 | ## 映射(mapping) 179 | 映射:创建索引时,可以预先定义字段的类型以及相关属性。 180 | 181 | es会根据 JSON 源数据的基础类型猜测你想要的字段映射。将输入的数据转变为可搜索的索引项。mapping 就是我妈自己定义的字段的数据类型,同时告诉 es 如何索引数据以及是否可以被搜索。 182 | 183 | 作用:会让索引建立的更加细致和完善。 184 | 185 | ## es 查询 186 | 1. 基本查询:使用 es 内置查询条件进行查询 187 | 2. 组合查询:把多个查询组合在一起进行复合查询 188 | 3. 过滤:查询同时通过 filter 条件在不影响打分的情况下筛选数据 189 | 190 | ## 编辑距离 191 | 编辑距离是一种字符串之间相似程度的计算方法。即两个字符串之间的编辑距离等于使一个字符串变成另一个字符串而进行的 插入/删除/替换/相邻字符串交换位置 进行操作的最少次数。 192 | 193 | 关于编辑距离的求法,普遍采用的是动态规划。 194 | 195 | ## 环境迁移 196 | 1. pip freeze > requirements.text 197 | 2. pip install -r requirement.txt 198 | 199 | ## 资料 200 | [Elasticsearch中ik_max_word和 ik_smart的区别](https://blog.csdn.net/weixin_44062339/article/details/85006948) 201 | 202 | [相关度评分背后的理论](https://www.elastic.co/guide/cn/elasticsearch/guide/current/scoring-theory.html) 203 | 204 | [Elasticsearch搜索中文分词优化](https://www.jianshu.com/p/914f102bc174) 205 | 206 | ## Elasticsearch 中文搜索时遇到几个问题 207 | 1. 检索葡萄糖关键字,希望结果仅包含葡萄糖,不包含葡萄;检索葡萄,希望结果包含葡萄糖。 208 | 2. 搜索“RMB”时只会匹配到包含“RMB”关键词的内容,实际上,“RMB”和“人民币”是同义词,我们希望用户搜索“RMB”和“人民币”可以相互匹配,ES同义词怎么配置? 209 | 3. 用户搜索拼音: 如"baidu",或者拼音首字母"bd",怎么匹配到"百度"这个关键词,又如用户输入"摆渡"这个词也能匹配到"百度"关键词,中文拼音匹配怎么做到? 210 | 4. 怎么保证搜索关键词被正确分词,通常我们会采用自定义词典来做,那么怎么获取自定义词典? 211 | 212 | ## ik 分词器 213 | 1. ik_max_word:将文本做最细粒度的拆分,比如会将“中华人民共和国人民大会堂”拆分为“中华人民共和国、中华人民、中华、华人、人民共和国、人民、共和国、大会堂、大会、会堂等词语。 214 | 2. ik_smart:会做最粗粒度的拆分,比如会将“中华人民共和国人民大会堂”拆分为中华人民共和国、人民大会堂。 215 | 216 | ## 最佳实践 217 | 两种分词器使用的最佳实践是:索引时用ik_max_word,在搜索时用ik_smart。 218 | 219 | 即:索引时最大化的将文章内容分词,搜索时更精确的搜索到想要的结果。索引时,为了提供索引的覆盖范围,通常会采用ik_max_word分析器,会以最细粒度分词索引,搜索时为了提高搜索准确度,会采用ik_smart分析器,会以粗粒度分词。 220 | 221 | ## ES 分词流程之分析(analysis)和分析器(analyzer) 222 | 1. character filter 字符过滤器:在分词前处理字符串,去除HTML标记; 223 | 2. tokenizer 分词器:英文分词可以根据空格将单词分开,中文分词比较复杂,可以采用机器学习算法来分词; 224 | 3. token filters 表征过滤器:修改大小写,停用词,增加同义词,增加词等; 225 | 4. ES分词流程:character filter-->>tokenizer-->>token filters 226 | 5. 自定义analyzer 227 | 6. 分词mapping设置 228 | ``` 229 | "content": { 230 | "type": "string", 231 | "analyzer": "ik_max_word", 232 | "search_analyzer": "ik_smart" 233 | } 234 | ``` 235 | 236 | ## 同义词 237 | ## Suggest分词 238 | suggest词需要对拼音前缀,全拼,中文进行前缀匹配,例如:“百度”一词,键入"baidu","bd","百"都必须匹配到,因此在索引的时候需要一词分多个分词器来索引保存,中文采用单字分词,拼音首字母和全拼需要自定义analyzer来索引。 239 | 240 | 241 | -------------------------------------------------------------------------------- /django_search/.idea/django_search.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 27 | 28 | 29 | 32 | -------------------------------------------------------------------------------- /django_search/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /django_search/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /django_search/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /django_search/django_search/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/django_search/__init__.py -------------------------------------------------------------------------------- /django_search/django_search/asgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | ASGI config for django_search project. 3 | 4 | It exposes the ASGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/3.0/howto/deployment/asgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.asgi import get_asgi_application 13 | 14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'django_search.settings') 15 | 16 | application = get_asgi_application() 17 | -------------------------------------------------------------------------------- /django_search/django_search/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for django_search project. 3 | 4 | Generated by 'django-admin startproject' using Django 3.0.4. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/3.0/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/3.0/ref/settings/ 11 | """ 12 | 13 | import os 14 | 15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 17 | 18 | # Quick-start development settings - unsuitable for production 19 | # See https://docs.djangoproject.com/en/3.0/howto/deployment/checklist/ 20 | 21 | # SECURITY WARNING: keep the secret key used in production secret! 22 | SECRET_KEY = '5h7388ran&ufu53jgqtzp)2*vupdb3&wmb*zz21mx%c00(8j9&' 23 | 24 | # SECURITY WARNING: don't run with debug turned on in production! 25 | DEBUG = True 26 | 27 | ALLOWED_HOSTS = [] 28 | 29 | # Application definition 30 | 31 | INSTALLED_APPS = [ 32 | 'django.contrib.admin', 33 | 'django.contrib.auth', 34 | 'django.contrib.contenttypes', 35 | 'django.contrib.sessions', 36 | 'django.contrib.messages', 37 | 'django.contrib.staticfiles', 38 | 'search.apps.SearchConfig', 39 | ] 40 | 41 | MIDDLEWARE = [ 42 | 'django.middleware.security.SecurityMiddleware', 43 | 'django.contrib.sessions.middleware.SessionMiddleware', 44 | 'django.middleware.common.CommonMiddleware', 45 | 'django.middleware.csrf.CsrfViewMiddleware', 46 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 47 | 'django.contrib.messages.middleware.MessageMiddleware', 48 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 49 | ] 50 | 51 | ROOT_URLCONF = 'django_search.urls' 52 | 53 | TEMPLATES = [ 54 | { 55 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 56 | 'DIRS': [os.path.join(BASE_DIR, 'templates')] 57 | , 58 | 'APP_DIRS': True, 59 | 'OPTIONS': { 60 | 'context_processors': [ 61 | 'django.template.context_processors.debug', 62 | 'django.template.context_processors.request', 63 | 'django.contrib.auth.context_processors.auth', 64 | 'django.contrib.messages.context_processors.messages', 65 | ], 66 | 'libraries': { # Adding this section should work around the issue. 67 | 'staticfiles': 'django.templatetags.static', 68 | }, 69 | }, 70 | }, 71 | ] 72 | 73 | WSGI_APPLICATION = 'django_search.wsgi.application' 74 | 75 | # Database 76 | # https://docs.djangoproject.com/en/3.0/ref/settings/#databases 77 | 78 | DATABASES = { 79 | 'default': { 80 | 'ENGINE': 'django.db.backends.sqlite3', 81 | 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), 82 | } 83 | } 84 | 85 | # Password validation 86 | # https://docs.djangoproject.com/en/3.0/ref/settings/#auth-password-validators 87 | 88 | AUTH_PASSWORD_VALIDATORS = [ 89 | { 90 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 91 | }, 92 | { 93 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', 94 | }, 95 | { 96 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', 97 | }, 98 | { 99 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', 100 | }, 101 | ] 102 | 103 | # Internationalization 104 | # https://docs.djangoproject.com/en/3.0/topics/i18n/ 105 | 106 | LANGUAGE_CODE = 'en-us' 107 | 108 | TIME_ZONE = 'UTC' 109 | 110 | USE_I18N = True 111 | 112 | USE_L10N = True 113 | 114 | USE_TZ = True 115 | 116 | # Static files (CSS, JavaScript, Images) 117 | # https://docs.djangoproject.com/en/3.0/howto/static-files/ 118 | 119 | STATIC_URL = '/static/' 120 | 121 | STATICFILES_DIRS = [ 122 | os.path.join(BASE_DIR, "static") 123 | ] 124 | -------------------------------------------------------------------------------- /django_search/django_search/urls.py: -------------------------------------------------------------------------------- 1 | """django_search URL Configuration 2 | 3 | The `urlpatterns` list routes URLs to views. For more information please see: 4 | https://docs.djangoproject.com/en/3.0/topics/http/urls/ 5 | Examples: 6 | Function views 7 | 1. Add an import: from my_app import views 8 | 2. Add a URL to urlpatterns: path('', views.home, name='home') 9 | Class-based views 10 | 1. Add an import: from other_app.views import Home 11 | 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') 12 | Including another URLconf 13 | 1. Import the include() function: from django.urls import include, path 14 | 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) 15 | """ 16 | from django.contrib import admin 17 | # from django.urls import path 18 | # from django.views.generic import TemplateView 19 | from django.conf.urls import url 20 | from search.views import SearchSuggest, SearchView, IndexView 21 | 22 | urlpatterns = [ 23 | # url(r'^admin/', admin.site.urls), 24 | url(r'^$', IndexView.as_view(), name="index"), 25 | url(r'^suggest/$', SearchSuggest.as_view(), name="suggest"), 26 | url(r'^search/$', SearchView.as_view(), name="search"), 27 | ] 28 | -------------------------------------------------------------------------------- /django_search/django_search/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for django_search project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/3.0/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'django_search.settings') 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /django_search/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Django's command-line utility for administrative tasks.""" 3 | import os 4 | import sys 5 | 6 | 7 | def main(): 8 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'django_search.settings') 9 | try: 10 | from django.core.management import execute_from_command_line 11 | except ImportError as exc: 12 | raise ImportError( 13 | "Couldn't import Django. Are you sure it's installed and " 14 | "available on your PYTHONPATH environment variable? Did you " 15 | "forget to activate a virtual environment?" 16 | ) from exc 17 | execute_from_command_line(sys.argv) 18 | 19 | 20 | if __name__ == '__main__': 21 | main() 22 | -------------------------------------------------------------------------------- /django_search/search/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/search/__init__.py -------------------------------------------------------------------------------- /django_search/search/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /django_search/search/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class SearchConfig(AppConfig): 5 | name = 'search' 6 | -------------------------------------------------------------------------------- /django_search/search/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/search/migrations/__init__.py -------------------------------------------------------------------------------- /django_search/search/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, \ 4 | analyzer, Completion, Keyword, Text, Integer 5 | from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer 6 | from elasticsearch_dsl.connections import connections 7 | 8 | connections.create_connection(hosts=["localhost"]) 9 | 10 | 11 | class CustomAnalyzer(_CustomAnalyzer): 12 | def get_analysis_definition(self): 13 | return {} 14 | 15 | 16 | ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"]) 17 | 18 | 19 | class PaperType(DocType): 20 | suggest = Completion(analyzer=ik_analyzer) # 用于自动补全 21 | 22 | paper_title = Text(analyzer="ik_max_word") 23 | paper_writer = Text() 24 | paper_time = Text() 25 | paper_cite_count = Integer() 26 | paper_source = Keyword() 27 | paper_abstract = Text(analyzer="ik_max_word") 28 | paper_keywords = Text(analyzer="ik_max_word") 29 | paper_DOI = Text() 30 | paper_download_link = Text() 31 | 32 | class Meta: 33 | index = "baidu" 34 | doc_type = "paper" 35 | 36 | 37 | if __name__ == "__main__": 38 | PaperType.init() 39 | -------------------------------------------------------------------------------- /django_search/search/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /django_search/search/views.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | from django.shortcuts import render 4 | from django.http import HttpResponse 5 | from django.views.generic.base import View 6 | from elasticsearch import Elasticsearch 7 | import redis 8 | 9 | from search.models import PaperType 10 | 11 | client = Elasticsearch(hosts=["127.0.0.1"]) 12 | 13 | redis_cli = redis.StrictRedis(charset='utf-8', decode_responses=True) 14 | 15 | 16 | class IndexView(View): 17 | def get(self, request): 18 | topn_search = redis_cli.zrevrangebyscore("search_keywords_set", "+inf", "-inf", start=0, num=5) 19 | return render(request, "index.html", {"topn_search": topn_search}) 20 | 21 | 22 | class SearchSuggest(View): 23 | def get(self, request): 24 | key_words = request.GET.get('s', '') 25 | re_datas = [] 26 | if key_words: 27 | s = PaperType.search() 28 | s = s.suggest('suggest', key_words, completion={ 29 | "field": "suggest", "fuzzy": { 30 | "fuzziness": 2 # 编辑距离 31 | }, "size": 10 32 | }) 33 | suggestions = s.execute_suggest() 34 | for match in suggestions.suggest[0].options: 35 | source = match._source 36 | re_datas.append(source["paper_title"]) 37 | return HttpResponse(json.dumps(re_datas), content_type="application/json") 38 | 39 | 40 | class SearchView(View): 41 | def get(self, request): 42 | key_words = request.GET.get('q', '') 43 | 44 | redis_cli.zincrby("search_keywords_set", 1, key_words) 45 | topn_search = redis_cli.zrevrangebyscore("search_keywords_set", "+inf", "-inf", start=0, num=5) 46 | 47 | page = request.GET.get("p", "1") 48 | try: 49 | page = int(page) 50 | except: 51 | page = 1 52 | 53 | baidu_count = redis_cli.get("baidu_count") 54 | start_time = datetime.now() 55 | 56 | choice = request.GET.get("option", "") 57 | if choice == 'cite': 58 | response = client.search( 59 | index="baidu", 60 | body={ 61 | "sort": {"paper_cite_count": {"order": "desc"}}, 62 | "query": { 63 | "multi_match": { 64 | "query": key_words, 65 | "fields": ["paper_title", "paper_keywords", "paper_abstract"] 66 | } 67 | }, 68 | "from": (page - 1) * 10, 69 | "size": 10, 70 | "highlight": { 71 | "pre_tags": [''], 72 | "post_tags": [''], 73 | "fields": { 74 | "paper_title": {}, 75 | "paper_abstract": {}, 76 | "paper_keywords": {} 77 | } 78 | } 79 | } 80 | ) 81 | elif choice == 'date': 82 | response = client.search( 83 | index="baidu", 84 | body={ 85 | "sort": {"paper_time": {"order": "desc"}}, 86 | "query": { 87 | "multi_match": { 88 | "query": key_words, 89 | "fields": ["paper_title", "paper_keywords", "paper_abstract"] 90 | } 91 | }, 92 | "from": (page - 1) * 10, 93 | "size": 10, 94 | "highlight": { 95 | "pre_tags": [''], 96 | "post_tags": [''], 97 | "fields": { 98 | "paper_title": {}, 99 | "paper_abstract": {}, 100 | "paper_keywords": {} 101 | } 102 | } 103 | } 104 | ) 105 | else: 106 | response = client.search( 107 | index="baidu", 108 | body={ 109 | "query": { 110 | "multi_match": { 111 | "query": key_words, 112 | "fields": ["paper_title", "paper_keywords", "paper_abstract"] 113 | } 114 | }, 115 | "from": (page-1)*10, 116 | "size": 10, 117 | "highlight": { 118 | "pre_tags": [''], 119 | "post_tags": [''], 120 | "fields": { 121 | "paper_title": {}, 122 | "paper_abstract": {}, 123 | "paper_keywords": {} 124 | } 125 | } 126 | } 127 | ) 128 | 129 | end_time = datetime.now() 130 | last_seconds = (end_time - start_time).total_seconds() 131 | 132 | total_nums = response["hits"]["total"] 133 | if(page % 10) > 0: 134 | page_nums = int(total_nums/10+1) 135 | else: 136 | page_nums = int(total_nums / 10) 137 | 138 | hit_list = [] 139 | for hit in response["hits"]["hits"]: 140 | hit_dict = {} 141 | if "highlight" in hit: 142 | if "paper_title" in hit["highlight"]: 143 | hit_dict["paper_title"] = "".join(hit["highlight"]["paper_title"]) 144 | else: 145 | hit_dict["paper_title"] = hit["_source"]["paper_title"] 146 | 147 | if "paper_abstract" in hit["highlight"]: 148 | hit_dict["paper_abstract"] = "".join(hit["highlight"]["paper_abstract"]) 149 | else: 150 | hit_dict["paper_abstract"] = hit["_source"]["paper_abstract"] 151 | 152 | if "paper_keywords" in hit["highlight"]: 153 | hit_dict["paper_keywords"] = hit["highlight"]["paper_keywords"] 154 | else: 155 | if "paper_keywords" in hit["_source"]: 156 | hit_dict["paper_keywords"] = hit["_source"]["paper_keywords"] 157 | 158 | if "paper_writer" in hit["_source"]: 159 | hit_dict["paper_writer"] = hit["_source"]["paper_writer"] 160 | 161 | if "paper_time" in hit["_source"]: 162 | hit_dict["paper_time"] = hit["_source"]["paper_time"] 163 | 164 | if "paper_cite_count" in hit["_source"]: 165 | hit_dict["paper_cite_count"] = hit["_source"]["paper_cite_count"] 166 | hit_dict["paper_source"] = hit["_source"]["paper_source"] 167 | 168 | if "paper_DOI" in hit["_source"]: 169 | hit_dict["paper_DOI"] = hit["_source"]["paper_DOI"] 170 | 171 | if "paper_download_link" in hit["_source"]: 172 | hit_dict["paper_download_link"] = hit["_source"]["paper_download_link"][:5] 173 | 174 | hit_dict["score"] = hit["_score"] 175 | 176 | hit_list.append(hit_dict) 177 | 178 | return render(request, "result.html", {"page": page, 179 | "all_hits": hit_list, 180 | "key_words": key_words, 181 | "total_nums": total_nums, 182 | "page_nums": page_nums, 183 | "last_seconds": last_seconds, 184 | "baidu_count": baidu_count, 185 | "topn_search": topn_search}) 186 | 187 | -------------------------------------------------------------------------------- /django_search/static/css/advanced.css: -------------------------------------------------------------------------------- 1 | @charset "utf-8"; 2 | html{*overflow:auto;} 3 | #hd{padding:20px 10px;} 4 | .logo{float:left;margin-right:30px; height:33px;} 5 | /*input搜索区域*/ 6 | .inputArea{float:left;position:relative;} 7 | .inputArea .searchInput{border:1px solid #bfbfbf;padding:0 15px;outline:none;height:35px;*line-height:38px;width:350px; background:url(../img/inputbg.png);font-size:14px;} 8 | .inputArea .searchButton{position:absolute;left:382px;top:0;*top:1px;*left:381px;width:106px;height:38px;background:url(../img/btn_min.png) no-repeat;border:none; cursor:pointer;} 9 | 10 | /*返回搜索*/ 11 | .inputArea .back{position:absolute;font-size:14px;left:500px;top:12px;width:60px;text-decoration:underline;} 12 | 13 | /*分界区域*/ 14 | .divsion{margin-bottom:24px;height:36px;background:#f9f9f9;border-bottom:1px solid #e0e0e0;} 15 | 16 | /*高级搜索区域*/ 17 | .subfield{border-left:4px solid #9cc813;font-size:14px;font-weight:bold;padding:2px 0 2px 20px;} 18 | .subfieldContent{padding-left:140px;padding-bottom:40px;} 19 | .subfieldContent .advanceItem{padding-left:350px;margin-bottom:15px;padding-top:8px;padding-bottom:3px;} 20 | .subfieldContent .advanceItem.keyWords{background:#f4f4f4;padding-top:18px;padding-bottom:3px;} 21 | .subfieldContent .advanceItem dd{float:left;margin-left:-320px;} 22 | .subfieldContent .advanceItem dd label{float:left;margin-right:40px;width:75px;font-weight:bold;} 23 | .subfieldContent .advanceItem dd .impInfo{ font-weight:bold;} 24 | .subfieldContent .advanceItem dd .tips{float:left;} 25 | .subfieldContent .advanceItem dd p, .subfieldContent .advanceItem dt p{margin-bottom:10px;height:26px;} 26 | .subfieldContent .advanceItem dt p input[type=text]{position:relative;top:-5px;line-height:26px;} 27 | 28 | .subfieldContent .advanceItem dt{float:left;width:100%;} 29 | .subfieldContent .advanceItem.keyWords dt input[type=text]{width:290px;height:26px;border:1px solid #bfbfbf;outline:none;} 30 | /*自定义*/ 31 | .subfieldContent .advanceItem.time{height:30px;} 32 | .subfieldContent .advanceItem .define{display:none;position:relative;*top:-3px;} 33 | .subfieldContent .advanceItem.time input[type=text]{width:80px;height:18px;line-height:18px;border:1px solid #bfbfbf;outline:none;} 34 | 35 | 36 | 37 | 38 | 39 | /*更多按钮*/ 40 | .more {float:left;} 41 | .more:hover{text-decoration:none;} 42 | .more .moreIcon{display:inline-block;position:relative;top:-1px;*top:-3px;left:2px;*left:-1px;width:9px;height:5px;background:url(../img/more.png);} 43 | .more.show .moreIcon{background:url(../img/down.png);top:-2px;} 44 | 45 | /*立即搜索样式*/ 46 | .subfieldContent .search{margin:45px 0 0 145px;width:130px;height:36px;background:url(../img/btnbg.png); font-weight:bold;border:none;border:1px solid #bfbfbf;line-height:36px;cursor:pointer;font-size:14px;} 47 | /*联想下拉区域*/ 48 | .inputArea .dataList{display:none;position:absolute;left:0;top:42px;*top:43px;width:550px;padding:5px 0;background:#fff;border:1px solid #bfbfbf;border-top:none;} 49 | .inputArea .dataList li{padding:2px 15px;font-size:14px;} 50 | .inputArea .dataList li:hover{background:#f0f0f0;color:#0080cc;font-weight:bold;} 51 | -------------------------------------------------------------------------------- /django_search/static/css/index.css: -------------------------------------------------------------------------------- 1 | @charset "utf-8"; 2 | html{*overflow:auto;} 3 | #main{width:730px;margin:75px auto 0;} 4 | #main h1.title{width:600px;} 5 | #bd{margin-bottom:20px;} 6 | .logo.large{margin:0px auto 10px auto;width:400px;height:190px;background: url(../img/logo.png) no-repeat center center;} 7 | 8 | /*nav样式*/ 9 | .nav{margin-bottom:10px;} 10 | .searchList{float:left;padding-left:5px;} 11 | .searchList .searchItem{float:left;margin-right:15px;font-size:14px;padding:0 0 2px 2px;cursor:pointer;height:18px;} 12 | .searchList .searchItem.current{color:#0080cc;border-bottom:2px solid #9cc813;font-weight:bold;} 13 | 14 | /*input搜索区域*/ 15 | .inputArea{position:relative;margin-bottom:65px;} 16 | .inputArea .searchInput{border-radius: 5rem;border:1px solid #bfbfbf;padding:0 15px;outline:none;height:40px;*height:39px;*line-height:40px;width:520px; background:url(../img/inputbg.png);font-size:14px;} 17 | .inputArea .searchButton{position:absolute;left:560px;*left:562px;*top:3px;top:4px;width:40px;height:40px;*height:40px;background:url(../img/btn_min.png) no-repeat;border:none;cursor:pointer;} 18 | /*高级搜索*/ 19 | .inputArea .advanced{position:absolute;font-size:14px;left:674px;top:12px;text-decoration:underline;} 20 | /*联想下拉区域*/ 21 | .inputArea .dataList{display:none;position:absolute;left:0;top:42px;*top:43px;width:550px;padding:5px 0;background:#fff;border:1px solid #bfbfbf;border-top:none;} 22 | .inputArea .dataList li{padding:2px 15px;font-size:14px;} 23 | .inputArea .dataList li:hover{background:#f0f0f0;color:#0080cc;font-weight:bold;} 24 | 25 | /*搜索历史区域*/ 26 | .historyArea{width:600px;} 27 | .historyArea .history {margin-bottom:5px;} 28 | .historyArea .history label{font-weight:bold;} 29 | .historyArea .history a{margin-right:12px;} 30 | -------------------------------------------------------------------------------- /django_search/static/css/result.css: -------------------------------------------------------------------------------- 1 | @charset "utf-8"; 2 | html{*overflow:auto;} 3 | #hd{padding:20px 10px;} 4 | #bd{margin-bottom:40px;} 5 | .logo{float:left;margin-right:30px; height:70px;} 6 | /*input搜索区域*/ 7 | .inputArea{float:left;position:relative;} 8 | .inputArea .searchInput{border-radius: 5rem;border:1px solid #bfbfbf;padding:0 15px;outline:none;height:35px;*line-height:35px;width:520px; background:url(../img/inputbg.png);font-size:14px;} 9 | .inputArea .searchButton{position:absolute;left:560px;top:3px;*top:2px;*left:560px;width:40px;height:40px;background:url(../img/btn_min.png) no-repeat;border:none;cursor:pointer;} 10 | 11 | /*返回高级搜索*/ 12 | .inputArea .advanced{position:absolute;font-size:14px;left:500px;top:12px;width:60px;text-decoration:underline;} 13 | 14 | /*分界区域,导航*/ 15 | .nav{margin-bottom:24px;height:31px;background:#f9f9f9;border-bottom:1px solid #e0e0e0;padding:5px 0 0 210px;} 16 | .searchList{float:left;padding-left:5px;} 17 | .searchList .searchItem{float:left;margin-right:15px;font-size:14px;padding:0 3px 2px 3px;cursor:pointer;height:26px; line-height:26px;} 18 | .searchList .searchItem.current{color:#0080cc;border-bottom:3px solid #9cc813;font-weight:bold;} 19 | .nav .tips{color:#969696;font-size:12px;line-height:24px;*line-height:26px;} 20 | #container.sideBarHide .nav{padding-left:35px;} 21 | 22 | /*#main区域样式*/ 23 | #main{padding:0 215px 0 182px;} 24 | #main.sideBarHide{padding-left:10px;} 25 | /*侧边栏搜索条件*/ 26 | .sideBar{position:relative;float:left;margin-left:-182px;width:182px;} 27 | .sideBar .subfieldContext{margin-bottom:20px;padding-left:25px;} 28 | .sideBar .subfieldContext li{margin-bottom:5px;cursor:pointer;} 29 | .sideBar .subfieldContext input[type=text]{width:75px;} 30 | .sideBar .unit{color:#787878;} 31 | 32 | /*更多按钮*/ 33 | .sideBar .more a:hover{text-decoration:none;} 34 | .sideBar .more .moreIcon{display:inline-block;position:relative;top:-1px;*top:-3px;left:2px;*left:-1px;width:9px;height:5px;background:url(../img/more.png);} 35 | .sideBar .more.show .moreIcon{background:url(../img/down.png);top:-2px;} 36 | 37 | .sideBar .reset{padding-left:25px;} 38 | /*siderBar区域显隐控制*/ 39 | .sideBar .sideBarShowHide{position:absolute;right:0px;top:20px;height:177px;width:1px; background:url(../img/line.png) right;} 40 | .sideBar .sideBarShowHide a{position:absolute;top:70px;left:-11px;display:inline-block;width:12px;height:31px;background:url(../img/lr.png);} 41 | 42 | .sideBar .sideBarShowHide a:hover{background-position:0 -31px;} 43 | 44 | /*左侧收起样式*/ 45 | #main.sideBarHide .sideBar{margin-left:-191px;*margin-left:-182px;} 46 | #main.sideBarHide .sideBar .sideBarShowHide{-moz-transform:rotate(180deg); -o-transform:rotate(180deg); -webkit-transform:rotate(180deg); transform:rotate(180deg);} 47 | #main.sideBarHide .sideBar .sideBarShowHide a{*background:url(../img/ll.png);} 48 | #main.sideBarHide .sideBar .sideBarShowHide a:hover{*background-position:0 -31px;} 49 | #main.sideBarHide .sideBar .sideBarShowHide{background:none;} 50 | 51 | .resultArea{float:left;width:100%;} 52 | .resultArea .resultTotal{position:relative;padding-left:30px;margin-bottom:20px;} 53 | .resultArea .resultTotal .info{color:#9a9a9a;} 54 | .resultArea .resultTotal .orderOpt{position:absolute;right:50px;} 55 | .resultArea .resultTotal .orderOpt a{margin-right:10px;color:#0080cc;} 56 | 57 | /*搜索结果列表区域*/ 58 | .resultArea .resultList{padding-left:30px;} 59 | /*.resultArea .resultList .resultItem{margin-bottom:20px;}*/ 60 | .resultArea .resultList .resultItem{margin-bottom:20px;border-bottom: 1px solid #eee;} 61 | .resultArea .resultList .itemHead{margin-bottom:5px;color:#767676;} 62 | .resultArea .resultList .itemHead .keyWord{color:#c00;} 63 | .resultArea .resultList .itemBody .keyWord{color:#c00;} 64 | .resultArea .resultList .itemHead a.title{font-size:18px;color:#001ba0;text-decoration:none;} 65 | .resultArea .resultList .itemHead .value{color:#008000;} 66 | .resultArea .resultList .itemHead .divsion{margin:0 5px;} 67 | .resultArea .resultList .itemHead .fileType{margin-right:10px;} 68 | 69 | /*搜索内容主体*/ 70 | .resultArea .resultList .itemBody{font-size:12px;margin-bottom:5px;line-height:18px;width:90%;} 71 | .resultArea .resultList .itemFoot{color:#008000; margin-bottom: 20px;} 72 | .resultArea .resultList .itemFoot .info{margin-right:10px;} 73 | 74 | .resultArea .pagination{margin-bottom:25px;padding-left:32px;} 75 | /*相关搜索*/ 76 | .resultArea .dependSearch{margin-bottom:30px;padding-left:32px;font-size:14px;} 77 | .resultArea .dependSearch h6{float:left;margin-right:15px;font-weight:bold;} 78 | .resultArea .dependSearch p{margin-bottom:5px;} 79 | .resultArea .dependSearch a{display:inline-block;margin-right:15px;text-decoration:underline;width:90px; white-space:nowrap; overflow:hidden;text-overflow:ellipsis;} 80 | .resultArea .searchInResult{padding-left:35px;} 81 | .resultArea .searchInResult .inResult{position:absolute;right:-190px;top:8px;font-size:14px;text-decoration:underline;} 82 | .resultArea .searchInResult .searchButton{left:417px;} 83 | /*历史搜索区域*/ 84 | .historyArea{float:right;margin-right:-216px;width:212px;} 85 | .historyArea h6{margin-bottom:10px;font-weight:bold;} 86 | .historyArea .historyList{margin-bottom:20px;} 87 | .historyArea .historyList li{margin-bottom:5px;} 88 | 89 | 90 | 91 | /*左侧分栏区域*/ 92 | .subfield{margin-bottom:5px;font-size:14px;font-weight:bold;padding:2px 0 2px 24px;} 93 | .subfield:first-child{border-left:4px solid #9cc813;padding-left:20px;} 94 | 95 | 96 | 97 | /*立即搜索样式*/ 98 | .subfieldContent .search{margin:45px 0 0 135px;width:130px;height:36px;background:url(../img/btnbg.png); font-weight:bold;border:none;border:1px solid #bfbfbf;line-height:36px;} 99 | /*联想下拉区域*/ 100 | .inputArea .dataList{display:none;position:absolute;left:0;top:42px;*top:43px;width:550px;padding:5px 0;background:#fff;border:1px solid #bfbfbf;border-top:none;} 101 | .inputArea .dataList li{padding:2px 15px;font-size:14px;} 102 | .inputArea .dataList li:hover{background:#f0f0f0;color:#0080cc;font-weight:bold;} 103 | 104 | 105 | 106 | .writers{color: #888;font-size: 12px;margin-right: 4px; } 107 | .resultArea .resultList .itemBody_writer{font-size: 12px; margin-bottom: 2px} 108 | .resultArea .resultList .itemBody_keywords{ margin-top: 10px;margin-bottom: 10px;} 109 | .keywords{font-size: 12px; color: #333; background-color: #f7f7f7; margin: 4px; padding: 4px} 110 | .download{color: #001ba0;} 111 | 112 | .showDownloead{ visibility:hidden; } 113 | .clickLink{margin-left: 4px;} -------------------------------------------------------------------------------- /django_search/static/css/style.css: -------------------------------------------------------------------------------- 1 | @charset "utf-8"; 2 | /*css reset*/ 3 | html, body, div, span, object, iframe, h1, h2, h3, h4, h5, h6, p, blockquote, pre, a, abbr, acronym, address, code, del, dfn, em, img, q, dl, dt, dd, ol, ul, li, fieldset, form, label, legend, table, caption, tbody, tfoot, thead, tr, th, td , i{ 4 | margin:0; 5 | padding:0; 6 | border:0; 7 | font-weight:inherit; 8 | font-style:inherit; 9 | font-size:100%; 10 | font-family:inherit; 11 | vertical-align:baseline; 12 | } 13 | body {line-height:1.5;} 14 | table {border-collapse: collapse;border-spacing:0;} 15 | caption, th, td ,b,strong{text-align:left;font-weight:normal;} 16 | table, td, th {vertical-align:middle;} 17 | blockquote:before, blockquote:after, q:before, q:after {content:"";} 18 | blockquote, q {quotes:"" "";} 19 | a img {border:none;} 20 | em,cite{font-style:normal;} 21 | 22 | 23 | body { background:#fff; font: 12px/1.5 Tahoma,'宋体';color:#000;} 24 | h1, h2, h3, h4, h5, h6 {font-weight:normal;color:#111;} 25 | a {text-decoration:none;cursor:pointer;} 26 | dl, dt, dd, ol, ul, li{ list-style:none;} 27 | 28 | /*some common class*/ 29 | .left{float:left;} 30 | .right{float:right;} 31 | 32 | /*clear*/ 33 | .ue-clear:after{content: ".";display:block;height:0;clear:both;visibility:hidden;} 34 | .ue-clear{display:inline-block;} 35 | *html .ue-clear{height:1%;} 36 | .ue-clear{display:block;} 37 | 38 | a{color:#0080cc;} 39 | a:hover{color:#267A01;text-decoration:underline;} 40 | /*logo样式*/ 41 | .logo{width:160px;height:47px;padding:0 5px;background: url(../img/logo1.png) no-repeat center center #fff;} 42 | 43 | /*choose样式*/ 44 | .choose{float:left;margin-right:15px;white-space:nowrap;} 45 | .choose .text{float:left;padding-left:20px;*padding-left:16px;white-space:nowrap; vertical-align:text-bottom;} 46 | .choose input[type=radio],.choose input[type=checkbox]{position:relative;*top:-3px;float:left;margin-right:-16px;} 47 | 48 | /*==================================== 49 | 分页信息(表格依赖样式) 50 | ===================================*/ 51 | .pagination{font-size:14px;} 52 | .pagination a {text-decoration: none;border: solid 1px; } 53 | .pagination .pxofy{float:left;margin-left: 5px;height:25px;*padding-top:1px;} 54 | .pagination a, .pagination span {display: block;float: left;height:18px;line-height:18px;padding:0 6px;margin-right: 5px;font-family:Arial, Helvetica, sans-serif !important;} 55 | .pagination .current {cursor:default;border: solid 1px ;} 56 | .pagination .prev, .pagination .next{*line-height:22px;} 57 | 58 | /*分页样式*/ 59 | .pagination a{color: #032F54;border-color:#8EB2D2;} 60 | .pagination a:hover{color:#023054;border-color:#8EB2D2;background:#B8DFFB;} 61 | .pagination .current{color:#fff;border-color:#5c9bc4;background:#89B8D8;} 62 | .pagination .current.prev, .pagination .current.next{color:#B9B9B9;border-color:#D3D3D3;background:#fff;} 63 | .pagination .pxofy{color: #023054;} 64 | 65 | #foot{height:32px;line-height:32px; text-align:center;background:#f9f9f9;border-top:1px solid #e0e0e0;color:#ababab;} 66 | 67 | 68 | -------------------------------------------------------------------------------- /django_search/static/img/Thumbs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/Thumbs.db -------------------------------------------------------------------------------- /django_search/static/img/btn_min.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/btn_min.png -------------------------------------------------------------------------------- /django_search/static/img/btnbg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/btnbg.png -------------------------------------------------------------------------------- /django_search/static/img/down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/down.png -------------------------------------------------------------------------------- /django_search/static/img/inputbg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/inputbg.png -------------------------------------------------------------------------------- /django_search/static/img/line.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/line.png -------------------------------------------------------------------------------- /django_search/static/img/ll.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/ll.png -------------------------------------------------------------------------------- /django_search/static/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/logo.png -------------------------------------------------------------------------------- /django_search/static/img/logo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/logo1.png -------------------------------------------------------------------------------- /django_search/static/img/lr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/lr.png -------------------------------------------------------------------------------- /django_search/static/img/more.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/more.png -------------------------------------------------------------------------------- /django_search/static/img/result_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/result_icon.png -------------------------------------------------------------------------------- /django_search/static/img/searchbtn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/django_search/static/img/searchbtn.png -------------------------------------------------------------------------------- /django_search/static/js/common.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by yli on 2017/4/21. 3 | */ 4 | 5 | var searchArr; 6 | //定义一个search的,判断浏览器有无数据存储(搜索历史) 7 | if(localStorage.search){ 8 | //如果有,转换成 数组的形式存放到searchArr的数组里(localStorage以字符串的形式存储,所以要把它转换成数组的形式) 9 | searchArr= localStorage.search.split(",") 10 | }else{ 11 | //如果没有,则定义searchArr为一个空的数组 12 | searchArr = []; 13 | } 14 | //把存储的数据显示出来作为搜索历史 15 | MapSearchArr(); 16 | 17 | 18 | $("#btn").on("click", function(){ 19 | var val = $("#inp").val(); 20 | //点击搜索按钮时,去重 21 | KillRepeat(val); 22 | //去重后把数组存储到浏览器localStorage 23 | localStorage.search = searchArr; 24 | //然后再把搜索内容显示出来 25 | MapSearchArr(); 26 | }); 27 | 28 | 29 | function MapSearchArr(){ 30 | var tmpHtml = ""; 31 | for (var i=0;i " 33 | } 34 | $("#keyname").html(tmpHtml); 35 | } 36 | //去重 37 | function KillRepeat(val){ 38 | var kill = 0; 39 | for (var i=0;i ne_half ? Math.max(Math.min(current_page 33 | - ne_half, upper_limit), 0) : 0; 34 | var end = current_page > ne_half ? Math.min(current_page + ne_half, 35 | np) : Math.min(opts.num_display_entries, np); 36 | return [start, end]; 37 | } 38 | 39 | /** 40 | * 点击事件 41 | */ 42 | function pageSelected(page_id, evt) { 43 | var page_id = parseInt(page_id); 44 | current_page = page_id; 45 | drawLinks(); 46 | var continuePropagation = opts.callback(page_id, panel); 47 | if (!continuePropagation) { 48 | if (evt.stopPropagation) { 49 | evt.stopPropagation(); 50 | } else { 51 | evt.cancelBubble = true; 52 | } 53 | } 54 | return continuePropagation; 55 | } 56 | 57 | /** 58 | * 链接 59 | */ 60 | function drawLinks() { 61 | panel.empty(); 62 | var interval = getInterval(); 63 | var np = numPages(); 64 | var getClickHandler = function(page_id) { 65 | return function(evt) { 66 | return pageSelected(page_id, evt); 67 | } 68 | } 69 | var appendItem = function(page_id, appendopts) { 70 | page_id = page_id < 0 ? 0 : (page_id < np ? page_id : np-1); 71 | appendopts = jQuery.extend({ 72 | text : page_id+1, 73 | classes : "" 74 | }, appendopts || {}); 75 | if (page_id == current_page) { 76 | var lnk = $("" + (appendopts.text) 77 | + ""); 78 | } else { 79 | var lnk = $("" + (appendopts.text) + "").bind( 80 | "click", getClickHandler(page_id)).attr('href', 81 | opts.link_to.replace(/__id__/, page_id)); 82 | 83 | } 84 | if (appendopts.classes) { 85 | lnk.addClass(appendopts.classes); 86 | } 87 | panel.append(lnk); 88 | } 89 | // 上一页 90 | if (opts.prev_text && (current_page > 0 || opts.prev_show_always)) { 91 | appendItem(current_page - 1, { 92 | text : opts.prev_text, 93 | classes : "prev" 94 | }); 95 | } 96 | // 点点点 97 | if (interval[0] > 0 && opts.num_edge_entries > 0) { 98 | var end = Math.min(opts.num_edge_entries, interval[0]); 99 | for (var i = 0; i < end; i++) { 100 | appendItem(i); 101 | } 102 | if (opts.num_edge_entries < interval[0] && opts.ellipse_text) { 103 | jQuery("" + opts.ellipse_text + "") 104 | .appendTo(panel); 105 | } 106 | } 107 | // 中间的页码 108 | for (var i = interval[0]; i < interval[1]; i++) { 109 | appendItem(i); 110 | } 111 | // 最后的页码 112 | if (interval[1] < np && opts.num_edge_entries > 0) { 113 | if (np - opts.num_edge_entries > interval[1] 114 | && opts.ellipse_text) { 115 | jQuery("" + opts.ellipse_text + "") 116 | .appendTo(panel); 117 | } 118 | var begin = Math.max(np - opts.num_edge_entries, interval[1]); 119 | for (var i = begin; i < np; i++) { 120 | appendItem(i); 121 | } 122 | 123 | } 124 | // 下一页 125 | if (opts.next_text 126 | && (current_page < np - 1 || opts.next_show_always)) { 127 | appendItem(current_page + 1, { 128 | text : opts.next_text, 129 | classes : "next" 130 | }); 131 | } 132 | // 记录显示 133 | if (opts.display_msg) { 134 | if(!maxentries){ 135 | panel 136 | .append('
暂时无数据可以显示
'); 137 | }else{ 138 | panel 139 | .append('
显示第 ' 140 | + ((current_page * opts.items_per_page) + 1) 141 | + ' 条到 ' 142 | + (((current_page + 1) * opts.items_per_page) > maxentries 143 | ? maxentries 144 | : ((current_page + 1) * opts.items_per_page)) 145 | + ' 条记录,总共 ' + maxentries + ' 条
'); 146 | } 147 | } 148 | //设置跳到第几页 149 | if(opts.setPageNo){ 150 | panel.append("
跳转到
"); 151 | } 152 | } 153 | 154 | // 当前页 155 | var current_page = opts.current_page; 156 | maxentries = ( maxentries < 0) ? 0 : maxentries; 157 | opts.items_per_page = (!opts.items_per_page || opts.items_per_page < 0) 158 | ? 1 159 | : opts.items_per_page; 160 | var panel = jQuery(this); 161 | this.selectPage = function(page_id) { 162 | pageSelected(page_id); 163 | } 164 | this.prevPage = function() { 165 | if (current_page > 0) { 166 | pageSelected(current_page - 1); 167 | return true; 168 | } else { 169 | return false; 170 | } 171 | } 172 | this.nextPage = function() { 173 | if (current_page < numPages() - 1) { 174 | pageSelected(current_page + 1); 175 | return true; 176 | } else { 177 | return false; 178 | } 179 | } 180 | 181 | if(maxentries==0){ 182 | panel.append(''+opts.prev_text+''+opts.next_text+'
暂时无数据可以显示
'); 183 | }else{ 184 | drawLinks(); 185 | } 186 | $(this).find(".goto button").live("click",function(evt){ 187 | var setPageNo = $(this).parent().find("input").val(); 188 | if(setPageNo!=null && setPageNo!=""&&setPageNo>0&&setPageNo<=numPages()){ 189 | pageSelected(setPageNo-1, evt); 190 | } 191 | }); 192 | }); 193 | } 194 | -------------------------------------------------------------------------------- /django_search/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% load staticfiles %} 4 | 5 | 6 | 7 | 搜索引擎 8 | 9 | 10 | 11 | 12 |
13 |
14 |
15 |

16 | 17 |

18 |
19 | 20 | 21 |
    22 |
    23 | 24 |
    25 |

    26 | 27 | {% for search_word in topn_search %} 28 | {{ search_word }} 29 | {% endfor %} 30 |

    31 |

    32 | 33 | 34 | 35 | 36 | 37 |

    38 |
    39 |
    40 |
    41 | 42 |
    43 | 44 | 45 | 46 | 96 | 156 | -------------------------------------------------------------------------------- /django_search/templates/result.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% load staticfiles %} 4 | 5 | 6 | 7 | 搜索引擎 8 | 9 | 10 | 11 | 12 |
    13 |
    14 | 15 | 16 | 17 |
    18 | 19 | 20 |
      21 |
      22 |
      23 |
      24 |
      25 | 54 |
      55 |
      56 |
      热门搜索
      57 |
        58 | {% for search_word in topn_search %} 59 |
      • {{ search_word }}
      • 60 | {% endfor %} 61 |
      62 |
      63 |
      64 |
      我的搜索
      65 |
        66 |
        67 |
        68 |
        69 |

        70 | 找到约 {{ total_nums }} 条结果(用时{{ last_seconds }}秒),共约{{ page_nums }} 73 |

        74 |
        75 | 76 | {% for hit in all_hits %} 77 |
        78 |
        79 | 80 | {% autoescape off %} 81 | {{ hit.paper_title }} 82 | {% endautoescape %} 83 | 84 | - 85 | 86 | [来源] 87 | 88 | 89 | 得分 90 | {{ hit.score }} 91 | 92 |
        93 |
        94 | {% for writer in hit.paper_writer %} 95 | {{ writer }} 96 | {% endfor %} 97 |
        98 |
        99 | {% autoescape off %} 100 | {{ hit.paper_abstract }} 101 | {% endautoescape %} 102 |
        103 |
        104 | {% for word in hit.paper_keywords %} 105 | 106 | {% autoescape off %} 107 | {{ word }} 108 | {% endautoescape %} 109 | 110 | {% endfor %} 111 |
        112 |
        113 | 114 | {{ hit.paper_time }}年 115 | 116 | 117 | 118 | {{ hit.paper_cite_count }} 119 | 120 | 121 | 122 | {{ hit.paper_DOI }} 123 | 124 | 125 | 下载地址 > 126 | 127 | {% for link in hit.paper_download_link %} 128 | 点击下载 129 | {% endfor %} 130 | 131 | 132 |
        133 |
        134 | {% endfor %} 135 | 136 |
        137 | 138 | 139 | 140 |
        141 |
        142 |
        143 |
        144 | 145 | 146 | 147 | 148 | 202 | 268 | 318 | -------------------------------------------------------------------------------- /paperSpider/.idea/paperSpider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /paperSpider/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2020-02-18 15:52 3 | # @Author : beking 4 | from scrapy.cmdline import execute 5 | 6 | import sys 7 | import os 8 | 9 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 10 | 11 | execute(["scrapy", "crawl", "baidu"]) 12 | -------------------------------------------------------------------------------- /paperSpider/paperSpider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Beking0912/distributed-paper-search-engine/23fdb408a88cae581bed2b7721c011a903736d06/paperSpider/paperSpider/__init__.py -------------------------------------------------------------------------------- /paperSpider/paperSpider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class PaperspiderItem(scrapy.Item): 12 | paper_title = scrapy.Field() # 论文题目 13 | paper_writer = scrapy.Field() # 作者 14 | paper_time = scrapy.Field() # 发表年代 15 | paper_cite_count = scrapy.Field() # 被引用量 16 | paper_source = scrapy.Field() # 来源 17 | paper_abstract = scrapy.Field() # 摘要 18 | paper_keywords = scrapy.Field() # 关键词 19 | paper_DOI = scrapy.Field() # DOI 20 | paper_download_link = scrapy.Field() # 下载地址 21 | -------------------------------------------------------------------------------- /paperSpider/paperSpider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class PaperspiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Request, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class PaperspiderDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /paperSpider/paperSpider/models/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2020-03-22 13:34 3 | # @Author : beking 4 | -------------------------------------------------------------------------------- /paperSpider/paperSpider/models/es_types.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2020-03-22 13:35 3 | # @Author : beking 4 | 5 | from datetime import datetime 6 | from elasticsearch_dsl import DocType, Date, Nested, Boolean, \ 7 | analyzer, Completion, Keyword, Text, Integer 8 | from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer 9 | from elasticsearch_dsl.connections import connections 10 | 11 | connections.create_connection(hosts=["localhost"]) 12 | 13 | 14 | class CustomAnalyzer(_CustomAnalyzer): 15 | def get_analysis_definition(self): 16 | return {} 17 | 18 | 19 | ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"]) 20 | 21 | 22 | class PaperType(DocType): 23 | suggest = Completion(analyzer=ik_analyzer) # 用于自动补全 24 | 25 | paper_title = Text(analyzer="ik_max_word") 26 | paper_writer = Keyword() 27 | paper_time = Integer() 28 | paper_cite_count = Integer() 29 | paper_source = Keyword() 30 | paper_abstract = Text(analyzer="ik_max_word") 31 | paper_keywords = Text(analyzer="ik_max_word") 32 | paper_DOI = Text() 33 | paper_download_link = Text() 34 | 35 | class Meta: 36 | index = "baidu" 37 | doc_type = "paper" 38 | 39 | 40 | if __name__ == "__main__": 41 | PaperType.init() 42 | -------------------------------------------------------------------------------- /paperSpider/paperSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 | # import codecs 8 | # from scrapy.exporters import JsonItemExporter 9 | import redis 10 | from paperSpider.models.es_types import PaperType 11 | 12 | from elasticsearch_dsl.connections import connections 13 | 14 | es = connections.create_connection(PaperType._doc_type.using) 15 | 16 | redis_cli = redis.StrictRedis() 17 | 18 | # import MySQLdb 19 | 20 | 21 | def gen_suggests(index, info_tuple): 22 | # 根据字符串生成搜索建议数组 23 | used_words = set() 24 | suggests = [] 25 | for text, weight in info_tuple: 26 | if text: 27 | # 调用 es 的 analyze 接口分析字符串 28 | words = es.indices.analyze(index=index, analyzer="ik_smart", params={'filter': ["lowercase"]}, body=text) 29 | analyzed_words = set([r["token"] for r in words["tokens"] if len(r["token"]) > 1]) 30 | new_words = analyzed_words - used_words 31 | else: 32 | new_words = set() 33 | 34 | if new_words: 35 | suggests.append({"input": list(new_words), "weight": weight}) 36 | used_words = used_words.union(new_words) 37 | 38 | return suggests 39 | 40 | 41 | class PaperspiderPipeline(object): 42 | def process_item(self, item, spider): 43 | return item 44 | 45 | 46 | # class MysqlPipeline(object): 47 | # def __init__(self): 48 | # self.conn = MySQLdb.connect('127.0.0.1', 'root', '123456', 'paper_spider_data', charset='utf8', 49 | # use_unicode=True) 50 | # self.cursor = self.conn.cursor() 51 | # 52 | # def process_item(self, item, spider): 53 | # insert_sql = """ 54 | # insert into paper_info(paper_title,paper_writer,paper_time,paper_cite_count,paper_source,paper_abstract,paper_keywords,paper_DOI,paper_download_link) 55 | # values (%s, %s, %s, %s, %s, %s, %s, %s, %s) 56 | # """ 57 | # params = list() 58 | # params.append(item.get('paper_title', '')) 59 | # params.append(item.get('paper_writer', '')) 60 | # params.append(item.get('paper_time', '')) 61 | # params.append(item.get('paper_cite_count', '')) 62 | # params.append(item.get('paper_source', '')) 63 | # params.append(item.get('paper_abstract', '')) 64 | # params.append(item.get('paper_keywords', '')) 65 | # params.append(item.get('paper_DOI', '')) 66 | # params.append(item.get('paper_download_link', '')) 67 | # self.cursor.execute(insert_sql, tuple(params)) 68 | # self.conn.commit() 69 | # 70 | # return item 71 | # 72 | # 73 | # class JsonExporterPipeline(object): 74 | # def __init__(self): 75 | # self.file = open('paper_export.json', 'wb') 76 | # self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False) 77 | # self.exporter.start_exporting() 78 | # 79 | # def process_item(self, item, spider): 80 | # self.exporter.export_item(item) 81 | # return item 82 | # 83 | # def spider_closed(self): 84 | # self.exporter.finish_exporting() 85 | # self.file.close() 86 | 87 | 88 | class ElasticsearchPipeline(object): 89 | def process_item(self, item, spider): 90 | # 将 item 转化为 es 的数据 91 | paper = PaperType() 92 | paper.paper_title = item['paper_title'] 93 | paper.paper_writer = item['paper_writer'] 94 | paper.paper_abstract = item['paper_abstract'] 95 | paper.paper_keywords = item['paper_keywords'] 96 | paper.paper_DOI = item['paper_DOI'] 97 | paper.paper_time = item['paper_time'] 98 | paper.paper_cite_count = item['paper_cite_count'] 99 | paper.paper_source = item['paper_source'] 100 | paper.paper_download_link = item['paper_download_link'] 101 | paper.meta.id = item['paper_source'] 102 | 103 | paper.suggest = gen_suggests(PaperType._doc_type.index, 104 | ((paper.paper_title, 10), (paper.paper_keywords, 5), (paper.paper_abstract, 2))) 105 | 106 | paper.save() 107 | redis_cli.incr("baidu_count") 108 | 109 | return item 110 | -------------------------------------------------------------------------------- /paperSpider/paperSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for paperSpider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://docs.scrapy.org/en/latest/topics/settings.html 9 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'paperSpider' 13 | 14 | SPIDER_MODULES = ['paperSpider.spiders'] 15 | NEWSPIDER_MODULE = 'paperSpider.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'paperSpider (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Enables scheduling storing requests queue in redis. 24 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 25 | 26 | # Schedule requests using a priority queue. (default) 27 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' 28 | 29 | # Ensure all spiders share same duplicates filter through redis. 30 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 31 | 32 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 33 | # CONCURRENT_REQUESTS = 32 34 | 35 | # Configure a delay for requests for the same website (default: 0) 36 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 37 | # See also autothrottle settings and docs 38 | # DOWNLOAD_DELAY = 3 39 | # The download delay setting will honor only one of: 40 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 41 | # CONCURRENT_REQUESTS_PER_IP = 16 42 | 43 | # Disable cookies (enabled by default) 44 | # COOKIES_ENABLED = False 45 | 46 | # Disable Telnet Console (enabled by default) 47 | # TELNETCONSOLE_ENABLED = False 48 | 49 | # Override the default request headers: 50 | # DEFAULT_REQUEST_HEADERS = { 51 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 52 | # 'Accept-Language': 'en', 53 | # } 54 | 55 | # Enable or disable spider middlewares 56 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 57 | # SPIDER_MIDDLEWARES = { 58 | # 'paperSpider.middlewares.PaperspiderSpiderMiddleware': 543, 59 | # } 60 | 61 | # Enable or disable downloader middlewares 62 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 63 | # DOWNLOADER_MIDDLEWARES = { 64 | # 'paperSpider.middlewares.PaperspiderDownloaderMiddleware': 543, 65 | # } 66 | 67 | # Enable or disable extensions 68 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 69 | # EXTENSIONS = { 70 | # 'scrapy.extensions.telnet.TelnetConsole': None, 71 | # } 72 | 73 | # Configure item pipelines 74 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 75 | # ITEM_PIPELINES = { 76 | # 'paperSpider.pipelines.JsonExporterPipeline': 2, 77 | # 'paperSpider.pipelines.MysqlPipeline': 3, 78 | # 'paperSpider.pipelines.PaperspiderPipeline': 300, 79 | # } 80 | # Store scraped item in redis for post-processing. 81 | ITEM_PIPELINES = { 82 | 'scrapy_redis.pipelines.RedisPipeline': 300, 83 | 'paperSpider.pipelines.ElasticsearchPipeline': 1 84 | } 85 | # Enable and configure the AutoThrottle extension (disabled by default) 86 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 87 | # AUTOTHROTTLE_ENABLED = True 88 | # The initial download delay 89 | # AUTOTHROTTLE_START_DELAY = 5 90 | # The maximum download delay to be set in case of high latencies 91 | # AUTOTHROTTLE_MAX_DELAY = 60 92 | # The average number of requests Scrapy should be sending in parallel to 93 | # each remote server 94 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 95 | # Enable showing throttling stats for every response received: 96 | # AUTOTHROTTLE_DEBUG = False 97 | 98 | # Enable and configure HTTP caching (disabled by default) 99 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 100 | # HTTPCACHE_ENABLED = True 101 | # HTTPCACHE_EXPIRATION_SECS = 0 102 | # HTTPCACHE_DIR = 'httpcache' 103 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 104 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 105 | -------------------------------------------------------------------------------- /paperSpider/paperSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /paperSpider/paperSpider/spiders/baidu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | from urllib import parse 4 | from paperSpider.utils.common import format_word 5 | import scrapy 6 | from scrapy import Request 7 | from paperSpider.items import PaperspiderItem 8 | from scrapy_redis.spiders import RedisSpider 9 | 10 | 11 | class BaiduSpider(scrapy.Spider): 12 | name = 'baidu' 13 | # redis_key = 'baidu:start_urls' 14 | allowed_domains = ['xueshu.baidu.com'] 15 | input_keyword = 'machine%20learning' 16 | start_urls = [ 17 | # 'http://xueshu.baidu.com/s?wd=machine%20learning&pn=0&tn=SE_baiduxueshu_c1gjeupa&ie=utf-8&sc_f_para=sc_tasktype%3D%7BfirstAdvancedSearch%7D&sc_hit=1' 18 | # 'http://xueshu.baidu.com/s?wd=%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0&tn=SE_baiduxueshu_c1gjeupa&cl=3&ie=utf-8&bs=machine+learning&f=8&rsv_bp=1&rsv_sug2=0&sc_f_para=sc_tasktype%3D%7BfirstSimpleSearch%7D&rsv_spt=3' 19 | # 'http://xueshu.baidu.com/s?wd=%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98&rsv_bp=0&tn=SE_baiduxueshu_c1gjeupa&rsv_spt=3&ie=utf-8&f=8&rsv_sug2=0&sc_f_para=sc_tasktype%3D%7BfirstSimpleSearch%7D' 20 | 'http://xueshu.baidu.com/s?wd=%E6%B7%B1%E5%BA%A6%E5%AD%A6%E4%B9%A0&tn=SE_baiduxueshu_c1gjeupa&cl=3&ie=utf-8&bs=%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98&f=8&rsv_bp=1&rsv_sug2=0&sc_f_para=sc_tasktype%3D%7BfirstSimpleSearch%7D&rsv_spt=3' 21 | ] 22 | 23 | def parse(self, response): 24 | paper_nodes = response.xpath('//*[@class="sc_content"]') 25 | for paper_node in paper_nodes: 26 | paper_url = paper_node.css('h3 a::attr(href)').extract_first('') 27 | yield Request(url=parse.urljoin(response.url, paper_url), callback=self.parse_detail) 28 | 29 | next_url = response.css('#page a:last-child::attr(href)').extract_first('') 30 | if next_url: 31 | # next_url = 'http://xueshu.baidu.com' + next_url 32 | yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse) 33 | 34 | def parse_detail(self, response): 35 | paper_item = PaperspiderItem() 36 | 37 | paper_title = response.css('.main-info h3 a::text').extract_first('') 38 | paper_title = format_word(paper_title) 39 | 40 | paper_writer = response.css('.author_wr .author_text span a::text').extract() 41 | paper_abstract = response.css('.abstract::text').extract_first('') 42 | paper_keywords = response.css('.kw_wr .kw_main span a::text').extract() 43 | 44 | paper_DOI = response.css('.doi_wr .kw_main::text').extract_first('') 45 | paper_DOI = format_word(paper_DOI) 46 | 47 | paper_cite_count = response.css('.sc_cite_cont::text').extract_first(0) 48 | paper_cite_count = format_word(paper_cite_count) 49 | 50 | paper_source = response.css('.love_wr .label-ll a::attr(href)').extract_first('') 51 | 52 | paper_time = response.css('.year_wr .kw_main::text').extract_first('暂无') 53 | paper_time = format_word(paper_time) 54 | 55 | paper_download_link = response.css('#savelink_wr .dl_item_span a::attr(href)').extract() 56 | 57 | paper_item['paper_title'] = paper_title 58 | paper_item['paper_writer'] = paper_writer 59 | paper_item['paper_abstract'] = paper_abstract 60 | paper_item['paper_keywords'] = paper_keywords 61 | paper_item['paper_DOI'] = paper_DOI 62 | paper_item['paper_time'] = paper_time 63 | paper_item['paper_cite_count'] = paper_cite_count 64 | paper_item['paper_source'] = paper_source 65 | paper_item['paper_download_link'] = paper_download_link[:5] 66 | 67 | yield paper_item 68 | -------------------------------------------------------------------------------- /paperSpider/paperSpider/tools/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2020-03-21 13:37 3 | # @Author : beking 4 | -------------------------------------------------------------------------------- /paperSpider/paperSpider/tools/add_urls.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2020-03-21 13:37 3 | # @Author : beking 4 | 5 | import redis 6 | import json 7 | 8 | rd = redis.Redis("127.0.0.1", decode_responses=True) 9 | rd.lpush('baidu:start_urls', 10 | 'http://xueshu.baidu.com/s?wd=machine+learning&rsv_bp=0&tn=SE_baiduxueshu_c1gjeupa&rsv_spt=3&ie=utf-8&f=8&rsv_sug2=1&sc_f_para=sc_tasktype%3D%7BfirstSimpleSearch%7D') 11 | 12 | urls = [('', 3, 'parse_detail'), ('', 4, 'parse_detail')] 13 | 14 | for url in urls: 15 | rd.rpush("baidu:new_urls", json.dumps(url)) 16 | -------------------------------------------------------------------------------- /paperSpider/paperSpider/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2020-03-19 12:29 3 | # @Author : beking 4 | -------------------------------------------------------------------------------- /paperSpider/paperSpider/utils/bloomfilter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2020-03-20 13:25 3 | # @Author : beking 4 | 5 | import mmh3 6 | import redis 7 | import math 8 | import time 9 | 10 | 11 | class PyBloomFilter(): 12 | # 内置100个随机种子 13 | SEEDS = [543, 460, 171, 876, 796, 607, 650, 81, 837, 545, 591, 946, 846, 521, 913, 636, 878, 735, 414, 372, 14 | 344, 324, 223, 180, 327, 891, 798, 933, 493, 293, 836, 10, 6, 544, 924, 849, 438, 41, 862, 648, 338, 15 | 465, 562, 693, 979, 52, 763, 103, 387, 374, 349, 94, 384, 680, 574, 480, 307, 580, 71, 535, 300, 53, 16 | 481, 519, 644, 219, 686, 236, 424, 326, 244, 212, 909, 202, 951, 56, 812, 901, 926, 250, 507, 739, 371, 17 | 63, 584, 154, 7, 284, 617, 332, 472, 140, 605, 262, 355, 526, 647, 923, 199, 518] 18 | 19 | # capacity 预先估计要去重的数量 20 | # error_rate 错误率 21 | # conn redis的连接客户端 22 | # key 在redis中的键的名字前缀 23 | def __init__(self, capacity=1000000000, error_rate=0.00000001, conn=None, key='BloomFilter'): 24 | self.m = math.ceil(capacity * math.log2(math.e) * math.log2(1 / error_rate)) # 需要的总bit位数 25 | self.k = math.ceil(math.log1p(2) * self.m / capacity) # 需要最少的hash次数 26 | self.mem = math.ceil(self.m / 8 / 1024 / 1024) # 需要的多少M内存 27 | self.blocknum = math.ceil(self.mem / 512) # 需要多少个512M的内存块,value的第一个字符必须是ascii码,所有最多有256个内存块 28 | self.seeds = self.SEEDS[0:self.k] 29 | self.key = key 30 | self.N = 2 ** 31 - 1 31 | self.redis = conn 32 | print(self.mem) 33 | print(self.k) 34 | 35 | def add(self, value): 36 | name = self.key + "_" + str(ord(value[0]) % self.blocknum) 37 | hashs = self.get_hashs(value) 38 | for hash in hashs: 39 | self.redis.setbit(name, hash, 1) 40 | 41 | def is_exist(self, value): 42 | name = self.key + "_" + str(ord(value[0]) % self.blocknum) 43 | hashs = self.get_hashs(value) 44 | exist = True 45 | for hash in hashs: 46 | exist = exist & self.redis.getbit(name, hash) 47 | return exist 48 | 49 | def get_hashs(self, value): 50 | hashs = list() 51 | for seed in self.seeds: 52 | hash = mmh3.hash(value, seed) 53 | if hash >= 0: 54 | hashs.append(hash) 55 | else: 56 | hashs.append(self.N - hash) 57 | return hashs 58 | 59 | 60 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=0) 61 | conn = redis.StrictRedis(connection_pool=pool) 62 | 63 | start = time.time() 64 | bf = PyBloomFilter(conn=conn) 65 | bf.add('www.jobbole.com') 66 | bf.add('www.zhihu.com') 67 | print(bf.is_exist('www.zhihu.com')) 68 | print(bf.is_exist('www.lagou.com')) 69 | -------------------------------------------------------------------------------- /paperSpider/paperSpider/utils/common.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2020-03-19 12:29 3 | # @Author : beking 4 | import re 5 | 6 | 7 | def format_word(word): 8 | if word: 9 | re.sub(r'\s+', '', word) 10 | word = str(word).strip() 11 | return word 12 | -------------------------------------------------------------------------------- /paperSpider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = paperSpider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = paperSpider 12 | -------------------------------------------------------------------------------- /paperSpider/scrapy_redis/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .connection import ( # NOQA 3 | get_redis, 4 | get_redis_from_settings, 5 | ) 6 | 7 | __author__ = 'Rolando Espinoza' 8 | __email__ = 'rolando at rmax.io' 9 | __version__ = '0.7.0-dev' 10 | -------------------------------------------------------------------------------- /paperSpider/scrapy_redis/connection.py: -------------------------------------------------------------------------------- 1 | import six 2 | 3 | from scrapy.utils.misc import load_object 4 | 5 | from . import defaults 6 | 7 | # Shortcut maps 'setting name' -> 'parmater name'. 8 | SETTINGS_PARAMS_MAP = { 9 | 'REDIS_URL': 'url', 10 | 'REDIS_HOST': 'host', 11 | 'REDIS_PORT': 'port', 12 | 'REDIS_ENCODING': 'encoding', 13 | } 14 | 15 | 16 | def get_redis_from_settings(settings): 17 | """Returns a redis client instance from given Scrapy settings object. 18 | 19 | This function uses ``get_client`` to instantiate the client and uses 20 | ``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You 21 | can override them using the ``REDIS_PARAMS`` setting. 22 | 23 | Parameters 24 | ---------- 25 | settings : Settings 26 | A scrapy settings object. See the supported settings below. 27 | 28 | Returns 29 | ------- 30 | server 31 | Redis client instance. 32 | 33 | Other Parameters 34 | ---------------- 35 | REDIS_URL : str, optional 36 | Server connection URL. 37 | REDIS_HOST : str, optional 38 | Server host. 39 | REDIS_PORT : str, optional 40 | Server port. 41 | REDIS_ENCODING : str, optional 42 | Data encoding. 43 | REDIS_PARAMS : dict, optional 44 | Additional client parameters. 45 | 46 | """ 47 | params = defaults.REDIS_PARAMS.copy() 48 | params.update(settings.getdict('REDIS_PARAMS')) 49 | # XXX: Deprecate REDIS_* settings. 50 | for source, dest in SETTINGS_PARAMS_MAP.items(): 51 | val = settings.get(source) 52 | if val: 53 | params[dest] = val 54 | 55 | # Allow ``redis_cls`` to be a path to a class. 56 | if isinstance(params.get('redis_cls'), six.string_types): 57 | params['redis_cls'] = load_object(params['redis_cls']) 58 | 59 | return get_redis(**params) 60 | 61 | 62 | # Backwards compatible alias. 63 | from_settings = get_redis_from_settings 64 | 65 | 66 | def get_redis(**kwargs): 67 | """Returns a redis client instance. 68 | 69 | Parameters 70 | ---------- 71 | redis_cls : class, optional 72 | Defaults to ``redis.StrictRedis``. 73 | url : str, optional 74 | If given, ``redis_cls.from_url`` is used to instantiate the class. 75 | **kwargs 76 | Extra parameters to be passed to the ``redis_cls`` class. 77 | 78 | Returns 79 | ------- 80 | server 81 | Redis client instance. 82 | 83 | """ 84 | redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS) 85 | url = kwargs.pop('url', None) 86 | if url: 87 | return redis_cls.from_url(url, **kwargs) 88 | else: 89 | return redis_cls(**kwargs) 90 | -------------------------------------------------------------------------------- /paperSpider/scrapy_redis/defaults.py: -------------------------------------------------------------------------------- 1 | import redis 2 | 3 | # For standalone use. 4 | DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' 5 | 6 | PIPELINE_KEY = '%(spider)s:items' 7 | 8 | REDIS_CLS = redis.StrictRedis 9 | REDIS_ENCODING = 'utf-8' 10 | # Sane connection defaults. 11 | REDIS_PARAMS = { 12 | 'socket_timeout': 30, 13 | 'socket_connect_timeout': 30, 14 | 'retry_on_timeout': True, 15 | 'encoding': REDIS_ENCODING, 16 | } 17 | 18 | SCHEDULER_QUEUE_KEY = '%(spider)s:requests' 19 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' 20 | SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' 21 | SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' 22 | 23 | START_URLS_KEY = '%(name)s:start_urls' 24 | START_URLS_AS_SET = False 25 | -------------------------------------------------------------------------------- /paperSpider/scrapy_redis/dupefilter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | from scrapy.dupefilters import BaseDupeFilter 5 | from scrapy.utils.request import request_fingerprint 6 | 7 | from . import defaults 8 | from .connection import get_redis_from_settings 9 | from paperSpider.utils.bloomfilter import conn, PyBloomFilter 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | # TODO: Rename class to RedisDupeFilter. 15 | class RFPDupeFilter(BaseDupeFilter): 16 | """Redis-based request duplicates filter. 17 | 18 | This class can also be used with default Scrapy's scheduler. 19 | 20 | """ 21 | 22 | logger = logger 23 | 24 | def __init__(self, server, key, debug=False): 25 | """Initialize the duplicates filter. 26 | 27 | Parameters 28 | ---------- 29 | server : redis.StrictRedis 30 | The redis server instance. 31 | key : str 32 | Redis key Where to store fingerprints. 33 | debug : bool, optional 34 | Whether to log filtered requests. 35 | 36 | """ 37 | self.server = server 38 | self.key = key 39 | self.debug = debug 40 | self.logdupes = True 41 | 42 | self.bf = PyBloomFilter(conn=conn, key=key) 43 | 44 | @classmethod 45 | def from_settings(cls, settings): 46 | """Returns an instance from given settings. 47 | 48 | This uses by default the key ``dupefilter:``. When using the 49 | ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as 50 | it needs to pass the spider name in the key. 51 | 52 | Parameters 53 | ---------- 54 | settings : scrapy.settings.Settings 55 | 56 | Returns 57 | ------- 58 | RFPDupeFilter 59 | A RFPDupeFilter instance. 60 | 61 | 62 | """ 63 | server = get_redis_from_settings(settings) 64 | # XXX: This creates one-time key. needed to support to use this 65 | # class as standalone dupefilter with scrapy's default scheduler 66 | # if scrapy passes spider on open() method this wouldn't be needed 67 | # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. 68 | key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} 69 | debug = settings.getbool('DUPEFILTER_DEBUG') 70 | return cls(server, key=key, debug=debug) 71 | 72 | @classmethod 73 | def from_crawler(cls, crawler): 74 | """Returns instance from crawler. 75 | 76 | Parameters 77 | ---------- 78 | crawler : scrapy.crawler.Crawler 79 | 80 | Returns 81 | ------- 82 | RFPDupeFilter 83 | Instance of RFPDupeFilter. 84 | 85 | """ 86 | return cls.from_settings(crawler.settings) 87 | 88 | def request_seen(self, request): 89 | """Returns True if request was already seen. 90 | 91 | Parameters 92 | ---------- 93 | request : scrapy.http.Request 94 | 95 | Returns 96 | ------- 97 | bool 98 | 99 | """ 100 | fp = self.request_fingerprint(request) 101 | 102 | if self.bf.is_exist(fp): 103 | return True 104 | else: 105 | self.bf.add(fp) 106 | return False 107 | # This returns the number of values added, zero if already exists. 108 | # added = self.server.sadd(self.key, fp) 109 | # return added == 0 110 | 111 | def request_fingerprint(self, request): 112 | """Returns a fingerprint for a given request. 113 | 114 | Parameters 115 | ---------- 116 | request : scrapy.http.Request 117 | 118 | Returns 119 | ------- 120 | str 121 | 122 | """ 123 | return request_fingerprint(request) 124 | 125 | @classmethod 126 | def from_spider(cls, spider): 127 | settings = spider.settings 128 | server = get_redis_from_settings(settings) 129 | dupefilter_key = settings.get("SCHEDULER_DUPEFILTER_KEY", defaults.SCHEDULER_DUPEFILTER_KEY) 130 | key = dupefilter_key % {'spider': spider.name} 131 | debug = settings.getbool('DUPEFILTER_DEBUG') 132 | return cls(server, key=key, debug=debug) 133 | 134 | def close(self, reason=''): 135 | """Delete data on close. Called by Scrapy's scheduler. 136 | 137 | Parameters 138 | ---------- 139 | reason : str, optional 140 | 141 | """ 142 | self.clear() 143 | 144 | def clear(self): 145 | """Clears fingerprints data.""" 146 | self.server.delete(self.key) 147 | 148 | def log(self, request, spider): 149 | """Logs given request. 150 | 151 | Parameters 152 | ---------- 153 | request : scrapy.http.Request 154 | spider : scrapy.spiders.Spider 155 | 156 | """ 157 | if self.debug: 158 | msg = "Filtered duplicate request: %(request)s" 159 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 160 | elif self.logdupes: 161 | msg = ("Filtered duplicate request %(request)s" 162 | " - no more duplicates will be shown" 163 | " (see DUPEFILTER_DEBUG to show all duplicates)") 164 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 165 | self.logdupes = False 166 | -------------------------------------------------------------------------------- /paperSpider/scrapy_redis/picklecompat.py: -------------------------------------------------------------------------------- 1 | """A pickle wrapper module with protocol=-1 by default.""" 2 | 3 | try: 4 | import cPickle as pickle # PY2 5 | except ImportError: 6 | import pickle 7 | 8 | 9 | def loads(s): 10 | return pickle.loads(s) 11 | 12 | 13 | def dumps(obj): 14 | return pickle.dumps(obj, protocol=-1) 15 | -------------------------------------------------------------------------------- /paperSpider/scrapy_redis/pipelines.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.misc import load_object 2 | from scrapy.utils.serialize import ScrapyJSONEncoder 3 | from twisted.internet.threads import deferToThread 4 | 5 | from . import connection, defaults 6 | 7 | default_serialize = ScrapyJSONEncoder().encode 8 | 9 | 10 | class RedisPipeline(object): 11 | """Pushes serialized item into a redis list/queue 12 | 13 | Settings 14 | -------- 15 | REDIS_ITEMS_KEY : str 16 | Redis key where to store items. 17 | REDIS_ITEMS_SERIALIZER : str 18 | Object path to serializer function. 19 | 20 | """ 21 | 22 | def __init__(self, server, 23 | key=defaults.PIPELINE_KEY, 24 | serialize_func=default_serialize): 25 | """Initialize pipeline. 26 | 27 | Parameters 28 | ---------- 29 | server : StrictRedis 30 | Redis client instance. 31 | key : str 32 | Redis key where to store items. 33 | serialize_func : callable 34 | Items serializer function. 35 | 36 | """ 37 | self.server = server 38 | self.key = key 39 | self.serialize = serialize_func 40 | 41 | @classmethod 42 | def from_settings(cls, settings): 43 | params = { 44 | 'server': connection.from_settings(settings), 45 | } 46 | if settings.get('REDIS_ITEMS_KEY'): 47 | params['key'] = settings['REDIS_ITEMS_KEY'] 48 | if settings.get('REDIS_ITEMS_SERIALIZER'): 49 | params['serialize_func'] = load_object( 50 | settings['REDIS_ITEMS_SERIALIZER'] 51 | ) 52 | 53 | return cls(**params) 54 | 55 | @classmethod 56 | def from_crawler(cls, crawler): 57 | return cls.from_settings(crawler.settings) 58 | 59 | def process_item(self, item, spider): 60 | return deferToThread(self._process_item, item, spider) 61 | 62 | def _process_item(self, item, spider): 63 | key = self.item_key(item, spider) 64 | data = self.serialize(item) 65 | self.server.rpush(key, data) 66 | return item 67 | 68 | def item_key(self, item, spider): 69 | """Returns redis key based on given spider. 70 | 71 | Override this function to use a different key depending on the item 72 | and/or spider. 73 | 74 | """ 75 | return self.key % {'spider': spider.name} 76 | -------------------------------------------------------------------------------- /paperSpider/scrapy_redis/queue.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.reqser import request_to_dict, request_from_dict 2 | 3 | from . import picklecompat 4 | 5 | 6 | class Base(object): 7 | """Per-spider base queue class""" 8 | 9 | def __init__(self, server, spider, key, serializer=None): 10 | """Initialize per-spider redis queue. 11 | 12 | Parameters 13 | ---------- 14 | server : StrictRedis 15 | Redis client instance. 16 | spider : Spider 17 | Scrapy spider instance. 18 | key: str 19 | Redis key where to put and get messages. 20 | serializer : object 21 | Serializer object with ``loads`` and ``dumps`` methods. 22 | 23 | """ 24 | if serializer is None: 25 | # Backward compatibility. 26 | # TODO: deprecate pickle. 27 | serializer = picklecompat 28 | if not hasattr(serializer, 'loads'): 29 | raise TypeError("serializer does not implement 'loads' function: %r" 30 | % serializer) 31 | if not hasattr(serializer, 'dumps'): 32 | raise TypeError("serializer '%s' does not implement 'dumps' function: %r" 33 | % serializer) 34 | 35 | self.server = server 36 | self.spider = spider 37 | self.key = key % {'spider': spider.name} 38 | self.serializer = serializer 39 | 40 | def _encode_request(self, request): 41 | """Encode a request object""" 42 | obj = request_to_dict(request, self.spider) 43 | return self.serializer.dumps(obj) 44 | 45 | def _decode_request(self, encoded_request): 46 | """Decode an request previously encoded""" 47 | obj = self.serializer.loads(encoded_request) 48 | return request_from_dict(obj, self.spider) 49 | 50 | def __len__(self): 51 | """Return the length of the queue""" 52 | raise NotImplementedError 53 | 54 | def push(self, request): 55 | """Push a request""" 56 | raise NotImplementedError 57 | 58 | def pop(self, timeout=0): 59 | """Pop a request""" 60 | raise NotImplementedError 61 | 62 | def clear(self): 63 | """Clear queue/stack""" 64 | self.server.delete(self.key) 65 | 66 | 67 | class FifoQueue(Base): 68 | """Per-spider FIFO queue""" 69 | 70 | def __len__(self): 71 | """Return the length of the queue""" 72 | return self.server.llen(self.key) 73 | 74 | def push(self, request): 75 | """Push a request""" 76 | self.server.lpush(self.key, self._encode_request(request)) 77 | 78 | def pop(self, timeout=0): 79 | """Pop a request""" 80 | if timeout > 0: 81 | data = self.server.brpop(self.key, timeout) 82 | if isinstance(data, tuple): 83 | data = data[1] 84 | else: 85 | data = self.server.rpop(self.key) 86 | if data: 87 | return self._decode_request(data) 88 | 89 | 90 | class PriorityQueue(Base): 91 | """Per-spider priority queue abstraction using redis' sorted set""" 92 | 93 | def __len__(self): 94 | """Return the length of the queue""" 95 | return self.server.zcard(self.key) 96 | 97 | def push(self, request): 98 | """Push a request""" 99 | data = self._encode_request(request) 100 | score = -request.priority 101 | # We don't use zadd method as the order of arguments change depending on 102 | # whether the class is Redis or StrictRedis, and the option of using 103 | # kwargs only accepts strings, not bytes. 104 | self.server.execute_command('ZADD', self.key, score, data) 105 | 106 | def pop(self, timeout=0): 107 | """ 108 | Pop a request 109 | timeout not support in this queue class 110 | """ 111 | # use atomic range/remove using multi/exec 112 | pipe = self.server.pipeline() 113 | pipe.multi() 114 | pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0) 115 | results, count = pipe.execute() 116 | if results: 117 | return self._decode_request(results[0]) 118 | 119 | 120 | class LifoQueue(Base): 121 | """Per-spider LIFO queue.""" 122 | 123 | def __len__(self): 124 | """Return the length of the stack""" 125 | return self.server.llen(self.key) 126 | 127 | def push(self, request): 128 | """Push a request""" 129 | self.server.lpush(self.key, self._encode_request(request)) 130 | 131 | def pop(self, timeout=0): 132 | """Pop a request""" 133 | if timeout > 0: 134 | data = self.server.blpop(self.key, timeout) 135 | if isinstance(data, tuple): 136 | data = data[1] 137 | else: 138 | data = self.server.lpop(self.key) 139 | 140 | if data: 141 | return self._decode_request(data) 142 | 143 | 144 | # TODO: Deprecate the use of these names. 145 | SpiderQueue = FifoQueue 146 | SpiderStack = LifoQueue 147 | SpiderPriorityQueue = PriorityQueue 148 | -------------------------------------------------------------------------------- /paperSpider/scrapy_redis/scheduler.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import six 3 | 4 | from scrapy.utils.misc import load_object 5 | 6 | from . import connection, defaults 7 | 8 | 9 | # TODO: add SCRAPY_JOB support. 10 | class Scheduler(object): 11 | """Redis-based scheduler 12 | 13 | Settings 14 | -------- 15 | SCHEDULER_PERSIST : bool (default: False) 16 | Whether to persist or clear redis queue. 17 | SCHEDULER_FLUSH_ON_START : bool (default: False) 18 | Whether to flush redis queue on start. 19 | SCHEDULER_IDLE_BEFORE_CLOSE : int (default: 0) 20 | How many seconds to wait before closing if no message is received. 21 | SCHEDULER_QUEUE_KEY : str 22 | Scheduler redis key. 23 | SCHEDULER_QUEUE_CLASS : str 24 | Scheduler queue class. 25 | SCHEDULER_DUPEFILTER_KEY : str 26 | Scheduler dupefilter redis key. 27 | SCHEDULER_DUPEFILTER_CLASS : str 28 | Scheduler dupefilter class. 29 | SCHEDULER_SERIALIZER : str 30 | Scheduler serializer. 31 | 32 | """ 33 | 34 | def __init__(self, server, 35 | persist=False, 36 | flush_on_start=False, 37 | queue_key=defaults.SCHEDULER_QUEUE_KEY, 38 | queue_cls=defaults.SCHEDULER_QUEUE_CLASS, 39 | dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY, 40 | dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS, 41 | idle_before_close=0, 42 | serializer=None): 43 | """Initialize scheduler. 44 | 45 | Parameters 46 | ---------- 47 | server : Redis 48 | The redis server instance. 49 | persist : bool 50 | Whether to flush requests when closing. Default is False. 51 | flush_on_start : bool 52 | Whether to flush requests on start. Default is False. 53 | queue_key : str 54 | Requests queue key. 55 | queue_cls : str 56 | Importable path to the queue class. 57 | dupefilter_key : str 58 | Duplicates filter key. 59 | dupefilter_cls : str 60 | Importable path to the dupefilter class. 61 | idle_before_close : int 62 | Timeout before giving up. 63 | 64 | """ 65 | if idle_before_close < 0: 66 | raise TypeError("idle_before_close cannot be negative") 67 | 68 | self.server = server 69 | self.persist = persist 70 | self.flush_on_start = flush_on_start 71 | self.queue_key = queue_key 72 | self.queue_cls = queue_cls 73 | self.dupefilter_cls = dupefilter_cls 74 | self.dupefilter_key = dupefilter_key 75 | self.idle_before_close = idle_before_close 76 | self.serializer = serializer 77 | self.stats = None 78 | 79 | def __len__(self): 80 | return len(self.queue) 81 | 82 | @classmethod 83 | def from_settings(cls, settings): 84 | kwargs = { 85 | 'persist': settings.getbool('SCHEDULER_PERSIST'), 86 | 'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'), 87 | 'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'), 88 | } 89 | 90 | # If these values are missing, it means we want to use the defaults. 91 | optional = { 92 | # TODO: Use custom prefixes for this settings to note that are 93 | # specific to scrapy-redis. 94 | 'queue_key': 'SCHEDULER_QUEUE_KEY', 95 | 'queue_cls': 'SCHEDULER_QUEUE_CLASS', 96 | 'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY', 97 | # We use the default setting name to keep compatibility. 98 | 'dupefilter_cls': 'DUPEFILTER_CLASS', 99 | 'serializer': 'SCHEDULER_SERIALIZER', 100 | } 101 | for name, setting_name in optional.items(): 102 | val = settings.get(setting_name) 103 | if val: 104 | kwargs[name] = val 105 | 106 | # Support serializer as a path to a module. 107 | if isinstance(kwargs.get('serializer'), six.string_types): 108 | kwargs['serializer'] = importlib.import_module(kwargs['serializer']) 109 | 110 | server = connection.from_settings(settings) 111 | # Ensure the connection is working. 112 | server.ping() 113 | 114 | return cls(server=server, **kwargs) 115 | 116 | @classmethod 117 | def from_crawler(cls, crawler): 118 | instance = cls.from_settings(crawler.settings) 119 | # FIXME: for now, stats are only supported from this constructor 120 | instance.stats = crawler.stats 121 | return instance 122 | 123 | def open(self, spider): 124 | self.spider = spider 125 | 126 | try: 127 | self.queue = load_object(self.queue_cls)( 128 | server=self.server, 129 | spider=spider, 130 | key=self.queue_key % {'spider': spider.name}, 131 | serializer=self.serializer, 132 | ) 133 | except TypeError as e: 134 | raise ValueError("Failed to instantiate queue class '%s': %s", 135 | self.queue_cls, e) 136 | 137 | self.df = load_object(self.dupefilter_cls).from_spider(spider) 138 | 139 | if self.flush_on_start: 140 | self.flush() 141 | # notice if there are requests already in the queue to resume the crawl 142 | if len(self.queue): 143 | spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) 144 | 145 | def close(self, reason): 146 | if not self.persist: 147 | self.flush() 148 | 149 | def flush(self): 150 | self.df.clear() 151 | self.queue.clear() 152 | 153 | def enqueue_request(self, request): 154 | # 通信 从 redis 获取 url 并放入到队列中 155 | import redis 156 | import json 157 | import scrapy 158 | 159 | rd = redis.Redis("127.0.0.1", decode_responses=True) 160 | # 先检查指定的 redis 队列中是否有 url 161 | list_name = "baidu:new_urls" 162 | while rd.llen(list_name): 163 | data = json.loads(rd.lpop(list_name)) 164 | callback_func = getattr(self.spider, data[2]) 165 | req = scrapy.Request(url=data[0], dont_filter=False, callback=callback_func, priority=data[1]) 166 | self.queue.push(req) 167 | 168 | if not request.dont_filter and self.df.request_seen(request): 169 | self.df.log(request, self.spider) 170 | return False 171 | if self.stats: 172 | self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) 173 | self.queue.push(request) 174 | return True 175 | 176 | def next_request(self): 177 | block_pop_timeout = self.idle_before_close 178 | request = self.queue.pop(block_pop_timeout) 179 | if request and self.stats: 180 | self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider) 181 | return request 182 | 183 | def has_pending_requests(self): 184 | return len(self) > 0 185 | -------------------------------------------------------------------------------- /paperSpider/scrapy_redis/spiders.py: -------------------------------------------------------------------------------- 1 | from scrapy import signals 2 | from scrapy.exceptions import DontCloseSpider 3 | from scrapy.spiders import Spider, CrawlSpider 4 | 5 | from . import connection, defaults 6 | from .utils import bytes_to_str 7 | 8 | 9 | class RedisMixin(object): 10 | """Mixin class to implement reading urls from a redis queue.""" 11 | redis_key = None 12 | redis_batch_size = None 13 | redis_encoding = None 14 | 15 | # Redis client placeholder. 16 | server = None 17 | 18 | def start_requests(self): 19 | """Returns a batch of start requests from redis.""" 20 | return self.next_requests() 21 | 22 | def setup_redis(self, crawler=None): 23 | """Setup redis connection and idle signal. 24 | 25 | This should be called after the spider has set its crawler object. 26 | """ 27 | if self.server is not None: 28 | return 29 | 30 | if crawler is None: 31 | # We allow optional crawler argument to keep backwards 32 | # compatibility. 33 | # XXX: Raise a deprecation warning. 34 | crawler = getattr(self, 'crawler', None) 35 | 36 | if crawler is None: 37 | raise ValueError("crawler is required") 38 | 39 | settings = crawler.settings 40 | 41 | if self.redis_key is None: 42 | self.redis_key = settings.get( 43 | 'REDIS_START_URLS_KEY', defaults.START_URLS_KEY, 44 | ) 45 | 46 | self.redis_key = self.redis_key % {'name': self.name} 47 | 48 | if not self.redis_key.strip(): 49 | raise ValueError("redis_key must not be empty") 50 | 51 | if self.redis_batch_size is None: 52 | # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE). 53 | self.redis_batch_size = settings.getint( 54 | 'REDIS_START_URLS_BATCH_SIZE', 55 | settings.getint('CONCURRENT_REQUESTS'), 56 | ) 57 | 58 | try: 59 | self.redis_batch_size = int(self.redis_batch_size) 60 | except (TypeError, ValueError): 61 | raise ValueError("redis_batch_size must be an integer") 62 | 63 | if self.redis_encoding is None: 64 | self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING) 65 | 66 | self.logger.info("Reading start URLs from redis key '%(redis_key)s' " 67 | "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s", 68 | self.__dict__) 69 | 70 | self.server = connection.from_settings(crawler.settings) 71 | # The idle signal is called when the spider has no requests left, 72 | # that's when we will schedule new requests from redis queue 73 | crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) 74 | 75 | def next_requests(self): 76 | """Returns a request to be scheduled or none.""" 77 | use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) 78 | fetch_one = self.server.spop if use_set else self.server.lpop 79 | # XXX: Do we need to use a timeout here? 80 | found = 0 81 | # TODO: Use redis pipeline execution. 82 | while found < self.redis_batch_size: 83 | data = fetch_one(self.redis_key) 84 | if not data: 85 | # Queue empty. 86 | break 87 | req = self.make_request_from_data(data) 88 | if req: 89 | yield req 90 | found += 1 91 | else: 92 | self.logger.debug("Request not made from data: %r", data) 93 | 94 | if found: 95 | self.logger.debug("Read %s requests from '%s'", found, self.redis_key) 96 | 97 | def make_request_from_data(self, data): 98 | """Returns a Request instance from data coming from Redis. 99 | 100 | By default, ``data`` is an encoded URL. You can override this method to 101 | provide your own message decoding. 102 | 103 | Parameters 104 | ---------- 105 | data : bytes 106 | Message from redis. 107 | 108 | """ 109 | url = bytes_to_str(data, self.redis_encoding) 110 | return self.make_requests_from_url(url) 111 | 112 | def schedule_next_requests(self): 113 | """Schedules a request if available""" 114 | # TODO: While there is capacity, schedule a batch of redis requests. 115 | for req in self.next_requests(): 116 | self.crawler.engine.crawl(req, spider=self) 117 | 118 | def spider_idle(self): 119 | """Schedules a request if available, otherwise waits.""" 120 | # XXX: Handle a sentinel to close the spider. 121 | self.schedule_next_requests() 122 | raise DontCloseSpider 123 | 124 | 125 | class RedisSpider(RedisMixin, Spider): 126 | """Spider that reads urls from redis queue when idle. 127 | 128 | Attributes 129 | ---------- 130 | redis_key : str (default: REDIS_START_URLS_KEY) 131 | Redis key where to fetch start URLs from.. 132 | redis_batch_size : int (default: CONCURRENT_REQUESTS) 133 | Number of messages to fetch from redis on each attempt. 134 | redis_encoding : str (default: REDIS_ENCODING) 135 | Encoding to use when decoding messages from redis queue. 136 | 137 | Settings 138 | -------- 139 | REDIS_START_URLS_KEY : str (default: ":start_urls") 140 | Default Redis key where to fetch start URLs from.. 141 | REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) 142 | Default number of messages to fetch from redis on each attempt. 143 | REDIS_START_URLS_AS_SET : bool (default: False) 144 | Use SET operations to retrieve messages from the redis queue. If False, 145 | the messages are retrieve using the LPOP command. 146 | REDIS_ENCODING : str (default: "utf-8") 147 | Default encoding to use when decoding messages from redis queue. 148 | 149 | """ 150 | 151 | @classmethod 152 | def from_crawler(self, crawler, *args, **kwargs): 153 | obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs) 154 | obj.setup_redis(crawler) 155 | return obj 156 | 157 | 158 | class RedisCrawlSpider(RedisMixin, CrawlSpider): 159 | """Spider that reads urls from redis queue when idle. 160 | 161 | Attributes 162 | ---------- 163 | redis_key : str (default: REDIS_START_URLS_KEY) 164 | Redis key where to fetch start URLs from.. 165 | redis_batch_size : int (default: CONCURRENT_REQUESTS) 166 | Number of messages to fetch from redis on each attempt. 167 | redis_encoding : str (default: REDIS_ENCODING) 168 | Encoding to use when decoding messages from redis queue. 169 | 170 | Settings 171 | -------- 172 | REDIS_START_URLS_KEY : str (default: ":start_urls") 173 | Default Redis key where to fetch start URLs from.. 174 | REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) 175 | Default number of messages to fetch from redis on each attempt. 176 | REDIS_START_URLS_AS_SET : bool (default: True) 177 | Use SET operations to retrieve messages from the redis queue. 178 | REDIS_ENCODING : str (default: "utf-8") 179 | Default encoding to use when decoding messages from redis queue. 180 | 181 | """ 182 | 183 | @classmethod 184 | def from_crawler(self, crawler, *args, **kwargs): 185 | obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs) 186 | obj.setup_redis(crawler) 187 | return obj 188 | -------------------------------------------------------------------------------- /paperSpider/scrapy_redis/utils.py: -------------------------------------------------------------------------------- 1 | import six 2 | 3 | 4 | def bytes_to_str(s, encoding='utf-8'): 5 | """Returns a str if a bytes object is given.""" 6 | if six.PY3 and isinstance(s, bytes): 7 | return s.decode(encoding) 8 | return s 9 | -------------------------------------------------------------------------------- /paperSpider/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2020-03-23 12:18 3 | # @Author : beking 4 | 5 | import redis 6 | redis_cli = redis.StrictRedis() 7 | redis_cli.incr("baidu_count") 8 | --------------------------------------------------------------------------------