├── LICENSE ├── README.md ├── procedure.png ├── spider ├── __init__.py ├── concurrent │ ├── __init__.py │ ├── threads_inst │ │ ├── __init__.py │ │ ├── base.py │ │ ├── fetch.py │ │ ├── parse.py │ │ ├── proxies.py │ │ └── save.py │ └── threads_pool.py ├── instances │ ├── __init__.py │ ├── inst_fetch.py │ ├── inst_parse.py │ ├── inst_proxies.py │ └── inst_save.py └── utilities │ ├── __init__.py │ ├── cfilter.py │ ├── cresult.py │ ├── ctask.py │ └── functions.py └── test.py /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2017 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PSpider 2 | 3 | A simple web spider frame written by Python, which needs Python3.8+ 4 | 5 | ### Features of PSpider 6 | 7 | 1. Support multi-threading crawling mode (using threading) 8 | 2. Support using proxies for crawling (using threading and queue) 9 | 3. Define some utility functions and classes, for example: UrlFilter, get_string_num, etc 10 | 4. Fewer lines of code, easyer to read, understand and expand 11 | 12 | ### Modules of PSpider 13 | 14 | 1. utilities module: define some utilities functions and classes for multi-threading spider 15 | 2. instances module: define classes of Fetcher, Parser, Saver for multi-threading spider 16 | 3. concurrent module: define WebSpiderFrame of multi-threading spider 17 | 18 | ### Procedure of PSpider 19 | 20 |  21 | ①: Fetchers get TaskFetch from QueueFetch, and make requests based on this task 22 | ②: Put the result(TaskParse) of ① to QueueParse, and so Parser can get task from it 23 | ③: Parser gets task from QueueParse, and parses content to get new TaskFetchs and TaskSave 24 | ④: Put the new TaskFetchs to QueueFetch, and so Fetchers can get task from it again 25 | ⑤: Put the TaskSave to QueueSave, and so Saver can get task from it 26 | ⑥: Saver gets TaskSave from QueueSave, and saves items to filesystem or database 27 | ⑦: Proxieser gets proxies from web or database, and puts proxies to QueueProxies 28 | ⑧: Fetcher gets proxies from QueueProxies if needed, and makes requests based on this proxies 29 | 30 | ### Tutorials of PSpider 31 | 32 | **Installation: you'd better use the first method** 33 | (1)Copy the "spider" directory to your project directory, and `import spider` 34 | (2)Install spider to your python system using `python3 setup.py install` 35 | 36 | **See test.py** 37 | 38 | ### TodoList 39 | 40 | 1. More Demos 41 | 2. Distribute Spider 42 | 3. Execute JavaScript 43 | 44 | ### If you have any questions or advices, you can commit "Issues" or "Pull requests" 45 | -------------------------------------------------------------------------------- /procedure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xianhu/PSpider/19c3af7a3ec340137c22b8c27b66673f42feccbd/procedure.png -------------------------------------------------------------------------------- /spider/__init__.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | import WebSpider, instances and utilities for web_spider 5 | """ 6 | 7 | __version__ = "4.0.1" 8 | 9 | from .utilities import * 10 | from .concurrent import TPEnum, WebSpider 11 | from .instances import Fetcher, Parser, Saver, Proxieser 12 | -------------------------------------------------------------------------------- /spider/concurrent/__init__.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | define ThreadPool as WebSpider 5 | """ 6 | 7 | from .threads_inst import TPEnum 8 | from .threads_pool import ThreadPool as WebSpider 9 | -------------------------------------------------------------------------------- /spider/concurrent/threads_inst/__init__.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | define thread instances of url_fetch, htm_parse, item_save and proxies for threads_pool 5 | """ 6 | 7 | from .base import TPEnum, MonitorThread 8 | from .fetch import FetchThread 9 | from .parse import ParseThread 10 | from .save import SaveThread 11 | from .proxies import ProxiesThread 12 | -------------------------------------------------------------------------------- /spider/concurrent/threads_inst/base.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | base.py by xianhu 5 | """ 6 | 7 | import enum 8 | import time 9 | import queue 10 | import logging 11 | import threading 12 | 13 | 14 | class TPEnum(enum.Enum): 15 | """ 16 | enum of TPEnum, define status of threads_pool 17 | """ 18 | URL_FETCH = "url_fetch" 19 | URL_FETCH_RUN = "url_fetch_run" 20 | URL_FETCH_NOT = "url_fetch_not" 21 | URL_FETCH_SUCC = "url_fetch_succ" 22 | URL_FETCH_FAIL = "url_fetch_fail" 23 | 24 | HTM_PARSE = "htm_parse" 25 | HTM_PARSE_RUN = "htm_parse_run" 26 | HTM_PARSE_NOT = "htm_parse_not" 27 | HTM_PARSE_SUCC = "htm_parse_succ" 28 | HTM_PARSE_FAIL = "htm_parse_fail" 29 | 30 | ITEM_SAVE = "item_save" 31 | ITEM_SAVE_RUN = "item_save_run" 32 | ITEM_SAVE_NOT = "item_save_not" 33 | ITEM_SAVE_SUCC = "item_save_succ" 34 | ITEM_SAVE_FAIL = "item_save_fail" 35 | 36 | PROXIES = "proxies" 37 | PROXIES_LEFT = "proxies_left" 38 | PROXIES_FAIL = "proxies_fail" 39 | 40 | 41 | class BaseThread(threading.Thread): 42 | """ 43 | class of BaseThread, as base class of each thread 44 | """ 45 | 46 | def __init__(self, name, worker, pool): 47 | """ 48 | constructor 49 | """ 50 | threading.Thread.__init__(self, name=name) 51 | self._worker = worker 52 | self._pool = pool 53 | return 54 | 55 | def run(self): 56 | """ 57 | rewrite auto-running function 58 | """ 59 | while True: 60 | try: 61 | if not self.working(): 62 | break 63 | except queue.Empty: 64 | if self._pool.is_ready_to_finish(): 65 | break 66 | return 67 | 68 | def working(self): 69 | """ 70 | procedure of each thread, return True to continue, False to stop 71 | """ 72 | raise NotImplementedError 73 | 74 | 75 | # =============================================================================================================================== 76 | def init_monitor(self, name, pool): 77 | """ 78 | constructor of MonitorThread 79 | """ 80 | BaseThread.__init__(self, name, None, pool) 81 | self._init_time = time.time() 82 | 83 | self._last_fetch_num = 0 84 | self._last_parse_num = 0 85 | self._last_save_num = 0 86 | return 87 | 88 | 89 | def work_monitor(self): 90 | """ 91 | procedure of MonitorThread, auto running, return False if you need stop thread 92 | """ 93 | time.sleep(5) 94 | info_list = [] 95 | 96 | fetch_run = self._pool.get_number_dict(TPEnum.URL_FETCH_RUN) 97 | fetch_not = self._pool.get_number_dict(TPEnum.URL_FETCH_NOT) 98 | fetch_succ = self._pool.get_number_dict(TPEnum.URL_FETCH_SUCC) 99 | fetch_fail = self._pool.get_number_dict(TPEnum.URL_FETCH_FAIL) 100 | fetch_temp = (fetch_succ + fetch_fail) - self._last_fetch_num 101 | self._last_fetch_num = fetch_succ + fetch_fail 102 | info_list.append(f"fetch: [RUN={fetch_run}, NOT={fetch_not}, SUCC={fetch_succ}, FAIL={fetch_fail}, {fetch_temp}/5s];") 103 | 104 | parse_run = self._pool.get_number_dict(TPEnum.HTM_PARSE_RUN) 105 | parse_not = self._pool.get_number_dict(TPEnum.HTM_PARSE_NOT) 106 | parse_succ = self._pool.get_number_dict(TPEnum.HTM_PARSE_SUCC) 107 | parse_fail = self._pool.get_number_dict(TPEnum.HTM_PARSE_FAIL) 108 | parse_temp = (parse_succ + parse_fail) - self._last_parse_num 109 | self._last_parse_num = parse_succ + parse_fail 110 | info_list.append(f"parse: [RUN={parse_run}, NOT={parse_not}, SUCC={parse_succ}, FAIL={parse_fail}, {parse_temp}/5s];") 111 | 112 | save_run = self._pool.get_number_dict(TPEnum.ITEM_SAVE_RUN) 113 | save_not = self._pool.get_number_dict(TPEnum.ITEM_SAVE_NOT) 114 | save_succ = self._pool.get_number_dict(TPEnum.ITEM_SAVE_SUCC) 115 | save_fail = self._pool.get_number_dict(TPEnum.ITEM_SAVE_FAIL) 116 | save_temp = (save_succ + save_fail) - self._last_save_num 117 | self._last_save_num = save_succ + save_fail 118 | info_list.append(f"save: [RUN={save_run}, NOT={save_not}, SUCC={save_succ}, FAIL={save_fail}, {save_temp}/5s];") 119 | 120 | if self._pool.get_proxies_flag(): 121 | proxies_left = self._pool.get_number_dict(TPEnum.PROXIES_LEFT) 122 | proxies_fail = self._pool.get_number_dict(TPEnum.PROXIES_FAIL) 123 | info_list.append(f"proxies: [LEFT={proxies_left}, FAIL={proxies_fail}];") 124 | info_list.append(f"total_seconds={int(time.time() - self._init_time)}") 125 | 126 | logging.warning(" ".join(info_list)) 127 | return not self._pool.is_ready_to_finish() 128 | 129 | 130 | # define class of MonitorThread 131 | MonitorThread = type("MonitorThread", (BaseThread,), dict(__init__=init_monitor, working=work_monitor)) 132 | -------------------------------------------------------------------------------- /spider/concurrent/threads_inst/fetch.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | fetch.py by xianhu 5 | """ 6 | 7 | import logging 8 | 9 | from .base import TPEnum, BaseThread 10 | from ...utilities import TaskFetch, ResultFetch 11 | 12 | 13 | class FetchThread(BaseThread): 14 | """ 15 | class of FetchThread, as the subclass of BaseThread 16 | """ 17 | 18 | def __init__(self, name, worker, pool): 19 | """ 20 | constructor, add proxies to this thread 21 | """ 22 | super().__init__(name, worker, pool) 23 | self._proxies = None 24 | return 25 | 26 | def working(self): 27 | """ 28 | procedure of fetching, auto running and return True 29 | """ 30 | # ----*---- 31 | if self._pool.get_proxies_flag() and (not self._proxies): 32 | self._proxies = self._pool.get_a_task(TPEnum.PROXIES) 33 | 34 | # ----1---- 35 | task: TaskFetch = self._pool.get_a_task(TPEnum.URL_FETCH) 36 | 37 | # ----2---- 38 | result: ResultFetch = self._worker.working(task, proxies=self._proxies) 39 | 40 | # ----3---- 41 | if result.state_code > 0: 42 | self._pool.update_number_dict(TPEnum.URL_FETCH_SUCC, +1) 43 | self._pool.add_a_task(TPEnum.HTM_PARSE, result.task_parse) 44 | elif result.state_code == 0: 45 | self._pool.add_a_task(TPEnum.URL_FETCH, TaskFetch.from_task_fetch(task)) 46 | logging.warning("%s repeat: %s, %s", result.excep_class, result.excep_string, str(task)) 47 | else: 48 | self._pool.update_number_dict(TPEnum.URL_FETCH_FAIL, +1) 49 | logging.error("%s error: %s, %s", result.excep_class, result.excep_string, str(task)) 50 | 51 | # ----4---- 52 | self._pool.finish_a_task(TPEnum.URL_FETCH) 53 | 54 | # ----*---- 55 | if self._pool.get_proxies_flag() and self._proxies and (result.state_proxies <= 0): 56 | if result.state_proxies == 0: 57 | self._pool.add_a_task(TPEnum.PROXIES, self._proxies) 58 | else: 59 | self._pool.update_number_dict(TPEnum.PROXIES_FAIL, +1) 60 | self._pool.finish_a_task(TPEnum.PROXIES) 61 | self._proxies = None 62 | 63 | # return 64 | return True 65 | -------------------------------------------------------------------------------- /spider/concurrent/threads_inst/parse.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | parse.py by xianhu 5 | """ 6 | 7 | import logging 8 | 9 | from .base import TPEnum, BaseThread 10 | from ...utilities import TaskParse, ResultParse 11 | 12 | 13 | class ParseThread(BaseThread): 14 | """ 15 | class of ParseThread, as the subclass of BaseThread 16 | """ 17 | 18 | def working(self): 19 | """ 20 | procedure of parsing, auto running and return True 21 | """ 22 | # ----1---- 23 | task: TaskParse = self._pool.get_a_task(TPEnum.HTM_PARSE) 24 | 25 | # ----2---- 26 | result: ResultParse = self._worker.working(task) 27 | 28 | # ----3---- 29 | if result.state_code > 0: 30 | self._pool.update_number_dict(TPEnum.HTM_PARSE_SUCC, +1) 31 | for task_fetch in result.task_fetch_list: 32 | self._pool.add_a_task(TPEnum.URL_FETCH, task_fetch) 33 | if result.task_save is not None: 34 | self._pool.add_a_task(TPEnum.ITEM_SAVE, result.task_save) 35 | else: 36 | self._pool.update_number_dict(TPEnum.HTM_PARSE_FAIL, +1) 37 | logging.error("%s error: %s, %s", result.excep_class, result.excep_string, str(task)) 38 | 39 | # ----4---- 40 | self._pool.finish_a_task(TPEnum.HTM_PARSE) 41 | 42 | # return 43 | return True 44 | -------------------------------------------------------------------------------- /spider/concurrent/threads_inst/proxies.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | proxies.py by xianhu 5 | """ 6 | 7 | import logging 8 | 9 | from .base import TPEnum, BaseThread 10 | from ...utilities import ResultProxies 11 | 12 | 13 | class ProxiesThread(BaseThread): 14 | """ 15 | class of ProxiesThread, as the subclass of BaseThread 16 | """ 17 | 18 | def working(self): 19 | """ 20 | procedure of proxies, auto running, return False if you need stop thread 21 | """ 22 | # ----2---- 23 | result: ResultProxies = self._worker.working() 24 | 25 | # ----3---- 26 | if result.state_code > 0: 27 | for proxies in result.proxies_list: 28 | self._pool.add_a_task(TPEnum.PROXIES, proxies) 29 | else: 30 | logging.warning("%s warning: %s", result.excep_class, result.excep_string) 31 | 32 | # return 33 | return not self._pool.is_ready_to_finish() 34 | -------------------------------------------------------------------------------- /spider/concurrent/threads_inst/save.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | save.py by xianhu 5 | """ 6 | 7 | import logging 8 | 9 | from .base import TPEnum, BaseThread 10 | from ...utilities import TaskSave, ResultSave 11 | 12 | 13 | class SaveThread(BaseThread): 14 | """ 15 | class of SaveThread, as the subclass of BaseThread 16 | """ 17 | 18 | def working(self): 19 | """ 20 | procedure of saving, auto running and return True 21 | """ 22 | # ----1---- 23 | task: TaskSave = self._pool.get_a_task(TPEnum.ITEM_SAVE) 24 | 25 | # ----2---- 26 | result: ResultSave = self._worker.working(task) 27 | 28 | # ----3---- 29 | if result.state_code > 0: 30 | self._pool.update_number_dict(TPEnum.ITEM_SAVE_SUCC, +1) 31 | else: 32 | self._pool.update_number_dict(TPEnum.ITEM_SAVE_FAIL, +1) 33 | logging.error("%s error: %s, %s", result.excep_class, result.excep_string, str(task)) 34 | 35 | # ----4---- 36 | self._pool.finish_a_task(TPEnum.ITEM_SAVE) 37 | 38 | # return 39 | return True 40 | -------------------------------------------------------------------------------- /spider/concurrent/threads_pool.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | threads_pool.py by xianhu 5 | """ 6 | 7 | import copy 8 | import queue 9 | import logging 10 | import threading 11 | 12 | from .threads_inst import * 13 | from ..instances import Fetcher, Parser, Saver, Proxieser 14 | from ..utilities import UrlFilter, TaskFetch 15 | 16 | 17 | class ThreadPool(object): 18 | """ 19 | class of ThreadPool 20 | """ 21 | 22 | def __init__( 23 | self, fetcher: Fetcher, parser: Parser = None, saver: Saver = None, proxieser: Proxieser = None, 24 | url_filter: UrlFilter = None, queue_parse_size=-1, queue_save_size=-1, queue_proxies_size=-1, 25 | ): 26 | """ 27 | constructor, queue_parse_size/queue_save_size/queue_proxies_size are the maximum size of each queue, -1 to no limition 28 | """ 29 | self._inst_fetcher = fetcher # fetcher instance, subclass of Fetcher 30 | self._inst_parser = parser # parser instance, subclass of Parser or None 31 | self._inst_saver = saver # saver instance, subclass of Saver or None 32 | self._inst_proxieser = proxieser # proxieser instance, subclass of Proxieser or None 33 | self._url_filter: UrlFilter = url_filter # default: None, also can be UrlFilter() 34 | 35 | self._thread_fetcher_list = [] # fetcher threads list, define length in start_working() 36 | self._thread_parser = None # parser thread, be None if no instance of parser 37 | self._thread_saver = None # saver thread, be None if no instance of saver 38 | self._thread_proxieser = None # proxieser thread, be None if no instance of proxieser 39 | self._thread_stop_flag = False # default: False, stop flag of threads 40 | 41 | self._queue_fetch = queue.PriorityQueue(-1) # TaskFetch(priority, keys, deep, url, repeat) 42 | self._queue_parse = queue.PriorityQueue(queue_parse_size) # TaskParse(priority, keys, deep, url, content) 43 | self._queue_save = queue.PriorityQueue(queue_save_size) # TaskSave(priority, keys, deep, url, item) 44 | self._queue_proxies = queue.Queue(queue_proxies_size) # {"http": "http://auth@ip:port", "https": "https://auth@ip:port"} 45 | 46 | self._number_dict = { 47 | TPEnum.URL_FETCH_RUN: 0, TPEnum.URL_FETCH_NOT: 0, TPEnum.URL_FETCH_SUCC: 0, TPEnum.URL_FETCH_FAIL: 0, 48 | TPEnum.HTM_PARSE_RUN: 0, TPEnum.HTM_PARSE_NOT: 0, TPEnum.HTM_PARSE_SUCC: 0, TPEnum.HTM_PARSE_FAIL: 0, 49 | TPEnum.ITEM_SAVE_RUN: 0, TPEnum.ITEM_SAVE_NOT: 0, TPEnum.ITEM_SAVE_SUCC: 0, TPEnum.ITEM_SAVE_FAIL: 0, 50 | TPEnum.PROXIES_LEFT: 0, TPEnum.PROXIES_FAIL: 0, 51 | } 52 | self._lock = threading.Lock() 53 | 54 | self._thread_monitor = MonitorThread("monitor", self) 55 | self._thread_monitor.setDaemon(True) 56 | self._thread_monitor.start() 57 | logging.warning("ThreadPool has been initialized") 58 | return 59 | 60 | def set_start_task(self, task_fetch: TaskFetch): 61 | """ 62 | set start task, according to a TaskFetch() 63 | """ 64 | self.add_a_task(TPEnum.URL_FETCH, task_fetch) 65 | return 66 | 67 | def start_working(self, fetchers_num=10): 68 | """ 69 | start this thread pool based on fetchers_num 70 | """ 71 | logging.warning("ThreadPool starts working: urls_num=%s, fetchers_num=%s", self.get_number_dict(TPEnum.URL_FETCH_NOT), fetchers_num) 72 | self._thread_stop_flag = False 73 | 74 | self._thread_fetcher_list = [FetchThread("fetcher-%d" % (i + 1), copy.deepcopy(self._inst_fetcher), self) for i in range(fetchers_num)] 75 | self._thread_parser = ParseThread("parser", self._inst_parser, self) if self._inst_parser else None 76 | self._thread_saver = SaveThread("saver", self._inst_saver, self) if self._inst_saver else None 77 | self._thread_proxieser = ProxiesThread("proxieser", self._inst_proxieser, self) if self._inst_proxieser else None 78 | 79 | for thread_fetcher in self._thread_fetcher_list: 80 | thread_fetcher.setDaemon(True) 81 | thread_fetcher.start() 82 | 83 | if self._thread_parser: 84 | self._thread_parser.setDaemon(True) 85 | self._thread_parser.start() 86 | 87 | if self._thread_saver: 88 | self._thread_saver.setDaemon(True) 89 | self._thread_saver.start() 90 | 91 | if self._thread_proxieser: 92 | self._thread_proxieser.setDaemon(True) 93 | self._thread_proxieser.start() 94 | 95 | logging.warning("ThreadPool starts working successfully") 96 | return 97 | 98 | def wait_for_finished(self): 99 | """ 100 | wait for finishing this thread pool 101 | """ 102 | logging.warning("ThreadPool waits for finishing") 103 | self._thread_stop_flag = True 104 | 105 | for _thread_fetcher in filter(lambda x: x.is_alive(), self._thread_fetcher_list): 106 | _thread_fetcher.join() 107 | 108 | if self._thread_parser and self._thread_parser.is_alive(): 109 | self._thread_parser.join() 110 | 111 | if self._thread_saver and self._thread_saver.is_alive(): 112 | self._thread_saver.join() 113 | 114 | if self._thread_monitor and self._thread_monitor.is_alive(): 115 | self._thread_monitor.join() 116 | 117 | logging.warning("ThreadPool has finished...") 118 | return self._number_dict 119 | 120 | def get_proxies_flag(self): 121 | """ 122 | get proxies flag of this thread pool 123 | """ 124 | return True if self._inst_proxieser else False 125 | 126 | def get_number_dict(self, key=None): 127 | """ 128 | get value of self._number_dict based on key 129 | """ 130 | return self._number_dict[key] if key else self._number_dict 131 | 132 | def update_number_dict(self, key, value): 133 | """ 134 | update value of self._number_dict based on key 135 | """ 136 | self._lock.acquire() 137 | self._number_dict[key] += value 138 | self._lock.release() 139 | return 140 | 141 | def is_ready_to_finish(self): 142 | """ 143 | check state of this thread pool, return True if all tasks finished and self._thread_stop_flag is True 144 | """ 145 | return False if self._number_dict[TPEnum.URL_FETCH_RUN] or self._number_dict[TPEnum.URL_FETCH_NOT] or \ 146 | self._number_dict[TPEnum.HTM_PARSE_RUN] or self._number_dict[TPEnum.HTM_PARSE_NOT] or \ 147 | self._number_dict[TPEnum.ITEM_SAVE_RUN] or self._number_dict[TPEnum.ITEM_SAVE_NOT] or \ 148 | (not self._thread_stop_flag) else True 149 | 150 | def add_a_task(self, task_type, task): 151 | """ 152 | add a task based on task_type, also for proxies 153 | """ 154 | if (task_type == TPEnum.URL_FETCH) and ((task.repeat > 0) or (not self._url_filter) or self._url_filter.check_and_add(task.url)): 155 | self._queue_fetch.put(task, block=False, timeout=None) 156 | self.update_number_dict(TPEnum.URL_FETCH_NOT, +1) 157 | elif (task_type == TPEnum.HTM_PARSE) and self._thread_parser: 158 | self._queue_parse.put(task, block=True, timeout=None) 159 | self.update_number_dict(TPEnum.HTM_PARSE_NOT, +1) 160 | elif (task_type == TPEnum.ITEM_SAVE) and self._thread_saver: 161 | self._queue_save.put(task, block=True, timeout=None) 162 | self.update_number_dict(TPEnum.ITEM_SAVE_NOT, +1) 163 | elif (task_type == TPEnum.PROXIES) and self._thread_proxieser: 164 | self._queue_proxies.put(task, block=True, timeout=None) 165 | self.update_number_dict(TPEnum.PROXIES_LEFT, +1) 166 | return 167 | 168 | def get_a_task(self, task_type): 169 | """ 170 | get a task based on task_type, also for proxies 171 | """ 172 | task = None 173 | if task_type == TPEnum.URL_FETCH: 174 | task = self._queue_fetch.get(block=True, timeout=5) 175 | self.update_number_dict(TPEnum.URL_FETCH_NOT, -1) 176 | self.update_number_dict(TPEnum.URL_FETCH_RUN, +1) 177 | elif task_type == TPEnum.HTM_PARSE: 178 | task = self._queue_parse.get(block=True, timeout=5) 179 | self.update_number_dict(TPEnum.HTM_PARSE_NOT, -1) 180 | self.update_number_dict(TPEnum.HTM_PARSE_RUN, +1) 181 | elif task_type == TPEnum.ITEM_SAVE: 182 | task = self._queue_save.get(block=True, timeout=5) 183 | self.update_number_dict(TPEnum.ITEM_SAVE_NOT, -1) 184 | self.update_number_dict(TPEnum.ITEM_SAVE_RUN, +1) 185 | elif task_type == TPEnum.PROXIES: 186 | task = self._queue_proxies.get(block=True, timeout=5) 187 | self.update_number_dict(TPEnum.PROXIES_LEFT, -1) 188 | return task 189 | 190 | def finish_a_task(self, task_type): 191 | """ 192 | finish a task based on task_type, also for proxies 193 | """ 194 | if task_type == TPEnum.URL_FETCH: 195 | self._queue_fetch.task_done() 196 | self.update_number_dict(TPEnum.URL_FETCH_RUN, -1) 197 | elif task_type == TPEnum.HTM_PARSE: 198 | self._queue_parse.task_done() 199 | self.update_number_dict(TPEnum.HTM_PARSE_RUN, -1) 200 | elif task_type == TPEnum.ITEM_SAVE: 201 | self._queue_save.task_done() 202 | self.update_number_dict(TPEnum.ITEM_SAVE_RUN, -1) 203 | elif task_type == TPEnum.PROXIES: 204 | self._queue_proxies.task_done() 205 | return 206 | -------------------------------------------------------------------------------- /spider/instances/__init__.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | define instances of url_fetch, htm_parse, item_save and proxies for web_spider 5 | """ 6 | 7 | from .inst_fetch import Fetcher 8 | from .inst_parse import Parser 9 | from .inst_save import Saver 10 | from .inst_proxies import Proxieser 11 | -------------------------------------------------------------------------------- /spider/instances/inst_fetch.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | inst_fetch.py by xianhu 5 | """ 6 | 7 | import time 8 | 9 | from ..utilities import TaskFetch, ResultFetch 10 | 11 | 12 | class Fetcher(object): 13 | """ 14 | class of Fetcher, must include function working() 15 | """ 16 | 17 | def __init__(self, sleep_time=0, max_repeat=3): 18 | """ 19 | constructor 20 | :param sleep_time: default 0, sleeping time before fetching 21 | :param max_repeat: default 3, maximum repeat count of fetching 22 | """ 23 | self._sleep_time = sleep_time 24 | self._max_repeat = max_repeat 25 | return 26 | 27 | def working(self, task_fetch: TaskFetch, proxies=None) -> ResultFetch: 28 | """ 29 | working function, must "try-except" and return ResultFetch() 30 | """ 31 | time.sleep(self._sleep_time) 32 | 33 | try: 34 | result_fetch = self.url_fetch(task_fetch, proxies=proxies) 35 | except Exception as excep: 36 | state_code = -1 if task_fetch.repeat >= self._max_repeat else 0 37 | kwargs = dict(excep_class=self.__class__.__name__, excep_string=str(excep)) 38 | result_fetch = ResultFetch(state_code=state_code, state_proxies=-1, task_parse=None, **kwargs) 39 | 40 | return result_fetch 41 | 42 | def url_fetch(self, task_fetch: TaskFetch, proxies=None) -> ResultFetch: 43 | """ 44 | fetch the content of an url. Parameters and returns refer to self.working() 45 | """ 46 | raise NotImplementedError 47 | -------------------------------------------------------------------------------- /spider/instances/inst_parse.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | inst_parse.py by xianhu 5 | """ 6 | 7 | from ..utilities import TaskParse, ResultParse 8 | 9 | 10 | class Parser(object): 11 | """ 12 | class of Parser, must include function working() 13 | """ 14 | 15 | def working(self, task_parse: TaskParse) -> ResultParse: 16 | """ 17 | working function, must "try-except" and return ResultParse() 18 | """ 19 | try: 20 | result_parse = self.htm_parse(task_parse) 21 | except Exception as excep: 22 | kwargs = dict(excep_class=self.__class__.__name__, excep_string=str(excep)) 23 | result_parse = ResultParse(state_code=-1, task_fetch_list=None, task_save=None, **kwargs) 24 | 25 | return result_parse 26 | 27 | def htm_parse(self, task_parse: TaskParse) -> ResultParse: 28 | """ 29 | parse the content of an url. Parameters and returns refer to self.working() 30 | """ 31 | raise NotImplementedError 32 | -------------------------------------------------------------------------------- /spider/instances/inst_proxies.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | inst_proxies.py by xianhu 5 | """ 6 | 7 | from ..utilities import ResultProxies 8 | 9 | 10 | class Proxieser(object): 11 | """ 12 | class of Proxieser, must include function working() 13 | """ 14 | 15 | def working(self) -> ResultProxies: 16 | """ 17 | working function, must "try-except" and return ResultProxies() 18 | """ 19 | try: 20 | result_proxies = self.proxies_get() 21 | except Exception as excep: 22 | kwargs = dict(excep_class=self.__class__.__name__, excep_string=str(excep)) 23 | result_proxies = ResultProxies(state_code=-1, proxies_list=None, **kwargs) 24 | 25 | return result_proxies 26 | 27 | def proxies_get(self) -> ResultProxies: 28 | """ 29 | get proxies from web or database. Parameters and returns refer to self.working() 30 | """ 31 | raise NotImplementedError 32 | -------------------------------------------------------------------------------- /spider/instances/inst_save.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | inst_save.py by xianhu 5 | """ 6 | 7 | from ..utilities import TaskSave, ResultSave 8 | 9 | 10 | class Saver(object): 11 | """ 12 | class of Saver, must include function working() 13 | """ 14 | 15 | def working(self, task_save: TaskSave) -> ResultSave: 16 | """ 17 | working function, must "try-except" and return ResultSave() 18 | """ 19 | try: 20 | result_save = self.item_save(task_save) 21 | except Exception as excep: 22 | kwargs = dict(excep_class=self.__class__.__name__, excep_string=str(excep)) 23 | result_save = ResultSave(state_code=-1, **kwargs) 24 | 25 | return result_save 26 | 27 | def item_save(self, task_save: TaskSave) -> ResultSave: 28 | """ 29 | save the content of an url. Parameters and returns refer to self.working() 30 | """ 31 | raise NotImplementedError 32 | -------------------------------------------------------------------------------- /spider/utilities/__init__.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | define utilities for web_spider 5 | """ 6 | 7 | from .functions import * 8 | from .cfilter import UrlFilter 9 | from .ctask import Task, TaskFetch, TaskParse, TaskSave 10 | from .cresult import ResultFetch, ResultParse, Result as ResultSave, ResultProxies 11 | -------------------------------------------------------------------------------- /spider/utilities/cfilter.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | cfilter.py by xianhu 5 | """ 6 | 7 | 8 | class UrlFilter(object): 9 | """ 10 | class of UrlFilter, to filter urls by regexs and set 11 | """ 12 | 13 | def __init__(self, black_patterns=tuple(), white_patterns=tuple()): 14 | """ 15 | constructor 16 | """ 17 | self._url_set = set() 18 | self._re_black_list = black_patterns 19 | self._re_white_list = white_patterns 20 | return 21 | 22 | def check(self, url): 23 | """ 24 | check the url based on re_black_list and re_white_list 25 | """ 26 | for re_black in self._re_black_list: 27 | if re_black.search(url): 28 | return False 29 | 30 | for re_white in self._re_white_list: 31 | if re_white.search(url): 32 | return True 33 | 34 | return False if self._re_white_list else True 35 | 36 | def check_and_add(self, url): 37 | """ 38 | check whether url is in set, and add url to it 39 | """ 40 | result = False 41 | if self.check(url): 42 | result = (url not in self._url_set) 43 | self._url_set.add(url) 44 | return result 45 | -------------------------------------------------------------------------------- /spider/utilities/cresult.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | cresult.py by xianhu 5 | """ 6 | 7 | 8 | class Result(object): 9 | """ 10 | class of Result, to define result of fetcher, parser and saver 11 | """ 12 | 13 | def __init__(self, state_code: int, excep_class: str = None, excep_string: str = None): 14 | """ 15 | constructor 16 | :param excep_class: name of class which raised exception 17 | :param excep_string: string of exception, error message 18 | """ 19 | self.state_code = state_code 20 | self.excep_class = excep_class or "" 21 | self.excep_string = excep_string or "" 22 | return 23 | 24 | 25 | class ResultFetch(Result): 26 | """ 27 | class of ResultFetch, to define result of fetcher 28 | """ 29 | 30 | def __init__(self, state_code, state_proxies=1, task_parse=None, excep_class=None, excep_string=None): 31 | """ 32 | constructor 33 | :param state_code: can be -1(fetch failed), 0(need repeat), 1(fetch success) 34 | :param state_proxies: can be -1(unavaiable), 0(return to queue), 1(avaiable) 35 | """ 36 | super().__init__(state_code, excep_class, excep_string) 37 | self.state_proxies = state_proxies 38 | self.task_parse = task_parse 39 | return 40 | 41 | 42 | class ResultParse(Result): 43 | """ 44 | class of ResultParse, to define result of parser 45 | """ 46 | 47 | def __init__(self, state_code, task_fetch_list=None, task_save=None, excep_class=None, excep_string=None): 48 | """ 49 | constructor 50 | :param state_code: can be -1(parse failed), 1(parse success) 51 | """ 52 | super().__init__(state_code, excep_class, excep_string) 53 | self.task_fetch_list = task_fetch_list or [] 54 | self.task_save = task_save 55 | return 56 | 57 | 58 | class ResultProxies(Result): 59 | """ 60 | class of ResultProxies, to define result of proxieser 61 | """ 62 | 63 | def __init__(self, state_code, proxies_list=None, excep_class=None, excep_string=None): 64 | """ 65 | constructor 66 | :param state_code: can be -1(save failed), 1(save success) 67 | """ 68 | super().__init__(state_code, excep_class, excep_string) 69 | self.proxies_list = proxies_list or [] 70 | return 71 | -------------------------------------------------------------------------------- /spider/utilities/ctask.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | 3 | """ 4 | ctask.py by xianhu 5 | """ 6 | 7 | import re 8 | from typing import TypeVar 9 | 10 | 11 | class Task(object): 12 | """ 13 | class of Task, to define task of fetcher, parser and saver 14 | """ 15 | # class variable, which to define type of parameters 16 | TypeContent = TypeVar("TypeContent", str, tuple, list, dict) 17 | TypeItem = TypeVar("TypeItem", str, tuple, list, dict) 18 | 19 | # class variable, which to parse error message to get a TaskFetch() 20 | re_obj = re.compile(r"priority=(?P
\d+),\s*keys=(?P