├── CrawlersTools ├── extractors │ ├── utils │ │ ├── __init__.py │ │ ├── cluster.py │ │ ├── similarity.py │ │ ├── preprocess.py │ │ ├── settings.py │ │ └── element.py │ ├── schemas │ │ ├── __init__.py │ │ └── element.py │ ├── base.py │ ├── __init__.py │ ├── attachment_extractor.py │ ├── time_extractor.py │ ├── title_extractor.py │ ├── content_extractor.py │ └── list_extractor.py ├── projects │ ├── __init__.py │ ├── filters.py │ └── upload_oss.py ├── schedules │ ├── __init__.py │ └── auto_thread.py ├── requirements.txt ├── logs │ ├── __init__.py │ ├── handlers.py │ ├── log.py │ ├── logger.py │ └── formatters.py ├── preprocess │ ├── __init__.py │ ├── time_process.py │ └── bloom_filter.py ├── js_crawler │ ├── __init__.py │ ├── transfer_js.py │ └── font_decrypt.py ├── pipelines │ ├── __init__.py │ ├── redis_pipeline.py │ ├── kafka_operate.py │ ├── mongo_pipeline.py │ └── mysql_pipeline.py ├── requests │ ├── __init__.py │ ├── proxy.py │ ├── base_requests.py │ └── random_ua.py ├── __init__.py └── utils │ └── str_compare.py ├── .github └── workflows │ └── python-publish.yml ├── setup.py ├── .gitignore ├── LICENSE └── README.md /CrawlersTools/extractors/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/schemas/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CrawlersTools/projects/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/15 11:29 4 | # @Author : MuggleK 5 | # @File : __init__.py 6 | -------------------------------------------------------------------------------- /CrawlersTools/schedules/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/15 11:29 4 | # @Author : MuggleK 5 | # @File : __init__.py 6 | 7 | from CrawlersTools.schedules.auto_thread import AutoThread 8 | -------------------------------------------------------------------------------- /CrawlersTools/requirements.txt: -------------------------------------------------------------------------------- 1 | auto_mix_prep 2 | DBUtils==1.3 3 | fontTools 4 | httpx 5 | httpx[http2] 6 | loguru 7 | Pillow 8 | PyExecJS==1.5.1 9 | pymongo 10 | PyMySQL 11 | redis 12 | tqdm 13 | PyYAML 14 | lxml 15 | numpy 16 | Distance 17 | chardet 18 | sinan 19 | kafka-python 20 | -------------------------------------------------------------------------------- /CrawlersTools/logs/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/11 21:05 4 | # @Author : MuggleK 5 | # @File : logs.py 6 | 7 | from .logger import init_logger 8 | from .handlers import default_handler, logstash_handler 9 | 10 | from CrawlersTools.logs.log import Logging 11 | -------------------------------------------------------------------------------- /CrawlersTools/preprocess/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/15 11:29 4 | # @Author : MuggleK 5 | # @File : __init__.py 6 | 7 | from CrawlersTools.preprocess.bloom_filter import BloomFilter 8 | 9 | from CrawlersTools.preprocess.time_process import TimeProcessor 10 | -------------------------------------------------------------------------------- /CrawlersTools/js_crawler/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/15 11:29 4 | # @Author : MuggleK 5 | # @File : __init__.py 6 | 7 | from CrawlersTools.js_crawler.font_decrypt import FontDecrypt 8 | from CrawlersTools.js_crawler.transfer_js import int_overflow, right_shift, string_similar 9 | -------------------------------------------------------------------------------- /CrawlersTools/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/15 11:29 4 | # @Author : MuggleK 5 | # @File : __init__.py 6 | 7 | from CrawlersTools.pipelines.mongo_pipeline import MongoPipeline 8 | 9 | from CrawlersTools.pipelines.mysql_pipeline import MysqlPipeline 10 | 11 | from CrawlersTools.pipelines.redis_pipeline import RedisPipeline 12 | -------------------------------------------------------------------------------- /CrawlersTools/requests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/6/21 17:08 4 | # @Author : MuggleK 5 | # @File : __init__.py 6 | 7 | import os 8 | 9 | from CrawlersTools.requests.base_requests import BaseRequests 10 | from CrawlersTools.requests.proxy import get_proxies 11 | from CrawlersTools.requests.random_ua import UserAgent 12 | 13 | PROJECT_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) 14 | 15 | 16 | base_requests = BaseRequests().base_requests 17 | -------------------------------------------------------------------------------- /CrawlersTools/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/12 20:48 4 | # @Author : MuggleK 5 | # @File : __init__.py 6 | 7 | from CrawlersTools.extractors import PolicyExtractor, ListExtractor 8 | from .logs.logger import init_logger 9 | from CrawlersTools.logs import Logging 10 | from CrawlersTools.pipelines import MysqlPipeline, MongoPipeline, RedisPipeline 11 | from CrawlersTools.preprocess import TimeProcessor 12 | from CrawlersTools.requests import base_requests, get_proxies, UserAgent 13 | -------------------------------------------------------------------------------- /CrawlersTools/requests/proxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/6/21 17:06 4 | # @Author : MuggleK 5 | # @File : proxy.py 6 | 7 | import httpx 8 | 9 | from loguru import logger 10 | 11 | 12 | def get_proxies(proxy_url=None, http2=False): 13 | """ 14 | 默认httpx代理模式 15 | @param proxy_url: 代理请求链接 16 | @param http2: 默认http1.1规则 17 | @return: 18 | """ 19 | if not proxy_url: return 20 | 21 | protocol = 'http://' 22 | try: 23 | proxy = httpx.get(proxy_url).text.strip() 24 | proxy = protocol + proxy 25 | if http2: 26 | return {protocol: proxy, 'https://': proxy} 27 | return {"http": proxy, "https": proxy} 28 | except Exception as err: 29 | logger.error(f'获取代理失败:{err}') 30 | -------------------------------------------------------------------------------- /CrawlersTools/pipelines/redis_pipeline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/12 9:12 4 | # @Author : MuggleK 5 | # @File : redis_pipeline.py 6 | 7 | import redis 8 | 9 | 10 | class RedisPipeline(object): 11 | def __init__(self, name, namespace, **redis_kwargs): 12 | self.__db = redis.Redis(**redis_kwargs) 13 | self.key = '%s:%s' % (namespace, name) 14 | 15 | def qsize(self): 16 | return self.__db.llen(self.key) 17 | 18 | def put(self, item): 19 | self.__db.rpush(self.key, item) 20 | 21 | def get_wait(self, timeout=None): 22 | item = self.__db.blpop(self.key, timeout=timeout) 23 | return item 24 | 25 | def get_nowait(self): 26 | item = self.__db.lpop(self.key) 27 | return item 28 | -------------------------------------------------------------------------------- /CrawlersTools/js_crawler/transfer_js.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/12 16:59 4 | # @Author : MuggleK 5 | # @File : transfer_js.py 6 | 7 | import ctypes 8 | import difflib 9 | 10 | 11 | def int_overflow(val: int): 12 | """ 13 | Process JavaScript nums Overflow 14 | :param val: 15 | :return: 16 | """ 17 | maxint = 2147483647 18 | if not -maxint - 1 <= val <= maxint: 19 | val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1 20 | return val 21 | 22 | 23 | def right_shift(n, i): 24 | """ 25 | Python Operator ">>" 26 | :param n: 27 | :param i: 28 | :return: 29 | """ 30 | if n < 0: 31 | n = ctypes.c_uint32(n).value 32 | if i < 0: 33 | return -int_overflow(n << abs(i)) 34 | if i != 0: 35 | return int_overflow(n >> i) 36 | else: 37 | return n 38 | 39 | 40 | def string_similar(s1: str, s2: str): 41 | """ 42 | Compare Strings Similar Percentage 43 | :param s1: 44 | :param s2: 45 | :return: :float: percentage 46 | """ 47 | return difflib.SequenceMatcher(None, s1, s2).quick_ratio() 48 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: push 12 | 13 | permissions: 14 | contents: read 15 | 16 | jobs: 17 | deploy: 18 | 19 | runs-on: ubuntu-latest 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: '3.x' 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install build 31 | - name: Build package 32 | run: python -m build 33 | - name: Publish package 34 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 35 | with: 36 | user: __token__ 37 | password: ${{ secrets.PYPI_API_TOKEN }} 38 | -------------------------------------------------------------------------------- /CrawlersTools/pipelines/kafka_operate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2022/9/5 15:32 4 | # @Author : xc 5 | # @File : kafka_operate 6 | # @Software: PyCharm 7 | 8 | 9 | import json 10 | from kafka import KafkaProducer 11 | from loguru import logger 12 | 13 | 14 | class KProducer: 15 | def __init__(self, bootstrap_servers, topic): 16 | """ 17 | kafka 生产者 18 | :param bootstrap_servers: 地址 19 | :param topic: topic 20 | """ 21 | self.producer = KafkaProducer( 22 | bootstrap_servers=bootstrap_servers, 23 | value_serializer=lambda m: json.dumps(m).encode('ascii'), ) # json 格式化发送的内容 24 | self.topic = topic 25 | 26 | def sync_producer(self, data_li: list): 27 | """ 28 | 同步发送 数据 29 | :param data_li: 发送数据 30 | :return: 31 | """ 32 | for data in data_li: 33 | future = self.producer.send(self.topic, data) 34 | record_metadata = future.get(timeout=10) # 同步确认消费 35 | partition = record_metadata.partition # 数据所在的分区 36 | offset = record_metadata.offset # 数据所在分区的位置 37 | logger.success('save success, partition: {}, offset: {}'.format(partition, offset)) 38 | -------------------------------------------------------------------------------- /CrawlersTools/utils/str_compare.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | def str_compare(str1: str, str2: str, half_compare=False) -> bool: 6 | """ 7 | 比较两个字符串是否相等,当half_compare为True时会把字符串转为半角字符串之后在比较 8 | 适用:比较两个公司名,原则上 特斯拉(上海)有限公司 和 特斯拉(上海)有限公司 这两个公司是同一个,但是因为一个是全角括号,一个是半角括号, 9 | 直接比较会导致两个公司名不相等,这时候转换为半角在进行比较则不会出现这个问题. 10 | """ 11 | str1 = full_str_to_half_str(str1) if half_compare else str1 12 | str2 = full_str_to_half_str(str2) if half_compare else str2 13 | if str1 == str2: 14 | return True 15 | return False 16 | 17 | 18 | def full_str_to_half_str(full_str: str) -> str: 19 | # 全角字符串转半角字符串 20 | half_str = "" 21 | for _str in full_str: 22 | _str_code = ord(_str) 23 | if _str_code == 12288: # 全角空格转半角空格 24 | _str_code = 32 25 | elif 65281 <= _str_code <= 65374: 26 | _str_code -= 65248 27 | half_str += chr(_str_code) 28 | return half_str 29 | 30 | 31 | if __name__ == '__main__': 32 | print(str_compare('特斯拉(上海)有限公司', '特斯拉(上海)有限公司')) 33 | print(str_compare('特斯拉(上海)有限公司', '特斯拉(上海)有限公司', half_compare=True)) # 全角括号转半角括号 34 | print(str_compare(' 特斯拉(上海)有限公司', ' 特斯拉(上海)有限公司')) 35 | print(str_compare(' 特斯拉(上海)有限公司', ' 特斯拉(上海)有限公司', half_compare=True)) # 全角空格转半角空格 36 | -------------------------------------------------------------------------------- /CrawlersTools/logs/handlers.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | from .formatters import JsonFormatter 5 | 6 | DEFAULT_HANDLER_FORMAT = "{time:YYYY-MM-DD HH:mm:ss.SSS} | {level:8} | " \ 7 | "{name}:{function}:{line} - " \ 8 | "{message}" 9 | LOGSTASH_HANDLER_FORMAT = "{message}" 10 | 11 | 12 | def default_handler(level="DEBUG", format=DEFAULT_HANDLER_FORMAT, **kwargs) -> dict: 13 | return dict(sink=sys.stderr, level=level, format=format, **kwargs) 14 | 15 | 16 | class LogstashHandler(logging.StreamHandler): 17 | def __init__(self, formatter=None): 18 | super().__init__() 19 | self.formatter = formatter 20 | 21 | 22 | def logstash_handler( 23 | level="INFO", 24 | format=LOGSTASH_HANDLER_FORMAT, 25 | extra=None, 26 | **kwargs 27 | ) -> dict: 28 | if extra is None: 29 | extra = {} 30 | elif not isinstance(extra, dict): 31 | raise TypeError( 32 | "The 'extra' parameter should be a dict (or None), not: '%s'" 33 | % type(extra).__name__ 34 | ) 35 | 36 | return dict( 37 | sink=LogstashHandler(JsonFormatter(**extra)), 38 | level=level, 39 | format=format, 40 | **kwargs 41 | ) 42 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/base.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | from lxml.html import etree 3 | from lxml.html import fromstring 4 | 5 | from CrawlersTools.extractors.schemas.element import Element 6 | 7 | 8 | class BaseExtractor(object): 9 | """ 10 | Base Extractor which provide common methods 11 | """ 12 | 13 | kwargs = None 14 | 15 | @staticmethod 16 | def to_string(element: Element, limit: int = None): 17 | """ 18 | convert element to string 19 | :param element: 20 | :param limit: 21 | :return: 22 | """ 23 | result = etree.tostring(element, pretty_print=True, encoding="utf-8", method='html').decode('utf-8') 24 | if limit: 25 | return result[:limit] 26 | return result 27 | 28 | def process(self, element: Element): 29 | """ 30 | process method that you should implement 31 | :param element: 32 | :return: 33 | """ 34 | logger.error('You must implement process method in your extractor.') 35 | raise NotImplementedError 36 | 37 | def extract(self, html, **kwargs): 38 | """ 39 | base extract method, firstly, it will convert html to WebElement, then it call 40 | process method that child class implements 41 | :param html: 42 | :return: 43 | """ 44 | self.kwargs = kwargs 45 | element = fromstring(html=html) 46 | element.__class__ = Element 47 | return self.process(element) 48 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/12 21:00 4 | # @Author : MuggleK 5 | # @File : setup.py 6 | 7 | from setuptools import setup, find_packages 8 | 9 | with open("README.md", "r", encoding='utf-8') as f: 10 | long_description = f.read() 11 | 12 | setup( 13 | name='CrawlersTools', # 包名 14 | version='1.4.81', # 版本号 15 | description='Tools for Crawlers', 16 | long_description=long_description, 17 | long_description_content_type="text/markdown", 18 | author='MuggleK', 19 | author_email='peichangchuan@gmail.com', 20 | url='https://github.com/MuggleK/CrawlersTools', 21 | install_requires=[ 22 | "auto_mix_prep", 23 | "DBUtils==1.3", 24 | "fontTools", 25 | "httpx", 26 | "httpx[http2]", 27 | "loguru", 28 | "Pillow", 29 | "PyExecJS==1.5.1", 30 | "pymongo", 31 | "PyMySQL", 32 | "redis", 33 | "tqdm", 34 | "PyYAML", 35 | "lxml", 36 | "numpy", 37 | "Distance", 38 | "chardet", 39 | "sinan", 40 | "kafka-python" 41 | ], 42 | license='BSD License', 43 | packages=find_packages(where='.', exclude=(), include=('*',)), 44 | platforms=["all"], 45 | classifiers=[ 46 | 'Intended Audience :: Developers', 47 | 'Operating System :: OS Independent', 48 | 'Natural Language :: Chinese (Simplified)', 49 | 'Programming Language :: Python :: 3.7', 50 | 'Topic :: Software Development :: Libraries' 51 | ], 52 | ) 53 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/15 11:29 4 | # @Author : MuggleK 5 | # @File : __init__.py 6 | 7 | from CrawlersTools.extractors.attachment_extractor import AttachmentExtractor 8 | from CrawlersTools.extractors.content_extractor import ContentExtractor 9 | from CrawlersTools.extractors.list_extractor import ListExtractor 10 | from CrawlersTools.extractors.time_extractor import TimeExtractor 11 | from CrawlersTools.extractors.title_extractor import TitleExtractor 12 | 13 | 14 | class PolicyExtractor(object): 15 | 16 | @staticmethod 17 | def extract( 18 | html, 19 | title_xpath: str = "", 20 | publish_time_xpath: str = "", 21 | content_xpath: str = "", 22 | attachment_xpath: str = "", 23 | attachment_regx: str = "" 24 | ) -> dict: 25 | title = TitleExtractor().extract(html, title_xpath=title_xpath) 26 | publish_time = TimeExtractor().extract(html, publish_time_xpath=publish_time_xpath) 27 | content, content_with_tag, images = ContentExtractor().extract(html, content_xpath=content_xpath) 28 | attachments = AttachmentExtractor().extract(html, attachment_xpath=attachment_xpath, attachment_regx=attachment_regx) 29 | 30 | return { 31 | "title": title, 32 | "publish_time": publish_time, 33 | "content": content, 34 | "content_with_tag": content_with_tag, 35 | "images": images, 36 | "attachment": attachments 37 | } 38 | -------------------------------------------------------------------------------- /CrawlersTools/logs/log.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/6/21 17:08 4 | # @Author : MuggleK 5 | # @File : logs.py 6 | 7 | import time 8 | 9 | from loguru import logger 10 | 11 | time_format = time.strftime("%Y_%m_%d") 12 | log_format = "{time:YYYY-MM-DD HH:mm:ss}|{level}| {name}:{function}:{line}| {message}" 13 | 14 | 15 | class Logging(object): 16 | """ 17 | Usage:: 18 | 19 | # >>> 20 | # >>> logger = Logging('logs') 21 | # >>> logger.info('Logging Example') 22 | # 2022-01-20 17:27:32.194 | INFO | __main__:info:149 - Logging Example 23 | # >>> 24 | """ 25 | 26 | __instance = None 27 | 28 | def __new__(cls, log_path, *args, **kwargs): 29 | if not cls.__instance: 30 | cls.__instance = super(Logging, cls).__new__(cls, *args, **kwargs) 31 | 32 | return cls.__instance 33 | 34 | def __init__(self, log_path, expire_date="10 days"): 35 | logger.add(f"{log_path}/log_{time_format}_info.log", encoding="utf-8", enqueue=True, retention="1 months", level="INFO", format=log_format) 36 | logger.add(f"{log_path}/log_{time_format}_error.log", encoding="utf-8", enqueue=True, retention=expire_date, level="ERROR", format=log_format) 37 | logger.add(f"{log_path}/log_{time_format}_debug.log", encoding="utf-8", enqueue=True, retention=expire_date, level="DEBUG", format=log_format) 38 | 39 | @staticmethod 40 | def info(msg): 41 | return logger.info(msg) 42 | 43 | @staticmethod 44 | def debug(msg): 45 | return logger.debug(msg) 46 | 47 | @staticmethod 48 | def warning(msg): 49 | return logger.warning(msg) 50 | 51 | @staticmethod 52 | def error(msg): 53 | return logger.error(msg) 54 | 55 | @staticmethod 56 | def success(msg): 57 | return logger.success(msg) 58 | -------------------------------------------------------------------------------- /CrawlersTools/projects/filters.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/12 14:48 4 | # @Author : MuggleK 5 | # @File : filters.py 6 | 7 | import re 8 | from functools import reduce 9 | from urllib.parse import urlparse 10 | 11 | from loguru import logger 12 | 13 | 14 | def empty_text(lis): 15 | word = "" 16 | for i in lis: 17 | word += i.strip() 18 | return word 19 | 20 | 21 | def filter_title(title: str, remove_list: list): 22 | """ 23 | 24 | :param title: 文章标题 25 | :param remove_list: 过滤关键词列表 26 | :return: 27 | """ 28 | if not title: 29 | return False 30 | for r in remove_list: 31 | and_lists = r.split("and") 32 | if len(and_lists) == 1: 33 | if and_lists[0] in title: 34 | logger.debug(f"过滤标题: {title} 过滤词: {r}") 35 | return True 36 | else: 37 | total = [1 for a in and_lists if a in title] 38 | result = reduce(lambda x, y: x + y, total) 39 | if len(and_lists) != result: 40 | continue 41 | return True 42 | 43 | 44 | def filter_text(text, removes: list): 45 | """ 46 | :param text: 正文字段 47 | :param removes: 需要去掉的特殊字段:扫一扫,【关闭】,【打印】 48 | :return: 49 | """ 50 | if removes: 51 | for remove in removes: 52 | text = text.replace(remove, '') 53 | return text 54 | 55 | 56 | def filter_allowed_url(url, main_url, other_domains): 57 | other_domains = other_domains if other_domains else [] 58 | main_url = main_url[0] if isinstance(main_url, list) else main_url 59 | allowed_domains = [urlparse(main_url).netloc] + other_domains if urlparse(main_url).netloc else other_domains + [main_url] 60 | for domain in allowed_domains: 61 | if (not domain) or re.search(domain, url): 62 | return True 63 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/utils/cluster.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | from CrawlersTools.extractors.utils.similarity import similarity 4 | 5 | 6 | def cluster(items, threshold=0.9): 7 | """ 8 | cluster names 9 | :param items: 10 | :param threshold: 11 | :return: cluster map, for example {"foo": 0, "bar": 1} 12 | """ 13 | number = -1 14 | clusters_map = {} 15 | clusters = [] 16 | for name in items: 17 | for c in clusters: 18 | if all(similarity(name, w) > threshold for w in c): 19 | c.append(name) 20 | clusters_map[name] = number 21 | break 22 | else: 23 | number += 1 24 | clusters.append([name]) 25 | clusters_map[name] = number 26 | return clusters_map 27 | 28 | 29 | def cluster_dict(data: dict, threshold=0.8): 30 | """ 31 | cluster dict, convert id key to cluster id key 32 | :param threshold: 33 | :param data: 34 | :return: 35 | """ 36 | ids = data.keys() 37 | clusters_map = cluster(ids, threshold) 38 | result = defaultdict(list) 39 | for k, v in data.items(): 40 | if isinstance(v, list): 41 | for i in v: 42 | result[clusters_map[k]].append(i) 43 | else: 44 | result[clusters_map[k]].append(v) 45 | return dict(result) 46 | 47 | 48 | if __name__ == '__main__': 49 | data = { 50 | '/html/body/div[@class="main"]/div[1]/ul': ['child1', 'child2', 'child3'], 51 | '/html/body/div[@class="main"]/div[2]/ul': ['child4', 'child5', 'child6'], 52 | '/html/body/div[@class="main"]/div[3]/ul': ['child7', 'child8', 'child9'], 53 | '/html/body/header/div[1]': ['child10', 'child11', 'child12'], 54 | '/html/body/header/div[2]': ['child13', 'child14', 'child15'], 55 | } 56 | print(cluster_dict(data, threshold=0.7)) 57 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/attachment_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/12/21 10:54 4 | # @Author : MuggleK 5 | # @File : attachment_extractor.py 6 | 7 | import re 8 | 9 | from CrawlersTools.extractors.base import BaseExtractor 10 | from CrawlersTools.extractors.schemas.element import Element 11 | from CrawlersTools.extractors.utils.settings import ATTACHMENT_REGX 12 | 13 | 14 | class AttachmentExtractor(BaseExtractor): 15 | """ 16 | extract content from detail page 17 | """ 18 | 19 | def process(self, element: Element): 20 | """ 21 | extract content from html 22 | :param element: 23 | :return: 24 | """ 25 | attachment_list = list() 26 | attachment_xpath = self.kwargs.get("attachment_xpath") or "//a" 27 | for attachment_element in element.xpath(attachment_xpath): 28 | url = [i.strip() for i in attachment_element.xpath("@href") or attachment_element.xpath("@src")] 29 | name = [i.strip() for i in attachment_element.xpath(".//text()")] 30 | if not (''.join(url).strip() and ''.join(name).strip()): 31 | continue 32 | suffix = self.filter_suffix(url[0], name[0]) 33 | if not suffix: continue 34 | attachment_list.append({ 35 | "file_url": url[0], 36 | "file_name": name[0] 37 | }) 38 | return attachment_list 39 | 40 | def filter_suffix(self, url, name): 41 | """ 42 | 附件.xls.doc 可上传, 接口会默认取最后一个 43 | 优先取 file_url 后缀 44 | """ 45 | regx = self.kwargs.get("attachment_regx") or ATTACHMENT_REGX 46 | is_name_suffix = re.search(regx, name, re.I) 47 | is_url_suffix = re.search(regx, url, re.I) 48 | name_suffix = is_name_suffix.group(1) if is_name_suffix else "" 49 | url_suffix = is_url_suffix.group(1) if is_url_suffix else "" 50 | 51 | return name_suffix or url_suffix 52 | -------------------------------------------------------------------------------- /CrawlersTools/logs/logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | - logging 日志拦截转发到 loguru 3 | - 日志输出为json并精简&自定义字段 4 | - 日志拦截与输出的 集成&单独 方法 5 | """ 6 | import time 7 | from typing import List, Optional, Dict 8 | 9 | from loguru import logger 10 | 11 | from .handlers import default_handler, DEFAULT_HANDLER_FORMAT 12 | 13 | TIME_FORMAT = time.strftime("%Y_%m_%d") 14 | 15 | 16 | def init_logger( 17 | handlers: Optional[List[Dict]] = None, 18 | add_file_handler: bool = False, 19 | log_path: str = "./", 20 | file_handler_level: str = "DEBUG", 21 | file_handler_format: str = DEFAULT_HANDLER_FORMAT, 22 | **kwargs 23 | ): 24 | """ 25 | 一键配置 loguru ,所属程序本身的日志可直接 from loguru import logger ,即可正常处理 26 | 27 | :param handlers: 日志处理的 handlers ,参见 loguru.configure ,默认配置了 default_handler ,其他预置的可以从 .logger.handlers 导入 28 | :param add_file_handler: 开启后,会添加一个默认的文件输出 handler 29 | :param log_path: 日志文件的路径,默认当前目录 30 | :param file_handler_level: 文件输出 handler 的日志级别,默认 DEBUG 31 | :param file_handler_format: 文件输出 handler 的日志格式,默认 DEFAULT_HANDLER_FORMAT 32 | :param kwargs: 其他要传递给 logger.configure 的参数 33 | """ 34 | if handlers is None: 35 | handlers = [default_handler()] 36 | elif not isinstance(handlers, list): 37 | raise TypeError( 38 | "The 'handlers' parameter should be a list (or None), not: '%s'" 39 | % type(handlers).__name__ 40 | ) 41 | 42 | extra = kwargs.pop("extra", {}) 43 | if not isinstance(extra, dict): 44 | raise TypeError( 45 | "The 'extra' parameter should be a dict (or None), not: '%s'" 46 | % type(extra).__name__ 47 | ) 48 | 49 | logger.configure(handlers=handlers, extra=extra, **kwargs) 50 | 51 | if add_file_handler: 52 | expire_date = kwargs.pop("expire_date", "1 days") 53 | logger.add( 54 | f"{log_path}/log_{TIME_FORMAT}_{file_handler_level.lower()}.log", 55 | encoding="UTF-8", enqueue=True, retention=expire_date, 56 | level=file_handler_level, format=file_handler_format 57 | ) 58 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/time_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/11/3 9:55 4 | # @Author : MuggleK 5 | # @File : time_extractor.py 6 | 7 | import re 8 | 9 | from lxml.html import etree 10 | 11 | from CrawlersTools.extractors.base import BaseExtractor 12 | from CrawlersTools.extractors.schemas.element import Element 13 | from CrawlersTools.extractors.utils.settings import DATETIME_PATTERN, PUBLISH_TIME_META, TITLE_EXTRACTOR_USELESS_TAGS 14 | from CrawlersTools.preprocess import TimeProcessor 15 | 16 | format_time = TimeProcessor().format 17 | 18 | 19 | class TimeExtractor(BaseExtractor): 20 | 21 | @staticmethod 22 | def extract_from_xpath(element: Element, publish_time_xpath: str) -> str: 23 | if publish_time_xpath: 24 | publish_time = ''.join(element.xpath(publish_time_xpath)) 25 | return format_time(publish_time) 26 | return '' 27 | 28 | @staticmethod 29 | def extract_from_text(element: Element) -> str: 30 | text = ''.join(element.xpath('.//text()')) 31 | for dt in DATETIME_PATTERN: 32 | dt_obj = re.search(dt, text) 33 | if dt_obj: 34 | return format_time(dt_obj.group(1)) 35 | else: 36 | return '' 37 | 38 | @staticmethod 39 | def extract_from_meta(element: Element) -> str: 40 | """ 41 | 优先匹配 META 数据 42 | :param element: 网页源代码对应的Dom 树 43 | :return: str 44 | """ 45 | for xpath in PUBLISH_TIME_META: 46 | publish_time = element.xpath(xpath) 47 | if publish_time: 48 | return format_time(''.join(publish_time)) 49 | return '' 50 | 51 | def process(self, element: Element): 52 | # remove tag and its content 53 | etree.strip_elements(element, *TITLE_EXTRACTOR_USELESS_TAGS) 54 | 55 | publish_time = (self.extract_from_xpath(element, publish_time_xpath=self.kwargs.get("publish_time_xpath")) 56 | or self.extract_from_meta(element) 57 | or self.extract_from_text(element)) 58 | 59 | return publish_time 60 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/utils/similarity.py: -------------------------------------------------------------------------------- 1 | import distance 2 | 3 | 4 | def similarity1(s1, s2): 5 | """ 6 | get similarity of two strings 7 | :param s1: 8 | :param s2: 9 | :return: 10 | """ 11 | if not s1 or not s2: 12 | return 0 13 | edit_distance = distance.levenshtein(s1, s2) 14 | similarity_score = 1 - edit_distance / (len(s1) + len(s2)) 15 | return similarity_score 16 | 17 | 18 | def similarity2(s1, s2): 19 | """ 20 | get similarity of two strings 21 | :param s1: 22 | :param s2: 23 | :return: 24 | """ 25 | if not s1 or not s2: 26 | return 0 27 | s1_set = set(list(s1)) 28 | s2_set = set(list(s2)) 29 | intersection = s1_set.intersection(s2_set) 30 | union = s2_set.union(s1_set) 31 | return len(intersection) / len(union) 32 | 33 | 34 | def similarity(s1, s2): 35 | """ 36 | get similarity of two strings 37 | :param s1: 38 | :param s2: 39 | :return: 40 | """ 41 | return similarity2(s1, s2) 42 | 43 | 44 | def get_longest_common_sub_string(str1: str, str2: str) -> str: 45 | """ 46 | get longest common string 47 | :param str1: 48 | :param str2: 49 | :return: 50 | """ 51 | if not all([str1, str2]): 52 | return '' 53 | matrix = [[0] * (len(str2) + 1) for _ in range(len(str1) + 1)] 54 | max_length = 0 55 | start_position = 0 56 | for index_of_str1 in range(1, len(str1) + 1): 57 | for index_of_str2 in range(1, len(str2) + 1): 58 | if str1[index_of_str1 - 1] == str2[index_of_str2 - 1]: 59 | matrix[index_of_str1][index_of_str2] = matrix[index_of_str1 - 1][index_of_str2 - 1] + 1 60 | if matrix[index_of_str1][index_of_str2] > max_length: 61 | max_length = matrix[index_of_str1][index_of_str2] 62 | start_position = index_of_str1 - max_length 63 | else: 64 | matrix[index_of_str1][index_of_str2] = 0 65 | return str1[start_position: start_position + max_length] 66 | 67 | 68 | if __name__ == '__main__': 69 | s1 = 'hello' 70 | s2 = 'world' 71 | print(similarity(s1, s2)) 72 | -------------------------------------------------------------------------------- /CrawlersTools/pipelines/mongo_pipeline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/12 9:12 4 | # @Author : MuggleK 5 | # @File : mongo_pipeline.py 6 | 7 | from pymongo import MongoClient 8 | 9 | 10 | class MongoPipeline: 11 | """ 12 | A Mongo Pipeline to Create or Insert or Update or Delete Collection 13 | 14 | Usage: 15 | 16 | ```python 17 | >>> mongo_client = MongoPipeline() 18 | >>> record = mongo_client.find_one("test_collection", '{"company_name": "qzd"}') 19 | ``` 20 | """ 21 | 22 | collection = None 23 | conn = None 24 | 25 | def __init__(self, host="127.0.0.1", port="27017", username="root", password="root", database="crawl_data"): 26 | 27 | self.server = '''mongodb://%s:%s@%s:%s/%s''' % (username, password, host, port, database) 28 | self.client = MongoClient(host=self.server, readPreference="secondaryPreferred") 29 | self.db = self.client.get_database(database) 30 | 31 | def close(self): 32 | return self.client.close() 33 | 34 | def set_collection(self, name): 35 | self.collection = self.db.get_collection(name) 36 | 37 | def find(self, collection_name, query=None, ref_query=None): 38 | """ 39 | from query phrase to find docs 40 | 41 | :param collection_name: 42 | :param query: query phrase 43 | :param ref_query: reserve phrase 44 | :return: 45 | """ 46 | records = self.db.get_collection(collection_name).find(query, ref_query) 47 | return records 48 | 49 | def find_one(self, collection_name, query=None, ref_query=None): 50 | records = self.db.get_collection(collection_name).find_one(query, ref_query) 51 | return records 52 | 53 | def update(self, collection_name, query, update, many=False): 54 | if many: 55 | self.db.get_collection(collection_name).update_many(query, update, upsert=True) 56 | return 57 | self.db.get_collection(collection_name).update_one(query, update, upsert=True) 58 | 59 | def aggregate(self, collection_name, query): 60 | records = self.db.get_collection(collection_name).aggregate(query) 61 | for record in records: 62 | yield record 63 | -------------------------------------------------------------------------------- /CrawlersTools/logs/formatters.py: -------------------------------------------------------------------------------- 1 | import json 2 | from logging import Formatter 3 | from typing import Tuple, List, Optional, Union 4 | 5 | EXTRA_IGNORE_FIELDS_DEFAULT = ( 6 | "name", 7 | "msg", 8 | "args", 9 | "levelno", 10 | "pathname", 11 | "filename", 12 | "module", 13 | "exc_info", 14 | "exc_text", 15 | "stack_info", 16 | "lineno", 17 | "funcName", 18 | "created", 19 | "msecs", 20 | "relativeCreated", 21 | "thread", 22 | "threadName", 23 | "processName", 24 | "process", 25 | ) 26 | 27 | 28 | class JsonFormatter(Formatter): 29 | """格式化日志到Json,并删除某些字段""" 30 | 31 | def __init__( 32 | self, 33 | extra_ignore_keys: Optional[Union[List[str], Tuple[str]]] = EXTRA_IGNORE_FIELDS_DEFAULT, 34 | with_timestamp: bool = True, 35 | **kwargs 36 | ): 37 | """ 38 | :param ignore_fields: 需要从 record[extra] 里忽略(排除)的字段 39 | :param kwargs: 这里的 key:val 会添加到格式化后的消息中 eg: app=explore 40 | """ 41 | super(JsonFormatter, self).__init__() 42 | self.extra_ignore_keys = extra_ignore_keys 43 | self.with_timestamp = with_timestamp 44 | self.kwargs = kwargs 45 | 46 | def formatException(self, exc_info): 47 | exc_text = super(JsonFormatter, self).formatException(exc_info) 48 | return repr(exc_text) 49 | 50 | def format(self, record): 51 | message = { 52 | **self.kwargs, 53 | **self.get_extra_info(record), 54 | } 55 | if self.with_timestamp: 56 | message.update({"timestamp": self.format_timestamp(record.created)}) 57 | 58 | if record.exc_info: 59 | message["message"] = self.formatException(record.exc_info) 60 | message["stack_trace"] = "".join(record.getMessage().split("\n")) 61 | else: 62 | message["message"] = record.getMessage() 63 | 64 | return json.dumps(message) 65 | 66 | @classmethod 67 | def format_timestamp(cls, time): 68 | return int(time * 1000) 69 | 70 | def get_extra_info(self, record): 71 | return { 72 | attr_name: record.__dict__[attr_name] 73 | for attr_name in record.__dict__ 74 | if attr_name not in self.extra_ignore_keys 75 | } 76 | -------------------------------------------------------------------------------- /CrawlersTools/preprocess/time_process.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/12 14:48 4 | # @Author : MuggleK 5 | # @File : time_process.py 6 | 7 | import re 8 | from datetime import datetime, timedelta 9 | 10 | from sinan import Sinan 11 | 12 | from CrawlersTools.projects.filters import empty_text 13 | 14 | 15 | class TimeProcessor: 16 | 17 | datetime_pattern = r"([0-9]{4}).*?([0-1]{0,1}[0-9]).*?([0-3]{0,1}[0-9])" 18 | 19 | def __init__(self): 20 | self.fmt = "%Y-%m-%d" # 暂时只处理年月日 21 | 22 | def format(self, string, struct=False): 23 | string = empty_text(string) 24 | try: 25 | return self.process_timestamp(string, struct) 26 | except ValueError: 27 | # print(f"非时间戳格式:{string}") 28 | pass 29 | 30 | date = Sinan(string).parse(display_status=False).get("datetime", [""])[0].split(' ')[0] # 错误的时分秒 31 | if not date: 32 | re_res = re.search(self.datetime_pattern, string) 33 | if re_res is not None: 34 | date = f"{re_res.group(1)}-{re_res.group(2)}-{re_res.group(3)}" 35 | else: 36 | # 提取不出时间或者格式不满足 datetime_pattern的直接返回 37 | return 38 | 39 | if struct: 40 | return datetime.strptime(date, self.fmt) 41 | return date 42 | 43 | def process_timestamp(self, timestamp, struct): 44 | timestamp = int(str(timestamp)[:10]) 45 | source_time = datetime(1970, 1, 1) 46 | struct_time = ( 47 | datetime.fromtimestamp(timestamp) if timestamp >= 0 else source_time + timedelta(seconds=timestamp) 48 | ) 49 | if struct: 50 | return struct_time 51 | return struct_time.strftime(self.fmt) 52 | 53 | def compare_date(self, time_min, time_max) -> bool: 54 | if not (time_min and time_max): 55 | return False 56 | 57 | time_min_format = time_min if isinstance(time_min, datetime) else self.format(time_min, struct=True) 58 | time_max_format = time_max if isinstance(time_max, datetime) else self.format(time_max, struct=True) 59 | if not (time_min_format and time_max_format): 60 | return False 61 | 62 | if time_min_format.date() <= time_max_format.date(): 63 | return True 64 | return False 65 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # xml 132 | .xml 133 | 134 | /.idea 135 | test/ -------------------------------------------------------------------------------- /CrawlersTools/preprocess/bloom_filter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/12 14:35 4 | # @Author : MuggleK 5 | # @File : bloom_filter.py 6 | 7 | import hashlib 8 | 9 | 10 | def sha1(data): 11 | """ 12 | BloomFilter fingerprint Function 13 | """ 14 | hash_object = hashlib.sha1(data.encode('utf-8')) 15 | hex_dig = hash_object.hexdigest() 16 | return hex_dig 17 | 18 | 19 | class SimpleHash(object): 20 | """ 21 | BloomFilter Hash Function 22 | """ 23 | def __init__(self, cap, seed): 24 | self.cap = cap 25 | self.seed = seed 26 | 27 | def hash(self, value): 28 | ret = 0 29 | for i in range(len(value)): 30 | ret += self.seed * ret + ord(value[i]) 31 | return (self.cap - 1) & ret 32 | 33 | 34 | class BloomFilter(object): 35 | """ 36 | Usage:: 37 | 38 | # >>> bf = BloomFilter(server, key, block_num=1) # you can increase block_num if you are filtering too many urls 39 | # ... if is_contains(fp): 40 | # ... print(f"{fp} 已存在") 41 | # ... else: 42 | # ... bf.insert(fp) 43 | # >>> 44 | 45 | """ 46 | def __init__(self, server, key, block_num=1, filter_level=0): 47 | """ 48 | 49 | :param server: Redis Server 50 | :param key: Redis Key 51 | :param block_num: 52 | :param filter_level: Filter data Magnitude 0:total data less than 100W. 1: Exceed 100W 53 | """ 54 | self.bit_size = 1 << 31 if filter_level else 1 << 29 55 | self.seeds = [5, 7, 11, 13, 31] if filter_level else [5, 7, 11, 13, 31, 37, 61] 56 | self.server = server 57 | self.key = key 58 | self.block_num = block_num 59 | self.hash_func = [] 60 | for seed in self.seeds: 61 | self.hash_func.append(SimpleHash(self.bit_size, seed)) 62 | 63 | def is_contains(self, str_input) -> bool: 64 | """ 65 | param str_input: source string 66 | :return: 67 | """ 68 | if not str_input: 69 | return False 70 | ret = True 71 | 72 | fp = sha1(str_input) 73 | name = f"{self.key}{str(int(fp[0:2], 16) % self.block_num)}" 74 | for f in self.hash_func: 75 | loc = f.hash(str_input) 76 | ret = ret & self.server.getbit(name, loc) 77 | return bool(ret) 78 | 79 | def insert(self, str_input): 80 | """ 81 | param str_input: source string 82 | :return: 83 | """ 84 | fp = sha1(str_input) 85 | name = f"{self.key}{str(int(fp[0:2], 16) % self.block_num)}" 86 | for f in self.hash_func: 87 | loc = f.hash(str_input) 88 | self.server.setbit(name, loc, 1) 89 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/title_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/19 20:23 4 | # @Author : MuggleK 5 | # @File : title_extractor.py 6 | 7 | import re 8 | from itertools import combinations 9 | 10 | from lxml.html import etree 11 | 12 | from CrawlersTools.extractors.base import BaseExtractor 13 | from CrawlersTools.extractors.schemas.element import Element 14 | from CrawlersTools.extractors.utils.settings import ( 15 | TITLE_HTAG_XPATH, TITLE_META_XPATH, TITLE_META_XPATH_BAK, TITLE_EXTRACTOR_USELESS_TAGS, PUNCTUATION_ALPHA_PATTERN 16 | ) 17 | from CrawlersTools.extractors.utils.similarity import get_longest_common_sub_string 18 | 19 | 20 | class TitleExtractor(BaseExtractor): 21 | 22 | @staticmethod 23 | def extract_by_xpath(element, title_xpath): 24 | if title_xpath: 25 | title_list = element.xpath(title_xpath) 26 | if title_list: 27 | return title_list[0] 28 | return '' 29 | 30 | @staticmethod 31 | def extract_by_title(element): 32 | title_list = element.xpath(TITLE_META_XPATH) or element.xpath(TITLE_META_XPATH_BAK) 33 | if title_list: 34 | return max(title_list, key=len) 35 | else: 36 | return '' 37 | 38 | @staticmethod 39 | def extract_by_htag(element): 40 | title_list = element.xpath(TITLE_HTAG_XPATH) 41 | title_list = [re.sub(PUNCTUATION_ALPHA_PATTERN, "", phrase) for phrase in title_list] 42 | if not title_list: 43 | return '' 44 | index_string = [(index, ''.join(filter(str.isalnum, string))) for index, string in enumerate(title_list)] 45 | string_list = [i[1] for i in index_string] 46 | max_string = max(string_list, key=len) 47 | return title_list[string_list.index(max_string)] 48 | 49 | @staticmethod 50 | def extract_common_str(element: Element) -> str: 51 | h_tag_texts_list = element.xpath(TITLE_HTAG_XPATH) 52 | new_title_list = list(combinations(h_tag_texts_list, 2)) 53 | if len(new_title_list) == 1: 54 | new_title = str(max(list(new_title_list[0]), key=len)) 55 | return new_title 56 | 57 | common_title_list = [get_longest_common_sub_string(i[0], i[1]).strip() for i in new_title_list] 58 | if common_title_list: 59 | new_title = max(common_title_list, key=len) 60 | sub_string = re.sub(r'\d+', '', ''.join(filter(str.isalnum, new_title))) 61 | return new_title if len(new_title) > 4 and sub_string else '' 62 | return '' 63 | 64 | def process(self, element: Element): 65 | # remove tag and its content 66 | etree.strip_elements(element, *TITLE_EXTRACTOR_USELESS_TAGS) 67 | 68 | title = (self.extract_by_xpath(element, title_xpath=self.kwargs.get("title_xpath")) 69 | or self.extract_by_title(element) 70 | or self.extract_common_str(element) 71 | or self.extract_by_htag(element) 72 | ) 73 | return title.strip() 74 | -------------------------------------------------------------------------------- /CrawlersTools/schedules/auto_thread.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/6/21 17:19 4 | # @Author : MuggleK 5 | # @File : auto_thread.py 6 | 7 | import time 8 | from threading import Lock, Thread, active_count 9 | from traceback import format_exc 10 | 11 | from loguru import logger 12 | from tqdm import tqdm 13 | 14 | thread_lock = Lock() 15 | 16 | 17 | class ExcThread(Thread): 18 | """ 19 | 主动捕获子线程异常 20 | """ 21 | 22 | def __init__(self, target, args=(), kwargs=None): 23 | super(ExcThread, self).__init__() 24 | self._target = target 25 | self._args = args 26 | self._kwargs = kwargs or dict() 27 | 28 | def run(self): 29 | try: 30 | if self._target: 31 | self._target(*self._args, **self._kwargs) 32 | except: 33 | logger.error(f'self._target:{self._target} args:{self._args} kwargs:{self._kwargs},{format_exc()}') 34 | 35 | 36 | class AutoThread(object): 37 | """ 38 | 动态线程调度, 传入任务队列可为列表(初始化转换成生成器),也可为生成器 39 | usage: 40 | a_thread = AutoThread(20, fun, arg_list) 41 | a_thread.main_thread() 42 | 43 | ps: 支持两种并发方式:1.并发函数 2.并发传参 3.并发函数和传参 44 | """ 45 | 46 | def __init__(self, thread_num: int, fun, arg_list=None): 47 | self.thread_num = thread_num 48 | if isinstance(fun, tuple): fun = list(fun) 49 | if isinstance(arg_list, tuple): arg_list = list(arg_list) 50 | self.fun_list = fun if callable(fun) else list(fun) # 待带调用对象只能是方法或方法列表,元组 51 | self.arg_list = arg_list 52 | self.os_threads = active_count() 53 | 54 | def process_task(self): 55 | if callable(self.fun_list): 56 | # 1.并发函数 57 | tasks = [{'fun': self.fun_list, 'args': arg} for arg in self.arg_list] 58 | elif isinstance(self.fun_list, list) and not isinstance(self.arg_list, list): 59 | # 2.并发传参 60 | tasks = [{'fun': fun, 'args': self.arg_list} for fun in self.fun_list] 61 | else: 62 | assert len(self.fun_list) == len(self.arg_list), '并发函数和传参长度不一致' 63 | # 3.并发函数和传参 64 | tasks = [{'fun': fun, 'args': arg} for fun, arg in zip(self.fun_list, self.arg_list)] 65 | return tasks 66 | 67 | def wait(self): 68 | """ 69 | 等待所有线程结束, 比较 当前存活线程和(主线程 + tqdm线程) 70 | """ 71 | while active_count() > self.os_threads + 1: 72 | time.sleep(.25) 73 | 74 | def main_thread(self): 75 | loop_flag = True 76 | tasks = self.process_task() 77 | with tqdm(total=len(tasks)) as pbar: 78 | while loop_flag: 79 | active_thread = active_count() 80 | if active_thread >= self.thread_num: 81 | time.sleep(.25) 82 | continue 83 | for _ in range(self.thread_num - active_thread + self.os_threads): 84 | thread_lock.acquire() 85 | task = tasks.pop() if tasks else None 86 | thread_lock.release() 87 | if task is None: 88 | loop_flag = False 89 | break 90 | child_thread = ExcThread(target=task["fun"]) if task["args"] is None else ExcThread( 91 | target=task["fun"], args=(task["args"],)) 92 | 93 | child_thread.start() 94 | pbar.update(1) 95 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/content_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/11/15 9:18 4 | # @Author : MuggleK 5 | # @File : content_extractor.py 6 | 7 | from copy import deepcopy 8 | 9 | import numpy as np 10 | from lxml.html import fromstring, HtmlElement 11 | 12 | from CrawlersTools.extractors.base import BaseExtractor 13 | from CrawlersTools.extractors.schemas.element import Element 14 | from CrawlersTools.extractors.utils.element import descendants_of_body 15 | from CrawlersTools.extractors.utils.preprocess import preprocess4content_extractor 16 | from CrawlersTools.extractors.utils.settings import SPECIAL_SYMBOL_MAP, ERROR_NAV_LIST 17 | 18 | 19 | class ContentExtractor(BaseExtractor): 20 | """ 21 | extract content from detail page 22 | """ 23 | 24 | def process(self, element: Element): 25 | """ 26 | extract content from html 27 | :param element: 28 | :return: 29 | """ 30 | source_element = deepcopy(element) 31 | source_element.__class__ = Element 32 | 33 | # preprocess 34 | preprocess4content_extractor(element) 35 | 36 | # start to evaluate every child element 37 | descendants = descendants_of_body(element) 38 | 39 | # get std of density_of_text among all elements 40 | density_of_text = [descendant.density_of_text for descendant in descendants] 41 | density_of_text_std = np.std(density_of_text, ddof=1) 42 | 43 | # get density_score of every element 44 | for descendant in descendants: 45 | score = np.log(density_of_text_std) * \ 46 | descendant.density_of_text * \ 47 | np.log10(descendant.number_of_p_descendants + 2) * \ 48 | np.log(descendant.density_of_punctuation) 49 | descendant.density_score = score 50 | 51 | # sort element info by density_score 52 | descendants = sorted(descendants, key=lambda x: x.density_score, reverse=True) 53 | descendant_first = descendants[0] if descendants else None 54 | if descendant_first is None: 55 | return None 56 | 57 | paragraphs = descendant_first.xpath(".//text()") 58 | paragraphs = [paragraph.strip() if paragraph else '' for paragraph in paragraphs] 59 | paragraphs = list(filter(lambda x: x, paragraphs)) 60 | text = '\n'.join(paragraphs) 61 | text = text.strip() 62 | 63 | # save content with tag 64 | content_with_tag = self.process_content_tag(descendant_first, source_element) 65 | 66 | # extract images 67 | img_list = [img.attrib["src"] for img in content_with_tag.img_descendants if img.attrib] 68 | 69 | return text, content_with_tag.string, img_list 70 | 71 | @staticmethod 72 | def process_content_tag(descendant_first, source_element): 73 | content_xpath = f"//{descendant_first.tag}" 74 | if descendant_first.attrib: 75 | for k, v in descendant_first.attrib.items(): 76 | if k and v: content_xpath += f"[@{k}='{v}']" 77 | preprocess4content_extractor(source_element, is_content=False) 78 | content_with_tag = source_element.xpath(content_xpath)[0] 79 | if isinstance(content_with_tag, HtmlElement): 80 | content_with_tag.__class__ = Element 81 | return content_with_tag 82 | 83 | def extract(self, html, **kwargs): 84 | """ 85 | base extract method, firstly, it will convert html to WebElement, then it call 86 | process method that child class implements 87 | :param html: 88 | :return: 89 | """ 90 | self.kwargs = kwargs 91 | for key, value in SPECIAL_SYMBOL_MAP.items(): 92 | html = html.replace(key, value) 93 | 94 | element = fromstring(html=html) # html有多个,fromstring默认取第一个 TODO 解析不了非规范html 95 | if self.kwargs.get("content_xpath"): 96 | return ''.join(element.xpath(self.kwargs.get("content_xpath"))) 97 | 98 | descendants_list = list(element.iterdescendants()) 99 | 100 | # remove error navigate tags 101 | remove_index_list = list() 102 | for index, descendant in enumerate(descendants_list): 103 | if descendant.text is None: 104 | continue 105 | nav_error_list = [i for i in ERROR_NAV_LIST if i in descendant.text] 106 | if nav_error_list: remove_index_list.append(index) 107 | 108 | for i in remove_index_list: 109 | parent_element = descendants_list[i].getparent() 110 | if parent_element is not None: parent_element.remove(descendants_list[i]) 111 | 112 | element.__class__ = Element 113 | return self.process(element) 114 | -------------------------------------------------------------------------------- /CrawlersTools/pipelines/mysql_pipeline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/12 9:12 4 | # @Author : MuggleK 5 | # @File : mysql_pipeline.py 6 | 7 | import time 8 | from typing import Tuple, Optional 9 | 10 | import pymysql 11 | from DBUtils.PooledDB import PooledDB 12 | from loguru import logger 13 | from pymysql import ProgrammingError 14 | from pymysql.converters import escape_string 15 | 16 | 17 | def cursor_handler(func): 18 | def wrapper(self, *args, **kwargs): 19 | sql_conn, cursor = self.ping() 20 | if not (sql_conn and cursor): 21 | logger.warning(f"Mysql Connection occur Error,args:{args}, kwargs: {kwargs}") 22 | return 23 | 24 | try: 25 | kwargs.update({'cursor': cursor}) 26 | result = func(self, *args, **kwargs) 27 | sql_conn.commit() 28 | return result 29 | finally: 30 | cursor.close() 31 | sql_conn.close() 32 | 33 | return wrapper 34 | 35 | 36 | class MysqlPipeline(object): 37 | """ 38 | A Mysql Pipeline to Create or Insert or Update or Delete Table 39 | 40 | Usage:: 41 | 42 | # >>> 43 | # >>> mysql_pool = MysqlPipeline(host='127.0.0.1', username='root', password='mysql', db='test') 44 | # >>> mysql_pool.insert(item, 'test_table') 45 | # >>> 46 | 47 | """ 48 | 49 | table_columns_map = dict() # 缓存每个table的结构,避免每次都要查询数据库 50 | 51 | def __init__(self, host: str = '127.0.0.1', username: str = 'root', 52 | password: str = '', db: str = 'test', port: int = 3306, 53 | drop_column: Optional[Tuple] = ('id', 'crawl_time'), 54 | pool_num: int = 10 55 | ): 56 | """ 57 | :param host: 58 | :param username: 59 | :param password: 60 | :param db: 61 | :param port: 62 | :param drop_column: type:list 插入数据中不需要手动添加的字段,例如自增主键id,自增时间戳等 63 | :param pool_num: 64 | """ 65 | self.host = host 66 | self.username = username 67 | self.password = password 68 | self.db = db 69 | self.port = port 70 | self.drop_column = drop_column 71 | self.pool_num = pool_num 72 | 73 | self.sql_pool = PooledDB( 74 | pymysql, self.pool_num, host=self.host, 75 | user=self.username, passwd=self.password, db=self.db, 76 | port=self.port, charset='utf8', use_unicode=True 77 | ) 78 | 79 | def ping(self): 80 | """ 81 | 重写pymysql中的ping函数并新增重试机制,以保持conn和cursor 82 | 83 | :return: 84 | """ 85 | for _ in range(5): 86 | try: 87 | sql_conn = self.sql_pool.connection() 88 | cursor = sql_conn.cursor() 89 | return sql_conn, cursor 90 | except Exception as e: 91 | logger.debug(f"Mysql Lost Connection for Host : {self.host} Retrying, Error: {e}") 92 | 93 | try: 94 | self.sql_pool = PooledDB( 95 | pymysql, self.pool_num, host=self.host, user=self.username, 96 | passwd=self.password, db=self.db, port=self.port, 97 | charset='utf8', use_unicode=True 98 | ) 99 | sql_conn = self.sql_pool.connection() 100 | cursor = sql_conn.cursor() 101 | return sql_conn, cursor 102 | except Exception as err: 103 | logger.debug(f"Waiting for 5s to Connect, Error: {err}") 104 | time.sleep(5) 105 | continue 106 | 107 | logger.error(f"Mysql Connects for Host : {self.host} Over Max Retries") 108 | return None, None 109 | 110 | def add_columns_map(self, table_name): 111 | sql = f"select column_name from information_schema.columns " \ 112 | f"where table_name='{table_name}' and table_schema='{self.db}'" 113 | column_list = self.execute_sql(sql) 114 | columns = [i[0] for i in column_list if i[0] not in self.drop_column] 115 | self.table_columns_map[table_name] = columns 116 | return columns 117 | 118 | @cursor_handler 119 | def execute_sql(self, sql, mode='fetch', cursor=None): 120 | if mode == 'fetch': 121 | cursor.execute(sql) 122 | result = cursor.fetchall() 123 | return result 124 | cursor.execute(sql) 125 | 126 | def insert(self, item, table_name): 127 | 128 | if not item: 129 | logger.error("item is Empty") 130 | return 131 | 132 | table_columns = self.table_columns_map.get(table_name) or self.add_columns_map(table_name) 133 | if not table_columns: 134 | raise ProgrammingError(f"Table '{self.db}.{table_name}' doesn't exist") 135 | 136 | # 格式化sql语句 处理"None" -> NULL 137 | format_str = ','.join(["%s" for _ in table_columns]) 138 | insert_sql = 'INSERT IGNORE INTO %s (%s) VALUES (%s)' % (table_name, ','.join(table_columns), format_str) 139 | item_values = [None if item.get(key) == "None" else item.get(key) for key in table_columns] 140 | execute_data = tuple([escape_string('%r') % str(i) if i else "NULL" for i in item_values]) 141 | 142 | self.execute_sql(insert_sql % execute_data) 143 | logger.info(f"Insert Ignore Successfully:{table_name} -> {item}") 144 | -------------------------------------------------------------------------------- /CrawlersTools/projects/upload_oss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/12 20:20 4 | # @Author : MuggleK 5 | # @File : upload_oss.py 6 | 7 | import base64 8 | import hashlib 9 | import re 10 | 11 | import httpx 12 | from loguru import logger 13 | 14 | from CrawlersTools import base_requests 15 | from CrawlersTools.requests.proxy import get_proxies 16 | 17 | 18 | class UploadOss(object): 19 | """ 20 | A Class for QZD Upload file to oss 21 | 22 | Usage: 23 | 24 | ```python 25 | >>> upload = UploadOss('(pdf|txt|doc|docx|xlsx|xls|csv|wps|hlp|rtf|ppt|pptx|zip|rar|jar|gz|jpg|jpeg|png|tif|gif|bmp)', "https://***") 26 | >>> oss_url, oss_uuid = upload.download("http://xxgk.haiyan.gov.cn/gov/jcms_files/jcms1/web7/site/zfxxgk/download/downfile.jsp?classid=0&filename=140901165845693.xls", '附件') 27 | ``` 28 | """ 29 | 30 | def __init__(self, oss_url, suffix_reg, oss_code=None, client_code=None): 31 | self.suffix_reg = suffix_reg 32 | self.oss_url = oss_url 33 | self.oss_code = oss_code 34 | self.client_code = client_code 35 | 36 | def download(self, file_url, file_name, headers=None, verify=True): 37 | """ 38 | 39 | :param file_url: 40 | :param file_name: 41 | :param headers: 42 | :param verify: 43 | :return: 44 | """ 45 | location = global_uuid = "" 46 | proxy = None 47 | headers = headers if headers else { 48 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"} 49 | for _ in range(3): 50 | try: 51 | if ";base64," in file_url: 52 | suffix = "png" 53 | logger.debug(f"正在上传base64图片: {file_name}: {file_url}") 54 | hl = hashlib.md5() 55 | hl.update(file_url.encode(encoding='utf-8')) 56 | file_name = hl.hexdigest() + f".{suffix}" 57 | a = file_url.split(";base64,")[-1] 58 | a = a + '=' * (4 - len(a) % 4) if len(a) % 4 != 0 else a 59 | base64str = base64.b64decode(a) 60 | upload_result = self.post_file(file_name, base64str) 61 | location = upload_result.get("downloadLocation") 62 | global_uuid = upload_result.get("globalUuid") 63 | logger.debug(f"文件上传成功: {file_name}: {file_url}") 64 | else: 65 | suffix = self.complete_name(file_url, file_name, self.suffix_reg) 66 | if not file_url.startswith("http") and not suffix: 67 | return location, global_uuid 68 | file_name = f"{file_name}.{suffix}" 69 | logger.debug(f"正在上传文件: {file_name}: {file_url}") 70 | res = base_requests(file_url, timeout=60, headers=headers, verify=verify, proxies=proxy) 71 | if 200 <= res.status_code < 400: 72 | upload_result = self.post_file(file_name, res) 73 | location = upload_result.get("downloadLocation") 74 | global_uuid = upload_result.get("globalUuid") 75 | logger.debug(f"文件上传成功: {file_name}: {file_url}") 76 | break 77 | elif res.status_code == 404 or res.status_code == 500: 78 | logger.debug(f"文件地址无效: {file_name}: {file_url}") 79 | break 80 | except Exception as e: 81 | logger.warning(f"文件上传异常: {file_name}: {e}") 82 | proxy = get_proxies(http2=True) 83 | continue 84 | else: 85 | logger.error(f"文件上传失败: {file_name}: {file_url}") 86 | 87 | return location, global_uuid 88 | 89 | def post_file(self, name, resp): 90 | params_json = { 91 | "name": name, 92 | "appCode": self.oss_code, 93 | "appClientCode": self.client_code, 94 | "appOrgCode": "", 95 | "appUserId": "", 96 | "ownCatalogUuid": "" 97 | } 98 | json_data = httpx.post(self.oss_url, json=params_json).json() 99 | if json_data.get("msg") == "SUCCESS": 100 | token_data = json_data.get("data", {}) 101 | 102 | str_dic = { 103 | "key": token_data.get("dir") + token_data.get("name"), 104 | "policy": token_data.get("policy"), 105 | "OSSAccessKeyId": token_data.get("accessid"), 106 | "success_action_status": 200, 107 | "callback": token_data.get("callback"), 108 | "signature": token_data.get("signature"), 109 | } 110 | 111 | files = {'file': resp.content} 112 | response = httpx.post(token_data.get("host"), data=str_dic, files=files) 113 | if response.status_code == 200: 114 | res_data = response.json() 115 | if res_data.get("msg") == "SUCCESS": 116 | return res_data["data"] 117 | 118 | raise ValueError(f"文件上传oss失败:{name}") 119 | 120 | @staticmethod 121 | def complete_name(url, name, suffix_reg): 122 | """ 123 | 附件.xls.doc 可上传, 接口会默认取最后一个 124 | 优先取 file_url 后缀 125 | """ 126 | is_name_suffix = re.search(suffix_reg, name, re.I) 127 | is_url_suffix = re.search(suffix_reg, url, re.I) 128 | name_suffix = is_name_suffix.group(1) if is_name_suffix else "" 129 | url_suffix = is_url_suffix.group(1) if is_url_suffix else "" 130 | if url_suffix: 131 | suffix = url_suffix 132 | elif name_suffix: 133 | suffix = name_suffix 134 | else: 135 | suffix = "" 136 | 137 | return suffix 138 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/utils/preprocess.py: -------------------------------------------------------------------------------- 1 | from lxml.html import etree 2 | 3 | from CrawlersTools.extractors.schemas.element import Element 4 | from CrawlersTools.extractors.utils.element import children, remove_element, remove_children 5 | 6 | # fmt:off 7 | CONTENT_EXTRACTOR_USELESS_TAGS = ['audio', 'colgroup', 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'iframe', 8 | 'input', 'link', 'option', 'path', 'script', 'select', 'source', 'style', 'svg', 9 | 'symbol', 'video'] 10 | 11 | CONTENT_EXTRACTOR_STRIP_TAGS = ['b', 'blockquote', 'br', 'font', 'p', 'section', 'span', 'spanlang', 'spanstyle', 12 | 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'th', 'tr', 'u'] # 'img' 13 | 14 | KEYWORD_FEATURES = 'and not (contains(@class,"main")) and not (contains(@class,"content"))and not (contains(@class,"con"))and not (contains(@class,"container")) and not (contains(@class,"list")) and not (contains(@class,"box")) and not (contains(@class,"right"))and not (contains(@class,"body")) and not (contains(@class,"lanmu")) ' 15 | CONTENT_EXTRACTOR_NOISE_XPATH = [ 16 | # '//div[contains(@class, "comment")]', 17 | '//div[contains(@class, "advertisement")]', 18 | '//div[contains(@class, "advert")]', 19 | '//a[contains(@style, "display: none")]', 20 | '//a[contains(@style, "display:none")]', # TODO css不展示数据是否要去除,可能会影响正文重复 21 | f'//div[contains(@class, "foot") {KEYWORD_FEATURES}]', 22 | f'//div[contains(@class, "footer") {KEYWORD_FEATURES}]', 23 | # f'//div[contains(@class, "location") {KEYWORD_FEATURES}]', 24 | f'//div[contains(@class, "navigation") {KEYWORD_FEATURES}]', 25 | f'//div[contains(@class, "barrier") {KEYWORD_FEATURES}]', 26 | '//div[contains(@id, "foot")]', 27 | # '//div[contains(@class, "head")]', # 误删 28 | # '//div[contains(@id, "head")]', 29 | # '//div[contains(@class, "nav")]', # 误删 30 | '//div[contains(@id, "nav")]', 31 | '//div[contains(@class, "siderbar")]', 32 | '//div[contains(@class, "breadcrumb")]', 33 | '//div[contains(@id, "siderbar")]', 34 | '//div[contains(@id, "页脚")]', 35 | '//div[contains(@class, "页脚")]', 36 | '//div[contains(@id, "页眉")]', 37 | '//div[contains(@id, "页头")]', 38 | '//div[contains(@class, "页眉")]', 39 | '//div[contains(@class, "页头")]', 40 | '//*[contains(@class, "hidden")]', 41 | ] 42 | 43 | 44 | def preprocess4content_extractor(element: Element, is_content: bool = True): 45 | """ 46 | preprocess element for content extraction 47 | :param element: 48 | :param is_content: save content without tag 49 | :return: 50 | """ 51 | remove_children(element, CONTENT_EXTRACTOR_NOISE_XPATH) 52 | 53 | # remove tag and its content 54 | etree.strip_elements(element, *CONTENT_EXTRACTOR_USELESS_TAGS) 55 | 56 | if not is_content: return 57 | # only move tag pair 58 | etree.strip_tags(element, *CONTENT_EXTRACTOR_STRIP_TAGS) 59 | 60 | for child in children(element): 61 | 62 | # merge text in span or strong to parent p tag 63 | if child.tag.lower() == 'p' or child.tag.lower() == 'table': 64 | etree.strip_tags(child, 'span') 65 | etree.strip_tags(child, 'strong') 66 | etree.strip_tags(child, 'tr') 67 | etree.strip_tags(child, 'td') 68 | 69 | if not (child.text and child.text.strip()): 70 | remove_element(child) 71 | 72 | # if a div tag does not contain any sub node, it could be converted to p node. 73 | if child.tag.lower() == 'div' and not child.getchildren(): 74 | child.tag = 'p' 75 | 76 | 77 | LIST_EXTRACTOR_USELESS_TAGS = CONTENT_EXTRACTOR_USELESS_TAGS 78 | LIST_EXTRACTOR_STRIP_TAGS = CONTENT_EXTRACTOR_STRIP_TAGS 79 | LIST_EXTRACTOR_NOISE_XPATH = CONTENT_EXTRACTOR_NOISE_XPATH 80 | 81 | 82 | def preprocess4list_extractor(element: Element): 83 | """ 84 | preprocess element for list extraction 85 | :param element: 86 | :return: 87 | """ 88 | # remove tag and its content 89 | etree.strip_elements(element, *CONTENT_EXTRACTOR_USELESS_TAGS) 90 | # only move tag pair 91 | etree.strip_tags(element, *CONTENT_EXTRACTOR_STRIP_TAGS) 92 | 93 | remove_children(element, CONTENT_EXTRACTOR_NOISE_XPATH) 94 | 95 | for child in children(element): 96 | 97 | # merge text in span or strong to parent p tag 98 | if child.tag.lower() == 'p': 99 | etree.strip_tags(child, 'span') 100 | etree.strip_tags(child, 'strong') 101 | 102 | if not (child.text and child.text.strip()): 103 | remove_element(child) 104 | 105 | # if a div tag does not contain any sub node, it could be converted to p node. 106 | if child.tag.lower() == 'div' and not child.getchildren(): 107 | child.tag = 'p' 108 | 109 | 110 | LIST_CLASSIFIER_USELESS_TAGS = ['style', 'script', 'link', 'video', 'audio', 'iframe', 'source', 'svg', 'path', 111 | 'symbol', 'footer', 'header'] 112 | LIST_CLASSIFIER_STRIP_TAGS = ['span', 'blockquote'] 113 | LIST_CLASSIFIER_NOISE_XPATHS = [ 114 | '//div[contains(@class, "comment")]', 115 | '//div[contains(@class, "advertisement")]', 116 | '//div[contains(@class, "advert")]', 117 | '//div[contains(@style, "display: none")]', 118 | ] 119 | 120 | 121 | def preprocess4list_classifier(element: Element): 122 | """ 123 | preprocess element for list classifier 124 | :param element: 125 | :return: 126 | """ 127 | # remove tag and its content 128 | etree.strip_elements(element, *LIST_CLASSIFIER_USELESS_TAGS) 129 | # only move tag pair 130 | etree.strip_tags(element, *LIST_CLASSIFIER_STRIP_TAGS) 131 | 132 | remove_children(element, LIST_CLASSIFIER_NOISE_XPATHS) 133 | 134 | for child in children(element): 135 | 136 | # merge text in span or strong to parent p tag 137 | if child.tag.lower() == 'p': 138 | etree.strip_tags(child, 'span') 139 | etree.strip_tags(child, 'strong') 140 | 141 | if not (child.text and child.text.strip()): 142 | remove_element(child) 143 | 144 | # if a div tag does not contain any sub node, it could be converted to p node. 145 | if child.tag.lower() == 'div' and not child.getchildren(): 146 | child.tag = 'p' 147 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/utils/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/19 20:00 4 | # @Author : MuggleK 5 | # @File : settings.py 6 | 7 | # list settings 8 | LIST_MIN_NUMBER = 5 9 | LIST_MIN_LENGTH = 8 10 | LIST_MAX_LENGTH = 50 11 | SIMILARITY_THRESHOLD = 0.8 12 | 13 | LIST_AVG_LENGTH = 9 14 | ADDTION_RIGHT_NUM = 10000 15 | 16 | HIGH_WEIGHT_ERROR_KEYWORD = ["ICP备", "公网安备", "网公安备", "备案序号:", "网站地图"] 17 | DIRECTORY_ERROR_TITLE = ["首页", "下一页", "解读", "图解", "详细", "阅读全文", "标题", "[详细]"] 18 | 19 | 20 | # common settings 21 | SPECIAL_SYMBOL_MAP = { 22 | """: '"', 23 | "&": "&", 24 | "<": "<", 25 | ">": ">", 26 | " ": " ", 27 | """: '"', 28 | "&": "&", 29 | "<": "<", 30 | ">": ">", 31 | " ": " ", 32 | '