├── CrawlersTools
├── extractors
│ ├── utils
│ │ ├── __init__.py
│ │ ├── cluster.py
│ │ ├── similarity.py
│ │ ├── preprocess.py
│ │ ├── settings.py
│ │ └── element.py
│ ├── schemas
│ │ ├── __init__.py
│ │ └── element.py
│ ├── base.py
│ ├── __init__.py
│ ├── attachment_extractor.py
│ ├── time_extractor.py
│ ├── title_extractor.py
│ ├── content_extractor.py
│ └── list_extractor.py
├── projects
│ ├── __init__.py
│ ├── filters.py
│ └── upload_oss.py
├── schedules
│ ├── __init__.py
│ └── auto_thread.py
├── requirements.txt
├── logs
│ ├── __init__.py
│ ├── handlers.py
│ ├── log.py
│ ├── logger.py
│ └── formatters.py
├── preprocess
│ ├── __init__.py
│ ├── time_process.py
│ └── bloom_filter.py
├── js_crawler
│ ├── __init__.py
│ ├── transfer_js.py
│ └── font_decrypt.py
├── pipelines
│ ├── __init__.py
│ ├── redis_pipeline.py
│ ├── kafka_operate.py
│ ├── mongo_pipeline.py
│ └── mysql_pipeline.py
├── requests
│ ├── __init__.py
│ ├── proxy.py
│ ├── base_requests.py
│ └── random_ua.py
├── __init__.py
└── utils
│ └── str_compare.py
├── .github
└── workflows
│ └── python-publish.yml
├── setup.py
├── .gitignore
├── LICENSE
└── README.md
/CrawlersTools/extractors/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/CrawlersTools/extractors/schemas/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/CrawlersTools/projects/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/15 11:29
4 | # @Author : MuggleK
5 | # @File : __init__.py
6 |
--------------------------------------------------------------------------------
/CrawlersTools/schedules/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/15 11:29
4 | # @Author : MuggleK
5 | # @File : __init__.py
6 |
7 | from CrawlersTools.schedules.auto_thread import AutoThread
8 |
--------------------------------------------------------------------------------
/CrawlersTools/requirements.txt:
--------------------------------------------------------------------------------
1 | auto_mix_prep
2 | DBUtils==1.3
3 | fontTools
4 | httpx
5 | httpx[http2]
6 | loguru
7 | Pillow
8 | PyExecJS==1.5.1
9 | pymongo
10 | PyMySQL
11 | redis
12 | tqdm
13 | PyYAML
14 | lxml
15 | numpy
16 | Distance
17 | chardet
18 | sinan
19 | kafka-python
20 |
--------------------------------------------------------------------------------
/CrawlersTools/logs/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/11 21:05
4 | # @Author : MuggleK
5 | # @File : logs.py
6 |
7 | from .logger import init_logger
8 | from .handlers import default_handler, logstash_handler
9 |
10 | from CrawlersTools.logs.log import Logging
11 |
--------------------------------------------------------------------------------
/CrawlersTools/preprocess/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/15 11:29
4 | # @Author : MuggleK
5 | # @File : __init__.py
6 |
7 | from CrawlersTools.preprocess.bloom_filter import BloomFilter
8 |
9 | from CrawlersTools.preprocess.time_process import TimeProcessor
10 |
--------------------------------------------------------------------------------
/CrawlersTools/js_crawler/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/15 11:29
4 | # @Author : MuggleK
5 | # @File : __init__.py
6 |
7 | from CrawlersTools.js_crawler.font_decrypt import FontDecrypt
8 | from CrawlersTools.js_crawler.transfer_js import int_overflow, right_shift, string_similar
9 |
--------------------------------------------------------------------------------
/CrawlersTools/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/15 11:29
4 | # @Author : MuggleK
5 | # @File : __init__.py
6 |
7 | from CrawlersTools.pipelines.mongo_pipeline import MongoPipeline
8 |
9 | from CrawlersTools.pipelines.mysql_pipeline import MysqlPipeline
10 |
11 | from CrawlersTools.pipelines.redis_pipeline import RedisPipeline
12 |
--------------------------------------------------------------------------------
/CrawlersTools/requests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/6/21 17:08
4 | # @Author : MuggleK
5 | # @File : __init__.py
6 |
7 | import os
8 |
9 | from CrawlersTools.requests.base_requests import BaseRequests
10 | from CrawlersTools.requests.proxy import get_proxies
11 | from CrawlersTools.requests.random_ua import UserAgent
12 |
13 | PROJECT_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
14 |
15 |
16 | base_requests = BaseRequests().base_requests
17 |
--------------------------------------------------------------------------------
/CrawlersTools/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/12 20:48
4 | # @Author : MuggleK
5 | # @File : __init__.py
6 |
7 | from CrawlersTools.extractors import PolicyExtractor, ListExtractor
8 | from .logs.logger import init_logger
9 | from CrawlersTools.logs import Logging
10 | from CrawlersTools.pipelines import MysqlPipeline, MongoPipeline, RedisPipeline
11 | from CrawlersTools.preprocess import TimeProcessor
12 | from CrawlersTools.requests import base_requests, get_proxies, UserAgent
13 |
--------------------------------------------------------------------------------
/CrawlersTools/requests/proxy.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/6/21 17:06
4 | # @Author : MuggleK
5 | # @File : proxy.py
6 |
7 | import httpx
8 |
9 | from loguru import logger
10 |
11 |
12 | def get_proxies(proxy_url=None, http2=False):
13 | """
14 | 默认httpx代理模式
15 | @param proxy_url: 代理请求链接
16 | @param http2: 默认http1.1规则
17 | @return:
18 | """
19 | if not proxy_url: return
20 |
21 | protocol = 'http://'
22 | try:
23 | proxy = httpx.get(proxy_url).text.strip()
24 | proxy = protocol + proxy
25 | if http2:
26 | return {protocol: proxy, 'https://': proxy}
27 | return {"http": proxy, "https": proxy}
28 | except Exception as err:
29 | logger.error(f'获取代理失败:{err}')
30 |
--------------------------------------------------------------------------------
/CrawlersTools/pipelines/redis_pipeline.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/12 9:12
4 | # @Author : MuggleK
5 | # @File : redis_pipeline.py
6 |
7 | import redis
8 |
9 |
10 | class RedisPipeline(object):
11 | def __init__(self, name, namespace, **redis_kwargs):
12 | self.__db = redis.Redis(**redis_kwargs)
13 | self.key = '%s:%s' % (namespace, name)
14 |
15 | def qsize(self):
16 | return self.__db.llen(self.key)
17 |
18 | def put(self, item):
19 | self.__db.rpush(self.key, item)
20 |
21 | def get_wait(self, timeout=None):
22 | item = self.__db.blpop(self.key, timeout=timeout)
23 | return item
24 |
25 | def get_nowait(self):
26 | item = self.__db.lpop(self.key)
27 | return item
28 |
--------------------------------------------------------------------------------
/CrawlersTools/js_crawler/transfer_js.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/12 16:59
4 | # @Author : MuggleK
5 | # @File : transfer_js.py
6 |
7 | import ctypes
8 | import difflib
9 |
10 |
11 | def int_overflow(val: int):
12 | """
13 | Process JavaScript nums Overflow
14 | :param val:
15 | :return:
16 | """
17 | maxint = 2147483647
18 | if not -maxint - 1 <= val <= maxint:
19 | val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1
20 | return val
21 |
22 |
23 | def right_shift(n, i):
24 | """
25 | Python Operator ">>"
26 | :param n:
27 | :param i:
28 | :return:
29 | """
30 | if n < 0:
31 | n = ctypes.c_uint32(n).value
32 | if i < 0:
33 | return -int_overflow(n << abs(i))
34 | if i != 0:
35 | return int_overflow(n >> i)
36 | else:
37 | return n
38 |
39 |
40 | def string_similar(s1: str, s2: str):
41 | """
42 | Compare Strings Similar Percentage
43 | :param s1:
44 | :param s2:
45 | :return: :float: percentage
46 | """
47 | return difflib.SequenceMatcher(None, s1, s2).quick_ratio()
48 |
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Upload Python Package
10 |
11 | on: push
12 |
13 | permissions:
14 | contents: read
15 |
16 | jobs:
17 | deploy:
18 |
19 | runs-on: ubuntu-latest
20 |
21 | steps:
22 | - uses: actions/checkout@v3
23 | - name: Set up Python
24 | uses: actions/setup-python@v3
25 | with:
26 | python-version: '3.x'
27 | - name: Install dependencies
28 | run: |
29 | python -m pip install --upgrade pip
30 | pip install build
31 | - name: Build package
32 | run: python -m build
33 | - name: Publish package
34 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
35 | with:
36 | user: __token__
37 | password: ${{ secrets.PYPI_API_TOKEN }}
38 |
--------------------------------------------------------------------------------
/CrawlersTools/pipelines/kafka_operate.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # -*- coding: utf-8 -*-
3 | # @Time : 2022/9/5 15:32
4 | # @Author : xc
5 | # @File : kafka_operate
6 | # @Software: PyCharm
7 |
8 |
9 | import json
10 | from kafka import KafkaProducer
11 | from loguru import logger
12 |
13 |
14 | class KProducer:
15 | def __init__(self, bootstrap_servers, topic):
16 | """
17 | kafka 生产者
18 | :param bootstrap_servers: 地址
19 | :param topic: topic
20 | """
21 | self.producer = KafkaProducer(
22 | bootstrap_servers=bootstrap_servers,
23 | value_serializer=lambda m: json.dumps(m).encode('ascii'), ) # json 格式化发送的内容
24 | self.topic = topic
25 |
26 | def sync_producer(self, data_li: list):
27 | """
28 | 同步发送 数据
29 | :param data_li: 发送数据
30 | :return:
31 | """
32 | for data in data_li:
33 | future = self.producer.send(self.topic, data)
34 | record_metadata = future.get(timeout=10) # 同步确认消费
35 | partition = record_metadata.partition # 数据所在的分区
36 | offset = record_metadata.offset # 数据所在分区的位置
37 | logger.success('save success, partition: {}, offset: {}'.format(partition, offset))
38 |
--------------------------------------------------------------------------------
/CrawlersTools/utils/str_compare.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | def str_compare(str1: str, str2: str, half_compare=False) -> bool:
6 | """
7 | 比较两个字符串是否相等,当half_compare为True时会把字符串转为半角字符串之后在比较
8 | 适用:比较两个公司名,原则上 特斯拉(上海)有限公司 和 特斯拉(上海)有限公司 这两个公司是同一个,但是因为一个是全角括号,一个是半角括号,
9 | 直接比较会导致两个公司名不相等,这时候转换为半角在进行比较则不会出现这个问题.
10 | """
11 | str1 = full_str_to_half_str(str1) if half_compare else str1
12 | str2 = full_str_to_half_str(str2) if half_compare else str2
13 | if str1 == str2:
14 | return True
15 | return False
16 |
17 |
18 | def full_str_to_half_str(full_str: str) -> str:
19 | # 全角字符串转半角字符串
20 | half_str = ""
21 | for _str in full_str:
22 | _str_code = ord(_str)
23 | if _str_code == 12288: # 全角空格转半角空格
24 | _str_code = 32
25 | elif 65281 <= _str_code <= 65374:
26 | _str_code -= 65248
27 | half_str += chr(_str_code)
28 | return half_str
29 |
30 |
31 | if __name__ == '__main__':
32 | print(str_compare('特斯拉(上海)有限公司', '特斯拉(上海)有限公司'))
33 | print(str_compare('特斯拉(上海)有限公司', '特斯拉(上海)有限公司', half_compare=True)) # 全角括号转半角括号
34 | print(str_compare(' 特斯拉(上海)有限公司', ' 特斯拉(上海)有限公司'))
35 | print(str_compare(' 特斯拉(上海)有限公司', ' 特斯拉(上海)有限公司', half_compare=True)) # 全角空格转半角空格
36 |
--------------------------------------------------------------------------------
/CrawlersTools/logs/handlers.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import sys
3 |
4 | from .formatters import JsonFormatter
5 |
6 | DEFAULT_HANDLER_FORMAT = "{time:YYYY-MM-DD HH:mm:ss.SSS} | {level:8} | " \
7 | "{name}:{function}:{line} - " \
8 | "{message}"
9 | LOGSTASH_HANDLER_FORMAT = "{message}"
10 |
11 |
12 | def default_handler(level="DEBUG", format=DEFAULT_HANDLER_FORMAT, **kwargs) -> dict:
13 | return dict(sink=sys.stderr, level=level, format=format, **kwargs)
14 |
15 |
16 | class LogstashHandler(logging.StreamHandler):
17 | def __init__(self, formatter=None):
18 | super().__init__()
19 | self.formatter = formatter
20 |
21 |
22 | def logstash_handler(
23 | level="INFO",
24 | format=LOGSTASH_HANDLER_FORMAT,
25 | extra=None,
26 | **kwargs
27 | ) -> dict:
28 | if extra is None:
29 | extra = {}
30 | elif not isinstance(extra, dict):
31 | raise TypeError(
32 | "The 'extra' parameter should be a dict (or None), not: '%s'"
33 | % type(extra).__name__
34 | )
35 |
36 | return dict(
37 | sink=LogstashHandler(JsonFormatter(**extra)),
38 | level=level,
39 | format=format,
40 | **kwargs
41 | )
42 |
--------------------------------------------------------------------------------
/CrawlersTools/extractors/base.py:
--------------------------------------------------------------------------------
1 | from loguru import logger
2 | from lxml.html import etree
3 | from lxml.html import fromstring
4 |
5 | from CrawlersTools.extractors.schemas.element import Element
6 |
7 |
8 | class BaseExtractor(object):
9 | """
10 | Base Extractor which provide common methods
11 | """
12 |
13 | kwargs = None
14 |
15 | @staticmethod
16 | def to_string(element: Element, limit: int = None):
17 | """
18 | convert element to string
19 | :param element:
20 | :param limit:
21 | :return:
22 | """
23 | result = etree.tostring(element, pretty_print=True, encoding="utf-8", method='html').decode('utf-8')
24 | if limit:
25 | return result[:limit]
26 | return result
27 |
28 | def process(self, element: Element):
29 | """
30 | process method that you should implement
31 | :param element:
32 | :return:
33 | """
34 | logger.error('You must implement process method in your extractor.')
35 | raise NotImplementedError
36 |
37 | def extract(self, html, **kwargs):
38 | """
39 | base extract method, firstly, it will convert html to WebElement, then it call
40 | process method that child class implements
41 | :param html:
42 | :return:
43 | """
44 | self.kwargs = kwargs
45 | element = fromstring(html=html)
46 | element.__class__ = Element
47 | return self.process(element)
48 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/12 21:00
4 | # @Author : MuggleK
5 | # @File : setup.py
6 |
7 | from setuptools import setup, find_packages
8 |
9 | with open("README.md", "r", encoding='utf-8') as f:
10 | long_description = f.read()
11 |
12 | setup(
13 | name='CrawlersTools', # 包名
14 | version='1.4.81', # 版本号
15 | description='Tools for Crawlers',
16 | long_description=long_description,
17 | long_description_content_type="text/markdown",
18 | author='MuggleK',
19 | author_email='peichangchuan@gmail.com',
20 | url='https://github.com/MuggleK/CrawlersTools',
21 | install_requires=[
22 | "auto_mix_prep",
23 | "DBUtils==1.3",
24 | "fontTools",
25 | "httpx",
26 | "httpx[http2]",
27 | "loguru",
28 | "Pillow",
29 | "PyExecJS==1.5.1",
30 | "pymongo",
31 | "PyMySQL",
32 | "redis",
33 | "tqdm",
34 | "PyYAML",
35 | "lxml",
36 | "numpy",
37 | "Distance",
38 | "chardet",
39 | "sinan",
40 | "kafka-python"
41 | ],
42 | license='BSD License',
43 | packages=find_packages(where='.', exclude=(), include=('*',)),
44 | platforms=["all"],
45 | classifiers=[
46 | 'Intended Audience :: Developers',
47 | 'Operating System :: OS Independent',
48 | 'Natural Language :: Chinese (Simplified)',
49 | 'Programming Language :: Python :: 3.7',
50 | 'Topic :: Software Development :: Libraries'
51 | ],
52 | )
53 |
--------------------------------------------------------------------------------
/CrawlersTools/extractors/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/15 11:29
4 | # @Author : MuggleK
5 | # @File : __init__.py
6 |
7 | from CrawlersTools.extractors.attachment_extractor import AttachmentExtractor
8 | from CrawlersTools.extractors.content_extractor import ContentExtractor
9 | from CrawlersTools.extractors.list_extractor import ListExtractor
10 | from CrawlersTools.extractors.time_extractor import TimeExtractor
11 | from CrawlersTools.extractors.title_extractor import TitleExtractor
12 |
13 |
14 | class PolicyExtractor(object):
15 |
16 | @staticmethod
17 | def extract(
18 | html,
19 | title_xpath: str = "",
20 | publish_time_xpath: str = "",
21 | content_xpath: str = "",
22 | attachment_xpath: str = "",
23 | attachment_regx: str = ""
24 | ) -> dict:
25 | title = TitleExtractor().extract(html, title_xpath=title_xpath)
26 | publish_time = TimeExtractor().extract(html, publish_time_xpath=publish_time_xpath)
27 | content, content_with_tag, images = ContentExtractor().extract(html, content_xpath=content_xpath)
28 | attachments = AttachmentExtractor().extract(html, attachment_xpath=attachment_xpath, attachment_regx=attachment_regx)
29 |
30 | return {
31 | "title": title,
32 | "publish_time": publish_time,
33 | "content": content,
34 | "content_with_tag": content_with_tag,
35 | "images": images,
36 | "attachment": attachments
37 | }
38 |
--------------------------------------------------------------------------------
/CrawlersTools/logs/log.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/6/21 17:08
4 | # @Author : MuggleK
5 | # @File : logs.py
6 |
7 | import time
8 |
9 | from loguru import logger
10 |
11 | time_format = time.strftime("%Y_%m_%d")
12 | log_format = "{time:YYYY-MM-DD HH:mm:ss}|{level}| {name}:{function}:{line}| {message}"
13 |
14 |
15 | class Logging(object):
16 | """
17 | Usage::
18 |
19 | # >>>
20 | # >>> logger = Logging('logs')
21 | # >>> logger.info('Logging Example')
22 | # 2022-01-20 17:27:32.194 | INFO | __main__:info:149 - Logging Example
23 | # >>>
24 | """
25 |
26 | __instance = None
27 |
28 | def __new__(cls, log_path, *args, **kwargs):
29 | if not cls.__instance:
30 | cls.__instance = super(Logging, cls).__new__(cls, *args, **kwargs)
31 |
32 | return cls.__instance
33 |
34 | def __init__(self, log_path, expire_date="10 days"):
35 | logger.add(f"{log_path}/log_{time_format}_info.log", encoding="utf-8", enqueue=True, retention="1 months", level="INFO", format=log_format)
36 | logger.add(f"{log_path}/log_{time_format}_error.log", encoding="utf-8", enqueue=True, retention=expire_date, level="ERROR", format=log_format)
37 | logger.add(f"{log_path}/log_{time_format}_debug.log", encoding="utf-8", enqueue=True, retention=expire_date, level="DEBUG", format=log_format)
38 |
39 | @staticmethod
40 | def info(msg):
41 | return logger.info(msg)
42 |
43 | @staticmethod
44 | def debug(msg):
45 | return logger.debug(msg)
46 |
47 | @staticmethod
48 | def warning(msg):
49 | return logger.warning(msg)
50 |
51 | @staticmethod
52 | def error(msg):
53 | return logger.error(msg)
54 |
55 | @staticmethod
56 | def success(msg):
57 | return logger.success(msg)
58 |
--------------------------------------------------------------------------------
/CrawlersTools/projects/filters.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/12 14:48
4 | # @Author : MuggleK
5 | # @File : filters.py
6 |
7 | import re
8 | from functools import reduce
9 | from urllib.parse import urlparse
10 |
11 | from loguru import logger
12 |
13 |
14 | def empty_text(lis):
15 | word = ""
16 | for i in lis:
17 | word += i.strip()
18 | return word
19 |
20 |
21 | def filter_title(title: str, remove_list: list):
22 | """
23 |
24 | :param title: 文章标题
25 | :param remove_list: 过滤关键词列表
26 | :return:
27 | """
28 | if not title:
29 | return False
30 | for r in remove_list:
31 | and_lists = r.split("and")
32 | if len(and_lists) == 1:
33 | if and_lists[0] in title:
34 | logger.debug(f"过滤标题: {title} 过滤词: {r}")
35 | return True
36 | else:
37 | total = [1 for a in and_lists if a in title]
38 | result = reduce(lambda x, y: x + y, total)
39 | if len(and_lists) != result:
40 | continue
41 | return True
42 |
43 |
44 | def filter_text(text, removes: list):
45 | """
46 | :param text: 正文字段
47 | :param removes: 需要去掉的特殊字段:扫一扫,【关闭】,【打印】
48 | :return:
49 | """
50 | if removes:
51 | for remove in removes:
52 | text = text.replace(remove, '')
53 | return text
54 |
55 |
56 | def filter_allowed_url(url, main_url, other_domains):
57 | other_domains = other_domains if other_domains else []
58 | main_url = main_url[0] if isinstance(main_url, list) else main_url
59 | allowed_domains = [urlparse(main_url).netloc] + other_domains if urlparse(main_url).netloc else other_domains + [main_url]
60 | for domain in allowed_domains:
61 | if (not domain) or re.search(domain, url):
62 | return True
63 |
--------------------------------------------------------------------------------
/CrawlersTools/extractors/utils/cluster.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 | from CrawlersTools.extractors.utils.similarity import similarity
4 |
5 |
6 | def cluster(items, threshold=0.9):
7 | """
8 | cluster names
9 | :param items:
10 | :param threshold:
11 | :return: cluster map, for example {"foo": 0, "bar": 1}
12 | """
13 | number = -1
14 | clusters_map = {}
15 | clusters = []
16 | for name in items:
17 | for c in clusters:
18 | if all(similarity(name, w) > threshold for w in c):
19 | c.append(name)
20 | clusters_map[name] = number
21 | break
22 | else:
23 | number += 1
24 | clusters.append([name])
25 | clusters_map[name] = number
26 | return clusters_map
27 |
28 |
29 | def cluster_dict(data: dict, threshold=0.8):
30 | """
31 | cluster dict, convert id key to cluster id key
32 | :param threshold:
33 | :param data:
34 | :return:
35 | """
36 | ids = data.keys()
37 | clusters_map = cluster(ids, threshold)
38 | result = defaultdict(list)
39 | for k, v in data.items():
40 | if isinstance(v, list):
41 | for i in v:
42 | result[clusters_map[k]].append(i)
43 | else:
44 | result[clusters_map[k]].append(v)
45 | return dict(result)
46 |
47 |
48 | if __name__ == '__main__':
49 | data = {
50 | '/html/body/div[@class="main"]/div[1]/ul': ['child1', 'child2', 'child3'],
51 | '/html/body/div[@class="main"]/div[2]/ul': ['child4', 'child5', 'child6'],
52 | '/html/body/div[@class="main"]/div[3]/ul': ['child7', 'child8', 'child9'],
53 | '/html/body/header/div[1]': ['child10', 'child11', 'child12'],
54 | '/html/body/header/div[2]': ['child13', 'child14', 'child15'],
55 | }
56 | print(cluster_dict(data, threshold=0.7))
57 |
--------------------------------------------------------------------------------
/CrawlersTools/extractors/attachment_extractor.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/12/21 10:54
4 | # @Author : MuggleK
5 | # @File : attachment_extractor.py
6 |
7 | import re
8 |
9 | from CrawlersTools.extractors.base import BaseExtractor
10 | from CrawlersTools.extractors.schemas.element import Element
11 | from CrawlersTools.extractors.utils.settings import ATTACHMENT_REGX
12 |
13 |
14 | class AttachmentExtractor(BaseExtractor):
15 | """
16 | extract content from detail page
17 | """
18 |
19 | def process(self, element: Element):
20 | """
21 | extract content from html
22 | :param element:
23 | :return:
24 | """
25 | attachment_list = list()
26 | attachment_xpath = self.kwargs.get("attachment_xpath") or "//a"
27 | for attachment_element in element.xpath(attachment_xpath):
28 | url = [i.strip() for i in attachment_element.xpath("@href") or attachment_element.xpath("@src")]
29 | name = [i.strip() for i in attachment_element.xpath(".//text()")]
30 | if not (''.join(url).strip() and ''.join(name).strip()):
31 | continue
32 | suffix = self.filter_suffix(url[0], name[0])
33 | if not suffix: continue
34 | attachment_list.append({
35 | "file_url": url[0],
36 | "file_name": name[0]
37 | })
38 | return attachment_list
39 |
40 | def filter_suffix(self, url, name):
41 | """
42 | 附件.xls.doc 可上传, 接口会默认取最后一个
43 | 优先取 file_url 后缀
44 | """
45 | regx = self.kwargs.get("attachment_regx") or ATTACHMENT_REGX
46 | is_name_suffix = re.search(regx, name, re.I)
47 | is_url_suffix = re.search(regx, url, re.I)
48 | name_suffix = is_name_suffix.group(1) if is_name_suffix else ""
49 | url_suffix = is_url_suffix.group(1) if is_url_suffix else ""
50 |
51 | return name_suffix or url_suffix
52 |
--------------------------------------------------------------------------------
/CrawlersTools/logs/logger.py:
--------------------------------------------------------------------------------
1 | """
2 | - logging 日志拦截转发到 loguru
3 | - 日志输出为json并精简&自定义字段
4 | - 日志拦截与输出的 集成&单独 方法
5 | """
6 | import time
7 | from typing import List, Optional, Dict
8 |
9 | from loguru import logger
10 |
11 | from .handlers import default_handler, DEFAULT_HANDLER_FORMAT
12 |
13 | TIME_FORMAT = time.strftime("%Y_%m_%d")
14 |
15 |
16 | def init_logger(
17 | handlers: Optional[List[Dict]] = None,
18 | add_file_handler: bool = False,
19 | log_path: str = "./",
20 | file_handler_level: str = "DEBUG",
21 | file_handler_format: str = DEFAULT_HANDLER_FORMAT,
22 | **kwargs
23 | ):
24 | """
25 | 一键配置 loguru ,所属程序本身的日志可直接 from loguru import logger ,即可正常处理
26 |
27 | :param handlers: 日志处理的 handlers ,参见 loguru.configure ,默认配置了 default_handler ,其他预置的可以从 .logger.handlers 导入
28 | :param add_file_handler: 开启后,会添加一个默认的文件输出 handler
29 | :param log_path: 日志文件的路径,默认当前目录
30 | :param file_handler_level: 文件输出 handler 的日志级别,默认 DEBUG
31 | :param file_handler_format: 文件输出 handler 的日志格式,默认 DEFAULT_HANDLER_FORMAT
32 | :param kwargs: 其他要传递给 logger.configure 的参数
33 | """
34 | if handlers is None:
35 | handlers = [default_handler()]
36 | elif not isinstance(handlers, list):
37 | raise TypeError(
38 | "The 'handlers' parameter should be a list (or None), not: '%s'"
39 | % type(handlers).__name__
40 | )
41 |
42 | extra = kwargs.pop("extra", {})
43 | if not isinstance(extra, dict):
44 | raise TypeError(
45 | "The 'extra' parameter should be a dict (or None), not: '%s'"
46 | % type(extra).__name__
47 | )
48 |
49 | logger.configure(handlers=handlers, extra=extra, **kwargs)
50 |
51 | if add_file_handler:
52 | expire_date = kwargs.pop("expire_date", "1 days")
53 | logger.add(
54 | f"{log_path}/log_{TIME_FORMAT}_{file_handler_level.lower()}.log",
55 | encoding="UTF-8", enqueue=True, retention=expire_date,
56 | level=file_handler_level, format=file_handler_format
57 | )
58 |
--------------------------------------------------------------------------------
/CrawlersTools/extractors/time_extractor.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/11/3 9:55
4 | # @Author : MuggleK
5 | # @File : time_extractor.py
6 |
7 | import re
8 |
9 | from lxml.html import etree
10 |
11 | from CrawlersTools.extractors.base import BaseExtractor
12 | from CrawlersTools.extractors.schemas.element import Element
13 | from CrawlersTools.extractors.utils.settings import DATETIME_PATTERN, PUBLISH_TIME_META, TITLE_EXTRACTOR_USELESS_TAGS
14 | from CrawlersTools.preprocess import TimeProcessor
15 |
16 | format_time = TimeProcessor().format
17 |
18 |
19 | class TimeExtractor(BaseExtractor):
20 |
21 | @staticmethod
22 | def extract_from_xpath(element: Element, publish_time_xpath: str) -> str:
23 | if publish_time_xpath:
24 | publish_time = ''.join(element.xpath(publish_time_xpath))
25 | return format_time(publish_time)
26 | return ''
27 |
28 | @staticmethod
29 | def extract_from_text(element: Element) -> str:
30 | text = ''.join(element.xpath('.//text()'))
31 | for dt in DATETIME_PATTERN:
32 | dt_obj = re.search(dt, text)
33 | if dt_obj:
34 | return format_time(dt_obj.group(1))
35 | else:
36 | return ''
37 |
38 | @staticmethod
39 | def extract_from_meta(element: Element) -> str:
40 | """
41 | 优先匹配 META 数据
42 | :param element: 网页源代码对应的Dom 树
43 | :return: str
44 | """
45 | for xpath in PUBLISH_TIME_META:
46 | publish_time = element.xpath(xpath)
47 | if publish_time:
48 | return format_time(''.join(publish_time))
49 | return ''
50 |
51 | def process(self, element: Element):
52 | # remove tag and its content
53 | etree.strip_elements(element, *TITLE_EXTRACTOR_USELESS_TAGS)
54 |
55 | publish_time = (self.extract_from_xpath(element, publish_time_xpath=self.kwargs.get("publish_time_xpath"))
56 | or self.extract_from_meta(element)
57 | or self.extract_from_text(element))
58 |
59 | return publish_time
60 |
--------------------------------------------------------------------------------
/CrawlersTools/extractors/utils/similarity.py:
--------------------------------------------------------------------------------
1 | import distance
2 |
3 |
4 | def similarity1(s1, s2):
5 | """
6 | get similarity of two strings
7 | :param s1:
8 | :param s2:
9 | :return:
10 | """
11 | if not s1 or not s2:
12 | return 0
13 | edit_distance = distance.levenshtein(s1, s2)
14 | similarity_score = 1 - edit_distance / (len(s1) + len(s2))
15 | return similarity_score
16 |
17 |
18 | def similarity2(s1, s2):
19 | """
20 | get similarity of two strings
21 | :param s1:
22 | :param s2:
23 | :return:
24 | """
25 | if not s1 or not s2:
26 | return 0
27 | s1_set = set(list(s1))
28 | s2_set = set(list(s2))
29 | intersection = s1_set.intersection(s2_set)
30 | union = s2_set.union(s1_set)
31 | return len(intersection) / len(union)
32 |
33 |
34 | def similarity(s1, s2):
35 | """
36 | get similarity of two strings
37 | :param s1:
38 | :param s2:
39 | :return:
40 | """
41 | return similarity2(s1, s2)
42 |
43 |
44 | def get_longest_common_sub_string(str1: str, str2: str) -> str:
45 | """
46 | get longest common string
47 | :param str1:
48 | :param str2:
49 | :return:
50 | """
51 | if not all([str1, str2]):
52 | return ''
53 | matrix = [[0] * (len(str2) + 1) for _ in range(len(str1) + 1)]
54 | max_length = 0
55 | start_position = 0
56 | for index_of_str1 in range(1, len(str1) + 1):
57 | for index_of_str2 in range(1, len(str2) + 1):
58 | if str1[index_of_str1 - 1] == str2[index_of_str2 - 1]:
59 | matrix[index_of_str1][index_of_str2] = matrix[index_of_str1 - 1][index_of_str2 - 1] + 1
60 | if matrix[index_of_str1][index_of_str2] > max_length:
61 | max_length = matrix[index_of_str1][index_of_str2]
62 | start_position = index_of_str1 - max_length
63 | else:
64 | matrix[index_of_str1][index_of_str2] = 0
65 | return str1[start_position: start_position + max_length]
66 |
67 |
68 | if __name__ == '__main__':
69 | s1 = 'hello'
70 | s2 = 'world'
71 | print(similarity(s1, s2))
72 |
--------------------------------------------------------------------------------
/CrawlersTools/pipelines/mongo_pipeline.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/12 9:12
4 | # @Author : MuggleK
5 | # @File : mongo_pipeline.py
6 |
7 | from pymongo import MongoClient
8 |
9 |
10 | class MongoPipeline:
11 | """
12 | A Mongo Pipeline to Create or Insert or Update or Delete Collection
13 |
14 | Usage:
15 |
16 | ```python
17 | >>> mongo_client = MongoPipeline()
18 | >>> record = mongo_client.find_one("test_collection", '{"company_name": "qzd"}')
19 | ```
20 | """
21 |
22 | collection = None
23 | conn = None
24 |
25 | def __init__(self, host="127.0.0.1", port="27017", username="root", password="root", database="crawl_data"):
26 |
27 | self.server = '''mongodb://%s:%s@%s:%s/%s''' % (username, password, host, port, database)
28 | self.client = MongoClient(host=self.server, readPreference="secondaryPreferred")
29 | self.db = self.client.get_database(database)
30 |
31 | def close(self):
32 | return self.client.close()
33 |
34 | def set_collection(self, name):
35 | self.collection = self.db.get_collection(name)
36 |
37 | def find(self, collection_name, query=None, ref_query=None):
38 | """
39 | from query phrase to find docs
40 |
41 | :param collection_name:
42 | :param query: query phrase
43 | :param ref_query: reserve phrase
44 | :return:
45 | """
46 | records = self.db.get_collection(collection_name).find(query, ref_query)
47 | return records
48 |
49 | def find_one(self, collection_name, query=None, ref_query=None):
50 | records = self.db.get_collection(collection_name).find_one(query, ref_query)
51 | return records
52 |
53 | def update(self, collection_name, query, update, many=False):
54 | if many:
55 | self.db.get_collection(collection_name).update_many(query, update, upsert=True)
56 | return
57 | self.db.get_collection(collection_name).update_one(query, update, upsert=True)
58 |
59 | def aggregate(self, collection_name, query):
60 | records = self.db.get_collection(collection_name).aggregate(query)
61 | for record in records:
62 | yield record
63 |
--------------------------------------------------------------------------------
/CrawlersTools/logs/formatters.py:
--------------------------------------------------------------------------------
1 | import json
2 | from logging import Formatter
3 | from typing import Tuple, List, Optional, Union
4 |
5 | EXTRA_IGNORE_FIELDS_DEFAULT = (
6 | "name",
7 | "msg",
8 | "args",
9 | "levelno",
10 | "pathname",
11 | "filename",
12 | "module",
13 | "exc_info",
14 | "exc_text",
15 | "stack_info",
16 | "lineno",
17 | "funcName",
18 | "created",
19 | "msecs",
20 | "relativeCreated",
21 | "thread",
22 | "threadName",
23 | "processName",
24 | "process",
25 | )
26 |
27 |
28 | class JsonFormatter(Formatter):
29 | """格式化日志到Json,并删除某些字段"""
30 |
31 | def __init__(
32 | self,
33 | extra_ignore_keys: Optional[Union[List[str], Tuple[str]]] = EXTRA_IGNORE_FIELDS_DEFAULT,
34 | with_timestamp: bool = True,
35 | **kwargs
36 | ):
37 | """
38 | :param ignore_fields: 需要从 record[extra] 里忽略(排除)的字段
39 | :param kwargs: 这里的 key:val 会添加到格式化后的消息中 eg: app=explore
40 | """
41 | super(JsonFormatter, self).__init__()
42 | self.extra_ignore_keys = extra_ignore_keys
43 | self.with_timestamp = with_timestamp
44 | self.kwargs = kwargs
45 |
46 | def formatException(self, exc_info):
47 | exc_text = super(JsonFormatter, self).formatException(exc_info)
48 | return repr(exc_text)
49 |
50 | def format(self, record):
51 | message = {
52 | **self.kwargs,
53 | **self.get_extra_info(record),
54 | }
55 | if self.with_timestamp:
56 | message.update({"timestamp": self.format_timestamp(record.created)})
57 |
58 | if record.exc_info:
59 | message["message"] = self.formatException(record.exc_info)
60 | message["stack_trace"] = "".join(record.getMessage().split("\n"))
61 | else:
62 | message["message"] = record.getMessage()
63 |
64 | return json.dumps(message)
65 |
66 | @classmethod
67 | def format_timestamp(cls, time):
68 | return int(time * 1000)
69 |
70 | def get_extra_info(self, record):
71 | return {
72 | attr_name: record.__dict__[attr_name]
73 | for attr_name in record.__dict__
74 | if attr_name not in self.extra_ignore_keys
75 | }
76 |
--------------------------------------------------------------------------------
/CrawlersTools/preprocess/time_process.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/12 14:48
4 | # @Author : MuggleK
5 | # @File : time_process.py
6 |
7 | import re
8 | from datetime import datetime, timedelta
9 |
10 | from sinan import Sinan
11 |
12 | from CrawlersTools.projects.filters import empty_text
13 |
14 |
15 | class TimeProcessor:
16 |
17 | datetime_pattern = r"([0-9]{4}).*?([0-1]{0,1}[0-9]).*?([0-3]{0,1}[0-9])"
18 |
19 | def __init__(self):
20 | self.fmt = "%Y-%m-%d" # 暂时只处理年月日
21 |
22 | def format(self, string, struct=False):
23 | string = empty_text(string)
24 | try:
25 | return self.process_timestamp(string, struct)
26 | except ValueError:
27 | # print(f"非时间戳格式:{string}")
28 | pass
29 |
30 | date = Sinan(string).parse(display_status=False).get("datetime", [""])[0].split(' ')[0] # 错误的时分秒
31 | if not date:
32 | re_res = re.search(self.datetime_pattern, string)
33 | if re_res is not None:
34 | date = f"{re_res.group(1)}-{re_res.group(2)}-{re_res.group(3)}"
35 | else:
36 | # 提取不出时间或者格式不满足 datetime_pattern的直接返回
37 | return
38 |
39 | if struct:
40 | return datetime.strptime(date, self.fmt)
41 | return date
42 |
43 | def process_timestamp(self, timestamp, struct):
44 | timestamp = int(str(timestamp)[:10])
45 | source_time = datetime(1970, 1, 1)
46 | struct_time = (
47 | datetime.fromtimestamp(timestamp) if timestamp >= 0 else source_time + timedelta(seconds=timestamp)
48 | )
49 | if struct:
50 | return struct_time
51 | return struct_time.strftime(self.fmt)
52 |
53 | def compare_date(self, time_min, time_max) -> bool:
54 | if not (time_min and time_max):
55 | return False
56 |
57 | time_min_format = time_min if isinstance(time_min, datetime) else self.format(time_min, struct=True)
58 | time_max_format = time_max if isinstance(time_max, datetime) else self.format(time_max, struct=True)
59 | if not (time_min_format and time_max_format):
60 | return False
61 |
62 | if time_min_format.date() <= time_max_format.date():
63 | return True
64 | return False
65 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # xml
132 | .xml
133 |
134 | /.idea
135 | test/
--------------------------------------------------------------------------------
/CrawlersTools/preprocess/bloom_filter.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/12 14:35
4 | # @Author : MuggleK
5 | # @File : bloom_filter.py
6 |
7 | import hashlib
8 |
9 |
10 | def sha1(data):
11 | """
12 | BloomFilter fingerprint Function
13 | """
14 | hash_object = hashlib.sha1(data.encode('utf-8'))
15 | hex_dig = hash_object.hexdigest()
16 | return hex_dig
17 |
18 |
19 | class SimpleHash(object):
20 | """
21 | BloomFilter Hash Function
22 | """
23 | def __init__(self, cap, seed):
24 | self.cap = cap
25 | self.seed = seed
26 |
27 | def hash(self, value):
28 | ret = 0
29 | for i in range(len(value)):
30 | ret += self.seed * ret + ord(value[i])
31 | return (self.cap - 1) & ret
32 |
33 |
34 | class BloomFilter(object):
35 | """
36 | Usage::
37 |
38 | # >>> bf = BloomFilter(server, key, block_num=1) # you can increase block_num if you are filtering too many urls
39 | # ... if is_contains(fp):
40 | # ... print(f"{fp} 已存在")
41 | # ... else:
42 | # ... bf.insert(fp)
43 | # >>>
44 |
45 | """
46 | def __init__(self, server, key, block_num=1, filter_level=0):
47 | """
48 |
49 | :param server: Redis Server
50 | :param key: Redis Key
51 | :param block_num:
52 | :param filter_level: Filter data Magnitude 0:total data less than 100W. 1: Exceed 100W
53 | """
54 | self.bit_size = 1 << 31 if filter_level else 1 << 29
55 | self.seeds = [5, 7, 11, 13, 31] if filter_level else [5, 7, 11, 13, 31, 37, 61]
56 | self.server = server
57 | self.key = key
58 | self.block_num = block_num
59 | self.hash_func = []
60 | for seed in self.seeds:
61 | self.hash_func.append(SimpleHash(self.bit_size, seed))
62 |
63 | def is_contains(self, str_input) -> bool:
64 | """
65 | param str_input: source string
66 | :return:
67 | """
68 | if not str_input:
69 | return False
70 | ret = True
71 |
72 | fp = sha1(str_input)
73 | name = f"{self.key}{str(int(fp[0:2], 16) % self.block_num)}"
74 | for f in self.hash_func:
75 | loc = f.hash(str_input)
76 | ret = ret & self.server.getbit(name, loc)
77 | return bool(ret)
78 |
79 | def insert(self, str_input):
80 | """
81 | param str_input: source string
82 | :return:
83 | """
84 | fp = sha1(str_input)
85 | name = f"{self.key}{str(int(fp[0:2], 16) % self.block_num)}"
86 | for f in self.hash_func:
87 | loc = f.hash(str_input)
88 | self.server.setbit(name, loc, 1)
89 |
--------------------------------------------------------------------------------
/CrawlersTools/extractors/title_extractor.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/19 20:23
4 | # @Author : MuggleK
5 | # @File : title_extractor.py
6 |
7 | import re
8 | from itertools import combinations
9 |
10 | from lxml.html import etree
11 |
12 | from CrawlersTools.extractors.base import BaseExtractor
13 | from CrawlersTools.extractors.schemas.element import Element
14 | from CrawlersTools.extractors.utils.settings import (
15 | TITLE_HTAG_XPATH, TITLE_META_XPATH, TITLE_META_XPATH_BAK, TITLE_EXTRACTOR_USELESS_TAGS, PUNCTUATION_ALPHA_PATTERN
16 | )
17 | from CrawlersTools.extractors.utils.similarity import get_longest_common_sub_string
18 |
19 |
20 | class TitleExtractor(BaseExtractor):
21 |
22 | @staticmethod
23 | def extract_by_xpath(element, title_xpath):
24 | if title_xpath:
25 | title_list = element.xpath(title_xpath)
26 | if title_list:
27 | return title_list[0]
28 | return ''
29 |
30 | @staticmethod
31 | def extract_by_title(element):
32 | title_list = element.xpath(TITLE_META_XPATH) or element.xpath(TITLE_META_XPATH_BAK)
33 | if title_list:
34 | return max(title_list, key=len)
35 | else:
36 | return ''
37 |
38 | @staticmethod
39 | def extract_by_htag(element):
40 | title_list = element.xpath(TITLE_HTAG_XPATH)
41 | title_list = [re.sub(PUNCTUATION_ALPHA_PATTERN, "", phrase) for phrase in title_list]
42 | if not title_list:
43 | return ''
44 | index_string = [(index, ''.join(filter(str.isalnum, string))) for index, string in enumerate(title_list)]
45 | string_list = [i[1] for i in index_string]
46 | max_string = max(string_list, key=len)
47 | return title_list[string_list.index(max_string)]
48 |
49 | @staticmethod
50 | def extract_common_str(element: Element) -> str:
51 | h_tag_texts_list = element.xpath(TITLE_HTAG_XPATH)
52 | new_title_list = list(combinations(h_tag_texts_list, 2))
53 | if len(new_title_list) == 1:
54 | new_title = str(max(list(new_title_list[0]), key=len))
55 | return new_title
56 |
57 | common_title_list = [get_longest_common_sub_string(i[0], i[1]).strip() for i in new_title_list]
58 | if common_title_list:
59 | new_title = max(common_title_list, key=len)
60 | sub_string = re.sub(r'\d+', '', ''.join(filter(str.isalnum, new_title)))
61 | return new_title if len(new_title) > 4 and sub_string else ''
62 | return ''
63 |
64 | def process(self, element: Element):
65 | # remove tag and its content
66 | etree.strip_elements(element, *TITLE_EXTRACTOR_USELESS_TAGS)
67 |
68 | title = (self.extract_by_xpath(element, title_xpath=self.kwargs.get("title_xpath"))
69 | or self.extract_by_title(element)
70 | or self.extract_common_str(element)
71 | or self.extract_by_htag(element)
72 | )
73 | return title.strip()
74 |
--------------------------------------------------------------------------------
/CrawlersTools/schedules/auto_thread.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/6/21 17:19
4 | # @Author : MuggleK
5 | # @File : auto_thread.py
6 |
7 | import time
8 | from threading import Lock, Thread, active_count
9 | from traceback import format_exc
10 |
11 | from loguru import logger
12 | from tqdm import tqdm
13 |
14 | thread_lock = Lock()
15 |
16 |
17 | class ExcThread(Thread):
18 | """
19 | 主动捕获子线程异常
20 | """
21 |
22 | def __init__(self, target, args=(), kwargs=None):
23 | super(ExcThread, self).__init__()
24 | self._target = target
25 | self._args = args
26 | self._kwargs = kwargs or dict()
27 |
28 | def run(self):
29 | try:
30 | if self._target:
31 | self._target(*self._args, **self._kwargs)
32 | except:
33 | logger.error(f'self._target:{self._target} args:{self._args} kwargs:{self._kwargs},{format_exc()}')
34 |
35 |
36 | class AutoThread(object):
37 | """
38 | 动态线程调度, 传入任务队列可为列表(初始化转换成生成器),也可为生成器
39 | usage:
40 | a_thread = AutoThread(20, fun, arg_list)
41 | a_thread.main_thread()
42 |
43 | ps: 支持两种并发方式:1.并发函数 2.并发传参 3.并发函数和传参
44 | """
45 |
46 | def __init__(self, thread_num: int, fun, arg_list=None):
47 | self.thread_num = thread_num
48 | if isinstance(fun, tuple): fun = list(fun)
49 | if isinstance(arg_list, tuple): arg_list = list(arg_list)
50 | self.fun_list = fun if callable(fun) else list(fun) # 待带调用对象只能是方法或方法列表,元组
51 | self.arg_list = arg_list
52 | self.os_threads = active_count()
53 |
54 | def process_task(self):
55 | if callable(self.fun_list):
56 | # 1.并发函数
57 | tasks = [{'fun': self.fun_list, 'args': arg} for arg in self.arg_list]
58 | elif isinstance(self.fun_list, list) and not isinstance(self.arg_list, list):
59 | # 2.并发传参
60 | tasks = [{'fun': fun, 'args': self.arg_list} for fun in self.fun_list]
61 | else:
62 | assert len(self.fun_list) == len(self.arg_list), '并发函数和传参长度不一致'
63 | # 3.并发函数和传参
64 | tasks = [{'fun': fun, 'args': arg} for fun, arg in zip(self.fun_list, self.arg_list)]
65 | return tasks
66 |
67 | def wait(self):
68 | """
69 | 等待所有线程结束, 比较 当前存活线程和(主线程 + tqdm线程)
70 | """
71 | while active_count() > self.os_threads + 1:
72 | time.sleep(.25)
73 |
74 | def main_thread(self):
75 | loop_flag = True
76 | tasks = self.process_task()
77 | with tqdm(total=len(tasks)) as pbar:
78 | while loop_flag:
79 | active_thread = active_count()
80 | if active_thread >= self.thread_num:
81 | time.sleep(.25)
82 | continue
83 | for _ in range(self.thread_num - active_thread + self.os_threads):
84 | thread_lock.acquire()
85 | task = tasks.pop() if tasks else None
86 | thread_lock.release()
87 | if task is None:
88 | loop_flag = False
89 | break
90 | child_thread = ExcThread(target=task["fun"]) if task["args"] is None else ExcThread(
91 | target=task["fun"], args=(task["args"],))
92 |
93 | child_thread.start()
94 | pbar.update(1)
95 |
--------------------------------------------------------------------------------
/CrawlersTools/extractors/content_extractor.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/11/15 9:18
4 | # @Author : MuggleK
5 | # @File : content_extractor.py
6 |
7 | from copy import deepcopy
8 |
9 | import numpy as np
10 | from lxml.html import fromstring, HtmlElement
11 |
12 | from CrawlersTools.extractors.base import BaseExtractor
13 | from CrawlersTools.extractors.schemas.element import Element
14 | from CrawlersTools.extractors.utils.element import descendants_of_body
15 | from CrawlersTools.extractors.utils.preprocess import preprocess4content_extractor
16 | from CrawlersTools.extractors.utils.settings import SPECIAL_SYMBOL_MAP, ERROR_NAV_LIST
17 |
18 |
19 | class ContentExtractor(BaseExtractor):
20 | """
21 | extract content from detail page
22 | """
23 |
24 | def process(self, element: Element):
25 | """
26 | extract content from html
27 | :param element:
28 | :return:
29 | """
30 | source_element = deepcopy(element)
31 | source_element.__class__ = Element
32 |
33 | # preprocess
34 | preprocess4content_extractor(element)
35 |
36 | # start to evaluate every child element
37 | descendants = descendants_of_body(element)
38 |
39 | # get std of density_of_text among all elements
40 | density_of_text = [descendant.density_of_text for descendant in descendants]
41 | density_of_text_std = np.std(density_of_text, ddof=1)
42 |
43 | # get density_score of every element
44 | for descendant in descendants:
45 | score = np.log(density_of_text_std) * \
46 | descendant.density_of_text * \
47 | np.log10(descendant.number_of_p_descendants + 2) * \
48 | np.log(descendant.density_of_punctuation)
49 | descendant.density_score = score
50 |
51 | # sort element info by density_score
52 | descendants = sorted(descendants, key=lambda x: x.density_score, reverse=True)
53 | descendant_first = descendants[0] if descendants else None
54 | if descendant_first is None:
55 | return None
56 |
57 | paragraphs = descendant_first.xpath(".//text()")
58 | paragraphs = [paragraph.strip() if paragraph else '' for paragraph in paragraphs]
59 | paragraphs = list(filter(lambda x: x, paragraphs))
60 | text = '\n'.join(paragraphs)
61 | text = text.strip()
62 |
63 | # save content with tag
64 | content_with_tag = self.process_content_tag(descendant_first, source_element)
65 |
66 | # extract images
67 | img_list = [img.attrib["src"] for img in content_with_tag.img_descendants if img.attrib]
68 |
69 | return text, content_with_tag.string, img_list
70 |
71 | @staticmethod
72 | def process_content_tag(descendant_first, source_element):
73 | content_xpath = f"//{descendant_first.tag}"
74 | if descendant_first.attrib:
75 | for k, v in descendant_first.attrib.items():
76 | if k and v: content_xpath += f"[@{k}='{v}']"
77 | preprocess4content_extractor(source_element, is_content=False)
78 | content_with_tag = source_element.xpath(content_xpath)[0]
79 | if isinstance(content_with_tag, HtmlElement):
80 | content_with_tag.__class__ = Element
81 | return content_with_tag
82 |
83 | def extract(self, html, **kwargs):
84 | """
85 | base extract method, firstly, it will convert html to WebElement, then it call
86 | process method that child class implements
87 | :param html:
88 | :return:
89 | """
90 | self.kwargs = kwargs
91 | for key, value in SPECIAL_SYMBOL_MAP.items():
92 | html = html.replace(key, value)
93 |
94 | element = fromstring(html=html) # html有多个,fromstring默认取第一个 TODO 解析不了非规范html
95 | if self.kwargs.get("content_xpath"):
96 | return ''.join(element.xpath(self.kwargs.get("content_xpath")))
97 |
98 | descendants_list = list(element.iterdescendants())
99 |
100 | # remove error navigate tags
101 | remove_index_list = list()
102 | for index, descendant in enumerate(descendants_list):
103 | if descendant.text is None:
104 | continue
105 | nav_error_list = [i for i in ERROR_NAV_LIST if i in descendant.text]
106 | if nav_error_list: remove_index_list.append(index)
107 |
108 | for i in remove_index_list:
109 | parent_element = descendants_list[i].getparent()
110 | if parent_element is not None: parent_element.remove(descendants_list[i])
111 |
112 | element.__class__ = Element
113 | return self.process(element)
114 |
--------------------------------------------------------------------------------
/CrawlersTools/pipelines/mysql_pipeline.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/12 9:12
4 | # @Author : MuggleK
5 | # @File : mysql_pipeline.py
6 |
7 | import time
8 | from typing import Tuple, Optional
9 |
10 | import pymysql
11 | from DBUtils.PooledDB import PooledDB
12 | from loguru import logger
13 | from pymysql import ProgrammingError
14 | from pymysql.converters import escape_string
15 |
16 |
17 | def cursor_handler(func):
18 | def wrapper(self, *args, **kwargs):
19 | sql_conn, cursor = self.ping()
20 | if not (sql_conn and cursor):
21 | logger.warning(f"Mysql Connection occur Error,args:{args}, kwargs: {kwargs}")
22 | return
23 |
24 | try:
25 | kwargs.update({'cursor': cursor})
26 | result = func(self, *args, **kwargs)
27 | sql_conn.commit()
28 | return result
29 | finally:
30 | cursor.close()
31 | sql_conn.close()
32 |
33 | return wrapper
34 |
35 |
36 | class MysqlPipeline(object):
37 | """
38 | A Mysql Pipeline to Create or Insert or Update or Delete Table
39 |
40 | Usage::
41 |
42 | # >>>
43 | # >>> mysql_pool = MysqlPipeline(host='127.0.0.1', username='root', password='mysql', db='test')
44 | # >>> mysql_pool.insert(item, 'test_table')
45 | # >>>
46 |
47 | """
48 |
49 | table_columns_map = dict() # 缓存每个table的结构,避免每次都要查询数据库
50 |
51 | def __init__(self, host: str = '127.0.0.1', username: str = 'root',
52 | password: str = '', db: str = 'test', port: int = 3306,
53 | drop_column: Optional[Tuple] = ('id', 'crawl_time'),
54 | pool_num: int = 10
55 | ):
56 | """
57 | :param host:
58 | :param username:
59 | :param password:
60 | :param db:
61 | :param port:
62 | :param drop_column: type:list 插入数据中不需要手动添加的字段,例如自增主键id,自增时间戳等
63 | :param pool_num:
64 | """
65 | self.host = host
66 | self.username = username
67 | self.password = password
68 | self.db = db
69 | self.port = port
70 | self.drop_column = drop_column
71 | self.pool_num = pool_num
72 |
73 | self.sql_pool = PooledDB(
74 | pymysql, self.pool_num, host=self.host,
75 | user=self.username, passwd=self.password, db=self.db,
76 | port=self.port, charset='utf8', use_unicode=True
77 | )
78 |
79 | def ping(self):
80 | """
81 | 重写pymysql中的ping函数并新增重试机制,以保持conn和cursor
82 |
83 | :return:
84 | """
85 | for _ in range(5):
86 | try:
87 | sql_conn = self.sql_pool.connection()
88 | cursor = sql_conn.cursor()
89 | return sql_conn, cursor
90 | except Exception as e:
91 | logger.debug(f"Mysql Lost Connection for Host : {self.host} Retrying, Error: {e}")
92 |
93 | try:
94 | self.sql_pool = PooledDB(
95 | pymysql, self.pool_num, host=self.host, user=self.username,
96 | passwd=self.password, db=self.db, port=self.port,
97 | charset='utf8', use_unicode=True
98 | )
99 | sql_conn = self.sql_pool.connection()
100 | cursor = sql_conn.cursor()
101 | return sql_conn, cursor
102 | except Exception as err:
103 | logger.debug(f"Waiting for 5s to Connect, Error: {err}")
104 | time.sleep(5)
105 | continue
106 |
107 | logger.error(f"Mysql Connects for Host : {self.host} Over Max Retries")
108 | return None, None
109 |
110 | def add_columns_map(self, table_name):
111 | sql = f"select column_name from information_schema.columns " \
112 | f"where table_name='{table_name}' and table_schema='{self.db}'"
113 | column_list = self.execute_sql(sql)
114 | columns = [i[0] for i in column_list if i[0] not in self.drop_column]
115 | self.table_columns_map[table_name] = columns
116 | return columns
117 |
118 | @cursor_handler
119 | def execute_sql(self, sql, mode='fetch', cursor=None):
120 | if mode == 'fetch':
121 | cursor.execute(sql)
122 | result = cursor.fetchall()
123 | return result
124 | cursor.execute(sql)
125 |
126 | def insert(self, item, table_name):
127 |
128 | if not item:
129 | logger.error("item is Empty")
130 | return
131 |
132 | table_columns = self.table_columns_map.get(table_name) or self.add_columns_map(table_name)
133 | if not table_columns:
134 | raise ProgrammingError(f"Table '{self.db}.{table_name}' doesn't exist")
135 |
136 | # 格式化sql语句 处理"None" -> NULL
137 | format_str = ','.join(["%s" for _ in table_columns])
138 | insert_sql = 'INSERT IGNORE INTO %s (%s) VALUES (%s)' % (table_name, ','.join(table_columns), format_str)
139 | item_values = [None if item.get(key) == "None" else item.get(key) for key in table_columns]
140 | execute_data = tuple([escape_string('%r') % str(i) if i else "NULL" for i in item_values])
141 |
142 | self.execute_sql(insert_sql % execute_data)
143 | logger.info(f"Insert Ignore Successfully:{table_name} -> {item}")
144 |
--------------------------------------------------------------------------------
/CrawlersTools/projects/upload_oss.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/12 20:20
4 | # @Author : MuggleK
5 | # @File : upload_oss.py
6 |
7 | import base64
8 | import hashlib
9 | import re
10 |
11 | import httpx
12 | from loguru import logger
13 |
14 | from CrawlersTools import base_requests
15 | from CrawlersTools.requests.proxy import get_proxies
16 |
17 |
18 | class UploadOss(object):
19 | """
20 | A Class for QZD Upload file to oss
21 |
22 | Usage:
23 |
24 | ```python
25 | >>> upload = UploadOss('(pdf|txt|doc|docx|xlsx|xls|csv|wps|hlp|rtf|ppt|pptx|zip|rar|jar|gz|jpg|jpeg|png|tif|gif|bmp)', "https://***")
26 | >>> oss_url, oss_uuid = upload.download("http://xxgk.haiyan.gov.cn/gov/jcms_files/jcms1/web7/site/zfxxgk/download/downfile.jsp?classid=0&filename=140901165845693.xls", '附件')
27 | ```
28 | """
29 |
30 | def __init__(self, oss_url, suffix_reg, oss_code=None, client_code=None):
31 | self.suffix_reg = suffix_reg
32 | self.oss_url = oss_url
33 | self.oss_code = oss_code
34 | self.client_code = client_code
35 |
36 | def download(self, file_url, file_name, headers=None, verify=True):
37 | """
38 |
39 | :param file_url:
40 | :param file_name:
41 | :param headers:
42 | :param verify:
43 | :return:
44 | """
45 | location = global_uuid = ""
46 | proxy = None
47 | headers = headers if headers else {
48 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"}
49 | for _ in range(3):
50 | try:
51 | if ";base64," in file_url:
52 | suffix = "png"
53 | logger.debug(f"正在上传base64图片: {file_name}: {file_url}")
54 | hl = hashlib.md5()
55 | hl.update(file_url.encode(encoding='utf-8'))
56 | file_name = hl.hexdigest() + f".{suffix}"
57 | a = file_url.split(";base64,")[-1]
58 | a = a + '=' * (4 - len(a) % 4) if len(a) % 4 != 0 else a
59 | base64str = base64.b64decode(a)
60 | upload_result = self.post_file(file_name, base64str)
61 | location = upload_result.get("downloadLocation")
62 | global_uuid = upload_result.get("globalUuid")
63 | logger.debug(f"文件上传成功: {file_name}: {file_url}")
64 | else:
65 | suffix = self.complete_name(file_url, file_name, self.suffix_reg)
66 | if not file_url.startswith("http") and not suffix:
67 | return location, global_uuid
68 | file_name = f"{file_name}.{suffix}"
69 | logger.debug(f"正在上传文件: {file_name}: {file_url}")
70 | res = base_requests(file_url, timeout=60, headers=headers, verify=verify, proxies=proxy)
71 | if 200 <= res.status_code < 400:
72 | upload_result = self.post_file(file_name, res)
73 | location = upload_result.get("downloadLocation")
74 | global_uuid = upload_result.get("globalUuid")
75 | logger.debug(f"文件上传成功: {file_name}: {file_url}")
76 | break
77 | elif res.status_code == 404 or res.status_code == 500:
78 | logger.debug(f"文件地址无效: {file_name}: {file_url}")
79 | break
80 | except Exception as e:
81 | logger.warning(f"文件上传异常: {file_name}: {e}")
82 | proxy = get_proxies(http2=True)
83 | continue
84 | else:
85 | logger.error(f"文件上传失败: {file_name}: {file_url}")
86 |
87 | return location, global_uuid
88 |
89 | def post_file(self, name, resp):
90 | params_json = {
91 | "name": name,
92 | "appCode": self.oss_code,
93 | "appClientCode": self.client_code,
94 | "appOrgCode": "",
95 | "appUserId": "",
96 | "ownCatalogUuid": ""
97 | }
98 | json_data = httpx.post(self.oss_url, json=params_json).json()
99 | if json_data.get("msg") == "SUCCESS":
100 | token_data = json_data.get("data", {})
101 |
102 | str_dic = {
103 | "key": token_data.get("dir") + token_data.get("name"),
104 | "policy": token_data.get("policy"),
105 | "OSSAccessKeyId": token_data.get("accessid"),
106 | "success_action_status": 200,
107 | "callback": token_data.get("callback"),
108 | "signature": token_data.get("signature"),
109 | }
110 |
111 | files = {'file': resp.content}
112 | response = httpx.post(token_data.get("host"), data=str_dic, files=files)
113 | if response.status_code == 200:
114 | res_data = response.json()
115 | if res_data.get("msg") == "SUCCESS":
116 | return res_data["data"]
117 |
118 | raise ValueError(f"文件上传oss失败:{name}")
119 |
120 | @staticmethod
121 | def complete_name(url, name, suffix_reg):
122 | """
123 | 附件.xls.doc 可上传, 接口会默认取最后一个
124 | 优先取 file_url 后缀
125 | """
126 | is_name_suffix = re.search(suffix_reg, name, re.I)
127 | is_url_suffix = re.search(suffix_reg, url, re.I)
128 | name_suffix = is_name_suffix.group(1) if is_name_suffix else ""
129 | url_suffix = is_url_suffix.group(1) if is_url_suffix else ""
130 | if url_suffix:
131 | suffix = url_suffix
132 | elif name_suffix:
133 | suffix = name_suffix
134 | else:
135 | suffix = ""
136 |
137 | return suffix
138 |
--------------------------------------------------------------------------------
/CrawlersTools/extractors/utils/preprocess.py:
--------------------------------------------------------------------------------
1 | from lxml.html import etree
2 |
3 | from CrawlersTools.extractors.schemas.element import Element
4 | from CrawlersTools.extractors.utils.element import children, remove_element, remove_children
5 |
6 | # fmt:off
7 | CONTENT_EXTRACTOR_USELESS_TAGS = ['audio', 'colgroup', 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'iframe',
8 | 'input', 'link', 'option', 'path', 'script', 'select', 'source', 'style', 'svg',
9 | 'symbol', 'video']
10 |
11 | CONTENT_EXTRACTOR_STRIP_TAGS = ['b', 'blockquote', 'br', 'font', 'p', 'section', 'span', 'spanlang', 'spanstyle',
12 | 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'th', 'tr', 'u'] # 'img'
13 |
14 | KEYWORD_FEATURES = 'and not (contains(@class,"main")) and not (contains(@class,"content"))and not (contains(@class,"con"))and not (contains(@class,"container")) and not (contains(@class,"list")) and not (contains(@class,"box")) and not (contains(@class,"right"))and not (contains(@class,"body")) and not (contains(@class,"lanmu")) '
15 | CONTENT_EXTRACTOR_NOISE_XPATH = [
16 | # '//div[contains(@class, "comment")]',
17 | '//div[contains(@class, "advertisement")]',
18 | '//div[contains(@class, "advert")]',
19 | '//a[contains(@style, "display: none")]',
20 | '//a[contains(@style, "display:none")]', # TODO css不展示数据是否要去除,可能会影响正文重复
21 | f'//div[contains(@class, "foot") {KEYWORD_FEATURES}]',
22 | f'//div[contains(@class, "footer") {KEYWORD_FEATURES}]',
23 | # f'//div[contains(@class, "location") {KEYWORD_FEATURES}]',
24 | f'//div[contains(@class, "navigation") {KEYWORD_FEATURES}]',
25 | f'//div[contains(@class, "barrier") {KEYWORD_FEATURES}]',
26 | '//div[contains(@id, "foot")]',
27 | # '//div[contains(@class, "head")]', # 误删
28 | # '//div[contains(@id, "head")]',
29 | # '//div[contains(@class, "nav")]', # 误删
30 | '//div[contains(@id, "nav")]',
31 | '//div[contains(@class, "siderbar")]',
32 | '//div[contains(@class, "breadcrumb")]',
33 | '//div[contains(@id, "siderbar")]',
34 | '//div[contains(@id, "页脚")]',
35 | '//div[contains(@class, "页脚")]',
36 | '//div[contains(@id, "页眉")]',
37 | '//div[contains(@id, "页头")]',
38 | '//div[contains(@class, "页眉")]',
39 | '//div[contains(@class, "页头")]',
40 | '//*[contains(@class, "hidden")]',
41 | ]
42 |
43 |
44 | def preprocess4content_extractor(element: Element, is_content: bool = True):
45 | """
46 | preprocess element for content extraction
47 | :param element:
48 | :param is_content: save content without tag
49 | :return:
50 | """
51 | remove_children(element, CONTENT_EXTRACTOR_NOISE_XPATH)
52 |
53 | # remove tag and its content
54 | etree.strip_elements(element, *CONTENT_EXTRACTOR_USELESS_TAGS)
55 |
56 | if not is_content: return
57 | # only move tag pair
58 | etree.strip_tags(element, *CONTENT_EXTRACTOR_STRIP_TAGS)
59 |
60 | for child in children(element):
61 |
62 | # merge text in span or strong to parent p tag
63 | if child.tag.lower() == 'p' or child.tag.lower() == 'table':
64 | etree.strip_tags(child, 'span')
65 | etree.strip_tags(child, 'strong')
66 | etree.strip_tags(child, 'tr')
67 | etree.strip_tags(child, 'td')
68 |
69 | if not (child.text and child.text.strip()):
70 | remove_element(child)
71 |
72 | # if a div tag does not contain any sub node, it could be converted to p node.
73 | if child.tag.lower() == 'div' and not child.getchildren():
74 | child.tag = 'p'
75 |
76 |
77 | LIST_EXTRACTOR_USELESS_TAGS = CONTENT_EXTRACTOR_USELESS_TAGS
78 | LIST_EXTRACTOR_STRIP_TAGS = CONTENT_EXTRACTOR_STRIP_TAGS
79 | LIST_EXTRACTOR_NOISE_XPATH = CONTENT_EXTRACTOR_NOISE_XPATH
80 |
81 |
82 | def preprocess4list_extractor(element: Element):
83 | """
84 | preprocess element for list extraction
85 | :param element:
86 | :return:
87 | """
88 | # remove tag and its content
89 | etree.strip_elements(element, *CONTENT_EXTRACTOR_USELESS_TAGS)
90 | # only move tag pair
91 | etree.strip_tags(element, *CONTENT_EXTRACTOR_STRIP_TAGS)
92 |
93 | remove_children(element, CONTENT_EXTRACTOR_NOISE_XPATH)
94 |
95 | for child in children(element):
96 |
97 | # merge text in span or strong to parent p tag
98 | if child.tag.lower() == 'p':
99 | etree.strip_tags(child, 'span')
100 | etree.strip_tags(child, 'strong')
101 |
102 | if not (child.text and child.text.strip()):
103 | remove_element(child)
104 |
105 | # if a div tag does not contain any sub node, it could be converted to p node.
106 | if child.tag.lower() == 'div' and not child.getchildren():
107 | child.tag = 'p'
108 |
109 |
110 | LIST_CLASSIFIER_USELESS_TAGS = ['style', 'script', 'link', 'video', 'audio', 'iframe', 'source', 'svg', 'path',
111 | 'symbol', 'footer', 'header']
112 | LIST_CLASSIFIER_STRIP_TAGS = ['span', 'blockquote']
113 | LIST_CLASSIFIER_NOISE_XPATHS = [
114 | '//div[contains(@class, "comment")]',
115 | '//div[contains(@class, "advertisement")]',
116 | '//div[contains(@class, "advert")]',
117 | '//div[contains(@style, "display: none")]',
118 | ]
119 |
120 |
121 | def preprocess4list_classifier(element: Element):
122 | """
123 | preprocess element for list classifier
124 | :param element:
125 | :return:
126 | """
127 | # remove tag and its content
128 | etree.strip_elements(element, *LIST_CLASSIFIER_USELESS_TAGS)
129 | # only move tag pair
130 | etree.strip_tags(element, *LIST_CLASSIFIER_STRIP_TAGS)
131 |
132 | remove_children(element, LIST_CLASSIFIER_NOISE_XPATHS)
133 |
134 | for child in children(element):
135 |
136 | # merge text in span or strong to parent p tag
137 | if child.tag.lower() == 'p':
138 | etree.strip_tags(child, 'span')
139 | etree.strip_tags(child, 'strong')
140 |
141 | if not (child.text and child.text.strip()):
142 | remove_element(child)
143 |
144 | # if a div tag does not contain any sub node, it could be converted to p node.
145 | if child.tag.lower() == 'div' and not child.getchildren():
146 | child.tag = 'p'
147 |
--------------------------------------------------------------------------------
/CrawlersTools/extractors/utils/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time : 2022/8/19 20:00
4 | # @Author : MuggleK
5 | # @File : settings.py
6 |
7 | # list settings
8 | LIST_MIN_NUMBER = 5
9 | LIST_MIN_LENGTH = 8
10 | LIST_MAX_LENGTH = 50
11 | SIMILARITY_THRESHOLD = 0.8
12 |
13 | LIST_AVG_LENGTH = 9
14 | ADDTION_RIGHT_NUM = 10000
15 |
16 | HIGH_WEIGHT_ERROR_KEYWORD = ["ICP备", "公网安备", "网公安备", "备案序号:", "网站地图"]
17 | DIRECTORY_ERROR_TITLE = ["首页", "下一页", "解读", "图解", "详细", "阅读全文", "标题", "[详细]"]
18 |
19 |
20 | # common settings
21 | SPECIAL_SYMBOL_MAP = {
22 | """: '"',
23 | "&": "&",
24 | "<": "<",
25 | ">": ">",
26 | " ": " ",
27 | """: '"',
28 | "&": "&",
29 | "<": "<",
30 | ">": ">",
31 | " ": " ",
32 | '