├── .github └── workflows │ └── python-publish.yml ├── .gitignore ├── CrawlersTools ├── README.md ├── __init__.py ├── extractors │ ├── __init__.py │ ├── attachment_extractor.py │ ├── base.py │ ├── content_extractor.py │ ├── list_extractor.py │ ├── schemas │ │ ├── __init__.py │ │ └── element.py │ ├── time_extractor.py │ ├── title_extractor.py │ └── utils │ │ ├── __init__.py │ │ ├── cluster.py │ │ ├── element.py │ │ ├── preprocess.py │ │ ├── settings.py │ │ └── similarity.py ├── js_crawler │ ├── __init__.py │ ├── font_decrypt.py │ └── transfer_js.py ├── logs │ ├── __init__.py │ ├── formatters.py │ ├── handlers.py │ ├── log.py │ └── logger.py ├── pipelines │ ├── __init__.py │ ├── kafka_operate.py │ ├── mongo_pipeline.py │ ├── mysql_pipeline.py │ └── redis_pipeline.py ├── preprocess │ ├── __init__.py │ ├── bloom_filter.py │ └── time_process.py ├── projects │ ├── __init__.py │ ├── filters.py │ └── upload_oss.py ├── requests │ ├── __init__.py │ ├── base_requests.py │ ├── proxy.py │ └── random_ua.py ├── requirements.txt ├── schedules │ ├── __init__.py │ └── auto_thread.py └── utils │ └── str_compare.py ├── LICENSE ├── README.md └── setup.py /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: push 12 | 13 | permissions: 14 | contents: read 15 | 16 | jobs: 17 | deploy: 18 | 19 | runs-on: ubuntu-latest 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: '3.x' 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install build 31 | - name: Build package 32 | run: python -m build 33 | - name: Publish package 34 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 35 | with: 36 | user: __token__ 37 | password: ${{ secrets.PYPI_API_TOKEN }} 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # xml 132 | .xml 133 | 134 | /.idea 135 | test/ -------------------------------------------------------------------------------- /CrawlersTools/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/12 20:48 4 | # @Author : MuggleK 5 | # @File : __init__.py 6 | 7 | from CrawlersTools.extractors import PolicyExtractor, ListExtractor 8 | from .logs.logger import init_logger 9 | from CrawlersTools.logs import Logging 10 | from CrawlersTools.pipelines import MysqlPipeline, MongoPipeline, RedisPipeline 11 | from CrawlersTools.preprocess import TimeProcessor 12 | from CrawlersTools.requests import base_requests, get_proxies, UserAgent 13 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/15 11:29 4 | # @Author : MuggleK 5 | # @File : __init__.py 6 | 7 | from CrawlersTools.extractors.attachment_extractor import AttachmentExtractor 8 | from CrawlersTools.extractors.content_extractor import ContentExtractor 9 | from CrawlersTools.extractors.list_extractor import ListExtractor 10 | from CrawlersTools.extractors.time_extractor import TimeExtractor 11 | from CrawlersTools.extractors.title_extractor import TitleExtractor 12 | 13 | 14 | class PolicyExtractor(object): 15 | 16 | @staticmethod 17 | def extract( 18 | html, 19 | title_xpath: str = "", 20 | publish_time_xpath: str = "", 21 | content_xpath: str = "", 22 | attachment_xpath: str = "", 23 | attachment_regx: str = "" 24 | ) -> dict: 25 | title = TitleExtractor().extract(html, title_xpath=title_xpath) 26 | publish_time = TimeExtractor().extract(html, publish_time_xpath=publish_time_xpath) 27 | content, content_with_tag, images = ContentExtractor().extract(html, content_xpath=content_xpath) 28 | attachments = AttachmentExtractor().extract(html, attachment_xpath=attachment_xpath, attachment_regx=attachment_regx) 29 | 30 | return { 31 | "title": title, 32 | "publish_time": publish_time, 33 | "content": content, 34 | "content_with_tag": content_with_tag, 35 | "images": images, 36 | "attachment": attachments 37 | } 38 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/attachment_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/12/21 10:54 4 | # @Author : MuggleK 5 | # @File : attachment_extractor.py 6 | 7 | import re 8 | 9 | from CrawlersTools.extractors.base import BaseExtractor 10 | from CrawlersTools.extractors.schemas.element import Element 11 | from CrawlersTools.extractors.utils.settings import ATTACHMENT_REGX 12 | 13 | 14 | class AttachmentExtractor(BaseExtractor): 15 | """ 16 | extract content from detail page 17 | """ 18 | 19 | def process(self, element: Element): 20 | """ 21 | extract content from html 22 | :param element: 23 | :return: 24 | """ 25 | attachment_list = list() 26 | attachment_xpath = self.kwargs.get("attachment_xpath") or "//a" 27 | for attachment_element in element.xpath(attachment_xpath): 28 | url = [i.strip() for i in attachment_element.xpath("@href") or attachment_element.xpath("@src")] 29 | name = [i.strip() for i in attachment_element.xpath(".//text()")] 30 | if not (''.join(url).strip() and ''.join(name).strip()): 31 | continue 32 | suffix = self.filter_suffix(url[0], name[0]) 33 | if not suffix: continue 34 | attachment_list.append({ 35 | "file_url": url[0], 36 | "file_name": name[0] 37 | }) 38 | return attachment_list 39 | 40 | def filter_suffix(self, url, name): 41 | """ 42 | 附件.xls.doc 可上传, 接口会默认取最后一个 43 | 优先取 file_url 后缀 44 | """ 45 | regx = self.kwargs.get("attachment_regx") or ATTACHMENT_REGX 46 | is_name_suffix = re.search(regx, name, re.I) 47 | is_url_suffix = re.search(regx, url, re.I) 48 | name_suffix = is_name_suffix.group(1) if is_name_suffix else "" 49 | url_suffix = is_url_suffix.group(1) if is_url_suffix else "" 50 | 51 | return name_suffix or url_suffix 52 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/base.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | from lxml.html import etree 3 | from lxml.html import fromstring 4 | 5 | from CrawlersTools.extractors.schemas.element import Element 6 | 7 | 8 | class BaseExtractor(object): 9 | """ 10 | Base Extractor which provide common methods 11 | """ 12 | 13 | kwargs = None 14 | 15 | @staticmethod 16 | def to_string(element: Element, limit: int = None): 17 | """ 18 | convert element to string 19 | :param element: 20 | :param limit: 21 | :return: 22 | """ 23 | result = etree.tostring(element, pretty_print=True, encoding="utf-8", method='html').decode('utf-8') 24 | if limit: 25 | return result[:limit] 26 | return result 27 | 28 | def process(self, element: Element): 29 | """ 30 | process method that you should implement 31 | :param element: 32 | :return: 33 | """ 34 | logger.error('You must implement process method in your extractor.') 35 | raise NotImplementedError 36 | 37 | def extract(self, html, **kwargs): 38 | """ 39 | base extract method, firstly, it will convert html to WebElement, then it call 40 | process method that child class implements 41 | :param html: 42 | :return: 43 | """ 44 | self.kwargs = kwargs 45 | element = fromstring(html=html) 46 | element.__class__ = Element 47 | return self.process(element) 48 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/content_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/11/15 9:18 4 | # @Author : MuggleK 5 | # @File : content_extractor.py 6 | 7 | from copy import deepcopy 8 | 9 | import numpy as np 10 | from lxml.html import fromstring, HtmlElement 11 | 12 | from CrawlersTools.extractors.base import BaseExtractor 13 | from CrawlersTools.extractors.schemas.element import Element 14 | from CrawlersTools.extractors.utils.element import descendants_of_body 15 | from CrawlersTools.extractors.utils.preprocess import preprocess4content_extractor 16 | from CrawlersTools.extractors.utils.settings import SPECIAL_SYMBOL_MAP, ERROR_NAV_LIST 17 | 18 | 19 | class ContentExtractor(BaseExtractor): 20 | """ 21 | extract content from detail page 22 | """ 23 | 24 | def process(self, element: Element): 25 | """ 26 | extract content from html 27 | :param element: 28 | :return: 29 | """ 30 | source_element = deepcopy(element) 31 | source_element.__class__ = Element 32 | 33 | # preprocess 34 | preprocess4content_extractor(element) 35 | 36 | # start to evaluate every child element 37 | descendants = descendants_of_body(element) 38 | 39 | # get std of density_of_text among all elements 40 | density_of_text = [descendant.density_of_text for descendant in descendants] 41 | density_of_text_std = np.std(density_of_text, ddof=1) 42 | 43 | # get density_score of every element 44 | for descendant in descendants: 45 | score = np.log(density_of_text_std) * \ 46 | descendant.density_of_text * \ 47 | np.log10(descendant.number_of_p_descendants + 2) * \ 48 | np.log(descendant.density_of_punctuation) 49 | descendant.density_score = score 50 | 51 | # sort element info by density_score 52 | descendants = sorted(descendants, key=lambda x: x.density_score, reverse=True) 53 | descendant_first = descendants[0] if descendants else None 54 | if descendant_first is None: 55 | return None 56 | 57 | paragraphs = descendant_first.xpath(".//text()") 58 | paragraphs = [paragraph.strip() if paragraph else '' for paragraph in paragraphs] 59 | paragraphs = list(filter(lambda x: x, paragraphs)) 60 | text = '\n'.join(paragraphs) 61 | text = text.strip() 62 | 63 | # save content with tag 64 | content_with_tag = self.process_content_tag(descendant_first, source_element) 65 | 66 | # extract images 67 | img_list = [img.attrib["src"] for img in content_with_tag.img_descendants if img.attrib] 68 | 69 | return text, content_with_tag.string, img_list 70 | 71 | @staticmethod 72 | def process_content_tag(descendant_first, source_element): 73 | content_xpath = f"//{descendant_first.tag}" 74 | if descendant_first.attrib: 75 | for k, v in descendant_first.attrib.items(): 76 | if k and v: content_xpath += f"[@{k}='{v}']" 77 | preprocess4content_extractor(source_element, is_content=False) 78 | content_with_tag = source_element.xpath(content_xpath)[0] 79 | if isinstance(content_with_tag, HtmlElement): 80 | content_with_tag.__class__ = Element 81 | return content_with_tag 82 | 83 | def extract(self, html, **kwargs): 84 | """ 85 | base extract method, firstly, it will convert html to WebElement, then it call 86 | process method that child class implements 87 | :param html: 88 | :return: 89 | """ 90 | self.kwargs = kwargs 91 | for key, value in SPECIAL_SYMBOL_MAP.items(): 92 | html = html.replace(key, value) 93 | 94 | element = fromstring(html=html) # html有多个,fromstring默认取第一个 TODO 解析不了非规范html 95 | if self.kwargs.get("content_xpath"): 96 | return ''.join(element.xpath(self.kwargs.get("content_xpath"))) 97 | 98 | descendants_list = list(element.iterdescendants()) 99 | 100 | # remove error navigate tags 101 | remove_index_list = list() 102 | for index, descendant in enumerate(descendants_list): 103 | if descendant.text is None: 104 | continue 105 | nav_error_list = [i for i in ERROR_NAV_LIST if i in descendant.text] 106 | if nav_error_list: remove_index_list.append(index) 107 | 108 | for i in remove_index_list: 109 | parent_element = descendants_list[i].getparent() 110 | if parent_element is not None: parent_element.remove(descendants_list[i]) 111 | 112 | element.__class__ = Element 113 | return self.process(element) 114 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/list_extractor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | """ 4 | Author: xiaobin.zhu 5 | since: 2022-11-24 14:24:09 6 | LastAuthor: xiaobin.zhu 7 | LastEditTime: 2022-11-24 14:24:09 8 | Description: extract list from index page 9 | FilePath: list_extractor 10 | """ 11 | import math 12 | import operator 13 | from collections import defaultdict 14 | from urllib.parse import urljoin 15 | 16 | # from loguru import logger 17 | import numpy as np 18 | from lxml.html import fromstring 19 | 20 | from CrawlersTools.extractors.base import BaseExtractor 21 | from CrawlersTools.extractors.schemas.element import Element 22 | from CrawlersTools.extractors.utils.cluster import cluster_dict 23 | from CrawlersTools.extractors.utils.element import calc_a_descendants_text_of_avg_length, descendants_of_body 24 | from CrawlersTools.extractors.utils.preprocess import preprocess4list_extractor 25 | from CrawlersTools.extractors.utils.settings import ( 26 | LIST_AVG_LENGTH, LIST_MAX_LENGTH, LIST_MIN_LENGTH, LIST_MIN_NUMBER, ADDTION_RIGHT_NUM, SIMILARITY_THRESHOLD, 27 | HIGH_WEIGHT_ERROR_KEYWORD, DIRECTORY_ERROR_TITLE, SPECIAL_SYMBOL_MAP, 28 | ) 29 | 30 | AVG_LENGTH = (LIST_MIN_LENGTH + LIST_MAX_LENGTH) / 2 31 | 32 | 33 | class ListExtractor(BaseExtractor): 34 | """ 35 | extract list from index page 36 | """ 37 | 38 | @staticmethod 39 | def _probability_of_title_with_length(length): 40 | """ 41 | get the probability of title according to length 42 | import matplotlib.pyplot as plt 43 | x = np.asarray(range(5, 40)) 44 | y = list_extractor.probability_of_title_with_length(x) 45 | plt.plot(x, y, 'g', label='m=0, sig=2') 46 | plt.show() 47 | :param length: 48 | :return: 49 | """ 50 | sigma = 6 51 | return np.exp(-1 * ((length - AVG_LENGTH) ** 2) / (2 * (sigma**2))) / ( 52 | math.sqrt(2 * np.pi) * sigma 53 | ) 54 | 55 | @staticmethod 56 | def _build_clusters(element): 57 | """ 58 | build candidate clusters according to element 59 | :return: 60 | """ 61 | descendants = descendants_of_body(element) 62 | descendants_tree = defaultdict(list) 63 | for descendant in descendants: 64 | # dispose some special descendant for less extract 65 | if len(descendant.a_descendants) > 5 and descendant.number_of_siblings == 1: 66 | if descendant.parent_selector in ["html>body", "html"]: 67 | continue 68 | if descendant.a_descendants_group_text_min_length > LIST_MAX_LENGTH: 69 | continue 70 | if descendant.a_descendants_group_text_max_length < LIST_MIN_LENGTH: 71 | continue 72 | for link in descendant.a_descendants: 73 | descendants_tree[descendant.parent_selector].append(link) 74 | continue 75 | # if one element does not have enough siblings, it can not become a child of candidate element 76 | if descendant.number_of_siblings + 1 < LIST_MIN_NUMBER: 77 | continue 78 | if calc_a_descendants_text_of_avg_length(descendant) < LIST_AVG_LENGTH: 79 | continue 80 | # if min length is larger than specified max length, it can not become a child of candidate element 81 | if descendant.a_descendants_group_text_min_length > LIST_MAX_LENGTH: 82 | continue 83 | # if max length is smaller than specified min length, it can not become a child of candidate element 84 | if descendant.a_descendants_group_text_max_length < LIST_MIN_LENGTH: 85 | continue 86 | # if descendant.a_descendants_group_text_avg_length < 10: 87 | # continue 88 | # descendant element must have same siblings which their similarity should not below similarity_threshold 89 | if descendant.similarity_with_siblings < SIMILARITY_THRESHOLD: 90 | continue 91 | descendants_tree[descendant.parent_selector].append(descendant) 92 | if len(descendants_tree) == 0: 93 | return 94 | descendants_tree = dict(descendants_tree) 95 | 96 | # cut tree, remove parent block 97 | selectors = sorted(list(descendants_tree.keys())) 98 | last_selector = None 99 | for selector in selectors[::-1]: 100 | # if later selector 101 | if last_selector and selector and last_selector.startswith(selector): 102 | del descendants_tree[selector] 103 | last_selector = selector 104 | clusters = cluster_dict(descendants_tree) 105 | 106 | return clusters 107 | 108 | @staticmethod 109 | def _evaluate_cluster(cluster): 110 | """ 111 | calculate score of cluster using similarity, numbers, or other info 112 | :param cluster: 113 | :return: 114 | """ 115 | score = dict() 116 | 117 | # calculate avg_similarity_with_siblings 118 | score["avg_similarity_with_siblings"] = np.mean( 119 | [element.similarity_with_siblings for element in cluster] 120 | ) 121 | score["avg_text_length"] = np.mean( 122 | [calc_a_descendants_text_of_avg_length(element) for element in cluster] 123 | ) 124 | # calculate number of elements 125 | score["number_of_elements"] = len(cluster) 126 | 127 | score["clusters_score"] = ( 128 | score["avg_similarity_with_siblings"] 129 | * np.log10(score["number_of_elements"] + 1) 130 | * score["avg_text_length"] 131 | ) 132 | # * clusters_score[cluster_id]['probability_of_title_with_length'] 133 | return score 134 | 135 | @staticmethod 136 | def _extend_cluster(cluster): 137 | """ 138 | extend cluster's elements except for missed children 139 | :param cluster: 140 | :return: 141 | """ 142 | result = [element.selector for element in cluster] 143 | for element in cluster: 144 | if calc_a_descendants_text_of_avg_length(element) < LIST_AVG_LENGTH: 145 | continue 146 | path_raw = element.path_raw 147 | siblings = list(element.siblings) 148 | for sibling in siblings: 149 | # skip invalid element 150 | if not isinstance(sibling, Element): 151 | continue 152 | sibling_selector = sibling.selector 153 | sibling_path_raw = sibling.path_raw 154 | if sibling_path_raw != path_raw: 155 | continue 156 | # add missed sibling 157 | if sibling_selector not in result: 158 | cluster.append(sibling) 159 | result.append(sibling_selector) 160 | 161 | cluster = sorted(cluster, key=lambda x: x.nth) 162 | # logger.debug(f"cluster after extend {cluster}") 163 | return cluster 164 | 165 | def _best_cluster(self, clusters): 166 | """ 167 | use clustering algorithm to choose best cluster from candidate clusters 168 | :param clusters: 169 | :return: 170 | """ 171 | if not clusters: 172 | # logger.debug("there is on cluster, just return empty result") 173 | return [] 174 | if len(clusters) == 1: 175 | # logger.debug("there is only one cluster, just return first cluster") 176 | return clusters[0] 177 | # choose best cluster using score 178 | clusters_score = defaultdict(dict) 179 | clusters_score_arg_max = 0 180 | clusters_score_max = -1 181 | for cluster_id, cluster in clusters.items(): 182 | if len(cluster) < 2: 183 | continue 184 | # calculate avg_similarity_with_siblings 185 | clusters_score[cluster_id] = self._evaluate_cluster(cluster) 186 | # get max score arg index 187 | if clusters_score[cluster_id]["clusters_score"] > clusters_score_max: 188 | clusters_score_max = clusters_score[cluster_id]["clusters_score"] 189 | clusters_score_arg_max = cluster_id 190 | # logger.debug(f"clusters_score {clusters_score}") 191 | best_cluster = clusters[clusters_score_arg_max] 192 | return best_cluster 193 | 194 | def _extract_cluster(self, cluster): 195 | """ 196 | extract title and href from best cluster 197 | :param cluster: 198 | :return: 199 | """ 200 | if not cluster: 201 | return None 202 | # get best tag path of title 203 | probabilities_of_title = defaultdict(list) 204 | for element in cluster: 205 | if element.tag == "a": 206 | descendants = [element] 207 | else: 208 | descendants = element.a_descendants 209 | for descendant in descendants: 210 | path = descendant.path_raw 211 | descendant_text = descendant.text 212 | probability_of_title_with_length = ( 213 | self._probability_of_title_with_length(len(descendant_text)) 214 | ) 215 | 216 | if descendant.attrib.get("title"): 217 | probability_of_title_with_length = ( 218 | probability_of_title_with_length * ADDTION_RIGHT_NUM 219 | ) 220 | 221 | if len(descendant_text) > LIST_MAX_LENGTH: 222 | probability_of_title_with_length = ( 223 | probability_of_title_with_length * ADDTION_RIGHT_NUM 224 | ) 225 | if ( 226 | descendant.tag == "a" 227 | and descendant.get("parent") 228 | and calc_a_descendants_text_of_avg_length(descendant.parent) 229 | < LIST_AVG_LENGTH 230 | ): 231 | probability_of_title_with_length = ( 232 | probability_of_title_with_length / ADDTION_RIGHT_NUM 233 | ) 234 | else: 235 | if ( 236 | calc_a_descendants_text_of_avg_length(descendant) 237 | < LIST_AVG_LENGTH 238 | ): 239 | probability_of_title_with_length = ( 240 | probability_of_title_with_length / ADDTION_RIGHT_NUM 241 | ) 242 | for ss in HIGH_WEIGHT_ERROR_KEYWORD: 243 | if ss in descendant_text: 244 | probability_of_title_with_length = ( 245 | probability_of_title_with_length / ADDTION_RIGHT_NUM 246 | ) 247 | 248 | probability_of_title = probability_of_title_with_length 249 | probabilities_of_title[path].append(probability_of_title) 250 | probabilities_of_title_bak = {} 251 | for key in probabilities_of_title: 252 | if len(probabilities_of_title[key]) > LIST_MIN_NUMBER - 2: 253 | probabilities_of_title_bak[key] = probabilities_of_title[key] 254 | probabilities_of_title = probabilities_of_title_bak 255 | # get most probable tag_path 256 | probabilities_of_title_avg = { 257 | k: np.mean(v) for k, v in probabilities_of_title.items() 258 | } 259 | if not probabilities_of_title_avg: 260 | return None 261 | best_path = max(probabilities_of_title_avg.items(), key=operator.itemgetter(1))[ 262 | 0 263 | ] 264 | # logger.debug(f"best tag path {best_path}") 265 | 266 | # extract according to best tag path 267 | result = [] 268 | # 去除重复链接 269 | cache_url_list = [] 270 | recode_avg_len_of_path = {} 271 | for element in cluster: 272 | avg_len = calc_a_descendants_text_of_avg_length(element) 273 | parent_selector = element.parent_selector 274 | if recode_avg_len_of_path.get(parent_selector) is not None: 275 | if recode_avg_len_of_path[parent_selector] < LIST_AVG_LENGTH: 276 | continue 277 | else: 278 | recode_avg_len_of_path[parent_selector] = avg_len 279 | if recode_avg_len_of_path[parent_selector] < LIST_AVG_LENGTH: 280 | continue 281 | 282 | if element.tag == "a": 283 | path_raw = element.path_raw 284 | if path_raw != best_path: # and descendant.text == "" 285 | continue 286 | title = element.attrib.get("title") or element.text 287 | if title in DIRECTORY_ERROR_TITLE or len(title) < 2: 288 | continue 289 | flag = False 290 | for ss in HIGH_WEIGHT_ERROR_KEYWORD: 291 | if ss in title: 292 | flag = True 293 | break 294 | if flag: 295 | continue 296 | url = element.attrib.get("href") 297 | if url is None: 298 | continue 299 | if "javascript" in url: 300 | ss = element.attrib.get("data-href") 301 | if ss is not None: 302 | url = ss 303 | if "#" in url or "javascript:void" in url: 304 | ss = element.attrib.get("onclick") 305 | if ss is not None: 306 | url = ss 307 | 308 | if url.startswith("//"): 309 | url = "http:" + url 310 | base_url = self.kwargs.get("base_url") 311 | if base_url: 312 | url = urljoin(base_url, url) 313 | if url in cache_url_list: 314 | continue 315 | else: 316 | cache_url_list.append(url) 317 | result.append({"title": title, "url": url}) 318 | continue 319 | else: 320 | descendants = element.a_descendants 321 | for descendant in descendants: 322 | path_raw = descendant.path_raw 323 | if path_raw != best_path: # and descendant.text == "" 324 | continue 325 | title = descendant.attrib.get("title") or descendant.text 326 | if title in DIRECTORY_ERROR_TITLE or len(title) < 2: 327 | continue 328 | flag = False 329 | for ss in HIGH_WEIGHT_ERROR_KEYWORD: 330 | if ss in title: 331 | flag = True 332 | break 333 | if flag: 334 | continue 335 | url = descendant.attrib.get("href") 336 | if url is None: 337 | continue 338 | if "javascript" in url: 339 | ss = descendant.attrib.get("data-href") 340 | if ss is not None: 341 | url = ss 342 | if "#" in url or "javascript:void" in url: 343 | ss = descendant.attrib.get("onclick") 344 | if ss is not None: 345 | url = ss 346 | 347 | if url.startswith("//"): 348 | url = "http:" + url 349 | base_url = self.kwargs.get("base_url") 350 | if base_url: 351 | url = urljoin(base_url, url) 352 | if url in cache_url_list: 353 | continue 354 | else: 355 | cache_url_list.append(url) 356 | result.append({"title": title, "url": url}) 357 | return result 358 | 359 | def process(self, element: Element): 360 | """ 361 | extract content from html 362 | :param element: 363 | :return: 364 | """ 365 | # preprocess 366 | preprocess4list_extractor(element) 367 | 368 | # build clusters 369 | clusters = self._build_clusters(element) 370 | # logger.debug(f"after build clusters {clusters}") 371 | 372 | # choose best cluster 373 | best_cluster = self._best_cluster(clusters) 374 | # logger.debug(f"best cluster {best_cluster}") 375 | 376 | extended_cluster = self._extend_cluster(best_cluster) 377 | # logger.debug(f"extended cluster {extended_cluster}") 378 | 379 | # extract result from best cluster 380 | return self._extract_cluster(best_cluster) 381 | 382 | def extract(self, html, **kwargs): 383 | self.kwargs = kwargs 384 | for key, value in SPECIAL_SYMBOL_MAP.items(): 385 | html = html.replace(key, value) 386 | 387 | element = fromstring(html=html) # html有多个,fromstring默认取第一个 TODO 解析不了非规范html 388 | if self.kwargs.get("list_xpath"): 389 | return ''.join(element.xpath(self.kwargs.get("list_xpath"))) 390 | 391 | element.__class__ = Element 392 | return self.process(element) 393 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/schemas/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MuggleK/CrawlersTools/8f59a1dd884367f2f59e1f6f63b683c9f24ad377/CrawlersTools/extractors/schemas/__init__.py -------------------------------------------------------------------------------- /CrawlersTools/extractors/schemas/element.py: -------------------------------------------------------------------------------- 1 | from lxml.html import HtmlElement, etree 2 | from numpy import mean 3 | 4 | 5 | class Element(HtmlElement): 6 | _id: int = None 7 | _selector: str = None 8 | _parent_selector: str = None 9 | _alias: str = None 10 | _tag_name: str = None 11 | _path: str = None 12 | _path_raw: str = None 13 | _children = None 14 | _parent = None 15 | _siblings = None 16 | _descendants = None 17 | _text = None 18 | _number_of_char: int = None 19 | _number_of_a_char: int = None 20 | _number_of_punctuation: int = None 21 | _number_of_a_descendants: int = None 22 | _number_of_p_descendants: int = None 23 | _number_of_children: int = None 24 | _number_of_siblings: int = None 25 | _number_of_descendants: int = None 26 | _density_of_punctuation: int = None 27 | _density_of_text: float = None 28 | _density_score: float = None 29 | _similarity_with_siblings: float = None 30 | _a_descendants: list = None 31 | _img_descendants: list = None 32 | _a_descendants_group: dict = None 33 | _a_descendants_group_text_length: dict = None 34 | _a_descendants_group_text_min_length: float = None 35 | _a_descendants_group_text_max_length: float = None 36 | 37 | density_score: float = None 38 | 39 | @property 40 | def id(self): 41 | """ 42 | get id by hashed element 43 | :return: 44 | """ 45 | if self._id is not None: 46 | return self._id 47 | self._id = hash(self) 48 | return self._id 49 | 50 | @property 51 | def nth(self): 52 | """ 53 | get nth index of this element in parent element 54 | :return: 55 | """ 56 | return len(list(self.itersiblings(preceding=True))) + 1 57 | 58 | 59 | @property 60 | def alias(self): 61 | """ 62 | get alias of element, using all attributes to construct it. 63 | :return: string 64 | """ 65 | if self._alias is not None: 66 | return self._alias 67 | from CrawlersTools.extractors.utils.element import alias 68 | self._alias = alias(self) 69 | return self._alias 70 | 71 | @property 72 | def selector(self): 73 | """ 74 | get id by hashed element 75 | :return: 76 | """ 77 | if self._selector is not None: 78 | return self._selector 79 | from CrawlersTools.extractors.utils.element import selector 80 | self._selector = selector(self) 81 | return self._selector 82 | 83 | @property 84 | def children(self): 85 | """ 86 | get children of this element 87 | :return: 88 | """ 89 | if self._children is not None: 90 | return self._children 91 | from CrawlersTools.extractors.utils.element import children 92 | self._children = list(children(self)) 93 | return self._children 94 | 95 | @property 96 | def siblings(self): 97 | """ 98 | get siblings of this element 99 | :return: 100 | """ 101 | if self._siblings is not None: 102 | return self._siblings 103 | from CrawlersTools.extractors.utils.element import siblings 104 | self._siblings = list(siblings(self)) 105 | return self._siblings 106 | 107 | @property 108 | def descendants(self): 109 | """ 110 | get descendants of this element 111 | :return: 112 | """ 113 | if self._descendants is not None: 114 | return self._descendants 115 | from CrawlersTools.extractors.utils.element import descendants 116 | self._descendants = list(descendants(self)) 117 | return self._descendants 118 | 119 | @property 120 | def parent_selector(self): 121 | """ 122 | get id by hashed element 123 | :return: 124 | """ 125 | if self._parent_selector is not None: 126 | return self._parent_selector 127 | from CrawlersTools.extractors.utils.element import selector, parent 128 | # TODO: change parent(self) to self.parent 129 | p = parent(self) 130 | if p is not None: 131 | self._parent_selector = selector(p) 132 | return self._parent_selector 133 | 134 | @property 135 | def tag_name(self): 136 | """ 137 | return tag name 138 | :return: 139 | """ 140 | if self._tag_name: 141 | return self._tag_name 142 | self._tag_name = self.tag 143 | return self._tag_name 144 | 145 | @property 146 | def text(self): 147 | """ 148 | get text of element 149 | :return: 150 | """ 151 | if self._text is not None: 152 | return self._text 153 | from CrawlersTools.extractors.utils.element import text 154 | self._text = text(self) 155 | return self._text 156 | 157 | @property 158 | def string(self): 159 | """ 160 | return string of element 161 | :return: 162 | """ 163 | return etree.tostring(self, pretty_print=True, encoding="utf-8", method='html').decode('utf-8') 164 | 165 | @property 166 | def path(self): 167 | """ 168 | get tag path using external path function 169 | :return: 170 | """ 171 | if self._path is not None: 172 | return self._path 173 | from CrawlersTools.extractors.utils.element import path 174 | self._path = path(self) 175 | return self._path 176 | 177 | @property 178 | def path_raw(self): 179 | """ 180 | get tag raw path using external path raw function 181 | :return: 182 | """ 183 | if self._path_raw is not None: 184 | return self._path_raw 185 | from CrawlersTools.extractors.utils.element import path_raw 186 | self._path_raw = path_raw(self) 187 | return self._path_raw 188 | 189 | @property 190 | def number_of_char(self): 191 | """ 192 | get text length 193 | :return: 194 | """ 195 | if self._number_of_char is not None: 196 | return self._number_of_char 197 | from CrawlersTools.extractors.utils.element import number_of_char 198 | self._number_of_char = number_of_char(self) 199 | return self._number_of_char 200 | 201 | @property 202 | def number_of_a_descendants(self): 203 | """ 204 | get number of a descendants 205 | :return: 206 | """ 207 | if self._number_of_a_descendants is not None: 208 | return self._number_of_a_descendants 209 | from CrawlersTools.extractors.utils.element import number_of_a_descendants 210 | self._number_of_a_descendants = number_of_a_descendants(self) 211 | return self._number_of_a_descendants 212 | 213 | @property 214 | def number_of_a_char(self): 215 | """ 216 | get a text length 217 | :return: 218 | """ 219 | if self._number_of_a_char is not None: 220 | return self._number_of_a_char 221 | from CrawlersTools.extractors.utils.element import number_of_a_char 222 | self._number_of_a_char = number_of_a_char(self) 223 | return self._number_of_a_char 224 | 225 | @property 226 | def number_of_p_descendants(self): 227 | """ 228 | return number of paragraph 229 | :return: 230 | """ 231 | if self._number_of_p_descendants is not None: 232 | return self._number_of_p_descendants 233 | from CrawlersTools.extractors.utils.element import number_of_p_descendants 234 | self._number_of_p_descendants = number_of_p_descendants(self) 235 | return self._number_of_p_descendants 236 | 237 | @property 238 | def number_of_punctuation(self): 239 | """ 240 | get number of punctuation 241 | :return: 242 | """ 243 | if self._number_of_punctuation is not None: 244 | return self._number_of_punctuation 245 | from CrawlersTools.extractors.utils.element import number_of_punctuation 246 | self._number_of_punctuation = number_of_punctuation(self) 247 | return self._number_of_punctuation 248 | 249 | @property 250 | def number_of_children(self): 251 | """ 252 | get children number 253 | :return: 254 | """ 255 | if self._number_of_children is not None: 256 | return self._number_of_children 257 | self._number_of_children = len(list(self.children)) 258 | return self._number_of_children 259 | 260 | @property 261 | def number_of_siblings(self): 262 | """ 263 | get number of siblings 264 | :return: 265 | """ 266 | if self._number_of_siblings is not None: 267 | return self._number_of_siblings 268 | self._number_of_siblings = len(list(self.siblings)) 269 | return self._number_of_siblings 270 | 271 | @property 272 | def number_of_descendants(self): 273 | """ 274 | get number of descendants 275 | :return: 276 | """ 277 | if self._number_of_descendants is not None: 278 | return self._number_of_descendants 279 | from CrawlersTools.extractors.utils.element import number_of_descendants 280 | self._number_of_descendants = len(list(self.descendants)) 281 | return self._number_of_descendants 282 | 283 | @property 284 | def density_of_punctuation(self): 285 | """ 286 | get density of punctuation 287 | :return: 288 | """ 289 | if self._density_of_punctuation is not None: 290 | return self._density_of_punctuation 291 | from CrawlersTools.extractors.utils.element import density_of_punctuation 292 | self._density_of_punctuation = density_of_punctuation(self) 293 | return self._density_of_punctuation 294 | 295 | @property 296 | def density_of_text(self): 297 | """ 298 | get density of text 299 | :return: 300 | """ 301 | if self._density_of_text is not None: 302 | return self._density_of_text 303 | from CrawlersTools.extractors.utils.element import density_of_text 304 | self._density_of_text = density_of_text(self) 305 | return self._density_of_text 306 | 307 | @property 308 | def similarity_with_siblings(self): 309 | """ 310 | get similarity with siblings 311 | :return: 312 | """ 313 | if self._similarity_with_siblings is not None: 314 | return self._similarity_with_siblings 315 | from CrawlersTools.extractors.utils.element import similarity_with_siblings 316 | self._similarity_with_siblings = similarity_with_siblings(self) 317 | return self._similarity_with_siblings 318 | 319 | @property 320 | def a_descendants(self): 321 | """ 322 | get linked descendants 323 | :return: 324 | """ 325 | if self._a_descendants is not None: 326 | return self._a_descendants 327 | from CrawlersTools.extractors.utils.element import a_descendants 328 | self._a_descendants = a_descendants(self) 329 | return self._a_descendants 330 | 331 | @property 332 | def img_descendants(self): 333 | """ 334 | get linked descendants 335 | :return: 336 | """ 337 | if self._img_descendants is not None: 338 | return self._img_descendants 339 | from CrawlersTools.extractors.utils.element import img_descendants 340 | 341 | self._img_descendants = img_descendants(self) 342 | return self._img_descendants 343 | 344 | @property 345 | def a_descendants_group(self): 346 | """ 347 | get linked descendants group 348 | :return: 349 | """ 350 | if self._a_descendants_group is not None: 351 | return self._a_descendants_group 352 | from CrawlersTools.extractors.utils.element import a_descendants_group 353 | self._a_descendants_group = a_descendants_group(self) 354 | return self._a_descendants_group 355 | 356 | @property 357 | def a_descendants_group_text_length(self): 358 | """ 359 | grouped linked text length 360 | :return: 361 | """ 362 | if self._a_descendants_group_text_length is not None: 363 | return self._a_descendants_group_text_length 364 | result = {} 365 | from CrawlersTools.extractors.utils.element import text 366 | for path, elements in self.a_descendants_group.items(): 367 | lengths = [] 368 | for element in elements: 369 | # TODO: convert len(text(element)) to element.number_of_char 370 | lengths.append(len(text(element))) 371 | mean_length = mean(lengths) if len(lengths) else 0 372 | result[path] = mean_length 373 | return result 374 | 375 | @property 376 | def a_descendants_group_text_min_length(self): 377 | """ 378 | get grouped linked text min length 379 | :return: 380 | """ 381 | if self._a_descendants_group_text_min_length is not None: 382 | return self._a_descendants_group_text_min_length 383 | values = self.a_descendants_group_text_length.values() 384 | self._a_descendants_group_text_min_length = min(values) if values else 0 385 | return self._a_descendants_group_text_min_length 386 | 387 | @property 388 | def a_descendants_group_text_max_length(self): 389 | """ 390 | get grouped linked text max length 391 | :return: 392 | """ 393 | if self._a_descendants_group_text_max_length is not None: 394 | return self._a_descendants_group_text_max_length 395 | values = self.a_descendants_group_text_length.values() 396 | self._a_descendants_group_text_max_length = max(values) if values else 0 397 | return self._a_descendants_group_text_max_length 398 | 399 | @property 400 | def a_descendants_group_text_avg_length(self): 401 | """ 402 | get grouped linked text avg length 403 | :return: 404 | """ 405 | if self._a_descendants_group_text_max_length is not None: 406 | return self._a_descendants_group_text_max_length 407 | values = self.a_descendants_group_text_length.values() 408 | self._a_descendants_group_text_max_length = max(values) if values else 0 409 | return self._a_descendants_group_text_max_length 410 | 411 | def __str__(self): 412 | """ 413 | rewrite str 414 | :return: 415 | """ 416 | return f'' 417 | 418 | def __repr__(self): 419 | """ 420 | rewrite repr 421 | :return: 422 | """ 423 | return self.__str__() 424 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/time_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/11/3 9:55 4 | # @Author : MuggleK 5 | # @File : time_extractor.py 6 | 7 | import re 8 | 9 | from lxml.html import etree 10 | 11 | from CrawlersTools.extractors.base import BaseExtractor 12 | from CrawlersTools.extractors.schemas.element import Element 13 | from CrawlersTools.extractors.utils.settings import DATETIME_PATTERN, PUBLISH_TIME_META, TITLE_EXTRACTOR_USELESS_TAGS 14 | from CrawlersTools.preprocess import TimeProcessor 15 | 16 | format_time = TimeProcessor().format 17 | 18 | 19 | class TimeExtractor(BaseExtractor): 20 | 21 | @staticmethod 22 | def extract_from_xpath(element: Element, publish_time_xpath: str) -> str: 23 | if publish_time_xpath: 24 | publish_time = ''.join(element.xpath(publish_time_xpath)) 25 | return format_time(publish_time) 26 | return '' 27 | 28 | @staticmethod 29 | def extract_from_text(element: Element) -> str: 30 | text = ''.join(element.xpath('.//text()')) 31 | for dt in DATETIME_PATTERN: 32 | dt_obj = re.search(dt, text) 33 | if dt_obj: 34 | return format_time(dt_obj.group(1)) 35 | else: 36 | return '' 37 | 38 | @staticmethod 39 | def extract_from_meta(element: Element) -> str: 40 | """ 41 | 优先匹配 META 数据 42 | :param element: 网页源代码对应的Dom 树 43 | :return: str 44 | """ 45 | for xpath in PUBLISH_TIME_META: 46 | publish_time = element.xpath(xpath) 47 | if publish_time: 48 | return format_time(''.join(publish_time)) 49 | return '' 50 | 51 | def process(self, element: Element): 52 | # remove tag and its content 53 | etree.strip_elements(element, *TITLE_EXTRACTOR_USELESS_TAGS) 54 | 55 | publish_time = (self.extract_from_xpath(element, publish_time_xpath=self.kwargs.get("publish_time_xpath")) 56 | or self.extract_from_meta(element) 57 | or self.extract_from_text(element)) 58 | 59 | return publish_time 60 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/title_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/19 20:23 4 | # @Author : MuggleK 5 | # @File : title_extractor.py 6 | 7 | import re 8 | from itertools import combinations 9 | 10 | from lxml.html import etree 11 | 12 | from CrawlersTools.extractors.base import BaseExtractor 13 | from CrawlersTools.extractors.schemas.element import Element 14 | from CrawlersTools.extractors.utils.settings import ( 15 | TITLE_HTAG_XPATH, TITLE_META_XPATH, TITLE_META_XPATH_BAK, TITLE_EXTRACTOR_USELESS_TAGS, PUNCTUATION_ALPHA_PATTERN 16 | ) 17 | from CrawlersTools.extractors.utils.similarity import get_longest_common_sub_string 18 | 19 | 20 | class TitleExtractor(BaseExtractor): 21 | 22 | @staticmethod 23 | def extract_by_xpath(element, title_xpath): 24 | if title_xpath: 25 | title_list = element.xpath(title_xpath) 26 | if title_list: 27 | return title_list[0] 28 | return '' 29 | 30 | @staticmethod 31 | def extract_by_title(element): 32 | title_list = element.xpath(TITLE_META_XPATH) or element.xpath(TITLE_META_XPATH_BAK) 33 | if title_list: 34 | return max(title_list, key=len) 35 | else: 36 | return '' 37 | 38 | @staticmethod 39 | def extract_by_htag(element): 40 | title_list = element.xpath(TITLE_HTAG_XPATH) 41 | title_list = [re.sub(PUNCTUATION_ALPHA_PATTERN, "", phrase) for phrase in title_list] 42 | if not title_list: 43 | return '' 44 | index_string = [(index, ''.join(filter(str.isalnum, string))) for index, string in enumerate(title_list)] 45 | string_list = [i[1] for i in index_string] 46 | max_string = max(string_list, key=len) 47 | return title_list[string_list.index(max_string)] 48 | 49 | @staticmethod 50 | def extract_common_str(element: Element) -> str: 51 | h_tag_texts_list = element.xpath(TITLE_HTAG_XPATH) 52 | new_title_list = list(combinations(h_tag_texts_list, 2)) 53 | if len(new_title_list) == 1: 54 | new_title = str(max(list(new_title_list[0]), key=len)) 55 | return new_title 56 | 57 | common_title_list = [get_longest_common_sub_string(i[0], i[1]).strip() for i in new_title_list] 58 | if common_title_list: 59 | new_title = max(common_title_list, key=len) 60 | sub_string = re.sub(r'\d+', '', ''.join(filter(str.isalnum, new_title))) 61 | return new_title if len(new_title) > 4 and sub_string else '' 62 | return '' 63 | 64 | def process(self, element: Element): 65 | # remove tag and its content 66 | etree.strip_elements(element, *TITLE_EXTRACTOR_USELESS_TAGS) 67 | 68 | title = (self.extract_by_xpath(element, title_xpath=self.kwargs.get("title_xpath")) 69 | or self.extract_by_title(element) 70 | or self.extract_common_str(element) 71 | or self.extract_by_htag(element) 72 | ) 73 | return title.strip() 74 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MuggleK/CrawlersTools/8f59a1dd884367f2f59e1f6f63b683c9f24ad377/CrawlersTools/extractors/utils/__init__.py -------------------------------------------------------------------------------- /CrawlersTools/extractors/utils/cluster.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | from CrawlersTools.extractors.utils.similarity import similarity 4 | 5 | 6 | def cluster(items, threshold=0.9): 7 | """ 8 | cluster names 9 | :param items: 10 | :param threshold: 11 | :return: cluster map, for example {"foo": 0, "bar": 1} 12 | """ 13 | number = -1 14 | clusters_map = {} 15 | clusters = [] 16 | for name in items: 17 | for c in clusters: 18 | if all(similarity(name, w) > threshold for w in c): 19 | c.append(name) 20 | clusters_map[name] = number 21 | break 22 | else: 23 | number += 1 24 | clusters.append([name]) 25 | clusters_map[name] = number 26 | return clusters_map 27 | 28 | 29 | def cluster_dict(data: dict, threshold=0.8): 30 | """ 31 | cluster dict, convert id key to cluster id key 32 | :param threshold: 33 | :param data: 34 | :return: 35 | """ 36 | ids = data.keys() 37 | clusters_map = cluster(ids, threshold) 38 | result = defaultdict(list) 39 | for k, v in data.items(): 40 | if isinstance(v, list): 41 | for i in v: 42 | result[clusters_map[k]].append(i) 43 | else: 44 | result[clusters_map[k]].append(v) 45 | return dict(result) 46 | 47 | 48 | if __name__ == '__main__': 49 | data = { 50 | '/html/body/div[@class="main"]/div[1]/ul': ['child1', 'child2', 'child3'], 51 | '/html/body/div[@class="main"]/div[2]/ul': ['child4', 'child5', 'child6'], 52 | '/html/body/div[@class="main"]/div[3]/ul': ['child7', 'child8', 'child9'], 53 | '/html/body/header/div[1]': ['child10', 'child11', 'child12'], 54 | '/html/body/header/div[2]': ['child13', 'child14', 'child15'], 55 | } 56 | print(cluster_dict(data, threshold=0.7)) 57 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/utils/element.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import defaultdict 3 | from os.path import exists 4 | from types import ModuleType 5 | 6 | import numpy as np 7 | from loguru import logger 8 | from lxml.html import fromstring, HtmlElement 9 | 10 | from CrawlersTools.extractors.schemas.element import Element 11 | from CrawlersTools.extractors.utils.similarity import similarity 12 | 13 | PUNCTUATION = set('''!,。?、;:“”‘’《》%()<>{}「」【】*~`,.?:;'"!%()''') 14 | 15 | 16 | def remove_element(element: Element): 17 | """ 18 | remove child element from parent 19 | :param element: 20 | :return: 21 | """ 22 | if element is None: 23 | return 24 | p = element.getparent() 25 | if p is not None: 26 | p.remove(element) 27 | 28 | 29 | def remove_children(element: Element, xpaths): 30 | """ 31 | remove children from element 32 | :param element: 33 | :param xpaths: 34 | :return: 35 | """ 36 | if element is None: 37 | return 38 | if not xpaths: 39 | return 40 | for xpath in xpaths: 41 | nodes = element.xpath(xpath) 42 | for node in nodes: 43 | remove_element(node) 44 | return element 45 | 46 | 47 | def html2element(html: str): 48 | """ 49 | convert html to HtmlElement 50 | :param html: 51 | :return: 52 | """ 53 | if not html: 54 | return None 55 | element = fromstring(html) 56 | element.__class__ = Element 57 | return element 58 | 59 | 60 | def file2element(file_path): 61 | """ 62 | convert file to element 63 | :param file_path: 64 | :return: 65 | """ 66 | if not exists(file_path): 67 | return 68 | with open(file_path, encoding='utf-8') as f: 69 | return html2element(f.read()) 70 | 71 | 72 | def selector(element: Element): 73 | """ 74 | get id using recursive function. 75 | for example result: html/body/div/div/ul/li 76 | :param element: 77 | :return: 78 | """ 79 | if element is None: 80 | return '' 81 | p = parent(element) 82 | if p is not None: 83 | return selector(p) + '>' + alias(element) 84 | return element.alias 85 | 86 | 87 | def path_raw(element: Element): 88 | """ 89 | get tag path using recursive function, only contains raw tag 90 | for example result: html/body/div/div/ul/li 91 | :param element: 92 | :return: 93 | """ 94 | if element is None: 95 | return '' 96 | p = parent(element) 97 | if p is not None: 98 | return path_raw(p) + '/' + element.tag 99 | return element.tag 100 | 101 | 102 | def path(element: Element): 103 | """ 104 | get tag path using recursive function. 105 | for example result: html/body/div/div/ul/li 106 | :param element: 107 | :return: 108 | """ 109 | if element is None: 110 | return '' 111 | result = path_raw(element) 112 | # get nth-child 113 | nth = len(list(element.itersiblings(preceding=True))) + 1 114 | result += f':nth-child({nth})' 115 | return result 116 | 117 | 118 | def a_descendants(element: Element): 119 | """ 120 | get 121 | :param element: 122 | :return: 123 | """ 124 | if element is None: 125 | return [] 126 | descendants = [] 127 | for descendant in element.xpath('.//a'): 128 | descendant.__class__ = Element 129 | descendants.append(descendant) 130 | return descendants 131 | 132 | def img_descendants(element: Element): 133 | """ 134 | get 135 | :param element: 136 | :return: 137 | """ 138 | if element is None: 139 | return [] 140 | descendants = [] 141 | for descendant in element.xpath(".//img"): 142 | descendant.__class__ = Element 143 | descendants.append(descendant) 144 | return descendants 145 | 146 | def a_descendants_group(element: Element): 147 | """ 148 | get linked descendants group 149 | :param element: 150 | :return: 151 | """ 152 | result = defaultdict(list) 153 | for linked_descendant in element.a_descendants: 154 | p = linked_descendant.path_raw 155 | result[p].append(linked_descendant) 156 | return result 157 | 158 | 159 | def parent(element: Element): 160 | """ 161 | get parent of element 162 | :param element: 163 | :return: 164 | """ 165 | if element is None: 166 | return None 167 | parent = element.getparent() 168 | if isinstance(parent, ModuleType): 169 | parent.__class__ = Element 170 | return parent 171 | 172 | 173 | def children(element: Element, including=False): 174 | """ 175 | get children 176 | :param element: 177 | :param including: 178 | :return: 179 | """ 180 | if element is None: 181 | return [] 182 | if including: 183 | yield element 184 | for child in element.iterchildren(): 185 | if isinstance(child, HtmlElement): 186 | child.__class__ = Element 187 | yield child 188 | 189 | 190 | def siblings(element: Element, including=False): 191 | """ 192 | get siblings of element 193 | :param element: 194 | :param including: include current element or not 195 | :return: 196 | """ 197 | if element is None: 198 | return [] 199 | if including: 200 | yield element 201 | for sibling in element.itersiblings(preceding=True): 202 | if isinstance(sibling, HtmlElement): 203 | sibling.__class__ = Element 204 | yield sibling 205 | for sibling in element.itersiblings(preceding=False): 206 | if isinstance(sibling, HtmlElement): 207 | sibling.__class__ = Element 208 | yield sibling 209 | 210 | 211 | def descendants(element: Element, including=False): 212 | """ 213 | get descendants clement of specific element 214 | :param element: parent element 215 | :param including: including current element or not 216 | :return: 217 | """ 218 | if element is None: 219 | return [] 220 | if including: 221 | yield element 222 | for descendant in element.iterdescendants(): 223 | if isinstance(descendant, HtmlElement): 224 | descendant.__class__ = Element 225 | yield descendant 226 | 227 | 228 | def alias(element: Element): 229 | """ 230 | get alias of element, concat tag and attribs 231 | :param element: 232 | :return: 233 | """ 234 | if element is None: 235 | return '' 236 | tag = element.tag 237 | # skip nth-child 238 | if tag in ['html', 'body']: 239 | return tag 240 | attribs = [tag] 241 | for k, v in element.attrib.items(): 242 | k, v = re.sub(r'\s*', '', k), re.sub(r'\s*', '', v) 243 | attribs.append(f'[{k}="{v}"]' if v else f'[{k}]') 244 | result = ''.join(attribs) 245 | # get nth-child 246 | nth = len(list(element.itersiblings(preceding=True))) + 1 247 | result += f':nth-child({nth})' 248 | return result 249 | 250 | 251 | def children_of_head(element: Element): 252 | """ 253 | get children element of body element 254 | :param element: 255 | :return: 256 | """ 257 | if element is None: 258 | return [] 259 | body_xpath = '//head' 260 | body_element = element.xpath(body_xpath) 261 | if body_element: 262 | body_element.__class__ = Element 263 | return descendants(body_element, True) 264 | return [] 265 | 266 | 267 | def descendants_of_body(element: Element): 268 | """ 269 | get descendants element of body element 270 | :param element: 271 | :return: 272 | """ 273 | if element is None: 274 | return [] 275 | body_xpath = '//*' 276 | elements = element.xpath(body_xpath) 277 | if elements: 278 | elements[0].__class__ = Element 279 | return list(descendants(elements[0], True)) 280 | return [] 281 | 282 | 283 | def text(element: Element): 284 | """ 285 | get text of element 286 | :param element: 287 | :return: 288 | """ 289 | if element is None: 290 | return 0 291 | text = ''.join(element.xpath('.//text()')) 292 | text = re.sub(r'\s*', '', text, flags=re.S) 293 | # text = ''.join(re.findall(r'[\u4e00-\u9fa5]+', text)) 294 | return text 295 | 296 | 297 | def calc_a_descendants_text_of_avg_length(element: Element): 298 | 299 | if element is None: 300 | return 0 301 | if element.tag.lower() == "a": 302 | if len(element.siblings) > 1: 303 | lengths = [] 304 | for link in element.siblings: 305 | if element.attrib.get("title") is not None: 306 | lengths.append(len(element.attrib.get("title"))) 307 | if len(element.text.strip()) > 0: 308 | lengths.append(len(element.text.strip())) 309 | if len(text(link)) > 0: 310 | lengths.append(len(text(link))) 311 | lengths.append(0) 312 | if len(lengths) == 0: 313 | return 0 314 | avg_length = np.mean(lengths) 315 | return avg_length 316 | else: 317 | return element.number_of_char 318 | lengths = [] 319 | # other node 320 | try: 321 | siblings = element.siblings 322 | except Exception: 323 | return 0 324 | 325 | for descentdant in siblings: 326 | if descentdant.number_of_a_char > 0: 327 | txt = descentdant.text 328 | regex = r"(2[0-9]{3}.?[0-1]{0,1}[0-9].?[0-3]{0,1}[0-9])" 329 | regex2 = r"([0-1]{0,1}[0-9]-[0-3]{0,1}[0-9])" 330 | time_match = len(re.findall(regex, txt)) or len(re.findall(regex2, txt)) 331 | t = descentdant.number_of_a_char 332 | if time_match > 0: 333 | t = t + 10 334 | lengths.append(t / len(descentdant.a_descendants)) 335 | continue 336 | if len(descentdant.text.strip()) > 0: 337 | lengths.append(len(descentdant.text.strip())) 338 | continue 339 | if len(text(descentdant)) > 0: 340 | lengths.append(len(text(descentdant))) 341 | continue 342 | lengths.append(0) 343 | if len(lengths) == 0: 344 | return 0 345 | avg_length = np.mean(lengths) 346 | return avg_length 347 | 348 | 349 | def number_of_char(element: Element): 350 | """ 351 | get number of char, for example, result of `helloworld` = 10 352 | :param element: 353 | :return: length 354 | """ 355 | if element is None: 356 | return 0 357 | return len(text(element)) 358 | 359 | 360 | def number_of_a_char(element: Element): 361 | """ 362 | get number of linked char, for example, result of `helloworld` = 5 363 | :param element: 364 | :return: length 365 | """ 366 | if element is None: 367 | return 0 368 | text = ''.join(element.xpath('.//a//text()')) 369 | text = re.sub(r'\s*', '', text, flags=re.S) 370 | return len(text) 371 | 372 | 373 | def number_of_a_char_log10(element: Element): 374 | """ 375 | get number of linked char, to log10 376 | :param element: 377 | :return: length 378 | """ 379 | if element is None: 380 | return 0 381 | return np.log10(number_of_a_char(element) + 1) 382 | 383 | 384 | def number_of_p_children(element: Element): 385 | """ 386 | get number of p tags in children 387 | :param element: 388 | :return: 389 | """ 390 | if element is None: 391 | return 0 392 | return len(element.xpath('./p')) 393 | 394 | 395 | def number_of_p_descendants(element: Element): 396 | """ 397 | get number of p tags in descendants 398 | :param element: 399 | :return: 400 | """ 401 | # element_list = list() 402 | if element is None: 403 | return 0 404 | # for tag in ["p", "span", "tr", "td", "th", "u", "strong", "b", "section", "spanstyle", "spanlang"]: 405 | # element_list.extend(element.xpath(f".//{tag}")) 406 | return len(element.xpath(".//*")) 407 | 408 | 409 | def number_of_p_descendants_log10(element: Element): 410 | """ 411 | get number of p tags, to log10 412 | :param element: 413 | :return: 414 | """ 415 | if element is None: 416 | return 0 417 | return np.log10(number_of_p_descendants(element)) 418 | 419 | 420 | def number_of_a_descendants(element: Element): 421 | """ 422 | get number of a tags in this element 423 | :param element: 424 | :return: 425 | """ 426 | if element is None: 427 | return 0 428 | return len(element.xpath('.//a')) 429 | 430 | 431 | def number_of_punctuation(element: Element): 432 | """ 433 | get number of punctuation of text in this element 434 | :param element: 435 | :return: 436 | """ 437 | if element is None: 438 | return 0 439 | text = ''.join(element.xpath('.//text()')) 440 | text = re.sub(r'\s*', '', text, flags=re.S) 441 | punctuations = [c for c in text if c in PUNCTUATION] 442 | return len(punctuations) 443 | 444 | 445 | def number_of_descendants(element: Element): 446 | """ 447 | get number of descendants 448 | :param element: 449 | :return: 450 | """ 451 | if element is None: 452 | return 0 453 | # return len(element.xpath('.//*')) 454 | return len(list(descendants(element, including=False))) 455 | 456 | 457 | def number_of_siblings(element: Element): 458 | """ 459 | get number of siblings 460 | :param element: 461 | :return: 462 | """ 463 | if element is None: 464 | return 0 465 | return len(list(siblings(element, including=False))) 466 | 467 | 468 | def number_of_clusters(element: Element, tags=None): 469 | """ 470 | get number of clusters 471 | :param element: 472 | :return: 473 | """ 474 | from CrawlersTools.extractors.utils.settings import LIST_MIN_NUMBER, LIST_MAX_LENGTH, LIST_MIN_LENGTH, SIMILARITY_THRESHOLD 475 | if element is None: 476 | return 0 477 | if tags and not isinstance(tags, (list, tuple)): 478 | logger.error('you must pass tags arg as list or tuple') 479 | descendants_tree = defaultdict(list) 480 | descendants = descendants_of_body(element) 481 | for descendant in descendants: 482 | # if one element does not have enough siblings, it can not become a child of candidate element 483 | if descendant.number_of_siblings + 1 < LIST_MIN_NUMBER: 484 | continue 485 | # if min length is larger than specified max length, it can not become a child of candidate element 486 | if descendant.a_descendants_group_text_min_length > LIST_MAX_LENGTH: 487 | continue 488 | # if max length is smaller than specified min length, it can not become a child of candidate element 489 | if descendant.a_descendants_group_text_max_length < LIST_MIN_LENGTH: 490 | continue 491 | # descendant element must have same siblings which their similarity should not below similarity_threshold 492 | if descendant.similarity_with_siblings < SIMILARITY_THRESHOLD: 493 | continue 494 | # filter tag 495 | if tags and descendant.tag not in tags: 496 | continue 497 | descendants_tree[descendant.parent_selector].append(descendant) 498 | return len(descendants_tree) 499 | 500 | 501 | def number_of_children(element: Element): 502 | """ 503 | get number of children 504 | :param element: 505 | :return: 506 | """ 507 | if element is None: 508 | return 0 509 | return len(list(children(element))) 510 | 511 | 512 | def density_of_text(element: Element): 513 | """ 514 | get density of text, using: 515 | number_of_char - number_of_a_char 516 | result = ------------------------------------------ 517 | number_of_descendants - number_of_a_descendants 518 | :return: 519 | """ 520 | # if denominator is 0, just return 0 521 | if element.number_of_descendants - element.number_of_a_descendants == 0: 522 | return 0 523 | return (element.number_of_char - element.number_of_a_char) / \ 524 | (element.number_of_descendants - element.number_of_a_descendants) 525 | 526 | 527 | def density_of_punctuation(element: Element): 528 | """ 529 | get density of punctuation, using 530 | number_of_char - number_of_linked_char 531 | result = ----------------------------------------- 532 | number_of_punctuation + 1 533 | :param element: 534 | :return: 535 | """ 536 | result = (element.number_of_char - element.number_of_a_char) / \ 537 | (element.number_of_punctuation + 1) 538 | # result should not be zero 539 | return result or 1 540 | 541 | 542 | def similarity_with_element(element1: Element, element2: Element): 543 | """ 544 | get similarity between two elements 545 | :param element1: 546 | :param element2: 547 | :return: 548 | """ 549 | alias1 = element1.alias 550 | alias2 = element2.alias 551 | # TODO: use better metrics to compare the two elements 552 | return similarity(alias1, alias2) 553 | 554 | 555 | def similarity_with_siblings(element: Element): 556 | """ 557 | get similarity with siblings 558 | :param element: 559 | :return: 560 | """ 561 | scores = [] 562 | for sibling in siblings(element): 563 | # TODO: maybe compare all children not only alias 564 | scores.append(similarity_with_element(element, sibling)) 565 | if not scores: 566 | return 0 567 | return np.mean(scores) 568 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/utils/preprocess.py: -------------------------------------------------------------------------------- 1 | from lxml.html import etree 2 | 3 | from CrawlersTools.extractors.schemas.element import Element 4 | from CrawlersTools.extractors.utils.element import children, remove_element, remove_children 5 | 6 | # fmt:off 7 | CONTENT_EXTRACTOR_USELESS_TAGS = ['audio', 'colgroup', 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'iframe', 8 | 'input', 'link', 'option', 'path', 'script', 'select', 'source', 'style', 'svg', 9 | 'symbol', 'video'] 10 | 11 | CONTENT_EXTRACTOR_STRIP_TAGS = ['b', 'blockquote', 'br', 'font', 'p', 'section', 'span', 'spanlang', 'spanstyle', 12 | 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'th', 'tr', 'u'] # 'img' 13 | 14 | KEYWORD_FEATURES = 'and not (contains(@class,"main")) and not (contains(@class,"content"))and not (contains(@class,"con"))and not (contains(@class,"container")) and not (contains(@class,"list")) and not (contains(@class,"box")) and not (contains(@class,"right"))and not (contains(@class,"body")) and not (contains(@class,"lanmu")) ' 15 | CONTENT_EXTRACTOR_NOISE_XPATH = [ 16 | # '//div[contains(@class, "comment")]', 17 | '//div[contains(@class, "advertisement")]', 18 | '//div[contains(@class, "advert")]', 19 | '//a[contains(@style, "display: none")]', 20 | '//a[contains(@style, "display:none")]', # TODO css不展示数据是否要去除,可能会影响正文重复 21 | f'//div[contains(@class, "foot") {KEYWORD_FEATURES}]', 22 | f'//div[contains(@class, "footer") {KEYWORD_FEATURES}]', 23 | # f'//div[contains(@class, "location") {KEYWORD_FEATURES}]', 24 | f'//div[contains(@class, "navigation") {KEYWORD_FEATURES}]', 25 | f'//div[contains(@class, "barrier") {KEYWORD_FEATURES}]', 26 | '//div[contains(@id, "foot")]', 27 | # '//div[contains(@class, "head")]', # 误删 28 | # '//div[contains(@id, "head")]', 29 | # '//div[contains(@class, "nav")]', # 误删 30 | '//div[contains(@id, "nav")]', 31 | '//div[contains(@class, "siderbar")]', 32 | '//div[contains(@class, "breadcrumb")]', 33 | '//div[contains(@id, "siderbar")]', 34 | '//div[contains(@id, "页脚")]', 35 | '//div[contains(@class, "页脚")]', 36 | '//div[contains(@id, "页眉")]', 37 | '//div[contains(@id, "页头")]', 38 | '//div[contains(@class, "页眉")]', 39 | '//div[contains(@class, "页头")]', 40 | '//*[contains(@class, "hidden")]', 41 | ] 42 | 43 | 44 | def preprocess4content_extractor(element: Element, is_content: bool = True): 45 | """ 46 | preprocess element for content extraction 47 | :param element: 48 | :param is_content: save content without tag 49 | :return: 50 | """ 51 | remove_children(element, CONTENT_EXTRACTOR_NOISE_XPATH) 52 | 53 | # remove tag and its content 54 | etree.strip_elements(element, *CONTENT_EXTRACTOR_USELESS_TAGS) 55 | 56 | if not is_content: return 57 | # only move tag pair 58 | etree.strip_tags(element, *CONTENT_EXTRACTOR_STRIP_TAGS) 59 | 60 | for child in children(element): 61 | 62 | # merge text in span or strong to parent p tag 63 | if child.tag.lower() == 'p' or child.tag.lower() == 'table': 64 | etree.strip_tags(child, 'span') 65 | etree.strip_tags(child, 'strong') 66 | etree.strip_tags(child, 'tr') 67 | etree.strip_tags(child, 'td') 68 | 69 | if not (child.text and child.text.strip()): 70 | remove_element(child) 71 | 72 | # if a div tag does not contain any sub node, it could be converted to p node. 73 | if child.tag.lower() == 'div' and not child.getchildren(): 74 | child.tag = 'p' 75 | 76 | 77 | LIST_EXTRACTOR_USELESS_TAGS = CONTENT_EXTRACTOR_USELESS_TAGS 78 | LIST_EXTRACTOR_STRIP_TAGS = CONTENT_EXTRACTOR_STRIP_TAGS 79 | LIST_EXTRACTOR_NOISE_XPATH = CONTENT_EXTRACTOR_NOISE_XPATH 80 | 81 | 82 | def preprocess4list_extractor(element: Element): 83 | """ 84 | preprocess element for list extraction 85 | :param element: 86 | :return: 87 | """ 88 | # remove tag and its content 89 | etree.strip_elements(element, *CONTENT_EXTRACTOR_USELESS_TAGS) 90 | # only move tag pair 91 | etree.strip_tags(element, *CONTENT_EXTRACTOR_STRIP_TAGS) 92 | 93 | remove_children(element, CONTENT_EXTRACTOR_NOISE_XPATH) 94 | 95 | for child in children(element): 96 | 97 | # merge text in span or strong to parent p tag 98 | if child.tag.lower() == 'p': 99 | etree.strip_tags(child, 'span') 100 | etree.strip_tags(child, 'strong') 101 | 102 | if not (child.text and child.text.strip()): 103 | remove_element(child) 104 | 105 | # if a div tag does not contain any sub node, it could be converted to p node. 106 | if child.tag.lower() == 'div' and not child.getchildren(): 107 | child.tag = 'p' 108 | 109 | 110 | LIST_CLASSIFIER_USELESS_TAGS = ['style', 'script', 'link', 'video', 'audio', 'iframe', 'source', 'svg', 'path', 111 | 'symbol', 'footer', 'header'] 112 | LIST_CLASSIFIER_STRIP_TAGS = ['span', 'blockquote'] 113 | LIST_CLASSIFIER_NOISE_XPATHS = [ 114 | '//div[contains(@class, "comment")]', 115 | '//div[contains(@class, "advertisement")]', 116 | '//div[contains(@class, "advert")]', 117 | '//div[contains(@style, "display: none")]', 118 | ] 119 | 120 | 121 | def preprocess4list_classifier(element: Element): 122 | """ 123 | preprocess element for list classifier 124 | :param element: 125 | :return: 126 | """ 127 | # remove tag and its content 128 | etree.strip_elements(element, *LIST_CLASSIFIER_USELESS_TAGS) 129 | # only move tag pair 130 | etree.strip_tags(element, *LIST_CLASSIFIER_STRIP_TAGS) 131 | 132 | remove_children(element, LIST_CLASSIFIER_NOISE_XPATHS) 133 | 134 | for child in children(element): 135 | 136 | # merge text in span or strong to parent p tag 137 | if child.tag.lower() == 'p': 138 | etree.strip_tags(child, 'span') 139 | etree.strip_tags(child, 'strong') 140 | 141 | if not (child.text and child.text.strip()): 142 | remove_element(child) 143 | 144 | # if a div tag does not contain any sub node, it could be converted to p node. 145 | if child.tag.lower() == 'div' and not child.getchildren(): 146 | child.tag = 'p' 147 | -------------------------------------------------------------------------------- /CrawlersTools/extractors/utils/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Project : CrawlersTools 3 | # @Time : 2022/8/19 20:00 4 | # @Author : MuggleK 5 | # @File : settings.py 6 | 7 | # list settings 8 | LIST_MIN_NUMBER = 5 9 | LIST_MIN_LENGTH = 8 10 | LIST_MAX_LENGTH = 50 11 | SIMILARITY_THRESHOLD = 0.8 12 | 13 | LIST_AVG_LENGTH = 9 14 | ADDTION_RIGHT_NUM = 10000 15 | 16 | HIGH_WEIGHT_ERROR_KEYWORD = ["ICP备", "公网安备", "网公安备", "备案序号:", "网站地图"] 17 | DIRECTORY_ERROR_TITLE = ["首页", "下一页", "解读", "图解", "详细", "阅读全文", "标题", "[详细]"] 18 | 19 | 20 | # common settings 21 | SPECIAL_SYMBOL_MAP = { 22 | """: '"', 23 | "&": "&", 24 | "<": "<", 25 | ">": ">", 26 | " ": " ", 27 | """: '"', 28 | "&": "&", 29 | "<": "<", 30 | ">": ">", 31 | " ": " ", 32 | '