├── .github
    └── workflows
    │   └── python-publish.yml
├── .gitignore
├── CrawlersTools
    ├── README.md
    ├── __init__.py
    ├── extractors
    │   ├── __init__.py
    │   ├── attachment_extractor.py
    │   ├── base.py
    │   ├── content_extractor.py
    │   ├── list_extractor.py
    │   ├── schemas
    │   │   ├── __init__.py
    │   │   └── element.py
    │   ├── time_extractor.py
    │   ├── title_extractor.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── cluster.py
    │   │   ├── element.py
    │   │   ├── preprocess.py
    │   │   ├── settings.py
    │   │   └── similarity.py
    ├── js_crawler
    │   ├── __init__.py
    │   ├── font_decrypt.py
    │   └── transfer_js.py
    ├── logs
    │   ├── __init__.py
    │   ├── formatters.py
    │   ├── handlers.py
    │   ├── log.py
    │   └── logger.py
    ├── pipelines
    │   ├── __init__.py
    │   ├── kafka_operate.py
    │   ├── mongo_pipeline.py
    │   ├── mysql_pipeline.py
    │   └── redis_pipeline.py
    ├── preprocess
    │   ├── __init__.py
    │   ├── bloom_filter.py
    │   └── time_process.py
    ├── projects
    │   ├── __init__.py
    │   ├── filters.py
    │   └── upload_oss.py
    ├── requests
    │   ├── __init__.py
    │   ├── base_requests.py
    │   ├── proxy.py
    │   └── random_ua.py
    ├── requirements.txt
    ├── schedules
    │   ├── __init__.py
    │   └── auto_thread.py
    └── utils
    │   └── str_compare.py
├── LICENSE
├── README.md
└── setup.py


/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on: push
12 | 
13 | permissions:
14 |   contents: read
15 | 
16 | jobs:
17 |   deploy:
18 | 
19 |     runs-on: ubuntu-latest
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v3
23 |     - name: Set up Python
24 |       uses: actions/setup-python@v3
25 |       with:
26 |         python-version: '3.x'
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         pip install build
31 |     - name: Build package
32 |       run: python -m build
33 |     - name: Publish package
34 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
35 |       with:
36 |         user: __token__
37 |         password: ${{ secrets.PYPI_API_TOKEN }}
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # xml
132 | .xml
133 | 
134 | /.idea
135 | test/


--------------------------------------------------------------------------------
/CrawlersTools/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/8/12 20:48
 4 | # @Author  : MuggleK
 5 | # @File    : __init__.py
 6 | 
 7 | from CrawlersTools.extractors import PolicyExtractor, ListExtractor
 8 | from .logs.logger import init_logger
 9 | from CrawlersTools.logs import Logging
10 | from CrawlersTools.pipelines import MysqlPipeline, MongoPipeline, RedisPipeline
11 | from CrawlersTools.preprocess import TimeProcessor
12 | from CrawlersTools.requests import base_requests, get_proxies, UserAgent
13 | 


--------------------------------------------------------------------------------
/CrawlersTools/extractors/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/8/15 11:29
 4 | # @Author  : MuggleK
 5 | # @File    : __init__.py
 6 | 
 7 | from CrawlersTools.extractors.attachment_extractor import AttachmentExtractor
 8 | from CrawlersTools.extractors.content_extractor import ContentExtractor
 9 | from CrawlersTools.extractors.list_extractor import ListExtractor
10 | from CrawlersTools.extractors.time_extractor import TimeExtractor
11 | from CrawlersTools.extractors.title_extractor import TitleExtractor
12 | 
13 | 
14 | class PolicyExtractor(object):
15 | 
16 |     @staticmethod
17 |     def extract(
18 |             html,
19 |             title_xpath: str = "",
20 |             publish_time_xpath: str = "",
21 |             content_xpath: str = "",
22 |             attachment_xpath: str = "",
23 |             attachment_regx: str = ""
24 |     ) -> dict:
25 |         title = TitleExtractor().extract(html, title_xpath=title_xpath)
26 |         publish_time = TimeExtractor().extract(html, publish_time_xpath=publish_time_xpath)
27 |         content, content_with_tag, images = ContentExtractor().extract(html, content_xpath=content_xpath)
28 |         attachments = AttachmentExtractor().extract(html, attachment_xpath=attachment_xpath, attachment_regx=attachment_regx)
29 | 
30 |         return {
31 |             "title": title,
32 |             "publish_time": publish_time,
33 |             "content": content,
34 |             "content_with_tag": content_with_tag,
35 |             "images": images,
36 |             "attachment": attachments
37 |         }
38 | 


--------------------------------------------------------------------------------
/CrawlersTools/extractors/attachment_extractor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/12/21 10:54
 4 | # @Author  : MuggleK
 5 | # @File    : attachment_extractor.py
 6 | 
 7 | import re
 8 | 
 9 | from CrawlersTools.extractors.base import BaseExtractor
10 | from CrawlersTools.extractors.schemas.element import Element
11 | from CrawlersTools.extractors.utils.settings import ATTACHMENT_REGX
12 | 
13 | 
14 | class AttachmentExtractor(BaseExtractor):
15 |     """
16 |     extract content from detail page
17 |     """
18 |     
19 |     def process(self, element: Element):
20 |         """
21 |         extract content from html
22 |         :param element:
23 |         :return:
24 |         """
25 |         attachment_list = list()
26 |         attachment_xpath = self.kwargs.get("attachment_xpath") or "//a"
27 |         for attachment_element in element.xpath(attachment_xpath):
28 |             url = [i.strip() for i in attachment_element.xpath("@href") or attachment_element.xpath("@src")]
29 |             name = [i.strip() for i in attachment_element.xpath(".//text()")]
30 |             if not (''.join(url).strip() and ''.join(name).strip()):
31 |                 continue
32 |             suffix = self.filter_suffix(url[0], name[0])
33 |             if not suffix: continue
34 |             attachment_list.append({
35 |                 "file_url": url[0],
36 |                 "file_name": name[0]
37 |             })
38 |         return attachment_list
39 | 
40 |     def filter_suffix(self, url, name):
41 |         """
42 |         附件.xls.doc 可上传, 接口会默认取最后一个
43 |         优先取 file_url 后缀
44 |         """
45 |         regx = self.kwargs.get("attachment_regx") or ATTACHMENT_REGX
46 |         is_name_suffix = re.search(regx, name, re.I)
47 |         is_url_suffix = re.search(regx, url, re.I)
48 |         name_suffix = is_name_suffix.group(1) if is_name_suffix else ""
49 |         url_suffix = is_url_suffix.group(1) if is_url_suffix else ""
50 | 
51 |         return name_suffix or url_suffix
52 | 


--------------------------------------------------------------------------------
/CrawlersTools/extractors/base.py:
--------------------------------------------------------------------------------
 1 | from loguru import logger
 2 | from lxml.html import etree
 3 | from lxml.html import fromstring
 4 | 
 5 | from CrawlersTools.extractors.schemas.element import Element
 6 | 
 7 | 
 8 | class BaseExtractor(object):
 9 |     """
10 |     Base Extractor which provide common methods
11 |     """
12 | 
13 |     kwargs = None
14 | 
15 |     @staticmethod
16 |     def to_string(element: Element, limit: int = None):
17 |         """
18 |         convert element to string
19 |         :param element:
20 |         :param limit:
21 |         :return:
22 |         """
23 |         result = etree.tostring(element, pretty_print=True, encoding="utf-8", method='html').decode('utf-8')
24 |         if limit:
25 |             return result[:limit]
26 |         return result
27 | 
28 |     def process(self, element: Element):
29 |         """
30 |         process method that you should implement
31 |         :param element:
32 |         :return:
33 |         """
34 |         logger.error('You must implement process method in your extractor.')
35 |         raise NotImplementedError
36 | 
37 |     def extract(self, html, **kwargs):
38 |         """
39 |         base extract method, firstly, it will convert html to WebElement, then it call
40 |         process method that child class implements
41 |         :param html:
42 |         :return:
43 |         """
44 |         self.kwargs = kwargs
45 |         element = fromstring(html=html)
46 |         element.__class__ = Element
47 |         return self.process(element)
48 | 


--------------------------------------------------------------------------------
/CrawlersTools/extractors/content_extractor.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Project : CrawlersTools
  3 | # @Time    : 2022/11/15 9:18
  4 | # @Author  : MuggleK
  5 | # @File    : content_extractor.py
  6 | 
  7 | from copy import deepcopy
  8 | 
  9 | import numpy as np
 10 | from lxml.html import fromstring, HtmlElement
 11 | 
 12 | from CrawlersTools.extractors.base import BaseExtractor
 13 | from CrawlersTools.extractors.schemas.element import Element
 14 | from CrawlersTools.extractors.utils.element import descendants_of_body
 15 | from CrawlersTools.extractors.utils.preprocess import preprocess4content_extractor
 16 | from CrawlersTools.extractors.utils.settings import SPECIAL_SYMBOL_MAP, ERROR_NAV_LIST
 17 | 
 18 | 
 19 | class ContentExtractor(BaseExtractor):
 20 |     """
 21 |     extract content from detail page
 22 |     """
 23 |     
 24 |     def process(self, element: Element):
 25 |         """
 26 |         extract content from html
 27 |         :param element:
 28 |         :return:
 29 |         """
 30 |         source_element = deepcopy(element)
 31 |         source_element.__class__ = Element
 32 | 
 33 |         # preprocess
 34 |         preprocess4content_extractor(element)
 35 | 
 36 |         # start to evaluate every child element
 37 |         descendants = descendants_of_body(element)
 38 | 
 39 |         # get std of density_of_text among all elements
 40 |         density_of_text = [descendant.density_of_text for descendant in descendants]
 41 |         density_of_text_std = np.std(density_of_text, ddof=1)
 42 | 
 43 |         # get density_score of every element
 44 |         for descendant in descendants:
 45 |             score = np.log(density_of_text_std) * \
 46 |                     descendant.density_of_text * \
 47 |                     np.log10(descendant.number_of_p_descendants + 2) * \
 48 |                     np.log(descendant.density_of_punctuation)
 49 |             descendant.density_score = score
 50 | 
 51 |         # sort element info by density_score
 52 |         descendants = sorted(descendants, key=lambda x: x.density_score, reverse=True)
 53 |         descendant_first = descendants[0] if descendants else None
 54 |         if descendant_first is None:
 55 |             return None
 56 | 
 57 |         paragraphs = descendant_first.xpath(".//text()")
 58 |         paragraphs = [paragraph.strip() if paragraph else '' for paragraph in paragraphs]
 59 |         paragraphs = list(filter(lambda x: x, paragraphs))
 60 |         text = '\n'.join(paragraphs)
 61 |         text = text.strip()
 62 | 
 63 |         # save content with tag
 64 |         content_with_tag = self.process_content_tag(descendant_first, source_element)
 65 | 
 66 |         # extract images
 67 |         img_list = [img.attrib["src"] for img in content_with_tag.img_descendants if img.attrib]
 68 | 
 69 |         return text, content_with_tag.string, img_list
 70 | 
 71 |     @staticmethod
 72 |     def process_content_tag(descendant_first, source_element):
 73 |         content_xpath = f"//{descendant_first.tag}"
 74 |         if descendant_first.attrib:
 75 |             for k, v in descendant_first.attrib.items():
 76 |                 if k and v: content_xpath += f"[@{k}='{v}']"
 77 |         preprocess4content_extractor(source_element, is_content=False)
 78 |         content_with_tag = source_element.xpath(content_xpath)[0]
 79 |         if isinstance(content_with_tag, HtmlElement):
 80 |             content_with_tag.__class__ = Element
 81 |         return content_with_tag
 82 | 
 83 |     def extract(self, html, **kwargs):
 84 |         """
 85 |         base extract method, firstly, it will convert html to WebElement, then it call
 86 |         process method that child class implements
 87 |         :param html:
 88 |         :return:
 89 |         """
 90 |         self.kwargs = kwargs
 91 |         for key, value in SPECIAL_SYMBOL_MAP.items():
 92 |             html = html.replace(key, value)
 93 | 
 94 |         element = fromstring(html=html)  # html有多个，fromstring默认取第一个 TODO 解析不了非规范html
 95 |         if self.kwargs.get("content_xpath"):
 96 |             return ''.join(element.xpath(self.kwargs.get("content_xpath")))
 97 | 
 98 |         descendants_list = list(element.iterdescendants())
 99 | 
100 |         # remove error navigate tags
101 |         remove_index_list = list()
102 |         for index, descendant in enumerate(descendants_list):
103 |             if descendant.text is None:
104 |                 continue
105 |             nav_error_list = [i for i in ERROR_NAV_LIST if i in descendant.text]
106 |             if nav_error_list: remove_index_list.append(index)
107 | 
108 |         for i in remove_index_list:
109 |             parent_element = descendants_list[i].getparent()
110 |             if parent_element is not None: parent_element.remove(descendants_list[i])
111 | 
112 |         element.__class__ = Element
113 |         return self.process(element)
114 | 


--------------------------------------------------------------------------------
/CrawlersTools/extractors/list_extractor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | """
  4 | Author: xiaobin.zhu
  5 | since: 2022-11-24 14:24:09
  6 | LastAuthor: xiaobin.zhu
  7 | LastEditTime: 2022-11-24 14:24:09
  8 | Description: extract list from index page
  9 | FilePath: list_extractor
 10 | """
 11 | import math
 12 | import operator
 13 | from collections import defaultdict
 14 | from urllib.parse import urljoin
 15 | 
 16 | # from loguru import logger
 17 | import numpy as np
 18 | from lxml.html import fromstring
 19 | 
 20 | from CrawlersTools.extractors.base import BaseExtractor
 21 | from CrawlersTools.extractors.schemas.element import Element
 22 | from CrawlersTools.extractors.utils.cluster import cluster_dict
 23 | from CrawlersTools.extractors.utils.element import calc_a_descendants_text_of_avg_length, descendants_of_body
 24 | from CrawlersTools.extractors.utils.preprocess import preprocess4list_extractor
 25 | from CrawlersTools.extractors.utils.settings import (
 26 |     LIST_AVG_LENGTH, LIST_MAX_LENGTH, LIST_MIN_LENGTH, LIST_MIN_NUMBER, ADDTION_RIGHT_NUM, SIMILARITY_THRESHOLD,
 27 |     HIGH_WEIGHT_ERROR_KEYWORD, DIRECTORY_ERROR_TITLE, SPECIAL_SYMBOL_MAP,
 28 | )
 29 | 
 30 | AVG_LENGTH = (LIST_MIN_LENGTH + LIST_MAX_LENGTH) / 2
 31 | 
 32 | 
 33 | class ListExtractor(BaseExtractor):
 34 |     """
 35 |     extract list from index page
 36 |     """
 37 | 
 38 |     @staticmethod
 39 |     def _probability_of_title_with_length(length):
 40 |         """
 41 |         get the probability of title according to length
 42 |         import matplotlib.pyplot as plt
 43 |         x = np.asarray(range(5, 40))
 44 |         y = list_extractor.probability_of_title_with_length(x)
 45 |         plt.plot(x, y, 'g', label='m=0, sig=2')
 46 |         plt.show()
 47 |         :param length:
 48 |         :return:
 49 |         """
 50 |         sigma = 6
 51 |         return np.exp(-1 * ((length - AVG_LENGTH) ** 2) / (2 * (sigma**2))) / (
 52 |             math.sqrt(2 * np.pi) * sigma
 53 |         )
 54 | 
 55 |     @staticmethod
 56 |     def _build_clusters(element):
 57 |         """
 58 |         build candidate clusters according to element
 59 |         :return:
 60 |         """
 61 |         descendants = descendants_of_body(element)
 62 |         descendants_tree = defaultdict(list)
 63 |         for descendant in descendants:
 64 |             #  dispose some special descendant for less extract
 65 |             if len(descendant.a_descendants) > 5 and descendant.number_of_siblings == 1:
 66 |                 if descendant.parent_selector in ["html>body", "html"]:
 67 |                     continue
 68 |                 if descendant.a_descendants_group_text_min_length > LIST_MAX_LENGTH:
 69 |                     continue
 70 |                 if descendant.a_descendants_group_text_max_length < LIST_MIN_LENGTH:
 71 |                     continue
 72 |                 for link in descendant.a_descendants:
 73 |                     descendants_tree[descendant.parent_selector].append(link)
 74 |                 continue
 75 |             # if one element does not have enough siblings, it can not become a child of candidate element
 76 |             if descendant.number_of_siblings + 1 < LIST_MIN_NUMBER:
 77 |                 continue
 78 |             if calc_a_descendants_text_of_avg_length(descendant) < LIST_AVG_LENGTH:
 79 |                 continue
 80 |             # if min length is larger than specified max length, it can not become a child of candidate element
 81 |             if descendant.a_descendants_group_text_min_length > LIST_MAX_LENGTH:
 82 |                 continue
 83 |             # if max length is smaller than specified min length, it can not become a child of candidate element
 84 |             if descendant.a_descendants_group_text_max_length < LIST_MIN_LENGTH:
 85 |                 continue
 86 |             # if descendant.a_descendants_group_text_avg_length < 10:
 87 |             #     continue
 88 |             # descendant element must have same siblings which their similarity should not below similarity_threshold
 89 |             if descendant.similarity_with_siblings < SIMILARITY_THRESHOLD:
 90 |                 continue
 91 |             descendants_tree[descendant.parent_selector].append(descendant)
 92 |         if len(descendants_tree) == 0:
 93 |             return
 94 |         descendants_tree = dict(descendants_tree)
 95 | 
 96 |         # cut tree, remove parent block
 97 |         selectors = sorted(list(descendants_tree.keys()))
 98 |         last_selector = None
 99 |         for selector in selectors[::-1]:
100 |             # if later selector
101 |             if last_selector and selector and last_selector.startswith(selector):
102 |                 del descendants_tree[selector]
103 |             last_selector = selector
104 |         clusters = cluster_dict(descendants_tree)
105 | 
106 |         return clusters
107 | 
108 |     @staticmethod
109 |     def _evaluate_cluster(cluster):
110 |         """
111 |         calculate score of cluster using similarity, numbers, or other info
112 |         :param cluster:
113 |         :return:
114 |         """
115 |         score = dict()
116 | 
117 |         # calculate avg_similarity_with_siblings
118 |         score["avg_similarity_with_siblings"] = np.mean(
119 |             [element.similarity_with_siblings for element in cluster]
120 |         )
121 |         score["avg_text_length"] = np.mean(
122 |             [calc_a_descendants_text_of_avg_length(element) for element in cluster]
123 |         )
124 |         # calculate number of elements
125 |         score["number_of_elements"] = len(cluster)
126 | 
127 |         score["clusters_score"] = (
128 |             score["avg_similarity_with_siblings"]
129 |             * np.log10(score["number_of_elements"] + 1)
130 |             * score["avg_text_length"]
131 |         )
132 |         # * clusters_score[cluster_id]['probability_of_title_with_length']
133 |         return score
134 | 
135 |     @staticmethod
136 |     def _extend_cluster(cluster):
137 |         """
138 |         extend cluster's elements except for missed children
139 |         :param cluster:
140 |         :return:
141 |         """
142 |         result = [element.selector for element in cluster]
143 |         for element in cluster:
144 |             if calc_a_descendants_text_of_avg_length(element) < LIST_AVG_LENGTH:
145 |                 continue
146 |             path_raw = element.path_raw
147 |             siblings = list(element.siblings)
148 |             for sibling in siblings:
149 |                 # skip invalid element
150 |                 if not isinstance(sibling, Element):
151 |                     continue
152 |                 sibling_selector = sibling.selector
153 |                 sibling_path_raw = sibling.path_raw
154 |                 if sibling_path_raw != path_raw:
155 |                     continue
156 |                 # add missed sibling
157 |                 if sibling_selector not in result:
158 |                     cluster.append(sibling)
159 |                     result.append(sibling_selector)
160 | 
161 |         cluster = sorted(cluster, key=lambda x: x.nth)
162 |         # logger.debug(f"cluster after extend {cluster}")
163 |         return cluster
164 | 
165 |     def _best_cluster(self, clusters):
166 |         """
167 |         use clustering algorithm to choose best cluster from candidate clusters
168 |         :param clusters:
169 |         :return:
170 |         """
171 |         if not clusters:
172 |             # logger.debug("there is on cluster, just return empty result")
173 |             return []
174 |         if len(clusters) == 1:
175 |             # logger.debug("there is only one cluster, just return first cluster")
176 |             return clusters[0]
177 |         # choose best cluster using score
178 |         clusters_score = defaultdict(dict)
179 |         clusters_score_arg_max = 0
180 |         clusters_score_max = -1
181 |         for cluster_id, cluster in clusters.items():
182 |             if len(cluster) < 2:
183 |                 continue
184 |             # calculate avg_similarity_with_siblings
185 |             clusters_score[cluster_id] = self._evaluate_cluster(cluster)
186 |             # get max score arg index
187 |             if clusters_score[cluster_id]["clusters_score"] > clusters_score_max:
188 |                 clusters_score_max = clusters_score[cluster_id]["clusters_score"]
189 |                 clusters_score_arg_max = cluster_id
190 |         # logger.debug(f"clusters_score {clusters_score}")
191 |         best_cluster = clusters[clusters_score_arg_max]
192 |         return best_cluster
193 | 
194 |     def _extract_cluster(self, cluster):
195 |         """
196 |         extract title and href from best cluster
197 |         :param cluster:
198 |         :return:
199 |         """
200 |         if not cluster:
201 |             return None
202 |         # get best tag path of title
203 |         probabilities_of_title = defaultdict(list)
204 |         for element in cluster:
205 |             if element.tag == "a":
206 |                 descendants = [element]
207 |             else:
208 |                 descendants = element.a_descendants
209 |             for descendant in descendants:
210 |                 path = descendant.path_raw
211 |                 descendant_text = descendant.text
212 |                 probability_of_title_with_length = (
213 |                     self._probability_of_title_with_length(len(descendant_text))
214 |                 )
215 | 
216 |                 if descendant.attrib.get("title"):
217 |                     probability_of_title_with_length = (
218 |                         probability_of_title_with_length * ADDTION_RIGHT_NUM
219 |                     )
220 | 
221 |                 if len(descendant_text) > LIST_MAX_LENGTH:
222 |                     probability_of_title_with_length = (
223 |                         probability_of_title_with_length * ADDTION_RIGHT_NUM
224 |                     )
225 |                 if (
226 |                     descendant.tag == "a"
227 |                     and descendant.get("parent")
228 |                     and calc_a_descendants_text_of_avg_length(descendant.parent)
229 |                     < LIST_AVG_LENGTH
230 |                 ):
231 |                     probability_of_title_with_length = (
232 |                         probability_of_title_with_length / ADDTION_RIGHT_NUM
233 |                     )
234 |                 else:
235 |                     if (
236 |                         calc_a_descendants_text_of_avg_length(descendant)
237 |                         < LIST_AVG_LENGTH
238 |                     ):
239 |                         probability_of_title_with_length = (
240 |                             probability_of_title_with_length / ADDTION_RIGHT_NUM
241 |                         )
242 |                 for ss in HIGH_WEIGHT_ERROR_KEYWORD:
243 |                     if ss in descendant_text:
244 |                         probability_of_title_with_length = (
245 |                             probability_of_title_with_length / ADDTION_RIGHT_NUM
246 |                         )
247 | 
248 |                 probability_of_title = probability_of_title_with_length
249 |                 probabilities_of_title[path].append(probability_of_title)
250 |         probabilities_of_title_bak = {}
251 |         for key in probabilities_of_title:
252 |             if len(probabilities_of_title[key]) > LIST_MIN_NUMBER - 2:
253 |                 probabilities_of_title_bak[key] = probabilities_of_title[key]
254 |         probabilities_of_title = probabilities_of_title_bak
255 |         # get most probable tag_path
256 |         probabilities_of_title_avg = {
257 |             k: np.mean(v) for k, v in probabilities_of_title.items()
258 |         }
259 |         if not probabilities_of_title_avg:
260 |             return None
261 |         best_path = max(probabilities_of_title_avg.items(), key=operator.itemgetter(1))[
262 |             0
263 |         ]
264 |         # logger.debug(f"best tag path {best_path}")
265 | 
266 |         # extract according to best tag path
267 |         result = []
268 |         # 去除重复链接
269 |         cache_url_list = []
270 |         recode_avg_len_of_path = {}
271 |         for element in cluster:
272 |             avg_len = calc_a_descendants_text_of_avg_length(element)
273 |             parent_selector = element.parent_selector
274 |             if recode_avg_len_of_path.get(parent_selector) is not None:
275 |                 if recode_avg_len_of_path[parent_selector] < LIST_AVG_LENGTH:
276 |                     continue
277 |             else:
278 |                 recode_avg_len_of_path[parent_selector] = avg_len
279 |                 if recode_avg_len_of_path[parent_selector] < LIST_AVG_LENGTH:
280 |                     continue
281 | 
282 |             if element.tag == "a":
283 |                 path_raw = element.path_raw
284 |                 if path_raw != best_path:  # and descendant.text == ""
285 |                     continue
286 |                 title = element.attrib.get("title") or element.text
287 |                 if title in DIRECTORY_ERROR_TITLE or len(title) < 2:
288 |                     continue
289 |                 flag = False
290 |                 for ss in HIGH_WEIGHT_ERROR_KEYWORD:
291 |                     if ss in title:
292 |                         flag = True
293 |                         break
294 |                 if flag:
295 |                     continue
296 |                 url = element.attrib.get("href")
297 |                 if url is None:
298 |                     continue
299 |                 if "javascript" in url:
300 |                     ss = element.attrib.get("data-href")
301 |                     if ss is not None:
302 |                         url = ss
303 |                 if "#" in url or "javascript:void" in url:
304 |                     ss = element.attrib.get("onclick")
305 |                     if ss is not None:
306 |                         url = ss
307 | 
308 |                 if url.startswith("//"):
309 |                     url = "http:" + url
310 |                 base_url = self.kwargs.get("base_url")
311 |                 if base_url:
312 |                     url = urljoin(base_url, url)
313 |                 if url in cache_url_list:
314 |                     continue
315 |                 else:
316 |                     cache_url_list.append(url)
317 |                 result.append({"title": title, "url": url})
318 |                 continue
319 |             else:
320 |                 descendants = element.a_descendants
321 |             for descendant in descendants:
322 |                 path_raw = descendant.path_raw
323 |                 if path_raw != best_path:  # and descendant.text == ""
324 |                     continue
325 |                 title = descendant.attrib.get("title") or descendant.text
326 |                 if title in DIRECTORY_ERROR_TITLE or len(title) < 2:
327 |                     continue
328 |                 flag = False
329 |                 for ss in HIGH_WEIGHT_ERROR_KEYWORD:
330 |                     if ss in title:
331 |                         flag = True
332 |                         break
333 |                 if flag:
334 |                     continue
335 |                 url = descendant.attrib.get("href")
336 |                 if url is None:
337 |                     continue
338 |                 if "javascript" in url:
339 |                     ss = descendant.attrib.get("data-href")
340 |                     if ss is not None:
341 |                         url = ss
342 |                 if "#" in url or "javascript:void" in url:
343 |                     ss = descendant.attrib.get("onclick")
344 |                     if ss is not None:
345 |                         url = ss
346 | 
347 |                 if url.startswith("//"):
348 |                     url = "http:" + url
349 |                 base_url = self.kwargs.get("base_url")
350 |                 if base_url:
351 |                     url = urljoin(base_url, url)
352 |                 if url in cache_url_list:
353 |                     continue
354 |                 else:
355 |                     cache_url_list.append(url)
356 |                 result.append({"title": title, "url": url})
357 |         return result
358 | 
359 |     def process(self, element: Element):
360 |         """
361 |         extract content from html
362 |         :param element:
363 |         :return:
364 |         """
365 |         # preprocess
366 |         preprocess4list_extractor(element)
367 | 
368 |         # build clusters
369 |         clusters = self._build_clusters(element)
370 |         # logger.debug(f"after build clusters {clusters}")
371 | 
372 |         # choose best cluster
373 |         best_cluster = self._best_cluster(clusters)
374 |         # logger.debug(f"best cluster {best_cluster}")
375 | 
376 |         extended_cluster = self._extend_cluster(best_cluster)
377 |         # logger.debug(f"extended cluster {extended_cluster}")
378 | 
379 |         # extract result from best cluster
380 |         return self._extract_cluster(best_cluster)
381 | 
382 |     def extract(self, html, **kwargs):
383 |         self.kwargs = kwargs
384 |         for key, value in SPECIAL_SYMBOL_MAP.items():
385 |             html = html.replace(key, value)
386 | 
387 |         element = fromstring(html=html)  # html有多个，fromstring默认取第一个 TODO 解析不了非规范html
388 |         if self.kwargs.get("list_xpath"):
389 |             return ''.join(element.xpath(self.kwargs.get("list_xpath")))
390 | 
391 |         element.__class__ = Element
392 |         return self.process(element)
393 | 


--------------------------------------------------------------------------------
/CrawlersTools/extractors/schemas/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MuggleK/CrawlersTools/8f59a1dd884367f2f59e1f6f63b683c9f24ad377/CrawlersTools/extractors/schemas/__init__.py


--------------------------------------------------------------------------------
/CrawlersTools/extractors/schemas/element.py:
--------------------------------------------------------------------------------
  1 | from lxml.html import HtmlElement, etree
  2 | from numpy import mean
  3 | 
  4 | 
  5 | class Element(HtmlElement):
  6 |     _id: int = None
  7 |     _selector: str = None
  8 |     _parent_selector: str = None
  9 |     _alias: str = None
 10 |     _tag_name: str = None
 11 |     _path: str = None
 12 |     _path_raw: str = None
 13 |     _children = None
 14 |     _parent = None
 15 |     _siblings = None
 16 |     _descendants = None
 17 |     _text = None
 18 |     _number_of_char: int = None
 19 |     _number_of_a_char: int = None
 20 |     _number_of_punctuation: int = None
 21 |     _number_of_a_descendants: int = None
 22 |     _number_of_p_descendants: int = None
 23 |     _number_of_children: int = None
 24 |     _number_of_siblings: int = None
 25 |     _number_of_descendants: int = None
 26 |     _density_of_punctuation: int = None
 27 |     _density_of_text: float = None
 28 |     _density_score: float = None
 29 |     _similarity_with_siblings: float = None
 30 |     _a_descendants: list = None
 31 |     _img_descendants: list = None
 32 |     _a_descendants_group: dict = None
 33 |     _a_descendants_group_text_length: dict = None
 34 |     _a_descendants_group_text_min_length: float = None
 35 |     _a_descendants_group_text_max_length: float = None
 36 |     
 37 |     density_score: float = None
 38 |     
 39 |     @property
 40 |     def id(self):
 41 |         """
 42 |         get id by hashed element
 43 |         :return:
 44 |         """
 45 |         if self._id is not None:
 46 |             return self._id
 47 |         self._id = hash(self)
 48 |         return self._id
 49 |     
 50 |     @property
 51 |     def nth(self):
 52 |         """
 53 |         get nth index of this element in parent element
 54 |         :return:
 55 |         """
 56 |         return len(list(self.itersiblings(preceding=True))) + 1
 57 |     
 58 |     
 59 |     @property
 60 |     def alias(self):
 61 |         """
 62 |         get alias of element, using all attributes to construct it.
 63 |         :return: string
 64 |         """
 65 |         if self._alias is not None:
 66 |             return self._alias
 67 |         from CrawlersTools.extractors.utils.element import alias
 68 |         self._alias = alias(self)
 69 |         return self._alias
 70 |     
 71 |     @property
 72 |     def selector(self):
 73 |         """
 74 |         get id by hashed element
 75 |         :return:
 76 |         """
 77 |         if self._selector is not None:
 78 |             return self._selector
 79 |         from CrawlersTools.extractors.utils.element import selector
 80 |         self._selector = selector(self)
 81 |         return self._selector
 82 |     
 83 |     @property
 84 |     def children(self):
 85 |         """
 86 |         get children of this element
 87 |         :return: 
 88 |         """
 89 |         if self._children is not None:
 90 |             return self._children
 91 |         from CrawlersTools.extractors.utils.element import children
 92 |         self._children = list(children(self))
 93 |         return self._children
 94 |     
 95 |     @property
 96 |     def siblings(self):
 97 |         """
 98 |         get siblings of this element
 99 |         :return: 
100 |         """
101 |         if self._siblings is not None:
102 |             return self._siblings
103 |         from CrawlersTools.extractors.utils.element import siblings
104 |         self._siblings = list(siblings(self))
105 |         return self._siblings
106 |     
107 |     @property
108 |     def descendants(self):
109 |         """
110 |         get descendants of this element
111 |         :return: 
112 |         """
113 |         if self._descendants is not None:
114 |             return self._descendants
115 |         from CrawlersTools.extractors.utils.element import descendants
116 |         self._descendants = list(descendants(self))
117 |         return self._descendants
118 |     
119 |     @property
120 |     def parent_selector(self):
121 |         """
122 |         get id by hashed element
123 |         :return:
124 |         """
125 |         if self._parent_selector is not None:
126 |             return self._parent_selector
127 |         from CrawlersTools.extractors.utils.element import selector, parent
128 |         # TODO: change parent(self) to self.parent
129 |         p = parent(self)
130 |         if p is not None:
131 |             self._parent_selector = selector(p)
132 |         return self._parent_selector
133 |     
134 |     @property
135 |     def tag_name(self):
136 |         """
137 |         return tag name
138 |         :return:
139 |         """
140 |         if self._tag_name:
141 |             return self._tag_name
142 |         self._tag_name = self.tag
143 |         return self._tag_name
144 |     
145 |     @property
146 |     def text(self):
147 |         """
148 |         get text of element
149 |         :return:
150 |         """
151 |         if self._text is not None:
152 |             return self._text
153 |         from CrawlersTools.extractors.utils.element import text
154 |         self._text = text(self)
155 |         return self._text
156 |     
157 |     @property
158 |     def string(self):
159 |         """
160 |         return string of element
161 |         :return:
162 |         """
163 |         return etree.tostring(self, pretty_print=True, encoding="utf-8", method='html').decode('utf-8')
164 |     
165 |     @property
166 |     def path(self):
167 |         """
168 |         get tag path using external path function
169 |         :return:
170 |         """
171 |         if self._path is not None:
172 |             return self._path
173 |         from CrawlersTools.extractors.utils.element import path
174 |         self._path = path(self)
175 |         return self._path
176 |     
177 |     @property
178 |     def path_raw(self):
179 |         """
180 |         get tag raw path using external path raw function
181 |         :return:
182 |         """
183 |         if self._path_raw is not None:
184 |             return self._path_raw
185 |         from CrawlersTools.extractors.utils.element import path_raw
186 |         self._path_raw = path_raw(self)
187 |         return self._path_raw
188 |     
189 |     @property
190 |     def number_of_char(self):
191 |         """
192 |         get text length
193 |         :return:
194 |         """
195 |         if self._number_of_char is not None:
196 |             return self._number_of_char
197 |         from CrawlersTools.extractors.utils.element import number_of_char
198 |         self._number_of_char = number_of_char(self)
199 |         return self._number_of_char
200 |     
201 |     @property
202 |     def number_of_a_descendants(self):
203 |         """
204 |         get number of a descendants
205 |         :return:
206 |         """
207 |         if self._number_of_a_descendants is not None:
208 |             return self._number_of_a_descendants
209 |         from CrawlersTools.extractors.utils.element import number_of_a_descendants
210 |         self._number_of_a_descendants = number_of_a_descendants(self)
211 |         return self._number_of_a_descendants
212 |     
213 |     @property
214 |     def number_of_a_char(self):
215 |         """
216 |         get a text length
217 |         :return:
218 |         """
219 |         if self._number_of_a_char is not None:
220 |             return self._number_of_a_char
221 |         from CrawlersTools.extractors.utils.element import number_of_a_char
222 |         self._number_of_a_char = number_of_a_char(self)
223 |         return self._number_of_a_char
224 |     
225 |     @property
226 |     def number_of_p_descendants(self):
227 |         """
228 |         return number of paragraph
229 |         :return:
230 |         """
231 |         if self._number_of_p_descendants is not None:
232 |             return self._number_of_p_descendants
233 |         from CrawlersTools.extractors.utils.element import number_of_p_descendants
234 |         self._number_of_p_descendants = number_of_p_descendants(self)
235 |         return self._number_of_p_descendants
236 |     
237 |     @property
238 |     def number_of_punctuation(self):
239 |         """
240 |         get number of punctuation
241 |         :return:
242 |         """
243 |         if self._number_of_punctuation is not None:
244 |             return self._number_of_punctuation
245 |         from CrawlersTools.extractors.utils.element import number_of_punctuation
246 |         self._number_of_punctuation = number_of_punctuation(self)
247 |         return self._number_of_punctuation
248 |     
249 |     @property
250 |     def number_of_children(self):
251 |         """
252 |         get children number
253 |         :return:
254 |         """
255 |         if self._number_of_children is not None:
256 |             return self._number_of_children
257 |         self._number_of_children = len(list(self.children))
258 |         return self._number_of_children
259 |     
260 |     @property
261 |     def number_of_siblings(self):
262 |         """
263 |         get number of siblings
264 |         :return:
265 |         """
266 |         if self._number_of_siblings is not None:
267 |             return self._number_of_siblings
268 |         self._number_of_siblings = len(list(self.siblings))
269 |         return self._number_of_siblings
270 |     
271 |     @property
272 |     def number_of_descendants(self):
273 |         """
274 |         get number of descendants
275 |         :return:
276 |         """
277 |         if self._number_of_descendants is not None:
278 |             return self._number_of_descendants
279 |         from CrawlersTools.extractors.utils.element import number_of_descendants
280 |         self._number_of_descendants = len(list(self.descendants))
281 |         return self._number_of_descendants
282 |     
283 |     @property
284 |     def density_of_punctuation(self):
285 |         """
286 |         get density of punctuation
287 |         :return:
288 |         """
289 |         if self._density_of_punctuation is not None:
290 |             return self._density_of_punctuation
291 |         from CrawlersTools.extractors.utils.element import density_of_punctuation
292 |         self._density_of_punctuation = density_of_punctuation(self)
293 |         return self._density_of_punctuation
294 |     
295 |     @property
296 |     def density_of_text(self):
297 |         """
298 |         get density of text
299 |         :return:
300 |         """
301 |         if self._density_of_text is not None:
302 |             return self._density_of_text
303 |         from CrawlersTools.extractors.utils.element import density_of_text
304 |         self._density_of_text = density_of_text(self)
305 |         return self._density_of_text
306 |     
307 |     @property
308 |     def similarity_with_siblings(self):
309 |         """
310 |         get similarity with siblings
311 |         :return:
312 |         """
313 |         if self._similarity_with_siblings is not None:
314 |             return self._similarity_with_siblings
315 |         from CrawlersTools.extractors.utils.element import similarity_with_siblings
316 |         self._similarity_with_siblings = similarity_with_siblings(self)
317 |         return self._similarity_with_siblings
318 |     
319 |     @property
320 |     def a_descendants(self):
321 |         """
322 |         get linked descendants
323 |         :return:
324 |         """
325 |         if self._a_descendants is not None:
326 |             return self._a_descendants
327 |         from CrawlersTools.extractors.utils.element import a_descendants
328 |         self._a_descendants = a_descendants(self)
329 |         return self._a_descendants
330 |     
331 |     @property
332 |     def img_descendants(self):
333 |         """
334 |         get linked descendants
335 |         :return:
336 |         """
337 |         if self._img_descendants is not None:
338 |             return self._img_descendants
339 |         from CrawlersTools.extractors.utils.element import img_descendants
340 | 
341 |         self._img_descendants = img_descendants(self)
342 |         return self._img_descendants
343 |     
344 |     @property
345 |     def a_descendants_group(self):
346 |         """
347 |         get linked descendants group
348 |         :return:
349 |         """
350 |         if self._a_descendants_group is not None:
351 |             return self._a_descendants_group
352 |         from CrawlersTools.extractors.utils.element import a_descendants_group
353 |         self._a_descendants_group = a_descendants_group(self)
354 |         return self._a_descendants_group
355 |     
356 |     @property
357 |     def a_descendants_group_text_length(self):
358 |         """
359 |         grouped linked text length
360 |         :return:
361 |         """
362 |         if self._a_descendants_group_text_length is not None:
363 |             return self._a_descendants_group_text_length
364 |         result = {}
365 |         from CrawlersTools.extractors.utils.element import text
366 |         for path, elements in self.a_descendants_group.items():
367 |             lengths = []
368 |             for element in elements:
369 |                 # TODO: convert len(text(element)) to element.number_of_char
370 |                 lengths.append(len(text(element)))
371 |             mean_length = mean(lengths) if len(lengths) else 0
372 |             result[path] = mean_length
373 |         return result
374 |     
375 |     @property
376 |     def a_descendants_group_text_min_length(self):
377 |         """
378 |         get grouped linked text min length
379 |         :return:
380 |         """
381 |         if self._a_descendants_group_text_min_length is not None:
382 |             return self._a_descendants_group_text_min_length
383 |         values = self.a_descendants_group_text_length.values()
384 |         self._a_descendants_group_text_min_length = min(values) if values else 0
385 |         return self._a_descendants_group_text_min_length
386 |     
387 |     @property
388 |     def a_descendants_group_text_max_length(self):
389 |         """
390 |         get grouped linked text max length
391 |         :return:
392 |         """
393 |         if self._a_descendants_group_text_max_length is not None:
394 |             return self._a_descendants_group_text_max_length
395 |         values = self.a_descendants_group_text_length.values()
396 |         self._a_descendants_group_text_max_length = max(values) if values else 0
397 |         return self._a_descendants_group_text_max_length
398 |     
399 |     @property
400 |     def a_descendants_group_text_avg_length(self):
401 |         """
402 |         get grouped linked text avg length
403 |         :return:
404 |         """
405 |         if self._a_descendants_group_text_max_length is not None:
406 |             return self._a_descendants_group_text_max_length
407 |         values = self.a_descendants_group_text_length.values()
408 |         self._a_descendants_group_text_max_length = max(values) if values else 0
409 |         return self._a_descendants_group_text_max_length
410 |     
411 |     def __str__(self):
412 |         """
413 |         rewrite str
414 |         :return:
415 |         """
416 |         return f'<Element {self.tag} of {self.path}>'
417 |     
418 |     def __repr__(self):
419 |         """
420 |         rewrite repr
421 |         :return:
422 |         """
423 |         return self.__str__()
424 | 


--------------------------------------------------------------------------------
/CrawlersTools/extractors/time_extractor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/11/3 9:55
 4 | # @Author  : MuggleK
 5 | # @File    : time_extractor.py
 6 | 
 7 | import re
 8 | 
 9 | from lxml.html import etree
10 | 
11 | from CrawlersTools.extractors.base import BaseExtractor
12 | from CrawlersTools.extractors.schemas.element import Element
13 | from CrawlersTools.extractors.utils.settings import DATETIME_PATTERN, PUBLISH_TIME_META, TITLE_EXTRACTOR_USELESS_TAGS
14 | from CrawlersTools.preprocess import TimeProcessor
15 | 
16 | format_time = TimeProcessor().format
17 | 
18 | 
19 | class TimeExtractor(BaseExtractor):
20 | 
21 |     @staticmethod
22 |     def extract_from_xpath(element: Element, publish_time_xpath: str) -> str:
23 |         if publish_time_xpath:
24 |             publish_time = ''.join(element.xpath(publish_time_xpath))
25 |             return format_time(publish_time)
26 |         return ''
27 | 
28 |     @staticmethod
29 |     def extract_from_text(element: Element) -> str:
30 |         text = ''.join(element.xpath('.//text()'))
31 |         for dt in DATETIME_PATTERN:
32 |             dt_obj = re.search(dt, text)
33 |             if dt_obj:
34 |                 return format_time(dt_obj.group(1))
35 |         else:
36 |             return ''
37 | 
38 |     @staticmethod
39 |     def extract_from_meta(element: Element) -> str:
40 |         """
41 |         优先匹配 META 数据
42 |         :param element: 网页源代码对应的Dom 树
43 |         :return: str
44 |         """
45 |         for xpath in PUBLISH_TIME_META:
46 |             publish_time = element.xpath(xpath)
47 |             if publish_time:
48 |                 return format_time(''.join(publish_time))
49 |         return ''
50 |     
51 |     def process(self, element: Element):
52 |         # remove tag and its content
53 |         etree.strip_elements(element, *TITLE_EXTRACTOR_USELESS_TAGS)
54 | 
55 |         publish_time = (self.extract_from_xpath(element, publish_time_xpath=self.kwargs.get("publish_time_xpath"))
56 |                         or self.extract_from_meta(element)
57 |                         or self.extract_from_text(element))
58 | 
59 |         return publish_time
60 | 


--------------------------------------------------------------------------------
/CrawlersTools/extractors/title_extractor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/8/19 20:23
 4 | # @Author  : MuggleK
 5 | # @File    : title_extractor.py
 6 | 
 7 | import re
 8 | from itertools import combinations
 9 | 
10 | from lxml.html import etree
11 | 
12 | from CrawlersTools.extractors.base import BaseExtractor
13 | from CrawlersTools.extractors.schemas.element import Element
14 | from CrawlersTools.extractors.utils.settings import (
15 |     TITLE_HTAG_XPATH, TITLE_META_XPATH, TITLE_META_XPATH_BAK, TITLE_EXTRACTOR_USELESS_TAGS, PUNCTUATION_ALPHA_PATTERN
16 | )
17 | from CrawlersTools.extractors.utils.similarity import get_longest_common_sub_string
18 | 
19 | 
20 | class TitleExtractor(BaseExtractor):
21 | 
22 |     @staticmethod
23 |     def extract_by_xpath(element, title_xpath):
24 |         if title_xpath:
25 |             title_list = element.xpath(title_xpath)
26 |             if title_list:
27 |                 return title_list[0]
28 |         return ''
29 | 
30 |     @staticmethod
31 |     def extract_by_title(element):
32 |         title_list = element.xpath(TITLE_META_XPATH) or element.xpath(TITLE_META_XPATH_BAK)
33 |         if title_list:
34 |             return max(title_list, key=len)
35 |         else:
36 |             return ''
37 | 
38 |     @staticmethod
39 |     def extract_by_htag(element):
40 |         title_list = element.xpath(TITLE_HTAG_XPATH)
41 |         title_list = [re.sub(PUNCTUATION_ALPHA_PATTERN, "", phrase) for phrase in title_list]
42 |         if not title_list:
43 |             return ''
44 |         index_string = [(index, ''.join(filter(str.isalnum, string))) for index, string in enumerate(title_list)]
45 |         string_list = [i[1] for i in index_string]
46 |         max_string = max(string_list, key=len)
47 |         return title_list[string_list.index(max_string)]
48 | 
49 |     @staticmethod
50 |     def extract_common_str(element: Element) -> str:
51 |         h_tag_texts_list = element.xpath(TITLE_HTAG_XPATH)
52 |         new_title_list = list(combinations(h_tag_texts_list, 2))
53 |         if len(new_title_list) == 1:
54 |             new_title = str(max(list(new_title_list[0]), key=len))
55 |             return new_title
56 | 
57 |         common_title_list = [get_longest_common_sub_string(i[0], i[1]).strip() for i in new_title_list]
58 |         if common_title_list:
59 |             new_title = max(common_title_list, key=len)
60 |             sub_string = re.sub(r'\d+', '', ''.join(filter(str.isalnum, new_title)))
61 |             return new_title if len(new_title) > 4 and sub_string else ''
62 |         return ''
63 | 
64 |     def process(self, element: Element):
65 |         # remove tag and its content
66 |         etree.strip_elements(element, *TITLE_EXTRACTOR_USELESS_TAGS)
67 | 
68 |         title = (self.extract_by_xpath(element, title_xpath=self.kwargs.get("title_xpath"))
69 |                  or self.extract_by_title(element)
70 |                  or self.extract_common_str(element)
71 |                  or self.extract_by_htag(element)
72 |                  )
73 |         return title.strip()
74 | 


--------------------------------------------------------------------------------
/CrawlersTools/extractors/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MuggleK/CrawlersTools/8f59a1dd884367f2f59e1f6f63b683c9f24ad377/CrawlersTools/extractors/utils/__init__.py


--------------------------------------------------------------------------------
/CrawlersTools/extractors/utils/cluster.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | 
 3 | from CrawlersTools.extractors.utils.similarity import similarity
 4 | 
 5 | 
 6 | def cluster(items, threshold=0.9):
 7 |     """
 8 |     cluster names
 9 |     :param items:
10 |     :param threshold:
11 |     :return: cluster map, for example {"foo": 0, "bar": 1}
12 |     """
13 |     number = -1
14 |     clusters_map = {}
15 |     clusters = []
16 |     for name in items:
17 |         for c in clusters:
18 |             if all(similarity(name, w) > threshold for w in c):
19 |                 c.append(name)
20 |                 clusters_map[name] = number
21 |                 break
22 |         else:
23 |             number += 1
24 |             clusters.append([name])
25 |             clusters_map[name] = number
26 |     return clusters_map
27 | 
28 | 
29 | def cluster_dict(data: dict, threshold=0.8):
30 |     """
31 |     cluster dict, convert id key to cluster id key
32 |     :param threshold:
33 |     :param data:
34 |     :return:
35 |     """
36 |     ids = data.keys()
37 |     clusters_map = cluster(ids, threshold)
38 |     result = defaultdict(list)
39 |     for k, v in data.items():
40 |         if isinstance(v, list):
41 |             for i in v:
42 |                 result[clusters_map[k]].append(i)
43 |         else:
44 |             result[clusters_map[k]].append(v)
45 |     return dict(result)
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     data = {
50 |         '/html/body/div[@class="main"]/div[1]/ul': ['child1', 'child2', 'child3'],
51 |         '/html/body/div[@class="main"]/div[2]/ul': ['child4', 'child5', 'child6'],
52 |         '/html/body/div[@class="main"]/div[3]/ul': ['child7', 'child8', 'child9'],
53 |         '/html/body/header/div[1]': ['child10', 'child11', 'child12'],
54 |         '/html/body/header/div[2]': ['child13', 'child14', 'child15'],
55 |     }
56 |     print(cluster_dict(data, threshold=0.7))
57 | 


--------------------------------------------------------------------------------
/CrawlersTools/extractors/utils/element.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from collections import defaultdict
  3 | from os.path import exists
  4 | from types import ModuleType
  5 | 
  6 | import numpy as np
  7 | from loguru import logger
  8 | from lxml.html import fromstring, HtmlElement
  9 | 
 10 | from CrawlersTools.extractors.schemas.element import Element
 11 | from CrawlersTools.extractors.utils.similarity import similarity
 12 | 
 13 | PUNCTUATION = set('''！，。？、；：“”‘’《》%（）<>{}「」【】*～`,.?:;'"!%()''')
 14 | 
 15 | 
 16 | def remove_element(element: Element):
 17 |     """
 18 |     remove child element from parent
 19 |     :param element:
 20 |     :return:
 21 |     """
 22 |     if element is None:
 23 |         return
 24 |     p = element.getparent()
 25 |     if p is not None:
 26 |         p.remove(element)
 27 | 
 28 | 
 29 | def remove_children(element: Element, xpaths):
 30 |     """
 31 |     remove children from element
 32 |     :param element:
 33 |     :param xpaths:
 34 |     :return:
 35 |     """
 36 |     if element is None:
 37 |         return
 38 |     if not xpaths:
 39 |         return
 40 |     for xpath in xpaths:
 41 |         nodes = element.xpath(xpath)
 42 |         for node in nodes:
 43 |             remove_element(node)
 44 |     return element
 45 | 
 46 | 
 47 | def html2element(html: str):
 48 |     """
 49 |     convert html to HtmlElement
 50 |     :param html:
 51 |     :return:
 52 |     """
 53 |     if not html:
 54 |         return None
 55 |     element = fromstring(html)
 56 |     element.__class__ = Element
 57 |     return element
 58 | 
 59 | 
 60 | def file2element(file_path):
 61 |     """
 62 |     convert file to element
 63 |     :param file_path:
 64 |     :return:
 65 |     """
 66 |     if not exists(file_path):
 67 |         return
 68 |     with open(file_path, encoding='utf-8') as f:
 69 |         return html2element(f.read())
 70 | 
 71 | 
 72 | def selector(element: Element):
 73 |     """
 74 |     get id using recursive function.
 75 |     for example result: html/body/div/div/ul/li
 76 |     :param element:
 77 |     :return:
 78 |     """
 79 |     if element is None:
 80 |         return ''
 81 |     p = parent(element)
 82 |     if p is not None:
 83 |         return selector(p) + '>' + alias(element)
 84 |     return element.alias
 85 | 
 86 | 
 87 | def path_raw(element: Element):
 88 |     """
 89 |     get tag path using recursive function, only contains raw tag
 90 |     for example result: html/body/div/div/ul/li
 91 |     :param element:
 92 |     :return:
 93 |     """
 94 |     if element is None:
 95 |         return ''
 96 |     p = parent(element)
 97 |     if p is not None:
 98 |         return path_raw(p) + '/' + element.tag
 99 |     return element.tag
100 | 
101 | 
102 | def path(element: Element):
103 |     """
104 |     get tag path using recursive function.
105 |     for example result: html/body/div/div/ul/li
106 |     :param element:
107 |     :return:
108 |     """
109 |     if element is None:
110 |         return ''
111 |     result = path_raw(element)
112 |     # get nth-child
113 |     nth = len(list(element.itersiblings(preceding=True))) + 1
114 |     result += f':nth-child({nth})'
115 |     return result
116 | 
117 | 
118 | def a_descendants(element: Element):
119 |     """
120 |     get
121 |     :param element:
122 |     :return:
123 |     """
124 |     if element is None:
125 |         return []
126 |     descendants = []
127 |     for descendant in element.xpath('.//a'):
128 |         descendant.__class__ = Element
129 |         descendants.append(descendant)
130 |     return descendants
131 | 
132 | def img_descendants(element: Element):
133 |     """
134 |     get
135 |     :param element:
136 |     :return:
137 |     """
138 |     if element is None:
139 |         return []
140 |     descendants = []
141 |     for descendant in element.xpath(".//img"):
142 |         descendant.__class__ = Element
143 |         descendants.append(descendant)
144 |     return descendants
145 | 
146 | def a_descendants_group(element: Element):
147 |     """
148 |     get linked descendants group
149 |     :param element:
150 |     :return:
151 |     """
152 |     result = defaultdict(list)
153 |     for linked_descendant in element.a_descendants:
154 |         p = linked_descendant.path_raw
155 |         result[p].append(linked_descendant)
156 |     return result
157 | 
158 | 
159 | def parent(element: Element):
160 |     """
161 |     get parent of element
162 |     :param element:
163 |     :return:
164 |     """
165 |     if element is None:
166 |         return None
167 |     parent = element.getparent()
168 |     if isinstance(parent, ModuleType):
169 |         parent.__class__ = Element
170 |     return parent
171 | 
172 | 
173 | def children(element: Element, including=False):
174 |     """
175 |     get children
176 |     :param element:
177 |     :param including:
178 |     :return:
179 |     """
180 |     if element is None:
181 |         return []
182 |     if including:
183 |         yield element
184 |     for child in element.iterchildren():
185 |         if isinstance(child, HtmlElement):
186 |             child.__class__ = Element
187 |             yield child
188 | 
189 | 
190 | def siblings(element: Element, including=False):
191 |     """
192 |     get siblings of element
193 |     :param element:
194 |     :param including: include current element or not
195 |     :return:
196 |     """
197 |     if element is None:
198 |         return []
199 |     if including:
200 |         yield element
201 |     for sibling in element.itersiblings(preceding=True):
202 |         if isinstance(sibling, HtmlElement):
203 |             sibling.__class__ = Element
204 |             yield sibling
205 |     for sibling in element.itersiblings(preceding=False):
206 |         if isinstance(sibling, HtmlElement):
207 |             sibling.__class__ = Element
208 |             yield sibling
209 | 
210 | 
211 | def descendants(element: Element, including=False):
212 |     """
213 |     get descendants clement of specific element
214 |     :param element: parent element
215 |     :param including: including current element or not
216 |     :return:
217 |     """
218 |     if element is None:
219 |         return []
220 |     if including:
221 |         yield element
222 |     for descendant in element.iterdescendants():
223 |         if isinstance(descendant, HtmlElement):
224 |             descendant.__class__ = Element
225 |             yield descendant
226 | 
227 | 
228 | def alias(element: Element):
229 |     """
230 |     get alias of element, concat tag and attribs
231 |     :param element:
232 |     :return:
233 |     """
234 |     if element is None:
235 |         return ''
236 |     tag = element.tag
237 |     # skip nth-child
238 |     if tag in ['html', 'body']:
239 |         return tag
240 |     attribs = [tag]
241 |     for k, v in element.attrib.items():
242 |         k, v = re.sub(r'\s*', '', k), re.sub(r'\s*', '', v)
243 |         attribs.append(f'[{k}="{v}"]' if v else f'[{k}]')
244 |     result = ''.join(attribs)
245 |     # get nth-child
246 |     nth = len(list(element.itersiblings(preceding=True))) + 1
247 |     result += f':nth-child({nth})'
248 |     return result
249 | 
250 | 
251 | def children_of_head(element: Element):
252 |     """
253 |     get children element of body element
254 |     :param element:
255 |     :return:
256 |     """
257 |     if element is None:
258 |         return []
259 |     body_xpath = '//head'
260 |     body_element = element.xpath(body_xpath)
261 |     if body_element:
262 |         body_element.__class__ = Element
263 |         return descendants(body_element, True)
264 |     return []
265 | 
266 | 
267 | def descendants_of_body(element: Element):
268 |     """
269 |     get descendants element of body element
270 |     :param element:
271 |     :return:
272 |     """
273 |     if element is None:
274 |         return []
275 |     body_xpath = '//*'
276 |     elements = element.xpath(body_xpath)
277 |     if elements:
278 |         elements[0].__class__ = Element
279 |         return list(descendants(elements[0], True))
280 |     return []
281 | 
282 | 
283 | def text(element: Element):
284 |     """
285 |     get text of element
286 |     :param element:
287 |     :return:
288 |     """
289 |     if element is None:
290 |         return 0
291 |     text = ''.join(element.xpath('.//text()'))
292 |     text = re.sub(r'\s*', '', text, flags=re.S)
293 |     # text = ''.join(re.findall(r'[\u4e00-\u9fa5]+', text))
294 |     return text
295 | 
296 | 
297 | def calc_a_descendants_text_of_avg_length(element: Element):
298 | 
299 |     if element is None:
300 |         return 0
301 |     if element.tag.lower() == "a":
302 |         if len(element.siblings) > 1:
303 |             lengths = []
304 |             for link in element.siblings:
305 |                 if element.attrib.get("title") is not None:
306 |                     lengths.append(len(element.attrib.get("title")))
307 |                 if len(element.text.strip()) > 0:
308 |                     lengths.append(len(element.text.strip()))
309 |                 if len(text(link)) > 0:
310 |                     lengths.append(len(text(link)))
311 |                 lengths.append(0)
312 |             if len(lengths) == 0:
313 |                 return 0
314 |             avg_length = np.mean(lengths)
315 |             return avg_length
316 |         else:
317 |             return element.number_of_char
318 |     lengths = []
319 |     # other node
320 |     try:
321 |         siblings = element.siblings
322 |     except Exception:
323 |         return 0
324 | 
325 |     for descentdant in siblings:
326 |         if descentdant.number_of_a_char > 0:
327 |             txt = descentdant.text
328 |             regex = r"(2[0-9]{3}.?[0-1]{0,1}[0-9].?[0-3]{0,1}[0-9])"
329 |             regex2 = r"([0-1]{0,1}[0-9]-[0-3]{0,1}[0-9])"
330 |             time_match = len(re.findall(regex, txt)) or len(re.findall(regex2, txt))
331 |             t = descentdant.number_of_a_char
332 |             if time_match > 0:
333 |                 t = t + 10
334 |             lengths.append(t / len(descentdant.a_descendants))
335 |             continue
336 |         if len(descentdant.text.strip()) > 0:
337 |             lengths.append(len(descentdant.text.strip()))
338 |             continue
339 |         if len(text(descentdant)) > 0:
340 |             lengths.append(len(text(descentdant)))
341 |             continue
342 |         lengths.append(0)
343 |     if len(lengths) == 0:
344 |         return 0
345 |     avg_length = np.mean(lengths)
346 |     return avg_length
347 | 
348 | 
349 | def number_of_char(element: Element):
350 |     """
351 |     get number of char, for example, result of `<a href="#">hello</a>world` = 10
352 |     :param element:
353 |     :return: length
354 |     """
355 |     if element is None:
356 |         return 0
357 |     return len(text(element))
358 | 
359 | 
360 | def number_of_a_char(element: Element):
361 |     """
362 |     get number of linked char, for example, result of `<a href="#">hello</a>world` = 5
363 |     :param element:
364 |     :return: length
365 |     """
366 |     if element is None:
367 |         return 0
368 |     text = ''.join(element.xpath('.//a//text()'))
369 |     text = re.sub(r'\s*', '', text, flags=re.S)
370 |     return len(text)
371 | 
372 | 
373 | def number_of_a_char_log10(element: Element):
374 |     """
375 |     get number of linked char, to log10
376 |     :param element:
377 |     :return: length
378 |     """
379 |     if element is None:
380 |         return 0
381 |     return np.log10(number_of_a_char(element) + 1)
382 | 
383 | 
384 | def number_of_p_children(element: Element):
385 |     """
386 |     get number of p tags in children
387 |     :param element:
388 |     :return:
389 |     """
390 |     if element is None:
391 |         return 0
392 |     return len(element.xpath('./p'))
393 | 
394 | 
395 | def number_of_p_descendants(element: Element):
396 |     """
397 |     get number of p tags in descendants
398 |     :param element:
399 |     :return:
400 |     """
401 |     # element_list = list()
402 |     if element is None:
403 |         return 0
404 |     # for tag in ["p", "span", "tr", "td", "th", "u", "strong", "b", "section", "spanstyle", "spanlang"]:
405 |     #     element_list.extend(element.xpath(f".//{tag}"))
406 |     return len(element.xpath(".//*"))
407 | 
408 | 
409 | def number_of_p_descendants_log10(element: Element):
410 |     """
411 |     get number of p tags, to log10
412 |     :param element:
413 |     :return:
414 |     """
415 |     if element is None:
416 |         return 0
417 |     return np.log10(number_of_p_descendants(element))
418 | 
419 | 
420 | def number_of_a_descendants(element: Element):
421 |     """
422 |     get number of a tags in this element
423 |     :param element:
424 |     :return:
425 |     """
426 |     if element is None:
427 |         return 0
428 |     return len(element.xpath('.//a'))
429 | 
430 | 
431 | def number_of_punctuation(element: Element):
432 |     """
433 |     get number of punctuation of text in this element
434 |     :param element:
435 |     :return:
436 |     """
437 |     if element is None:
438 |         return 0
439 |     text = ''.join(element.xpath('.//text()'))
440 |     text = re.sub(r'\s*', '', text, flags=re.S)
441 |     punctuations = [c for c in text if c in PUNCTUATION]
442 |     return len(punctuations)
443 | 
444 | 
445 | def number_of_descendants(element: Element):
446 |     """
447 |     get number of descendants
448 |     :param element:
449 |     :return:
450 |     """
451 |     if element is None:
452 |         return 0
453 |     # return len(element.xpath('.//*'))
454 |     return len(list(descendants(element, including=False)))
455 | 
456 | 
457 | def number_of_siblings(element: Element):
458 |     """
459 |     get number of siblings
460 |     :param element:
461 |     :return:
462 |     """
463 |     if element is None:
464 |         return 0
465 |     return len(list(siblings(element, including=False)))
466 | 
467 | 
468 | def number_of_clusters(element: Element, tags=None):
469 |     """
470 |     get number of clusters
471 |     :param element:
472 |     :return:
473 |     """
474 |     from CrawlersTools.extractors.utils.settings import LIST_MIN_NUMBER, LIST_MAX_LENGTH, LIST_MIN_LENGTH, SIMILARITY_THRESHOLD
475 |     if element is None:
476 |         return 0
477 |     if tags and not isinstance(tags, (list, tuple)):
478 |         logger.error('you must pass tags arg as list or tuple')
479 |     descendants_tree = defaultdict(list)
480 |     descendants = descendants_of_body(element)
481 |     for descendant in descendants:
482 |         # if one element does not have enough siblings, it can not become a child of candidate element
483 |         if descendant.number_of_siblings + 1 < LIST_MIN_NUMBER:
484 |             continue
485 |         # if min length is larger than specified max length, it can not become a child of candidate element
486 |         if descendant.a_descendants_group_text_min_length > LIST_MAX_LENGTH:
487 |             continue
488 |         # if max length is smaller than specified min length, it can not become a child of candidate element
489 |         if descendant.a_descendants_group_text_max_length < LIST_MIN_LENGTH:
490 |             continue
491 |         # descendant element must have same siblings which their similarity should not below similarity_threshold
492 |         if descendant.similarity_with_siblings < SIMILARITY_THRESHOLD:
493 |             continue
494 |         # filter tag
495 |         if tags and descendant.tag not in tags:
496 |             continue
497 |         descendants_tree[descendant.parent_selector].append(descendant)
498 |     return len(descendants_tree)
499 | 
500 | 
501 | def number_of_children(element: Element):
502 |     """
503 |     get number of children
504 |     :param element:
505 |     :return:
506 |     """
507 |     if element is None:
508 |         return 0
509 |     return len(list(children(element)))
510 | 
511 | 
512 | def density_of_text(element: Element):
513 |     """
514 |     get density of text, using:
515 |                number_of_char - number_of_a_char
516 |     result = ------------------------------------------
517 |                number_of_descendants - number_of_a_descendants
518 |     :return:
519 |     """
520 |     # if denominator is 0, just return 0
521 |     if element.number_of_descendants - element.number_of_a_descendants == 0:
522 |         return 0
523 |     return (element.number_of_char - element.number_of_a_char) / \
524 |            (element.number_of_descendants - element.number_of_a_descendants)
525 | 
526 | 
527 | def density_of_punctuation(element: Element):
528 |     """
529 |     get density of punctuation, using
530 |                 number_of_char - number_of_linked_char
531 |     result = -----------------------------------------
532 |                  number_of_punctuation + 1
533 |     :param element:
534 |     :return:
535 |     """
536 |     result = (element.number_of_char - element.number_of_a_char) / \
537 |              (element.number_of_punctuation + 1)
538 |     # result should not be zero
539 |     return result or 1
540 | 
541 | 
542 | def similarity_with_element(element1: Element, element2: Element):
543 |     """
544 |     get similarity between two elements
545 |     :param element1:
546 |     :param element2:
547 |     :return:
548 |     """
549 |     alias1 = element1.alias
550 |     alias2 = element2.alias
551 |     # TODO: use better metrics to compare the two elements
552 |     return similarity(alias1, alias2)
553 | 
554 | 
555 | def similarity_with_siblings(element: Element):
556 |     """
557 |     get similarity with siblings
558 |     :param element:
559 |     :return:
560 |     """
561 |     scores = []
562 |     for sibling in siblings(element):
563 |         # TODO: maybe compare all children not only alias
564 |         scores.append(similarity_with_element(element, sibling))
565 |     if not scores:
566 |         return 0
567 |     return np.mean(scores)
568 | 


--------------------------------------------------------------------------------
/CrawlersTools/extractors/utils/preprocess.py:
--------------------------------------------------------------------------------
  1 | from lxml.html import etree
  2 | 
  3 | from CrawlersTools.extractors.schemas.element import Element
  4 | from CrawlersTools.extractors.utils.element import children, remove_element, remove_children
  5 | 
  6 | # fmt:off
  7 | CONTENT_EXTRACTOR_USELESS_TAGS = ['audio', 'colgroup', 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'iframe',
  8 |                                   'input', 'link', 'option', 'path', 'script', 'select', 'source', 'style', 'svg',
  9 |                                   'symbol', 'video']
 10 | 
 11 | CONTENT_EXTRACTOR_STRIP_TAGS = ['b', 'blockquote', 'br', 'font',  'p', 'section', 'span', 'spanlang', 'spanstyle',
 12 |                                 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'th', 'tr', 'u'] # 'img'
 13 | 
 14 | KEYWORD_FEATURES = 'and not (contains(@class,"main")) and not (contains(@class,"content"))and not (contains(@class,"con"))and not (contains(@class,"container")) and not (contains(@class,"list")) and not (contains(@class,"box")) and not (contains(@class,"right"))and not (contains(@class,"body")) and not (contains(@class,"lanmu")) '
 15 | CONTENT_EXTRACTOR_NOISE_XPATH = [
 16 |     # '//div[contains(@class, "comment")]',
 17 |     '//div[contains(@class, "advertisement")]',
 18 |     '//div[contains(@class, "advert")]',
 19 |     '//a[contains(@style, "display: none")]',
 20 |     '//a[contains(@style, "display:none")]',  # TODO css不展示数据是否要去除，可能会影响正文重复
 21 |     f'//div[contains(@class, "foot") {KEYWORD_FEATURES}]',
 22 |     f'//div[contains(@class, "footer") {KEYWORD_FEATURES}]',
 23 |     # f'//div[contains(@class, "location") {KEYWORD_FEATURES}]',
 24 |     f'//div[contains(@class, "navigation") {KEYWORD_FEATURES}]',
 25 |     f'//div[contains(@class, "barrier") {KEYWORD_FEATURES}]',
 26 |     '//div[contains(@id, "foot")]',
 27 |     # '//div[contains(@class, "head")]',    # 误删
 28 |     # '//div[contains(@id, "head")]',
 29 |     # '//div[contains(@class, "nav")]', # 误删
 30 |     '//div[contains(@id, "nav")]',
 31 |     '//div[contains(@class, "siderbar")]',
 32 |     '//div[contains(@class, "breadcrumb")]',
 33 |     '//div[contains(@id, "siderbar")]',
 34 |     '//div[contains(@id, "页脚")]',
 35 |     '//div[contains(@class, "页脚")]',
 36 |     '//div[contains(@id, "页眉")]',
 37 |     '//div[contains(@id, "页头")]',
 38 |     '//div[contains(@class, "页眉")]',
 39 |     '//div[contains(@class, "页头")]',
 40 |     '//*[contains(@class, "hidden")]',
 41 | ]
 42 | 
 43 | 
 44 | def preprocess4content_extractor(element: Element, is_content: bool = True):
 45 |     """
 46 |     preprocess element for content extraction
 47 |     :param element:
 48 |     :param is_content:  save content without tag
 49 |     :return:
 50 |     """
 51 |     remove_children(element, CONTENT_EXTRACTOR_NOISE_XPATH)
 52 | 
 53 |     # remove tag and its content
 54 |     etree.strip_elements(element, *CONTENT_EXTRACTOR_USELESS_TAGS)
 55 | 
 56 |     if not is_content: return
 57 |     # only move tag pair
 58 |     etree.strip_tags(element, *CONTENT_EXTRACTOR_STRIP_TAGS)
 59 | 
 60 |     for child in children(element):
 61 | 
 62 |         # merge text in span or strong to parent p tag
 63 |         if child.tag.lower() == 'p' or child.tag.lower() == 'table':
 64 |             etree.strip_tags(child, 'span')
 65 |             etree.strip_tags(child, 'strong')
 66 |             etree.strip_tags(child, 'tr')
 67 |             etree.strip_tags(child, 'td')
 68 | 
 69 |             if not (child.text and child.text.strip()):
 70 |                 remove_element(child)
 71 | 
 72 |         # if a div tag does not contain any sub node, it could be converted to p node.
 73 |         if child.tag.lower() == 'div' and not child.getchildren():
 74 |             child.tag = 'p'
 75 | 
 76 | 
 77 | LIST_EXTRACTOR_USELESS_TAGS = CONTENT_EXTRACTOR_USELESS_TAGS
 78 | LIST_EXTRACTOR_STRIP_TAGS = CONTENT_EXTRACTOR_STRIP_TAGS
 79 | LIST_EXTRACTOR_NOISE_XPATH = CONTENT_EXTRACTOR_NOISE_XPATH
 80 | 
 81 | 
 82 | def preprocess4list_extractor(element: Element):
 83 |     """
 84 |     preprocess element for list extraction
 85 |     :param element:
 86 |     :return:
 87 |     """
 88 |     # remove tag and its content
 89 |     etree.strip_elements(element, *CONTENT_EXTRACTOR_USELESS_TAGS)
 90 |     # only move tag pair
 91 |     etree.strip_tags(element, *CONTENT_EXTRACTOR_STRIP_TAGS)
 92 | 
 93 |     remove_children(element, CONTENT_EXTRACTOR_NOISE_XPATH)
 94 | 
 95 |     for child in children(element):
 96 | 
 97 |         # merge text in span or strong to parent p tag
 98 |         if child.tag.lower() == 'p':
 99 |             etree.strip_tags(child, 'span')
100 |             etree.strip_tags(child, 'strong')
101 | 
102 |             if not (child.text and child.text.strip()):
103 |                 remove_element(child)
104 | 
105 |         # if a div tag does not contain any sub node, it could be converted to p node.
106 |         if child.tag.lower() == 'div' and not child.getchildren():
107 |             child.tag = 'p'
108 | 
109 | 
110 | LIST_CLASSIFIER_USELESS_TAGS = ['style', 'script', 'link', 'video', 'audio', 'iframe', 'source', 'svg', 'path',
111 |                                 'symbol', 'footer', 'header']
112 | LIST_CLASSIFIER_STRIP_TAGS = ['span', 'blockquote']
113 | LIST_CLASSIFIER_NOISE_XPATHS = [
114 |     '//div[contains(@class, "comment")]',
115 |     '//div[contains(@class, "advertisement")]',
116 |     '//div[contains(@class, "advert")]',
117 |     '//div[contains(@style, "display: none")]',
118 | ]
119 | 
120 | 
121 | def preprocess4list_classifier(element: Element):
122 |     """
123 |     preprocess element for list classifier
124 |     :param element:
125 |     :return:
126 |     """
127 |     # remove tag and its content
128 |     etree.strip_elements(element, *LIST_CLASSIFIER_USELESS_TAGS)
129 |     # only move tag pair
130 |     etree.strip_tags(element, *LIST_CLASSIFIER_STRIP_TAGS)
131 | 
132 |     remove_children(element, LIST_CLASSIFIER_NOISE_XPATHS)
133 | 
134 |     for child in children(element):
135 | 
136 |         # merge text in span or strong to parent p tag
137 |         if child.tag.lower() == 'p':
138 |             etree.strip_tags(child, 'span')
139 |             etree.strip_tags(child, 'strong')
140 | 
141 |             if not (child.text and child.text.strip()):
142 |                 remove_element(child)
143 | 
144 |         # if a div tag does not contain any sub node, it could be converted to p node.
145 |         if child.tag.lower() == 'div' and not child.getchildren():
146 |             child.tag = 'p'
147 | 


--------------------------------------------------------------------------------
/CrawlersTools/extractors/utils/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Project : CrawlersTools
  3 | # @Time    : 2022/8/19 20:00
  4 | # @Author  : MuggleK
  5 | # @File    : settings.py
  6 | 
  7 | # list settings
  8 | LIST_MIN_NUMBER = 5
  9 | LIST_MIN_LENGTH = 8
 10 | LIST_MAX_LENGTH = 50
 11 | SIMILARITY_THRESHOLD = 0.8
 12 | 
 13 | LIST_AVG_LENGTH = 9
 14 | ADDTION_RIGHT_NUM = 10000
 15 | 
 16 | HIGH_WEIGHT_ERROR_KEYWORD = ["ICP备", "公网安备", "网公安备", "备案序号：", "网站地图"]
 17 | DIRECTORY_ERROR_TITLE = ["首页", "下一页", "解读", "图解", "详细", "阅读全文", "标题", "[详细]"]
 18 | 
 19 | 
 20 | # common settings
 21 | SPECIAL_SYMBOL_MAP = {
 22 |     "&quot;": '"',
 23 |     "&amp;": "&",
 24 |     "&lt;": "<",
 25 |     "&gt;": ">",
 26 |     "&nbsp;": " ",
 27 |     "&#34;": '"',
 28 |     "&#38;": "&",
 29 |     "&#60;": "<",
 30 |     "&#62;": ">",
 31 |     "&#160;": " ",
 32 |     '<script type="text/xml">': "",
 33 |     "<script type='text/xml'>": "",
 34 | }
 35 | 
 36 | ERROR_NAV_LIST = ['Copyright', 'ICP备', 'QQ空间', '上一篇', '下一篇', '专题专栏', '主题分类', '主题词：', '二维码', '互动交流', '人人网', '优质工程评审',
 37 |                   '使用帮助', '依申请公开', '信息公开', '信息公开制度', '信息公开年报', '信息公开指南', '信息发布', '免责声明', '党建专栏', '党建工作', '党组织概况',
 38 |                   '公示公告', '公网安备', '关闭窗口', '分享到', '分类：', '办事服务', '加入收藏', '区政府文件', '协会文件', '协会章程', '协会简介', '协会领导', '发布日期',
 39 |                   '发布日期', '发布时间', '发布机构', '发文日期', '名称：', '基层政务公开标准化规范化', '备案序号', '字体', '安全人员培训', '市场信用评价', '当前位置',
 40 |                   '您当前的位置', '您现在所在的位置', '您的位置', '意见征集', '成文日期', '手机阅读', '打印', '打印本页', '打印此页', '扫一扫', '收藏本站', '政务公开',
 41 |                   '政务服务', '政府信息公开', '政府信息公开制度', '政府信息公开年报+', '政府信息公开指南', '政府信息公开申请表', '政府公报', '政府动态', '政府网站工作报表',
 42 |                   '政策文件', '政策法规', '新浪微博', '新闻公告', '无障碍浏览', '机构概况', '机构职能', '机构设置', '来源:', '来源：', '标题', '法定主动公开内容',
 43 |                   '法定主动公开内容+', '法定主动公开目录', '法规文件', '浏览器版本过低', '浏览次数', '浏览统计', '浏览量', '浏览量', '点击次数', '点击量', '版权所有',
 44 |                   '生成时间', '电子期刊', '科技动态', '科技资讯', '简繁', '索 引 号', '索引号', '组织机构', '网公安备', '网站地图', '网站建设', '网站标识码', '联系我们',
 45 |                   '联系我们', '腾讯微博', '行政规范性文件', '责任编辑', '返回顶部', '进度查询', '通知公告', '部门规章', '重大决策预公开', '门户网站', '预决算公开', '领导介绍',
 46 |                   '领导信箱', '首页']
 47 | 
 48 | # title settings
 49 | TITLE_EXTRACTOR_USELESS_TAGS = ["br", "BR", "script", "style"]
 50 | 
 51 | PUNCTUATION_ALPHA_PATTERN = '[a-zA-Z0-9’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'
 52 | 
 53 | TITLE_HTAG_XPATH = '(//h1//text() | //h2//text() | //h3//text() | //h4//text() | //h5//text() | //title//text() | //*[contains(@class, "title")]/text() | //*[contains(@class, "Title")]/text() | //*[contains(@id, "title")]/text() | //*[contains(@id, "Title")]/text())'
 54 | 
 55 | TITLE_META_XPATH = '//meta[contains(@name, "Title")]/@content'
 56 | 
 57 | TITLE_META_XPATH_BAK = '//meta[contains(@name, "title")]/@content'
 58 | 
 59 | # time settings
 60 | DATETIME_PATTERN = [
 61 |     r"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
 62 |     r"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
 63 |     r"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])",
 64 |     r"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
 65 |     r"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
 66 |     r"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
 67 |     r"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
 68 |     r"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])",
 69 |     r"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
 70 |     r"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
 71 |     r"(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
 72 |     r"(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
 73 |     r"(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
 74 |     r"(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
 75 |     r"(\d{4}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
 76 |     r"(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
 77 |     r"(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
 78 |     r"(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
 79 |     r"(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
 80 |     r"(\d{2}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
 81 |     r"(\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
 82 |     r"(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
 83 |     r"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",  # TODO 年月日与符号优先级
 84 |     r"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
 85 |     r"(\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
 86 |     r"(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
 87 |     r"(\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
 88 |     r"(\d{4}年\d{1,2}月\d{1,2}日)",
 89 |     r"(\d{2}年\d{1,2}月\d{1,2}日)",
 90 |     r"(\d{1,2}月\d{1,2}日)",
 91 | ]
 92 | 
 93 | PUBLISH_TIME_META = [  # 部分特别规范的网站，可以直接从 HTML 的 meta 数据中获得发布时间
 94 |     '//meta[starts-with(@property, "rnews:datePublished")]/@content',
 95 |     '//meta[starts-with(@property, "article:published_time")]/@content',
 96 |     '//meta[starts-with(@property, "og:published_time")]/@content',
 97 |     '//meta[starts-with(@property, "og:release_date")]/@content',
 98 |     '//meta[starts-with(@itemprop, "datePublished")]/@content',
 99 |     '//meta[starts-with(@itemprop, "dateUpdate")]/@content',
100 |     '//meta[starts-with(@name, "OriginalPublicationDate")]/@content',
101 |     '//meta[starts-with(@name, "article_date_original")]/@content',
102 |     '//meta[starts-with(@name, "og:time")]/@content',
103 |     '//meta[starts-with(@name, "apub:time")]/@content',
104 |     '//meta[starts-with(@name, "publication_date")]/@content',
105 |     '//meta[starts-with(@name, "sailthru.date")]/@content',
106 |     '//meta[starts-with(@name, "PublishDate")]/@content',
107 |     '//meta[starts-with(@name, "publishdate")]/@content',
108 |     '//meta[starts-with(@name, "PubDate")]/@content',
109 |     '//meta[starts-with(@name, "pubDate")]/@content',
110 |     '//meta[starts-with(@name, "pubdate")]/@content',
111 |     '//meta[starts-with(@name, "pubtime")]/@content',
112 |     '//meta[starts-with(@name, "_pubtime")]/@content',
113 |     '//meta[starts-with(@name, "weibo: article:create_at")]/@content',
114 |     '//meta[starts-with(@pubdate, "pubdate")]/@content',
115 |     '//meta[starts-with(@name, "firstpublishedtime")]/@content',
116 |     '//*[contains(text(), "发布日期")]//text()',
117 |     '//*[contains(text(), "发布时间")]//text()',
118 |     '//*[contains(text(), "发布日期")]/..//text()',
119 |     '//*[contains(text(), "发布时间")]/..//text()',
120 |     '//*[contains(text(), "发布日期")]/../..//text()',
121 |     '//*[contains(text(), "发布时间")]/../..//text()',
122 | ]
123 | 
124 | # attachments settings
125 | ATTACHMENT_REGX = "(pdf|txt|doc|docx|xlsx|xls|csv|wps|hlp|rtf|ppt|pptx|zip|rar|jar|gz|jpg|jpeg|png|tif|gif|bmp|pic|mac|avi|mpg|mov|swf|wav|aif|au|mp3|ram|wma|mmf|amr|aac|flac|et|ofd|webp|jfif)(?:&classid=\\-?[0-9]+|\?docid=[0-9]+)?$"
126 | 


--------------------------------------------------------------------------------
/CrawlersTools/extractors/utils/similarity.py:
--------------------------------------------------------------------------------
 1 | import distance
 2 | 
 3 | 
 4 | def similarity1(s1, s2):
 5 |     """
 6 |     get similarity of two strings
 7 |     :param s1:
 8 |     :param s2:
 9 |     :return:
10 |     """
11 |     if not s1 or not s2:
12 |         return 0
13 |     edit_distance = distance.levenshtein(s1, s2)
14 |     similarity_score = 1 - edit_distance / (len(s1) + len(s2))
15 |     return similarity_score
16 | 
17 | 
18 | def similarity2(s1, s2):
19 |     """
20 |     get similarity of two strings
21 |     :param s1:
22 |     :param s2:
23 |     :return:
24 |     """
25 |     if not s1 or not s2:
26 |         return 0
27 |     s1_set = set(list(s1))
28 |     s2_set = set(list(s2))
29 |     intersection = s1_set.intersection(s2_set)
30 |     union = s2_set.union(s1_set)
31 |     return len(intersection) / len(union)
32 | 
33 | 
34 | def similarity(s1, s2):
35 |     """
36 |     get similarity of two strings
37 |     :param s1:
38 |     :param s2:
39 |     :return:
40 |     """
41 |     return similarity2(s1, s2)
42 | 
43 | 
44 | def get_longest_common_sub_string(str1: str, str2: str) -> str:
45 |     """
46 |     get longest common string
47 |     :param str1:
48 |     :param str2:
49 |     :return:
50 |     """
51 |     if not all([str1, str2]):
52 |         return ''
53 |     matrix = [[0] * (len(str2) + 1) for _ in range(len(str1) + 1)]
54 |     max_length = 0
55 |     start_position = 0
56 |     for index_of_str1 in range(1, len(str1) + 1):
57 |         for index_of_str2 in range(1, len(str2) + 1):
58 |             if str1[index_of_str1 - 1] == str2[index_of_str2 - 1]:
59 |                 matrix[index_of_str1][index_of_str2] = matrix[index_of_str1 - 1][index_of_str2 - 1] + 1
60 |                 if matrix[index_of_str1][index_of_str2] > max_length:
61 |                     max_length = matrix[index_of_str1][index_of_str2]
62 |                     start_position = index_of_str1 - max_length
63 |             else:
64 |                 matrix[index_of_str1][index_of_str2] = 0
65 |     return str1[start_position: start_position + max_length]
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     s1 = 'hello'
70 |     s2 = 'world'
71 |     print(similarity(s1, s2))
72 | 


--------------------------------------------------------------------------------
/CrawlersTools/js_crawler/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time    : 2022/8/15 11:29
4 | # @Author  : MuggleK
5 | # @File    : __init__.py
6 | 
7 | from CrawlersTools.js_crawler.font_decrypt import FontDecrypt
8 | from CrawlersTools.js_crawler.transfer_js import int_overflow, right_shift, string_similar
9 | 


--------------------------------------------------------------------------------
/CrawlersTools/js_crawler/font_decrypt.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Project : CrawlersTools
  3 | # @Time    : 2022/8/12 16:11
  4 | # @Author  : MuggleK
  5 | # @File    : font_decrypt.py
  6 | 
  7 | from io import BytesIO
  8 | 
  9 | import httpx
 10 | from PIL import ImageFont, ImageDraw, Image
 11 | from fontTools.ttLib import TTFont
 12 | from loguru import logger
 13 | 
 14 | 
 15 | class FontDecrypt(object):
 16 |     """
 17 |     Usage::
 18 | 
 19 |         # >>>
 20 |         # >>> # 字体base64加密
 21 |         # >>> str_base64_ = 'd09GRgABAAAAAE34AAoAAAAATbAAATMzAAAAAAAAAAAAAAAAAAAAAAAAAABPUy8yAAAA9AAAAGAAAABgZ5ijY2NtYXAAAAFUAAAGogAABqJE+hJPZ2x5ZgAAB/gAAEEUAABBFJTJlvxoZWFkAABJDAAAADYAAAA2ByuHNWhoZWEAAElEAAAAJAAAACQEJgKgaG10eAAASWgAAAD0AAAA9CckDMBsb2NhAABKXAAAAOAAAADgLkc+mG1heHAAAEs8AAAAIAAAACAAeQBhbmFtZQAAS1wAAAJ8AAACfKf0GLVwb3N0AABN2AAAACAAAAAg/58ArwAEA8cBkAAFAAQB9AH0AAAAAAH0AfQAAAH0AEAAyAgBAgsEAAAAAAAAAKAAAv8QAAAAAAAAFgAAAABBUFBMAEAAMZfnA1z/dAGQBCQBVAAEAAEAAAAAAlgDXAAAACAAAwAAAAQAAAADAAAAJAABAAAAAAB8AAMAAQAAACQAAwAKAAABggAEAFgAAAASABAAAwACADEAMgAzADQANQA2ADgAOf//AAAAMQAyADMANAA1ADYANwA5////1P/V/9H/z//N/8v/0f/NAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAEGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABQcEAwIBCAkGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAwAAAAABSAAAAAAAAAAbAAAADEAAAAxAAAABQAAADIAAAAyAAAABwAAADMAAAAzAAAABAAAADQAAAA0AAAAAwAAADUAAAA1AAAAAgAAADYAAAA2AAAAAQAAADcAAAA4AAAACAAAADkAAAA5AAAABgACLk8AAi5PAAAACgACLlcAAi5XAAAAEgACLm8AAi5vAAAALQACLnMAAi5zAAAALwACLnUAAi51AAAAMgACLn4AAi5+AAAANwACLoMAAi6DAAAAOwACLpEAAi6RAAAARAACLpQAAi6UAAAARwACLpgAAi6YAAAASAACLrIAAi6zAAAAUgACLrsAAi67AAAAVgACLtIAAi7SAAAAYwACLvUAAi71AAAAagACLwsAAi8LAAAAbgACTV0AAk1dAAAACwACTWIAAk1iAAAADAACTWkAAk1pAAAADQACTWsAAk1rAAAADgACTXwAAk18AAAADwACTYoAAk2KAAAAEAACTd0AAk3dAAAAEQACTmkAAk5pAAAAEwACTpwAAk6cAAAAFAACTqIAAk6iAAAAFQACTqsAAk6rAAAAFgACTzAAAk8wAAAAFwACTz0AAk89AAAAGAACUJgAAlCYAAAAGQACUMUAAlDFAAAAGgACUVUAAlFVAAAAGwACUXgAAlF4AAAAHAACUYUAAlGFAAAAHQACUfcAAlH3AAAAHgACUmUAAlJlAAAAHwACUp0AAlKdAAAAIAACUqYAAlKmAAAAIQACUqkAAlKpAAAAIgACUxkAAlMZAAAAIwACUyAAAlMgAAAAJAACU0YAAlNGAAAAJQACU1cAAlNXAAAAJgACU2AAAlNgAAAAJwACU6cAAlOnAAAAKAACU9sAAlPbAAAAKQACVkwAAlZMAAAAKgACVncAAlZ3AAAAKwACV0kAAldJAAAALAACWGkAAlhpAAAALgACWKUAAlilAAAAMAACWSMAAlkjAAAAMQACWrUAAlq1AAAAMwACWu0AAlrtAAAANAACWwUAAlsFAAAANQACW0sAAltLAAAANgACXVEAAl1RAAAAOAACXVcAAl1XAAAAOQACXcMAAl3DAAAAOgACXeMAAl3jAAAAPAACX3YAAl92AAAAPQACYV8AAmFfAAAAPgACYc8AAmHPAAAAPwACYlAAAmJQAAAAQAACYtcAAmLXAAAAQQACZIgAAmSIAAAAQgACZKgAAmSoAAAAQwACZOgAAmToAAAARQACZP8AAmT/AAAARgACZm4AAmZuAAAASQACZn4AAmZ+AAAASgACZokAAmaJAAAASwACZp8AAmafAAAATAACZ14AAmdeAAAATQACbCQAAmwkAAAATgACcIUAAnCFAAAATwACcbgAAnG4AAAAUAACc1UAAnNVAAAAUQACdIQAAnSEAAAAVAACddMAAnXTAAAAVQACdlAAAnZQAAAAVwACd2MAAndjAAAAWAACeRwAAnkcAAAAWQACeSAAAnkgAAAAWgACeVoAAnlaAAAAWwACecUAAnnFAAAAXAACepgAAnqYAAAAXQACfEoAAnxKAAAAXgACfh4AAn4eAAAAXwACfi4AAn4uAAAAYAACgAEAAoABAAAAYQACgEwAAoBMAAAAYgACgwYAAoMGAAAAZAACh7cAAoe3AAAAZQACivAAAorwAAAAZgACiwkAAosJAAAAZwACi00AAotNAAAAaAACkRwAApEcAAAAaQAClbEAApWxAAAAawACl8gAApfIAAAAbAACl+cAApfnAAAAbQAAAAAAAAAAAAAAAAAAAAEAbwAAAWkCygAKAAABESMRBgc1Njc2NwFpUkJmLzUxJQLK/TYCZkQeUg0fHioAAAEAPgAAAhoC2AAhAAAAFxYVFAcGBwYHIRUhNDc2NzY3NjU0JyYjIgcGByM0NzYzAZlAQEYpa4gUAXf+JEcubVUbNycnRksmKwFSQ0JtAtg5Ol1cTCtIW0lJY040Sz0cPERBJCQyMV53SEgAAAEAM//yAiYC2AAyAAAAFxYVFAcWFxYVFAcGIyInJiczFhcWMzI3NjU0JyYrATUzMjc2NTQnJiMiBwYHIzY3NjMBmz4+dUAgJERFb2RCTAlTAzUuQkwvKyopSzo3RyYmJSZHSScpCFEJQz5nAtg0NVl5JxQpKj9hPj80P3hULCUsKz5AJCNAICE8OyIjJiZJZzs6AAACABgAAAJBAsoACgAOAAABETMVIxUjNSE1AQcBIREBzXR0Tv6ZAWgE/uEBIgLK/htDoqJOAdpr/oYBegABADP/8gIlAsoAJwAAARUhBzM2NzYzMhcWFRQHBiMiJyYnMxYXFjMyNzY1NCcmIyIHBgcjEwII/qQYBB4vKDNmP0BNSWlfQkoIUQYwK0JLMTEuK08zKCsVTiYCyknvJBIRQENybEVBMzliRCQgMDFKVDAvFhUrAYoAAAAAAgAz//ICJgLYACAAMAAAABcjJiMiBwYdARQXMzY3NjMyFxYVFAcGIyInJjU0NzYzAgcGFRQXFjMyNzY1NCcmIwH8HlEYfFUvLwEEHDEvQGY/PkRGZ4FDPkNGekkvLiwtSkcuLSwsSgLYv3pTTHwKCAg5HR9CQ2dpRkZhWqWsanD+tzAvS0wvMzEzTEsuLwAAAQBCAAACFwLKAAYAABMhFQEjASFCAdX++VcBCv5/AspD/XkCfwAAAAMAKv/yAi4C2AAhADEAQQAAABcWFRQHBgcVFhcWFRQHBiMiJyY1NDc2NzUmJyY1NDc2MwYHBhUUFxYzMjc2NTQnJiMCBwYVFBcWMzI3NjU0JyYjAZ0+OhsfNjgnKkJGenxEQiooNzgdGzo+cU0qJSEoU1EqISUrTFcuLCwuV1YwKysvVwLYOjVOOScrEwIOMDNFXzk7OzlfRTMxDQITKyc5TzQ6QychNTchKCghNzQiJ/7DKyZCQSYpKig+QCgrAAIAM//yAiYC2AAgADAAAAAXFhUUBwYjIiczFjMyNzY9ASY1IwYHBiMiJyY1NDc2MwYHBhUUFxYzMjc2NTQnJiMBpUI/Q0l3xx1RF31SMi8BBBsyMj1nPz1EQ2pHLSwsLEpFMC4sLUoC2GBapqhucL96U1B4CgkHNiAfREBoa0RGRTEwT00sLzAxSUoxMwAAAAEAMgFLA7UBkwADAAATIRUhMgOD/H0Bk0gAAAACADr/pgN4AzUAFQAZAAABByEQBwYjIi8BFjMyNzYRIRMXByEVASEVIQE7EQIsLi2VP0cTdBaBGxv9yy5JDQI1/MICb/2RAmye/lxFPwNCBz0/ASoBqwZ9Rv56RgAAAAABADP/kQO1AzgAKwAAEyE2NyM1ITY3Fw8BBgchFSEGByEVIQYPAiEVBgcWFwcmJzcWFzY3ITY3ITMBHg0S+QEICRBKBgUHBQGv/kIOEAIX/dYJDAsLAdxKmFNaKN7yJmR0iVH+Eicd/vYCADFLQylQBh4dIhZDPj5GHyUhHz5ybR4rP25BOBsoWmRpXgAAAAADADn/xgOvAycACwARABcAAAEzETMRMxEhFSE1IQECByc2EwQTByYnNwFjSY5JASz8igEqAjE8W0FYQP2KPUI2UT0DJ/zlAxv85UZGAlP++sMXsgESzP79EfDeFAAAAAADADD/ngO2AzsAIQAnAC0AABMzNjcXBgchFSEGByE1MxUhFSERFCsBJxYzMjURISc2NyMBBgcnNjcEFwcmJzdO9xwbRxUbAf794khVAQ5JATv+xV9oDzcnL/6mEV1O1gEEY5Euj2ECN181TpIwArg9RhA1PkaNapSUQ/7CXEcFLAEsP2SX/kuIZzxhfnJ5NG6ELgAAAwBp/5UDgAM1AA8AEwAXAAABMxUhESM1IREjESEVIxEhASERIQEhESEBz0oBZ0j+4Ur+4kgBZv7iAR7+4gFoAR/+4QM1tP4qP/6rAVU/Adb+rwEL/vUBCwAAAAABADn/vgOwAzMAGgAAASE1ISYnNxYXByEVIRUhFSERIRUhNSERITUhAdD+jAF6GSVGJBYnAZz+jAFE/rwBl/yJAZf+vAFEAkBHTUwTU0oPR+xD/vRHRwEMQwAAAAEAOv+nA64DCgAWAAATIRUhESEVIREUKwEnFjMyNREhNSERIXkC7/65AY3+c2WcD2YsNf5iAZ7+oQMKRv7wSP6dYkUEMwFRSAEQAAABACP/ogPDAy8ADgAAARQHEgUHJAMCBSckEzY1AhgHVAFeL/7Nalr+siwBnA4CAy9xbf5o1z/JAVP+utdA/QGxiBcAAAMAIf+dA8QDMQAKAA4AJgAAARYXByYnBgUnJDcDIRUhByEVIQYHNjcmJzcWFwcmJwYFJzY3NjcjAg6/9yf2s6L+9SYBCbDaAev+FZ0DJP4pSGb71zw0OIpIPBsc6/6bFBoMYz77AzHVe0WB1MaQRYfK/s1EhUaNaQ4dTjokmm8qKigmFT8HDHJ3AAAABAAk/5oDvwM3AAoAFAAeACQAAAEGBxEjEQYHJzY3FyEmJzcWFyEVIQMhNhMXAgczFSESFwcmJzcBUCc6SDM5F6BJPgENFB1MGBUBBf2mEQFYTDlFNEzj/XvGL0MoQEIDGHZm/V4CNEQ8TbXnp0pIDUVaRf3HxgEyEv7x10cBc/oS4NkUAAAAAAIAJ/+bA8ADNQAKACoAABMGByc2NxcGBxEjEyE1MxUhFSMWFwcmAyMRMxUjFSM1IzUzESMCByc2EyOhLTgVnEVBKDlHlwEJRwER00ezL7hHCpubR5mZC0y5JatLygG3QUBIwPcegGz9cALrrKxG+r873wEV/m9Fzs5FAZH+yMY9rgETAAIAI/+bA7MDNwAWACEAAAEGByc2NxcGByEVIRUhFSEVIRUhFSMRBQYHJzY3FwYHESMB5ERbM4VUSBoWAbD+sAEk/twBKv7WSf6KMDkYo0lEKD1KAl2QbT2e/BBKOUelRqlF6QLCkD85TLHiIHNo/WIAAAAABgAg/5oDwAM6AAoAFQAZAB0AJQApAAATBgcnNjcXBgcRIxMhJic3FhcHIRUhFyEVIRUhFSEFESM1IRUjERchNSGaMDMXkkRDJDRHmwE0Fh5FHhMmATv9dVwB2f4nAdn+JwHrRv6ORkYBcv6OAcxGOU207B91ZP1YAxM8OBNAOQ5EUUFMQlL+pTY2AVvhngAABwAk/5UDuwM0ABcAHgApAC0ANwA9AEcAAAEGByEVBgcWFwcmJwYHJzY3JicGByc2NxYXNjcjDwEnBgcRIxEGByc2NxczESMBBgcGByc2NzY3FwYHJzY3ATY3NjcXBgcGBwKJDBQBHjFlWHIckWJpkRt4XEAkHCcpey0bT1kz+wUF+CUzRTA2FZVDQUFBAcQ4Oz9mI1ZDOTWPd+kd3XD+iqpra0szQ4aLjAMrJCc+Y0gnG0MkNjkkQRwrMTkbHTZXfuYzOUoGB4N0YP1bAjNFO0i06LH9ygEfLyAhITkWHh0ob5VCPTqF/pYtPz5jJmhOTSYAAAMAKv+fA7oDMAAlACsAMQAAATMRIRUhERQ7ATI3NjcXBgcGKwEiNREjFQYHBgcnNjc2NzUhNSEBBgcnNjcEFwcmJzcBz0oBkP7hKnkhDxIGRQofGzycXKkETUqrKaVBPQP+8AGPAY8zU0FRM/3+QkM/TUQDMP6NR/6xLRUbgBaeIh5iAV4Hu3JoO0A5XGKZB0cBI3t0IXZ4cX0he2sgAAAGADT/nwO1AzUAEwAXABsAHwAlACsAABMzNTMVITUzFTMVIxEzFSE1MxEjEyE1ITUhNSE1ITUhEwYHJzY3BBcHJic3RqlJAXhJqam8/H+7qfIBeP6IAXj+iAF4/ohnerUtsncBsogolJokAtlcXFxcRP42Q0MByv42cj5vPm39rVxHOkNUQFQ9YTc2AAMALP+cA7sDKQAJACQAKgAAAQYHBgcnNjc2NwM1IQYHBgcGByMnFjMyNzY3IQYHBgcnNjc2NyQXByYnNwGoRkdJfydpUEs7rwJfBBAPIyNNuhNaUFMbGQn+9SQ/SZUshkM9HwFg3DDlWj4C+YFPU1k/Pl1Vcf5CRbZ6dSssBEUDRkbtpmBpVDlLV1WT5oU5lesiAAADACT/nAOKAy4AHgAiAC0AABMzNQYHJzY3FwYHFTMVIxUWFwcmJxEjEQYHJzY3NSMlMxEjFycWMzI1ETMRFCM6x19PFvaiI1RaublgYCtVQElHdiCaQ8cCHEhIRA9gKixJXQIjiBEFQxRBPSEVl0UuU2VAbEL+NgHCkHlTiK0B9f2WxEYEMQMW/NhhAAMAI/+bA5YDLgAsADAAOwAAEwYHJzY3FwYHMzUzFTMVIxUzFSMVMxEUKwEnFzI9ASMRIxEjESMRMzUjNTM1JTMRIxcnFjMyNREzERQjnBskOksVRAgLdEi7u9jYw1o1EjErfkh/RsXy8gFbRkYkEFA6LkdcAm88JypRjgszJXx8Q4FDaf74WUQDJ7b+nAFk/tsBaGlDgWX9lsVFBTADGfzVXgAAAwAz/5gDowMuAAMAGwA3AAATIRUhByEVIwYHNjcmJzcWFwcmJwYHJzY3NhMjATMVMxAHBiMiLwEWMzI3NhMjFQIHJzYTNSM1M1oBav6WJwG35zsudXcpJD1JPj4NEZ68EQkHLUuLAkxJ2w4PeRpCEEwXQAsKA5QDzzm/A3x8AutFyUbiZBgsVj8egpYgIyY5IUQDAz4BEAGXtv4XeXADQARNUAGxEv5FzjK6AZ0SRQACACf/mgOvAzYACgAoAAABBgcRIxEGByc2NwEiNREGByc2NxEzETY3FwYHERQ7ATI3NjcXBgcGIwGcM1FIQFIX0mABEms6RyBeQ0iceDKLuzSgJxMQBkMLGRtIAxZ7a/1qAj5GREu06fx2agExISJELCcBkf6baIc5lXb+ri8ZF5YWrSQkAAAAAwA2/5sDrgMzAAsAKQA0AAA3ITUzFSEVIRUjNSElIj0BBgcnNjc1MxU2NxcGBxUUOwEyNzY3FwYHBiMlBgcnNjcXBgcRI0QBikoBjv5ySv52AiZaNl0oYFtFl2UxgqsqligQFgdEDCIeQP2sPUQXvFY9L0JG3WJiRvz8t1dOFyE8HyjlxUpcMm5SXyMPFmEWfhwY5DUtSnueHFlI/rUAAAIAN/+dA7EDNwAcADgAABMhNxcGByEVIQYHIREUKwEnFzI1ESERIxEhNjchBBczNjcXBgczFSMVMxUjFSM1IzUzNSM1MyYnNzcBngxJAwgBkv5kCQwBfFliE10q/YBHAUkOBv5sAW0baSIaQhofcsjm5kbl5cVuFx9AAtNkCiI4RDYv/dhfRgIrAdT9twKNRCH/Rz9DGDkxP2lAtbVAaT83LhgACAA4/50DsAM2AAMABwALAA8ALQBAAEwAUgAAARUzNTMVMz0CIxUjNSMdAiMRMzUhNSE1MxUzJic3FhczFSEVMxEjNSMVIzUFITUzFTMVIxUUKwEnFjMyPQEhATM1MxUzFSMRIxEjABcHJic3AcWfQ6CgQ59D4v7/AQFDeQ8ZPhYPUP7840OgQ/7lAaxGdXVRZw81KSP+VP7vaUdlZUdpAe48LTZdKwGzQUFBQTNCQkJCp10BfkI+UVEjKQotKT5C/oJdVFSLOTk+jExDBR97AcX09Eb9qAJY/j49LjxCKgACACz/lgOxAwUAGwAlAAABIzUhFQYHMxUGBxYXByYnBgcnNjcmJwIHJzYTADcjNjchFQcWFwEprAJUGx3KLJF1li+jcYHFKrt6jEMyzzf5BAHCLdkhIv7xATbCAr5HL3tmKs2NVkFASlxnQ0M/W4Wj/rKuMc0Bxf61nWOncxjrpwAAAAADACn/mQOvAzgAIwAoAC4AABsBFwczNjcXBgchFSEGByEVBgcWFwcmJwYHJzY3JicGByc2ExcWFzY3EhcHJic3dE9HOa8eEkkSHQHU/hkVEQGZKXp5ryXAfnm4JqdxdDJtrCXaaTEzeG8qKUozPXMwAf4BFgzFdn8JdnZFRys+gmhIPENHUlVIQz1KWWrsjEK/AU+0ZldXZgGWVTNNXC8AAwAy/6MDtQMNAAMABwAdAAATIREhJTUhFQc1IRUhByEGBwYjIi8BFjMyNzY3ITfPAkr9tgIA/krnA4P9qyIB6QklJGZKOxhQPVETEgn+Ci8DDf7yQYyM3EVFde8wLQZCBiEhh7YAAAAEAB//nQPGAzIACgASABYAGgAAARYXByYnBgUnJDcBESM1IRUjERMhNSEDIRUhAg6/+Sf6saL+8yYBCrEBVkb+FEZGAez+FA4CCP34AzLdfkWH2MqXRYzR/f7+bTw8AZP+7s4BCEUAAAMAbP+eA4QDNgADAAcAGwAAASERISU1IxUTBgchERQrAScXMjURIREjESE2NwE9AXH+jwEr5XwSIgG5W5YUkiv9eEgBEyQSAc3+okTW1gJ5TUD9ZWJJAywCRv1EAwFHUAAAAAAFAET/lAOaAw0ABwALAA8AGQAfAAABESMRIREjERMhFSElNSEVEwYHBgUnNjc2NwAXByYnNwM4Sf4QSTQCFP3sAcz+e/EVSE7+6Rz0T0YMAR+iJqyxJgHH/pQBKv7GAXwBRvdBdHT+5txOXhlBD1JOt/7gUzxdMjkAAAAAAwAt/6MDjgMnAAcACwAqAAABESM1IxUjERMzESMFMzUGByc2NxcGBxUzFSMVFhcHJicRIxEGByc2NzUjA45G80VF8/P98q9LThffliJTTqurV1gqUzJGQGsejTyvAu38109ZAzP9agJTjJAMBkASOTweEZ5DMkxjPnE5/j0BvY90ToSoBAAAAAQAWf+eA48DAwATABsAHwAlAAATIRUjFTMVIxUzFSE1MzUjNTM1IyURIzUhFSMREyERIQAXByYnN/wB79Kzs+/92e6zs9MCk0n9XElJAqT9XAITGzgeJDkCZEGQQJ9CQp9AkOD8mysrA2X9CgKy/ks6GkQxEwAAAAIAI/+ZA7EDOAAPACQAAAEzNTMVMxUjESEVITUhESMnBgcnNjchNSE2NxcGByEVIQYHESMBZ+VJ7+8BHP2NAQ7loTw6Lbtp/v8BIyETSxgXAff96TpSRwFfurpG/uxHRwEUGj4wO5rIR0tECVA2R3Zn/hQABQA1/74DtQMyAB8AIwAnACsAOwAAEzM1MxUhNTMVMxUjETMVIxYXByYnIQYHJzY3IzUzESMTITUhNSE1ITUhNSETMxUzFSMVIRUhNSE1IzUzULFGAVlGsbHA0EiXHbpO/sZOriWUSNDAsfcBWf6nAVn+pwFZ/qeKRsvLAVf9BQFezc0C5kxMTEw9/rVAWjM+Tn19UT05WEABS/61SDlHOUr+UlBAakNDakAAAAEAN//QA7EDMQAPAAATIREzESEVIREhFSE1IREhNwGZSgGX/mkBcfzUAXH+ZwHuAUP+vUX+bUZGAZMAAAIAYP+sA5cDNAAWAC0AAAEGByEVAgUnNjcmJzcWFzY3IQYHJyQ3Ehc2NyEGByckNxcGByEVAgUnNjcmJzcCMBseAR+i/kkfiGovMz85LZxV/tx1oCoBDIMXMKVh/tiGsysBRp0+KSQBEtX9ux3sqDY0QAMSHhs5/uAjQQkfOS8eNjg/fEUkQjOP/VA/R4ZAH0AhpiIpGzn+ng1FAjhHMx4AAAAAAQAp/5kDvgMwABYAABMhNjc1MxUUByEVIRIFByQDAgUnJBMhPQGFCwJKDQGf/nZpATQt/tJwXf7ALQE/Tv6HAipTYlFBYGVG/q23OroBUv6UpzydAXIAAAQALf+dA7gDMAAUAB4AJAA7AAATITY3MxQHIRUhFgUHJCcGBSckNyEBMxEjNQYHJzY3JhcHJic3JQYHMxUGBSc2NyYnNxYXNjcjBgcnNjdFAYsFAUYFAYz+hWMBLR3+tVdc/q4eAUFN/ooBC0pKb4sdoXZ6SCdSVScCShIR8kv+riNzVTUnMi8vVibcN1EsjTMBAyQwKStGoTlDSLOwTkNEmQJv/jOPOy1CMD+cPzxJKDklKxs9410/Hi01HiUnMD9RPDE6UXMAAwA5/5sDqwMqAB4AOgBDAAABFSM1IwYFJzY3ITUhNQYjJyAlFwYHFSEVIRYXByYnASE/ATY3FwchFSMGBxYXByYnBgUnMjcmJzY3IyEGDwIWFzY3AhdIDF7+/SnoWf7lAXCdhRYBiAEDI4KsAXb+32nWIP1r/iUBHQ8PEgpGLQHptTdGcZwsh5qg/vco45eDfjIq6wFADxQQEGh1TTECLbOzfl1BRVVDUQdAMD0bDFZDXThEU4b+8hYWGhAVQUNlQCQ9OzozZQY/RykcPTcTGRUTFyI5VAAAAQA//6MDqQMFABkAABMhNTY3ITUhFQYHFSEVIREUKwEnFjMyNREhPwGlf3f9zgKUeJcBfP6EW6wTS1cv/lsBg45JZ0REcmNpRv7DXUYDMQEmAAAABABH/6ADogM7AA0AKQAvADUAAAEVIzUhFSM1ITY3FwYHBSEVBgcVIRUhFRQrAScWMxYzMj0BITUhNTY3IRIXByYnNwQXByYnNwOaR/1BRgI9QTFJNDn+DgIbZncBff6DVn4RIhYhHSb+awGVfkH+TDAsPjQ2QQEmK0AtM0ECZuCbm+BfchtrS5pAQTozRKlRQgEBKpBEUzckAU1OH1lDIDtXIF1IIAAABAA//5YDpgM7AA0AEwAZADQAAAEVIzUhFSM1ISYnNxYXAhcHJic3NhcHJic3BTMVBgchFSEGBxYXByYnBgUnJD8CITUhNjcDjEn9YkkBfBUWTBIU1VolZGgkpV8kZWIjAS9JBSYBRv6bDBTOsii/xHn+/icBFWwMC/5TAdIsBQLJyIWKzTsrDCtH/tA9OUgmNXA/OUglMwtvg2NDFhtKbD55SXZGPUp4EBBDYoQAAAACAEn/nQOxAzgAOgBIAAAABxYXFAcGKwEnMzY3NjUmJwYFJyQ3JicGByc2NyYnDwEGByc2NzY3ITUhFSMGBxYXNjcXBgcWFwcmJwIXIRUjNSEVIzUhJic3Am4aGQEgJYEiGDBZFhsCAqf+8CQBLKEOD43zJP+GDhoLC2epJZFzQyT+9wJX5TMfNR+MfTFSWkagM6VIgRIBeEf9QkcBjBATSQEYB0tRdi40PwIZIV0gEoNTPliFLB9vRT5DZxcdBQYwMDsgMR4UQ0MjEjY/J1IxNiO8cTuGzQHyO65ra64tJQ0AAAAEADH/mQO2AwoAEgAmACoAMAAAEyE1MxUzFSMVFCsBJxYzMj0BIQEVIRUUMyE2NzY3FwYHBgchIjURFyE1IRIXByYnNzECYUjc3FuAEEYxLP2fAu79wysCFx0MCgZEBw0VP/26WUgB9f4LlUk2OmwzAQc/P0bUVEcFJMICSe5GKwETEzwWQSAsA1QBTqxr/Y9SNkpYMQAAAAEAQv/LA6UC7QALAAABITUhFSERIRUhNSEB0/6kAwP+owGI/J0BkQKnRkb9akZGAAEAOf+eA68DOwAfAAATISYnNxYXIRUhFSERFCsBJxcyNREjESMRIxEjESE1ITkBmRcfThcaAZT+agFFW2QSXyr9SvFHATj+agK+PDUMMUxFg/6VW0UEKgEW/e0CE/54Ac2DAAADAEv/nAO7AywAGQAdACUAAAEhFSMVMxEUKwEnFzI1ESMRIxEjESMRMzUjBTMRIxMzEQIHJzYTAYoCMfXOTEQTQB6JRolFzvb+zUZGsEYEyDa5AwMIRnv+FlNGBCABmP2cAmT9/wJEexn98gKR/iT+1oovegELAAACAC7/mQOsAzoAHAAgAAATBgcnNjcXBgchFSEVIRUhFSEVIRUjNSE1MxEhNQMzNSP8P18wmDRLDhgCbf7GARf+6QFa/qZJ/jKlASnh4eECe2tHPHLDDjI5RqBF2Ebf30YBHaD+Q9gAAAEAKv+UA7EDQAAPAAAAFyEVIRUCByc2ExEhJic3AikcAWz9GwZhO1UDAXQXIE8DDEVF4P63xTWtASwBJTc0DgAEACr/nAOxAzkACQAPABUAJQAANyE2ExcCBzMVIRIXByYnNxYXByYnNzYXIRUhFQIHJzYTESEmJzffAaFdRUdBWtP9PqgxRCo9P/MrRSE2Px4YAW79GwZhO1UDAXMYHk8N1gE5E/7c2EYBh9IRw6kUXNgQw7QU1zhG3/61xTWtAS4BJTAqDgAAAwAm/6ADvAMuAAUALAAwAAATBgcnNjcEFzY3FwYHMzUzFTMVIxUzFSMVMxUhNSE1IzUzNSMGByc2NwcmJzcnMxEjgwMfOyAEARsaNAdBAwmPStray8v0/aQBHuXlnBs0NhENMhciNY9JSQJskowRgZAyYoSTCT85u7tG8Uf0Rkb0R/FuXDAhHg1dUxGp/HIAAQAt/5wDwwM0AEYAAAAXBzMVIRYXFhc2NxcGBxYXFjM2NxcGIyInJicGByc2NyYnJichFTMGBwYHIi8BFjM2NzY3IxUCByc2NxEhJjUzFBczJic3Ay80Lnr+zAocBw1KL0E9XxYXMh4cFkEkQz9IHBZojiiXZxcPIAz+qfUDFBNLFUAWOyghCQgDrQZwN18FAZ0DSgO7LUwuAv85LkW3bx0pdqMcyIYxIkgCmiTIWiUueUA8RII8PHnBpvxLQAMEQQMBKy2rHv70ozGJ9QFNS1BTSDY6LAAAAwAj/5kDxwMwAB4APABBAAAXJxYzMj0BBgcnNjc1IzUzNTMVMxUjFTY3FQYHERQjEzM1MxUzFSMVMxUGBxYXByYnBgcnNjcmJyM1MzUjFxYXNjdKDysgKUE5EkFLeXlGampDKTA8V+XwRvHxyjR6YpAmlmd1qCKgbX0qNd3wiyZuaDFZRAUt7xoTRxAc7EW3t0XPIRdGGxz+4VoC+Y6ORaNApXVMOzxCVV08PDVVeptEo+eCaWWGAAQAJP+kA7oDMQAeAC4AQQBHAAAXJxYzMj0BBgcnNjc1IzUzNTMVMxUjFTY3FQYHERQjATMVMxUjFTMVITUzNSM1MwMhNTMVMxUjERQrAScWMzI9ASEWFwcmJzdLDzYYJ0wvEkhFenpKZ2cwPDY2WAHQSsbG8P3d6cTE8AFmSImJWIcQWCYp/pqwMzkvVzhbRgQp8hoOSBEW4ka9vUbGFB5JGRf+4FkDjGtBeEREeEH+jEtLRf7xWkUEK/2ARilIWyMAAAAHACz/ngPCAzAAGwAgACQAKgAwADYAVgAAARUjFQYHFhcHJicGByc2NyYnIzUjNSE2NxcGBwUWFzY3JSE1ITYXByYnNwYXByYnNyUGIScgNwEnFjMyPQEGByc2NzUjNTM1MxUzFSMVNj8BFQYHERQjA6Y6M3Npkyqma3e2JqhuZiwpMAGFNihDKyz+0C1eYzX+gwG8/kT5GDwaKj6EFjwbKD4B3Nb+oxUBS9v8zA8rIho4LRE3P29vR11dExosKTBIAf++KnhZPipAMkpJMUMoPVd0Lr5XaRlkQ+xcSEddQmWwPBZERhBNOBZEQhFCKUEk/HtFBRn8Fg5HDhfwRbe3RdIJDRhJFhf+1UgAAwA5/5oDxwMyAAYAHwAwAAABBgcWFzY3MwIHFhcHJicGByc2NyYnBgcnNhMXBgchFQE2NxcGByc2NREzNSE1IREjAlQLDTBPVgNFBm5fgymFXmKkLatgTDEfISxtJ0YKEgE//QqEfwukuxUg4P7uAVjgAlggI62Dm9j+/bCESzxLhH9NO0uDe587KziYAQgMPktF/d4sPkJOPT4MJwF07UX+jAADACX/nQPGAzIAGAAfAF8AACQnBgcnNjcmJwYHJzYTFwYHIRUjBgcWFwcBDwEWFzY3JTM1MxUzFSMVMzY3FwYHMxUjBgczFQYHFTY3FQYHFRQrAScWMzI9AQYHJzY3NTY3IwYHJzY3IzUzNjcjNTM1IwMrR1ePKJZRQCIcJytqJkcLEgEBPgdZTHAs/vkKCiQ+QAb9QINFa2s6Tjs4NztKfSsnqjNFXVVOZFVkEj8bK29tCm15ODGbT0wtLC4XajEt+LKDCXSFWUFXinqONzI2mQEDDEZIReyrgV8/ArYcG5p8krt/WVlAZmN2HWhUQzMnNDw8MQ4XQxUOdFdBBS5XDQpFBQ5JKjRBKjMZH0ApMUNmAAACAC3/mAO+AzgAGAAdAAATISYnNxYXIRUjBgcWFwcmJwYHJzY3JicjMxYXNjdAAZ4pO0g2MgF+nEqTnvEn+KKr/yb9oa1EmuFAnIU/AohRRRpEbEjun4ZZPF2LnUo/RJKq6MqWkc8AAAYAJ/+fA8QDLQAFAAsAIgAuADQAOgAAEhcHJic3JQYHJzY3ATMRMxEzFSMVFhcHJicRIxEGByc2NyMBMxE3FwcRIxEFJyUCFwcmJzcSFwcmJzecFjgaITwBTQkmNyAJ/quzSZWVRlErQStJQ2EegTqsAs1Hbgx6R/7fCgEraUUxPGkwSkExPGYwAnxtEHFbEwV5YhJWe/7nAVv+pUYaP1k/Yjb+bAGXkmpLfokBoP3JEUgS/vMBAixILQGxSTFHTy7+yEoySlIuAAAAAAUAIv+YA8QDPQATAB0AQABGAEwAAAEGBxUhFSMRIxEjFQIHJzY3ETY3BTMmJzcWFzMVIQczJic3FhczNjcXBgczFSMVMxUjERQrAScWMzI1ESM1MzUjFwYHJzY3BBcHJic3A6KHngFHbEeUCGszWQezlPy/wRkbShgYvP49CnMWH0AeF24cGEEaGnHBsLBXNg8bEie6usqYJkc8RiEBTxY8Hig8Auc0CbZF/eoCFnf+/Zg2geQBrANATTIlDC02QqE7Nhc6Tj5MFkUvQXBC/ulVQwUoAQZCcPaVcyZrhWFKF2NWEAAAAAABAEH/mQOvAzUAJgAAEyEmJzcWFyEVIQcVIQIHBgciLwEWMzY3NjchBgcGByc2NzY3NjchQQGvIClEKSUBdv3xCQGfAxQedhtYFk40TBIIA/6iEzdJhDSgNDIIBQH+6QKdRzgZPlpFsgL+2VaABARBAwNpN9h3ZYFqMohycpBfMgAAAAADAIv/mQMtAwkAAwAHABkAAAEVIT0CIRUDBgcnNjcRIREUKwEnFjMyPQEBOQGr/lUEEmE3YgMCPVWBFFwiIwHStLRDr6/+xsN/MobQAej88lZHBCzHAAAACAAu/5kDkQMuABMAFwAbAB8AOQA/AEQASAAAEzMVMzUzFTMVIxEzFSE1MxEjNTMTMzUjEyMVMwczNSMSFzY3ESERFCsBJxYzMj0BIwYHJzY3ByYnNwcGByc2NwEVBzM9AiMVfUbDRjw8Qf4nST4+RsPDw8PDw8PD9ShiAwFMWGoUSh0nwhVjNwICNSdEOIk6SjlNNgHbAb69Ay5wbm5D/llERAGnQ/4WYgFFZKJj/kdGitQB1Pz2XEcEM8fFhDMCBCZMWyMbbU8pS2QBX6YMskKvrwACACr/ngO6AzMAFwAdAAATITUzFSEVIRYXByQDIxEjESMCBSc2EyEkFwcmJzdMAYRIAYX+w2P3MP75ZwRIBG/+9Cf0av7EAscwOTVUOAJE7+9F+sI94QEY/Z8CYf7Ix0CuARHVRSlQWCMAAAACACf/mAPCAzAAFgA4AAATMzUzFTMVIxUWFwcmJxEjEQYHJzY3IwA3NjURIREUOwEyNzY3NjUXFAcGBwYrASI1ESMRFAcGByc6lUh8fEhFKjkqSDNWH3QxkgFrGRQBQxciEgoMAwdCBwYYGS8pYLIbGUc7Am3Dw0UzS1U+Xzr98QH8kn5Ok7r+BGpemAF7/Q0cBgkUI4oVYUM1FRFZArb+yaVvc201AAACACb/oAOvAzEAFwAuAAABNSE1MxUzFSMRFCsBJxYzMjURAgcnNhMlMzUzFTMVIxUWFwcmJxEjEQYHJzY3IwHVAR9Ic3NbahBEHSxp3yboZv1moUiFhU9NKkIwSDVfIXw2ngImRcbGRf3lY0YEMwHP/um6QbIBGEfDw0U7Slk+XTn9/gIMoIVSk7wAAAUAIf+gA7oDMAAFAA8AJgAqAC4AAAAXByYnNwUGBzMVITUhNjcBIxEGByc2NyM1MzUzFTMVIxUWFwcmJxMhFSETIRUhAj4uQCw2PwFqLDOI/hEBHDkq/edJNVUfdTCOkkl4eElLKzcycAI3/clKAbD+UALTXiBdUx8NiF5GRnWK/HACBJh7TpC5RcPDRThKXz9WQv54RgFbRgAAAAQANP+ZA7gDMQAjACkALwA1AAABMxUhFSEVIRUhBgc2NyYnNxYXByYnBgUnPwE2EyM1ITUjNTMkFwcmJzcSFwcmJzcTBgcnNjcCRkoBAf7/ARz+uGBIv7IlM0VaREQVFdz+8REIB0dy7QEd7e3+mkUzOmwzQz40NmIztUFPSFo9AzGhRrtG5GAUJk5VF5erHjYwLBo/BAM+AQxGu0ZITDRGUzH+yEwzR1Ux/tC/pR+xrgAAAAkAJ/+WA8IDNAAFACAAJgAsADIAOABAAEcAWAAAABcHJic3JzMVBgczFSMWFwcmJwYHBgcnNjc2NyM1MzY3AQYHJzY3BBcHJic3FhcHJic3BBcHJic3AAcWFzY3Jic2NyMGBxYXNxUCBSc2NyYnBgcnNjcXDwEDcyQ4JD81pkcBBdbCLbUnqjwRF0GSJ4s3GQ2BiwUB/mgkOUc7KQEAD0cLFEP1HUgZJkUBCj1APEc8/Z4iU0EhG01JyhSeEBFLSHVZ/rMieFhCTxsVLKc7RgkJAt05Jz5JIQpzMy1GuWs+brM8LndBPkFqOUFGLzH90o5qGWKPhW4ReW4OYoQRimkPXoAhh28dAWUpIigqLSoXLUolHxgkwDv+kZw/MlYsIRwTOZTXCR0dAAAAAAIAJf+cA5kDMgAmAEUAAAEGByEQBwYjIi8BFjMyNzYTIwYHBgcnNjc2NyMCByc2EyMGByc2NwEGByc2NxcGBzM1MxUzFSMVNjcVBgcRIxEGByc2NxECOhEUAYQPEJIVNhFAElwNCgNEITpGhyx8QDodTjyoL5o5RSgzNWAq/pYSFj84CD8DCUZGVFQpJxo2RlVIEVpUAyBTPf4LdYQDQQRgUAG4/JCeezlwjIvl/rqVNoMBIl8/LXjh/uhRQRKdsQVDQNDQRvATFUQOGv62ASwhF0YVIQEOAAYAJv+6A7oDCgATABcAGwAfACMAOgAAASERIxUzFSMVMxUhNTM1IzUzNSMlNSMVIzUjFTUzNSMzFTM1ATM1IzUhFSMVMxUjFTY3FQYHJzY3NSMBqQHhzNzc/P3A/N7ezQGch0iIiIjQh/z8a3kBO3htbTw8ia0SP0drAwr+MXpDf0VFf0N6QIiIiIjGiYmJ/vXsRETsRuAVG0U8LEUNE/gAAAAAAQAq/74DswMpAB4AABMGByc2NxcGBzM1MxUhFSEVIRUhFSEVITUhNSE1ITXxPF0ujjFJEBfWSQFJ/rcBNf7LAYn8jwGf/toBJgIxiF07lPoNTUSyskbnRv5ISP5G5wAABQAq/6UDdgMJAAMAGQAdACEAJQAAExUzNQUGByc2NxEhERQrAScWMzI9ASERIxElNSEVJTUhFScjFTPV/v7+EV83YAMC6WlzE1AfN/7wSgFa/vABEP7wSv7+AdSvr/HBfTGGzQHg/RNpRwQ/rv7bASVCr6/yrKysrAAAAAUAgf+rA74DLwADAB8AIwAnACsAABMVMzUVIxUjESE1MxUhESEVFDsBMjc2NxcGBwYjISI1NzM1Izc1IxUnIxUzyfb2SAE+SQFE/rxC5CQTDwZECRocRf7/ekn9/f39Sfb2AY2WltlVAkeJif4Oe0oREmUXgRoag8mWQpOTk5MABQBS/6gDmQMzAA0AEQAVABsAMgAAAREjNSMVIxEzNjcXBgcDMzUjNTM1IwAXByYnNyUQBwYjIi8BFjMyNzYTIQYHJzY3FwYHAZRGtkZdHQ5IExxbtra2tgISPDk9aDoBVw8PfQZgET8uRgwJA/7tKzkraCdHFBgChP1WOVYCx1JcCl9F/dLbQc7+5WMoY3sk5/4NdXMEQgNNVAGxYEs5kdcJYUYAAAAABACX/50DUQL6AAcACwAPABMAAAERIzUhFSMREyE1ITUhNSE1ITUhA1FJ/dhJSQIo/dgCKP3YAij92AL6/KNBQQNd/SqtRK9ErAAAAAYAMf+dA7kDMwADABoAHgAiACYALAAAATMRIwcGByckNyYnNxYXByYnBgchESM1IRUjNyE1ITUhNSE1ITUhEwYHJzY3AcRHR+RZOB4BzvQ6OTCabzImO5TfAY5G/itGRgHV/isB1f4rAdX+KzJahCqAVQMz/wC4EwpBS6cuKS5rbzMrMmlJ/estL29WOlA6UQFWb1Q2UWkABAAi/5sDwAL/AAMAGQArAC8AAAEjETMVIwIHJzYTIzUzESM1IRUjETMVIxEjATUhFSMGBzMRIxUjEQYHJzY3EzMRIwMEnJydDKc1mQltblgB21x3d0X9QAFanhUpuaxCHyYmdCYTaWkCu/7rRf7LkTCBARVFARVERP7rRf49AxdGRndr/jtPAaI5Mz6n2/2YAUYAAAQAIf+YA5sDLQAPABMAFwA1AAABMxUzESM1IxEjESMVIxEzAzMRIxMzESMlNjcXBgcVMxUjFRYXByYnESMRBgcnNjcjNTM1BgcCi0nHRoFJfUXCfX19xoGB/WO8ihlEPoGBSE8oQC9GN1gedDKTmjdSAy2w/jsz/rEBTzMBxf6xAQv+9QELrRIyQxUOpUUyRFs8YDz+LwHMlHNLhqxFlwkKAAAABAAj/58DwQMtAAsAEQAXADUAACUHESMRBSclETMRNwAXByYnNxIXByYnNyUzNQYHJzY3FwYHFTMVIxUWFwcmJxEjEQYHJzY3IwPBZUn+yAoBQkla/uZJMUN0M1RHM0F4M/4tpEpLDsCaGTxMiopXVSlTMEg9ZB9+OpzNEP7iARIzRjQCNf3XDwGaTjJLUDH+xk0zSlkxEJQLB0QPLkQRDqBFMUZSPWEv/jcB05V2UICnAAAABAAj/6ADuQMiAAMABwAbADkAAAEhESElNSEVByEVIxUzFSMVMxUhNTM1IzUzNSMlMzUGByc2NxcGBxUzFSMVFhcHJicRIxEGByc2NyMB1wGf/mEBWf7ubgHx2ru78v3P9ba2zf6Kkk02DryQGT1SgYFHSCg4L0U0Vh9yMIsDDP7fQaCgnkKAQYdERIdBgNORCwZEEi1EEhCdRTVFVjxXPv49AcOSd1CGqgAAAAQAO/+eA68DNQAFAAsAGQA+AAAAFwcmJzcHBgcnNjc2FyEVIzUhFSM1ISYnNwMzFQchERQ7ATI3NjUXBgcGKwEiNREjBgcGByc2NzY3IzUhNjcC9YAniZQknXy5H7N2fhIBckj9WkgBeBMXT21JAwECHkolCQZCCQgQVVVaxRc4Vb8nuksoEvsBBgICAhNuPHhJNi52TEFDcKY4yYaFyDAnDP6MSTf+xBwnPEMTgBg/VAEEbkprPD47XjtNRBMkAAQAM/+cA64DPgAiACgAOQBKAAATITUzFSEVIRUhFSMVMxUjFRQrAScWMzI9ASE1ITUhNSE1IRIXByYnNwMGByc2NxcGBzMVIxYXByYnJQYHJzY3FwYHIRUjFhcHJieoAShIAST+3AGWv5SUWZEPUDcq/c0CM/2SAZf+2MMtODBWOk4mLz5bKUUOC/+vKRpBJSgBdxwkPEogRAwMASK1KRg+JSoCBz4+QF4/YUGZU0QFJYhBYT9e/lQ8J0BQJAJASDonbX8PKxo9NC0YRjMCODAlYnEOJCE9NC4YQzcAAAAAAwA6/58DqgMlADQAOgBAAAAFJxYzMj0BBgcnNjc2NwYHJzY3NjcGIycgJRcGBwYHNjc2NxcGBzY3Jic3FhcHJicGBxEUIyQXByYnNwUGByc2NwFfEDwrJa2MFB4he5lpsA8XGkxlf5AVAa8BHSOg0Ftiaok6UUL9vf2mMB42fEE6GShajlUBdWYzXJ4w/rZlly2TY2FHBSX1CwZAAwo2cggLQAMKJm4GPz89JRBqQAUMLkkh32MOFD4jIZFpKSw7Cgz+9FW3bzNqdy40elw6V3IAAAAEAC//vgO9AzIACwAbACEAPgAAJSM1IRUjFTMVITUzAgcnJDchNSEVBgcWFwcmJwEGByc2NwMGBzY/AjY3FwYHNjcVBgcnNjc2NwYHJzY3NjcCc8QB08Xz/dLxJ6ctARht/qoBrCdFgVU1ToL+wIyzCruOWkxLR0QODxIMQoheZ2qYkRMSDj1SakASEQ1PO+FGRt1GRgGXQTtpwUU5WUpSTTVPV/48Ry5FK0kCncJwBgwbHCEYGfpvFiZCNRRCBw0+jA8IQggThrAAAAAFADL/ngO+AzoABQAgAD4ASABaAAAlBgcnNjcDBgc/ATY3FwYHNjcVBgcnNjc2NwYHJzY3NjcTPwE2NyM1MyYnNxYXMxUhBgc2NyYnNxYXByYnBgcXBgcGByc2NzY3ASI1ETMRFBczNjc2NRcUBwYHAX6Oqgm5iFtLTkJCGxxBhFdraJmVFBEQPFBoPBMRD089sggGMlGr/xQZSRcV4f7OQTSTgyMkN2k1OBQatOGXBygsZS5ZJSQGAR5aRRdBIQYHPhQUQTtHKUgpRAKXwHIKDDM4GflsGy5DQBlCCA4/jBMJQgcVhbT+iQMCJZxFNzAMNT5FgT0NFTMuIoZfJyYnHREpvFpdQjk5TE6p/ltXAVf+tR8DAyExURaNIh8DAAAFADT/nAO2AzQAFgAeACIAJgA0AAATISYnNxYXIRUjFhcHJicGBSc2NzY3KQEGByQ3Jic3ARUhPQIhHQIjESERFCsBJxcyPQE0AZ8TF1MSFgGS64hKPCMo5P5eEBIRRFb+9wFjWEgBFNMnJzL+UAHH/jlIAlhgZhNhLwLYKiUNIzlEbFMqKiYTED8DBypJUiwJESUfIP47T08+VVXLpgIE/ltdRgMpOAAGADH/mgOxAzEAAwAHABUALQBFAFkAADcVMz0CIx0CIxEhERQrAScXMj0BEyI1ETMVNjcXBgcVFDsBMjc2NxcGBwYjAxQ7ATI3NjcXBgcGKwEiNREzFTY3FwYHAhcHJicGByc2NzY3FwYHNjcmJzex6+tFAXZUUxJNJv1dR6JsF4ShKoMeDA4GQwocGTS5J4MeDA8GQgocGTWkWkeXdReCoZEvOxAUm+QUEg9fO0RGUoWGIiI4811dPGBg1r8CNP4rWEQDJlEBJmQBToUZHD8hF3gyDxNcFXccF/6RJA8TXhV5HBdWAVV7DyRAJg8B01UqISEZDDwFDmJ2FYdLCBE0MCMAAAQApP+dA0QDNAANABEAFQAZAAABESM1IRUjETM2NxcGBwMhNSE1ITUhNSE1IQNESf3ySdYjDkkRHtgCDv3yAg798gIO/fICuvzjPj4DHUA6Cjw0/WajQplClQAABAAw/50DtgM2ABMAOgBPAFUAABMzNTMVITUzFTMVIxUjNSEVIzUjBQYHFhUUBwYjIi8BFjMyNzY3NCcGByc2NyYnBgcnNjcmJzcWFzY3FzM2NTMUBzMVIxYXByYnBgcnNjcjJBcHJic3POpKAQhK6upK/vhK6gFVODNKJxw4LzAUNC4lEBECBk9yJ4lREBZDWSZaQi4/NUArMCZMzgNIAubgPbswrTk8pzDDG8oBxio2LEc2AuRSUlJSQUlJSUmGSC19tYQzHgJCBRkfYzAqUDs7P1s3JzAsPCU0NyQtKDUvMq9sOyeARPWDNoPLy4c0pNq6OCY/RCEAAAAAAQAo/54DsQMvADYAABMhNTMVIRUhFSEVIRUhFSEWFzY3FwYHFhcHJAMjBgcVNjcXBgcnNj0BBgcnNjchNSE1ITUhNSFoAWdKAWf+mQEt/tMBiv6UKTpVTTJOXmOMJv7ofiM0U2Z5Dp2fGCBTYybvb/6/AYr+0wEt/pkC2lVVQl9CX0FgSydSME4paTU/cQFFSz/VHjFDPyc+DBidMyc+VIJBX0JfAAAAAwAt/5oDwAMyAAsAEQAeAAABIREzESEVIREjESECFwcmJzcDMxE2NxcGByc2NREjAWYBD0oBAf7/Sv7xXUQxPm4yddhGMBJabhwXkwIFAS3+00j93QIjAR1NMklSMf7Y/kM7LUpXSUESGAGPAAAAAAQAK/+tA8oDMQAKACIAKAA1AAABFhcHJicGByc2NwMiNREzFTY3FwYHFRQ7ATI3NjcXBgcGIwAXByYnNwMzETY3FwYHJzY1ESMCo1bRJ8NdcKsnumcqZUiUiSmumDO2LxMbCUMMJyFI/fNEMT5uMnLTM0ASWW4cGI0DMb2MPYO/1nU9gtD8fG4BsbI4ZTpyOKk5FhuBFqAiHgMtTTJJUjH+2P5EKj9IWU1AFBgBkAAABwAq/58DxgMmABsAHwAjACcAKwAxAD4AAAEhESMVIRUjFhcHJicjESMRIwYHJzY3IzUhNSMlNSMVIzUjFTUzNSMzFTM1JBcHJic3AzMRNjcXBgcnNjURIwGOAe3WAQjGQ5wupkQJQwlJqiWaRsUBBtQBq5RDkpKS1ZT9yUcxPGwzedEnNBNQXRwViwMK/oJdQ4NpOYSh/rMBTbN0PV2NQ109Y2NjY55kZGQTTzFHUDH+2f5UIzRKUENAEhgBgwAABQA6/7YDrgMqACYAKgAuADIANgAAEyE1ITUhNQYHJyQlFwYHFSEVIRUhESEVIRUhFSEVITUhNSE1ITUhJTUjFSM1IxU1MzUjIRUzNZoBNf55AYeYpAwBhwEqFoO0AYn+dwE2/soBZv6aAZf8jAGV/p0BY/7LAm3wSPDw8AE48AILQD5ICQY7DCE8DgtMPkD+tEU6Sz8/SzpFOlFRUVGJT09PAAACAC7/tgO5AykAKQAuAAATBgcnJDczFgUHJicVIxUhFSERMzY3FwYHMxUhNTMmJzcWFzMRITUhNSMlJicGB+5MSioBBqQ6jwEYKUtJ4wFk/pxvMydCJi3P/KnPHjJBLShz/psBZeMB/ZJkcooCEy4iPHW1snU8Hys3kEP+9WJwF2lSRUVcVxhRegELQ5BBVnV2VQAABABO/5sDwgM2AA0AEQAzAEwAAAEVIzUhFSM1MyYnNxYXByEVIQchFSMRFDsBMjc2NxcGBwYHIyInJjURIwYHBgcnNjc2NyMBIRUGBxYXBgcGBycWMzY3NjU0JzY3IxEjA6NF/npG4xETShMQ4gF+/oJKAhymHSoSChAFQgkWFTBPJhQVXQwoM3UtZy0mCYv+vAEhKDhVBgIcKGUYIAgxExFcPSOURgLXy4eHyy8kDC0yvkFxRP7xKAoNdhaEHBYCGRolASCgSlo/OzdHQ4cB5jiGlYZoRB4jAkkEAw4PImR/moH81QAAAAQALP+VA8cDBgAOACAAJgAyAAATIzUhFSMRNjcVBgcnNjcTIRUjBgczESMRIREjETM2NyMAFwcmJzcDMxUGBwYHJzY3NjeudwE1dEkyhK4VTDbkAirvCQzURv6/RbAOCPQB41I0S4EwekgGUFCzJ69FQAQCiUZG/jwZFUg3L0cTDwJYRjY1/ioBlP5sAdYvPP1hWDRXYy4BVqyjYFkzPi9OUoIABwAi/5sDwAMKACAAJAAoACwAPgBEAFAAAAQ3ByMgJyYnBgcnNjcXBgcWFzUjNSEVIxUzFSMVFhcWIQEhESElNSMVNTM1IyUhFSMGBzMRIxEjESMRMzY3IwAXByYnNwMzFQYHBgcnNjc2NwNcZAzF/plaTj8bMDRQCEADBycvxgGooomJCQZXATP9jgFV/qsBD8nJyQFDAbC0BQilQ/JAjwgFuwF9PzM6XzBpQgM0MnQnaywoAxAESCchWGBRLoirCDYpPSPzQkJqQG0DAxsDGv6jPFVVjVg7Qiwk/mwBVv6qAZQiLv3RSDJITC4BKo2NV04uPidESG8AAAAABgA6/5wDrQM3AAkAFwAbAB8AIwAnAAAAFyEVITUhJic3EycXMjURIREjESERFCMBIRUhJTUhFRchFSElNSEVAhMOAYz8jQGaDBZPuBFQPv18RQMObv3QAi/90QHp/l0NAYn+dwFJ/vgDCi1CQiIqDvxoQQMwARD+fwHA/qJfArvCOFJS88g2XFwAAAABAAAAATMzEUvW918PPPUAAwPoAAAAANGAbcIAAAAA0rGijgAY/5EDygNAAAAABAACAAAAAAAAAAEAAAQk/qwAAAPoAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAALA+gAAAJYAG8CWAA+AlgAMwJYABgCWAAzAlgAMwJYAEICWAAqAlgAMwPoADIAOgAzADkAMABpADkAOgAjACEAJAAnACMAIAAkACoANAAsACQAIwAzACcANgA3ADgALAApADIAHwBsAEQALQBZACMANQA3AGAAKQAtADkAPwBHAD8ASQAxAEIAOQBLAC4AKgAqACYALQAjACQALAA5ACUALQAnACIAQQCLAC4AKgAnACYAIQA0ACcAJQAmACoAKgCBAFIAlwAxACIAIQAjACMAOwAzADoALwAyADQAMQCkADAAKAAtACsAKgA6AC4ATgAsACIAOgAAAAYAHgBUAJ4AvAD6AUIBVgG2Af4CDAI8AoQCtAL+AywDWgOAA6ID6AQsBG4EqATwBWoFugYABkoGkAbiBzoHfgfOCCAIkAjSCSIJVgmKCbwJ+go+CnoKtAsKCygLegumDAoMdgygDPQNTA2+DgoOIg5UDpAOxA7kDygPcA/cEDgQmhEgEXIR/BIwEpYTChNME3gT4hQYFGwUshUAFVoV8BZcFq4W3hccF1wXsBfWGCIYbBi+GRgZbhnOGj4aqBsOG54b8hx0HKIdIB10HaweBB5kHrYfAB90H8YgRCCKAAEAAABvAGAACQAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAQAMYAAQAAAAAAAAAhAAAAAQAAAAAAAQAMACEAAQAAAAAAAgAHAC0AAQAAAAAAAwAcADQAAQAAAAAABAAPAFAAAQAAAAAABQAcAF8AAQAAAAAABgAKAHsAAQAAAAAABwANAIUAAwABBAkAAABCAJIAAwABBAkAAQAYANQAAwABBAkAAgAOAOwAAwABBAkAAwA4APoAAwABBAkABAAeATIAAwABBAkABQA4AVAAAwABBAkABgAUAYgAAwABBAkABwAaAZxDb3B5cmlnaHQoYykgUVpEIENvcnBvcmF0aW9uLjIwMjFRWkQtUElOR0ZBTkdSZWd1bGFyUVpEIFBpbmdGYW5nIFNDOlZlcnNpb24gMS4yMFFaRCBQaW5nRmFuZyBTQ1ZlcnNpb24gMS4yMCBKYW51YXJ5IDUsIDIwMTZQaW5nRmFuZ1NDQnkgUVpEIENYWSBGRQBDAG8AcAB5AHIAaQBnAGgAdAAoAGMAKQAgAFEAWgBEACAAQwBvAHIAcABvAHIAYQB0AGkAbwBuAC4AMgAwADIAMQBRAFoARAAtAFAASQBOAEcARgBBAE4ARwBSAGUAZwB1AGwAYQByAFEAWgBEACAAUABpAG4AZwBGAGEAbgBnACAAUwBDADoAVgBlAHIAcwBpAG8AbgAgADEALgAyADAAUQBaAEQAIABQAGkAbgBnAEYAYQBuAGcAIABTAEMAVgBlAHIAcwBpAG8AbgAgADEALgAyADAAIABKAGEAbgB1AGEAcgB5ACAANQAsACAAMgAwADEANgBQAGkAbgBnAEYAYQBuAGcAUwBDAEIAeQAgAFEAWgBEACAAQwBYAFkAIABGAEUAAwAAAAAAAP+cAEAAAAAAAAAAAAAAAAAAAAAAAAAAbw== '
 22 |         # >>> font = FontDecrypt(str_base64=str_base64_)
 23 |         # >>> font.decrypt('𤵼𥙌腐进𤵝防护𥪵𤹩腐蚀𧒄𥉥𥪵𥌙测试𢺔𦰤𤵢𤵩𥤣𥎧𤹩𥤣𥎧')
 24 |         # 2022-01-20 17:01:47.932 | SUCCESS  | __main__:decrypt:89 - Out Decrypted Strings：中国腐进与防护学会腐蚀电化学及测试方法专业委员会委员
 25 |         # >>>
 26 |         # >>> # 字体链接
 27 |         # >>> font_url_ = 'https://ss.cods.org.cn/css/woff/791831.woff'
 28 |         # >>> font = FontDecrypt(font_url=font_url_)
 29 |         # >>> font.decrypt('┗┐┑┕┏┓┐┐┤┘┑┛┣┢┏┨┐┢')
 30 |         # 2022-01-20 17:01:48.519 | SUCCESS  | __main__:decrypt:89 - Out Decrypted Strings：92371522mA3dlk1q2k
 31 |         # >>>
 32 | 
 33 |     """
 34 | 
 35 |     def __init__(self, font_url=None, str_base64=None, fontsize=30, host="http://127.0.0.1:9898/"):
 36 |         """
 37 |         :param font_url: 仅支持woff和ttf格式字体
 38 |         :param str_base64: 字体base64
 39 |         :param fontsize:
 40 |         """
 41 |         self.img_mode = 'RGB'
 42 |         self.bg_color = (255, 255, 255)
 43 |         self.fg_color = (0, 0, 0)
 44 |         self.fontsize = fontsize
 45 |         self.font_url = font_url
 46 |         self.str_base64 = str_base64
 47 |         self.headers = {
 48 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
 49 |                           ' Chrome/93.0.4577.63 Safari/537.36'
 50 |         }
 51 |         self.encrypt_str = None
 52 |         self.font = None
 53 |         self.host = host
 54 |         self.font_process()
 55 | 
 56 |     def font_base64(self):
 57 |         url = 'https://www.motobit.com/util/base64-decoder-encoder.asp?charset=iso-8859-1&acharset='
 58 |         data = {
 59 |             'tobase64text': self.str_base64,
 60 |             'tobase64': 'Convert the source data',
 61 |             'tobase64file': '(binary)',
 62 |             'maxLineChars': '76',
 63 |             'todo': 'decode',
 64 |             'output': 'file',
 65 |             # 'filename1': '1.ttf'
 66 |         }
 67 |         r = httpx.post(url=url, headers=self.headers, data=data)
 68 |         return r.content
 69 | 
 70 |     def font_process(self):
 71 |         """
 72 |         TODO 修改FreeTypeFont对象源码支持字体bytes类型
 73 | 
 74 |         ***
 75 |         def load_from_bytes(f):
 76 |             if isinstance(f, bytes):    # 直接传入bytes
 77 |                 self.font_bytes = f
 78 |             else:
 79 |                 self.font_bytes = f.read()
 80 |             self.font = core.getfont(
 81 |                 "", size, index, encoding, self.font_bytes, layout_engine
 82 |             )
 83 | 
 84 |         if isinstance(font, bytes): # 直接传入bytes
 85 |             load_from_bytes(font)
 86 |         elif isPath(font):
 87 |             ***
 88 |         """
 89 |         if self.font_url:
 90 |             font_content = httpx.get(self.font_url, headers=self.headers).content
 91 |         else:
 92 |             font_content = self.font_base64()
 93 |         self.font = ImageFont.FreeTypeFont(font_content, self.fontsize)
 94 |         ttf = TTFont(BytesIO(font_content))
 95 |         # 混淆后字体列表
 96 |         self.encrypt_str = [chr(string) for string in ttf.getBestCmap().keys()]
 97 |         logger.debug(f'Encrypted Words List：{self.encrypt_str}')
 98 | 
 99 |     def draw_img(self, letters: str):
100 |         letter_width, letter_height = self.font.getsize(letters)
101 |         img_size = (letter_width + 10, letter_height + 10)
102 |         img_width, img_height = img_size
103 |         img = Image.new(self.img_mode, img_size, self.bg_color)
104 |         draw_brush = ImageDraw.Draw(img)
105 |         text_y = (img_height - letter_height + 1) / 2
106 |         text_y = int(text_y)
107 |         text_x = int((img_width - letter_width + 1) / 2)
108 |         draw_brush.text((text_x, text_y), letters, fill=self.fg_color, font=self.font)
109 |         return img
110 | 
111 |     def orc(self, word: str):
112 |         import base64
113 |         img = self.draw_img(word)
114 |         img_object = BytesIO()
115 |         img.save(img_object, 'JPEG')
116 |         img_bytes = img_object.getvalue()
117 |         api_url = f"{self.host}/ocr/b64"
118 |         res = httpx.post(api_url, data=base64.b64encode(img_bytes).decode())
119 |         return res.text
120 | 
121 |     def decrypt(self, word: str):
122 |         logger.debug(f'In Encrypted Strings：{word}')
123 |         string = ''
124 |         for letter in word:
125 |             if letter in self.encrypt_str:
126 |                 ocr_str = self.orc(letter)
127 |                 if ocr_str:
128 |                     string += self.orc(letter)
129 |                 else:
130 |                     string += letter
131 |             else:
132 |                 string += letter
133 |         logger.success(f'Out Decrypted Strings：{string}')
134 |         return string
135 | 


--------------------------------------------------------------------------------
/CrawlersTools/js_crawler/transfer_js.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/8/12 16:59
 4 | # @Author  : MuggleK
 5 | # @File    : transfer_js.py
 6 | 
 7 | import ctypes
 8 | import difflib
 9 | 
10 | 
11 | def int_overflow(val: int):
12 |     """
13 |     Process JavaScript nums Overflow
14 |     :param val:
15 |     :return:
16 |     """
17 |     maxint = 2147483647
18 |     if not -maxint - 1 <= val <= maxint:
19 |         val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1
20 |     return val
21 | 
22 | 
23 | def right_shift(n, i):
24 |     """
25 |     Python Operator ">>"
26 |     :param n:
27 |     :param i:
28 |     :return:
29 |     """
30 |     if n < 0:
31 |         n = ctypes.c_uint32(n).value
32 |     if i < 0:
33 |         return -int_overflow(n << abs(i))
34 |     if i != 0:
35 |         return int_overflow(n >> i)
36 |     else:
37 |         return n
38 | 
39 | 
40 | def string_similar(s1: str, s2: str):
41 |     """
42 |     Compare Strings Similar Percentage
43 |     :param s1:
44 |     :param s2:
45 |     :return: :float: percentage
46 |     """
47 |     return difflib.SequenceMatcher(None, s1, s2).quick_ratio()
48 | 


--------------------------------------------------------------------------------
/CrawlersTools/logs/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/8/11 21:05
 4 | # @Author  : MuggleK
 5 | # @File    : logs.py
 6 | 
 7 | from .logger import init_logger
 8 | from .handlers import default_handler, logstash_handler
 9 | 
10 | from CrawlersTools.logs.log import Logging
11 | 


--------------------------------------------------------------------------------
/CrawlersTools/logs/formatters.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from logging import Formatter
 3 | from typing import Tuple, List, Optional, Union
 4 | 
 5 | EXTRA_IGNORE_FIELDS_DEFAULT = (
 6 |     "name",
 7 |     "msg",
 8 |     "args",
 9 |     "levelno",
10 |     "pathname",
11 |     "filename",
12 |     "module",
13 |     "exc_info",
14 |     "exc_text",
15 |     "stack_info",
16 |     "lineno",
17 |     "funcName",
18 |     "created",
19 |     "msecs",
20 |     "relativeCreated",
21 |     "thread",
22 |     "threadName",
23 |     "processName",
24 |     "process",
25 | )
26 | 
27 | 
28 | class JsonFormatter(Formatter):
29 |     """格式化日志到Json，并删除某些字段"""
30 | 
31 |     def __init__(
32 |         self,
33 |         extra_ignore_keys: Optional[Union[List[str], Tuple[str]]] = EXTRA_IGNORE_FIELDS_DEFAULT,
34 |         with_timestamp: bool = True,
35 |         **kwargs
36 |     ):
37 |         """
38 |         :param ignore_fields: 需要从 record[extra] 里忽略(排除)的字段
39 |         :param kwargs: 这里的 key:val 会添加到格式化后的消息中 eg: app=explore
40 |         """
41 |         super(JsonFormatter, self).__init__()
42 |         self.extra_ignore_keys = extra_ignore_keys
43 |         self.with_timestamp = with_timestamp
44 |         self.kwargs = kwargs
45 | 
46 |     def formatException(self, exc_info):
47 |         exc_text = super(JsonFormatter, self).formatException(exc_info)
48 |         return repr(exc_text)
49 | 
50 |     def format(self, record):
51 |         message = {
52 |             **self.kwargs,
53 |             **self.get_extra_info(record),
54 |         }
55 |         if self.with_timestamp:
56 |             message.update({"timestamp": self.format_timestamp(record.created)})
57 | 
58 |         if record.exc_info:
59 |             message["message"] = self.formatException(record.exc_info)
60 |             message["stack_trace"] = "".join(record.getMessage().split("\n"))
61 |         else:
62 |             message["message"] = record.getMessage()
63 | 
64 |         return json.dumps(message)
65 | 
66 |     @classmethod
67 |     def format_timestamp(cls, time):
68 |         return int(time * 1000)
69 | 
70 |     def get_extra_info(self, record):
71 |         return {
72 |             attr_name: record.__dict__[attr_name]
73 |             for attr_name in record.__dict__
74 |             if attr_name not in self.extra_ignore_keys
75 |         }
76 | 


--------------------------------------------------------------------------------
/CrawlersTools/logs/handlers.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | 
 4 | from .formatters import JsonFormatter
 5 | 
 6 | DEFAULT_HANDLER_FORMAT = "<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level:8}</level> | " \
 7 |                          "<level>{name}</level>:<level>{function}</level>:<level>{line}</level> - " \
 8 |                          "<level>{message}</level>"
 9 | LOGSTASH_HANDLER_FORMAT = "{message}"
10 | 
11 | 
12 | def default_handler(level="DEBUG", format=DEFAULT_HANDLER_FORMAT, **kwargs) -> dict:
13 |     return dict(sink=sys.stderr, level=level, format=format, **kwargs)
14 | 
15 | 
16 | class LogstashHandler(logging.StreamHandler):
17 |     def __init__(self, formatter=None):
18 |         super().__init__()
19 |         self.formatter = formatter
20 | 
21 | 
22 | def logstash_handler(
23 |         level="INFO",
24 |         format=LOGSTASH_HANDLER_FORMAT,
25 |         extra=None,
26 |         **kwargs
27 | ) -> dict:
28 |     if extra is None:
29 |         extra = {}
30 |     elif not isinstance(extra, dict):
31 |         raise TypeError(
32 |             "The 'extra' parameter should be a dict (or None), not: '%s'"
33 |             % type(extra).__name__
34 |         )
35 | 
36 |     return dict(
37 |         sink=LogstashHandler(JsonFormatter(**extra)),
38 |         level=level,
39 |         format=format,
40 |         **kwargs
41 |     )
42 | 


--------------------------------------------------------------------------------
/CrawlersTools/logs/log.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/6/21 17:08
 4 | # @Author  : MuggleK
 5 | # @File    : logs.py
 6 | 
 7 | import time
 8 | 
 9 | from loguru import logger
10 | 
11 | time_format = time.strftime("%Y_%m_%d")
12 | log_format = "{time:YYYY-MM-DD HH:mm:ss}|{level}| {name}:{function}:{line}| {message}"
13 | 
14 | 
15 | class Logging(object):
16 |     """
17 |     Usage::
18 | 
19 |         # >>>
20 |         # >>> logger = Logging('logs')
21 |         # >>> logger.info('Logging Example')
22 |         # 2022-01-20 17:27:32.194 | INFO     | __main__:info:149 - Logging Example
23 |         # >>>
24 |     """
25 | 
26 |     __instance = None
27 | 
28 |     def __new__(cls, log_path, *args, **kwargs):
29 |         if not cls.__instance:
30 |             cls.__instance = super(Logging, cls).__new__(cls, *args, **kwargs)
31 | 
32 |         return cls.__instance
33 | 
34 |     def __init__(self, log_path, expire_date="10 days"):
35 |         logger.add(f"{log_path}/log_{time_format}_info.log", encoding="utf-8", enqueue=True, retention="1 months", level="INFO", format=log_format)
36 |         logger.add(f"{log_path}/log_{time_format}_error.log", encoding="utf-8", enqueue=True, retention=expire_date, level="ERROR", format=log_format)
37 |         logger.add(f"{log_path}/log_{time_format}_debug.log", encoding="utf-8", enqueue=True, retention=expire_date, level="DEBUG", format=log_format)
38 | 
39 |     @staticmethod
40 |     def info(msg):
41 |         return logger.info(msg)
42 | 
43 |     @staticmethod
44 |     def debug(msg):
45 |         return logger.debug(msg)
46 | 
47 |     @staticmethod
48 |     def warning(msg):
49 |         return logger.warning(msg)
50 | 
51 |     @staticmethod
52 |     def error(msg):
53 |         return logger.error(msg)
54 | 
55 |     @staticmethod
56 |     def success(msg):
57 |         return logger.success(msg)
58 | 


--------------------------------------------------------------------------------
/CrawlersTools/logs/logger.py:
--------------------------------------------------------------------------------
 1 | """
 2 | - logging 日志拦截转发到 loguru
 3 | - 日志输出为json并精简&自定义字段
 4 | - 日志拦截与输出的 集成&单独 方法
 5 | """
 6 | import time
 7 | from typing import List, Optional, Dict
 8 | 
 9 | from loguru import logger
10 | 
11 | from .handlers import default_handler, DEFAULT_HANDLER_FORMAT
12 | 
13 | TIME_FORMAT = time.strftime("%Y_%m_%d")
14 | 
15 | 
16 | def init_logger(
17 |         handlers: Optional[List[Dict]] = None,
18 |         add_file_handler: bool = False,
19 |         log_path: str = "./",
20 |         file_handler_level: str = "DEBUG",
21 |         file_handler_format: str = DEFAULT_HANDLER_FORMAT,
22 |         **kwargs
23 | ):
24 |     """
25 |     一键配置 loguru ，所属程序本身的日志可直接 from loguru import logger ，即可正常处理
26 | 
27 |     :param handlers: 日志处理的 handlers ，参见 loguru.configure ，默认配置了 default_handler ，其他预置的可以从 .logger.handlers 导入
28 |     :param add_file_handler: 开启后，会添加一个默认的文件输出 handler
29 |     :param log_path: 日志文件的路径，默认当前目录
30 |     :param file_handler_level: 文件输出 handler 的日志级别，默认 DEBUG
31 |     :param file_handler_format: 文件输出 handler 的日志格式，默认 DEFAULT_HANDLER_FORMAT
32 |     :param kwargs: 其他要传递给 logger.configure 的参数
33 |     """
34 |     if handlers is None:
35 |         handlers = [default_handler()]
36 |     elif not isinstance(handlers, list):
37 |         raise TypeError(
38 |             "The 'handlers' parameter should be a list (or None), not: '%s'"
39 |             % type(handlers).__name__
40 |         )
41 |     
42 |     extra = kwargs.pop("extra", {})
43 |     if not isinstance(extra, dict):
44 |         raise TypeError(
45 |             "The 'extra' parameter should be a dict (or None), not: '%s'"
46 |             % type(extra).__name__
47 |         )
48 | 
49 |     logger.configure(handlers=handlers, extra=extra, **kwargs)
50 | 
51 |     if add_file_handler:
52 |         expire_date = kwargs.pop("expire_date", "1 days")
53 |         logger.add(
54 |             f"{log_path}/log_{TIME_FORMAT}_{file_handler_level.lower()}.log",
55 |             encoding="UTF-8", enqueue=True, retention=expire_date,
56 |             level=file_handler_level, format=file_handler_format
57 |         )
58 | 


--------------------------------------------------------------------------------
/CrawlersTools/pipelines/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/8/15 11:29
 4 | # @Author  : MuggleK
 5 | # @File    : __init__.py
 6 | 
 7 | from CrawlersTools.pipelines.mongo_pipeline import MongoPipeline
 8 | 
 9 | from CrawlersTools.pipelines.mysql_pipeline import MysqlPipeline
10 | 
11 | from CrawlersTools.pipelines.redis_pipeline import RedisPipeline
12 | 


--------------------------------------------------------------------------------
/CrawlersTools/pipelines/kafka_operate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2022/9/5 15:32
 4 | # @Author  : xc
 5 | # @File    : kafka_operate
 6 | # @Software: PyCharm
 7 | 
 8 | 
 9 | import json
10 | from kafka import KafkaProducer
11 | from loguru import logger
12 | 
13 | 
14 | class KProducer:
15 |     def __init__(self, bootstrap_servers, topic):
16 |         """
17 |         kafka 生产者
18 |         :param bootstrap_servers: 地址
19 |         :param topic:  topic
20 |         """
21 |         self.producer = KafkaProducer(
22 |             bootstrap_servers=bootstrap_servers,
23 |             value_serializer=lambda m: json.dumps(m).encode('ascii'), )  # json 格式化发送的内容
24 |         self.topic = topic
25 | 
26 |     def sync_producer(self, data_li: list):
27 |         """
28 |         同步发送 数据
29 |         :param data_li:  发送数据
30 |         :return:
31 |         """
32 |         for data in data_li:
33 |             future = self.producer.send(self.topic, data)
34 |             record_metadata = future.get(timeout=10)  # 同步确认消费
35 |             partition = record_metadata.partition  # 数据所在的分区
36 |             offset = record_metadata.offset  # 数据所在分区的位置
37 |             logger.success('save success, partition: {}, offset: {}'.format(partition, offset))
38 | 


--------------------------------------------------------------------------------
/CrawlersTools/pipelines/mongo_pipeline.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/8/12 9:12
 4 | # @Author  : MuggleK
 5 | # @File    : mongo_pipeline.py
 6 | 
 7 | from pymongo import MongoClient
 8 | 
 9 | 
10 | class MongoPipeline:
11 |     """
12 |     A Mongo Pipeline to Create or Insert or Update or Delete Collection
13 | 
14 |     Usage:
15 | 
16 |     ```python
17 |     >>> mongo_client = MongoPipeline()
18 |     >>> record = mongo_client.find_one("test_collection", '{"company_name": "qzd"}')
19 |     ```
20 |     """
21 | 
22 |     collection = None
23 |     conn = None
24 | 
25 |     def __init__(self, host="127.0.0.1", port="27017", username="root", password="root", database="crawl_data"):
26 | 
27 |         self.server = '''mongodb://%s:%s@%s:%s/%s''' % (username, password, host, port, database)
28 |         self.client = MongoClient(host=self.server, readPreference="secondaryPreferred")
29 |         self.db = self.client.get_database(database)
30 | 
31 |     def close(self):
32 |         return self.client.close()
33 | 
34 |     def set_collection(self, name):
35 |         self.collection = self.db.get_collection(name)
36 | 
37 |     def find(self, collection_name, query=None, ref_query=None):
38 |         """
39 |         from query phrase to find docs
40 | 
41 |         :param collection_name:
42 |         :param query: query phrase
43 |         :param ref_query: reserve phrase
44 |         :return:
45 |         """
46 |         records = self.db.get_collection(collection_name).find(query, ref_query)
47 |         return records
48 | 
49 |     def find_one(self, collection_name, query=None, ref_query=None):
50 |         records = self.db.get_collection(collection_name).find_one(query, ref_query)
51 |         return records
52 | 
53 |     def update(self, collection_name, query, update, many=False):
54 |         if many:
55 |             self.db.get_collection(collection_name).update_many(query, update, upsert=True)
56 |             return
57 |         self.db.get_collection(collection_name).update_one(query, update, upsert=True)
58 | 
59 |     def aggregate(self, collection_name, query):
60 |         records = self.db.get_collection(collection_name).aggregate(query)
61 |         for record in records:
62 |             yield record
63 | 


--------------------------------------------------------------------------------
/CrawlersTools/pipelines/mysql_pipeline.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Project : CrawlersTools
  3 | # @Time    : 2022/8/12 9:12
  4 | # @Author  : MuggleK
  5 | # @File    : mysql_pipeline.py
  6 | 
  7 | import time
  8 | from typing import Tuple, Optional
  9 | 
 10 | import pymysql
 11 | from DBUtils.PooledDB import PooledDB
 12 | from loguru import logger
 13 | from pymysql import ProgrammingError
 14 | from pymysql.converters import escape_string
 15 | 
 16 | 
 17 | def cursor_handler(func):
 18 |     def wrapper(self, *args, **kwargs):
 19 |         sql_conn, cursor = self.ping()
 20 |         if not (sql_conn and cursor):
 21 |             logger.warning(f"Mysql Connection occur Error，args：{args}, kwargs: {kwargs}")
 22 |             return
 23 | 
 24 |         try:
 25 |             kwargs.update({'cursor': cursor})
 26 |             result = func(self, *args, **kwargs)
 27 |             sql_conn.commit()
 28 |             return result
 29 |         finally:
 30 |             cursor.close()
 31 |             sql_conn.close()
 32 | 
 33 |     return wrapper
 34 | 
 35 | 
 36 | class MysqlPipeline(object):
 37 |     """
 38 |     A Mysql Pipeline to Create or Insert or Update or Delete Table
 39 | 
 40 |     Usage::
 41 | 
 42 |         # >>>
 43 |         # >>> mysql_pool = MysqlPipeline(host='127.0.0.1', username='root', password='mysql', db='test')
 44 |         # >>> mysql_pool.insert(item, 'test_table')
 45 |         # >>>
 46 | 
 47 |     """
 48 | 
 49 |     table_columns_map = dict()  # 缓存每个table的结构，避免每次都要查询数据库
 50 | 
 51 |     def __init__(self, host: str = '127.0.0.1', username: str = 'root',
 52 |                  password: str = '', db: str = 'test', port: int = 3306,
 53 |                  drop_column: Optional[Tuple] = ('id', 'crawl_time'),
 54 |                  pool_num: int = 10
 55 |                  ):
 56 |         """
 57 |         :param host:
 58 |         :param username:
 59 |         :param password:
 60 |         :param db:
 61 |         :param port:
 62 |         :param drop_column: type:list 插入数据中不需要手动添加的字段,例如自增主键id,自增时间戳等
 63 |         :param pool_num:
 64 |         """
 65 |         self.host = host
 66 |         self.username = username
 67 |         self.password = password
 68 |         self.db = db
 69 |         self.port = port
 70 |         self.drop_column = drop_column
 71 |         self.pool_num = pool_num
 72 | 
 73 |         self.sql_pool = PooledDB(
 74 |             pymysql, self.pool_num, host=self.host,
 75 |             user=self.username, passwd=self.password, db=self.db,
 76 |             port=self.port, charset='utf8', use_unicode=True
 77 |         )
 78 | 
 79 |     def ping(self):
 80 |         """
 81 |         重写pymysql中的ping函数并新增重试机制,以保持conn和cursor
 82 | 
 83 |         :return:
 84 |         """
 85 |         for _ in range(5):
 86 |             try:
 87 |                 sql_conn = self.sql_pool.connection()
 88 |                 cursor = sql_conn.cursor()
 89 |                 return sql_conn, cursor
 90 |             except Exception as e:
 91 |                 logger.debug(f"Mysql Lost Connection for Host : {self.host} Retrying, Error: {e}")
 92 | 
 93 |             try:
 94 |                 self.sql_pool = PooledDB(
 95 |                     pymysql, self.pool_num, host=self.host, user=self.username,
 96 |                     passwd=self.password, db=self.db, port=self.port,
 97 |                     charset='utf8', use_unicode=True
 98 |                 )
 99 |                 sql_conn = self.sql_pool.connection()
100 |                 cursor = sql_conn.cursor()
101 |                 return sql_conn, cursor
102 |             except Exception as err:
103 |                 logger.debug(f"Waiting for 5s to Connect, Error: {err}")
104 |                 time.sleep(5)
105 |                 continue
106 | 
107 |         logger.error(f"Mysql Connects for Host : {self.host} Over Max Retries")
108 |         return None, None
109 | 
110 |     def add_columns_map(self, table_name):
111 |         sql = f"select column_name from information_schema.columns " \
112 |               f"where table_name='{table_name}' and table_schema='{self.db}'"
113 |         column_list = self.execute_sql(sql)
114 |         columns = [i[0] for i in column_list if i[0] not in self.drop_column]
115 |         self.table_columns_map[table_name] = columns
116 |         return columns
117 | 
118 |     @cursor_handler
119 |     def execute_sql(self, sql, mode='fetch', cursor=None):
120 |         if mode == 'fetch':
121 |             cursor.execute(sql)
122 |             result = cursor.fetchall()
123 |             return result
124 |         cursor.execute(sql)
125 | 
126 |     def insert(self, item, table_name):
127 | 
128 |         if not item:
129 |             logger.error("item is Empty")
130 |             return
131 | 
132 |         table_columns = self.table_columns_map.get(table_name) or self.add_columns_map(table_name)
133 |         if not table_columns:
134 |             raise ProgrammingError(f"Table '{self.db}.{table_name}' doesn't exist")
135 | 
136 |         # 格式化sql语句 处理"None" -> NULL
137 |         format_str = ','.join(["%s" for _ in table_columns])
138 |         insert_sql = 'INSERT IGNORE INTO %s (%s) VALUES (%s)' % (table_name, ','.join(table_columns), format_str)
139 |         item_values = [None if item.get(key) == "None" else item.get(key) for key in table_columns]
140 |         execute_data = tuple([escape_string('%r') % str(i) if i else "NULL" for i in item_values])
141 | 
142 |         self.execute_sql(insert_sql % execute_data)
143 |         logger.info(f"Insert Ignore Successfully：{table_name} -> {item}")
144 | 


--------------------------------------------------------------------------------
/CrawlersTools/pipelines/redis_pipeline.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/8/12 9:12
 4 | # @Author  : MuggleK
 5 | # @File    : redis_pipeline.py
 6 | 
 7 | import redis
 8 | 
 9 | 
10 | class RedisPipeline(object):
11 |     def __init__(self, name, namespace, **redis_kwargs):
12 |         self.__db = redis.Redis(**redis_kwargs)
13 |         self.key = '%s:%s' % (namespace, name)
14 | 
15 |     def qsize(self):
16 |         return self.__db.llen(self.key)
17 | 
18 |     def put(self, item):
19 |         self.__db.rpush(self.key, item)
20 | 
21 |     def get_wait(self, timeout=None):
22 |         item = self.__db.blpop(self.key, timeout=timeout)
23 |         return item
24 | 
25 |     def get_nowait(self):
26 |         item = self.__db.lpop(self.key)
27 |         return item
28 | 


--------------------------------------------------------------------------------
/CrawlersTools/preprocess/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/8/15 11:29
 4 | # @Author  : MuggleK
 5 | # @File    : __init__.py
 6 | 
 7 | from CrawlersTools.preprocess.bloom_filter import BloomFilter
 8 | 
 9 | from CrawlersTools.preprocess.time_process import TimeProcessor
10 | 


--------------------------------------------------------------------------------
/CrawlersTools/preprocess/bloom_filter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/8/12 14:35
 4 | # @Author  : MuggleK
 5 | # @File    : bloom_filter.py
 6 | 
 7 | import hashlib
 8 | 
 9 | 
10 | def sha1(data):
11 |     """
12 |     BloomFilter fingerprint Function
13 |     """
14 |     hash_object = hashlib.sha1(data.encode('utf-8'))
15 |     hex_dig = hash_object.hexdigest()
16 |     return hex_dig
17 | 
18 | 
19 | class SimpleHash(object):
20 |     """
21 |     BloomFilter Hash Function
22 |     """
23 |     def __init__(self, cap, seed):
24 |         self.cap = cap
25 |         self.seed = seed
26 | 
27 |     def hash(self, value):
28 |         ret = 0
29 |         for i in range(len(value)):
30 |             ret += self.seed * ret + ord(value[i])
31 |         return (self.cap - 1) & ret
32 | 
33 | 
34 | class BloomFilter(object):
35 |     """
36 |     Usage::
37 | 
38 |       # >>> bf = BloomFilter(server, key, block_num=1)  # you can increase block_num if you are filtering too many urls
39 |       # ... if is_contains(fp):
40 |       # ...     print(f"{fp} 已存在")
41 |       # ... else:
42 |       # ...     bf.insert(fp)
43 |       # >>>
44 | 
45 |     """
46 |     def __init__(self, server, key, block_num=1, filter_level=0):
47 |         """
48 | 
49 |         :param server: Redis Server
50 |         :param key: Redis Key
51 |         :param block_num:
52 |         :param filter_level: Filter data Magnitude 0：total data less than 100W. 1: Exceed 100W
53 |         """
54 |         self.bit_size = 1 << 31 if filter_level else 1 << 29
55 |         self.seeds = [5, 7, 11, 13, 31] if filter_level else [5, 7, 11, 13, 31, 37, 61]
56 |         self.server = server
57 |         self.key = key
58 |         self.block_num = block_num
59 |         self.hash_func = []
60 |         for seed in self.seeds:
61 |             self.hash_func.append(SimpleHash(self.bit_size, seed))
62 | 
63 |     def is_contains(self, str_input) -> bool:
64 |         """
65 |         param str_input: source string
66 |         :return:
67 |         """
68 |         if not str_input:
69 |             return False
70 |         ret = True
71 | 
72 |         fp = sha1(str_input)
73 |         name = f"{self.key}{str(int(fp[0:2], 16) % self.block_num)}"
74 |         for f in self.hash_func:
75 |             loc = f.hash(str_input)
76 |             ret = ret & self.server.getbit(name, loc)
77 |         return bool(ret)
78 | 
79 |     def insert(self, str_input):
80 |         """
81 |         param str_input: source string
82 |         :return:
83 |         """
84 |         fp = sha1(str_input)
85 |         name = f"{self.key}{str(int(fp[0:2], 16) % self.block_num)}"
86 |         for f in self.hash_func:
87 |             loc = f.hash(str_input)
88 |             self.server.setbit(name, loc, 1)
89 | 


--------------------------------------------------------------------------------
/CrawlersTools/preprocess/time_process.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/8/12 14:48
 4 | # @Author  : MuggleK
 5 | # @File    : time_process.py
 6 | 
 7 | import re
 8 | from datetime import datetime, timedelta
 9 | 
10 | from sinan import Sinan
11 | 
12 | from CrawlersTools.projects.filters import empty_text
13 | 
14 | 
15 | class TimeProcessor:
16 | 
17 |     datetime_pattern = r"([0-9]{4}).*?([0-1]{0,1}[0-9]).*?([0-3]{0,1}[0-9])"
18 | 
19 |     def __init__(self):
20 |         self.fmt = "%Y-%m-%d"  # 暂时只处理年月日
21 | 
22 |     def format(self, string, struct=False):
23 |         string = empty_text(string)
24 |         try:
25 |             return self.process_timestamp(string, struct)
26 |         except ValueError:
27 |             # print(f"非时间戳格式：{string}")
28 |             pass
29 | 
30 |         date = Sinan(string).parse(display_status=False).get("datetime", [""])[0].split(' ')[0]  # 错误的时分秒
31 |         if not date:
32 |             re_res = re.search(self.datetime_pattern, string)
33 |             if re_res is not None:
34 |                 date = f"{re_res.group(1)}-{re_res.group(2)}-{re_res.group(3)}"
35 |             else:
36 |                 # 提取不出时间或者格式不满足 datetime_pattern的直接返回
37 |                 return
38 | 
39 |         if struct:
40 |             return datetime.strptime(date, self.fmt)
41 |         return date
42 | 
43 |     def process_timestamp(self, timestamp, struct):
44 |         timestamp = int(str(timestamp)[:10])
45 |         source_time = datetime(1970, 1, 1)
46 |         struct_time = (
47 |             datetime.fromtimestamp(timestamp) if timestamp >= 0 else source_time + timedelta(seconds=timestamp)
48 |         )
49 |         if struct:
50 |             return struct_time
51 |         return struct_time.strftime(self.fmt)
52 | 
53 |     def compare_date(self, time_min, time_max) -> bool:
54 |         if not (time_min and time_max):
55 |             return False
56 | 
57 |         time_min_format = time_min if isinstance(time_min, datetime) else self.format(time_min, struct=True)
58 |         time_max_format = time_max if isinstance(time_max, datetime) else self.format(time_max, struct=True)
59 |         if not (time_min_format and time_max_format):
60 |             return False
61 | 
62 |         if time_min_format.date() <= time_max_format.date():
63 |             return True
64 |         return False
65 | 


--------------------------------------------------------------------------------
/CrawlersTools/projects/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time    : 2022/8/15 11:29
4 | # @Author  : MuggleK
5 | # @File    : __init__.py
6 | 


--------------------------------------------------------------------------------
/CrawlersTools/projects/filters.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/8/12 14:48
 4 | # @Author  : MuggleK
 5 | # @File    : filters.py
 6 | 
 7 | import re
 8 | from functools import reduce
 9 | from urllib.parse import urlparse
10 | 
11 | from loguru import logger
12 | 
13 | 
14 | def empty_text(lis):
15 |     word = ""
16 |     for i in lis:
17 |         word += i.strip()
18 |     return word
19 | 
20 | 
21 | def filter_title(title: str, remove_list: list):
22 |     """
23 | 
24 |     :param title: 文章标题
25 |     :param remove_list: 过滤关键词列表
26 |     :return:
27 |     """
28 |     if not title:
29 |         return False
30 |     for r in remove_list:
31 |         and_lists = r.split("and")
32 |         if len(and_lists) == 1:
33 |             if and_lists[0] in title:
34 |                 logger.debug(f"过滤标题: {title}  过滤词: {r}")
35 |                 return True
36 |         else:
37 |             total = [1 for a in and_lists if a in title]
38 |             result = reduce(lambda x, y: x + y, total)
39 |             if len(and_lists) != result:
40 |                 continue
41 |             return True
42 | 
43 | 
44 | def filter_text(text, removes: list):
45 |     """
46 |     :param text: 正文字段
47 |     :param removes: 需要去掉的特殊字段：扫一扫，【关闭】，【打印】
48 |     :return:
49 |     """
50 |     if removes:
51 |         for remove in removes:
52 |             text = text.replace(remove, '')
53 |     return text
54 | 
55 | 
56 | def filter_allowed_url(url, main_url, other_domains):
57 |     other_domains = other_domains if other_domains else []
58 |     main_url = main_url[0] if isinstance(main_url, list) else main_url
59 |     allowed_domains = [urlparse(main_url).netloc] + other_domains if urlparse(main_url).netloc else other_domains + [main_url]
60 |     for domain in allowed_domains:
61 |         if (not domain) or re.search(domain, url):
62 |             return True
63 | 


--------------------------------------------------------------------------------
/CrawlersTools/projects/upload_oss.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Project : CrawlersTools
  3 | # @Time    : 2022/8/12 20:20
  4 | # @Author  : MuggleK
  5 | # @File    : upload_oss.py
  6 | 
  7 | import base64
  8 | import hashlib
  9 | import re
 10 | 
 11 | import httpx
 12 | from loguru import logger
 13 | 
 14 | from CrawlersTools import base_requests
 15 | from CrawlersTools.requests.proxy import get_proxies
 16 | 
 17 | 
 18 | class UploadOss(object):
 19 |     """
 20 |     A Class for QZD Upload file to oss
 21 | 
 22 |     Usage:
 23 | 
 24 |     ```python
 25 |     >>> upload = UploadOss('(pdf|txt|doc|docx|xlsx|xls|csv|wps|hlp|rtf|ppt|pptx|zip|rar|jar|gz|jpg|jpeg|png|tif|gif|bmp)', "https://***")
 26 |     >>> oss_url, oss_uuid = upload.download("http://xxgk.haiyan.gov.cn/gov/jcms_files/jcms1/web7/site/zfxxgk/download/downfile.jsp?classid=0&filename=140901165845693.xls", '附件')
 27 |     ```
 28 |     """
 29 | 
 30 |     def __init__(self, oss_url, suffix_reg, oss_code=None, client_code=None):
 31 |         self.suffix_reg = suffix_reg
 32 |         self.oss_url = oss_url
 33 |         self.oss_code = oss_code
 34 |         self.client_code = client_code
 35 | 
 36 |     def download(self, file_url, file_name, headers=None, verify=True):
 37 |         """
 38 | 
 39 |         :param file_url:
 40 |         :param file_name:
 41 |         :param headers:
 42 |         :param verify:
 43 |         :return:
 44 |         """
 45 |         location = global_uuid = ""
 46 |         proxy = None
 47 |         headers = headers if headers else {
 48 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"}
 49 |         for _ in range(3):
 50 |             try:
 51 |                 if ";base64," in file_url:
 52 |                     suffix = "png"
 53 |                     logger.debug(f"正在上传base64图片: {file_name}: {file_url}")
 54 |                     hl = hashlib.md5()
 55 |                     hl.update(file_url.encode(encoding='utf-8'))
 56 |                     file_name = hl.hexdigest() + f".{suffix}"
 57 |                     a = file_url.split(";base64,")[-1]
 58 |                     a = a + '=' * (4 - len(a) % 4) if len(a) % 4 != 0 else a
 59 |                     base64str = base64.b64decode(a)
 60 |                     upload_result = self.post_file(file_name, base64str)
 61 |                     location = upload_result.get("downloadLocation")
 62 |                     global_uuid = upload_result.get("globalUuid")
 63 |                     logger.debug(f"文件上传成功: {file_name}: {file_url}")
 64 |                 else:
 65 |                     suffix = self.complete_name(file_url, file_name, self.suffix_reg)
 66 |                     if not file_url.startswith("http") and not suffix:
 67 |                         return location, global_uuid
 68 |                     file_name = f"{file_name}.{suffix}"
 69 |                     logger.debug(f"正在上传文件: {file_name}: {file_url}")
 70 |                     res = base_requests(file_url, timeout=60, headers=headers, verify=verify, proxies=proxy)
 71 |                     if 200 <= res.status_code < 400:
 72 |                         upload_result = self.post_file(file_name, res)
 73 |                         location = upload_result.get("downloadLocation")
 74 |                         global_uuid = upload_result.get("globalUuid")
 75 |                         logger.debug(f"文件上传成功: {file_name}: {file_url}")
 76 |                         break
 77 |                     elif res.status_code == 404 or res.status_code == 500:
 78 |                         logger.debug(f"文件地址无效: {file_name}: {file_url}")
 79 |                         break
 80 |             except Exception as e:
 81 |                 logger.warning(f"文件上传异常: {file_name}: {e}")
 82 |                 proxy = get_proxies(http2=True)
 83 |                 continue
 84 |         else:
 85 |             logger.error(f"文件上传失败: {file_name}: {file_url}")
 86 | 
 87 |         return location, global_uuid
 88 | 
 89 |     def post_file(self, name, resp):
 90 |         params_json = {
 91 |             "name": name,
 92 |             "appCode": self.oss_code,
 93 |             "appClientCode": self.client_code,
 94 |             "appOrgCode": "",
 95 |             "appUserId": "",
 96 |             "ownCatalogUuid": ""
 97 |         }
 98 |         json_data = httpx.post(self.oss_url, json=params_json).json()
 99 |         if json_data.get("msg") == "SUCCESS":
100 |             token_data = json_data.get("data", {})
101 | 
102 |             str_dic = {
103 |                 "key": token_data.get("dir") + token_data.get("name"),
104 |                 "policy": token_data.get("policy"),
105 |                 "OSSAccessKeyId": token_data.get("accessid"),
106 |                 "success_action_status": 200,
107 |                 "callback": token_data.get("callback"),
108 |                 "signature": token_data.get("signature"),
109 |             }
110 | 
111 |             files = {'file': resp.content}
112 |             response = httpx.post(token_data.get("host"), data=str_dic, files=files)
113 |             if response.status_code == 200:
114 |                 res_data = response.json()
115 |                 if res_data.get("msg") == "SUCCESS":
116 |                     return res_data["data"]
117 | 
118 |         raise ValueError(f"文件上传oss失败：{name}")
119 | 
120 |     @staticmethod
121 |     def complete_name(url, name, suffix_reg):
122 |         """
123 |         附件.xls.doc 可上传, 接口会默认取最后一个
124 |         优先取 file_url 后缀
125 |         """
126 |         is_name_suffix = re.search(suffix_reg, name, re.I)
127 |         is_url_suffix = re.search(suffix_reg, url, re.I)
128 |         name_suffix = is_name_suffix.group(1) if is_name_suffix else ""
129 |         url_suffix = is_url_suffix.group(1) if is_url_suffix else ""
130 |         if url_suffix:
131 |             suffix = url_suffix
132 |         elif name_suffix:
133 |             suffix = name_suffix
134 |         else:
135 |             suffix = ""
136 | 
137 |         return suffix
138 | 


--------------------------------------------------------------------------------
/CrawlersTools/requests/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/6/21 17:08
 4 | # @Author  : MuggleK
 5 | # @File    : __init__.py
 6 | 
 7 | import os
 8 | 
 9 | from CrawlersTools.requests.base_requests import BaseRequests
10 | from CrawlersTools.requests.proxy import get_proxies
11 | from CrawlersTools.requests.random_ua import UserAgent
12 | 
13 | PROJECT_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
14 | 
15 | 
16 | base_requests = BaseRequests().base_requests
17 | 


--------------------------------------------------------------------------------
/CrawlersTools/requests/base_requests.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Project : CrawlersTools
  3 | # @Time    : 2022/6/21 17:08
  4 | # @Author  : MuggleK
  5 | # @File    : base_requests.py
  6 | 
  7 | import json
  8 | import random
  9 | import re
 10 | import time
 11 | 
 12 | from chardet import detect
 13 | from httpx import Client, Response
 14 | from loguru import logger
 15 | 
 16 | from CrawlersTools.requests.proxy import get_proxies
 17 | from CrawlersTools.requests.random_ua import UserAgent
 18 | 
 19 | 
 20 | class BaseRequests(object):
 21 |     """
 22 |     A Rquests Class base on httpx
 23 | 
 24 |     Usage:
 25 | 
 26 |     ```python
 27 |     >>> base_requests = BaseRequests().base_requests
 28 |     >>> response = base_requests('https://example.org')
 29 |     ```
 30 |     """
 31 | 
 32 |     def base_requests(
 33 |         self,
 34 |         url: str,
 35 |         session: object = None,
 36 |         headers=UserAgent(),
 37 |         method: str = "get",
 38 |         proxies: dict = None,
 39 |         proxy_url: str = None,
 40 |         http2: bool = False,
 41 |         encoding: str = None,
 42 |         retry: int = 3,
 43 |         **kwargs
 44 |     ) -> Response:
 45 |         """
 46 |         内置ali_waf & 加速乐解密
 47 | 
 48 |         :param url: 请求链接
 49 |         :param session: 维持session可从外部传入
 50 |         :param headers: 请求头
 51 |         :param method:  具体请求方式
 52 |         :param proxies: ip代理，配合proxy_url可失效自动切换
 53 |         :param proxy_url:  获取代理链接
 54 |         :param http2:   是否使用http2.0协议
 55 |         :param retry:   请求重试次数，默认3次
 56 |         :param encoding:   指定编码，默认detect解析，效果同requests的apparent_encoding
 57 |         :param kwargs:  请求时需携带的其他参数
 58 |         :return: Response
 59 |         :exception: 1.代理失效&超过重试次数返回None 2.waf或加速乐解密失败返回None
 60 |         """
 61 |         for _ in range(retry):
 62 |             try:
 63 |                 proxies = proxies if proxies else get_proxies(proxy_url, http2=True)
 64 |                 session = session or Client(
 65 |                     http2=http2,
 66 |                     headers=headers,
 67 |                     proxies=proxies,
 68 |                     timeout=kwargs.get("timeout", 20),
 69 |                     verify=kwargs.get("verify", True),
 70 |                     follow_redirects=kwargs.get("allow_redirects", False)
 71 |                 )
 72 |                 response = session.request(
 73 |                     method=method.lower(),
 74 |                     url=url,
 75 |                     headers=headers,
 76 |                     content=kwargs.get("content"),
 77 |                     data=kwargs.get("data"),
 78 |                     files=kwargs.get("files"),
 79 |                     json=kwargs.get("json"),
 80 |                     params=kwargs.get("params"),
 81 |                     timeout=kwargs.get("timeout", 20),
 82 |                     follow_redirects=kwargs.get("allow_redirects", False)
 83 |                 )
 84 |                 response.encoding = encoding if encoding else detect(response.content)['encoding']  # chardet 更准确
 85 |                 if 200 <= response.status_code < 300 or response.status_code == 412:
 86 |                     if 'arg1=' in response.text:
 87 |                         acw_tc_cookie = f'acw_tc={session.cookies.get("acw_tc")};'
 88 |                         headers["Cookie"] = headers["Cookie"] + acw_tc_cookie if headers.get("Cookie") else acw_tc_cookie
 89 |                         reg_arg1 = re.findall("var arg1='(.*)';", response.text)[0]
 90 |                         arg2 = self.ali_waf(reg_arg1)
 91 |                         headers['cookie'] += f'acw_sc__v2={arg2}'
 92 |                         continue
 93 |                     return response
 94 |                 elif response.status_code == 521:
 95 |                     if 'document.cookie' in response.text:
 96 |                         cookie_key = [key for key in list(session.cookies.keys()) if key.startswith("__jsluid")][0]
 97 |                         headers["Cookie"] = headers["Cookie"] if headers.get("Cookie") else f'{cookie_key}={session.cookies.get(cookie_key)};'
 98 |                         headers["Cookie"] += f'{self.process_fuck_js(response.text)};'
 99 |                         continue
100 |                     elif 'chars' in response.text:
101 |                         __jsl_clearance_s = self.process_clearance(response.text)
102 |                         headers["Cookie"] = '='.join(headers["Cookie"].split('=')[:-1]) + f'={__jsl_clearance_s};'
103 |                         continue
104 |                 else:
105 |                     proxies = None
106 |                     time.sleep(random.uniform(0, 1))
107 |                     continue
108 |             except Exception as err:
109 |                 logger.error(f'url：{url} error：{err} proxies：{proxies}')
110 |                 proxies = None
111 |                 time.sleep(random.uniform(0, 1))
112 |                 continue
113 | 
114 |     @staticmethod
115 |     def ali_waf(arg1):
116 |         """
117 |         acw_sc__v2算法
118 |         :param arg1:
119 |         :return:
120 |         """
121 |         list1 = [15, 35, 29, 24, 33, 16, 1, 38, 10, 9, 19, 31, 40, 27, 22, 23, 25, 13, 6, 11, 39, 18, 20, 8, 14, 21, 32,
122 |                  26, 2, 30, 7, 4, 17, 5, 3, 28, 34, 37, 12, 36]
123 |         dict1 = {}
124 |         for i in range(len(arg1)):
125 |             string = arg1[i]
126 |             for j in range(len(list1)):
127 |                 if list1[j] == i + 1:
128 |                     dict1[j] = string
129 |         str1 = ''.join([dict1.get(i) for i in range(40)])
130 | 
131 |         str1_list = list(str1)
132 |         str2 = "3000176000856006061501533003690027800375"
133 |         str2_list = list(str2)
134 |         str4 = ''
135 |         for m in range(0, len(str1_list), 2):
136 |             int1 = int(''.join(str1_list[m:m + 2]), 16)
137 |             int2 = int(''.join(str2_list[m:m + 2]), 16)
138 |             str3 = str(hex(int1 ^ int2))[2:]
139 |             if len(str3) == 1:
140 |                 str3 = '0' + str3
141 |             str4 += str3
142 |         return str4
143 | 
144 |     @staticmethod
145 |     def process_fuck_js(js_text):
146 |         import execjs
147 | 
148 |         js_text = js_text.split(';location.href=loc')[0].split('document.cookie=')[-1]
149 |         r = execjs.eval(js_text).split(';')[0]
150 |         return r
151 | 
152 |     @staticmethod
153 |     def process_clearance(html):
154 |         import hashlib
155 | 
156 |         data = json.loads(re.findall(r'go\((.*?)\)', html)[1])
157 |         chars_length = len(data.get('chars'))
158 |         for i in range(chars_length):
159 |             for j in range(chars_length):
160 |                 result = data.get('bts')[0] + data.get('chars')[i] + data.get('chars')[j] + data.get('bts')[1]
161 |                 b = eval('hashlib.{}()'.format(data.get('ha')))
162 |                 b.update(result.encode(encoding='utf-8'))
163 |                 res = b.hexdigest()
164 |                 if res == data.get('ct'):
165 |                     return result
166 | 


--------------------------------------------------------------------------------
/CrawlersTools/requests/proxy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/6/21 17:06
 4 | # @Author  : MuggleK
 5 | # @File    : proxy.py
 6 | 
 7 | import httpx
 8 | 
 9 | from loguru import logger
10 | 
11 | 
12 | def get_proxies(proxy_url=None, http2=False):
13 |     """
14 |     默认httpx代理模式
15 |     @param proxy_url: 代理请求链接
16 |     @param http2: 默认http1.1规则
17 |     @return:
18 |     """
19 |     if not proxy_url: return
20 | 
21 |     protocol = 'http://'
22 |     try:
23 |         proxy = httpx.get(proxy_url).text.strip()
24 |         proxy = protocol + proxy
25 |         if http2:
26 |             return {protocol: proxy, 'https://': proxy}
27 |         return {"http": proxy, "https": proxy}
28 |     except Exception as err:
29 |         logger.error(f'获取代理失败：{err}')
30 | 


--------------------------------------------------------------------------------
/CrawlersTools/requests/random_ua.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Project : CrawlersTools
  3 | # @Time    : 2022/6/21 16:44
  4 | # @Author  : MuggleK
  5 | # @File    : random_ua.py
  6 | 
  7 | import random
  8 | 
  9 | 
 10 | class UserAgent(object):
 11 |     __PC_USER_AGENTS = (
 12 |         # chrome
 13 |         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
 14 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
 15 |         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
 16 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
 17 |         "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
 18 |         "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
 19 |         "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
 20 |         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
 21 |         "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
 22 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
 23 |         # opera
 24 |         "Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16",
 25 |         "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14",
 26 |         "Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14",
 27 |         "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14",
 28 |         "Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
 29 |         "Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00",
 30 |         "Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00",
 31 |         "Opera/12.0(Windows NT 5.2;U;en)Presto/22.9.168 Version/12.00",
 32 |         "Opera/12.0(Windows NT 5.1;U;en)Presto/22.9.168 Version/12.00",
 33 |         "Mozilla/5.0 (Windows NT 5.1) Gecko/20100101 Firefox/14.0 Opera/12.0",
 34 |         # firefox
 35 |         "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
 36 |         "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
 37 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
 38 |         "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
 39 |         "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0",
 40 |         "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0",
 41 |         "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
 42 |         "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0",
 43 |         "Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0",
 44 |         "Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101  Firefox/28.0",
 45 |         # safari
 46 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
 47 |         "Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25",
 48 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
 49 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10",
 50 |         "Mozilla/5.0 (iPad; CPU OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko ) Version/5.1 Mobile/9B176 Safari/7534.48.3",
 51 |         "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; de-at) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
 52 |         "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; da-dk) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
 53 |         "Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
 54 |         "Mozilla/5.0 (Windows; U; Windows NT 6.1; ko-KR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
 55 |         "Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
 56 |     )
 57 | 
 58 |     __MOBILE_USER_AGENTS = (
 59 |         # QQ浏览器
 60 |         "Mozilla/5.0 (Linux; Android 6.0; Le X620 Build/HEXCNFN5902012151S; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043015 Safari/537.36 V1_AND_SQ_6.6.9_482_YYB_D QQ/6.6.9.3060 NetType/2G WebP/0.3.0 Pixel/1080",
 61 |         "Mozilla/5.0 (Linux; U; Android 4.4.2; zh-cn; LG-D859 Build/KVT49L.D85910d) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/7.4 Mobile Safari/537.36",
 62 |         "Mozilla/5.0 (Linux; Android 6.0.1; ONEPLUS A3000 Build/MMB29M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043405 Safari/537.36 V1_AND_SQ_7.1.0_692_YYB_D QQ/7.1.0.3175 NetType/WIFI WebP/0.3.0 Pixel/1080",
 63 |         "Mozilla/5.0 (Linux; U; Android 4.4.2; zh-cn; SM-N9006 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/7.5 Mobile Safari/537.36",
 64 |         "Mozilla/5.0 (Linux; Android 7.0; EVA-AL00 Build/HUAWEIEVA-AL00; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.91 Mobile Safari/537.36 MQQBrowser/6.6",
 65 |         "Mozilla/5.0 (Linux; Android 5.0.2; Redmi Note 2 Build/LRX22G; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043221 Safari/537.36 V1_AND_SQ_7.0.0_676_YYB_D QQ/7.0.0.3135 NetType/WIFI WebP/0.3.0 Pixel/1920",
 66 |         "Mozilla/5.0 (Linux; Android 7.1.2; Redmi Note 3 Build/NJH47F; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043602 Safari/537.36 V1_AND_SQ_7.2.5_744_YYB_D QQ/7.2.5.3305 NetType/WIFI WebP/0.3.0 Pixel/1080",
 67 |         "Mozilla/5.0 (Linux; U; Android 6.0.1; zh-cn; 1505-A02 Build/MMB29M) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/7.8 Mobile Safari/537.36",
 68 |         "Mozilla/5.0 (Linux; U; Android 6.0.1; zh-cn; MI 5s Plus Build/MXB48T) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/7.2 Mobile Safari/537.36",
 69 |         "Mozilla/5.0 (Linux; Android 6.0.1; SM-A8000 Build/MMB29M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043305 Safari/537.36 V1_AND_SQ_7.0.0_676_YYB_D QQ/7.0.0.3135 NetType/WIFI WebP/0.3.0 Pixel/1080",
 70 |         # UC浏览器
 71 |         "Mozilla/5.0 (Linux; U; Android 8.0.0; zh-CN) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.108 UCBrowser/11.9.5.975 Mobile Safari/537.36",
 72 |         "Mozilla/5.0 (Linux; U; Android 7.0; zh-CN; MI 5 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.5.5.943 Mobile Safari/537.36",
 73 |         "Mozilla/5.0 (Linux; U; Android 7.0; zh-cn; MHA-AL00 Build/HUAWEIMHA-AL00) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 UCBrowser/1.0.0.100 U3/0.8.0 Mobile Safari/534.30 AliApp(TB/6.6.1) WindVane/8.0.0 1080X1830 GCanvas/1.4.2.21",
 74 |         "Mozilla/5.0 (Linux; U; Android 7.0; zh-CN; DUK-AL20 Build/HUAWEIDUK-AL20) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.108 UCBrowser/11.6.4.950 UCBS/2.11.1.27 Mobile Safari/537.36 AliApp(TB/7.2.3) WindVane/8.0.0 1080X1812",
 75 |         "Mozilla/5.0 (Linux; U; Android 5.1; zh-CN; M681C Build/LMY47I) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.4.1.939 Mobile Safari/537.36",
 76 |         "Mozilla/5.0 (Linux; U; Android 5.0.2; zh-cn; vivo Y31 Build/LRX22G) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 UCBrowser/1.0.0.100 U3/0.8.0 Mobile Safari/534.30 AliApp(TB/6.6.0) WindVane/8.0.0 540X960 GCanvas/1.4.2.21",
 77 |         "Mozilla/5.0 (Linux; U; Android 7.1.1; zh-CN; MI MAX 2 Build/NMF26F) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.7.2.954 Mobile Safari/537.36",
 78 |         "Mozilla/5.0 (Linux; U; Android 5.1.1; zh-cn; R7Plusm Build/LMY47V) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 UCBrowser/1.0.0.100 U3/0.8.0 Mobile Safari/534.30 AliApp(TB/6.5.2) WindVane/8.0.0 1080X1800 GCanvas/1.4.2.21",
 79 |         "Mozilla/5.0 (Linux; U; Android 6.0; zh-CN; HUAWEI MT7-TL10 Build/HuaweiMT7-TL10) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.3.0.907 Mobile Safari/537.36",
 80 |         "Mozilla/5.0 (Linux; U; Android 7.0; zh-CN; KNT-AL20 Build/HUAWEIKNT-AL20) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.6.8.952 Mobile Safari/537.36",
 81 |         # MIUI浏览器
 82 |         "Mozilla/5.0 (Linux; U; Android 4.4.4; zh-cn; Coolpad 8675-A Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/46.0.2490.85 Mobile Safari/537.36 XiaoMi/MiuiBrowser/2.1.1",
 83 |         "Mozilla/5.0 (Linux; U; Android 7.0; zh-cn; MI 5 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/9.2.7",
 84 |         "Mozilla/5.0 (Linux; U; Android 7.0; zh-cn; MI 5 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/8.8.3",
 85 |         "Mozilla/5.0 (Linux; U; Android 6.0.1; zh-cn; MI NOTE LTE Build/MMB29M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/9.1.3",
 86 |         "Mozilla/5.0 (Linux; U; Android 4.4.2; zh-cn; HM NOTE 1TD Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/8.5.14",
 87 |         "Mozilla/5.0 (Linux; U; Android 6.0; zh-cn; Redmi Note 4 Build/MRA58K) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/9.2.8",
 88 |         "Mozilla/5.0 (Linux; U; Android 7.0; zh-cn; Mi Note 2 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/9.2.8",
 89 |         "Mozilla/5.0 (Linux; U; Android 4.4.4; zh-cn; HM NOTE 1LTE Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/9.3.2",
 90 |         "Mozilla/5.0 (Linux; U; Android 6.0.1; zh-cn; MI 4W Build/MMB29M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/46.0.2490.85 Mobile Safari/537.36 XiaoMi/MiuiBrowser/8.2.15",
 91 |         "Mozilla/5.0 (Linux; U; Android 6.0.1; zh-cn; MI MAX Build/MMB29M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/8.6.5",
 92 |         # 傲游浏览器
 93 |         "Mozilla/5.0 (Linux; U; Android 2.3.5; zh-cn; U8800 Build/HuaweiU8800) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
 94 |         "Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M031 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30",
 95 |         "Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B179 Safari/537.4 Maxthon/%s",
 96 |         "Mozilla/5.0 (Android 5.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.9.4.1000 Chrome/39.0.2146.0 Safari/537.36",
 97 |         "Mozilla/5.0 (Linux; Android 7.1.1; MI 6 Build/NMF26X) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 Mobile Safari/537.36 Maxthon/3047",
 98 |         "Mozilla/5.0 (Windows NT 6.1; Android) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.9.2.1000",
 99 |         "Mozilla/5.0 (Linux; U; Android 4.2.1; zh-cn; M040 Build/JOP40D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30 Maxthon/4.1.3.2000",
100 |         "Mozilla/5.0 (iPhone; CPU iPhone OS 7_0_4 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Mobile/11B554a",
101 |         "Mozilla/5.0 (Linux; Android 7.0; MI 5 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.91 Mobile Safari/537.36 Maxthon/3047",
102 |         "Mozilla/5.0 (Linux; Android 7.0; MI 5 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.91 Mobile Safari/537.36 Maxthon/3048",
103 |         # 百度浏览器
104 |         "Mozilla/5.0 (Linux; Android 5.1.1; vivo X6SPlus D Build/LMY47V; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/48.0.2564.116 Mobile Safari/537.36 T7/9.1 baidubrowser/7.15.15.0 (Baidu; P1 5.1.1)",
105 |         "Mozilla/5.0 (Linux; Android 7.0; HUAWEI NXT-AL10 Build/HUAWEINXT-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/48.0.2564.116 Mobile Safari/537.36 baidubrowser/7.9.12.0 (Baidu; P1 7.0)",
106 |         "Mozilla/5.0 (Linux; Android 6.0.1; SM-C7000 Build/MMB29M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/48.0.2564.116 Mobile Safari/537.36 baidubrowser/7.13.13.0 (Baidu; P1 6.0.1)",
107 |         "Mozilla/5.0 (Linux; Android 6.0; HUAWEI NXT-AL10 Build/HUAWEINXT-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/48.0.2564.116 Mobile Safari/537.36 baidubrowser/7.9.12.0 (Baidu; P1 6.0)",
108 |         "Mozilla/5.0 (Linux; U; Android 4.4.4; zh-cn; H2 Build/KTU84P) AppleWebKit/534.24 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.24 T5/2.0 baidubrowser/5.6.4.7 (Baidu; P1 4.4.4)",
109 |         "Mozilla/5.0 (Linux; Android 4.4.4; SM-N935F Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/33.0.0.0 Safari/537.36 baidubrowser/7.13.13.0 (Baidu; P1 4.4.4)",
110 |         "Mozilla/5.0 (Linux; Android 4.2.2; NX403A Build/JDQ39; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/48.0.2564.116 Mobile Safari/537.36 baidubrowser/7.10.12.0 (Baidu; P1 4.2.2)",
111 |         "Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M031 Build/IML74K) AppleWebKit/530.17 (KHTML, like Gecko) FlyFlow/2.3 Version/4.0 Mobile Safari/530.17 baidubrowser/023_1.41.3.2_diordna_069_046/uzieM_51_3.0.4_130M/1200a/963E77C7DAC3FA587DF3A7798517939D%7C408994110686468/1",
112 |         "Mozilla/5.0 (Linux; Android 6.0; HUAWEI VNS-DL00 Build/HUAWEIVNS-DL00; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/48.0.2564.116 Mobile Safari/537.36 baidubrowser/7.9.12.0 (Baidu; P1 6.0)",
113 |         "Mozilla/5.0 (Linux; Android 6.0.1; MI 5s Build/MXB48T; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/48.0.2564.116 Mobile Safari/537.36 baidubrowser/7.9.12.0 (Baidu; P1 6.0.1)",
114 |     )
115 | 
116 |     ua_name = 'User-Agent'
117 | 
118 |     def __new__(cls, mobile_ua=False, default=False):
119 |         if default:
120 |             return cls.default_ua()
121 | 
122 |         if mobile_ua:
123 |             return {cls.ua_name: random.choice(cls.__MOBILE_USER_AGENTS)}
124 |         else:
125 |             return {cls.ua_name: random.choice(cls.__PC_USER_AGENTS)}
126 | 
127 |     @staticmethod
128 |     def random_ua():
129 |         s_ver = [str(random.randint(10, 99)), '0', str(random.randint(1000, 9999)), str(random.randint(100, 999))]
130 |         version = '.'.join(s_ver)
131 |         webkit = 'AppleWebKit/537.36 (KHTML, like Gecko)'
132 |         mac = '_'.join([str(random.randint(8, 12)) for _ in range(2)] + [str(random.randint(1, 10))])
133 |         typeid = random.randint(1, 6)
134 |         if typeid == 1:
135 |             ua_ua = 'Mozilla/5.0 (Windows NT 7.1; WOW64) %s Chrome/%s Safari/537.36' % (webkit, version)
136 |         elif typeid == 2:
137 |             ua_ua = 'Mozilla/5.0 (Windows NT 10.1; WOW64) %s Chrome/%s Safari/537.36' % (webkit, version)
138 |         elif typeid == 3:
139 |             ua_ua = 'Mozilla/5.0 (Windows NT 8.1; WOW64) %s Chrome/%s Safari/537.36' % (webkit, version)
140 |         else:
141 |             ua_ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X %s) %s Chrome/%s Safari/537.36' % (mac, webkit, version)
142 |         return ua_ua
143 | 
144 |     @classmethod
145 |     def default_ua(cls):
146 |         return {cls.ua_name: cls.random_ua()}
147 | 


--------------------------------------------------------------------------------
/CrawlersTools/requirements.txt:
--------------------------------------------------------------------------------
 1 | auto_mix_prep
 2 | DBUtils==1.3
 3 | fontTools
 4 | httpx
 5 | httpx[http2]
 6 | loguru
 7 | Pillow
 8 | PyExecJS==1.5.1
 9 | pymongo
10 | PyMySQL
11 | redis
12 | tqdm
13 | PyYAML
14 | lxml
15 | numpy
16 | Distance
17 | chardet
18 | sinan
19 | kafka-python
20 | 


--------------------------------------------------------------------------------
/CrawlersTools/schedules/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Project : CrawlersTools
3 | # @Time    : 2022/8/15 11:29
4 | # @Author  : MuggleK
5 | # @File    : __init__.py
6 | 
7 | from CrawlersTools.schedules.auto_thread import AutoThread
8 | 


--------------------------------------------------------------------------------
/CrawlersTools/schedules/auto_thread.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/6/21 17:19
 4 | # @Author  : MuggleK
 5 | # @File    : auto_thread.py
 6 | 
 7 | import time
 8 | from threading import Lock, Thread, active_count
 9 | from traceback import format_exc
10 | 
11 | from loguru import logger
12 | from tqdm import tqdm
13 | 
14 | thread_lock = Lock()
15 | 
16 | 
17 | class ExcThread(Thread):
18 |     """
19 |     主动捕获子线程异常
20 |     """
21 | 
22 |     def __init__(self, target, args=(), kwargs=None):
23 |         super(ExcThread, self).__init__()
24 |         self._target = target
25 |         self._args = args
26 |         self._kwargs = kwargs or dict()
27 | 
28 |     def run(self):
29 |         try:
30 |             if self._target:
31 |                 self._target(*self._args, **self._kwargs)
32 |         except:
33 |             logger.error(f'self._target:{self._target} args：{self._args} kwargs：{self._kwargs}，{format_exc()}')
34 | 
35 | 
36 | class AutoThread(object):
37 |     """
38 |     动态线程调度, 传入任务队列可为列表（初始化转换成生成器），也可为生成器
39 |     usage:
40 |         a_thread = AutoThread(20, fun, arg_list)
41 |         a_thread.main_thread()
42 | 
43 |     ps: 支持两种并发方式：1.并发函数 2.并发传参 3.并发函数和传参
44 |     """
45 | 
46 |     def __init__(self, thread_num: int, fun, arg_list=None):
47 |         self.thread_num = thread_num
48 |         if isinstance(fun, tuple): fun = list(fun)
49 |         if isinstance(arg_list, tuple): arg_list = list(arg_list)
50 |         self.fun_list = fun if callable(fun) else list(fun)     # 待带调用对象只能是方法或方法列表，元组
51 |         self.arg_list = arg_list
52 |         self.os_threads = active_count()
53 | 
54 |     def process_task(self):
55 |         if callable(self.fun_list):
56 |             # 1.并发函数
57 |             tasks = [{'fun': self.fun_list, 'args': arg} for arg in self.arg_list]
58 |         elif isinstance(self.fun_list, list) and not isinstance(self.arg_list, list):
59 |             # 2.并发传参
60 |             tasks = [{'fun': fun, 'args': self.arg_list} for fun in self.fun_list]
61 |         else:
62 |             assert len(self.fun_list) == len(self.arg_list), '并发函数和传参长度不一致'
63 |             # 3.并发函数和传参
64 |             tasks = [{'fun': fun, 'args': arg} for fun, arg in zip(self.fun_list, self.arg_list)]
65 |         return tasks
66 | 
67 |     def wait(self):
68 |         """
69 |         等待所有线程结束, 比较 当前存活线程和(主线程 + tqdm线程)
70 |         """
71 |         while active_count() > self.os_threads + 1:
72 |             time.sleep(.25)
73 | 
74 |     def main_thread(self):
75 |         loop_flag = True
76 |         tasks = self.process_task()
77 |         with tqdm(total=len(tasks)) as pbar:
78 |             while loop_flag:
79 |                 active_thread = active_count()
80 |                 if active_thread >= self.thread_num:
81 |                     time.sleep(.25)
82 |                     continue
83 |                 for _ in range(self.thread_num - active_thread + self.os_threads):
84 |                     thread_lock.acquire()
85 |                     task = tasks.pop() if tasks else None
86 |                     thread_lock.release()
87 |                     if task is None:
88 |                         loop_flag = False
89 |                         break
90 |                     child_thread = ExcThread(target=task["fun"]) if task["args"] is None else ExcThread(
91 |                         target=task["fun"], args=(task["args"],))
92 | 
93 |                     child_thread.start()
94 |                     pbar.update(1)
95 | 


--------------------------------------------------------------------------------
/CrawlersTools/utils/str_compare.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | def str_compare(str1: str, str2: str, half_compare=False) -> bool:
 6 |     """
 7 |     比较两个字符串是否相等，当half_compare为True时会把字符串转为半角字符串之后在比较
 8 |     适用：比较两个公司名，原则上 特斯拉（上海）有限公司 和 特斯拉(上海)有限公司 这两个公司是同一个，但是因为一个是全角括号，一个是半角括号，
 9 |         直接比较会导致两个公司名不相等，这时候转换为半角在进行比较则不会出现这个问题.
10 |     """
11 |     str1 = full_str_to_half_str(str1) if half_compare else str1
12 |     str2 = full_str_to_half_str(str2) if half_compare else str2
13 |     if str1 == str2:
14 |         return True
15 |     return False
16 | 
17 | 
18 | def full_str_to_half_str(full_str: str) -> str:
19 |     # 全角字符串转半角字符串
20 |     half_str = ""
21 |     for _str in full_str:
22 |         _str_code = ord(_str)
23 |         if _str_code == 12288:  # 全角空格转半角空格
24 |             _str_code = 32
25 |         elif 65281 <= _str_code <= 65374:
26 |             _str_code -= 65248
27 |         half_str += chr(_str_code)
28 |     return half_str
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     print(str_compare('特斯拉（上海）有限公司', '特斯拉(上海)有限公司'))
33 |     print(str_compare('特斯拉（上海）有限公司', '特斯拉(上海)有限公司', half_compare=True)) # 全角括号转半角括号
34 |     print(str_compare('　特斯拉(上海)有限公司', ' 特斯拉(上海)有限公司'))
35 |     print(str_compare('　特斯拉(上海)有限公司', ' 特斯拉(上海)有限公司', half_compare=True))  # 全角空格转半角空格
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CrawlersTools
  2 | ![](https://img.shields.io/badge/python-3.7.3-brightgreen)
  3 | 
  4 | CrawlersTools is a comprehensive tools for crawlers base on Python3.
  5 | 
  6 | ---
  7 | 
  8 | Install CrawlersTools using pip:
  9 | 
 10 | ```shell
 11 | pip install CrawlersTools
 12 | ```
 13 | 
 14 | CrawlersTools requires Python 3.7+.
 15 | 
 16 | 
 17 | ## Modules Introduction
 18 | 
 19 | 
 20 | ### Extraction of Policy HTML
 21 | With this module,you can extract policy html automatically.
 22 | #### Extraction of List HTML
 23 | Usage example:
 24 | ```shell
 25 | from CrawlersTools import ListExtractor
 26 | 
 27 | list_extract = ListExtractor()
 28 | extract_result = list_extract.extract(html)
 29 | print(extract_result)
 30 | ```
 31 | Outputs:
 32 | ```shell
 33 | [{
 34 |     'title': '关于对迎泽区2023年乡村振兴项目储备库的公示',
 35 |     'url': '/p222/ncjbmxxgk/20221111/30003142.html'
 36 | }, {
 37 |     'title': '迎泽区2022年第四批实际种粮农民一次性补贴发放明细公示',
 38 |     'url': '/p222/ncjbmxxgk/20221027/1249082.html'
 39 | }, {
 40 |     'title': '迎泽区2022年第三批实际种粮农民一次性补贴发放明细公示',
 41 |     'url': '/p222/ncjbmxxgk/20221027/1249081.html'
 42 | }, {
 43 |     'title': '迎泽区2022年第二批实际种粮农民一次性补贴发放明细公示',
 44 |     'url': '/p222/ncjbmxxgk/20221027/1249078.html'
 45 | }, {
 46 |     'title': '迎泽区2022年第一批实际种粮农民一次性补贴发放明细公示',
 47 |     'url': '/p222/ncjbmxxgk/20221027/1249077.html'
 48 | }, ***]
 49 | ```
 50 | 
 51 | #### Extraction of Detail HTML
 52 | Usage example:
 53 | ```shell
 54 | from CrawlersTools import PolicyExtractor
 55 | 
 56 | detail_extract = PolicyExtractor()
 57 | detail_result = detail_extract.extract(html)
 58 | print(detail_result)
 59 | ```
 60 | Outputs:
 61 | ```shell
 62 | {
 63 |   'title': '玉溪市2018年工业经济运行工作指导意见', 
 64 |   'publish_time': '2018-03-15', 
 65 |   'content': '2017年，在作为全市工业主要支柱产业的烟草制品业增加值增速持续负增长，虽然全市非烟产业平稳高速增长，但占比小、支撑作用不明显，全市工业稳增长面临严峻困难和挑战的形势下，市委、市政府早谋划、早部署，及时出台稳增长促转型系列政策措施，工业经济运行缓中趋稳，稳中有升，结构调整和转型升级取得积极进展。全年全部工业完成增加值\n662.5\n亿元，增长\n7.1%\n，拉动\nGDP\n增长\n3.6\n个百分点，对经济增长的贡献率为\n38.1%\n，其中：规模以上工业完成增加值\n623.7\n亿元，增长\n7.2%\n。\n2018\n年是贯彻党的十九大精神的开局之年，是改革开放\n40\n周年，也是决胜全面建成小康社会、实施\n“\n十三五\n”\n规划承上启下的关键一年。当前，我国经济正处在转变发展方式、优化经济结构、转换增长动力的攻关期。十九大报告作出这样的判断：我国经济已由高速增长阶段转向高质量发展阶段。\n“\n十三五\n”\n行至一半，\n2018\n年成为承上启下的关键一年，全市工业经济运行调节要全面贯彻党的十九大精神，以习近平新时代中国特色社会主义思想为指导，牢固树立\n“\n四个意识\n”\n和\n“\n四个自信\n”\n，坚持稳中求进工作总基调，坚持新发展理念，紧扣我国社会主要矛盾变化，着眼建设现代化经济体系，立足制造强国、网络强国建设全局，以供给侧结构性改革为主线，以提高制造业供给体系质量为主攻方向，深入实施\n“\n中国制造\n2025”\n，强化创新驱动、改革推动、融合带动，推动质量变革、效率变革、动力变革，加快培育壮大新动能、改造提升传统动能，实现工业及信息化业持续健康发展。\n一、2018年工业经济运行形势研判\n（一）有利条件\n当前，玉溪工业发展正处于工业化中期的前半阶段，资本、技术等要素边际贡献率较高，工业发展空间仍然较大，支撑工业平稳较快增长的有利条件和因素不断增多。\n一是宏观经济运行企稳。\n全球经济\n“\n低增长、低通胀、低利率\n”\n态势将会有所回暖，传统产业负增长的情况会有所改变。同时，随着滇中城市经济圈和\n“\n五大基础设施网络\n”\n等一系列项目实施，国家、省、市出台的稳增长、供给侧结构性改革等系列政策效应逐步释放，经济增长的内生动力不断增强，国内经济稳中向好，全市经济稳中有进，为工业稳增长调结构提供了良好的宏观环境。\n二是面临良好发展机遇和政策环境。\n玉溪地处滇中区域，位于泛亚铁路中线和东线的交汇点，是昆曼高速的重要节点，有助于在更高、更广的平台上深化对内和对外两个开放，辐射带动滇中城市经济圈发展，开创面向南亚、东南亚开放合作的新局面。且玉溪处于多个重大国家战略的交汇点，\n“\n一带一路\n”“\n长江经济带\n”\n创造新战略空间，有望加快面向南亚东南亚辐射中心建设和沿边地区开发开放，同时在基建、能源、金融、贸易合作等方面将获得更为广阔的发展空间。市委市政府高位推进\n“\n五网\n”\n建设，加快培育七大重点产业，实施工业经济攻坚战，将助力补齐工业经济发展短板，打造工业增长新引擎。\n三是工业企业生产经营环境有所改善。\n全市大宗工业品价格稳步回升，工业通缩局面得到缓解。工业企业经营环境逐步改善，将在一定程度上改善企业市场预期，增强库存回补动力和投资意愿。创新驱动发展战略深入实施，大众创业、万众创新成为经济提质增效的新引擎。\n“\n互联网\n+”\n加快跨界融合，将极大提升实体经济的创新力和生产力，企业生产经营环境不断得到创新及改善。\n（二）不利条件\n在内外需求偏弱，周期性和结构性矛盾叠加的背景下，\n2018\n年玉溪工业经济平稳健康运行仍然面临诸多制约因素，主要表现为：\n一是结构性矛盾制约。\n全市工业结构中，\n2017\n年烟草制品业占比达\n62.68%\n，从\n2017\n年运行趋势看，烟草行业仍将面临三个不变，即国家控烟的力度不会改变、对烟草行业的政策调控不会改变、烟草行业运行环境难有较大改变，烟草行业保增长的压力仍较为突出。传统行业中，钢铁行业虽呈现恢复性增长，但受工业增加值率和价格指数影响，对全市工业增长拉动较小。新兴产业中，装备制造、生物医药和电子信息制造产业虽然增速较快，但在全市规上工业增加值中的占比仅分别为\n4.2%\n、\n1.5%\n、\n0.5%\n，占比小，拉动增长较弱，全市工业经济结构性矛盾依然突出。\n二是要素成本制约。\n近年来，资金、用工、土地等要素刚性成本和环保、物流等隐性成本不断上升，给企业生产经营造成较大压力。企业成本上升明显，工业企业融资难、融资贵矛盾仍未有效缓解，金融资本一定程度上\n“\n脱实向虚\n”\n。企业难贷款和银行贷款难\n“\n两难矛盾\n”\n以及慎贷、限贷、抽贷现象仍然不同程度存在。\n三是新增增量制约。\n由于工业企业生产经营困难、效益下滑影响，导致对未来市场的预期和信心不足。民营企业由于缺乏担保，实际资金支付率和到付率偏低，加之投资回报率下降、用地成本高、落地困难等多种因素影响，民间投资意愿下降，新增投资大幅下滑。\n四是新投产项目匮乏。\n全市新建成投产且能纳入规上统计的项目较少，尤其是缺乏大项目形成新的增量。招商优质项目不多，带动性不强，投资额度大的项目不多，跨越发展缺少大项目支撑，工业经济缺乏增量支撑。\n综上所述，\n2018\n年全市工业经济处于新旧动能转换，并将加速向纵深推进期，工业经济将在合理区间稳定运行，工业发展质量将稳步提升。但仍需要密切关注影响工业经济平衡充分发展的一些突出问题：供给体系质量与消费升级需求不匹配、新技术对传统产业的渗透融合不深入、工业领域民间投资意愿不强烈等，经济继续下行的压力和风险较大，延续低速低效运行趋势的可能性较大，完成全年工业经济发展目标任务十分艰巨，全市工信系统的广大干部职工务必高度重视，振奋精神，迎难而上，恪尽职守，竭尽全力，全面完成市委市政府下达各县区的工业经济发展目标。\n二、2018年工业经济运行总体思路和主要指标\n（一）总体思路\n全面贯彻落实十九大、中央经济工作会议、省委十届四次全会、市委五届五次全会，以及\n2018\n年全省工业和信息化工作会议精神，牢固树立\n“\n创新、协调、绿色、开放、共享\n”\n的发展新理念，坚持稳中求进工作总基调，适应新常态，谋划新思路，把稳增长作为首要任务，坚持全局视野、问题导向、创新意识、聚焦目标任务的运行调节原则，强化重点监测、准确定位、分类指导、精准施策，实现将短期需求管理与中长期供给管理相结合、从侧重需求侧向供需两侧并重逐步转变，实施重点行业挖潜、重大项目驱动、产融合作推进、达规企业培育\n“\n四大工程\n”\n，增强工业持续增长动力，促进工业经济平稳增长。\n（二）主要目标\n2018\n年非烟规上工业增加值增长\n21%\n。\n三、主要措施\n（一）压实各县区、工业园区主体责任。\n督促各县区、工业园区要扛起责任使命，压紧压实工业经济发展主体责任，扎实推进工业经济发展工作，着力营造良好的发展环境。实现责任全厘清，层层建立主体责任清单，主要领导带头，以上率下，重视工业、研究工业、推动工业发展。\n（二）提前谋划，实现高开稳走。\n提前谋划部署，科学有序部署，减少工业经济大幅波动，促进全年工业经济平稳发展。抓好节后企业复产复工，确保实现一季度工业经济\n“\n开门红\n”\n。及早部署和安排\n2018\n年工业投资重点项目，促进项目早启动、早投产、早落地，支撑全市工业经济平稳发展。\n（三）抓好成本费用调查工作。\n市工信委和市统计局联合组织对全市\n94\n户成本费用调查工业企业开展业务培训。各级工信、统计部门要深入填报企业开展业务指导，进一步提高全市规模以上工业企业成本费用调查数据质量，客观真实反映工业各行业增加值率情况。\n（四）抓好在库企业生产经营监测工作。\n一是\n组建\n10\n个帮扶督导组，对\n9\n个县区和高新区全覆盖，每月一调研一督导，继续抓好扩销促产、电力市场化交易等措施落实，强化稳增长政策落到实处，激励在库工业企业做好生产经营工作。\n二是\n加强分析预测，认真研究提出工业经济平稳发展的支撑点，以更加有力的措施确保工业经济增长态势。\n三是\n抓好重点企业稳增长工作。对全市工业经济贡献较大，占比较高的重点企业，实行面对面帮、点对点扶，保障重点企业生产经营中的各种生产要素，确保企业满负荷生产，支撑全市工业经济稳定增长。\n（五）抓好负增长企业扭负为正工作。\n以企业帮扶为手段，进一步优化发展环境。创新帮扶工作机制，切实帮助企业解决具体问题，重点排查帮扶企业存在的突出矛盾和问题，采取\n“\n一企一策\n”\n的办法，分类指导，重点帮扶，加大惠企政策落实力度，努力实现负增长企业达产增效。\n（六）创新工业企业纳规培育工作机制。\n依托全市工业企业纳规工作联席会议制度，比对筛选全市符合纳规条件工业企业，做好政策宣传，企业帮扶工作，推动全市工业企业纳规工作。'
 66 | }
 67 | ```
 68 | 
 69 | 
 70 | ### Font Decryption
 71 | Font encryption is one of the common scenarios for crawlers.With this module,you can only
 72 | provide font file and encrypted strings to get decrypted strings.
 73 | 
 74 | FreeTypeFont isn`t compatible with bytes, so you must change the source code in advance.
 75 | ```shell
 76 | def load_from_bytes(f):
 77 |     if isinstance(f, bytes):    # 直接传入bytes
 78 |         self.font_bytes = f
 79 |     else:
 80 |         self.font_bytes = f.read()
 81 |     self.font = core.getfont(
 82 |         "", size, index, encoding, self.font_bytes, layout_engine
 83 |     )
 84 | 
 85 | if isinstance(font, bytes): # 直接传入bytes
 86 |     load_from_bytes(font)
 87 | elif isPath(font):
 88 |     ***
 89 | ```
 90 | 
 91 | Usage example:
 92 | ```shell
 93 | from CrawlersTools.js_crawler import FontDecrypt
 94 | 
 95 | str_base64_ = 'd09GRgABAAAAAE34AAoAAAAATbAAATMzAAAAAAAAAAAAAAAAAAAAAAAAAABPUy8yAAAA9AAAAGAAAABgZ5ijY2NtYXAAAAFUAAAGogAABqJE+hJPZ2x5ZgAAB/gAAEEUAABBFJTJlvxoZWFkAABJDAAAADYAAAA2ByuHNWhoZWEAAElEAAAAJAAAACQEJgKgaG10eAAASWgAAAD0AAAA9CckDMBsb2NhAABKXAAAAOAAAADgLkc+mG1heHAAAEs8AAAAIAAAACAAeQBhbmFtZQAAS1wAAAJ8AAACfKf0GLVwb3N0AABN2AAAACAAAAAg/58ArwAEA8cBkAAFAAQB9AH0AAAAAAH0AfQAAAH0AEAAyAgBAgsEAAAAAAAAAKAAAv8QAAAAAAAAFgAAAABBUFBMAEAAMZfnA1z/dAGQBCQBVAAEAAEAAAAAAlgDXAAAACAAAwAAAAQAAAADAAAAJAABAAAAAAB8AAMAAQAAACQAAwAKAAABggAEAFgAAAASABAAAwACADEAMgAzADQANQA2ADgAOf//AAAAMQAyADMANAA1ADYANwA5////1P/V/9H/z//N/8v/0f/NAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAEGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABQcEAwIBCAkGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAwAAAAABSAAAAAAAAAAbAAAADEAAAAxAAAABQAAADIAAAAyAAAABwAAADMAAAAzAAAABAAAADQAAAA0AAAAAwAAADUAAAA1AAAAAgAAADYAAAA2AAAAAQAAADcAAAA4AAAACAAAADkAAAA5AAAABgACLk8AAi5PAAAACgACLlcAAi5XAAAAEgACLm8AAi5vAAAALQACLnMAAi5zAAAALwACLnUAAi51AAAAMgACLn4AAi5+AAAANwACLoMAAi6DAAAAOwACLpEAAi6RAAAARAACLpQAAi6UAAAARwACLpgAAi6YAAAASAACLrIAAi6zAAAAUgACLrsAAi67AAAAVgACLtIAAi7SAAAAYwACLvUAAi71AAAAagACLwsAAi8LAAAAbgACTV0AAk1dAAAACwACTWIAAk1iAAAADAACTWkAAk1pAAAADQACTWsAAk1rAAAADgACTXwAAk18AAAADwACTYoAAk2KAAAAEAACTd0AAk3dAAAAEQACTmkAAk5pAAAAEwACTpwAAk6cAAAAFAACTqIAAk6iAAAAFQACTqsAAk6rAAAAFgACTzAAAk8wAAAAFwACTz0AAk89AAAAGAACUJgAAlCYAAAAGQACUMUAAlDFAAAAGgACUVUAAlFVAAAAGwACUXgAAlF4AAAAHAACUYUAAlGFAAAAHQACUfcAAlH3AAAAHgACUmUAAlJlAAAAHwACUp0AAlKdAAAAIAACUqYAAlKmAAAAIQACUqkAAlKpAAAAIgACUxkAAlMZAAAAIwACUyAAAlMgAAAAJAACU0YAAlNGAAAAJQACU1cAAlNXAAAAJgACU2AAAlNgAAAAJwACU6cAAlOnAAAAKAACU9sAAlPbAAAAKQACVkwAAlZMAAAAKgACVncAAlZ3AAAAKwACV0kAAldJAAAALAACWGkAAlhpAAAALgACWKUAAlilAAAAMAACWSMAAlkjAAAAMQACWrUAAlq1AAAAMwACWu0AAlrtAAAANAACWwUAAlsFAAAANQACW0sAAltLAAAANgACXVEAAl1RAAAAOAACXVcAAl1XAAAAOQACXcMAAl3DAAAAOgACXeMAAl3jAAAAPAACX3YAAl92AAAAPQACYV8AAmFfAAAAPgACYc8AAmHPAAAAPwACYlAAAmJQAAAAQAACYtcAAmLXAAAAQQACZIgAAmSIAAAAQgACZKgAAmSoAAAAQwACZOgAAmToAAAARQACZP8AAmT/AAAARgACZm4AAmZuAAAASQACZn4AAmZ+AAAASgACZokAAmaJAAAASwACZp8AAmafAAAATAACZ14AAmdeAAAATQACbCQAAmwkAAAATgACcIUAAnCFAAAATwACcbgAAnG4AAAAUAACc1UAAnNVAAAAUQACdIQAAnSEAAAAVAACddMAAnXTAAAAVQACdlAAAnZQAAAAVwACd2MAAndjAAAAWAACeRwAAnkcAAAAWQACeSAAAnkgAAAAWgACeVoAAnlaAAAAWwACecUAAnnFAAAAXAACepgAAnqYAAAAXQACfEoAAnxKAAAAXgACfh4AAn4eAAAAXwACfi4AAn4uAAAAYAACgAEAAoABAAAAYQACgEwAAoBMAAAAYgACgwYAAoMGAAAAZAACh7cAAoe3AAAAZQACivAAAorwAAAAZgACiwkAAosJAAAAZwACi00AAotNAAAAaAACkRwAApEcAAAAaQAClbEAApWxAAAAawACl8gAApfIAAAAbAACl+cAApfnAAAAbQAAAAAAAAAAAAAAAAAAAAEAbwAAAWkCygAKAAABESMRBgc1Njc2NwFpUkJmLzUxJQLK/TYCZkQeUg0fHioAAAEAPgAAAhoC2AAhAAAAFxYVFAcGBwYHIRUhNDc2NzY3NjU0JyYjIgcGByM0NzYzAZlAQEYpa4gUAXf+JEcubVUbNycnRksmKwFSQ0JtAtg5Ol1cTCtIW0lJY040Sz0cPERBJCQyMV53SEgAAAEAM//yAiYC2AAyAAAAFxYVFAcWFxYVFAcGIyInJiczFhcWMzI3NjU0JyYrATUzMjc2NTQnJiMiBwYHIzY3NjMBmz4+dUAgJERFb2RCTAlTAzUuQkwvKyopSzo3RyYmJSZHSScpCFEJQz5nAtg0NVl5JxQpKj9hPj80P3hULCUsKz5AJCNAICE8OyIjJiZJZzs6AAACABgAAAJBAsoACgAOAAABETMVIxUjNSE1AQcBIREBzXR0Tv6ZAWgE/uEBIgLK/htDoqJOAdpr/oYBegABADP/8gIlAsoAJwAAARUhBzM2NzYzMhcWFRQHBiMiJyYnMxYXFjMyNzY1NCcmIyIHBgcjEwII/qQYBB4vKDNmP0BNSWlfQkoIUQYwK0JLMTEuK08zKCsVTiYCyknvJBIRQENybEVBMzliRCQgMDFKVDAvFhUrAYoAAAAAAgAz//ICJgLYACAAMAAAABcjJiMiBwYdARQXMzY3NjMyFxYVFAcGIyInJjU0NzYzAgcGFRQXFjMyNzY1NCcmIwH8HlEYfFUvLwEEHDEvQGY/PkRGZ4FDPkNGekkvLiwtSkcuLSwsSgLYv3pTTHwKCAg5HR9CQ2dpRkZhWqWsanD+tzAvS0wvMzEzTEsuLwAAAQBCAAACFwLKAAYAABMhFQEjASFCAdX++VcBCv5/AspD/XkCfwAAAAMAKv/yAi4C2AAhADEAQQAAABcWFRQHBgcVFhcWFRQHBiMiJyY1NDc2NzUmJyY1NDc2MwYHBhUUFxYzMjc2NTQnJiMCBwYVFBcWMzI3NjU0JyYjAZ0+OhsfNjgnKkJGenxEQiooNzgdGzo+cU0qJSEoU1EqISUrTFcuLCwuV1YwKysvVwLYOjVOOScrEwIOMDNFXzk7OzlfRTMxDQITKyc5TzQ6QychNTchKCghNzQiJ/7DKyZCQSYpKig+QCgrAAIAM//yAiYC2AAgADAAAAAXFhUUBwYjIiczFjMyNzY9ASY1IwYHBiMiJyY1NDc2MwYHBhUUFxYzMjc2NTQnJiMBpUI/Q0l3xx1RF31SMi8BBBsyMj1nPz1EQ2pHLSwsLEpFMC4sLUoC2GBapqhucL96U1B4CgkHNiAfREBoa0RGRTEwT00sLzAxSUoxMwAAAAEAMgFLA7UBkwADAAATIRUhMgOD/H0Bk0gAAAACADr/pgN4AzUAFQAZAAABByEQBwYjIi8BFjMyNzYRIRMXByEVASEVIQE7EQIsLi2VP0cTdBaBGxv9yy5JDQI1/MICb/2RAmye/lxFPwNCBz0/ASoBqwZ9Rv56RgAAAAABADP/kQO1AzgAKwAAEyE2NyM1ITY3Fw8BBgchFSEGByEVIQYPAiEVBgcWFwcmJzcWFzY3ITY3ITMBHg0S+QEICRBKBgUHBQGv/kIOEAIX/dYJDAsLAdxKmFNaKN7yJmR0iVH+Eicd/vYCADFLQylQBh4dIhZDPj5GHyUhHz5ybR4rP25BOBsoWmRpXgAAAAADADn/xgOvAycACwARABcAAAEzETMRMxEhFSE1IQECByc2EwQTByYnNwFjSY5JASz8igEqAjE8W0FYQP2KPUI2UT0DJ/zlAxv85UZGAlP++sMXsgESzP79EfDeFAAAAAADADD/ngO2AzsAIQAnAC0AABMzNjcXBgchFSEGByE1MxUhFSERFCsBJxYzMjURISc2NyMBBgcnNjcEFwcmJzdO9xwbRxUbAf794khVAQ5JATv+xV9oDzcnL/6mEV1O1gEEY5Euj2ECN181TpIwArg9RhA1PkaNapSUQ/7CXEcFLAEsP2SX/kuIZzxhfnJ5NG6ELgAAAwBp/5UDgAM1AA8AEwAXAAABMxUhESM1IREjESEVIxEhASERIQEhESEBz0oBZ0j+4Ur+4kgBZv7iAR7+4gFoAR/+4QM1tP4qP/6rAVU/Adb+rwEL/vUBCwAAAAABADn/vgOwAzMAGgAAASE1ISYnNxYXByEVIRUhFSERIRUhNSERITUhAdD+jAF6GSVGJBYnAZz+jAFE/rwBl/yJAZf+vAFEAkBHTUwTU0oPR+xD/vRHRwEMQwAAAAEAOv+nA64DCgAWAAATIRUhESEVIREUKwEnFjMyNREhNSERIXkC7/65AY3+c2WcD2YsNf5iAZ7+oQMKRv7wSP6dYkUEMwFRSAEQAAABACP/ogPDAy8ADgAAARQHEgUHJAMCBSckEzY1AhgHVAFeL/7Nalr+siwBnA4CAy9xbf5o1z/JAVP+utdA/QGxiBcAAAMAIf+dA8QDMQAKAA4AJgAAARYXByYnBgUnJDcDIRUhByEVIQYHNjcmJzcWFwcmJwYFJzY3NjcjAg6/9yf2s6L+9SYBCbDaAev+FZ0DJP4pSGb71zw0OIpIPBsc6/6bFBoMYz77AzHVe0WB1MaQRYfK/s1EhUaNaQ4dTjokmm8qKigmFT8HDHJ3AAAABAAk/5oDvwM3AAoAFAAeACQAAAEGBxEjEQYHJzY3FyEmJzcWFyEVIQMhNhMXAgczFSESFwcmJzcBUCc6SDM5F6BJPgENFB1MGBUBBf2mEQFYTDlFNEzj/XvGL0MoQEIDGHZm/V4CNEQ8TbXnp0pIDUVaRf3HxgEyEv7x10cBc/oS4NkUAAAAAAIAJ/+bA8ADNQAKACoAABMGByc2NxcGBxEjEyE1MxUhFSMWFwcmAyMRMxUjFSM1IzUzESMCByc2EyOhLTgVnEVBKDlHlwEJRwER00ezL7hHCpubR5mZC0y5JatLygG3QUBIwPcegGz9cALrrKxG+r873wEV/m9Fzs5FAZH+yMY9rgETAAIAI/+bA7MDNwAWACEAAAEGByc2NxcGByEVIRUhFSEVIRUhFSMRBQYHJzY3FwYHESMB5ERbM4VUSBoWAbD+sAEk/twBKv7WSf6KMDkYo0lEKD1KAl2QbT2e/BBKOUelRqlF6QLCkD85TLHiIHNo/WIAAAAABgAg/5oDwAM6AAoAFQAZAB0AJQApAAATBgcnNjcXBgcRIxMhJic3FhcHIRUhFyEVIRUhFSEFESM1IRUjERchNSGaMDMXkkRDJDRHmwE0Fh5FHhMmATv9dVwB2f4nAdn+JwHrRv6ORkYBcv6OAcxGOU207B91ZP1YAxM8OBNAOQ5EUUFMQlL+pTY2AVvhngAABwAk/5UDuwM0ABcAHgApAC0ANwA9AEcAAAEGByEVBgcWFwcmJwYHJzY3JicGByc2NxYXNjcjDwEnBgcRIxEGByc2NxczESMBBgcGByc2NzY3FwYHJzY3ATY3NjcXBgcGBwKJDBQBHjFlWHIckWJpkRt4XEAkHCcpey0bT1kz+wUF+CUzRTA2FZVDQUFBAcQ4Oz9mI1ZDOTWPd+kd3XD+iqpra0szQ4aLjAMrJCc+Y0gnG0MkNjkkQRwrMTkbHTZXfuYzOUoGB4N0YP1bAjNFO0i06LH9ygEfLyAhITkWHh0ob5VCPTqF/pYtPz5jJmhOTSYAAAMAKv+fA7oDMAAlACsAMQAAATMRIRUhERQ7ATI3NjcXBgcGKwEiNREjFQYHBgcnNjc2NzUhNSEBBgcnNjcEFwcmJzcBz0oBkP7hKnkhDxIGRQofGzycXKkETUqrKaVBPQP+8AGPAY8zU0FRM/3+QkM/TUQDMP6NR/6xLRUbgBaeIh5iAV4Hu3JoO0A5XGKZB0cBI3t0IXZ4cX0he2sgAAAGADT/nwO1AzUAEwAXABsAHwAlACsAABMzNTMVITUzFTMVIxEzFSE1MxEjEyE1ITUhNSE1ITUhEwYHJzY3BBcHJic3RqlJAXhJqam8/H+7qfIBeP6IAXj+iAF4/ohnerUtsncBsogolJokAtlcXFxcRP42Q0MByv42cj5vPm39rVxHOkNUQFQ9YTc2AAMALP+cA7sDKQAJACQAKgAAAQYHBgcnNjc2NwM1IQYHBgcGByMnFjMyNzY3IQYHBgcnNjc2NyQXByYnNwGoRkdJfydpUEs7rwJfBBAPIyNNuhNaUFMbGQn+9SQ/SZUshkM9HwFg3DDlWj4C+YFPU1k/Pl1Vcf5CRbZ6dSssBEUDRkbtpmBpVDlLV1WT5oU5lesiAAADACT/nAOKAy4AHgAiAC0AABMzNQYHJzY3FwYHFTMVIxUWFwcmJxEjEQYHJzY3NSMlMxEjFycWMzI1ETMRFCM6x19PFvaiI1RaublgYCtVQElHdiCaQ8cCHEhIRA9gKixJXQIjiBEFQxRBPSEVl0UuU2VAbEL+NgHCkHlTiK0B9f2WxEYEMQMW/NhhAAMAI/+bA5YDLgAsADAAOwAAEwYHJzY3FwYHMzUzFTMVIxUzFSMVMxEUKwEnFzI9ASMRIxEjESMRMzUjNTM1JTMRIxcnFjMyNREzERQjnBskOksVRAgLdEi7u9jYw1o1EjErfkh/RsXy8gFbRkYkEFA6LkdcAm88JypRjgszJXx8Q4FDaf74WUQDJ7b+nAFk/tsBaGlDgWX9lsVFBTADGfzVXgAAAwAz/5gDowMuAAMAGwA3AAATIRUhByEVIwYHNjcmJzcWFwcmJwYHJzY3NhMjATMVMxAHBiMiLwEWMzI3NhMjFQIHJzYTNSM1M1oBav6WJwG35zsudXcpJD1JPj4NEZ68EQkHLUuLAkxJ2w4PeRpCEEwXQAsKA5QDzzm/A3x8AutFyUbiZBgsVj8egpYgIyY5IUQDAz4BEAGXtv4XeXADQARNUAGxEv5FzjK6AZ0SRQACACf/mgOvAzYACgAoAAABBgcRIxEGByc2NwEiNREGByc2NxEzETY3FwYHERQ7ATI3NjcXBgcGIwGcM1FIQFIX0mABEms6RyBeQ0iceDKLuzSgJxMQBkMLGRtIAxZ7a/1qAj5GREu06fx2agExISJELCcBkf6baIc5lXb+ri8ZF5YWrSQkAAAAAwA2/5sDrgMzAAsAKQA0AAA3ITUzFSEVIRUjNSElIj0BBgcnNjc1MxU2NxcGBxUUOwEyNzY3FwYHBiMlBgcnNjcXBgcRI0QBikoBjv5ySv52AiZaNl0oYFtFl2UxgqsqligQFgdEDCIeQP2sPUQXvFY9L0JG3WJiRvz8t1dOFyE8HyjlxUpcMm5SXyMPFmEWfhwY5DUtSnueHFlI/rUAAAIAN/+dA7EDNwAcADgAABMhNxcGByEVIQYHIREUKwEnFzI1ESERIxEhNjchBBczNjcXBgczFSMVMxUjFSM1IzUzNSM1MyYnNzcBngxJAwgBkv5kCQwBfFliE10q/YBHAUkOBv5sAW0baSIaQhofcsjm5kbl5cVuFx9AAtNkCiI4RDYv/dhfRgIrAdT9twKNRCH/Rz9DGDkxP2lAtbVAaT83LhgACAA4/50DsAM2AAMABwALAA8ALQBAAEwAUgAAARUzNTMVMz0CIxUjNSMdAiMRMzUhNSE1MxUzJic3FhczFSEVMxEjNSMVIzUFITUzFTMVIxUUKwEnFjMyPQEhATM1MxUzFSMRIxEjABcHJic3AcWfQ6CgQ59D4v7/AQFDeQ8ZPhYPUP7840OgQ/7lAaxGdXVRZw81KSP+VP7vaUdlZUdpAe48LTZdKwGzQUFBQTNCQkJCp10BfkI+UVEjKQotKT5C/oJdVFSLOTk+jExDBR97AcX09Eb9qAJY/j49LjxCKgACACz/lgOxAwUAGwAlAAABIzUhFQYHMxUGBxYXByYnBgcnNjcmJwIHJzYTADcjNjchFQcWFwEprAJUGx3KLJF1li+jcYHFKrt6jEMyzzf5BAHCLdkhIv7xATbCAr5HL3tmKs2NVkFASlxnQ0M/W4Wj/rKuMc0Bxf61nWOncxjrpwAAAAADACn/mQOvAzgAIwAoAC4AABsBFwczNjcXBgchFSEGByEVBgcWFwcmJwYHJzY3JicGByc2ExcWFzY3EhcHJic3dE9HOa8eEkkSHQHU/hkVEQGZKXp5ryXAfnm4JqdxdDJtrCXaaTEzeG8qKUozPXMwAf4BFgzFdn8JdnZFRys+gmhIPENHUlVIQz1KWWrsjEK/AU+0ZldXZgGWVTNNXC8AAwAy/6MDtQMNAAMABwAdAAATIREhJTUhFQc1IRUhByEGBwYjIi8BFjMyNzY3ITfPAkr9tgIA/krnA4P9qyIB6QklJGZKOxhQPVETEgn+Ci8DDf7yQYyM3EVFde8wLQZCBiEhh7YAAAAEAB//nQPGAzIACgASABYAGgAAARYXByYnBgUnJDcBESM1IRUjERMhNSEDIRUhAg6/+Sf6saL+8yYBCrEBVkb+FEZGAez+FA4CCP34AzLdfkWH2MqXRYzR/f7+bTw8AZP+7s4BCEUAAAMAbP+eA4QDNgADAAcAGwAAASERISU1IxUTBgchERQrAScXMjURIREjESE2NwE9AXH+jwEr5XwSIgG5W5YUkiv9eEgBEyQSAc3+okTW1gJ5TUD9ZWJJAywCRv1EAwFHUAAAAAAFAET/lAOaAw0ABwALAA8AGQAfAAABESMRIREjERMhFSElNSEVEwYHBgUnNjc2NwAXByYnNwM4Sf4QSTQCFP3sAcz+e/EVSE7+6Rz0T0YMAR+iJqyxJgHH/pQBKv7GAXwBRvdBdHT+5txOXhlBD1JOt/7gUzxdMjkAAAAAAwAt/6MDjgMnAAcACwAqAAABESM1IxUjERMzESMFMzUGByc2NxcGBxUzFSMVFhcHJicRIxEGByc2NzUjA45G80VF8/P98q9LThffliJTTqurV1gqUzJGQGsejTyvAu38109ZAzP9agJTjJAMBkASOTweEZ5DMkxjPnE5/j0BvY90ToSoBAAAAAQAWf+eA48DAwATABsAHwAlAAATIRUjFTMVIxUzFSE1MzUjNTM1IyURIzUhFSMREyERIQAXByYnN/wB79Kzs+/92e6zs9MCk0n9XElJAqT9XAITGzgeJDkCZEGQQJ9CQp9AkOD8mysrA2X9CgKy/ks6GkQxEwAAAAIAI/+ZA7EDOAAPACQAAAEzNTMVMxUjESEVITUhESMnBgcnNjchNSE2NxcGByEVIQYHESMBZ+VJ7+8BHP2NAQ7loTw6Lbtp/v8BIyETSxgXAff96TpSRwFfurpG/uxHRwEUGj4wO5rIR0tECVA2R3Zn/hQABQA1/74DtQMyAB8AIwAnACsAOwAAEzM1MxUhNTMVMxUjETMVIxYXByYnIQYHJzY3IzUzESMTITUhNSE1ITUhNSETMxUzFSMVIRUhNSE1IzUzULFGAVlGsbHA0EiXHbpO/sZOriWUSNDAsfcBWf6nAVn+pwFZ/qeKRsvLAVf9BQFezc0C5kxMTEw9/rVAWjM+Tn19UT05WEABS/61SDlHOUr+UlBAakNDakAAAAEAN//QA7EDMQAPAAATIREzESEVIREhFSE1IREhNwGZSgGX/mkBcfzUAXH+ZwHuAUP+vUX+bUZGAZMAAAIAYP+sA5cDNAAWAC0AAAEGByEVAgUnNjcmJzcWFzY3IQYHJyQ3Ehc2NyEGByckNxcGByEVAgUnNjcmJzcCMBseAR+i/kkfiGovMz85LZxV/tx1oCoBDIMXMKVh/tiGsysBRp0+KSQBEtX9ux3sqDY0QAMSHhs5/uAjQQkfOS8eNjg/fEUkQjOP/VA/R4ZAH0AhpiIpGzn+ng1FAjhHMx4AAAAAAQAp/5kDvgMwABYAABMhNjc1MxUUByEVIRIFByQDAgUnJBMhPQGFCwJKDQGf/nZpATQt/tJwXf7ALQE/Tv6HAipTYlFBYGVG/q23OroBUv6UpzydAXIAAAQALf+dA7gDMAAUAB4AJAA7AAATITY3MxQHIRUhFgUHJCcGBSckNyEBMxEjNQYHJzY3JhcHJic3JQYHMxUGBSc2NyYnNxYXNjcjBgcnNjdFAYsFAUYFAYz+hWMBLR3+tVdc/q4eAUFN/ooBC0pKb4sdoXZ6SCdSVScCShIR8kv+riNzVTUnMi8vVibcN1EsjTMBAyQwKStGoTlDSLOwTkNEmQJv/jOPOy1CMD+cPzxJKDklKxs9410/Hi01HiUnMD9RPDE6UXMAAwA5/5sDqwMqAB4AOgBDAAABFSM1IwYFJzY3ITUhNQYjJyAlFwYHFSEVIRYXByYnASE/ATY3FwchFSMGBxYXByYnBgUnMjcmJzY3IyEGDwIWFzY3AhdIDF7+/SnoWf7lAXCdhRYBiAEDI4KsAXb+32nWIP1r/iUBHQ8PEgpGLQHptTdGcZwsh5qg/vco45eDfjIq6wFADxQQEGh1TTECLbOzfl1BRVVDUQdAMD0bDFZDXThEU4b+8hYWGhAVQUNlQCQ9OzozZQY/RykcPTcTGRUTFyI5VAAAAQA//6MDqQMFABkAABMhNTY3ITUhFQYHFSEVIREUKwEnFjMyNREhPwGlf3f9zgKUeJcBfP6EW6wTS1cv/lsBg45JZ0REcmNpRv7DXUYDMQEmAAAABABH/6ADogM7AA0AKQAvADUAAAEVIzUhFSM1ITY3FwYHBSEVBgcVIRUhFRQrAScWMxYzMj0BITUhNTY3IRIXByYnNwQXByYnNwOaR/1BRgI9QTFJNDn+DgIbZncBff6DVn4RIhYhHSb+awGVfkH+TDAsPjQ2QQEmK0AtM0ECZuCbm+BfchtrS5pAQTozRKlRQgEBKpBEUzckAU1OH1lDIDtXIF1IIAAABAA//5YDpgM7AA0AEwAZADQAAAEVIzUhFSM1ISYnNxYXAhcHJic3NhcHJic3BTMVBgchFSEGBxYXByYnBgUnJD8CITUhNjcDjEn9YkkBfBUWTBIU1VolZGgkpV8kZWIjAS9JBSYBRv6bDBTOsii/xHn+/icBFWwMC/5TAdIsBQLJyIWKzTsrDCtH/tA9OUgmNXA/OUglMwtvg2NDFhtKbD55SXZGPUp4EBBDYoQAAAACAEn/nQOxAzgAOgBIAAAABxYXFAcGKwEnMzY3NjUmJwYFJyQ3JicGByc2NyYnDwEGByc2NzY3ITUhFSMGBxYXNjcXBgcWFwcmJwIXIRUjNSEVIzUhJic3Am4aGQEgJYEiGDBZFhsCAqf+8CQBLKEOD43zJP+GDhoLC2epJZFzQyT+9wJX5TMfNR+MfTFSWkagM6VIgRIBeEf9QkcBjBATSQEYB0tRdi40PwIZIV0gEoNTPliFLB9vRT5DZxcdBQYwMDsgMR4UQ0MjEjY/J1IxNiO8cTuGzQHyO65ra64tJQ0AAAAEADH/mQO2AwoAEgAmACoAMAAAEyE1MxUzFSMVFCsBJxYzMj0BIQEVIRUUMyE2NzY3FwYHBgchIjURFyE1IRIXByYnNzECYUjc3FuAEEYxLP2fAu79wysCFx0MCgZEBw0VP/26WUgB9f4LlUk2OmwzAQc/P0bUVEcFJMICSe5GKwETEzwWQSAsA1QBTqxr/Y9SNkpYMQAAAAEAQv/LA6UC7QALAAABITUhFSERIRUhNSEB0/6kAwP+owGI/J0BkQKnRkb9akZGAAEAOf+eA68DOwAfAAATISYnNxYXIRUhFSERFCsBJxcyNREjESMRIxEjESE1ITkBmRcfThcaAZT+agFFW2QSXyr9SvFHATj+agK+PDUMMUxFg/6VW0UEKgEW/e0CE/54Ac2DAAADAEv/nAO7AywAGQAdACUAAAEhFSMVMxEUKwEnFzI1ESMRIxEjESMRMzUjBTMRIxMzEQIHJzYTAYoCMfXOTEQTQB6JRolFzvb+zUZGsEYEyDa5AwMIRnv+FlNGBCABmP2cAmT9/wJEexn98gKR/iT+1oovegELAAACAC7/mQOsAzoAHAAgAAATBgcnNjcXBgchFSEVIRUhFSEVIRUjNSE1MxEhNQMzNSP8P18wmDRLDhgCbf7GARf+6QFa/qZJ/jKlASnh4eECe2tHPHLDDjI5RqBF2Ebf30YBHaD+Q9gAAAEAKv+UA7EDQAAPAAAAFyEVIRUCByc2ExEhJic3AikcAWz9GwZhO1UDAXQXIE8DDEVF4P63xTWtASwBJTc0DgAEACr/nAOxAzkACQAPABUAJQAANyE2ExcCBzMVIRIXByYnNxYXByYnNzYXIRUhFQIHJzYTESEmJzffAaFdRUdBWtP9PqgxRCo9P/MrRSE2Px4YAW79GwZhO1UDAXMYHk8N1gE5E/7c2EYBh9IRw6kUXNgQw7QU1zhG3/61xTWtAS4BJTAqDgAAAwAm/6ADvAMuAAUALAAwAAATBgcnNjcEFzY3FwYHMzUzFTMVIxUzFSMVMxUhNSE1IzUzNSMGByc2NwcmJzcnMxEjgwMfOyAEARsaNAdBAwmPStray8v0/aQBHuXlnBs0NhENMhciNY9JSQJskowRgZAyYoSTCT85u7tG8Uf0Rkb0R/FuXDAhHg1dUxGp/HIAAQAt/5wDwwM0AEYAAAAXBzMVIRYXFhc2NxcGBxYXFjM2NxcGIyInJicGByc2NyYnJichFTMGBwYHIi8BFjM2NzY3IxUCByc2NxEhJjUzFBczJic3Ay80Lnr+zAocBw1KL0E9XxYXMh4cFkEkQz9IHBZojiiXZxcPIAz+qfUDFBNLFUAWOyghCQgDrQZwN18FAZ0DSgO7LUwuAv85LkW3bx0pdqMcyIYxIkgCmiTIWiUueUA8RII8PHnBpvxLQAMEQQMBKy2rHv70ozGJ9QFNS1BTSDY6LAAAAwAj/5kDxwMwAB4APABBAAAXJxYzMj0BBgcnNjc1IzUzNTMVMxUjFTY3FQYHERQjEzM1MxUzFSMVMxUGBxYXByYnBgcnNjcmJyM1MzUjFxYXNjdKDysgKUE5EkFLeXlGampDKTA8V+XwRvHxyjR6YpAmlmd1qCKgbX0qNd3wiyZuaDFZRAUt7xoTRxAc7EW3t0XPIRdGGxz+4VoC+Y6ORaNApXVMOzxCVV08PDVVeptEo+eCaWWGAAQAJP+kA7oDMQAeAC4AQQBHAAAXJxYzMj0BBgcnNjc1IzUzNTMVMxUjFTY3FQYHERQjATMVMxUjFTMVITUzNSM1MwMhNTMVMxUjERQrAScWMzI9ASEWFwcmJzdLDzYYJ0wvEkhFenpKZ2cwPDY2WAHQSsbG8P3d6cTE8AFmSImJWIcQWCYp/pqwMzkvVzhbRgQp8hoOSBEW4ka9vUbGFB5JGRf+4FkDjGtBeEREeEH+jEtLRf7xWkUEK/2ARilIWyMAAAAHACz/ngPCAzAAGwAgACQAKgAwADYAVgAAARUjFQYHFhcHJicGByc2NyYnIzUjNSE2NxcGBwUWFzY3JSE1ITYXByYnNwYXByYnNyUGIScgNwEnFjMyPQEGByc2NzUjNTM1MxUzFSMVNj8BFQYHERQjA6Y6M3Npkyqma3e2JqhuZiwpMAGFNihDKyz+0C1eYzX+gwG8/kT5GDwaKj6EFjwbKD4B3Nb+oxUBS9v8zA8rIho4LRE3P29vR11dExosKTBIAf++KnhZPipAMkpJMUMoPVd0Lr5XaRlkQ+xcSEddQmWwPBZERhBNOBZEQhFCKUEk/HtFBRn8Fg5HDhfwRbe3RdIJDRhJFhf+1UgAAwA5/5oDxwMyAAYAHwAwAAABBgcWFzY3MwIHFhcHJicGByc2NyYnBgcnNhMXBgchFQE2NxcGByc2NREzNSE1IREjAlQLDTBPVgNFBm5fgymFXmKkLatgTDEfISxtJ0YKEgE//QqEfwukuxUg4P7uAVjgAlggI62Dm9j+/bCESzxLhH9NO0uDe587KziYAQgMPktF/d4sPkJOPT4MJwF07UX+jAADACX/nQPGAzIAGAAfAF8AACQnBgcnNjcmJwYHJzYTFwYHIRUjBgcWFwcBDwEWFzY3JTM1MxUzFSMVMzY3FwYHMxUjBgczFQYHFTY3FQYHFRQrAScWMzI9AQYHJzY3NTY3IwYHJzY3IzUzNjcjNTM1IwMrR1ePKJZRQCIcJytqJkcLEgEBPgdZTHAs/vkKCiQ+QAb9QINFa2s6Tjs4NztKfSsnqjNFXVVOZFVkEj8bK29tCm15ODGbT0wtLC4XajEt+LKDCXSFWUFXinqONzI2mQEDDEZIReyrgV8/ArYcG5p8krt/WVlAZmN2HWhUQzMnNDw8MQ4XQxUOdFdBBS5XDQpFBQ5JKjRBKjMZH0ApMUNmAAACAC3/mAO+AzgAGAAdAAATISYnNxYXIRUjBgcWFwcmJwYHJzY3JicjMxYXNjdAAZ4pO0g2MgF+nEqTnvEn+KKr/yb9oa1EmuFAnIU/AohRRRpEbEjun4ZZPF2LnUo/RJKq6MqWkc8AAAYAJ/+fA8QDLQAFAAsAIgAuADQAOgAAEhcHJic3JQYHJzY3ATMRMxEzFSMVFhcHJicRIxEGByc2NyMBMxE3FwcRIxEFJyUCFwcmJzcSFwcmJzecFjgaITwBTQkmNyAJ/quzSZWVRlErQStJQ2EegTqsAs1Hbgx6R/7fCgEraUUxPGkwSkExPGYwAnxtEHFbEwV5YhJWe/7nAVv+pUYaP1k/Yjb+bAGXkmpLfokBoP3JEUgS/vMBAixILQGxSTFHTy7+yEoySlIuAAAAAAUAIv+YA8QDPQATAB0AQABGAEwAAAEGBxUhFSMRIxEjFQIHJzY3ETY3BTMmJzcWFzMVIQczJic3FhczNjcXBgczFSMVMxUjERQrAScWMzI1ESM1MzUjFwYHJzY3BBcHJic3A6KHngFHbEeUCGszWQezlPy/wRkbShgYvP49CnMWH0AeF24cGEEaGnHBsLBXNg8bEie6usqYJkc8RiEBTxY8Hig8Auc0CbZF/eoCFnf+/Zg2geQBrANATTIlDC02QqE7Nhc6Tj5MFkUvQXBC/ulVQwUoAQZCcPaVcyZrhWFKF2NWEAAAAAABAEH/mQOvAzUAJgAAEyEmJzcWFyEVIQcVIQIHBgciLwEWMzY3NjchBgcGByc2NzY3NjchQQGvIClEKSUBdv3xCQGfAxQedhtYFk40TBIIA/6iEzdJhDSgNDIIBQH+6QKdRzgZPlpFsgL+2VaABARBAwNpN9h3ZYFqMohycpBfMgAAAAADAIv/mQMtAwkAAwAHABkAAAEVIT0CIRUDBgcnNjcRIREUKwEnFjMyPQEBOQGr/lUEEmE3YgMCPVWBFFwiIwHStLRDr6/+xsN/MobQAej88lZHBCzHAAAACAAu/5kDkQMuABMAFwAbAB8AOQA/AEQASAAAEzMVMzUzFTMVIxEzFSE1MxEjNTMTMzUjEyMVMwczNSMSFzY3ESERFCsBJxYzMj0BIwYHJzY3ByYnNwcGByc2NwEVBzM9AiMVfUbDRjw8Qf4nST4+RsPDw8PDw8PD9ShiAwFMWGoUSh0nwhVjNwICNSdEOIk6SjlNNgHbAb69Ay5wbm5D/llERAGnQ/4WYgFFZKJj/kdGitQB1Pz2XEcEM8fFhDMCBCZMWyMbbU8pS2QBX6YMskKvrwACACr/ngO6AzMAFwAdAAATITUzFSEVIRYXByQDIxEjESMCBSc2EyEkFwcmJzdMAYRIAYX+w2P3MP75ZwRIBG/+9Cf0av7EAscwOTVUOAJE7+9F+sI94QEY/Z8CYf7Ix0CuARHVRSlQWCMAAAACACf/mAPCAzAAFgA4AAATMzUzFTMVIxUWFwcmJxEjEQYHJzY3IwA3NjURIREUOwEyNzY3NjUXFAcGBwYrASI1ESMRFAcGByc6lUh8fEhFKjkqSDNWH3QxkgFrGRQBQxciEgoMAwdCBwYYGS8pYLIbGUc7Am3Dw0UzS1U+Xzr98QH8kn5Ok7r+BGpemAF7/Q0cBgkUI4oVYUM1FRFZArb+yaVvc201AAACACb/oAOvAzEAFwAuAAABNSE1MxUzFSMRFCsBJxYzMjURAgcnNhMlMzUzFTMVIxUWFwcmJxEjEQYHJzY3IwHVAR9Ic3NbahBEHSxp3yboZv1moUiFhU9NKkIwSDVfIXw2ngImRcbGRf3lY0YEMwHP/um6QbIBGEfDw0U7Slk+XTn9/gIMoIVSk7wAAAUAIf+gA7oDMAAFAA8AJgAqAC4AAAAXByYnNwUGBzMVITUhNjcBIxEGByc2NyM1MzUzFTMVIxUWFwcmJxMhFSETIRUhAj4uQCw2PwFqLDOI/hEBHDkq/edJNVUfdTCOkkl4eElLKzcycAI3/clKAbD+UALTXiBdUx8NiF5GRnWK/HACBJh7TpC5RcPDRThKXz9WQv54RgFbRgAAAAQANP+ZA7gDMQAjACkALwA1AAABMxUhFSEVIRUhBgc2NyYnNxYXByYnBgUnPwE2EyM1ITUjNTMkFwcmJzcSFwcmJzcTBgcnNjcCRkoBAf7/ARz+uGBIv7IlM0VaREQVFdz+8REIB0dy7QEd7e3+mkUzOmwzQz40NmIztUFPSFo9AzGhRrtG5GAUJk5VF5erHjYwLBo/BAM+AQxGu0ZITDRGUzH+yEwzR1Ux/tC/pR+xrgAAAAkAJ/+WA8IDNAAFACAAJgAsADIAOABAAEcAWAAAABcHJic3JzMVBgczFSMWFwcmJwYHBgcnNjc2NyM1MzY3AQYHJzY3BBcHJic3FhcHJic3BBcHJic3AAcWFzY3Jic2NyMGBxYXNxUCBSc2NyYnBgcnNjcXDwEDcyQ4JD81pkcBBdbCLbUnqjwRF0GSJ4s3GQ2BiwUB/mgkOUc7KQEAD0cLFEP1HUgZJkUBCj1APEc8/Z4iU0EhG01JyhSeEBFLSHVZ/rMieFhCTxsVLKc7RgkJAt05Jz5JIQpzMy1GuWs+brM8LndBPkFqOUFGLzH90o5qGWKPhW4ReW4OYoQRimkPXoAhh28dAWUpIigqLSoXLUolHxgkwDv+kZw/MlYsIRwTOZTXCR0dAAAAAAIAJf+cA5kDMgAmAEUAAAEGByEQBwYjIi8BFjMyNzYTIwYHBgcnNjc2NyMCByc2EyMGByc2NwEGByc2NxcGBzM1MxUzFSMVNjcVBgcRIxEGByc2NxECOhEUAYQPEJIVNhFAElwNCgNEITpGhyx8QDodTjyoL5o5RSgzNWAq/pYSFj84CD8DCUZGVFQpJxo2RlVIEVpUAyBTPf4LdYQDQQRgUAG4/JCeezlwjIvl/rqVNoMBIl8/LXjh/uhRQRKdsQVDQNDQRvATFUQOGv62ASwhF0YVIQEOAAYAJv+6A7oDCgATABcAGwAfACMAOgAAASERIxUzFSMVMxUhNTM1IzUzNSMlNSMVIzUjFTUzNSMzFTM1ATM1IzUhFSMVMxUjFTY3FQYHJzY3NSMBqQHhzNzc/P3A/N7ezQGch0iIiIjQh/z8a3kBO3htbTw8ia0SP0drAwr+MXpDf0VFf0N6QIiIiIjGiYmJ/vXsRETsRuAVG0U8LEUNE/gAAAAAAQAq/74DswMpAB4AABMGByc2NxcGBzM1MxUhFSEVIRUhFSEVITUhNSE1ITXxPF0ujjFJEBfWSQFJ/rcBNf7LAYn8jwGf/toBJgIxiF07lPoNTUSyskbnRv5ISP5G5wAABQAq/6UDdgMJAAMAGQAdACEAJQAAExUzNQUGByc2NxEhERQrAScWMzI9ASERIxElNSEVJTUhFScjFTPV/v7+EV83YAMC6WlzE1AfN/7wSgFa/vABEP7wSv7+AdSvr/HBfTGGzQHg/RNpRwQ/rv7bASVCr6/yrKysrAAAAAUAgf+rA74DLwADAB8AIwAnACsAABMVMzUVIxUjESE1MxUhESEVFDsBMjc2NxcGBwYjISI1NzM1Izc1IxUnIxUzyfb2SAE+SQFE/rxC5CQTDwZECRocRf7/ekn9/f39Sfb2AY2WltlVAkeJif4Oe0oREmUXgRoag8mWQpOTk5MABQBS/6gDmQMzAA0AEQAVABsAMgAAAREjNSMVIxEzNjcXBgcDMzUjNTM1IwAXByYnNyUQBwYjIi8BFjMyNzYTIQYHJzY3FwYHAZRGtkZdHQ5IExxbtra2tgISPDk9aDoBVw8PfQZgET8uRgwJA/7tKzkraCdHFBgChP1WOVYCx1JcCl9F/dLbQc7+5WMoY3sk5/4NdXMEQgNNVAGxYEs5kdcJYUYAAAAABACX/50DUQL6AAcACwAPABMAAAERIzUhFSMREyE1ITUhNSE1ITUhA1FJ/dhJSQIo/dgCKP3YAij92AL6/KNBQQNd/SqtRK9ErAAAAAYAMf+dA7kDMwADABoAHgAiACYALAAAATMRIwcGByckNyYnNxYXByYnBgchESM1IRUjNyE1ITUhNSE1ITUhEwYHJzY3AcRHR+RZOB4BzvQ6OTCabzImO5TfAY5G/itGRgHV/isB1f4rAdX+KzJahCqAVQMz/wC4EwpBS6cuKS5rbzMrMmlJ/estL29WOlA6UQFWb1Q2UWkABAAi/5sDwAL/AAMAGQArAC8AAAEjETMVIwIHJzYTIzUzESM1IRUjETMVIxEjATUhFSMGBzMRIxUjEQYHJzY3EzMRIwMEnJydDKc1mQltblgB21x3d0X9QAFanhUpuaxCHyYmdCYTaWkCu/7rRf7LkTCBARVFARVERP7rRf49AxdGRndr/jtPAaI5Mz6n2/2YAUYAAAQAIf+YA5sDLQAPABMAFwA1AAABMxUzESM1IxEjESMVIxEzAzMRIxMzESMlNjcXBgcVMxUjFRYXByYnESMRBgcnNjcjNTM1BgcCi0nHRoFJfUXCfX19xoGB/WO8ihlEPoGBSE8oQC9GN1gedDKTmjdSAy2w/jsz/rEBTzMBxf6xAQv+9QELrRIyQxUOpUUyRFs8YDz+LwHMlHNLhqxFlwkKAAAABAAj/58DwQMtAAsAEQAXADUAACUHESMRBSclETMRNwAXByYnNxIXByYnNyUzNQYHJzY3FwYHFTMVIxUWFwcmJxEjEQYHJzY3IwPBZUn+yAoBQkla/uZJMUN0M1RHM0F4M/4tpEpLDsCaGTxMiopXVSlTMEg9ZB9+OpzNEP7iARIzRjQCNf3XDwGaTjJLUDH+xk0zSlkxEJQLB0QPLkQRDqBFMUZSPWEv/jcB05V2UICnAAAABAAj/6ADuQMiAAMABwAbADkAAAEhESElNSEVByEVIxUzFSMVMxUhNTM1IzUzNSMlMzUGByc2NxcGBxUzFSMVFhcHJicRIxEGByc2NyMB1wGf/mEBWf7ubgHx2ru78v3P9ba2zf6Kkk02DryQGT1SgYFHSCg4L0U0Vh9yMIsDDP7fQaCgnkKAQYdERIdBgNORCwZEEi1EEhCdRTVFVjxXPv49AcOSd1CGqgAAAAQAO/+eA68DNQAFAAsAGQA+AAAAFwcmJzcHBgcnNjc2FyEVIzUhFSM1ISYnNwMzFQchERQ7ATI3NjUXBgcGKwEiNREjBgcGByc2NzY3IzUhNjcC9YAniZQknXy5H7N2fhIBckj9WkgBeBMXT21JAwECHkolCQZCCQgQVVVaxRc4Vb8nuksoEvsBBgICAhNuPHhJNi52TEFDcKY4yYaFyDAnDP6MSTf+xBwnPEMTgBg/VAEEbkprPD47XjtNRBMkAAQAM/+cA64DPgAiACgAOQBKAAATITUzFSEVIRUhFSMVMxUjFRQrAScWMzI9ASE1ITUhNSE1IRIXByYnNwMGByc2NxcGBzMVIxYXByYnJQYHJzY3FwYHIRUjFhcHJieoAShIAST+3AGWv5SUWZEPUDcq/c0CM/2SAZf+2MMtODBWOk4mLz5bKUUOC/+vKRpBJSgBdxwkPEogRAwMASK1KRg+JSoCBz4+QF4/YUGZU0QFJYhBYT9e/lQ8J0BQJAJASDonbX8PKxo9NC0YRjMCODAlYnEOJCE9NC4YQzcAAAAAAwA6/58DqgMlADQAOgBAAAAFJxYzMj0BBgcnNjc2NwYHJzY3NjcGIycgJRcGBwYHNjc2NxcGBzY3Jic3FhcHJicGBxEUIyQXByYnNwUGByc2NwFfEDwrJa2MFB4he5lpsA8XGkxlf5AVAa8BHSOg0Ftiaok6UUL9vf2mMB42fEE6GShajlUBdWYzXJ4w/rZlly2TY2FHBSX1CwZAAwo2cggLQAMKJm4GPz89JRBqQAUMLkkh32MOFD4jIZFpKSw7Cgz+9FW3bzNqdy40elw6V3IAAAAEAC//vgO9AzIACwAbACEAPgAAJSM1IRUjFTMVITUzAgcnJDchNSEVBgcWFwcmJwEGByc2NwMGBzY/AjY3FwYHNjcVBgcnNjc2NwYHJzY3NjcCc8QB08Xz/dLxJ6ctARht/qoBrCdFgVU1ToL+wIyzCruOWkxLR0QODxIMQoheZ2qYkRMSDj1SakASEQ1PO+FGRt1GRgGXQTtpwUU5WUpSTTVPV/48Ry5FK0kCncJwBgwbHCEYGfpvFiZCNRRCBw0+jA8IQggThrAAAAAFADL/ngO+AzoABQAgAD4ASABaAAAlBgcnNjcDBgc/ATY3FwYHNjcVBgcnNjc2NwYHJzY3NjcTPwE2NyM1MyYnNxYXMxUhBgc2NyYnNxYXByYnBgcXBgcGByc2NzY3ASI1ETMRFBczNjc2NRcUBwYHAX6Oqgm5iFtLTkJCGxxBhFdraJmVFBEQPFBoPBMRD089sggGMlGr/xQZSRcV4f7OQTSTgyMkN2k1OBQatOGXBygsZS5ZJSQGAR5aRRdBIQYHPhQUQTtHKUgpRAKXwHIKDDM4GflsGy5DQBlCCA4/jBMJQgcVhbT+iQMCJZxFNzAMNT5FgT0NFTMuIoZfJyYnHREpvFpdQjk5TE6p/ltXAVf+tR8DAyExURaNIh8DAAAFADT/nAO2AzQAFgAeACIAJgA0AAATISYnNxYXIRUjFhcHJicGBSc2NzY3KQEGByQ3Jic3ARUhPQIhHQIjESERFCsBJxcyPQE0AZ8TF1MSFgGS64hKPCMo5P5eEBIRRFb+9wFjWEgBFNMnJzL+UAHH/jlIAlhgZhNhLwLYKiUNIzlEbFMqKiYTED8DBypJUiwJESUfIP47T08+VVXLpgIE/ltdRgMpOAAGADH/mgOxAzEAAwAHABUALQBFAFkAADcVMz0CIx0CIxEhERQrAScXMj0BEyI1ETMVNjcXBgcVFDsBMjc2NxcGBwYjAxQ7ATI3NjcXBgcGKwEiNREzFTY3FwYHAhcHJicGByc2NzY3FwYHNjcmJzex6+tFAXZUUxJNJv1dR6JsF4ShKoMeDA4GQwocGTS5J4MeDA8GQgocGTWkWkeXdReCoZEvOxAUm+QUEg9fO0RGUoWGIiI4811dPGBg1r8CNP4rWEQDJlEBJmQBToUZHD8hF3gyDxNcFXccF/6RJA8TXhV5HBdWAVV7DyRAJg8B01UqISEZDDwFDmJ2FYdLCBE0MCMAAAQApP+dA0QDNAANABEAFQAZAAABESM1IRUjETM2NxcGBwMhNSE1ITUhNSE1IQNESf3ySdYjDkkRHtgCDv3yAg798gIO/fICuvzjPj4DHUA6Cjw0/WajQplClQAABAAw/50DtgM2ABMAOgBPAFUAABMzNTMVITUzFTMVIxUjNSEVIzUjBQYHFhUUBwYjIi8BFjMyNzY3NCcGByc2NyYnBgcnNjcmJzcWFzY3FzM2NTMUBzMVIxYXByYnBgcnNjcjJBcHJic3POpKAQhK6upK/vhK6gFVODNKJxw4LzAUNC4lEBECBk9yJ4lREBZDWSZaQi4/NUArMCZMzgNIAubgPbswrTk8pzDDG8oBxio2LEc2AuRSUlJSQUlJSUmGSC19tYQzHgJCBRkfYzAqUDs7P1s3JzAsPCU0NyQtKDUvMq9sOyeARPWDNoPLy4c0pNq6OCY/RCEAAAAAAQAo/54DsQMvADYAABMhNTMVIRUhFSEVIRUhFSEWFzY3FwYHFhcHJAMjBgcVNjcXBgcnNj0BBgcnNjchNSE1ITUhNSFoAWdKAWf+mQEt/tMBiv6UKTpVTTJOXmOMJv7ofiM0U2Z5Dp2fGCBTYybvb/6/AYr+0wEt/pkC2lVVQl9CX0FgSydSME4paTU/cQFFSz/VHjFDPyc+DBidMyc+VIJBX0JfAAAAAwAt/5oDwAMyAAsAEQAeAAABIREzESEVIREjESECFwcmJzcDMxE2NxcGByc2NREjAWYBD0oBAf7/Sv7xXUQxPm4yddhGMBJabhwXkwIFAS3+00j93QIjAR1NMklSMf7Y/kM7LUpXSUESGAGPAAAAAAQAK/+tA8oDMQAKACIAKAA1AAABFhcHJicGByc2NwMiNREzFTY3FwYHFRQ7ATI3NjcXBgcGIwAXByYnNwMzETY3FwYHJzY1ESMCo1bRJ8NdcKsnumcqZUiUiSmumDO2LxMbCUMMJyFI/fNEMT5uMnLTM0ASWW4cGI0DMb2MPYO/1nU9gtD8fG4BsbI4ZTpyOKk5FhuBFqAiHgMtTTJJUjH+2P5EKj9IWU1AFBgBkAAABwAq/58DxgMmABsAHwAjACcAKwAxAD4AAAEhESMVIRUjFhcHJicjESMRIwYHJzY3IzUhNSMlNSMVIzUjFTUzNSMzFTM1JBcHJic3AzMRNjcXBgcnNjURIwGOAe3WAQjGQ5wupkQJQwlJqiWaRsUBBtQBq5RDkpKS1ZT9yUcxPGwzedEnNBNQXRwViwMK/oJdQ4NpOYSh/rMBTbN0PV2NQ109Y2NjY55kZGQTTzFHUDH+2f5UIzRKUENAEhgBgwAABQA6/7YDrgMqACYAKgAuADIANgAAEyE1ITUhNQYHJyQlFwYHFSEVIRUhESEVIRUhFSEVITUhNSE1ITUhJTUjFSM1IxU1MzUjIRUzNZoBNf55AYeYpAwBhwEqFoO0AYn+dwE2/soBZv6aAZf8jAGV/p0BY/7LAm3wSPDw8AE48AILQD5ICQY7DCE8DgtMPkD+tEU6Sz8/SzpFOlFRUVGJT09PAAACAC7/tgO5AykAKQAuAAATBgcnJDczFgUHJicVIxUhFSERMzY3FwYHMxUhNTMmJzcWFzMRITUhNSMlJicGB+5MSioBBqQ6jwEYKUtJ4wFk/pxvMydCJi3P/KnPHjJBLShz/psBZeMB/ZJkcooCEy4iPHW1snU8Hys3kEP+9WJwF2lSRUVcVxhRegELQ5BBVnV2VQAABABO/5sDwgM2AA0AEQAzAEwAAAEVIzUhFSM1MyYnNxYXByEVIQchFSMRFDsBMjc2NxcGBwYHIyInJjURIwYHBgcnNjc2NyMBIRUGBxYXBgcGBycWMzY3NjU0JzY3IxEjA6NF/npG4xETShMQ4gF+/oJKAhymHSoSChAFQgkWFTBPJhQVXQwoM3UtZy0mCYv+vAEhKDhVBgIcKGUYIAgxExFcPSOURgLXy4eHyy8kDC0yvkFxRP7xKAoNdhaEHBYCGRolASCgSlo/OzdHQ4cB5jiGlYZoRB4jAkkEAw4PImR/moH81QAAAAQALP+VA8cDBgAOACAAJgAyAAATIzUhFSMRNjcVBgcnNjcTIRUjBgczESMRIREjETM2NyMAFwcmJzcDMxUGBwYHJzY3NjeudwE1dEkyhK4VTDbkAirvCQzURv6/RbAOCPQB41I0S4EwekgGUFCzJ69FQAQCiUZG/jwZFUg3L0cTDwJYRjY1/ioBlP5sAdYvPP1hWDRXYy4BVqyjYFkzPi9OUoIABwAi/5sDwAMKACAAJAAoACwAPgBEAFAAAAQ3ByMgJyYnBgcnNjcXBgcWFzUjNSEVIxUzFSMVFhcWIQEhESElNSMVNTM1IyUhFSMGBzMRIxEjESMRMzY3IwAXByYnNwMzFQYHBgcnNjc2NwNcZAzF/plaTj8bMDRQCEADBycvxgGooomJCQZXATP9jgFV/qsBD8nJyQFDAbC0BQilQ/JAjwgFuwF9PzM6XzBpQgM0MnQnaywoAxAESCchWGBRLoirCDYpPSPzQkJqQG0DAxsDGv6jPFVVjVg7Qiwk/mwBVv6qAZQiLv3RSDJITC4BKo2NV04uPidESG8AAAAABgA6/5wDrQM3AAkAFwAbAB8AIwAnAAAAFyEVITUhJic3EycXMjURIREjESERFCMBIRUhJTUhFRchFSElNSEVAhMOAYz8jQGaDBZPuBFQPv18RQMObv3QAi/90QHp/l0NAYn+dwFJ/vgDCi1CQiIqDvxoQQMwARD+fwHA/qJfArvCOFJS88g2XFwAAAABAAAAATMzEUvW918PPPUAAwPoAAAAANGAbcIAAAAA0rGijgAY/5EDygNAAAAABAACAAAAAAAAAAEAAAQk/qwAAAPoAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAALA+gAAAJYAG8CWAA+AlgAMwJYABgCWAAzAlgAMwJYAEICWAAqAlgAMwPoADIAOgAzADkAMABpADkAOgAjACEAJAAnACMAIAAkACoANAAsACQAIwAzACcANgA3ADgALAApADIAHwBsAEQALQBZACMANQA3AGAAKQAtADkAPwBHAD8ASQAxAEIAOQBLAC4AKgAqACYALQAjACQALAA5ACUALQAnACIAQQCLAC4AKgAnACYAIQA0ACcAJQAmACoAKgCBAFIAlwAxACIAIQAjACMAOwAzADoALwAyADQAMQCkADAAKAAtACsAKgA6AC4ATgAsACIAOgAAAAYAHgBUAJ4AvAD6AUIBVgG2Af4CDAI8AoQCtAL+AywDWgOAA6ID6AQsBG4EqATwBWoFugYABkoGkAbiBzoHfgfOCCAIkAjSCSIJVgmKCbwJ+go+CnoKtAsKCygLegumDAoMdgygDPQNTA2+DgoOIg5UDpAOxA7kDygPcA/cEDgQmhEgEXIR/BIwEpYTChNME3gT4hQYFGwUshUAFVoV8BZcFq4W3hccF1wXsBfWGCIYbBi+GRgZbhnOGj4aqBsOG54b8hx0HKIdIB10HaweBB5kHrYfAB90H8YgRCCKAAEAAABvAGAACQAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAQAMYAAQAAAAAAAAAhAAAAAQAAAAAAAQAMACEAAQAAAAAAAgAHAC0AAQAAAAAAAwAcADQAAQAAAAAABAAPAFAAAQAAAAAABQAcAF8AAQAAAAAABgAKAHsAAQAAAAAABwANAIUAAwABBAkAAABCAJIAAwABBAkAAQAYANQAAwABBAkAAgAOAOwAAwABBAkAAwA4APoAAwABBAkABAAeATIAAwABBAkABQA4AVAAAwABBAkABgAUAYgAAwABBAkABwAaAZxDb3B5cmlnaHQoYykgUVpEIENvcnBvcmF0aW9uLjIwMjFRWkQtUElOR0ZBTkdSZWd1bGFyUVpEIFBpbmdGYW5nIFNDOlZlcnNpb24gMS4yMFFaRCBQaW5nRmFuZyBTQ1ZlcnNpb24gMS4yMCBKYW51YXJ5IDUsIDIwMTZQaW5nRmFuZ1NDQnkgUVpEIENYWSBGRQBDAG8AcAB5AHIAaQBnAGgAdAAoAGMAKQAgAFEAWgBEACAAQwBvAHIAcABvAHIAYQB0AGkAbwBuAC4AMgAwADIAMQBRAFoARAAtAFAASQBOAEcARgBBAE4ARwBSAGUAZwB1AGwAYQByAFEAWgBEACAAUABpAG4AZwBGAGEAbgBnACAAUwBDADoAVgBlAHIAcwBpAG8AbgAgADEALgAyADAAUQBaAEQAIABQAGkAbgBnAEYAYQBuAGcAIABTAEMAVgBlAHIAcwBpAG8AbgAgADEALgAyADAAIABKAGEAbgB1AGEAcgB5ACAANQAsACAAMgAwADEANgBQAGkAbgBnAEYAYQBuAGcAUwBDAEIAeQAgAFEAWgBEACAAQwBYAFkAIABGAEUAAwAAAAAAAP+cAEAAAAAAAAAAAAAAAAAAAAAAAAAAbw== '
 96 | font = FontDecrypt(str_base64=str_base64_)
 97 | font.decrypt('𤵼𥙌腐进𤵝防护𥪵𤹩腐蚀𧒄𥉥𥪵𥌙测试𢺔𦰤𤵢𤵩𥤣𥎧𤹩𥤣𥎧')
 98 | ```
 99 | Outputs:
100 | ```shell
101 | 2022-01-20 17:01:47.932 | SUCCESS  | __main__:decrypt:89 - Out Decrypted Strings：中国腐进与防护学会腐蚀电化学及测试方法专业委员会委员
102 | ```
103 | 
104 | 
105 | ### Database Pipelines
106 | With this module,insert data into database is more conveniently.
107 | #### MySQL Pipeline
108 | There is no need to care the order of items when insert data.
109 | 
110 | Usage example:
111 | ```shell
112 | from CrawlersTools import MysqlPipeline
113 | 
114 | mysql_client = MysqlPipeline(host='127.0.0.1', username='root', password='', db='project', drop_column=["id"])
115 | item = {"name": "MuggleK", "age": "18"}
116 | mysql_client.insert_one(item, table_name='test_table')
117 | ```
118 | Outputs:
119 | ```shell
120 | 2022-11-25 11:25:18.291 | SUCCESS  | pipelines.mysql_pipeline:insert:160 - Success: Insert {"name": "MuggleK", "age": "18"} -> test_table
121 | ```
122 | 
123 | 
124 | ### Http Request Base on Httpx
125 | Usage example:
126 | ```shell
127 | from CrawlersTools import base_requests
128 | 
129 | response = base_requests('https://example.org')
130 | print(response)
131 | ```
132 | Outputs:
133 | ```shell
134 | <Response [200 OK]>
135 | ```
136 | 
137 | 
138 | ### Dynamic Thread
139 | This concurrent mode will create a child thread for every task below max threads and when children thread less than max threads,the main thread will create child thread for new task.
140 | 
141 | Usage example:
142 | ```shell
143 | from CrawlersTools import AutoThread
144 | 
145 | a_thread = AutoThread(thread_num=20, fun, arg_list)
146 | a_thread.main_thread()
147 | ```
148 | 
149 | 
150 | ## Reference
151 | 
152 | ### Paper
153 | 
154 | * [面向不规则列表的网页数据抽取技术的研究](http://www.cnki.com.cn/Article/CJFDTotal-JSYJ201509023.htm)
155 | * [基于文本及符号密度的网页正文提取方法](https://kns.cnki.net/KCMS/detail/detail.aspx?dbcode=CJFQ&dbname=CJFDLAST2019&filename=GWDZ201908029&v=MDY4MTRxVHJXTTFGckNVUkxPZmJ1Wm5GQ2poVXJyQklqclBkTEc0SDlqTXA0OUhiWVI4ZVgxTHV4WVM3RGgxVDM=)
156 | * [基于块密度加权标签路径特征的Web新闻在线抽取](https://kns.cnki.net/kcms/detail/detail.aspx?filename=PZKX201708010&dbcode=CJFQ&dbname=CJFD2017&v=)
157 | * [基于DOM树和视觉特征的网页信息自动抽取](http://www.cnki.com.cn/Article/CJFDTOTAL-JSJC201310069.htm)
158 | * [基于行块分布函数的通用网页正文抽取算法](https://code.google.com/archive/p/cx-extractor/)
159 | ### Project
160 | 
161 | * [GeneralNewsExtractor](https://github.com/kingname/GeneralNewsExtractor)
162 | * [GerapyAutoExtractor](https://github.com/Gerapy/GerapyAutoExtractor)
163 | * [artical-spider](https://github.com/govzhz/artical-spider)
164 | * [html-extractor](https://github.com/cnyangkui/html-extractor)
165 | * [vthread](https://github.com/cilame/vthread)
166 | * [usepy](https://github.com/use-py/usepy)
167 | 
168 | 
169 | ## Contribute
170 | 
171 | If you want to contribute with CrawlersTools check out the [GitHub](https://github.com/MuggleK/CrawlersTools).
172 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Project : CrawlersTools
 3 | # @Time    : 2022/8/12 21:00
 4 | # @Author  : MuggleK
 5 | # @File    : setup.py
 6 | 
 7 | from setuptools import setup, find_packages
 8 | 
 9 | with open("README.md", "r", encoding='utf-8') as f:
10 |     long_description = f.read()
11 | 
12 | setup(
13 |     name='CrawlersTools',  # 包名
14 |     version='1.4.81',  # 版本号
15 |     description='Tools for Crawlers',
16 |     long_description=long_description,
17 |     long_description_content_type="text/markdown",
18 |     author='MuggleK',
19 |     author_email='peichangchuan@gmail.com',
20 |     url='https://github.com/MuggleK/CrawlersTools',
21 |     install_requires=[
22 |         "auto_mix_prep",
23 |         "DBUtils==1.3",
24 |         "fontTools",
25 |         "httpx",
26 |         "httpx[http2]",
27 |         "loguru",
28 |         "Pillow",
29 |         "PyExecJS==1.5.1",
30 |         "pymongo",
31 |         "PyMySQL",
32 |         "redis",
33 |         "tqdm",
34 |         "PyYAML",
35 |         "lxml",
36 |         "numpy",
37 |         "Distance",
38 |         "chardet",
39 |         "sinan",
40 |         "kafka-python"
41 |     ],
42 |     license='BSD License',
43 |     packages=find_packages(where='.', exclude=(), include=('*',)),
44 |     platforms=["all"],
45 |     classifiers=[
46 |         'Intended Audience :: Developers',
47 |         'Operating System :: OS Independent',
48 |         'Natural Language :: Chinese (Simplified)',
49 |         'Programming Language :: Python :: 3.7',
50 |         'Topic :: Software Development :: Libraries'
51 |     ],
52 | )
53 | 


--------------------------------------------------------------------------------