├── docker-compose.yml
├── .github
    └── workflows
    │   ├── build_docker.yml
    │   └── test_scraping.yml
├── setup.cfg
├── docs
    ├── modules.rst
    ├── setup.rst
    ├── pydork.rst
    ├── index.rst
    ├── Makefile
    ├── make.bat
    └── conf.py
├── completion
    ├── pydork-completion.bash
    └── _pydork
├── pydork
    ├── engine_yandex.py
    ├── messages.py
    ├── common.py
    ├── engine_duckduckgo.py
    ├── __init__.py
    ├── engine_yahoo.py
    ├── recaptcha.py
    ├── engine_bing.py
    ├── engine_baidu.py
    ├── sub_commands.py
    ├── test_engine.py
    ├── test_engine_selenium.py
    ├── engine_google.py
    ├── engine.py
    └── engine_common.py
├── Dockerfile
├── LICENSE
├── .gitignore
├── setup.py
├── README.md
└── README.rst


/docker-compose.yml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/workflows/build_docker.yml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [build_sphinx]
2 | source-dir = docs/
3 | build-dir = docs/_build
4 | all_files = 1
5 | 


--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | pydork
2 | ======
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    pydork
8 |    setup
9 | 


--------------------------------------------------------------------------------
/docs/setup.rst:
--------------------------------------------------------------------------------
1 | setup module
2 | ============
3 | 
4 | .. automodule:: setup
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/pydork.rst:
--------------------------------------------------------------------------------
 1 | pydork package
 2 | ==============
 3 | 
 4 | .. automodule:: pydork
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | pydork.engine module
13 | --------------------
14 | 
15 | .. automodule:: pydork.engine
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 


--------------------------------------------------------------------------------
/completion/pydork-completion.bash:
--------------------------------------------------------------------------------
 1 | #!bash
 2 | # =======================================================
 3 | 
 4 | _pydork() {
 5 |   local cur
 6 |   local cmd
 7 | 
 8 |   cur=${COMP_WORDS[$COMP_CWORD]}
 9 |   cmd=(${COMP_WORDS[@]})
10 | 
11 |   if [[ "$cur" == -* ]]; then
12 |     COMPREPLY=($(compgen -W "-h --help" -- $cur))
13 |     return 0
14 |   fi
15 | }
16 | 
17 | complete -F _pydork -o default pydork
18 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. pydork documentation master file, created by
 2 |    sphinx-quickstart on Sun Feb 13 19:47:15 2022.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to pydork's documentation!
 7 | ==================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 |    :glob:
13 | 
14 |    pydork
15 | 
16 | Indices and tables
17 | ==================
18 | 
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 | 


--------------------------------------------------------------------------------
/pydork/engine_yandex.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) 2023 Blacknon. All rights reserved.
 4 | # Use of this source code is governed by an MIT license
 5 | # that can be found in the LICENSE file.
 6 | # =======================================================
 7 | 
 8 | """engine_yandex
 9 |     * Yandex(yandex.com)用の検索用Classを持つモジュール.
10 | """
11 | 
12 | from .common import Color
13 | from .engine_common import CommonEngine
14 | 
15 | 
16 | class Yandex(CommonEngine):
17 |     """DuckDuckGo
18 | 
19 |     DuckDuckGo用の検索エンジン用Class.
20 |     """
21 | 
22 |     def __init__(self):
23 |         None
24 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Blacknon. All rights reserved.
 2 | # Use of this source code is governed by an MIT license
 3 | # that can be found in the LICENSE file.
 4 | # =======================================================
 5 | 
 6 | FROM debian:bullseye
 7 | 
 8 | ENV DEBIAN_FRONTEND noninteractive
 9 | 
10 | # apt update
11 | RUN apt update
12 | 
13 | # apt install
14 | RUN apt install -y \
15 |     firefox-esr \
16 |     python3-pip
17 | 
18 | RUN pip3 install --upgrade pip
19 | RUN pip3 install --upgrade pip setuptools
20 | 
21 | # copy directory
22 | COPY ./ /opt/pydork
23 | WORKDIR /opt/pydork
24 | 
25 | # listing /opt/pydork
26 | RUN ls -la /opt/pydork
27 | 
28 | # # pip install
29 | RUN pip3 install --use-pep517 ./
30 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.https://www.sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 blacknon
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # vscode
  2 | .vscode
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # pyenv
 79 | .python-version
 80 | 
 81 | # celery beat schedule file
 82 | celerybeat-schedule
 83 | 
 84 | # SageMath parsed files
 85 | *.sage.py
 86 | 
 87 | # Environments
 88 | .env
 89 | .venv
 90 | env/
 91 | venv/
 92 | ENV/
 93 | env.bak/
 94 | venv.bak/
 95 | 
 96 | # Spyder project settings
 97 | .spyderproject
 98 | .spyproject
 99 | 
100 | # Rope project settings
101 | .ropeproject
102 | 
103 | # mkdocs documentation
104 | /site
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 
109 | # Selenium
110 | geckodriver.log
111 | 


--------------------------------------------------------------------------------
/completion/_pydork:
--------------------------------------------------------------------------------
 1 | #compdef pydork
 2 | # =======================================================
 3 | 
 4 | 
 5 | _pydork() {
 6 | local context curcontext=$curcontext state line
 7 | declare -A opt_args
 8 | local ret=1
 9 | 
10 | # args
11 | _arguments -C \
12 |   '(-h --help)'{-h,--help}'[show help]' \
13 |   '(-v --version)'{-v,--version}'[show version]' \
14 |   '1: :__pydork_commands' \
15 |   '*:: :->modes' \
16 |   && ret=0
17 | 
18 | # args and subcommand
19 | case $state in
20 | modes)
21 | case $words[1] in
22 | search)
23 | _arguments -C \
24 |   '(-h --help)'{-h,--help}'[show help]' \
25 |   '-t[search engine]:_values:(baidu bing duckduckgo google yahoo)' \
26 |   '(-n --num)'{-n,--num}'[get search result num (int)]:_values:(100 200 300 400 500)' \
27 |   '(-P --proxy)'{-P,--proxy}'[proxy server]' \
28 |   '(-s --selenium)'{-s,--selenium}'[Seleniumを使用する]' \
29 |   '(-S --splash)'{-S,--splash}'[Splashを使用する]' \
30 |   '(-T --title)'{-T,--title}'[検索結果のタイトルも取得する]' \
31 |   '(-0 --nullchar)'{-0,--nullchar}'[区切り文字としてNull Characterを使用する]' \
32 |   '--color[output color(default:auto)]:_values:(auto always none)' \
33 |   '--debug[debug mode]' \
34 |   '(-)*:: :->null_state' \
35 |   && ret=0
36 | ;;
37 | 
38 | suggest)
39 | _arguments -C \
40 |   '(-h --help)'{-h,--help}'[show help]' \
41 |   '-t[search engine]:_values:(baidu bing duckduckgo google yahoo)' \
42 |   '--jap[サジェスト取得時に日本語の候補を追加で検索]' \
43 |   '--alph[サジェスト取得時にアルファベットの候補を追加で検索]' \
44 |   '--num[サジェスト取得時に数字の候補を追加で検索]' \
45 |   '(-P --proxy)'{-P,--proxy}'[proxy server]' \
46 |   '--color[output color(default:auto)]:_values:(auto always none)' \
47 |   '(-)*:: :->null_state' \
48 |   && ret=0
49 | ;;
50 | esac
51 | ;;
52 | esac
53 | 
54 | return ret
55 | }
56 | 
57 | __pydork_commands () {
58 | local -a _c
59 | _c=(
60 | 'search:url検索モード'
61 | 'suggest:suggest取得モード'
62 | )
63 | 
64 | _describe -t commands Commands _c
65 | }
66 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | 
16 | 
17 | import pathlib
18 | 
19 | 
20 | path = pathlib.Path("../../pydork")
21 | sys.path.insert(0, os.path.abspath(path))  # '..\\..\\Resources'))
22 | 
23 | 
24 | # -- Project information -----------------------------------------------------
25 | 
26 | project = 'pydork'
27 | copyright = '2022, blacknon'
28 | author = 'blacknon'
29 | 
30 | # The full version, including alpha/beta/rc tags
31 | release = '1.1.0'
32 | 
33 | 
34 | # -- General configuration ---------------------------------------------------
35 | 
36 | # Add any Sphinx extension module names here, as strings. They can be
37 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
38 | # ones.
39 | extensions = [
40 |     'sphinx.ext.autodoc',
41 |     'sphinx.ext.napoleon',
42 |     'sphinx.ext.viewcode'
43 | ]
44 | 
45 | # Add any paths that contain templates here, relative to this directory.
46 | templates_path = ['_templates']
47 | 
48 | # List of patterns, relative to source directory, that match files and
49 | # directories to ignore when looking for source files.
50 | # This pattern also affects html_static_path and html_extra_path.
51 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
52 | 
53 | 
54 | # -- Options for HTML output -------------------------------------------------
55 | 
56 | # The theme to use for HTML and HTML Help pages.  See the documentation for
57 | # a list of builtin themes.
58 | #
59 | html_theme = 'sphinx_rtd_theme'
60 | 
61 | # Add any paths that contain custom static files (such as style sheets) here,
62 | # relative to this directory. They are copied after the builtin static files,
63 | # so a file named "default.css" will overwrite the builtin "default.css".
64 | html_static_path = ['_static']
65 | 


--------------------------------------------------------------------------------
/.github/workflows/test_scraping.yml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 Blacknon. All rights reserved.
 2 | # Use of this source code is governed by an MIT license
 3 | # that can be found in the LICENSE file.
 4 | 
 5 | 
 6 | name: Test job at Scraping.
 7 | 
 8 | on:
 9 |   push:
10 |     branches:
11 |       - 'master'
12 |       - 'develop'
13 | 
14 | jobs:
15 |   # build rust binary
16 |   scraping:
17 |     strategy:
18 |       matrix:
19 |         include:
20 |           - target: google
21 |             search_text: test_google_text_search
22 |             search_image: test_google_image_search
23 |             suggest: test_google_suggest
24 |             suggest_jap: test_google_suggest_with_jap
25 |             suggest_alph: test_google_suggest_with_alph
26 |             suggest_num: test_google_suggest_with_num
27 | 
28 |           - target: bing
29 |             search_text: test_bing_text_search
30 |             search_image: test_bing_image_search
31 |             suggest: test_bing_suggest
32 |             suggest_jap: test_bing_suggest_with_jap
33 |             suggest_alph: test_bing_suggest_with_alph
34 |             suggest_num: test_bing_suggest_with_num
35 | 
36 |           - target: baidu
37 |             search_text: test_baidu_text_search
38 |             search_image: test_baidu_image_search
39 |             suggest: test_baidu_suggest
40 |             suggest_jap: test_baidu_suggest_with_jap
41 |             suggest_alph: test_baidu_suggest_with_alph
42 |             suggest_num: test_baidu_suggest_with_num
43 | 
44 |           - target: yahoo
45 |             search_text: test_yahoo_text_search
46 |             search_image: test_yahoo_image_search
47 |             suggest: test_yahoo_suggest
48 |             suggest_jap: test_yahoo_suggest_with_jap
49 |             suggest_alph: test_yahoo_suggest_with_alph
50 |             suggest_num: test_yahoo_suggest_with_num
51 | 
52 |           - target: duckduckgo
53 |             search_text: test_duckduckgo_text_search
54 |             search_image: test_duckduckgo_image_search
55 |             suggest: test_duckduckgo_suggest
56 |             suggest_jap: test_duckduckgo_suggest_with_jap
57 |             suggest_alph: test_duckduckgo_suggest_with_alph
58 |             suggest_num: test_duckduckgo_suggest_with_num
59 | 
60 |     runs-on: ubuntu-latest
61 |     steps:
62 |       - uses: actions/checkout@v1
63 | 
64 |       - name: Setup Python
65 |         uses: actions/setup-python@v2
66 |         with:
67 |           python-version: '3.9'
68 |           architecture: 'x64'
69 | 
70 |       - name: Get Python version
71 |         run: python -V
72 | 
73 |       - name: Install Selenium
74 |         run: pip install get-chrome-driver get-gecko-driver --upgrade
75 | 
76 |       - name: Install Sphinx
77 |         run: pip install sphinx sphinx-rtd-theme sphinx-autobuild
78 | 
79 |       - name: Install dependencies
80 |         run: pip install ./
81 | 
82 |       - name: Run Test Text Search
83 |         run: python -m unittest pydork.test_engine_selenium.SearchEngineTestCaseWithSelenium.${{ matrix.search_text }} -v
84 | 
85 |       - name: Run Test Image Search
86 |         run: python -m unittest pydork.test_engine_selenium.SearchEngineTestCaseWithSelenium.${{ matrix.search_image }} -v
87 | 
88 |       - name: Run Test Suggests
89 |         run: |
90 |           python -m unittest pydork.test_engine_selenium.SearchEngineTestCaseWithSelenium.${{ matrix.suggest }} -v
91 |           python -m unittest pydork.test_engine_selenium.SearchEngineTestCaseWithSelenium.${{ matrix.suggest_num }} -v
92 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) 2023 Blacknon. All rights reserved.
  4 | # Use of this source code is governed by an MIT license
  5 | # that can be found in the LICENSE file.
  6 | # =======================================================
  7 | 
  8 | 
  9 | import os
 10 | import platform
 11 | 
 12 | import setuptools
 13 | 
 14 | cmdclass = {}
 15 | try:
 16 |     from sphinx.setup_command import BuildDoc
 17 |     cmdclass = {'build_sphinx': BuildDoc}
 18 | except ImportError:
 19 |     pass
 20 | 
 21 | try:
 22 |     with open('README.rst') as f:
 23 |         readme = f.read()
 24 | except IOError:
 25 |     readme = ''
 26 | 
 27 | 
 28 | # 補完ファイルインストール用関数
 29 | def get_data_files():
 30 |     # 補完ファイルのインストール先を取得する関数
 31 |     def get_completefile_install_location(shell):
 32 |         # pathのprefixを定義
 33 |         prefix = ''
 34 | 
 35 |         # osの種類を取得
 36 |         uname = platform.uname()[0]
 37 | 
 38 |         # 実行ユーザがrootかどうかでprefixを変更
 39 |         if os.geteuid() == 0:
 40 |             ''' システムインストール時の挙動 '''
 41 |             if uname == 'Linux' and shell == 'bash':
 42 |                 prefix = '/'
 43 |             elif uname == 'Linux' and shell == 'zsh':
 44 |                 prefix = '/usr/local'
 45 |             elif uname == 'Darwin' and shell == 'bash':
 46 |                 prefix = '/'
 47 |             elif uname == 'Darwin' and shell == 'zsh':
 48 |                 prefix = '/usr'
 49 | 
 50 |         # shellの種類に応じてインストール先のlocationを変更
 51 |         if shell == 'bash':
 52 |             location = os.path.join(prefix, 'etc/bash_completion.d')
 53 |         elif shell == 'zsh':
 54 |             location = os.path.join(prefix, 'share/zsh/site-functions')
 55 |         else:
 56 |             raise ValueError('unsupported shell: {0}'.format(shell))
 57 | 
 58 |         # locationを返す
 59 |         return location
 60 | 
 61 |     # locationをdict形式で取得する
 62 |     loc = {
 63 |         'bash': get_completefile_install_location('bash'),
 64 |         'zsh': get_completefile_install_location('zsh')
 65 |     }
 66 | 
 67 |     # 対象となるファイルをdict形式で指定
 68 |     files = dict(
 69 |         bash=['completion/pydork-completion.bash'],
 70 |         zsh=[
 71 |             'completion/pydork-completion.bash',
 72 |             'completion/_pydork'
 73 |         ]
 74 |     )
 75 | 
 76 |     # data_files形式でreturn
 77 |     data_files = []
 78 |     data_files.append((loc['bash'], files['bash']))
 79 |     data_files.append((loc['zsh'], files['zsh']))
 80 | 
 81 |     return data_files
 82 | 
 83 | 
 84 | name = 'pydork'
 85 | version = '1.1.7'
 86 | release = '1.1.7'
 87 | 
 88 | if __name__ == "__main__":
 89 |     setuptools.setup(
 90 |         name=name,
 91 |         version=version,
 92 |         author='blacknon',
 93 |         author_email='blacknon@orebibou.com',
 94 |         maintainer='blacknon',
 95 |         maintainer_email='blacknon@orebibou.com',
 96 |         description='Scraping and listing text and image searches on Google, Bing, DuckDuckGo, Baidu, Yahoo japan.',
 97 |         long_description=readme,
 98 |         license='MIT License',
 99 |         install_requires=[
100 |             'bs4',
101 |             'get-chrome-driver',
102 |             'get-gecko-driver',
103 |             'chromedriver_autoinstaller',
104 |             'geckodriver_autoinstaller',
105 |             'fake_useragent',
106 |             'lxml',
107 |             'requests[socks]',
108 |             'selenium==4.7.2',
109 |             'selenium_requests',
110 |             'pickle-mixin',
111 |             'sphinx',
112 |             'sphinx-rtd-theme',
113 |             'sphinx-autobuild'
114 |         ],
115 |         url='https://github.com/blacknon/pydork',
116 |         packages=setuptools.find_packages(),
117 |         py_modules=['pydork'],
118 |         entry_points={
119 |             'console_scripts': [
120 |                 'pydork = pydork:main',
121 |             ],
122 |         },
123 |         classifiers=[
124 |             'Programming Language :: Python :: 3',
125 |             'Programming Language :: Python :: 3.7',
126 |             'Programming Language :: Python :: 3.8',
127 |             'Programming Language :: Python :: 3.9',
128 |             'Programming Language :: Python :: 3.10',
129 |             'Programming Language :: Python :: 3.11',
130 |             'License :: OSI Approved :: MIT License',
131 |         ],
132 |         data_files=get_data_files(),
133 |         cmdclass=cmdclass,
134 |         command_options={
135 |             'build_sphinx': {
136 |                 'project': ('setup.py', name),
137 |                 'version': ('setup.py', version),
138 |                 'release': ('setup.py', release)}},
139 |         setup_requires=[
140 |             "sphinx",
141 |             "sphinx-rtd-theme",
142 |             "sphinx-autobuild",
143 |         ],
144 |     )
145 | 


--------------------------------------------------------------------------------
/pydork/messages.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) 2023 Blacknon. All rights reserved.
 4 | # Use of this source code is governed by an MIT license
 5 | # that can be found in the LICENSE file.
 6 | # =======================================================
 7 | 
 8 | """common
 9 |     * commandでのhelp messageを英語・日本語対応させるためのテキストデータを持つモジュール.
10 | """
11 | 
12 | import os
13 | 
14 | lang = os.getenv('LANG')
15 | 
16 | if lang == 'ja_JP.UTF-8':
17 |     description = "各種検索エンジンから指定したクエリの結果(url)およびSuggestを取得する"
18 | 
19 |     # common_args_map
20 |     help_message_query = "検索文字列(クエリ)"
21 |     help_message_op_file = "検索文字列(クエリ)が書かれているファイル"
22 |     help_message_op_template_file = "検索文字列(クエリ)が書かれているテンプレートファイル(jinja2)"
23 |     help_message_op_template_variable = "テンプレートファイル(jinja2)で使用する変数セット(json)"
24 |     help_message_op_search_type = "使用する検索エンジンを指定"
25 |     help_message_op_lang = "言語を指定"
26 |     help_message_op_country = "国を指定"
27 |     help_message_op_proxy_server = "プロキシサーバーを指定(例:socks5://hogehoge:8080, https://fugafuga:18080)"
28 |     help_message_op_json = "json形式で出力する"
29 |     help_message_op_insecure = "sslエラーを無視する"
30 |     help_message_op_selenium = "Selenium(headless browser)を使用する(排他: Splashより優先)"
31 |     help_message_op_splash = "Splash(headless browser)を使用する(排他: Seleniumの方が優先)"
32 |     help_message_op_browser_endpoint = "Selenium/Splash等のヘッドレスブラウザのエンドポイントを指定(例: localhost:8050)"
33 |     help_message_op_browser = "Seleniumで使用するBrowserを指定"
34 |     help_message_op_color = "color出力の切り替え"
35 |     help_message_op_cookies_dir = "使用するcookieファイルの格納先ディレクトリのPATH(各検索エンジンごとでcookieファイルを個別保存)"
36 |     help_message_op_delete_cookies = "検索クエリ実行ごとにCookieを削除する"
37 | 
38 |     # other_map
39 |     help_message_op_title = "検索結果のタイトルをセットで出力する"
40 |     help_message_op_null_char = "null characterを区切り文字として使用する"
41 |     help_message_op_num = "検索結果の取得数を指定する"
42 |     help_message_op_debug = "debugモードを有効にする"
43 |     help_message_op_disable_headless = "Seleniumでheadlessモードを無効化する(手動でのReCaptcha対応時に必要)"
44 |     help_message_op_start = "期間指定(開始)"
45 |     help_message_op_end = "期間指定(終了)"
46 |     help_message_op_image_pagelink = "画像ファイルがあるhtmlのURLも出力する"
47 | 
48 |     # suggest_map
49 |     help_message_op_suggest_jap = "日本語の文字を検索キーワードに追加してサジェストを取得"
50 |     help_message_op_suggest_alph = "アルファベット文字を検索キーワードに追加してサジェストを取得"
51 |     help_message_op_suggest_num = "数字を検索キーワードに追加してサジェストを取得"
52 | 
53 | 
54 | else:
55 |     description = "Obtain results (url) and Suggest for a specified query from various search engines"
56 | 
57 |     # common_args_map
58 |     help_message_query = "search string(query)"
59 |     help_message_op_file = "File containing search strings(queries)"
60 |     help_message_op_template_file = "Template file (jinja2) containing search strings (queries)"
61 |     help_message_op_template_variable = "Variable set (json) used in template file (jinja2)"
62 |     help_message_op_search_type = "Specify which search engine to use"
63 |     help_message_op_lang = "Specify language"
64 |     help_message_op_country = "Specify country"
65 |     help_message_op_proxy_server = "Specify proxy server(example: socks5://hogehoge:8080, https://fugafuga:18080)"
66 |     help_message_op_json = "Output in json format"
67 |     help_message_op_insecure = "ignore ssl errors"
68 |     help_message_op_selenium = "Use Selenium (headless browser). (exclusive: takes precedence over Splash)"
69 |     help_message_op_splash = "Use Splash (headless browser) (exclusive: Selenium is preferred)"
70 |     help_message_op_browser_endpoint = "Specify the endpoint for headless browsers such as Selenium/Splash (example: localhost:8050)"
71 |     help_message_op_browser = "Specify Browser to use with Selenium"
72 |     help_message_op_color = "Switching color output"
73 |     help_message_op_cookies_dir = "PATH of the directory where the cookie files to be used are stored (cookie files are stored separately for each search engine)"
74 |     help_message_op_delete_cookies = "Delete cookies on every search query execution"
75 | 
76 |     # other_map
77 |     help_message_op_title = "Output a set of search result titles"
78 |     help_message_op_null_char = "Use null character as delimiter"
79 |     help_message_op_num = "Specify the number of search results to retrieve"
80 |     help_message_op_debug = "Enable debug mode"
81 |     help_message_op_disable_headless = "Disable headless mode in Selenium (required for manual ReCaptcha support)"
82 |     help_message_op_start = "Search period (start)"
83 |     help_message_op_end = "Search period (end)"
84 |     help_message_op_image_pagelink = "Also output the html URL where the image files are located."
85 | 
86 |     # suggest_map
87 |     help_message_op_suggest_jap = "Add Japanese characters to search keywords to get suggestions"
88 |     help_message_op_suggest_alph = "Add alphabetic characters to search keywords to get suggestions"
89 |     help_message_op_suggest_num = "Add numbers to search keywords to get suggestions"
90 | 


--------------------------------------------------------------------------------
/pydork/common.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) 2023 Blacknon. All rights reserved.
  4 | # Use of this source code is governed by an MIT license
  5 | # that can be found in the LICENSE file.
  6 | # =======================================================
  7 | 
  8 | 
  9 | """common
 10 |     * 共通系や雑多な処理を詰め合わせたバルクモジュール.
 11 | """
 12 | 
 13 | import sys
 14 | import datetime
 15 | 
 16 | from string import Template
 17 | 
 18 | 
 19 | # コンソール出力時に色付をするためのClass
 20 | class Color:
 21 |     """Color
 22 | 
 23 |     コンソール出力時に色付をするための文字列を変数にして保持しているClass.
 24 | 
 25 |     Examples:
 26 |         c = Color()
 27 |         c.set(c.BLUE)
 28 |         print(c.out('hogehoge'))
 29 |     """
 30 |     # color_code
 31 |     BLACK = '\033[30m'
 32 |     RED = '\033[31m'
 33 |     GREEN = '\033[32m'
 34 |     YELLOW = '\033[33m'
 35 |     BLUE = '\033[34m'
 36 |     PURPLE = '\033[35m'
 37 |     CYAN = '\033[36m'
 38 |     WHITE = '\033[37m'
 39 |     GRAY = '\033[1;30m'
 40 | 
 41 |     # 文字効果
 42 |     BOLD = '\038[1m'
 43 |     ITALIC = '\038[3m'
 44 |     UNDERLINE = '\033[4m'
 45 |     INVISIBLE = '\033[08m'
 46 |     REVERCE = '\033[07m'
 47 | 
 48 |     # 効果を終了
 49 |     END = '\033[0m'
 50 | 
 51 |     def __init__(self, color_code: str):
 52 |         """[summary]
 53 | 
 54 |         Args:
 55 |             color_code (str): 使用するカラーコード
 56 | 
 57 |         """
 58 |         self.COLOR_CODE = color_code
 59 | 
 60 |     def out(self, text: str, is_bold=False, is_underline=False, is_reverse=False, is_italic=False):
 61 |         # textを囲む
 62 |         text = self.COLOR_CODE + text + self.END
 63 | 
 64 |         # is_boldが有効な場合、太字にする
 65 |         if is_bold:
 66 |             text = self.BOLD + text + self.END
 67 | 
 68 |         # is_underlineが有効な場合、下線を入れる
 69 |         if is_underline:
 70 |             text = self.UNDERLINE + text + self.END
 71 | 
 72 |         # is_reverseが有効な場合、色の反転をする
 73 |         if is_reverse:
 74 |             text = self.REVERCE + text + self.END
 75 | 
 76 |         # is_italicが有効な場合、Italicにする
 77 |         if is_italic:
 78 |             text = self.ITALIC + text + self.END
 79 | 
 80 |         return text
 81 | 
 82 | 
 83 | # Message関連の制御用Class
 84 | class Message:
 85 |     """Message
 86 | 
 87 |     メッセージの出力を簡易化するためのClass.
 88 | 
 89 |     Examples:
 90 | 
 91 |     """
 92 | 
 93 |     def __init__(self):
 94 |         # command flag
 95 |         self.IS_COMMAND = False
 96 | 
 97 |         # debug flag
 98 |         self.IS_DEBUG = False
 99 | 
100 |         # timestamp flag
101 |         self.IS_TIMESTAMP = False
102 | 
103 |         # engine data
104 |         self.ENGINE_COLOR = Color('')
105 |         self.ENGINE_NAME = ''
106 |         self.ENGINE = ''
107 | 
108 |         # header
109 |         self.HEADER = ''
110 | 
111 |     def set_is_command(self, is_command: bool):
112 |         self.IS_COMMAND = is_command
113 | 
114 |     def set_is_debug(self, is_debug: bool):
115 |         self.IS_DEBUG = is_debug
116 | 
117 |     def set_engine(self, engine: str, color: str):
118 |         self.ENGINE_COLOR = Color(color)
119 |         self.ENGINE_NAME = engine
120 |         self.ENGINE = self.ENGINE_COLOR.out(engine)
121 | 
122 |     def set_header(self, text):
123 |         self.HEADER = text
124 | 
125 |     def replace(self, text):
126 |         """replace
127 | 
128 |         テンプレートテキストの変数をself変数や時刻に置換して返す
129 | 
130 |         Args:
131 |             text (str): 置換処理をするテンプレート用テキスト
132 |         """
133 | 
134 |         # 現在時刻を取得
135 |         dt_now = datetime.datetime.now()
136 | 
137 |         # 置換用のdictを生成
138 |         data = {
139 |             # 時刻情報
140 |             'YEAR': dt_now.year,
141 |             'MONTH': dt_now.month,
142 |             'DAY': dt_now.day,
143 |             'HOUR': dt_now.hour,
144 |             'MINUTE': dt_now.minute,
145 |             'SECOND': dt_now.second,
146 | 
147 |             # 検索エンジン(color)
148 |             'ENGINE': self.ENGINE,  # 色付き
149 |             'ENGINE_NAME': self.ENGINE_NAME,  # 色なし
150 |         }
151 | 
152 |         # テンプレートを作成
153 |         template = Template(text)
154 | 
155 |         # 置換処理を実行
156 |         result = template.safe_substitute(data)
157 | 
158 |         return result
159 | 
160 |     def print_line(self, *text, use_header=True, separator=' ', file=sys.stdout, header=None):
161 |         """print_line
162 | 
163 |         メッセージを出力する(行)
164 | 
165 |         Args:
166 |             text: メッセージとして出力するテキスト行
167 |             use_header: `header`で指定しているヘッダーを行頭に表示するかどうか
168 |             separator: printする際に使用する区切り文字
169 |             file: 出力先のファイル(デフォルトはstdout)
170 |             header: ヘッダーとして使用する文字列を指定
171 |         """
172 |         # headerの生成
173 |         if header is None:
174 |             header = self.HEADER
175 | 
176 |         header = self.replace(header)
177 | 
178 |         # テキストを出力
179 |         if use_header:
180 |             print(header, *text, sep=separator, file=file)
181 |         else:
182 |             print(*text, sep=separator, file=file)
183 | 
184 |     def print_text(self, text, mode='message', use_header=True, separator=' ', file=sys.stdout, header=None):
185 |         """print_line
186 | 
187 |         メッセージを出力する(テキスト)
188 | 
189 |         Args:
190 |             text: メッセージとして出力するテキスト
191 |             mode: メッセージの出力モード(`message`, `error`, `warn`, `info`, `debug`)
192 |             use_header: `header`で指定しているヘッダーを行頭に表示するかどうか
193 |             separator: printする際に使用する区切り文字
194 |             file: 出力先のファイル(デフォルトはstdout)
195 |             header: ヘッダーとして使用する文字列を指定
196 |         """
197 |         # is_commandが有効のときのみ出力させる
198 |         if not self.IS_COMMAND:
199 |             return
200 | 
201 |         # debug, infoのときは、self.is_debugが有効のときのみ出力
202 |         if mode in ('info', 'debug'):
203 |             # self.is_debugでない場合は出力しない
204 |             if not self.IS_DEBUG:
205 |                 return
206 | 
207 |         # 出力テキストの生成
208 |         text = self.replace(text)
209 | 
210 |         # case
211 |         text_color: Color = Color(Color.END)
212 |         if mode == 'message':  # modeが `message` のとき
213 |             text_color = Color(Color.WHITE)
214 | 
215 |         elif mode == 'error':
216 |             text_color = Color(Color.RED)
217 |             file = sys.stderr
218 | 
219 |         elif mode == 'warn':
220 |             text_color = Color(Color.YELLOW)
221 |             file = sys.stderr
222 | 
223 |         elif mode == 'info':
224 |             text_color = Color(Color.GREEN)
225 |             file = sys.stderr
226 | 
227 |         elif mode == 'debug':
228 |             text_color = Color(Color.GRAY)
229 |             file = sys.stderr
230 | 
231 |         # default headerの定義
232 |         if mode in ('info', 'debug'):
233 |             if header is None:
234 |                 header = self.HEADER
235 | 
236 |             header = Color.REVERCE + \
237 |                 self.replace(header) + Color.END
238 | 
239 |         # TODO: 正規表現で、付きの箇所を抜き出すような処理を追加で入れる
240 | 
241 |         # テキストの出力
242 |         for line in text.splitlines():
243 |             self.print_line(text_color.out(line),
244 |                             separator=separator, use_header=use_header, file=file, header=header)
245 | 
246 |         return
247 | 
248 | 
249 | # 渡されたリスト内のdictに`num`を追加する関数
250 | def set_counter(links: list):
251 |     """set_counter
252 | 
253 |     links(list)の要素に`num`キーを追加し、連続した数値を入れていく
254 | 
255 |     Args:
256 |         links(list): リンクのリスト. ex) [{'link', 'http://...', 'title': 'hogehoge...'}, {'link': '...', 'title': '...'}, ... ]
257 |     Returns:
258 |         result(list):  [{'link', 'http://...', 'title': 'hogehoge...', num: 1}, {'link': '...', 'title': '...', num: 2}, ... ]
259 |     """
260 |     # result(list)の生成
261 |     result = list()
262 | 
263 |     num = 1
264 |     for d in links:
265 |         d["num"] = num
266 |         num += 1
267 |         result.append(d)
268 | 
269 |     return result
270 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | PyDork
  2 | ======
  3 | 
  4 | ## Description
  5 | 
  6 | Scraping and listing text and image searches on **Google**, **Bing**, **DuckDuckGo**, **Baidu**, **Yahoo japan**.
  7 | 
  8 | ## Install
  9 | 
 10 | ```bash
 11 | pip install pydork
 12 | ```
 13 | 
 14 | ## Build
 15 | 
 16 | ### Documents
 17 | 
 18 | ```bash
 19 | python setup.py build_sphinx
 20 | ```
 21 | 
 22 | ### Dockerimage
 23 | 
 24 | ```bash
 25 | docker build -t "pydork" --progress=plain .
 26 | ```
 27 | 
 28 | ## How to use
 29 | 
 30 | ### commandline tool
 31 | 
 32 | ```shell
 33 | $ # search text at google
 34 | $ pydork search -n 10 -t google -- 'super mario'
 35 | Google: Text Search: super mario
 36 | Google: Finally got 10 links.
 37 | [GoogleSearch]: https://www.nintendo.co.jp/character/mario/
 38 | [GoogleSearch]: https://www.nintendo.co.jp/software/smb1/index.html
 39 | [GoogleSearch]: https://www.nintendo.co.jp/switch/adala/index.html
 40 | [GoogleSearch]: https://www.nintendo.co.jp/switch/ayama/index.html
 41 | [GoogleSearch]: https://www.nintendo.co.jp/switch/aaaca/index.html
 42 | [GoogleSearch]: https://supermariorun.com/ja/
 43 | [GoogleSearch]: https://ja.wikipedia.org/wiki/%E3%82%B9%E3%83%BC%E3%83%91%E3%83%BC%E3%83%9E%E3%83%AA%E3%82%AA%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA
 44 | [GoogleSearch]: https://store-jp.nintendo.com/list/software/70010000034626.html
 45 | [GoogleSearch]: https://www.youtube.com/watch?v=z5nqRrqFFZI
 46 | [GoogleSearch]: https://www.nintendo.com/games/detail/super-mario-3d-world-plus-bowsers-fury-switch/
 47 | 
 48 | $ # search text at google, bing, duckduckgo, with selenium
 49 | $ pydork search -s -n 10 -t google bing duckduckgo -- 'super mario'
 50 | Google: Text Search: super mario
 51 | Bing: Text Search: super mario
 52 | DuckDuckGo: Text Search: super mario
 53 | Bing: Finally got 10 links.
 54 | [BingSearch]: https://www.nintendo.co.jp/software/smb1/index.html
 55 | [BingSearch]: https://www.nintendo.co.jp/character/mario/index.html
 56 | [BingSearch]: https://ja.wikipedia.org/wiki/%E3%82%B9%E3%83%BC%E3%83%91%E3%83%BC%E3%83%9E%E3%83%AA%E3%82%AA%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA
 57 | [BingSearch]: https://supermarioplay.com/
 58 | [BingSearch]: https://www.lego.com/ja-jp/campaigns/jp/supermario
 59 | [BingSearch]: https://supermariorun.com/ja/
 60 | [BingSearch]: https://ja.wikipedia.org/wiki/%E3%82%B9%E3%83%BC%E3%83%91%E3%83%BC%E3%83%9E%E3%83%AA%E3%82%AA%E3%83%96%E3%83%A9%E3%82%B6%E3%83%BC%E3%82%BA
 61 | [BingSearch]: https://supermariobros.io/
 62 | [BingSearch]: https://supermario-bros.co/
 63 | [BingSearch]: https://game-ac.com/free/mario/
 64 | Google: Finally got 10 links.
 65 | [GoogleSearch]: https://www.nintendo.co.jp/character/mario/
 66 | [GoogleSearch]: https://www.nintendo.co.jp/software/smb1/index.html
 67 | [GoogleSearch]: https://www.nintendo.co.jp/switch/adala/index.html
 68 | [GoogleSearch]: https://www.nintendo.co.jp/switch/ayama/index.html
 69 | [GoogleSearch]: https://www.nintendo.co.jp/switch/aaaca/index.html
 70 | [GoogleSearch]: https://supermariorun.com/ja/
 71 | [GoogleSearch]: https://ja.wikipedia.org/wiki/%E3%82%B9%E3%83%BC%E3%83%91%E3%83%BC%E3%83%9E%E3%83%AA%E3%82%AA%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA
 72 | [GoogleSearch]: https://store-jp.nintendo.com/list/software/70010000034626.html
 73 | [GoogleSearch]: https://store-jp.nintendo.com/feature_mar004.html
 74 | [GoogleSearch]: https://www.nintendo.com/games/detail/super-mario-3d-world-plus-bowsers-fury-switch/
 75 | DuckDuckGo: Finally got 10 links.
 76 | [DuckDuckGoSearch]: https://supermariobros.io/
 77 | [DuckDuckGoSearch]: https://supermarioplay.com/
 78 | [DuckDuckGoSearch]: https://mario.nintendo.com/
 79 | [DuckDuckGoSearch]: https://en.wikipedia.org/wiki/Super_Mario
 80 | [DuckDuckGoSearch]: https://supermario-game.com/
 81 | [DuckDuckGoSearch]: https://www.mario-flash.com/
 82 | [DuckDuckGoSearch]: https://supermario-bros.co/
 83 | [DuckDuckGoSearch]: https://www.youtube.com/watch?v=4noiYiEYg6Q
 84 | [DuckDuckGoSearch]: https://www.crazygames.com/t/mario
 85 | [DuckDuckGoSearch]: https://arcadespot.com/game/super-mario-64/
 86 | 
 87 | $ # search image at google, yahoo.co.jp with selenium and set html title...
 88 | $ pydork image -T -s -n 10 -t google yahoo -- 'legend of zelda'
 89 | Yahoo: Image Search: legend of zelda
 90 | Google: Image Search: legend of zelda
 91 | Yahoo: Finally got 10 links.
 92 | [YahooSearch]: Amazon.co.jp: The Legend of Zelda: Breath of the Wild ...: https://m.media-amazon.com/images/I/81iU0U8VZML._AC_SL1500_.jpg
 93 | [YahooSearch]: Amazon | Legend of Zelda Link's Awakening(輸入版:北米 ...: https://m.media-amazon.com/images/I/91z5JYtUZAS._AC_SY445_.jpg
 94 | [YahooSearch]: Amazon | The Legend of Zelda: Breath of the Wild (輸入版 ...: https://m.media-amazon.com/images/I/61wcjVPx4sL._AC_SX466_.jpg
 95 | [YahooSearch]: Amazon | The Legend of Zelda Encyclopedia | Nintendo | Video ...: https://images-na.ssl-images-amazon.com/images/I/91zJdQWSE0L.jpg
 96 | [YahooSearch]: the-legend-of-zelda-breath-of- ...: https://www.nintendo.com//content/dam/noa/en_US/games/switch/t/the-legend-of-zelda-breath-of-the-wild-switch/the-legend-of-zelda-breath-of-the-wild-switch-hero.jpg
 97 | [YahooSearch]: Amazon | The Legend of Zelda: Twilight Princess, Vol. 7 (7 ...: https://images-na.ssl-images-amazon.com/images/I/81-c6fHsctL.jpg
 98 | [YahooSearch]: The Legend of Zelda™: Breath of the Wild - My Nintendo Store: https://assets.nintendo.eu/image/upload/f_auto,q_auto,t_product_tile_desktop/MNS/NOE/70010000000023/SQ_NSwitch_TheLegendOfZeldaBreathOfTheWild_E
 99 | [YahooSearch]: Amazon | Legend of Zelda 2020 Wall Calendar | Nintendo ...: https://images-na.ssl-images-amazon.com/images/I/61R+rBBQxaL._SX258_BO1,204,203,200_.jpg
100 | [YahooSearch]: 359点のThe Legend Of Zeldaのストックフォト - Getty Images: https://media.gettyimages.com/photos/link-figurine-from-legend-of-zelda-with-shop-staff-inside-nintendo-picture-id1231509485?s=612x612
101 | [YahooSearch]: Evolution of Legend of Zelda 1986-2020 - YouTube: https://i.ytimg.com/vi/1FwoEgUBgE0/maxresdefault.jpg
102 | Google: Finally got 10 links.
103 | [GoogleSearch]: LATEST* The Legend Of Zelda Breath Of The Wild 2: Nintendo Direct E3 2021,  Release Date, Leaked Info, Gameplay, Setting, Story Info, Trailers, & More: https://cdn.realsport101.com/images/ncavvykf/realsport-production/2db4094078e3c7e7442e33afb8e8e5e6082d3849-1920x1080.png?rect=0,1,1920,1077&w=328&h=184&auto=format
104 | [GoogleSearch]: Jual The Legend of Zelda: Breath of the Wild Special Edition [EU] - Jakarta  Barat - Lionheartno Games Store | Tokopedia: https://images.tokopedia.net/img/cache/700/product-1/2017/1/16/9470651/9470651_4508d715-ecf7-452a-8150-df1a6a0c47ab_771_424.jpg
105 | [GoogleSearch]: The Legend of Zelda: Breath of the Wild – Link has never been set so free |  Nintendo Switch | The Guardian: https://i.guim.co.uk/img/media/22d6b308c89e62e229feb220208a639836e31fd9/60_0_1800_1080/master/1800.png?width=700&quality=85&auto=format&fit=max&s=25c588a5203feea6061c32112a66ebdc
106 | [GoogleSearch]: Kaos The Legend of Zelda c Nintendo, Fesyen Pria, Pakaian , Atasan di  Carousell: https://media.karousell.com/media/photos/products/2021/9/22/kaos_the_legend_of_zelda_c_nin_1632313294_5b47ea62_progressive.jpg
107 | [GoogleSearch]: Sales of The Legend of Zelda titles worldwide 2019 | Statista: https://cdn.statcdn.com/Statistic/985000/985767-blank-355.png
108 | [GoogleSearch]: Legend Of Zelda Monsters | Minimalis: http://tse2.mm.bing.net/th?id=OIP.wUtxfbukexwonASdvmIirgHaEK&pid=15.1
109 | [GoogleSearch]: Everything The Legend of Zelda: Breath of the Wild 2 is hiding: full  analysis - The Legend of Zelda: Breath of the Wild II - Gamereactor: https://www.gamereactor.eu/media/08/legendzelda_3500863.jpg
110 | [GoogleSearch]: The Legend of Zelda: A Link Between Worlds (Video Game 2013) - IMDb: https://m.media-amazon.com/images/M/MV5BZDI2M2IwMDItOTU4MS00YzdjLWJmYjItMzA3MjJjMDk2YjBiXkEyXkFqcGdeQXVyNjY5NTM5MjA@._V1_.jpg
111 | [GoogleSearch]: The Complete Chronological Order Of Legend Of Zelda Games: https://static0.gamerantimages.com/wordpress/wp-content/uploads/2021/01/Zelda-Four-Swords-Adventures-Links.jpg?q=50&fit=crop&w=1400&dpr=1.5
112 | [GoogleSearch]: Sword Slash Png - Legend Of Zelda Skyward Sword Artwork Clipart (#1717847)  - PikPng: https://cpng.pikpng.com/pngl/s/90-907142_the-legend-of-zelda-legend-of-zelda-skyward.png
113 | 
114 | ```
115 | 
116 | ### python library
117 | 
118 | ```python
119 | from pydork.engine import SearchEngine
120 | 
121 | # SearchEngine
122 | search_engine = SearchEngine()
123 | 
124 | search_engine.set('google')
125 | search_result = search_engine.search('final fantasy')
126 | ```
127 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | PyDork
  2 | ======
  3 | 
  4 | Description
  5 | -----------
  6 | 
  7 | Scraping and listing text and image searches on Google, Bing,
  8 | DuckDuckGo, Baidu, Yahoo japan.
  9 | 
 10 | Install
 11 | -------
 12 | 
 13 | .. code:: bash
 14 | 
 15 |    git clone https://github.com/blacknon/pydork
 16 |    cd pydork
 17 |    pip install ./
 18 | 
 19 | How to use
 20 | ----------
 21 | 
 22 | commandline tool
 23 | ~~~~~~~~~~~~~~~~
 24 | 
 25 | .. code:: shell
 26 | 
 27 |    $ # search text at google
 28 |    $ pydork search -n 10 -t google -- 'super mario'
 29 |    Google: Text Search: super mario
 30 |    Google: Finally got 10 links.
 31 |    [GoogleSearch]: https://www.nintendo.co.jp/character/mario/
 32 |    [GoogleSearch]: https://www.nintendo.co.jp/software/smb1/index.html
 33 |    [GoogleSearch]: https://www.nintendo.co.jp/switch/adala/index.html
 34 |    [GoogleSearch]: https://www.nintendo.co.jp/switch/ayama/index.html
 35 |    [GoogleSearch]: https://www.nintendo.co.jp/switch/aaaca/index.html
 36 |    [GoogleSearch]: https://supermariorun.com/ja/
 37 |    [GoogleSearch]: https://ja.wikipedia.org/wiki/%E3%82%B9%E3%83%BC%E3%83%91%E3%83%BC%E3%83%9E%E3%83%AA%E3%82%AA%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA
 38 |    [GoogleSearch]: https://store-jp.nintendo.com/list/software/70010000034626.html
 39 |    [GoogleSearch]: https://www.youtube.com/watch?v=z5nqRrqFFZI
 40 |    [GoogleSearch]: https://www.nintendo.com/games/detail/super-mario-3d-world-plus-bowsers-fury-switch/
 41 | 
 42 |    $ # search text at google, bing, duckduckgo, with selenium
 43 |    $ pydork search -s -n 10 -t google bing duckduckgo -- 'super mario'
 44 |    Google: Text Search: super mario
 45 |    Bing: Text Search: super mario
 46 |    DuckDuckGo: Text Search: super mario
 47 |    Bing: Finally got 10 links.
 48 |    [BingSearch]: https://www.nintendo.co.jp/software/smb1/index.html
 49 |    [BingSearch]: https://www.nintendo.co.jp/character/mario/index.html
 50 |    [BingSearch]: https://ja.wikipedia.org/wiki/%E3%82%B9%E3%83%BC%E3%83%91%E3%83%BC%E3%83%9E%E3%83%AA%E3%82%AA%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA
 51 |    [BingSearch]: https://supermarioplay.com/
 52 |    [BingSearch]: https://www.lego.com/ja-jp/campaigns/jp/supermario
 53 |    [BingSearch]: https://supermariorun.com/ja/
 54 |    [BingSearch]: https://ja.wikipedia.org/wiki/%E3%82%B9%E3%83%BC%E3%83%91%E3%83%BC%E3%83%9E%E3%83%AA%E3%82%AA%E3%83%96%E3%83%A9%E3%82%B6%E3%83%BC%E3%82%BA
 55 |    [BingSearch]: https://supermariobros.io/
 56 |    [BingSearch]: https://supermario-bros.co/
 57 |    [BingSearch]: https://game-ac.com/free/mario/
 58 |    Google: Finally got 10 links.
 59 |    [GoogleSearch]: https://www.nintendo.co.jp/character/mario/
 60 |    [GoogleSearch]: https://www.nintendo.co.jp/software/smb1/index.html
 61 |    [GoogleSearch]: https://www.nintendo.co.jp/switch/adala/index.html
 62 |    [GoogleSearch]: https://www.nintendo.co.jp/switch/ayama/index.html
 63 |    [GoogleSearch]: https://www.nintendo.co.jp/switch/aaaca/index.html
 64 |    [GoogleSearch]: https://supermariorun.com/ja/
 65 |    [GoogleSearch]: https://ja.wikipedia.org/wiki/%E3%82%B9%E3%83%BC%E3%83%91%E3%83%BC%E3%83%9E%E3%83%AA%E3%82%AA%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA
 66 |    [GoogleSearch]: https://store-jp.nintendo.com/list/software/70010000034626.html
 67 |    [GoogleSearch]: https://store-jp.nintendo.com/feature_mar004.html
 68 |    [GoogleSearch]: https://www.nintendo.com/games/detail/super-mario-3d-world-plus-bowsers-fury-switch/
 69 |    DuckDuckGo: Finally got 10 links.
 70 |    [DuckDuckGoSearch]: https://supermariobros.io/
 71 |    [DuckDuckGoSearch]: https://supermarioplay.com/
 72 |    [DuckDuckGoSearch]: https://mario.nintendo.com/
 73 |    [DuckDuckGoSearch]: https://en.wikipedia.org/wiki/Super_Mario
 74 |    [DuckDuckGoSearch]: https://supermario-game.com/
 75 |    [DuckDuckGoSearch]: https://www.mario-flash.com/
 76 |    [DuckDuckGoSearch]: https://supermario-bros.co/
 77 |    [DuckDuckGoSearch]: https://www.youtube.com/watch?v=4noiYiEYg6Q
 78 |    [DuckDuckGoSearch]: https://www.crazygames.com/t/mario
 79 |    [DuckDuckGoSearch]: https://arcadespot.com/game/super-mario-64/
 80 | 
 81 |    $ # search image at google, yahoo.co.jp with selenium and set html title...
 82 |    $ pydork image -T -s -n 10 -t google yahoo -- 'legend of zelda'
 83 |    Yahoo: Image Search: legend of zelda
 84 |    Google: Image Search: legend of zelda
 85 |    Yahoo: Finally got 10 links.
 86 |    [YahooSearch]: Amazon.co.jp: The Legend of Zelda: Breath of the Wild ...: https://m.media-amazon.com/images/I/81iU0U8VZML._AC_SL1500_.jpg
 87 |    [YahooSearch]: Amazon | Legend of Zelda Link's Awakening(輸入版:北米 ...: https://m.media-amazon.com/images/I/91z5JYtUZAS._AC_SY445_.jpg
 88 |    [YahooSearch]: Amazon | The Legend of Zelda: Breath of the Wild (輸入版 ...: https://m.media-amazon.com/images/I/61wcjVPx4sL._AC_SX466_.jpg
 89 |    [YahooSearch]: Amazon | The Legend of Zelda Encyclopedia | Nintendo | Video ...: https://images-na.ssl-images-amazon.com/images/I/91zJdQWSE0L.jpg
 90 |    [YahooSearch]: the-legend-of-zelda-breath-of- ...: https://www.nintendo.com//content/dam/noa/en_US/games/switch/t/the-legend-of-zelda-breath-of-the-wild-switch/the-legend-of-zelda-breath-of-the-wild-switch-hero.jpg
 91 |    [YahooSearch]: Amazon | The Legend of Zelda: Twilight Princess, Vol. 7 (7 ...: https://images-na.ssl-images-amazon.com/images/I/81-c6fHsctL.jpg
 92 |    [YahooSearch]: The Legend of Zelda™: Breath of the Wild - My Nintendo Store: https://assets.nintendo.eu/image/upload/f_auto,q_auto,t_product_tile_desktop/MNS/NOE/70010000000023/SQ_NSwitch_TheLegendOfZeldaBreathOfTheWild_E
 93 |    [YahooSearch]: Amazon | Legend of Zelda 2020 Wall Calendar | Nintendo ...: https://images-na.ssl-images-amazon.com/images/I/61R+rBBQxaL._SX258_BO1,204,203,200_.jpg
 94 |    [YahooSearch]: 359点のThe Legend Of Zeldaのストックフォト - Getty Images: https://media.gettyimages.com/photos/link-figurine-from-legend-of-zelda-with-shop-staff-inside-nintendo-picture-id1231509485?s=612x612
 95 |    [YahooSearch]: Evolution of Legend of Zelda 1986-2020 - YouTube: https://i.ytimg.com/vi/1FwoEgUBgE0/maxresdefault.jpg
 96 |    Google: Finally got 10 links.
 97 |    [GoogleSearch]: LATEST* The Legend Of Zelda Breath Of The Wild 2: Nintendo Direct E3 2021,  Release Date, Leaked Info, Gameplay, Setting, Story Info, Trailers, & More: https://cdn.realsport101.com/images/ncavvykf/realsport-production/2db4094078e3c7e7442e33afb8e8e5e6082d3849-1920x1080.png?rect=0,1,1920,1077&w=328&h=184&auto=format
 98 |    [GoogleSearch]: Jual The Legend of Zelda: Breath of the Wild Special Edition [EU] - Jakarta  Barat - Lionheartno Games Store | Tokopedia: https://images.tokopedia.net/img/cache/700/product-1/2017/1/16/9470651/9470651_4508d715-ecf7-452a-8150-df1a6a0c47ab_771_424.jpg
 99 |    [GoogleSearch]: The Legend of Zelda: Breath of the Wild – Link has never been set so free |  Nintendo Switch | The Guardian: https://i.guim.co.uk/img/media/22d6b308c89e62e229feb220208a639836e31fd9/60_0_1800_1080/master/1800.png?width=700&quality=85&auto=format&fit=max&s=25c588a5203feea6061c32112a66ebdc
100 |    [GoogleSearch]: Kaos The Legend of Zelda c Nintendo, Fesyen Pria, Pakaian , Atasan di  Carousell: https://media.karousell.com/media/photos/products/2021/9/22/kaos_the_legend_of_zelda_c_nin_1632313294_5b47ea62_progressive.jpg
101 |    [GoogleSearch]: Sales of The Legend of Zelda titles worldwide 2019 | Statista: https://cdn.statcdn.com/Statistic/985000/985767-blank-355.png
102 |    [GoogleSearch]: Legend Of Zelda Monsters | Minimalis: http://tse2.mm.bing.net/th?id=OIP.wUtxfbukexwonASdvmIirgHaEK&pid=15.1
103 |    [GoogleSearch]: Everything The Legend of Zelda: Breath of the Wild 2 is hiding: full  analysis - The Legend of Zelda: Breath of the Wild II - Gamereactor: https://www.gamereactor.eu/media/08/legendzelda_3500863.jpg
104 |    [GoogleSearch]: The Legend of Zelda: A Link Between Worlds (Video Game 2013) - IMDb: https://m.media-amazon.com/images/M/MV5BZDI2M2IwMDItOTU4MS00YzdjLWJmYjItMzA3MjJjMDk2YjBiXkEyXkFqcGdeQXVyNjY5NTM5MjA@._V1_.jpg
105 |    [GoogleSearch]: The Complete Chronological Order Of Legend Of Zelda Games: https://static0.gamerantimages.com/wordpress/wp-content/uploads/2021/01/Zelda-Four-Swords-Adventures-Links.jpg?q=50&fit=crop&w=1400&dpr=1.5
106 |    [GoogleSearch]: Sword Slash Png - Legend Of Zelda Skyward Sword Artwork Clipart (#1717847)  - PikPng: https://cpng.pikpng.com/pngl/s/90-907142_the-legend-of-zelda-legend-of-zelda-skyward.png
107 | 
108 | python library
109 | ~~~~~~~~~~~~~~
110 | 
111 | .. code:: python
112 | 
113 |    from pydork.engine import SearchEngine
114 | 
115 |    # SearchEngine
116 |    search_engine = SearchEngine()
117 | 
118 |    search_engine.set('google')
119 |    search_result = search_engine.search('final fantasy')
120 | 


--------------------------------------------------------------------------------
/pydork/engine_duckduckgo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) 2023 Blacknon. All rights reserved.
  4 | # Use of this source code is governed by an MIT license
  5 | # that can be found in the LICENSE file.
  6 | # =======================================================
  7 | 
  8 | 
  9 | """engine_duckduckgo
 10 |     * DuckDuckGo用の検索用Classを持つモジュール.
 11 | """
 12 | 
 13 | 
 14 | import json
 15 | import re
 16 | import sys
 17 | 
 18 | from time import sleep
 19 | from urllib import parse
 20 | from bs4 import BeautifulSoup
 21 | 
 22 | from .common import Color
 23 | from .engine_common import CommonEngine
 24 | 
 25 | 
 26 | class DuckDuckGo(CommonEngine):
 27 |     """DuckDuckGo
 28 | 
 29 |     DuckDuckGo用の検索エンジン用Class.
 30 |     """
 31 | 
 32 |     def __init__(self):
 33 |         # CommonEngineの処理を呼出し
 34 |         super().__init__()
 35 | 
 36 |         self.NAME = 'DuckDuckGo'
 37 |         self.COLOR = Color.BLUE
 38 |         self.COLOR_NAME = self.COLOR + self.NAME + Color.END
 39 | 
 40 |         # リクエスト先のURLを指定
 41 |         self.PRE_URL = 'https://duckduckgo.com/'
 42 |         self.ENGINE_TOP_URL = 'https://duckduckgo.com/'
 43 |         self.SEARCH_URL = 'https://links.duckduckgo.com/d.js'
 44 |         self.IMAGE_URL = 'https://duckduckgo.com/i.js'
 45 |         self.SUGGEST_URL = 'https://duckduckgo.com/ac/'
 46 | 
 47 |     def request_selenium(self, url: str, method='GET', data=None):
 48 |         if self.SUGGEST_URL in url:
 49 |             # 最初にTOPページを表示
 50 |             self.driver.get(self.ENGINE_TOP_URL)
 51 | 
 52 |             self.driver.implicitly_wait(3)
 53 | 
 54 |             # javascriptからリクエストを投げてjsonを取得
 55 |             exec_java_script = 'return fetch("{}").then(response=>response.json())'.format(
 56 |                 url)
 57 |             result = self.driver.execute_script(exec_java_script)
 58 | 
 59 |             result = json.dumps(result)
 60 | 
 61 |         else:
 62 |             result = super().request_selenium(url, method, data)
 63 | 
 64 |         return result
 65 | 
 66 |     def gen_search_url(self, keyword: str, type: str):
 67 |         """gen_search_url
 68 | 
 69 |         検索用のurlを生成する.
 70 | 
 71 |         Args:
 72 |             keyword (str): 検索クエリ.
 73 |             type (str): 検索タイプ.
 74 | 
 75 |         Returns:
 76 |             dict: 検索用url
 77 |         """
 78 | 
 79 |         # 前処理リクエスト用パラメータの設定
 80 |         pre_param = {
 81 |             'q': keyword,  # 検索キーワード
 82 |             't': 'h_'
 83 |         }
 84 | 
 85 |         try:
 86 |             # 前処理リクエスのセッションを生成する
 87 |             pre_params = parse.urlencode(pre_param)
 88 |             pre_url = self.PRE_URL + '?' + pre_params
 89 | 
 90 |             # 前処理リクエスト1を実行
 91 |             self.get_result('https://duckduckgo.com/?t=h_')
 92 | 
 93 |             # 待機時間を入れる
 94 |             sleep(1)
 95 | 
 96 |             # 前処理リクエスト2を実行
 97 |             pre_html = self.get_result(pre_url)
 98 |             sleep(1)
 99 | 
100 |             r = re.findall(
101 |                 r"(?<=vqd\=)[0-9-]+", pre_html
102 |             )
103 | 
104 |             # get vqd
105 |             vqd = r[0]
106 | 
107 |         except Exception:
108 |             return
109 | 
110 |         if type == 'text':
111 |             # 検索urlを指定
112 |             search_url = self.SEARCH_URL
113 | 
114 |             # 検索パラメータの設定
115 |             url_param = {
116 |                 'q': keyword,  # 検索キーワード
117 |                 's': 0,  # 取得開始件数
118 |                 'vqd': vqd
119 |             }
120 | 
121 |             # lang/localeが設定されている場合
122 |             if self.LANG != '' and self.LOCALE != '':
123 |                 url_param['l'] = self.LANG + '_' + self.LOCALE
124 | 
125 |             # rangeが設定されている場合(DuckDuckGoにはレンジ指定がないらしいので、追加されたら記述する)
126 | 
127 |         elif type == 'image':
128 |             # 検索urlを指定
129 |             search_url = self.IMAGE_URL
130 | 
131 |             # 検索パラメータの設定
132 |             url_param = {
133 |                 'q': keyword,  # 検索キーワード
134 |                 'o': 'json',  # output format
135 |                 'p': 1,
136 |                 's': 0,  # 取得開始件数
137 |                 'u': 'bing',  # TODO: 利用する検索エンジン(おそらく).後でオプションで指定できるようにする.
138 |                 'f': ',,,,,',
139 |                 'vqd': vqd
140 |             }
141 | 
142 |             # lang/localeが設定されている場合
143 |             if self.LANG != '' and self.LOCALE != '':
144 |                 url_param['l'] = self.LANG + '-' + self.LANG
145 | 
146 |         # set next_url
147 |         params = parse.urlencode(url_param)
148 |         self.next_url = search_url + '?' + params
149 | 
150 |         # while loop
151 |         page = 0
152 |         while True:
153 |             if self.next_url == "":
154 |                 break
155 | 
156 |             # get next_url
157 |             target_url = self.next_url
158 | 
159 |             yield 'GET', target_url, None
160 | 
161 |             page += 1
162 | 
163 |     def gen_suggest_url(self, keyword: str):
164 |         """gen_suggest_url
165 | 
166 |         サジェスト取得用のurlを生成する.
167 | 
168 |         Args:
169 |             keyword (str): 検索クエリ.
170 | 
171 |         Returns:
172 |             dict: サジェスト取得用url
173 |         """
174 |         url_param = {
175 |             'q': keyword,  # 検索キーワード
176 |             'kl': 'wt-wt'
177 |         }
178 | 
179 |         params = parse.urlencode(url_param)
180 |         url = self.SUGGEST_URL + '?' + params
181 | 
182 |         return url
183 | 
184 |     def get_links(self, source_url: str, html: str, type: str):
185 |         """get_links
186 | 
187 |         受け付けたhtmlを解析し、検索結果をlistに加工して返す関数.
188 | 
189 |         Args:
190 |             url  (str): 解析する検索結果のurl.
191 |             html (str): 解析する検索結果のhtml.
192 |             type (str): 検索タイプ([text, image]).現時点ではtextのみ対応.
193 | 
194 |         Returns:
195 |             list: 検索結果(`[{'title': 'title...', 'url': 'https://hogehoge....'}, {...}]`)
196 |         """
197 |         links = list()
198 | 
199 |         # next_url用のurl
200 |         url = ""
201 |         vqd = ""
202 | 
203 |         if type == 'text':
204 |             # 加工してdictとして扱えるようにする
205 |             r = re.findall(
206 |                 r"DDG\.pageLayout\.load\(\'d\',(.+)\]\)\;", html
207 |             )
208 | 
209 |             try:
210 |                 r_dict = json.loads(r[0] + "]")
211 |             except Exception:
212 |                 return links
213 | 
214 |             for r_data in r_dict:
215 |                 if "u" in r_data and "s" in r_data:
216 |                     d = {
217 |                         "link": r_data["u"],
218 |                         "title": BeautifulSoup(
219 |                             r_data["t"], "lxml").text,
220 |                         "text": BeautifulSoup(
221 |                             r_data["a"], "lxml").text,
222 |                         "source_url": source_url,
223 |                     }
224 |                     links.append(d)
225 | 
226 |                 elif "n" in r_data:
227 |                     base_uri = '{uri.scheme}://{uri.netloc}'.format(
228 |                         uri=parse.urlparse(self.SEARCH_URL)
229 |                     )
230 |                     url = base_uri + r_data["n"]
231 | 
232 |         elif type == 'image':
233 |             # seleniumを使用している場合、htmlを上書き
234 |             if self.USE_SELENIUM or self.USE_SPLASH:
235 |                 soup = BeautifulSoup(html, "lxml")
236 |                 selected_one = soup.select_one('html > body > pre')
237 |                 html = selected_one.text
238 | 
239 |             # jsonとして読み込む
240 |             try:
241 |                 data = json.loads(html)
242 |             except Exception as e:
243 |                 print(e, file=sys.stderr)
244 |                 return links
245 | 
246 |             if 'results' in data:
247 |                 results = data['results']
248 | 
249 |                 for r in results:
250 |                     d = {
251 |                         'link': r['image'],
252 |                         'title': r['title'],
253 |                         'pagelink': r['url']
254 |                     }
255 |                     links.append(d)
256 | 
257 |             if 'vqd' in data:
258 |                 vqd = list(data['vqd'].values())[0]
259 | 
260 |             # next_url用のurlを取得する
261 |             if 'next' in data:
262 |                 next_path = data['next']
263 |                 next_path = next_path + '&vqd=' + vqd
264 |                 base_url = '{uri.scheme}://{uri.netloc}/'.format(
265 |                     uri=parse.urlparse(self.IMAGE_URL)
266 |                 )
267 |                 url = base_url + next_path
268 | 
269 |         if url != "":
270 |             self.next_url = url
271 | 
272 |         return links
273 | 
274 |     def get_suggest_list(self, suggests: list, char: str, html: str):
275 |         """get_suggest_list
276 | 
277 |         htmlからsuggestを配列で取得する関数.
278 | 
279 |         Args:
280 |             suggests (list): suggestを追加するための大本のlist.
281 |             char (str): サジェストの文字列.
282 |             html (str): 解析を行うhtml.
283 | 
284 |         Returns:
285 |             dict: サジェスト配列
286 |         """
287 | 
288 |         data = json.loads(html)
289 |         suggests[char if char == '' else char[-1]] = [e['phrase']
290 |                                                       for e in data]
291 | 
292 |         return suggests
293 | 


--------------------------------------------------------------------------------
/pydork/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) 2023 Blacknon. All rights reserved.
  4 | # Use of this source code is governed by an MIT license
  5 | # that can be found in the LICENSE file.
  6 | # =======================================================
  7 | 
  8 | from .sub_commands import run_subcommand
  9 | from .engine import ENGINES
 10 | from . import messages
 11 | 
 12 | from pkg_resources import get_distribution
 13 | from datetime import datetime
 14 | 
 15 | import copy
 16 | import argparse
 17 | 
 18 | # TODO: returnではなくyieldに切り替えて、返り値をgeneratorにすることである程度途中状態でも状況を見れるような仕組みとする
 19 | 
 20 | 
 21 | # version (setup.pyから取得してくる)
 22 | __version__ = get_distribution('pydork').version
 23 | 
 24 | 
 25 | # main
 26 | def main():
 27 |     # parserの作成
 28 |     parser = argparse.ArgumentParser(
 29 |         description=messages.description)
 30 |     subparsers = parser.add_subparsers()
 31 | 
 32 |     # ENGINESに`all`を追加
 33 |     engines_list = copy.deepcopy(ENGINES)
 34 |     engines_list.append('all')
 35 | 
 36 |     # サブコマンド共通の引数
 37 |     common_args_map = [
 38 |         {
 39 |             "args": ["query"],
 40 |             "action": "store",
 41 |             "type": str,
 42 |             "nargs": "?",
 43 |             "default": "",
 44 |             "help": messages.help_message_query,
 45 |         },
 46 |         {
 47 |             "args": ["-f", "--file"],
 48 |             "action": "store",
 49 |             "type": str,
 50 |             "default": "",
 51 |             "help": messages.help_message_op_file,
 52 |         },
 53 |         {
 54 |             "args": ["-F", "--template_file"],
 55 |             "action": "store",
 56 |             "type": str,
 57 |             "default": "",
 58 |             "help": messages.help_message_op_template_file,
 59 |         },
 60 |         {
 61 |             "args": ["-V", "--template_variable"],
 62 |             "action": "store",
 63 |             "type": str,
 64 |             "default": "",
 65 |             "help": messages.help_message_op_template_variable,
 66 |         },
 67 |         {
 68 |             "args": ["-t", "--search_type"],
 69 |             "default": ["google"],
 70 |             "choices": engines_list,
 71 |             "nargs": "+",
 72 |             "type": str,
 73 |             "help": messages.help_message_op_search_type,
 74 |         },
 75 |         {
 76 |             "args": ["-l", "--lang"],
 77 |             "default": "ja",
 78 |             "choices": ["ja", "en"],
 79 |             "type": str,
 80 |             "help": messages.help_message_op_lang,
 81 |         },
 82 |         {
 83 |             "args": ["-c", "--country"],
 84 |             "default": "JP",
 85 |             "choices": ["JP", "US"],
 86 |             "type": str,
 87 |             "help": messages.help_message_op_country,
 88 |         },
 89 |         {
 90 |             "args": ["-P", "--proxy"],
 91 |             "default": "",
 92 |             "type": str,
 93 |             "help": messages.help_message_op_proxy_server,
 94 |         },
 95 |         {
 96 |             "args": ["-j", "--json"],
 97 |             "action": "store_true",
 98 |             "help": messages.help_message_op_json,
 99 |         },
100 |         {
101 |             "args": ["-k", "--insecure"],
102 |             "action": "store_true",
103 |             "help": messages.help_message_op_insecure,
104 |         },
105 |         {
106 |             "args": ["-s", "--selenium"],
107 |             "action": "store_true",
108 |             "help": messages.help_message_op_selenium,
109 |         },
110 |         {
111 |             "args": ["-S", "--splash"],
112 |             "action": "store_true",
113 |             "help": messages.help_message_op_splash,
114 |         },
115 |         {
116 |             "args": ["-b", "--browser-endpoint"],
117 |             "default": "",
118 |             "type": str,
119 |             "help": messages.help_message_op_browser_endpoint,
120 |         },
121 |         {
122 |             "args": ["-B", "--browser"],
123 |             "default": "firefox",
124 |             "choices": ["chrome", "firefox"],
125 |             "type": str,
126 |             "help": messages.help_message_op_browser,
127 |         },
128 |         {
129 |             "args": ["--color"],
130 |             "default": "auto",
131 |             "choices": ["auto", "none", "always"],
132 |             "type": str,
133 |             "help": messages.help_message_op_color,
134 |         },
135 |         {
136 |             "args": ["--cookies"],
137 |             "default": "~/.pydork_cookies",
138 |             "type": str,
139 |             "help": messages.help_message_op_cookies_dir,
140 |         },
141 |         {
142 |             "args": ["--delete-cookies"],
143 |             "action": "store_true",
144 |             "help": messages.help_message_op_delete_cookies,
145 |         },
146 |     ]
147 | 
148 |     # サブコマンド `search` の引数
149 |     search_args_map = [
150 |         {
151 |             "args": ["-T", "--title"],
152 |             "action": "store_true",
153 |             "help": messages.help_message_op_title,
154 |         },
155 |         {
156 |             "args": ["-0", "--nullchar"],
157 |             "action": "store_true",
158 |             "help": messages.help_message_op_null_char,
159 |         },
160 |         {
161 |             "args": ["-n", "--num"],
162 |             "default": 300,
163 |             "type": int,
164 |             "help": messages.help_message_op_num,
165 |         },
166 |         {
167 |             "args": ["--start"],
168 |             "type": lambda s: datetime.strptime(s, '%Y-%m-%d'),
169 |             "help": messages.help_message_op_start,
170 |         },
171 |         {
172 |             "args": ["--end"],
173 |             "type": lambda s: datetime.strptime(s, '%Y-%m-%d'),
174 |             "help": messages.help_message_op_end,
175 |         },
176 |         {
177 |             "args": ["--debug"],
178 |             "action": "store_true",
179 |             "help": messages.help_message_op_debug,
180 |         },
181 |         {
182 |             "args": ["--disable-headless"],
183 |             "action": "store_true",
184 |             "help": messages.help_message_op_disable_headless,
185 |         },
186 |     ]
187 |     search_args_map.extend(copy.deepcopy(common_args_map))
188 | 
189 |     # サブコマンド `image` の引数
190 |     image_args_map = [
191 |         {
192 |             "args": ["-T", "--title"],
193 |             "action": "store_true",
194 |             "help": messages.help_message_op_title,
195 |         },
196 |         {
197 |             "args": ["-p", "--pagelink"],
198 |             "action": "store_true",
199 |             "help": messages.help_message_op_image_pagelink,
200 |         },
201 |         {
202 |             "args": ["-0", "--nullchar"],
203 |             "action": "store_true",
204 |             "help": messages.help_message_op_null_char,
205 |         },
206 |         {
207 |             "args": ["-n", "--num"],
208 |             "default": 300,
209 |             "type": int,
210 |             "help": messages.help_message_op_num,
211 |         },
212 |         # {
213 |         #     "args": ["--start"],
214 |         #     "type": lambda s: datetime.strptime(s, '%Y-%m-%d'),
215 |         #     "help": messages.help_message_op_start,
216 |         # },
217 |         # {
218 |         #     "args": ["--end"],
219 |         #     "type": lambda s: datetime.strptime(s, '%Y-%m-%d'),
220 |         #     "help": messages.help_message_op_end,
221 |         # },
222 |         {
223 |             "args": ["--debug"],
224 |             "action": "store_true",
225 |             "help": messages.help_message_op_debug,
226 |         },
227 |         {
228 |             "args": ["--disable-headless"],
229 |             "action": "store_true",
230 |             "help": messages.help_message_op_disable_headless,
231 |         },
232 |     ]
233 |     image_args_map.extend(copy.deepcopy(common_args_map))
234 | 
235 |     # サブコマンド `suggest` の引数
236 |     suggest_args_map = [
237 |         {
238 |             "args": ["--jap"],
239 |             "action": "store_true",
240 |             "help": messages.help_message_op_suggest_jap
241 |         },
242 |         {
243 |             "args": ["--alph"],
244 |             "action": "store_true",
245 |             "help": messages.help_message_op_suggest_alph
246 |         },
247 |         {
248 |             "args": ["--num"],
249 |             "action": "store_true",
250 |             "help": messages.help_message_op_suggest_num
251 |         },
252 |     ]
253 |     suggest_args_map.extend(copy.deepcopy(common_args_map))
254 | 
255 |     # search
256 |     # ----------
257 |     parser_search = subparsers.add_parser(
258 |         'search',
259 |         help='search mode. see `search -h`'
260 |     )
261 | 
262 |     # add_argument
263 |     for element in search_args_map:
264 |         args = element['args']
265 |         element.pop('args')
266 |         parser_search.add_argument(*args, **element)
267 | 
268 |     # set parser_search
269 |     parser_search.set_defaults(handler=run_subcommand, subcommand="search")
270 | 
271 |     # image
272 |     # ----------
273 |     parser_image = subparsers.add_parser(
274 |         'image',
275 |         help='search mode. see `search -h`'
276 |     )
277 | 
278 |     # add_argument
279 |     for element in image_args_map:
280 |         args = element['args']
281 |         element.pop('args')
282 |         parser_image.add_argument(*args, **element)
283 | 
284 |     # set parser_image
285 |     parser_image.set_defaults(handler=run_subcommand, subcommand="image")
286 | 
287 |     # suggest
288 |     # ----------
289 |     parser_suggest = subparsers.add_parser(
290 |         'suggest',
291 |         help='suggest mode. see `suggest -h`'
292 |     )
293 | 
294 |     # add_argument
295 |     for element in suggest_args_map:
296 |         args = element['args']
297 |         element.pop('args')
298 |         parser_suggest.add_argument(*args, **element)
299 | 
300 |     parser_suggest.set_defaults(handler=run_subcommand, subcommand="suggest")
301 | 
302 |     # --version(-v)オプションのparser定義
303 |     parser.add_argument(
304 |         '-v',
305 |         '--version',
306 |         action='version',
307 |         version='%(prog)s version:{version}'.format(version=__version__)
308 |     )
309 | 
310 |     args = parser.parse_args()
311 |     if hasattr(args, 'handler'):
312 |         args.handler(args.subcommand, args)
313 |     else:
314 |         # 未知のサブコマンドの場合はヘルプを表示
315 |         parser.print_help()
316 | 
317 | 
318 | if __name__ == '__main__':
319 |     main()
320 | 


--------------------------------------------------------------------------------
/pydork/engine_yahoo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) 2023 Blacknon. All rights reserved.
  4 | # Use of this source code is governed by an MIT license
  5 | # that can be found in the LICENSE file.
  6 | # =======================================================
  7 | 
  8 | 
  9 | """engine_yahoo
 10 |     * Yahoo(yahoo.co.jp)用の検索用Classを持つモジュール.
 11 | """
 12 | 
 13 | 
 14 | import json
 15 | import re
 16 | import sys
 17 | 
 18 | from urllib import parse
 19 | from bs4 import BeautifulSoup
 20 | 
 21 | from .common import Color
 22 | from .engine_common import CommonEngine
 23 | 
 24 | 
 25 | class Yahoo(CommonEngine):
 26 |     """Yahoo
 27 | 
 28 |     Yahoo(yahoo.co.jp)用の検索エンジン用Class.
 29 |     """
 30 | 
 31 |     def __init__(self):
 32 |         # CommonEngineの処理を呼出し
 33 |         super().__init__()
 34 | 
 35 |         self.NAME = 'Yahoo'
 36 |         self.COLOR = Color.YELLOW
 37 |         self.COLOR_NAME = self.COLOR + self.NAME + Color.END
 38 | 
 39 |         # リクエスト先のURLを指定
 40 |         self.ENGINE_TOP_URL = 'https://www.yahoo.co.jp/'
 41 |         self.SEARCH_URL = 'https://search.yahoo.co.jp/search'
 42 |         self.IMAGE_PRE_URL = 'https://search.yahoo.co.jp/image/search'
 43 |         self.IMAGE_URL = 'https://search.yahoo.co.jp/image/api/search'
 44 |         self.SUGGEST_URL = 'https://ff.search.yahoo.com/gossip'
 45 | 
 46 |     def gen_search_url(self, keyword: str, type: str):
 47 |         """gen_search_url
 48 | 
 49 |         検索用のurlを生成する.
 50 | 
 51 |         Args:
 52 |             keyword (str): 検索クエリ.
 53 |             type (str): 検索タイプ.
 54 | 
 55 |         Returns:
 56 |             dict: 検索用url
 57 |         """
 58 |         search_url = ''
 59 | 
 60 |         # 検索タイプがtextの場合
 61 |         if type == 'text':
 62 |             # 検索urlを指定
 63 |             search_url = self.SEARCH_URL
 64 | 
 65 |             # 検索パラメータの設定
 66 |             url_param = {
 67 |                 'p': keyword,         # 検索キーワード
 68 |                 'num': '100',    # 指定不可(削除)
 69 |                 'day_from': '',  # 開始日時(yyyy/mm/dd)
 70 |                 'day_to': '',    # 終了日時(yyyy/mm/dd)
 71 |                 'b': '',         # 開始位置
 72 |                 'nfpr': '1',     # もしかして検索(Escape hatch)の無効化
 73 |                 'qrw': '0'       # もしかして検索(Escape hatch)の無効化
 74 |             }
 75 | 
 76 |             # lang/localeが設定されている場合
 77 |             if self.LANG != '' and self.LOCALE != '':
 78 |                 url_param['hl'] = self.LANG
 79 |                 url_param['gl'] = self.LOCALE
 80 | 
 81 |             # rangeが設定されている場合
 82 |             try:
 83 |                 start = self.RANGE_START
 84 |                 end = self.RANGE_END
 85 | 
 86 |                 # ex.) day_from=2019/09/01&day_to=2019/09/30
 87 |                 # パラメータが2つ存在している
 88 |                 day_from = start.strftime("%Y/%m/%d")
 89 |                 day_to = end.strftime("%Y/%m/%d")
 90 | 
 91 |                 # GETパラメータに日時データを追加
 92 |                 url_param['day_from'] = day_from
 93 |                 url_param['day_to'] = day_to
 94 | 
 95 |             except AttributeError:
 96 |                 None
 97 | 
 98 |         # 検索タイプがimageの場合
 99 |         elif type == 'image':
100 |             # 前処理(パラメータ`cr`の取得)を実行
101 |             cr = self.get_image_search_cr(keyword)
102 | 
103 |             # 検索urlを指定
104 |             search_url = self.IMAGE_URL
105 | 
106 |             # 検索パラメータの設定
107 |             url_param = {
108 |                 'p': keyword,  # 検索キーワード
109 |                 'fr': 'top_ga1_sa',
110 |                 'ei': 'UTF-8',
111 |                 'aq': '-1',
112 |                 'n': '20',  # 指定不可(削除)
113 |                 'vm': 'i',
114 |                 'se': '0',
115 |                 'ue': '0',
116 |                 'cr': cr,
117 |                 # 'day_from': '',  # 開始日時(yyyy/mm/dd)
118 |                 # 'day_to': '',    # 終了日時(yyyy/mm/dd)
119 |                 'b': '',         # 開始位置
120 |                 'nfpr': '1',     # もしかして検索(Escape hatch)の無効化
121 |                 'qrw': '0'       # もしかして検索(Escape hatch)の無効化
122 |             }
123 | 
124 |         page = 0
125 |         while True:
126 |             # parameterにページを開始する番号を指定
127 |             if type == 'text':
128 |                 url_param['b'] = str(page * 10)
129 |             elif type == 'image':
130 |                 url_param['b'] = str(page * 10)
131 | 
132 |             # パラメータをセット
133 |             params = parse.urlencode(url_param)
134 | 
135 |             target_url = search_url + '?' + params
136 | 
137 |             yield 'GET', target_url, None
138 | 
139 |             page += 1
140 | 
141 |     def gen_suggest_url(self, keyword: str):
142 |         """gen_suggest_url
143 | 
144 |         サジェスト取得用のurlを生成する.
145 | 
146 |         Args:
147 |             keyword (str): 検索クエリ.
148 | 
149 |         Returns:
150 |             dict: サジェスト取得用url
151 |         """
152 |         url_param = {
153 |             'command': keyword,   # 検索キーワード
154 |             'output': 'json',
155 |         }
156 | 
157 |         params = parse.urlencode(url_param)
158 |         url = self.SUGGEST_URL + '?' + params
159 | 
160 |         return url
161 | 
162 |     def get_links(self, url: str, html: str, type: str):
163 |         """get_links
164 | 
165 |         受け付けたhtmlを解析し、検索結果をlistに加工して返す関数.
166 | 
167 |         Args:
168 |             url  (str): 解析する検索結果のurl.
169 |             html (str): 解析する検索結果のhtml.
170 |             type (str): 検索タイプ([text, image]).現時点ではtextのみ対応.
171 | 
172 |         Returns:
173 |             list: 検索結果(`[{'title': 'title...', 'url': 'https://hogehoge....'}, {...}]`)
174 |         """
175 | 
176 |         if type == 'text':
177 |             if self.USE_SPLASH or self.USE_SELENIUM:
178 |                 self.SOUP_SELECT_JSON = '#__NEXT_DATA__'
179 |                 self.SOUP_SELECT_IMAGE = '.rg_meta.notranslate'
180 |                 self.SOUP_SELECT_TEXT = ''
181 | 
182 |                 # Yahooの場合、jsonから検索結果を取得する
183 |                 soup = BeautifulSoup(html, 'lxml')
184 |                 elements = soup.select(self.SOUP_SELECT_JSON)
185 |                 element = elements[0].string
186 | 
187 |                 # debug
188 |                 if self.IS_DEBUG:
189 |                     print(Color.PURPLE + '[JsonElement]' + Color.END,
190 |                           file=sys.stderr)
191 |                     print(Color.PURPLE + element + Color.END,
192 |                           file=sys.stderr)  # type: ignore
193 | 
194 |                 # jsonからデータを抽出　
195 |                 j = json.loads(element)  # type: ignore
196 | 
197 |                 # debug
198 |                 if self.IS_DEBUG:
199 |                     print(Color.PURPLE + '[Json]' + Color.END, file=sys.stderr)
200 |                     print(Color.PURPLE + json.dumps(j) + Color.END,
201 |                           file=sys.stderr)
202 | 
203 |                 jd = j['props']['initialProps']['pageProps']['pageData']['algos']
204 | 
205 |                 elinks = [e['url'] for e in jd]
206 |                 etitles = [e['title'] for e in jd]
207 |                 etexts = [e['description'] for e in jd]
208 | 
209 |                 links = self.create_text_links(url, elinks, etitles, etexts)
210 | 
211 |             else:
212 |                 self.SOUP_SELECT_URL = '.sw-Card__headerSpace > .sw-Card__title > a'
213 |                 self.SOUP_SELECT_TITLE = '.sw-Card__headerSpace > .sw-Card__title > a > h3'
214 |                 self.SOUP_SELECT_TEXT = '.sw-Card__floatContainer > .sw-Card__summary'
215 | 
216 |                 # CommonEngineの処理を呼び出す
217 |                 links = super().get_links(url, html, type)
218 | 
219 |         elif type == 'image':
220 |             # CommonEngineの処理を呼び出す
221 |             links = super().get_links(url, html, type)
222 | 
223 |         return links
224 | 
225 |     # 画像検索ページの検索結果(links(list()))を生成するfunction
226 |     def get_image_links(self, soup: BeautifulSoup):
227 |         """get_image_links
228 |         BeautifulSoupから画像検索ページを解析して結果を返す関数.
229 | 
230 |         Args:
231 |             soup (BeautifulSoup): 解析するBeautifulSoupオブジェクト.
232 | 
233 |         Returns:
234 |             list: 検索結果(`[{'title': 'title...', 'link': 'https://hogehoge....'}, {...}]`)
235 |         """
236 | 
237 |         result = []  # image url
238 | 
239 |         try:
240 |             data = json.loads(soup.text)
241 |         except Exception:
242 |             return result
243 | 
244 |         for d in data['algos']:
245 |             etitle = d['title']
246 |             elink = d['refererUrl']
247 |             eimage = d['original']['url']
248 | 
249 |             el = {
250 |                 'title': etitle,
251 |                 'pagelink': elink,
252 |                 'link': eimage,
253 |             }
254 | 
255 |             result.append(el)
256 | 
257 |         return result
258 | 
259 |     def get_suggest_list(self, suggests: list, char: str, html: str):
260 |         """get_suggest_list
261 | 
262 |         htmlからsuggestを配列で取得する関数.
263 | 
264 |         Args:
265 |             suggests (list): suggestを追加するための大本のlist.
266 |             char (str): サジェストの文字列.
267 |             html (str): 解析を行うhtml.
268 | 
269 |         Returns:
270 |             dict: サジェスト配列
271 |         """
272 |         if self.USE_SELENIUM and self.SELENIUM_BROWSER == 'firefox':
273 |             soup = BeautifulSoup(html, features="lxml")
274 |             html = soup.find("pre").text
275 |         data = json.loads(html)
276 |         suggests[char if char == '' else char[-1]] = [e['key']  # type: ignore
277 |                                                       for e in data['gossip']['results']]
278 | 
279 |         return suggests
280 | 
281 |     def get_image_search_cr(self, keyword: str):
282 |         """get_image_search_cr
283 | 
284 |         Yahooの画像検索時に必要になるcrumb(cr)パラメータを取得するための前処理リクエストを行う関数
285 | 
286 |         Args:
287 |             keyword (str): 検索キーワード
288 | 
289 |         Returns:
290 |             str: crumbパラメータの値
291 |         """
292 | 
293 |         result = ''
294 | 
295 |         # urlパラメータを設定
296 |         url_param = {
297 |             'p': keyword,
298 |             'fr': 'top_ga1_sa',
299 |             'ei': 'UTF-8',
300 |             'aq': '-1',
301 |         }
302 |         params = parse.urlencode(url_param)
303 | 
304 |         # 前処理リクエストを投げる
305 |         pre_result = self.get_result(self.IMAGE_PRE_URL + '?' + params)
306 | 
307 |         # 前処理リクエストから、crumbパラメータの値を取得する(正規表現)
308 |         pattern = r'{ *"crumb": *"[^"]+" *}'
309 |         data = re.findall(pattern, pre_result)
310 | 
311 |         if len(data) > 0:
312 |             d = data[0]
313 |             jd = json.loads(d)
314 | 
315 |             result = jd['crumb']
316 | 
317 |         return result
318 | 


--------------------------------------------------------------------------------
/pydork/recaptcha.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) 2023 Blacknon. All rights reserved.
  4 | # Use of this source code is governed by an MIT license
  5 | # that can be found in the LICENSE file.
  6 | # =======================================================
  7 | 
  8 | """engine
  9 |     * ReCaptcha関連のClassを集約するモジュールファイル
 10 | """
 11 | 
 12 | import json
 13 | import requests
 14 | 
 15 | from urllib import parse
 16 | from urllib.parse import urlparse
 17 | from bs4 import BeautifulSoup
 18 | from time import sleep
 19 | 
 20 | from .common import Color, Message
 21 | 
 22 | 
 23 | # 2CaptchaのAPIへPOSTするためのClass
 24 | class TwoCaptcha:
 25 |     """TwoCaptcha
 26 | 
 27 |     2CaptchaのAPIへPOSTし、ReCaptchaを突破するためのClass.
 28 | 
 29 |     Note:
 30 |         公式ライブラリ側でCookieのPOSTに対応していなかったため作成.
 31 | 
 32 | 
 33 |     """
 34 | 
 35 |     def __init__(self, apikey: str):
 36 |         """__init__
 37 | 
 38 |         Args:
 39 |             apikey (str): 2CaptchaのAPI Key.
 40 |         """
 41 | 
 42 |         # apiへリクエストを投げるためのsession
 43 |         self.session = requests.Session()
 44 | 
 45 |         # api_url
 46 |         self.api_in_url = 'https://2captcha.com/in.php'
 47 |         self.api_res_url = 'https://2captcha.com/res.php'
 48 | 
 49 |         # api_key
 50 |         self.api_key = apikey
 51 | 
 52 |         # proxy
 53 |         self.proxy = None
 54 |         self.user_agent = None
 55 | 
 56 |         # flag
 57 |         self.IS_DEBUG = False
 58 |         self.IS_COMMAND = False
 59 | 
 60 |         # Message
 61 |         self.MESSAGE = None
 62 | 
 63 |     def set_debug(self, is_debug: bool):
 64 |         """set_debug
 65 | 
 66 |         Args:
 67 |             is_debug (bool): debug modeが有効ならTrue
 68 |         """
 69 | 
 70 |         self.IS_DEBUG = is_debug
 71 | 
 72 |     def set_command(self, is_command: bool):
 73 |         """set_command
 74 | 
 75 |         Args:
 76 |             is_command (bool): command modeが有効ならTrue
 77 |         """
 78 | 
 79 |         self.IS_COMMAND = is_command
 80 | 
 81 |     def set_user_agent(self, user_agent: str):
 82 |         """set_user_agent
 83 | 
 84 |         Args:
 85 |             user_agent (str): 2Captchaに送るUser Agent
 86 |         """
 87 | 
 88 |         self.user_agent = user_agent
 89 | 
 90 |     def set_messages(self, message: Message):
 91 |         """set_message
 92 | 
 93 |         Args:
 94 |             message (Message): 利用するcommon.Messageを指定
 95 |         """
 96 | 
 97 |         self.MESSAGE = message
 98 | 
 99 |     # googleのReCaptcha画面からデータを抽出する
100 |     def get_google_recaptcha_data(self, html: str):
101 |         """get_google_recaptcha_data
102 | 
103 |         ReCapthcaのhtmlからsitekey, data-sの値を抽出する.
104 | 
105 | 
106 |         Args:
107 |             html (str): 解析するReCaptcha画面のhtmlデータ
108 | 
109 |         Returns:
110 |             sitekey (str): 2Captchaへ送るsitekey
111 |             data-s (str): 2Captchaへ送るdata-s
112 |         """
113 | 
114 |         # resultの初期値設定
115 |         sitekey = None
116 |         data_s = None
117 | 
118 |         # ReCaptchaのタグ・要素データを宣言
119 |         recaptcha_tag = '#captcha-form > #recaptcha'
120 |         sitekey_el_name = 'data-sitekey'
121 |         data_s_el_name = 'data-s'
122 | 
123 |         # htmlをBeautifulSoupで解析する
124 |         soup = BeautifulSoup(html, 'lxml')
125 | 
126 |         # 要素を抽出する
127 |         if recaptcha_tag != '':
128 |             elements = soup.select(recaptcha_tag)
129 | 
130 |             # 要素のチェック
131 |             if len(elements) > 0:
132 |                 el = elements[0]
133 | 
134 |                 try:
135 |                     sitekey = el[sitekey_el_name]
136 |                     data_s = el[data_s_el_name]
137 | 
138 |                     return sitekey, data_s
139 | 
140 |                 except AttributeError:
141 |                     None
142 | 
143 |         return sitekey, data_s
144 | 
145 |     def in_php(self, data: dict):
146 |         """in_php
147 | 
148 |         Args:
149 |             data (dict): in.phpにpostするデータ(dict)
150 | 
151 |         Returns:
152 |             bool: 処理が正常終了か否か
153 |             str: request_code
154 |         """
155 | 
156 |         res = self.session.post(self.api_in_url, data=data)
157 | 
158 |         if self.MESSAGE is not None:
159 |             self.MESSAGE.print_text(
160 |                 '2Captcha Response in.php from `{}`: {}'.format(
161 |                     self.api_in_url, res.text),
162 |                 mode='debug',
163 |                 header=self.MESSAGE.HEADER + ': ' + Color.GRAY +
164 |                 '[DEBUG]: [2CaptchaIn]' + Color.END,
165 |                 separator=": "
166 |             )
167 | 
168 |         # status codeを確認
169 |         if res.status_code == 200:
170 |             d = json.loads(res.text)
171 |             if d['status'] == 1:
172 |                 request_id = d['request']
173 | 
174 |                 return True, request_id
175 | 
176 |         # request codeを取得できなかった場合、
177 |         return False, None
178 | 
179 |     def res_php(self, request_id: str):
180 |         """res_php
181 | 
182 |         Args:
183 |             request_id (str): 2Captchaのres.php(2Captchaの突破状況確認するpath)で利用するrequest_id.
184 | 
185 |         Returns:
186 |             (str): res.phpからのresponse結果を返す
187 |         """
188 | 
189 |         url_param = {
190 |             'key': self.api_key,
191 |             'action': 'get',
192 |             'json': 1,
193 |             'id': request_id
194 |         }
195 |         params = parse.urlencode(url_param)
196 |         target_url = self.api_res_url + '?' + params
197 | 
198 |         result = self.session.get(target_url)
199 | 
200 |         if self.MESSAGE is not None:
201 |             self.MESSAGE.print_text(
202 |                 '2Captcha res.php Response from `{}`: {}'.format(
203 |                     target_url, result.text),
204 |                 mode='debug',
205 |                 header=self.MESSAGE.HEADER + ': ' + Color.GRAY +
206 |                 '[DEBUG]: [2CaptchaRes]' + Color.END,
207 |                 separator=": "
208 |             )
209 | 
210 |         return result
211 | 
212 |     # 解析結果を渡す
213 |     def google_recaptcha(self, html: str, url: str, cookies: list, proxy: str):
214 |         """[summary]
215 | 
216 |         Args:
217 |             sitekey (str): ReCaptchaのhtml.
218 |             url (str): ReCaptchaが表示されてしまったurl(元のurl)
219 |             cookie (list): cookiesを渡す.
220 |             proxy (str): proxyをuriで渡す.
221 | 
222 |         Returns:
223 |             (str): Google ReCaptchaで使用するcodeを返す.
224 |         """
225 | 
226 |         # code
227 |         code = None
228 |         result = None
229 | 
230 |         # set proxy
231 |         self.proxy = proxy
232 | 
233 |         # sitekey, data-sを取得する
234 |         sitekey, data_s = self.get_google_recaptcha_data(html)
235 | 
236 |         # proxyをuriから整形する
237 |         proxy_parse = urlparse(proxy)
238 |         proxy_type = proxy_parse.scheme.upper()
239 |         proxy_uri = proxy_parse.netloc
240 | 
241 |         # cookieを整形する
242 |         cookie_elements = []
243 |         for cookie in cookies:
244 |             cookie_element = cookie['name'] + ':' + cookie['value']
245 |             cookie_elements.append(cookie_element)
246 | 
247 |         cookie_data = ';'.join(cookie_elements)
248 | 
249 |         # postリクエストで使用するデータを生成する
250 |         payload = {
251 |             'key': self.api_key,
252 |             'pageurl': url,
253 |             'method': 'userrecaptcha',
254 |             'json': 1,
255 |             'googlekey': sitekey,
256 |             'data-s': data_s,
257 |             'proxytype': proxy_type,
258 |             'proxy': proxy_uri,
259 |             'cookies': cookie_data,
260 |             'callback': 'submitCallback',
261 |         }
262 | 
263 |         if self.user_agent is not None:
264 |             payload['userAgent'] = self.user_agent
265 | 
266 |         while True:
267 |             # debug message
268 |             if self.MESSAGE is not None:
269 |                 self.MESSAGE.print_text(
270 |                     'Send ReCaptcha Data to `{}`.'.format(
271 |                         self.api_in_url),
272 |                     mode='info',
273 |                     header=self.MESSAGE.HEADER + ': ' + Color.GRAY +
274 |                     '[DEBUG]: [ReCaptcha]' + Color.END,
275 |                     separator=": "
276 |                 )
277 | 
278 |             # リクエストを送信
279 |             ok, request_id = self.in_php(payload)
280 | 
281 |             if not ok:
282 |                 # debug message
283 |                 if self.MESSAGE is not None:
284 |                     self.MESSAGE.print_text(
285 |                         'Failed Send ReCaptcha Data. data: {}'.format(
286 |                             payload),
287 |                         mode='warn',
288 |                         header=self.MESSAGE.HEADER + ': ' + Color.GRAY +
289 |                         '[DEBUG]: [ReCaptcha]' + Color.END,
290 |                         separator=": "
291 |                     )
292 | 
293 |                 break
294 | 
295 |             # message
296 |             if self.MESSAGE is not None:
297 |                 self.MESSAGE.print_text(
298 |                     'Get request_id: {}'.format(request_id),
299 |                     mode='info',
300 |                     header=self.MESSAGE.HEADER + ': ' + Color.GRAY +
301 |                     '[DEBUG]: [2Captcha]' + Color.END,
302 |                     separator=": "
303 |                 )
304 | 
305 |                 self.MESSAGE.print_text(
306 |                     'Check ReCaptcha Rsponse Status from: {}'.format(
307 |                         self.api_res_url),
308 |                     mode='info',
309 |                     header=self.MESSAGE.HEADER + ': ' + Color.GRAY +
310 |                     '[DEBUG]: [2Captcha]' + Color.END,
311 |                     separator=": "
312 |                 )
313 | 
314 |             # res_phpのチェックループ
315 |             while True:
316 |                 res = self.res_php(request_id)
317 | 
318 |                 # レスポンス(json)から読み込む
319 |                 data = json.loads(res.text)
320 | 
321 |                 # codeを取得
322 |                 code = data['request']
323 | 
324 |                 if data['status'] == 1:
325 |                     result = code
326 |                     return result
327 | 
328 |                 if code != 'CAPCHA_NOT_READY':
329 |                     break
330 | 
331 |                 sleep(30)
332 | 
333 |         if code is None:
334 |             code = 'None'
335 | 
336 |         # debug messages
337 |         if self.MESSAGE is not None:
338 |             self.MESSAGE.print_text(
339 |                 'Bypass NG ReCaptcha Data. code: {}'.format(code),
340 |                 mode='warn',
341 |                 header=self.MESSAGE.HEADER + ': ' + Color.GRAY +
342 |                 '[DEBUG]: [2Captcha]' + Color.END,
343 |                 separator=": "
344 |             )
345 | 
346 |         return result
347 | 


--------------------------------------------------------------------------------
/pydork/engine_bing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) 2023 Blacknon. All rights reserved.
  4 | # Use of this source code is governed by an MIT license
  5 | # that can be found in the LICENSE file.
  6 | # =======================================================
  7 | 
  8 | 
  9 | """engine_bing
 10 |     * Bing用の検索用Classを持つモジュール.
 11 | """
 12 | 
 13 | import requests
 14 | import datetime
 15 | import json
 16 | import asyncio
 17 | import re
 18 | 
 19 | from urllib import parse
 20 | from bs4 import BeautifulSoup
 21 | 
 22 | from .common import Color
 23 | from .engine_common import CommonEngine
 24 | 
 25 | 
 26 | class Bing(CommonEngine):
 27 |     """Bing
 28 | 
 29 |     Bing用の検索エンジン用Class.
 30 |     """
 31 | 
 32 |     def __init__(self):
 33 |         # CommonEngineの処理を呼出し
 34 |         super().__init__()
 35 | 
 36 |         self.NAME = 'Bing'
 37 |         self.COLOR = Color.CYAN
 38 |         self.COLOR_NAME = self.COLOR + self.NAME + Color.END
 39 | 
 40 |         # リクエスト先のURLを指定
 41 |         self.ENGINE_TOP_URL = 'https://www.bing.com/'
 42 |         self.SEARCH_URL = 'https://www.bing.com/search'
 43 |         self.IMAGE_URL = 'https://www.bing.com/images/async'
 44 |         self.SUGGEST_URL = 'https://www.bing.com/AS/Suggestions'
 45 | 
 46 |     def gen_search_url(self, keyword: str, type: str):
 47 |         """gen_search_url
 48 | 
 49 |         検索用のurlを生成する.
 50 | 
 51 |         Args:
 52 |             keyword (str): 検索クエリ.
 53 |             type (str): 検索タイプ.
 54 | 
 55 |         Returns:
 56 |             dict: 検索用url
 57 |         """
 58 | 
 59 |         search_url = ''
 60 | 
 61 |         # NOTE:
 62 |         #   2023/07/27にて、queryが以下のように切り替わったため修正
 63 |         #     - `https://www.bing.com/search?q=site%3aorebibou.com&search=%e9%80%81%e4%bf%a1&rdr=1&rdrig=D4B6730A85514F25BAE1E9BDC04F1C28&cc=us&setlang=en`
 64 |         #       ```json
 65 |         #       {
 66 |         #         'q': ['site:orebibou.com'],
 67 |         #         'search': ['送信'],
 68 |         #         'rdr': ['1'],
 69 |         #         'rdrig': ['D4B6730A85514F25BAE1E9BDC04F1C28'],
 70 |         #         'cc': ['us'],
 71 |         #         'setlang': ['en']
 72 |         #       }
 73 |         #       ```
 74 |         #     - `https://www.bing.com/search?q=site%3aorebibou.com&search=%E9%80%81%E4%BF%A1&rdr=1&rdrig=D4B6730A85514F25BAE1E9BDC04F1C28&cc=us&setlang=en&FPIG=B035C5DE50AE4A328CB93C767B02D08B&first=11&FORM=PERE&count=100`
 75 | 
 76 |         # 検索タイプがtextの場合
 77 |         if type == 'text':
 78 |             # 検索urlを指定
 79 |             search_url = self.SEARCH_URL
 80 | 
 81 |             # 検索パラメータの設定
 82 |             url_param = {
 83 |                 'q': keyword,    # 検索キーワード
 84 |                 'count': '100',  # 1ページごとの表示件数
 85 |                 'search': '送信',
 86 |                 'rdr': '1',
 87 |                 'from': 'PERE',
 88 |                 'cc': 'us',
 89 |                 'setlang': 'en',
 90 |                 'filters': '',   # 期間含めフィルターとして指定するパラメータ
 91 |                 'first': ''      # 開始位置
 92 |             }
 93 | 
 94 |             # lang/localeが設定されている場合
 95 |             if self.LANG != '':
 96 |                 url_param['setlang'] = self.LANG.lower()
 97 |             if self.LOCALE != '':
 98 |                 url_param['cc'] = self.LOCALE.lower()
 99 | 
100 |             # rangeが設定されている場合
101 |             try:
102 |                 start = self.RANGE_START
103 |                 end = self.RANGE_END
104 | 
105 |                 unix_day = datetime.strptime('1970-01-01', "%Y-%m-%d")
106 |                 cd_min = (start - unix_day).days
107 |                 cd_max = (end - unix_day).days
108 | 
109 |                 # GETパラメータに日時データを追加
110 |                 url_param['filters'] = 'ex1:"ez5_{0}_{1}"'.format(
111 |                     cd_min, cd_max)
112 | 
113 |             except AttributeError:
114 |                 None
115 | 
116 |         # 検索タイプがimageの場合
117 |         elif type == 'image':
118 |             # 検索urlを指定
119 |             search_url = self.IMAGE_URL
120 | 
121 |             # 検索パラメータの設定
122 |             url_param = {
123 |                 'q': keyword,    # 検索キーワード
124 |                 'count': '100',  # 1回ごとの件数
125 |                 'first': '',     # 検索位置
126 |                 'tsc': 'ImageBasicHover',
127 |                 'layout': 'RowBased',
128 |             }
129 | 
130 |             # rangeが指定されている場合
131 |             # TODO: 日時パラメータを追加(ex: `qft=+filterui%3aage-lt43200`)
132 | 
133 |         page = 0
134 |         while True:
135 |             # parameterにページを開始する番号を指定
136 |             url_param['first'] = str(page * 100)
137 |             params = parse.urlencode(url_param)
138 | 
139 |             target_url = search_url + '?' + params
140 | 
141 |             yield 'GET', target_url, None
142 | 
143 |             page += 1
144 | 
145 |     def gen_suggest_url(self, keyword: str):
146 |         """gen_suggest_url
147 | 
148 |         サジェスト取得用のurlを生成する.
149 | 
150 |         Args:
151 |             keyword (str): 検索クエリ.
152 | 
153 |         Returns:
154 |             dict: サジェスト取得用url
155 |         """
156 | 
157 |         url_param = {
158 |             'qry': keyword,  # 検索キーワード
159 |             'cvid': 'F5F47E4155E44D86A86690B49023B0EF'
160 |         }
161 | 
162 |         params = parse.urlencode(url_param)
163 |         url = self.SUGGEST_URL + '?' + params
164 | 
165 |         return url
166 | 
167 |     def get_links(self, url: str, html: str, type: str):
168 |         """get_links
169 | 
170 |         受け付けたhtmlを解析し、検索結果をlistに加工して返す関数.
171 | 
172 |         Args:
173 |             html (str): 解析する検索結果のhtml.
174 |             type (str): 検索タイプ([text, image]).現時点ではtextのみ対応.
175 | 
176 |         Returns:
177 |             list: 検索結果(`[{'title': 'title...', 'url': 'https://hogehoge....'}, {...}]`)
178 |         """
179 | 
180 |         if type == 'text':
181 |             self.SOUP_SELECT_URL = 'h2 > a'
182 |             self.SOUP_SELECT_TITLE = 'h2 > a'
183 |             self.SOUP_SELECT_TEXT = 'li > div > p'
184 | 
185 |         elif type == 'image':
186 |             self.SOUP_SELECT_URL = '.imgpt > .iusc'
187 | 
188 |         # CommonEngineの処理を呼び出す
189 |         links = super().get_links(url, html, type)
190 | 
191 |         return links
192 | 
193 |     # 画像検索ページの検索結果(links(list()))を生成するfunction
194 |     def get_image_links(self, soup: BeautifulSoup):
195 |         """get_image_links
196 |         BeautifulSoupから画像検索ページを解析して結果を返す関数.
197 | 
198 |         Args:
199 |             soup (BeautifulSoup): 解析するBeautifulSoupオブジェクト.
200 | 
201 |         Returns:
202 |             list: 検索結果(`[{'title': 'title...', 'link': 'https://hogehoge....'}, {...}]`)
203 |         """
204 | 
205 |         elements = soup.select(self.SOUP_SELECT_URL)
206 |         edata = [e['m'] for e in elements]
207 | 
208 |         result = []  # image url
209 |         for e in edata:
210 |             # json化
211 |             je = json.loads(e)
212 | 
213 |             etitle = je['t']
214 |             elink = je['purl']
215 |             eimage = je['murl']
216 | 
217 |             el = {
218 |                 'title': etitle,
219 |                 'pagelink': elink,
220 |                 'link': eimage,
221 |             }
222 | 
223 |             result.append(el)
224 | 
225 |         return result
226 | 
227 |     def get_suggest_list(self, suggests: list, char: str, html: str):
228 |         """get_suggest_list
229 | 
230 |         htmlからsuggestを配列で取得する関数.
231 | 
232 |         Args:
233 |             suggests (list): suggestを追加するための大本のlist.
234 |             char (str): サジェストの文字列.
235 |             html (str): 解析を行うhtml.
236 | 
237 |         Returns:
238 |             dict: サジェスト配列
239 |         """
240 |         soup = BeautifulSoup(html, 'lxml')
241 |         elements = soup.select('ul > li')
242 |         suggests[char if char == '' else char[-1]] = [e['query']
243 |                                                       for e in elements]
244 |         return suggests
245 | 
246 |     def processings_elist(self, elinks, etitles, etexts: list):
247 |         """processings_elist
248 | 
249 |         self.get_links 内で、取得直後のelinks, etitlesに加工を加えるための関数.
250 |         requestsを用いて、リダイレクトリンクから遷移先urlを取得していく.
251 | 
252 |         Args:
253 |             elinks (list): elinks(検索結果のlink)の配列
254 |             etitles (list): etitles(検索結果のtitle)の配列
255 |             etexts (list): etexts(検索結果のtext)の配列
256 | 
257 |         Returns:
258 |             elinks (list): elinks(検索結果のlink)の配列
259 |             etitles (list): etitles(検索結果のtitle)の配列
260 |             etexts (list): etexts(検索結果のtext)の配列
261 |         """
262 | 
263 |         # 通常のスクレイピングとは別にセッションを作成
264 |         session = requests.session()
265 | 
266 |         # pool sizeを調整
267 |         adapter = requests.adapters.HTTPAdapter(
268 |             pool_connections=100, pool_maxsize=100)
269 |         session.mount('https://', adapter)
270 | 
271 |         # proxyを設定
272 |         if self.PROXY != '':
273 |             proxies = {
274 |                 'http': self.PROXY,
275 |                 'https': self.PROXY
276 |             }
277 |             session.proxies = proxies
278 | 
279 |         # user-agentを設定
280 |         if self.USER_AGENT != '':
281 |             session.headers.update(
282 |                 {
283 |                     'User-Agent': self.USER_AGENT,
284 |                     'Accept-Language': 'ja,en-US;q=0.7,en;q=0.3'
285 |                 }
286 |             )
287 | 
288 |         # asyncio loopを作成
289 |         loop = asyncio.new_event_loop()
290 |         asyncio.set_event_loop(loop)
291 | 
292 |         # リダイレクト先のurlに置き換え
293 |         elinks = loop.run_until_complete(
294 |             resolv_links(loop, session, elinks))
295 |         loop.close()
296 | 
297 |         return elinks, etitles, etexts
298 | 
299 | 
300 | async def resolv_links(loop: asyncio.AbstractEventLoop, session: requests.Session, links: list):
301 |     """resolv_links
302 | 
303 |     リダイレクト先のurlをパラレルで取得する(Baiduで使用)
304 | 
305 |     Args:
306 |         loop (asyncio.AbstractEventLoop): loop
307 |         session (requests.Session): 使用するSession
308 |         links (list): リダイレクト先を取得するurlのリスト
309 | 
310 |     Returns:
311 |         data (list): リダイレクト先を取得したurlのリスト
312 |     """
313 | 
314 |     async def req(session: requests.Session, url: str):
315 |         task = await loop.run_in_executor(None, resolv_url, session, url)
316 |         return task
317 | 
318 |     tasks = []
319 |     for link in links:
320 |         # urlをパース
321 |         url = parse.urlparse(link)
322 | 
323 |         # bingの遷移ページの場合はリダイレクトして処理
324 |         if url.netloc == 'www.bing.com' and url.path == '/ck/a':
325 |             task = req(session, link)
326 |             tasks.append(task)
327 | 
328 |     data = await asyncio.gather(*tasks)
329 | 
330 |     return data
331 | 
332 | 
333 | def resolv_url(session: requests.Session, url: str):
334 |     """resolv_url
335 |     リダイレクト先のurlを取得する(Baiduで使用)
336 |     Args:
337 |         session (request.Session): リダイレクト先を取得する際に使用するSession
338 |         url (str): リダイレクト先を取得するurl
339 |     Returns:
340 |         url (str): リダイレクト先のurl
341 |     """
342 | 
343 |     while True:
344 |         try:
345 |             # リダイレクト先のbodyを取得する
346 |             res = session.get(url).text
347 | 
348 |         except requests.RequestException:
349 |             continue
350 |         except ConnectionError:
351 |             continue
352 |         else:
353 |             # resから１行ずつチェック
354 |             for line in res.splitlines():
355 |                 if re.match('^ +var u', line):
356 |                     text = re.findall('"([^"]*)"', line)
357 |                     url = text[0]
358 |                     break
359 |             break
360 | 
361 |     return url
362 | 


--------------------------------------------------------------------------------
/pydork/engine_baidu.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) 2023 Blacknon. All rights reserved.
  4 | # Use of this source code is governed by an MIT license
  5 | # that can be found in the LICENSE file.
  6 | # =======================================================
  7 | 
  8 | 
  9 | """engine_baidu
 10 |     * Baidu用の検索用Classを持つモジュール.
 11 | """
 12 | 
 13 | import requests
 14 | import json
 15 | import asyncio
 16 | import sys
 17 | 
 18 | from urllib import parse
 19 | from bs4 import BeautifulSoup
 20 | 
 21 | from .engine_common import CommonEngine
 22 | from .common import Color
 23 | 
 24 | 
 25 | class Baidu(CommonEngine):
 26 |     """Baidu
 27 | 
 28 |     Baidu用の検索エンジン用Class.
 29 | 
 30 |     Note:
 31 |         検索結果に直接urlが記載されておらず、リンクを踏んで移動先のurlを取得する必要がある。
 32 |         そのため、検索結果を取得してからパラレルで検索結果urlからリンク先urlを取得している。
 33 |         なお、その際のリクエストはSelenium/Splashを使用している場合でもrequestsを使っている。
 34 |     """
 35 | 
 36 |     def __init__(self):
 37 |         # CommonEngineの処理を呼出し
 38 |         super().__init__()
 39 | 
 40 |         self.NAME = 'Baidu'
 41 |         self.COLOR = Color.RED
 42 |         self.COLOR_NAME = self.COLOR + self.NAME + Color.END
 43 | 
 44 |         # リクエスト先のURLを指定
 45 |         self.ENGINE_TOP_URL = 'https://www.baidu.com/'
 46 |         self.SEARCH_URL = 'https://www.baidu.com/s'
 47 |         self.IMAGE_URL = 'https://image.baidu.com/search/acjson'
 48 |         self.SUGGEST_URL = 'https://www.baidu.com/sugrec'
 49 | 
 50 |     def gen_search_url(self, keyword: str, type: str):
 51 |         """gen_search_url
 52 | 
 53 |         検索用のurlを生成する.
 54 | 
 55 |         Args:
 56 |             keyword (str): 検索クエリ.
 57 |             type (str): 検索タイプ.
 58 | 
 59 |         Returns:
 60 |             dict: 検索用url
 61 |         """
 62 | 
 63 |         if type == 'text':
 64 |             # 1ページごとの表示件数
 65 |             view_num = 50
 66 | 
 67 |             # 検索urlを指定
 68 |             search_url = self.SEARCH_URL
 69 | 
 70 |             # 検索パラメータの設定
 71 |             url_param = {
 72 |                 'wd': keyword,  # 検索キーワード
 73 |                 'rn': view_num,     # 1ページごとの表示件数
 74 |                 'filter': '0',  # aaa
 75 |                 'ia': 'web',    #
 76 |                 'pn': ''        # 開始位置
 77 |             }
 78 | 
 79 |         elif type == 'image':
 80 |             # 1ページごとの表示件数
 81 |             view_num = 30
 82 | 
 83 |             # example:
 84 |             #   'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=10696586825489113064&ipn=rj&ct=201326592&is=&fp=result&queryWord=poop&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=&copyright=&word=poop&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=1&fr=&expermode=&force=&pn=30&rn=30&gsm=1e&1617708591950='
 85 |             #   'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=11967476791890431299&ipn=rj&ct=201326592&is=&fp=result&queryWord=poop&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=&copyright=&word=poop&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=1&fr=&expermode=&nojc=&pn=60&rn=30&gsm=3c&1629026924429='
 86 | 
 87 |             # 検索urlを指定
 88 |             search_url = self.IMAGE_URL
 89 | 
 90 |             # 検索パラメータの設定
 91 |             url_param = {
 92 |                 'tn': 'resultjson_com',
 93 |                 'fp': 'result',
 94 |                 'queryWord': keyword,
 95 |                 'word': keyword,
 96 |                 'logid': '12399428100030957064',
 97 |                 'ipn': 'rj',
 98 |                 'ct': '201326592',
 99 |                 'lm': '-1',
100 |                 'cl': 2,
101 |                 'ie': 'utf-8',
102 |                 'nc': 1,
103 |                 'pn': 0,  # 開始位置
104 |                 'rn': view_num,
105 |                 'gsm': '3c',
106 |             }
107 | 
108 |         page = 0
109 |         while True:
110 |             # parameterにページを開始する番号を指定
111 |             url_param['pn'] = str(page * view_num)
112 |             params = parse.urlencode(url_param)
113 | 
114 |             target_url = search_url + '?' + params
115 | 
116 |             yield 'GET', target_url, None
117 | 
118 |             page += 1
119 | 
120 |     def gen_suggest_url(self, keyword: str):
121 |         """gen_suggest_url
122 | 
123 |         サジェスト取得用のurlを生成する.
124 | 
125 |         Args:
126 |             keyword (str): 検索クエリ.
127 | 
128 |         Returns:
129 |             dict: サジェスト取得用url
130 |         """
131 | 
132 |         url_param = {
133 |             'wd': keyword,  # 検索キーワード
134 |             'prod': 'pc'   #
135 |         }
136 | 
137 |         params = parse.urlencode(url_param)
138 |         url = self.SUGGEST_URL + '?' + params
139 | 
140 |         return url
141 | 
142 |     def get_links(self, url: str, html: str, type: str):
143 |         """get_links
144 | 
145 |         受け付けたhtmlを解析し、検索結果をlistに加工して返す関数.
146 | 
147 |         Args:
148 |             html (str): 解析する検索結果のhtml.
149 |             type (str): 検索タイプ([text, image]).現時点ではtextのみ対応.
150 | 
151 |         Returns:
152 |             list: 検索結果(`[{'title': 'title...', 'url': 'https://hogehoge....'}, {...}]`)
153 |         """
154 | 
155 |         links = []
156 | 
157 |         if type == 'text':
158 |             # Splash経由で通信している場合
159 |             self.SOUP_SELECT_URL = '.tts-title > a'
160 |             self.SOUP_SELECT_TITLE = '.tts-title > a'
161 |             self.SOUP_SELECT_TEXT = '.c-gap-top-small > span'
162 | 
163 |             # CommonEngineの処理を呼び出す
164 |             links = super().get_links(url, html, type)
165 | 
166 |         elif type == 'image':
167 |             # unicode escape
168 |             # html = html.encode().decode("unicode-escape")
169 |             html = html.replace("\\'", "'")
170 | 
171 |             # seleniumを使用している場合、htmlで返ってくるためjson要素のみを抽出する
172 |             if self.USE_SELENIUM:
173 |                 html_text = ""
174 |                 soup = BeautifulSoup(html, "lxml")
175 | 
176 |                 for text in soup.find_all(text=True):
177 |                     if text.strip():
178 |                         html_text += text
179 | 
180 |                 html = html_text
181 | 
182 |             # json load
183 |             try:
184 |                 json_data = json.loads(html, strict=False)
185 |             except Exception as e:
186 |                 print(e, file=sys.stderr)
187 |                 return links
188 | 
189 |             if 'data' in json_data:
190 |                 data = json_data['data']
191 | 
192 |                 for d in data:
193 |                     if 'replaceUrl' in d:
194 |                         result = dict()
195 | 
196 |                         # 画像ファイルのurlをパラメータに持つvalueを取得する
197 |                         replace_url = d['replaceUrl'][0]['ObjURL']
198 |                         replace_url = replace_url.replace(
199 |                             'image_search/', 'image_search/?')
200 | 
201 |                         # url valueをparse
202 |                         replace_url_query = parse.urlparse(replace_url).query
203 | 
204 |                         # パラメータを取得
205 |                         replace_url_query_dict = parse.parse_qs(
206 |                             replace_url_query)
207 | 
208 |                         if 'src' not in replace_url_query_dict:
209 |                             continue
210 | 
211 |                         # 画像urlを取得
212 |                         result['link'] = replace_url_query_dict['src'][0]
213 | 
214 |                         if 'fromPageTitle' in d:
215 |                             result['title'] = d['fromPageTitle']
216 | 
217 |                         links.append(result)
218 | 
219 |         return links
220 | 
221 |     def get_suggest_list(self, suggests: list, char: str, html: str):
222 |         """get_suggest_list
223 | 
224 |         htmlからsuggestを配列で取得する関数.
225 | 
226 |         Args:
227 |             suggests (list): suggestを追加するための大本のlist.
228 |             char (str): サジェストの文字列.
229 |             html (str): 解析を行うhtml.
230 | 
231 |         Returns:
232 |             dict: サジェスト配列
233 |         """
234 | 
235 |         try:
236 |             data = json.loads(html)
237 |         except Exception:
238 |             soup = BeautifulSoup(html, "lxml")
239 |             json_data = soup.select_one('html > body')
240 |             data = json.loads(json_data.text)
241 | 
242 |         if 'g' in data:
243 |             suggests[char if char == '' else char[-1]
244 |                      ] = [e['q']
245 |                           for e in data['g']]
246 |         return suggests
247 | 
248 |     def processings_elist(self, elinks, etitles, etexts: list):
249 |         """processings_elist
250 | 
251 |         self.get_links 内で、取得直後のelinks, etitlesに加工を加えるための関数.
252 |         requestsを用いて、リダイレクトリンクから遷移先urlを取得していく.
253 | 
254 |         Args:
255 |             elinks (list): elinks(検索結果のlink)の配列
256 |             etitles (list): etitles(検索結果のtitle)の配列
257 |             etexts (list): etexts(検索結果のtext)の配列
258 | 
259 |         Returns:
260 |             elinks (list): elinks(検索結果のlink)の配列
261 |             etitles (list): etitles(検索結果のtitle)の配列
262 |             etexts (list): etexts(検索結果のtext)の配列
263 |         """
264 | 
265 |         # 通常のスクレイピングとは別にセッションを作成
266 |         session = requests.session()
267 | 
268 |         # pool sizeを調整
269 |         adapter = requests.adapters.HTTPAdapter(
270 |             pool_connections=100, pool_maxsize=100)
271 |         session.mount('http://', adapter)
272 | 
273 |         # proxyを設定
274 |         if self.PROXY != '':
275 |             proxies = {
276 |                 'http': self.PROXY,
277 |                 'https': self.PROXY
278 |             }
279 |             session.proxies = proxies
280 | 
281 |         # user-agentを設定
282 |         if self.USER_AGENT != '':
283 |             session.headers.update(
284 |                 {
285 |                     'User-Agent': self.USER_AGENT,
286 |                     'Accept-Language': 'ja,en-US;q=0.7,en;q=0.3'
287 |                 }
288 |             )
289 | 
290 |         # asyncio loopを作成
291 |         loop = asyncio.new_event_loop()
292 |         asyncio.set_event_loop(loop)
293 | 
294 |         # リダイレクト先のurlに置き換え
295 |         elinks = loop.run_until_complete(
296 |             resolv_links(loop, session, elinks))
297 |         loop.close()
298 | 
299 |         return elinks, etitles, etexts
300 | 
301 | 
302 | async def resolv_links(loop: asyncio.AbstractEventLoop, session: requests.Session, links: list):
303 |     """resolv_links
304 | 
305 |     リダイレクト先のurlをパラレルで取得する(Baiduで使用)
306 | 
307 |     Args:
308 |         loop (asyncio.AbstractEventLoop): loop
309 |         session (requests.Session): 使用するSession
310 |         links (list): リダイレクト先を取得するurlのリスト
311 | 
312 |     Returns:
313 |         data (list): リダイレクト先を取得したurlのリスト
314 |     """
315 | 
316 |     async def req(session: requests.Session, url: str):
317 |         task = await loop.run_in_executor(None, resolv_url, session, url)
318 |         return task
319 | 
320 |     tasks = []
321 |     for link in links:
322 |         task = req(session, link)
323 |         tasks.append(task)
324 | 
325 |     data = await asyncio.gather(*tasks)
326 | 
327 |     return data
328 | 
329 | 
330 | def resolv_url(session: requests.Session, url: str):
331 |     """resolv_url
332 | 
333 |     リダイレクト先のurlを取得する(Baiduで使用)
334 | 
335 |     Args:
336 |         session (request.Session): リダイレクト先を取得する際に使用するSession
337 |         url (str): リダイレクト先を取得するurl
338 |     Returns:
339 |         url (str): リダイレクト先のurl
340 |     """
341 |     while True:
342 |         try:
343 |             res_header = session.head(url, allow_redirects=False).headers
344 |         except requests.RequestException:
345 |             continue
346 |         except ConnectionError:
347 |             continue
348 |         else:
349 |             url = res_header['Location']
350 |             break
351 | 
352 |     return url
353 | 


--------------------------------------------------------------------------------
/pydork/sub_commands.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) 2023 Blacknon. All rights reserved.
  4 | # Use of this source code is governed by an MIT license
  5 | # that can be found in the LICENSE file.
  6 | # =======================================================
  7 | 
  8 | """subcommands
  9 |     * pydorkをコマンドとして動作させる際の処理を記載しているモジュール
 10 | """
 11 | 
 12 | 
 13 | import sys
 14 | import threading
 15 | import json
 16 | import os
 17 | import pathlib
 18 | 
 19 | from typing import List
 20 | from argparse import Namespace
 21 | from jinja2 import Template
 22 | 
 23 | from .engine import SearchEngine, ENGINES
 24 | from .common import Color
 25 | from .common import Message
 26 | 
 27 | 
 28 | # サブコマンドの動作集約用関数
 29 | def run_subcommand(subcommand, args):
 30 |     """run_subcommand
 31 | 
 32 |     Args:
 33 |         subcommand (str): 使用するサブコマンド([search, suggest]).
 34 |         args (Namespace): argparseで取得した引数(Namespace).
 35 |     """
 36 | 
 37 |     # template file用の変数セット(dict)
 38 |     template_variable = {}
 39 | 
 40 |     # query及びfileがともに指定なしの場合、エラーにして返す
 41 |     if args.query == "" and args.file == "" and args.template_file == "":
 42 |         print("Error: クエリもしくはファイルを指定してください.", file=sys.stderr)
 43 |         return
 44 | 
 45 |     # args.fileのチェック
 46 |     if args.file != "":
 47 |         if not os.path.exists(args.file):
 48 |             print("Error: ファイルが存在しません.", file=sys.stderr)
 49 |             return
 50 | 
 51 |     # args.template_fileのチェック
 52 |     if args.template_file != "":
 53 |         if not os.path.exists(args.template_file):
 54 |             print("Error: ファイルが存在しません.", file=sys.stderr)
 55 |             return
 56 | 
 57 |         if args.template_variable == "":
 58 |             print("Error: テンプレート変数が指定されていません.", file=sys.stderr)
 59 |             return
 60 | 
 61 |         try:
 62 |             template_variable = json.loads(args.template_variable)
 63 |         except Exception:
 64 |             print("Error: テンプレート変数の形式がまちがっています.", file=sys.stderr)
 65 |             return
 66 | 
 67 |     # 各サブコマンドのチェック
 68 |     target = None
 69 |     search_mode = ''
 70 |     if subcommand == 'search':
 71 |         # チェック処理
 72 |         if ((args.start is None and args.end is not None) or (args.start is not None and args.end is None)):
 73 |             print(
 74 |                 Color.GRAY + "期間を指定する場合は--start, --endの両方を指定してください" + Color.END,
 75 |                 file=sys.stderr
 76 |             )
 77 |             return
 78 |         target = run_search
 79 |         search_mode = 'text'
 80 | 
 81 |     elif subcommand == 'image':
 82 |         target = run_search
 83 |         search_mode = 'image'
 84 | 
 85 |     elif subcommand == 'suggest':
 86 |         target = run_suggest
 87 | 
 88 |     # create query_list
 89 |     query_list = generate_query_list(args)
 90 | 
 91 |     # append query in template file
 92 |     if args.template_file != "":
 93 |         # template fileのfullpathを取得
 94 |         template_file = pathlib.Path(args.template_file).expanduser()
 95 | 
 96 |         # args.template_variableをjsonとして読み込む.
 97 | 
 98 |         with open(template_file) as f:
 99 |             template_data = f.read()
100 | 
101 |             # template fileから値を取得
102 |             tmpl = Template(template_data)
103 | 
104 |             # 設定情報を取得
105 |             tmpl_params = template_variable
106 | 
107 |             # レンダリング処理を実行
108 |             rendered_query_strings = tmpl.render(tmpl_params)
109 | 
110 |             # templateを1行ずつqueryに追加
111 |             template_file_querys = [s.strip()
112 |                                     for s in rendered_query_strings.splitlines()]
113 |             query_list.extend(template_file_querys)
114 | 
115 |     # engine_listへ、選択されているsearch engineを入れていく
116 |     engine_list = []
117 |     for search_type in args.search_type:
118 |         # if all
119 |         if search_type == 'all':
120 |             for engine in ENGINES:
121 |                 engine_list.append(engine)
122 |             continue
123 | 
124 |         # if in searchengine
125 |         if search_type in ENGINES:
126 |             engine_list.append(search_type)
127 |             continue
128 | 
129 |     # engine_listから、重複したリストを削除
130 |     engine_list = list(set(engine_list))
131 | 
132 |     tasks = []
133 |     thread_result = dict()
134 |     lock = threading.Lock()
135 |     for engine in engine_list:
136 |         task = threading.Thread(
137 |             target=target, args=(engine, query_list, args, thread_result, True, lock, search_mode))
138 |         tasks.append(task)
139 | 
140 |     for task in tasks:
141 |         task.start()
142 | 
143 |     for task in tasks:
144 |         task.join()
145 | 
146 |     # json出力が有効だった場合、json形式で出力
147 |     if args.json:
148 |         print(json.dumps(thread_result, ensure_ascii=False, indent=2))
149 | 
150 | 
151 | # SearchEngineのオプション設定用関数
152 | def set_se_options(se: SearchEngine, args: Namespace):
153 |     """set_se_options
154 | 
155 |     Args:
156 |         se (SearchEngine): argsの情報を元に、オプションを設定するSearchEngine.
157 |         args (Namespace): argparseで取得した引数(Namespace).
158 | 
159 |     Returns:
160 |         SearchEngine: オプションを設定したSearchEngine.
161 |     """
162 | 
163 |     # set debug flag
164 |     if 'debug' in args:
165 |         se.set_is_debug(args.debug)
166 | 
167 |     # set ssl verify
168 |     if 'insecure' in args:
169 |         se.set_ignore_ssl(args.insecure)
170 | 
171 |     # set is_command flag
172 |     se.set_is_command(True)
173 | 
174 |     # set disable headless
175 |     if 'disable_headless' in args:
176 |         se.set_disable_headless(args.disable_headless)
177 | 
178 |     # proxy
179 |     if args.proxy != '':
180 |         se.set_proxy(args.proxy)
181 | 
182 |     # Selenium
183 |     if args.selenium:
184 |         # set default endpoint
185 |         endpoint = None
186 | 
187 |         # if set browser-endpoint
188 |         if args.browser_endpoint != "":
189 |             endpoint = args.browser_endpoint
190 | 
191 |         # set selenium
192 |         se.set_selenium(endpoint, args.browser)
193 | 
194 |     # Splush
195 |     if args.splash:
196 |         # set default endpoint
197 |         endpoint = 'localhost:8050'
198 | 
199 |         # if set browser-endpoint
200 |         if args.browser_endpoint != "":
201 |             endpoint = args.browser_endpoint
202 | 
203 |         # set splash
204 |         se.set_splash(endpoint)
205 | 
206 |     # useragent
207 |     se.set_user_agent()
208 | 
209 |     # lang/country code
210 |     se.set_lang(args.lang, args.country)
211 | 
212 |     # set cookie driver(last set)
213 |     se.set_cookie_files(args.cookies)
214 | 
215 |     # set cookie file delete
216 |     se.set_cookie_files_delete(args.delete_cookies)
217 | 
218 |     return se
219 | 
220 | 
221 | # 検索結果を出力する
222 | def print_search_result(result, args: Namespace, message: Message):
223 |     """print_search_result
224 | 
225 | 
226 |     Args:
227 |         result : SearchEngine.searchのresult.
228 |         args (Namespace): argparseで取得した引数(Namespace).
229 |         message (common.Message): 出力用Class.
230 |     """
231 | 
232 |     # 区切り文字を指定
233 |     sep = ': '
234 |     if args.nullchar:
235 |         sep = '\0'
236 | 
237 |     # title出力を行うか確認
238 |     title_mode = False
239 |     if 'title' in args:
240 |         title_mode = args.title
241 | 
242 |     # pageurl出力を行うか確認
243 |     pagelink_mode = False
244 |     if 'pagelink' in args:
245 |         pagelink_mode = args.pagelink
246 | 
247 |     for d in result:
248 |         data = []
249 |         link = d['link']
250 | 
251 |         # 出力dataにlinkを追加
252 |         data.insert(0, link)
253 | 
254 |         # pageurlの有無を確認
255 |         if 'pagelink' in d and pagelink_mode:
256 |             pagelink = d['pagelink']
257 | 
258 |             # pagelinkの色指定
259 |             if args.color == 'always' or (args.color == 'auto' and sys.stdout.isatty()):
260 |                 pagelink = Color.GRAY + Color.UNDERLINE + pagelink + Color.END
261 | 
262 |             data.insert(0, pagelink)
263 | 
264 |         # titleの有無を確認
265 |         if 'title' in d and title_mode:
266 |             title = d['title']
267 | 
268 |             # titleの色指定
269 |             if args.color == 'always' or (args.color == 'auto' and sys.stdout.isatty()):
270 |                 title = Color.GRAY + title + Color.END
271 | 
272 |             data.insert(0, title)
273 | 
274 |         message.print_line(*data, separator=sep)
275 | 
276 | 
277 | # generate
278 | def generate_query_list(args: Namespace):
279 |     """generate_query_list
280 | 
281 |     """
282 |     # create query_list
283 |     query_list: List[str] = list()
284 | 
285 |     # append query
286 |     if args.query != "":
287 |         query_list.append(args.query)
288 | 
289 |     # append query in file
290 |     if args.file != "":
291 |         # fileのfull pathを取得
292 |         file = pathlib.Path(args.file).expanduser()
293 | 
294 |         # ファイルを開いて1行ずつqueryに追加する
295 |         with open(file) as f:
296 |             file_querys = [s.strip() for s in f.readlines()]
297 |             query_list.extend(file_querys)
298 | 
299 |     return query_list
300 | 
301 | 
302 | # 検索
303 | def run_search(engine: str, query_list: list, args, thread_result: dict, cmd=False, lock=None, mode='text'):
304 |     """search
305 | 
306 |     Args:
307 |         engine (str): 使用する検索エンジン(.engine.ENGINES).
308 |         query_list(list): 検索クエリのリスト.
309 |         args (Namespace): argparseで取得した引数(Namespace).
310 |         thread_result(dict): 結果を1箇所に集約するためのresult dict. json出力するときのみ使用.
311 |         cmd (bool, optional): commandで実行しているか否か. Defaults to False.
312 |         lock (threading.Lock): threadingのマルチスレッドで使用するLock.現在は未使用. Defaults to None.
313 |         type (str, optional): 検索タイプ. `text` or `image`.
314 |     """
315 | 
316 |     # start SearchEngine class
317 |     se = SearchEngine()
318 | 
319 |     # Set Engine
320 |     se.set(engine)
321 | 
322 |     # Set SearchEngine options
323 |     se = set_se_options(se, args)
324 | 
325 |     # Set lock
326 |     se.set_lock(lock)
327 | 
328 |     # Set color
329 |     if args.color == 'always' or (args.color == 'auto' and sys.stdout.isatty()):
330 |         se.set_is_color(True)
331 | 
332 |     # 検索タイプを設定(テキスト or 画像)
333 |     search_type = mode
334 | 
335 |     # 区切り文字を指定
336 |     sep = ': '
337 |     if args.nullchar:
338 |         sep = '\0'
339 | 
340 |     # json出力時の変数を宣言
341 |     all_result_json = list()
342 | 
343 |     # query_listの内容を順番に処理
344 |     for query in query_list:
345 |         # 検索を実行
346 |         result = se.search(
347 |             query, search_type=search_type,
348 |             maximum=args.num
349 |         )
350 | 
351 |         # debug
352 |         se.ENGINE.MESSAGE.print_text(
353 |             json.dumps(result),
354 |             separator=sep,
355 |             header=se.ENGINE.MESSAGE.HEADER + ': ' +
356 |             Color.GRAY + '[DEBUG]: [Result]' + Color.END,
357 |             mode="debug",
358 |         )
359 | 
360 |         if args.json:
361 |             # all_result_jsonへ組み込むためのjson方式へ加工.
362 |             append_result = {
363 |                 'query': query,
364 |                 'result': result
365 |             }
366 |             all_result_json.append(append_result)
367 | 
368 |         else:
369 |             print_search_result(result, args, se.ENGINE.MESSAGE)
370 | 
371 |     if args.json:
372 |         thread_result[engine] = all_result_json
373 | 
374 | 
375 | # サジェスト
376 | def run_suggest(engine: str, query_list: list, args: Namespace, thread_result: dict, cmd=False, lock=None, mode=''):
377 |     """suggest
378 | 
379 |     Args:
380 |         engine (str): 使用する検索エンジン(.engine.ENGINES).
381 |         query_list(list): 検索クエリのリスト.
382 |         args (Namespace): argparseで取得した引数(Namespace).
383 |         thread_result(dict): 結果を1箇所に集約するためのresult dict. json出力するときのみ使用.
384 |         cmd (bool, optional): commandで実行しているか否か. Defaults to False.
385 |         lock (threading.Lock): threadingのマルチスレッドで使用するLock.現在は未使用. Defaults to None.
386 |         mode (str, optional): マルチスレッドでsearchある程度共用で使えるようにするための引数. 利用していない. Defaults to ''.
387 |     """
388 | 
389 |     # start search engine class
390 |     se = SearchEngine()
391 | 
392 |     # Set Engine
393 |     se.set(engine)
394 | 
395 |     # Set Message()
396 |     msg = Message()
397 |     msg.set_engine(se.ENGINE.NAME, se.ENGINE.COLOR)
398 |     if 'debug' in args:
399 |         msg.set_is_debug(args.debug)
400 |     msg.set_is_command(True)
401 | 
402 |     # set msg to se
403 |     se.ENGINE.set_messages(msg)
404 | 
405 |     # Set SearchEngine options
406 |     se = set_se_options(se, args)
407 | 
408 |     # Set lock
409 |     se.set_lock(lock)
410 | 
411 |     # Header
412 |     header = '[${ENGINE_NAME}Suggest]'
413 |     if args.color == 'always' or (args.color == 'auto' and sys.stdout.isatty()):
414 |         sc = Color(se.ENGINE.COLOR)
415 |         header = sc.out(header)
416 |     se.ENGINE.MESSAGE.set_header(header)
417 | 
418 |     # json出力時の変数を宣言
419 |     all_result_json = list()
420 | 
421 |     # Suggestを取得
422 |     for query in query_list:
423 |         result = se.suggest(
424 |             query,
425 |             jap=args.jap,
426 |             alph=args.alph,
427 |             num=args.num,
428 |         )
429 | 
430 |         for words in result.values():
431 |             if args.json:
432 |                 append_result = {
433 |                     'query': query,
434 |                     'result': words
435 |                 }
436 |                 all_result_json.append(append_result)
437 | 
438 |             else:
439 |                 for w in words:
440 |                     se.ENGINE.MESSAGE.print_line(w, separator=": ")
441 | 
442 |     if args.json:
443 |         thread_result[engine] = all_result_json
444 | 


--------------------------------------------------------------------------------
/pydork/test_engine.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) 2023 Blacknon. All rights reserved.
  4 | # Use of this source code is governed by an MIT license
  5 | # that can be found in the LICENSE file.
  6 | # =======================================================
  7 | 
  8 | 
  9 | """test_engine_google
 10 |     * SearchEngine Classのテストコード.
 11 |     * 各検索エンジンの動作テストを行う
 12 | """
 13 | 
 14 | # TODO: splash/selenium経由での通信のテストも追加する(dockerでのコンテナ環境が前提になると思われる)
 15 | 
 16 | 
 17 | import unittest
 18 | 
 19 | from .engine import SearchEngine
 20 | 
 21 | # 変数
 22 | SEARCH_TEXT = 'Linux'
 23 | 
 24 | 
 25 | class SearchEngineTestCase(unittest.TestCase):
 26 |     def setUp(self):
 27 |         """setUp
 28 | 
 29 |         テストメソッド実行前処理.
 30 |         """
 31 |         # SearchEngine
 32 |         self.search_engine = SearchEngine()
 33 | 
 34 |         print("setUp!!")
 35 | 
 36 |     def tearDown(self):
 37 |         """tearDown
 38 | 
 39 |         テストメソッド実行後処理
 40 |         """
 41 | 
 42 |         print("tearDown!!")
 43 | 
 44 |     def common_settings(self):
 45 |         # command modeを有効化
 46 |         self.search_engine.set_is_command(True)
 47 | 
 48 |         # debug modeを有効化
 49 |         self.search_engine.set_is_debug(True)
 50 | 
 51 |         # user agentを定義
 52 |         self.search_engine.set_user_agent()
 53 | 
 54 |     # ==========
 55 |     # Baidu
 56 |     # ==========
 57 |     def test_baidu_text_search(self):
 58 |         print('Test Baidu text search.')
 59 | 
 60 |         # 検索エンジンを指定(ここではBaiduを使用)
 61 |         self.search_engine.set('baidu')
 62 | 
 63 |         # 共通系の中間前処理を実行
 64 |         self.common_settings()
 65 | 
 66 |         # 検索を実行
 67 |         data = self.search_engine.search(SEARCH_TEXT, maximum=30)
 68 | 
 69 |         print("{} count.".format(len(data)))
 70 |         self.assertNotEqual(0, len(data))
 71 | 
 72 |     def test_baidu_image_search(self):
 73 |         print('Test Baidu image search.')
 74 | 
 75 |         # 検索エンジンを指定(ここではBaiduを使用)
 76 |         self.search_engine.set('baidu')
 77 | 
 78 |         # 共通系の中間前処理を実行
 79 |         self.common_settings()
 80 | 
 81 |         # 検索を実行
 82 |         data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30)
 83 | 
 84 |         print("{} count.".format(len(data)))
 85 |         self.assertNotEqual(0, len(data))
 86 | 
 87 |     def test_baidu_suggest(self):
 88 |         print('Test Baidu text suggest.')
 89 | 
 90 |         # 検索エンジンを指定(ここではBaiduを使用)
 91 |         self.search_engine.set('baidu')
 92 | 
 93 |         # 共通系の中間前処理を実行
 94 |         self.common_settings()
 95 | 
 96 |         # 検索を実行
 97 |         data = self.search_engine.suggest(
 98 |             SEARCH_TEXT)
 99 | 
100 |         print("{} count.".format(len(data)))
101 |         self.assertNotEqual(0, len(data))
102 | 
103 |     def test_baidu_suggest_with_alph(self):
104 |         print('Test Baidu text suggest with alph.')
105 | 
106 |         # 検索エンジンを指定(ここではBaiduを使用)
107 |         self.search_engine.set('baidu')
108 | 
109 |         # 共通系の中間前処理を実行
110 |         self.common_settings()
111 | 
112 |         # 検索を実行
113 |         data = self.search_engine.suggest(
114 |             SEARCH_TEXT, alph=True)
115 | 
116 |         self.assertNotEqual(0, len(data))
117 | 
118 |     def test_baidu_suggest_with_num(self):
119 |         print('Test Baidu text suggest with num.')
120 | 
121 |         # 検索エンジンを指定(ここではBaiduを使用)
122 |         self.search_engine.set('baidu')
123 | 
124 |         # 共通系の中間前処理を実行
125 |         self.common_settings()
126 | 
127 |         # 検索を実行
128 |         data = self.search_engine.suggest(
129 |             SEARCH_TEXT, num=True)
130 | 
131 |         print("{} count.".format(len(data)))
132 |         self.assertNotEqual(0, len(data))
133 | 
134 |     # ==========
135 |     # Bing
136 |     # ==========
137 |     def test_bing_text_search(self):
138 |         print('Test Bing text search.')
139 | 
140 |         # 検索エンジンを指定(ここではBingを使用)
141 |         self.search_engine.set('bing')
142 | 
143 |         # 共通系の中間前処理を実行
144 |         self.common_settings()
145 | 
146 |         # 検索を実行
147 |         data = self.search_engine.search(SEARCH_TEXT, maximum=30)
148 | 
149 |         print("{} count.".format(len(data)))
150 |         self.assertNotEqual(0, len(data))
151 | 
152 |     def test_bing_image_search(self):
153 |         print('Test Bing image search.')
154 | 
155 |         # 検索エンジンを指定(ここではBingを使用)
156 |         self.search_engine.set('bing')
157 | 
158 |         # 共通系の中間前処理を実行
159 |         self.common_settings()
160 | 
161 |         # 検索を実行
162 |         data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30)
163 | 
164 |         print("{} count.".format(len(data)))
165 |         self.assertEqual(30, len(data))
166 | 
167 |     def test_bing_suggest(self):
168 |         print('Test Bing text suggest.')
169 | 
170 |         # 検索エンジンを指定(ここではBingを使用)
171 |         self.search_engine.set('bing')
172 | 
173 |         # 共通系の中間前処理を実行
174 |         self.common_settings()
175 | 
176 |         # 検索を実行
177 |         data = self.search_engine.suggest(
178 |             SEARCH_TEXT)
179 | 
180 |         print("{} count.".format(len(data)))
181 |         self.assertNotEqual(0, len(data))
182 | 
183 |     def test_bing_suggest_with_jap(self):
184 |         print('Test Bing text suggest with jap.')
185 | 
186 |         # 検索エンジンを指定(ここではBingを使用)
187 |         self.search_engine.set('bing')
188 | 
189 |         # 共通系の中間前処理を実行
190 |         self.common_settings()
191 | 
192 |         # 検索を実行
193 |         data = self.search_engine.suggest(
194 |             SEARCH_TEXT, jap=True)
195 | 
196 |         print("{} count.".format(len(data)))
197 |         self.assertNotEqual(0, len(data))
198 | 
199 |     def test_bing_suggest_with_alph(self):
200 |         print('Test Bing text suggest with alph.')
201 | 
202 |         # 検索エンジンを指定(ここではBingを使用)
203 |         self.search_engine.set('bing')
204 | 
205 |         # 共通系の中間前処理を実行
206 |         self.common_settings()
207 | 
208 |         # 検索を実行
209 |         data = self.search_engine.suggest(
210 |             SEARCH_TEXT, alph=True)
211 | 
212 |         self.assertNotEqual(0, len(data))
213 | 
214 |     def test_bing_suggest_with_num(self):
215 |         print('Test Bing text suggest with num.')
216 | 
217 |         # 検索エンジンを指定(ここではBingを使用)
218 |         self.search_engine.set('bing')
219 | 
220 |         # 共通系の中間前処理を実行
221 |         self.common_settings()
222 | 
223 |         # 検索を実行
224 |         data = self.search_engine.suggest(
225 |             SEARCH_TEXT, num=True)
226 | 
227 |         print("{} count.".format(len(data)))
228 |         self.assertNotEqual(0, len(data))
229 | 
230 |     # ==========
231 |     # DuckDuckGo
232 |     # ==========
233 |     def test_duckduckgo_text_search(self):
234 |         print('Test DuckDuckGo text search.')
235 | 
236 |         # 検索エンジンを指定(ここではDuckDuckGoを使用)
237 |         self.search_engine.set('duckduckgo')
238 | 
239 |         # 共通系の中間前処理を実行
240 |         self.common_settings()
241 | 
242 |         # 検索を実行
243 |         data = self.search_engine.search(SEARCH_TEXT, maximum=30)
244 | 
245 |         print("{} count.".format(len(data)))
246 |         self.assertEqual(30, len(data))
247 | 
248 |     def test_duckduckgo_image_search(self):
249 |         print('Test DuckDuckGo image search.')
250 | 
251 |         # 検索エンジンを指定(ここではDuckDuckGoを使用)
252 |         self.search_engine.set('duckduckgo')
253 | 
254 |         # 共通系の中間前処理を実行
255 |         self.common_settings()
256 | 
257 |         # 検索を実行
258 |         data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30)
259 | 
260 |         print("{} count.".format(len(data)))
261 |         self.assertEqual(30, len(data))
262 | 
263 |     def test_duckduckgo_suggest(self):
264 |         print('Test DuckDuckGo text suggest.')
265 | 
266 |         # 検索エンジンを指定(ここではDuckDuckGoを使用)
267 |         self.search_engine.set('duckduckgo')
268 | 
269 |         # 共通系の中間前処理を実行
270 |         self.common_settings()
271 | 
272 |         # 検索を実行
273 |         data = self.search_engine.suggest(
274 |             SEARCH_TEXT)
275 | 
276 |         print("{} count.".format(len(data)))
277 |         self.assertNotEqual(0, len(data))
278 | 
279 |     def test_duckduckgo_suggest_with_jap(self):
280 |         print('Test DuckDuckGo text suggest with jap.')
281 | 
282 |         # 検索エンジンを指定(ここではDuckDuckGoを使用)
283 |         self.search_engine.set('duckduckgo')
284 | 
285 |         # 共通系の中間前処理を実行
286 |         self.common_settings()
287 | 
288 |         # 検索を実行
289 |         data = self.search_engine.suggest(
290 |             SEARCH_TEXT, jap=True)
291 | 
292 |         print("{} count.".format(len(data)))
293 |         self.assertNotEqual(0, len(data))
294 | 
295 |     def test_duckduckgo_suggest_with_alph(self):
296 |         print('Test DuckDuckGo text suggest with alph.')
297 | 
298 |         # 検索エンジンを指定(ここではDuckDuckGoを使用)
299 |         self.search_engine.set('duckduckgo')
300 | 
301 |         # 共通系の中間前処理を実行
302 |         self.common_settings()
303 | 
304 |         # 検索を実行
305 |         data = self.search_engine.suggest(
306 |             SEARCH_TEXT, alph=True)
307 | 
308 |         self.assertNotEqual(0, len(data))
309 | 
310 |     def test_duckduckgo_suggest_with_num(self):
311 |         print('Test DuckDuckGo text suggest with num.')
312 | 
313 |         # 検索エンジンを指定(ここではDuckDuckGoを使用)
314 |         self.search_engine.set('duckduckgo')
315 | 
316 |         # 共通系の中間前処理を実行
317 |         self.common_settings()
318 | 
319 |         # 検索を実行
320 |         data = self.search_engine.suggest(
321 |             SEARCH_TEXT, num=True)
322 | 
323 |         print("{} count.".format(len(data)))
324 |         self.assertNotEqual(0, len(data))
325 | 
326 |     # ==========
327 |     # Google
328 |     # ==========
329 |     def test_google_text_search(self):
330 |         print('Test Google text search.')
331 | 
332 |         # 検索エンジンを指定(ここではGoogleを使用)
333 |         self.search_engine.set('google')
334 | 
335 |         # 共通系の中間前処理を実行
336 |         self.common_settings()
337 | 
338 |         # 検索を実行
339 |         data = self.search_engine.search(SEARCH_TEXT, maximum=30)
340 | 
341 |         print("{} count.".format(len(data)))
342 |         self.assertEqual(30, len(data))
343 | 
344 |     def test_google_image_search(self):
345 |         print('Test Google image search.')
346 | 
347 |         # 検索エンジンを指定(ここではGoogleを使用)
348 |         self.search_engine.set('google')
349 | 
350 |         # 共通系の中間前処理を実行
351 |         self.common_settings()
352 | 
353 |         # 検索を実行
354 |         data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30)
355 | 
356 |         print("{} count.".format(len(data)))
357 |         self.assertEqual(30, len(data))
358 | 
359 |     def test_google_suggest(self):
360 |         print('Test Google text suggest.')
361 | 
362 |         # 検索エンジンを指定(ここではGoogleを使用)
363 |         self.search_engine.set('google')
364 | 
365 |         # 共通系の中間前処理を実行
366 |         self.common_settings()
367 | 
368 |         # 検索を実行
369 |         data = self.search_engine.suggest(
370 |             SEARCH_TEXT)
371 | 
372 |         print("{} count.".format(len(data)))
373 |         self.assertNotEqual(0, len(data))
374 | 
375 |     def test_google_suggest_with_jap(self):
376 |         print('Test Google text suggest with jap.')
377 | 
378 |         # 検索エンジンを指定(ここではGoogleを使用)
379 |         self.search_engine.set('google')
380 | 
381 |         # 共通系の中間前処理を実行
382 |         self.common_settings()
383 | 
384 |         # 検索を実行
385 |         data = self.search_engine.suggest(
386 |             SEARCH_TEXT, jap=True)
387 | 
388 |         print("{} count.".format(len(data)))
389 |         self.assertNotEqual(0, len(data))
390 | 
391 |     def test_google_suggest_with_alph(self):
392 |         print('Test Google text suggest with alph.')
393 | 
394 |         # 検索エンジンを指定(ここではGoogleを使用)
395 |         self.search_engine.set('google')
396 | 
397 |         # 共通系の中間前処理を実行
398 |         self.common_settings()
399 | 
400 |         # 検索を実行
401 |         data = self.search_engine.suggest(
402 |             SEARCH_TEXT, alph=True)
403 | 
404 |         self.assertNotEqual(0, len(data))
405 | 
406 |     def test_google_suggest_with_num(self):
407 |         print('Test Google text suggest with num.')
408 | 
409 |         # 検索エンジンを指定(ここではGoogleを使用)
410 |         self.search_engine.set('google')
411 | 
412 |         # 共通系の中間前処理を実行
413 |         self.common_settings()
414 | 
415 |         # 検索を実行
416 |         data = self.search_engine.suggest(
417 |             SEARCH_TEXT, num=True)
418 | 
419 |         print("{} count.".format(len(data)))
420 |         self.assertNotEqual(0, len(data))
421 | 
422 |     # ==========
423 |     # Yahoo
424 |     # ==========
425 |     def test_yahoo_text_search(self):
426 |         print('Test Yahoo text search.')
427 | 
428 |         # 検索エンジンを指定(ここではYahooを使用)
429 |         self.search_engine.set('yahoo')
430 | 
431 |         # 共通系の中間前処理を実行
432 |         self.common_settings()
433 | 
434 |         # 検索を実行
435 |         data = self.search_engine.search(SEARCH_TEXT, maximum=30)
436 | 
437 |         print("{} count.".format(len(data)))
438 |         self.assertEqual(30, len(data))
439 | 
440 |     def test_yahoo_image_search(self):
441 |         print('Test Yahoo image search.')
442 | 
443 |         # 検索エンジンを指定(ここではYahooを使用)
444 |         self.search_engine.set('yahoo')
445 | 
446 |         # 共通系の中間前処理を実行
447 |         self.common_settings()
448 | 
449 |         # 検索を実行
450 |         data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30)
451 | 
452 |         print("{} count.".format(len(data)))
453 |         self.assertEqual(30, len(data))
454 | 
455 |     def test_yahoo_suggest(self):
456 |         print('Test Yahoo text suggest.')
457 | 
458 |         # 検索エンジンを指定(ここではYahooを使用)
459 |         self.search_engine.set('yahoo')
460 | 
461 |         # 共通系の中間前処理を実行
462 |         self.common_settings()
463 | 
464 |         # 検索を実行
465 |         data = self.search_engine.suggest(
466 |             SEARCH_TEXT)
467 | 
468 |         print("{} count.".format(len(data)))
469 |         self.assertNotEqual(0, len(data))
470 | 
471 |     def test_yahoo_suggest_with_jap(self):
472 |         print('Test Yahoo text suggest with jap.')
473 | 
474 |         # 検索エンジンを指定(ここではYahooを使用)
475 |         self.search_engine.set('yahoo')
476 | 
477 |         # 共通系の中間前処理を実行
478 |         self.common_settings()
479 | 
480 |         # 検索を実行
481 |         data = self.search_engine.suggest(
482 |             SEARCH_TEXT, jap=True)
483 | 
484 |         print("{} count.".format(len(data)))
485 |         self.assertNotEqual(0, len(data))
486 | 
487 |     def test_yahoo_suggest_with_alph(self):
488 |         print('Test Yahoo text suggest with alph.')
489 | 
490 |         # 検索エンジンを指定(ここではYahooを使用)
491 |         self.search_engine.set('yahoo')
492 | 
493 |         # 共通系の中間前処理を実行
494 |         self.common_settings()
495 | 
496 |         # 検索を実行
497 |         data = self.search_engine.suggest(
498 |             SEARCH_TEXT, alph=True)
499 | 
500 |         self.assertNotEqual(0, len(data))
501 | 
502 |     def test_yahoo_suggest_with_num(self):
503 |         print('Test Yahoo text suggest with num.')
504 | 
505 |         # 検索エンジンを指定(ここではYahooを使用)
506 |         self.search_engine.set('yahoo')
507 | 
508 |         # 共通系の中間前処理を実行
509 |         self.common_settings()
510 | 
511 |         # 検索を実行
512 |         data = self.search_engine.suggest(
513 |             SEARCH_TEXT, num=True)
514 | 
515 |         print("{} count.".format(len(data)))
516 |         self.assertNotEqual(0, len(data))
517 | 


--------------------------------------------------------------------------------
/pydork/test_engine_selenium.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) 2023 Blacknon. All rights reserved.
  4 | # Use of this source code is governed by an MIT license
  5 | # that can be found in the LICENSE file.
  6 | # =======================================================
  7 | 
  8 | 
  9 | """test_engine_google
 10 |     * SearchEngine Classのテストコード.
 11 |     * 各検索エンジンの動作テストを行う
 12 | """
 13 | 
 14 | # TODO: splash/selenium経由での通信のテストも追加する(dockerでのコンテナ環境が前提になると思われる)
 15 | 
 16 | 
 17 | import unittest
 18 | 
 19 | from .engine import SearchEngine
 20 | 
 21 | # 変数
 22 | SEARCH_TEXT = 'Linux'
 23 | 
 24 | 
 25 | class SearchEngineTestCaseWithSelenium(unittest.TestCase):
 26 |     def setUp(self):
 27 |         """setUp
 28 | 
 29 |         テストメソッド実行前処理.
 30 |         """
 31 |         # SearchEngine
 32 |         self.search_engine = SearchEngine()
 33 | 
 34 |         print("setUp!!")
 35 | 
 36 |     def tearDown(self):
 37 |         """tearDown
 38 | 
 39 |         テストメソッド実行後処理
 40 |         """
 41 | 
 42 |         print("tearDown!!")
 43 | 
 44 |     def common_settings(self):
 45 |         # command modeを有効化
 46 |         self.search_engine.set_is_command(True)
 47 | 
 48 |         # debug modeを有効化
 49 |         self.search_engine.set_is_debug(True)
 50 | 
 51 |         # seleniumを有効化
 52 |         self.search_engine.set_selenium(None, 'chrome')
 53 | 
 54 |         # user agentを定義
 55 |         self.search_engine.set_user_agent()
 56 | 
 57 |     # ==========
 58 |     # Baidu
 59 |     # ==========
 60 |     def test_baidu_text_search(self):
 61 |         print('Test Baidu text search.')
 62 | 
 63 |         # 検索エンジンを指定(ここではBaiduを使用)
 64 |         self.search_engine.set('baidu')
 65 | 
 66 |         # 共通系の中間前処理を実行
 67 |         self.common_settings()
 68 | 
 69 |         # 検索を実行
 70 |         data = self.search_engine.search(SEARCH_TEXT, maximum=30)
 71 | 
 72 |         print("{} count.".format(len(data)))
 73 |         self.assertNotEqual(0, len(data))
 74 | 
 75 |     def test_baidu_image_search(self):
 76 |         print('Test Baidu image search.')
 77 | 
 78 |         # 検索エンジンを指定(ここではBaiduを使用)
 79 |         self.search_engine.set('baidu')
 80 | 
 81 |         # 共通系の中間前処理を実行
 82 |         self.common_settings()
 83 | 
 84 |         # 検索を実行
 85 |         data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30)
 86 | 
 87 |         print("{} count.".format(len(data)))
 88 |         self.assertNotEqual(0, len(data))
 89 | 
 90 |     def test_baidu_suggest(self):
 91 |         print('Test Baidu text suggest.')
 92 | 
 93 |         # 検索エンジンを指定(ここではBaiduを使用)
 94 |         self.search_engine.set('baidu')
 95 | 
 96 |         # 共通系の中間前処理を実行
 97 |         self.common_settings()
 98 | 
 99 |         # 検索を実行
100 |         data = self.search_engine.suggest(
101 |             SEARCH_TEXT)
102 | 
103 |         print("{} count.".format(len(data)))
104 |         self.assertNotEqual(0, len(data))
105 | 
106 |     def test_baidu_suggest_with_alph(self):
107 |         print('Test Baidu text suggest with alph.')
108 | 
109 |         # 検索エンジンを指定(ここではBaiduを使用)
110 |         self.search_engine.set('baidu')
111 | 
112 |         # 共通系の中間前処理を実行
113 |         self.common_settings()
114 | 
115 |         # 検索を実行
116 |         data = self.search_engine.suggest(
117 |             SEARCH_TEXT, alph=True)
118 | 
119 |         self.assertNotEqual(0, len(data))
120 | 
121 |     def test_baidu_suggest_with_num(self):
122 |         print('Test Baidu text suggest with num.')
123 | 
124 |         # 検索エンジンを指定(ここではBaiduを使用)
125 |         self.search_engine.set('baidu')
126 | 
127 |         # 共通系の中間前処理を実行
128 |         self.common_settings()
129 | 
130 |         # 検索を実行
131 |         data = self.search_engine.suggest(
132 |             SEARCH_TEXT, num=True)
133 | 
134 |         print("{} count.".format(len(data)))
135 |         self.assertNotEqual(0, len(data))
136 | 
137 |     # ==========
138 |     # Bing
139 |     # ==========
140 |     def test_bing_text_search(self):
141 |         print('Test Bing text search.')
142 | 
143 |         # 検索エンジンを指定(ここではBingを使用)
144 |         self.search_engine.set('bing')
145 | 
146 |         # 共通系の中間前処理を実行
147 |         self.common_settings()
148 | 
149 |         # 検索を実行
150 |         data = self.search_engine.search(SEARCH_TEXT, maximum=30)
151 | 
152 |         print("{} count.".format(len(data)))
153 |         self.assertNotEqual(0, len(data))
154 | 
155 |     def test_bing_image_search(self):
156 |         print('Test Bing image search.')
157 | 
158 |         # 検索エンジンを指定(ここではBingを使用)
159 |         self.search_engine.set('bing')
160 | 
161 |         # 共通系の中間前処理を実行
162 |         self.common_settings()
163 | 
164 |         # 検索を実行
165 |         data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30)
166 | 
167 |         print("{} count.".format(len(data)))
168 |         self.assertNotEqual(0, len(data))
169 | 
170 |     def test_bing_suggest(self):
171 |         print('Test Bing text suggest.')
172 | 
173 |         # 検索エンジンを指定(ここではBingを使用)
174 |         self.search_engine.set('bing')
175 | 
176 |         # 共通系の中間前処理を実行
177 |         self.common_settings()
178 | 
179 |         # 検索を実行
180 |         data = self.search_engine.suggest(
181 |             SEARCH_TEXT)
182 | 
183 |         print("{} count.".format(len(data)))
184 |         self.assertNotEqual(0, len(data))
185 | 
186 |     def test_bing_suggest_with_jap(self):
187 |         print('Test Bing text suggest with jap.')
188 | 
189 |         # 検索エンジンを指定(ここではBingを使用)
190 |         self.search_engine.set('bing')
191 | 
192 |         # 共通系の中間前処理を実行
193 |         self.common_settings()
194 | 
195 |         # 検索を実行
196 |         data = self.search_engine.suggest(
197 |             SEARCH_TEXT, jap=True)
198 | 
199 |         print("{} count.".format(len(data)))
200 |         self.assertNotEqual(0, len(data))
201 | 
202 |     def test_bing_suggest_with_alph(self):
203 |         print('Test Bing text suggest with alph.')
204 | 
205 |         # 検索エンジンを指定(ここではBingを使用)
206 |         self.search_engine.set('bing')
207 | 
208 |         # 共通系の中間前処理を実行
209 |         self.common_settings()
210 | 
211 |         # 検索を実行
212 |         data = self.search_engine.suggest(
213 |             SEARCH_TEXT, alph=True)
214 | 
215 |         self.assertNotEqual(0, len(data))
216 | 
217 |     def test_bing_suggest_with_num(self):
218 |         print('Test Bing text suggest with num.')
219 | 
220 |         # 検索エンジンを指定(ここではBingを使用)
221 |         self.search_engine.set('bing')
222 | 
223 |         # 共通系の中間前処理を実行
224 |         self.common_settings()
225 | 
226 |         # 検索を実行
227 |         data = self.search_engine.suggest(
228 |             SEARCH_TEXT, num=True)
229 | 
230 |         print("{} count.".format(len(data)))
231 |         self.assertNotEqual(0, len(data))
232 | 
233 |     # ==========
234 |     # DuckDuckGo
235 |     # ==========
236 |     def test_duckduckgo_text_search(self):
237 |         print('Test DuckDuckGo text search.')
238 | 
239 |         # 検索エンジンを指定(ここではDuckDuckGoを使用)
240 |         self.search_engine.set('duckduckgo')
241 | 
242 |         # 共通系の中間前処理を実行
243 |         self.common_settings()
244 | 
245 |         # 検索を実行
246 |         data = self.search_engine.search(SEARCH_TEXT, maximum=30)
247 | 
248 |         print("{} count.".format(len(data)))
249 |         self.assertEqual(30, len(data))
250 | 
251 |     def test_duckduckgo_image_search(self):
252 |         print('Test DuckDuckGo image search.')
253 | 
254 |         # 検索エンジンを指定(ここではDuckDuckGoを使用)
255 |         self.search_engine.set('duckduckgo')
256 | 
257 |         # 共通系の中間前処理を実行
258 |         self.common_settings()
259 | 
260 |         # 検索を実行
261 |         data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30)
262 | 
263 |         print("{} count.".format(len(data)))
264 |         self.assertEqual(30, len(data))
265 | 
266 |     def test_duckduckgo_suggest(self):
267 |         print('Test DuckDuckGo text suggest.')
268 | 
269 |         # 検索エンジンを指定(ここではDuckDuckGoを使用)
270 |         self.search_engine.set('duckduckgo')
271 | 
272 |         # 共通系の中間前処理を実行
273 |         self.common_settings()
274 | 
275 |         # 検索を実行
276 |         data = self.search_engine.suggest(
277 |             SEARCH_TEXT)
278 | 
279 |         print("{} count.".format(len(data)))
280 |         self.assertNotEqual(0, len(data))
281 | 
282 |     def test_duckduckgo_suggest_with_jap(self):
283 |         print('Test DuckDuckGo text suggest with jap.')
284 | 
285 |         # 検索エンジンを指定(ここではDuckDuckGoを使用)
286 |         self.search_engine.set('duckduckgo')
287 | 
288 |         # 共通系の中間前処理を実行
289 |         self.common_settings()
290 | 
291 |         # 検索を実行
292 |         data = self.search_engine.suggest(
293 |             SEARCH_TEXT, jap=True)
294 | 
295 |         print("{} count.".format(len(data)))
296 |         self.assertNotEqual(0, len(data))
297 | 
298 |     def test_duckduckgo_suggest_with_alph(self):
299 |         print('Test DuckDuckGo text suggest with alph.')
300 | 
301 |         # 検索エンジンを指定(ここではDuckDuckGoを使用)
302 |         self.search_engine.set('duckduckgo')
303 | 
304 |         # 共通系の中間前処理を実行
305 |         self.common_settings()
306 | 
307 |         # 検索を実行
308 |         data = self.search_engine.suggest(
309 |             SEARCH_TEXT, alph=True)
310 | 
311 |         self.assertNotEqual(0, len(data))
312 | 
313 |     def test_duckduckgo_suggest_with_num(self):
314 |         print('Test DuckDuckGo text suggest with num.')
315 | 
316 |         # 検索エンジンを指定(ここではDuckDuckGoを使用)
317 |         self.search_engine.set('duckduckgo')
318 | 
319 |         # 共通系の中間前処理を実行
320 |         self.common_settings()
321 | 
322 |         # 検索を実行
323 |         data = self.search_engine.suggest(
324 |             SEARCH_TEXT, num=True)
325 | 
326 |         print("{} count.".format(len(data)))
327 |         self.assertNotEqual(0, len(data))
328 | 
329 |     # ==========
330 |     # Google
331 |     # ==========
332 |     def test_google_text_search(self):
333 |         print('Test Google text search.')
334 | 
335 |         # 検索エンジンを指定(ここではGoogleを使用)
336 |         self.search_engine.set('google')
337 | 
338 |         # 共通系の中間前処理を実行
339 |         self.common_settings()
340 | 
341 |         # 検索を実行
342 |         data = self.search_engine.search(SEARCH_TEXT, maximum=30)
343 | 
344 |         print("{} count.".format(len(data)))
345 |         self.assertEqual(30, len(data))
346 | 
347 |     def test_google_image_search(self):
348 |         print('Test Google image search.')
349 | 
350 |         # 検索エンジンを指定(ここではGoogleを使用)
351 |         self.search_engine.set('google')
352 | 
353 |         # 共通系の中間前処理を実行
354 |         self.common_settings()
355 | 
356 |         # 検索を実行
357 |         data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30)
358 | 
359 |         print("{} count.".format(len(data)))
360 |         self.assertEqual(30, len(data))
361 | 
362 |     def test_google_suggest(self):
363 |         print('Test Google text suggest.')
364 | 
365 |         # 検索エンジンを指定(ここではGoogleを使用)
366 |         self.search_engine.set('google')
367 | 
368 |         # 共通系の中間前処理を実行
369 |         self.common_settings()
370 | 
371 |         # 検索を実行
372 |         data = self.search_engine.suggest(
373 |             SEARCH_TEXT)
374 | 
375 |         print("{} count.".format(len(data)))
376 |         self.assertNotEqual(0, len(data))
377 | 
378 |     def test_google_suggest_with_jap(self):
379 |         print('Test Google text suggest with jap.')
380 | 
381 |         # 検索エンジンを指定(ここではGoogleを使用)
382 |         self.search_engine.set('google')
383 | 
384 |         # 共通系の中間前処理を実行
385 |         self.common_settings()
386 | 
387 |         # 検索を実行
388 |         data = self.search_engine.suggest(
389 |             SEARCH_TEXT, jap=True)
390 | 
391 |         print("{} count.".format(len(data)))
392 |         self.assertNotEqual(0, len(data))
393 | 
394 |     def test_google_suggest_with_alph(self):
395 |         print('Test Google text suggest with alph.')
396 | 
397 |         # 検索エンジンを指定(ここではGoogleを使用)
398 |         self.search_engine.set('google')
399 | 
400 |         # 共通系の中間前処理を実行
401 |         self.common_settings()
402 | 
403 |         # 検索を実行
404 |         data = self.search_engine.suggest(
405 |             SEARCH_TEXT, alph=True)
406 | 
407 |         self.assertNotEqual(0, len(data))
408 | 
409 |     def test_google_suggest_with_num(self):
410 |         print('Test Google text suggest with num.')
411 | 
412 |         # 検索エンジンを指定(ここではGoogleを使用)
413 |         self.search_engine.set('google')
414 | 
415 |         # 共通系の中間前処理を実行
416 |         self.common_settings()
417 | 
418 |         # 検索を実行
419 |         data = self.search_engine.suggest(
420 |             SEARCH_TEXT, num=True)
421 | 
422 |         print("{} count.".format(len(data)))
423 |         self.assertNotEqual(0, len(data))
424 | 
425 |     # ==========
426 |     # Yahoo
427 |     # ==========
428 |     def test_yahoo_text_search(self):
429 |         print('Test Yahoo text search.')
430 | 
431 |         # 検索エンジンを指定(ここではYahooを使用)
432 |         self.search_engine.set('yahoo')
433 | 
434 |         # 共通系の中間前処理を実行
435 |         self.common_settings()
436 | 
437 |         # 検索を実行
438 |         data = self.search_engine.search(SEARCH_TEXT, maximum=30)
439 | 
440 |         print("{} count.".format(len(data)))
441 |         self.assertEqual(30, len(data))
442 | 
443 |     def test_yahoo_image_search(self):
444 |         print('Test Yahoo image search.')
445 | 
446 |         # 検索エンジンを指定(ここではYahooを使用)
447 |         self.search_engine.set('yahoo')
448 | 
449 |         # 共通系の中間前処理を実行
450 |         self.common_settings()
451 | 
452 |         # 検索を実行
453 |         data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30)
454 | 
455 |         print("{} count.".format(len(data)))
456 |         self.assertEqual(30, len(data))
457 | 
458 |     def test_yahoo_suggest(self):
459 |         print('Test Yahoo text suggest.')
460 | 
461 |         # 検索エンジンを指定(ここではYahooを使用)
462 |         self.search_engine.set('yahoo')
463 | 
464 |         # 共通系の中間前処理を実行
465 |         self.common_settings()
466 | 
467 |         # 検索を実行
468 |         data = self.search_engine.suggest(
469 |             SEARCH_TEXT)
470 | 
471 |         print("{} count.".format(len(data)))
472 |         self.assertNotEqual(0, len(data))
473 | 
474 |     def test_yahoo_suggest_with_jap(self):
475 |         print('Test Yahoo text suggest with jap.')
476 | 
477 |         # 検索エンジンを指定(ここではYahooを使用)
478 |         self.search_engine.set('yahoo')
479 | 
480 |         # 共通系の中間前処理を実行
481 |         self.common_settings()
482 | 
483 |         # 検索を実行
484 |         data = self.search_engine.suggest(
485 |             SEARCH_TEXT, jap=True)
486 | 
487 |         print("{} count.".format(len(data)))
488 |         self.assertNotEqual(0, len(data))
489 | 
490 |     def test_yahoo_suggest_with_alph(self):
491 |         print('Test Yahoo text suggest with alph.')
492 | 
493 |         # 検索エンジンを指定(ここではYahooを使用)
494 |         self.search_engine.set('yahoo')
495 | 
496 |         # 共通系の中間前処理を実行
497 |         self.common_settings()
498 | 
499 |         # 検索を実行
500 |         data = self.search_engine.suggest(
501 |             SEARCH_TEXT, alph=True)
502 | 
503 |         self.assertNotEqual(0, len(data))
504 | 
505 |     def test_yahoo_suggest_with_num(self):
506 |         print('Test Yahoo text suggest with num.')
507 | 
508 |         # 検索エンジンを指定(ここではYahooを使用)
509 |         self.search_engine.set('yahoo')
510 | 
511 |         # 共通系の中間前処理を実行
512 |         self.common_settings()
513 | 
514 |         # 検索を実行
515 |         data = self.search_engine.suggest(
516 |             SEARCH_TEXT, num=True)
517 | 
518 |         print("{} count.".format(len(data)))
519 |         self.assertNotEqual(0, len(data))
520 | 


--------------------------------------------------------------------------------
/pydork/engine_google.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) 2023 Blacknon. All rights reserved.
  4 | # Use of this source code is governed by an MIT license
  5 | # that can be found in the LICENSE file.
  6 | # =======================================================
  7 | 
  8 | 
  9 | """engine_google
 10 |     * Google用の検索用Classを持つモジュール.
 11 | """
 12 | 
 13 | # import sys
 14 | 
 15 | import json
 16 | import os
 17 | 
 18 | from time import sleep
 19 | from json.decoder import JSONDecodeError
 20 | from urllib import parse
 21 | from lxml import etree
 22 | # from bs4 import BeautifulSoup
 23 | 
 24 | from .common import Color
 25 | from .recaptcha import TwoCaptcha
 26 | from .engine_common import CommonEngine
 27 | 
 28 | 
 29 | # Google画像検索で使用するパラメータID
 30 | RPC_ID = "HoAMBc"
 31 | 
 32 | 
 33 | class Google(CommonEngine):
 34 |     """Google
 35 | 
 36 |     Google用の検索エンジン用Class.
 37 |     """
 38 | 
 39 |     def __init__(self):
 40 |         # CommonEngineの処理を呼出し
 41 |         super().__init__()
 42 | 
 43 |         self.NAME = 'Google'
 44 |         self.COLOR = Color.PURPLE
 45 |         self.COLOR_NAME = self.COLOR + self.NAME + Color.END
 46 | 
 47 |         # リクエスト先のURLを指定
 48 |         self.ENGINE_TOP_URL = 'https://www.google.com/'
 49 |         self.SEARCH_URL = 'https://www.google.com/search'
 50 |         self.IMAGE_URL = 'https://www.google.com/_/VisualFrontendUi/data/batchexecute'
 51 |         self.SUGGEST_URL = 'http://www.google.com/complete/search'
 52 | 
 53 |         # 次の検索ページのURL(`self.get_nextpage_url`の処理で取得する)
 54 |         self.SEARCH_NEXT_URL = None
 55 | 
 56 |         # ReCaptcha画面かどうかの識別用
 57 |         self.SOUP_RECAPTCHA_TAG = '#captcha-form > #recaptcha'
 58 | 
 59 |     def gen_search_url(self, keyword: str, type: str):
 60 |         """gen_search_url
 61 | 
 62 |         検索用のurlを生成する.
 63 | 
 64 |         Args:
 65 |             keyword (str): 検索クエリ.
 66 |             type (str): 検索タイプ.
 67 | 
 68 |         Returns:
 69 |             dict: 検索用url
 70 |         """
 71 | 
 72 |         search_url = ''
 73 | 
 74 |         if type == 'text':
 75 |             # 検索用urlを指定
 76 |             search_url = self.SEARCH_URL
 77 | 
 78 |             # 検索パラメータの設定
 79 |             url_param = {
 80 |                 'q': keyword,   # 検索キーワード
 81 |                 'oq': keyword,   # 検索キーワード
 82 |                 'num': 100,   # 1ページごとの表示件数.
 83 |                 'filter': 0,  # 類似ページのフィルタリング(0...無効, 1...有効)
 84 |                 'nfpr': 1     # もしかして検索(Escape hatch)を無効化
 85 |             }
 86 | 
 87 |             # lang/localeが設定されている場合
 88 |             if self.LANG != '' and self.LOCALE != '':
 89 |                 url_param['hl'] = self.LANG
 90 |                 url_param['gl'] = self.LOCALE
 91 | 
 92 |             # rangeが設定されている場合
 93 |             try:
 94 |                 start = self.RANGE_START
 95 |                 end = self.RANGE_END
 96 | 
 97 |                 cd_min = start.strftime("%m/%d/%Y")
 98 |                 cd_max = end.strftime("%m/%d/%Y")
 99 | 
100 |                 # GETパラメータに日時データを追加
101 |                 url_param['tbs'] = "cdr:1,cd_min:{0},cd_max:{1}".format(
102 |                     cd_min, cd_max)
103 | 
104 |             except AttributeError:
105 |                 None
106 | 
107 |             page = 0
108 |             while True:
109 |                 # parameterにページを開始する番号を指定
110 |                 url_param['start'] = str(page * 100)
111 |                 params = parse.urlencode(url_param)
112 | 
113 |                 target_url = search_url + '?' + params
114 | 
115 |                 yield 'GET', target_url, None
116 |                 page += 1
117 | 
118 |         elif type == 'image':
119 |             # 検索用urlを指定
120 |             search_url = self.IMAGE_URL
121 | 
122 |             # Refererの設定
123 |             if not self.USE_SELENIUM:
124 |                 self.session.headers.update(
125 |                     {"Referer": "https://www.google.com/"}
126 |                 )
127 | 
128 |             # 検索パラメータの設定
129 |             url_param = {
130 |                 'rpcids': 'HoAMBc',
131 |                 'hl': 'id',
132 |                 'authuser': '0',
133 |                 'soc-app': '162',
134 |                 'soc-platform': '1',
135 |                 'soc-device': '1',
136 |                 'rt': 'c'
137 |             }
138 | 
139 |             # 画像のカーソル位置指定パラメータを作成
140 |             self.image_next_cursor = None
141 |             self.image_cursor = []
142 | 
143 |             page = 0
144 |             while True:
145 |                 # post dataを生成
146 |                 data = {
147 |                     "f.req": build_rpc_request(keyword, (self.image_cursor, self.image_next_cursor), page),
148 |                     "at": "ABrGKkQnVYg89U_cdKuhNZ5hM4vx:1616119655028",
149 |                     # "": "",
150 |                 }
151 | 
152 |                 params = parse.urlencode(url_param)
153 |                 target_url = search_url + '?' + params
154 | 
155 |                 yield 'POST', target_url, data
156 | 
157 |     def gen_suggest_url(self, keyword: str):
158 |         """gen_suggest_url
159 | 
160 |         サジェスト取得用のurlを生成する.
161 | 
162 |         Args:
163 |             keyword (str): 検索クエリ.
164 | 
165 |         Returns:
166 |             dict: サジェスト取得用url
167 |         """
168 | 
169 |         url_param = {
170 |             'q': keyword,  # 検索キーワード
171 |             'output': 'toolbar',
172 |             'ie': 'utf-8',
173 |             'oe': 'utf-8',
174 |         }
175 | 
176 |         params = parse.urlencode(url_param)
177 |         url = self.SUGGEST_URL + '?' + params
178 | 
179 |         return url
180 | 
181 |     def get_links(self, url: str, html: str, type: str):
182 |         """get_links
183 | 
184 |         受け付けたhtmlを解析し、検索結果をlistに加工して返す関数.
185 | 
186 |         Args:
187 |             url  (str): 解析する検索結果のurl.
188 |             html (str): 解析する検索結果のhtml.
189 |             type (str): 検索タイプ([text, image]).現時点ではtextのみ対応.
190 | 
191 |         Returns:
192 |             list: 検索結果。変数名はlinks。(`[{'title': 'title...', 'url': 'https://hogehoge....'}, {...}]`)
193 |         """
194 | 
195 |         # テキスト検索の場合
196 |         if type == 'text':
197 |             # request or seleniumの定義
198 |             self.SOUP_SELECT_URL = '#main > div > div > .kCrYT > a'
199 |             self.SOUP_SELECT_TITLE = '#main > div > div > .kCrYT > a > h3 > div'
200 |             self.SOUP_SELECT_TEXT = '#main > div > div > .kCrYT > div > div > div > div > div'
201 |             self.SOUP_SELECT_NEXT_URL = ''
202 | 
203 |             # Selenium経由、かつFirefoxを使っている場合
204 |             if self.USE_SELENIUM:
205 |                 self.SOUP_SELECT_URL = '.yuRUbf > div > span > a'
206 |                 self.SOUP_SELECT_TITLE = '.yuRUbf > div > span > a > h3'
207 |                 self.SOUP_SELECT_TEXT = '.yXK7lf'
208 |                 self.SOUP_SELECT_NEXT_URL = '.AaVjTc > tbody > tr > td > a'
209 | 
210 |             # Splash経由で通信している場合
211 |             elif self.USE_SPLASH:
212 |                 self.SOUP_SELECT_URL = '.yuRUbf > div > span > a.href'
213 |                 self.SOUP_SELECT_TITLE = '.yuRUbf > div > span > a > h3'
214 |                 self.SOUP_SELECT_TEXT = '.yXK7lf'
215 |                 self.SOUP_SELECT_NEXT_URL = '.AaVjTc > tbody > tr > td > a'
216 | 
217 |             # TODO: SEARCH_NEXT_URLを書き換える
218 |             # self.get_nextpage_url(html)
219 | 
220 |             # CommonEngineの処理を呼び出す
221 |             links = super().get_links(url, html, type)
222 | 
223 |         # イメージ検索の場合
224 |         elif type == 'image':
225 |             links = self.get_image_links(html)
226 | 
227 |         return links
228 | 
229 |     def get_image_links(self, html: str):
230 |         """get_image_links
231 | 
232 |         BeautifulSoupから画像検索ページを解析して結果を返す関数.
233 |         Seleniumを利用し、自動的にページ末尾まで移動して続きを取得する.
234 |         クリック等が発生するため、抽出にかなり時間がかかる.
235 | 
236 |         Args:
237 |             html (str): 解析する検索結果のhtml.
238 | 
239 |         Returns:
240 |             list: 検索結果(`[{'title': 'title...', 'link': 'https://hogehoge....'}, {...}]`)
241 | 
242 |         参考:
243 |             - https://github.com/Wikidepia/py-googleimages/blob/b781b79e9bf40d29cf6fcbdcf625303abf3718bd/googleimages/client.py
244 |         """
245 | 
246 |         links = list()
247 | 
248 |         # 改行区切りでloop
249 |         for line in html.split("\n"):
250 |             if RPC_ID not in line:
251 |                 continue
252 | 
253 |             # Make it json readable
254 |             line_cl = line.replace("\\n", "")  # Remove \n
255 | 
256 |         lineson = json.loads(line_cl)
257 | 
258 |         data = pjson_loads(lineson[0][2])
259 | 
260 |         # 画像のカーソル位置を更新
261 |         self.image_next_cursor = data[-2]
262 |         self.image_img_cursor = data[31][0][12][11][5]
263 | 
264 |         for img in data[31][0][12][2]:
265 |             # imgの値をチェック
266 |             if img[1] is None:
267 |                 continue
268 | 
269 |             link = img[1][3][0]  # 画像ファイルのurl
270 |             title = img[1][9]['2003'][3]  # 画像ファイルのあるページのtitle
271 |             pagelink = img[1][9]['2003'][2]  # 画像ファイルのあるページのurl
272 |             links.append(
273 |                 {
274 |                     "link": link,
275 |                     "title": title,
276 |                     "pagelink": pagelink,
277 |                 }
278 |             )
279 | 
280 |         return links
281 | 
282 |     def get_suggest_list(self, suggests: list, char: str, html: str):
283 |         """get_suggest_list
284 | 
285 |         htmlからsuggestを配列で取得する関数.
286 | 
287 |         Args:
288 |             suggests (list): suggestを追加するための大本のlist.
289 |             char (str): サジェストの文字列.
290 |             html (str): 解析を行うhtml.
291 | 
292 |         Returns:
293 |             dict: サジェスト配列
294 |         """
295 | 
296 |         sug_root = etree.XML(html)
297 |         sug_data = sug_root.xpath("//suggestion")
298 |         data = [s.get("data") for s in sug_data]
299 | 
300 |         suggests[char if char == '' else char[-1]] = data  # type: ignore
301 | 
302 |         return suggests
303 | 
304 |     def processings_elist(self, elinks, etitles, etexts: list):
305 |         """processings_elist
306 | 
307 |         self.get_links 内で、取得直後のelinks, etitlesに加工を加えるための関数.
308 | 
309 |         Args:
310 |             elinks (list): elinks(検索結果のlink)の配列
311 |             etitles (list): etitles(検索結果のtitle)の配列
312 |             etexts (list): etexts(検索結果のtext)の配列
313 | 
314 |         Returns:
315 |             elinks (list): elinks(検索結果のlink)の配列
316 |             etitles (list): etitles(検索結果のtitle)の配列
317 |             etexts (list): etexts(検索結果のtext)の配列
318 |         """
319 | 
320 |         # seleniumでfirefoxを使っていない、かつsplashを使っていない場合
321 |         new_elinks = []
322 |         for elink in elinks:
323 |             parsed = parse.urlparse(elink)
324 |             parsed_query = parse.parse_qs(parsed.query)
325 | 
326 |             if 'url' in parsed_query and elink[0] == '/':
327 |                 parsed_q = parsed_query['url']
328 |                 if len(parsed_q) > 0:
329 |                     new_elink = parsed_q[0]
330 |                     new_elinks.append(new_elink)
331 |             else:
332 |                 new_elinks.append(elink)
333 |         elinks = list(dict.fromkeys(new_elinks))
334 | 
335 |         return elinks, etitles, etexts
336 | 
337 |     def bypass_recaptcha_selenium(self, url: str, html: str):
338 |         """bypass_recaptcha_selenium
339 | 
340 |         SeleniumでReCaptchaを突破する関数.
341 |         2Captchaでの自動突破の場合、CookieとProxyが必要となる.
342 | 
343 |         Args:
344 |             url (str): ReCaptcha画面が表示されてしまったリクエストのurl
345 |             html (str): ReCaptcha画面のhtml
346 | 
347 |         Returns:
348 |             str: ReCaptchaを突破後のurlのhtml
349 |         """
350 | 
351 |         # resultを定義しておく
352 |         result = None
353 | 
354 |         # 環境変数を取得
355 |         TC_API_KEY = os.getenv('API_KEY_2CAPTCHA')
356 | 
357 |         # Seleniumの場合、手動でBypassが行えるようにする
358 |         if self.IS_DISABLE_HEADLESS:
359 |             while True:
360 |                 # 現在のSeleniumのurlを取得する
361 |                 current_url = self.driver.current_url
362 |                 current_url_parse = parse.urlparse(current_url)
363 | 
364 |                 # current_urlのpathが `/sorry/index` か識別する
365 |                 current_url_path = current_url_parse.path
366 |                 if current_url_path != '/sorry/index':
367 |                     break
368 | 
369 |                 # 待機
370 |                 sleep(1)
371 | 
372 |             sleep(5)
373 | 
374 |             # 現在のページ(ReCaptchaから移動したページ)のhtmlを取得する
375 |             result = self.driver.page_source
376 | 
377 |         # self.IS_DISABLE_HEADLESS がFalseで、かつ`API_KEY_2CAPTCHA`が定義されている場合
378 |         elif TC_API_KEY is not None:
379 |             # solverを作成
380 |             solver = TwoCaptcha(TC_API_KEY)
381 | 
382 |             # flag set
383 |             solver.set_debug(self.IS_DEBUG)
384 |             solver.set_command(self.IS_COMMAND)
385 |             solver.set_user_agent(self.USER_AGENT)
386 |             solver.set_messages(self.MESSAGE)
387 | 
388 |             # solverからのレスポンスを取得する
389 |             code = solver.google_recaptcha(
390 |                 html=html,
391 |                 url=url,
392 |                 cookies=self.driver.get_cookies(),
393 |                 proxy=self.PROXY,
394 | 
395 |             )
396 | 
397 |             # ReCaptchaの解除に失敗した場合
398 |             if code is None:
399 |                 return result
400 | 
401 |             # 解除コードを所定のtextareaに入力
402 |             self.driver.execute_script("""
403 |               document.getElementById(
404 |                   "g-recaptcha-response").innerHTML = arguments[0]
405 |             """, code)
406 | 
407 |             # ボタンクリック
408 |             self.driver.execute_script(
409 |                 'var element=document.getElementById("g-recaptcha-response"); element.style.display="none";')
410 | 
411 |             self.driver.execute_script('submitCallback()')
412 | 
413 |             sleep(10)
414 | 
415 |             # 結果を取得する
416 |             result = self.driver.page_source
417 | 
418 |         return result
419 | 
420 | 
421 | def build_rpc_request(keyword: str, cursor: list, page: int):
422 |     """build_rpc_request
423 | 
424 |     画像検索で使用するrpcデータの生成用関数.
425 | 
426 |     Original:
427 |         https://github.com/Wikidepia/py-googleimages/blob/b781b79e9bf40d29cf6fcbdcf625303abf3718bd/googleimages/utils.py
428 | 
429 |     Args:
430 |         keyword (str): [description]
431 |         cursor (list): [description]
432 |         page (int): [description]
433 | 
434 |     Returns:
435 |         [type]: [description]
436 |     """
437 | 
438 |     RPC_ID = "HoAMBc"
439 | 
440 |     return json.dumps(
441 |         [
442 |             [
443 |                 [
444 |                     RPC_ID,
445 |                     json.dumps(
446 |                         [
447 |                             None,
448 |                             None,
449 |                             [
450 |                                 1,
451 |                                 None,
452 |                                 450,
453 |                                 1,
454 |                                 1280,
455 |                                 cursor[0],
456 |                                 [],
457 |                                 [],
458 |                                 None,
459 |                                 None,
460 |                                 None,
461 |                                 0,
462 |                                 310,
463 |                                 [],
464 |                             ],
465 |                             None,
466 |                             None,
467 |                             None,
468 |                             None,
469 |                             None,
470 |                             None,
471 |                             None,
472 |                             None,
473 |                             None,
474 |                             None,
475 |                             None,
476 |                             None,
477 |                             None,
478 |                             None,
479 |                             None,
480 |                             None,
481 |                             None,
482 |                             None,
483 |                             None,
484 |                             None,
485 |                             None,
486 |                             None,
487 |                             None,
488 |                             None,
489 |                             None,
490 |                             [
491 |                                 keyword,
492 |                                 None,
493 |                                 None,
494 |                                 "strict",
495 |                                 None,
496 |                                 None,
497 |                                 None,
498 |                                 None,
499 |                                 None,
500 |                                 None,
501 |                                 None,
502 |                                 None,
503 |                                 None,
504 |                                 None,
505 |                                 None,
506 |                                 None,
507 |                                 None,
508 |                                 None,
509 |                                 None,
510 |                                 None,
511 |                                 None,
512 |                                 "lnms",
513 |                             ],
514 |                             None,
515 |                             None,
516 |                             None,
517 |                             None,
518 |                             None,
519 |                             None,
520 |                             None,
521 |                             None,
522 |                             [
523 |                                 cursor[1],
524 |                                 "CAM=",
525 |                                 "CgtHUklEX1NUQVRFMBAaIAA=",
526 |                             ],
527 |                         ],
528 |                         separators=(",", ":"),
529 |                     ),
530 |                     None,
531 |                     "generic",
532 |                 ],
533 |             ]
534 |         ],
535 |         separators=(",", ":"),
536 |     )
537 | 
538 | 
539 | def pjson_loads(text):
540 |     """pjson_loads
541 | 
542 |     画像検索で使用するデータの生成用関数.
543 | 
544 |     Original:
545 |         https://github.com/Wikidepia/py-googleimages/blob/b781b79e9bf40d29cf6fcbdcf625303abf3718bd/googleimages/utils.py
546 | 
547 |     Args:
548 |         text ([type]): [description]
549 | 
550 |     Returns:
551 |         [type]: [description]
552 |     """
553 |     while True:
554 |         try:
555 |             data = json.loads(text, strict=False)
556 |         except JSONDecodeError as exc:
557 |             if exc.msg == "Invalid \\escape":
558 |                 text = text[: exc.pos] + "\\" + text[exc.pos:]
559 |             else:
560 |                 raise
561 |         else:
562 |             return data
563 | 


--------------------------------------------------------------------------------
/pydork/engine.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) 2023 Blacknon. All rights reserved.
  4 | # Use of this source code is governed by an MIT license
  5 | # that can be found in the LICENSE file.
  6 | # =======================================================
  7 | 
  8 | 
  9 | # TODO: json出力時にヒット番号を付与する(SEO対策が行えているかどうかのチェック用)
 10 | 
 11 | """engine
 12 |     * Module for performing searches with SearchEngine
 13 | """
 14 | 
 15 | 
 16 | import os
 17 | import pathlib
 18 | import sys
 19 | 
 20 | from time import sleep
 21 | from string import ascii_lowercase, digits
 22 | from datetime import datetime
 23 | 
 24 | from .common import Color, Message
 25 | from .common import set_counter
 26 | from .engine_baidu import Baidu
 27 | from .engine_bing import Bing
 28 | from .engine_duckduckgo import DuckDuckGo
 29 | from .engine_google import Google
 30 | from .engine_yahoo import Yahoo
 31 | 
 32 | 
 33 | # 対応する検索エンジンのリスト
 34 | ENGINES = ['baidu', 'bing', 'duckduckgo', 'google', 'yahoo']
 35 | 
 36 | 
 37 | # 各種SearchEngineへの処理をまとめるWrapper用Class
 38 | class SearchEngine:
 39 |     """SearchEngine
 40 | 
 41 |     Class for Wrapper to perform a search against the specified search engine.
 42 | 
 43 |     Examples:
 44 |         >>> search_engine = SearchEngine()
 45 |         >>> search_engine.set('google')
 46 |         >>>
 47 |         >>> # Text search in the accepted query
 48 |         >>> search_result = search_engine.search('zelda')
 49 |         >>>
 50 |         >>> # Image search in the accepted query
 51 |         >>> search_result = search_engine.search('zelda', type='image')
 52 |         >>>
 53 |         >>> # Get Suggest in the accepted query
 54 |         >>> search_result = search_engine.suggest('zelda')
 55 |     """
 56 | 
 57 |     def __init__(self):
 58 |         None
 59 | 
 60 |     # どの検索エンジンを使用するか指定する関数
 61 |     def set(self, engine: str):
 62 |         """set
 63 | 
 64 |         A function that specifies which search engine to use.
 65 | 
 66 |         Args:
 67 |             engine (str): Specify the search engine to use for the search (see const ENGINES)
 68 |         """
 69 | 
 70 |         # TODO: 値チェックして、許可した値以外はエラーにする
 71 |         if engine == 'baidu':
 72 |             self.ENGINE = Baidu()
 73 | 
 74 |         elif engine == 'bing':
 75 |             self.ENGINE = Bing()
 76 | 
 77 |         elif engine == 'duckduckgo':
 78 |             self.ENGINE = DuckDuckGo()
 79 | 
 80 |         elif engine == 'google':
 81 |             self.ENGINE = Google()
 82 | 
 83 |         elif engine == 'yahoo':
 84 |             self.ENGINE = Yahoo()
 85 | 
 86 |         else:
 87 |             raise Exception('Error!')
 88 | 
 89 |         self.IS_COLOR = False
 90 | 
 91 |         # Messageを定義
 92 |         self.MESSAGE = Message()
 93 |         self.MESSAGE.set_engine(self.ENGINE.NAME, self.ENGINE.COLOR)
 94 | 
 95 |     # multithreading用のlockを渡すための関数(現在未使用？)
 96 |     def set_lock(self, lock):
 97 |         """set_lock
 98 | 
 99 |         Function to pass lock for multithreading
100 | 
101 |         Args:
102 |             lock (threading.Lock): multithreading lock object
103 |         """
104 |         self.ENGINE.LOCK = lock
105 | 
106 |     # debugフラグを有効化する関数
107 |     def set_is_debug(self, is_debug: bool):
108 |         """set_debug
109 | 
110 |         set debug flag
111 | 
112 |         Args:
113 |             debug (bool): debug flag(Enable debug with `True`).
114 |         """
115 | 
116 |         self.ENGINE.IS_DEBUG = is_debug  # type: ignore
117 | 
118 |     # commandフラグ(コマンドモードでの実行)を有効化する関数
119 |     def set_is_command(self, is_command: bool):
120 |         """set_is_command
121 | 
122 |         set command flag.
123 |         When the command flag is enabled, the contents used in the command will be output to the console.
124 | 
125 |         Args:
126 |             is_command (bool): command flag(Enable command mode with `True`).
127 |         """
128 |         self.ENGINE.IS_COMMAND = is_command  # type: ignore
129 | 
130 |     # color出力が有効か否か
131 |     def set_is_color(self, is_color: bool = False):
132 |         """set_is_color
133 | 
134 |         Specifies whether to display the output in color.
135 | 
136 |         Args:
137 |             is_color (bool): color flag(Enable color mode with `True`).
138 |         """
139 |         self.IS_COLOR = is_color
140 | 
141 |     # disable-headlessフラグ(Seleniumをheadlessで起動)を有効化する関数
142 |     def set_disable_headless(self, disable_headless: bool):
143 |         """set_disable_headless
144 | 
145 |         Function to Disable Selenium's headless option.
146 |         Used when manually bypassing ReCaptcha or when debugging.
147 | 
148 |         Args:
149 |             disable_headless (bool): Disable Selenium headless option (disable with True)
150 | 
151 |         Examples:
152 |             >>> search_engine = SearchEngine()
153 |             >>> search_engine.set('google')
154 |             >>>
155 |             >>> # Set Selenium
156 |             >>> search_engine.set_selenium()
157 |             >>>
158 |             >>> # Disable headless mode
159 |             >>> search_engine.set_disable_headless(True)
160 |             >>>
161 |             >>> # Open browser and search query
162 |             >>> search_engine.search('mario')
163 | 
164 |         """
165 | 
166 |         self.ENGINE.IS_DISABLE_HEADLESS = disable_headless  # type: ignore
167 | 
168 |     # cookieファイルを入れているディレクトリを渡して、使用するcookieファイルを取得する関数
169 |     def set_cookie_files(self, cookie_dir: str):
170 |         """set_cookie_files
171 | 
172 |         Function to specify and generate the cookie file name to be used by passing the directory to put the cookie file.
173 |         Currently, cookie files are only used with Selenium.
174 | 
175 |         Args:
176 |             cookie_dir (str): Directory path where cookie files are placed.
177 |         """
178 | 
179 |         # フルパスに変換
180 |         cookie_dir = pathlib.Path(cookie_dir).expanduser()  # type: ignore
181 |         cookie_dir = pathlib.Path(cookie_dir).resolve()  # type: ignore
182 | 
183 |         # 存在チェックをして、ディレクトリがない場合は新規作成
184 |         if not os.path.exists(cookie_dir):
185 |             # TODO: ディレクトリではなく、ファイルが存在していた場合はエラー処理をする
186 | 
187 |             # ディレクトリを作成
188 |             os.mkdir(cookie_dir)
189 | 
190 |         # 使用する方式に応じてpostfixを切り替え
191 |         postfix = ''
192 |         if self.ENGINE.USE_SELENIUM:
193 |             postfix = '_selenium'
194 |         elif self.ENGINE.USE_SPLASH:
195 |             postfix = '_splash'
196 |         else:
197 |             postfix = '_requests'
198 | 
199 |         # Prefixを付与してPATHを生成
200 |         cookie_file = os.path.join(
201 |             cookie_dir, '.cookie_' + self.ENGINE.NAME.lower() + postfix)
202 | 
203 |         # 存在チェックをして、ファイルがない場合は新規作成
204 |         if not os.path.exists(cookie_file):
205 |             open(cookie_file, 'a').close()
206 | 
207 |         # ENGINEのself変数にセットする
208 |         self.ENGINE.COOKIE_FILE = cookie_file  # type: ignore
209 | 
210 |     # クエリ実行ごとにCookieを削除して作り直しさせるかを指定する関数
211 |     def set_cookie_files_delete(self, is_delete_cookie: bool):
212 |         """set_cookie_files_delete
213 | 
214 |         Function that specifies whether the cookie should be deleted and recreated each time the query is executed.
215 | 
216 |         Args:
217 |             is_delete_cookie (bool): delete flag.
218 |         """
219 | 
220 |         # ENGINEのself変数にセットする
221 |         self.ENGINE.COOKIE_FILE_DELETE = is_delete_cookie  # type: ignore
222 | 
223 |     # 検索エンジンにわたす言語・国の設定を受け付ける
224 |     def set_lang(self, lang: str = "ja", locale: str = "JP"):
225 |         """set_lang
226 | 
227 |         Function to set the language / country specified by the search engine.
228 | 
229 |         Args:
230 |             lang (str): Language ([ja,en])
231 |             locale (str): Locale ([JP,US])
232 |         """
233 |         self.ENGINE.set_lang(lang, locale)
234 | 
235 |     # 検索時の日時範囲を指定
236 |     def set_range(self, start: datetime, end: datetime):
237 |         """set_range
238 | 
239 |         Specify the date of the search range.
240 | 
241 |         Args:
242 |             start (datetime): start time(datetime)
243 |             end (datetime): end time(datetime)
244 |         """
245 | 
246 |         self.ENGINE.set_range(start, end)
247 | 
248 |     # proxyの設定を受け付ける
249 |     def set_proxy(self, proxy: str):
250 |         """set_proxy
251 | 
252 |         Set the proxy server to be used when searching.
253 | 
254 |         Args:
255 |             proxy (str): proxy uri(ex. socks5://localhost:11080, http://hogehoge:8080)
256 |         """
257 |         self.ENGINE.set_proxy(proxy)
258 | 
259 |     # seleniumを有効にする
260 |     def set_selenium(self, uri: str = None, browser: str = None):  # type: ignore
261 |         """set_selenium
262 | 
263 |         Use Selenium (priority over Splash).
264 | 
265 |         Args:
266 |             uri (str, optional): Specify the `host:port` of Selenium  (used when Selenium is started by docker etc.). Defaults to None.
267 |             browser (str, optional): Specify Browser to use with Selenium ([chrome, firefox]). Defaults to None.
268 |         """
269 | 
270 |         self.ENGINE.set_selenium(uri, browser)
271 | 
272 |     # splashを有効にする
273 |     def set_splash(self, splash_url: str):
274 |         """set_splash
275 | 
276 |         Use Splash (Selenium has priority).
277 | 
278 |         Args:
279 |             splash_url (str): Splash uri(ex: `localhost:8050`)
280 |         """
281 | 
282 |         self.ENGINE.set_splash(splash_url)
283 | 
284 |     # user_agentの設定値を受け付ける
285 |     def set_user_agent(self, useragent: str = None):  # type: ignore
286 |         """set_user_agent
287 | 
288 |         Specify the UserAgent.
289 |         If not specified, FakeUA or hard-coded UserAgent will be used.
290 | 
291 | 
292 |         Args:
293 |             useragent (str, optional): useragent. Defaults to None.
294 |         """
295 | 
296 |         self.ENGINE.set_user_agent(useragent)
297 | 
298 |     # sslの検証を無効化する
299 |     def set_ignore_ssl(self, verify: bool):
300 |         """set_ignore_ssl
301 | 
302 |         Ignore ssl verify.
303 | 
304 |         Args:
305 |             verify (bool): bool.
306 |         """
307 |         self.ENGINE.set_ignore_ssl = verify  # type: ignore
308 | 
309 |     # 検索を行う
310 |     def search(self, keyword: str, search_type='text', maximum=100):
311 |         """search
312 | 
313 |         Search with a search engine.
314 | 
315 |         Args:
316 |             keyword (str): query.
317 |             search_type (str, optional): search type. text or image. Defaults to 'text'.
318 |             maximum (int, optional): Max count of searches. Defaults to 100.
319 | 
320 |         Returns:
321 |             [list]: [{'link', 'http://...', 'title': 'hogehoge...'}, {'link': '...', 'title': '...'}, ... ]
322 |         """
323 | 
324 |         # ENGINE.MESSAGEへis_command/is_debugを渡す
325 |         self.MESSAGE.set_is_command(self.ENGINE.IS_COMMAND)
326 |         self.MESSAGE.set_is_debug(self.ENGINE.IS_DEBUG)
327 | 
328 |         # Set header
329 |         header = '[${ENGINE_NAME}Search]'
330 |         if self.IS_COLOR:
331 |             sc = Color(self.ENGINE.COLOR)
332 |             header = sc.out(header)
333 |         self.MESSAGE.set_header(header)
334 | 
335 |         # ENGINEへMessage()を渡す
336 |         self.ENGINE.set_messages(self.MESSAGE)
337 | 
338 |         if self.ENGINE.LANG == "" and self.ENGINE.LOCALE == "":
339 |             self.set_lang()
340 | 
341 |         # メッセージ出力（コマンド実行時のみ）
342 |         colored_keyword = self.ENGINE.MESSAGE.ENGINE_COLOR.out(keyword)
343 |         self.ENGINE.MESSAGE.print_text(
344 |             "$ENGINE: {} Search: {}".format(
345 |                 search_type.capitalize(), colored_keyword),
346 |             use_header=False,
347 |             file=sys.stderr
348 | 
349 |         )
350 |         result, total = [], 0
351 | 
352 |         # maximumが0の場合、返す値は0個になるのでこのままreturn
353 |         if maximum == 0:
354 |             return result
355 | 
356 |         # ENGINEのproxyやブラウザオプションを、各接続方式(Selenium, Splash, requests)に応じてセットし、ブラウザ(session)を作成する
357 |         self.ENGINE.create_session()
358 | 
359 |         # 検索処理の開始
360 |         gen_url = self.ENGINE.gen_search_url(keyword, search_type)
361 |         while True:
362 |             # リクエスト先のurlを取得
363 |             try:
364 |                 method, url, data = next(gen_url)
365 |             except Exception:
366 |                 break
367 | 
368 |             # debug
369 |             self.ENGINE.MESSAGE.print_text(
370 |                 url,
371 |                 mode='debug',
372 |                 separator=": ",  # type: ignore
373 |                 header=self.ENGINE.MESSAGE.HEADER + ': ' + \
374 |                 Color.GRAY + '[DEBUG]: [TargetURL]' + Color.END
375 |             )
376 | 
377 |             # debug
378 |             self.ENGINE.MESSAGE.print_text(
379 |                 self.ENGINE.USER_AGENT,
380 |                 mode='debug',
381 |                 separator=": ",  # type: ignore
382 |                 header=self.ENGINE.MESSAGE.HEADER + ': ' + \
383 |                 Color.GRAY + '[DEBUG]: [UserAgent]' + Color.END
384 |             )
385 | 
386 |             # 検索結果の取得
387 |             html = self.ENGINE.get_result(
388 |                 url, method=method, data=data)  # type: ignore
389 | 
390 |             # debug
391 |             self.ENGINE.MESSAGE.print_text(
392 |                 html,
393 |                 mode='debug',
394 |                 separator=": ",  # type: ignore
395 |                 header=self.ENGINE.MESSAGE.HEADER + ': ' + \
396 |                 Color.GRAY + '[DEBUG]: [Response]' + Color.END
397 |             )
398 | 
399 |             # 初期値
400 |             is_recaptcha = False
401 | 
402 |             while True:
403 |                 # ReCaptchaページかどうかを識別
404 |                 if html is not None:
405 |                     is_recaptcha = self.ENGINE.check_recaptcha(html)
406 |                 else:
407 |                     break
408 | 
409 |                 if is_recaptcha:
410 |                     # commandの場合の出力処理
411 |                     self.ENGINE.MESSAGE.print_text(
412 |                         'Oh, Redirect to ReCaptcha Window.',
413 |                         mode='warn',
414 |                         header=self.ENGINE.MESSAGE.ENGINE,
415 |                         separator=": "
416 |                     )
417 | 
418 |                     # headless browserを使っている場合
419 |                     if self.ENGINE.USE_SELENIUM or self.ENGINE.USE_SPLASH:
420 |                         # byass用の関数にわたす
421 |                         html = self.ENGINE.bypass_recaptcha(
422 |                             url, html)  # type: ignore
423 | 
424 |                         if html is not None:
425 |                             # debug
426 |                             self.ENGINE.MESSAGE.print_text(
427 |                                 html,
428 |                                 mode='debug',  # type: ignore
429 |                                 header=self.ENGINE.MESSAGE.HEADER + ': ' + Color.GRAY + \
430 |                                 '[DEBUG]: [ReCaptchaedResponse]' + Color.END,
431 |                                 separator=": "
432 |                             )
433 | 
434 |                     else:
435 |                         # headless browserが無い場合、Recaptchaには対応していない旨のエラーメッセージを出力する
436 |                         None
437 | 
438 |                 else:  # is_recaptchaがFalseの場合、whileを抜ける
439 |                     break
440 | 
441 |             # htmlがNone、かつReCaptchaチェックでTrueであった場合
442 |             if html is None and is_recaptcha:
443 |                 # commandの場合の出力処理
444 |                 self.ENGINE.MESSAGE.print_text(
445 |                     'FAiled ReCaptcha. exit process.',
446 |                     mode='warn',
447 |                     header=self.ENGINE.MESSAGE.ENGINE,
448 |                     separator=": "
449 |                 )
450 | 
451 |                 break
452 | 
453 |             # TODO: resultも関数に渡して重複チェックを行わせる
454 |             # 検索結果をパースしてurlリストを取得する
455 |             links = self.ENGINE.get_links(
456 |                 url, html, search_type)  # type: ignore
457 | 
458 |             # linksの件数に応じて処理を実施
459 |             if not len(links):
460 |                 # commandの場合の出力処理
461 |                 self.ENGINE.MESSAGE.print_text(
462 |                     'No more links.',
463 |                     header=self.ENGINE.MESSAGE.ENGINE,
464 |                     separator=": ",
465 |                     file=sys.stderr,
466 |                 )
467 | 
468 |                 # loopを抜ける
469 |                 if self.ENGINE.NAME == "Google":
470 |                     if self.ENGINE.SEARCH_NEXT_URL is None:  # type: ignore
471 |                         break
472 |                 else:
473 |                     break
474 | 
475 |             # maximumで指定した件数を超える場合、その件数までを追加してloopを抜ける
476 |             elif len(links) > maximum - total:
477 |                 result += links[:maximum - total]
478 |                 break
479 | 
480 |             # TODO: bingのときだけ追加する処理として外だしする方法を考える
481 |             elif len(links) < 10 and self.ENGINE.NAME == "Bing":
482 |                 # Bingの場合、件数以下でも次のページが表示されてしまうため件数でbreak
483 |                 result += links[:maximum - total]
484 |                 break
485 | 
486 |             else:
487 |                 result += links
488 |                 total += len(links)
489 | 
490 |             # 連続でアクセスすると問題があるため、3秒待機
491 |             sleep(3)
492 | 
493 |         # 検索番号を指定
494 |         result = set_counter(result)
495 | 
496 |         # commandの場合の出力処理
497 |         self.ENGINE.MESSAGE.print_text(
498 |             # type: ignore
499 |             'Finally got ' + self.ENGINE.COLOR + \
500 |             str(len(result)) + Color.END + ' links.',
501 |             header=self.ENGINE.MESSAGE.ENGINE,
502 |             separator=": ",
503 |             file=sys.stderr,
504 |         )
505 | 
506 |         # save cookies
507 |         if self.ENGINE.COOKIE_FILE != '':
508 |             self.ENGINE.write_cookies()
509 | 
510 |         # delete cookie file
511 |         if self.ENGINE.COOKIE_FILE_DELETE:
512 |             os.remove(self.ENGINE.COOKIE_FILE)
513 | 
514 |         # sessionを終了
515 |         self.ENGINE.close_session()
516 | 
517 |         return result
518 | 
519 |     # suggestを取得する
520 |     def suggest(self, keyword: str, jap=False, alph=False, num=False):
521 |         """suggest
522 | 
523 |         get suggest with a search engine.
524 | 
525 |         Args:
526 |             keyword (str): query
527 |             jap (bool, optional): with japanese char. Defaults to False.
528 |             alph (bool, optional): with alphabet char. Defaults to False.
529 |             num (bool, optional): with number. Defaults to False.
530 | 
531 |         Returns:
532 |             [list]: {'with char': ['suggest1', 'suggest2' ...]}
533 |         """
534 | 
535 |         # ENGINEのproxyやブラウザオプションを、各接続方式(Selenium, Splash, requests)に応じてセットし、ブラウザ(session)を作成する
536 |         self.ENGINE.create_session()
537 | 
538 |         # 文字リスト作成
539 |         chars = ['', ' ']
540 | 
541 |         # japフラグが有効な場合、キーワードに日本語を含めてサジェストを検索
542 |         chars += [' ' + chr(i) for i in range(12353, 12436)] if jap else []
543 | 
544 |         # alphフラグが有効な場合、キーワードにアルファベットを含めてサジェストを検索
545 |         chars += [' ' + char for char in ascii_lowercase] if alph else []
546 | 
547 |         # numフラグが有効な場合、キーワードに数字を含めてサジェストを検索
548 |         chars += [' ' + char for char in digits] if num else []
549 | 
550 |         # サジェスト取得
551 |         suggests = {}
552 |         for char in chars:
553 |             word = keyword + char
554 |             url = self.ENGINE.gen_suggest_url(word)
555 |             html = self.ENGINE.get_result(url)
556 | 
557 |             # TODO: 各エンジンでjson/textの変換処理を別途実装する必要がある
558 |             suggests = self.ENGINE.get_suggest_list(
559 |                 suggests, char, html)  # type: ignore
560 | 
561 |             sleep(0.5)
562 | 
563 |         # sessionを終了
564 |         self.ENGINE.close_session()
565 | 
566 |         return suggests
567 | 


--------------------------------------------------------------------------------
/pydork/engine_common.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) 2023 Blacknon. All rights reserved.
  4 | # Use of this source code is governed by an MIT license
  5 | # that can be found in the LICENSE file.
  6 | # =======================================================
  7 | 
  8 | 
  9 | """engine_common
 10 |     * SearchEngine Classから呼び出す、各検索エンジンで共通の処理を保持させる継承用Classである `CommonEngine` を持つモジュール.
 11 | """
 12 | 
 13 | import requests
 14 | import os
 15 | import pickle
 16 | 
 17 | # selenium driver auto install packages
 18 | import chromedriver_autoinstaller
 19 | import geckodriver_autoinstaller
 20 | 
 21 | # seleniumrequests
 22 | from seleniumrequests import Chrome, Firefox
 23 | 
 24 | # selenium
 25 | from selenium import webdriver
 26 | from selenium.webdriver.chrome.options import Options as ChromeOptions
 27 | from selenium.webdriver.firefox.options import Options as FirefoxOptions
 28 | from selenium.webdriver.support.ui import WebDriverWait
 29 | from selenium.webdriver.support import expected_conditions as EC
 30 | 
 31 | from urllib import parse
 32 | from fake_useragent import UserAgent
 33 | from bs4 import BeautifulSoup
 34 | from datetime import datetime
 35 | 
 36 | from .common import Color, Message
 37 | 
 38 | 
 39 | # 各検索エンジン用class共通の処理を記述した継承用class
 40 | class CommonEngine:
 41 |     """CommonEngine
 42 | 
 43 |     検索エンジンごとの処理を記述するClassのための、継承用Class.
 44 |     """
 45 | 
 46 |     # Class作成時の処理
 47 |     def __init__(self):
 48 |         # headless browserの利用有無フラグ(デフォルト: False)
 49 |         self.USE_SELENIUM = False
 50 |         self.USE_SPLASH = False
 51 | 
 52 |         # 初期値の作成
 53 |         self.LOCK = None
 54 |         self.COOKIE_FILE = ''
 55 |         self.COOKIE_FILE_DELETE = False
 56 |         self.SPLASH_URI = ''
 57 |         self.PROXY = ''
 58 |         self.USER_AGENT = ''
 59 |         self.LANG = ''
 60 |         self.LOCALE = ''
 61 |         self.IS_DEBUG = False
 62 |         self.IS_COMMAND = False
 63 |         self.IS_DISABLE_HEADLESS = False
 64 |         self.MESSAGE: Message
 65 |         self.IGNORE_SSL_VERIFY = False
 66 | 
 67 |         # ReCaptcha画面かどうかの識別用(初期値(ブランク))
 68 |         self.RECAPTCHA_SITEKEY = ''
 69 |         self.SOUP_RECAPTCHA_TAG = ''
 70 |         self.SOUP_RECAPTCHA_SITEKEY = ''
 71 | 
 72 |     # 検索エンジンにわたす言語・国の設定を受け付ける
 73 |     def set_lang(self, lang: str, locale: str):
 74 |         """set_lang
 75 | 
 76 |         検索エンジンで指定する言語・国の設定を行う関数
 77 | 
 78 |         Args:
 79 |             lang (str): 検索エンジンのパラメータで指定する言語を指定する([ja,en])
 80 |             locale (str): 検索エンジンのパラメータで指定する国を指定する([JP,US])
 81 |         """
 82 | 
 83 |         self.LANG = lang
 84 |         self.LOCALE = locale
 85 | 
 86 |     # 検索時の日時範囲を指定
 87 |     def set_range(self, start: datetime, end: datetime):
 88 |         """set_range
 89 | 
 90 |         検索エンジンで指定する日付範囲を指定する
 91 | 
 92 |         Args:
 93 |             start (datetime): 検索対象ページの対象範囲開始日時(datetime)
 94 |             end (datetime): 検索対象ページの対象範囲終了日時(datetime)
 95 |         """
 96 |         self.RANGE_START = start
 97 |         self.RANGE_END = end
 98 | 
 99 |     # user_agentの設定値を受け付ける(引数がない場合はランダム。Seleniumの際は自動的に使用したbrowserのagentを指定)
100 |     def set_user_agent(self, user_agent: str = None, browser: str = None):  # type: ignore
101 |         """set_user_agent
102 | 
103 |         user_agentの値を受け付ける.
104 |         user_agentの指定がない場合、 Chromeを使用したものとする.
105 |         また、もし`browser`が指定されている場合はそのブラウザのUser Agentを指定する.
106 | 
107 |         注) seleniumを利用する場合、事前に有効にする必要がある。
108 | 
109 |         Args:
110 |             user_agent (str, optional): User Agentを指定する. Defaults to None.
111 |             browser (str, optional): Seleniumで使用するBrowserを指定する([chrome, firefox]). Defaults to None.
112 |         """
113 | 
114 |         if user_agent is None:
115 |             # seleniumが有効になっている場合、そのままSeleniumで利用するブラウザのUAを使用する
116 |             if self.USE_SELENIUM:
117 |                 user_agent = ''
118 |             else:
119 |                 try:
120 |                     ua = UserAgent(verify_ssl=False, use_cache_server=True)
121 |                     if user_agent is None:
122 |                         if browser is None:
123 |                             user_agent = ua.firefox
124 | 
125 |                         elif browser == 'chrome':
126 |                             user_agent = ua.chrome
127 | 
128 |                         elif browser == 'firefox':
129 |                             user_agent = ua.chrome
130 | 
131 |                 except Exception:
132 |                     user_agent = 'Mozilla/5.0 (Linux; Android 10; SM-A205U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Mobile Safari/537.36.'
133 | 
134 |         self.USER_AGENT = user_agent
135 | 
136 |     # seleniumを有効にする
137 |     #   - splashより優先
138 |     #   - host, browserは、指定がない場合はそれぞれデフォルト設定(hostは指定なし、browserはchrome)での動作
139 |     #   - browserは `chrome` or `firefox` のみ受け付ける
140 |     def set_selenium(self, uri: str = None, browser: str = None):  # type: ignore
141 |         """set_selenium
142 | 
143 |         検索時にSelenium経由で通信を行う.
144 |         他のHeadless Browserと比較して最優先(Splash等が有効でもこちらが優先される).
145 | 
146 |         Args:
147 |             uri (str, optional): APIのURIを指定(localhost:4444). Defaults to None.
148 |             browser (str, optional): 使用するブラウザを指定([chrome, firefox]). Defaults to None.
149 |         """
150 | 
151 |         # 入力値検証(browser: chrome or firefox)
152 |         if browser is None:
153 |             browser = 'chrome'
154 | 
155 |         # USE_SELENIUM to True
156 |         self.USE_SELENIUM = True
157 |         self.SELENIUM_URI = uri
158 |         self.SELENIUM_BROWSER = browser
159 | 
160 |     # proxyの設定を受け付ける
161 |     def set_proxy(self, proxy: str):
162 |         """set_proxy
163 | 
164 |         検索時に使用するProxyを指定する(uri指定)
165 | 
166 |         Args:
167 |             proxy (str): ProxyのURIを指定する(socks5://localhost:11080, http://hogehoge:8080)
168 |         """
169 | 
170 |         self.PROXY = proxy
171 | 
172 |     # splash urlの値を受け付ける
173 |     def set_splash(self, splash_url: str):
174 |         """set_splash
175 | 
176 |         検索時にSplashを有効にする.
177 |         (Seleniumと同時に有効化されている場合、Seleniumを優先する)
178 | 
179 |         Args:
180 |             splash_url (str): Splashのアクセス先URIを指定する(ex: `localhost:8050`)
181 |         """
182 | 
183 |         self.USE_SPLASH = True
184 |         self.SPLASH_URI = splash_url
185 | 
186 |     # common.Messageを受け付ける
187 |     def set_messages(self, message: Message):
188 |         self.MESSAGE = message
189 | 
190 |     # sslのチェックを無効にする
191 |     def set_ignore_ssl(self, verify: bool):
192 |         self.IGNORE_SSL_VERIFY = verify
193 | 
194 |     # cookieをcookiefileから取得する
195 |     def read_cookies(self):
196 |         """read_cookies
197 | 
198 |         `self.COOKIE_FILE` からcookieを読み込む.
199 |         現時点ではSeleniumでのみ動作.
200 |         """
201 | 
202 |         # cookieファイルが存在しない場合、空ファイルで作成する
203 |         exist_cookie_file = os.path.isfile(self.COOKIE_FILE)
204 |         if not exist_cookie_file:
205 |             cookie_file = open(self.COOKIE_FILE, 'w')
206 |             cookie_file.write('')
207 |             cookie_file.close()
208 | 
209 |         # cookieファイルのサイズを取得
210 |         file_size = os.path.getsize(self.COOKIE_FILE)
211 | 
212 |         # cookieファイルのサイズが0以上の場合
213 |         if file_size > 0:
214 |             # cookie fileからcookieの取得
215 |             cookies = pickle.load(open(self.COOKIE_FILE, "rb"))
216 | 
217 |             # seleniumを使う場合
218 |             if self.USE_SELENIUM:
219 |                 # 事前アクセスが必要になるため、検索対象ドメインのTOPページにアクセスしておく
220 |                 self.driver.get(self.ENGINE_TOP_URL)  # type: ignore
221 | 
222 |                 # cookieを設定していく
223 |                 for cookie in cookies:
224 |                     try:
225 |                         self.driver.add_cookie(cookie)
226 |                     except Exception:
227 |                         pass
228 | 
229 |             # splashを使う場合
230 |             elif self.USE_SPLASH:
231 |                 # NOTE: 動作しないためコメントアウト
232 |                 # TODO: 確認して修正
233 |                 # self.session.cookies.update(cookies)
234 |                 None
235 | 
236 |             # requestを使う場合
237 |             else:
238 |                 # NOTE: 動作しないためコメントアウト
239 |                 # TODO: 確認して修正
240 |                 # self.session.cookies.update(cookies)
241 |                 None
242 | 
243 |     # cookieをcookiefileに書き込む
244 |     def write_cookies(self):
245 |         """write_cookies
246 | 
247 |         cookiesを `self.COOKIE_FILE` に書き込む.
248 | 
249 |         """
250 | 
251 |         cookies = None
252 | 
253 |         # seleniumを使う場合
254 |         if self.USE_SELENIUM:
255 |             cookies = self.driver.get_cookies()
256 | 
257 |         # splashを使う場合
258 |         elif self.USE_SPLASH:
259 |             cookies = self.session.cookies
260 | 
261 |         # requestを使う場合
262 |         else:
263 |             cookies = self.session.cookies
264 | 
265 |         # cookieを書き込み
266 |         with open(self.COOKIE_FILE, 'wb') as f:
267 |             pickle.dump(cookies, f)
268 | 
269 |     # seleniumのOptionsを作成
270 |     def create_selenium_options(self):
271 |         """create_selenium_options
272 | 
273 |         Seleniumのoptionsを生成して返す.
274 | 
275 |         Returns:
276 |             Options: 指定されたブラウザに応じたSeleniumのOptionsを返す.
277 |         """
278 | 
279 |         # browser別の処理
280 |         if self.SELENIUM_BROWSER == 'chrome':
281 |             options = ChromeOptions()
282 | 
283 |             # set ssl verify
284 |             if not self.IGNORE_SSL_VERIFY:
285 |                 options.add_argument('ignore-certificate-errors')
286 | 
287 |         elif self.SELENIUM_BROWSER == 'firefox':
288 |             options = FirefoxOptions()
289 | 
290 |         # set headless option
291 |         if not self.IS_DISABLE_HEADLESS:
292 |             options.add_argument('--headless')
293 | 
294 |         # set user_agent option
295 |         if self.USER_AGENT != '':
296 |             options.add_argument('--user-agent=%s' % self.USER_AGENT)
297 | 
298 |         return options
299 | 
300 |     # selenium driverの作成
301 |     def create_selenium_driver(self):
302 |         """create_selenium_driver
303 | 
304 |         Seleniumで使用するDriverを作成する関数.
305 |         Optionsもこの関数で作成する.
306 |         """
307 | 
308 |         # optionsを取得する
309 |         options = self.create_selenium_options()
310 | 
311 |         # browserに応じてdriverを作成していく
312 |         if self.SELENIUM_BROWSER == 'chrome':
313 |             # proxyを追加
314 |             if self.PROXY != '':
315 |                 options.add_argument('--proxy-server=%s' % self.PROXY)
316 | 
317 |             try:
318 |                 chromedriver_autoinstaller.install()
319 |             except Exception:
320 |                 pass
321 | 
322 |             self.driver = Chrome(options=options)
323 | 
324 |         elif self.SELENIUM_BROWSER == 'firefox':
325 |             # profileを作成する
326 |             profile = webdriver.FirefoxProfile()
327 |             profile.set_preference('devtools.jsonview.enabled', False)
328 |             profile.set_preference('plain_text.wrap_long_lines', False)
329 |             profile.set_preference('view_source.wrap_long_lines', False)
330 | 
331 |             # proxyを追加
332 |             if self.PROXY != '':
333 |                 # self.PROXYをパース処理する
334 |                 parsed_uri = parse.urlparse(self.PROXY)
335 | 
336 |                 # socks5
337 |                 if parsed_uri.scheme == "socks5":
338 |                     # Proxy設定を追加
339 |                     profile.set_preference(
340 |                         'network.proxy.type', 1)
341 |                     profile.set_preference('network.proxy.socks_version', 5)
342 |                     profile.set_preference(
343 |                         'network.proxy.socks', parsed_uri.hostname)
344 |                     profile.set_preference(
345 |                         'network.proxy.socks_port', parsed_uri.port)
346 |                     profile.set_preference('network.proxy.no_proxies_on', '')
347 |                     profile.set_preference(
348 |                         'network.proxy.socks_remote_dns', True)
349 |                     profile.update_preferences()
350 |                 elif parsed_uri.scheme == "socks4":
351 |                     # Proxy設定を追加
352 |                     profile.set_preference(
353 |                         'network.proxy.type', 1)
354 |                     profile.set_preference('network.proxy.socks_version', 4)
355 |                     profile.set_preference(
356 |                         'network.proxy.socks', parsed_uri.hostname)
357 |                     profile.set_preference(
358 |                         'network.proxy.socks_port', parsed_uri.port)
359 |                     profile.set_preference('network.proxy.no_proxies_on', '')
360 |                     profile.set_preference(
361 |                         'network.proxy.socks_remote_dns', True)
362 |                     profile.update_preferences()
363 | 
364 |             # set ssl verify(firefoxの場合はprofileで処理するのでこちらに記述する)
365 |             if not self.IGNORE_SSL_VERIFY:
366 |                 profile.accept_untrusted_certs = True
367 | 
368 |             try:
369 |                 geckodriver_autoinstaller.install()
370 |             except Exception:
371 |                 pass
372 |             self.driver = Firefox(options=options, firefox_profile=profile)
373 | 
374 |         # User agentを指定させる
375 |         user_agent = self.driver.execute_script("return navigator.userAgent")
376 |         self.set_user_agent(user_agent)
377 | 
378 |         return
379 | 
380 |     # selenium経由でリクエストを送信する
381 |     def request_selenium(self, url: str, method='GET', data=None):
382 |         """[summary]
383 | 
384 |         Selenium経由でGETリクエストを投げて、その結果をhtml(文字列)で返す.
385 | 
386 |         Args:
387 |             url (str):    リクエストを投げるurl.
388 |             method (str): リクエストメソッド.
389 |             data (str):   POSTメソッド時に利用するdata.
390 | 
391 |         Returns:
392 |             str: htmlの文字列.
393 |         """
394 | 
395 |         if method == 'GET':
396 |             response = self.driver.get(url)
397 | 
398 |             # wait all elements
399 |             WebDriverWait(self.driver, 15).until(
400 |                 EC.presence_of_all_elements_located)
401 | 
402 |             # wait 5 seconds(wait DOM)
403 |             if self.NAME in ('Bing', 'Baidu', 'DuckDuckGo'):  # type: ignore
404 |                 self.driver.implicitly_wait(20)
405 | 
406 |             # get result
407 |             result = self.driver.page_source
408 | 
409 |         elif method == 'POST':
410 |             response = self.driver.request('POST', url, data=data)
411 | 
412 |             # wait all elements
413 |             WebDriverWait(self.driver, 15).until(
414 |                 EC.presence_of_all_elements_located)
415 | 
416 |             # wait 5 seconds(wait DOM)
417 |             if self.NAME in ('Bing', 'Baidu', 'DuckDuckGo'):  # type: ignore
418 |                 self.driver.implicitly_wait(20)
419 | 
420 |             # get result
421 |             result = response.text
422 | 
423 |         return result
424 | 
425 |     # splash経由でのリクエストを送信する
426 |     def request_splash(self, url: str, method='GET', data=None):
427 |         """request_splash
428 | 
429 |         Splash経由でGETリクエストを投げて、その結果をhtml(文字列)で返す.
430 | 
431 |         Args:
432 |             url (str):    リクエストを投げるurl.
433 |             method (str): リクエストメソッド.
434 |             data (str):   POSTメソッド時に利用するdata.
435 | 
436 |         Returns:
437 |             str: htmlの文字列.
438 |         """
439 | 
440 |         # urlを生成する
441 |         splash_url = 'http://' + self.SPLASH_URI + '/render.html'
442 | 
443 |         # param
444 |         params = {
445 |             'url': url
446 |         }
447 | 
448 |         # Proxy指定をする場合
449 |         if self.PROXY != '':
450 |             params['proxy'] = self.PROXY
451 | 
452 |         # リクエストを投げてレスポンスを取得する
453 |         if method == 'GET':
454 |             result = self.session.get(splash_url, params=params).text
455 | 
456 |         # NOTE: Googleの画像検索のPOSTがSplashではレンダリングできないので、特例対応でrequestsを使用する.
457 |         # TODO: Splashでもレンダリングできるようになったら書き換える.
458 |         elif method == 'POST' and self.NAME == 'Google' and self.IMAGE_URL in url:  # type: ignore
459 |             # create session
460 |             session = requests.session()
461 | 
462 |             # proxyを設定
463 |             if self.PROXY != '':
464 |                 proxies = {
465 |                     'http': self.PROXY,
466 |                     'https': self.PROXY
467 |                 }
468 |                 session.proxies = proxies
469 | 
470 |             # user-agentを設定
471 |             if self.USER_AGENT != '':
472 |                 session.headers.update(
473 |                     {
474 |                         'User-Agent': self.USER_AGENT,
475 |                         'Accept-Language': 'ja,en-US;q=0.7,en;q=0.3'
476 |                     }
477 |                 )
478 | 
479 |             result = session.post(url, data=data).text
480 | 
481 |         elif method == 'POST':
482 |             headers = {'Content-Type': 'application/json'}
483 |             params['http_method'] = 'POST'
484 |             params['body'] = parse.urlencode(data)  # type: ignore
485 | 
486 |             result = self.session.post(
487 |                 splash_url,
488 |                 headers=headers,
489 |                 json=params
490 |             ).text
491 | 
492 |         return result
493 | 
494 |     # seleniumやsplushなどのヘッドレスブラウザ、request.sessionの作成・設定、cookieの読み込みを行う
495 |     def create_session(self):
496 |         """create_session
497 | 
498 |         指定された接続方式(Seleniumなどのヘッドレスブラウザの有無)に応じて、driverやsessionを作成する.
499 |         cookiesの読み込みやproxyの設定が必要な場合、この関数内で処理を行う.
500 |         """
501 | 
502 |         # seleniumを使う場合
503 |         if self.USE_SELENIUM:
504 |             self.create_selenium_driver()
505 | 
506 |         # splashを使う場合
507 |         elif self.USE_SPLASH:
508 |             # create session
509 |             self.session = requests.session()
510 | 
511 |             # user-agentを設定
512 |             if self.USER_AGENT != '':
513 |                 self.session.headers.update(
514 |                     {
515 |                         'User-Agent': self.USER_AGENT,
516 |                         'Accept-Language': 'ja,en-US;q=0.7,en;q=0.3'
517 |                     }
518 |                 )
519 | 
520 |         # requestを使う場合
521 |         else:
522 |             # create session
523 |             self.session = requests.session()
524 | 
525 |             # リダイレクトの上限を60にしておく(baidu対策)
526 |             self.session.max_redirects = 60
527 | 
528 |             # proxyを設定
529 |             if self.PROXY != '':
530 |                 proxies = {
531 |                     'http': self.PROXY,
532 |                     'https': self.PROXY
533 |                 }
534 |                 self.session.proxies = proxies
535 | 
536 |             # user-agentを設定
537 |             if self.USER_AGENT != '':
538 |                 self.session.headers.update(
539 |                     {
540 |                         'User-Agent': self.USER_AGENT,
541 |                         'Accept-Language': 'ja,en-US;q=0.7,en;q=0.3'
542 |                     }
543 |                 )
544 | 
545 |         # cookiefileが指定されている場合、読み込みを行う
546 |         if self.COOKIE_FILE != '':
547 |             self.read_cookies()
548 | 
549 |         return
550 | 
551 |     # sessionをcloseする
552 |     def close_session(self):
553 |         if self.USE_SELENIUM:
554 |             self.driver.quit()
555 |         else:
556 |             self.session.close()
557 | 
558 |     # リクエストを投げてhtmlを取得する(selenium/splash/requestで分岐してリクエストを投げるwrapperとして動作させる)
559 |     def get_result(self, url: str, method='GET', data=None):
560 |         """get_result
561 | 
562 |         接続方式に応じて、urlへGETリクエストを投げてhtmlを文字列で返す関数.
563 | 
564 |         Args:
565 |             url (str):    リクエストを投げるurl.
566 |             method (str): リクエストメソッド.
567 |             data (str):   POSTメソッド時に利用するdata.
568 | 
569 |         Returns:
570 |             str: htmlの文字列.
571 |         """
572 | 
573 |         # 優先度1: Selenium経由でのアクセス
574 |         if self.USE_SELENIUM:
575 |             result = self.request_selenium(url, method=method, data=data)
576 | 
577 |             # NOTE: seleniumでのブラウザスクロール. googleでの処理で不要になったため、ただ遅くなるだけで不便なので一旦コメントアウト.
578 |             # for i in range(0, 10):
579 |             #     self.driver.execute_script(
580 |             #         "window.scrollTo(0,document.body.scrollHeight)"
581 |             #     )
582 |             #     time.sleep(0.5)
583 | 
584 |         # 優先度2: Splash経由でのアクセス(Seleniumが有効になってない場合はこちら)
585 |         elif self.USE_SPLASH:
586 |             # create splash url
587 |             result = self.request_splash(url, method=method, data=data)
588 | 
589 |         # 優先度3: request.sessionからのリクエスト(SeleniumもSplashも有効でない場合)
590 |         else:
591 |             if method == 'GET':
592 |                 result = self.session.get(
593 |                     url, verify=self.IGNORE_SSL_VERIFY).text
594 |             elif method == 'POST':
595 |                 result = self.session.post(
596 |                     url, verify=self.IGNORE_SSL_VERIFY, data=data).text
597 | 
598 |         return result
599 | 
600 |     # 検索用のurlを生成
601 |     def gen_search_url(self, keyword: str, type: str):
602 |         """gen_search_url
603 | 
604 |         検索用のurlを生成する.
605 |         各検索エンジンで上書きする用の関数.
606 | 
607 |         Args:
608 |             keyword (str): 検索クエリ.
609 |             type (str): 検索タイプ.
610 | 
611 |         Returns:
612 |             dict: method
613 |             dict: 検索用url
614 |             dict: data
615 |         """
616 | 
617 |         result = {}
618 |         return 'GET', result, None
619 | 
620 |     # テキスト、画像検索の結果からlinksを取得するための集約function
621 |     def get_links(self, source_url, html: str, type: str):
622 |         """get_links
623 | 
624 |         受け付けたhtmlを解析し、検索結果をlistに加工して返す関数.
625 | 
626 |         Args:
627 |             url  (str): 解析する検索結果のurl.
628 |             html (str): 解析する検索結果のhtml.
629 |             type (str): 検索タイプ([text, image]).現時点ではtextのみ対応.
630 | 
631 |         Returns:
632 |             list: 検索結果(`[{'title': 'title...', 'link': 'https://hogehoge....'}, {...}]`)
633 |         """
634 | 
635 |         # BeautifulSoupでの解析を実施
636 |         soup = BeautifulSoup(html, 'lxml')
637 | 
638 |         if type == 'text':
639 |             # link, titleの組み合わせを取得する
640 |             elinks, etitles, etexts = self.get_text_links(soup)
641 | 
642 |             # before processing elists
643 |             self.MESSAGE.print_text(
644 |                 ','.join(elinks),  # type: ignore
645 |                 header=self.MESSAGE.HEADER + ': ' + Color.BLUE + \
646 |                 '[BeforeProcessing elinks]' + Color.END,
647 |                 separator=" :",
648 |                 mode="debug",
649 |             )
650 | 
651 |             # before processing etitles
652 |             self.MESSAGE.print_text(
653 |                 ','.join(etitles),  # type: ignore
654 |                 header=self.MESSAGE.HEADER + ': ' + Color.BLUE + \
655 |                 '[BeforeProcessing etitles]' + Color.END,
656 |                 separator=" :",
657 |                 mode="debug",
658 |             )
659 | 
660 |             # 加工処理を行う関数に渡す(各エンジンで独自対応)
661 |             elinks, etitles, etexts = self.processings_elist(
662 |                 elinks, etitles, etexts)
663 | 
664 |             # after processing elists
665 |             self.MESSAGE.print_text(
666 |                 ','.join(elinks),  # type: ignore
667 |                 header=self.MESSAGE.HEADER + ': ' + \
668 |                 Color.GREEN + '[AfterProcessing elinks]' + Color.END,
669 |                 separator=" :",
670 |                 mode="debug",
671 |             )
672 | 
673 |             # after processing etitles
674 |             self.MESSAGE.print_text(
675 |                 ','.join(etitles),  # type: ignore
676 |                 header=self.MESSAGE.HEADER + ': ' + \
677 |                 Color.GREEN + '[AfterProcessing etitles]' + Color.END,
678 |                 separator=" :",
679 |                 mode="debug",
680 |             )
681 | 
682 |             # dictに加工してリスト化する
683 |             # [{'title': 'title...', 'link': 'https://hogehoge....'}, {...}]
684 |             links = self.create_text_links(source_url, elinks, etitles, etexts)
685 | 
686 |             return links
687 | 
688 |         elif type == 'image':
689 |             links = self.get_image_links(soup)
690 | 
691 |             return links
692 | 
693 |     # テキスト検索ページの検索結果(links([{link: ..., title: ...},...]))を生成するfunction
694 |     def get_text_links(self, soup: BeautifulSoup):
695 |         """get_text_links
696 | 
697 |         BeautifulSoupからテキスト検索ページを解析して結果を返す関数.
698 | 
699 |         Args:
700 |             soup (BeautifulSoup): 解析するBeautifulSoupオブジェクト.
701 | 
702 |         Returns:
703 |             list: linkの検索結果([xxx,xxx,xxx...])
704 |             list: titleの検索結果([xxx,xxx,xxx...])
705 |             list: textの検索結果([xxx,xxx,xxx...])
706 |         """
707 |         # linkのurlを取得する
708 |         self.MESSAGE.print_text(
709 |             self.SOUP_SELECT_URL,  # type: ignore
710 |             header=self.MESSAGE.HEADER + ': ' + \
711 |             Color.GREEN + '[get_text_link.SOUP_SELECT_URL]' + Color.END,
712 |             separator=" :",
713 |             mode="debug",
714 |         )
715 |         elements = soup.select(self.SOUP_SELECT_URL)
716 |         elinks = [e['href'] for e in elements]
717 | 
718 |         # linkのtitleを取得する
719 |         self.MESSAGE.print_text(
720 |             self.SOUP_SELECT_TITLE,  # type: ignore
721 |             header=self.MESSAGE.HEADER + ': ' + \
722 |             Color.GREEN + '[get_text_link.SOUP_SELECT_TITLE]' + Color.END,
723 |             separator=" :",
724 |             mode="debug",
725 |         )
726 |         elements = soup.select(self.SOUP_SELECT_TITLE)
727 |         etitles = [e.text for e in elements]
728 | 
729 |         # linkのtextを取得する
730 |         self.MESSAGE.print_text(
731 |             self.SOUP_SELECT_TEXT,  # type: ignore
732 |             header=self.MESSAGE.HEADER + ': ' + \
733 |             Color.GREEN + '[get_text_link.SOUP_SELECT_TEXT]' + Color.END,
734 |             separator=" :",
735 |             mode="debug",
736 |         )
737 |         elements = soup.select(self.SOUP_SELECT_TEXT)
738 |         etext = [e.text for e in elements]
739 | 
740 |         return elinks, etitles, etext
741 | 
742 |     # 画像検索ページの検索結果(links(list()))を生成するfunction
743 |     def get_image_links(self, soup: BeautifulSoup):
744 |         """get_image_links
745 |         BeautifulSoupから画像検索ページを解析して結果を返す関数.
746 |         (実際の処理は各検索エンジンごとの関数で実施).
747 | 
748 |         Args:
749 |             soup (BeautifulSoup): 解析するBeautifulSoupオブジェクト.
750 | 
751 |         Returns:
752 |             list: 検索結果(`[{'title': 'title...', 'link': 'https://hogehoge....'}, {...}]`)
753 |         """
754 | 
755 |         links = []
756 | 
757 |         return links
758 | 
759 |     # elist, etitle生成時の追加編集処理用function
760 |     def processings_elist(self, elinks, etitles, etexts: list):
761 |         """processings_elist
762 | 
763 |         self.get_links 内で、取得直後のelinks, etitlesに加工を加えるための関数.
764 |         必要に応じて各検索エンジンのClassで上書きする.
765 | 
766 |         Args:
767 |             elinks (list): elinks(検索結果のlink)の配列
768 |             etitles (list): etitles(検索結果のtitle)の配列
769 |             etexts (list): etexts(検索結果のtext)の配列
770 | 
771 |         Returns:
772 |             elinks (list): elinks(検索結果のlink)の配列
773 |             etitles (list): etitles(検索結果のtitle)の配列
774 |             etexts (list): etexts(検索結果のtext)の配列
775 |         """
776 | 
777 |         return elinks, etitles, etexts
778 | 
779 |     # テキスト検索の1ページごとの検索結果から、links(links([{link: ..., title: ...},...]))を生成するfunction
780 |     def create_text_links(self, source_url: str, elinks, etitles, etext: list):
781 |         """create_text_links
782 | 
783 |         elinks, etitlesからlinks(get_linksのデータ)を返す関数.
784 | 
785 |         Args:
786 |             elinks (list): elinks(検索結果のlink)の配列
787 |             etitles (list): etitles(検索結果のtitle)の配列
788 |             etext (list): etext(検索結果のテキスト)の配列
789 | 
790 |         Returns:
791 |             list: 検索結果(`[{'title': 'title...', 'url': 'https://hogehoge....', 'text': 'hogehoge fugafuga...'}, {...}]`)を返す。
792 |         """
793 | 
794 |         links = list()
795 |         n = 0
796 |         before_link = ""
797 |         for link in elinks:
798 |             d = dict()
799 |             d['link'] = link
800 | 
801 |             # etitle(urlのtitle)をdictに追加する
802 |             if len(etitles) > n:
803 |                 d['title'] = etitles[n]
804 | 
805 |             # etext(urlに対応する検索結果のテキスト文)をdictに追加する
806 |             if len(etext) > n:
807 |                 d['text'] = etext[n]
808 | 
809 |             # 検索元urlをdictに追加する
810 |             d['source_url'] = source_url
811 | 
812 |             if before_link != link:
813 |                 links.append(d)
814 | 
815 |             before_link = link
816 |             n += 1
817 | 
818 |         return links
819 | 
820 |     # サジェスト取得用のurlを生成
821 |     def gen_suggest_url(self, keyword: str):
822 |         """gen_suggest_url
823 | 
824 |         サジェスト取得用のurlを生成する.
825 |         各検索エンジンで上書きする用の関数.
826 | 
827 |         Args:
828 |             keyword (str): 検索クエリ.
829 | 
830 |         Returns:
831 |             dict: サジェスト取得用url
832 |         """
833 | 
834 |         result = {}
835 |         return result
836 | 
837 |     # サジェストの取得
838 |     def get_suggest_list(self, suggests: list, char: str, html: str):
839 |         """get_suggest_list
840 | 
841 |         htmlからsuggestを配列で取得する関数.
842 |         実際の処理は各検索エンジンClassで上書きする.
843 | 
844 |         Args:
845 |             suggests (list): suggestを追加するための大本のlist.
846 |             char (str): サジェストの文字列.
847 |             html (str): 解析を行うhtml.
848 | 
849 |         Returns:
850 |             dict: サジェスト配列
851 |         """
852 |         result = {}
853 |         return result
854 | 
855 |     # ReCaptcha画面かどうかを識別するための関数
856 |     def check_recaptcha(self, html: str):
857 |         """[summary]
858 | 
859 |         `self.SOUP_RECAPTCHA_TAG` を元に、htmlがReCaptcha画面かどうかを識別する.
860 | 
861 |         Args:
862 |             html (str): 識別するページのhtml
863 | 
864 |         Returns:
865 |             bool: ReCaptcha画面かどうか(ReCaptcha画面の場合はTrue)
866 |         """
867 | 
868 |         result = False
869 | 
870 |         # BeautifulSoupでの識別を実施
871 |         soup = BeautifulSoup(html, 'lxml')
872 | 
873 |         # 要素が存在するかを確認
874 |         if self.SOUP_RECAPTCHA_TAG != '':
875 |             elements = soup.select(self.SOUP_RECAPTCHA_TAG)
876 | 
877 |             # 要素のチェック
878 |             if len(elements) > 0:
879 |                 result = True
880 | 
881 |         return result
882 | 
883 |     # ReCaptchaをBypassする処理(wrapper)
884 |     def bypass_recaptcha(self, url: str, html: str):
885 |         """bypass_recaptcha
886 | 
887 |         ReCaptcha画面をBypassするための関数.
888 |         実際の処理は Selenium/Splash 各ブラウザに応じて処理させる関数に行わせる.
889 | 
890 |         Args:
891 |             url (str): ReCaptcha画面が表示されてしまったリクエストのurl
892 |             html (str): ReCaptcha画面のhtml
893 | 
894 |         Returns:
895 |             str: ReCaptchaを突破後のurlのhtml
896 |         """
897 | 
898 |         # seleniumを使う場合
899 |         if self.USE_SELENIUM:
900 |             html = self.bypass_recaptcha_selenium(url, html)
901 | 
902 |         # splashを使う場合
903 |         elif self.USE_SPLASH:
904 |             html = self.bypass_recaptcha_splash(url, html)
905 | 
906 |         return html
907 | 
908 |     # ReCaptchaをSeleniumでBypassする処理
909 |     def bypass_recaptcha_selenium(self, url: str, html: str):
910 |         """bypass_recaptcha_selenium
911 | 
912 |         SeleniumでReCaptchaを突破する関数.
913 |         実際の処理は各検索エンジンのClassで実装.
914 | 
915 |         Args:
916 |             url (str): ReCaptcha画面が表示されてしまったリクエストのurl
917 |             html (str): ReCaptcha画面のhtml
918 | 
919 |         Returns:
920 |             str: ReCaptchaを突破後のurlのhtml
921 |         """
922 | 
923 |         return html
924 | 
925 |     # ReCaptchaをSplashでBypassする処理
926 |     def bypass_recaptcha_splash(self, url: str, html: str):
927 |         """bypass_recaptcha_splash
928 | 
929 |         SplashでReCaptchaを突破する関数.
930 |         実際の処理は各検索エンジンのClassで実装.
931 | 
932 |         Args:
933 |             url (str): ReCaptcha画面が表示されてしまったリクエストのurl
934 |             html (str): ReCaptcha画面のhtml
935 | 
936 |         Returns:
937 |             str: ReCaptchaを突破後のurlのhtml
938 |         """
939 | 
940 |         return html
941 | 


--------------------------------------------------------------------------------