├── docker-compose.yml ├── .github └── workflows │ ├── build_docker.yml │ └── test_scraping.yml ├── setup.cfg ├── docs ├── modules.rst ├── setup.rst ├── pydork.rst ├── index.rst ├── Makefile ├── make.bat └── conf.py ├── completion ├── pydork-completion.bash └── _pydork ├── pydork ├── engine_yandex.py ├── messages.py ├── common.py ├── engine_duckduckgo.py ├── __init__.py ├── engine_yahoo.py ├── recaptcha.py ├── engine_bing.py ├── engine_baidu.py ├── sub_commands.py ├── test_engine.py ├── test_engine_selenium.py ├── engine_google.py ├── engine.py └── engine_common.py ├── Dockerfile ├── LICENSE ├── .gitignore ├── setup.py ├── README.md └── README.rst /docker-compose.yml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/workflows/build_docker.yml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [build_sphinx] 2 | source-dir = docs/ 3 | build-dir = docs/_build 4 | all_files = 1 5 | -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | pydork 2 | ====== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | pydork 8 | setup 9 | -------------------------------------------------------------------------------- /docs/setup.rst: -------------------------------------------------------------------------------- 1 | setup module 2 | ============ 3 | 4 | .. automodule:: setup 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/pydork.rst: -------------------------------------------------------------------------------- 1 | pydork package 2 | ============== 3 | 4 | .. automodule:: pydork 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | pydork.engine module 13 | -------------------- 14 | 15 | .. automodule:: pydork.engine 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | -------------------------------------------------------------------------------- /completion/pydork-completion.bash: -------------------------------------------------------------------------------- 1 | #!bash 2 | # ======================================================= 3 | 4 | _pydork() { 5 | local cur 6 | local cmd 7 | 8 | cur=${COMP_WORDS[$COMP_CWORD]} 9 | cmd=(${COMP_WORDS[@]}) 10 | 11 | if [[ "$cur" == -* ]]; then 12 | COMPREPLY=($(compgen -W "-h --help" -- $cur)) 13 | return 0 14 | fi 15 | } 16 | 17 | complete -F _pydork -o default pydork 18 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. pydork documentation master file, created by 2 | sphinx-quickstart on Sun Feb 13 19:47:15 2022. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to pydork's documentation! 7 | ================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | :glob: 13 | 14 | pydork 15 | 16 | Indices and tables 17 | ================== 18 | 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | * :ref:`search` 22 | -------------------------------------------------------------------------------- /pydork/engine_yandex.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2023 Blacknon. All rights reserved. 4 | # Use of this source code is governed by an MIT license 5 | # that can be found in the LICENSE file. 6 | # ======================================================= 7 | 8 | """engine_yandex 9 | * Yandex(yandex.com)用の検索用Classを持つモジュール. 10 | """ 11 | 12 | from .common import Color 13 | from .engine_common import CommonEngine 14 | 15 | 16 | class Yandex(CommonEngine): 17 | """DuckDuckGo 18 | 19 | DuckDuckGo用の検索エンジン用Class. 20 | """ 21 | 22 | def __init__(self): 23 | None 24 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Blacknon. All rights reserved. 2 | # Use of this source code is governed by an MIT license 3 | # that can be found in the LICENSE file. 4 | # ======================================================= 5 | 6 | FROM debian:bullseye 7 | 8 | ENV DEBIAN_FRONTEND noninteractive 9 | 10 | # apt update 11 | RUN apt update 12 | 13 | # apt install 14 | RUN apt install -y \ 15 | firefox-esr \ 16 | python3-pip 17 | 18 | RUN pip3 install --upgrade pip 19 | RUN pip3 install --upgrade pip setuptools 20 | 21 | # copy directory 22 | COPY ./ /opt/pydork 23 | WORKDIR /opt/pydork 24 | 25 | # listing /opt/pydork 26 | RUN ls -la /opt/pydork 27 | 28 | # # pip install 29 | RUN pip3 install --use-pep517 ./ 30 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.https://www.sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 blacknon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # vscode 2 | .vscode 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | 109 | # Selenium 110 | geckodriver.log 111 | -------------------------------------------------------------------------------- /completion/_pydork: -------------------------------------------------------------------------------- 1 | #compdef pydork 2 | # ======================================================= 3 | 4 | 5 | _pydork() { 6 | local context curcontext=$curcontext state line 7 | declare -A opt_args 8 | local ret=1 9 | 10 | # args 11 | _arguments -C \ 12 | '(-h --help)'{-h,--help}'[show help]' \ 13 | '(-v --version)'{-v,--version}'[show version]' \ 14 | '1: :__pydork_commands' \ 15 | '*:: :->modes' \ 16 | && ret=0 17 | 18 | # args and subcommand 19 | case $state in 20 | modes) 21 | case $words[1] in 22 | search) 23 | _arguments -C \ 24 | '(-h --help)'{-h,--help}'[show help]' \ 25 | '-t[search engine]:_values:(baidu bing duckduckgo google yahoo)' \ 26 | '(-n --num)'{-n,--num}'[get search result num (int)]:_values:(100 200 300 400 500)' \ 27 | '(-P --proxy)'{-P,--proxy}'[proxy server]' \ 28 | '(-s --selenium)'{-s,--selenium}'[Seleniumを使用する]' \ 29 | '(-S --splash)'{-S,--splash}'[Splashを使用する]' \ 30 | '(-T --title)'{-T,--title}'[検索結果のタイトルも取得する]' \ 31 | '(-0 --nullchar)'{-0,--nullchar}'[区切り文字としてNull Characterを使用する]' \ 32 | '--color[output color(default:auto)]:_values:(auto always none)' \ 33 | '--debug[debug mode]' \ 34 | '(-)*:: :->null_state' \ 35 | && ret=0 36 | ;; 37 | 38 | suggest) 39 | _arguments -C \ 40 | '(-h --help)'{-h,--help}'[show help]' \ 41 | '-t[search engine]:_values:(baidu bing duckduckgo google yahoo)' \ 42 | '--jap[サジェスト取得時に日本語の候補を追加で検索]' \ 43 | '--alph[サジェスト取得時にアルファベットの候補を追加で検索]' \ 44 | '--num[サジェスト取得時に数字の候補を追加で検索]' \ 45 | '(-P --proxy)'{-P,--proxy}'[proxy server]' \ 46 | '--color[output color(default:auto)]:_values:(auto always none)' \ 47 | '(-)*:: :->null_state' \ 48 | && ret=0 49 | ;; 50 | esac 51 | ;; 52 | esac 53 | 54 | return ret 55 | } 56 | 57 | __pydork_commands () { 58 | local -a _c 59 | _c=( 60 | 'search:url検索モード' 61 | 'suggest:suggest取得モード' 62 | ) 63 | 64 | _describe -t commands Commands _c 65 | } 66 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | 16 | 17 | import pathlib 18 | 19 | 20 | path = pathlib.Path("../../pydork") 21 | sys.path.insert(0, os.path.abspath(path)) # '..\\..\\Resources')) 22 | 23 | 24 | # -- Project information ----------------------------------------------------- 25 | 26 | project = 'pydork' 27 | copyright = '2022, blacknon' 28 | author = 'blacknon' 29 | 30 | # The full version, including alpha/beta/rc tags 31 | release = '1.1.0' 32 | 33 | 34 | # -- General configuration --------------------------------------------------- 35 | 36 | # Add any Sphinx extension module names here, as strings. They can be 37 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 38 | # ones. 39 | extensions = [ 40 | 'sphinx.ext.autodoc', 41 | 'sphinx.ext.napoleon', 42 | 'sphinx.ext.viewcode' 43 | ] 44 | 45 | # Add any paths that contain templates here, relative to this directory. 46 | templates_path = ['_templates'] 47 | 48 | # List of patterns, relative to source directory, that match files and 49 | # directories to ignore when looking for source files. 50 | # This pattern also affects html_static_path and html_extra_path. 51 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 52 | 53 | 54 | # -- Options for HTML output ------------------------------------------------- 55 | 56 | # The theme to use for HTML and HTML Help pages. See the documentation for 57 | # a list of builtin themes. 58 | # 59 | html_theme = 'sphinx_rtd_theme' 60 | 61 | # Add any paths that contain custom static files (such as style sheets) here, 62 | # relative to this directory. They are copied after the builtin static files, 63 | # so a file named "default.css" will overwrite the builtin "default.css". 64 | html_static_path = ['_static'] 65 | -------------------------------------------------------------------------------- /.github/workflows/test_scraping.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Blacknon. All rights reserved. 2 | # Use of this source code is governed by an MIT license 3 | # that can be found in the LICENSE file. 4 | 5 | 6 | name: Test job at Scraping. 7 | 8 | on: 9 | push: 10 | branches: 11 | - 'master' 12 | - 'develop' 13 | 14 | jobs: 15 | # build rust binary 16 | scraping: 17 | strategy: 18 | matrix: 19 | include: 20 | - target: google 21 | search_text: test_google_text_search 22 | search_image: test_google_image_search 23 | suggest: test_google_suggest 24 | suggest_jap: test_google_suggest_with_jap 25 | suggest_alph: test_google_suggest_with_alph 26 | suggest_num: test_google_suggest_with_num 27 | 28 | - target: bing 29 | search_text: test_bing_text_search 30 | search_image: test_bing_image_search 31 | suggest: test_bing_suggest 32 | suggest_jap: test_bing_suggest_with_jap 33 | suggest_alph: test_bing_suggest_with_alph 34 | suggest_num: test_bing_suggest_with_num 35 | 36 | - target: baidu 37 | search_text: test_baidu_text_search 38 | search_image: test_baidu_image_search 39 | suggest: test_baidu_suggest 40 | suggest_jap: test_baidu_suggest_with_jap 41 | suggest_alph: test_baidu_suggest_with_alph 42 | suggest_num: test_baidu_suggest_with_num 43 | 44 | - target: yahoo 45 | search_text: test_yahoo_text_search 46 | search_image: test_yahoo_image_search 47 | suggest: test_yahoo_suggest 48 | suggest_jap: test_yahoo_suggest_with_jap 49 | suggest_alph: test_yahoo_suggest_with_alph 50 | suggest_num: test_yahoo_suggest_with_num 51 | 52 | - target: duckduckgo 53 | search_text: test_duckduckgo_text_search 54 | search_image: test_duckduckgo_image_search 55 | suggest: test_duckduckgo_suggest 56 | suggest_jap: test_duckduckgo_suggest_with_jap 57 | suggest_alph: test_duckduckgo_suggest_with_alph 58 | suggest_num: test_duckduckgo_suggest_with_num 59 | 60 | runs-on: ubuntu-latest 61 | steps: 62 | - uses: actions/checkout@v1 63 | 64 | - name: Setup Python 65 | uses: actions/setup-python@v2 66 | with: 67 | python-version: '3.9' 68 | architecture: 'x64' 69 | 70 | - name: Get Python version 71 | run: python -V 72 | 73 | - name: Install Selenium 74 | run: pip install get-chrome-driver get-gecko-driver --upgrade 75 | 76 | - name: Install Sphinx 77 | run: pip install sphinx sphinx-rtd-theme sphinx-autobuild 78 | 79 | - name: Install dependencies 80 | run: pip install ./ 81 | 82 | - name: Run Test Text Search 83 | run: python -m unittest pydork.test_engine_selenium.SearchEngineTestCaseWithSelenium.${{ matrix.search_text }} -v 84 | 85 | - name: Run Test Image Search 86 | run: python -m unittest pydork.test_engine_selenium.SearchEngineTestCaseWithSelenium.${{ matrix.search_image }} -v 87 | 88 | - name: Run Test Suggests 89 | run: | 90 | python -m unittest pydork.test_engine_selenium.SearchEngineTestCaseWithSelenium.${{ matrix.suggest }} -v 91 | python -m unittest pydork.test_engine_selenium.SearchEngineTestCaseWithSelenium.${{ matrix.suggest_num }} -v 92 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2023 Blacknon. All rights reserved. 4 | # Use of this source code is governed by an MIT license 5 | # that can be found in the LICENSE file. 6 | # ======================================================= 7 | 8 | 9 | import os 10 | import platform 11 | 12 | import setuptools 13 | 14 | cmdclass = {} 15 | try: 16 | from sphinx.setup_command import BuildDoc 17 | cmdclass = {'build_sphinx': BuildDoc} 18 | except ImportError: 19 | pass 20 | 21 | try: 22 | with open('README.rst') as f: 23 | readme = f.read() 24 | except IOError: 25 | readme = '' 26 | 27 | 28 | # 補完ファイルインストール用関数 29 | def get_data_files(): 30 | # 補完ファイルのインストール先を取得する関数 31 | def get_completefile_install_location(shell): 32 | # pathのprefixを定義 33 | prefix = '' 34 | 35 | # osの種類を取得 36 | uname = platform.uname()[0] 37 | 38 | # 実行ユーザがrootかどうかでprefixを変更 39 | if os.geteuid() == 0: 40 | ''' システムインストール時の挙動 ''' 41 | if uname == 'Linux' and shell == 'bash': 42 | prefix = '/' 43 | elif uname == 'Linux' and shell == 'zsh': 44 | prefix = '/usr/local' 45 | elif uname == 'Darwin' and shell == 'bash': 46 | prefix = '/' 47 | elif uname == 'Darwin' and shell == 'zsh': 48 | prefix = '/usr' 49 | 50 | # shellの種類に応じてインストール先のlocationを変更 51 | if shell == 'bash': 52 | location = os.path.join(prefix, 'etc/bash_completion.d') 53 | elif shell == 'zsh': 54 | location = os.path.join(prefix, 'share/zsh/site-functions') 55 | else: 56 | raise ValueError('unsupported shell: {0}'.format(shell)) 57 | 58 | # locationを返す 59 | return location 60 | 61 | # locationをdict形式で取得する 62 | loc = { 63 | 'bash': get_completefile_install_location('bash'), 64 | 'zsh': get_completefile_install_location('zsh') 65 | } 66 | 67 | # 対象となるファイルをdict形式で指定 68 | files = dict( 69 | bash=['completion/pydork-completion.bash'], 70 | zsh=[ 71 | 'completion/pydork-completion.bash', 72 | 'completion/_pydork' 73 | ] 74 | ) 75 | 76 | # data_files形式でreturn 77 | data_files = [] 78 | data_files.append((loc['bash'], files['bash'])) 79 | data_files.append((loc['zsh'], files['zsh'])) 80 | 81 | return data_files 82 | 83 | 84 | name = 'pydork' 85 | version = '1.1.7' 86 | release = '1.1.7' 87 | 88 | if __name__ == "__main__": 89 | setuptools.setup( 90 | name=name, 91 | version=version, 92 | author='blacknon', 93 | author_email='blacknon@orebibou.com', 94 | maintainer='blacknon', 95 | maintainer_email='blacknon@orebibou.com', 96 | description='Scraping and listing text and image searches on Google, Bing, DuckDuckGo, Baidu, Yahoo japan.', 97 | long_description=readme, 98 | license='MIT License', 99 | install_requires=[ 100 | 'bs4', 101 | 'get-chrome-driver', 102 | 'get-gecko-driver', 103 | 'chromedriver_autoinstaller', 104 | 'geckodriver_autoinstaller', 105 | 'fake_useragent', 106 | 'lxml', 107 | 'requests[socks]', 108 | 'selenium==4.7.2', 109 | 'selenium_requests', 110 | 'pickle-mixin', 111 | 'sphinx', 112 | 'sphinx-rtd-theme', 113 | 'sphinx-autobuild' 114 | ], 115 | url='https://github.com/blacknon/pydork', 116 | packages=setuptools.find_packages(), 117 | py_modules=['pydork'], 118 | entry_points={ 119 | 'console_scripts': [ 120 | 'pydork = pydork:main', 121 | ], 122 | }, 123 | classifiers=[ 124 | 'Programming Language :: Python :: 3', 125 | 'Programming Language :: Python :: 3.7', 126 | 'Programming Language :: Python :: 3.8', 127 | 'Programming Language :: Python :: 3.9', 128 | 'Programming Language :: Python :: 3.10', 129 | 'Programming Language :: Python :: 3.11', 130 | 'License :: OSI Approved :: MIT License', 131 | ], 132 | data_files=get_data_files(), 133 | cmdclass=cmdclass, 134 | command_options={ 135 | 'build_sphinx': { 136 | 'project': ('setup.py', name), 137 | 'version': ('setup.py', version), 138 | 'release': ('setup.py', release)}}, 139 | setup_requires=[ 140 | "sphinx", 141 | "sphinx-rtd-theme", 142 | "sphinx-autobuild", 143 | ], 144 | ) 145 | -------------------------------------------------------------------------------- /pydork/messages.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2023 Blacknon. All rights reserved. 4 | # Use of this source code is governed by an MIT license 5 | # that can be found in the LICENSE file. 6 | # ======================================================= 7 | 8 | """common 9 | * commandでのhelp messageを英語・日本語対応させるためのテキストデータを持つモジュール. 10 | """ 11 | 12 | import os 13 | 14 | lang = os.getenv('LANG') 15 | 16 | if lang == 'ja_JP.UTF-8': 17 | description = "各種検索エンジンから指定したクエリの結果(url)およびSuggestを取得する" 18 | 19 | # common_args_map 20 | help_message_query = "検索文字列(クエリ)" 21 | help_message_op_file = "検索文字列(クエリ)が書かれているファイル" 22 | help_message_op_template_file = "検索文字列(クエリ)が書かれているテンプレートファイル(jinja2)" 23 | help_message_op_template_variable = "テンプレートファイル(jinja2)で使用する変数セット(json)" 24 | help_message_op_search_type = "使用する検索エンジンを指定" 25 | help_message_op_lang = "言語を指定" 26 | help_message_op_country = "国を指定" 27 | help_message_op_proxy_server = "プロキシサーバーを指定(例:socks5://hogehoge:8080, https://fugafuga:18080)" 28 | help_message_op_json = "json形式で出力する" 29 | help_message_op_insecure = "sslエラーを無視する" 30 | help_message_op_selenium = "Selenium(headless browser)を使用する(排他: Splashより優先)" 31 | help_message_op_splash = "Splash(headless browser)を使用する(排他: Seleniumの方が優先)" 32 | help_message_op_browser_endpoint = "Selenium/Splash等のヘッドレスブラウザのエンドポイントを指定(例: localhost:8050)" 33 | help_message_op_browser = "Seleniumで使用するBrowserを指定" 34 | help_message_op_color = "color出力の切り替え" 35 | help_message_op_cookies_dir = "使用するcookieファイルの格納先ディレクトリのPATH(各検索エンジンごとでcookieファイルを個別保存)" 36 | help_message_op_delete_cookies = "検索クエリ実行ごとにCookieを削除する" 37 | 38 | # other_map 39 | help_message_op_title = "検索結果のタイトルをセットで出力する" 40 | help_message_op_null_char = "null characterを区切り文字として使用する" 41 | help_message_op_num = "検索結果の取得数を指定する" 42 | help_message_op_debug = "debugモードを有効にする" 43 | help_message_op_disable_headless = "Seleniumでheadlessモードを無効化する(手動でのReCaptcha対応時に必要)" 44 | help_message_op_start = "期間指定(開始)" 45 | help_message_op_end = "期間指定(終了)" 46 | help_message_op_image_pagelink = "画像ファイルがあるhtmlのURLも出力する" 47 | 48 | # suggest_map 49 | help_message_op_suggest_jap = "日本語の文字を検索キーワードに追加してサジェストを取得" 50 | help_message_op_suggest_alph = "アルファベット文字を検索キーワードに追加してサジェストを取得" 51 | help_message_op_suggest_num = "数字を検索キーワードに追加してサジェストを取得" 52 | 53 | 54 | else: 55 | description = "Obtain results (url) and Suggest for a specified query from various search engines" 56 | 57 | # common_args_map 58 | help_message_query = "search string(query)" 59 | help_message_op_file = "File containing search strings(queries)" 60 | help_message_op_template_file = "Template file (jinja2) containing search strings (queries)" 61 | help_message_op_template_variable = "Variable set (json) used in template file (jinja2)" 62 | help_message_op_search_type = "Specify which search engine to use" 63 | help_message_op_lang = "Specify language" 64 | help_message_op_country = "Specify country" 65 | help_message_op_proxy_server = "Specify proxy server(example: socks5://hogehoge:8080, https://fugafuga:18080)" 66 | help_message_op_json = "Output in json format" 67 | help_message_op_insecure = "ignore ssl errors" 68 | help_message_op_selenium = "Use Selenium (headless browser). (exclusive: takes precedence over Splash)" 69 | help_message_op_splash = "Use Splash (headless browser) (exclusive: Selenium is preferred)" 70 | help_message_op_browser_endpoint = "Specify the endpoint for headless browsers such as Selenium/Splash (example: localhost:8050)" 71 | help_message_op_browser = "Specify Browser to use with Selenium" 72 | help_message_op_color = "Switching color output" 73 | help_message_op_cookies_dir = "PATH of the directory where the cookie files to be used are stored (cookie files are stored separately for each search engine)" 74 | help_message_op_delete_cookies = "Delete cookies on every search query execution" 75 | 76 | # other_map 77 | help_message_op_title = "Output a set of search result titles" 78 | help_message_op_null_char = "Use null character as delimiter" 79 | help_message_op_num = "Specify the number of search results to retrieve" 80 | help_message_op_debug = "Enable debug mode" 81 | help_message_op_disable_headless = "Disable headless mode in Selenium (required for manual ReCaptcha support)" 82 | help_message_op_start = "Search period (start)" 83 | help_message_op_end = "Search period (end)" 84 | help_message_op_image_pagelink = "Also output the html URL where the image files are located." 85 | 86 | # suggest_map 87 | help_message_op_suggest_jap = "Add Japanese characters to search keywords to get suggestions" 88 | help_message_op_suggest_alph = "Add alphabetic characters to search keywords to get suggestions" 89 | help_message_op_suggest_num = "Add numbers to search keywords to get suggestions" 90 | -------------------------------------------------------------------------------- /pydork/common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2023 Blacknon. All rights reserved. 4 | # Use of this source code is governed by an MIT license 5 | # that can be found in the LICENSE file. 6 | # ======================================================= 7 | 8 | 9 | """common 10 | * 共通系や雑多な処理を詰め合わせたバルクモジュール. 11 | """ 12 | 13 | import sys 14 | import datetime 15 | 16 | from string import Template 17 | 18 | 19 | # コンソール出力時に色付をするためのClass 20 | class Color: 21 | """Color 22 | 23 | コンソール出力時に色付をするための文字列を変数にして保持しているClass. 24 | 25 | Examples: 26 | c = Color() 27 | c.set(c.BLUE) 28 | print(c.out('hogehoge')) 29 | """ 30 | # color_code 31 | BLACK = '\033[30m' 32 | RED = '\033[31m' 33 | GREEN = '\033[32m' 34 | YELLOW = '\033[33m' 35 | BLUE = '\033[34m' 36 | PURPLE = '\033[35m' 37 | CYAN = '\033[36m' 38 | WHITE = '\033[37m' 39 | GRAY = '\033[1;30m' 40 | 41 | # 文字効果 42 | BOLD = '\038[1m' 43 | ITALIC = '\038[3m' 44 | UNDERLINE = '\033[4m' 45 | INVISIBLE = '\033[08m' 46 | REVERCE = '\033[07m' 47 | 48 | # 効果を終了 49 | END = '\033[0m' 50 | 51 | def __init__(self, color_code: str): 52 | """[summary] 53 | 54 | Args: 55 | color_code (str): 使用するカラーコード 56 | 57 | """ 58 | self.COLOR_CODE = color_code 59 | 60 | def out(self, text: str, is_bold=False, is_underline=False, is_reverse=False, is_italic=False): 61 | # textを囲む 62 | text = self.COLOR_CODE + text + self.END 63 | 64 | # is_boldが有効な場合、太字にする 65 | if is_bold: 66 | text = self.BOLD + text + self.END 67 | 68 | # is_underlineが有効な場合、下線を入れる 69 | if is_underline: 70 | text = self.UNDERLINE + text + self.END 71 | 72 | # is_reverseが有効な場合、色の反転をする 73 | if is_reverse: 74 | text = self.REVERCE + text + self.END 75 | 76 | # is_italicが有効な場合、Italicにする 77 | if is_italic: 78 | text = self.ITALIC + text + self.END 79 | 80 | return text 81 | 82 | 83 | # Message関連の制御用Class 84 | class Message: 85 | """Message 86 | 87 | メッセージの出力を簡易化するためのClass. 88 | 89 | Examples: 90 | 91 | """ 92 | 93 | def __init__(self): 94 | # command flag 95 | self.IS_COMMAND = False 96 | 97 | # debug flag 98 | self.IS_DEBUG = False 99 | 100 | # timestamp flag 101 | self.IS_TIMESTAMP = False 102 | 103 | # engine data 104 | self.ENGINE_COLOR = Color('') 105 | self.ENGINE_NAME = '' 106 | self.ENGINE = '' 107 | 108 | # header 109 | self.HEADER = '' 110 | 111 | def set_is_command(self, is_command: bool): 112 | self.IS_COMMAND = is_command 113 | 114 | def set_is_debug(self, is_debug: bool): 115 | self.IS_DEBUG = is_debug 116 | 117 | def set_engine(self, engine: str, color: str): 118 | self.ENGINE_COLOR = Color(color) 119 | self.ENGINE_NAME = engine 120 | self.ENGINE = self.ENGINE_COLOR.out(engine) 121 | 122 | def set_header(self, text): 123 | self.HEADER = text 124 | 125 | def replace(self, text): 126 | """replace 127 | 128 | テンプレートテキストの変数をself変数や時刻に置換して返す 129 | 130 | Args: 131 | text (str): 置換処理をするテンプレート用テキスト 132 | """ 133 | 134 | # 現在時刻を取得 135 | dt_now = datetime.datetime.now() 136 | 137 | # 置換用のdictを生成 138 | data = { 139 | # 時刻情報 140 | 'YEAR': dt_now.year, 141 | 'MONTH': dt_now.month, 142 | 'DAY': dt_now.day, 143 | 'HOUR': dt_now.hour, 144 | 'MINUTE': dt_now.minute, 145 | 'SECOND': dt_now.second, 146 | 147 | # 検索エンジン(color) 148 | 'ENGINE': self.ENGINE, # 色付き 149 | 'ENGINE_NAME': self.ENGINE_NAME, # 色なし 150 | } 151 | 152 | # テンプレートを作成 153 | template = Template(text) 154 | 155 | # 置換処理を実行 156 | result = template.safe_substitute(data) 157 | 158 | return result 159 | 160 | def print_line(self, *text, use_header=True, separator=' ', file=sys.stdout, header=None): 161 | """print_line 162 | 163 | メッセージを出力する(行) 164 | 165 | Args: 166 | text: メッセージとして出力するテキスト行 167 | use_header: `header`で指定しているヘッダーを行頭に表示するかどうか 168 | separator: printする際に使用する区切り文字 169 | file: 出力先のファイル(デフォルトはstdout) 170 | header: ヘッダーとして使用する文字列を指定 171 | """ 172 | # headerの生成 173 | if header is None: 174 | header = self.HEADER 175 | 176 | header = self.replace(header) 177 | 178 | # テキストを出力 179 | if use_header: 180 | print(header, *text, sep=separator, file=file) 181 | else: 182 | print(*text, sep=separator, file=file) 183 | 184 | def print_text(self, text, mode='message', use_header=True, separator=' ', file=sys.stdout, header=None): 185 | """print_line 186 | 187 | メッセージを出力する(テキスト) 188 | 189 | Args: 190 | text: メッセージとして出力するテキスト 191 | mode: メッセージの出力モード(`message`, `error`, `warn`, `info`, `debug`) 192 | use_header: `header`で指定しているヘッダーを行頭に表示するかどうか 193 | separator: printする際に使用する区切り文字 194 | file: 出力先のファイル(デフォルトはstdout) 195 | header: ヘッダーとして使用する文字列を指定 196 | """ 197 | # is_commandが有効のときのみ出力させる 198 | if not self.IS_COMMAND: 199 | return 200 | 201 | # debug, infoのときは、self.is_debugが有効のときのみ出力 202 | if mode in ('info', 'debug'): 203 | # self.is_debugでない場合は出力しない 204 | if not self.IS_DEBUG: 205 | return 206 | 207 | # 出力テキストの生成 208 | text = self.replace(text) 209 | 210 | # case 211 | text_color: Color = Color(Color.END) 212 | if mode == 'message': # modeが `message` のとき 213 | text_color = Color(Color.WHITE) 214 | 215 | elif mode == 'error': 216 | text_color = Color(Color.RED) 217 | file = sys.stderr 218 | 219 | elif mode == 'warn': 220 | text_color = Color(Color.YELLOW) 221 | file = sys.stderr 222 | 223 | elif mode == 'info': 224 | text_color = Color(Color.GREEN) 225 | file = sys.stderr 226 | 227 | elif mode == 'debug': 228 | text_color = Color(Color.GRAY) 229 | file = sys.stderr 230 | 231 | # default headerの定義 232 | if mode in ('info', 'debug'): 233 | if header is None: 234 | header = self.HEADER 235 | 236 | header = Color.REVERCE + \ 237 | self.replace(header) + Color.END 238 | 239 | # TODO: 正規表現で、付きの箇所を抜き出すような処理を追加で入れる 240 | 241 | # テキストの出力 242 | for line in text.splitlines(): 243 | self.print_line(text_color.out(line), 244 | separator=separator, use_header=use_header, file=file, header=header) 245 | 246 | return 247 | 248 | 249 | # 渡されたリスト内のdictに`num`を追加する関数 250 | def set_counter(links: list): 251 | """set_counter 252 | 253 | links(list)の要素に`num`キーを追加し、連続した数値を入れていく 254 | 255 | Args: 256 | links(list): リンクのリスト. ex) [{'link', 'http://...', 'title': 'hogehoge...'}, {'link': '...', 'title': '...'}, ... ] 257 | Returns: 258 | result(list): [{'link', 'http://...', 'title': 'hogehoge...', num: 1}, {'link': '...', 'title': '...', num: 2}, ... ] 259 | """ 260 | # result(list)の生成 261 | result = list() 262 | 263 | num = 1 264 | for d in links: 265 | d["num"] = num 266 | num += 1 267 | result.append(d) 268 | 269 | return result 270 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PyDork 2 | ====== 3 | 4 | ## Description 5 | 6 | Scraping and listing text and image searches on **Google**, **Bing**, **DuckDuckGo**, **Baidu**, **Yahoo japan**. 7 | 8 | ## Install 9 | 10 | ```bash 11 | pip install pydork 12 | ``` 13 | 14 | ## Build 15 | 16 | ### Documents 17 | 18 | ```bash 19 | python setup.py build_sphinx 20 | ``` 21 | 22 | ### Dockerimage 23 | 24 | ```bash 25 | docker build -t "pydork" --progress=plain . 26 | ``` 27 | 28 | ## How to use 29 | 30 | ### commandline tool 31 | 32 | ```shell 33 | $ # search text at google 34 | $ pydork search -n 10 -t google -- 'super mario' 35 | Google: Text Search: super mario 36 | Google: Finally got 10 links. 37 | [GoogleSearch]: https://www.nintendo.co.jp/character/mario/ 38 | [GoogleSearch]: https://www.nintendo.co.jp/software/smb1/index.html 39 | [GoogleSearch]: https://www.nintendo.co.jp/switch/adala/index.html 40 | [GoogleSearch]: https://www.nintendo.co.jp/switch/ayama/index.html 41 | [GoogleSearch]: https://www.nintendo.co.jp/switch/aaaca/index.html 42 | [GoogleSearch]: https://supermariorun.com/ja/ 43 | [GoogleSearch]: https://ja.wikipedia.org/wiki/%E3%82%B9%E3%83%BC%E3%83%91%E3%83%BC%E3%83%9E%E3%83%AA%E3%82%AA%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA 44 | [GoogleSearch]: https://store-jp.nintendo.com/list/software/70010000034626.html 45 | [GoogleSearch]: https://www.youtube.com/watch?v=z5nqRrqFFZI 46 | [GoogleSearch]: https://www.nintendo.com/games/detail/super-mario-3d-world-plus-bowsers-fury-switch/ 47 | 48 | $ # search text at google, bing, duckduckgo, with selenium 49 | $ pydork search -s -n 10 -t google bing duckduckgo -- 'super mario' 50 | Google: Text Search: super mario 51 | Bing: Text Search: super mario 52 | DuckDuckGo: Text Search: super mario 53 | Bing: Finally got 10 links. 54 | [BingSearch]: https://www.nintendo.co.jp/software/smb1/index.html 55 | [BingSearch]: https://www.nintendo.co.jp/character/mario/index.html 56 | [BingSearch]: https://ja.wikipedia.org/wiki/%E3%82%B9%E3%83%BC%E3%83%91%E3%83%BC%E3%83%9E%E3%83%AA%E3%82%AA%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA 57 | [BingSearch]: https://supermarioplay.com/ 58 | [BingSearch]: https://www.lego.com/ja-jp/campaigns/jp/supermario 59 | [BingSearch]: https://supermariorun.com/ja/ 60 | [BingSearch]: https://ja.wikipedia.org/wiki/%E3%82%B9%E3%83%BC%E3%83%91%E3%83%BC%E3%83%9E%E3%83%AA%E3%82%AA%E3%83%96%E3%83%A9%E3%82%B6%E3%83%BC%E3%82%BA 61 | [BingSearch]: https://supermariobros.io/ 62 | [BingSearch]: https://supermario-bros.co/ 63 | [BingSearch]: https://game-ac.com/free/mario/ 64 | Google: Finally got 10 links. 65 | [GoogleSearch]: https://www.nintendo.co.jp/character/mario/ 66 | [GoogleSearch]: https://www.nintendo.co.jp/software/smb1/index.html 67 | [GoogleSearch]: https://www.nintendo.co.jp/switch/adala/index.html 68 | [GoogleSearch]: https://www.nintendo.co.jp/switch/ayama/index.html 69 | [GoogleSearch]: https://www.nintendo.co.jp/switch/aaaca/index.html 70 | [GoogleSearch]: https://supermariorun.com/ja/ 71 | [GoogleSearch]: https://ja.wikipedia.org/wiki/%E3%82%B9%E3%83%BC%E3%83%91%E3%83%BC%E3%83%9E%E3%83%AA%E3%82%AA%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA 72 | [GoogleSearch]: https://store-jp.nintendo.com/list/software/70010000034626.html 73 | [GoogleSearch]: https://store-jp.nintendo.com/feature_mar004.html 74 | [GoogleSearch]: https://www.nintendo.com/games/detail/super-mario-3d-world-plus-bowsers-fury-switch/ 75 | DuckDuckGo: Finally got 10 links. 76 | [DuckDuckGoSearch]: https://supermariobros.io/ 77 | [DuckDuckGoSearch]: https://supermarioplay.com/ 78 | [DuckDuckGoSearch]: https://mario.nintendo.com/ 79 | [DuckDuckGoSearch]: https://en.wikipedia.org/wiki/Super_Mario 80 | [DuckDuckGoSearch]: https://supermario-game.com/ 81 | [DuckDuckGoSearch]: https://www.mario-flash.com/ 82 | [DuckDuckGoSearch]: https://supermario-bros.co/ 83 | [DuckDuckGoSearch]: https://www.youtube.com/watch?v=4noiYiEYg6Q 84 | [DuckDuckGoSearch]: https://www.crazygames.com/t/mario 85 | [DuckDuckGoSearch]: https://arcadespot.com/game/super-mario-64/ 86 | 87 | $ # search image at google, yahoo.co.jp with selenium and set html title... 88 | $ pydork image -T -s -n 10 -t google yahoo -- 'legend of zelda' 89 | Yahoo: Image Search: legend of zelda 90 | Google: Image Search: legend of zelda 91 | Yahoo: Finally got 10 links. 92 | [YahooSearch]: Amazon.co.jp: The Legend of Zelda: Breath of the Wild ...: https://m.media-amazon.com/images/I/81iU0U8VZML._AC_SL1500_.jpg 93 | [YahooSearch]: Amazon | Legend of Zelda Link's Awakening(輸入版:北米 ...: https://m.media-amazon.com/images/I/91z5JYtUZAS._AC_SY445_.jpg 94 | [YahooSearch]: Amazon | The Legend of Zelda: Breath of the Wild (輸入版 ...: https://m.media-amazon.com/images/I/61wcjVPx4sL._AC_SX466_.jpg 95 | [YahooSearch]: Amazon | The Legend of Zelda Encyclopedia | Nintendo | Video ...: https://images-na.ssl-images-amazon.com/images/I/91zJdQWSE0L.jpg 96 | [YahooSearch]: the-legend-of-zelda-breath-of- ...: https://www.nintendo.com//content/dam/noa/en_US/games/switch/t/the-legend-of-zelda-breath-of-the-wild-switch/the-legend-of-zelda-breath-of-the-wild-switch-hero.jpg 97 | [YahooSearch]: Amazon | The Legend of Zelda: Twilight Princess, Vol. 7 (7 ...: https://images-na.ssl-images-amazon.com/images/I/81-c6fHsctL.jpg 98 | [YahooSearch]: The Legend of Zelda™: Breath of the Wild - My Nintendo Store: https://assets.nintendo.eu/image/upload/f_auto,q_auto,t_product_tile_desktop/MNS/NOE/70010000000023/SQ_NSwitch_TheLegendOfZeldaBreathOfTheWild_E 99 | [YahooSearch]: Amazon | Legend of Zelda 2020 Wall Calendar | Nintendo ...: https://images-na.ssl-images-amazon.com/images/I/61R+rBBQxaL._SX258_BO1,204,203,200_.jpg 100 | [YahooSearch]: 359点のThe Legend Of Zeldaのストックフォト - Getty Images: https://media.gettyimages.com/photos/link-figurine-from-legend-of-zelda-with-shop-staff-inside-nintendo-picture-id1231509485?s=612x612 101 | [YahooSearch]: Evolution of Legend of Zelda 1986-2020 - YouTube: https://i.ytimg.com/vi/1FwoEgUBgE0/maxresdefault.jpg 102 | Google: Finally got 10 links. 103 | [GoogleSearch]: LATEST* The Legend Of Zelda Breath Of The Wild 2: Nintendo Direct E3 2021, Release Date, Leaked Info, Gameplay, Setting, Story Info, Trailers, & More: https://cdn.realsport101.com/images/ncavvykf/realsport-production/2db4094078e3c7e7442e33afb8e8e5e6082d3849-1920x1080.png?rect=0,1,1920,1077&w=328&h=184&auto=format 104 | [GoogleSearch]: Jual The Legend of Zelda: Breath of the Wild Special Edition [EU] - Jakarta Barat - Lionheartno Games Store | Tokopedia: https://images.tokopedia.net/img/cache/700/product-1/2017/1/16/9470651/9470651_4508d715-ecf7-452a-8150-df1a6a0c47ab_771_424.jpg 105 | [GoogleSearch]: The Legend of Zelda: Breath of the Wild – Link has never been set so free | Nintendo Switch | The Guardian: https://i.guim.co.uk/img/media/22d6b308c89e62e229feb220208a639836e31fd9/60_0_1800_1080/master/1800.png?width=700&quality=85&auto=format&fit=max&s=25c588a5203feea6061c32112a66ebdc 106 | [GoogleSearch]: Kaos The Legend of Zelda c Nintendo, Fesyen Pria, Pakaian , Atasan di Carousell: https://media.karousell.com/media/photos/products/2021/9/22/kaos_the_legend_of_zelda_c_nin_1632313294_5b47ea62_progressive.jpg 107 | [GoogleSearch]: Sales of The Legend of Zelda titles worldwide 2019 | Statista: https://cdn.statcdn.com/Statistic/985000/985767-blank-355.png 108 | [GoogleSearch]: Legend Of Zelda Monsters | Minimalis: http://tse2.mm.bing.net/th?id=OIP.wUtxfbukexwonASdvmIirgHaEK&pid=15.1 109 | [GoogleSearch]: Everything The Legend of Zelda: Breath of the Wild 2 is hiding: full analysis - The Legend of Zelda: Breath of the Wild II - Gamereactor: https://www.gamereactor.eu/media/08/legendzelda_3500863.jpg 110 | [GoogleSearch]: The Legend of Zelda: A Link Between Worlds (Video Game 2013) - IMDb: https://m.media-amazon.com/images/M/MV5BZDI2M2IwMDItOTU4MS00YzdjLWJmYjItMzA3MjJjMDk2YjBiXkEyXkFqcGdeQXVyNjY5NTM5MjA@._V1_.jpg 111 | [GoogleSearch]: The Complete Chronological Order Of Legend Of Zelda Games: https://static0.gamerantimages.com/wordpress/wp-content/uploads/2021/01/Zelda-Four-Swords-Adventures-Links.jpg?q=50&fit=crop&w=1400&dpr=1.5 112 | [GoogleSearch]: Sword Slash Png - Legend Of Zelda Skyward Sword Artwork Clipart (#1717847) - PikPng: https://cpng.pikpng.com/pngl/s/90-907142_the-legend-of-zelda-legend-of-zelda-skyward.png 113 | 114 | ``` 115 | 116 | ### python library 117 | 118 | ```python 119 | from pydork.engine import SearchEngine 120 | 121 | # SearchEngine 122 | search_engine = SearchEngine() 123 | 124 | search_engine.set('google') 125 | search_result = search_engine.search('final fantasy') 126 | ``` 127 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | PyDork 2 | ====== 3 | 4 | Description 5 | ----------- 6 | 7 | Scraping and listing text and image searches on Google, Bing, 8 | DuckDuckGo, Baidu, Yahoo japan. 9 | 10 | Install 11 | ------- 12 | 13 | .. code:: bash 14 | 15 | git clone https://github.com/blacknon/pydork 16 | cd pydork 17 | pip install ./ 18 | 19 | How to use 20 | ---------- 21 | 22 | commandline tool 23 | ~~~~~~~~~~~~~~~~ 24 | 25 | .. code:: shell 26 | 27 | $ # search text at google 28 | $ pydork search -n 10 -t google -- 'super mario' 29 | Google: Text Search: super mario 30 | Google: Finally got 10 links. 31 | [GoogleSearch]: https://www.nintendo.co.jp/character/mario/ 32 | [GoogleSearch]: https://www.nintendo.co.jp/software/smb1/index.html 33 | [GoogleSearch]: https://www.nintendo.co.jp/switch/adala/index.html 34 | [GoogleSearch]: https://www.nintendo.co.jp/switch/ayama/index.html 35 | [GoogleSearch]: https://www.nintendo.co.jp/switch/aaaca/index.html 36 | [GoogleSearch]: https://supermariorun.com/ja/ 37 | [GoogleSearch]: https://ja.wikipedia.org/wiki/%E3%82%B9%E3%83%BC%E3%83%91%E3%83%BC%E3%83%9E%E3%83%AA%E3%82%AA%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA 38 | [GoogleSearch]: https://store-jp.nintendo.com/list/software/70010000034626.html 39 | [GoogleSearch]: https://www.youtube.com/watch?v=z5nqRrqFFZI 40 | [GoogleSearch]: https://www.nintendo.com/games/detail/super-mario-3d-world-plus-bowsers-fury-switch/ 41 | 42 | $ # search text at google, bing, duckduckgo, with selenium 43 | $ pydork search -s -n 10 -t google bing duckduckgo -- 'super mario' 44 | Google: Text Search: super mario 45 | Bing: Text Search: super mario 46 | DuckDuckGo: Text Search: super mario 47 | Bing: Finally got 10 links. 48 | [BingSearch]: https://www.nintendo.co.jp/software/smb1/index.html 49 | [BingSearch]: https://www.nintendo.co.jp/character/mario/index.html 50 | [BingSearch]: https://ja.wikipedia.org/wiki/%E3%82%B9%E3%83%BC%E3%83%91%E3%83%BC%E3%83%9E%E3%83%AA%E3%82%AA%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA 51 | [BingSearch]: https://supermarioplay.com/ 52 | [BingSearch]: https://www.lego.com/ja-jp/campaigns/jp/supermario 53 | [BingSearch]: https://supermariorun.com/ja/ 54 | [BingSearch]: https://ja.wikipedia.org/wiki/%E3%82%B9%E3%83%BC%E3%83%91%E3%83%BC%E3%83%9E%E3%83%AA%E3%82%AA%E3%83%96%E3%83%A9%E3%82%B6%E3%83%BC%E3%82%BA 55 | [BingSearch]: https://supermariobros.io/ 56 | [BingSearch]: https://supermario-bros.co/ 57 | [BingSearch]: https://game-ac.com/free/mario/ 58 | Google: Finally got 10 links. 59 | [GoogleSearch]: https://www.nintendo.co.jp/character/mario/ 60 | [GoogleSearch]: https://www.nintendo.co.jp/software/smb1/index.html 61 | [GoogleSearch]: https://www.nintendo.co.jp/switch/adala/index.html 62 | [GoogleSearch]: https://www.nintendo.co.jp/switch/ayama/index.html 63 | [GoogleSearch]: https://www.nintendo.co.jp/switch/aaaca/index.html 64 | [GoogleSearch]: https://supermariorun.com/ja/ 65 | [GoogleSearch]: https://ja.wikipedia.org/wiki/%E3%82%B9%E3%83%BC%E3%83%91%E3%83%BC%E3%83%9E%E3%83%AA%E3%82%AA%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA 66 | [GoogleSearch]: https://store-jp.nintendo.com/list/software/70010000034626.html 67 | [GoogleSearch]: https://store-jp.nintendo.com/feature_mar004.html 68 | [GoogleSearch]: https://www.nintendo.com/games/detail/super-mario-3d-world-plus-bowsers-fury-switch/ 69 | DuckDuckGo: Finally got 10 links. 70 | [DuckDuckGoSearch]: https://supermariobros.io/ 71 | [DuckDuckGoSearch]: https://supermarioplay.com/ 72 | [DuckDuckGoSearch]: https://mario.nintendo.com/ 73 | [DuckDuckGoSearch]: https://en.wikipedia.org/wiki/Super_Mario 74 | [DuckDuckGoSearch]: https://supermario-game.com/ 75 | [DuckDuckGoSearch]: https://www.mario-flash.com/ 76 | [DuckDuckGoSearch]: https://supermario-bros.co/ 77 | [DuckDuckGoSearch]: https://www.youtube.com/watch?v=4noiYiEYg6Q 78 | [DuckDuckGoSearch]: https://www.crazygames.com/t/mario 79 | [DuckDuckGoSearch]: https://arcadespot.com/game/super-mario-64/ 80 | 81 | $ # search image at google, yahoo.co.jp with selenium and set html title... 82 | $ pydork image -T -s -n 10 -t google yahoo -- 'legend of zelda' 83 | Yahoo: Image Search: legend of zelda 84 | Google: Image Search: legend of zelda 85 | Yahoo: Finally got 10 links. 86 | [YahooSearch]: Amazon.co.jp: The Legend of Zelda: Breath of the Wild ...: https://m.media-amazon.com/images/I/81iU0U8VZML._AC_SL1500_.jpg 87 | [YahooSearch]: Amazon | Legend of Zelda Link's Awakening(輸入版:北米 ...: https://m.media-amazon.com/images/I/91z5JYtUZAS._AC_SY445_.jpg 88 | [YahooSearch]: Amazon | The Legend of Zelda: Breath of the Wild (輸入版 ...: https://m.media-amazon.com/images/I/61wcjVPx4sL._AC_SX466_.jpg 89 | [YahooSearch]: Amazon | The Legend of Zelda Encyclopedia | Nintendo | Video ...: https://images-na.ssl-images-amazon.com/images/I/91zJdQWSE0L.jpg 90 | [YahooSearch]: the-legend-of-zelda-breath-of- ...: https://www.nintendo.com//content/dam/noa/en_US/games/switch/t/the-legend-of-zelda-breath-of-the-wild-switch/the-legend-of-zelda-breath-of-the-wild-switch-hero.jpg 91 | [YahooSearch]: Amazon | The Legend of Zelda: Twilight Princess, Vol. 7 (7 ...: https://images-na.ssl-images-amazon.com/images/I/81-c6fHsctL.jpg 92 | [YahooSearch]: The Legend of Zelda™: Breath of the Wild - My Nintendo Store: https://assets.nintendo.eu/image/upload/f_auto,q_auto,t_product_tile_desktop/MNS/NOE/70010000000023/SQ_NSwitch_TheLegendOfZeldaBreathOfTheWild_E 93 | [YahooSearch]: Amazon | Legend of Zelda 2020 Wall Calendar | Nintendo ...: https://images-na.ssl-images-amazon.com/images/I/61R+rBBQxaL._SX258_BO1,204,203,200_.jpg 94 | [YahooSearch]: 359点のThe Legend Of Zeldaのストックフォト - Getty Images: https://media.gettyimages.com/photos/link-figurine-from-legend-of-zelda-with-shop-staff-inside-nintendo-picture-id1231509485?s=612x612 95 | [YahooSearch]: Evolution of Legend of Zelda 1986-2020 - YouTube: https://i.ytimg.com/vi/1FwoEgUBgE0/maxresdefault.jpg 96 | Google: Finally got 10 links. 97 | [GoogleSearch]: LATEST* The Legend Of Zelda Breath Of The Wild 2: Nintendo Direct E3 2021, Release Date, Leaked Info, Gameplay, Setting, Story Info, Trailers, & More: https://cdn.realsport101.com/images/ncavvykf/realsport-production/2db4094078e3c7e7442e33afb8e8e5e6082d3849-1920x1080.png?rect=0,1,1920,1077&w=328&h=184&auto=format 98 | [GoogleSearch]: Jual The Legend of Zelda: Breath of the Wild Special Edition [EU] - Jakarta Barat - Lionheartno Games Store | Tokopedia: https://images.tokopedia.net/img/cache/700/product-1/2017/1/16/9470651/9470651_4508d715-ecf7-452a-8150-df1a6a0c47ab_771_424.jpg 99 | [GoogleSearch]: The Legend of Zelda: Breath of the Wild – Link has never been set so free | Nintendo Switch | The Guardian: https://i.guim.co.uk/img/media/22d6b308c89e62e229feb220208a639836e31fd9/60_0_1800_1080/master/1800.png?width=700&quality=85&auto=format&fit=max&s=25c588a5203feea6061c32112a66ebdc 100 | [GoogleSearch]: Kaos The Legend of Zelda c Nintendo, Fesyen Pria, Pakaian , Atasan di Carousell: https://media.karousell.com/media/photos/products/2021/9/22/kaos_the_legend_of_zelda_c_nin_1632313294_5b47ea62_progressive.jpg 101 | [GoogleSearch]: Sales of The Legend of Zelda titles worldwide 2019 | Statista: https://cdn.statcdn.com/Statistic/985000/985767-blank-355.png 102 | [GoogleSearch]: Legend Of Zelda Monsters | Minimalis: http://tse2.mm.bing.net/th?id=OIP.wUtxfbukexwonASdvmIirgHaEK&pid=15.1 103 | [GoogleSearch]: Everything The Legend of Zelda: Breath of the Wild 2 is hiding: full analysis - The Legend of Zelda: Breath of the Wild II - Gamereactor: https://www.gamereactor.eu/media/08/legendzelda_3500863.jpg 104 | [GoogleSearch]: The Legend of Zelda: A Link Between Worlds (Video Game 2013) - IMDb: https://m.media-amazon.com/images/M/MV5BZDI2M2IwMDItOTU4MS00YzdjLWJmYjItMzA3MjJjMDk2YjBiXkEyXkFqcGdeQXVyNjY5NTM5MjA@._V1_.jpg 105 | [GoogleSearch]: The Complete Chronological Order Of Legend Of Zelda Games: https://static0.gamerantimages.com/wordpress/wp-content/uploads/2021/01/Zelda-Four-Swords-Adventures-Links.jpg?q=50&fit=crop&w=1400&dpr=1.5 106 | [GoogleSearch]: Sword Slash Png - Legend Of Zelda Skyward Sword Artwork Clipart (#1717847) - PikPng: https://cpng.pikpng.com/pngl/s/90-907142_the-legend-of-zelda-legend-of-zelda-skyward.png 107 | 108 | python library 109 | ~~~~~~~~~~~~~~ 110 | 111 | .. code:: python 112 | 113 | from pydork.engine import SearchEngine 114 | 115 | # SearchEngine 116 | search_engine = SearchEngine() 117 | 118 | search_engine.set('google') 119 | search_result = search_engine.search('final fantasy') 120 | -------------------------------------------------------------------------------- /pydork/engine_duckduckgo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2023 Blacknon. All rights reserved. 4 | # Use of this source code is governed by an MIT license 5 | # that can be found in the LICENSE file. 6 | # ======================================================= 7 | 8 | 9 | """engine_duckduckgo 10 | * DuckDuckGo用の検索用Classを持つモジュール. 11 | """ 12 | 13 | 14 | import json 15 | import re 16 | import sys 17 | 18 | from time import sleep 19 | from urllib import parse 20 | from bs4 import BeautifulSoup 21 | 22 | from .common import Color 23 | from .engine_common import CommonEngine 24 | 25 | 26 | class DuckDuckGo(CommonEngine): 27 | """DuckDuckGo 28 | 29 | DuckDuckGo用の検索エンジン用Class. 30 | """ 31 | 32 | def __init__(self): 33 | # CommonEngineの処理を呼出し 34 | super().__init__() 35 | 36 | self.NAME = 'DuckDuckGo' 37 | self.COLOR = Color.BLUE 38 | self.COLOR_NAME = self.COLOR + self.NAME + Color.END 39 | 40 | # リクエスト先のURLを指定 41 | self.PRE_URL = 'https://duckduckgo.com/' 42 | self.ENGINE_TOP_URL = 'https://duckduckgo.com/' 43 | self.SEARCH_URL = 'https://links.duckduckgo.com/d.js' 44 | self.IMAGE_URL = 'https://duckduckgo.com/i.js' 45 | self.SUGGEST_URL = 'https://duckduckgo.com/ac/' 46 | 47 | def request_selenium(self, url: str, method='GET', data=None): 48 | if self.SUGGEST_URL in url: 49 | # 最初にTOPページを表示 50 | self.driver.get(self.ENGINE_TOP_URL) 51 | 52 | self.driver.implicitly_wait(3) 53 | 54 | # javascriptからリクエストを投げてjsonを取得 55 | exec_java_script = 'return fetch("{}").then(response=>response.json())'.format( 56 | url) 57 | result = self.driver.execute_script(exec_java_script) 58 | 59 | result = json.dumps(result) 60 | 61 | else: 62 | result = super().request_selenium(url, method, data) 63 | 64 | return result 65 | 66 | def gen_search_url(self, keyword: str, type: str): 67 | """gen_search_url 68 | 69 | 検索用のurlを生成する. 70 | 71 | Args: 72 | keyword (str): 検索クエリ. 73 | type (str): 検索タイプ. 74 | 75 | Returns: 76 | dict: 検索用url 77 | """ 78 | 79 | # 前処理リクエスト用パラメータの設定 80 | pre_param = { 81 | 'q': keyword, # 検索キーワード 82 | 't': 'h_' 83 | } 84 | 85 | try: 86 | # 前処理リクエスのセッションを生成する 87 | pre_params = parse.urlencode(pre_param) 88 | pre_url = self.PRE_URL + '?' + pre_params 89 | 90 | # 前処理リクエスト1を実行 91 | self.get_result('https://duckduckgo.com/?t=h_') 92 | 93 | # 待機時間を入れる 94 | sleep(1) 95 | 96 | # 前処理リクエスト2を実行 97 | pre_html = self.get_result(pre_url) 98 | sleep(1) 99 | 100 | r = re.findall( 101 | r"(?<=vqd\=)[0-9-]+", pre_html 102 | ) 103 | 104 | # get vqd 105 | vqd = r[0] 106 | 107 | except Exception: 108 | return 109 | 110 | if type == 'text': 111 | # 検索urlを指定 112 | search_url = self.SEARCH_URL 113 | 114 | # 検索パラメータの設定 115 | url_param = { 116 | 'q': keyword, # 検索キーワード 117 | 's': 0, # 取得開始件数 118 | 'vqd': vqd 119 | } 120 | 121 | # lang/localeが設定されている場合 122 | if self.LANG != '' and self.LOCALE != '': 123 | url_param['l'] = self.LANG + '_' + self.LOCALE 124 | 125 | # rangeが設定されている場合(DuckDuckGoにはレンジ指定がないらしいので、追加されたら記述する) 126 | 127 | elif type == 'image': 128 | # 検索urlを指定 129 | search_url = self.IMAGE_URL 130 | 131 | # 検索パラメータの設定 132 | url_param = { 133 | 'q': keyword, # 検索キーワード 134 | 'o': 'json', # output format 135 | 'p': 1, 136 | 's': 0, # 取得開始件数 137 | 'u': 'bing', # TODO: 利用する検索エンジン(おそらく).後でオプションで指定できるようにする. 138 | 'f': ',,,,,', 139 | 'vqd': vqd 140 | } 141 | 142 | # lang/localeが設定されている場合 143 | if self.LANG != '' and self.LOCALE != '': 144 | url_param['l'] = self.LANG + '-' + self.LANG 145 | 146 | # set next_url 147 | params = parse.urlencode(url_param) 148 | self.next_url = search_url + '?' + params 149 | 150 | # while loop 151 | page = 0 152 | while True: 153 | if self.next_url == "": 154 | break 155 | 156 | # get next_url 157 | target_url = self.next_url 158 | 159 | yield 'GET', target_url, None 160 | 161 | page += 1 162 | 163 | def gen_suggest_url(self, keyword: str): 164 | """gen_suggest_url 165 | 166 | サジェスト取得用のurlを生成する. 167 | 168 | Args: 169 | keyword (str): 検索クエリ. 170 | 171 | Returns: 172 | dict: サジェスト取得用url 173 | """ 174 | url_param = { 175 | 'q': keyword, # 検索キーワード 176 | 'kl': 'wt-wt' 177 | } 178 | 179 | params = parse.urlencode(url_param) 180 | url = self.SUGGEST_URL + '?' + params 181 | 182 | return url 183 | 184 | def get_links(self, source_url: str, html: str, type: str): 185 | """get_links 186 | 187 | 受け付けたhtmlを解析し、検索結果をlistに加工して返す関数. 188 | 189 | Args: 190 | url (str): 解析する検索結果のurl. 191 | html (str): 解析する検索結果のhtml. 192 | type (str): 検索タイプ([text, image]).現時点ではtextのみ対応. 193 | 194 | Returns: 195 | list: 検索結果(`[{'title': 'title...', 'url': 'https://hogehoge....'}, {...}]`) 196 | """ 197 | links = list() 198 | 199 | # next_url用のurl 200 | url = "" 201 | vqd = "" 202 | 203 | if type == 'text': 204 | # 加工してdictとして扱えるようにする 205 | r = re.findall( 206 | r"DDG\.pageLayout\.load\(\'d\',(.+)\]\)\;", html 207 | ) 208 | 209 | try: 210 | r_dict = json.loads(r[0] + "]") 211 | except Exception: 212 | return links 213 | 214 | for r_data in r_dict: 215 | if "u" in r_data and "s" in r_data: 216 | d = { 217 | "link": r_data["u"], 218 | "title": BeautifulSoup( 219 | r_data["t"], "lxml").text, 220 | "text": BeautifulSoup( 221 | r_data["a"], "lxml").text, 222 | "source_url": source_url, 223 | } 224 | links.append(d) 225 | 226 | elif "n" in r_data: 227 | base_uri = '{uri.scheme}://{uri.netloc}'.format( 228 | uri=parse.urlparse(self.SEARCH_URL) 229 | ) 230 | url = base_uri + r_data["n"] 231 | 232 | elif type == 'image': 233 | # seleniumを使用している場合、htmlを上書き 234 | if self.USE_SELENIUM or self.USE_SPLASH: 235 | soup = BeautifulSoup(html, "lxml") 236 | selected_one = soup.select_one('html > body > pre') 237 | html = selected_one.text 238 | 239 | # jsonとして読み込む 240 | try: 241 | data = json.loads(html) 242 | except Exception as e: 243 | print(e, file=sys.stderr) 244 | return links 245 | 246 | if 'results' in data: 247 | results = data['results'] 248 | 249 | for r in results: 250 | d = { 251 | 'link': r['image'], 252 | 'title': r['title'], 253 | 'pagelink': r['url'] 254 | } 255 | links.append(d) 256 | 257 | if 'vqd' in data: 258 | vqd = list(data['vqd'].values())[0] 259 | 260 | # next_url用のurlを取得する 261 | if 'next' in data: 262 | next_path = data['next'] 263 | next_path = next_path + '&vqd=' + vqd 264 | base_url = '{uri.scheme}://{uri.netloc}/'.format( 265 | uri=parse.urlparse(self.IMAGE_URL) 266 | ) 267 | url = base_url + next_path 268 | 269 | if url != "": 270 | self.next_url = url 271 | 272 | return links 273 | 274 | def get_suggest_list(self, suggests: list, char: str, html: str): 275 | """get_suggest_list 276 | 277 | htmlからsuggestを配列で取得する関数. 278 | 279 | Args: 280 | suggests (list): suggestを追加するための大本のlist. 281 | char (str): サジェストの文字列. 282 | html (str): 解析を行うhtml. 283 | 284 | Returns: 285 | dict: サジェスト配列 286 | """ 287 | 288 | data = json.loads(html) 289 | suggests[char if char == '' else char[-1]] = [e['phrase'] 290 | for e in data] 291 | 292 | return suggests 293 | -------------------------------------------------------------------------------- /pydork/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2023 Blacknon. All rights reserved. 4 | # Use of this source code is governed by an MIT license 5 | # that can be found in the LICENSE file. 6 | # ======================================================= 7 | 8 | from .sub_commands import run_subcommand 9 | from .engine import ENGINES 10 | from . import messages 11 | 12 | from pkg_resources import get_distribution 13 | from datetime import datetime 14 | 15 | import copy 16 | import argparse 17 | 18 | # TODO: returnではなくyieldに切り替えて、返り値をgeneratorにすることである程度途中状態でも状況を見れるような仕組みとする 19 | 20 | 21 | # version (setup.pyから取得してくる) 22 | __version__ = get_distribution('pydork').version 23 | 24 | 25 | # main 26 | def main(): 27 | # parserの作成 28 | parser = argparse.ArgumentParser( 29 | description=messages.description) 30 | subparsers = parser.add_subparsers() 31 | 32 | # ENGINESに`all`を追加 33 | engines_list = copy.deepcopy(ENGINES) 34 | engines_list.append('all') 35 | 36 | # サブコマンド共通の引数 37 | common_args_map = [ 38 | { 39 | "args": ["query"], 40 | "action": "store", 41 | "type": str, 42 | "nargs": "?", 43 | "default": "", 44 | "help": messages.help_message_query, 45 | }, 46 | { 47 | "args": ["-f", "--file"], 48 | "action": "store", 49 | "type": str, 50 | "default": "", 51 | "help": messages.help_message_op_file, 52 | }, 53 | { 54 | "args": ["-F", "--template_file"], 55 | "action": "store", 56 | "type": str, 57 | "default": "", 58 | "help": messages.help_message_op_template_file, 59 | }, 60 | { 61 | "args": ["-V", "--template_variable"], 62 | "action": "store", 63 | "type": str, 64 | "default": "", 65 | "help": messages.help_message_op_template_variable, 66 | }, 67 | { 68 | "args": ["-t", "--search_type"], 69 | "default": ["google"], 70 | "choices": engines_list, 71 | "nargs": "+", 72 | "type": str, 73 | "help": messages.help_message_op_search_type, 74 | }, 75 | { 76 | "args": ["-l", "--lang"], 77 | "default": "ja", 78 | "choices": ["ja", "en"], 79 | "type": str, 80 | "help": messages.help_message_op_lang, 81 | }, 82 | { 83 | "args": ["-c", "--country"], 84 | "default": "JP", 85 | "choices": ["JP", "US"], 86 | "type": str, 87 | "help": messages.help_message_op_country, 88 | }, 89 | { 90 | "args": ["-P", "--proxy"], 91 | "default": "", 92 | "type": str, 93 | "help": messages.help_message_op_proxy_server, 94 | }, 95 | { 96 | "args": ["-j", "--json"], 97 | "action": "store_true", 98 | "help": messages.help_message_op_json, 99 | }, 100 | { 101 | "args": ["-k", "--insecure"], 102 | "action": "store_true", 103 | "help": messages.help_message_op_insecure, 104 | }, 105 | { 106 | "args": ["-s", "--selenium"], 107 | "action": "store_true", 108 | "help": messages.help_message_op_selenium, 109 | }, 110 | { 111 | "args": ["-S", "--splash"], 112 | "action": "store_true", 113 | "help": messages.help_message_op_splash, 114 | }, 115 | { 116 | "args": ["-b", "--browser-endpoint"], 117 | "default": "", 118 | "type": str, 119 | "help": messages.help_message_op_browser_endpoint, 120 | }, 121 | { 122 | "args": ["-B", "--browser"], 123 | "default": "firefox", 124 | "choices": ["chrome", "firefox"], 125 | "type": str, 126 | "help": messages.help_message_op_browser, 127 | }, 128 | { 129 | "args": ["--color"], 130 | "default": "auto", 131 | "choices": ["auto", "none", "always"], 132 | "type": str, 133 | "help": messages.help_message_op_color, 134 | }, 135 | { 136 | "args": ["--cookies"], 137 | "default": "~/.pydork_cookies", 138 | "type": str, 139 | "help": messages.help_message_op_cookies_dir, 140 | }, 141 | { 142 | "args": ["--delete-cookies"], 143 | "action": "store_true", 144 | "help": messages.help_message_op_delete_cookies, 145 | }, 146 | ] 147 | 148 | # サブコマンド `search` の引数 149 | search_args_map = [ 150 | { 151 | "args": ["-T", "--title"], 152 | "action": "store_true", 153 | "help": messages.help_message_op_title, 154 | }, 155 | { 156 | "args": ["-0", "--nullchar"], 157 | "action": "store_true", 158 | "help": messages.help_message_op_null_char, 159 | }, 160 | { 161 | "args": ["-n", "--num"], 162 | "default": 300, 163 | "type": int, 164 | "help": messages.help_message_op_num, 165 | }, 166 | { 167 | "args": ["--start"], 168 | "type": lambda s: datetime.strptime(s, '%Y-%m-%d'), 169 | "help": messages.help_message_op_start, 170 | }, 171 | { 172 | "args": ["--end"], 173 | "type": lambda s: datetime.strptime(s, '%Y-%m-%d'), 174 | "help": messages.help_message_op_end, 175 | }, 176 | { 177 | "args": ["--debug"], 178 | "action": "store_true", 179 | "help": messages.help_message_op_debug, 180 | }, 181 | { 182 | "args": ["--disable-headless"], 183 | "action": "store_true", 184 | "help": messages.help_message_op_disable_headless, 185 | }, 186 | ] 187 | search_args_map.extend(copy.deepcopy(common_args_map)) 188 | 189 | # サブコマンド `image` の引数 190 | image_args_map = [ 191 | { 192 | "args": ["-T", "--title"], 193 | "action": "store_true", 194 | "help": messages.help_message_op_title, 195 | }, 196 | { 197 | "args": ["-p", "--pagelink"], 198 | "action": "store_true", 199 | "help": messages.help_message_op_image_pagelink, 200 | }, 201 | { 202 | "args": ["-0", "--nullchar"], 203 | "action": "store_true", 204 | "help": messages.help_message_op_null_char, 205 | }, 206 | { 207 | "args": ["-n", "--num"], 208 | "default": 300, 209 | "type": int, 210 | "help": messages.help_message_op_num, 211 | }, 212 | # { 213 | # "args": ["--start"], 214 | # "type": lambda s: datetime.strptime(s, '%Y-%m-%d'), 215 | # "help": messages.help_message_op_start, 216 | # }, 217 | # { 218 | # "args": ["--end"], 219 | # "type": lambda s: datetime.strptime(s, '%Y-%m-%d'), 220 | # "help": messages.help_message_op_end, 221 | # }, 222 | { 223 | "args": ["--debug"], 224 | "action": "store_true", 225 | "help": messages.help_message_op_debug, 226 | }, 227 | { 228 | "args": ["--disable-headless"], 229 | "action": "store_true", 230 | "help": messages.help_message_op_disable_headless, 231 | }, 232 | ] 233 | image_args_map.extend(copy.deepcopy(common_args_map)) 234 | 235 | # サブコマンド `suggest` の引数 236 | suggest_args_map = [ 237 | { 238 | "args": ["--jap"], 239 | "action": "store_true", 240 | "help": messages.help_message_op_suggest_jap 241 | }, 242 | { 243 | "args": ["--alph"], 244 | "action": "store_true", 245 | "help": messages.help_message_op_suggest_alph 246 | }, 247 | { 248 | "args": ["--num"], 249 | "action": "store_true", 250 | "help": messages.help_message_op_suggest_num 251 | }, 252 | ] 253 | suggest_args_map.extend(copy.deepcopy(common_args_map)) 254 | 255 | # search 256 | # ---------- 257 | parser_search = subparsers.add_parser( 258 | 'search', 259 | help='search mode. see `search -h`' 260 | ) 261 | 262 | # add_argument 263 | for element in search_args_map: 264 | args = element['args'] 265 | element.pop('args') 266 | parser_search.add_argument(*args, **element) 267 | 268 | # set parser_search 269 | parser_search.set_defaults(handler=run_subcommand, subcommand="search") 270 | 271 | # image 272 | # ---------- 273 | parser_image = subparsers.add_parser( 274 | 'image', 275 | help='search mode. see `search -h`' 276 | ) 277 | 278 | # add_argument 279 | for element in image_args_map: 280 | args = element['args'] 281 | element.pop('args') 282 | parser_image.add_argument(*args, **element) 283 | 284 | # set parser_image 285 | parser_image.set_defaults(handler=run_subcommand, subcommand="image") 286 | 287 | # suggest 288 | # ---------- 289 | parser_suggest = subparsers.add_parser( 290 | 'suggest', 291 | help='suggest mode. see `suggest -h`' 292 | ) 293 | 294 | # add_argument 295 | for element in suggest_args_map: 296 | args = element['args'] 297 | element.pop('args') 298 | parser_suggest.add_argument(*args, **element) 299 | 300 | parser_suggest.set_defaults(handler=run_subcommand, subcommand="suggest") 301 | 302 | # --version(-v)オプションのparser定義 303 | parser.add_argument( 304 | '-v', 305 | '--version', 306 | action='version', 307 | version='%(prog)s version:{version}'.format(version=__version__) 308 | ) 309 | 310 | args = parser.parse_args() 311 | if hasattr(args, 'handler'): 312 | args.handler(args.subcommand, args) 313 | else: 314 | # 未知のサブコマンドの場合はヘルプを表示 315 | parser.print_help() 316 | 317 | 318 | if __name__ == '__main__': 319 | main() 320 | -------------------------------------------------------------------------------- /pydork/engine_yahoo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2023 Blacknon. All rights reserved. 4 | # Use of this source code is governed by an MIT license 5 | # that can be found in the LICENSE file. 6 | # ======================================================= 7 | 8 | 9 | """engine_yahoo 10 | * Yahoo(yahoo.co.jp)用の検索用Classを持つモジュール. 11 | """ 12 | 13 | 14 | import json 15 | import re 16 | import sys 17 | 18 | from urllib import parse 19 | from bs4 import BeautifulSoup 20 | 21 | from .common import Color 22 | from .engine_common import CommonEngine 23 | 24 | 25 | class Yahoo(CommonEngine): 26 | """Yahoo 27 | 28 | Yahoo(yahoo.co.jp)用の検索エンジン用Class. 29 | """ 30 | 31 | def __init__(self): 32 | # CommonEngineの処理を呼出し 33 | super().__init__() 34 | 35 | self.NAME = 'Yahoo' 36 | self.COLOR = Color.YELLOW 37 | self.COLOR_NAME = self.COLOR + self.NAME + Color.END 38 | 39 | # リクエスト先のURLを指定 40 | self.ENGINE_TOP_URL = 'https://www.yahoo.co.jp/' 41 | self.SEARCH_URL = 'https://search.yahoo.co.jp/search' 42 | self.IMAGE_PRE_URL = 'https://search.yahoo.co.jp/image/search' 43 | self.IMAGE_URL = 'https://search.yahoo.co.jp/image/api/search' 44 | self.SUGGEST_URL = 'https://ff.search.yahoo.com/gossip' 45 | 46 | def gen_search_url(self, keyword: str, type: str): 47 | """gen_search_url 48 | 49 | 検索用のurlを生成する. 50 | 51 | Args: 52 | keyword (str): 検索クエリ. 53 | type (str): 検索タイプ. 54 | 55 | Returns: 56 | dict: 検索用url 57 | """ 58 | search_url = '' 59 | 60 | # 検索タイプがtextの場合 61 | if type == 'text': 62 | # 検索urlを指定 63 | search_url = self.SEARCH_URL 64 | 65 | # 検索パラメータの設定 66 | url_param = { 67 | 'p': keyword, # 検索キーワード 68 | 'num': '100', # 指定不可(削除) 69 | 'day_from': '', # 開始日時(yyyy/mm/dd) 70 | 'day_to': '', # 終了日時(yyyy/mm/dd) 71 | 'b': '', # 開始位置 72 | 'nfpr': '1', # もしかして検索(Escape hatch)の無効化 73 | 'qrw': '0' # もしかして検索(Escape hatch)の無効化 74 | } 75 | 76 | # lang/localeが設定されている場合 77 | if self.LANG != '' and self.LOCALE != '': 78 | url_param['hl'] = self.LANG 79 | url_param['gl'] = self.LOCALE 80 | 81 | # rangeが設定されている場合 82 | try: 83 | start = self.RANGE_START 84 | end = self.RANGE_END 85 | 86 | # ex.) day_from=2019/09/01&day_to=2019/09/30 87 | # パラメータが2つ存在している 88 | day_from = start.strftime("%Y/%m/%d") 89 | day_to = end.strftime("%Y/%m/%d") 90 | 91 | # GETパラメータに日時データを追加 92 | url_param['day_from'] = day_from 93 | url_param['day_to'] = day_to 94 | 95 | except AttributeError: 96 | None 97 | 98 | # 検索タイプがimageの場合 99 | elif type == 'image': 100 | # 前処理(パラメータ`cr`の取得)を実行 101 | cr = self.get_image_search_cr(keyword) 102 | 103 | # 検索urlを指定 104 | search_url = self.IMAGE_URL 105 | 106 | # 検索パラメータの設定 107 | url_param = { 108 | 'p': keyword, # 検索キーワード 109 | 'fr': 'top_ga1_sa', 110 | 'ei': 'UTF-8', 111 | 'aq': '-1', 112 | 'n': '20', # 指定不可(削除) 113 | 'vm': 'i', 114 | 'se': '0', 115 | 'ue': '0', 116 | 'cr': cr, 117 | # 'day_from': '', # 開始日時(yyyy/mm/dd) 118 | # 'day_to': '', # 終了日時(yyyy/mm/dd) 119 | 'b': '', # 開始位置 120 | 'nfpr': '1', # もしかして検索(Escape hatch)の無効化 121 | 'qrw': '0' # もしかして検索(Escape hatch)の無効化 122 | } 123 | 124 | page = 0 125 | while True: 126 | # parameterにページを開始する番号を指定 127 | if type == 'text': 128 | url_param['b'] = str(page * 10) 129 | elif type == 'image': 130 | url_param['b'] = str(page * 10) 131 | 132 | # パラメータをセット 133 | params = parse.urlencode(url_param) 134 | 135 | target_url = search_url + '?' + params 136 | 137 | yield 'GET', target_url, None 138 | 139 | page += 1 140 | 141 | def gen_suggest_url(self, keyword: str): 142 | """gen_suggest_url 143 | 144 | サジェスト取得用のurlを生成する. 145 | 146 | Args: 147 | keyword (str): 検索クエリ. 148 | 149 | Returns: 150 | dict: サジェスト取得用url 151 | """ 152 | url_param = { 153 | 'command': keyword, # 検索キーワード 154 | 'output': 'json', 155 | } 156 | 157 | params = parse.urlencode(url_param) 158 | url = self.SUGGEST_URL + '?' + params 159 | 160 | return url 161 | 162 | def get_links(self, url: str, html: str, type: str): 163 | """get_links 164 | 165 | 受け付けたhtmlを解析し、検索結果をlistに加工して返す関数. 166 | 167 | Args: 168 | url (str): 解析する検索結果のurl. 169 | html (str): 解析する検索結果のhtml. 170 | type (str): 検索タイプ([text, image]).現時点ではtextのみ対応. 171 | 172 | Returns: 173 | list: 検索結果(`[{'title': 'title...', 'url': 'https://hogehoge....'}, {...}]`) 174 | """ 175 | 176 | if type == 'text': 177 | if self.USE_SPLASH or self.USE_SELENIUM: 178 | self.SOUP_SELECT_JSON = '#__NEXT_DATA__' 179 | self.SOUP_SELECT_IMAGE = '.rg_meta.notranslate' 180 | self.SOUP_SELECT_TEXT = '' 181 | 182 | # Yahooの場合、jsonから検索結果を取得する 183 | soup = BeautifulSoup(html, 'lxml') 184 | elements = soup.select(self.SOUP_SELECT_JSON) 185 | element = elements[0].string 186 | 187 | # debug 188 | if self.IS_DEBUG: 189 | print(Color.PURPLE + '[JsonElement]' + Color.END, 190 | file=sys.stderr) 191 | print(Color.PURPLE + element + Color.END, 192 | file=sys.stderr) # type: ignore 193 | 194 | # jsonからデータを抽出  195 | j = json.loads(element) # type: ignore 196 | 197 | # debug 198 | if self.IS_DEBUG: 199 | print(Color.PURPLE + '[Json]' + Color.END, file=sys.stderr) 200 | print(Color.PURPLE + json.dumps(j) + Color.END, 201 | file=sys.stderr) 202 | 203 | jd = j['props']['initialProps']['pageProps']['pageData']['algos'] 204 | 205 | elinks = [e['url'] for e in jd] 206 | etitles = [e['title'] for e in jd] 207 | etexts = [e['description'] for e in jd] 208 | 209 | links = self.create_text_links(url, elinks, etitles, etexts) 210 | 211 | else: 212 | self.SOUP_SELECT_URL = '.sw-Card__headerSpace > .sw-Card__title > a' 213 | self.SOUP_SELECT_TITLE = '.sw-Card__headerSpace > .sw-Card__title > a > h3' 214 | self.SOUP_SELECT_TEXT = '.sw-Card__floatContainer > .sw-Card__summary' 215 | 216 | # CommonEngineの処理を呼び出す 217 | links = super().get_links(url, html, type) 218 | 219 | elif type == 'image': 220 | # CommonEngineの処理を呼び出す 221 | links = super().get_links(url, html, type) 222 | 223 | return links 224 | 225 | # 画像検索ページの検索結果(links(list()))を生成するfunction 226 | def get_image_links(self, soup: BeautifulSoup): 227 | """get_image_links 228 | BeautifulSoupから画像検索ページを解析して結果を返す関数. 229 | 230 | Args: 231 | soup (BeautifulSoup): 解析するBeautifulSoupオブジェクト. 232 | 233 | Returns: 234 | list: 検索結果(`[{'title': 'title...', 'link': 'https://hogehoge....'}, {...}]`) 235 | """ 236 | 237 | result = [] # image url 238 | 239 | try: 240 | data = json.loads(soup.text) 241 | except Exception: 242 | return result 243 | 244 | for d in data['algos']: 245 | etitle = d['title'] 246 | elink = d['refererUrl'] 247 | eimage = d['original']['url'] 248 | 249 | el = { 250 | 'title': etitle, 251 | 'pagelink': elink, 252 | 'link': eimage, 253 | } 254 | 255 | result.append(el) 256 | 257 | return result 258 | 259 | def get_suggest_list(self, suggests: list, char: str, html: str): 260 | """get_suggest_list 261 | 262 | htmlからsuggestを配列で取得する関数. 263 | 264 | Args: 265 | suggests (list): suggestを追加するための大本のlist. 266 | char (str): サジェストの文字列. 267 | html (str): 解析を行うhtml. 268 | 269 | Returns: 270 | dict: サジェスト配列 271 | """ 272 | if self.USE_SELENIUM and self.SELENIUM_BROWSER == 'firefox': 273 | soup = BeautifulSoup(html, features="lxml") 274 | html = soup.find("pre").text 275 | data = json.loads(html) 276 | suggests[char if char == '' else char[-1]] = [e['key'] # type: ignore 277 | for e in data['gossip']['results']] 278 | 279 | return suggests 280 | 281 | def get_image_search_cr(self, keyword: str): 282 | """get_image_search_cr 283 | 284 | Yahooの画像検索時に必要になるcrumb(cr)パラメータを取得するための前処理リクエストを行う関数 285 | 286 | Args: 287 | keyword (str): 検索キーワード 288 | 289 | Returns: 290 | str: crumbパラメータの値 291 | """ 292 | 293 | result = '' 294 | 295 | # urlパラメータを設定 296 | url_param = { 297 | 'p': keyword, 298 | 'fr': 'top_ga1_sa', 299 | 'ei': 'UTF-8', 300 | 'aq': '-1', 301 | } 302 | params = parse.urlencode(url_param) 303 | 304 | # 前処理リクエストを投げる 305 | pre_result = self.get_result(self.IMAGE_PRE_URL + '?' + params) 306 | 307 | # 前処理リクエストから、crumbパラメータの値を取得する(正規表現) 308 | pattern = r'{ *"crumb": *"[^"]+" *}' 309 | data = re.findall(pattern, pre_result) 310 | 311 | if len(data) > 0: 312 | d = data[0] 313 | jd = json.loads(d) 314 | 315 | result = jd['crumb'] 316 | 317 | return result 318 | -------------------------------------------------------------------------------- /pydork/recaptcha.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2023 Blacknon. All rights reserved. 4 | # Use of this source code is governed by an MIT license 5 | # that can be found in the LICENSE file. 6 | # ======================================================= 7 | 8 | """engine 9 | * ReCaptcha関連のClassを集約するモジュールファイル 10 | """ 11 | 12 | import json 13 | import requests 14 | 15 | from urllib import parse 16 | from urllib.parse import urlparse 17 | from bs4 import BeautifulSoup 18 | from time import sleep 19 | 20 | from .common import Color, Message 21 | 22 | 23 | # 2CaptchaのAPIへPOSTするためのClass 24 | class TwoCaptcha: 25 | """TwoCaptcha 26 | 27 | 2CaptchaのAPIへPOSTし、ReCaptchaを突破するためのClass. 28 | 29 | Note: 30 | 公式ライブラリ側でCookieのPOSTに対応していなかったため作成. 31 | 32 | 33 | """ 34 | 35 | def __init__(self, apikey: str): 36 | """__init__ 37 | 38 | Args: 39 | apikey (str): 2CaptchaのAPI Key. 40 | """ 41 | 42 | # apiへリクエストを投げるためのsession 43 | self.session = requests.Session() 44 | 45 | # api_url 46 | self.api_in_url = 'https://2captcha.com/in.php' 47 | self.api_res_url = 'https://2captcha.com/res.php' 48 | 49 | # api_key 50 | self.api_key = apikey 51 | 52 | # proxy 53 | self.proxy = None 54 | self.user_agent = None 55 | 56 | # flag 57 | self.IS_DEBUG = False 58 | self.IS_COMMAND = False 59 | 60 | # Message 61 | self.MESSAGE = None 62 | 63 | def set_debug(self, is_debug: bool): 64 | """set_debug 65 | 66 | Args: 67 | is_debug (bool): debug modeが有効ならTrue 68 | """ 69 | 70 | self.IS_DEBUG = is_debug 71 | 72 | def set_command(self, is_command: bool): 73 | """set_command 74 | 75 | Args: 76 | is_command (bool): command modeが有効ならTrue 77 | """ 78 | 79 | self.IS_COMMAND = is_command 80 | 81 | def set_user_agent(self, user_agent: str): 82 | """set_user_agent 83 | 84 | Args: 85 | user_agent (str): 2Captchaに送るUser Agent 86 | """ 87 | 88 | self.user_agent = user_agent 89 | 90 | def set_messages(self, message: Message): 91 | """set_message 92 | 93 | Args: 94 | message (Message): 利用するcommon.Messageを指定 95 | """ 96 | 97 | self.MESSAGE = message 98 | 99 | # googleのReCaptcha画面からデータを抽出する 100 | def get_google_recaptcha_data(self, html: str): 101 | """get_google_recaptcha_data 102 | 103 | ReCapthcaのhtmlからsitekey, data-sの値を抽出する. 104 | 105 | 106 | Args: 107 | html (str): 解析するReCaptcha画面のhtmlデータ 108 | 109 | Returns: 110 | sitekey (str): 2Captchaへ送るsitekey 111 | data-s (str): 2Captchaへ送るdata-s 112 | """ 113 | 114 | # resultの初期値設定 115 | sitekey = None 116 | data_s = None 117 | 118 | # ReCaptchaのタグ・要素データを宣言 119 | recaptcha_tag = '#captcha-form > #recaptcha' 120 | sitekey_el_name = 'data-sitekey' 121 | data_s_el_name = 'data-s' 122 | 123 | # htmlをBeautifulSoupで解析する 124 | soup = BeautifulSoup(html, 'lxml') 125 | 126 | # 要素を抽出する 127 | if recaptcha_tag != '': 128 | elements = soup.select(recaptcha_tag) 129 | 130 | # 要素のチェック 131 | if len(elements) > 0: 132 | el = elements[0] 133 | 134 | try: 135 | sitekey = el[sitekey_el_name] 136 | data_s = el[data_s_el_name] 137 | 138 | return sitekey, data_s 139 | 140 | except AttributeError: 141 | None 142 | 143 | return sitekey, data_s 144 | 145 | def in_php(self, data: dict): 146 | """in_php 147 | 148 | Args: 149 | data (dict): in.phpにpostするデータ(dict) 150 | 151 | Returns: 152 | bool: 処理が正常終了か否か 153 | str: request_code 154 | """ 155 | 156 | res = self.session.post(self.api_in_url, data=data) 157 | 158 | if self.MESSAGE is not None: 159 | self.MESSAGE.print_text( 160 | '2Captcha Response in.php from `{}`: {}'.format( 161 | self.api_in_url, res.text), 162 | mode='debug', 163 | header=self.MESSAGE.HEADER + ': ' + Color.GRAY + 164 | '[DEBUG]: [2CaptchaIn]' + Color.END, 165 | separator=": " 166 | ) 167 | 168 | # status codeを確認 169 | if res.status_code == 200: 170 | d = json.loads(res.text) 171 | if d['status'] == 1: 172 | request_id = d['request'] 173 | 174 | return True, request_id 175 | 176 | # request codeを取得できなかった場合、 177 | return False, None 178 | 179 | def res_php(self, request_id: str): 180 | """res_php 181 | 182 | Args: 183 | request_id (str): 2Captchaのres.php(2Captchaの突破状況確認するpath)で利用するrequest_id. 184 | 185 | Returns: 186 | (str): res.phpからのresponse結果を返す 187 | """ 188 | 189 | url_param = { 190 | 'key': self.api_key, 191 | 'action': 'get', 192 | 'json': 1, 193 | 'id': request_id 194 | } 195 | params = parse.urlencode(url_param) 196 | target_url = self.api_res_url + '?' + params 197 | 198 | result = self.session.get(target_url) 199 | 200 | if self.MESSAGE is not None: 201 | self.MESSAGE.print_text( 202 | '2Captcha res.php Response from `{}`: {}'.format( 203 | target_url, result.text), 204 | mode='debug', 205 | header=self.MESSAGE.HEADER + ': ' + Color.GRAY + 206 | '[DEBUG]: [2CaptchaRes]' + Color.END, 207 | separator=": " 208 | ) 209 | 210 | return result 211 | 212 | # 解析結果を渡す 213 | def google_recaptcha(self, html: str, url: str, cookies: list, proxy: str): 214 | """[summary] 215 | 216 | Args: 217 | sitekey (str): ReCaptchaのhtml. 218 | url (str): ReCaptchaが表示されてしまったurl(元のurl) 219 | cookie (list): cookiesを渡す. 220 | proxy (str): proxyをuriで渡す. 221 | 222 | Returns: 223 | (str): Google ReCaptchaで使用するcodeを返す. 224 | """ 225 | 226 | # code 227 | code = None 228 | result = None 229 | 230 | # set proxy 231 | self.proxy = proxy 232 | 233 | # sitekey, data-sを取得する 234 | sitekey, data_s = self.get_google_recaptcha_data(html) 235 | 236 | # proxyをuriから整形する 237 | proxy_parse = urlparse(proxy) 238 | proxy_type = proxy_parse.scheme.upper() 239 | proxy_uri = proxy_parse.netloc 240 | 241 | # cookieを整形する 242 | cookie_elements = [] 243 | for cookie in cookies: 244 | cookie_element = cookie['name'] + ':' + cookie['value'] 245 | cookie_elements.append(cookie_element) 246 | 247 | cookie_data = ';'.join(cookie_elements) 248 | 249 | # postリクエストで使用するデータを生成する 250 | payload = { 251 | 'key': self.api_key, 252 | 'pageurl': url, 253 | 'method': 'userrecaptcha', 254 | 'json': 1, 255 | 'googlekey': sitekey, 256 | 'data-s': data_s, 257 | 'proxytype': proxy_type, 258 | 'proxy': proxy_uri, 259 | 'cookies': cookie_data, 260 | 'callback': 'submitCallback', 261 | } 262 | 263 | if self.user_agent is not None: 264 | payload['userAgent'] = self.user_agent 265 | 266 | while True: 267 | # debug message 268 | if self.MESSAGE is not None: 269 | self.MESSAGE.print_text( 270 | 'Send ReCaptcha Data to `{}`.'.format( 271 | self.api_in_url), 272 | mode='info', 273 | header=self.MESSAGE.HEADER + ': ' + Color.GRAY + 274 | '[DEBUG]: [ReCaptcha]' + Color.END, 275 | separator=": " 276 | ) 277 | 278 | # リクエストを送信 279 | ok, request_id = self.in_php(payload) 280 | 281 | if not ok: 282 | # debug message 283 | if self.MESSAGE is not None: 284 | self.MESSAGE.print_text( 285 | 'Failed Send ReCaptcha Data. data: {}'.format( 286 | payload), 287 | mode='warn', 288 | header=self.MESSAGE.HEADER + ': ' + Color.GRAY + 289 | '[DEBUG]: [ReCaptcha]' + Color.END, 290 | separator=": " 291 | ) 292 | 293 | break 294 | 295 | # message 296 | if self.MESSAGE is not None: 297 | self.MESSAGE.print_text( 298 | 'Get request_id: {}'.format(request_id), 299 | mode='info', 300 | header=self.MESSAGE.HEADER + ': ' + Color.GRAY + 301 | '[DEBUG]: [2Captcha]' + Color.END, 302 | separator=": " 303 | ) 304 | 305 | self.MESSAGE.print_text( 306 | 'Check ReCaptcha Rsponse Status from: {}'.format( 307 | self.api_res_url), 308 | mode='info', 309 | header=self.MESSAGE.HEADER + ': ' + Color.GRAY + 310 | '[DEBUG]: [2Captcha]' + Color.END, 311 | separator=": " 312 | ) 313 | 314 | # res_phpのチェックループ 315 | while True: 316 | res = self.res_php(request_id) 317 | 318 | # レスポンス(json)から読み込む 319 | data = json.loads(res.text) 320 | 321 | # codeを取得 322 | code = data['request'] 323 | 324 | if data['status'] == 1: 325 | result = code 326 | return result 327 | 328 | if code != 'CAPCHA_NOT_READY': 329 | break 330 | 331 | sleep(30) 332 | 333 | if code is None: 334 | code = 'None' 335 | 336 | # debug messages 337 | if self.MESSAGE is not None: 338 | self.MESSAGE.print_text( 339 | 'Bypass NG ReCaptcha Data. code: {}'.format(code), 340 | mode='warn', 341 | header=self.MESSAGE.HEADER + ': ' + Color.GRAY + 342 | '[DEBUG]: [2Captcha]' + Color.END, 343 | separator=": " 344 | ) 345 | 346 | return result 347 | -------------------------------------------------------------------------------- /pydork/engine_bing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2023 Blacknon. All rights reserved. 4 | # Use of this source code is governed by an MIT license 5 | # that can be found in the LICENSE file. 6 | # ======================================================= 7 | 8 | 9 | """engine_bing 10 | * Bing用の検索用Classを持つモジュール. 11 | """ 12 | 13 | import requests 14 | import datetime 15 | import json 16 | import asyncio 17 | import re 18 | 19 | from urllib import parse 20 | from bs4 import BeautifulSoup 21 | 22 | from .common import Color 23 | from .engine_common import CommonEngine 24 | 25 | 26 | class Bing(CommonEngine): 27 | """Bing 28 | 29 | Bing用の検索エンジン用Class. 30 | """ 31 | 32 | def __init__(self): 33 | # CommonEngineの処理を呼出し 34 | super().__init__() 35 | 36 | self.NAME = 'Bing' 37 | self.COLOR = Color.CYAN 38 | self.COLOR_NAME = self.COLOR + self.NAME + Color.END 39 | 40 | # リクエスト先のURLを指定 41 | self.ENGINE_TOP_URL = 'https://www.bing.com/' 42 | self.SEARCH_URL = 'https://www.bing.com/search' 43 | self.IMAGE_URL = 'https://www.bing.com/images/async' 44 | self.SUGGEST_URL = 'https://www.bing.com/AS/Suggestions' 45 | 46 | def gen_search_url(self, keyword: str, type: str): 47 | """gen_search_url 48 | 49 | 検索用のurlを生成する. 50 | 51 | Args: 52 | keyword (str): 検索クエリ. 53 | type (str): 検索タイプ. 54 | 55 | Returns: 56 | dict: 検索用url 57 | """ 58 | 59 | search_url = '' 60 | 61 | # NOTE: 62 | # 2023/07/27にて、queryが以下のように切り替わったため修正 63 | # - `https://www.bing.com/search?q=site%3aorebibou.com&search=%e9%80%81%e4%bf%a1&rdr=1&rdrig=D4B6730A85514F25BAE1E9BDC04F1C28&cc=us&setlang=en` 64 | # ```json 65 | # { 66 | # 'q': ['site:orebibou.com'], 67 | # 'search': ['送信'], 68 | # 'rdr': ['1'], 69 | # 'rdrig': ['D4B6730A85514F25BAE1E9BDC04F1C28'], 70 | # 'cc': ['us'], 71 | # 'setlang': ['en'] 72 | # } 73 | # ``` 74 | # - `https://www.bing.com/search?q=site%3aorebibou.com&search=%E9%80%81%E4%BF%A1&rdr=1&rdrig=D4B6730A85514F25BAE1E9BDC04F1C28&cc=us&setlang=en&FPIG=B035C5DE50AE4A328CB93C767B02D08B&first=11&FORM=PERE&count=100` 75 | 76 | # 検索タイプがtextの場合 77 | if type == 'text': 78 | # 検索urlを指定 79 | search_url = self.SEARCH_URL 80 | 81 | # 検索パラメータの設定 82 | url_param = { 83 | 'q': keyword, # 検索キーワード 84 | 'count': '100', # 1ページごとの表示件数 85 | 'search': '送信', 86 | 'rdr': '1', 87 | 'from': 'PERE', 88 | 'cc': 'us', 89 | 'setlang': 'en', 90 | 'filters': '', # 期間含めフィルターとして指定するパラメータ 91 | 'first': '' # 開始位置 92 | } 93 | 94 | # lang/localeが設定されている場合 95 | if self.LANG != '': 96 | url_param['setlang'] = self.LANG.lower() 97 | if self.LOCALE != '': 98 | url_param['cc'] = self.LOCALE.lower() 99 | 100 | # rangeが設定されている場合 101 | try: 102 | start = self.RANGE_START 103 | end = self.RANGE_END 104 | 105 | unix_day = datetime.strptime('1970-01-01', "%Y-%m-%d") 106 | cd_min = (start - unix_day).days 107 | cd_max = (end - unix_day).days 108 | 109 | # GETパラメータに日時データを追加 110 | url_param['filters'] = 'ex1:"ez5_{0}_{1}"'.format( 111 | cd_min, cd_max) 112 | 113 | except AttributeError: 114 | None 115 | 116 | # 検索タイプがimageの場合 117 | elif type == 'image': 118 | # 検索urlを指定 119 | search_url = self.IMAGE_URL 120 | 121 | # 検索パラメータの設定 122 | url_param = { 123 | 'q': keyword, # 検索キーワード 124 | 'count': '100', # 1回ごとの件数 125 | 'first': '', # 検索位置 126 | 'tsc': 'ImageBasicHover', 127 | 'layout': 'RowBased', 128 | } 129 | 130 | # rangeが指定されている場合 131 | # TODO: 日時パラメータを追加(ex: `qft=+filterui%3aage-lt43200`) 132 | 133 | page = 0 134 | while True: 135 | # parameterにページを開始する番号を指定 136 | url_param['first'] = str(page * 100) 137 | params = parse.urlencode(url_param) 138 | 139 | target_url = search_url + '?' + params 140 | 141 | yield 'GET', target_url, None 142 | 143 | page += 1 144 | 145 | def gen_suggest_url(self, keyword: str): 146 | """gen_suggest_url 147 | 148 | サジェスト取得用のurlを生成する. 149 | 150 | Args: 151 | keyword (str): 検索クエリ. 152 | 153 | Returns: 154 | dict: サジェスト取得用url 155 | """ 156 | 157 | url_param = { 158 | 'qry': keyword, # 検索キーワード 159 | 'cvid': 'F5F47E4155E44D86A86690B49023B0EF' 160 | } 161 | 162 | params = parse.urlencode(url_param) 163 | url = self.SUGGEST_URL + '?' + params 164 | 165 | return url 166 | 167 | def get_links(self, url: str, html: str, type: str): 168 | """get_links 169 | 170 | 受け付けたhtmlを解析し、検索結果をlistに加工して返す関数. 171 | 172 | Args: 173 | html (str): 解析する検索結果のhtml. 174 | type (str): 検索タイプ([text, image]).現時点ではtextのみ対応. 175 | 176 | Returns: 177 | list: 検索結果(`[{'title': 'title...', 'url': 'https://hogehoge....'}, {...}]`) 178 | """ 179 | 180 | if type == 'text': 181 | self.SOUP_SELECT_URL = 'h2 > a' 182 | self.SOUP_SELECT_TITLE = 'h2 > a' 183 | self.SOUP_SELECT_TEXT = 'li > div > p' 184 | 185 | elif type == 'image': 186 | self.SOUP_SELECT_URL = '.imgpt > .iusc' 187 | 188 | # CommonEngineの処理を呼び出す 189 | links = super().get_links(url, html, type) 190 | 191 | return links 192 | 193 | # 画像検索ページの検索結果(links(list()))を生成するfunction 194 | def get_image_links(self, soup: BeautifulSoup): 195 | """get_image_links 196 | BeautifulSoupから画像検索ページを解析して結果を返す関数. 197 | 198 | Args: 199 | soup (BeautifulSoup): 解析するBeautifulSoupオブジェクト. 200 | 201 | Returns: 202 | list: 検索結果(`[{'title': 'title...', 'link': 'https://hogehoge....'}, {...}]`) 203 | """ 204 | 205 | elements = soup.select(self.SOUP_SELECT_URL) 206 | edata = [e['m'] for e in elements] 207 | 208 | result = [] # image url 209 | for e in edata: 210 | # json化 211 | je = json.loads(e) 212 | 213 | etitle = je['t'] 214 | elink = je['purl'] 215 | eimage = je['murl'] 216 | 217 | el = { 218 | 'title': etitle, 219 | 'pagelink': elink, 220 | 'link': eimage, 221 | } 222 | 223 | result.append(el) 224 | 225 | return result 226 | 227 | def get_suggest_list(self, suggests: list, char: str, html: str): 228 | """get_suggest_list 229 | 230 | htmlからsuggestを配列で取得する関数. 231 | 232 | Args: 233 | suggests (list): suggestを追加するための大本のlist. 234 | char (str): サジェストの文字列. 235 | html (str): 解析を行うhtml. 236 | 237 | Returns: 238 | dict: サジェスト配列 239 | """ 240 | soup = BeautifulSoup(html, 'lxml') 241 | elements = soup.select('ul > li') 242 | suggests[char if char == '' else char[-1]] = [e['query'] 243 | for e in elements] 244 | return suggests 245 | 246 | def processings_elist(self, elinks, etitles, etexts: list): 247 | """processings_elist 248 | 249 | self.get_links 内で、取得直後のelinks, etitlesに加工を加えるための関数. 250 | requestsを用いて、リダイレクトリンクから遷移先urlを取得していく. 251 | 252 | Args: 253 | elinks (list): elinks(検索結果のlink)の配列 254 | etitles (list): etitles(検索結果のtitle)の配列 255 | etexts (list): etexts(検索結果のtext)の配列 256 | 257 | Returns: 258 | elinks (list): elinks(検索結果のlink)の配列 259 | etitles (list): etitles(検索結果のtitle)の配列 260 | etexts (list): etexts(検索結果のtext)の配列 261 | """ 262 | 263 | # 通常のスクレイピングとは別にセッションを作成 264 | session = requests.session() 265 | 266 | # pool sizeを調整 267 | adapter = requests.adapters.HTTPAdapter( 268 | pool_connections=100, pool_maxsize=100) 269 | session.mount('https://', adapter) 270 | 271 | # proxyを設定 272 | if self.PROXY != '': 273 | proxies = { 274 | 'http': self.PROXY, 275 | 'https': self.PROXY 276 | } 277 | session.proxies = proxies 278 | 279 | # user-agentを設定 280 | if self.USER_AGENT != '': 281 | session.headers.update( 282 | { 283 | 'User-Agent': self.USER_AGENT, 284 | 'Accept-Language': 'ja,en-US;q=0.7,en;q=0.3' 285 | } 286 | ) 287 | 288 | # asyncio loopを作成 289 | loop = asyncio.new_event_loop() 290 | asyncio.set_event_loop(loop) 291 | 292 | # リダイレクト先のurlに置き換え 293 | elinks = loop.run_until_complete( 294 | resolv_links(loop, session, elinks)) 295 | loop.close() 296 | 297 | return elinks, etitles, etexts 298 | 299 | 300 | async def resolv_links(loop: asyncio.AbstractEventLoop, session: requests.Session, links: list): 301 | """resolv_links 302 | 303 | リダイレクト先のurlをパラレルで取得する(Baiduで使用) 304 | 305 | Args: 306 | loop (asyncio.AbstractEventLoop): loop 307 | session (requests.Session): 使用するSession 308 | links (list): リダイレクト先を取得するurlのリスト 309 | 310 | Returns: 311 | data (list): リダイレクト先を取得したurlのリスト 312 | """ 313 | 314 | async def req(session: requests.Session, url: str): 315 | task = await loop.run_in_executor(None, resolv_url, session, url) 316 | return task 317 | 318 | tasks = [] 319 | for link in links: 320 | # urlをパース 321 | url = parse.urlparse(link) 322 | 323 | # bingの遷移ページの場合はリダイレクトして処理 324 | if url.netloc == 'www.bing.com' and url.path == '/ck/a': 325 | task = req(session, link) 326 | tasks.append(task) 327 | 328 | data = await asyncio.gather(*tasks) 329 | 330 | return data 331 | 332 | 333 | def resolv_url(session: requests.Session, url: str): 334 | """resolv_url 335 | リダイレクト先のurlを取得する(Baiduで使用) 336 | Args: 337 | session (request.Session): リダイレクト先を取得する際に使用するSession 338 | url (str): リダイレクト先を取得するurl 339 | Returns: 340 | url (str): リダイレクト先のurl 341 | """ 342 | 343 | while True: 344 | try: 345 | # リダイレクト先のbodyを取得する 346 | res = session.get(url).text 347 | 348 | except requests.RequestException: 349 | continue 350 | except ConnectionError: 351 | continue 352 | else: 353 | # resから1行ずつチェック 354 | for line in res.splitlines(): 355 | if re.match('^ +var u', line): 356 | text = re.findall('"([^"]*)"', line) 357 | url = text[0] 358 | break 359 | break 360 | 361 | return url 362 | -------------------------------------------------------------------------------- /pydork/engine_baidu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2023 Blacknon. All rights reserved. 4 | # Use of this source code is governed by an MIT license 5 | # that can be found in the LICENSE file. 6 | # ======================================================= 7 | 8 | 9 | """engine_baidu 10 | * Baidu用の検索用Classを持つモジュール. 11 | """ 12 | 13 | import requests 14 | import json 15 | import asyncio 16 | import sys 17 | 18 | from urllib import parse 19 | from bs4 import BeautifulSoup 20 | 21 | from .engine_common import CommonEngine 22 | from .common import Color 23 | 24 | 25 | class Baidu(CommonEngine): 26 | """Baidu 27 | 28 | Baidu用の検索エンジン用Class. 29 | 30 | Note: 31 | 検索結果に直接urlが記載されておらず、リンクを踏んで移動先のurlを取得する必要がある。 32 | そのため、検索結果を取得してからパラレルで検索結果urlからリンク先urlを取得している。 33 | なお、その際のリクエストはSelenium/Splashを使用している場合でもrequestsを使っている。 34 | """ 35 | 36 | def __init__(self): 37 | # CommonEngineの処理を呼出し 38 | super().__init__() 39 | 40 | self.NAME = 'Baidu' 41 | self.COLOR = Color.RED 42 | self.COLOR_NAME = self.COLOR + self.NAME + Color.END 43 | 44 | # リクエスト先のURLを指定 45 | self.ENGINE_TOP_URL = 'https://www.baidu.com/' 46 | self.SEARCH_URL = 'https://www.baidu.com/s' 47 | self.IMAGE_URL = 'https://image.baidu.com/search/acjson' 48 | self.SUGGEST_URL = 'https://www.baidu.com/sugrec' 49 | 50 | def gen_search_url(self, keyword: str, type: str): 51 | """gen_search_url 52 | 53 | 検索用のurlを生成する. 54 | 55 | Args: 56 | keyword (str): 検索クエリ. 57 | type (str): 検索タイプ. 58 | 59 | Returns: 60 | dict: 検索用url 61 | """ 62 | 63 | if type == 'text': 64 | # 1ページごとの表示件数 65 | view_num = 50 66 | 67 | # 検索urlを指定 68 | search_url = self.SEARCH_URL 69 | 70 | # 検索パラメータの設定 71 | url_param = { 72 | 'wd': keyword, # 検索キーワード 73 | 'rn': view_num, # 1ページごとの表示件数 74 | 'filter': '0', # aaa 75 | 'ia': 'web', # 76 | 'pn': '' # 開始位置 77 | } 78 | 79 | elif type == 'image': 80 | # 1ページごとの表示件数 81 | view_num = 30 82 | 83 | # example: 84 | # 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=10696586825489113064&ipn=rj&ct=201326592&is=&fp=result&queryWord=poop&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&word=poop&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=1&fr=&expermode=&force=&pn=30&rn=30&gsm=1e&1617708591950=' 85 | # 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=11967476791890431299&ipn=rj&ct=201326592&is=&fp=result&queryWord=poop&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&word=poop&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=1&fr=&expermode=&nojc=&pn=60&rn=30&gsm=3c&1629026924429=' 86 | 87 | # 検索urlを指定 88 | search_url = self.IMAGE_URL 89 | 90 | # 検索パラメータの設定 91 | url_param = { 92 | 'tn': 'resultjson_com', 93 | 'fp': 'result', 94 | 'queryWord': keyword, 95 | 'word': keyword, 96 | 'logid': '12399428100030957064', 97 | 'ipn': 'rj', 98 | 'ct': '201326592', 99 | 'lm': '-1', 100 | 'cl': 2, 101 | 'ie': 'utf-8', 102 | 'nc': 1, 103 | 'pn': 0, # 開始位置 104 | 'rn': view_num, 105 | 'gsm': '3c', 106 | } 107 | 108 | page = 0 109 | while True: 110 | # parameterにページを開始する番号を指定 111 | url_param['pn'] = str(page * view_num) 112 | params = parse.urlencode(url_param) 113 | 114 | target_url = search_url + '?' + params 115 | 116 | yield 'GET', target_url, None 117 | 118 | page += 1 119 | 120 | def gen_suggest_url(self, keyword: str): 121 | """gen_suggest_url 122 | 123 | サジェスト取得用のurlを生成する. 124 | 125 | Args: 126 | keyword (str): 検索クエリ. 127 | 128 | Returns: 129 | dict: サジェスト取得用url 130 | """ 131 | 132 | url_param = { 133 | 'wd': keyword, # 検索キーワード 134 | 'prod': 'pc' # 135 | } 136 | 137 | params = parse.urlencode(url_param) 138 | url = self.SUGGEST_URL + '?' + params 139 | 140 | return url 141 | 142 | def get_links(self, url: str, html: str, type: str): 143 | """get_links 144 | 145 | 受け付けたhtmlを解析し、検索結果をlistに加工して返す関数. 146 | 147 | Args: 148 | html (str): 解析する検索結果のhtml. 149 | type (str): 検索タイプ([text, image]).現時点ではtextのみ対応. 150 | 151 | Returns: 152 | list: 検索結果(`[{'title': 'title...', 'url': 'https://hogehoge....'}, {...}]`) 153 | """ 154 | 155 | links = [] 156 | 157 | if type == 'text': 158 | # Splash経由で通信している場合 159 | self.SOUP_SELECT_URL = '.tts-title > a' 160 | self.SOUP_SELECT_TITLE = '.tts-title > a' 161 | self.SOUP_SELECT_TEXT = '.c-gap-top-small > span' 162 | 163 | # CommonEngineの処理を呼び出す 164 | links = super().get_links(url, html, type) 165 | 166 | elif type == 'image': 167 | # unicode escape 168 | # html = html.encode().decode("unicode-escape") 169 | html = html.replace("\\'", "'") 170 | 171 | # seleniumを使用している場合、htmlで返ってくるためjson要素のみを抽出する 172 | if self.USE_SELENIUM: 173 | html_text = "" 174 | soup = BeautifulSoup(html, "lxml") 175 | 176 | for text in soup.find_all(text=True): 177 | if text.strip(): 178 | html_text += text 179 | 180 | html = html_text 181 | 182 | # json load 183 | try: 184 | json_data = json.loads(html, strict=False) 185 | except Exception as e: 186 | print(e, file=sys.stderr) 187 | return links 188 | 189 | if 'data' in json_data: 190 | data = json_data['data'] 191 | 192 | for d in data: 193 | if 'replaceUrl' in d: 194 | result = dict() 195 | 196 | # 画像ファイルのurlをパラメータに持つvalueを取得する 197 | replace_url = d['replaceUrl'][0]['ObjURL'] 198 | replace_url = replace_url.replace( 199 | 'image_search/', 'image_search/?') 200 | 201 | # url valueをparse 202 | replace_url_query = parse.urlparse(replace_url).query 203 | 204 | # パラメータを取得 205 | replace_url_query_dict = parse.parse_qs( 206 | replace_url_query) 207 | 208 | if 'src' not in replace_url_query_dict: 209 | continue 210 | 211 | # 画像urlを取得 212 | result['link'] = replace_url_query_dict['src'][0] 213 | 214 | if 'fromPageTitle' in d: 215 | result['title'] = d['fromPageTitle'] 216 | 217 | links.append(result) 218 | 219 | return links 220 | 221 | def get_suggest_list(self, suggests: list, char: str, html: str): 222 | """get_suggest_list 223 | 224 | htmlからsuggestを配列で取得する関数. 225 | 226 | Args: 227 | suggests (list): suggestを追加するための大本のlist. 228 | char (str): サジェストの文字列. 229 | html (str): 解析を行うhtml. 230 | 231 | Returns: 232 | dict: サジェスト配列 233 | """ 234 | 235 | try: 236 | data = json.loads(html) 237 | except Exception: 238 | soup = BeautifulSoup(html, "lxml") 239 | json_data = soup.select_one('html > body') 240 | data = json.loads(json_data.text) 241 | 242 | if 'g' in data: 243 | suggests[char if char == '' else char[-1] 244 | ] = [e['q'] 245 | for e in data['g']] 246 | return suggests 247 | 248 | def processings_elist(self, elinks, etitles, etexts: list): 249 | """processings_elist 250 | 251 | self.get_links 内で、取得直後のelinks, etitlesに加工を加えるための関数. 252 | requestsを用いて、リダイレクトリンクから遷移先urlを取得していく. 253 | 254 | Args: 255 | elinks (list): elinks(検索結果のlink)の配列 256 | etitles (list): etitles(検索結果のtitle)の配列 257 | etexts (list): etexts(検索結果のtext)の配列 258 | 259 | Returns: 260 | elinks (list): elinks(検索結果のlink)の配列 261 | etitles (list): etitles(検索結果のtitle)の配列 262 | etexts (list): etexts(検索結果のtext)の配列 263 | """ 264 | 265 | # 通常のスクレイピングとは別にセッションを作成 266 | session = requests.session() 267 | 268 | # pool sizeを調整 269 | adapter = requests.adapters.HTTPAdapter( 270 | pool_connections=100, pool_maxsize=100) 271 | session.mount('http://', adapter) 272 | 273 | # proxyを設定 274 | if self.PROXY != '': 275 | proxies = { 276 | 'http': self.PROXY, 277 | 'https': self.PROXY 278 | } 279 | session.proxies = proxies 280 | 281 | # user-agentを設定 282 | if self.USER_AGENT != '': 283 | session.headers.update( 284 | { 285 | 'User-Agent': self.USER_AGENT, 286 | 'Accept-Language': 'ja,en-US;q=0.7,en;q=0.3' 287 | } 288 | ) 289 | 290 | # asyncio loopを作成 291 | loop = asyncio.new_event_loop() 292 | asyncio.set_event_loop(loop) 293 | 294 | # リダイレクト先のurlに置き換え 295 | elinks = loop.run_until_complete( 296 | resolv_links(loop, session, elinks)) 297 | loop.close() 298 | 299 | return elinks, etitles, etexts 300 | 301 | 302 | async def resolv_links(loop: asyncio.AbstractEventLoop, session: requests.Session, links: list): 303 | """resolv_links 304 | 305 | リダイレクト先のurlをパラレルで取得する(Baiduで使用) 306 | 307 | Args: 308 | loop (asyncio.AbstractEventLoop): loop 309 | session (requests.Session): 使用するSession 310 | links (list): リダイレクト先を取得するurlのリスト 311 | 312 | Returns: 313 | data (list): リダイレクト先を取得したurlのリスト 314 | """ 315 | 316 | async def req(session: requests.Session, url: str): 317 | task = await loop.run_in_executor(None, resolv_url, session, url) 318 | return task 319 | 320 | tasks = [] 321 | for link in links: 322 | task = req(session, link) 323 | tasks.append(task) 324 | 325 | data = await asyncio.gather(*tasks) 326 | 327 | return data 328 | 329 | 330 | def resolv_url(session: requests.Session, url: str): 331 | """resolv_url 332 | 333 | リダイレクト先のurlを取得する(Baiduで使用) 334 | 335 | Args: 336 | session (request.Session): リダイレクト先を取得する際に使用するSession 337 | url (str): リダイレクト先を取得するurl 338 | Returns: 339 | url (str): リダイレクト先のurl 340 | """ 341 | while True: 342 | try: 343 | res_header = session.head(url, allow_redirects=False).headers 344 | except requests.RequestException: 345 | continue 346 | except ConnectionError: 347 | continue 348 | else: 349 | url = res_header['Location'] 350 | break 351 | 352 | return url 353 | -------------------------------------------------------------------------------- /pydork/sub_commands.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2023 Blacknon. All rights reserved. 4 | # Use of this source code is governed by an MIT license 5 | # that can be found in the LICENSE file. 6 | # ======================================================= 7 | 8 | """subcommands 9 | * pydorkをコマンドとして動作させる際の処理を記載しているモジュール 10 | """ 11 | 12 | 13 | import sys 14 | import threading 15 | import json 16 | import os 17 | import pathlib 18 | 19 | from typing import List 20 | from argparse import Namespace 21 | from jinja2 import Template 22 | 23 | from .engine import SearchEngine, ENGINES 24 | from .common import Color 25 | from .common import Message 26 | 27 | 28 | # サブコマンドの動作集約用関数 29 | def run_subcommand(subcommand, args): 30 | """run_subcommand 31 | 32 | Args: 33 | subcommand (str): 使用するサブコマンド([search, suggest]). 34 | args (Namespace): argparseで取得した引数(Namespace). 35 | """ 36 | 37 | # template file用の変数セット(dict) 38 | template_variable = {} 39 | 40 | # query及びfileがともに指定なしの場合、エラーにして返す 41 | if args.query == "" and args.file == "" and args.template_file == "": 42 | print("Error: クエリもしくはファイルを指定してください.", file=sys.stderr) 43 | return 44 | 45 | # args.fileのチェック 46 | if args.file != "": 47 | if not os.path.exists(args.file): 48 | print("Error: ファイルが存在しません.", file=sys.stderr) 49 | return 50 | 51 | # args.template_fileのチェック 52 | if args.template_file != "": 53 | if not os.path.exists(args.template_file): 54 | print("Error: ファイルが存在しません.", file=sys.stderr) 55 | return 56 | 57 | if args.template_variable == "": 58 | print("Error: テンプレート変数が指定されていません.", file=sys.stderr) 59 | return 60 | 61 | try: 62 | template_variable = json.loads(args.template_variable) 63 | except Exception: 64 | print("Error: テンプレート変数の形式がまちがっています.", file=sys.stderr) 65 | return 66 | 67 | # 各サブコマンドのチェック 68 | target = None 69 | search_mode = '' 70 | if subcommand == 'search': 71 | # チェック処理 72 | if ((args.start is None and args.end is not None) or (args.start is not None and args.end is None)): 73 | print( 74 | Color.GRAY + "期間を指定する場合は--start, --endの両方を指定してください" + Color.END, 75 | file=sys.stderr 76 | ) 77 | return 78 | target = run_search 79 | search_mode = 'text' 80 | 81 | elif subcommand == 'image': 82 | target = run_search 83 | search_mode = 'image' 84 | 85 | elif subcommand == 'suggest': 86 | target = run_suggest 87 | 88 | # create query_list 89 | query_list = generate_query_list(args) 90 | 91 | # append query in template file 92 | if args.template_file != "": 93 | # template fileのfullpathを取得 94 | template_file = pathlib.Path(args.template_file).expanduser() 95 | 96 | # args.template_variableをjsonとして読み込む. 97 | 98 | with open(template_file) as f: 99 | template_data = f.read() 100 | 101 | # template fileから値を取得 102 | tmpl = Template(template_data) 103 | 104 | # 設定情報を取得 105 | tmpl_params = template_variable 106 | 107 | # レンダリング処理を実行 108 | rendered_query_strings = tmpl.render(tmpl_params) 109 | 110 | # templateを1行ずつqueryに追加 111 | template_file_querys = [s.strip() 112 | for s in rendered_query_strings.splitlines()] 113 | query_list.extend(template_file_querys) 114 | 115 | # engine_listへ、選択されているsearch engineを入れていく 116 | engine_list = [] 117 | for search_type in args.search_type: 118 | # if all 119 | if search_type == 'all': 120 | for engine in ENGINES: 121 | engine_list.append(engine) 122 | continue 123 | 124 | # if in searchengine 125 | if search_type in ENGINES: 126 | engine_list.append(search_type) 127 | continue 128 | 129 | # engine_listから、重複したリストを削除 130 | engine_list = list(set(engine_list)) 131 | 132 | tasks = [] 133 | thread_result = dict() 134 | lock = threading.Lock() 135 | for engine in engine_list: 136 | task = threading.Thread( 137 | target=target, args=(engine, query_list, args, thread_result, True, lock, search_mode)) 138 | tasks.append(task) 139 | 140 | for task in tasks: 141 | task.start() 142 | 143 | for task in tasks: 144 | task.join() 145 | 146 | # json出力が有効だった場合、json形式で出力 147 | if args.json: 148 | print(json.dumps(thread_result, ensure_ascii=False, indent=2)) 149 | 150 | 151 | # SearchEngineのオプション設定用関数 152 | def set_se_options(se: SearchEngine, args: Namespace): 153 | """set_se_options 154 | 155 | Args: 156 | se (SearchEngine): argsの情報を元に、オプションを設定するSearchEngine. 157 | args (Namespace): argparseで取得した引数(Namespace). 158 | 159 | Returns: 160 | SearchEngine: オプションを設定したSearchEngine. 161 | """ 162 | 163 | # set debug flag 164 | if 'debug' in args: 165 | se.set_is_debug(args.debug) 166 | 167 | # set ssl verify 168 | if 'insecure' in args: 169 | se.set_ignore_ssl(args.insecure) 170 | 171 | # set is_command flag 172 | se.set_is_command(True) 173 | 174 | # set disable headless 175 | if 'disable_headless' in args: 176 | se.set_disable_headless(args.disable_headless) 177 | 178 | # proxy 179 | if args.proxy != '': 180 | se.set_proxy(args.proxy) 181 | 182 | # Selenium 183 | if args.selenium: 184 | # set default endpoint 185 | endpoint = None 186 | 187 | # if set browser-endpoint 188 | if args.browser_endpoint != "": 189 | endpoint = args.browser_endpoint 190 | 191 | # set selenium 192 | se.set_selenium(endpoint, args.browser) 193 | 194 | # Splush 195 | if args.splash: 196 | # set default endpoint 197 | endpoint = 'localhost:8050' 198 | 199 | # if set browser-endpoint 200 | if args.browser_endpoint != "": 201 | endpoint = args.browser_endpoint 202 | 203 | # set splash 204 | se.set_splash(endpoint) 205 | 206 | # useragent 207 | se.set_user_agent() 208 | 209 | # lang/country code 210 | se.set_lang(args.lang, args.country) 211 | 212 | # set cookie driver(last set) 213 | se.set_cookie_files(args.cookies) 214 | 215 | # set cookie file delete 216 | se.set_cookie_files_delete(args.delete_cookies) 217 | 218 | return se 219 | 220 | 221 | # 検索結果を出力する 222 | def print_search_result(result, args: Namespace, message: Message): 223 | """print_search_result 224 | 225 | 226 | Args: 227 | result : SearchEngine.searchのresult. 228 | args (Namespace): argparseで取得した引数(Namespace). 229 | message (common.Message): 出力用Class. 230 | """ 231 | 232 | # 区切り文字を指定 233 | sep = ': ' 234 | if args.nullchar: 235 | sep = '\0' 236 | 237 | # title出力を行うか確認 238 | title_mode = False 239 | if 'title' in args: 240 | title_mode = args.title 241 | 242 | # pageurl出力を行うか確認 243 | pagelink_mode = False 244 | if 'pagelink' in args: 245 | pagelink_mode = args.pagelink 246 | 247 | for d in result: 248 | data = [] 249 | link = d['link'] 250 | 251 | # 出力dataにlinkを追加 252 | data.insert(0, link) 253 | 254 | # pageurlの有無を確認 255 | if 'pagelink' in d and pagelink_mode: 256 | pagelink = d['pagelink'] 257 | 258 | # pagelinkの色指定 259 | if args.color == 'always' or (args.color == 'auto' and sys.stdout.isatty()): 260 | pagelink = Color.GRAY + Color.UNDERLINE + pagelink + Color.END 261 | 262 | data.insert(0, pagelink) 263 | 264 | # titleの有無を確認 265 | if 'title' in d and title_mode: 266 | title = d['title'] 267 | 268 | # titleの色指定 269 | if args.color == 'always' or (args.color == 'auto' and sys.stdout.isatty()): 270 | title = Color.GRAY + title + Color.END 271 | 272 | data.insert(0, title) 273 | 274 | message.print_line(*data, separator=sep) 275 | 276 | 277 | # generate 278 | def generate_query_list(args: Namespace): 279 | """generate_query_list 280 | 281 | """ 282 | # create query_list 283 | query_list: List[str] = list() 284 | 285 | # append query 286 | if args.query != "": 287 | query_list.append(args.query) 288 | 289 | # append query in file 290 | if args.file != "": 291 | # fileのfull pathを取得 292 | file = pathlib.Path(args.file).expanduser() 293 | 294 | # ファイルを開いて1行ずつqueryに追加する 295 | with open(file) as f: 296 | file_querys = [s.strip() for s in f.readlines()] 297 | query_list.extend(file_querys) 298 | 299 | return query_list 300 | 301 | 302 | # 検索 303 | def run_search(engine: str, query_list: list, args, thread_result: dict, cmd=False, lock=None, mode='text'): 304 | """search 305 | 306 | Args: 307 | engine (str): 使用する検索エンジン(.engine.ENGINES). 308 | query_list(list): 検索クエリのリスト. 309 | args (Namespace): argparseで取得した引数(Namespace). 310 | thread_result(dict): 結果を1箇所に集約するためのresult dict. json出力するときのみ使用. 311 | cmd (bool, optional): commandで実行しているか否か. Defaults to False. 312 | lock (threading.Lock): threadingのマルチスレッドで使用するLock.現在は未使用. Defaults to None. 313 | type (str, optional): 検索タイプ. `text` or `image`. 314 | """ 315 | 316 | # start SearchEngine class 317 | se = SearchEngine() 318 | 319 | # Set Engine 320 | se.set(engine) 321 | 322 | # Set SearchEngine options 323 | se = set_se_options(se, args) 324 | 325 | # Set lock 326 | se.set_lock(lock) 327 | 328 | # Set color 329 | if args.color == 'always' or (args.color == 'auto' and sys.stdout.isatty()): 330 | se.set_is_color(True) 331 | 332 | # 検索タイプを設定(テキスト or 画像) 333 | search_type = mode 334 | 335 | # 区切り文字を指定 336 | sep = ': ' 337 | if args.nullchar: 338 | sep = '\0' 339 | 340 | # json出力時の変数を宣言 341 | all_result_json = list() 342 | 343 | # query_listの内容を順番に処理 344 | for query in query_list: 345 | # 検索を実行 346 | result = se.search( 347 | query, search_type=search_type, 348 | maximum=args.num 349 | ) 350 | 351 | # debug 352 | se.ENGINE.MESSAGE.print_text( 353 | json.dumps(result), 354 | separator=sep, 355 | header=se.ENGINE.MESSAGE.HEADER + ': ' + 356 | Color.GRAY + '[DEBUG]: [Result]' + Color.END, 357 | mode="debug", 358 | ) 359 | 360 | if args.json: 361 | # all_result_jsonへ組み込むためのjson方式へ加工. 362 | append_result = { 363 | 'query': query, 364 | 'result': result 365 | } 366 | all_result_json.append(append_result) 367 | 368 | else: 369 | print_search_result(result, args, se.ENGINE.MESSAGE) 370 | 371 | if args.json: 372 | thread_result[engine] = all_result_json 373 | 374 | 375 | # サジェスト 376 | def run_suggest(engine: str, query_list: list, args: Namespace, thread_result: dict, cmd=False, lock=None, mode=''): 377 | """suggest 378 | 379 | Args: 380 | engine (str): 使用する検索エンジン(.engine.ENGINES). 381 | query_list(list): 検索クエリのリスト. 382 | args (Namespace): argparseで取得した引数(Namespace). 383 | thread_result(dict): 結果を1箇所に集約するためのresult dict. json出力するときのみ使用. 384 | cmd (bool, optional): commandで実行しているか否か. Defaults to False. 385 | lock (threading.Lock): threadingのマルチスレッドで使用するLock.現在は未使用. Defaults to None. 386 | mode (str, optional): マルチスレッドでsearchある程度共用で使えるようにするための引数. 利用していない. Defaults to ''. 387 | """ 388 | 389 | # start search engine class 390 | se = SearchEngine() 391 | 392 | # Set Engine 393 | se.set(engine) 394 | 395 | # Set Message() 396 | msg = Message() 397 | msg.set_engine(se.ENGINE.NAME, se.ENGINE.COLOR) 398 | if 'debug' in args: 399 | msg.set_is_debug(args.debug) 400 | msg.set_is_command(True) 401 | 402 | # set msg to se 403 | se.ENGINE.set_messages(msg) 404 | 405 | # Set SearchEngine options 406 | se = set_se_options(se, args) 407 | 408 | # Set lock 409 | se.set_lock(lock) 410 | 411 | # Header 412 | header = '[${ENGINE_NAME}Suggest]' 413 | if args.color == 'always' or (args.color == 'auto' and sys.stdout.isatty()): 414 | sc = Color(se.ENGINE.COLOR) 415 | header = sc.out(header) 416 | se.ENGINE.MESSAGE.set_header(header) 417 | 418 | # json出力時の変数を宣言 419 | all_result_json = list() 420 | 421 | # Suggestを取得 422 | for query in query_list: 423 | result = se.suggest( 424 | query, 425 | jap=args.jap, 426 | alph=args.alph, 427 | num=args.num, 428 | ) 429 | 430 | for words in result.values(): 431 | if args.json: 432 | append_result = { 433 | 'query': query, 434 | 'result': words 435 | } 436 | all_result_json.append(append_result) 437 | 438 | else: 439 | for w in words: 440 | se.ENGINE.MESSAGE.print_line(w, separator=": ") 441 | 442 | if args.json: 443 | thread_result[engine] = all_result_json 444 | -------------------------------------------------------------------------------- /pydork/test_engine.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2023 Blacknon. All rights reserved. 4 | # Use of this source code is governed by an MIT license 5 | # that can be found in the LICENSE file. 6 | # ======================================================= 7 | 8 | 9 | """test_engine_google 10 | * SearchEngine Classのテストコード. 11 | * 各検索エンジンの動作テストを行う 12 | """ 13 | 14 | # TODO: splash/selenium経由での通信のテストも追加する(dockerでのコンテナ環境が前提になると思われる) 15 | 16 | 17 | import unittest 18 | 19 | from .engine import SearchEngine 20 | 21 | # 変数 22 | SEARCH_TEXT = 'Linux' 23 | 24 | 25 | class SearchEngineTestCase(unittest.TestCase): 26 | def setUp(self): 27 | """setUp 28 | 29 | テストメソッド実行前処理. 30 | """ 31 | # SearchEngine 32 | self.search_engine = SearchEngine() 33 | 34 | print("setUp!!") 35 | 36 | def tearDown(self): 37 | """tearDown 38 | 39 | テストメソッド実行後処理 40 | """ 41 | 42 | print("tearDown!!") 43 | 44 | def common_settings(self): 45 | # command modeを有効化 46 | self.search_engine.set_is_command(True) 47 | 48 | # debug modeを有効化 49 | self.search_engine.set_is_debug(True) 50 | 51 | # user agentを定義 52 | self.search_engine.set_user_agent() 53 | 54 | # ========== 55 | # Baidu 56 | # ========== 57 | def test_baidu_text_search(self): 58 | print('Test Baidu text search.') 59 | 60 | # 検索エンジンを指定(ここではBaiduを使用) 61 | self.search_engine.set('baidu') 62 | 63 | # 共通系の中間前処理を実行 64 | self.common_settings() 65 | 66 | # 検索を実行 67 | data = self.search_engine.search(SEARCH_TEXT, maximum=30) 68 | 69 | print("{} count.".format(len(data))) 70 | self.assertNotEqual(0, len(data)) 71 | 72 | def test_baidu_image_search(self): 73 | print('Test Baidu image search.') 74 | 75 | # 検索エンジンを指定(ここではBaiduを使用) 76 | self.search_engine.set('baidu') 77 | 78 | # 共通系の中間前処理を実行 79 | self.common_settings() 80 | 81 | # 検索を実行 82 | data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30) 83 | 84 | print("{} count.".format(len(data))) 85 | self.assertNotEqual(0, len(data)) 86 | 87 | def test_baidu_suggest(self): 88 | print('Test Baidu text suggest.') 89 | 90 | # 検索エンジンを指定(ここではBaiduを使用) 91 | self.search_engine.set('baidu') 92 | 93 | # 共通系の中間前処理を実行 94 | self.common_settings() 95 | 96 | # 検索を実行 97 | data = self.search_engine.suggest( 98 | SEARCH_TEXT) 99 | 100 | print("{} count.".format(len(data))) 101 | self.assertNotEqual(0, len(data)) 102 | 103 | def test_baidu_suggest_with_alph(self): 104 | print('Test Baidu text suggest with alph.') 105 | 106 | # 検索エンジンを指定(ここではBaiduを使用) 107 | self.search_engine.set('baidu') 108 | 109 | # 共通系の中間前処理を実行 110 | self.common_settings() 111 | 112 | # 検索を実行 113 | data = self.search_engine.suggest( 114 | SEARCH_TEXT, alph=True) 115 | 116 | self.assertNotEqual(0, len(data)) 117 | 118 | def test_baidu_suggest_with_num(self): 119 | print('Test Baidu text suggest with num.') 120 | 121 | # 検索エンジンを指定(ここではBaiduを使用) 122 | self.search_engine.set('baidu') 123 | 124 | # 共通系の中間前処理を実行 125 | self.common_settings() 126 | 127 | # 検索を実行 128 | data = self.search_engine.suggest( 129 | SEARCH_TEXT, num=True) 130 | 131 | print("{} count.".format(len(data))) 132 | self.assertNotEqual(0, len(data)) 133 | 134 | # ========== 135 | # Bing 136 | # ========== 137 | def test_bing_text_search(self): 138 | print('Test Bing text search.') 139 | 140 | # 検索エンジンを指定(ここではBingを使用) 141 | self.search_engine.set('bing') 142 | 143 | # 共通系の中間前処理を実行 144 | self.common_settings() 145 | 146 | # 検索を実行 147 | data = self.search_engine.search(SEARCH_TEXT, maximum=30) 148 | 149 | print("{} count.".format(len(data))) 150 | self.assertNotEqual(0, len(data)) 151 | 152 | def test_bing_image_search(self): 153 | print('Test Bing image search.') 154 | 155 | # 検索エンジンを指定(ここではBingを使用) 156 | self.search_engine.set('bing') 157 | 158 | # 共通系の中間前処理を実行 159 | self.common_settings() 160 | 161 | # 検索を実行 162 | data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30) 163 | 164 | print("{} count.".format(len(data))) 165 | self.assertEqual(30, len(data)) 166 | 167 | def test_bing_suggest(self): 168 | print('Test Bing text suggest.') 169 | 170 | # 検索エンジンを指定(ここではBingを使用) 171 | self.search_engine.set('bing') 172 | 173 | # 共通系の中間前処理を実行 174 | self.common_settings() 175 | 176 | # 検索を実行 177 | data = self.search_engine.suggest( 178 | SEARCH_TEXT) 179 | 180 | print("{} count.".format(len(data))) 181 | self.assertNotEqual(0, len(data)) 182 | 183 | def test_bing_suggest_with_jap(self): 184 | print('Test Bing text suggest with jap.') 185 | 186 | # 検索エンジンを指定(ここではBingを使用) 187 | self.search_engine.set('bing') 188 | 189 | # 共通系の中間前処理を実行 190 | self.common_settings() 191 | 192 | # 検索を実行 193 | data = self.search_engine.suggest( 194 | SEARCH_TEXT, jap=True) 195 | 196 | print("{} count.".format(len(data))) 197 | self.assertNotEqual(0, len(data)) 198 | 199 | def test_bing_suggest_with_alph(self): 200 | print('Test Bing text suggest with alph.') 201 | 202 | # 検索エンジンを指定(ここではBingを使用) 203 | self.search_engine.set('bing') 204 | 205 | # 共通系の中間前処理を実行 206 | self.common_settings() 207 | 208 | # 検索を実行 209 | data = self.search_engine.suggest( 210 | SEARCH_TEXT, alph=True) 211 | 212 | self.assertNotEqual(0, len(data)) 213 | 214 | def test_bing_suggest_with_num(self): 215 | print('Test Bing text suggest with num.') 216 | 217 | # 検索エンジンを指定(ここではBingを使用) 218 | self.search_engine.set('bing') 219 | 220 | # 共通系の中間前処理を実行 221 | self.common_settings() 222 | 223 | # 検索を実行 224 | data = self.search_engine.suggest( 225 | SEARCH_TEXT, num=True) 226 | 227 | print("{} count.".format(len(data))) 228 | self.assertNotEqual(0, len(data)) 229 | 230 | # ========== 231 | # DuckDuckGo 232 | # ========== 233 | def test_duckduckgo_text_search(self): 234 | print('Test DuckDuckGo text search.') 235 | 236 | # 検索エンジンを指定(ここではDuckDuckGoを使用) 237 | self.search_engine.set('duckduckgo') 238 | 239 | # 共通系の中間前処理を実行 240 | self.common_settings() 241 | 242 | # 検索を実行 243 | data = self.search_engine.search(SEARCH_TEXT, maximum=30) 244 | 245 | print("{} count.".format(len(data))) 246 | self.assertEqual(30, len(data)) 247 | 248 | def test_duckduckgo_image_search(self): 249 | print('Test DuckDuckGo image search.') 250 | 251 | # 検索エンジンを指定(ここではDuckDuckGoを使用) 252 | self.search_engine.set('duckduckgo') 253 | 254 | # 共通系の中間前処理を実行 255 | self.common_settings() 256 | 257 | # 検索を実行 258 | data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30) 259 | 260 | print("{} count.".format(len(data))) 261 | self.assertEqual(30, len(data)) 262 | 263 | def test_duckduckgo_suggest(self): 264 | print('Test DuckDuckGo text suggest.') 265 | 266 | # 検索エンジンを指定(ここではDuckDuckGoを使用) 267 | self.search_engine.set('duckduckgo') 268 | 269 | # 共通系の中間前処理を実行 270 | self.common_settings() 271 | 272 | # 検索を実行 273 | data = self.search_engine.suggest( 274 | SEARCH_TEXT) 275 | 276 | print("{} count.".format(len(data))) 277 | self.assertNotEqual(0, len(data)) 278 | 279 | def test_duckduckgo_suggest_with_jap(self): 280 | print('Test DuckDuckGo text suggest with jap.') 281 | 282 | # 検索エンジンを指定(ここではDuckDuckGoを使用) 283 | self.search_engine.set('duckduckgo') 284 | 285 | # 共通系の中間前処理を実行 286 | self.common_settings() 287 | 288 | # 検索を実行 289 | data = self.search_engine.suggest( 290 | SEARCH_TEXT, jap=True) 291 | 292 | print("{} count.".format(len(data))) 293 | self.assertNotEqual(0, len(data)) 294 | 295 | def test_duckduckgo_suggest_with_alph(self): 296 | print('Test DuckDuckGo text suggest with alph.') 297 | 298 | # 検索エンジンを指定(ここではDuckDuckGoを使用) 299 | self.search_engine.set('duckduckgo') 300 | 301 | # 共通系の中間前処理を実行 302 | self.common_settings() 303 | 304 | # 検索を実行 305 | data = self.search_engine.suggest( 306 | SEARCH_TEXT, alph=True) 307 | 308 | self.assertNotEqual(0, len(data)) 309 | 310 | def test_duckduckgo_suggest_with_num(self): 311 | print('Test DuckDuckGo text suggest with num.') 312 | 313 | # 検索エンジンを指定(ここではDuckDuckGoを使用) 314 | self.search_engine.set('duckduckgo') 315 | 316 | # 共通系の中間前処理を実行 317 | self.common_settings() 318 | 319 | # 検索を実行 320 | data = self.search_engine.suggest( 321 | SEARCH_TEXT, num=True) 322 | 323 | print("{} count.".format(len(data))) 324 | self.assertNotEqual(0, len(data)) 325 | 326 | # ========== 327 | # Google 328 | # ========== 329 | def test_google_text_search(self): 330 | print('Test Google text search.') 331 | 332 | # 検索エンジンを指定(ここではGoogleを使用) 333 | self.search_engine.set('google') 334 | 335 | # 共通系の中間前処理を実行 336 | self.common_settings() 337 | 338 | # 検索を実行 339 | data = self.search_engine.search(SEARCH_TEXT, maximum=30) 340 | 341 | print("{} count.".format(len(data))) 342 | self.assertEqual(30, len(data)) 343 | 344 | def test_google_image_search(self): 345 | print('Test Google image search.') 346 | 347 | # 検索エンジンを指定(ここではGoogleを使用) 348 | self.search_engine.set('google') 349 | 350 | # 共通系の中間前処理を実行 351 | self.common_settings() 352 | 353 | # 検索を実行 354 | data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30) 355 | 356 | print("{} count.".format(len(data))) 357 | self.assertEqual(30, len(data)) 358 | 359 | def test_google_suggest(self): 360 | print('Test Google text suggest.') 361 | 362 | # 検索エンジンを指定(ここではGoogleを使用) 363 | self.search_engine.set('google') 364 | 365 | # 共通系の中間前処理を実行 366 | self.common_settings() 367 | 368 | # 検索を実行 369 | data = self.search_engine.suggest( 370 | SEARCH_TEXT) 371 | 372 | print("{} count.".format(len(data))) 373 | self.assertNotEqual(0, len(data)) 374 | 375 | def test_google_suggest_with_jap(self): 376 | print('Test Google text suggest with jap.') 377 | 378 | # 検索エンジンを指定(ここではGoogleを使用) 379 | self.search_engine.set('google') 380 | 381 | # 共通系の中間前処理を実行 382 | self.common_settings() 383 | 384 | # 検索を実行 385 | data = self.search_engine.suggest( 386 | SEARCH_TEXT, jap=True) 387 | 388 | print("{} count.".format(len(data))) 389 | self.assertNotEqual(0, len(data)) 390 | 391 | def test_google_suggest_with_alph(self): 392 | print('Test Google text suggest with alph.') 393 | 394 | # 検索エンジンを指定(ここではGoogleを使用) 395 | self.search_engine.set('google') 396 | 397 | # 共通系の中間前処理を実行 398 | self.common_settings() 399 | 400 | # 検索を実行 401 | data = self.search_engine.suggest( 402 | SEARCH_TEXT, alph=True) 403 | 404 | self.assertNotEqual(0, len(data)) 405 | 406 | def test_google_suggest_with_num(self): 407 | print('Test Google text suggest with num.') 408 | 409 | # 検索エンジンを指定(ここではGoogleを使用) 410 | self.search_engine.set('google') 411 | 412 | # 共通系の中間前処理を実行 413 | self.common_settings() 414 | 415 | # 検索を実行 416 | data = self.search_engine.suggest( 417 | SEARCH_TEXT, num=True) 418 | 419 | print("{} count.".format(len(data))) 420 | self.assertNotEqual(0, len(data)) 421 | 422 | # ========== 423 | # Yahoo 424 | # ========== 425 | def test_yahoo_text_search(self): 426 | print('Test Yahoo text search.') 427 | 428 | # 検索エンジンを指定(ここではYahooを使用) 429 | self.search_engine.set('yahoo') 430 | 431 | # 共通系の中間前処理を実行 432 | self.common_settings() 433 | 434 | # 検索を実行 435 | data = self.search_engine.search(SEARCH_TEXT, maximum=30) 436 | 437 | print("{} count.".format(len(data))) 438 | self.assertEqual(30, len(data)) 439 | 440 | def test_yahoo_image_search(self): 441 | print('Test Yahoo image search.') 442 | 443 | # 検索エンジンを指定(ここではYahooを使用) 444 | self.search_engine.set('yahoo') 445 | 446 | # 共通系の中間前処理を実行 447 | self.common_settings() 448 | 449 | # 検索を実行 450 | data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30) 451 | 452 | print("{} count.".format(len(data))) 453 | self.assertEqual(30, len(data)) 454 | 455 | def test_yahoo_suggest(self): 456 | print('Test Yahoo text suggest.') 457 | 458 | # 検索エンジンを指定(ここではYahooを使用) 459 | self.search_engine.set('yahoo') 460 | 461 | # 共通系の中間前処理を実行 462 | self.common_settings() 463 | 464 | # 検索を実行 465 | data = self.search_engine.suggest( 466 | SEARCH_TEXT) 467 | 468 | print("{} count.".format(len(data))) 469 | self.assertNotEqual(0, len(data)) 470 | 471 | def test_yahoo_suggest_with_jap(self): 472 | print('Test Yahoo text suggest with jap.') 473 | 474 | # 検索エンジンを指定(ここではYahooを使用) 475 | self.search_engine.set('yahoo') 476 | 477 | # 共通系の中間前処理を実行 478 | self.common_settings() 479 | 480 | # 検索を実行 481 | data = self.search_engine.suggest( 482 | SEARCH_TEXT, jap=True) 483 | 484 | print("{} count.".format(len(data))) 485 | self.assertNotEqual(0, len(data)) 486 | 487 | def test_yahoo_suggest_with_alph(self): 488 | print('Test Yahoo text suggest with alph.') 489 | 490 | # 検索エンジンを指定(ここではYahooを使用) 491 | self.search_engine.set('yahoo') 492 | 493 | # 共通系の中間前処理を実行 494 | self.common_settings() 495 | 496 | # 検索を実行 497 | data = self.search_engine.suggest( 498 | SEARCH_TEXT, alph=True) 499 | 500 | self.assertNotEqual(0, len(data)) 501 | 502 | def test_yahoo_suggest_with_num(self): 503 | print('Test Yahoo text suggest with num.') 504 | 505 | # 検索エンジンを指定(ここではYahooを使用) 506 | self.search_engine.set('yahoo') 507 | 508 | # 共通系の中間前処理を実行 509 | self.common_settings() 510 | 511 | # 検索を実行 512 | data = self.search_engine.suggest( 513 | SEARCH_TEXT, num=True) 514 | 515 | print("{} count.".format(len(data))) 516 | self.assertNotEqual(0, len(data)) 517 | -------------------------------------------------------------------------------- /pydork/test_engine_selenium.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2023 Blacknon. All rights reserved. 4 | # Use of this source code is governed by an MIT license 5 | # that can be found in the LICENSE file. 6 | # ======================================================= 7 | 8 | 9 | """test_engine_google 10 | * SearchEngine Classのテストコード. 11 | * 各検索エンジンの動作テストを行う 12 | """ 13 | 14 | # TODO: splash/selenium経由での通信のテストも追加する(dockerでのコンテナ環境が前提になると思われる) 15 | 16 | 17 | import unittest 18 | 19 | from .engine import SearchEngine 20 | 21 | # 変数 22 | SEARCH_TEXT = 'Linux' 23 | 24 | 25 | class SearchEngineTestCaseWithSelenium(unittest.TestCase): 26 | def setUp(self): 27 | """setUp 28 | 29 | テストメソッド実行前処理. 30 | """ 31 | # SearchEngine 32 | self.search_engine = SearchEngine() 33 | 34 | print("setUp!!") 35 | 36 | def tearDown(self): 37 | """tearDown 38 | 39 | テストメソッド実行後処理 40 | """ 41 | 42 | print("tearDown!!") 43 | 44 | def common_settings(self): 45 | # command modeを有効化 46 | self.search_engine.set_is_command(True) 47 | 48 | # debug modeを有効化 49 | self.search_engine.set_is_debug(True) 50 | 51 | # seleniumを有効化 52 | self.search_engine.set_selenium(None, 'chrome') 53 | 54 | # user agentを定義 55 | self.search_engine.set_user_agent() 56 | 57 | # ========== 58 | # Baidu 59 | # ========== 60 | def test_baidu_text_search(self): 61 | print('Test Baidu text search.') 62 | 63 | # 検索エンジンを指定(ここではBaiduを使用) 64 | self.search_engine.set('baidu') 65 | 66 | # 共通系の中間前処理を実行 67 | self.common_settings() 68 | 69 | # 検索を実行 70 | data = self.search_engine.search(SEARCH_TEXT, maximum=30) 71 | 72 | print("{} count.".format(len(data))) 73 | self.assertNotEqual(0, len(data)) 74 | 75 | def test_baidu_image_search(self): 76 | print('Test Baidu image search.') 77 | 78 | # 検索エンジンを指定(ここではBaiduを使用) 79 | self.search_engine.set('baidu') 80 | 81 | # 共通系の中間前処理を実行 82 | self.common_settings() 83 | 84 | # 検索を実行 85 | data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30) 86 | 87 | print("{} count.".format(len(data))) 88 | self.assertNotEqual(0, len(data)) 89 | 90 | def test_baidu_suggest(self): 91 | print('Test Baidu text suggest.') 92 | 93 | # 検索エンジンを指定(ここではBaiduを使用) 94 | self.search_engine.set('baidu') 95 | 96 | # 共通系の中間前処理を実行 97 | self.common_settings() 98 | 99 | # 検索を実行 100 | data = self.search_engine.suggest( 101 | SEARCH_TEXT) 102 | 103 | print("{} count.".format(len(data))) 104 | self.assertNotEqual(0, len(data)) 105 | 106 | def test_baidu_suggest_with_alph(self): 107 | print('Test Baidu text suggest with alph.') 108 | 109 | # 検索エンジンを指定(ここではBaiduを使用) 110 | self.search_engine.set('baidu') 111 | 112 | # 共通系の中間前処理を実行 113 | self.common_settings() 114 | 115 | # 検索を実行 116 | data = self.search_engine.suggest( 117 | SEARCH_TEXT, alph=True) 118 | 119 | self.assertNotEqual(0, len(data)) 120 | 121 | def test_baidu_suggest_with_num(self): 122 | print('Test Baidu text suggest with num.') 123 | 124 | # 検索エンジンを指定(ここではBaiduを使用) 125 | self.search_engine.set('baidu') 126 | 127 | # 共通系の中間前処理を実行 128 | self.common_settings() 129 | 130 | # 検索を実行 131 | data = self.search_engine.suggest( 132 | SEARCH_TEXT, num=True) 133 | 134 | print("{} count.".format(len(data))) 135 | self.assertNotEqual(0, len(data)) 136 | 137 | # ========== 138 | # Bing 139 | # ========== 140 | def test_bing_text_search(self): 141 | print('Test Bing text search.') 142 | 143 | # 検索エンジンを指定(ここではBingを使用) 144 | self.search_engine.set('bing') 145 | 146 | # 共通系の中間前処理を実行 147 | self.common_settings() 148 | 149 | # 検索を実行 150 | data = self.search_engine.search(SEARCH_TEXT, maximum=30) 151 | 152 | print("{} count.".format(len(data))) 153 | self.assertNotEqual(0, len(data)) 154 | 155 | def test_bing_image_search(self): 156 | print('Test Bing image search.') 157 | 158 | # 検索エンジンを指定(ここではBingを使用) 159 | self.search_engine.set('bing') 160 | 161 | # 共通系の中間前処理を実行 162 | self.common_settings() 163 | 164 | # 検索を実行 165 | data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30) 166 | 167 | print("{} count.".format(len(data))) 168 | self.assertNotEqual(0, len(data)) 169 | 170 | def test_bing_suggest(self): 171 | print('Test Bing text suggest.') 172 | 173 | # 検索エンジンを指定(ここではBingを使用) 174 | self.search_engine.set('bing') 175 | 176 | # 共通系の中間前処理を実行 177 | self.common_settings() 178 | 179 | # 検索を実行 180 | data = self.search_engine.suggest( 181 | SEARCH_TEXT) 182 | 183 | print("{} count.".format(len(data))) 184 | self.assertNotEqual(0, len(data)) 185 | 186 | def test_bing_suggest_with_jap(self): 187 | print('Test Bing text suggest with jap.') 188 | 189 | # 検索エンジンを指定(ここではBingを使用) 190 | self.search_engine.set('bing') 191 | 192 | # 共通系の中間前処理を実行 193 | self.common_settings() 194 | 195 | # 検索を実行 196 | data = self.search_engine.suggest( 197 | SEARCH_TEXT, jap=True) 198 | 199 | print("{} count.".format(len(data))) 200 | self.assertNotEqual(0, len(data)) 201 | 202 | def test_bing_suggest_with_alph(self): 203 | print('Test Bing text suggest with alph.') 204 | 205 | # 検索エンジンを指定(ここではBingを使用) 206 | self.search_engine.set('bing') 207 | 208 | # 共通系の中間前処理を実行 209 | self.common_settings() 210 | 211 | # 検索を実行 212 | data = self.search_engine.suggest( 213 | SEARCH_TEXT, alph=True) 214 | 215 | self.assertNotEqual(0, len(data)) 216 | 217 | def test_bing_suggest_with_num(self): 218 | print('Test Bing text suggest with num.') 219 | 220 | # 検索エンジンを指定(ここではBingを使用) 221 | self.search_engine.set('bing') 222 | 223 | # 共通系の中間前処理を実行 224 | self.common_settings() 225 | 226 | # 検索を実行 227 | data = self.search_engine.suggest( 228 | SEARCH_TEXT, num=True) 229 | 230 | print("{} count.".format(len(data))) 231 | self.assertNotEqual(0, len(data)) 232 | 233 | # ========== 234 | # DuckDuckGo 235 | # ========== 236 | def test_duckduckgo_text_search(self): 237 | print('Test DuckDuckGo text search.') 238 | 239 | # 検索エンジンを指定(ここではDuckDuckGoを使用) 240 | self.search_engine.set('duckduckgo') 241 | 242 | # 共通系の中間前処理を実行 243 | self.common_settings() 244 | 245 | # 検索を実行 246 | data = self.search_engine.search(SEARCH_TEXT, maximum=30) 247 | 248 | print("{} count.".format(len(data))) 249 | self.assertEqual(30, len(data)) 250 | 251 | def test_duckduckgo_image_search(self): 252 | print('Test DuckDuckGo image search.') 253 | 254 | # 検索エンジンを指定(ここではDuckDuckGoを使用) 255 | self.search_engine.set('duckduckgo') 256 | 257 | # 共通系の中間前処理を実行 258 | self.common_settings() 259 | 260 | # 検索を実行 261 | data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30) 262 | 263 | print("{} count.".format(len(data))) 264 | self.assertEqual(30, len(data)) 265 | 266 | def test_duckduckgo_suggest(self): 267 | print('Test DuckDuckGo text suggest.') 268 | 269 | # 検索エンジンを指定(ここではDuckDuckGoを使用) 270 | self.search_engine.set('duckduckgo') 271 | 272 | # 共通系の中間前処理を実行 273 | self.common_settings() 274 | 275 | # 検索を実行 276 | data = self.search_engine.suggest( 277 | SEARCH_TEXT) 278 | 279 | print("{} count.".format(len(data))) 280 | self.assertNotEqual(0, len(data)) 281 | 282 | def test_duckduckgo_suggest_with_jap(self): 283 | print('Test DuckDuckGo text suggest with jap.') 284 | 285 | # 検索エンジンを指定(ここではDuckDuckGoを使用) 286 | self.search_engine.set('duckduckgo') 287 | 288 | # 共通系の中間前処理を実行 289 | self.common_settings() 290 | 291 | # 検索を実行 292 | data = self.search_engine.suggest( 293 | SEARCH_TEXT, jap=True) 294 | 295 | print("{} count.".format(len(data))) 296 | self.assertNotEqual(0, len(data)) 297 | 298 | def test_duckduckgo_suggest_with_alph(self): 299 | print('Test DuckDuckGo text suggest with alph.') 300 | 301 | # 検索エンジンを指定(ここではDuckDuckGoを使用) 302 | self.search_engine.set('duckduckgo') 303 | 304 | # 共通系の中間前処理を実行 305 | self.common_settings() 306 | 307 | # 検索を実行 308 | data = self.search_engine.suggest( 309 | SEARCH_TEXT, alph=True) 310 | 311 | self.assertNotEqual(0, len(data)) 312 | 313 | def test_duckduckgo_suggest_with_num(self): 314 | print('Test DuckDuckGo text suggest with num.') 315 | 316 | # 検索エンジンを指定(ここではDuckDuckGoを使用) 317 | self.search_engine.set('duckduckgo') 318 | 319 | # 共通系の中間前処理を実行 320 | self.common_settings() 321 | 322 | # 検索を実行 323 | data = self.search_engine.suggest( 324 | SEARCH_TEXT, num=True) 325 | 326 | print("{} count.".format(len(data))) 327 | self.assertNotEqual(0, len(data)) 328 | 329 | # ========== 330 | # Google 331 | # ========== 332 | def test_google_text_search(self): 333 | print('Test Google text search.') 334 | 335 | # 検索エンジンを指定(ここではGoogleを使用) 336 | self.search_engine.set('google') 337 | 338 | # 共通系の中間前処理を実行 339 | self.common_settings() 340 | 341 | # 検索を実行 342 | data = self.search_engine.search(SEARCH_TEXT, maximum=30) 343 | 344 | print("{} count.".format(len(data))) 345 | self.assertEqual(30, len(data)) 346 | 347 | def test_google_image_search(self): 348 | print('Test Google image search.') 349 | 350 | # 検索エンジンを指定(ここではGoogleを使用) 351 | self.search_engine.set('google') 352 | 353 | # 共通系の中間前処理を実行 354 | self.common_settings() 355 | 356 | # 検索を実行 357 | data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30) 358 | 359 | print("{} count.".format(len(data))) 360 | self.assertEqual(30, len(data)) 361 | 362 | def test_google_suggest(self): 363 | print('Test Google text suggest.') 364 | 365 | # 検索エンジンを指定(ここではGoogleを使用) 366 | self.search_engine.set('google') 367 | 368 | # 共通系の中間前処理を実行 369 | self.common_settings() 370 | 371 | # 検索を実行 372 | data = self.search_engine.suggest( 373 | SEARCH_TEXT) 374 | 375 | print("{} count.".format(len(data))) 376 | self.assertNotEqual(0, len(data)) 377 | 378 | def test_google_suggest_with_jap(self): 379 | print('Test Google text suggest with jap.') 380 | 381 | # 検索エンジンを指定(ここではGoogleを使用) 382 | self.search_engine.set('google') 383 | 384 | # 共通系の中間前処理を実行 385 | self.common_settings() 386 | 387 | # 検索を実行 388 | data = self.search_engine.suggest( 389 | SEARCH_TEXT, jap=True) 390 | 391 | print("{} count.".format(len(data))) 392 | self.assertNotEqual(0, len(data)) 393 | 394 | def test_google_suggest_with_alph(self): 395 | print('Test Google text suggest with alph.') 396 | 397 | # 検索エンジンを指定(ここではGoogleを使用) 398 | self.search_engine.set('google') 399 | 400 | # 共通系の中間前処理を実行 401 | self.common_settings() 402 | 403 | # 検索を実行 404 | data = self.search_engine.suggest( 405 | SEARCH_TEXT, alph=True) 406 | 407 | self.assertNotEqual(0, len(data)) 408 | 409 | def test_google_suggest_with_num(self): 410 | print('Test Google text suggest with num.') 411 | 412 | # 検索エンジンを指定(ここではGoogleを使用) 413 | self.search_engine.set('google') 414 | 415 | # 共通系の中間前処理を実行 416 | self.common_settings() 417 | 418 | # 検索を実行 419 | data = self.search_engine.suggest( 420 | SEARCH_TEXT, num=True) 421 | 422 | print("{} count.".format(len(data))) 423 | self.assertNotEqual(0, len(data)) 424 | 425 | # ========== 426 | # Yahoo 427 | # ========== 428 | def test_yahoo_text_search(self): 429 | print('Test Yahoo text search.') 430 | 431 | # 検索エンジンを指定(ここではYahooを使用) 432 | self.search_engine.set('yahoo') 433 | 434 | # 共通系の中間前処理を実行 435 | self.common_settings() 436 | 437 | # 検索を実行 438 | data = self.search_engine.search(SEARCH_TEXT, maximum=30) 439 | 440 | print("{} count.".format(len(data))) 441 | self.assertEqual(30, len(data)) 442 | 443 | def test_yahoo_image_search(self): 444 | print('Test Yahoo image search.') 445 | 446 | # 検索エンジンを指定(ここではYahooを使用) 447 | self.search_engine.set('yahoo') 448 | 449 | # 共通系の中間前処理を実行 450 | self.common_settings() 451 | 452 | # 検索を実行 453 | data = self.search_engine.search(SEARCH_TEXT, type='image', maximum=30) 454 | 455 | print("{} count.".format(len(data))) 456 | self.assertEqual(30, len(data)) 457 | 458 | def test_yahoo_suggest(self): 459 | print('Test Yahoo text suggest.') 460 | 461 | # 検索エンジンを指定(ここではYahooを使用) 462 | self.search_engine.set('yahoo') 463 | 464 | # 共通系の中間前処理を実行 465 | self.common_settings() 466 | 467 | # 検索を実行 468 | data = self.search_engine.suggest( 469 | SEARCH_TEXT) 470 | 471 | print("{} count.".format(len(data))) 472 | self.assertNotEqual(0, len(data)) 473 | 474 | def test_yahoo_suggest_with_jap(self): 475 | print('Test Yahoo text suggest with jap.') 476 | 477 | # 検索エンジンを指定(ここではYahooを使用) 478 | self.search_engine.set('yahoo') 479 | 480 | # 共通系の中間前処理を実行 481 | self.common_settings() 482 | 483 | # 検索を実行 484 | data = self.search_engine.suggest( 485 | SEARCH_TEXT, jap=True) 486 | 487 | print("{} count.".format(len(data))) 488 | self.assertNotEqual(0, len(data)) 489 | 490 | def test_yahoo_suggest_with_alph(self): 491 | print('Test Yahoo text suggest with alph.') 492 | 493 | # 検索エンジンを指定(ここではYahooを使用) 494 | self.search_engine.set('yahoo') 495 | 496 | # 共通系の中間前処理を実行 497 | self.common_settings() 498 | 499 | # 検索を実行 500 | data = self.search_engine.suggest( 501 | SEARCH_TEXT, alph=True) 502 | 503 | self.assertNotEqual(0, len(data)) 504 | 505 | def test_yahoo_suggest_with_num(self): 506 | print('Test Yahoo text suggest with num.') 507 | 508 | # 検索エンジンを指定(ここではYahooを使用) 509 | self.search_engine.set('yahoo') 510 | 511 | # 共通系の中間前処理を実行 512 | self.common_settings() 513 | 514 | # 検索を実行 515 | data = self.search_engine.suggest( 516 | SEARCH_TEXT, num=True) 517 | 518 | print("{} count.".format(len(data))) 519 | self.assertNotEqual(0, len(data)) 520 | -------------------------------------------------------------------------------- /pydork/engine_google.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2023 Blacknon. All rights reserved. 4 | # Use of this source code is governed by an MIT license 5 | # that can be found in the LICENSE file. 6 | # ======================================================= 7 | 8 | 9 | """engine_google 10 | * Google用の検索用Classを持つモジュール. 11 | """ 12 | 13 | # import sys 14 | 15 | import json 16 | import os 17 | 18 | from time import sleep 19 | from json.decoder import JSONDecodeError 20 | from urllib import parse 21 | from lxml import etree 22 | # from bs4 import BeautifulSoup 23 | 24 | from .common import Color 25 | from .recaptcha import TwoCaptcha 26 | from .engine_common import CommonEngine 27 | 28 | 29 | # Google画像検索で使用するパラメータID 30 | RPC_ID = "HoAMBc" 31 | 32 | 33 | class Google(CommonEngine): 34 | """Google 35 | 36 | Google用の検索エンジン用Class. 37 | """ 38 | 39 | def __init__(self): 40 | # CommonEngineの処理を呼出し 41 | super().__init__() 42 | 43 | self.NAME = 'Google' 44 | self.COLOR = Color.PURPLE 45 | self.COLOR_NAME = self.COLOR + self.NAME + Color.END 46 | 47 | # リクエスト先のURLを指定 48 | self.ENGINE_TOP_URL = 'https://www.google.com/' 49 | self.SEARCH_URL = 'https://www.google.com/search' 50 | self.IMAGE_URL = 'https://www.google.com/_/VisualFrontendUi/data/batchexecute' 51 | self.SUGGEST_URL = 'http://www.google.com/complete/search' 52 | 53 | # 次の検索ページのURL(`self.get_nextpage_url`の処理で取得する) 54 | self.SEARCH_NEXT_URL = None 55 | 56 | # ReCaptcha画面かどうかの識別用 57 | self.SOUP_RECAPTCHA_TAG = '#captcha-form > #recaptcha' 58 | 59 | def gen_search_url(self, keyword: str, type: str): 60 | """gen_search_url 61 | 62 | 検索用のurlを生成する. 63 | 64 | Args: 65 | keyword (str): 検索クエリ. 66 | type (str): 検索タイプ. 67 | 68 | Returns: 69 | dict: 検索用url 70 | """ 71 | 72 | search_url = '' 73 | 74 | if type == 'text': 75 | # 検索用urlを指定 76 | search_url = self.SEARCH_URL 77 | 78 | # 検索パラメータの設定 79 | url_param = { 80 | 'q': keyword, # 検索キーワード 81 | 'oq': keyword, # 検索キーワード 82 | 'num': 100, # 1ページごとの表示件数. 83 | 'filter': 0, # 類似ページのフィルタリング(0...無効, 1...有効) 84 | 'nfpr': 1 # もしかして検索(Escape hatch)を無効化 85 | } 86 | 87 | # lang/localeが設定されている場合 88 | if self.LANG != '' and self.LOCALE != '': 89 | url_param['hl'] = self.LANG 90 | url_param['gl'] = self.LOCALE 91 | 92 | # rangeが設定されている場合 93 | try: 94 | start = self.RANGE_START 95 | end = self.RANGE_END 96 | 97 | cd_min = start.strftime("%m/%d/%Y") 98 | cd_max = end.strftime("%m/%d/%Y") 99 | 100 | # GETパラメータに日時データを追加 101 | url_param['tbs'] = "cdr:1,cd_min:{0},cd_max:{1}".format( 102 | cd_min, cd_max) 103 | 104 | except AttributeError: 105 | None 106 | 107 | page = 0 108 | while True: 109 | # parameterにページを開始する番号を指定 110 | url_param['start'] = str(page * 100) 111 | params = parse.urlencode(url_param) 112 | 113 | target_url = search_url + '?' + params 114 | 115 | yield 'GET', target_url, None 116 | page += 1 117 | 118 | elif type == 'image': 119 | # 検索用urlを指定 120 | search_url = self.IMAGE_URL 121 | 122 | # Refererの設定 123 | if not self.USE_SELENIUM: 124 | self.session.headers.update( 125 | {"Referer": "https://www.google.com/"} 126 | ) 127 | 128 | # 検索パラメータの設定 129 | url_param = { 130 | 'rpcids': 'HoAMBc', 131 | 'hl': 'id', 132 | 'authuser': '0', 133 | 'soc-app': '162', 134 | 'soc-platform': '1', 135 | 'soc-device': '1', 136 | 'rt': 'c' 137 | } 138 | 139 | # 画像のカーソル位置指定パラメータを作成 140 | self.image_next_cursor = None 141 | self.image_cursor = [] 142 | 143 | page = 0 144 | while True: 145 | # post dataを生成 146 | data = { 147 | "f.req": build_rpc_request(keyword, (self.image_cursor, self.image_next_cursor), page), 148 | "at": "ABrGKkQnVYg89U_cdKuhNZ5hM4vx:1616119655028", 149 | # "": "", 150 | } 151 | 152 | params = parse.urlencode(url_param) 153 | target_url = search_url + '?' + params 154 | 155 | yield 'POST', target_url, data 156 | 157 | def gen_suggest_url(self, keyword: str): 158 | """gen_suggest_url 159 | 160 | サジェスト取得用のurlを生成する. 161 | 162 | Args: 163 | keyword (str): 検索クエリ. 164 | 165 | Returns: 166 | dict: サジェスト取得用url 167 | """ 168 | 169 | url_param = { 170 | 'q': keyword, # 検索キーワード 171 | 'output': 'toolbar', 172 | 'ie': 'utf-8', 173 | 'oe': 'utf-8', 174 | } 175 | 176 | params = parse.urlencode(url_param) 177 | url = self.SUGGEST_URL + '?' + params 178 | 179 | return url 180 | 181 | def get_links(self, url: str, html: str, type: str): 182 | """get_links 183 | 184 | 受け付けたhtmlを解析し、検索結果をlistに加工して返す関数. 185 | 186 | Args: 187 | url (str): 解析する検索結果のurl. 188 | html (str): 解析する検索結果のhtml. 189 | type (str): 検索タイプ([text, image]).現時点ではtextのみ対応. 190 | 191 | Returns: 192 | list: 検索結果。変数名はlinks。(`[{'title': 'title...', 'url': 'https://hogehoge....'}, {...}]`) 193 | """ 194 | 195 | # テキスト検索の場合 196 | if type == 'text': 197 | # request or seleniumの定義 198 | self.SOUP_SELECT_URL = '#main > div > div > .kCrYT > a' 199 | self.SOUP_SELECT_TITLE = '#main > div > div > .kCrYT > a > h3 > div' 200 | self.SOUP_SELECT_TEXT = '#main > div > div > .kCrYT > div > div > div > div > div' 201 | self.SOUP_SELECT_NEXT_URL = '' 202 | 203 | # Selenium経由、かつFirefoxを使っている場合 204 | if self.USE_SELENIUM: 205 | self.SOUP_SELECT_URL = '.yuRUbf > div > span > a' 206 | self.SOUP_SELECT_TITLE = '.yuRUbf > div > span > a > h3' 207 | self.SOUP_SELECT_TEXT = '.yXK7lf' 208 | self.SOUP_SELECT_NEXT_URL = '.AaVjTc > tbody > tr > td > a' 209 | 210 | # Splash経由で通信している場合 211 | elif self.USE_SPLASH: 212 | self.SOUP_SELECT_URL = '.yuRUbf > div > span > a.href' 213 | self.SOUP_SELECT_TITLE = '.yuRUbf > div > span > a > h3' 214 | self.SOUP_SELECT_TEXT = '.yXK7lf' 215 | self.SOUP_SELECT_NEXT_URL = '.AaVjTc > tbody > tr > td > a' 216 | 217 | # TODO: SEARCH_NEXT_URLを書き換える 218 | # self.get_nextpage_url(html) 219 | 220 | # CommonEngineの処理を呼び出す 221 | links = super().get_links(url, html, type) 222 | 223 | # イメージ検索の場合 224 | elif type == 'image': 225 | links = self.get_image_links(html) 226 | 227 | return links 228 | 229 | def get_image_links(self, html: str): 230 | """get_image_links 231 | 232 | BeautifulSoupから画像検索ページを解析して結果を返す関数. 233 | Seleniumを利用し、自動的にページ末尾まで移動して続きを取得する. 234 | クリック等が発生するため、抽出にかなり時間がかかる. 235 | 236 | Args: 237 | html (str): 解析する検索結果のhtml. 238 | 239 | Returns: 240 | list: 検索結果(`[{'title': 'title...', 'link': 'https://hogehoge....'}, {...}]`) 241 | 242 | 参考: 243 | - https://github.com/Wikidepia/py-googleimages/blob/b781b79e9bf40d29cf6fcbdcf625303abf3718bd/googleimages/client.py 244 | """ 245 | 246 | links = list() 247 | 248 | # 改行区切りでloop 249 | for line in html.split("\n"): 250 | if RPC_ID not in line: 251 | continue 252 | 253 | # Make it json readable 254 | line_cl = line.replace("\\n", "") # Remove \n 255 | 256 | lineson = json.loads(line_cl) 257 | 258 | data = pjson_loads(lineson[0][2]) 259 | 260 | # 画像のカーソル位置を更新 261 | self.image_next_cursor = data[-2] 262 | self.image_img_cursor = data[31][0][12][11][5] 263 | 264 | for img in data[31][0][12][2]: 265 | # imgの値をチェック 266 | if img[1] is None: 267 | continue 268 | 269 | link = img[1][3][0] # 画像ファイルのurl 270 | title = img[1][9]['2003'][3] # 画像ファイルのあるページのtitle 271 | pagelink = img[1][9]['2003'][2] # 画像ファイルのあるページのurl 272 | links.append( 273 | { 274 | "link": link, 275 | "title": title, 276 | "pagelink": pagelink, 277 | } 278 | ) 279 | 280 | return links 281 | 282 | def get_suggest_list(self, suggests: list, char: str, html: str): 283 | """get_suggest_list 284 | 285 | htmlからsuggestを配列で取得する関数. 286 | 287 | Args: 288 | suggests (list): suggestを追加するための大本のlist. 289 | char (str): サジェストの文字列. 290 | html (str): 解析を行うhtml. 291 | 292 | Returns: 293 | dict: サジェスト配列 294 | """ 295 | 296 | sug_root = etree.XML(html) 297 | sug_data = sug_root.xpath("//suggestion") 298 | data = [s.get("data") for s in sug_data] 299 | 300 | suggests[char if char == '' else char[-1]] = data # type: ignore 301 | 302 | return suggests 303 | 304 | def processings_elist(self, elinks, etitles, etexts: list): 305 | """processings_elist 306 | 307 | self.get_links 内で、取得直後のelinks, etitlesに加工を加えるための関数. 308 | 309 | Args: 310 | elinks (list): elinks(検索結果のlink)の配列 311 | etitles (list): etitles(検索結果のtitle)の配列 312 | etexts (list): etexts(検索結果のtext)の配列 313 | 314 | Returns: 315 | elinks (list): elinks(検索結果のlink)の配列 316 | etitles (list): etitles(検索結果のtitle)の配列 317 | etexts (list): etexts(検索結果のtext)の配列 318 | """ 319 | 320 | # seleniumでfirefoxを使っていない、かつsplashを使っていない場合 321 | new_elinks = [] 322 | for elink in elinks: 323 | parsed = parse.urlparse(elink) 324 | parsed_query = parse.parse_qs(parsed.query) 325 | 326 | if 'url' in parsed_query and elink[0] == '/': 327 | parsed_q = parsed_query['url'] 328 | if len(parsed_q) > 0: 329 | new_elink = parsed_q[0] 330 | new_elinks.append(new_elink) 331 | else: 332 | new_elinks.append(elink) 333 | elinks = list(dict.fromkeys(new_elinks)) 334 | 335 | return elinks, etitles, etexts 336 | 337 | def bypass_recaptcha_selenium(self, url: str, html: str): 338 | """bypass_recaptcha_selenium 339 | 340 | SeleniumでReCaptchaを突破する関数. 341 | 2Captchaでの自動突破の場合、CookieとProxyが必要となる. 342 | 343 | Args: 344 | url (str): ReCaptcha画面が表示されてしまったリクエストのurl 345 | html (str): ReCaptcha画面のhtml 346 | 347 | Returns: 348 | str: ReCaptchaを突破後のurlのhtml 349 | """ 350 | 351 | # resultを定義しておく 352 | result = None 353 | 354 | # 環境変数を取得 355 | TC_API_KEY = os.getenv('API_KEY_2CAPTCHA') 356 | 357 | # Seleniumの場合、手動でBypassが行えるようにする 358 | if self.IS_DISABLE_HEADLESS: 359 | while True: 360 | # 現在のSeleniumのurlを取得する 361 | current_url = self.driver.current_url 362 | current_url_parse = parse.urlparse(current_url) 363 | 364 | # current_urlのpathが `/sorry/index` か識別する 365 | current_url_path = current_url_parse.path 366 | if current_url_path != '/sorry/index': 367 | break 368 | 369 | # 待機 370 | sleep(1) 371 | 372 | sleep(5) 373 | 374 | # 現在のページ(ReCaptchaから移動したページ)のhtmlを取得する 375 | result = self.driver.page_source 376 | 377 | # self.IS_DISABLE_HEADLESS がFalseで、かつ`API_KEY_2CAPTCHA`が定義されている場合 378 | elif TC_API_KEY is not None: 379 | # solverを作成 380 | solver = TwoCaptcha(TC_API_KEY) 381 | 382 | # flag set 383 | solver.set_debug(self.IS_DEBUG) 384 | solver.set_command(self.IS_COMMAND) 385 | solver.set_user_agent(self.USER_AGENT) 386 | solver.set_messages(self.MESSAGE) 387 | 388 | # solverからのレスポンスを取得する 389 | code = solver.google_recaptcha( 390 | html=html, 391 | url=url, 392 | cookies=self.driver.get_cookies(), 393 | proxy=self.PROXY, 394 | 395 | ) 396 | 397 | # ReCaptchaの解除に失敗した場合 398 | if code is None: 399 | return result 400 | 401 | # 解除コードを所定のtextareaに入力 402 | self.driver.execute_script(""" 403 | document.getElementById( 404 | "g-recaptcha-response").innerHTML = arguments[0] 405 | """, code) 406 | 407 | # ボタンクリック 408 | self.driver.execute_script( 409 | 'var element=document.getElementById("g-recaptcha-response"); element.style.display="none";') 410 | 411 | self.driver.execute_script('submitCallback()') 412 | 413 | sleep(10) 414 | 415 | # 結果を取得する 416 | result = self.driver.page_source 417 | 418 | return result 419 | 420 | 421 | def build_rpc_request(keyword: str, cursor: list, page: int): 422 | """build_rpc_request 423 | 424 | 画像検索で使用するrpcデータの生成用関数. 425 | 426 | Original: 427 | https://github.com/Wikidepia/py-googleimages/blob/b781b79e9bf40d29cf6fcbdcf625303abf3718bd/googleimages/utils.py 428 | 429 | Args: 430 | keyword (str): [description] 431 | cursor (list): [description] 432 | page (int): [description] 433 | 434 | Returns: 435 | [type]: [description] 436 | """ 437 | 438 | RPC_ID = "HoAMBc" 439 | 440 | return json.dumps( 441 | [ 442 | [ 443 | [ 444 | RPC_ID, 445 | json.dumps( 446 | [ 447 | None, 448 | None, 449 | [ 450 | 1, 451 | None, 452 | 450, 453 | 1, 454 | 1280, 455 | cursor[0], 456 | [], 457 | [], 458 | None, 459 | None, 460 | None, 461 | 0, 462 | 310, 463 | [], 464 | ], 465 | None, 466 | None, 467 | None, 468 | None, 469 | None, 470 | None, 471 | None, 472 | None, 473 | None, 474 | None, 475 | None, 476 | None, 477 | None, 478 | None, 479 | None, 480 | None, 481 | None, 482 | None, 483 | None, 484 | None, 485 | None, 486 | None, 487 | None, 488 | None, 489 | None, 490 | [ 491 | keyword, 492 | None, 493 | None, 494 | "strict", 495 | None, 496 | None, 497 | None, 498 | None, 499 | None, 500 | None, 501 | None, 502 | None, 503 | None, 504 | None, 505 | None, 506 | None, 507 | None, 508 | None, 509 | None, 510 | None, 511 | None, 512 | "lnms", 513 | ], 514 | None, 515 | None, 516 | None, 517 | None, 518 | None, 519 | None, 520 | None, 521 | None, 522 | [ 523 | cursor[1], 524 | "CAM=", 525 | "CgtHUklEX1NUQVRFMBAaIAA=", 526 | ], 527 | ], 528 | separators=(",", ":"), 529 | ), 530 | None, 531 | "generic", 532 | ], 533 | ] 534 | ], 535 | separators=(",", ":"), 536 | ) 537 | 538 | 539 | def pjson_loads(text): 540 | """pjson_loads 541 | 542 | 画像検索で使用するデータの生成用関数. 543 | 544 | Original: 545 | https://github.com/Wikidepia/py-googleimages/blob/b781b79e9bf40d29cf6fcbdcf625303abf3718bd/googleimages/utils.py 546 | 547 | Args: 548 | text ([type]): [description] 549 | 550 | Returns: 551 | [type]: [description] 552 | """ 553 | while True: 554 | try: 555 | data = json.loads(text, strict=False) 556 | except JSONDecodeError as exc: 557 | if exc.msg == "Invalid \\escape": 558 | text = text[: exc.pos] + "\\" + text[exc.pos:] 559 | else: 560 | raise 561 | else: 562 | return data 563 | -------------------------------------------------------------------------------- /pydork/engine.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2023 Blacknon. All rights reserved. 4 | # Use of this source code is governed by an MIT license 5 | # that can be found in the LICENSE file. 6 | # ======================================================= 7 | 8 | 9 | # TODO: json出力時にヒット番号を付与する(SEO対策が行えているかどうかのチェック用) 10 | 11 | """engine 12 | * Module for performing searches with SearchEngine 13 | """ 14 | 15 | 16 | import os 17 | import pathlib 18 | import sys 19 | 20 | from time import sleep 21 | from string import ascii_lowercase, digits 22 | from datetime import datetime 23 | 24 | from .common import Color, Message 25 | from .common import set_counter 26 | from .engine_baidu import Baidu 27 | from .engine_bing import Bing 28 | from .engine_duckduckgo import DuckDuckGo 29 | from .engine_google import Google 30 | from .engine_yahoo import Yahoo 31 | 32 | 33 | # 対応する検索エンジンのリスト 34 | ENGINES = ['baidu', 'bing', 'duckduckgo', 'google', 'yahoo'] 35 | 36 | 37 | # 各種SearchEngineへの処理をまとめるWrapper用Class 38 | class SearchEngine: 39 | """SearchEngine 40 | 41 | Class for Wrapper to perform a search against the specified search engine. 42 | 43 | Examples: 44 | >>> search_engine = SearchEngine() 45 | >>> search_engine.set('google') 46 | >>> 47 | >>> # Text search in the accepted query 48 | >>> search_result = search_engine.search('zelda') 49 | >>> 50 | >>> # Image search in the accepted query 51 | >>> search_result = search_engine.search('zelda', type='image') 52 | >>> 53 | >>> # Get Suggest in the accepted query 54 | >>> search_result = search_engine.suggest('zelda') 55 | """ 56 | 57 | def __init__(self): 58 | None 59 | 60 | # どの検索エンジンを使用するか指定する関数 61 | def set(self, engine: str): 62 | """set 63 | 64 | A function that specifies which search engine to use. 65 | 66 | Args: 67 | engine (str): Specify the search engine to use for the search (see const ENGINES) 68 | """ 69 | 70 | # TODO: 値チェックして、許可した値以外はエラーにする 71 | if engine == 'baidu': 72 | self.ENGINE = Baidu() 73 | 74 | elif engine == 'bing': 75 | self.ENGINE = Bing() 76 | 77 | elif engine == 'duckduckgo': 78 | self.ENGINE = DuckDuckGo() 79 | 80 | elif engine == 'google': 81 | self.ENGINE = Google() 82 | 83 | elif engine == 'yahoo': 84 | self.ENGINE = Yahoo() 85 | 86 | else: 87 | raise Exception('Error!') 88 | 89 | self.IS_COLOR = False 90 | 91 | # Messageを定義 92 | self.MESSAGE = Message() 93 | self.MESSAGE.set_engine(self.ENGINE.NAME, self.ENGINE.COLOR) 94 | 95 | # multithreading用のlockを渡すための関数(現在未使用?) 96 | def set_lock(self, lock): 97 | """set_lock 98 | 99 | Function to pass lock for multithreading 100 | 101 | Args: 102 | lock (threading.Lock): multithreading lock object 103 | """ 104 | self.ENGINE.LOCK = lock 105 | 106 | # debugフラグを有効化する関数 107 | def set_is_debug(self, is_debug: bool): 108 | """set_debug 109 | 110 | set debug flag 111 | 112 | Args: 113 | debug (bool): debug flag(Enable debug with `True`). 114 | """ 115 | 116 | self.ENGINE.IS_DEBUG = is_debug # type: ignore 117 | 118 | # commandフラグ(コマンドモードでの実行)を有効化する関数 119 | def set_is_command(self, is_command: bool): 120 | """set_is_command 121 | 122 | set command flag. 123 | When the command flag is enabled, the contents used in the command will be output to the console. 124 | 125 | Args: 126 | is_command (bool): command flag(Enable command mode with `True`). 127 | """ 128 | self.ENGINE.IS_COMMAND = is_command # type: ignore 129 | 130 | # color出力が有効か否か 131 | def set_is_color(self, is_color: bool = False): 132 | """set_is_color 133 | 134 | Specifies whether to display the output in color. 135 | 136 | Args: 137 | is_color (bool): color flag(Enable color mode with `True`). 138 | """ 139 | self.IS_COLOR = is_color 140 | 141 | # disable-headlessフラグ(Seleniumをheadlessで起動)を有効化する関数 142 | def set_disable_headless(self, disable_headless: bool): 143 | """set_disable_headless 144 | 145 | Function to Disable Selenium's headless option. 146 | Used when manually bypassing ReCaptcha or when debugging. 147 | 148 | Args: 149 | disable_headless (bool): Disable Selenium headless option (disable with True) 150 | 151 | Examples: 152 | >>> search_engine = SearchEngine() 153 | >>> search_engine.set('google') 154 | >>> 155 | >>> # Set Selenium 156 | >>> search_engine.set_selenium() 157 | >>> 158 | >>> # Disable headless mode 159 | >>> search_engine.set_disable_headless(True) 160 | >>> 161 | >>> # Open browser and search query 162 | >>> search_engine.search('mario') 163 | 164 | """ 165 | 166 | self.ENGINE.IS_DISABLE_HEADLESS = disable_headless # type: ignore 167 | 168 | # cookieファイルを入れているディレクトリを渡して、使用するcookieファイルを取得する関数 169 | def set_cookie_files(self, cookie_dir: str): 170 | """set_cookie_files 171 | 172 | Function to specify and generate the cookie file name to be used by passing the directory to put the cookie file. 173 | Currently, cookie files are only used with Selenium. 174 | 175 | Args: 176 | cookie_dir (str): Directory path where cookie files are placed. 177 | """ 178 | 179 | # フルパスに変換 180 | cookie_dir = pathlib.Path(cookie_dir).expanduser() # type: ignore 181 | cookie_dir = pathlib.Path(cookie_dir).resolve() # type: ignore 182 | 183 | # 存在チェックをして、ディレクトリがない場合は新規作成 184 | if not os.path.exists(cookie_dir): 185 | # TODO: ディレクトリではなく、ファイルが存在していた場合はエラー処理をする 186 | 187 | # ディレクトリを作成 188 | os.mkdir(cookie_dir) 189 | 190 | # 使用する方式に応じてpostfixを切り替え 191 | postfix = '' 192 | if self.ENGINE.USE_SELENIUM: 193 | postfix = '_selenium' 194 | elif self.ENGINE.USE_SPLASH: 195 | postfix = '_splash' 196 | else: 197 | postfix = '_requests' 198 | 199 | # Prefixを付与してPATHを生成 200 | cookie_file = os.path.join( 201 | cookie_dir, '.cookie_' + self.ENGINE.NAME.lower() + postfix) 202 | 203 | # 存在チェックをして、ファイルがない場合は新規作成 204 | if not os.path.exists(cookie_file): 205 | open(cookie_file, 'a').close() 206 | 207 | # ENGINEのself変数にセットする 208 | self.ENGINE.COOKIE_FILE = cookie_file # type: ignore 209 | 210 | # クエリ実行ごとにCookieを削除して作り直しさせるかを指定する関数 211 | def set_cookie_files_delete(self, is_delete_cookie: bool): 212 | """set_cookie_files_delete 213 | 214 | Function that specifies whether the cookie should be deleted and recreated each time the query is executed. 215 | 216 | Args: 217 | is_delete_cookie (bool): delete flag. 218 | """ 219 | 220 | # ENGINEのself変数にセットする 221 | self.ENGINE.COOKIE_FILE_DELETE = is_delete_cookie # type: ignore 222 | 223 | # 検索エンジンにわたす言語・国の設定を受け付ける 224 | def set_lang(self, lang: str = "ja", locale: str = "JP"): 225 | """set_lang 226 | 227 | Function to set the language / country specified by the search engine. 228 | 229 | Args: 230 | lang (str): Language ([ja,en]) 231 | locale (str): Locale ([JP,US]) 232 | """ 233 | self.ENGINE.set_lang(lang, locale) 234 | 235 | # 検索時の日時範囲を指定 236 | def set_range(self, start: datetime, end: datetime): 237 | """set_range 238 | 239 | Specify the date of the search range. 240 | 241 | Args: 242 | start (datetime): start time(datetime) 243 | end (datetime): end time(datetime) 244 | """ 245 | 246 | self.ENGINE.set_range(start, end) 247 | 248 | # proxyの設定を受け付ける 249 | def set_proxy(self, proxy: str): 250 | """set_proxy 251 | 252 | Set the proxy server to be used when searching. 253 | 254 | Args: 255 | proxy (str): proxy uri(ex. socks5://localhost:11080, http://hogehoge:8080) 256 | """ 257 | self.ENGINE.set_proxy(proxy) 258 | 259 | # seleniumを有効にする 260 | def set_selenium(self, uri: str = None, browser: str = None): # type: ignore 261 | """set_selenium 262 | 263 | Use Selenium (priority over Splash). 264 | 265 | Args: 266 | uri (str, optional): Specify the `host:port` of Selenium (used when Selenium is started by docker etc.). Defaults to None. 267 | browser (str, optional): Specify Browser to use with Selenium ([chrome, firefox]). Defaults to None. 268 | """ 269 | 270 | self.ENGINE.set_selenium(uri, browser) 271 | 272 | # splashを有効にする 273 | def set_splash(self, splash_url: str): 274 | """set_splash 275 | 276 | Use Splash (Selenium has priority). 277 | 278 | Args: 279 | splash_url (str): Splash uri(ex: `localhost:8050`) 280 | """ 281 | 282 | self.ENGINE.set_splash(splash_url) 283 | 284 | # user_agentの設定値を受け付ける 285 | def set_user_agent(self, useragent: str = None): # type: ignore 286 | """set_user_agent 287 | 288 | Specify the UserAgent. 289 | If not specified, FakeUA or hard-coded UserAgent will be used. 290 | 291 | 292 | Args: 293 | useragent (str, optional): useragent. Defaults to None. 294 | """ 295 | 296 | self.ENGINE.set_user_agent(useragent) 297 | 298 | # sslの検証を無効化する 299 | def set_ignore_ssl(self, verify: bool): 300 | """set_ignore_ssl 301 | 302 | Ignore ssl verify. 303 | 304 | Args: 305 | verify (bool): bool. 306 | """ 307 | self.ENGINE.set_ignore_ssl = verify # type: ignore 308 | 309 | # 検索を行う 310 | def search(self, keyword: str, search_type='text', maximum=100): 311 | """search 312 | 313 | Search with a search engine. 314 | 315 | Args: 316 | keyword (str): query. 317 | search_type (str, optional): search type. text or image. Defaults to 'text'. 318 | maximum (int, optional): Max count of searches. Defaults to 100. 319 | 320 | Returns: 321 | [list]: [{'link', 'http://...', 'title': 'hogehoge...'}, {'link': '...', 'title': '...'}, ... ] 322 | """ 323 | 324 | # ENGINE.MESSAGEへis_command/is_debugを渡す 325 | self.MESSAGE.set_is_command(self.ENGINE.IS_COMMAND) 326 | self.MESSAGE.set_is_debug(self.ENGINE.IS_DEBUG) 327 | 328 | # Set header 329 | header = '[${ENGINE_NAME}Search]' 330 | if self.IS_COLOR: 331 | sc = Color(self.ENGINE.COLOR) 332 | header = sc.out(header) 333 | self.MESSAGE.set_header(header) 334 | 335 | # ENGINEへMessage()を渡す 336 | self.ENGINE.set_messages(self.MESSAGE) 337 | 338 | if self.ENGINE.LANG == "" and self.ENGINE.LOCALE == "": 339 | self.set_lang() 340 | 341 | # メッセージ出力(コマンド実行時のみ) 342 | colored_keyword = self.ENGINE.MESSAGE.ENGINE_COLOR.out(keyword) 343 | self.ENGINE.MESSAGE.print_text( 344 | "$ENGINE: {} Search: {}".format( 345 | search_type.capitalize(), colored_keyword), 346 | use_header=False, 347 | file=sys.stderr 348 | 349 | ) 350 | result, total = [], 0 351 | 352 | # maximumが0の場合、返す値は0個になるのでこのままreturn 353 | if maximum == 0: 354 | return result 355 | 356 | # ENGINEのproxyやブラウザオプションを、各接続方式(Selenium, Splash, requests)に応じてセットし、ブラウザ(session)を作成する 357 | self.ENGINE.create_session() 358 | 359 | # 検索処理の開始 360 | gen_url = self.ENGINE.gen_search_url(keyword, search_type) 361 | while True: 362 | # リクエスト先のurlを取得 363 | try: 364 | method, url, data = next(gen_url) 365 | except Exception: 366 | break 367 | 368 | # debug 369 | self.ENGINE.MESSAGE.print_text( 370 | url, 371 | mode='debug', 372 | separator=": ", # type: ignore 373 | header=self.ENGINE.MESSAGE.HEADER + ': ' + \ 374 | Color.GRAY + '[DEBUG]: [TargetURL]' + Color.END 375 | ) 376 | 377 | # debug 378 | self.ENGINE.MESSAGE.print_text( 379 | self.ENGINE.USER_AGENT, 380 | mode='debug', 381 | separator=": ", # type: ignore 382 | header=self.ENGINE.MESSAGE.HEADER + ': ' + \ 383 | Color.GRAY + '[DEBUG]: [UserAgent]' + Color.END 384 | ) 385 | 386 | # 検索結果の取得 387 | html = self.ENGINE.get_result( 388 | url, method=method, data=data) # type: ignore 389 | 390 | # debug 391 | self.ENGINE.MESSAGE.print_text( 392 | html, 393 | mode='debug', 394 | separator=": ", # type: ignore 395 | header=self.ENGINE.MESSAGE.HEADER + ': ' + \ 396 | Color.GRAY + '[DEBUG]: [Response]' + Color.END 397 | ) 398 | 399 | # 初期値 400 | is_recaptcha = False 401 | 402 | while True: 403 | # ReCaptchaページかどうかを識別 404 | if html is not None: 405 | is_recaptcha = self.ENGINE.check_recaptcha(html) 406 | else: 407 | break 408 | 409 | if is_recaptcha: 410 | # commandの場合の出力処理 411 | self.ENGINE.MESSAGE.print_text( 412 | 'Oh, Redirect to ReCaptcha Window.', 413 | mode='warn', 414 | header=self.ENGINE.MESSAGE.ENGINE, 415 | separator=": " 416 | ) 417 | 418 | # headless browserを使っている場合 419 | if self.ENGINE.USE_SELENIUM or self.ENGINE.USE_SPLASH: 420 | # byass用の関数にわたす 421 | html = self.ENGINE.bypass_recaptcha( 422 | url, html) # type: ignore 423 | 424 | if html is not None: 425 | # debug 426 | self.ENGINE.MESSAGE.print_text( 427 | html, 428 | mode='debug', # type: ignore 429 | header=self.ENGINE.MESSAGE.HEADER + ': ' + Color.GRAY + \ 430 | '[DEBUG]: [ReCaptchaedResponse]' + Color.END, 431 | separator=": " 432 | ) 433 | 434 | else: 435 | # headless browserが無い場合、Recaptchaには対応していない旨のエラーメッセージを出力する 436 | None 437 | 438 | else: # is_recaptchaがFalseの場合、whileを抜ける 439 | break 440 | 441 | # htmlがNone、かつReCaptchaチェックでTrueであった場合 442 | if html is None and is_recaptcha: 443 | # commandの場合の出力処理 444 | self.ENGINE.MESSAGE.print_text( 445 | 'FAiled ReCaptcha. exit process.', 446 | mode='warn', 447 | header=self.ENGINE.MESSAGE.ENGINE, 448 | separator=": " 449 | ) 450 | 451 | break 452 | 453 | # TODO: resultも関数に渡して重複チェックを行わせる 454 | # 検索結果をパースしてurlリストを取得する 455 | links = self.ENGINE.get_links( 456 | url, html, search_type) # type: ignore 457 | 458 | # linksの件数に応じて処理を実施 459 | if not len(links): 460 | # commandの場合の出力処理 461 | self.ENGINE.MESSAGE.print_text( 462 | 'No more links.', 463 | header=self.ENGINE.MESSAGE.ENGINE, 464 | separator=": ", 465 | file=sys.stderr, 466 | ) 467 | 468 | # loopを抜ける 469 | if self.ENGINE.NAME == "Google": 470 | if self.ENGINE.SEARCH_NEXT_URL is None: # type: ignore 471 | break 472 | else: 473 | break 474 | 475 | # maximumで指定した件数を超える場合、その件数までを追加してloopを抜ける 476 | elif len(links) > maximum - total: 477 | result += links[:maximum - total] 478 | break 479 | 480 | # TODO: bingのときだけ追加する処理として外だしする方法を考える 481 | elif len(links) < 10 and self.ENGINE.NAME == "Bing": 482 | # Bingの場合、件数以下でも次のページが表示されてしまうため件数でbreak 483 | result += links[:maximum - total] 484 | break 485 | 486 | else: 487 | result += links 488 | total += len(links) 489 | 490 | # 連続でアクセスすると問題があるため、3秒待機 491 | sleep(3) 492 | 493 | # 検索番号を指定 494 | result = set_counter(result) 495 | 496 | # commandの場合の出力処理 497 | self.ENGINE.MESSAGE.print_text( 498 | # type: ignore 499 | 'Finally got ' + self.ENGINE.COLOR + \ 500 | str(len(result)) + Color.END + ' links.', 501 | header=self.ENGINE.MESSAGE.ENGINE, 502 | separator=": ", 503 | file=sys.stderr, 504 | ) 505 | 506 | # save cookies 507 | if self.ENGINE.COOKIE_FILE != '': 508 | self.ENGINE.write_cookies() 509 | 510 | # delete cookie file 511 | if self.ENGINE.COOKIE_FILE_DELETE: 512 | os.remove(self.ENGINE.COOKIE_FILE) 513 | 514 | # sessionを終了 515 | self.ENGINE.close_session() 516 | 517 | return result 518 | 519 | # suggestを取得する 520 | def suggest(self, keyword: str, jap=False, alph=False, num=False): 521 | """suggest 522 | 523 | get suggest with a search engine. 524 | 525 | Args: 526 | keyword (str): query 527 | jap (bool, optional): with japanese char. Defaults to False. 528 | alph (bool, optional): with alphabet char. Defaults to False. 529 | num (bool, optional): with number. Defaults to False. 530 | 531 | Returns: 532 | [list]: {'with char': ['suggest1', 'suggest2' ...]} 533 | """ 534 | 535 | # ENGINEのproxyやブラウザオプションを、各接続方式(Selenium, Splash, requests)に応じてセットし、ブラウザ(session)を作成する 536 | self.ENGINE.create_session() 537 | 538 | # 文字リスト作成 539 | chars = ['', ' '] 540 | 541 | # japフラグが有効な場合、キーワードに日本語を含めてサジェストを検索 542 | chars += [' ' + chr(i) for i in range(12353, 12436)] if jap else [] 543 | 544 | # alphフラグが有効な場合、キーワードにアルファベットを含めてサジェストを検索 545 | chars += [' ' + char for char in ascii_lowercase] if alph else [] 546 | 547 | # numフラグが有効な場合、キーワードに数字を含めてサジェストを検索 548 | chars += [' ' + char for char in digits] if num else [] 549 | 550 | # サジェスト取得 551 | suggests = {} 552 | for char in chars: 553 | word = keyword + char 554 | url = self.ENGINE.gen_suggest_url(word) 555 | html = self.ENGINE.get_result(url) 556 | 557 | # TODO: 各エンジンでjson/textの変換処理を別途実装する必要がある 558 | suggests = self.ENGINE.get_suggest_list( 559 | suggests, char, html) # type: ignore 560 | 561 | sleep(0.5) 562 | 563 | # sessionを終了 564 | self.ENGINE.close_session() 565 | 566 | return suggests 567 | -------------------------------------------------------------------------------- /pydork/engine_common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2023 Blacknon. All rights reserved. 4 | # Use of this source code is governed by an MIT license 5 | # that can be found in the LICENSE file. 6 | # ======================================================= 7 | 8 | 9 | """engine_common 10 | * SearchEngine Classから呼び出す、各検索エンジンで共通の処理を保持させる継承用Classである `CommonEngine` を持つモジュール. 11 | """ 12 | 13 | import requests 14 | import os 15 | import pickle 16 | 17 | # selenium driver auto install packages 18 | import chromedriver_autoinstaller 19 | import geckodriver_autoinstaller 20 | 21 | # seleniumrequests 22 | from seleniumrequests import Chrome, Firefox 23 | 24 | # selenium 25 | from selenium import webdriver 26 | from selenium.webdriver.chrome.options import Options as ChromeOptions 27 | from selenium.webdriver.firefox.options import Options as FirefoxOptions 28 | from selenium.webdriver.support.ui import WebDriverWait 29 | from selenium.webdriver.support import expected_conditions as EC 30 | 31 | from urllib import parse 32 | from fake_useragent import UserAgent 33 | from bs4 import BeautifulSoup 34 | from datetime import datetime 35 | 36 | from .common import Color, Message 37 | 38 | 39 | # 各検索エンジン用class共通の処理を記述した継承用class 40 | class CommonEngine: 41 | """CommonEngine 42 | 43 | 検索エンジンごとの処理を記述するClassのための、継承用Class. 44 | """ 45 | 46 | # Class作成時の処理 47 | def __init__(self): 48 | # headless browserの利用有無フラグ(デフォルト: False) 49 | self.USE_SELENIUM = False 50 | self.USE_SPLASH = False 51 | 52 | # 初期値の作成 53 | self.LOCK = None 54 | self.COOKIE_FILE = '' 55 | self.COOKIE_FILE_DELETE = False 56 | self.SPLASH_URI = '' 57 | self.PROXY = '' 58 | self.USER_AGENT = '' 59 | self.LANG = '' 60 | self.LOCALE = '' 61 | self.IS_DEBUG = False 62 | self.IS_COMMAND = False 63 | self.IS_DISABLE_HEADLESS = False 64 | self.MESSAGE: Message 65 | self.IGNORE_SSL_VERIFY = False 66 | 67 | # ReCaptcha画面かどうかの識別用(初期値(ブランク)) 68 | self.RECAPTCHA_SITEKEY = '' 69 | self.SOUP_RECAPTCHA_TAG = '' 70 | self.SOUP_RECAPTCHA_SITEKEY = '' 71 | 72 | # 検索エンジンにわたす言語・国の設定を受け付ける 73 | def set_lang(self, lang: str, locale: str): 74 | """set_lang 75 | 76 | 検索エンジンで指定する言語・国の設定を行う関数 77 | 78 | Args: 79 | lang (str): 検索エンジンのパラメータで指定する言語を指定する([ja,en]) 80 | locale (str): 検索エンジンのパラメータで指定する国を指定する([JP,US]) 81 | """ 82 | 83 | self.LANG = lang 84 | self.LOCALE = locale 85 | 86 | # 検索時の日時範囲を指定 87 | def set_range(self, start: datetime, end: datetime): 88 | """set_range 89 | 90 | 検索エンジンで指定する日付範囲を指定する 91 | 92 | Args: 93 | start (datetime): 検索対象ページの対象範囲開始日時(datetime) 94 | end (datetime): 検索対象ページの対象範囲終了日時(datetime) 95 | """ 96 | self.RANGE_START = start 97 | self.RANGE_END = end 98 | 99 | # user_agentの設定値を受け付ける(引数がない場合はランダム。Seleniumの際は自動的に使用したbrowserのagentを指定) 100 | def set_user_agent(self, user_agent: str = None, browser: str = None): # type: ignore 101 | """set_user_agent 102 | 103 | user_agentの値を受け付ける. 104 | user_agentの指定がない場合、 Chromeを使用したものとする. 105 | また、もし`browser`が指定されている場合はそのブラウザのUser Agentを指定する. 106 | 107 | 注) seleniumを利用する場合、事前に有効にする必要がある。 108 | 109 | Args: 110 | user_agent (str, optional): User Agentを指定する. Defaults to None. 111 | browser (str, optional): Seleniumで使用するBrowserを指定する([chrome, firefox]). Defaults to None. 112 | """ 113 | 114 | if user_agent is None: 115 | # seleniumが有効になっている場合、そのままSeleniumで利用するブラウザのUAを使用する 116 | if self.USE_SELENIUM: 117 | user_agent = '' 118 | else: 119 | try: 120 | ua = UserAgent(verify_ssl=False, use_cache_server=True) 121 | if user_agent is None: 122 | if browser is None: 123 | user_agent = ua.firefox 124 | 125 | elif browser == 'chrome': 126 | user_agent = ua.chrome 127 | 128 | elif browser == 'firefox': 129 | user_agent = ua.chrome 130 | 131 | except Exception: 132 | user_agent = 'Mozilla/5.0 (Linux; Android 10; SM-A205U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Mobile Safari/537.36.' 133 | 134 | self.USER_AGENT = user_agent 135 | 136 | # seleniumを有効にする 137 | # - splashより優先 138 | # - host, browserは、指定がない場合はそれぞれデフォルト設定(hostは指定なし、browserはchrome)での動作 139 | # - browserは `chrome` or `firefox` のみ受け付ける 140 | def set_selenium(self, uri: str = None, browser: str = None): # type: ignore 141 | """set_selenium 142 | 143 | 検索時にSelenium経由で通信を行う. 144 | 他のHeadless Browserと比較して最優先(Splash等が有効でもこちらが優先される). 145 | 146 | Args: 147 | uri (str, optional): APIのURIを指定(localhost:4444). Defaults to None. 148 | browser (str, optional): 使用するブラウザを指定([chrome, firefox]). Defaults to None. 149 | """ 150 | 151 | # 入力値検証(browser: chrome or firefox) 152 | if browser is None: 153 | browser = 'chrome' 154 | 155 | # USE_SELENIUM to True 156 | self.USE_SELENIUM = True 157 | self.SELENIUM_URI = uri 158 | self.SELENIUM_BROWSER = browser 159 | 160 | # proxyの設定を受け付ける 161 | def set_proxy(self, proxy: str): 162 | """set_proxy 163 | 164 | 検索時に使用するProxyを指定する(uri指定) 165 | 166 | Args: 167 | proxy (str): ProxyのURIを指定する(socks5://localhost:11080, http://hogehoge:8080) 168 | """ 169 | 170 | self.PROXY = proxy 171 | 172 | # splash urlの値を受け付ける 173 | def set_splash(self, splash_url: str): 174 | """set_splash 175 | 176 | 検索時にSplashを有効にする. 177 | (Seleniumと同時に有効化されている場合、Seleniumを優先する) 178 | 179 | Args: 180 | splash_url (str): Splashのアクセス先URIを指定する(ex: `localhost:8050`) 181 | """ 182 | 183 | self.USE_SPLASH = True 184 | self.SPLASH_URI = splash_url 185 | 186 | # common.Messageを受け付ける 187 | def set_messages(self, message: Message): 188 | self.MESSAGE = message 189 | 190 | # sslのチェックを無効にする 191 | def set_ignore_ssl(self, verify: bool): 192 | self.IGNORE_SSL_VERIFY = verify 193 | 194 | # cookieをcookiefileから取得する 195 | def read_cookies(self): 196 | """read_cookies 197 | 198 | `self.COOKIE_FILE` からcookieを読み込む. 199 | 現時点ではSeleniumでのみ動作. 200 | """ 201 | 202 | # cookieファイルが存在しない場合、空ファイルで作成する 203 | exist_cookie_file = os.path.isfile(self.COOKIE_FILE) 204 | if not exist_cookie_file: 205 | cookie_file = open(self.COOKIE_FILE, 'w') 206 | cookie_file.write('') 207 | cookie_file.close() 208 | 209 | # cookieファイルのサイズを取得 210 | file_size = os.path.getsize(self.COOKIE_FILE) 211 | 212 | # cookieファイルのサイズが0以上の場合 213 | if file_size > 0: 214 | # cookie fileからcookieの取得 215 | cookies = pickle.load(open(self.COOKIE_FILE, "rb")) 216 | 217 | # seleniumを使う場合 218 | if self.USE_SELENIUM: 219 | # 事前アクセスが必要になるため、検索対象ドメインのTOPページにアクセスしておく 220 | self.driver.get(self.ENGINE_TOP_URL) # type: ignore 221 | 222 | # cookieを設定していく 223 | for cookie in cookies: 224 | try: 225 | self.driver.add_cookie(cookie) 226 | except Exception: 227 | pass 228 | 229 | # splashを使う場合 230 | elif self.USE_SPLASH: 231 | # NOTE: 動作しないためコメントアウト 232 | # TODO: 確認して修正 233 | # self.session.cookies.update(cookies) 234 | None 235 | 236 | # requestを使う場合 237 | else: 238 | # NOTE: 動作しないためコメントアウト 239 | # TODO: 確認して修正 240 | # self.session.cookies.update(cookies) 241 | None 242 | 243 | # cookieをcookiefileに書き込む 244 | def write_cookies(self): 245 | """write_cookies 246 | 247 | cookiesを `self.COOKIE_FILE` に書き込む. 248 | 249 | """ 250 | 251 | cookies = None 252 | 253 | # seleniumを使う場合 254 | if self.USE_SELENIUM: 255 | cookies = self.driver.get_cookies() 256 | 257 | # splashを使う場合 258 | elif self.USE_SPLASH: 259 | cookies = self.session.cookies 260 | 261 | # requestを使う場合 262 | else: 263 | cookies = self.session.cookies 264 | 265 | # cookieを書き込み 266 | with open(self.COOKIE_FILE, 'wb') as f: 267 | pickle.dump(cookies, f) 268 | 269 | # seleniumのOptionsを作成 270 | def create_selenium_options(self): 271 | """create_selenium_options 272 | 273 | Seleniumのoptionsを生成して返す. 274 | 275 | Returns: 276 | Options: 指定されたブラウザに応じたSeleniumのOptionsを返す. 277 | """ 278 | 279 | # browser別の処理 280 | if self.SELENIUM_BROWSER == 'chrome': 281 | options = ChromeOptions() 282 | 283 | # set ssl verify 284 | if not self.IGNORE_SSL_VERIFY: 285 | options.add_argument('ignore-certificate-errors') 286 | 287 | elif self.SELENIUM_BROWSER == 'firefox': 288 | options = FirefoxOptions() 289 | 290 | # set headless option 291 | if not self.IS_DISABLE_HEADLESS: 292 | options.add_argument('--headless') 293 | 294 | # set user_agent option 295 | if self.USER_AGENT != '': 296 | options.add_argument('--user-agent=%s' % self.USER_AGENT) 297 | 298 | return options 299 | 300 | # selenium driverの作成 301 | def create_selenium_driver(self): 302 | """create_selenium_driver 303 | 304 | Seleniumで使用するDriverを作成する関数. 305 | Optionsもこの関数で作成する. 306 | """ 307 | 308 | # optionsを取得する 309 | options = self.create_selenium_options() 310 | 311 | # browserに応じてdriverを作成していく 312 | if self.SELENIUM_BROWSER == 'chrome': 313 | # proxyを追加 314 | if self.PROXY != '': 315 | options.add_argument('--proxy-server=%s' % self.PROXY) 316 | 317 | try: 318 | chromedriver_autoinstaller.install() 319 | except Exception: 320 | pass 321 | 322 | self.driver = Chrome(options=options) 323 | 324 | elif self.SELENIUM_BROWSER == 'firefox': 325 | # profileを作成する 326 | profile = webdriver.FirefoxProfile() 327 | profile.set_preference('devtools.jsonview.enabled', False) 328 | profile.set_preference('plain_text.wrap_long_lines', False) 329 | profile.set_preference('view_source.wrap_long_lines', False) 330 | 331 | # proxyを追加 332 | if self.PROXY != '': 333 | # self.PROXYをパース処理する 334 | parsed_uri = parse.urlparse(self.PROXY) 335 | 336 | # socks5 337 | if parsed_uri.scheme == "socks5": 338 | # Proxy設定を追加 339 | profile.set_preference( 340 | 'network.proxy.type', 1) 341 | profile.set_preference('network.proxy.socks_version', 5) 342 | profile.set_preference( 343 | 'network.proxy.socks', parsed_uri.hostname) 344 | profile.set_preference( 345 | 'network.proxy.socks_port', parsed_uri.port) 346 | profile.set_preference('network.proxy.no_proxies_on', '') 347 | profile.set_preference( 348 | 'network.proxy.socks_remote_dns', True) 349 | profile.update_preferences() 350 | elif parsed_uri.scheme == "socks4": 351 | # Proxy設定を追加 352 | profile.set_preference( 353 | 'network.proxy.type', 1) 354 | profile.set_preference('network.proxy.socks_version', 4) 355 | profile.set_preference( 356 | 'network.proxy.socks', parsed_uri.hostname) 357 | profile.set_preference( 358 | 'network.proxy.socks_port', parsed_uri.port) 359 | profile.set_preference('network.proxy.no_proxies_on', '') 360 | profile.set_preference( 361 | 'network.proxy.socks_remote_dns', True) 362 | profile.update_preferences() 363 | 364 | # set ssl verify(firefoxの場合はprofileで処理するのでこちらに記述する) 365 | if not self.IGNORE_SSL_VERIFY: 366 | profile.accept_untrusted_certs = True 367 | 368 | try: 369 | geckodriver_autoinstaller.install() 370 | except Exception: 371 | pass 372 | self.driver = Firefox(options=options, firefox_profile=profile) 373 | 374 | # User agentを指定させる 375 | user_agent = self.driver.execute_script("return navigator.userAgent") 376 | self.set_user_agent(user_agent) 377 | 378 | return 379 | 380 | # selenium経由でリクエストを送信する 381 | def request_selenium(self, url: str, method='GET', data=None): 382 | """[summary] 383 | 384 | Selenium経由でGETリクエストを投げて、その結果をhtml(文字列)で返す. 385 | 386 | Args: 387 | url (str): リクエストを投げるurl. 388 | method (str): リクエストメソッド. 389 | data (str): POSTメソッド時に利用するdata. 390 | 391 | Returns: 392 | str: htmlの文字列. 393 | """ 394 | 395 | if method == 'GET': 396 | response = self.driver.get(url) 397 | 398 | # wait all elements 399 | WebDriverWait(self.driver, 15).until( 400 | EC.presence_of_all_elements_located) 401 | 402 | # wait 5 seconds(wait DOM) 403 | if self.NAME in ('Bing', 'Baidu', 'DuckDuckGo'): # type: ignore 404 | self.driver.implicitly_wait(20) 405 | 406 | # get result 407 | result = self.driver.page_source 408 | 409 | elif method == 'POST': 410 | response = self.driver.request('POST', url, data=data) 411 | 412 | # wait all elements 413 | WebDriverWait(self.driver, 15).until( 414 | EC.presence_of_all_elements_located) 415 | 416 | # wait 5 seconds(wait DOM) 417 | if self.NAME in ('Bing', 'Baidu', 'DuckDuckGo'): # type: ignore 418 | self.driver.implicitly_wait(20) 419 | 420 | # get result 421 | result = response.text 422 | 423 | return result 424 | 425 | # splash経由でのリクエストを送信する 426 | def request_splash(self, url: str, method='GET', data=None): 427 | """request_splash 428 | 429 | Splash経由でGETリクエストを投げて、その結果をhtml(文字列)で返す. 430 | 431 | Args: 432 | url (str): リクエストを投げるurl. 433 | method (str): リクエストメソッド. 434 | data (str): POSTメソッド時に利用するdata. 435 | 436 | Returns: 437 | str: htmlの文字列. 438 | """ 439 | 440 | # urlを生成する 441 | splash_url = 'http://' + self.SPLASH_URI + '/render.html' 442 | 443 | # param 444 | params = { 445 | 'url': url 446 | } 447 | 448 | # Proxy指定をする場合 449 | if self.PROXY != '': 450 | params['proxy'] = self.PROXY 451 | 452 | # リクエストを投げてレスポンスを取得する 453 | if method == 'GET': 454 | result = self.session.get(splash_url, params=params).text 455 | 456 | # NOTE: Googleの画像検索のPOSTがSplashではレンダリングできないので、特例対応でrequestsを使用する. 457 | # TODO: Splashでもレンダリングできるようになったら書き換える. 458 | elif method == 'POST' and self.NAME == 'Google' and self.IMAGE_URL in url: # type: ignore 459 | # create session 460 | session = requests.session() 461 | 462 | # proxyを設定 463 | if self.PROXY != '': 464 | proxies = { 465 | 'http': self.PROXY, 466 | 'https': self.PROXY 467 | } 468 | session.proxies = proxies 469 | 470 | # user-agentを設定 471 | if self.USER_AGENT != '': 472 | session.headers.update( 473 | { 474 | 'User-Agent': self.USER_AGENT, 475 | 'Accept-Language': 'ja,en-US;q=0.7,en;q=0.3' 476 | } 477 | ) 478 | 479 | result = session.post(url, data=data).text 480 | 481 | elif method == 'POST': 482 | headers = {'Content-Type': 'application/json'} 483 | params['http_method'] = 'POST' 484 | params['body'] = parse.urlencode(data) # type: ignore 485 | 486 | result = self.session.post( 487 | splash_url, 488 | headers=headers, 489 | json=params 490 | ).text 491 | 492 | return result 493 | 494 | # seleniumやsplushなどのヘッドレスブラウザ、request.sessionの作成・設定、cookieの読み込みを行う 495 | def create_session(self): 496 | """create_session 497 | 498 | 指定された接続方式(Seleniumなどのヘッドレスブラウザの有無)に応じて、driverやsessionを作成する. 499 | cookiesの読み込みやproxyの設定が必要な場合、この関数内で処理を行う. 500 | """ 501 | 502 | # seleniumを使う場合 503 | if self.USE_SELENIUM: 504 | self.create_selenium_driver() 505 | 506 | # splashを使う場合 507 | elif self.USE_SPLASH: 508 | # create session 509 | self.session = requests.session() 510 | 511 | # user-agentを設定 512 | if self.USER_AGENT != '': 513 | self.session.headers.update( 514 | { 515 | 'User-Agent': self.USER_AGENT, 516 | 'Accept-Language': 'ja,en-US;q=0.7,en;q=0.3' 517 | } 518 | ) 519 | 520 | # requestを使う場合 521 | else: 522 | # create session 523 | self.session = requests.session() 524 | 525 | # リダイレクトの上限を60にしておく(baidu対策) 526 | self.session.max_redirects = 60 527 | 528 | # proxyを設定 529 | if self.PROXY != '': 530 | proxies = { 531 | 'http': self.PROXY, 532 | 'https': self.PROXY 533 | } 534 | self.session.proxies = proxies 535 | 536 | # user-agentを設定 537 | if self.USER_AGENT != '': 538 | self.session.headers.update( 539 | { 540 | 'User-Agent': self.USER_AGENT, 541 | 'Accept-Language': 'ja,en-US;q=0.7,en;q=0.3' 542 | } 543 | ) 544 | 545 | # cookiefileが指定されている場合、読み込みを行う 546 | if self.COOKIE_FILE != '': 547 | self.read_cookies() 548 | 549 | return 550 | 551 | # sessionをcloseする 552 | def close_session(self): 553 | if self.USE_SELENIUM: 554 | self.driver.quit() 555 | else: 556 | self.session.close() 557 | 558 | # リクエストを投げてhtmlを取得する(selenium/splash/requestで分岐してリクエストを投げるwrapperとして動作させる) 559 | def get_result(self, url: str, method='GET', data=None): 560 | """get_result 561 | 562 | 接続方式に応じて、urlへGETリクエストを投げてhtmlを文字列で返す関数. 563 | 564 | Args: 565 | url (str): リクエストを投げるurl. 566 | method (str): リクエストメソッド. 567 | data (str): POSTメソッド時に利用するdata. 568 | 569 | Returns: 570 | str: htmlの文字列. 571 | """ 572 | 573 | # 優先度1: Selenium経由でのアクセス 574 | if self.USE_SELENIUM: 575 | result = self.request_selenium(url, method=method, data=data) 576 | 577 | # NOTE: seleniumでのブラウザスクロール. googleでの処理で不要になったため、ただ遅くなるだけで不便なので一旦コメントアウト. 578 | # for i in range(0, 10): 579 | # self.driver.execute_script( 580 | # "window.scrollTo(0,document.body.scrollHeight)" 581 | # ) 582 | # time.sleep(0.5) 583 | 584 | # 優先度2: Splash経由でのアクセス(Seleniumが有効になってない場合はこちら) 585 | elif self.USE_SPLASH: 586 | # create splash url 587 | result = self.request_splash(url, method=method, data=data) 588 | 589 | # 優先度3: request.sessionからのリクエスト(SeleniumもSplashも有効でない場合) 590 | else: 591 | if method == 'GET': 592 | result = self.session.get( 593 | url, verify=self.IGNORE_SSL_VERIFY).text 594 | elif method == 'POST': 595 | result = self.session.post( 596 | url, verify=self.IGNORE_SSL_VERIFY, data=data).text 597 | 598 | return result 599 | 600 | # 検索用のurlを生成 601 | def gen_search_url(self, keyword: str, type: str): 602 | """gen_search_url 603 | 604 | 検索用のurlを生成する. 605 | 各検索エンジンで上書きする用の関数. 606 | 607 | Args: 608 | keyword (str): 検索クエリ. 609 | type (str): 検索タイプ. 610 | 611 | Returns: 612 | dict: method 613 | dict: 検索用url 614 | dict: data 615 | """ 616 | 617 | result = {} 618 | return 'GET', result, None 619 | 620 | # テキスト、画像検索の結果からlinksを取得するための集約function 621 | def get_links(self, source_url, html: str, type: str): 622 | """get_links 623 | 624 | 受け付けたhtmlを解析し、検索結果をlistに加工して返す関数. 625 | 626 | Args: 627 | url (str): 解析する検索結果のurl. 628 | html (str): 解析する検索結果のhtml. 629 | type (str): 検索タイプ([text, image]).現時点ではtextのみ対応. 630 | 631 | Returns: 632 | list: 検索結果(`[{'title': 'title...', 'link': 'https://hogehoge....'}, {...}]`) 633 | """ 634 | 635 | # BeautifulSoupでの解析を実施 636 | soup = BeautifulSoup(html, 'lxml') 637 | 638 | if type == 'text': 639 | # link, titleの組み合わせを取得する 640 | elinks, etitles, etexts = self.get_text_links(soup) 641 | 642 | # before processing elists 643 | self.MESSAGE.print_text( 644 | ','.join(elinks), # type: ignore 645 | header=self.MESSAGE.HEADER + ': ' + Color.BLUE + \ 646 | '[BeforeProcessing elinks]' + Color.END, 647 | separator=" :", 648 | mode="debug", 649 | ) 650 | 651 | # before processing etitles 652 | self.MESSAGE.print_text( 653 | ','.join(etitles), # type: ignore 654 | header=self.MESSAGE.HEADER + ': ' + Color.BLUE + \ 655 | '[BeforeProcessing etitles]' + Color.END, 656 | separator=" :", 657 | mode="debug", 658 | ) 659 | 660 | # 加工処理を行う関数に渡す(各エンジンで独自対応) 661 | elinks, etitles, etexts = self.processings_elist( 662 | elinks, etitles, etexts) 663 | 664 | # after processing elists 665 | self.MESSAGE.print_text( 666 | ','.join(elinks), # type: ignore 667 | header=self.MESSAGE.HEADER + ': ' + \ 668 | Color.GREEN + '[AfterProcessing elinks]' + Color.END, 669 | separator=" :", 670 | mode="debug", 671 | ) 672 | 673 | # after processing etitles 674 | self.MESSAGE.print_text( 675 | ','.join(etitles), # type: ignore 676 | header=self.MESSAGE.HEADER + ': ' + \ 677 | Color.GREEN + '[AfterProcessing etitles]' + Color.END, 678 | separator=" :", 679 | mode="debug", 680 | ) 681 | 682 | # dictに加工してリスト化する 683 | # [{'title': 'title...', 'link': 'https://hogehoge....'}, {...}] 684 | links = self.create_text_links(source_url, elinks, etitles, etexts) 685 | 686 | return links 687 | 688 | elif type == 'image': 689 | links = self.get_image_links(soup) 690 | 691 | return links 692 | 693 | # テキスト検索ページの検索結果(links([{link: ..., title: ...},...]))を生成するfunction 694 | def get_text_links(self, soup: BeautifulSoup): 695 | """get_text_links 696 | 697 | BeautifulSoupからテキスト検索ページを解析して結果を返す関数. 698 | 699 | Args: 700 | soup (BeautifulSoup): 解析するBeautifulSoupオブジェクト. 701 | 702 | Returns: 703 | list: linkの検索結果([xxx,xxx,xxx...]) 704 | list: titleの検索結果([xxx,xxx,xxx...]) 705 | list: textの検索結果([xxx,xxx,xxx...]) 706 | """ 707 | # linkのurlを取得する 708 | self.MESSAGE.print_text( 709 | self.SOUP_SELECT_URL, # type: ignore 710 | header=self.MESSAGE.HEADER + ': ' + \ 711 | Color.GREEN + '[get_text_link.SOUP_SELECT_URL]' + Color.END, 712 | separator=" :", 713 | mode="debug", 714 | ) 715 | elements = soup.select(self.SOUP_SELECT_URL) 716 | elinks = [e['href'] for e in elements] 717 | 718 | # linkのtitleを取得する 719 | self.MESSAGE.print_text( 720 | self.SOUP_SELECT_TITLE, # type: ignore 721 | header=self.MESSAGE.HEADER + ': ' + \ 722 | Color.GREEN + '[get_text_link.SOUP_SELECT_TITLE]' + Color.END, 723 | separator=" :", 724 | mode="debug", 725 | ) 726 | elements = soup.select(self.SOUP_SELECT_TITLE) 727 | etitles = [e.text for e in elements] 728 | 729 | # linkのtextを取得する 730 | self.MESSAGE.print_text( 731 | self.SOUP_SELECT_TEXT, # type: ignore 732 | header=self.MESSAGE.HEADER + ': ' + \ 733 | Color.GREEN + '[get_text_link.SOUP_SELECT_TEXT]' + Color.END, 734 | separator=" :", 735 | mode="debug", 736 | ) 737 | elements = soup.select(self.SOUP_SELECT_TEXT) 738 | etext = [e.text for e in elements] 739 | 740 | return elinks, etitles, etext 741 | 742 | # 画像検索ページの検索結果(links(list()))を生成するfunction 743 | def get_image_links(self, soup: BeautifulSoup): 744 | """get_image_links 745 | BeautifulSoupから画像検索ページを解析して結果を返す関数. 746 | (実際の処理は各検索エンジンごとの関数で実施). 747 | 748 | Args: 749 | soup (BeautifulSoup): 解析するBeautifulSoupオブジェクト. 750 | 751 | Returns: 752 | list: 検索結果(`[{'title': 'title...', 'link': 'https://hogehoge....'}, {...}]`) 753 | """ 754 | 755 | links = [] 756 | 757 | return links 758 | 759 | # elist, etitle生成時の追加編集処理用function 760 | def processings_elist(self, elinks, etitles, etexts: list): 761 | """processings_elist 762 | 763 | self.get_links 内で、取得直後のelinks, etitlesに加工を加えるための関数. 764 | 必要に応じて各検索エンジンのClassで上書きする. 765 | 766 | Args: 767 | elinks (list): elinks(検索結果のlink)の配列 768 | etitles (list): etitles(検索結果のtitle)の配列 769 | etexts (list): etexts(検索結果のtext)の配列 770 | 771 | Returns: 772 | elinks (list): elinks(検索結果のlink)の配列 773 | etitles (list): etitles(検索結果のtitle)の配列 774 | etexts (list): etexts(検索結果のtext)の配列 775 | """ 776 | 777 | return elinks, etitles, etexts 778 | 779 | # テキスト検索の1ページごとの検索結果から、links(links([{link: ..., title: ...},...]))を生成するfunction 780 | def create_text_links(self, source_url: str, elinks, etitles, etext: list): 781 | """create_text_links 782 | 783 | elinks, etitlesからlinks(get_linksのデータ)を返す関数. 784 | 785 | Args: 786 | elinks (list): elinks(検索結果のlink)の配列 787 | etitles (list): etitles(検索結果のtitle)の配列 788 | etext (list): etext(検索結果のテキスト)の配列 789 | 790 | Returns: 791 | list: 検索結果(`[{'title': 'title...', 'url': 'https://hogehoge....', 'text': 'hogehoge fugafuga...'}, {...}]`)を返す。 792 | """ 793 | 794 | links = list() 795 | n = 0 796 | before_link = "" 797 | for link in elinks: 798 | d = dict() 799 | d['link'] = link 800 | 801 | # etitle(urlのtitle)をdictに追加する 802 | if len(etitles) > n: 803 | d['title'] = etitles[n] 804 | 805 | # etext(urlに対応する検索結果のテキスト文)をdictに追加する 806 | if len(etext) > n: 807 | d['text'] = etext[n] 808 | 809 | # 検索元urlをdictに追加する 810 | d['source_url'] = source_url 811 | 812 | if before_link != link: 813 | links.append(d) 814 | 815 | before_link = link 816 | n += 1 817 | 818 | return links 819 | 820 | # サジェスト取得用のurlを生成 821 | def gen_suggest_url(self, keyword: str): 822 | """gen_suggest_url 823 | 824 | サジェスト取得用のurlを生成する. 825 | 各検索エンジンで上書きする用の関数. 826 | 827 | Args: 828 | keyword (str): 検索クエリ. 829 | 830 | Returns: 831 | dict: サジェスト取得用url 832 | """ 833 | 834 | result = {} 835 | return result 836 | 837 | # サジェストの取得 838 | def get_suggest_list(self, suggests: list, char: str, html: str): 839 | """get_suggest_list 840 | 841 | htmlからsuggestを配列で取得する関数. 842 | 実際の処理は各検索エンジンClassで上書きする. 843 | 844 | Args: 845 | suggests (list): suggestを追加するための大本のlist. 846 | char (str): サジェストの文字列. 847 | html (str): 解析を行うhtml. 848 | 849 | Returns: 850 | dict: サジェスト配列 851 | """ 852 | result = {} 853 | return result 854 | 855 | # ReCaptcha画面かどうかを識別するための関数 856 | def check_recaptcha(self, html: str): 857 | """[summary] 858 | 859 | `self.SOUP_RECAPTCHA_TAG` を元に、htmlがReCaptcha画面かどうかを識別する. 860 | 861 | Args: 862 | html (str): 識別するページのhtml 863 | 864 | Returns: 865 | bool: ReCaptcha画面かどうか(ReCaptcha画面の場合はTrue) 866 | """ 867 | 868 | result = False 869 | 870 | # BeautifulSoupでの識別を実施 871 | soup = BeautifulSoup(html, 'lxml') 872 | 873 | # 要素が存在するかを確認 874 | if self.SOUP_RECAPTCHA_TAG != '': 875 | elements = soup.select(self.SOUP_RECAPTCHA_TAG) 876 | 877 | # 要素のチェック 878 | if len(elements) > 0: 879 | result = True 880 | 881 | return result 882 | 883 | # ReCaptchaをBypassする処理(wrapper) 884 | def bypass_recaptcha(self, url: str, html: str): 885 | """bypass_recaptcha 886 | 887 | ReCaptcha画面をBypassするための関数. 888 | 実際の処理は Selenium/Splash 各ブラウザに応じて処理させる関数に行わせる. 889 | 890 | Args: 891 | url (str): ReCaptcha画面が表示されてしまったリクエストのurl 892 | html (str): ReCaptcha画面のhtml 893 | 894 | Returns: 895 | str: ReCaptchaを突破後のurlのhtml 896 | """ 897 | 898 | # seleniumを使う場合 899 | if self.USE_SELENIUM: 900 | html = self.bypass_recaptcha_selenium(url, html) 901 | 902 | # splashを使う場合 903 | elif self.USE_SPLASH: 904 | html = self.bypass_recaptcha_splash(url, html) 905 | 906 | return html 907 | 908 | # ReCaptchaをSeleniumでBypassする処理 909 | def bypass_recaptcha_selenium(self, url: str, html: str): 910 | """bypass_recaptcha_selenium 911 | 912 | SeleniumでReCaptchaを突破する関数. 913 | 実際の処理は各検索エンジンのClassで実装. 914 | 915 | Args: 916 | url (str): ReCaptcha画面が表示されてしまったリクエストのurl 917 | html (str): ReCaptcha画面のhtml 918 | 919 | Returns: 920 | str: ReCaptchaを突破後のurlのhtml 921 | """ 922 | 923 | return html 924 | 925 | # ReCaptchaをSplashでBypassする処理 926 | def bypass_recaptcha_splash(self, url: str, html: str): 927 | """bypass_recaptcha_splash 928 | 929 | SplashでReCaptchaを突破する関数. 930 | 実際の処理は各検索エンジンのClassで実装. 931 | 932 | Args: 933 | url (str): ReCaptcha画面が表示されてしまったリクエストのurl 934 | html (str): ReCaptcha画面のhtml 935 | 936 | Returns: 937 | str: ReCaptchaを突破後のurlのhtml 938 | """ 939 | 940 | return html 941 | --------------------------------------------------------------------------------