├── .all-contributorsrc ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE │ ├── bug_fix.md │ ├── documentation_related.md │ ├── engine_implementation.md │ └── feature_implementation.md └── workflows │ ├── deploy.yml │ └── test.yml ├── .gitignore ├── .pylintrc ├── .readthedocs.yml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── assets ├── animate.gif └── example.gif ├── docs ├── Makefile ├── documentation.md ├── engines.md ├── faq.md ├── make.bat ├── source │ ├── conf.py │ ├── index.rst │ ├── modules.rst │ ├── search_engine_parser.core.engines.rst │ ├── search_engine_parser.core.rst │ ├── search_engine_parser.rst │ └── search_engine_parser.tests.rst └── supported_engines.md ├── fixtures ├── aol-Hello-synopsis.yaml ├── ask-Hello-synopsis.yaml ├── baidu-Hello-synopsis.yaml ├── bing-Hello-synopsis.yaml ├── coursera-Hello-synopsis.yaml ├── duckduckgo-Hello-synopsis.yaml ├── github-Hello-synopsis.yaml ├── google-Hello-synopsis.yaml ├── google-test-diff-synopsis.yaml ├── googlenews-Hello-synopsis.yaml ├── googlescholar-Hello-synopsis.yaml ├── myanimelist-Hello-synopsis.yaml ├── stackoverflow-Hello-synopsis.yaml ├── yahoo-Hello-synopsis.yaml ├── yahoo-test-diff-synopsis.yaml ├── yandex-Hello-synopsis.yaml └── youtube-Hello-synopsis.yaml ├── requirements ├── cli.txt ├── dev.txt └── main.txt ├── scripts ├── docs.sh ├── post_deploy_test.sh └── pre_deploy_test.sh ├── search_engine_parser ├── .gitignore ├── __init__.py ├── core │ ├── __init__.py │ ├── base.py │ ├── cli.py │ ├── engines │ │ ├── __init__.py │ │ ├── aol.py │ │ ├── ask.py │ │ ├── baidu.py │ │ ├── bing.py │ │ ├── coursera.py │ │ ├── duckduckgo.py │ │ ├── github.py │ │ ├── google.py │ │ ├── googlenews.py │ │ ├── googlescholar.py │ │ ├── myanimelist.py │ │ ├── stackoverflow.py │ │ ├── yahoo.py │ │ ├── yandex.py │ │ └── youtube.py │ ├── exceptions.py │ └── utils.py └── tests │ ├── __init__.py │ ├── test_base.py │ └── test_cli.py └── setup.py /.all-contributorsrc: -------------------------------------------------------------------------------- 1 | { 2 | "files": [ 3 | "README.md" 4 | ], 5 | "imageSize": 100, 6 | "commit": false, 7 | "contributors": [ 8 | { 9 | "login": "Rexogamer", 10 | "name": "Ed Luff", 11 | "avatar_url": "https://avatars0.githubusercontent.com/u/42586271?v=4", 12 | "profile": "https://github.com/Rexogamer", 13 | "contributions": [ 14 | "code" 15 | ] 16 | }, 17 | { 18 | "login": "deven96", 19 | "name": "Diretnan Domnan", 20 | "avatar_url": "https://avatars3.githubusercontent.com/u/23453888?v=4", 21 | "profile": "http://diretnandomnan.webnode.com", 22 | "contributions": [ 23 | "infra", 24 | "test", 25 | "tool", 26 | "code" 27 | ] 28 | }, 29 | { 30 | "login": "MeNsaaH", 31 | "name": "MeNsaaH", 32 | "avatar_url": "https://avatars3.githubusercontent.com/u/24734308?v=4", 33 | "profile": "http://mensaah.github.io", 34 | "contributions": [ 35 | "infra", 36 | "test", 37 | "tool", 38 | "code" 39 | ] 40 | }, 41 | { 42 | "login": "PalAditya", 43 | "name": "Aditya Pal", 44 | "avatar_url": "https://avatars2.githubusercontent.com/u/25523604?v=4", 45 | "profile": "https://github.com/PalAditya", 46 | "contributions": [ 47 | "test", 48 | "code", 49 | "doc" 50 | ] 51 | }, 52 | { 53 | "login": "AvinashReddy3108", 54 | "name": "Avinash Reddy", 55 | "avatar_url": "https://avatars1.githubusercontent.com/u/27774996?v=4", 56 | "profile": "http://energized.pro", 57 | "contributions": [ 58 | "bug" 59 | ] 60 | }, 61 | { 62 | "login": "Iamdavidonuh", 63 | "name": "David Onuh", 64 | "avatar_url": "https://avatars3.githubusercontent.com/u/37768509?v=4", 65 | "profile": "https://github.com/Iamdavidonuh", 66 | "contributions": [ 67 | "code", 68 | "test" 69 | ] 70 | }, 71 | { 72 | "login": "sp1thas", 73 | "name": "Panagiotis Simakis", 74 | "avatar_url": "https://avatars2.githubusercontent.com/u/8322266?v=4", 75 | "profile": "http://simakis.me", 76 | "contributions": [ 77 | "code", 78 | "test" 79 | ] 80 | }, 81 | { 82 | "login": "reiarthur", 83 | "name": "reiarthur", 84 | "avatar_url": "https://avatars2.githubusercontent.com/u/20190646?v=4", 85 | "profile": "https://github.com/reiarthur", 86 | "contributions": [ 87 | "code" 88 | ] 89 | }, 90 | { 91 | "login": "ashokkumarta", 92 | "name": "Ashokkumar TA", 93 | "avatar_url": "https://avatars0.githubusercontent.com/u/5450267?v=4", 94 | "profile": "http://ashokkumarta.blogspot.com/", 95 | "contributions": [ 96 | "code" 97 | ] 98 | }, 99 | { 100 | "login": "ateuber", 101 | "name": "Andreas Teuber", 102 | "avatar_url": "https://avatars2.githubusercontent.com/u/44349054?v=4", 103 | "profile": "https://github.com/ateuber", 104 | "contributions": [ 105 | "code" 106 | ] 107 | }, 108 | { 109 | "login": "mi096684", 110 | "name": "mi096684", 111 | "avatar_url": "https://avatars3.githubusercontent.com/u/22032932?v=4", 112 | "profile": "https://github.com/mi096684", 113 | "contributions": [ 114 | "bug" 115 | ] 116 | }, 117 | { 118 | "login": "devajithvs", 119 | "name": "devajithvs", 120 | "avatar_url": "https://avatars1.githubusercontent.com/u/29475282?v=4", 121 | "profile": "https://github.com/devajithvs", 122 | "contributions": [ 123 | "code" 124 | ] 125 | }, 126 | { 127 | "login": "zakaryan2004", 128 | "name": "Geg Zakaryan", 129 | "avatar_url": "https://avatars3.githubusercontent.com/u/29994884?v=4", 130 | "profile": "https://github.com/zakaryan2004", 131 | "contributions": [ 132 | "code", 133 | "bug" 134 | ] 135 | }, 136 | { 137 | "login": "redrussianarmy", 138 | "name": "Hakan Boğan", 139 | "avatar_url": "https://avatars1.githubusercontent.com/u/24498747?v=4", 140 | "profile": "https://www.hakanbogan.com", 141 | "contributions": [ 142 | "bug" 143 | ] 144 | }, 145 | { 146 | "login": "NicKoehler", 147 | "name": "NicKoehler", 148 | "avatar_url": "https://avatars3.githubusercontent.com/u/53040044?v=4", 149 | "profile": "https://github.com/NicKoehler", 150 | "contributions": [ 151 | "bug", 152 | "code" 153 | ] 154 | }, 155 | { 156 | "login": "chris4540", 157 | "name": "ChrisLin", 158 | "avatar_url": "https://avatars1.githubusercontent.com/u/12794588?v=4", 159 | "profile": "https://github.com/chris4540", 160 | "contributions": [ 161 | "bug", 162 | "code" 163 | ] 164 | }, 165 | { 166 | "login": "pgrandinetti", 167 | "name": "Pietro", 168 | "avatar_url": "https://avatars.githubusercontent.com/u/10454135?v=4", 169 | "profile": "http://pete.world", 170 | "contributions": [ 171 | "code", 172 | "bug" 173 | ] 174 | } 175 | ], 176 | "contributorsPerLine": 7, 177 | "projectName": "search-engine-parser", 178 | "projectOwner": "bisoncorps", 179 | "repoType": "github", 180 | "repoHost": "https://github.com", 181 | "skipCi": true 182 | } 183 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: 'bug' 6 | assignees: '@deven96' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Open python console to 16 | 2. Import search_engine_parser 17 | 3. Search using .... Engine 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. Windows] 28 | - Python Version [e.g. 3.6.5] 29 | - Search-engine-parser version [e.g. 0.5.1] 30 | 31 | 32 | **Additional context** 33 | Add any other context about the problem here. 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: 'enhancement' 6 | assignees: '@deven96' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE/bug_fix.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Fix 3 | title: '' 4 | labels: 'patch', 'needs-review' 5 | assignees: '@MeNsaaH' 6 | 7 | --- 8 | 9 | **Issue relating to the bug** 10 | Issue number relating to the bug e.g #13 11 | 12 | **Simple summary of steps Taken to fix the bug** 13 | A clear and concise description of what the fix is. Ex. I added a browser header to the base engine `search_engine_parser/core/engines/base` to prevent captchas. 14 | 15 | **Describe alternatives you've considered** 16 | A clear and concise description of any alternative solutions you've considered. 17 | 18 | **Additional context** 19 | Add any other context or screenshots about the fix here. 20 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE/documentation_related.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Documentation Related 3 | about: Added documentation to the project 4 | title: '' 5 | labels: 'documentation', 'needs-review' 6 | assignees: '@MenSaaH' 7 | 8 | --- 9 | 10 | **Describe the change to the documentation** 11 | A clear and concise description of what the change/addition is. 12 | 13 | **Issue fix?** 14 | Issue number that this documentation PR fixes. 15 | 16 | **Screenshots** 17 | If applicable, add screenshots of the sphinx documentation rendered on your local machine. 18 | 19 | **Additional context** 20 | Add any other context about the PR here. 21 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE/engine_implementation.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Engine Implementation 3 | about: Implemented a new engine 4 | title: 'Name of Engine goes here' 5 | labels: 'engine', 'needs-review' 6 | assignees: '@deven96', '@MenSaaH' 7 | 8 | --- 9 | 10 | **Issue relating to the engine request** 11 | Issue number relating to the engine e.g #13 12 | 13 | **Summary of steps Taken to implement the engine** 14 | A clear and concise description of what the engine is. 15 | 16 | ```t 17 | Ex. I added the GitHub engine, `github.py` to the `search_engine_parser/core/engines` directory and made the necessary imports. 18 | This engine integrates GitHub search capabilities and returns stars, repository info, descriptions, links and titles. 19 | ``` 20 | 21 | **Describe any issues you've faced or inconsistencies in the engine** 22 | A clear and concise description of any issues you've faced. Ex. I was unable to parse 10 results per page due to [...] 23 | 24 | **Additional context** 25 | Add any other context or screenshots about the engine here. 26 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE/feature_implementation.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature Implementation 3 | about: '' 4 | title: '' 5 | labels: 'feature', 'needs-review' 6 | assignees: '@deven96', '@MenSaaH' 7 | 8 | --- 9 | 10 | **Issue relating to the feature** 11 | Issue number relating to the feature e.g #13 12 | 13 | **Summary of steps Taken to implement the feature** 14 | A clear and concise description of what the feature is. 15 | 16 | ```t 17 | Ex. I added a browser header to the base engine `search_engine_parser/core/engines/base` to prevent captchas. 18 | ``` 19 | 20 | **Describe any issues you've faced or inconsistencies in implementing the feature** 21 | A clear and concise description of any issues you've faced. Ex. Captchas still occur after a certain amount of usage [...] 22 | 23 | **Additional context** 24 | Add any other context or screenshots about the feature here. 25 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy to Pypi 2 | on: 3 | push: 4 | tags: 5 | - 'v*.*.*' 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v1 12 | 13 | - name: Set up Python 3.7 14 | uses: actions/setup-python@v1 15 | with: 16 | python-version: 3.7 17 | 18 | - name: Install Dependencies 19 | run: pip install -r requirements/dev.txt 20 | 21 | - name: Set env 22 | run: echo "RELEASE_VERSION=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV 23 | 24 | - name: update Package version 25 | run: sed -i "s/.*__version__.*/__version__ = \"${{ env.RELEASE_VERSION }}\"/g" search_engine_parser/__init__.py 26 | 27 | - name: Install pypa/build 28 | run: python -m pip install build --user 29 | 30 | - name: Build a binary wheel and a source tarball 31 | run: python -m build --sdist --wheel --outdir dist/ . 32 | 33 | - name: Build Changelog 34 | id: github_release 35 | uses: mikepenz/release-changelog-builder-action@v3 36 | env: 37 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 38 | 39 | - name: Create Release 40 | uses: softprops/action-gh-release@v0.1.14 41 | with: 42 | body: ${{steps.github_release.outputs.changelog}} 43 | 44 | - name: Publish package 45 | uses: pypa/gh-action-pypi-publish@release/v1 46 | with: 47 | user: __token__ 48 | password: ${{ secrets.PYPI_API_TOKEN }} 49 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | on: 3 | push: 4 | branches: 5 | - master 6 | paths: 7 | - '**.py' 8 | - 'requirements/**' 9 | pull_request: 10 | branches: 11 | - master 12 | paths: 13 | - '**.py' 14 | - 'requirements/**' 15 | 16 | jobs: 17 | test: 18 | strategy: 19 | matrix: 20 | python: ["3.6", "3.7", "3.8", "3.9"] 21 | runs-on: ubuntu-latest 22 | steps: 23 | - uses: actions/checkout@v1 24 | 25 | - name: Set up Python 3.7 26 | uses: actions/setup-python@v1 27 | with: 28 | python-version: ${{ matrix.python }} 29 | 30 | - name: Install Dependencies 31 | run: pip install -r requirements/dev.txt 32 | 33 | - name: Run tests 34 | run: pytest -s 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | #search_engine_parser cache 107 | **/cache/** 108 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # A comma-separated list of package or module names from where C extensions may 4 | # be loaded. Extensions are loading into the active Python interpreter and may 5 | # run arbitrary code. 6 | extension-pkg-whitelist= 7 | 8 | # Add files or directories to the blacklist. They should be base names, not 9 | # paths. 10 | ignore=CVS 11 | 12 | # Add files or directories matching the regex patterns to the blacklist. The 13 | # regex matches against base names, not paths. 14 | ignore-patterns= 15 | 16 | # Python code to execute, usually for sys.path manipulation such as 17 | # pygtk.require(). 18 | #init-hook= 19 | 20 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the 21 | # number of processors available to use. 22 | jobs=1 23 | 24 | # Control the amount of potential inferred values when inferring a single 25 | # object. This can help the performance when dealing with large functions or 26 | # complex, nested conditions. 27 | limit-inference-results=100 28 | 29 | # List of plugins (as comma separated values of python modules names) to load, 30 | # usually to register additional checkers. 31 | load-plugins= 32 | 33 | # Pickle collected data for later comparisons. 34 | persistent=yes 35 | 36 | # Specify a configuration file. #rcfile= 37 | 38 | # When enabled, pylint would attempt to guess common misconfiguration and emit 39 | # user-friendly hints instead of false-positive error messages. 40 | suggestion-mode=yes 41 | 42 | # Allow loading of arbitrary C extensions. Extensions are imported into the 43 | # active Python interpreter and may run arbitrary code. 44 | unsafe-load-any-extension=no 45 | 46 | 47 | [MESSAGES CONTROL] 48 | 49 | # Only show warnings with the listed confidence levels. Leave empty to show 50 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. 51 | confidence= 52 | 53 | # Disable the message, report, category or checker with the given id(s). You 54 | # can either give multiple identifiers separated by comma (,) or put this 55 | # option multiple times (only on the command line, not in the configuration 56 | # file where it should appear only once). You can also use "--disable=all" to 57 | # disable everything first and then reenable specific checks. For example, if 58 | # you want to run only the similarities checker, you can use "--disable=all 59 | # --enable=similarities". If you want to run only the classes checker, but have 60 | # no Warning level messages displayed, use "--disable=all --enable=classes 61 | # --disable=W". 62 | disable=print-statement, 63 | parameter-unpacking, 64 | unpacking-in-except, 65 | old-raise-syntax, 66 | backtick, 67 | long-suffix, 68 | old-ne-operator, 69 | old-octal-literal, 70 | import-star-module-level, 71 | non-ascii-bytes-literal, 72 | raw-checker-failed, 73 | bad-inline-option, 74 | locally-disabled, 75 | file-ignored, 76 | suppressed-message, 77 | useless-suppression, 78 | deprecated-pragma, 79 | use-symbolic-message-instead, 80 | apply-builtin, 81 | basestring-builtin, 82 | buffer-builtin, 83 | cmp-builtin, 84 | coerce-builtin, 85 | execfile-builtin, 86 | file-builtin, 87 | long-builtin, 88 | raw_input-builtin, 89 | reduce-builtin, 90 | standarderror-builtin, 91 | unicode-builtin, 92 | xrange-builtin, 93 | coerce-method, 94 | delslice-method, 95 | getslice-method, 96 | setslice-method, 97 | no-absolute-import, 98 | old-division, 99 | dict-iter-method, 100 | dict-view-method, 101 | next-method-called, 102 | metaclass-assignment, 103 | indexing-exception, 104 | raising-string, 105 | reload-builtin, 106 | oct-method, 107 | hex-method, 108 | nonzero-method, 109 | cmp-method, 110 | input-builtin, 111 | round-builtin, 112 | missing-docstring, 113 | intern-builtin, 114 | unichr-builtin, 115 | map-builtin-not-iterating, 116 | zip-builtin-not-iterating, 117 | range-builtin-not-iterating, 118 | filter-builtin-not-iterating, 119 | using-cmp-argument, 120 | eq-without-hash, 121 | div-method, 122 | idiv-method, 123 | rdiv-method, 124 | exception-message-attribute, 125 | invalid-str-codec, 126 | sys-max-int, 127 | bad-python3-import, 128 | deprecated-string-function, 129 | deprecated-str-translate-call, 130 | deprecated-itertools-function, 131 | deprecated-types-field, 132 | next-method-defined, 133 | dict-items-not-iterating, 134 | dict-keys-not-iterating, 135 | dict-values-not-iterating, 136 | deprecated-operator-function, 137 | deprecated-urllib-function, 138 | xreadlines-attribute, 139 | deprecated-sys-function, 140 | exception-escape, 141 | comprehension-escape, 142 | R0801 143 | 144 | # Enable the message, report, category or checker with the given id(s). You can 145 | # either give multiple identifier separated by comma (,) or put this option 146 | # multiple time (only on the command line, not in the configuration file where 147 | # it should appear only once). See also the "--disable" option for examples. 148 | enable=c-extension-no-member 149 | 150 | 151 | [REPORTS] 152 | 153 | # Python expression which should return a note less than 10 (10 is the highest 154 | # note). You have access to the variables errors warning, statement which 155 | # respectively contain the number of errors / warnings messages and the total 156 | # number of statements analyzed. This is used by the global evaluation report 157 | # (RP0004). 158 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 159 | 160 | # Template used to display messages. This is a python new-style format string 161 | # used to format the message information. See doc for all details. 162 | #msg-template= 163 | 164 | # Set the output format. Available formats are text, parseable, colorized, json 165 | # and msvs (visual studio). You can also give a reporter class, e.g. 166 | # mypackage.mymodule.MyReporterClass. 167 | output-format=text 168 | 169 | # Tells whether to display a full report or only the messages. 170 | reports=no 171 | 172 | # Activate the evaluation score. 173 | score=yes 174 | 175 | 176 | [REFACTORING] 177 | 178 | # Maximum number of nested blocks for function / method body 179 | max-nested-blocks=5 180 | 181 | # Complete name of functions that never returns. When checking for 182 | # inconsistent-return-statements if a never returning function is called then 183 | # it will be considered as an explicit return statement and no message will be 184 | # printed. 185 | never-returning-functions=sys.exit 186 | 187 | 188 | [MISCELLANEOUS] 189 | 190 | # List of note tags to take in consideration, separated by a comma. 191 | notes=FIXME, 192 | XXX, 193 | TODO 194 | 195 | 196 | [LOGGING] 197 | 198 | # Format style used to check logging format string. `old` means using % 199 | # formatting, while `new` is for `{}` formatting. 200 | logging-format-style=old 201 | 202 | # Logging modules to check that the string format arguments are in logging 203 | # function parameter format. 204 | logging-modules=logging 205 | 206 | 207 | [STRING] 208 | 209 | # This flag controls whether the implicit-str-concat-in-sequence should 210 | # generate a warning on implicit string concatenation in sequences defined over 211 | # several lines. 212 | check-str-concat-over-line-jumps=no 213 | 214 | 215 | [SPELLING] 216 | 217 | # Limits count of emitted suggestions for spelling mistakes. 218 | max-spelling-suggestions=4 219 | 220 | # Spelling dictionary name. Available dictionaries: none. To make it working 221 | # install python-enchant package.. 222 | spelling-dict= 223 | 224 | # List of comma separated words that should not be checked. 225 | spelling-ignore-words= 226 | 227 | # A path to a file that contains private dictionary; one word per line. 228 | spelling-private-dict-file= 229 | 230 | # Tells whether to store unknown words to indicated private dictionary in 231 | # --spelling-private-dict-file option instead of raising a message. 232 | spelling-store-unknown-words=no 233 | 234 | 235 | [FORMAT] 236 | 237 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 238 | expected-line-ending-format= 239 | 240 | # Regexp for a line that is allowed to be longer than the limit. 241 | ignore-long-lines=^\s*(# )??$ 242 | 243 | # Number of spaces of indent required inside a hanging or continued line. 244 | indent-after-paren=4 245 | 246 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 247 | # tab). 248 | indent-string=' ' 249 | 250 | # Maximum number of characters on a single line. 251 | max-line-length=100 252 | 253 | # Maximum number of lines in a module. 254 | max-module-lines=1000 255 | 256 | # List of optional constructs for which whitespace checking is disabled. `dict- 257 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 258 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 259 | # `empty-line` allows space-only lines. 260 | no-space-check=trailing-comma, 261 | dict-separator 262 | 263 | # Allow the body of a class to be on the same line as the declaration if body 264 | # contains single statement. 265 | single-line-class-stmt=no 266 | 267 | # Allow the body of an if to be on the same line as the test if there is no 268 | # else. 269 | single-line-if-stmt=no 270 | 271 | 272 | [BASIC] 273 | 274 | # Naming style matching correct argument names. 275 | argument-naming-style=snake_case 276 | 277 | # Regular expression matching correct argument names. Overrides argument- 278 | # naming-style. 279 | #argument-rgx= 280 | 281 | # Naming style matching correct attribute names. 282 | attr-naming-style=snake_case 283 | 284 | # Regular expression matching correct attribute names. Overrides attr-naming- 285 | # style. 286 | #attr-rgx= 287 | 288 | # Bad variable names which should always be refused, separated by a comma. 289 | bad-names=foo, 290 | bar, 291 | baz, 292 | toto, 293 | tutu, 294 | tata 295 | 296 | # Naming style matching correct class attribute names. 297 | class-attribute-naming-style=any 298 | 299 | # Regular expression matching correct class attribute names. Overrides class- 300 | # attribute-naming-style. 301 | #class-attribute-rgx= 302 | 303 | # Naming style matching correct class names. 304 | class-naming-style=PascalCase 305 | 306 | # Regular expression matching correct class names. Overrides class-naming- 307 | # style. 308 | #class-rgx= 309 | 310 | # Naming style matching correct constant names. 311 | const-naming-style=UPPER_CASE 312 | 313 | # Regular expression matching correct constant names. Overrides const-naming- 314 | # style. 315 | #const-rgx= 316 | 317 | # Minimum line length for functions/classes that require docstrings, shorter 318 | # ones are exempt. 319 | docstring-min-length=-1 320 | 321 | # Naming style matching correct function names. 322 | function-naming-style=snake_case 323 | 324 | # Regular expression matching correct function names. Overrides function- 325 | # naming-style. 326 | #function-rgx= 327 | 328 | # Good variable names which should always be accepted, separated by a comma. 329 | good-names=i, 330 | j, 331 | k, 332 | ex, 333 | Run, 334 | _ 335 | 336 | # Include a hint for the correct naming format with invalid-name. 337 | include-naming-hint=no 338 | 339 | # Naming style matching correct inline iteration names. 340 | inlinevar-naming-style=any 341 | 342 | # Regular expression matching correct inline iteration names. Overrides 343 | # inlinevar-naming-style. 344 | #inlinevar-rgx= 345 | 346 | # Naming style matching correct method names. 347 | method-naming-style=snake_case 348 | 349 | # Regular expression matching correct method names. Overrides method-naming- 350 | # style. 351 | #method-rgx= 352 | 353 | # Naming style matching correct module names. 354 | module-naming-style=snake_case 355 | 356 | # Regular expression matching correct module names. Overrides module-naming- 357 | # style. 358 | #module-rgx= 359 | 360 | # Colon-delimited sets of names that determine each other's naming style when 361 | # the name regexes allow several styles. 362 | name-group= 363 | 364 | # Regular expression which should only match function or class names that do 365 | # not require a docstring. 366 | no-docstring-rgx=^_ 367 | 368 | # List of decorators that produce properties, such as abc.abstractproperty. Add 369 | # to this list to register other decorators that produce valid properties. 370 | # These decorators are taken in consideration only for invalid-name. 371 | property-classes=abc.abstractproperty 372 | 373 | # Naming style matching correct variable names. 374 | variable-naming-style=snake_case 375 | 376 | # Regular expression matching correct variable names. Overrides variable- 377 | # naming-style. 378 | #variable-rgx= 379 | 380 | 381 | [TYPECHECK] 382 | 383 | # List of decorators that produce context managers, such as 384 | # contextlib.contextmanager. Add to this list to register other decorators that 385 | # produce valid context managers. 386 | contextmanager-decorators=contextlib.contextmanager 387 | 388 | # List of members which are set dynamically and missed by pylint inference 389 | # system, and so shouldn't trigger E1101 when accessed. Python regular 390 | # expressions are accepted. 391 | generated-members= 392 | 393 | # Tells whether missing members accessed in mixin class should be ignored. A 394 | # mixin class is detected if its name ends with "mixin" (case insensitive). 395 | ignore-mixin-members=yes 396 | 397 | # Tells whether to warn about missing members when the owner of the attribute 398 | # is inferred to be None. 399 | ignore-none=yes 400 | 401 | # This flag controls whether pylint should warn about no-member and similar 402 | # checks whenever an opaque object is returned when inferring. The inference 403 | # can return multiple potential results while evaluating a Python object, but 404 | # some branches might not be evaluated, which results in partial inference. In 405 | # that case, it might be useful to still emit no-member and other checks for 406 | # the rest of the inferred objects. 407 | ignore-on-opaque-inference=yes 408 | 409 | # List of class names for which member attributes should not be checked (useful 410 | # for classes with dynamically set attributes). This supports the use of 411 | # qualified names. 412 | ignored-classes=optparse.Values,thread._local,_thread._local 413 | 414 | # List of module names for which member attributes should not be checked 415 | # (useful for modules/projects where namespaces are manipulated during runtime 416 | # and thus existing member attributes cannot be deduced by static analysis. It 417 | # supports qualified module names, as well as Unix pattern matching. 418 | ignored-modules= 419 | 420 | # Show a hint with possible names when a member name was not found. The aspect 421 | # of finding the hint is based on edit distance. 422 | missing-member-hint=yes 423 | 424 | # The minimum edit distance a name should have in order to be considered a 425 | # similar match for a missing member name. 426 | missing-member-hint-distance=1 427 | 428 | # The total number of similar names that should be taken in consideration when 429 | # showing a hint for a missing member. 430 | missing-member-max-choices=1 431 | 432 | 433 | [VARIABLES] 434 | 435 | # List of additional names supposed to be defined in builtins. Remember that 436 | # you should avoid defining new builtins when possible. 437 | additional-builtins= 438 | 439 | # Tells whether unused global variables should be treated as a violation. 440 | allow-global-unused-variables=yes 441 | 442 | # List of strings which can identify a callback function by name. A callback 443 | # name must start or end with one of those strings. 444 | callbacks=cb_, 445 | _cb 446 | 447 | # A regular expression matching the name of dummy variables (i.e. expected to 448 | # not be used). 449 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 450 | 451 | # Argument names that match this expression will be ignored. Default to name 452 | # with leading underscore. 453 | ignored-argument-names=_.*|^ignored_|^unused_ 454 | 455 | # Tells whether we should check for unused import in __init__ files. 456 | init-import=no 457 | 458 | # List of qualified module names which can have objects that can redefine 459 | # builtins. 460 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io 461 | 462 | 463 | [SIMILARITIES] 464 | 465 | # Ignore comments when computing similarities. 466 | ignore-comments=yes 467 | 468 | # Ignore docstrings when computing similarities. 469 | ignore-docstrings=yes 470 | 471 | # Ignore imports when computing similarities. 472 | ignore-imports=no 473 | 474 | # Minimum lines number of a similarity. 475 | min-similarity-lines=4 476 | 477 | 478 | [IMPORTS] 479 | 480 | # Allow wildcard imports from modules that define __all__. 481 | allow-wildcard-with-all=no 482 | 483 | # Analyse import fallback blocks. This can be used to support both Python 2 and 484 | # 3 compatible code, which means that the block might have code that exists 485 | # only in one or another interpreter, leading to false positives when analysed. 486 | analyse-fallback-blocks=no 487 | 488 | # Deprecated modules which should not be used, separated by a comma. 489 | deprecated-modules=optparse,tkinter.tix 490 | 491 | # Create a graph of external dependencies in the given file (report RP0402 must 492 | # not be disabled). 493 | ext-import-graph= 494 | 495 | # Create a graph of every (i.e. internal and external) dependencies in the 496 | # given file (report RP0402 must not be disabled). 497 | import-graph= 498 | 499 | # Create a graph of internal dependencies in the given file (report RP0402 must 500 | # not be disabled). 501 | int-import-graph= 502 | 503 | # Force import order to recognize a module as part of the standard 504 | # compatibility libraries. 505 | known-standard-library= 506 | 507 | # Force import order to recognize a module as part of a third party library. 508 | known-third-party=enchant 509 | 510 | 511 | [DESIGN] 512 | 513 | # Maximum number of arguments for function / method. 514 | max-args=5 515 | 516 | # Maximum number of attributes for a class (see R0902). 517 | max-attributes=7 518 | 519 | # Maximum number of boolean expressions in an if statement. 520 | max-bool-expr=5 521 | 522 | # Maximum number of branch for function / method body. 523 | max-branches=12 524 | 525 | # Maximum number of locals for function / method body. 526 | max-locals=15 527 | 528 | # Maximum number of parents for a class (see R0901). 529 | max-parents=7 530 | 531 | # Maximum number of public methods for a class (see R0904). 532 | max-public-methods=20 533 | 534 | # Maximum number of return / yield for function / method body. 535 | max-returns=6 536 | 537 | # Maximum number of statements in function / method body. 538 | max-statements=50 539 | 540 | # Minimum number of public methods for a class (see R0903). 541 | min-public-methods=2 542 | 543 | 544 | [CLASSES] 545 | 546 | # List of method names used to declare (i.e. assign) instance attributes. 547 | defining-attr-methods=__init__, 548 | __new__, 549 | setUp 550 | 551 | # List of member names, which should be excluded from the protected access 552 | # warning. 553 | exclude-protected=_asdict, 554 | _fields, 555 | _replace, 556 | _source, 557 | _make 558 | 559 | # List of valid names for the first argument in a class method. 560 | valid-classmethod-first-arg=cls 561 | 562 | # List of valid names for the first argument in a metaclass class method. 563 | valid-metaclass-classmethod-first-arg=cls 564 | 565 | 566 | [EXCEPTIONS] 567 | 568 | # Exceptions that will emit a warning when being caught. Defaults to 569 | # "BaseException, Exception". 570 | overgeneral-exceptions=BaseException, 571 | Exception 572 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/source/conf.py 11 | 12 | # Build documentation with MkDocs 13 | #mkdocs: 14 | # configuration: mkdocs.yml 15 | 16 | # Optionally build your docs in additional formats such as PDF and ePub 17 | formats: all 18 | 19 | # Optionally set the version of Python and requirements required to build your docs 20 | python: 21 | version: 3.7 22 | install: 23 | - requirements: requirements/dev.txt -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at diretnan.bisoncorps@gmail.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## How to contribute to Search Engine Parser 2 | 3 | All Contributions to the code base or documentation must be done on a branch with intuitive name e.g `aol-#13-patch`, `yandex-engine-implementation` 4 | 5 | #### **Did you find a bug?** 6 | 7 | 8 | * **Ensure the bug was not already reported** by searching on GitHub under [Issues](https://github.com/bisoncorps/search-engine-parser/issues). 9 | 10 | * If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/bisoncorps/search-engine-parser/issues/new). If possible, be sure to make use of the [bug template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/ISSUE_TEMPLATE/bug_report.md) with label `bug` 11 | 12 | * Ensure the issue description clearly describes the bug.Include the relevant issue number if applicable. 13 | 14 | #### **Did you write a patch that fixes a bug?** 15 | 16 | * Ensure the bug is first reported by searching on GitHub under [Issues](https://github.com/bisoncorps/search-engine-parser/issues) using label `bug` 17 | 18 | * If issue does not exist, open an issue with the [bug report template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/ISSUE_TEMPLATE/bug_report.md) 19 | 20 | * Open a new GitHub pull request with the patch using [bug fix template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/PULL_REQUEST_TEMPLATE/bug_fix.md). 21 | 22 | * Ensure the PR description clearly describes the solution. Include the relevant issue number if applicable. 23 | 24 | 25 | #### **Do you intend to add a new feature or change an existing one?** 26 | 27 | * **Ensure the feature was not already requested** by searching on GitHub under [Issues](https://github.com/bisoncorps/search-engine-parser/issues). Search using the `enhancement` or `feature` labels 28 | 29 | * Suggest your feature/change in the [search-engine-parser mailing list](https://groups.google.com/forum/?fromgroups#!forum/searchengineparser) and start writing code. 30 | 31 | * Do not open an issue on GitHub until you have collected positive feedback about the change. 32 | 33 | * Raise an issue using the [feature request template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/ISSUE_TEMPLATE/feature_request.md) with labels `enhancement` 34 | 35 | * Upon implementing the feature, make a PR using the [feature implementation template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/PULL_REQUEST_TEMPLATE/feature_implementation.md) 36 | 37 | ##### **Engines** 38 | 39 | * Refer to the [SearchEngineParser Engines Documentation](https://github.com/bisoncorps/search-engine-parser/blob/master/docs/engines.md) for help on implementing Engines 40 | 41 | * If an issue for the Engine does not already exist under [Issues], suggest the engine in the [search-engine-parser mailing list](https://groups.google.com/forum/?fromgroups#!forum/searchengineparser) 42 | 43 | * If the Engine to be included is accepted, raise an issue using the [feature template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/ISSUE_TEMPLATE/feature_request.md) and labels `enhancement` and `engine` 44 | 45 | * Upon implementing the Engine, make a PR using the [engine implementation template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/PULL_REQUEST_TEMPLATE/engine_implementation.md) 46 | 47 | 48 | #### **Do you have questions about the source code?** 49 | 50 | * Ask any question about how to use SearchEngineParser [search-engine-parser mailing list](https://groups.google.com/forum/?fromgroups#!forum/searchengineparser). 51 | 52 | #### **Do you want to contribute to the search-engine-parser documentation?** 53 | 54 | * Please read [Contributing to the SearchEngineParser Documentation](https://github.com/bisoncorps/search-engine-parser/blob/master/docs/documentation.md). 55 | 56 | 57 | 58 | `NOTE: There are exceptions in every case and we know that too!` 59 | 60 | SearchEngineParser is a volunteer effort. We encourage you to pitch in and [join the team](https://github.com/bisoncorps/search-engine-parser/blob/master/README.md#contributors)! 61 | 62 | 63 | Thanks! 64 | 65 | Bisoncorps Team - `B`uilding `I`nteresting `S`oftware `O`pensourced for huma`NS` :heart: :heart: 66 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 bison_corps/search-engine-parser 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include requirements/*.txt 3 | include README.md 4 | recursive-include search_engine_parser *.py 5 | prune docs/ 6 | prune scripts/ 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Search Engine Parser 2 | 3 | "If it is a search engine, then it can be parsed" - some random guy 4 | 5 | ![Demo](https://github.com/bisoncorps/search-engine-parser/raw/master/assets/animate.gif) 6 | 7 | [![Python 3.6|3.7|3.8|3.9](https://img.shields.io/badge/python-3.5%7C3.6%7C3.7%7C3.8-blue)](https://www.python.org/downloads/) 8 | [![PyPI version](https://img.shields.io/pypi/v/search-engine-parser)](https://pypi.org/project/search-engine-parser/) 9 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/search-engine-parser)](https://pypi.org/project/search-engine-parser/) 10 | [![Deploy to Pypi](https://github.com/bisohns/search-engine-parser/actions/workflows/deploy.yml/badge.svg)](https://github.com/bisohns/search-engine-parser/actions/workflows/deploy.yml) 11 | [![Test](https://github.com/bisohns/search-engine-parser/actions/workflows/test.yml/badge.svg)](https://github.com/bisohns/search-engine-parser/actions/workflows/test.yml) 12 | [![Documentation Status](https://readthedocs.org/projects/search-engine-parser/badge/?version=latest)](https://search-engine-parser.readthedocs.io/en/latest/?badge=latest) 13 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 14 | [![All Contributors](https://img.shields.io/badge/all_contributors-10-orange.svg)](#contributors) 15 |
16 | 17 | search-engine-parser is a package that lets you query popular search engines and scrape for result titles, links, descriptions and more. It aims to scrape the widest range of search engines. 18 | View all supported engines [here.](https://github.com/bisoncorps/search-engine-parser/blob/master/docs/supported_engines.md) 19 | 20 | - [Search Engine Parser](#search-engine-parser) 21 | - [Popular Supported Engines](#popular-supported-engines) 22 | - [Installation](#installation) 23 | - [Development](#development) 24 | - [Code Documentation](#code-documentation) 25 | - [Running the tests](#running-the-tests) 26 | - [Usage](#usage) 27 | - [Code](#code) 28 | - [Command line](#command-line) 29 | - [FAQ](docs/faq.md) 30 | - [Code of Conduct](#code-of-conduct) 31 | - [Contribution](#contribution) 32 | - [License (MIT)](#license-mit) 33 | 34 | ## Popular Supported Engines 35 | Popular search engines supported include: 36 | 37 | - Google 38 | - DuckDuckGo 39 | - GitHub 40 | - StackOverflow 41 | - Baidu 42 | - YouTube 43 | 44 | View all supported engines [here.](docs/supported_engines.md) 45 | 46 | ## Installation 47 | Install from PyPi: 48 | 49 | ```bash 50 | # install only package dependencies 51 | pip install search-engine-parser 52 | # Installs `pysearch` cli tool 53 | pip install "search-engine-parser[cli]" 54 | ``` 55 | 56 | or from master: 57 | ```bash 58 | pip install git+https://github.com/bisoncorps/search-engine-parser 59 | ``` 60 | 61 | ## Development 62 | Clone the repository: 63 | 64 | ```bash 65 | git clone git@github.com:bisoncorps/search-engine-parser.git 66 | ``` 67 | 68 | Then create a virtual environment and install the required packages: 69 | 70 | ```bash 71 | mkvirtualenv search_engine_parser 72 | pip install -r requirements/dev.txt 73 | ``` 74 | 75 | 76 | ## Code Documentation 77 | Code docs can be found on [Read the Docs](https://search-engine-parser.readthedocs.io/en/latest). 78 | 79 | ## Running the tests 80 | ```bash 81 | pytest 82 | ``` 83 | 84 | ## Usage 85 | 86 | ### Code 87 | Query results can be scraped from popular search engines, as shown in the example snippet below. 88 | 89 | ```python 90 | import pprint 91 | 92 | from search_engine_parser.core.engines.bing import Search as BingSearch 93 | from search_engine_parser.core.engines.google import Search as GoogleSearch 94 | from search_engine_parser.core.engines.yahoo import Search as YahooSearch 95 | 96 | search_args = ('preaching to the choir', 1) 97 | gsearch = GoogleSearch() 98 | ysearch = YahooSearch() 99 | bsearch = BingSearch() 100 | gresults = gsearch.search(*search_args) 101 | yresults = ysearch.search(*search_args) 102 | bresults = bsearch.search(*search_args) 103 | a = { 104 | "Google": gresults, 105 | "Yahoo": yresults, 106 | "Bing": bresults 107 | } 108 | 109 | # pretty print the result from each engine 110 | for k, v in a.items(): 111 | print(f"-------------{k}------------") 112 | for result in v: 113 | pprint.pprint(result) 114 | 115 | # print first title from google search 116 | print(gresults["titles"][0]) 117 | # print 10th link from yahoo search 118 | print(yresults["links"][9]) 119 | # print 6th description from bing search 120 | print(bresults["descriptions"][5]) 121 | 122 | # print first result containing links, descriptions and title 123 | print(gresults[0]) 124 | ``` 125 | 126 | For localization, you can pass the `url` keyword and a localized url. This queries and parses the localized url using the same engine's parser: 127 | ```python 128 | # Use google.de instead of google.com 129 | results = gsearch.search(*search_args, url="google.de") 130 | ``` 131 | 132 | If you need results in a specific language you can pass the 'hl' keyword and the 2-letter country abbreviation (here's a [handy list](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)): 133 | ```python 134 | # Use 'it' to receive italian results 135 | results = gsearch.search(*search_args, hl="it") 136 | ``` 137 | 138 | #### Cache 139 | The results are automatically cached for engine searches. You can either bypass the cache by adding `cache=False` to the `search` or `async_search` method or clear the engine's cache 140 | ```python 141 | from search_engine_parser.core.engines.github import Search as GitHub 142 | github = GitHub() 143 | # bypass the cache 144 | github.search("search-engine-parser", cache=False) 145 | 146 | #OR 147 | # clear cache before search 148 | github.clear_cache() 149 | github.search("search-engine-parser") 150 | ``` 151 | 152 | #### Proxy 153 | Adding a proxy entails sending details to the search function 154 | ```python 155 | from search_engine_parser.core.engines.github import Search as GitHub 156 | github = GitHub() 157 | github.search("search-engine-parser", 158 | # http proxies supported only 159 | proxy='http://123.12.1.0', 160 | proxy_auth=('username', 'password')) 161 | ``` 162 | 163 | 164 | #### Async 165 | search-engine-parser supports `async`: 166 | ```python 167 | results = await gsearch.async_search(*search_args) 168 | ``` 169 | 170 | #### Results 171 | The `SearchResults` after searching: 172 | ```python 173 | >>> results = gsearch.search("preaching to the choir", 1) 174 | >>> results 175 | 176 | # the object supports retrieving individual results by iteration of just by type (links, descriptions, titles) 177 | >>> results[0] # returns the first 178 | >>> results[0]["description"] # gets the description of the first item 179 | >>> results[0]["link"] # gets the link of the first item 180 | >>> results["descriptions"] # returns a list of all descriptions from all results 181 | ``` 182 | It can be iterated like a normal list to return individual `SearchItem`s. 183 | 184 | ### Command line 185 | 186 | search-engine-parser comes with a CLI tool known as `pysearch`. You can use it as such: 187 | 188 | ```bash 189 | pysearch --engine bing --type descriptions "Preaching to the choir" 190 | ``` 191 | 192 | Result: 193 | 194 | ```bash 195 | 'Preaching to the choir' originated in the USA in the 1970s. It is a variant of the earlier 'preaching to the converted', which dates from England in the late 1800s and has the same meaning. Origin - the full story 'Preaching to the choir' (also sometimes spelled quire) is of US origin. 196 | ``` 197 | 198 | ![Demo](https://github.com/bisoncorps/search-engine-parser/raw/master/assets/example.gif) 199 | 200 | ```bash 201 | usage: pysearch [-h] [-V] [-e ENGINE] [--show-summary] [-u URL] [-p PAGE] 202 | [-t TYPE] [-cc] [-r RANK] [--proxy PROXY] 203 | [--proxy-user PROXY_USER] [--proxy-password PROXY_PASSWORD] 204 | query 205 | 206 | SearchEngineParser 207 | 208 | positional arguments: 209 | query Query string to search engine for 210 | 211 | optional arguments: 212 | -h, --help show this help message and exit 213 | -V, --version show program's version number and exit 214 | -e ENGINE, --engine ENGINE 215 | Engine to use for parsing the query e.g google, yahoo, 216 | bing,duckduckgo (default: google) 217 | --show-summary Shows the summary of an engine 218 | -u URL, --url URL A custom link to use as base url for search e.g 219 | google.de 220 | -p PAGE, --page PAGE Page of the result to return details for (default: 1) 221 | -t TYPE, --type TYPE Type of detail to return i.e full, links, desciptions 222 | or titles (default: full) 223 | -cc, --clear-cache Clear cache of engine before searching 224 | -r RANK, --rank RANK ID of Detail to return e.g 5 (default: 0) 225 | --proxy PROXY Proxy address to make use of 226 | --proxy-user PROXY_USER 227 | Proxy user to make use of 228 | --proxy-password PROXY_PASSWORD 229 | Proxy password to make use of 230 | ``` 231 | 232 | 233 | 234 | ## Code of Conduct 235 | Make sure to adhere to the [code of conduct](CODE_OF_CONDUCT.md) at all times. 236 | 237 | ## Contribution 238 | Before making any contributions, please read the [contribution guide](CONTRIBUTING.md). 239 | 240 | ## License (MIT) 241 | This project is licensed under the [MIT 2.0 License](LICENSE) which allows very broad use for both academic and commercial purposes. 242 | 243 | ## Contributors ✨ 244 | 245 | Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)): 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 |

Ed Luff

💻

Diretnan Domnan

🚇 ⚠️ 🔧 💻

MeNsaaH

🚇 ⚠️ 🔧 💻

Aditya Pal

⚠️ 💻 📖

Avinash Reddy

🐛

David Onuh

💻 ⚠️

Panagiotis Simakis

💻 ⚠️

reiarthur

💻

Ashokkumar TA

💻

Andreas Teuber

💻

mi096684

🐛

devajithvs

💻

Geg Zakaryan

💻 🐛

Hakan Boğan

🐛

NicKoehler

🐛 💻

ChrisLin

🐛 💻

Pietro

💻 🐛
275 | 276 | 277 | 278 | 279 | 280 | 281 | This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome! 282 | -------------------------------------------------------------------------------- /assets/animate.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bisohns/search-engine-parser/0c2f4bde7dd21c10e64c9204417d9a228e96c187/assets/animate.gif -------------------------------------------------------------------------------- /assets/example.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bisohns/search-engine-parser/0c2f4bde7dd21c10e64c9204417d9a228e96c187/assets/example.gif -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/documentation.md: -------------------------------------------------------------------------------- 1 | ## Documentation 2 | 3 | The documentation for this project is generated by [Sphinx](https://sphinx-doc.org) and is hosted at [Read the Docs](https://search-engine-parser.readthedocs.io). 4 | On the root of the project, there exists a `docs` directory housing the sphinx configuration and rst files. 5 | 6 | ### Understanding Sphinx 7 | 8 | If you have not made use of sphinx before, take a look at this explanatory [blogpost](https://medium.com/@richdayandnight/a-simple-tutorial-on-how-to-document-your-python-project-using-sphinx-and-rinohtype-177c22a15b5b) 9 | 10 | 11 | ### Documenting an Engine 12 | 13 | Write the appropriate summary, and document the class and every function as follows 14 | 15 | ```python 16 | """@desc 17 | # This is the module documentation 18 | Parser for FakeEngine search results 19 | """ 20 | 21 | 22 | class Search(BaseSearch): 23 | """ 24 | Searches FakeEngine for string 25 | """ 26 | name = "FakeEngine" 27 | summary = "\t Here lies the summary for a fake engine" 28 | 29 | def fake_function(self, input_1, input_2): 30 | """ 31 | Describe function here 32 | :param input_1: describe input 1 33 | :type single_result: str 34 | :param input_2: describe input 2 35 | :type input_2: int 36 | :return: this is an example return 37 | :rtype: str 38 | """ 39 | ``` 40 | 41 | ### Generating the files 42 | 43 | After including the necessary documentation 44 | 45 | * Go to the root of the project and then 46 | 47 | ```bash 48 | cd docs/ 49 | ``` 50 | 51 | * Ensure your virtualenv is enabled with all requirements listed in the [requirements-dev.txt](https://github.com/bisoncorps/search-engine-parser/blob/master/requirements-dev.txt) 52 | 53 | * Run the command 54 | 55 | ```bash 56 | sphinx-apidoc -f -o source/ ../search_engine_parser 57 | ``` 58 | 59 | * Write an appropriate commit message 60 | 61 | ```t 62 | Ex. Included documentation for the Yandex Engine 63 | ``` 64 | -------------------------------------------------------------------------------- /docs/engines.md: -------------------------------------------------------------------------------- 1 | ## Engines 2 | 3 | This document is dedicated to helping developers better understand how to include Engines to the SearchEngineParser OSS. 4 | 5 | ### What Search Engines are accepted 6 | 7 | This project was started primarily for general purpose search engines like Google and Bing. 8 | It has since surpassed that and aims to include all useful sites (termed `custom engines`). 9 | These custom engines include things like Youtube, GitHub, StackOverflow, e.t.c. 10 | Basically any site that is popular enough to search and return links 11 | 12 | ### Skills Needed 13 | 14 | - Python (obviously) 15 | - Sphinx 16 | - Regular Expressions 17 | - Beautiful Soup 18 | 19 | ### Implementing an Engine 20 | 21 | The engine modules are in the [search_engine_parser/core/engines/](https://github.com/bisoncorps/search-engine-parser/blob/master/search_engine_parser/core/engines) directory 22 | 23 | * Create module for the new search engine 24 | 25 | * Create class for the Engine 26 | 27 | * Class should import from the base engine 28 | 29 | * Example for a fake engine is shown below 30 | 31 | ```python 32 | 33 | # fake.py 34 | from search_engine_parser.core.base import BaseSearch 35 | from search_engine_parser.core.exceptions import NoResultsOrTrafficError 36 | 37 | class FakeEngine(BaseSearch): 38 | # name of the engine to be displayed on the CLI, preferably PascalCase 39 | name = "FakeEngine" 40 | # engine url to be search, with parameters to be formatted e.g query , page 41 | search_url = "https://search.fake.com/fake/search" 42 | # a short 2 or 3 line summary of the engine with some statistics, preferably obtained from wikipedia 43 | summary = "\t According to netmarketshare, this site is balderdash among "\ 44 | "search engines with a market share that is close to 100%. "\ 45 | "The fake engine includes many popular features but was solely created to show you an example ." 46 | 47 | 48 | # this function should return the dict of params to be passed to the search_url 49 | def get_params(self, query=None, page=None, offset=None, **kwargs): 50 | params = {} 51 | params["q"] =query 52 | params["page"] = page 53 | return params 54 | 55 | # This function should use beautiful soup (combined with regex if necessary) 56 | # to return all the divs containiing results 57 | def parse_soup(self, soup): 58 | return soup.find_all('div', class_='fake-result-div') 59 | 60 | # This function should parse each result soup to return title, link, and description 61 | # NOTE: The implementation may not be as straightforward as shown below 62 | def parse_single_result(self, single_result): 63 | title_div = single_result.find('div', class_='fake-title') 64 | title = title_div.text 65 | link_tag = title_div.find('a') 66 | link = link_tag.get('href') 67 | desc_span = single_result.find('span', class_='fake-description') 68 | desc = desc.text 69 | rdict = { 70 | "titles": title, 71 | "links": link, 72 | "descriptions": desc, 73 | } 74 | return rdict 75 | ``` 76 | 77 | * Import the engine by adding to the following files 78 | 79 | [search_engine_parser/__init__.py](https://github.com/bisoncorps/search-engine-parser/blob/master/search_engine_parser/__init__.py) 80 | 81 | ```python 82 | ... 83 | from search_engine_parser.core.engines.fake import Search as FakeEngineSearch 84 | ``` 85 | 86 | 87 | * Make sure to write code documentation by following the [documentation guide](https://github.com/bisoncorps/search-engine-parser/blob/master/docs/documentation.md#documenting-an-engine) 88 | 89 | * [Generate the RST file](https://github.com/bisoncorps/search-engine-parser/blob/master/docs/documentation.md#generating-the-files) 90 | 91 | * Add Engine to Supported Engines in [supported engines](https://github.com/bisoncorps/search-engine-parser/blob/master/docs/supported_engines.md) 92 | -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ## Why do I get `RuntimeError: This event loop is already running` When running in Jupyter Notebook 4 | 5 | This is a popular issue on [Jupyter Notebook](https://github.com/jupyter/notebook/issues/5663). The solution: 6 | - try `pip install --upgrade ipykernel ipython` which should upgrade the ipykernet to a recent version with issue resolved 7 | - or add this to your notebook to allow nested asyncio loops 8 | ```bash 9 | !pip install nest-asyncio 10 | ``` 11 | 12 | ```python 13 | import nest_asyncio 14 | nest_asyncio.apply() 15 | ``` 16 | 17 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | import os 16 | import sys 17 | sys.path.insert(0, os.path.abspath('../..')) 18 | from search_engine_parser import __version__ as VERSION 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'Search Engine Parser' 23 | copyright = '2019, BisonCorps' 24 | author = 'Diretnan Domnan, Mmadu Manasseh' 25 | 26 | # The short X.Y version 27 | version = '' 28 | # The full version, including alpha/beta/rc tags 29 | release = VERSION 30 | 31 | 32 | # -- General configuration --------------------------------------------------- 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | # 36 | # needs_sphinx = '1.0' 37 | 38 | # Add any Sphinx extension module names here, as strings. They can be 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 40 | # ones. 41 | extensions = [ 42 | 'sphinx.ext.autodoc', 43 | 'sphinx.ext.todo', 44 | 'sphinx.ext.viewcode', 45 | 'sphinx.ext.githubpages', 46 | 'm2r', 47 | ] 48 | 49 | # Add any paths that contain templates here, relative to this directory. 50 | templates_path = ['_templates'] 51 | 52 | # The suffix(es) of source filenames. 53 | # You can specify multiple suffix as a list of string: 54 | # 55 | # source_suffix = ['.rst', '.md'] 56 | source_suffix = ['.rst', '.md'] 57 | 58 | # The master toctree document. 59 | master_doc = 'index' 60 | 61 | # The language for content autogenerated by Sphinx. Refer to documentation 62 | # for a list of supported languages. 63 | # 64 | # This is also used if you do content translation via gettext catalogs. 65 | # Usually you set "language" from the command line for these cases. 66 | language = None 67 | 68 | # List of patterns, relative to source directory, that match files and 69 | # directories to ignore when looking for source files. 70 | # This pattern also affects html_static_path and html_extra_path. 71 | exclude_patterns = [] 72 | 73 | # The name of the Pygments (syntax highlighting) style to use. 74 | pygments_style = None 75 | 76 | 77 | # -- Options for HTML output ------------------------------------------------- 78 | 79 | # The theme to use for HTML and HTML Help pages. See the documentation for 80 | # a list of builtin themes. 81 | # 82 | html_theme = 'sphinx_rtd_theme' 83 | 84 | # Theme options are theme-specific and customize the look and feel of a theme 85 | # further. For a list of options available for each theme, see the 86 | # documentation. 87 | # 88 | # html_theme_options = {} 89 | 90 | # Add any paths that contain custom static files (such as style sheets) here, 91 | # relative to this directory. They are copied after the builtin static files, 92 | # so a file named "default.css" will overwrite the builtin "default.css". 93 | html_static_path = ['_static'] 94 | 95 | # Custom sidebar templates, must be a dictionary that maps document names 96 | # to template names. 97 | # 98 | # The default sidebars (for documents that don't match any pattern) are 99 | # defined by theme itself. Builtin themes are using these templates by 100 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 101 | # 'searchbox.html']``. 102 | # 103 | # html_sidebars = {} 104 | 105 | 106 | # -- Options for HTMLHelp output --------------------------------------------- 107 | 108 | # Output file base name for HTML help builder. 109 | htmlhelp_basename = 'SearchEngineParserdoc' 110 | 111 | 112 | # -- Options for LaTeX output ------------------------------------------------ 113 | 114 | latex_elements = { 115 | # The paper size ('letterpaper' or 'a4paper'). 116 | # 117 | # 'papersize': 'letterpaper', 118 | 119 | # The font size ('10pt', '11pt' or '12pt'). 120 | # 121 | # 'pointsize': '10pt', 122 | 123 | # Additional stuff for the LaTeX preamble. 124 | # 125 | # 'preamble': '', 126 | 127 | # Latex figure (float) alignment 128 | # 129 | # 'figure_align': 'htbp', 130 | } 131 | 132 | # Grouping the document tree into LaTeX files. List of tuples 133 | # (source start file, target name, title, 134 | # author, documentclass [howto, manual, or own class]). 135 | latex_documents = [ 136 | (master_doc, 'SearchEngineParser.tex', 'Search Engine Parser Documentation', 137 | 'Diretnan Domnan, Mmadu Manasseh', 'manual'), 138 | ] 139 | 140 | 141 | # -- Options for manual page output ------------------------------------------ 142 | 143 | # One entry per manual page. List of tuples 144 | # (source start file, name, description, authors, manual section). 145 | man_pages = [ 146 | (master_doc, 'searchengineparser', 'Search Engine Parser Documentation', 147 | [author], 1) 148 | ] 149 | 150 | 151 | # -- Options for Texinfo output ---------------------------------------------- 152 | 153 | # Grouping the document tree into Texinfo files. List of tuples 154 | # (source start file, target name, title, author, 155 | # dir menu entry, description, category) 156 | texinfo_documents = [ 157 | (master_doc, 'SearchEngineParser', 'Search Engine Parser Documentation', 158 | author, 'SearchEngineParser', 'One line description of project.', 159 | 'Miscellaneous'), 160 | ] 161 | 162 | 163 | # -- Options for Epub output ------------------------------------------------- 164 | 165 | # Bibliographic Dublin Core info. 166 | epub_title = project 167 | 168 | # The unique identifier of the text. This can be a ISBN number 169 | # or the project homepage. 170 | # 171 | # epub_identifier = '' 172 | 173 | # A unique identification for the text. 174 | # 175 | # epub_uid = '' 176 | 177 | # A list of files that should not be packed into the epub file. 178 | epub_exclude_files = ['search.html'] 179 | 180 | 181 | # -- Extension configuration ------------------------------------------------- 182 | 183 | # -- Options for todo extension ---------------------------------------------- 184 | 185 | # If true, `todo` and `todoList` produce output, else they produce nothing. 186 | todo_include_todos = True 187 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. Search Engine Parser documentation master file, created by 2 | sphinx-quickstart on Fri Feb 1 23:05:55 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Search Engine Parser's documentation! 7 | ================================================ 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | 14 | .. mdinclude:: ../../README.md 15 | 16 | Indices and tables 17 | ================== 18 | 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | * :ref:`search` 22 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | search_engine_parser 2 | ==================== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | search_engine_parser 8 | -------------------------------------------------------------------------------- /docs/source/search_engine_parser.core.engines.rst: -------------------------------------------------------------------------------- 1 | search\_engine\_parser.core.engines package 2 | =========================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | search\_engine\_parser.core.engines.aol module 8 | ---------------------------------------------- 9 | 10 | .. automodule:: search_engine_parser.core.engines.aol 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | search\_engine\_parser.core.engines.ask module 16 | ---------------------------------------------- 17 | 18 | .. automodule:: search_engine_parser.core.engines.ask 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | search\_engine\_parser.core.engines.baidu module 24 | ------------------------------------------------ 25 | 26 | .. automodule:: search_engine_parser.core.engines.baidu 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | search\_engine\_parser.core.engines.bing module 32 | ----------------------------------------------- 33 | 34 | .. automodule:: search_engine_parser.core.engines.bing 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | search\_engine\_parser.core.engines.coursera module 40 | --------------------------------------------------- 41 | 42 | .. automodule:: search_engine_parser.core.engines.coursera 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | search\_engine\_parser.core.engines.duckduckgo module 48 | ----------------------------------------------------- 49 | 50 | .. automodule:: search_engine_parser.core.engines.duckduckgo 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | search\_engine\_parser.core.engines.github module 56 | ------------------------------------------------- 57 | 58 | .. automodule:: search_engine_parser.core.engines.github 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | search\_engine\_parser.core.engines.google module 64 | ------------------------------------------------- 65 | 66 | .. automodule:: search_engine_parser.core.engines.google 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | search\_engine\_parser.core.engines.googlescholar module 72 | -------------------------------------------------------- 73 | 74 | .. automodule:: search_engine_parser.core.engines.googlescholar 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | 79 | search\_engine\_parser.core.engines.myanimelist module 80 | ------------------------------------------------------ 81 | 82 | .. automodule:: search_engine_parser.core.engines.myanimelist 83 | :members: 84 | :undoc-members: 85 | :show-inheritance: 86 | 87 | search\_engine\_parser.core.engines.stackoverflow module 88 | -------------------------------------------------------- 89 | 90 | .. automodule:: search_engine_parser.core.engines.stackoverflow 91 | :members: 92 | :undoc-members: 93 | :show-inheritance: 94 | 95 | search\_engine\_parser.core.engines.yahoo module 96 | ------------------------------------------------ 97 | 98 | .. automodule:: search_engine_parser.core.engines.yahoo 99 | :members: 100 | :undoc-members: 101 | :show-inheritance: 102 | 103 | search\_engine\_parser.core.engines.yandex module 104 | ------------------------------------------------- 105 | 106 | .. automodule:: search_engine_parser.core.engines.yandex 107 | :members: 108 | :undoc-members: 109 | :show-inheritance: 110 | 111 | search\_engine\_parser.core.engines.youtube module 112 | -------------------------------------------------- 113 | 114 | .. automodule:: search_engine_parser.core.engines.youtube 115 | :members: 116 | :undoc-members: 117 | :show-inheritance: 118 | 119 | 120 | Module contents 121 | --------------- 122 | 123 | .. automodule:: search_engine_parser.core.engines 124 | :members: 125 | :undoc-members: 126 | :show-inheritance: 127 | -------------------------------------------------------------------------------- /docs/source/search_engine_parser.core.rst: -------------------------------------------------------------------------------- 1 | search\_engine\_parser.core package 2 | =================================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | search_engine_parser.core.engines 10 | 11 | Submodules 12 | ---------- 13 | 14 | search\_engine\_parser.core.base module 15 | --------------------------------------- 16 | 17 | .. automodule:: search_engine_parser.core.base 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | search\_engine\_parser.core.cli module 23 | -------------------------------------- 24 | 25 | .. automodule:: search_engine_parser.core.cli 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | search\_engine\_parser.core.exceptions module 31 | --------------------------------------------- 32 | 33 | .. automodule:: search_engine_parser.core.exceptions 34 | :members: 35 | :undoc-members: 36 | :show-inheritance: 37 | 38 | 39 | Module contents 40 | --------------- 41 | 42 | .. automodule:: search_engine_parser.core 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | -------------------------------------------------------------------------------- /docs/source/search_engine_parser.rst: -------------------------------------------------------------------------------- 1 | search\_engine\_parser package 2 | ============================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | search_engine_parser.core 10 | search_engine_parser.tests 11 | 12 | Module contents 13 | --------------- 14 | 15 | .. automodule:: search_engine_parser 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | -------------------------------------------------------------------------------- /docs/source/search_engine_parser.tests.rst: -------------------------------------------------------------------------------- 1 | search\_engine\_parser.tests package 2 | ==================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | search\_engine\_parser.tests.base module 8 | ---------------------------------------- 9 | 10 | .. automodule:: search_engine_parser.tests.base 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | search\_engine\_parser.tests.test\_search module 16 | ------------------------------------------------ 17 | 18 | .. automodule:: search_engine_parser.tests.test_search 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: search_engine_parser.tests 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /docs/supported_engines.md: -------------------------------------------------------------------------------- 1 | ## Supported Engines 2 | 3 | Below is a list of supported engines and what they return. 4 | 5 | 6 | |No|Engine|Returns| 7 | |------|------|-----| 8 | 1|Google|titles, links, descriptions 9 | |2|Yahoo|titles, links, descriptions 10 | 3|Bing|titles, links, descriptions 11 | |4|DuckDuckGo|titles, links, descriptions 12 | 5|Baidu|titles, links, descriptions 13 | |6|Yandex|titles, links, descriptions 14 | 7|Aol|titles, links, descriptions 15 | 8|StackOverflow|titles, links, descriptions 16 | 9|GitHub|titles, links, descriptions, stars, languages 17 | 10|Ask|titles, links, descriptions 18 | 11|YouTube|titles, links, descriptions, channels, [single videos only: durations, views, upload_dates] 19 | 12|MyAnimeList|titles, links, descriptions, number of episodes, type of result (OVA, series, movie, etc.), ratings 20 | 13|GoogleScholar|titles, links, descriptions, type of results ([BOOK], [CITATION], etc.), links of files 21 | 14|GoogleNews|titles, links, descriptions, image links, date, news source 22 | 15|Coursera|titles,links,ratings count, ratings average, partners, difficulties, enrolments numbers 23 | -------------------------------------------------------------------------------- /requirements/cli.txt: -------------------------------------------------------------------------------- 1 | blessed >=1.15.0, < 2 2 | -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- 1 | -r main.txt 2 | blessed==1.17.9 3 | m2r==0.2.1 4 | parameterized==0.7.4 5 | pylint==2.5.3 6 | pytest==5.4.3 7 | sphinx==3.1.2 8 | sphinx-rtd-theme==0.5.0 9 | vcrpy==4.0.2 10 | -------------------------------------------------------------------------------- /requirements/main.txt: -------------------------------------------------------------------------------- 1 | lxml >=4.6.5, <5 2 | aiohttp >=3.6.2,<4 3 | beautifulsoup4 >=4.9.1,<5 4 | fake-useragent >=0.1.11, <0.2 5 | -------------------------------------------------------------------------------- /scripts/docs.sh: -------------------------------------------------------------------------------- 1 | cd ./docs 2 | sphinx-apidoc -f -o source/ ../search_engine_parser 3 | if [ $? -ne 0 ]; then 4 | echo "Failed to run sphinx-apidoc" 5 | exit 1 6 | fi 7 | make html 8 | if [ $? -ne 0 ]; then 9 | echo "Failed to make html" 10 | exit 1 11 | fi 12 | cd .. 13 | git commit -am "make html" 14 | git config --global push.default simple 15 | git config --global user.email "travis@travis-ci.com" 16 | git config --global user.name "Travis CI" 17 | 18 | 19 | #remove existing files except html 20 | shopt -s extglob 21 | rm -r ./!(docs)/ 22 | 23 | #copy contents of html to root 24 | cp -R ${TRAVIS_BUILD_DIR}/docs/build/html/. ${TRAVIS_BUILD_DIR}/ 25 | 26 | #remove html and accompanying docs 27 | rm -r ./docs 28 | echo "Viewing current files in directory" 29 | ls -lah 30 | # Checkout to gh-pages 31 | git checkout gh-pages 32 | if [ $? -eq 1 ]; then 33 | echo "Checked out to existing gh-pages branch" 34 | else 35 | git checkout -b gh-pages 36 | echo "Creating gh-pages branch" 37 | fi 38 | git add . 39 | git commit -am "rebuilt docs" 40 | git remote add origin-pages https://${GITHUB_TOKEN}@github.com/bisoncorps/search_engine_parser.git 41 | git push -u origin-pages gh-pages --force 42 | 43 | # echo if docs was succesfully pushed 44 | if [ $? -eq 0 ]; then 45 | echo "Docs successfully pushed to Github Pages" 46 | else 47 | echo "Failed to push docs" 48 | exit 1 49 | fi 50 | -------------------------------------------------------------------------------- /scripts/post_deploy_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # get current version 4 | VERSION="$(python setup.py --version)" 5 | echo "${VERSION}" 6 | 7 | # install python package 8 | pip uninstall search-engine-parser -y 9 | pip install search-engine-parser=="${VERSION}" 10 | python -c "import search_engine_parser" 11 | 12 | pip uninstall search-engine-parser -y 13 | 14 | pip install 'search-engine-parser[cli]=="${VERSION}"' 15 | 16 | # run the cli version to get a result 17 | python -m search_engine_parser.core.cli --engine bing search --query "Preaching to the choir" --type descriptions 18 | 19 | # run cli with pysearch 20 | pysearch -e youtube search -q "NoCopyrightSounds" 21 | 22 | if [ $? -eq 0 ]; then 23 | echo "Package works as expected" 24 | else 25 | echo "CLI handler of the package failed to execute" 26 | exit 1 27 | fi 28 | -------------------------------------------------------------------------------- /scripts/pre_deploy_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # change directory 4 | cd search_engine_parser/ 5 | 6 | python tests/__init__.py 7 | -------------------------------------------------------------------------------- /search_engine_parser/.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # pytype static type analyzer 136 | .pytype/ 137 | 138 | # Cython debug symbols 139 | cython_debug/ 140 | 141 | ### Python template 142 | # Byte-compiled / optimized / DLL files 143 | __pycache__/ 144 | *.py[cod] 145 | *$py.class 146 | 147 | # C extensions 148 | *.so 149 | 150 | # Distribution / packaging 151 | .Python 152 | build/ 153 | develop-eggs/ 154 | dist/ 155 | downloads/ 156 | eggs/ 157 | .eggs/ 158 | lib/ 159 | lib64/ 160 | parts/ 161 | sdist/ 162 | var/ 163 | wheels/ 164 | share/python-wheels/ 165 | *.egg-info/ 166 | .installed.cfg 167 | *.egg 168 | MANIFEST 169 | 170 | # PyInstaller 171 | # Usually these files are written by a python script from a template 172 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 173 | *.manifest 174 | *.spec 175 | 176 | # Installer logs 177 | pip-log.txt 178 | pip-delete-this-directory.txt 179 | 180 | # Unit test / coverage reports 181 | htmlcov/ 182 | .tox/ 183 | .nox/ 184 | .coverage 185 | .coverage.* 186 | .cache 187 | nosetests.xml 188 | coverage.xml 189 | *.cover 190 | *.py,cover 191 | .hypothesis/ 192 | .pytest_cache/ 193 | cover/ 194 | 195 | # Translations 196 | *.mo 197 | *.pot 198 | 199 | # Django stuff: 200 | *.log 201 | local_settings.py 202 | db.sqlite3 203 | db.sqlite3-journal 204 | 205 | # Flask stuff: 206 | instance/ 207 | .webassets-cache 208 | 209 | # Scrapy stuff: 210 | .scrapy 211 | 212 | # Sphinx documentation 213 | docs/_build/ 214 | 215 | # PyBuilder 216 | .pybuilder/ 217 | target/ 218 | 219 | # Jupyter Notebook 220 | .ipynb_checkpoints 221 | 222 | # IPython 223 | profile_default/ 224 | ipython_config.py 225 | 226 | # pyenv 227 | # For a library or package, you might want to ignore these files since the code is 228 | # intended to run in multiple environments; otherwise, check them in: 229 | # .python-version 230 | 231 | # pipenv 232 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 233 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 234 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 235 | # install all needed dependencies. 236 | #Pipfile.lock 237 | 238 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 239 | __pypackages__/ 240 | 241 | # Celery stuff 242 | celerybeat-schedule 243 | celerybeat.pid 244 | 245 | # SageMath parsed files 246 | *.sage.py 247 | 248 | # Environments 249 | .env 250 | .venv 251 | env/ 252 | venv/ 253 | ENV/ 254 | env.bak/ 255 | venv.bak/ 256 | 257 | # Spyder project settings 258 | .spyderproject 259 | .spyproject 260 | 261 | # Rope project settings 262 | .ropeproject 263 | 264 | # mkdocs documentation 265 | /site 266 | 267 | # mypy 268 | .mypy_cache/ 269 | .dmypy.json 270 | dmypy.json 271 | 272 | # Pyre type checker 273 | .pyre/ 274 | 275 | # pytype static type analyzer 276 | .pytype/ 277 | 278 | # Cython debug symbols 279 | cython_debug/ 280 | 281 | #idea 282 | .idea/* 283 | -------------------------------------------------------------------------------- /search_engine_parser/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author 3 | Domnan Diretnan 4 | Artificial Intelligence Enthusiast & Software Engineer. 5 | Email: diretnandomnan@gmail.com 6 | Github: https://github.com/deven96 7 | GitLab: https://gitlab.com/Deven96 8 | 9 | Mmadu Manasseh 10 | Email: mmadumanasseh@gmail.com 11 | Github: https://github.com/mensaah 12 | GitLab: https://gitlab.com/mensaah 13 | 14 | @project 15 | @create date 2019-02-01 22:15:44 16 | @modify date 2019-02-01 22:15:44 17 | 18 | @license 19 | MIT License 20 | Copyright (c) 2018. Domnan Diretnan. All rights reserved 21 | 22 | """ 23 | 24 | # Allow import using `search_engine_parser.engines` 25 | from search_engine_parser.core import engines 26 | # Support for older versions of imports 27 | # DEPRECATION_WARNING: These imports will be removed in later versions 28 | from search_engine_parser.core.engines.aol import Search as AolSearch 29 | from search_engine_parser.core.engines.ask import Search as AskSearch 30 | from search_engine_parser.core.engines.baidu import Search as BaiduSearch 31 | from search_engine_parser.core.engines.bing import Search as BingSearch 32 | from search_engine_parser.core.engines.duckduckgo import \ 33 | Search as DuckDuckGoSearch 34 | from search_engine_parser.core.engines.github import Search as GithubSearch 35 | from search_engine_parser.core.engines.google import Search as GoogleSearch 36 | from search_engine_parser.core.engines.googlescholar import \ 37 | Search as GoogleScholarSearch 38 | from search_engine_parser.core.engines.stackoverflow import \ 39 | Search as StackOverflowSearch 40 | from search_engine_parser.core.engines.yahoo import Search as YahooSearch 41 | 42 | name = "search-engine-parser" # pylint: disable=invalid-name 43 | __version__ = "0.6.3" 44 | -------------------------------------------------------------------------------- /search_engine_parser/core/__init__.py: -------------------------------------------------------------------------------- 1 | import search_engine_parser.core.engines 2 | -------------------------------------------------------------------------------- /search_engine_parser/core/base.py: -------------------------------------------------------------------------------- 1 | """@desc 2 | Base class inherited by every search engine 3 | """ 4 | 5 | import asyncio 6 | import random 7 | from abc import ABCMeta, abstractmethod 8 | from contextlib import suppress 9 | from enum import Enum, unique 10 | from urllib.parse import urlencode, urlparse 11 | 12 | import aiohttp 13 | from bs4 import BeautifulSoup 14 | 15 | from search_engine_parser.core import utils 16 | from search_engine_parser.core.exceptions import NoResultsOrTrafficError 17 | 18 | 19 | @unique 20 | class ReturnType(Enum): 21 | FULL = "full" 22 | TITLE = "titles" 23 | DESCRIPTION = "descriptions" 24 | LINK = "links" 25 | 26 | 27 | # All results returned are each items of search 28 | class SearchItem(dict): 29 | """ 30 | SearchItem is a dict of results containing keys (titles, descriptions, links and other 31 | additional keys dependending on the engine) 32 | >>> result 33 | 34 | >>> result["description"] 35 | Some description 36 | >>> result["descriptions"] 37 | Same description 38 | """ 39 | def __getitem__(self, value): 40 | """ Allow getting by index and by type ('descriptions', 'links'...)""" 41 | try: 42 | return super().__getitem__(value) 43 | except KeyError: 44 | pass 45 | if not value.endswith('s'): 46 | value += 's' 47 | return super().__getitem__(value) 48 | 49 | 50 | class SearchResult(): 51 | """ 52 | The SearchResults after the searching 53 | 54 | >>> results = gsearch.search("preaching the choir", 1) 55 | >>> results 56 | 57 | 58 | The object supports retreiving individual results by iteration of just by type 59 | >>> results[0] # Returns the first result 60 | >>> results["descriptions"] # Returns a list of all descriptions from all results 61 | 62 | It can be iterated like a normal list to return individual SearchItem 63 | """ 64 | 65 | def __init__(self): 66 | self.results = [] 67 | 68 | def append(self, value): 69 | self.results.append(value) 70 | 71 | def __getitem__(self, value): 72 | """ Allow getting by index and by type ('descriptions', 'links'...)""" 73 | if isinstance(value, int): 74 | return self.results[value] 75 | l = [] 76 | for x in self.results: 77 | with suppress(KeyError): 78 | l.append(x[value]) 79 | return l 80 | 81 | def keys(self): 82 | keys = {} 83 | with suppress(IndexError): 84 | x = self.results[0] 85 | keys = x.keys() 86 | return keys 87 | 88 | def __len__(self): 89 | return len(self.results) 90 | 91 | def __repr_(self): 92 | return "".format(len(self.results)) 93 | 94 | 95 | class BaseSearch: 96 | 97 | __metaclass__ = ABCMeta 98 | 99 | """ 100 | Search base to be extended by search parsers 101 | Every subclass must have two methods `search` amd `parse_single_result` 102 | """ 103 | # Summary of engine 104 | summary = None 105 | # Search Engine Name 106 | name = None 107 | # Search Engine unformatted URL 108 | search_url = None 109 | # The url after all query params have been set 110 | _parsed_url = None 111 | # boolean that indicates cache hit or miss 112 | _cache_hit = False 113 | 114 | @abstractmethod 115 | def parse_soup(self, soup): 116 | """ 117 | Defines the results contained in a soup 118 | """ 119 | raise NotImplementedError("subclasses must define method ") 120 | 121 | @abstractmethod 122 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs): 123 | """ 124 | Every div/span containing a result is passed here to retrieve 125 | `title`, `link` and `descr` 126 | """ 127 | raise NotImplementedError( 128 | "subclasses must define method ") 129 | 130 | def get_cache_handler(self): 131 | """ Return Cache Handler to use""" 132 | 133 | return utils.CacheHandler() 134 | 135 | @property 136 | def cache_handler(self): 137 | return self.get_cache_handler() 138 | 139 | def parse_result(self, results, **kwargs): 140 | """ 141 | Runs every entry on the page through parse_single_result 142 | 143 | :param results: Result of main search to extract individual results 144 | :type results: list[`bs4.element.ResultSet`] 145 | :returns: dictionary. Containing lists of titles, links, descriptions and other possible\ 146 | returns. 147 | :rtype: dict 148 | """ 149 | search_results = SearchResult() 150 | for each in results: 151 | rdict = self.parse_single_result(each, **kwargs) 152 | if rdict is not None: 153 | search_results.append(rdict) 154 | return search_results 155 | 156 | def get_params(self, query=None, page=None, offset=None, **kwargs): 157 | """ This function should be overwritten to return a dictionary of query params""" 158 | return {'q': query, 'page': page} 159 | 160 | def headers(self): 161 | headers = { 162 | "Cache-Control": 'no-cache', 163 | "Connection": "keep-alive", 164 | "User-Agent": utils.get_rand_user_agent() 165 | } 166 | return headers 167 | 168 | def clear_cache(self, all_cache=False): 169 | """ 170 | Triggers the clear cache function for a particular engine 171 | 172 | :param all_cache: if True, deletes for all engines 173 | """ 174 | if all_cache: 175 | return self.cache_handler.clear() 176 | return self.cache_handler.clear(self.name) 177 | 178 | async def get_source(self, url, cache=True, proxy=None, proxy_auth=None): 179 | """ 180 | Returns the source code of a webpage. 181 | Also sets the _cache_hit if cache was used 182 | 183 | :rtype: string 184 | :param url: URL to pull it's source code 185 | :param proxy: proxy address to make use off 186 | :type proxy: str 187 | :param proxy_auth: (user, password) tuple to authenticate proxy 188 | :type proxy_auth: (str, str) 189 | :return: html source code of a given URL. 190 | """ 191 | try: 192 | html, cache_hit = await self.cache_handler.get_source(self.name, url, self.headers(), cache, proxy, proxy_auth) 193 | except Exception as exc: 194 | raise Exception('ERROR: {}\n'.format(exc)) 195 | self._cache_hit = cache_hit 196 | return html 197 | 198 | async def get_soup(self, url, cache, proxy, proxy_auth): 199 | """ 200 | Get the html soup of a query 201 | :param url: url to obrain soup from 202 | :type url: str 203 | :param cache: cache request or not 204 | :type cache: bool 205 | :param proxy: proxy address to make use off 206 | :type proxy: str 207 | :param proxy_auth: (user, password) tuple to authenticate proxy 208 | :type proxy_auth: (str, str) 209 | 210 | :rtype: `bs4.element.ResultSet` 211 | """ 212 | html = await self.get_source(url, cache, proxy, proxy_auth) 213 | return BeautifulSoup(html, 'lxml') 214 | 215 | def get_search_url(self, query=None, page=None, **kwargs): 216 | """ 217 | Return a formatted search url 218 | """ 219 | # Some URLs use offsets 220 | offset = (page * 10) - 9 221 | params = self.get_params( 222 | query=query, page=page, offset=offset, **kwargs) 223 | url = urlparse(self.search_url) 224 | # For localization purposes, custom urls can be parsed for the same engine 225 | # such as google.de and google.com 226 | if kwargs.get("url"): 227 | new_url = urlparse(kwargs.pop("url")) 228 | # When passing url without scheme e.g google.de, url is parsed as path 229 | if not new_url.netloc: 230 | url = url._replace(netloc=new_url.path) 231 | else: 232 | url = url._replace(netloc=new_url.netloc) 233 | self.base_url = url.geturl() 234 | self._parsed_url = url._replace(query=urlencode(params)) 235 | 236 | return self._parsed_url.geturl() 237 | 238 | def get_results(self, soup, **kwargs): 239 | """ Get results from soup""" 240 | 241 | search_results = None 242 | results = self.parse_soup(soup) 243 | # TODO Check if empty results is caused by traffic or answers to query 244 | # were not found 245 | if not results: 246 | print("ENGINE FAILURE: {}\n".format(self.name)) 247 | raise NoResultsOrTrafficError( 248 | "The result parsing was unsuccessful. It is either your query could not be found" 249 | " or it was flagged as unusual traffic") 250 | 251 | try: 252 | search_results = self.parse_result(results, **kwargs) 253 | # AttributeError occurs as it cannot pass the returned soup 254 | except AttributeError as e: 255 | raise NoResultsOrTrafficError( 256 | "The returned results could not be parsed. This might be due to site updates or " 257 | "server errors. Drop an issue at https://github.com/bisoncorps/search-engine-parser" 258 | " if this persists" 259 | ) 260 | 261 | return search_results 262 | 263 | def search(self, query=None, page=1, cache=True, proxy=None, proxy_auth=None, **kwargs): 264 | """ 265 | Query the search engine 266 | 267 | :param query: the query to search for 268 | :type query: str 269 | :param page: Page to be displayed, defaults to 1 270 | :type page: int 271 | :param proxy: proxy address to make use off 272 | :type proxy: str 273 | :param proxy_auth: (user, password) tuple to authenticate proxy 274 | :type proxy_auth: (str, str) 275 | :return: dictionary. Containing titles, links, netlocs and descriptions. 276 | """ 277 | # Pages can only be from 1-N 278 | if page <= 0: 279 | page = 1 280 | # Get search Page Results 281 | loop = asyncio.get_event_loop() 282 | url = self.get_search_url( 283 | query, page, **kwargs) 284 | soup = loop.run_until_complete( 285 | self.get_soup(url, cache=cache, 286 | proxy=proxy, 287 | proxy_auth=proxy_auth)) 288 | return self.get_results(soup, **kwargs) 289 | 290 | async def async_search(self, query=None, page=1, cache=True, proxy=None, proxy_auth=None, **kwargs): 291 | """ 292 | Query the search engine but in async mode 293 | 294 | :param query: the query to search for 295 | :type query: str 296 | :param page: Page to be displayed, defaults to 1 297 | :type page: int 298 | :param proxy: proxy address to make use off 299 | :type proxy: str 300 | :param proxy_auth: (user, password) tuple to authenticate proxy 301 | :type proxy_auth: (str, str) 302 | :return: dictionary. Containing titles, links, netlocs and descriptions. 303 | """ 304 | # Pages can only be from 1-N 305 | if page == 0: 306 | page = 1 307 | soup = await self.get_soup(self.get_search_url(query, page, **kwargs), cache=cache, proxy=proxy, proxy_auth=proxy_auth) 308 | return self.get_results(soup, **kwargs) 309 | -------------------------------------------------------------------------------- /search_engine_parser/core/cli.py: -------------------------------------------------------------------------------- 1 | """@desc 2 | Making use of the parser through cli 3 | """ 4 | from __future__ import print_function 5 | 6 | import argparse 7 | import sys 8 | from datetime import datetime 9 | from importlib import import_module 10 | 11 | from blessed import Terminal 12 | from search_engine_parser import __version__ 13 | from search_engine_parser.core.base import ReturnType 14 | from search_engine_parser.core.exceptions import NoResultsOrTrafficError 15 | 16 | 17 | def display(results, term, args): 18 | """ Displays search results 19 | """ 20 | def print_one(kwargs): 21 | """ Print one result to the console """ 22 | # Header 23 | if kwargs.get("titles"): 24 | print("\t{}".format(term.magenta(kwargs.pop("titles")))) 25 | if kwargs.get("links"): 26 | print("\t{}".format(kwargs.pop("links"))) 27 | print("\t-----------------------------------------------------") 28 | if kwargs.get("descriptions"): 29 | print(kwargs.pop("descriptions")) 30 | if kwargs.values(): 31 | for k, v in kwargs.items(): 32 | if v: 33 | print(k.strip(), " : ", v) 34 | print("\n") 35 | 36 | if args.rank and args.rank > 10: 37 | sys.exit( 38 | "Results are only limited to 10, specify a different page number instead") 39 | 40 | if not args.rank: 41 | for i in results: 42 | print_one(i) 43 | else: 44 | rank = args.rank 45 | print_one(results[rank]) 46 | 47 | 48 | def get_engine_class(engine): 49 | """ Return the Engine Class """ 50 | try: 51 | module = import_module( 52 | "search_engine_parser.core.engines.{}".format( 53 | engine.lower())) 54 | return getattr(module, "Search") 55 | except (ImportError, ModuleNotFoundError): 56 | sys.exit('Engine < {} > does not exist'.format(engine)) 57 | 58 | 59 | def show_summary(term, engine_class): 60 | """ Show the summary of an Engine""" 61 | print("\t{}".format(term.magenta(engine_class.name))) 62 | print("\t-----------------------------------------------------") 63 | print(engine_class.summary) 64 | 65 | 66 | def main(args): # pylint: disable=too-many-branches 67 | """ 68 | Executes logic from parsed arguments 69 | """ 70 | term = Terminal() 71 | engine_class = get_engine_class(args.engine) 72 | 73 | if args.show_summary: 74 | show_summary(term, engine_class) 75 | return 76 | 77 | if not args.query: 78 | print("--show-summary or --query argument must be passed") 79 | sys.exit(1) 80 | 81 | # Initialize search Engine with required params 82 | engine = engine_class() 83 | try: 84 | if args.clear_cache: 85 | engine.clear_cache() 86 | # Display full details: Header, Link, Description 87 | start = datetime.now() 88 | results = engine.search( 89 | args.query, args.page, return_type=ReturnType(args.type), url=args.url, proxy=args.proxy, proxy_auth=(args.proxy_user, args.proxy_password)) 90 | duration = datetime.now() - start 91 | display(results, term, args) 92 | print("Total search took -> %s seconds" % (duration)) 93 | except NoResultsOrTrafficError as exc: 94 | print('\n', '{}'.format(term.red(str(exc)))) 95 | 96 | 97 | def create_parser(): 98 | """ 99 | runner that handles parsing logic 100 | """ 101 | parser = argparse.ArgumentParser(description='SearchEngineParser', prog="pysearch") 102 | 103 | parser.add_argument('-V', '--version', action="version", version="%(prog)s v" + __version__) 104 | 105 | parser.add_argument( 106 | '-e', '--engine', 107 | help='Engine to use for parsing the query e.g google, yahoo, bing,' 108 | 'duckduckgo (default: google)', 109 | default='google') 110 | 111 | parser.add_argument( 112 | '--show-summary', 113 | action='store_true', 114 | help='Shows the summary of an engine') 115 | 116 | parser.add_argument( 117 | '-u', 118 | '--url', 119 | help='A custom link to use as base url for search e.g google.de') 120 | 121 | parser.add_argument( 122 | '-p', 123 | '--page', 124 | type=int, 125 | help='Page of the result to return details for (default: 1)', 126 | default=1) 127 | 128 | parser.add_argument( 129 | '-t', '--type', 130 | help='Type of detail to return i.e full, links, desciptions or titles (default: full)', 131 | default="full") 132 | 133 | parser.add_argument( 134 | '-cc', '--clear-cache', 135 | action='store_true', 136 | help='Clear cache of engine before searching') 137 | 138 | parser.add_argument( 139 | '-r', 140 | '--rank', 141 | type=int, 142 | help='ID of Detail to return e.g 5 (default: 0)') 143 | 144 | parser.add_argument( 145 | '--proxy', 146 | required=False, 147 | help='Proxy address to make use of') 148 | 149 | parser.add_argument( 150 | '--proxy-user', 151 | required='--proxy' in sys.argv, 152 | help='Proxy user to make use of') 153 | 154 | parser.add_argument( 155 | '--proxy-password', 156 | required='--proxy' in sys.argv, 157 | help='Proxy password to make use of') 158 | 159 | parser.add_argument( 160 | 'query', type=str, nargs='?', 161 | help='Query string to search engine for') 162 | 163 | return parser 164 | 165 | 166 | def runner(): 167 | parser = create_parser() 168 | args = parser.parse_args(sys.argv[1:]) 169 | main(args) 170 | 171 | 172 | if __name__ == '__main__': 173 | runner() 174 | -------------------------------------------------------------------------------- /search_engine_parser/core/engines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bisohns/search-engine-parser/0c2f4bde7dd21c10e64c9204417d9a228e96c187/search_engine_parser/core/engines/__init__.py -------------------------------------------------------------------------------- /search_engine_parser/core/engines/aol.py: -------------------------------------------------------------------------------- 1 | """@desc 2 | Parser for AOL search results 3 | """ 4 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem 5 | 6 | 7 | class Search(BaseSearch): 8 | """ 9 | Searches Aol for string 10 | """ 11 | name = "AOL" 12 | search_url = "https://search.aol.com/aol/search?" 13 | summary = "\t According to netmarketshare, the old time famous AOL is still in the top 10 "\ 14 | "search engines with a market share that is close to 0.06%. "\ 15 | "The AOL network includes many popular web sites like engadget.com, techchrunch.com and "\ 16 | "the huffingtonpost.com. \nOn June 23, 2015, AOL was acquired by Verizon Communications." 17 | 18 | def parse_soup(self, soup): 19 | """ 20 | Parses AOL for a search query 21 | """ 22 | # find all divs 23 | return soup.find_all('div', class_='algo-sr') 24 | 25 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs): 26 | """ 27 | Parses the source code to return 28 | 29 | :param single_result: single result found in
30 | :type single_result: `bs4.element.ResultSet` 31 | :return: parsed title, link and description of single result 32 | :rtype: dict 33 | """ 34 | rdict = SearchItem() 35 | h3_tag = single_result.find('h3') 36 | link_tag = h3_tag.find('a') 37 | if return_type in (ReturnType.FULL, return_type.TITLE): 38 | # Get the text and link 39 | rdict["titles"] = link_tag.text 40 | 41 | if return_type in (ReturnType.FULL, ReturnType.LINK): 42 | rdict["links"] = link_tag.get("href") 43 | 44 | if return_type in (ReturnType.FULL, return_type.DESCRIPTION): 45 | caption = single_result.find('div', class_='compText aAbs') 46 | desc = caption.find('p', class_='lh-16') 47 | rdict["descriptions"] = desc.text 48 | 49 | return rdict 50 | -------------------------------------------------------------------------------- /search_engine_parser/core/engines/ask.py: -------------------------------------------------------------------------------- 1 | """@desc 2 | Parser for ask search results 3 | """ 4 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem 5 | 6 | 7 | class Search(BaseSearch): 8 | """ 9 | Searches Ask for string 10 | """ 11 | name = "Ask" 12 | 13 | search_url = "https://www.ask.com/web?" 14 | 15 | summary = "\t Formerly known as Ask Jeeves, Ask.com receives approximately 0.42% of the search"\ 16 | " share. ASK is based on a question/answer format where most questions are answered by "\ 17 | "other users or are in the form of polls.\nIt also has the general search functionality "\ 18 | "but the results returned lack quality compared to Google or even Bing and Yahoo." 19 | 20 | def get_params(self, query=None, page=None, offset=None, **kwargs): 21 | params = {} 22 | params["o"] = 0 23 | params["l"] = "dir" 24 | params["qo"] = "pagination" 25 | params["q"] = query 26 | params["qsrc"] = 998 27 | params["page"] = page 28 | return params 29 | 30 | def parse_soup(self, soup): 31 | """ 32 | Parses Ask Search Soup for results 33 | """ 34 | # find all class_='PartialSearchResults-item' => each result 35 | return soup.find_all('div', class_="PartialSearchResults-item") 36 | 37 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs): 38 | """ 39 | Parses the source code to return 40 | 41 | :param single_result: single result found in
42 | :type single_result: `bs4.element.ResultSet` 43 | :return: parsed title, link and description of single result 44 | :rtype: str, str, str 45 | """ 46 | 47 | rdict = SearchItem() 48 | if return_type in (ReturnType.FULL, return_type.TITLE): 49 | rdict["titles"] = single_result.find('a').text 50 | 51 | if return_type in (ReturnType.FULL, return_type.TITLE): 52 | rdict["links"] = single_result.a["href"] 53 | 54 | if return_type in (ReturnType.FULL, return_type.TITLE): 55 | rdict["descriptions"] = single_result.find( 56 | 'p', class_="PartialSearchResults-item-abstract").text 57 | 58 | return rdict 59 | -------------------------------------------------------------------------------- /search_engine_parser/core/engines/baidu.py: -------------------------------------------------------------------------------- 1 | """@desc 2 | Parser for Baidu search results 3 | """ 4 | 5 | import re 6 | 7 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem 8 | 9 | 10 | class Search(BaseSearch): 11 | """ 12 | Searches Baidu for string 13 | """ 14 | name = "Baidu" 15 | search_url = "https://www.baidu.com/s?" 16 | summary = "\tBaidu, Inc. is a Chinese multinational technology company specializing in"\ 17 | " Internet-related services and products and artificial intelligence (AI), headquartered"\ 18 | " in Beijing's Haidian District.\n\tIt is one of the largest AI and internet"\ 19 | " companies in the world.\n\tBaidu offers various services, including a"\ 20 | " Chinese search engine, as well as a mapping service called Baidu Maps." 21 | 22 | """Override get_search_url""" 23 | 24 | def get_params(self, query=None, page=None, offset=None, **kwargs): 25 | params = {} 26 | params["wd"] = query 27 | params["pn"] = (page - 1) * 10 28 | params["oq"] = query 29 | return params 30 | 31 | def parse_soup(self, soup): 32 | """ 33 | Parses Baidu for a search query 34 | """ 35 | 36 | # Baidu search can be made deterministic via an id 37 | # Hence, a regex is used to match all eligible ids 38 | 39 | return soup.find_all('div', {'id': re.compile(r"^\d{1,2}")}, class_="c-container") 40 | 41 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs): 42 | """ 43 | Parses the source code to return 44 | 45 | :param single_result: single result found in div with a numeric id 46 | :type single_result: `bs4.element.Tag` 47 | :return: parsed title, link and description of single result 48 | :rtype: dict 49 | """ 50 | rdict = SearchItem() 51 | if return_type in (ReturnType.FULL, return_type.TITLE): 52 | h3_tag = single_result.find('h3') 53 | 54 | # sometimes h3 tag is not found 55 | if h3_tag: 56 | rdict["title"] = h3_tag.text 57 | 58 | if return_type in (ReturnType.FULL, ReturnType.LINK): 59 | link_tag = single_result.find('a') 60 | # Get the text and link 61 | rdict["links"] = link_tag.get('href') 62 | 63 | if return_type in (ReturnType.FULL, return_type.DESCRIPTION): 64 | desc = single_result.find('div', class_='c-abstract') 65 | rdict["descriptions"] = desc if desc else '' 66 | return rdict 67 | -------------------------------------------------------------------------------- /search_engine_parser/core/engines/bing.py: -------------------------------------------------------------------------------- 1 | """@desc 2 | Parser for Bing search results 3 | """ 4 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem 5 | 6 | 7 | class Search(BaseSearch): 8 | """ 9 | Searches Bing for string 10 | """ 11 | name = "Bing" 12 | search_url = "https://www.bing.com/search?" 13 | summary = "\tBing is Microsoft’s attempt to challenge Google in search, but despite their "\ 14 | "efforts they still did not manage to convince users that their search engine can be"\ 15 | " an alternative to Google.\n\tTheir search engine market share is constantly below "\ 16 | "10%, even though Bing is the default search engine on Windows PCs." 17 | 18 | def get_params(self, query=None, page=None, offset=None, **kwargs): 19 | params = {} 20 | params["q"] = query 21 | params["offset"] = 0 22 | params["first"] = offset 23 | params["count"] = 10 24 | params["FORM"] = "PERE" 25 | return params 26 | 27 | def parse_soup(self, soup): 28 | """ 29 | Parses Bing for a search query. 30 | """ 31 | # find all li tags 32 | return soup.find_all('li', class_='b_algo') 33 | 34 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs): 35 | """ 36 | Parses the source code to return 37 | 38 | :param single_result: single result found in
  • 39 | :type single_result: `bs4.element.ResultSet` 40 | :return: parsed title, link and description of single result 41 | :rtype: dict 42 | """ 43 | rdict = SearchItem() 44 | h2_tag = single_result.find('h2') 45 | link_tag = h2_tag.find('a') 46 | 47 | if return_type in (ReturnType.FULL, return_type.TITLE): 48 | rdict["titles"] = link_tag.text 49 | 50 | if return_type in (ReturnType.FULL, return_type.LINK): 51 | link = link_tag.get('href') 52 | rdict["links"] = link 53 | 54 | if return_type in (ReturnType.FULL, return_type.DESCRIPTION): 55 | caption = single_result.find('div', class_='b_caption') 56 | desc = caption.find('p') 57 | rdict["descriptions"] = desc.text 58 | 59 | return rdict 60 | -------------------------------------------------------------------------------- /search_engine_parser/core/engines/coursera.py: -------------------------------------------------------------------------------- 1 | """@desc 2 | Parser for coursera search results 3 | """ 4 | 5 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem 6 | from urllib.parse import urljoin 7 | 8 | 9 | class Search(BaseSearch): 10 | """ 11 | Searches Coursera for string 12 | """ 13 | name = "Coursera" 14 | search_url = "https://www.coursera.org/search?" 15 | summary = "\tCoursera is an American online learning platform founded by Stanford professors Andrew Ng and " \ 16 | "Daphne Koller that offers massive open online courses, specializations, and degrees." 17 | 18 | def get_params(self, query=None, page=None, offset=None, **kwargs): 19 | params = {} 20 | params["query"] =query 21 | params["page"] = page 22 | return params 23 | 24 | def parse_soup(self, soup): 25 | """ 26 | Parses Coursera Search Soup for results 27 | """ 28 | # find all class_='gs_r gs_or gs_scl' => each result 29 | return soup.find_all('li', class_='ais-InfiniteHits-item') 30 | 31 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs): 32 | """ 33 | Parses the source code to return 34 | 35 | :param single_result: single result found in
    36 | :type single_result: `bs4.element.ResultSet` 37 | :return: parsed title, link, description, file link, result type of single result 38 | :rtype: dict 39 | """ 40 | rdict = SearchItem() 41 | 42 | if return_type in (ReturnType.FULL, return_type.LINK): 43 | link = single_result.find('a', class_='rc-DesktopSearchCard anchor-wrapper').get('href') 44 | 45 | rdict["links"] = urljoin('https://www.coursera.org', link) 46 | 47 | if return_type in (ReturnType.FULL, return_type.TITLE): 48 | title = single_result.find('h2', class_="card-title").text 49 | rdict["titles"] = title 50 | 51 | if return_type in (ReturnType.FULL,): 52 | partner_elem = single_result.find('span', class_='partner-name') 53 | partner = '' 54 | if partner_elem: 55 | partner = partner_elem.text 56 | 57 | rating_avg_elem = single_result.find('span', class_='ratings-text') 58 | rating_avg = None 59 | if rating_avg_elem: 60 | rating_avg = float(rating_avg_elem.text) 61 | 62 | enrollment_elem = single_result.find('span', class_='enrollment-number') 63 | enrolment_number = None 64 | 65 | if enrollment_elem: 66 | enr_cl_txt = enrollment_elem.text.lower().replace(',', '').replace('.', '')\ 67 | .replace('m', '0' * 6).replace('k', '0' * 3) 68 | if enr_cl_txt.isdigit(): 69 | enrolment_number = int(enr_cl_txt) 70 | 71 | difficulty_elem = single_result.find('span', class_='difficulty') 72 | difficulty = '' 73 | if difficulty_elem: 74 | difficulty = difficulty_elem.text 75 | 76 | rating_count_elem = single_result.find('span', class_='ratings-count') 77 | rating_count = None 78 | if rating_count_elem: 79 | rating_count_elem = rating_count_elem.find('span') 80 | rating_count_cl = rating_count_elem.text.replace(',', '') 81 | if rating_count_cl.isdigit(): 82 | rating_count = int(rating_count_cl) 83 | 84 | rdict.update({ 85 | "partners": partner, 86 | "ratings_avg": rating_avg, 87 | "ratings_count": rating_count, 88 | "enrolments_numbers": enrolment_number, 89 | "difficulties": difficulty, 90 | }) 91 | return rdict 92 | -------------------------------------------------------------------------------- /search_engine_parser/core/engines/duckduckgo.py: -------------------------------------------------------------------------------- 1 | """@desc 2 | Parser for DuckDuckGo search results 3 | """ 4 | import re 5 | 6 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem 7 | 8 | 9 | class Search(BaseSearch): 10 | """ 11 | Searches DuckDuckGo for string 12 | """ 13 | name = "DuckDuckGo" 14 | base_url = "https://www.duckduckgo.com" 15 | search_url = "https://www.duckduckgo.com/html/?" 16 | summary = "\tHas a number of advantages over the other search engines. \n\tIt has a clean "\ 17 | "interface, it does not track users, it is not fully loaded with ads and has a number "\ 18 | "of very nice features (only one page of results, you can search directly other web "\ 19 | "sites etc).\n\tAccording to DuckDuckGo traffic stats [December, 2018], they are "\ 20 | "currently serving more than 30 million searches per day." 21 | 22 | def get_params(self, query=None, page=None, offset=None, **kwargs): 23 | params = {} 24 | params["q"] = query 25 | params["s"] = 0 if (page < 2) else (((page-1) * 50) - 20) 26 | params["dc"] = offset 27 | params["o"] = "json" 28 | params["api"] = "d.js" 29 | return params 30 | 31 | def parse_soup(self, soup): 32 | """ 33 | Parses DuckDuckGo Search Soup for a query results 34 | """ 35 | # find all div tags 36 | return soup.find_all('div', class_='result') 37 | 38 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs): 39 | """ 40 | Parses the source code to return 41 | 42 | :param single_result: single result found in
    43 | :type single_result: `bs4.element.ResultSet` 44 | :return: parsed title, link and description of single result 45 | :rtype: dict 46 | """ 47 | 48 | rdict = SearchItem() 49 | 50 | if return_type in (ReturnType.FULL, return_type.TITLE): 51 | h2 = single_result.find( 52 | 'h2', class_="result__title") # pylint: disable=invalid-name 53 | # Get the text and link 54 | rdict["titles"] = h2.text.strip() 55 | 56 | if return_type in (ReturnType.FULL, ReturnType.LINK): 57 | link = None 58 | link_tag = single_result.find('a', class_="result__a") 59 | if link_tag is not None: 60 | rdict["links"] = link_tag.get('href') 61 | else: 62 | rdict['links'] = None 63 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION): 64 | desc = single_result.find(class_='result__snippet') 65 | if desc is not None: 66 | rdict["descriptions"] = desc.text 67 | else: 68 | rdict["descriptions"] = "" 69 | if rdict['links'] is None: 70 | rdict = None 71 | 72 | return rdict 73 | -------------------------------------------------------------------------------- /search_engine_parser/core/engines/github.py: -------------------------------------------------------------------------------- 1 | """@desc 2 | Parser for GitHub search results 3 | """ 4 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem 5 | from search_engine_parser.core.exceptions import IncorrectKeyWord 6 | 7 | 8 | class Search(BaseSearch): 9 | """ 10 | Searches GitHub for string 11 | """ 12 | name = "GitHub" 13 | base_url = "https://github.com" 14 | search_url = base_url + "/search?" 15 | summary = "\tGitHub is an American company that provides hosting for software development "\ 16 | "version control using Git. It is a subsidiary of Microsoft, which acquired the company "\ 17 | "in 2018 for $7.5 billion.\n\tIt offers all of the distributed version control and source"\ 18 | " code management (SCM) functionality of Git as well as adding its own features."\ 19 | "\n\tAs of May 2019, GitHub reports having over 37 million users and more than 100 million"\ 20 | " repositories (including at least 28 million public repositories), making it the largest "\ 21 | "host of source code in the world." 22 | 23 | def get_params(self, query=None, page=None, offset=None, **kwargs): 24 | params = {} 25 | params["q"] = query 26 | params["p"] = page 27 | params["type"] = kwargs.get("type_", None) 28 | self.type = params["type"] 29 | return params 30 | 31 | def parse_soup(self, soup): 32 | """ 33 | Parses GitHub for a search query. 34 | """ 35 | allowed_types = ( 36 | None, 37 | "Repositories", 38 | "Wikis", 39 | "Users", 40 | "Topics", 41 | "Marketplace", 42 | "RegistryPackages", 43 | "Issues", 44 | "Commits", 45 | "Code") 46 | if self.type not in allowed_types: 47 | raise IncorrectKeyWord( 48 | "No type <{type_}> exists".format(type_=self.type)) 49 | # find all li tags 50 | if self.type in (None, "Repositories"): 51 | return soup.find_all('li', class_='repo-list-item') 52 | elif self.type == "RegistryPackages": 53 | return soup.find_all("div", class_='hx_hit-package') 54 | # find all user divs 55 | elif self.type == "Users": 56 | return soup.find_all('div', class_='user-list-item') 57 | elif self.type == "Wikis": 58 | return soup.find_all('div', class_='hx_hit-wiki') 59 | elif self.type == "Topics": 60 | return soup.find_all('div', class_='topic-list-item') 61 | elif self.type == "Issues": 62 | return soup.find_all('div', class_='issue-list-item') 63 | elif self.type == "Marketplace": 64 | return soup.find_all('div', class_='hx_hit-marketplace') 65 | elif self.type == "Commits": 66 | return soup.find_all('div', class_='commits-list-item') 67 | 68 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs): 69 | """ 70 | Parses the source code to return 71 | 72 | :param single_result: single result found in container element 73 | :type single_result: `bs4.element.ResultSet` 74 | :return: parsed title, link and description of single result 75 | :rtype: dict 76 | """ 77 | rdict = SearchItem() 78 | if self.type in (None, "Repositories"): 79 | h3 = single_result.find( 80 | 'div', class_='f4') # pylint: disable=invalid-name 81 | link_tag = h3.find('a') 82 | # Get the text and link 83 | if return_type in (ReturnType.FULL, ReturnType.TITLE): 84 | title = link_tag.text 85 | rdict["titles"] = title 86 | 87 | if return_type in (ReturnType.FULL, ReturnType.LINK): 88 | ref_link = link_tag.get('href') 89 | link = self.base_url + ref_link 90 | rdict["links"] = link 91 | 92 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION): 93 | desc = single_result.find('p', class_="mb-1") 94 | rdict["descriptions"] = getattr(desc, 'text', '') 95 | 96 | if return_type in (ReturnType.FULL,): 97 | stars_and_lang_div = single_result.find( 98 | 'div', class_='d-flex') 99 | lang = stars_and_lang_div.find( 100 | 'span', itemprop="programmingLanguage") 101 | stars = single_result.find('div', class_='mr-3').find( 102 | 'a') 103 | updated_on = single_result.find("relative-time").get("title") 104 | rdict.update({ 105 | "stars": "" if not stars else stars.text.strip(), 106 | "languages": lang.text if lang else "", 107 | "updated_on": updated_on, 108 | }) 109 | 110 | if self.type == "Users": 111 | title_tag = single_result.find('div', class_='f4') 112 | if return_type in (ReturnType.FULL, ReturnType.TITLE): 113 | title = title_tag.text 114 | rdict["titles"] = title 115 | 116 | if return_type in (ReturnType.FULL, ReturnType.LINK): 117 | ref_link = title_tag.find('a').get('href') 118 | link = self.base_url + ref_link 119 | rdict["links"] = link 120 | 121 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION): 122 | desc_tag = single_result.find('p', class_='mb-1') 123 | desc = None 124 | if desc_tag: 125 | desc = desc_tag.text.strip(' \n') 126 | rdict["descriptions"] = desc 127 | 128 | if return_type in (ReturnType.FULL, ): 129 | location_div = single_result.find('div', class_='d-flex') 130 | location_and_email = location_div.find_all( 131 | 'div', class_='mr-3') 132 | location = email = None 133 | for single in location_and_email: 134 | if single.get('href') == None: 135 | location = single.text.strip(' \n') 136 | else: 137 | email = single.text 138 | 139 | rdict.update({ 140 | "locations": location, 141 | "emails": email, 142 | }) 143 | 144 | if self.type == "Wikis": 145 | title_tag = single_result.find('a', class_=None) 146 | 147 | if return_type in (ReturnType.FULL, ReturnType.TITLE): 148 | title = title_tag.get('title') 149 | rdict["title"] = title 150 | 151 | if return_type in (ReturnType.FULL, ReturnType.LINK): 152 | ref_link = title_tag.get('href') 153 | link = self.base_url + ref_link 154 | rdict["links"] = link 155 | 156 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION): 157 | desc = single_result.find('p', class_="mb1").text 158 | rdict["descriptions"] = desc 159 | 160 | if return_type in (ReturnType.FULL, ): 161 | last_updated = single_result.find( 162 | 'relative-time').get('title') 163 | repository = single_result.find('a', class_='muted-link').text 164 | rdict.update({ 165 | "repositories": repository, 166 | "last_updated": last_updated, 167 | }) 168 | 169 | if self.type == "Topics": 170 | title_div = single_result.find('div', class_='f4') 171 | title_tag = title_div.find('a', class_=None) 172 | if return_type in (ReturnType.FULL, ReturnType.TITLE): 173 | rdict["titles"] = title_tag.text 174 | if return_type in (ReturnType.FULL, ReturnType.LINK): 175 | ref_link = title_tag.get('href') 176 | link = self.base_url + ref_link 177 | rdict["links"] = link 178 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION): 179 | desc = None 180 | desc_tag = single_result.find('p', class_=None) 181 | if desc_tag: 182 | desc = desc_tag.text 183 | rdict["descriptions"] = desc 184 | 185 | if self.type == "Marketplace": 186 | title_tag = single_result.find('a', class_='no-underline') 187 | if return_type in (ReturnType.FULL, ReturnType.TITLE): 188 | title = title_tag.get('title') 189 | rdict["titles"] = title_tag.text 190 | if return_type in (ReturnType.FULL, ReturnType.LINK): 191 | link = title_tag.get('href') 192 | rdict["links"] = link 193 | 194 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION): 195 | desc = None 196 | desc_tag = single_result.find('text-gray') 197 | if desc_tag: 198 | desc = desc_tag.text 199 | rdict["descriptions"] = desc 200 | 201 | if return_type in (ReturnType.FULL, ): 202 | categories = list() 203 | categories_tags = single_result.find_all('a', class_='Label') 204 | if categories_tags: 205 | for i in categories_tags: 206 | categories.append(str(i).strip('\n ')) 207 | rdict["categories"] = categories 208 | 209 | if self.type == "RegistryPackages": 210 | title_tag = single_result.find('a', class_='h4') 211 | if return_type in (ReturnType.FULL, ReturnType.TITLE): 212 | title = title_tag.text 213 | rdict["titles"] = title_tag.text 214 | 215 | if return_type in (ReturnType.FULL, ReturnType.LINK): 216 | ref_link = title_tag.get('href') 217 | link = self.base_url + ref_link 218 | rdict["links"] = link 219 | 220 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION): 221 | desc = single_result.find( 222 | 'p', class_='mb-1').text.strip('\n ') 223 | rdict["descriptions"] = desc 224 | 225 | if self.type == "Issues": 226 | title_tag = single_result.find('a', class_=None) 227 | if return_type in (ReturnType.FULL, ReturnType.TITLE): 228 | title = title_tag.text 229 | rdict["titles"] = title_tag.text 230 | 231 | if return_type in (ReturnType.FULL, ReturnType.LINK): 232 | ref_link = title_tag.get('href') 233 | link = self.base_url + ref_link 234 | rdict["links"] = link 235 | 236 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION): 237 | desc = single_result.find('p', class_='mb-0').text 238 | rdict["descriptions"] = desc 239 | 240 | if return_type in (ReturnType.FULL, ): 241 | repository = single_result.find( 242 | 'div', class_='ml-1').find('a', 'text-bold').text 243 | opened_by = self.base_url + \ 244 | single_result.find( 245 | 'div', class_='mr-3').find('a').get('href') 246 | opened_on = single_result.find('relative-time').get("title") 247 | rdict.update({ 248 | "opened_by": opened_by, 249 | "opened_on": opened_on, 250 | "respositories": repository, 251 | }) 252 | 253 | if self.type == "Commits": 254 | title_p = single_result.find('div', class_="f4") 255 | title_tag = title_p.find('a') 256 | 257 | if return_type in (ReturnType.FULL, ReturnType.TITLE): 258 | title = title_tag.get('aria-label').strip("\n ") 259 | rdict["titles"] = title_tag.text 260 | 261 | if return_type in (ReturnType.FULL, ReturnType.LINK): 262 | ref_link = title_tag.get('href') 263 | if ref_link.startswith("http"): 264 | link = ref_link 265 | else: 266 | link = self.base_url + ref_link 267 | rdict["links"] = link 268 | 269 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION): 270 | opened_on = None 271 | author = None 272 | if single_result.find('relative-time'): 273 | opened_on = single_result.find( 274 | 'relative-time').get("title") 275 | desc = None 276 | if single_result.find('a', class_='commit-author'): 277 | author_tag = single_result.find( 278 | 'a', class_='commit-author') 279 | author = author_tag.text 280 | div = single_result.find('div', class_='d-flex') 281 | repo = div.find('a').text 282 | desc = "Committed to {}".format(repo) 283 | rdict["descriptions"] = desc 284 | if return_type == ReturnType.FULL: 285 | rdict.update({ 286 | "authors": author, 287 | "opened_on": opened_on, 288 | }) 289 | return rdict 290 | -------------------------------------------------------------------------------- /search_engine_parser/core/engines/google.py: -------------------------------------------------------------------------------- 1 | """@desc 2 | Parser for google search results 3 | """ 4 | import sys 5 | from urllib.parse import ( 6 | urljoin, 7 | parse_qs, 8 | unquote 9 | ) 10 | import urllib.parse as urlparse 11 | 12 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem 13 | 14 | 15 | EXTRA_PARAMS = ('hl', 'tbs') 16 | 17 | 18 | class Search(BaseSearch): 19 | """ 20 | Searches Google for string 21 | """ 22 | name = "Google" 23 | base_url = "https://www.google.com/" 24 | summary = "\tNo need for further introductions. The search engine giant holds the first "\ 25 | "place in search with a stunning difference of 65% from second in place Bing.\n"\ 26 | "\tAccording to the latest netmarketshare report (November 2018) 73% of searches "\ 27 | "were powered by Google and only 7.91% by Bing.\n\tGoogle is also dominating the "\ 28 | "mobile/tablet search engine market share with 81%!" 29 | 30 | def __init__(self): 31 | super().__init__() 32 | self.search_url = urljoin(self.base_url, "search") 33 | 34 | def get_params(self, query=None, offset=None, page=None, **kwargs): 35 | params = {} 36 | params["start"] = (page-1) * 10 37 | params["q"] = query 38 | params["gbv"] = 1 39 | # additional parameters will be considered 40 | for param in EXTRA_PARAMS: 41 | if kwargs.get(param): 42 | params[param] = kwargs[param] 43 | return params 44 | 45 | def parse_url(self, url): 46 | return self.clean_url(urljoin(self.base_url, url)) 47 | 48 | def parse_soup(self, soup): 49 | """ 50 | Parses Google Search Soup for results 51 | """ 52 | # find all class_='g' => each result 53 | return soup.find_all('div', class_="Gx5Zad fP1Qef xpd EtOod pkphOe") 54 | 55 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs): 56 | """ 57 | Parses the source code to return 58 | 59 | :param single_result: single result found in
    60 | :type single_result: `bs4.element.ResultSet` 61 | :return: parsed title, link and description of single result 62 | :rtype: dict 63 | """ 64 | # Some unneeded details shown such as suggestions should be ignore 65 | if (single_result.find("h2", class_="wITvVb") and single_result.find("div", class_="LKSyXe"))\ 66 | or single_result.find("div", class_="X7NTVe"): 67 | return 68 | 69 | results = SearchItem() 70 | els = single_result.find_all('div', class_='kCrYT') 71 | if len(els) < 2: 72 | return 73 | 74 | # First div contains title and url 75 | r_elem = els[0] 76 | 77 | # Get the text and link 78 | if return_type in (ReturnType.FULL, ReturnType.TITLE): 79 | link_tag = r_elem.find('a') 80 | if link_tag: 81 | title = link_tag.find('h3').text 82 | else: 83 | r_elem = els[1] 84 | title = r_elem.find('div', class_='BNeawe').text 85 | results['titles'] = title 86 | 87 | if return_type in (ReturnType.FULL, ReturnType.LINK): 88 | link_tag = r_elem.find('a') 89 | if link_tag: 90 | raw_link = link_tag.get('href') 91 | raw_url = urljoin(self.base_url, raw_link) 92 | results['raw_urls'] = raw_url 93 | results['links'] = self.clean_url(raw_url) 94 | 95 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION): 96 | # Second Div contains Description 97 | desc_tag = els[1] 98 | if return_type in (ReturnType.FULL, ReturnType.LINK) and not results.get('links'): 99 | link_tag = desc_tag.find('a') 100 | if link_tag: 101 | desc_tag = els[0] 102 | raw_link = link_tag.get('href') 103 | raw_url = urljoin(self.base_url, raw_link) 104 | results['raw_urls'] = raw_url 105 | results['links'] = self.clean_url(raw_url) 106 | desc = desc_tag.text 107 | results['descriptions'] = desc 108 | return results 109 | 110 | def clean_url(self, url): 111 | """ 112 | Extract clean URL from the SERP URL. 113 | 114 | >clean_url('https://www.google.com/url?q=https://english.stackexchange.com/questions/140710/what-is-the-opposite-of-preaching-to-the-choir&sa=U&ved=2ahUKEwi31MGyzvnuAhXyyDgGHXXACOYQFnoECAkQAg&usg=AOvVaw1GdXON-JIWGu-dGjHfgljl') 115 | https://english.stackexchange.com/questions/140710/what-is-the-opposite-of-preaching-to-the-choir 116 | """ 117 | parsed = urlparse.urlparse(url) 118 | url_qs = parse_qs(parsed.query) 119 | if 'q' in url_qs: 120 | return unquote(url_qs['q'][0]) 121 | elif 'url' in url_qs: 122 | return unquote(url_qs['url'][0]) 123 | # Add more cases here. 124 | return url 125 | -------------------------------------------------------------------------------- /search_engine_parser/core/engines/googlenews.py: -------------------------------------------------------------------------------- 1 | """@desc 2 | Parser for google news search results 3 | """ 4 | 5 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem 6 | 7 | 8 | class Search(BaseSearch): 9 | """ 10 | Searches Google News for string 11 | """ 12 | name = "GoogleNews" 13 | search_url = "https://www.google.com/search?" 14 | summary = "\tGoogle News is a news aggregator app developed by Google. It presents a "\ 15 | "continuous, customizable flow of articles organized from thousands of publishers "\ 16 | "and magazines. Google News is available as an app on Android, iOS, and the Web. "\ 17 | "Google released a beta version in September 2002 and the official app in January 2006." 18 | 19 | def get_params(self, query=None, offset=None, page=None, **kwargs): 20 | params = {} 21 | params["num"] = 10 22 | params["start"] = page 23 | params["q"] = query 24 | params["client"] = "ubuntu" 25 | params["tbm"] = "nws" 26 | return params 27 | 28 | def parse_soup(self, soup): 29 | """ 30 | Parses Google News Search Soup for results 31 | """ 32 | # find all class_='g' => each result 33 | return soup.find_all('div', class_='g') 34 | 35 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs): 36 | """ 37 | Parses the source code to return 38 | 39 | :param single_result: single result found in
    40 | :type single_result: `bs4.element.ResultSet` 41 | :return: parsed title, link, description, imge link, news source, date of single result 42 | :rtype: dict 43 | """ 44 | rdict = SearchItem() 45 | 46 | if return_type in (ReturnType.FULL, return_type.TITLE): 47 | title_tag = single_result.find('h3') 48 | title = title_tag.text 49 | rdict["titles"] = title 50 | 51 | if return_type in (ReturnType.FULL, ReturnType.LINK): 52 | link_tag = single_result.find('a') 53 | rdict["links"] = link_tag.get('href') 54 | 55 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION): 56 | desc_tag = single_result.find('div', class_='st') 57 | rdict["descriptions"] = desc_tag.text 58 | 59 | if return_type in (ReturnType.FULL,): 60 | img_tag = single_result.find('img', class_='th') 61 | news_source_tag = single_result.find('span', class_='e8fRJf') 62 | date_tag = single_result.find('span', class_='f') 63 | 64 | rdict["image_url"] = img_tag.get('src') 65 | rdict["news_source"] = news_source_tag.text 66 | rdict["date"] = date_tag.text 67 | return rdict 68 | -------------------------------------------------------------------------------- /search_engine_parser/core/engines/googlescholar.py: -------------------------------------------------------------------------------- 1 | """@desc 2 | Parser for google scholar search results 3 | """ 4 | 5 | import re 6 | 7 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem 8 | 9 | 10 | class Search(BaseSearch): 11 | """ 12 | Searches Google Scholar for string 13 | """ 14 | name = "GoogleScholar" 15 | search_url = "https://scholar.google.gr/scholar?" 16 | summary = "\tGoogle Scholar is a freely accessible web search engine that indexes the full "\ 17 | "text or metadata of scholarly literature across an array of publishing formats and "\ 18 | "disciplines." 19 | 20 | def get_params(self, query=None, offset=None, page=None, **kwargs): 21 | params = {} 22 | params["hl"] = "en" 23 | params["start"] = page 24 | params["q"] = query 25 | return params 26 | 27 | def parse_soup(self, soup): 28 | """ 29 | Parses Google Scholar Search Soup for results 30 | """ 31 | # find all class_='gs_r gs_or gs_scl' => each result 32 | return soup.find_all('div', class_='gs_r gs_or gs_scl') 33 | 34 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs): 35 | """ 36 | Parses the source code to return 37 | 38 | :param single_result: single result found in
    39 | :type single_result: `bs4.element.ResultSet` 40 | :return: parsed title, link, description, file link, result type of single result 41 | :rtype: dict 42 | """ 43 | rdict = SearchItem() 44 | r_elem = single_result.find('h3', class_='gs_rt') 45 | if return_type in (ReturnType.FULL, ReturnType.LINK): 46 | link_tag = r_elem.find('a') 47 | if link_tag: 48 | raw_link = link_tag.get('href') 49 | else: 50 | raw_link = '' 51 | rdict["links"] = raw_link 52 | 53 | if return_type in (ReturnType.FULL, return_type.DESCRIPTION): 54 | desc = single_result.find('div', class_='gs_rs') 55 | if desc: 56 | desc = desc.text 57 | else: 58 | desc = '' 59 | rdict["descriptions"] = desc 60 | 61 | if return_type in (ReturnType.FULL, return_type.TITLE): 62 | title = r_elem.text 63 | title = re.sub(r'^[\[\w+\]]+ ', '', title) 64 | rdict["titles"] = title 65 | 66 | if return_type == ReturnType.FULL: 67 | t_elem = single_result.find('span', class_='gs_ct1') 68 | if t_elem: 69 | result_type = t_elem.text 70 | else: 71 | result_type = '' 72 | 73 | f_elem = single_result.find('div', class_='gs_or_ggsm') 74 | if f_elem: 75 | flink_tag = r_elem.find('a') 76 | if flink_tag: 77 | file_link = flink_tag.get('href') 78 | else: 79 | file_link = '' 80 | else: 81 | file_link = '' 82 | 83 | rdict.update({ 84 | "result_types": result_type, 85 | "files_links": file_link 86 | }) 87 | 88 | return rdict 89 | -------------------------------------------------------------------------------- /search_engine_parser/core/engines/myanimelist.py: -------------------------------------------------------------------------------- 1 | """@desc 2 | Parser for MyAnimeList search results 3 | """ 4 | 5 | import math 6 | import sys 7 | 8 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem 9 | 10 | 11 | class Search(BaseSearch): 12 | """ 13 | Searches MyAnimeList for string 14 | """ 15 | name = "MyAnimeList" 16 | 17 | search_url = "https://myanimelist.net/anime.php?" 18 | summary = "\tMyAnimeList, often abbreviated as MAL, is an anime and manga social"\ 19 | "networking and social cataloging application website."\ 20 | "\n\tThe site provides its users with a list-like system to organize"\ 21 | "and score anime and manga.\n\tIt facilitates finding users who share"\ 22 | "similar tastes and provides a large database on anime and manga.\n\tThe"\ 23 | "site claims to have 4.4 million anime and 775,000 manga entries."\ 24 | "\n\tIn 2015, the site received over 120 million visitors a month." 25 | 26 | def get_params(self, query=None, page=None, offset=None, **kwargs): 27 | params = {} 28 | params["show"] = (math.ceil(page / 5) - 1) * 50 29 | params["q"] = query 30 | return params 31 | 32 | def parse_soup(self, soup): 33 | """ 34 | Parses MyAnimeList for a search query 35 | """ 36 | 37 | # The data is stored in table so find all table rows 38 | # The first row is table header 39 | res = soup.find('div', class_='js-categories-seasonal js-block-list list') 40 | if res: 41 | return res.find_all('tr')[1:] 42 | 43 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs): 44 | """ 45 | Parses the source code to return 46 | 47 | :param single_result: single result found in div with a numeric id 48 | :type single_result: `bs4.element.Tag` 49 | :return: parsed title, link and description of single result 50 | :rtype: str, str, str 51 | """ 52 | rdict = SearchItem() 53 | link_tag = single_result.find('a', class_='fw-b') 54 | 55 | if return_type in (ReturnType.FULL, return_type.TITLE): 56 | title = link_tag.find('strong').text 57 | rdict["titles"] = title 58 | 59 | if return_type in (ReturnType.FULL, ReturnType.LINK): 60 | rdict["links"] = link_tag.get('href') 61 | 62 | if return_type in (ReturnType.FULL, return_type.DESCRIPTION): 63 | desc = single_result.find('div', class_='pt4').text.strip() 64 | rdict["descriptions"] = desc 65 | 66 | if return_type == ReturnType.FULL: 67 | data = list(single_result.find_all('td', class_='ac')) 68 | animetype = data[0].text.strip() 69 | episodes = data[1].text.strip() 70 | score = data[2].text.strip() 71 | 72 | rdict.update({ 73 | "episode_count": episodes, 74 | "animetypes": animetype, 75 | "ratings": score 76 | }) 77 | return rdict 78 | -------------------------------------------------------------------------------- /search_engine_parser/core/engines/stackoverflow.py: -------------------------------------------------------------------------------- 1 | """@desc 2 | Parser for AOL search results 3 | """ 4 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem 5 | 6 | 7 | class Search(BaseSearch): 8 | """ 9 | Searches StackOverflow for string 10 | """ 11 | name = "StackOverflow" 12 | base_url = "https://stackoverflow.com" 13 | search_url = base_url + "/search?" 14 | summary = "\tStack Overflow is a question and answer site for professional and enthusiast "\ 15 | "programmers.\n\tIt is a privately held website, the flagship site of the Stack "\ 16 | "Exchange Network, created in 2008 by Jeff Atwood and Joel Spolsky.\n\tIt features "\ 17 | "questions and answers on a wide range of topics in computer programming. It was "\ 18 | "created to be a more open alternative to earlier question and answer sites "\ 19 | "such as Experts-Exchange" 20 | 21 | def get_params(self, query=None, offset=None, page=None, **kwargs): 22 | params = {} 23 | params["page"] = page 24 | params["q"] = query 25 | params["pagesize"] = 15 26 | return params 27 | 28 | def parse_soup(self, soup): 29 | """ 30 | Parses StackOverflow for a search query 31 | """ 32 | # find all divs 33 | return soup.find_all('div', class_='summary') 34 | 35 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs): 36 | """ 37 | Parses the source code to return 38 | 39 | :param single_result: single result found in
    40 | :type single_result: `bs4.element.ResultSet` 41 | :return: parsed title, link and description of single result 42 | :rtype: dict 43 | """ 44 | rdict = SearchItem() 45 | h3 = single_result.find('h3') # pylint: disable=invalid-name 46 | link_tag = h3.find('a') 47 | if return_type in (ReturnType.FULL, return_type.TITLE): 48 | # Get the text and link 49 | rdict["titles"] = link_tag.text 50 | 51 | if return_type in (ReturnType.FULL, return_type.LINK): 52 | ref_link = link_tag.get('href') 53 | link = self.base_url + ref_link 54 | rdict["links"] = link 55 | 56 | if return_type in (ReturnType.FULL, return_type.DESCRIPTION): 57 | caption = single_result.find('div', class_='excerpt') 58 | rdict["descriptions"] = caption.text 59 | return rdict 60 | -------------------------------------------------------------------------------- /search_engine_parser/core/engines/yahoo.py: -------------------------------------------------------------------------------- 1 | """@desc 2 | Parser for Yahoo search results 3 | """ 4 | import re 5 | 6 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem 7 | 8 | 9 | class Search(BaseSearch): 10 | """ 11 | Searches Yahoo for string 12 | """ 13 | name = "Yahoo" 14 | search_url = "https://search.yahoo.com/search?" 15 | summary = "\tYahoo is one the most popular email providers and holds the fourth place in "\ 16 | "search with 3.90% market share.\n\tFrom October 2011 to October 2015, Yahoo search "\ 17 | "was powered exclusively by Bing. \n\tSince October 2015 Yahoo agreed with Google to "\ 18 | "provide search-related services and since then the results of Yahoo are powered both "\ 19 | "by Google and Bing. \n\tYahoo is also the default search engine for Firefox browsers "\ 20 | "in the United States (since 2014)." 21 | 22 | def get_params(self, query=None, page=None, offset=None, **kwargs): 23 | params = {} 24 | params["p"] = query 25 | params["b"] = offset 26 | return params 27 | 28 | def parse_soup(self, soup): 29 | """ 30 | Parses Yahoo for a search query 31 | """ 32 | # find all divs 33 | return soup.find_all('div', class_='Sr') 34 | 35 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs): 36 | """ 37 | Parses the source code to return 38 | 39 | :param single_result: single result found in
    40 | :type single_result: `bs4.element.ResultSet` 41 | :return: parsed title, link and description of single result 42 | :rtype: dict 43 | """ 44 | rdict = SearchItem() 45 | h3_tag = single_result.find('h3', class_='title') 46 | 47 | if return_type in (ReturnType.FULL, return_type.TITLE): 48 | title = h3_tag.text 49 | rdict["titles"] = title 50 | 51 | if return_type in (ReturnType.FULL, ReturnType.LINK): 52 | link_tag = h3_tag.find('a') 53 | raw_link = link_tag.get('href') 54 | re_str = re.findall("/RU=(.+)/RK", raw_link)[0] 55 | re_str = re_str.replace("%3a", ":") 56 | link = re_str.replace("%2f", "/") 57 | rdict["links"] = link 58 | 59 | if return_type in (ReturnType.FULL, return_type.DESCRIPTION): 60 | desc = single_result.find('span', class_='fc-falcon') 61 | rdict["descriptions"] = desc.text 62 | 63 | return rdict 64 | -------------------------------------------------------------------------------- /search_engine_parser/core/engines/yandex.py: -------------------------------------------------------------------------------- 1 | """@desc 2 | Parser for Yandex search results 3 | """ 4 | 5 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem 6 | 7 | 8 | class Search(BaseSearch): 9 | """ 10 | Searches Yandex for string 11 | """ 12 | name = "Yandex" 13 | search_url = "https://yandex.com/search/?" 14 | summary = "\tYandex is the largest technology company in Russia and the"\ 15 | " largest search engine on the internet in Russian"\ 16 | ", with a market share of over 52%."\ 17 | "\n\tThe Yandex.ru home page is the 4th most popular website in Russia."\ 18 | "\n\tIt also has the largest market share of any search engine in the Commonwealth"\ 19 | " of Independent States and is the 5th largest search engine worldwide"\ 20 | " after Google, Baidu, Bing, and Yahoo!" 21 | 22 | def get_params(self, query=None, page=None, offset=None, **kwargs): 23 | params = {} 24 | params["text"] = query 25 | params["p"] = offset 26 | return params 27 | 28 | def parse_soup(self, soup): 29 | """ 30 | Parses Yandex for a search query 31 | """ 32 | return soup.find_all('li', class_="serp-item") 33 | 34 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs): 35 | """ 36 | Parses the source code to return 37 | 38 | :param single_result: single result found in
  • 39 | :type single_result: `bs4.element.ResultSet` 40 | :return: parsed title, link and description of single result 41 | :rtype: str, str, str 42 | """ 43 | rdict = SearchItem() 44 | h3_tag = single_result.find('div', class_="organic__url-text") 45 | 46 | if return_type in (ReturnType.FULL, return_type.TITLE): 47 | # Get the text and link 48 | title = h3_tag.text 49 | # Handle read more type texts 50 | index = title.find("Read more") 51 | if index >= 0: 52 | title = title[0:int(index)] 53 | rdict["titles"] = title 54 | 55 | if return_type in (ReturnType.FULL, ReturnType.LINK): 56 | link_tag = single_result.find('a') 57 | link = link_tag.get('href') 58 | rdict["links"] = link 59 | 60 | if return_type in (ReturnType.FULL, return_type.DESCRIPTION): 61 | desc = single_result.find('div', class_="organic__content-wrapper") 62 | desc = desc.text 63 | rdict["descriptions"] = desc 64 | return rdict 65 | -------------------------------------------------------------------------------- /search_engine_parser/core/engines/youtube.py: -------------------------------------------------------------------------------- 1 | """@desc 2 | Parser for YouTube search results 3 | """ 4 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem 5 | 6 | 7 | class Search(BaseSearch): 8 | """ 9 | Searches YouTube for string 10 | """ 11 | name = "YouTube" 12 | base_url = "https://youtube.com" 13 | search_url = base_url + "/results?" 14 | summary = "\tYouTube is an American video-sharing website headquartered in San Bruno, "\ 15 | "California. Three former PayPal employees—Chad Hurley, Steve Chen, and Jawed "\ 16 | "Karim—created the service in February 2005.\n\tGoogle bought the site in November "\ 17 | "2006 for US$1.65 billion; YouTube now operates as one of Google's subsidiaries. "\ 18 | "As of May 2019, more than 500 hours of video content are uploaded to YouTube every minute" 19 | 20 | def get_params(self, query=None, page=None, offset=None, **kwargs): 21 | params = {} 22 | params["search_query"] = query 23 | return params 24 | 25 | def parse_soup(self, soup): 26 | """ 27 | Parses YouTube for a search query. 28 | """ 29 | # find all ytd-video-renderer tags 30 | return soup.find_all('div', class_='yt-lockup-content') 31 | 32 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs): 33 | """ 34 | Parses the source code to return 35 | 36 | :param single_result: single result found in 37 | :type single_result: `bs4.element.ResultSet` 38 | :return: parsed title, link and description of single result 39 | :rtype: dict 40 | """ 41 | rdict = SearchItem() 42 | # pylint: disable=too-many-locals 43 | title_tag = single_result.find('a', class_='yt-uix-tile-link') 44 | channel_name = "" 45 | 46 | if return_type in (ReturnType.FULL, return_type.TITLE): 47 | # Get the text and link 48 | rdict["titles"] = title_tag.text 49 | 50 | # try for single videos 51 | try: 52 | if return_type in (ReturnType.FULL, ReturnType.LINK): 53 | ref_link = title_tag.get('href') 54 | link = self.base_url + ref_link 55 | rdict["links"] = link 56 | 57 | if return_type in (ReturnType.FULL, return_type.DESCRIPTION): 58 | desc = single_result.find( 59 | 'div', class_="yt-lockup-description").text 60 | rdict["descriptions"] = desc 61 | 62 | if return_type in (ReturnType.FULL, ): 63 | duration = single_result.find( 64 | 'span', class_='accessible-description').text 65 | ul_tag = single_result.find('ul', class_='yt-lockup-meta-info') 66 | 67 | channel_name = single_result.find( 68 | 'a', class_='yt-uix-sessionlink spf-link').text 69 | views_and_upload_date = ul_tag.find_all('li') 70 | upload_date = views_and_upload_date[0].text 71 | views = views_and_upload_date[1].text 72 | rdict.update({ 73 | "channels": channel_name, 74 | "durations": duration, 75 | "views": views, 76 | "upload_dates": upload_date, 77 | }) 78 | except BaseException: # pylint: disable=broad-except 79 | link_tags = single_result.find_all( 80 | 'a', class_='yt-uix-sessionlink spf-link') 81 | # TODO Optimize calls here so that we don't assign ref_link and channel_name 82 | # when we don't need them 83 | for i in link_tags: 84 | if i.get("href").startswith("/playlist"): 85 | ref_link = i.get("href") 86 | elif i.get("href").startswith("/user"): 87 | channel_name = i.text 88 | if return_type in (ReturnType.FULL, ReturnType.LINK): 89 | link = self.base_url + ref_link 90 | rdict["links"] = link 91 | 92 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION): 93 | desc = single_result.find( 94 | 'span', class_='accessible-description').text 95 | rdict["descriptions"] = desc 96 | if return_type in (ReturnType.FULL,): 97 | rdict.update({ 98 | "channels": channel_name, 99 | }) 100 | return rdict 101 | -------------------------------------------------------------------------------- /search_engine_parser/core/exceptions.py: -------------------------------------------------------------------------------- 1 | """@desc 2 | Exceptions 3 | """ 4 | 5 | 6 | class NoResultsFound(Exception): 7 | pass 8 | 9 | 10 | class NoResultsOrTrafficError(Exception): 11 | """ When No results is returned or unusual traffic caused app to return empty results """ 12 | 13 | class IncorrectKeyWord(Exception): 14 | """ When a wrong keyword argument is passed to the search function """ 15 | -------------------------------------------------------------------------------- /search_engine_parser/core/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import pickle 4 | import hashlib 5 | import aiohttp 6 | from fake_useragent import UserAgent 7 | 8 | FILEPATH = os.path.dirname(os.path.abspath(__file__)) 9 | 10 | # prevent caching 11 | USER_AGENT_LIST = [ 12 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0", 13 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " 14 | "Chrome/72.0.3626.121 Safari/537.36", 15 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0", 16 | "Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0", 17 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) " 18 | "Chrome/19.0.1084.46 Safari/536.5", 19 | "Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) " 20 | "Chrome/19.0.1084.46 Safari/536.5", 21 | ] 22 | 23 | 24 | def get_rand_user_agent(): 25 | user_agent = random.choice(USER_AGENT_LIST) 26 | try: 27 | user_agent = UserAgent().random 28 | except: 29 | pass 30 | return user_agent 31 | 32 | 33 | 34 | class CacheHandler: 35 | def __init__(self): 36 | self.cache = os.path.join(FILEPATH, "cache") 37 | engine_path = os.path.join(FILEPATH, "engines") 38 | if not os.path.exists(self.cache): 39 | os.makedirs(self.cache) 40 | enginelist = os.listdir(engine_path) 41 | self.engine_cache = {i[:-3]: os.path.join(self.cache, i[:-3]) for i in enginelist if i not in 42 | ("__init__.py")} 43 | for cache in self.engine_cache.values(): 44 | if not os.path.exists(cache): 45 | os.makedirs(cache) 46 | 47 | async def get_source(self, engine, url, headers, cache=True, 48 | proxy=None, proxy_auth=None): 49 | """ 50 | Retrieves source code of webpage from internet or from cache 51 | 52 | :rtype: str, bool 53 | :param engine: engine of the engine saving 54 | :type engine: str 55 | :param url: URL to pull source code from 56 | :type url: str 57 | :param headers: request headers to make use of 58 | :type headers: dict 59 | :param cache: use cache or not 60 | :type cache: bool 61 | :param proxy: proxy address to make use off 62 | :type proxy: str 63 | :param proxy_auth: (user, password) tuple to authenticate proxy 64 | :type proxy_auth: (str, str) 65 | """ 66 | encodedUrl = url.encode("utf-8") 67 | urlhash = hashlib.sha256(encodedUrl).hexdigest() 68 | engine = engine.lower() 69 | cache_path = os.path.join(self.engine_cache[engine], urlhash) 70 | if os.path.exists(cache_path) and cache: 71 | with open(cache_path, 'rb') as stream: 72 | return pickle.load(stream), True 73 | get_vars = { 'url':url, 'headers':headers } 74 | if proxy and proxy_auth: 75 | auth = aiohttp.BasicAuth(*proxy_auth) 76 | get_vars.update({'proxy':proxy, 'proxy_auth': auth}) 77 | 78 | async with aiohttp.ClientSession() as session: 79 | async with session.get(**get_vars) as resp: 80 | html = await resp.text() 81 | with open(cache_path, 'wb') as stream: 82 | pickle.dump(str(html), stream) 83 | return str(html), False 84 | 85 | def clear(self, engine=None): 86 | """ 87 | Clear the entire cache either by engine name 88 | or just all 89 | 90 | :param engine: engine to clear 91 | """ 92 | if not engine: 93 | for engine_cache in self.engine_cache.values(): 94 | for root, dirs, files in os.walk(engine_cache): 95 | for f in files: 96 | os.remove(os.path.join(engine_cache, f)) 97 | else: 98 | engine_cache = self.engine_cache[engine.lower()] 99 | for _, _, files in os.walk(engine_cache): 100 | for f in files: 101 | os.remove(os.path.join(engine_cache, f)) 102 | -------------------------------------------------------------------------------- /search_engine_parser/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bisohns/search-engine-parser/0c2f4bde7dd21c10e64c9204417d9a228e96c187/search_engine_parser/tests/__init__.py -------------------------------------------------------------------------------- /search_engine_parser/tests/test_base.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from importlib import import_module 4 | from urllib.parse import urlparse 5 | from unittest.mock import patch, MagicMock 6 | import vcr 7 | from parameterized import parameterized_class 8 | 9 | from search_engine_parser.core.exceptions import NoResultsOrTrafficError 10 | 11 | SEARCH_ARGS = ('Hello', 1) 12 | 13 | 14 | def get_engines(): 15 | """ Returns a list of all engines for tests """ 16 | engines = [] 17 | 18 | base_dir = os.getcwd() 19 | engines_dir = os.path.join(base_dir, 'search_engine_parser', 'core', 'engines') 20 | 21 | for filename in os.listdir(engines_dir): 22 | if os.path.isfile(os.path.join(engines_dir, filename)) and filename.endswith('.py') \ 23 | and filename != '__init__.py': 24 | engine = filename.split('.py')[0] 25 | module = import_module("search_engine_parser.core.engines.{}".format(engine.lower())) 26 | engine_class = getattr(module, "Search") 27 | engines.append([engine, engine_class(),]) 28 | return engines 29 | 30 | 31 | def validate_url(url): 32 | """ Checks if a url is valid 33 | urls must contain scheme, netloc and path 34 | """ 35 | try: 36 | result = urlparse(url) 37 | return all([result.scheme, result.netloc, result.path]) 38 | except BaseException: # pylint: disable=broad-except 39 | print("URL: %s\n" % url) 40 | return False 41 | 42 | 43 | # pylint: disable=no-member 44 | class EngineBaseTest(unittest.TestCase): 45 | """ Testbase for Engines 46 | 47 | provides tests for engine methods 48 | """ 49 | 50 | def setUp(self): 51 | from search_engine_parser.core.engines.google import Search # pylint: disable=import-outside-toplevel 52 | self.engine = Search() 53 | 54 | @patch('search_engine_parser.core.engines.google.Search.get_results') 55 | @patch('search_engine_parser.core.engines.google.Search.get_soup') 56 | async def test_urls(self, get_results_mock, get_soup_mock): 57 | """ Test that url updates work fine """ 58 | await self.engine.search(query="hello", url="google.com.tr") 59 | first_url = self.engine._parsed_url.geturl() 60 | self.assertTrue(validate_url(first_url)) 61 | 62 | self.engine.search(query="World", url="https://google.com.tr") 63 | second_url = self.engine._parsed_url.geturl() 64 | self.assertTrue(validate_url(second_url)) 65 | 66 | self.assertNotEqual(second_url, first_url) 67 | 68 | # Test for https://github.com/bisoncorps/search-engine-parser/issues/92 69 | def test_two_queries_different_results(self): 70 | """ Test that url updates work fine """ 71 | from search_engine_parser.core.engines.google import Search as GoogleSearch # pylint: disable=import-outside-toplevel 72 | from search_engine_parser.core.engines.yahoo import Search as YahooSearch # pylint: disable=import-outside-toplevel 73 | gengine = GoogleSearch() 74 | yahoo_engine = YahooSearch() 75 | gresults = None 76 | gresults = None 77 | with vcr.use_cassette('fixtures/google-test-diff-synopsis.yaml', record_mode='once'): 78 | gresults = gengine.search(query="What's up from this side") 79 | with vcr.use_cassette('fixtures/yahoo-test-diff-synopsis.yaml', record_mode='once'): 80 | yresults = yahoo_engine.search(query="this is example Bob") 81 | for key in gresults[0]: 82 | self.assertNotEqual(gresults[0].get(key, "GSearch"), yresults[0].get(key, "Ysearch")) 83 | 84 | self.assertNotEqual(gresults, yresults) 85 | 86 | # pylint: disable=no-member 87 | @parameterized_class(('name', 'engine'), get_engines()) 88 | class TestScraping(unittest.TestCase): 89 | """ Testbase for Engines 90 | 91 | provides tests for titles, description and return urls 92 | """ 93 | engine_class = None 94 | 95 | @classmethod 96 | def setUpClass(cls): 97 | super().setUpClass() 98 | 99 | try: 100 | cls.vcr_search(*SEARCH_ARGS) 101 | except NoResultsOrTrafficError: 102 | raise unittest.SkipTest( 103 | '{} failed due to traffic'.format( 104 | cls.engine)) 105 | 106 | @classmethod 107 | def vcr_search(cls, *args, **kwargs): 108 | print(cls.name) 109 | with vcr.use_cassette('fixtures/{}-{}-synopsis.yaml'.format(cls.name, args[0].replace(" ", "-")), record="once"): 110 | cls.results = cls.engine.search(*args, **kwargs) 111 | 112 | @classmethod 113 | def test_cache_used(cls): 114 | """ 115 | Test that the cache was used 116 | """ 117 | try: 118 | cls.vcr_search(*SEARCH_ARGS, cache=True) 119 | if cls.engine._cache_hit == False: 120 | assert False, "{} cache - unexpected miss".format( 121 | cls.engine.name) 122 | except NoResultsOrTrafficError: 123 | raise unittest.SkipTest( 124 | '{} failed due to traffic'.format( 125 | cls.engine)) 126 | 127 | @classmethod 128 | def test_cache_not_used(cls): 129 | """ 130 | Test that the cache was used 131 | """ 132 | try: 133 | cls.vcr_search(*SEARCH_ARGS, cache=False) 134 | if cls.engine._cache_hit == True: 135 | assert False, "{} cache - unexpected hit".format( 136 | cls.engine.name) 137 | except NoResultsOrTrafficError: 138 | raise unittest.SkipTest( 139 | '{} failed due to traffic'.format( 140 | cls.engine)) 141 | 142 | @classmethod 143 | def test_cache_bypassed(cls): 144 | """ 145 | Test that cache was bypassed 146 | """ 147 | # wrongly set cls.engine._cache_hit 148 | cls.engine._cache_hit = True 149 | try: 150 | cls.vcr_search(*SEARCH_ARGS, cache=False) 151 | if cls.engine._cache_hit == True: 152 | assert False, "{} cache - not bypassed".format( 153 | cls.engine.name) 154 | except NoResultsOrTrafficError: 155 | raise unittest.SkipTest( 156 | '{} failed due to traffic'.format( 157 | cls.engine)) 158 | 159 | def test_search_urls(self): 160 | """ 161 | Test that the search urls generated are valid 162 | """ 163 | self.assertTrue(validate_url(self.engine._parsed_url.geturl())) 164 | 165 | def test_returned_results(self): 166 | """ 167 | Test that the returned results have valid data. 8 is just a chosen value as most search 168 | engines return values more than that 169 | """ 170 | self.assertTrue(len(self.results['titles']) >= 4) 171 | self.assertTrue(len(self.results['links']) >= 4) 172 | # coursera does not return descriptions for 173 | # Preaching to the choir 174 | if not self.engine.name.lower() == "coursera": 175 | self.assertTrue(len(self.results['descriptions']) >= 4) 176 | else: 177 | self.assertTrue(len(self.results["difficulties"]) >= 4) 178 | 179 | def test_links(self): 180 | for link in self.results['links']: 181 | print("{}:::::{}".format(self.name, link)) 182 | # Sometimes googlescholar returns empty links for citation type results 183 | if not link and self.name.lower() == "googlescholar": 184 | continue 185 | self.assertTrue(validate_url(link)) 186 | 187 | def test_results_length_are_the_same(self): 188 | """ Tests if returned result items are equal. 189 | :param args: a list/tuple of other keys returned 190 | """ 191 | # Different engines have different keys which may be returned or not returned 192 | # So if all keys are not the same length check that the titles and links length are 193 | # the same 194 | default_keys = ["titles", "links"] 195 | default_keys_set = set(map(lambda x: len(self.results[x]), default_keys)) 196 | 197 | items = self.results.keys() 198 | items_set = set(map(lambda x: len(self.results[x]), items)) 199 | 200 | self.assertTrue(len(items_set) == 1 or len(default_keys_set) == 1) 201 | -------------------------------------------------------------------------------- /search_engine_parser/tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from unittest.mock import patch, MagicMock 4 | 5 | from search_engine_parser.core import cli 6 | 7 | engine_class_mock = MagicMock() 8 | engine_class_mock.name = "Random Engine Name" 9 | engine_class_mock.clear_cache = MagicMock() 10 | engine_class_mock.search = MagicMock() 11 | 12 | class CliTests(unittest.TestCase): 13 | 14 | def setUp(self): 15 | self.parser = cli.create_parser() 16 | 17 | def test_show_summary(self): 18 | args = self.parser.parse_args(["-e", "google", "--show-summary"]) 19 | # If it executes properly it should return None 20 | self.assertTrue(cli.main(args) is None) 21 | 22 | @patch('search_engine_parser.core.cli.get_engine_class', return_value=engine_class_mock) 23 | def test_query(self, engine_class): 24 | args = self.parser.parse_args(["-e", "google", "Preach"]) 25 | # If it executes properly it should return None 26 | self.assertTrue(cli.main(args) is None) 27 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import re 2 | import setuptools 3 | 4 | REQUIRED_PYTHON = (3, 5) 5 | 6 | # Load requirements 7 | REQUIREMENTS = 'requirements/main.txt' 8 | CLI_REQUIREMENTS = 'requirements/cli.txt' 9 | REQUIREMENTS = [line.strip('\n') for line in open(REQUIREMENTS).readlines()] 10 | CLI_REQUIREMENTS = [line.strip('\n') for line in open(CLI_REQUIREMENTS).readlines()] 11 | 12 | with open("README.md", "r", encoding="utf8") as fh: 13 | LONG_DESCRIPTION = fh.read() 14 | 15 | # Trying to load version directly from `search-engine-parser` module attempts 16 | # to load __init__.py which will try to load other libraries not yet installed 17 | with open("search_engine_parser/__init__.py", "rt", encoding="utf8") as f: 18 | VERSION = re.search(r'__version__ = "(.*?)"', f.read(), re.M).group(1) 19 | 20 | setuptools.setup( 21 | name="search-engine-parser", 22 | version=VERSION, 23 | author='Domnan Diretnan, Mmadu Manasseh', 24 | author_email="diretnandomnan@gmail.com", 25 | description="scrapes search engine pages for query titles, descriptions and links", 26 | url="https://github.com/bisoncorps/search-engine-parser", 27 | project_urls={ 28 | "Documentation":"https://search-engine-parser.readthedocs.io/en/latest", 29 | "Source": "https://github.com/bisoncorps/search-engine-parser", 30 | }, 31 | packages=setuptools.find_packages(), 32 | install_requires=REQUIREMENTS, 33 | long_description=LONG_DESCRIPTION, 34 | long_description_content_type="text/markdown", 35 | license="MIT", 36 | keywords='\ 37 | search-engine \ 38 | search \ 39 | parser \ 40 | google \ 41 | yahoo \ 42 | bing \ 43 | yandex \ 44 | stackoverflow \ 45 | github \ 46 | baidu ', 47 | entry_points={'console_scripts': [ 48 | 'pysearch=search_engine_parser.core.cli:runner' 49 | ]}, 50 | classifiers=[ 51 | "Programming Language :: Python :: 3", 52 | "License :: OSI Approved :: MIT License", 53 | "Operating System :: OS Independent", 54 | ], 55 | package_data={ 56 | '': ['*.*'], 57 | 'requirements': ['*.*'], 58 | }, 59 | include_package_data=True, 60 | extras_require={ 61 | 'cli': CLI_REQUIREMENTS 62 | }, 63 | python_requires='>={}.{}'.format(*REQUIRED_PYTHON), 64 | ) 65 | --------------------------------------------------------------------------------