├── .all-contributorsrc
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    ├── PULL_REQUEST_TEMPLATE
    │   ├── bug_fix.md
    │   ├── documentation_related.md
    │   ├── engine_implementation.md
    │   └── feature_implementation.md
    └── workflows
    │   ├── deploy.yml
    │   └── test.yml
├── .gitignore
├── .pylintrc
├── .readthedocs.yml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── assets
    ├── animate.gif
    └── example.gif
├── docs
    ├── Makefile
    ├── documentation.md
    ├── engines.md
    ├── faq.md
    ├── make.bat
    ├── source
    │   ├── conf.py
    │   ├── index.rst
    │   ├── modules.rst
    │   ├── search_engine_parser.core.engines.rst
    │   ├── search_engine_parser.core.rst
    │   ├── search_engine_parser.rst
    │   └── search_engine_parser.tests.rst
    └── supported_engines.md
├── fixtures
    ├── aol-Hello-synopsis.yaml
    ├── ask-Hello-synopsis.yaml
    ├── baidu-Hello-synopsis.yaml
    ├── bing-Hello-synopsis.yaml
    ├── coursera-Hello-synopsis.yaml
    ├── duckduckgo-Hello-synopsis.yaml
    ├── github-Hello-synopsis.yaml
    ├── google-Hello-synopsis.yaml
    ├── google-test-diff-synopsis.yaml
    ├── googlenews-Hello-synopsis.yaml
    ├── googlescholar-Hello-synopsis.yaml
    ├── myanimelist-Hello-synopsis.yaml
    ├── stackoverflow-Hello-synopsis.yaml
    ├── yahoo-Hello-synopsis.yaml
    ├── yahoo-test-diff-synopsis.yaml
    ├── yandex-Hello-synopsis.yaml
    └── youtube-Hello-synopsis.yaml
├── requirements
    ├── cli.txt
    ├── dev.txt
    └── main.txt
├── scripts
    ├── docs.sh
    ├── post_deploy_test.sh
    └── pre_deploy_test.sh
├── search_engine_parser
    ├── .gitignore
    ├── __init__.py
    ├── core
    │   ├── __init__.py
    │   ├── base.py
    │   ├── cli.py
    │   ├── engines
    │   │   ├── __init__.py
    │   │   ├── aol.py
    │   │   ├── ask.py
    │   │   ├── baidu.py
    │   │   ├── bing.py
    │   │   ├── coursera.py
    │   │   ├── duckduckgo.py
    │   │   ├── github.py
    │   │   ├── google.py
    │   │   ├── googlenews.py
    │   │   ├── googlescholar.py
    │   │   ├── myanimelist.py
    │   │   ├── stackoverflow.py
    │   │   ├── yahoo.py
    │   │   ├── yandex.py
    │   │   └── youtube.py
    │   ├── exceptions.py
    │   └── utils.py
    └── tests
    │   ├── __init__.py
    │   ├── test_base.py
    │   └── test_cli.py
└── setup.py


/.all-contributorsrc:
--------------------------------------------------------------------------------
  1 | {
  2 |   "files": [
  3 |     "README.md"
  4 |   ],
  5 |   "imageSize": 100,
  6 |   "commit": false,
  7 |   "contributors": [
  8 |     {
  9 |       "login": "Rexogamer",
 10 |       "name": "Ed Luff",
 11 |       "avatar_url": "https://avatars0.githubusercontent.com/u/42586271?v=4",
 12 |       "profile": "https://github.com/Rexogamer",
 13 |       "contributions": [
 14 |         "code"
 15 |       ]
 16 |     },
 17 |     {
 18 |       "login": "deven96",
 19 |       "name": "Diretnan Domnan",
 20 |       "avatar_url": "https://avatars3.githubusercontent.com/u/23453888?v=4",
 21 |       "profile": "http://diretnandomnan.webnode.com",
 22 |       "contributions": [
 23 |         "infra",
 24 |         "test",
 25 |         "tool",
 26 |         "code"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "login": "MeNsaaH",
 31 |       "name": "MeNsaaH",
 32 |       "avatar_url": "https://avatars3.githubusercontent.com/u/24734308?v=4",
 33 |       "profile": "http://mensaah.github.io",
 34 |       "contributions": [
 35 |         "infra",
 36 |         "test",
 37 |         "tool",
 38 |         "code"
 39 |       ]
 40 |     },
 41 |     {
 42 |       "login": "PalAditya",
 43 |       "name": "Aditya Pal",
 44 |       "avatar_url": "https://avatars2.githubusercontent.com/u/25523604?v=4",
 45 |       "profile": "https://github.com/PalAditya",
 46 |       "contributions": [
 47 |         "test",
 48 |         "code",
 49 |         "doc"
 50 |       ]
 51 |     },
 52 |     {
 53 |       "login": "AvinashReddy3108",
 54 |       "name": "Avinash Reddy",
 55 |       "avatar_url": "https://avatars1.githubusercontent.com/u/27774996?v=4",
 56 |       "profile": "http://energized.pro",
 57 |       "contributions": [
 58 |         "bug"
 59 |       ]
 60 |     },
 61 |     {
 62 |       "login": "Iamdavidonuh",
 63 |       "name": "David Onuh",
 64 |       "avatar_url": "https://avatars3.githubusercontent.com/u/37768509?v=4",
 65 |       "profile": "https://github.com/Iamdavidonuh",
 66 |       "contributions": [
 67 |         "code",
 68 |         "test"
 69 |       ]
 70 |     },
 71 |     {
 72 |       "login": "sp1thas",
 73 |       "name": "Panagiotis Simakis",
 74 |       "avatar_url": "https://avatars2.githubusercontent.com/u/8322266?v=4",
 75 |       "profile": "http://simakis.me",
 76 |       "contributions": [
 77 |         "code",
 78 |         "test"
 79 |       ]
 80 |     },
 81 |     {
 82 |       "login": "reiarthur",
 83 |       "name": "reiarthur",
 84 |       "avatar_url": "https://avatars2.githubusercontent.com/u/20190646?v=4",
 85 |       "profile": "https://github.com/reiarthur",
 86 |       "contributions": [
 87 |         "code"
 88 |       ]
 89 |     },
 90 |     {
 91 |       "login": "ashokkumarta",
 92 |       "name": "Ashokkumar TA",
 93 |       "avatar_url": "https://avatars0.githubusercontent.com/u/5450267?v=4",
 94 |       "profile": "http://ashokkumarta.blogspot.com/",
 95 |       "contributions": [
 96 |         "code"
 97 |       ]
 98 |     },
 99 |     {
100 |       "login": "ateuber",
101 |       "name": "Andreas Teuber",
102 |       "avatar_url": "https://avatars2.githubusercontent.com/u/44349054?v=4",
103 |       "profile": "https://github.com/ateuber",
104 |       "contributions": [
105 |         "code"
106 |       ]
107 |     },
108 |     {
109 |       "login": "mi096684",
110 |       "name": "mi096684",
111 |       "avatar_url": "https://avatars3.githubusercontent.com/u/22032932?v=4",
112 |       "profile": "https://github.com/mi096684",
113 |       "contributions": [
114 |         "bug"
115 |       ]
116 |     },
117 |     {
118 |       "login": "devajithvs",
119 |       "name": "devajithvs",
120 |       "avatar_url": "https://avatars1.githubusercontent.com/u/29475282?v=4",
121 |       "profile": "https://github.com/devajithvs",
122 |       "contributions": [
123 |         "code"
124 |       ]
125 |     },
126 |     {
127 |       "login": "zakaryan2004",
128 |       "name": "Geg Zakaryan",
129 |       "avatar_url": "https://avatars3.githubusercontent.com/u/29994884?v=4",
130 |       "profile": "https://github.com/zakaryan2004",
131 |       "contributions": [
132 |         "code",
133 |         "bug"
134 |       ]
135 |     },
136 |     {
137 |       "login": "redrussianarmy",
138 |       "name": "Hakan Boğan",
139 |       "avatar_url": "https://avatars1.githubusercontent.com/u/24498747?v=4",
140 |       "profile": "https://www.hakanbogan.com",
141 |       "contributions": [
142 |         "bug"
143 |       ]
144 |     },
145 |     {
146 |       "login": "NicKoehler",
147 |       "name": "NicKoehler",
148 |       "avatar_url": "https://avatars3.githubusercontent.com/u/53040044?v=4",
149 |       "profile": "https://github.com/NicKoehler",
150 |       "contributions": [
151 |         "bug",
152 |         "code"
153 |       ]
154 |     },
155 |     {
156 |       "login": "chris4540",
157 |       "name": "ChrisLin",
158 |       "avatar_url": "https://avatars1.githubusercontent.com/u/12794588?v=4",
159 |       "profile": "https://github.com/chris4540",
160 |       "contributions": [
161 |         "bug",
162 |         "code"
163 |       ]
164 |     },
165 |     {
166 |       "login": "pgrandinetti",
167 |       "name": "Pietro",
168 |       "avatar_url": "https://avatars.githubusercontent.com/u/10454135?v=4",
169 |       "profile": "http://pete.world",
170 |       "contributions": [
171 |         "code",
172 |         "bug"
173 |       ]
174 |     }
175 |   ],
176 |   "contributorsPerLine": 7,
177 |   "projectName": "search-engine-parser",
178 |   "projectOwner": "bisoncorps",
179 |   "repoType": "github",
180 |   "repoHost": "https://github.com",
181 |   "skipCi": true
182 | }
183 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: 'bug'
 6 | assignees: '@deven96'
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Open python console to 
16 | 2. Import search_engine_parser
17 | 3. Search using .... Engine
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. Windows]
28 |  - Python Version [e.g. 3.6.5]
29 |  - Search-engine-parser version [e.g. 0.5.1]
30 | 
31 | 
32 | **Additional context**
33 | Add any other context about the problem here.
34 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: 'enhancement'
 6 | assignees: '@deven96'
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/bug_fix.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug Fix
 3 | title: ''
 4 | labels: 'patch', 'needs-review'
 5 | assignees: '@MeNsaaH'
 6 | 
 7 | ---
 8 | 
 9 | **Issue relating to the bug**
10 | Issue number relating to the bug e.g #13
11 | 
12 | **Simple summary of steps Taken to fix the bug**
13 | A clear and concise description of what the fix is. Ex. I added a browser header to the base engine `search_engine_parser/core/engines/base` to prevent captchas.
14 | 
15 | **Describe alternatives you've considered**
16 | A clear and concise description of any alternative solutions you've considered.
17 | 
18 | **Additional context**
19 | Add any other context or screenshots about the fix here.
20 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/documentation_related.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Documentation Related
 3 | about: Added documentation to the project
 4 | title: ''
 5 | labels: 'documentation', 'needs-review'
 6 | assignees: '@MenSaaH'
 7 | 
 8 | ---
 9 | 
10 | **Describe the change to the documentation**
11 | A clear and concise description of what the change/addition is.
12 | 
13 | **Issue fix?**
14 | Issue number that this documentation PR fixes.
15 | 
16 | **Screenshots**
17 | If applicable, add screenshots of the sphinx documentation rendered on your local machine.
18 | 
19 | **Additional context**
20 | Add any other context about the PR here.
21 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/engine_implementation.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Engine Implementation
 3 | about: Implemented a new engine
 4 | title: 'Name of Engine goes here'
 5 | labels: 'engine', 'needs-review'
 6 | assignees: '@deven96', '@MenSaaH'
 7 | 
 8 | ---
 9 | 
10 | **Issue relating to the engine request**
11 | Issue number relating to the engine e.g #13
12 | 
13 | **Summary of steps Taken to implement the engine**
14 | A clear and concise description of what the engine is. 
15 | 
16 | ```t
17 | Ex. I added the GitHub engine, `github.py` to the `search_engine_parser/core/engines` directory and made the necessary imports.
18 | This engine integrates GitHub search capabilities and returns stars, repository info, descriptions, links and titles.
19 | ```
20 | 
21 | **Describe any issues you've faced or inconsistencies in the engine**
22 | A clear and concise description of any issues you've faced. Ex. I was unable to parse 10 results per page due to [...]
23 | 
24 | **Additional context**
25 | Add any other context or screenshots about the engine here.
26 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/feature_implementation.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature Implementation
 3 | about: ''
 4 | title: ''
 5 | labels: 'feature', 'needs-review'
 6 | assignees: '@deven96', '@MenSaaH'
 7 | 
 8 | ---
 9 | 
10 | **Issue relating to the feature**
11 | Issue number relating to the feature e.g #13
12 | 
13 | **Summary of steps Taken to implement the feature**
14 | A clear and concise description of what the feature is. 
15 | 
16 | ```t
17 | Ex. I added a browser header to the base engine `search_engine_parser/core/engines/base` to prevent captchas.
18 | ```
19 | 
20 | **Describe any issues you've faced or inconsistencies in implementing the feature**
21 | A clear and concise description of any issues you've faced. Ex. Captchas still occur after a certain amount of usage [...]
22 | 
23 | **Additional context**
24 | Add any other context or screenshots about the feature here.
25 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy to Pypi
 2 | on:
 3 |   push:
 4 |     tags:
 5 |       - 'v*.*.*'
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v1
12 | 
13 |     - name: Set up Python 3.7
14 |       uses: actions/setup-python@v1
15 |       with:
16 |         python-version: 3.7
17 | 
18 |     - name: Install Dependencies
19 |       run: pip install -r requirements/dev.txt
20 | 
21 |     - name: Set env
22 |       run: echo "RELEASE_VERSION=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
23 | 
24 |     - name: update Package version
25 |       run: sed -i "s/.*__version__.*/__version__ = \"${{ env.RELEASE_VERSION }}\"/g" search_engine_parser/__init__.py
26 | 
27 |     - name: Install pypa/build
28 |       run: python -m pip install build --user
29 | 
30 |     - name: Build a binary wheel and a source tarball
31 |       run: python -m build --sdist --wheel --outdir dist/ .
32 | 
33 |     - name: Build Changelog
34 |       id: github_release
35 |       uses: mikepenz/release-changelog-builder-action@v3
36 |       env:
37 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
38 | 
39 |     - name: Create Release
40 |       uses: softprops/action-gh-release@v0.1.14
41 |       with:
42 |         body: ${{steps.github_release.outputs.changelog}}
43 | 
44 |     - name: Publish package
45 |       uses: pypa/gh-action-pypi-publish@release/v1
46 |       with:
47 |         user: __token__
48 |         password: ${{ secrets.PYPI_API_TOKEN }}
49 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 |     paths:
 7 |       - '**.py'
 8 |       - 'requirements/**'
 9 |   pull_request:
10 |     branches:
11 |       - master
12 |     paths:
13 |       - '**.py'
14 |       - 'requirements/**'
15 | 
16 | jobs:
17 |   test:
18 |     strategy:
19 |       matrix:
20 |         python: ["3.6", "3.7", "3.8", "3.9"]
21 |     runs-on: ubuntu-latest
22 |     steps:
23 |     - uses: actions/checkout@v1
24 | 
25 |     - name: Set up Python 3.7
26 |       uses: actions/setup-python@v1
27 |       with:
28 |         python-version: ${{ matrix.python }}
29 | 
30 |     - name: Install Dependencies
31 |       run: pip install -r requirements/dev.txt
32 | 
33 |     - name: Run tests
34 |       run: pytest -s
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | #search_engine_parser cache
107 | **/cache/**
108 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
  1 | [MASTER]
  2 | 
  3 | # A comma-separated list of package or module names from where C extensions may
  4 | # be loaded. Extensions are loading into the active Python interpreter and may
  5 | # run arbitrary code.
  6 | extension-pkg-whitelist=
  7 | 
  8 | # Add files or directories to the blacklist. They should be base names, not
  9 | # paths.
 10 | ignore=CVS
 11 | 
 12 | # Add files or directories matching the regex patterns to the blacklist. The
 13 | # regex matches against base names, not paths.
 14 | ignore-patterns=
 15 | 
 16 | # Python code to execute, usually for sys.path manipulation such as
 17 | # pygtk.require().
 18 | #init-hook=
 19 | 
 20 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
 21 | # number of processors available to use.
 22 | jobs=1
 23 | 
 24 | # Control the amount of potential inferred values when inferring a single
 25 | # object. This can help the performance when dealing with large functions or
 26 | # complex, nested conditions.
 27 | limit-inference-results=100
 28 | 
 29 | # List of plugins (as comma separated values of python modules names) to load,
 30 | # usually to register additional checkers.
 31 | load-plugins=
 32 | 
 33 | # Pickle collected data for later comparisons.
 34 | persistent=yes
 35 | 
 36 | # Specify a configuration file.  #rcfile=
 37 | 
 38 | # When enabled, pylint would attempt to guess common misconfiguration and emit
 39 | # user-friendly hints instead of false-positive error messages.
 40 | suggestion-mode=yes
 41 | 
 42 | # Allow loading of arbitrary C extensions. Extensions are imported into the
 43 | # active Python interpreter and may run arbitrary code.
 44 | unsafe-load-any-extension=no
 45 | 
 46 | 
 47 | [MESSAGES CONTROL]
 48 | 
 49 | # Only show warnings with the listed confidence levels. Leave empty to show
 50 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
 51 | confidence=
 52 | 
 53 | # Disable the message, report, category or checker with the given id(s). You
 54 | # can either give multiple identifiers separated by comma (,) or put this
 55 | # option multiple times (only on the command line, not in the configuration
 56 | # file where it should appear only once). You can also use "--disable=all" to
 57 | # disable everything first and then reenable specific checks. For example, if
 58 | # you want to run only the similarities checker, you can use "--disable=all
 59 | # --enable=similarities". If you want to run only the classes checker, but have
 60 | # no Warning level messages displayed, use "--disable=all --enable=classes
 61 | # --disable=W".
 62 | disable=print-statement,
 63 |         parameter-unpacking,
 64 |         unpacking-in-except,
 65 |         old-raise-syntax,
 66 |         backtick,
 67 |         long-suffix,
 68 |         old-ne-operator,
 69 |         old-octal-literal,
 70 |         import-star-module-level,
 71 |         non-ascii-bytes-literal,
 72 |         raw-checker-failed,
 73 |         bad-inline-option,
 74 |         locally-disabled,
 75 |         file-ignored,
 76 |         suppressed-message,
 77 |         useless-suppression,
 78 |         deprecated-pragma,
 79 |         use-symbolic-message-instead,
 80 |         apply-builtin,
 81 |         basestring-builtin,
 82 |         buffer-builtin,
 83 |         cmp-builtin,
 84 |         coerce-builtin,
 85 |         execfile-builtin,
 86 |         file-builtin,
 87 |         long-builtin,
 88 |         raw_input-builtin,
 89 |         reduce-builtin,
 90 |         standarderror-builtin,
 91 |         unicode-builtin,
 92 |         xrange-builtin,
 93 |         coerce-method,
 94 |         delslice-method,
 95 |         getslice-method,
 96 |         setslice-method,
 97 |         no-absolute-import,
 98 |         old-division,
 99 |         dict-iter-method,
100 |         dict-view-method,
101 |         next-method-called,
102 |         metaclass-assignment,
103 |         indexing-exception,
104 |         raising-string,
105 |         reload-builtin,
106 |         oct-method,
107 |         hex-method,
108 |         nonzero-method,
109 |         cmp-method,
110 |         input-builtin,
111 |         round-builtin,
112 |         missing-docstring,
113 |         intern-builtin,
114 |         unichr-builtin,
115 |         map-builtin-not-iterating,
116 |         zip-builtin-not-iterating,
117 |         range-builtin-not-iterating,
118 |         filter-builtin-not-iterating,
119 |         using-cmp-argument,
120 |         eq-without-hash,
121 |         div-method,
122 |         idiv-method,
123 |         rdiv-method,
124 |         exception-message-attribute,
125 |         invalid-str-codec,
126 |         sys-max-int,
127 |         bad-python3-import,
128 |         deprecated-string-function,
129 |         deprecated-str-translate-call,
130 |         deprecated-itertools-function,
131 |         deprecated-types-field,
132 |         next-method-defined,
133 |         dict-items-not-iterating,
134 |         dict-keys-not-iterating,
135 |         dict-values-not-iterating,
136 |         deprecated-operator-function,
137 |         deprecated-urllib-function,
138 |         xreadlines-attribute,
139 |         deprecated-sys-function,
140 |         exception-escape,
141 |         comprehension-escape,
142 |         R0801
143 | 
144 | # Enable the message, report, category or checker with the given id(s). You can
145 | # either give multiple identifier separated by comma (,) or put this option
146 | # multiple time (only on the command line, not in the configuration file where
147 | # it should appear only once). See also the "--disable" option for examples.
148 | enable=c-extension-no-member
149 | 
150 | 
151 | [REPORTS]
152 | 
153 | # Python expression which should return a note less than 10 (10 is the highest
154 | # note). You have access to the variables errors warning, statement which
155 | # respectively contain the number of errors / warnings messages and the total
156 | # number of statements analyzed. This is used by the global evaluation report
157 | # (RP0004).
158 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
159 | 
160 | # Template used to display messages. This is a python new-style format string
161 | # used to format the message information. See doc for all details.
162 | #msg-template=
163 | 
164 | # Set the output format. Available formats are text, parseable, colorized, json
165 | # and msvs (visual studio). You can also give a reporter class, e.g.
166 | # mypackage.mymodule.MyReporterClass.
167 | output-format=text
168 | 
169 | # Tells whether to display a full report or only the messages.
170 | reports=no
171 | 
172 | # Activate the evaluation score.
173 | score=yes
174 | 
175 | 
176 | [REFACTORING]
177 | 
178 | # Maximum number of nested blocks for function / method body
179 | max-nested-blocks=5
180 | 
181 | # Complete name of functions that never returns. When checking for
182 | # inconsistent-return-statements if a never returning function is called then
183 | # it will be considered as an explicit return statement and no message will be
184 | # printed.
185 | never-returning-functions=sys.exit
186 | 
187 | 
188 | [MISCELLANEOUS]
189 | 
190 | # List of note tags to take in consideration, separated by a comma.
191 | notes=FIXME,
192 |       XXX,
193 |       TODO
194 | 
195 | 
196 | [LOGGING]
197 | 
198 | # Format style used to check logging format string. `old` means using %
199 | # formatting, while `new` is for `{}` formatting.
200 | logging-format-style=old
201 | 
202 | # Logging modules to check that the string format arguments are in logging
203 | # function parameter format.
204 | logging-modules=logging
205 | 
206 | 
207 | [STRING]
208 | 
209 | # This flag controls whether the implicit-str-concat-in-sequence should
210 | # generate a warning on implicit string concatenation in sequences defined over
211 | # several lines.
212 | check-str-concat-over-line-jumps=no
213 | 
214 | 
215 | [SPELLING]
216 | 
217 | # Limits count of emitted suggestions for spelling mistakes.
218 | max-spelling-suggestions=4
219 | 
220 | # Spelling dictionary name. Available dictionaries: none. To make it working
221 | # install python-enchant package..
222 | spelling-dict=
223 | 
224 | # List of comma separated words that should not be checked.
225 | spelling-ignore-words=
226 | 
227 | # A path to a file that contains private dictionary; one word per line.
228 | spelling-private-dict-file=
229 | 
230 | # Tells whether to store unknown words to indicated private dictionary in
231 | # --spelling-private-dict-file option instead of raising a message.
232 | spelling-store-unknown-words=no
233 | 
234 | 
235 | [FORMAT]
236 | 
237 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
238 | expected-line-ending-format=
239 | 
240 | # Regexp for a line that is allowed to be longer than the limit.
241 | ignore-long-lines=^\s*(# )?<?https?://\S+>?$
242 | 
243 | # Number of spaces of indent required inside a hanging or continued line.
244 | indent-after-paren=4
245 | 
246 | # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
247 | # tab).
248 | indent-string='    '
249 | 
250 | # Maximum number of characters on a single line.
251 | max-line-length=100
252 | 
253 | # Maximum number of lines in a module.
254 | max-module-lines=1000
255 | 
256 | # List of optional constructs for which whitespace checking is disabled. `dict-
257 | # separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
258 | # `trailing-comma` allows a space between comma and closing bracket: (a, ).
259 | # `empty-line` allows space-only lines.
260 | no-space-check=trailing-comma,
261 |                dict-separator
262 | 
263 | # Allow the body of a class to be on the same line as the declaration if body
264 | # contains single statement.
265 | single-line-class-stmt=no
266 | 
267 | # Allow the body of an if to be on the same line as the test if there is no
268 | # else.
269 | single-line-if-stmt=no
270 | 
271 | 
272 | [BASIC]
273 | 
274 | # Naming style matching correct argument names.
275 | argument-naming-style=snake_case
276 | 
277 | # Regular expression matching correct argument names. Overrides argument-
278 | # naming-style.
279 | #argument-rgx=
280 | 
281 | # Naming style matching correct attribute names.
282 | attr-naming-style=snake_case
283 | 
284 | # Regular expression matching correct attribute names. Overrides attr-naming-
285 | # style.
286 | #attr-rgx=
287 | 
288 | # Bad variable names which should always be refused, separated by a comma.
289 | bad-names=foo,
290 |           bar,
291 |           baz,
292 |           toto,
293 |           tutu,
294 |           tata
295 | 
296 | # Naming style matching correct class attribute names.
297 | class-attribute-naming-style=any
298 | 
299 | # Regular expression matching correct class attribute names. Overrides class-
300 | # attribute-naming-style.
301 | #class-attribute-rgx=
302 | 
303 | # Naming style matching correct class names.
304 | class-naming-style=PascalCase
305 | 
306 | # Regular expression matching correct class names. Overrides class-naming-
307 | # style.
308 | #class-rgx=
309 | 
310 | # Naming style matching correct constant names.
311 | const-naming-style=UPPER_CASE
312 | 
313 | # Regular expression matching correct constant names. Overrides const-naming-
314 | # style.
315 | #const-rgx=
316 | 
317 | # Minimum line length for functions/classes that require docstrings, shorter
318 | # ones are exempt.
319 | docstring-min-length=-1
320 | 
321 | # Naming style matching correct function names.
322 | function-naming-style=snake_case
323 | 
324 | # Regular expression matching correct function names. Overrides function-
325 | # naming-style.
326 | #function-rgx=
327 | 
328 | # Good variable names which should always be accepted, separated by a comma.
329 | good-names=i,
330 |            j,
331 |            k,
332 |            ex,
333 |            Run,
334 |            _
335 | 
336 | # Include a hint for the correct naming format with invalid-name.
337 | include-naming-hint=no
338 | 
339 | # Naming style matching correct inline iteration names.
340 | inlinevar-naming-style=any
341 | 
342 | # Regular expression matching correct inline iteration names. Overrides
343 | # inlinevar-naming-style.
344 | #inlinevar-rgx=
345 | 
346 | # Naming style matching correct method names.
347 | method-naming-style=snake_case
348 | 
349 | # Regular expression matching correct method names. Overrides method-naming-
350 | # style.
351 | #method-rgx=
352 | 
353 | # Naming style matching correct module names.
354 | module-naming-style=snake_case
355 | 
356 | # Regular expression matching correct module names. Overrides module-naming-
357 | # style.
358 | #module-rgx=
359 | 
360 | # Colon-delimited sets of names that determine each other's naming style when
361 | # the name regexes allow several styles.
362 | name-group=
363 | 
364 | # Regular expression which should only match function or class names that do
365 | # not require a docstring.
366 | no-docstring-rgx=^_
367 | 
368 | # List of decorators that produce properties, such as abc.abstractproperty. Add
369 | # to this list to register other decorators that produce valid properties.
370 | # These decorators are taken in consideration only for invalid-name.
371 | property-classes=abc.abstractproperty
372 | 
373 | # Naming style matching correct variable names.
374 | variable-naming-style=snake_case
375 | 
376 | # Regular expression matching correct variable names. Overrides variable-
377 | # naming-style.
378 | #variable-rgx=
379 | 
380 | 
381 | [TYPECHECK]
382 | 
383 | # List of decorators that produce context managers, such as
384 | # contextlib.contextmanager. Add to this list to register other decorators that
385 | # produce valid context managers.
386 | contextmanager-decorators=contextlib.contextmanager
387 | 
388 | # List of members which are set dynamically and missed by pylint inference
389 | # system, and so shouldn't trigger E1101 when accessed. Python regular
390 | # expressions are accepted.
391 | generated-members=
392 | 
393 | # Tells whether missing members accessed in mixin class should be ignored. A
394 | # mixin class is detected if its name ends with "mixin" (case insensitive).
395 | ignore-mixin-members=yes
396 | 
397 | # Tells whether to warn about missing members when the owner of the attribute
398 | # is inferred to be None.
399 | ignore-none=yes
400 | 
401 | # This flag controls whether pylint should warn about no-member and similar
402 | # checks whenever an opaque object is returned when inferring. The inference
403 | # can return multiple potential results while evaluating a Python object, but
404 | # some branches might not be evaluated, which results in partial inference. In
405 | # that case, it might be useful to still emit no-member and other checks for
406 | # the rest of the inferred objects.
407 | ignore-on-opaque-inference=yes
408 | 
409 | # List of class names for which member attributes should not be checked (useful
410 | # for classes with dynamically set attributes). This supports the use of
411 | # qualified names.
412 | ignored-classes=optparse.Values,thread._local,_thread._local
413 | 
414 | # List of module names for which member attributes should not be checked
415 | # (useful for modules/projects where namespaces are manipulated during runtime
416 | # and thus existing member attributes cannot be deduced by static analysis. It
417 | # supports qualified module names, as well as Unix pattern matching.
418 | ignored-modules=
419 | 
420 | # Show a hint with possible names when a member name was not found. The aspect
421 | # of finding the hint is based on edit distance.
422 | missing-member-hint=yes
423 | 
424 | # The minimum edit distance a name should have in order to be considered a
425 | # similar match for a missing member name.
426 | missing-member-hint-distance=1
427 | 
428 | # The total number of similar names that should be taken in consideration when
429 | # showing a hint for a missing member.
430 | missing-member-max-choices=1
431 | 
432 | 
433 | [VARIABLES]
434 | 
435 | # List of additional names supposed to be defined in builtins. Remember that
436 | # you should avoid defining new builtins when possible.
437 | additional-builtins=
438 | 
439 | # Tells whether unused global variables should be treated as a violation.
440 | allow-global-unused-variables=yes
441 | 
442 | # List of strings which can identify a callback function by name. A callback
443 | # name must start or end with one of those strings.
444 | callbacks=cb_,
445 |           _cb
446 | 
447 | # A regular expression matching the name of dummy variables (i.e. expected to
448 | # not be used).
449 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
450 | 
451 | # Argument names that match this expression will be ignored. Default to name
452 | # with leading underscore.
453 | ignored-argument-names=_.*|^ignored_|^unused_
454 | 
455 | # Tells whether we should check for unused import in __init__ files.
456 | init-import=no
457 | 
458 | # List of qualified module names which can have objects that can redefine
459 | # builtins.
460 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
461 | 
462 | 
463 | [SIMILARITIES]
464 | 
465 | # Ignore comments when computing similarities.
466 | ignore-comments=yes
467 | 
468 | # Ignore docstrings when computing similarities.
469 | ignore-docstrings=yes
470 | 
471 | # Ignore imports when computing similarities.
472 | ignore-imports=no
473 | 
474 | # Minimum lines number of a similarity.
475 | min-similarity-lines=4
476 | 
477 | 
478 | [IMPORTS]
479 | 
480 | # Allow wildcard imports from modules that define __all__.
481 | allow-wildcard-with-all=no
482 | 
483 | # Analyse import fallback blocks. This can be used to support both Python 2 and
484 | # 3 compatible code, which means that the block might have code that exists
485 | # only in one or another interpreter, leading to false positives when analysed.
486 | analyse-fallback-blocks=no
487 | 
488 | # Deprecated modules which should not be used, separated by a comma.
489 | deprecated-modules=optparse,tkinter.tix
490 | 
491 | # Create a graph of external dependencies in the given file (report RP0402 must
492 | # not be disabled).
493 | ext-import-graph=
494 | 
495 | # Create a graph of every (i.e. internal and external) dependencies in the
496 | # given file (report RP0402 must not be disabled).
497 | import-graph=
498 | 
499 | # Create a graph of internal dependencies in the given file (report RP0402 must
500 | # not be disabled).
501 | int-import-graph=
502 | 
503 | # Force import order to recognize a module as part of the standard
504 | # compatibility libraries.
505 | known-standard-library=
506 | 
507 | # Force import order to recognize a module as part of a third party library.
508 | known-third-party=enchant
509 | 
510 | 
511 | [DESIGN]
512 | 
513 | # Maximum number of arguments for function / method.
514 | max-args=5
515 | 
516 | # Maximum number of attributes for a class (see R0902).
517 | max-attributes=7
518 | 
519 | # Maximum number of boolean expressions in an if statement.
520 | max-bool-expr=5
521 | 
522 | # Maximum number of branch for function / method body.
523 | max-branches=12
524 | 
525 | # Maximum number of locals for function / method body.
526 | max-locals=15
527 | 
528 | # Maximum number of parents for a class (see R0901).
529 | max-parents=7
530 | 
531 | # Maximum number of public methods for a class (see R0904).
532 | max-public-methods=20
533 | 
534 | # Maximum number of return / yield for function / method body.
535 | max-returns=6
536 | 
537 | # Maximum number of statements in function / method body.
538 | max-statements=50
539 | 
540 | # Minimum number of public methods for a class (see R0903).
541 | min-public-methods=2
542 | 
543 | 
544 | [CLASSES]
545 | 
546 | # List of method names used to declare (i.e. assign) instance attributes.
547 | defining-attr-methods=__init__,
548 |                       __new__,
549 |                       setUp
550 | 
551 | # List of member names, which should be excluded from the protected access
552 | # warning.
553 | exclude-protected=_asdict,
554 |                   _fields,
555 |                   _replace,
556 |                   _source,
557 |                   _make
558 | 
559 | # List of valid names for the first argument in a class method.
560 | valid-classmethod-first-arg=cls
561 | 
562 | # List of valid names for the first argument in a metaclass class method.
563 | valid-metaclass-classmethod-first-arg=cls
564 | 
565 | 
566 | [EXCEPTIONS]
567 | 
568 | # Exceptions that will emit a warning when being caught. Defaults to
569 | # "BaseException, Exception".
570 | overgeneral-exceptions=BaseException,
571 |                        Exception
572 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/source/conf.py
11 | 
12 | # Build documentation with MkDocs
13 | #mkdocs:
14 | #  configuration: mkdocs.yml
15 | 
16 | # Optionally build your docs in additional formats such as PDF and ePub
17 | formats: all
18 | 
19 | # Optionally set the version of Python and requirements required to build your docs
20 | python:
21 |   version: 3.7
22 |   install:
23 |     - requirements: requirements/dev.txt


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at diretnan.bisoncorps@gmail.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | ## How to contribute to Search Engine Parser
 2 | 
 3 | All Contributions to the code base or documentation must be done on a branch with intuitive name e.g `aol-#13-patch`, `yandex-engine-implementation`
 4 | 
 5 | #### **Did you find a bug?**
 6 | 
 7 | 
 8 | * **Ensure the bug was not already reported** by searching on GitHub under [Issues](https://github.com/bisoncorps/search-engine-parser/issues).
 9 | 
10 | * If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/bisoncorps/search-engine-parser/issues/new). If possible, be sure to make use of the [bug template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/ISSUE_TEMPLATE/bug_report.md) with label `bug`
11 | 
12 | * Ensure the issue description clearly describes the bug.Include the relevant issue number if applicable.
13 | 
14 | #### **Did you write a patch that fixes a bug?**
15 | 
16 | * Ensure the bug is first reported by searching on GitHub under [Issues](https://github.com/bisoncorps/search-engine-parser/issues) using label `bug`
17 | 
18 | * If issue does not exist, open an issue with the [bug report template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/ISSUE_TEMPLATE/bug_report.md)
19 | 
20 | * Open a new GitHub pull request with the patch using [bug fix template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/PULL_REQUEST_TEMPLATE/bug_fix.md).
21 | 
22 | * Ensure the PR description clearly describes the solution. Include the relevant issue number if applicable.
23 | 
24 | 
25 | #### **Do you intend to add a new feature or change an existing one?**
26 | 
27 | * **Ensure the feature was not already requested** by searching on GitHub under [Issues](https://github.com/bisoncorps/search-engine-parser/issues). Search using the `enhancement` or `feature` labels
28 | 
29 | * Suggest your feature/change in the [search-engine-parser mailing list](https://groups.google.com/forum/?fromgroups#!forum/searchengineparser) and start writing code.
30 | 
31 | * Do not open an issue on GitHub until you have collected positive feedback about the change.
32 | 
33 | * Raise an issue using the [feature request template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/ISSUE_TEMPLATE/feature_request.md) with labels `enhancement`
34 | 
35 | * Upon implementing the feature, make a PR using the [feature implementation template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/PULL_REQUEST_TEMPLATE/feature_implementation.md)
36 | 
37 | ##### **Engines**
38 | 
39 | * Refer to the [SearchEngineParser Engines Documentation](https://github.com/bisoncorps/search-engine-parser/blob/master/docs/engines.md) for help on implementing Engines
40 | 
41 | * If an issue for the Engine does not already exist under [Issues], suggest the engine in the [search-engine-parser mailing list](https://groups.google.com/forum/?fromgroups#!forum/searchengineparser)
42 | 
43 | * If the Engine to be included is accepted, raise an issue using the [feature template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/ISSUE_TEMPLATE/feature_request.md) and labels `enhancement` and `engine`
44 | 
45 | * Upon implementing the Engine, make a PR using the [engine implementation template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/PULL_REQUEST_TEMPLATE/engine_implementation.md)
46 | 
47 | 
48 | #### **Do you have questions about the source code?**
49 | 
50 | * Ask any question about how to use SearchEngineParser [search-engine-parser mailing list](https://groups.google.com/forum/?fromgroups#!forum/searchengineparser).
51 | 
52 | #### **Do you want to contribute to the search-engine-parser documentation?**
53 | 
54 | * Please read [Contributing to the SearchEngineParser Documentation](https://github.com/bisoncorps/search-engine-parser/blob/master/docs/documentation.md).
55 | 
56 | 
57 | 
58 | `NOTE: There are exceptions in every case and we know that too!`
59 | 
60 | SearchEngineParser is a volunteer effort. We encourage you to pitch in and [join the team](https://github.com/bisoncorps/search-engine-parser/blob/master/README.md#contributors)!
61 | 
62 | 
63 | Thanks!
64 | 
65 | Bisoncorps Team - `B`uilding `I`nteresting `S`oftware `O`pensourced for huma`NS` :heart: :heart:
66 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 bison_corps/search-engine-parser
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include requirements/*.txt
3 | include README.md
4 | recursive-include search_engine_parser *.py
5 | prune docs/ 
6 | prune scripts/
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Search Engine Parser
  2 | 
  3 | <span><i>"If it is a search engine, then it can be parsed"</i> - some random guy</span>
  4 | 
  5 | ![Demo](https://github.com/bisoncorps/search-engine-parser/raw/master/assets/animate.gif)
  6 | 
  7 | [![Python 3.6|3.7|3.8|3.9](https://img.shields.io/badge/python-3.5%7C3.6%7C3.7%7C3.8-blue)](https://www.python.org/downloads/)
  8 | [![PyPI version](https://img.shields.io/pypi/v/search-engine-parser)](https://pypi.org/project/search-engine-parser/)
  9 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/search-engine-parser)](https://pypi.org/project/search-engine-parser/)
 10 | [![Deploy to Pypi](https://github.com/bisohns/search-engine-parser/actions/workflows/deploy.yml/badge.svg)](https://github.com/bisohns/search-engine-parser/actions/workflows/deploy.yml)
 11 | [![Test](https://github.com/bisohns/search-engine-parser/actions/workflows/test.yml/badge.svg)](https://github.com/bisohns/search-engine-parser/actions/workflows/test.yml)
 12 | [![Documentation Status](https://readthedocs.org/projects/search-engine-parser/badge/?version=latest)](https://search-engine-parser.readthedocs.io/en/latest/?badge=latest)
 13 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 14 | [![All Contributors](https://img.shields.io/badge/all_contributors-10-orange.svg)](#contributors)
 15 | <hr/>
 16 | 
 17 | search-engine-parser is a package that lets you query popular search engines and scrape for result titles, links, descriptions and more. It aims to scrape the widest range of search engines.
 18 | View all supported engines [here.](https://github.com/bisoncorps/search-engine-parser/blob/master/docs/supported_engines.md)
 19 | 
 20 | - [Search Engine Parser](#search-engine-parser)
 21 |   - [Popular Supported Engines](#popular-supported-engines)
 22 |   - [Installation](#installation)
 23 |   - [Development](#development)
 24 |   - [Code Documentation](#code-documentation)
 25 |   - [Running the tests](#running-the-tests)
 26 |   - [Usage](#usage)
 27 |     - [Code](#code)
 28 |     - [Command line](#command-line)
 29 |   - [FAQ](docs/faq.md)
 30 |   - [Code of Conduct](#code-of-conduct)
 31 |   - [Contribution](#contribution)
 32 |   - [License (MIT)](#license-mit)
 33 | 
 34 | ## Popular Supported Engines
 35 | Popular search engines supported include:
 36 | 
 37 | - Google
 38 | - DuckDuckGo
 39 | - GitHub
 40 | - StackOverflow
 41 | - Baidu
 42 | - YouTube
 43 | 
 44 | View all supported engines [here.](docs/supported_engines.md)
 45 | 
 46 | ## Installation
 47 | Install from PyPi:
 48 | 
 49 | ```bash
 50 |     # install only package dependencies
 51 |     pip install search-engine-parser
 52 |     # Installs `pysearch` cli  tool
 53 |     pip install "search-engine-parser[cli]"
 54 | ```
 55 | 
 56 | or from master:
 57 | ```bash
 58 |   pip install git+https://github.com/bisoncorps/search-engine-parser
 59 | ```
 60 | 
 61 | ## Development
 62 | Clone the repository:
 63 | 
 64 | ```bash
 65 |     git clone git@github.com:bisoncorps/search-engine-parser.git
 66 | ```
 67 | 
 68 | Then create a virtual environment and install the required packages:
 69 | 
 70 | ```bash
 71 |     mkvirtualenv search_engine_parser
 72 |     pip install -r requirements/dev.txt
 73 | ```
 74 | 
 75 | 
 76 | ## Code Documentation
 77 | Code docs can be found on [Read the Docs](https://search-engine-parser.readthedocs.io/en/latest).
 78 | 
 79 | ## Running the tests
 80 | ```bash
 81 |     pytest
 82 | ```
 83 | 
 84 | ## Usage
 85 | 
 86 | ### Code
 87 | Query results can be scraped from popular search engines, as shown in the example snippet below.
 88 | 
 89 | ```python
 90 |   import pprint
 91 | 
 92 |   from search_engine_parser.core.engines.bing import Search as BingSearch
 93 |   from search_engine_parser.core.engines.google import Search as GoogleSearch
 94 |   from search_engine_parser.core.engines.yahoo import Search as YahooSearch
 95 | 
 96 |   search_args = ('preaching to the choir', 1)
 97 |   gsearch = GoogleSearch()
 98 |   ysearch = YahooSearch()
 99 |   bsearch = BingSearch()
100 |   gresults = gsearch.search(*search_args)
101 |   yresults = ysearch.search(*search_args)
102 |   bresults = bsearch.search(*search_args)
103 |   a = {
104 |       "Google": gresults,
105 |       "Yahoo": yresults,
106 |       "Bing": bresults
107 |       }
108 | 
109 |   # pretty print the result from each engine
110 |   for k, v in a.items():
111 |       print(f"-------------{k}------------")
112 |       for result in v:
113 |           pprint.pprint(result)
114 | 
115 |   # print first title from google search
116 |   print(gresults["titles"][0])
117 |   # print 10th link from yahoo search
118 |   print(yresults["links"][9])
119 |   # print 6th description from bing search
120 |   print(bresults["descriptions"][5])
121 | 
122 |   # print first result containing links, descriptions and title
123 |   print(gresults[0])
124 | ```
125 | 
126 | For localization, you can pass the `url` keyword and a localized url. This queries and parses the localized url using the same engine's parser:
127 | ```python
128 |   # Use google.de instead of google.com
129 |   results = gsearch.search(*search_args, url="google.de")
130 | ```
131 | 
132 | If you need results in a specific language you can pass the 'hl' keyword and the 2-letter country abbreviation (here's a [handy list](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)):
133 | ```python
134 |   # Use 'it' to receive italian results
135 |   results = gsearch.search(*search_args, hl="it")
136 | ```
137 | 
138 | #### Cache
139 | The results are automatically cached for engine searches. You can either bypass the cache by adding `cache=False` to the `search` or `async_search` method or clear the engine's cache
140 | ```python
141 |     from search_engine_parser.core.engines.github import Search as GitHub
142 |     github = GitHub()
143 |     # bypass the cache
144 |     github.search("search-engine-parser", cache=False)
145 | 
146 |     #OR
147 |     # clear cache before search
148 |     github.clear_cache()
149 |     github.search("search-engine-parser")
150 | ```
151 | 
152 | #### Proxy
153 | Adding a proxy entails sending details to the search function
154 | ```python
155 |     from search_engine_parser.core.engines.github import Search as GitHub
156 |     github = GitHub()
157 |     github.search("search-engine-parser",
158 |         # http proxies supported only
159 |         proxy='http://123.12.1.0',
160 |         proxy_auth=('username', 'password'))
161 | ```
162 | 
163 | 
164 | #### Async
165 | search-engine-parser supports `async`:
166 | ```python
167 |    results = await gsearch.async_search(*search_args)
168 | ```
169 | 
170 | #### Results
171 | The `SearchResults` after searching:
172 | ```python
173 |   >>> results = gsearch.search("preaching to the choir", 1)
174 |   >>> results
175 |   <search_engine_parser.core.base.SearchResult object at 0x7f907426a280>
176 |   # the object supports retrieving individual results by iteration of just by type (links, descriptions, titles)
177 |   >>> results[0] # returns the first <SearchItem>
178 |   >>> results[0]["description"] # gets the description of the first item
179 |   >>> results[0]["link"] # gets the link of the first item
180 |   >>> results["descriptions"] # returns a list of all descriptions from all results
181 | ```
182 | It can be iterated like a normal list to return individual `SearchItem`s.
183 | 
184 | ### Command line
185 | 
186 | search-engine-parser comes with a CLI tool known as `pysearch`. You can use it as such:
187 | 
188 | ```bash
189 | pysearch --engine bing  --type descriptions "Preaching to the choir"
190 | ```
191 | 
192 | Result:
193 | 
194 | ```bash
195 | 'Preaching to the choir' originated in the USA in the 1970s. It is a variant of the earlier 'preaching to the converted', which dates from England in the late 1800s and has the same meaning. Origin - the full story 'Preaching to the choir' (also sometimes spelled quire) is of US origin.
196 | ```
197 | 
198 | ![Demo](https://github.com/bisoncorps/search-engine-parser/raw/master/assets/example.gif)
199 | 
200 | ```bash
201 | usage: pysearch [-h] [-V] [-e ENGINE] [--show-summary] [-u URL] [-p PAGE]
202 |                 [-t TYPE] [-cc] [-r RANK] [--proxy PROXY]
203 |                 [--proxy-user PROXY_USER] [--proxy-password PROXY_PASSWORD]
204 |                 query
205 | 
206 | SearchEngineParser
207 | 
208 | positional arguments:
209 |   query                 Query string to search engine for
210 | 
211 | optional arguments:
212 |   -h, --help            show this help message and exit
213 |   -V, --version         show program's version number and exit
214 |   -e ENGINE, --engine ENGINE
215 |                         Engine to use for parsing the query e.g google, yahoo,
216 |                         bing,duckduckgo (default: google)
217 |   --show-summary        Shows the summary of an engine
218 |   -u URL, --url URL     A custom link to use as base url for search e.g
219 |                         google.de
220 |   -p PAGE, --page PAGE  Page of the result to return details for (default: 1)
221 |   -t TYPE, --type TYPE  Type of detail to return i.e full, links, desciptions
222 |                         or titles (default: full)
223 |   -cc, --clear-cache    Clear cache of engine before searching
224 |   -r RANK, --rank RANK  ID of Detail to return e.g 5 (default: 0)
225 |   --proxy PROXY         Proxy address to make use of
226 |   --proxy-user PROXY_USER
227 |                         Proxy user to make use of
228 |   --proxy-password PROXY_PASSWORD
229 |                         Proxy password to make use of
230 | ```
231 | 
232 | 
233 | 
234 | ## Code of Conduct
235 | Make sure to adhere to the [code of conduct](CODE_OF_CONDUCT.md) at all times.
236 | 
237 | ## Contribution
238 | Before making any contributions, please read the [contribution guide](CONTRIBUTING.md).
239 | 
240 | ## License (MIT)
241 | This project is licensed under the [MIT 2.0 License](LICENSE) which allows very broad use for both academic and commercial purposes.
242 | 
243 | ## Contributors ✨
244 | 
245 | Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
246 | 
247 | <!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->
248 | <!-- prettier-ignore-start -->
249 | <!-- markdownlint-disable -->
250 | <table>
251 |   <tr>
252 |     <td align="center"><a href="https://github.com/Rexogamer"><img src="https://avatars0.githubusercontent.com/u/42586271?v=4?s=100" width="100px;" alt=""/><br /><sub><b>Ed Luff</b></sub></a><br /><a href="https://github.com/bisoncorps/search-engine-parser/commits?author=Rexogamer" title="Code">💻</a></td>
253 |     <td align="center"><a href="http://diretnandomnan.webnode.com"><img src="https://avatars3.githubusercontent.com/u/23453888?v=4?s=100" width="100px;" alt=""/><br /><sub><b>Diretnan Domnan</b></sub></a><br /><a href="#infra-deven96" title="Infrastructure (Hosting, Build-Tools, etc)">🚇</a> <a href="https://github.com/bisoncorps/search-engine-parser/commits?author=deven96" title="Tests">⚠️</a> <a href="#tool-deven96" title="Tools">🔧</a> <a href="https://github.com/bisoncorps/search-engine-parser/commits?author=deven96" title="Code">💻</a></td>
254 |     <td align="center"><a href="http://mensaah.github.io"><img src="https://avatars3.githubusercontent.com/u/24734308?v=4?s=100" width="100px;" alt=""/><br /><sub><b>MeNsaaH</b></sub></a><br /><a href="#infra-MeNsaaH" title="Infrastructure (Hosting, Build-Tools, etc)">🚇</a> <a href="https://github.com/bisoncorps/search-engine-parser/commits?author=MeNsaaH" title="Tests">⚠️</a> <a href="#tool-MeNsaaH" title="Tools">🔧</a> <a href="https://github.com/bisoncorps/search-engine-parser/commits?author=MeNsaaH" title="Code">💻</a></td>
255 |     <td align="center"><a href="https://github.com/PalAditya"><img src="https://avatars2.githubusercontent.com/u/25523604?v=4?s=100" width="100px;" alt=""/><br /><sub><b>Aditya Pal</b></sub></a><br /><a href="https://github.com/bisoncorps/search-engine-parser/commits?author=PalAditya" title="Tests">⚠️</a> <a href="https://github.com/bisoncorps/search-engine-parser/commits?author=PalAditya" title="Code">💻</a> <a href="https://github.com/bisoncorps/search-engine-parser/commits?author=PalAditya" title="Documentation">📖</a></td>
256 |     <td align="center"><a href="http://energized.pro"><img src="https://avatars1.githubusercontent.com/u/27774996?v=4?s=100" width="100px;" alt=""/><br /><sub><b>Avinash Reddy</b></sub></a><br /><a href="https://github.com/bisoncorps/search-engine-parser/issues?q=author%3AAvinashReddy3108" title="Bug reports">🐛</a></td>
257 |     <td align="center"><a href="https://github.com/Iamdavidonuh"><img src="https://avatars3.githubusercontent.com/u/37768509?v=4?s=100" width="100px;" alt=""/><br /><sub><b>David Onuh</b></sub></a><br /><a href="https://github.com/bisoncorps/search-engine-parser/commits?author=Iamdavidonuh" title="Code">💻</a> <a href="https://github.com/bisoncorps/search-engine-parser/commits?author=Iamdavidonuh" title="Tests">⚠️</a></td>
258 |     <td align="center"><a href="http://simakis.me"><img src="https://avatars2.githubusercontent.com/u/8322266?v=4?s=100" width="100px;" alt=""/><br /><sub><b>Panagiotis Simakis</b></sub></a><br /><a href="https://github.com/bisoncorps/search-engine-parser/commits?author=sp1thas" title="Code">💻</a> <a href="https://github.com/bisoncorps/search-engine-parser/commits?author=sp1thas" title="Tests">⚠️</a></td>
259 |   </tr>
260 |   <tr>
261 |     <td align="center"><a href="https://github.com/reiarthur"><img src="https://avatars2.githubusercontent.com/u/20190646?v=4?s=100" width="100px;" alt=""/><br /><sub><b>reiarthur</b></sub></a><br /><a href="https://github.com/bisoncorps/search-engine-parser/commits?author=reiarthur" title="Code">💻</a></td>
262 |     <td align="center"><a href="http://ashokkumarta.blogspot.com/"><img src="https://avatars0.githubusercontent.com/u/5450267?v=4?s=100" width="100px;" alt=""/><br /><sub><b>Ashokkumar TA</b></sub></a><br /><a href="https://github.com/bisoncorps/search-engine-parser/commits?author=ashokkumarta" title="Code">💻</a></td>
263 |     <td align="center"><a href="https://github.com/ateuber"><img src="https://avatars2.githubusercontent.com/u/44349054?v=4?s=100" width="100px;" alt=""/><br /><sub><b>Andreas Teuber</b></sub></a><br /><a href="https://github.com/bisoncorps/search-engine-parser/commits?author=ateuber" title="Code">💻</a></td>
264 |     <td align="center"><a href="https://github.com/mi096684"><img src="https://avatars3.githubusercontent.com/u/22032932?v=4?s=100" width="100px;" alt=""/><br /><sub><b>mi096684</b></sub></a><br /><a href="https://github.com/bisoncorps/search-engine-parser/issues?q=author%3Ami096684" title="Bug reports">🐛</a></td>
265 |     <td align="center"><a href="https://github.com/devajithvs"><img src="https://avatars1.githubusercontent.com/u/29475282?v=4?s=100" width="100px;" alt=""/><br /><sub><b>devajithvs</b></sub></a><br /><a href="https://github.com/bisoncorps/search-engine-parser/commits?author=devajithvs" title="Code">💻</a></td>
266 |     <td align="center"><a href="https://github.com/zakaryan2004"><img src="https://avatars3.githubusercontent.com/u/29994884?v=4?s=100" width="100px;" alt=""/><br /><sub><b>Geg Zakaryan</b></sub></a><br /><a href="https://github.com/bisoncorps/search-engine-parser/commits?author=zakaryan2004" title="Code">💻</a> <a href="https://github.com/bisoncorps/search-engine-parser/issues?q=author%3Azakaryan2004" title="Bug reports">🐛</a></td>
267 |     <td align="center"><a href="https://www.hakanbogan.com"><img src="https://avatars1.githubusercontent.com/u/24498747?v=4?s=100" width="100px;" alt=""/><br /><sub><b>Hakan Boğan</b></sub></a><br /><a href="https://github.com/bisoncorps/search-engine-parser/issues?q=author%3Aredrussianarmy" title="Bug reports">🐛</a></td>
268 |   </tr>
269 |   <tr>
270 |     <td align="center"><a href="https://github.com/NicKoehler"><img src="https://avatars3.githubusercontent.com/u/53040044?v=4?s=100" width="100px;" alt=""/><br /><sub><b>NicKoehler</b></sub></a><br /><a href="https://github.com/bisoncorps/search-engine-parser/issues?q=author%3ANicKoehler" title="Bug reports">🐛</a> <a href="https://github.com/bisoncorps/search-engine-parser/commits?author=NicKoehler" title="Code">💻</a></td>
271 |     <td align="center"><a href="https://github.com/chris4540"><img src="https://avatars1.githubusercontent.com/u/12794588?v=4?s=100" width="100px;" alt=""/><br /><sub><b>ChrisLin</b></sub></a><br /><a href="https://github.com/bisoncorps/search-engine-parser/issues?q=author%3Achris4540" title="Bug reports">🐛</a> <a href="https://github.com/bisoncorps/search-engine-parser/commits?author=chris4540" title="Code">💻</a></td>
272 |     <td align="center"><a href="http://pete.world"><img src="https://avatars.githubusercontent.com/u/10454135?v=4?s=100" width="100px;" alt=""/><br /><sub><b>Pietro</b></sub></a><br /><a href="https://github.com/bisoncorps/search-engine-parser/commits?author=pgrandinetti" title="Code">💻</a> <a href="https://github.com/bisoncorps/search-engine-parser/issues?q=author%3Apgrandinetti" title="Bug reports">🐛</a></td>
273 |   </tr>
274 | </table>
275 | 
276 | <!-- markdownlint-restore -->
277 | <!-- prettier-ignore-end -->
278 | 
279 | <!-- ALL-CONTRIBUTORS-LIST:END -->
280 | 
281 | This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
282 | 


--------------------------------------------------------------------------------
/assets/animate.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bisohns/search-engine-parser/0c2f4bde7dd21c10e64c9204417d9a228e96c187/assets/animate.gif


--------------------------------------------------------------------------------
/assets/example.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bisohns/search-engine-parser/0c2f4bde7dd21c10e64c9204417d9a228e96c187/assets/example.gif


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = source
 8 | BUILDDIR      = build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/documentation.md:
--------------------------------------------------------------------------------
 1 | ## Documentation
 2 | 
 3 | The documentation for this project is generated by [Sphinx](https://sphinx-doc.org) and is hosted at [Read the Docs](https://search-engine-parser.readthedocs.io).
 4 | On the root of the project, there exists a `docs` directory housing the sphinx configuration and rst files. 
 5 | 
 6 | ### Understanding Sphinx
 7 | 
 8 | If you have not made use of sphinx before, take a look at this explanatory [blogpost](https://medium.com/@richdayandnight/a-simple-tutorial-on-how-to-document-your-python-project-using-sphinx-and-rinohtype-177c22a15b5b)
 9 | 
10 | 
11 | ### Documenting an Engine
12 | 
13 | Write the appropriate summary, and document the class and every function as follows
14 | 
15 | ```python
16 | """@desc 
17 |         # This is the module documentation
18 | 		Parser for FakeEngine search results
19 | """
20 | 
21 | 
22 | class Search(BaseSearch):
23 |     """
24 |     Searches FakeEngine for string
25 |     """
26 |     name = "FakeEngine"
27 |     summary = "\t Here lies the summary for a fake engine"
28 | 
29 |     def fake_function(self, input_1, input_2):
30 |         """
31 |         Describe function here
32 |         :param input_1: describe input 1
33 |         :type single_result: str
34 |         :param input_2: describe input 2
35 |         :type input_2: int
36 |         :return: this is an example return
37 |         :rtype: str
38 |         """
39 | ```
40 | 
41 | ### Generating the files
42 | 
43 | After including the necessary documentation
44 | 
45 | * Go to the root of the project and then
46 | 
47 | ```bash
48 | cd docs/
49 | ```
50 | 
51 | * Ensure your virtualenv is enabled with all requirements listed in the [requirements-dev.txt](https://github.com/bisoncorps/search-engine-parser/blob/master/requirements-dev.txt)
52 | 
53 | * Run the command
54 | 
55 | ```bash
56 | sphinx-apidoc -f -o source/ ../search_engine_parser
57 | ```
58 | 
59 | * Write an appropriate commit message
60 | 
61 | ```t
62 | Ex. Included documentation for the Yandex Engine
63 | ```
64 | 


--------------------------------------------------------------------------------
/docs/engines.md:
--------------------------------------------------------------------------------
 1 | ## Engines
 2 | 
 3 | This document is dedicated to helping developers better understand how to include Engines to the SearchEngineParser OSS.
 4 | 
 5 | ### What Search Engines are accepted
 6 | 
 7 | This project was started primarily for general purpose search engines like Google and Bing.
 8 | It has since surpassed that and aims to include all useful sites (termed `custom engines`).
 9 | These custom engines include things like Youtube, GitHub, StackOverflow, e.t.c.
10 | Basically any site that is popular enough to search and return links
11 | 
12 | ### Skills Needed
13 | 
14 | - Python (obviously)
15 | - Sphinx
16 | - Regular Expressions
17 | - Beautiful Soup
18 | 
19 | ### Implementing an Engine
20 | 
21 | The engine modules are in the [search_engine_parser/core/engines/](https://github.com/bisoncorps/search-engine-parser/blob/master/search_engine_parser/core/engines) directory
22 | 
23 | * Create module for the new search engine
24 | 
25 | * Create class for the Engine
26 | 
27 | * Class should import from the base engine
28 | 
29 | * Example for a fake engine is shown below
30 | 
31 | ```python
32 | 
33 |     # fake.py
34 |     from search_engine_parser.core.base import BaseSearch
35 |     from search_engine_parser.core.exceptions import NoResultsOrTrafficError
36 | 
37 |     class FakeEngine(BaseSearch):
38 |         # name of the engine to be displayed on the CLI, preferably PascalCase
39 |         name = "FakeEngine"
40 |         # engine url to be search, with parameters to be formatted e.g query , page
41 |         search_url = "https://search.fake.com/fake/search"
42 |         # a short 2 or 3 line summary of the engine with some statistics, preferably obtained from wikipedia
43 |         summary = "\t According to netmarketshare, this site is balderdash among "\
44 | 	      "search engines with a market share that is close to 100%. "\
45 | 	      "The fake engine includes many popular features but was solely created to show you an example ."
46 | 
47 |         
48 |         # this function should return the dict of params to be passed to the search_url
49 |         def get_params(self, query=None, page=None, offset=None, **kwargs):
50 |           params = {}
51 |           params["q"] =query
52 |           params["page"] = page
53 |           return params
54 | 
55 |         # This function should use beautiful soup (combined with regex if necessary) 
56 |         # to return all the divs containiing results
57 |         def parse_soup(self, soup):
58 |             return soup.find_all('div', class_='fake-result-div')
59 |         
60 |         # This function should parse each result soup to return title, link, and description 
61 |         # NOTE: The implementation may not be as straightforward as shown below
62 |         def parse_single_result(self, single_result):
63 |             title_div = single_result.find('div', class_='fake-title')
64 |             title = title_div.text
65 |             link_tag = title_div.find('a')
66 |             link = link_tag.get('href')
67 |             desc_span = single_result.find('span', class_='fake-description')
68 |             desc = desc.text
69 |             rdict = {
70 |                 "titles": title,
71 |                 "links": link,
72 |                 "descriptions": desc,
73 |             }
74 |             return rdict
75 | ```
76 | 
77 | * Import the engine by adding to the following files
78 | 
79 | [search_engine_parser/__init__.py](https://github.com/bisoncorps/search-engine-parser/blob/master/search_engine_parser/__init__.py)
80 | 
81 | ```python
82 |     ...
83 |     from search_engine_parser.core.engines.fake import Search as FakeEngineSearch
84 | ```
85 | 
86 | 
87 | * Make sure to write code documentation by following the [documentation guide](https://github.com/bisoncorps/search-engine-parser/blob/master/docs/documentation.md#documenting-an-engine)
88 | 
89 | * [Generate the RST file](https://github.com/bisoncorps/search-engine-parser/blob/master/docs/documentation.md#generating-the-files)
90 | 
91 | * Add Engine to Supported Engines in [supported engines](https://github.com/bisoncorps/search-engine-parser/blob/master/docs/supported_engines.md)
92 | 


--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
 1 | # FAQ
 2 | 
 3 | ## Why do I get `RuntimeError: This event loop is already running` When running in Jupyter Notebook
 4 | 
 5 | This is a popular issue on [Jupyter Notebook](https://github.com/jupyter/notebook/issues/5663). The solution:
 6 | - try `pip install --upgrade ipykernel ipython` which should upgrade the ipykernet to a recent version with issue resolved
 7 | - or add this to your notebook to allow nested asyncio loops
 8 | ```bash
 9 | !pip install nest-asyncio
10 | ```
11 | 
12 | ```python
13 | import nest_asyncio
14 | nest_asyncio.apply()
15 | ```
16 | 
17 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/master/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | import os
 16 | import sys
 17 | sys.path.insert(0, os.path.abspath('../..'))
 18 | from search_engine_parser import __version__ as VERSION
 19 | 
 20 | # -- Project information -----------------------------------------------------
 21 | 
 22 | project = 'Search Engine Parser'
 23 | copyright = '2019, BisonCorps'
 24 | author = 'Diretnan Domnan, Mmadu Manasseh'
 25 | 
 26 | # The short X.Y version
 27 | version = ''
 28 | # The full version, including alpha/beta/rc tags
 29 | release = VERSION
 30 | 
 31 | 
 32 | # -- General configuration ---------------------------------------------------
 33 | 
 34 | # If your documentation needs a minimal Sphinx version, state it here.
 35 | #
 36 | # needs_sphinx = '1.0'
 37 | 
 38 | # Add any Sphinx extension module names here, as strings. They can be
 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 40 | # ones.
 41 | extensions = [
 42 |     'sphinx.ext.autodoc',
 43 |     'sphinx.ext.todo',
 44 |     'sphinx.ext.viewcode',
 45 |     'sphinx.ext.githubpages',
 46 |     'm2r',
 47 | ]
 48 | 
 49 | # Add any paths that contain templates here, relative to this directory.
 50 | templates_path = ['_templates']
 51 | 
 52 | # The suffix(es) of source filenames.
 53 | # You can specify multiple suffix as a list of string:
 54 | #
 55 | # source_suffix = ['.rst', '.md']
 56 | source_suffix = ['.rst', '.md']
 57 | 
 58 | # The master toctree document.
 59 | master_doc = 'index'
 60 | 
 61 | # The language for content autogenerated by Sphinx. Refer to documentation
 62 | # for a list of supported languages.
 63 | #
 64 | # This is also used if you do content translation via gettext catalogs.
 65 | # Usually you set "language" from the command line for these cases.
 66 | language = None
 67 | 
 68 | # List of patterns, relative to source directory, that match files and
 69 | # directories to ignore when looking for source files.
 70 | # This pattern also affects html_static_path and html_extra_path.
 71 | exclude_patterns = []
 72 | 
 73 | # The name of the Pygments (syntax highlighting) style to use.
 74 | pygments_style = None
 75 | 
 76 | 
 77 | # -- Options for HTML output -------------------------------------------------
 78 | 
 79 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 80 | # a list of builtin themes.
 81 | #
 82 | html_theme = 'sphinx_rtd_theme'
 83 | 
 84 | # Theme options are theme-specific and customize the look and feel of a theme
 85 | # further.  For a list of options available for each theme, see the
 86 | # documentation.
 87 | #
 88 | # html_theme_options = {}
 89 | 
 90 | # Add any paths that contain custom static files (such as style sheets) here,
 91 | # relative to this directory. They are copied after the builtin static files,
 92 | # so a file named "default.css" will overwrite the builtin "default.css".
 93 | html_static_path = ['_static']
 94 | 
 95 | # Custom sidebar templates, must be a dictionary that maps document names
 96 | # to template names.
 97 | #
 98 | # The default sidebars (for documents that don't match any pattern) are
 99 | # defined by theme itself.  Builtin themes are using these templates by
100 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
101 | # 'searchbox.html']``.
102 | #
103 | # html_sidebars = {}
104 | 
105 | 
106 | # -- Options for HTMLHelp output ---------------------------------------------
107 | 
108 | # Output file base name for HTML help builder.
109 | htmlhelp_basename = 'SearchEngineParserdoc'
110 | 
111 | 
112 | # -- Options for LaTeX output ------------------------------------------------
113 | 
114 | latex_elements = {
115 |     # The paper size ('letterpaper' or 'a4paper').
116 |     #
117 |     # 'papersize': 'letterpaper',
118 | 
119 |     # The font size ('10pt', '11pt' or '12pt').
120 |     #
121 |     # 'pointsize': '10pt',
122 | 
123 |     # Additional stuff for the LaTeX preamble.
124 |     #
125 |     # 'preamble': '',
126 | 
127 |     # Latex figure (float) alignment
128 |     #
129 |     # 'figure_align': 'htbp',
130 | }
131 | 
132 | # Grouping the document tree into LaTeX files. List of tuples
133 | # (source start file, target name, title,
134 | #  author, documentclass [howto, manual, or own class]).
135 | latex_documents = [
136 |     (master_doc, 'SearchEngineParser.tex', 'Search Engine Parser Documentation',
137 |      'Diretnan Domnan, Mmadu Manasseh', 'manual'),
138 | ]
139 | 
140 | 
141 | # -- Options for manual page output ------------------------------------------
142 | 
143 | # One entry per manual page. List of tuples
144 | # (source start file, name, description, authors, manual section).
145 | man_pages = [
146 |     (master_doc, 'searchengineparser', 'Search Engine Parser Documentation',
147 |      [author], 1)
148 | ]
149 | 
150 | 
151 | # -- Options for Texinfo output ----------------------------------------------
152 | 
153 | # Grouping the document tree into Texinfo files. List of tuples
154 | # (source start file, target name, title, author,
155 | #  dir menu entry, description, category)
156 | texinfo_documents = [
157 |     (master_doc, 'SearchEngineParser', 'Search Engine Parser Documentation',
158 |      author, 'SearchEngineParser', 'One line description of project.',
159 |      'Miscellaneous'),
160 | ]
161 | 
162 | 
163 | # -- Options for Epub output -------------------------------------------------
164 | 
165 | # Bibliographic Dublin Core info.
166 | epub_title = project
167 | 
168 | # The unique identifier of the text. This can be a ISBN number
169 | # or the project homepage.
170 | #
171 | # epub_identifier = ''
172 | 
173 | # A unique identification for the text.
174 | #
175 | # epub_uid = ''
176 | 
177 | # A list of files that should not be packed into the epub file.
178 | epub_exclude_files = ['search.html']
179 | 
180 | 
181 | # -- Extension configuration -------------------------------------------------
182 | 
183 | # -- Options for todo extension ----------------------------------------------
184 | 
185 | # If true, `todo` and `todoList` produce output, else they produce nothing.
186 | todo_include_todos = True
187 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. Search Engine Parser documentation master file, created by
 2 |    sphinx-quickstart on Fri Feb  1 23:05:55 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to Search Engine Parser's documentation!
 7 | ================================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 | 
14 | .. mdinclude:: ../../README.md
15 | 
16 | Indices and tables
17 | ==================
18 | 
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 | 


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | search_engine_parser
2 | ====================
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    search_engine_parser
8 | 


--------------------------------------------------------------------------------
/docs/source/search_engine_parser.core.engines.rst:
--------------------------------------------------------------------------------
  1 | search\_engine\_parser.core.engines package
  2 | ===========================================
  3 | 
  4 | Submodules
  5 | ----------
  6 | 
  7 | search\_engine\_parser.core.engines.aol module
  8 | ----------------------------------------------
  9 | 
 10 | .. automodule:: search_engine_parser.core.engines.aol
 11 |    :members:
 12 |    :undoc-members:
 13 |    :show-inheritance:
 14 | 
 15 | search\_engine\_parser.core.engines.ask module
 16 | ----------------------------------------------
 17 | 
 18 | .. automodule:: search_engine_parser.core.engines.ask
 19 |    :members:
 20 |    :undoc-members:
 21 |    :show-inheritance:
 22 | 
 23 | search\_engine\_parser.core.engines.baidu module
 24 | ------------------------------------------------
 25 | 
 26 | .. automodule:: search_engine_parser.core.engines.baidu
 27 |    :members:
 28 |    :undoc-members:
 29 |    :show-inheritance:
 30 | 
 31 | search\_engine\_parser.core.engines.bing module
 32 | -----------------------------------------------
 33 | 
 34 | .. automodule:: search_engine_parser.core.engines.bing
 35 |    :members:
 36 |    :undoc-members:
 37 |    :show-inheritance:
 38 | 
 39 | search\_engine\_parser.core.engines.coursera module
 40 | ---------------------------------------------------
 41 | 
 42 | .. automodule:: search_engine_parser.core.engines.coursera
 43 |    :members:
 44 |    :undoc-members:
 45 |    :show-inheritance:
 46 | 
 47 | search\_engine\_parser.core.engines.duckduckgo module
 48 | -----------------------------------------------------
 49 | 
 50 | .. automodule:: search_engine_parser.core.engines.duckduckgo
 51 |    :members:
 52 |    :undoc-members:
 53 |    :show-inheritance:
 54 | 
 55 | search\_engine\_parser.core.engines.github module
 56 | -------------------------------------------------
 57 | 
 58 | .. automodule:: search_engine_parser.core.engines.github
 59 |    :members:
 60 |    :undoc-members:
 61 |    :show-inheritance:
 62 | 
 63 | search\_engine\_parser.core.engines.google module
 64 | -------------------------------------------------
 65 | 
 66 | .. automodule:: search_engine_parser.core.engines.google
 67 |    :members:
 68 |    :undoc-members:
 69 |    :show-inheritance:
 70 | 
 71 | search\_engine\_parser.core.engines.googlescholar module
 72 | --------------------------------------------------------
 73 | 
 74 | .. automodule:: search_engine_parser.core.engines.googlescholar
 75 |    :members:
 76 |    :undoc-members:
 77 |    :show-inheritance:
 78 | 
 79 | search\_engine\_parser.core.engines.myanimelist module
 80 | ------------------------------------------------------
 81 | 
 82 | .. automodule:: search_engine_parser.core.engines.myanimelist
 83 |    :members:
 84 |    :undoc-members:
 85 |    :show-inheritance:
 86 | 
 87 | search\_engine\_parser.core.engines.stackoverflow module
 88 | --------------------------------------------------------
 89 | 
 90 | .. automodule:: search_engine_parser.core.engines.stackoverflow
 91 |    :members:
 92 |    :undoc-members:
 93 |    :show-inheritance:
 94 | 
 95 | search\_engine\_parser.core.engines.yahoo module
 96 | ------------------------------------------------
 97 | 
 98 | .. automodule:: search_engine_parser.core.engines.yahoo
 99 |    :members:
100 |    :undoc-members:
101 |    :show-inheritance:
102 | 
103 | search\_engine\_parser.core.engines.yandex module
104 | -------------------------------------------------
105 | 
106 | .. automodule:: search_engine_parser.core.engines.yandex
107 |    :members:
108 |    :undoc-members:
109 |    :show-inheritance:
110 | 
111 | search\_engine\_parser.core.engines.youtube module
112 | --------------------------------------------------
113 | 
114 | .. automodule:: search_engine_parser.core.engines.youtube
115 |    :members:
116 |    :undoc-members:
117 |    :show-inheritance:
118 | 
119 | 
120 | Module contents
121 | ---------------
122 | 
123 | .. automodule:: search_engine_parser.core.engines
124 |    :members:
125 |    :undoc-members:
126 |    :show-inheritance:
127 | 


--------------------------------------------------------------------------------
/docs/source/search_engine_parser.core.rst:
--------------------------------------------------------------------------------
 1 | search\_engine\_parser.core package
 2 | ===================================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |    search_engine_parser.core.engines
10 | 
11 | Submodules
12 | ----------
13 | 
14 | search\_engine\_parser.core.base module
15 | ---------------------------------------
16 | 
17 | .. automodule:: search_engine_parser.core.base
18 |    :members:
19 |    :undoc-members:
20 |    :show-inheritance:
21 | 
22 | search\_engine\_parser.core.cli module
23 | --------------------------------------
24 | 
25 | .. automodule:: search_engine_parser.core.cli
26 |    :members:
27 |    :undoc-members:
28 |    :show-inheritance:
29 | 
30 | search\_engine\_parser.core.exceptions module
31 | ---------------------------------------------
32 | 
33 | .. automodule:: search_engine_parser.core.exceptions
34 |    :members:
35 |    :undoc-members:
36 |    :show-inheritance:
37 | 
38 | 
39 | Module contents
40 | ---------------
41 | 
42 | .. automodule:: search_engine_parser.core
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 


--------------------------------------------------------------------------------
/docs/source/search_engine_parser.rst:
--------------------------------------------------------------------------------
 1 | search\_engine\_parser package
 2 | ==============================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |    search_engine_parser.core
10 |    search_engine_parser.tests
11 | 
12 | Module contents
13 | ---------------
14 | 
15 | .. automodule:: search_engine_parser
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 


--------------------------------------------------------------------------------
/docs/source/search_engine_parser.tests.rst:
--------------------------------------------------------------------------------
 1 | search\_engine\_parser.tests package
 2 | ====================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | search\_engine\_parser.tests.base module
 8 | ----------------------------------------
 9 | 
10 | .. automodule:: search_engine_parser.tests.base
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | search\_engine\_parser.tests.test\_search module
16 | ------------------------------------------------
17 | 
18 | .. automodule:: search_engine_parser.tests.test_search
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | 
24 | Module contents
25 | ---------------
26 | 
27 | .. automodule:: search_engine_parser.tests
28 |    :members:
29 |    :undoc-members:
30 |    :show-inheritance:
31 | 


--------------------------------------------------------------------------------
/docs/supported_engines.md:
--------------------------------------------------------------------------------
 1 | ## Supported Engines
 2 | 
 3 | Below is a list of supported engines and what they return.
 4 | 
 5 | 
 6 | |No|Engine|Returns|
 7 | |------|------|-----|
 8 | 1|Google|titles, links, descriptions
 9 | |2|Yahoo|titles, links, descriptions
10 | 3|Bing|titles, links, descriptions
11 | |4|DuckDuckGo|titles, links, descriptions
12 | 5|Baidu|titles, links, descriptions
13 | |6|Yandex|titles, links, descriptions
14 | 7|Aol|titles, links, descriptions
15 | 8|StackOverflow|titles, links, descriptions
16 | 9|GitHub|titles, links, descriptions, stars, languages
17 | 10|Ask|titles, links, descriptions
18 | 11|YouTube|titles, links, descriptions, channels, [single videos only: durations, views, upload_dates]
19 | 12|MyAnimeList|titles, links, descriptions, number of episodes, type of result (OVA, series, movie, etc.), ratings
20 | 13|GoogleScholar|titles, links, descriptions, type of results ([BOOK], [CITATION], etc.), links of files
21 | 14|GoogleNews|titles, links, descriptions, image links, date, news source
22 | 15|Coursera|titles,links,ratings count, ratings average, partners, difficulties, enrolments numbers
23 | 


--------------------------------------------------------------------------------
/requirements/cli.txt:
--------------------------------------------------------------------------------
1 | blessed >=1.15.0, < 2
2 | 


--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
 1 | -r main.txt
 2 | blessed==1.17.9
 3 | m2r==0.2.1
 4 | parameterized==0.7.4
 5 | pylint==2.5.3
 6 | pytest==5.4.3
 7 | sphinx==3.1.2
 8 | sphinx-rtd-theme==0.5.0
 9 | vcrpy==4.0.2
10 | 


--------------------------------------------------------------------------------
/requirements/main.txt:
--------------------------------------------------------------------------------
1 | lxml >=4.6.5, <5
2 | aiohttp >=3.6.2,<4
3 | beautifulsoup4 >=4.9.1,<5
4 | fake-useragent >=0.1.11, <0.2
5 | 


--------------------------------------------------------------------------------
/scripts/docs.sh:
--------------------------------------------------------------------------------
 1 | cd ./docs
 2 | sphinx-apidoc -f -o source/ ../search_engine_parser
 3 | if [ $? -ne 0 ]; then
 4 |    echo "Failed to run sphinx-apidoc"
 5 |    exit 1
 6 | fi
 7 | make html
 8 | if [ $? -ne 0 ]; then
 9 |    echo "Failed to make html"
10 |    exit 1
11 | fi
12 | cd ..
13 | git commit -am "make html"
14 | git config --global push.default simple
15 | git config --global user.email "travis@travis-ci.com"
16 | git config --global user.name "Travis CI"
17 | 
18 | 
19 | #remove existing files except html
20 | shopt -s extglob
21 | rm -r ./!(docs)/
22 | 
23 | #copy contents of html to root
24 | cp -R ${TRAVIS_BUILD_DIR}/docs/build/html/. ${TRAVIS_BUILD_DIR}/
25 | 
26 | #remove html and accompanying docs  
27 | rm -r ./docs
28 | echo "Viewing current files in directory"
29 | ls -lah
30 | # Checkout to gh-pages
31 | git checkout gh-pages
32 | if [ $? -eq 1 ]; then
33 |    echo "Checked out to existing gh-pages branch"
34 | else
35 |    git checkout -b gh-pages
36 |    echo "Creating gh-pages branch"
37 | fi 
38 | git add .
39 | git commit -am "rebuilt docs"
40 | git remote add origin-pages https://${GITHUB_TOKEN}@github.com/bisoncorps/search_engine_parser.git
41 | git push -u origin-pages gh-pages --force
42 | 
43 | # echo if docs was succesfully pushed
44 | if [ $? -eq 0 ]; then
45 |     echo "Docs successfully pushed to Github Pages"
46 | else
47 |     echo "Failed to push docs"
48 |     exit 1
49 | fi
50 | 


--------------------------------------------------------------------------------
/scripts/post_deploy_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # get current version
 4 | VERSION="$(python setup.py --version)"
 5 | echo "${VERSION}"
 6 | 
 7 | # install python package
 8 | pip uninstall search-engine-parser -y
 9 | pip install search-engine-parser=="${VERSION}"
10 | python -c "import search_engine_parser"
11 | 
12 | pip uninstall search-engine-parser -y
13 | 
14 | pip install 'search-engine-parser[cli]=="${VERSION}"'
15 | 
16 | # run the cli version to get a result
17 | python -m search_engine_parser.core.cli --engine bing search --query "Preaching to the choir" --type descriptions
18 | 
19 | # run cli with pysearch
20 | pysearch -e youtube search -q "NoCopyrightSounds"
21 | 
22 | if [ $? -eq 0 ]; then
23 |     echo "Package works as expected"
24 | else
25 |     echo "CLI handler of the package failed to execute"
26 |     exit 1
27 | fi
28 | 


--------------------------------------------------------------------------------
/scripts/pre_deploy_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # change directory
4 | cd search_engine_parser/
5 | 
6 | python tests/__init__.py
7 | 


--------------------------------------------------------------------------------
/search_engine_parser/.gitignore:
--------------------------------------------------------------------------------
  1 | ### Python template
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # pytype static type analyzer
136 | .pytype/
137 | 
138 | # Cython debug symbols
139 | cython_debug/
140 | 
141 | ### Python template
142 | # Byte-compiled / optimized / DLL files
143 | __pycache__/
144 | *.py[cod]
145 | *$py.class
146 | 
147 | # C extensions
148 | *.so
149 | 
150 | # Distribution / packaging
151 | .Python
152 | build/
153 | develop-eggs/
154 | dist/
155 | downloads/
156 | eggs/
157 | .eggs/
158 | lib/
159 | lib64/
160 | parts/
161 | sdist/
162 | var/
163 | wheels/
164 | share/python-wheels/
165 | *.egg-info/
166 | .installed.cfg
167 | *.egg
168 | MANIFEST
169 | 
170 | # PyInstaller
171 | #  Usually these files are written by a python script from a template
172 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
173 | *.manifest
174 | *.spec
175 | 
176 | # Installer logs
177 | pip-log.txt
178 | pip-delete-this-directory.txt
179 | 
180 | # Unit test / coverage reports
181 | htmlcov/
182 | .tox/
183 | .nox/
184 | .coverage
185 | .coverage.*
186 | .cache
187 | nosetests.xml
188 | coverage.xml
189 | *.cover
190 | *.py,cover
191 | .hypothesis/
192 | .pytest_cache/
193 | cover/
194 | 
195 | # Translations
196 | *.mo
197 | *.pot
198 | 
199 | # Django stuff:
200 | *.log
201 | local_settings.py
202 | db.sqlite3
203 | db.sqlite3-journal
204 | 
205 | # Flask stuff:
206 | instance/
207 | .webassets-cache
208 | 
209 | # Scrapy stuff:
210 | .scrapy
211 | 
212 | # Sphinx documentation
213 | docs/_build/
214 | 
215 | # PyBuilder
216 | .pybuilder/
217 | target/
218 | 
219 | # Jupyter Notebook
220 | .ipynb_checkpoints
221 | 
222 | # IPython
223 | profile_default/
224 | ipython_config.py
225 | 
226 | # pyenv
227 | #   For a library or package, you might want to ignore these files since the code is
228 | #   intended to run in multiple environments; otherwise, check them in:
229 | # .python-version
230 | 
231 | # pipenv
232 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
233 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
234 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
235 | #   install all needed dependencies.
236 | #Pipfile.lock
237 | 
238 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
239 | __pypackages__/
240 | 
241 | # Celery stuff
242 | celerybeat-schedule
243 | celerybeat.pid
244 | 
245 | # SageMath parsed files
246 | *.sage.py
247 | 
248 | # Environments
249 | .env
250 | .venv
251 | env/
252 | venv/
253 | ENV/
254 | env.bak/
255 | venv.bak/
256 | 
257 | # Spyder project settings
258 | .spyderproject
259 | .spyproject
260 | 
261 | # Rope project settings
262 | .ropeproject
263 | 
264 | # mkdocs documentation
265 | /site
266 | 
267 | # mypy
268 | .mypy_cache/
269 | .dmypy.json
270 | dmypy.json
271 | 
272 | # Pyre type checker
273 | .pyre/
274 | 
275 | # pytype static type analyzer
276 | .pytype/
277 | 
278 | # Cython debug symbols
279 | cython_debug/
280 | 
281 | #idea
282 | .idea/*
283 | 


--------------------------------------------------------------------------------
/search_engine_parser/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  	@author
 3 |  		Domnan Diretnan
 4 |  		Artificial Intelligence Enthusiast & Software Engineer.
 5 |  		Email: diretnandomnan@gmail.com
 6 |  		Github: https://github.com/deven96
 7 |  		GitLab: https://gitlab.com/Deven96
 8 | 
 9 |         Mmadu Manasseh
10 |  		Email: mmadumanasseh@gmail.com
11 |  		Github: https://github.com/mensaah
12 |  		GitLab: https://gitlab.com/mensaah
13 | 
14 |  	@project
15 |  		@create date 2019-02-01 22:15:44
16 |  		@modify date 2019-02-01 22:15:44
17 | 
18 | 	@license
19 | 		MIT License
20 | 		Copyright (c) 2018. Domnan Diretnan. All rights reserved
21 | 
22 | """
23 | 
24 | # Allow import using `search_engine_parser.engines`
25 | from search_engine_parser.core import engines
26 | # Support for older versions of imports
27 | # DEPRECATION_WARNING: These imports will be removed in later versions
28 | from search_engine_parser.core.engines.aol import Search as AolSearch
29 | from search_engine_parser.core.engines.ask import Search as AskSearch
30 | from search_engine_parser.core.engines.baidu import Search as BaiduSearch
31 | from search_engine_parser.core.engines.bing import Search as BingSearch
32 | from search_engine_parser.core.engines.duckduckgo import \
33 |     Search as DuckDuckGoSearch
34 | from search_engine_parser.core.engines.github import Search as GithubSearch
35 | from search_engine_parser.core.engines.google import Search as GoogleSearch
36 | from search_engine_parser.core.engines.googlescholar import \
37 |     Search as GoogleScholarSearch
38 | from search_engine_parser.core.engines.stackoverflow import \
39 |     Search as StackOverflowSearch
40 | from search_engine_parser.core.engines.yahoo import Search as YahooSearch
41 | 
42 | name = "search-engine-parser"  # pylint: disable=invalid-name
43 | __version__ = "0.6.3"
44 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/__init__.py:
--------------------------------------------------------------------------------
1 | import search_engine_parser.core.engines
2 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/base.py:
--------------------------------------------------------------------------------
  1 | """@desc
  2 | 		Base class inherited by every search engine
  3 | """
  4 | 
  5 | import asyncio
  6 | import random
  7 | from abc import ABCMeta, abstractmethod
  8 | from contextlib import suppress
  9 | from enum import Enum, unique
 10 | from urllib.parse import urlencode, urlparse
 11 | 
 12 | import aiohttp
 13 | from bs4 import BeautifulSoup
 14 | 
 15 | from search_engine_parser.core import utils
 16 | from search_engine_parser.core.exceptions import NoResultsOrTrafficError
 17 | 
 18 | 
 19 | @unique
 20 | class ReturnType(Enum):
 21 |     FULL = "full"
 22 |     TITLE = "titles"
 23 |     DESCRIPTION = "descriptions"
 24 |     LINK = "links"
 25 | 
 26 | 
 27 | # All results returned are each items of search
 28 | class SearchItem(dict):
 29 |     """
 30 |     SearchItem is a dict of results containing keys (titles, descriptions, links and other
 31 |     additional keys dependending on the engine)
 32 |     >>> result
 33 |     <search_engine_parser.core.base.SearchItem object at 0x7f907426a280>
 34 |     >>> result["description"]
 35 |     Some description
 36 |     >>> result["descriptions"]
 37 |     Same description
 38 |     """
 39 |     def __getitem__(self, value):
 40 |         """ Allow getting by index and by type ('descriptions', 'links'...)"""
 41 |         try:
 42 |             return super().__getitem__(value)
 43 |         except KeyError:
 44 |             pass
 45 |         if not value.endswith('s'):
 46 |             value += 's'
 47 |         return super().__getitem__(value)
 48 | 
 49 | 
 50 | class SearchResult():
 51 |     """
 52 |     The SearchResults after the searching
 53 | 
 54 |     >>> results = gsearch.search("preaching the choir", 1)
 55 |     >>> results
 56 |     <search_engine_parser.core.base.SearchResult object at 0x7f907426a280>
 57 | 
 58 |     The object supports retreiving individual results by iteration of just by type
 59 |     >>> results[0] # Returns the first result <SearchItem>
 60 |     >>> results["descriptions"] # Returns a list of all descriptions from all results
 61 | 
 62 |     It can be iterated like a normal list to return individual SearchItem
 63 |     """
 64 | 
 65 |     def __init__(self):
 66 |         self.results = []
 67 | 
 68 |     def append(self, value):
 69 |         self.results.append(value)
 70 | 
 71 |     def __getitem__(self, value):
 72 |         """ Allow getting by index and by type ('descriptions', 'links'...)"""
 73 |         if isinstance(value, int):
 74 |             return self.results[value]
 75 |         l = []
 76 |         for x in self.results:
 77 |             with suppress(KeyError):
 78 |                 l.append(x[value])
 79 |         return l
 80 | 
 81 |     def keys(self):
 82 |         keys = {}
 83 |         with suppress(IndexError):
 84 |             x = self.results[0]
 85 |             keys = x.keys()
 86 |         return keys
 87 | 
 88 |     def __len__(self):
 89 |         return len(self.results)
 90 | 
 91 |     def __repr_(self):
 92 |         return "<SearchResult: {} results>".format(len(self.results))
 93 | 
 94 | 
 95 | class BaseSearch:
 96 | 
 97 |     __metaclass__ = ABCMeta
 98 | 
 99 |     """
100 |     Search base to be extended by search parsers
101 |     Every subclass must have two methods `search` amd `parse_single_result`
102 |     """
103 |     # Summary of engine
104 |     summary = None
105 |     # Search Engine Name
106 |     name = None
107 |     # Search Engine unformatted URL
108 |     search_url = None
109 |     # The url after all query params have been set
110 |     _parsed_url = None
111 |     # boolean that indicates cache hit or miss
112 |     _cache_hit = False
113 | 
114 |     @abstractmethod
115 |     def parse_soup(self, soup):
116 |         """
117 |         Defines the results contained in a soup
118 |         """
119 |         raise NotImplementedError("subclasses must define method <parse_soup>")
120 | 
121 |     @abstractmethod
122 |     def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
123 |         """
124 |         Every div/span containing a result is passed here to retrieve
125 |         `title`, `link` and `descr`
126 |         """
127 |         raise NotImplementedError(
128 |             "subclasses must define method <parse_results>")
129 | 
130 |     def get_cache_handler(self):
131 |         """ Return Cache Handler to use"""
132 | 
133 |         return utils.CacheHandler()
134 | 
135 |     @property
136 |     def cache_handler(self):
137 |         return self.get_cache_handler()
138 | 
139 |     def parse_result(self, results, **kwargs):
140 |         """
141 |         Runs every entry on the page through parse_single_result
142 | 
143 |         :param results: Result of main search to extract individual results
144 |         :type results: list[`bs4.element.ResultSet`]
145 |         :returns: dictionary. Containing lists of titles, links, descriptions and other possible\
146 |             returns.
147 |         :rtype: dict
148 |         """
149 |         search_results = SearchResult()
150 |         for each in results:
151 |             rdict = self.parse_single_result(each, **kwargs)
152 |             if rdict is not None:
153 |                 search_results.append(rdict)
154 |         return search_results
155 | 
156 |     def get_params(self, query=None, page=None, offset=None, **kwargs):
157 |         """ This  function should be overwritten to return a dictionary of query params"""
158 |         return {'q': query, 'page': page}
159 | 
160 |     def headers(self):
161 |         headers = {
162 |             "Cache-Control": 'no-cache',
163 |             "Connection": "keep-alive",
164 |             "User-Agent": utils.get_rand_user_agent()
165 |         }
166 |         return headers
167 | 
168 |     def clear_cache(self, all_cache=False):
169 |         """
170 |         Triggers the clear cache function for a particular engine
171 | 
172 |         :param all_cache: if True, deletes for all engines
173 |         """
174 |         if all_cache:
175 |             return self.cache_handler.clear()
176 |         return self.cache_handler.clear(self.name)
177 | 
178 |     async def get_source(self, url, cache=True, proxy=None, proxy_auth=None):
179 |         """
180 |         Returns the source code of a webpage.
181 |         Also sets the _cache_hit if cache was used
182 | 
183 |         :rtype: string
184 |         :param url: URL to pull it's source code
185 |         :param proxy: proxy address to make use off
186 |         :type proxy: str
187 |         :param proxy_auth: (user, password) tuple to authenticate proxy
188 |         :type proxy_auth: (str, str)
189 |         :return: html source code of a given URL.
190 |         """
191 |         try:
192 |             html, cache_hit = await self.cache_handler.get_source(self.name, url, self.headers(), cache, proxy, proxy_auth)
193 |         except Exception as exc:
194 |             raise Exception('ERROR: {}\n'.format(exc))
195 |         self._cache_hit = cache_hit
196 |         return html
197 | 
198 |     async def get_soup(self, url, cache, proxy, proxy_auth):
199 |         """
200 |         Get the html soup of a query
201 |         :param url: url to obrain soup from
202 |         :type url: str
203 |         :param cache: cache request or not
204 |         :type cache: bool
205 |         :param proxy: proxy address to make use off
206 |         :type proxy: str
207 |         :param proxy_auth: (user, password) tuple to authenticate proxy
208 |         :type proxy_auth: (str, str)
209 | 
210 |         :rtype: `bs4.element.ResultSet`
211 |         """
212 |         html = await self.get_source(url, cache, proxy, proxy_auth)
213 |         return BeautifulSoup(html, 'lxml')
214 | 
215 |     def get_search_url(self, query=None, page=None, **kwargs):
216 |         """
217 |         Return a formatted search url
218 |         """
219 |         # Some URLs use offsets
220 |         offset = (page * 10) - 9
221 |         params = self.get_params(
222 |             query=query, page=page, offset=offset, **kwargs)
223 |         url = urlparse(self.search_url)
224 |         # For localization purposes, custom urls can be parsed for the same engine
225 |         # such as google.de and google.com
226 |         if kwargs.get("url"):
227 |             new_url = urlparse(kwargs.pop("url"))
228 |             # When passing url without scheme e.g google.de, url is parsed as path
229 |             if not new_url.netloc:
230 |                 url = url._replace(netloc=new_url.path)
231 |             else:
232 |                 url = url._replace(netloc=new_url.netloc)
233 |             self.base_url = url.geturl()
234 |         self._parsed_url = url._replace(query=urlencode(params))
235 | 
236 |         return self._parsed_url.geturl()
237 | 
238 |     def get_results(self, soup, **kwargs):
239 |         """ Get results from soup"""
240 | 
241 |         search_results = None
242 |         results = self.parse_soup(soup)
243 |         # TODO Check if empty results is caused by traffic or answers to query
244 |         # were not found
245 |         if not results:
246 |             print("ENGINE FAILURE: {}\n".format(self.name))
247 |             raise NoResultsOrTrafficError(
248 |                 "The result parsing was unsuccessful. It is either your query could not be found"
249 |                 " or it was flagged as unusual traffic")
250 | 
251 |         try:
252 |             search_results = self.parse_result(results, **kwargs)
253 |         # AttributeError occurs as it cannot pass the returned soup
254 |         except AttributeError as e:
255 |             raise NoResultsOrTrafficError(
256 |                 "The returned results could not be parsed. This might be due to site updates or "
257 |                 "server errors. Drop an issue at https://github.com/bisoncorps/search-engine-parser"
258 |                 " if this persists"
259 |                 )
260 | 
261 |         return search_results
262 | 
263 |     def search(self, query=None, page=1, cache=True, proxy=None, proxy_auth=None, **kwargs):
264 |         """
265 |         Query the search engine
266 | 
267 |         :param query: the query to search for
268 |         :type query: str
269 |         :param page: Page to be displayed, defaults to 1
270 |         :type page: int
271 |         :param proxy: proxy address to make use off
272 |         :type proxy: str
273 |         :param proxy_auth: (user, password) tuple to authenticate proxy
274 |         :type proxy_auth: (str, str)
275 |         :return: dictionary. Containing titles, links, netlocs and descriptions.
276 |         """
277 |         # Pages can only be from 1-N
278 |         if page <= 0:
279 |             page = 1
280 |         # Get search Page Results
281 |         loop = asyncio.get_event_loop()
282 |         url = self.get_search_url(
283 |                     query, page, **kwargs)
284 |         soup = loop.run_until_complete(
285 |             self.get_soup(url, cache=cache,
286 |                          proxy=proxy,
287 |                          proxy_auth=proxy_auth))
288 |         return self.get_results(soup, **kwargs)
289 | 
290 |     async def async_search(self, query=None, page=1, cache=True, proxy=None, proxy_auth=None, **kwargs):
291 |         """
292 |         Query the search engine but in async mode
293 | 
294 |         :param query: the query to search for
295 |         :type query: str
296 |         :param page: Page to be displayed, defaults to 1
297 |         :type page: int
298 |         :param proxy: proxy address to make use off
299 |         :type proxy: str
300 |         :param proxy_auth: (user, password) tuple to authenticate proxy
301 |         :type proxy_auth: (str, str)
302 |         :return: dictionary. Containing titles, links, netlocs and descriptions.
303 |         """
304 |         # Pages can only be from 1-N
305 |         if page == 0:
306 |             page = 1
307 |         soup = await self.get_soup(self.get_search_url(query, page, **kwargs), cache=cache, proxy=proxy, proxy_auth=proxy_auth)
308 |         return self.get_results(soup, **kwargs)
309 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/cli.py:
--------------------------------------------------------------------------------
  1 | """@desc
  2 |         Making use of the parser through cli
  3 | """
  4 | from __future__ import print_function
  5 | 
  6 | import argparse
  7 | import sys
  8 | from datetime import datetime
  9 | from importlib import import_module
 10 | 
 11 | from blessed import Terminal
 12 | from search_engine_parser import __version__
 13 | from search_engine_parser.core.base import ReturnType
 14 | from search_engine_parser.core.exceptions import NoResultsOrTrafficError
 15 | 
 16 | 
 17 | def display(results, term, args):
 18 |     """ Displays search results
 19 |     """
 20 |     def print_one(kwargs):
 21 |         """ Print one result to the console """
 22 |         # Header
 23 |         if kwargs.get("titles"):
 24 |             print("\t{}".format(term.magenta(kwargs.pop("titles"))))
 25 |         if kwargs.get("links"):
 26 |             print("\t{}".format(kwargs.pop("links")))
 27 |             print("\t-----------------------------------------------------")
 28 |         if kwargs.get("descriptions"):
 29 |             print(kwargs.pop("descriptions"))
 30 |         if kwargs.values():
 31 |             for k, v in kwargs.items():
 32 |                 if v:
 33 |                     print(k.strip(), " : ", v)
 34 |         print("\n")
 35 | 
 36 |     if args.rank and args.rank > 10:
 37 |         sys.exit(
 38 |             "Results are only limited to 10, specify a different page number instead")
 39 | 
 40 |     if not args.rank:
 41 |         for i in results:
 42 |             print_one(i)
 43 |     else:
 44 |         rank = args.rank
 45 |         print_one(results[rank])
 46 | 
 47 | 
 48 | def get_engine_class(engine):
 49 |     """ Return the Engine Class """
 50 |     try:
 51 |         module = import_module(
 52 |             "search_engine_parser.core.engines.{}".format(
 53 |                 engine.lower()))
 54 |         return getattr(module, "Search")
 55 |     except (ImportError, ModuleNotFoundError):
 56 |         sys.exit('Engine < {} > does not exist'.format(engine))
 57 | 
 58 | 
 59 | def show_summary(term, engine_class):
 60 |     """ Show the summary of an Engine"""
 61 |     print("\t{}".format(term.magenta(engine_class.name)))
 62 |     print("\t-----------------------------------------------------")
 63 |     print(engine_class.summary)
 64 | 
 65 | 
 66 | def main(args):  # pylint: disable=too-many-branches
 67 |     """
 68 |         Executes logic from parsed arguments
 69 |     """
 70 |     term = Terminal()
 71 |     engine_class = get_engine_class(args.engine)
 72 | 
 73 |     if args.show_summary:
 74 |         show_summary(term, engine_class)
 75 |         return
 76 | 
 77 |     if not args.query:
 78 |         print("--show-summary or --query argument must be passed")
 79 |         sys.exit(1)
 80 | 
 81 |     # Initialize search Engine with required params
 82 |     engine = engine_class()
 83 |     try:
 84 |         if args.clear_cache:
 85 |             engine.clear_cache()
 86 |         # Display full details: Header, Link, Description
 87 |         start = datetime.now()
 88 |         results = engine.search(
 89 |             args.query, args.page, return_type=ReturnType(args.type), url=args.url, proxy=args.proxy, proxy_auth=(args.proxy_user, args.proxy_password))
 90 |         duration = datetime.now() - start
 91 |         display(results, term, args)
 92 |         print("Total search took -> %s seconds" % (duration))
 93 |     except NoResultsOrTrafficError as exc:
 94 |         print('\n', '{}'.format(term.red(str(exc))))
 95 | 
 96 | 
 97 | def create_parser():
 98 |     """
 99 |     runner that handles parsing logic
100 |     """
101 |     parser = argparse.ArgumentParser(description='SearchEngineParser', prog="pysearch")
102 | 
103 |     parser.add_argument('-V', '--version', action="version", version="%(prog)s v" + __version__)
104 | 
105 |     parser.add_argument(
106 |         '-e', '--engine',
107 |         help='Engine to use for parsing the query e.g google, yahoo, bing,'
108 |              'duckduckgo (default: google)',
109 |         default='google')
110 | 
111 |     parser.add_argument(
112 |         '--show-summary',
113 |         action='store_true',
114 |         help='Shows the summary of an engine')
115 | 
116 |     parser.add_argument(
117 |         '-u',
118 |         '--url',
119 |         help='A custom link to use as base url for search e.g google.de')
120 | 
121 |     parser.add_argument(
122 |         '-p',
123 |         '--page',
124 |         type=int,
125 |         help='Page of the result to return details for (default: 1)',
126 |         default=1)
127 | 
128 |     parser.add_argument(
129 |         '-t', '--type',
130 |         help='Type of detail to return i.e full, links, desciptions or titles (default: full)',
131 |         default="full")
132 | 
133 |     parser.add_argument(
134 |         '-cc', '--clear-cache',
135 |         action='store_true',
136 |         help='Clear cache of engine before searching')
137 | 
138 |     parser.add_argument(
139 |         '-r',
140 |         '--rank',
141 |         type=int,
142 |         help='ID of Detail to return e.g 5 (default: 0)')
143 | 
144 |     parser.add_argument(
145 |         '--proxy',
146 |         required=False,
147 |         help='Proxy address to make use of')
148 | 
149 |     parser.add_argument(
150 |         '--proxy-user',
151 |         required='--proxy' in sys.argv,
152 |         help='Proxy user to make use of')
153 | 
154 |     parser.add_argument(
155 |         '--proxy-password',
156 |         required='--proxy' in sys.argv,
157 |         help='Proxy password to make use of')
158 | 
159 |     parser.add_argument(
160 |         'query', type=str, nargs='?',
161 |         help='Query string to search engine for')
162 | 
163 |     return parser
164 | 
165 | 
166 | def runner():
167 |     parser = create_parser()
168 |     args = parser.parse_args(sys.argv[1:])
169 |     main(args)
170 | 
171 | 
172 | if __name__ == '__main__':
173 |     runner()
174 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/engines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bisohns/search-engine-parser/0c2f4bde7dd21c10e64c9204417d9a228e96c187/search_engine_parser/core/engines/__init__.py


--------------------------------------------------------------------------------
/search_engine_parser/core/engines/aol.py:
--------------------------------------------------------------------------------
 1 | """@desc
 2 | 		Parser for AOL search results
 3 | """
 4 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
 5 | 
 6 | 
 7 | class Search(BaseSearch):
 8 |     """
 9 |     Searches Aol for string
10 |     """
11 |     name = "AOL"
12 |     search_url = "https://search.aol.com/aol/search?"
13 |     summary = "\t According to netmarketshare, the old time famous AOL is still in the top 10 "\
14 |         "search engines with a market share that is close to 0.06%. "\
15 |         "The AOL network includes many popular web sites like engadget.com, techchrunch.com and "\
16 |         "the huffingtonpost.com. \nOn June 23, 2015, AOL was acquired by Verizon Communications."
17 | 
18 |     def parse_soup(self, soup):
19 |         """
20 |         Parses AOL for a search query
21 |         """
22 |         # find all divs
23 |         return soup.find_all('div', class_='algo-sr')
24 | 
25 |     def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
26 |         """
27 |         Parses the source code to return
28 | 
29 |         :param single_result: single result found in <div class="algo-sr">
30 |         :type single_result: `bs4.element.ResultSet`
31 |         :return: parsed title, link and description of single result
32 |         :rtype: dict
33 |         """
34 |         rdict = SearchItem()
35 |         h3_tag = single_result.find('h3')
36 |         link_tag = h3_tag.find('a')
37 |         if return_type in (ReturnType.FULL, return_type.TITLE):
38 |             # Get the text and link
39 |             rdict["titles"] = link_tag.text
40 | 
41 |         if return_type in (ReturnType.FULL, ReturnType.LINK):
42 |             rdict["links"] = link_tag.get("href")
43 | 
44 |         if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
45 |             caption = single_result.find('div', class_='compText aAbs')
46 |             desc = caption.find('p', class_='lh-16')
47 |             rdict["descriptions"] = desc.text
48 | 
49 |         return rdict
50 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/engines/ask.py:
--------------------------------------------------------------------------------
 1 | """@desc
 2 | 		Parser for ask search results
 3 | """
 4 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
 5 | 
 6 | 
 7 | class Search(BaseSearch):
 8 |     """
 9 |     Searches Ask for string
10 |     """
11 |     name = "Ask"
12 | 
13 |     search_url = "https://www.ask.com/web?"
14 | 
15 |     summary = "\t Formerly known as Ask Jeeves, Ask.com receives approximately 0.42% of the search"\
16 |         " share. ASK is based on a question/answer format where most questions are answered by "\
17 |         "other users or are in the form of polls.\nIt also has the general search functionality "\
18 |         "but the results returned lack quality compared to Google or even Bing and Yahoo."
19 | 
20 |     def get_params(self, query=None, page=None, offset=None, **kwargs):
21 |         params = {}
22 |         params["o"] = 0
23 |         params["l"] = "dir"
24 |         params["qo"] = "pagination"
25 |         params["q"] = query
26 |         params["qsrc"] = 998
27 |         params["page"] = page
28 |         return params
29 | 
30 |     def parse_soup(self, soup):
31 |         """
32 |         Parses Ask Search Soup for results
33 |         """
34 |         # find all class_='PartialSearchResults-item' => each result
35 |         return soup.find_all('div', class_="PartialSearchResults-item")
36 | 
37 |     def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
38 |         """
39 |         Parses the source code to return
40 | 
41 |         :param single_result: single result found in <div class="PartialSearchResults-item">
42 |         :type single_result: `bs4.element.ResultSet`
43 |         :return: parsed title, link and description of single result
44 |         :rtype: str, str, str
45 |         """
46 | 
47 |         rdict = SearchItem()
48 |         if return_type in (ReturnType.FULL, return_type.TITLE):
49 |             rdict["titles"] = single_result.find('a').text
50 | 
51 |         if return_type in (ReturnType.FULL, return_type.TITLE):
52 |             rdict["links"] = single_result.a["href"]
53 | 
54 |         if return_type in (ReturnType.FULL, return_type.TITLE):
55 |             rdict["descriptions"] = single_result.find(
56 |                 'p', class_="PartialSearchResults-item-abstract").text
57 | 
58 |         return rdict
59 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/engines/baidu.py:
--------------------------------------------------------------------------------
 1 | """@desc
 2 | 		Parser for Baidu search results
 3 | """
 4 | 
 5 | import re
 6 | 
 7 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
 8 | 
 9 | 
10 | class Search(BaseSearch):
11 |     """
12 |     Searches Baidu for string
13 |     """
14 |     name = "Baidu"
15 |     search_url = "https://www.baidu.com/s?"
16 |     summary = "\tBaidu, Inc. is a Chinese multinational technology company specializing in"\
17 |         " Internet-related services and products and artificial intelligence (AI), headquartered"\
18 |         " in Beijing's Haidian District.\n\tIt is one of the largest AI and internet"\
19 |         " companies in the world.\n\tBaidu offers various services, including a"\
20 |         " Chinese search engine, as well as a mapping service called Baidu Maps."
21 | 
22 |     """Override get_search_url"""
23 | 
24 |     def get_params(self, query=None, page=None, offset=None, **kwargs):
25 |         params = {}
26 |         params["wd"] = query
27 |         params["pn"] = (page - 1) * 10
28 |         params["oq"] = query
29 |         return params
30 | 
31 |     def parse_soup(self, soup):
32 |         """
33 |         Parses Baidu for a search query
34 |         """
35 | 
36 |         # Baidu search can be made deterministic via an id
37 |         # Hence, a regex is used to match all eligible ids
38 | 
39 |         return soup.find_all('div', {'id': re.compile(r"^\d{1,2}")}, class_="c-container")
40 | 
41 |     def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
42 |         """
43 |         Parses the source code to return
44 | 
45 |         :param single_result: single result found in div with a numeric id
46 |         :type single_result: `bs4.element.Tag`
47 |         :return: parsed title, link and description of single result
48 |         :rtype: dict
49 |         """
50 |         rdict = SearchItem()
51 |         if return_type in (ReturnType.FULL, return_type.TITLE):
52 |             h3_tag = single_result.find('h3')
53 | 
54 |             # sometimes h3 tag is not found
55 |             if h3_tag:
56 |                 rdict["title"] = h3_tag.text
57 | 
58 |         if return_type in (ReturnType.FULL, ReturnType.LINK):
59 |             link_tag = single_result.find('a')
60 |             # Get the text and link
61 |             rdict["links"] = link_tag.get('href')
62 | 
63 |         if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
64 |             desc = single_result.find('div', class_='c-abstract')
65 |             rdict["descriptions"] = desc if desc else ''
66 |             return rdict
67 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/engines/bing.py:
--------------------------------------------------------------------------------
 1 | """@desc
 2 | 		Parser for Bing search results
 3 | """
 4 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
 5 | 
 6 | 
 7 | class Search(BaseSearch):
 8 |     """
 9 |     Searches Bing for string
10 |     """
11 |     name = "Bing"
12 |     search_url = "https://www.bing.com/search?"
13 |     summary = "\tBing is Microsoft’s attempt to challenge Google in search, but despite their "\
14 |         "efforts they still did not manage to convince users that their search engine can be"\
15 |         " an alternative to Google.\n\tTheir search engine market share is constantly below "\
16 |         "10%, even though Bing is the default search engine on Windows PCs."
17 | 
18 |     def get_params(self, query=None, page=None, offset=None, **kwargs):
19 |         params = {}
20 |         params["q"] = query
21 |         params["offset"] = 0
22 |         params["first"] = offset
23 |         params["count"] = 10
24 |         params["FORM"] = "PERE"
25 |         return params
26 | 
27 |     def parse_soup(self, soup):
28 |         """
29 |         Parses Bing for a search query.
30 |         """
31 |         # find all li tags
32 |         return soup.find_all('li', class_='b_algo')
33 | 
34 |     def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
35 |         """
36 |         Parses the source code to return
37 | 
38 |         :param single_result: single result found in <li class="b_algo">
39 |         :type single_result: `bs4.element.ResultSet`
40 |         :return: parsed title, link and description of single result
41 |         :rtype: dict
42 |         """
43 |         rdict = SearchItem()
44 |         h2_tag = single_result.find('h2')
45 |         link_tag = h2_tag.find('a')
46 | 
47 |         if return_type in (ReturnType.FULL, return_type.TITLE):
48 |             rdict["titles"] = link_tag.text
49 | 
50 |         if return_type in (ReturnType.FULL, return_type.LINK):
51 |             link = link_tag.get('href')
52 |             rdict["links"] = link
53 | 
54 |         if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
55 |             caption = single_result.find('div', class_='b_caption')
56 |             desc = caption.find('p')
57 |             rdict["descriptions"] = desc.text
58 | 
59 |         return rdict
60 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/engines/coursera.py:
--------------------------------------------------------------------------------
 1 | """@desc
 2 | 		Parser for coursera search results
 3 | """
 4 | 
 5 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
 6 | from urllib.parse import urljoin
 7 | 
 8 | 
 9 | class Search(BaseSearch):
10 |     """
11 |     Searches Coursera for string
12 |     """
13 |     name = "Coursera"
14 |     search_url = "https://www.coursera.org/search?"
15 |     summary = "\tCoursera is an American online learning platform founded by Stanford professors Andrew Ng and " \
16 |               "Daphne Koller that offers massive open online courses, specializations, and degrees."
17 | 
18 |     def get_params(self, query=None, page=None, offset=None, **kwargs):
19 |         params = {}
20 |         params["query"] =query
21 |         params["page"] = page
22 |         return params
23 | 
24 |     def parse_soup(self, soup):
25 |         """
26 |         Parses Coursera Search Soup for results
27 |         """
28 |         # find all class_='gs_r gs_or gs_scl' => each result
29 |         return soup.find_all('li', class_='ais-InfiniteHits-item')
30 | 
31 |     def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
32 |         """
33 |         Parses the source code to return
34 | 
35 |         :param single_result: single result found in <div class="gs_r gs_or gs_scl">
36 |         :type single_result: `bs4.element.ResultSet`
37 |         :return: parsed title, link, description, file link, result type of single result
38 |         :rtype: dict
39 |         """
40 |         rdict = SearchItem()
41 | 
42 |         if return_type in (ReturnType.FULL, return_type.LINK):
43 |             link = single_result.find('a', class_='rc-DesktopSearchCard anchor-wrapper').get('href')
44 | 
45 |             rdict["links"] = urljoin('https://www.coursera.org', link)
46 | 
47 |         if return_type in (ReturnType.FULL, return_type.TITLE):
48 |             title = single_result.find('h2', class_="card-title").text
49 |             rdict["titles"] = title
50 | 
51 |         if return_type in (ReturnType.FULL,):
52 |             partner_elem = single_result.find('span', class_='partner-name')
53 |             partner = ''
54 |             if partner_elem:
55 |                 partner = partner_elem.text
56 | 
57 |             rating_avg_elem = single_result.find('span', class_='ratings-text')
58 |             rating_avg = None
59 |             if rating_avg_elem:
60 |                 rating_avg = float(rating_avg_elem.text)
61 | 
62 |             enrollment_elem = single_result.find('span', class_='enrollment-number')
63 |             enrolment_number = None
64 | 
65 |             if enrollment_elem:
66 |                 enr_cl_txt = enrollment_elem.text.lower().replace(',', '').replace('.', '')\
67 |                         .replace('m', '0' * 6).replace('k', '0' * 3)
68 |                 if enr_cl_txt.isdigit():
69 |                     enrolment_number = int(enr_cl_txt)
70 | 
71 |             difficulty_elem = single_result.find('span', class_='difficulty')
72 |             difficulty = ''
73 |             if difficulty_elem:
74 |                 difficulty = difficulty_elem.text
75 | 
76 |             rating_count_elem = single_result.find('span', class_='ratings-count')
77 |             rating_count = None
78 |             if rating_count_elem:
79 |                 rating_count_elem = rating_count_elem.find('span')
80 |                 rating_count_cl = rating_count_elem.text.replace(',', '')
81 |                 if rating_count_cl.isdigit():
82 |                     rating_count = int(rating_count_cl)
83 | 
84 |             rdict.update({
85 |                 "partners": partner,
86 |                 "ratings_avg": rating_avg,
87 |                 "ratings_count": rating_count,
88 |                 "enrolments_numbers": enrolment_number,
89 |                 "difficulties": difficulty,
90 |             })
91 |         return rdict
92 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/engines/duckduckgo.py:
--------------------------------------------------------------------------------
 1 | """@desc
 2 | 		Parser for DuckDuckGo search results
 3 | """
 4 | import re
 5 | 
 6 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
 7 | 
 8 | 
 9 | class Search(BaseSearch):
10 |     """
11 |     Searches DuckDuckGo for string
12 |     """
13 |     name = "DuckDuckGo"
14 |     base_url = "https://www.duckduckgo.com"
15 |     search_url = "https://www.duckduckgo.com/html/?"
16 |     summary = "\tHas a number of advantages over the other search engines. \n\tIt has a clean "\
17 |         "interface, it does not track users, it is not fully loaded with ads and has a number "\
18 |         "of very nice features (only one page of results, you can search directly other web "\
19 |         "sites etc).\n\tAccording to DuckDuckGo traffic stats [December, 2018], they are "\
20 |         "currently serving more than 30 million searches per day."
21 | 
22 |     def get_params(self, query=None, page=None, offset=None, **kwargs):
23 |         params = {}
24 |         params["q"] = query
25 |         params["s"] = 0 if (page < 2) else (((page-1) * 50) - 20)
26 |         params["dc"] = offset
27 |         params["o"] = "json"
28 |         params["api"] = "d.js"
29 |         return params
30 | 
31 |     def parse_soup(self, soup):
32 |         """
33 |         Parses DuckDuckGo Search Soup for a query results
34 |         """
35 |         # find all div tags
36 |         return soup.find_all('div', class_='result')
37 | 
38 |     def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
39 |         """
40 |         Parses the source code to return
41 | 
42 |         :param single_result: single result found in <div id="r1-{id}">
43 |         :type single_result: `bs4.element.ResultSet`
44 |         :return: parsed title, link and description of single result
45 |         :rtype: dict
46 |         """
47 | 
48 |         rdict = SearchItem()
49 | 
50 |         if return_type in (ReturnType.FULL, return_type.TITLE):
51 |             h2 = single_result.find(
52 |                 'h2', class_="result__title")  # pylint: disable=invalid-name
53 |             # Get the text and link
54 |             rdict["titles"] = h2.text.strip()
55 | 
56 |         if return_type in (ReturnType.FULL, ReturnType.LINK):
57 |             link = None
58 |             link_tag = single_result.find('a', class_="result__a")
59 |             if link_tag is not None:
60 |                 rdict["links"] = link_tag.get('href')
61 |             else:
62 |                 rdict['links'] = None
63 |         if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
64 |             desc = single_result.find(class_='result__snippet')
65 |             if desc is not None:
66 |                 rdict["descriptions"] = desc.text
67 |             else:
68 |                 rdict["descriptions"] = ""
69 |         if rdict['links'] is None:
70 |             rdict = None
71 | 
72 |         return rdict
73 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/engines/github.py:
--------------------------------------------------------------------------------
  1 | """@desc
  2 | 		Parser for GitHub search results
  3 | """
  4 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
  5 | from search_engine_parser.core.exceptions import IncorrectKeyWord
  6 | 
  7 | 
  8 | class Search(BaseSearch):
  9 |     """
 10 |     Searches GitHub for string
 11 |     """
 12 |     name = "GitHub"
 13 |     base_url = "https://github.com"
 14 |     search_url = base_url + "/search?"
 15 |     summary = "\tGitHub is an American company that provides hosting for software development "\
 16 |         "version control using Git. It is a subsidiary of Microsoft, which acquired the company "\
 17 |         "in 2018 for $7.5 billion.\n\tIt offers all of the distributed version control and source"\
 18 |         " code management (SCM) functionality of Git as well as adding its own features."\
 19 |         "\n\tAs of May 2019, GitHub reports having over 37 million users and more than 100 million"\
 20 |         " repositories (including at least 28 million public repositories), making it the largest "\
 21 |         "host of source code in the world."
 22 | 
 23 |     def get_params(self, query=None, page=None, offset=None, **kwargs):
 24 |         params = {}
 25 |         params["q"] = query
 26 |         params["p"] = page
 27 |         params["type"] = kwargs.get("type_", None)
 28 |         self.type = params["type"]
 29 |         return params
 30 | 
 31 |     def parse_soup(self, soup):
 32 |         """
 33 |         Parses GitHub for a search query.
 34 |         """
 35 |         allowed_types = (
 36 |             None,
 37 |             "Repositories",
 38 |             "Wikis",
 39 |             "Users",
 40 |             "Topics",
 41 |             "Marketplace",
 42 |             "RegistryPackages",
 43 |             "Issues",
 44 |             "Commits",
 45 |             "Code")
 46 |         if self.type not in allowed_types:
 47 |             raise IncorrectKeyWord(
 48 |                 "No type <{type_}> exists".format(type_=self.type))
 49 |         # find all li tags
 50 |         if self.type in (None, "Repositories"):
 51 |             return soup.find_all('li', class_='repo-list-item')
 52 |         elif self.type == "RegistryPackages":
 53 |             return soup.find_all("div", class_='hx_hit-package')
 54 |         # find all user divs
 55 |         elif self.type == "Users":
 56 |             return soup.find_all('div', class_='user-list-item')
 57 |         elif self.type == "Wikis":
 58 |             return soup.find_all('div', class_='hx_hit-wiki')
 59 |         elif self.type == "Topics":
 60 |             return soup.find_all('div', class_='topic-list-item')
 61 |         elif self.type == "Issues":
 62 |             return soup.find_all('div', class_='issue-list-item')
 63 |         elif self.type == "Marketplace":
 64 |             return soup.find_all('div', class_='hx_hit-marketplace')
 65 |         elif self.type == "Commits":
 66 |             return soup.find_all('div', class_='commits-list-item')
 67 | 
 68 |     def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
 69 |         """
 70 |         Parses the source code to return
 71 | 
 72 |         :param single_result: single result found in container element
 73 |         :type single_result: `bs4.element.ResultSet`
 74 |         :return: parsed title, link and description of single result
 75 |         :rtype: dict
 76 |         """
 77 |         rdict = SearchItem()
 78 |         if self.type in (None, "Repositories"):
 79 |             h3 = single_result.find(
 80 |                 'div', class_='f4')  # pylint: disable=invalid-name
 81 |             link_tag = h3.find('a')
 82 |             # Get the text and link
 83 |             if return_type in (ReturnType.FULL, ReturnType.TITLE):
 84 |                 title = link_tag.text
 85 |                 rdict["titles"] = title
 86 | 
 87 |             if return_type in (ReturnType.FULL, ReturnType.LINK):
 88 |                 ref_link = link_tag.get('href')
 89 |                 link = self.base_url + ref_link
 90 |                 rdict["links"] = link
 91 | 
 92 |             if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
 93 |                 desc = single_result.find('p', class_="mb-1")
 94 |                 rdict["descriptions"] = getattr(desc, 'text', '')
 95 | 
 96 |             if return_type in (ReturnType.FULL,):
 97 |                 stars_and_lang_div = single_result.find(
 98 |                     'div', class_='d-flex')
 99 |                 lang = stars_and_lang_div.find(
100 |                     'span', itemprop="programmingLanguage")
101 |                 stars = single_result.find('div', class_='mr-3').find(
102 |                     'a')
103 |                 updated_on = single_result.find("relative-time").get("title")
104 |                 rdict.update({
105 |                     "stars": "" if not stars else stars.text.strip(),
106 |                     "languages": lang.text if lang else "",
107 |                     "updated_on": updated_on,
108 |                 })
109 | 
110 |         if self.type == "Users":
111 |             title_tag = single_result.find('div', class_='f4')
112 |             if return_type in (ReturnType.FULL, ReturnType.TITLE):
113 |                 title = title_tag.text
114 |                 rdict["titles"] = title
115 | 
116 |             if return_type in (ReturnType.FULL, ReturnType.LINK):
117 |                 ref_link = title_tag.find('a').get('href')
118 |                 link = self.base_url + ref_link
119 |                 rdict["links"] = link
120 | 
121 |             if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
122 |                 desc_tag = single_result.find('p', class_='mb-1')
123 |                 desc = None
124 |                 if desc_tag:
125 |                     desc = desc_tag.text.strip(' \n')
126 |                 rdict["descriptions"] = desc
127 | 
128 |             if return_type in (ReturnType.FULL, ):
129 |                 location_div = single_result.find('div', class_='d-flex')
130 |                 location_and_email = location_div.find_all(
131 |                     'div', class_='mr-3')
132 |                 location = email = None
133 |                 for single in location_and_email:
134 |                     if single.get('href') == None:
135 |                         location = single.text.strip(' \n')
136 |                     else:
137 |                         email = single.text
138 | 
139 |                 rdict.update({
140 |                     "locations": location,
141 |                     "emails": email,
142 |                 })
143 | 
144 |         if self.type == "Wikis":
145 |             title_tag = single_result.find('a', class_=None)
146 | 
147 |             if return_type in (ReturnType.FULL, ReturnType.TITLE):
148 |                 title = title_tag.get('title')
149 |                 rdict["title"] = title
150 | 
151 |             if return_type in (ReturnType.FULL, ReturnType.LINK):
152 |                 ref_link = title_tag.get('href')
153 |                 link = self.base_url + ref_link
154 |                 rdict["links"] = link
155 | 
156 |             if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
157 |                 desc = single_result.find('p', class_="mb1").text
158 |                 rdict["descriptions"] = desc
159 | 
160 |             if return_type in (ReturnType.FULL, ):
161 |                 last_updated = single_result.find(
162 |                     'relative-time').get('title')
163 |                 repository = single_result.find('a', class_='muted-link').text
164 |                 rdict.update({
165 |                     "repositories": repository,
166 |                     "last_updated": last_updated,
167 |                 })
168 | 
169 |         if self.type == "Topics":
170 |             title_div = single_result.find('div', class_='f4')
171 |             title_tag = title_div.find('a', class_=None)
172 |             if return_type in (ReturnType.FULL, ReturnType.TITLE):
173 |                 rdict["titles"] = title_tag.text
174 |             if return_type in (ReturnType.FULL, ReturnType.LINK):
175 |                 ref_link = title_tag.get('href')
176 |                 link = self.base_url + ref_link
177 |                 rdict["links"] = link
178 |             if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
179 |                 desc = None
180 |                 desc_tag = single_result.find('p', class_=None)
181 |                 if desc_tag:
182 |                     desc = desc_tag.text
183 |                 rdict["descriptions"] = desc
184 | 
185 |         if self.type == "Marketplace":
186 |             title_tag = single_result.find('a', class_='no-underline')
187 |             if return_type in (ReturnType.FULL, ReturnType.TITLE):
188 |                 title = title_tag.get('title')
189 |                 rdict["titles"] = title_tag.text
190 |             if return_type in (ReturnType.FULL, ReturnType.LINK):
191 |                 link = title_tag.get('href')
192 |                 rdict["links"] = link
193 | 
194 |             if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
195 |                 desc = None
196 |                 desc_tag = single_result.find('text-gray')
197 |                 if desc_tag:
198 |                     desc = desc_tag.text
199 |                 rdict["descriptions"] = desc
200 | 
201 |             if return_type in (ReturnType.FULL, ):
202 |                 categories = list()
203 |                 categories_tags = single_result.find_all('a', class_='Label')
204 |                 if categories_tags:
205 |                     for i in categories_tags:
206 |                         categories.append(str(i).strip('\n '))
207 |             rdict["categories"] = categories
208 | 
209 |         if self.type == "RegistryPackages":
210 |             title_tag = single_result.find('a', class_='h4')
211 |             if return_type in (ReturnType.FULL, ReturnType.TITLE):
212 |                 title = title_tag.text
213 |                 rdict["titles"] = title_tag.text
214 | 
215 |             if return_type in (ReturnType.FULL, ReturnType.LINK):
216 |                 ref_link = title_tag.get('href')
217 |                 link = self.base_url + ref_link
218 |                 rdict["links"] = link
219 | 
220 |             if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
221 |                 desc = single_result.find(
222 |                     'p', class_='mb-1').text.strip('\n ')
223 |                 rdict["descriptions"] = desc
224 | 
225 |         if self.type == "Issues":
226 |             title_tag = single_result.find('a', class_=None)
227 |             if return_type in (ReturnType.FULL, ReturnType.TITLE):
228 |                 title = title_tag.text
229 |                 rdict["titles"] = title_tag.text
230 | 
231 |             if return_type in (ReturnType.FULL, ReturnType.LINK):
232 |                 ref_link = title_tag.get('href')
233 |                 link = self.base_url + ref_link
234 |                 rdict["links"] = link
235 | 
236 |             if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
237 |                 desc = single_result.find('p', class_='mb-0').text
238 |                 rdict["descriptions"] = desc
239 | 
240 |             if return_type in (ReturnType.FULL, ):
241 |                 repository = single_result.find(
242 |                     'div', class_='ml-1').find('a', 'text-bold').text
243 |                 opened_by = self.base_url + \
244 |                     single_result.find(
245 |                         'div', class_='mr-3').find('a').get('href')
246 |                 opened_on = single_result.find('relative-time').get("title")
247 |                 rdict.update({
248 |                     "opened_by": opened_by,
249 |                     "opened_on": opened_on,
250 |                     "respositories": repository,
251 |                 })
252 | 
253 |         if self.type == "Commits":
254 |             title_p = single_result.find('div', class_="f4")
255 |             title_tag = title_p.find('a')
256 | 
257 |             if return_type in (ReturnType.FULL, ReturnType.TITLE):
258 |                 title = title_tag.get('aria-label').strip("\n ")
259 |                 rdict["titles"] = title_tag.text
260 | 
261 |             if return_type in (ReturnType.FULL, ReturnType.LINK):
262 |                 ref_link = title_tag.get('href')
263 |                 if ref_link.startswith("http"):
264 |                     link = ref_link
265 |                 else:
266 |                     link = self.base_url + ref_link
267 |                 rdict["links"] = link
268 | 
269 |             if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
270 |                 opened_on = None
271 |                 author = None
272 |                 if single_result.find('relative-time'):
273 |                     opened_on = single_result.find(
274 |                         'relative-time').get("title")
275 |                 desc = None
276 |                 if single_result.find('a', class_='commit-author'):
277 |                     author_tag = single_result.find(
278 |                         'a', class_='commit-author')
279 |                     author = author_tag.text
280 |                     div = single_result.find('div', class_='d-flex')
281 |                     repo = div.find('a').text
282 |                     desc = "Committed to {}".format(repo)
283 |                 rdict["descriptions"] = desc
284 |                 if return_type == ReturnType.FULL:
285 |                     rdict.update({
286 |                         "authors": author,
287 |                         "opened_on": opened_on,
288 |                     })
289 |         return rdict
290 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/engines/google.py:
--------------------------------------------------------------------------------
  1 | """@desc
  2 | 		Parser for google search results
  3 | """
  4 | import sys
  5 | from urllib.parse import (
  6 |     urljoin,
  7 |     parse_qs,
  8 |     unquote
  9 | )
 10 | import urllib.parse as urlparse
 11 | 
 12 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
 13 | 
 14 | 
 15 | EXTRA_PARAMS = ('hl', 'tbs')
 16 | 
 17 | 
 18 | class Search(BaseSearch):
 19 |     """
 20 |     Searches Google for string
 21 |     """
 22 |     name = "Google"
 23 |     base_url = "https://www.google.com/"
 24 |     summary = "\tNo need for further introductions. The search engine giant holds the first "\
 25 |         "place in search with a stunning difference of 65% from second in place Bing.\n"\
 26 |         "\tAccording to the latest netmarketshare report (November 2018) 73% of searches "\
 27 |         "were powered by Google and only 7.91% by Bing.\n\tGoogle is also dominating the "\
 28 |         "mobile/tablet search engine market share with 81%!"
 29 | 
 30 |     def __init__(self):
 31 |         super().__init__()
 32 |         self.search_url = urljoin(self.base_url, "search")
 33 | 
 34 |     def get_params(self, query=None, offset=None, page=None, **kwargs):
 35 |         params = {}
 36 |         params["start"] = (page-1) * 10
 37 |         params["q"] = query
 38 |         params["gbv"] = 1
 39 |         # additional parameters will be considered
 40 |         for param in EXTRA_PARAMS:
 41 |             if kwargs.get(param):
 42 |                 params[param] = kwargs[param]
 43 |         return params
 44 | 
 45 |     def parse_url(self, url):
 46 |         return self.clean_url(urljoin(self.base_url, url))
 47 | 
 48 |     def parse_soup(self, soup):
 49 |         """
 50 |         Parses Google Search Soup for results
 51 |         """
 52 |         # find all class_='g' => each result
 53 |         return soup.find_all('div', class_="Gx5Zad fP1Qef xpd EtOod pkphOe")
 54 | 
 55 |     def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
 56 |         """
 57 |         Parses the source code to return
 58 | 
 59 |         :param single_result: single result found in <div class="g">
 60 |         :type single_result: `bs4.element.ResultSet`
 61 |         :return: parsed title, link and description of single result
 62 |         :rtype: dict
 63 |         """
 64 |         # Some unneeded details shown such as suggestions should be ignore
 65 |         if (single_result.find("h2", class_="wITvVb") and single_result.find("div", class_="LKSyXe"))\
 66 |                 or single_result.find("div", class_="X7NTVe"):
 67 |             return
 68 | 
 69 |         results = SearchItem()
 70 |         els = single_result.find_all('div', class_='kCrYT')
 71 |         if len(els) < 2:
 72 |             return
 73 | 
 74 |         # First div contains title and url
 75 |         r_elem = els[0]
 76 | 
 77 |         # Get the text and link
 78 |         if return_type in (ReturnType.FULL, ReturnType.TITLE):
 79 |             link_tag = r_elem.find('a')
 80 |             if link_tag:
 81 |                 title = link_tag.find('h3').text
 82 |             else:
 83 |                 r_elem = els[1]
 84 |                 title = r_elem.find('div', class_='BNeawe').text
 85 |             results['titles'] = title
 86 | 
 87 |         if return_type in (ReturnType.FULL, ReturnType.LINK):
 88 |             link_tag = r_elem.find('a')
 89 |             if link_tag:
 90 |                 raw_link = link_tag.get('href')
 91 |                 raw_url = urljoin(self.base_url, raw_link)
 92 |                 results['raw_urls'] = raw_url
 93 |                 results['links'] = self.clean_url(raw_url)
 94 | 
 95 |         if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
 96 |             # Second Div contains Description
 97 |             desc_tag = els[1]
 98 |             if return_type in (ReturnType.FULL, ReturnType.LINK) and not results.get('links'):
 99 |                 link_tag = desc_tag.find('a')
100 |                 if link_tag:
101 |                     desc_tag = els[0]
102 |                     raw_link = link_tag.get('href')
103 |                     raw_url = urljoin(self.base_url, raw_link)
104 |                     results['raw_urls'] = raw_url
105 |                     results['links'] = self.clean_url(raw_url)
106 |             desc = desc_tag.text
107 |             results['descriptions'] = desc
108 |         return results
109 | 
110 |     def clean_url(self, url):
111 |         """
112 |         Extract clean URL from the SERP URL.
113 | 
114 |         >clean_url('https://www.google.com/url?q=https://english.stackexchange.com/questions/140710/what-is-the-opposite-of-preaching-to-the-choir&sa=U&ved=2ahUKEwi31MGyzvnuAhXyyDgGHXXACOYQFnoECAkQAg&usg=AOvVaw1GdXON-JIWGu-dGjHfgljl')
115 |         https://english.stackexchange.com/questions/140710/what-is-the-opposite-of-preaching-to-the-choir
116 |         """
117 |         parsed = urlparse.urlparse(url)
118 |         url_qs = parse_qs(parsed.query)
119 |         if 'q' in url_qs:
120 |             return unquote(url_qs['q'][0])
121 |         elif 'url' in url_qs:
122 |             return unquote(url_qs['url'][0])
123 |         # Add more cases here.
124 |         return url
125 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/engines/googlenews.py:
--------------------------------------------------------------------------------
 1 | """@desc
 2 | 		Parser for google news search results
 3 | """
 4 | 
 5 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
 6 | 
 7 | 
 8 | class Search(BaseSearch):
 9 |     """
10 |     Searches Google News for string
11 |     """
12 |     name = "GoogleNews"
13 |     search_url = "https://www.google.com/search?"
14 |     summary = "\tGoogle News is a news aggregator app developed by Google. It presents a "\
15 |         "continuous, customizable flow of articles organized from thousands of publishers "\
16 |         "and magazines. Google News is available as an app on Android, iOS, and the Web. "\
17 |         "Google released a beta version in September 2002 and the official app in January 2006."
18 | 
19 |     def get_params(self, query=None, offset=None, page=None, **kwargs):
20 |         params = {}
21 |         params["num"] = 10
22 |         params["start"] = page
23 |         params["q"] = query
24 |         params["client"] = "ubuntu"
25 |         params["tbm"] = "nws"
26 |         return params
27 | 
28 |     def parse_soup(self, soup):
29 |         """
30 |         Parses Google News Search Soup for results
31 |         """
32 |         # find all class_='g' => each result
33 |         return soup.find_all('div', class_='g')
34 | 
35 |     def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
36 |         """
37 |         Parses the source code to return
38 | 
39 |         :param single_result: single result found in <div class="g">
40 |         :type single_result: `bs4.element.ResultSet`
41 |         :return: parsed title, link, description, imge link, news source, date of single result
42 |         :rtype: dict
43 |         """
44 |         rdict = SearchItem()
45 | 
46 |         if return_type in (ReturnType.FULL, return_type.TITLE):
47 |             title_tag = single_result.find('h3')
48 |             title = title_tag.text
49 |             rdict["titles"] = title
50 | 
51 |         if return_type in (ReturnType.FULL, ReturnType.LINK):
52 |             link_tag = single_result.find('a')
53 |             rdict["links"] = link_tag.get('href')
54 | 
55 |         if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
56 |             desc_tag = single_result.find('div', class_='st')
57 |             rdict["descriptions"] = desc_tag.text
58 | 
59 |         if return_type in (ReturnType.FULL,):
60 |             img_tag = single_result.find('img', class_='th')
61 |             news_source_tag = single_result.find('span', class_='e8fRJf')
62 |             date_tag = single_result.find('span', class_='f')
63 | 
64 |             rdict["image_url"] = img_tag.get('src')
65 |             rdict["news_source"] = news_source_tag.text
66 |             rdict["date"] = date_tag.text
67 |         return rdict
68 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/engines/googlescholar.py:
--------------------------------------------------------------------------------
 1 | """@desc
 2 | 		Parser for google scholar search results
 3 | """
 4 | 
 5 | import re
 6 | 
 7 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
 8 | 
 9 | 
10 | class Search(BaseSearch):
11 |     """
12 |     Searches Google Scholar for string
13 |     """
14 |     name = "GoogleScholar"
15 |     search_url = "https://scholar.google.gr/scholar?"
16 |     summary = "\tGoogle Scholar is a freely accessible web search engine that indexes the full "\
17 |         "text or metadata of scholarly literature across an array of publishing formats and "\
18 |         "disciplines."
19 | 
20 |     def get_params(self, query=None, offset=None, page=None, **kwargs):
21 |         params = {}
22 |         params["hl"] = "en"
23 |         params["start"] = page
24 |         params["q"] = query
25 |         return params
26 | 
27 |     def parse_soup(self, soup):
28 |         """
29 |         Parses Google Scholar Search Soup for results
30 |         """
31 |         # find all class_='gs_r gs_or gs_scl' => each result
32 |         return soup.find_all('div', class_='gs_r gs_or gs_scl')
33 | 
34 |     def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
35 |         """
36 |         Parses the source code to return
37 | 
38 |         :param single_result: single result found in <div class="gs_r gs_or gs_scl">
39 |         :type single_result: `bs4.element.ResultSet`
40 |         :return: parsed title, link, description, file link, result type of single result
41 |         :rtype: dict
42 |         """
43 |         rdict = SearchItem()
44 |         r_elem = single_result.find('h3', class_='gs_rt')
45 |         if return_type in (ReturnType.FULL, ReturnType.LINK):
46 |             link_tag = r_elem.find('a')
47 |             if link_tag:
48 |                 raw_link = link_tag.get('href')
49 |             else:
50 |                 raw_link = ''
51 |             rdict["links"] = raw_link
52 | 
53 |         if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
54 |             desc = single_result.find('div', class_='gs_rs')
55 |             if desc:
56 |                 desc = desc.text
57 |             else:
58 |                 desc = ''
59 |             rdict["descriptions"] = desc
60 | 
61 |         if return_type in (ReturnType.FULL, return_type.TITLE):
62 |             title = r_elem.text
63 |             title = re.sub(r'^[\[\w+\]]+ ', '', title)
64 |             rdict["titles"] = title
65 | 
66 |         if return_type == ReturnType.FULL:
67 |             t_elem = single_result.find('span', class_='gs_ct1')
68 |             if t_elem:
69 |                 result_type = t_elem.text
70 |             else:
71 |                 result_type = ''
72 | 
73 |             f_elem = single_result.find('div', class_='gs_or_ggsm')
74 |             if f_elem:
75 |                 flink_tag = r_elem.find('a')
76 |                 if flink_tag:
77 |                     file_link = flink_tag.get('href')
78 |                 else:
79 |                     file_link = ''
80 |             else:
81 |                 file_link = ''
82 | 
83 |             rdict.update({
84 |                 "result_types": result_type,
85 |                 "files_links": file_link
86 |             })
87 | 
88 |         return rdict
89 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/engines/myanimelist.py:
--------------------------------------------------------------------------------
 1 | """@desc
 2 | 		Parser for MyAnimeList search results
 3 | """
 4 | 
 5 | import math
 6 | import sys
 7 | 
 8 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
 9 | 
10 | 
11 | class Search(BaseSearch):
12 |     """
13 |     Searches MyAnimeList for string
14 |     """
15 |     name = "MyAnimeList"
16 | 
17 |     search_url = "https://myanimelist.net/anime.php?"
18 |     summary = "\tMyAnimeList, often abbreviated as MAL, is an anime and manga social"\
19 |         "networking and social cataloging application website."\
20 |         "\n\tThe site provides its users with a list-like system to organize"\
21 |         "and score anime and manga.\n\tIt facilitates finding users who share"\
22 |         "similar tastes and provides a large database on anime and manga.\n\tThe"\
23 |         "site claims to have 4.4 million anime and 775,000 manga entries."\
24 |         "\n\tIn 2015, the site received over 120 million visitors a month."
25 | 
26 |     def get_params(self, query=None, page=None, offset=None, **kwargs):
27 |         params = {}
28 |         params["show"] = (math.ceil(page / 5) - 1) * 50
29 |         params["q"] = query
30 |         return params
31 | 
32 |     def parse_soup(self, soup):
33 |         """
34 |         Parses MyAnimeList for a search query
35 |         """
36 | 
37 |         # The data is stored in table so find all table rows
38 |         # The first row is table header
39 |         res = soup.find('div', class_='js-categories-seasonal js-block-list list')
40 |         if res:
41 |             return res.find_all('tr')[1:]
42 | 
43 |     def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
44 |         """
45 |         Parses the source code to return
46 | 
47 |         :param single_result: single result found in div with a numeric id
48 |         :type single_result: `bs4.element.Tag`
49 |         :return: parsed title, link and description of single result
50 |         :rtype: str, str, str
51 |         """
52 |         rdict = SearchItem()
53 |         link_tag = single_result.find('a', class_='fw-b')
54 | 
55 |         if return_type in (ReturnType.FULL, return_type.TITLE):
56 |             title = link_tag.find('strong').text
57 |             rdict["titles"] = title
58 | 
59 |         if return_type in (ReturnType.FULL, ReturnType.LINK):
60 |             rdict["links"] = link_tag.get('href')
61 | 
62 |         if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
63 |             desc = single_result.find('div', class_='pt4').text.strip()
64 |             rdict["descriptions"] = desc
65 | 
66 |         if return_type == ReturnType.FULL:
67 |             data = list(single_result.find_all('td', class_='ac'))
68 |             animetype = data[0].text.strip()
69 |             episodes = data[1].text.strip()
70 |             score = data[2].text.strip()
71 | 
72 |             rdict.update({
73 |                 "episode_count": episodes,
74 |                 "animetypes": animetype,
75 |                 "ratings": score
76 |             })
77 |         return rdict
78 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/engines/stackoverflow.py:
--------------------------------------------------------------------------------
 1 | """@desc
 2 | 		Parser for AOL search results
 3 | """
 4 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
 5 | 
 6 | 
 7 | class Search(BaseSearch):
 8 |     """
 9 |     Searches StackOverflow for string
10 |     """
11 |     name = "StackOverflow"
12 |     base_url = "https://stackoverflow.com"
13 |     search_url = base_url + "/search?"
14 |     summary = "\tStack Overflow is a question and answer site for professional and enthusiast "\
15 |               "programmers.\n\tIt is a privately held website, the flagship site of the Stack "\
16 |               "Exchange Network, created in 2008 by Jeff Atwood and Joel Spolsky.\n\tIt features "\
17 |               "questions and answers on a wide range of topics in computer programming. It was "\
18 |               "created to be a more open alternative to earlier question and answer sites "\
19 |               "such as Experts-Exchange"
20 | 
21 |     def get_params(self, query=None, offset=None, page=None, **kwargs):
22 |         params = {}
23 |         params["page"] = page
24 |         params["q"] = query
25 |         params["pagesize"] = 15
26 |         return params
27 | 
28 |     def parse_soup(self, soup):
29 |         """
30 |         Parses StackOverflow for a search query
31 |         """
32 |         # find all divs
33 |         return soup.find_all('div', class_='summary')
34 | 
35 |     def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
36 |         """
37 |         Parses the source code to return
38 | 
39 |         :param single_result: single result found in <div class="summary">
40 |         :type single_result: `bs4.element.ResultSet`
41 |         :return: parsed title, link and description of single result
42 |         :rtype: dict
43 |         """
44 |         rdict = SearchItem()
45 |         h3 = single_result.find('h3')  # pylint: disable=invalid-name
46 |         link_tag = h3.find('a')
47 |         if return_type in (ReturnType.FULL, return_type.TITLE):
48 |             # Get the text and link
49 |             rdict["titles"] = link_tag.text
50 | 
51 |         if return_type in (ReturnType.FULL, return_type.LINK):
52 |             ref_link = link_tag.get('href')
53 |             link = self.base_url + ref_link
54 |             rdict["links"] = link
55 | 
56 |         if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
57 |             caption = single_result.find('div', class_='excerpt')
58 |             rdict["descriptions"] = caption.text
59 |         return rdict
60 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/engines/yahoo.py:
--------------------------------------------------------------------------------
 1 | """@desc
 2 | 		Parser for Yahoo search results
 3 | """
 4 | import re
 5 | 
 6 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
 7 | 
 8 | 
 9 | class Search(BaseSearch):
10 |     """
11 |     Searches Yahoo for string
12 |     """
13 |     name = "Yahoo"
14 |     search_url = "https://search.yahoo.com/search?"
15 |     summary = "\tYahoo is one the most popular email providers and holds the fourth place in "\
16 |         "search with 3.90% market share.\n\tFrom October 2011 to October 2015, Yahoo search "\
17 |         "was powered exclusively by Bing. \n\tSince October 2015 Yahoo agreed with Google to "\
18 |         "provide search-related services and since then the results of Yahoo are powered both "\
19 |         "by Google and Bing. \n\tYahoo is also the default search engine for Firefox browsers "\
20 |         "in the United States (since 2014)."
21 | 
22 |     def get_params(self, query=None, page=None, offset=None, **kwargs):
23 |         params = {}
24 |         params["p"] = query
25 |         params["b"] = offset
26 |         return params
27 | 
28 |     def parse_soup(self, soup):
29 |         """
30 |         Parses Yahoo for a search query
31 |         """
32 |         # find all divs
33 |         return soup.find_all('div', class_='Sr')
34 | 
35 |     def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
36 |         """
37 |         Parses the source code to return
38 | 
39 |         :param single_result: single result found in <div class="Sr">
40 |         :type single_result: `bs4.element.ResultSet`
41 |         :return: parsed title, link and description of single result
42 |         :rtype: dict
43 |         """
44 |         rdict = SearchItem()
45 |         h3_tag = single_result.find('h3', class_='title')
46 | 
47 |         if return_type in (ReturnType.FULL, return_type.TITLE):
48 |             title = h3_tag.text
49 |             rdict["titles"] = title
50 | 
51 |         if return_type in (ReturnType.FULL, ReturnType.LINK):
52 |             link_tag = h3_tag.find('a')
53 |             raw_link = link_tag.get('href')
54 |             re_str = re.findall("/RU=(.+)/RK", raw_link)[0]
55 |             re_str = re_str.replace("%3a", ":")
56 |             link = re_str.replace("%2f", "/")
57 |             rdict["links"] = link
58 | 
59 |         if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
60 |             desc = single_result.find('span', class_='fc-falcon')
61 |             rdict["descriptions"] = desc.text
62 | 
63 |         return rdict
64 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/engines/yandex.py:
--------------------------------------------------------------------------------
 1 | """@desc
 2 | 		Parser for Yandex search results
 3 | """
 4 | 
 5 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
 6 | 
 7 | 
 8 | class Search(BaseSearch):
 9 |     """
10 |     Searches Yandex for string
11 |     """
12 |     name = "Yandex"
13 |     search_url = "https://yandex.com/search/?"
14 |     summary = "\tYandex is the largest technology company in Russia and the"\
15 |         " largest search engine on the internet in Russian"\
16 |         ", with a market share of over 52%."\
17 |         "\n\tThe Yandex.ru home page is the 4th most popular website in Russia."\
18 |         "\n\tIt also has the largest market share of any search engine in the Commonwealth"\
19 |         " of Independent States and is the 5th largest search engine worldwide"\
20 |         " after Google, Baidu, Bing, and Yahoo!"
21 | 
22 |     def get_params(self, query=None, page=None, offset=None, **kwargs):
23 |         params = {}
24 |         params["text"] = query
25 |         params["p"] = offset
26 |         return params
27 | 
28 |     def parse_soup(self, soup):
29 |         """
30 |         Parses Yandex for a search query
31 |         """
32 |         return soup.find_all('li', class_="serp-item")
33 | 
34 |     def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
35 |         """
36 |         Parses the source code to return
37 | 
38 |         :param single_result: single result found in <li class="serp-item">
39 |         :type single_result: `bs4.element.ResultSet`
40 |         :return: parsed title, link and description of single result
41 |         :rtype: str, str, str
42 |         """
43 |         rdict = SearchItem()
44 |         h3_tag = single_result.find('div', class_="organic__url-text")
45 | 
46 |         if return_type in (ReturnType.FULL, return_type.TITLE):
47 |             # Get the text and link
48 |             title = h3_tag.text
49 |             # Handle read more type texts
50 |             index = title.find("Read more")
51 |             if index >= 0:
52 |                 title = title[0:int(index)]
53 |             rdict["titles"] = title
54 | 
55 |         if return_type in (ReturnType.FULL, ReturnType.LINK):
56 |             link_tag = single_result.find('a')
57 |             link = link_tag.get('href')
58 |             rdict["links"] = link
59 | 
60 |         if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
61 |             desc = single_result.find('div', class_="organic__content-wrapper")
62 |             desc = desc.text
63 |             rdict["descriptions"] = desc
64 |         return rdict
65 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/engines/youtube.py:
--------------------------------------------------------------------------------
  1 | """@desc
  2 | 		Parser for YouTube search results
  3 | """
  4 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
  5 | 
  6 | 
  7 | class Search(BaseSearch):
  8 |     """
  9 |     Searches YouTube for string
 10 |     """
 11 |     name = "YouTube"
 12 |     base_url = "https://youtube.com"
 13 |     search_url = base_url + "/results?"
 14 |     summary = "\tYouTube is an American video-sharing website headquartered in San Bruno, "\
 15 |         "California. Three former PayPal employees—Chad Hurley, Steve Chen, and Jawed "\
 16 |         "Karim—created the service in February 2005.\n\tGoogle bought the site in November "\
 17 |         "2006 for US$1.65 billion; YouTube now operates as one of Google's subsidiaries. "\
 18 |         "As of May 2019, more than 500 hours of video content are uploaded to YouTube every minute"
 19 | 
 20 |     def get_params(self, query=None, page=None, offset=None, **kwargs):
 21 |         params = {}
 22 |         params["search_query"] = query
 23 |         return params
 24 | 
 25 |     def parse_soup(self, soup):
 26 |         """
 27 |         Parses YouTube for a search query.
 28 |         """
 29 |         # find all ytd-video-renderer tags
 30 |         return soup.find_all('div', class_='yt-lockup-content')
 31 | 
 32 |     def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
 33 |         """
 34 |         Parses the source code to return
 35 | 
 36 |         :param single_result: single result found in <ytd-video-renderer class="style-scope">
 37 |         :type single_result: `bs4.element.ResultSet`
 38 |         :return: parsed title, link and description of single result
 39 |         :rtype: dict
 40 |         """
 41 |         rdict = SearchItem()
 42 |         # pylint: disable=too-many-locals
 43 |         title_tag = single_result.find('a', class_='yt-uix-tile-link')
 44 |         channel_name = ""
 45 | 
 46 |         if return_type in (ReturnType.FULL, return_type.TITLE):
 47 |             # Get the text and link
 48 |             rdict["titles"] = title_tag.text
 49 | 
 50 |         # try for single videos
 51 |         try:
 52 |             if return_type in (ReturnType.FULL, ReturnType.LINK):
 53 |                 ref_link = title_tag.get('href')
 54 |                 link = self.base_url + ref_link
 55 |                 rdict["links"] = link
 56 | 
 57 |             if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
 58 |                 desc = single_result.find(
 59 |                     'div', class_="yt-lockup-description").text
 60 |                 rdict["descriptions"] = desc
 61 | 
 62 |             if return_type in (ReturnType.FULL, ):
 63 |                 duration = single_result.find(
 64 |                     'span', class_='accessible-description').text
 65 |                 ul_tag = single_result.find('ul', class_='yt-lockup-meta-info')
 66 | 
 67 |                 channel_name = single_result.find(
 68 |                     'a', class_='yt-uix-sessionlink spf-link').text
 69 |                 views_and_upload_date = ul_tag.find_all('li')
 70 |                 upload_date = views_and_upload_date[0].text
 71 |                 views = views_and_upload_date[1].text
 72 |                 rdict.update({
 73 |                     "channels": channel_name,
 74 |                     "durations": duration,
 75 |                     "views": views,
 76 |                     "upload_dates": upload_date,
 77 |                 })
 78 |         except BaseException:  # pylint: disable=broad-except
 79 |             link_tags = single_result.find_all(
 80 |                 'a', class_='yt-uix-sessionlink spf-link')
 81 |             # TODO Optimize calls here so that we don't assign ref_link and channel_name
 82 |             # when we don't need them
 83 |             for i in link_tags:
 84 |                 if i.get("href").startswith("/playlist"):
 85 |                     ref_link = i.get("href")
 86 |                 elif i.get("href").startswith("/user"):
 87 |                     channel_name = i.text
 88 |             if return_type in (ReturnType.FULL, ReturnType.LINK):
 89 |                 link = self.base_url + ref_link
 90 |                 rdict["links"] = link
 91 | 
 92 |             if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
 93 |                 desc = single_result.find(
 94 |                     'span', class_='accessible-description').text
 95 |                 rdict["descriptions"] = desc
 96 |             if return_type in (ReturnType.FULL,):
 97 |                 rdict.update({
 98 |                     "channels": channel_name,
 99 |                 })
100 |         return rdict
101 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/exceptions.py:
--------------------------------------------------------------------------------
 1 | """@desc
 2 |     Exceptions
 3 | """
 4 | 
 5 | 
 6 | class NoResultsFound(Exception):
 7 |     pass
 8 | 
 9 | 
10 | class NoResultsOrTrafficError(Exception):
11 |     """ When No results is returned or unusual traffic caused app to return empty results """
12 | 
13 | class IncorrectKeyWord(Exception):
14 |     """ When a wrong keyword argument is passed to the search function """
15 | 


--------------------------------------------------------------------------------
/search_engine_parser/core/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import pickle
  4 | import hashlib
  5 | import aiohttp
  6 | from fake_useragent import UserAgent
  7 | 
  8 | FILEPATH = os.path.dirname(os.path.abspath(__file__))
  9 | 
 10 | # prevent caching
 11 | USER_AGENT_LIST = [
 12 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0",
 13 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
 14 |     "Chrome/72.0.3626.121 Safari/537.36",
 15 |     "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0",
 16 |     "Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0",
 17 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) "
 18 |     "Chrome/19.0.1084.46 Safari/536.5",
 19 |     "Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) "
 20 |     "Chrome/19.0.1084.46 Safari/536.5",
 21 | ]
 22 | 
 23 | 
 24 | def get_rand_user_agent():
 25 |     user_agent = random.choice(USER_AGENT_LIST)
 26 |     try:
 27 |         user_agent = UserAgent().random
 28 |     except:
 29 |        pass
 30 |     return user_agent
 31 |     
 32 | 
 33 | 
 34 | class CacheHandler:
 35 |     def __init__(self):
 36 |         self.cache = os.path.join(FILEPATH, "cache")
 37 |         engine_path = os.path.join(FILEPATH, "engines")
 38 |         if not os.path.exists(self.cache):
 39 |             os.makedirs(self.cache)
 40 |         enginelist = os.listdir(engine_path)
 41 |         self.engine_cache = {i[:-3]: os.path.join(self.cache, i[:-3]) for i in enginelist if i not in
 42 |                              ("__init__.py")}
 43 |         for cache in self.engine_cache.values():
 44 |             if not os.path.exists(cache):
 45 |                 os.makedirs(cache)
 46 | 
 47 |     async def get_source(self, engine, url, headers, cache=True,
 48 |                         proxy=None, proxy_auth=None):
 49 |         """
 50 |         Retrieves source code of webpage from internet or from cache
 51 | 
 52 |         :rtype: str, bool
 53 |         :param engine: engine of the engine saving
 54 |         :type engine: str
 55 |         :param url: URL to pull source code from
 56 |         :type url: str
 57 |         :param headers: request headers to make use of
 58 |         :type headers: dict
 59 |         :param cache: use cache or not
 60 |         :type cache: bool
 61 |         :param proxy: proxy address to make use off
 62 |         :type proxy: str
 63 |         :param proxy_auth: (user, password) tuple to authenticate proxy
 64 |         :type proxy_auth: (str, str)
 65 |         """
 66 |         encodedUrl = url.encode("utf-8")
 67 |         urlhash = hashlib.sha256(encodedUrl).hexdigest()
 68 |         engine = engine.lower()
 69 |         cache_path = os.path.join(self.engine_cache[engine], urlhash)
 70 |         if os.path.exists(cache_path) and cache:
 71 |             with open(cache_path, 'rb') as stream:
 72 |                 return pickle.load(stream), True
 73 |         get_vars = { 'url':url, 'headers':headers }
 74 |         if proxy and proxy_auth:
 75 |             auth = aiohttp.BasicAuth(*proxy_auth)
 76 |             get_vars.update({'proxy':proxy, 'proxy_auth': auth})
 77 | 
 78 |         async with aiohttp.ClientSession() as session:
 79 |             async with session.get(**get_vars) as resp:
 80 |                 html = await resp.text()
 81 |                 with open(cache_path, 'wb') as stream:
 82 |                     pickle.dump(str(html), stream)
 83 |                 return str(html), False
 84 | 
 85 |     def clear(self, engine=None):
 86 |         """
 87 |         Clear the entire cache either by engine name
 88 |         or just all
 89 | 
 90 |         :param engine: engine to clear
 91 |         """
 92 |         if not engine:
 93 |             for engine_cache in self.engine_cache.values():
 94 |                 for root, dirs, files in os.walk(engine_cache):
 95 |                     for f in files:
 96 |                         os.remove(os.path.join(engine_cache, f))
 97 |         else:
 98 |             engine_cache = self.engine_cache[engine.lower()]
 99 |             for _, _, files in os.walk(engine_cache):
100 |                 for f in files:
101 |                     os.remove(os.path.join(engine_cache, f))
102 | 


--------------------------------------------------------------------------------
/search_engine_parser/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bisohns/search-engine-parser/0c2f4bde7dd21c10e64c9204417d9a228e96c187/search_engine_parser/tests/__init__.py


--------------------------------------------------------------------------------
/search_engine_parser/tests/test_base.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | from importlib import import_module
  4 | from urllib.parse import urlparse
  5 | from unittest.mock import patch, MagicMock
  6 | import vcr
  7 | from parameterized import parameterized_class
  8 | 
  9 | from search_engine_parser.core.exceptions import NoResultsOrTrafficError
 10 | 
 11 | SEARCH_ARGS = ('Hello', 1)
 12 | 
 13 | 
 14 | def get_engines():
 15 |     """ Returns a list of all engines for tests """
 16 |     engines = []
 17 | 
 18 |     base_dir = os.getcwd()
 19 |     engines_dir = os.path.join(base_dir, 'search_engine_parser', 'core', 'engines')
 20 | 
 21 |     for filename in os.listdir(engines_dir):
 22 |         if os.path.isfile(os.path.join(engines_dir, filename)) and filename.endswith('.py') \
 23 |                 and filename != '__init__.py':
 24 |             engine = filename.split('.py')[0]
 25 |             module = import_module("search_engine_parser.core.engines.{}".format(engine.lower()))
 26 |             engine_class = getattr(module, "Search")
 27 |             engines.append([engine, engine_class(),])
 28 |     return engines
 29 | 
 30 | 
 31 | def validate_url(url):
 32 |     """ Checks if a url is valid
 33 |     urls must contain scheme, netloc and path
 34 |     """
 35 |     try:
 36 |         result = urlparse(url)
 37 |         return all([result.scheme, result.netloc, result.path])
 38 |     except BaseException:  # pylint: disable=broad-except
 39 |         print("URL: %s\n" % url)
 40 |         return False
 41 | 
 42 | 
 43 | # pylint: disable=no-member
 44 | class EngineBaseTest(unittest.TestCase):
 45 |     """ Testbase for Engines
 46 | 
 47 |     provides tests for engine methods
 48 |     """
 49 | 
 50 |     def setUp(self):
 51 |         from search_engine_parser.core.engines.google import Search # pylint: disable=import-outside-toplevel
 52 |         self.engine = Search()
 53 | 
 54 |     @patch('search_engine_parser.core.engines.google.Search.get_results')
 55 |     @patch('search_engine_parser.core.engines.google.Search.get_soup')
 56 |     async def test_urls(self, get_results_mock, get_soup_mock):
 57 |         """ Test that url updates work fine """
 58 |         await self.engine.search(query="hello", url="google.com.tr")
 59 |         first_url = self.engine._parsed_url.geturl()
 60 |         self.assertTrue(validate_url(first_url))
 61 | 
 62 |         self.engine.search(query="World", url="https://google.com.tr")
 63 |         second_url = self.engine._parsed_url.geturl()
 64 |         self.assertTrue(validate_url(second_url))
 65 | 
 66 |         self.assertNotEqual(second_url, first_url)
 67 | 
 68 |     # Test for https://github.com/bisoncorps/search-engine-parser/issues/92
 69 |     def test_two_queries_different_results(self):
 70 |         """ Test that url updates work fine """
 71 |         from search_engine_parser.core.engines.google import Search as GoogleSearch # pylint: disable=import-outside-toplevel
 72 |         from search_engine_parser.core.engines.yahoo import Search as YahooSearch # pylint: disable=import-outside-toplevel
 73 |         gengine = GoogleSearch()
 74 |         yahoo_engine = YahooSearch()
 75 |         gresults = None
 76 |         gresults = None
 77 |         with vcr.use_cassette('fixtures/google-test-diff-synopsis.yaml', record_mode='once'):
 78 |             gresults = gengine.search(query="What's up from this side")
 79 |         with vcr.use_cassette('fixtures/yahoo-test-diff-synopsis.yaml', record_mode='once'):
 80 |             yresults = yahoo_engine.search(query="this is example Bob")
 81 |         for key in gresults[0]:
 82 |             self.assertNotEqual(gresults[0].get(key, "GSearch"), yresults[0].get(key, "Ysearch"))
 83 | 
 84 |         self.assertNotEqual(gresults, yresults)
 85 | 
 86 | # pylint: disable=no-member
 87 | @parameterized_class(('name', 'engine'), get_engines())
 88 | class TestScraping(unittest.TestCase):
 89 |     """ Testbase for Engines
 90 | 
 91 |     provides tests for titles, description and return urls
 92 |     """
 93 |     engine_class = None
 94 | 
 95 |     @classmethod
 96 |     def setUpClass(cls):
 97 |         super().setUpClass()
 98 | 
 99 |         try:
100 |             cls.vcr_search(*SEARCH_ARGS)
101 |         except NoResultsOrTrafficError:
102 |             raise unittest.SkipTest(
103 |                 '{} failed due to traffic'.format(
104 |                     cls.engine))
105 | 
106 |     @classmethod
107 |     def vcr_search(cls, *args, **kwargs):
108 |         print(cls.name)
109 |         with vcr.use_cassette('fixtures/{}-{}-synopsis.yaml'.format(cls.name, args[0].replace(" ", "-")), record="once"):
110 |             cls.results = cls.engine.search(*args, **kwargs)
111 | 
112 |     @classmethod
113 |     def test_cache_used(cls):
114 |         """
115 |         Test that the cache was used
116 |         """
117 |         try:
118 |             cls.vcr_search(*SEARCH_ARGS, cache=True)
119 |             if cls.engine._cache_hit == False:
120 |                 assert False, "{} cache - unexpected miss".format(
121 |                     cls.engine.name)
122 |         except NoResultsOrTrafficError:
123 |             raise unittest.SkipTest(
124 |                 '{} failed due to traffic'.format(
125 |                     cls.engine))
126 | 
127 |     @classmethod
128 |     def test_cache_not_used(cls):
129 |         """
130 |         Test that the cache was used
131 |         """
132 |         try:
133 |             cls.vcr_search(*SEARCH_ARGS, cache=False)
134 |             if cls.engine._cache_hit == True:
135 |                 assert False, "{} cache - unexpected hit".format(
136 |                     cls.engine.name)
137 |         except NoResultsOrTrafficError:
138 |             raise unittest.SkipTest(
139 |                 '{} failed due to traffic'.format(
140 |                     cls.engine))
141 | 
142 |     @classmethod
143 |     def test_cache_bypassed(cls):
144 |         """
145 |         Test that cache was bypassed
146 |         """
147 |         # wrongly set cls.engine._cache_hit
148 |         cls.engine._cache_hit = True
149 |         try:
150 |             cls.vcr_search(*SEARCH_ARGS, cache=False)
151 |             if cls.engine._cache_hit == True:
152 |                 assert False, "{} cache - not bypassed".format(
153 |                     cls.engine.name)
154 |         except NoResultsOrTrafficError:
155 |             raise unittest.SkipTest(
156 |                 '{} failed due to traffic'.format(
157 |                     cls.engine))
158 | 
159 |     def test_search_urls(self):
160 |         """
161 |         Test that the search urls generated are valid
162 |         """
163 |         self.assertTrue(validate_url(self.engine._parsed_url.geturl()))
164 | 
165 |     def test_returned_results(self):
166 |         """
167 |         Test that the returned results have valid data. 8 is just a chosen value as most search
168 |         engines return values more than that
169 |         """
170 |         self.assertTrue(len(self.results['titles']) >= 4)
171 |         self.assertTrue(len(self.results['links']) >= 4)
172 |         # coursera does not return descriptions for
173 |         # Preaching to the choir
174 |         if not self.engine.name.lower() == "coursera":
175 |             self.assertTrue(len(self.results['descriptions']) >= 4)
176 |         else:
177 |             self.assertTrue(len(self.results["difficulties"]) >= 4)
178 | 
179 |     def test_links(self):
180 |         for link in self.results['links']:
181 |             print("{}:::::{}".format(self.name, link))
182 |             # Sometimes googlescholar returns empty links for citation type results
183 |             if not link and self.name.lower() == "googlescholar":
184 |                 continue
185 |             self.assertTrue(validate_url(link))
186 | 
187 |     def test_results_length_are_the_same(self):
188 |         """ Tests if returned result items are equal.
189 |         :param args: a list/tuple of other keys returned
190 |         """
191 |         # Different engines have different keys which may be returned or not returned
192 |         # So if all keys are not the same length check that the titles and links length are
193 |         # the same
194 |         default_keys = ["titles", "links"]
195 |         default_keys_set = set(map(lambda x: len(self.results[x]), default_keys))
196 | 
197 |         items = self.results.keys()
198 |         items_set = set(map(lambda x: len(self.results[x]), items))
199 | 
200 |         self.assertTrue(len(items_set) == 1 or len(default_keys_set) == 1)
201 | 


--------------------------------------------------------------------------------
/search_engine_parser/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | from unittest.mock import patch, MagicMock
 4 | 
 5 | from search_engine_parser.core import cli
 6 | 
 7 | engine_class_mock = MagicMock()
 8 | engine_class_mock.name = "Random Engine Name"
 9 | engine_class_mock.clear_cache = MagicMock()
10 | engine_class_mock.search = MagicMock()
11 | 
12 | class CliTests(unittest.TestCase):
13 | 
14 |     def setUp(self):
15 |         self.parser = cli.create_parser()
16 | 
17 |     def test_show_summary(self):
18 |         args = self.parser.parse_args(["-e", "google", "--show-summary"])
19 |         # If it executes properly it should return None
20 |         self.assertTrue(cli.main(args) is None)
21 | 
22 |     @patch('search_engine_parser.core.cli.get_engine_class', return_value=engine_class_mock)
23 |     def test_query(self, engine_class):
24 |         args = self.parser.parse_args(["-e", "google", "Preach"])
25 |         # If it executes properly it should return None
26 |         self.assertTrue(cli.main(args) is None)
27 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import setuptools
 3 | 
 4 | REQUIRED_PYTHON = (3, 5)
 5 | 
 6 | # Load requirements
 7 | REQUIREMENTS = 'requirements/main.txt'
 8 | CLI_REQUIREMENTS = 'requirements/cli.txt'
 9 | REQUIREMENTS = [line.strip('\n') for line in open(REQUIREMENTS).readlines()]
10 | CLI_REQUIREMENTS = [line.strip('\n') for line in open(CLI_REQUIREMENTS).readlines()]
11 | 
12 | with open("README.md", "r", encoding="utf8") as fh:
13 |     LONG_DESCRIPTION = fh.read()
14 | 
15 | # Trying to load version directly from `search-engine-parser` module attempts
16 | # to load __init__.py which will try to load other libraries not yet installed
17 | with open("search_engine_parser/__init__.py", "rt", encoding="utf8") as f:
18 |     VERSION = re.search(r'__version__ = "(.*?)"', f.read(), re.M).group(1)
19 | 
20 | setuptools.setup(
21 |     name="search-engine-parser",
22 |     version=VERSION,
23 |     author='Domnan Diretnan, Mmadu Manasseh',
24 |     author_email="diretnandomnan@gmail.com",
25 |     description="scrapes search engine pages for query titles, descriptions and links",
26 |     url="https://github.com/bisoncorps/search-engine-parser",
27 |     project_urls={
28 |         "Documentation":"https://search-engine-parser.readthedocs.io/en/latest",
29 |         "Source": "https://github.com/bisoncorps/search-engine-parser",
30 |     },
31 |     packages=setuptools.find_packages(),
32 |     install_requires=REQUIREMENTS,
33 |     long_description=LONG_DESCRIPTION,
34 |     long_description_content_type="text/markdown",
35 |     license="MIT",
36 |     keywords='\
37 |         search-engine \
38 |         search \
39 |         parser \
40 |         google \
41 |         yahoo \
42 |         bing \
43 |         yandex \
44 |         stackoverflow \
45 |         github \
46 |         baidu ',
47 |     entry_points={'console_scripts': [
48 |         'pysearch=search_engine_parser.core.cli:runner'
49 |     ]},
50 |     classifiers=[
51 |         "Programming Language :: Python :: 3",
52 |         "License :: OSI Approved :: MIT License",
53 |         "Operating System :: OS Independent",
54 |     ],
55 |     package_data={
56 |         '': ['*.*'],
57 |         'requirements': ['*.*'],
58 |     },
59 |     include_package_data=True,
60 |     extras_require={
61 |         'cli': CLI_REQUIREMENTS
62 |     },
63 |     python_requires='>={}.{}'.format(*REQUIRED_PYTHON),
64 | )
65 | 


--------------------------------------------------------------------------------